Use real NVIDIA power-limit bounds in benchmark

2026-04-20 09:26:56 +03:00
parent a94e8007f8
commit ab802719f8
2 changed files with 101 additions and 35 deletions
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
@@ -37,6 +37,8 @@ type benchmarkGPUInfo struct {
 	VBIOS                string
 	PowerLimitW          float64
 	DefaultPowerLimitW   float64
+	MinPowerLimitW       float64
+	MaxPowerLimitW       float64
 	MaxGraphicsClockMHz  float64
 	MaxMemoryClockMHz    float64
 	BaseGraphicsClockMHz float64
@@ -349,9 +351,9 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 		result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error())
 		result.Normalization.Status = "partial"
 	}
-	// Enrich with max clocks from verbose output — covers GPUs where
-	// clocks.max.* CSV fields are unsupported (e.g. Blackwell / driver 98.x).
-	enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQOut)
+	// Enrich with verbose nvidia-smi data — covers GPUs where some CSV fields
+	// are unsupported (e.g. clocks.max.* on Blackwell / driver 98.x).
+	enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQOut)

 	activeApps, err := queryActiveComputeApps(selected)
 	if err == nil && len(activeApps) > 0 {
@@ -735,8 +737,8 @@ func resolveBenchmarkProfile(profile string) benchmarkProfileSpec {
 // (attribute.multiprocessor_count, power.default_limit) are not supported on
 // all driver versions, so we fall back to the base set if the full query fails.
 // The minimal fallback omits clock fields entirely — clocks.max.* returns
-// exit status 2 on some GPU generations (e.g. Blackwell); max clocks are
-// then recovered from nvidia-smi -q via enrichGPUInfoWithMaxClocks.
+// exit status 2 on some GPU generations (e.g. Blackwell); missing data is
+// then recovered from nvidia-smi -q.
 var benchmarkGPUInfoQueries = []struct {
 	fields   string
 	extended bool // whether this query includes optional extended fields
@@ -756,12 +758,9 @@ var benchmarkGPUInfoQueries = []struct {
 	},
 }

-// enrichGPUInfoWithMaxClocks fills MaxGraphicsClockMHz / MaxMemoryClockMHz for
-// any GPU in infoByIndex where those values are still zero.  It parses the
-// "Max Clocks" section of nvidia-smi -q output (already available as nvsmiQ).
-// This is the fallback for GPUs (e.g. Blackwell) where clocks.max.* CSV fields
-// return exit status 2 but the verbose query works fine.
-func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []byte) {
+// enrichGPUInfoWithNvidiaSMIQ fills benchmark GPU metadata from nvidia-smi -q
+// for fields that may be missing from --query-gpu on some driver versions.
+func enrichGPUInfoWithNvidiaSMIQ(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []byte) {
 	if len(infoByIndex) == 0 || len(nvsmiQ) == 0 {
 		return
 	}
@@ -782,6 +781,8 @@ func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []b
 	maxMemRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Memory\s*:\s*(\d+)\s*MHz`)
 	defaultPwrRe := regexp.MustCompile(`(?i)Default Power Limit\s*:\s*([0-9.]+)\s*W`)
 	currentPwrRe := regexp.MustCompile(`(?i)Current Power Limit\s*:\s*([0-9.]+)\s*W`)
+	minPwrRe := regexp.MustCompile(`(?i)Min Power Limit\s*:\s*([0-9.]+)\s*W`)
+	maxPwrRe := regexp.MustCompile(`(?i)Max Power Limit\s*:\s*([0-9.]+)\s*W`)
 	smCountRe := regexp.MustCompile(`(?i)Multiprocessor Count\s*:\s*(\d+)`)
 	shutdownTempRe := regexp.MustCompile(`(?i)GPU Shutdown Temp\s*:\s*(\d+)\s*C`)
 	slowdownTempRe := regexp.MustCompile(`(?i)GPU Slowdown Temp\s*:\s*(\d+)\s*C`)
@@ -841,6 +842,20 @@ func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []b
 				}
 			}
 		}
+		if info.MinPowerLimitW == 0 {
+			if m := minPwrRe.FindSubmatch(section); m != nil {
+				if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 {
+					info.MinPowerLimitW = v
+				}
+			}
+		}
+		if info.MaxPowerLimitW == 0 {
+			if m := maxPwrRe.FindSubmatch(section); m != nil {
+				if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 {
+					info.MaxPowerLimitW = v
+				}
+			}
+		}
 		if info.MultiprocessorCount == 0 {
 			if m := smCountRe.FindSubmatch(section); m != nil {
 				if v, err := strconv.Atoi(string(m[1])); err == nil && v > 0 {
@@ -3041,7 +3056,6 @@ func runBenchmarkPowerCalibration(
 	if calibDurationSec <= 0 {
 		calibDurationSec = 120
 	}
-	const maxDerateW = 150
 	// calibSearchTolerance is the binary-search convergence threshold in watts.
 	// When hi-lo ≤ this, the highest verified-stable limit (lo) is used.
 	const calibSearchTolerance = 10
@@ -3088,8 +3102,9 @@ func runBenchmarkPowerCalibration(
 		originalLimitW int
 		appliedLimitW  int
 		minLimitW      int
-		lo             int // highest verified-stable limit (assumed: minLimitW)
+		lo             int // highest verified-stable limit
 		hi             int // lowest verified-unstable limit (exclusive sentinel above start)
+		loVerified     bool
 		calib          benchmarkPowerCalibrationResult
 		converged      bool
 	}
@@ -3115,19 +3130,13 @@ func runBenchmarkPowerCalibration(
 		if appliedLimitW <= 0 {
 			appliedLimitW = defaultLimitW
 		}
-		minLimitW := appliedLimitW
-		switch {
-		case defaultLimitW > 0:
-			minLimitW = defaultLimitW - maxDerateW
-			floorByRatio := int(math.Round(float64(defaultLimitW) * 0.70))
-			if minLimitW < floorByRatio {
-				minLimitW = floorByRatio
-			}
-		case appliedLimitW > 0:
-			minLimitW = appliedLimitW - maxDerateW
+		minLimitW := int(math.Round(info.MinPowerLimitW))
+		if minLimitW <= 0 {
+			minLimitW = appliedLimitW
 		}
-		if minLimitW < calibSearchTolerance {
-			minLimitW = calibSearchTolerance
+		maxLimitW := int(math.Round(info.MaxPowerLimitW))
+		if maxLimitW > 0 && appliedLimitW > maxLimitW {
+			appliedLimitW = maxLimitW
 		}
 		s := &gpuCalibState{
 			idx:            idx,
@@ -3139,11 +3148,24 @@ func runBenchmarkPowerCalibration(
 			hi:             appliedLimitW + 1, // not yet tested, not yet confirmed unstable
 			calib:          benchmarkPowerCalibrationResult{AppliedPowerLimitW: float64(appliedLimitW)},
 		}
+		if minLimitW > 0 && appliedLimitW > 0 && minLimitW >= appliedLimitW {
+			s.appliedLimitW = minLimitW
+			s.hi = minLimitW + 1
+		}
+		if info.MinPowerLimitW <= 0 {
+			s.calib.Notes = append(s.calib.Notes, "minimum power limit was not reported by nvidia-smi; calibration can only validate the current/default power limit")
+		}
 		if seedLimits != nil {
 			if seedW, ok := seedLimits[idx]; ok && seedW > 0 {
 				// A previously validated limit is only a starting point. Re-run
 				// targeted_power under the current multi-GPU thermal load and derate
 				// again if this step shows new throttling.
+				if seedW < s.minLimitW {
+					seedW = s.minLimitW
+				}
+				if maxLimitW > 0 && seedW > maxLimitW {
+					seedW = maxLimitW
+				}
 				if canDerate {
 					_ = setBenchmarkPowerLimit(ctx, verboseLog, idx, seedW)
 				}
@@ -3331,6 +3353,7 @@ calibDone:
 				s.calib.AppliedPowerLimitW = float64(s.appliedLimitW)
 				logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", s.idx, s.appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples))
 				s.lo = s.appliedLimitW
+				s.loVerified = true
 				if canDerate && s.hi-s.lo > calibSearchTolerance {
 					next := roundTo5W((s.lo + s.hi) / 2)
 					if next > s.lo && next < s.hi {
@@ -3369,7 +3392,23 @@ calibDone:
 			s.hi = s.appliedLimitW

 			if s.hi-s.lo <= calibSearchTolerance {
-				if s.lo > s.minLimitW {
+				if !s.loVerified && s.minLimitW > 0 && s.appliedLimitW != s.minLimitW {
+					if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, s.minLimitW); err != nil {
+						s.calib.Notes = append(s.calib.Notes, "failed to set power limit: "+err.Error())
+						logFunc(fmt.Sprintf("power calibration: GPU %d failed to set minimum power limit %d W: %v", s.idx, s.minLimitW, err))
+						s.converged = true
+						continue
+					}
+					s.appliedLimitW = s.minLimitW
+					s.calib.AppliedPowerLimitW = float64(s.minLimitW)
+					s.calib.Derated = s.minLimitW < s.originalLimitW
+					s.info.PowerLimitW = float64(s.minLimitW)
+					infoByIndex[s.idx] = s.info
+					s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: validating minimum settable limit %d W before concluding failure", s.minLimitW))
+					logFunc(fmt.Sprintf("power calibration: GPU %d binary search: validating minimum settable limit %d W", s.idx, s.minLimitW))
+					continue
+				}
+				if s.loVerified {
 					s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search converged: using %d W (lo=%d hi=%d)", s.lo, s.lo, s.hi))
 					if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, s.lo); err == nil {
 						s.appliedLimitW = s.lo
@@ -3381,7 +3420,8 @@ calibDone:
 						s.calib.Completed = true
 					}
 				} else {
-					s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit within %d W of the default", engineLabel, maxDerateW))
+					s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit down to the minimum settable power limit %d W", engineLabel, s.minLimitW))
+					logFunc(fmt.Sprintf("power calibration: GPU %d no stable limit found down to minimum settable power limit %d W", s.idx, s.minLimitW))
 				}
 				s.calib.MetricRows = filterRowsByGPU(ar.rows, s.idx)
 				s.converged = true
@@ -3396,9 +3436,7 @@ calibDone:
 				next = (s.lo + s.hi) / 2
 			}
 			if next < s.minLimitW {
-				s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit within %d W of the default", engineLabel, maxDerateW))
-				s.converged = true
-				continue
+				next = s.minLimitW
 			}
 			if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, next); err != nil {
 				s.calib.Notes = append(s.calib.Notes, "failed to set power limit: "+err.Error())
--- a/audit/internal/platform/benchmark_test.go
+++ b/audit/internal/platform/benchmark_test.go
@@ -356,12 +356,16 @@ func TestScoreBenchmarkGPUIgnoresDisabledPrecisions(t *testing.T) {
 	}
 }

-func TestEnrichGPUInfoWithMaxClocks(t *testing.T) {
+func TestEnrichGPUInfoWithNvidiaSMIQ(t *testing.T) {
 	t.Parallel()

 	nvsmiQ := []byte(`
 GPU 00000000:4E:00.0
    Product Name                          : NVIDIA RTX PRO 6000 Blackwell Server Edition
+    Min Power Limit                       : 200.00 W
+    Max Power Limit                       : 600.00 W
+    Default Power Limit                   : 575.00 W
+    Current Power Limit                   : 560.00 W
    Clocks
        Graphics                          : 2422 MHz
        Memory                            : 12481 MHz
@@ -383,7 +387,7 @@ GPU 00000000:4F:00.0
 		1: {Index: 1, BusID: "00000000:4F:00.0"},
 	}

-	enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
+	enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQ)

 	if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
 		t.Errorf("GPU 0 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[0].MaxGraphicsClockMHz)
@@ -397,25 +401,49 @@ GPU 00000000:4F:00.0
 	if infoByIndex[1].MaxMemoryClockMHz != 12481 {
 		t.Errorf("GPU 1 MaxMemoryClockMHz = %v, want 12481", infoByIndex[1].MaxMemoryClockMHz)
 	}
+	if infoByIndex[0].MinPowerLimitW != 200 {
+		t.Errorf("GPU 0 MinPowerLimitW = %v, want 200", infoByIndex[0].MinPowerLimitW)
+	}
+	if infoByIndex[0].MaxPowerLimitW != 600 {
+		t.Errorf("GPU 0 MaxPowerLimitW = %v, want 600", infoByIndex[0].MaxPowerLimitW)
+	}
+	if infoByIndex[0].DefaultPowerLimitW != 575 {
+		t.Errorf("GPU 0 DefaultPowerLimitW = %v, want 575", infoByIndex[0].DefaultPowerLimitW)
+	}
+	if infoByIndex[0].PowerLimitW != 560 {
+		t.Errorf("GPU 0 PowerLimitW = %v, want 560", infoByIndex[0].PowerLimitW)
+	}
 }

-func TestEnrichGPUInfoWithMaxClocksSkipsPopulated(t *testing.T) {
+func TestEnrichGPUInfoWithNvidiaSMIQSkipsPopulated(t *testing.T) {
 	t.Parallel()

 	nvsmiQ := []byte(`
 GPU 00000000:4E:00.0
+    Min Power Limit                       : 100.00 W
+    Max Power Limit                       : 900.00 W
    Max Clocks
        Graphics                          : 9999 MHz
        Memory                            : 9999 MHz
 `)
 	// Already populated — must not be overwritten.
 	infoByIndex := map[int]benchmarkGPUInfo{
-		0: {Index: 0, BusID: "00000000:4E:00.0", MaxGraphicsClockMHz: 2430, MaxMemoryClockMHz: 12481},
+		0: {
+			Index:               0,
+			BusID:               "00000000:4E:00.0",
+			MaxGraphicsClockMHz: 2430,
+			MaxMemoryClockMHz:   12481,
+			MinPowerLimitW:      200,
+			MaxPowerLimitW:      600,
+		},
 	}

-	enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)
+	enrichGPUInfoWithNvidiaSMIQ(infoByIndex, nvsmiQ)

 	if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
 		t.Errorf("expected existing value to be preserved, got %v", infoByIndex[0].MaxGraphicsClockMHz)
 	}
+	if infoByIndex[0].MinPowerLimitW != 200 {
+		t.Errorf("expected existing min power limit to be preserved, got %v", infoByIndex[0].MinPowerLimitW)
+	}
 }