Run power calibration for all GPUs simultaneously

Previously each GPU was calibrated sequentially (one card fully done before the next started), producing the staircase temperature pattern seen on the graph. Now all GPUs run together in a single dcgmi diag -r targeted_power session per attempt. This means: - All cards are under realistic thermal load at the same time. - A single DCGM session handles the run — no resource-busy contention from concurrent dcgmi processes. - Binary search state (lo/hi) is tracked independently per GPU; each card converges to its own highest stable power limit. - Throttle counter polling covers all active GPUs in the shared ticker. - Resource-busy exponential back-off is shared (one DCGM session). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-14 22:25:05 +03:00
parent 19dbabd71d
commit 3cf2e9c9dc
1 changed files with 232 additions and 172 deletions
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
@@ -2498,8 +2498,25 @@ func runBenchmarkPowerCalibration(
 		err  error
 	}
 	// gpuCalibState holds per-GPU binary search state during parallel calibration.
 	type gpuCalibState struct {
 		idx            int
 		info           benchmarkGPUInfo
 		originalLimitW int
 		appliedLimitW  int
 		minLimitW      int
 		lo             int // highest verified-stable limit (assumed: minLimitW)
 		hi             int // lowest verified-unstable limit (exclusive sentinel above start)
 		calib          benchmarkPowerCalibrationResult
 		converged      bool
 	}
 	results := make(map[int]benchmarkPowerCalibrationResult, len(gpuIndices))
 	var restore []benchmarkRestoreAction
 	// Initialise per-GPU state.
 	states := make([]*gpuCalibState, 0, len(gpuIndices))
 	for _, idx := range gpuIndices {
 		info := infoByIndex[idx]
 		originalLimitW := int(math.Round(info.PowerLimitW))
@@ -2528,17 +2545,17 @@ func runBenchmarkPowerCalibration(
 		if minLimitW < calibSearchTolerance {
 			minLimitW = calibSearchTolerance
 		}
-
+		s := &gpuCalibState{
-		calib := benchmarkPowerCalibrationResult{
+			idx:            idx,
-			AppliedPowerLimitW: float64(appliedLimitW),
+			info:           info,
 			originalLimitW: originalLimitW,
 			appliedLimitW:  appliedLimitW,
 			minLimitW:      minLimitW,
 			lo:             minLimitW,
 			hi:             appliedLimitW + 1, // not yet tested, not yet confirmed unstable
 			calib:          benchmarkPowerCalibrationResult{AppliedPowerLimitW: float64(appliedLimitW)},
 		}
-		// Binary search bounds for finding the highest stable power limit.
+		states = append(states, s)
 		// lo = highest verified-stable level (assumed: minLimitW).
 		// hi = lowest verified-unstable level (assumed: above the starting limit).
 		lo := minLimitW
 		hi := appliedLimitW + 1 // exclusive: not yet tested, so not yet confirmed unstable
 		busyRetries := 0
 		busyDelaySec := 1 // exponential back-off seed; doubles each retry up to dcgmResourceBusyMaxDelaySec
 		if canDerate && originalLimitW > 0 {
 			idxCopy := idx
 			orig := originalLimitW
@@ -2549,119 +2566,131 @@ func runBenchmarkPowerCalibration(
 				},
 			})
 		}
 	}
-	calibLoop:
+	// Shared DCGM resource-busy back-off state (single diagnostic session).
 	busyRetries := 0
 	busyDelaySec := 1
 	sharedAttempt := 0
 	type sharedAttemptResult struct {
 		out  []byte
 		rows []GPUMetricRow
 		err  error
 	}
 calibDone:
 	for {
-			calib.Attempts++
+		// Collect non-converged GPUs.
-			logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power attempt %d at %d W for %ds", idx, calib.Attempts, appliedLimitW, calibDurationSec))
+		var active []*gpuCalibState
 		for _, s := range states {
 			if !s.converged {
 				active = append(active, s)
 			}
 		}
 		if len(active) == 0 || ctx.Err() != nil {
 			break
 		}
-			beforeThrottle, _ := queryThrottleCounters(idx)
+		sharedAttempt++
-			attemptCtx, cancel := context.WithCancel(ctx)
+		for _, s := range active {
-			doneCh := make(chan calibrationAttemptResult, 1)
+			s.calib.Attempts++
-			logName := fmt.Sprintf("power-calibration-gpu-%d-attempt-%d.log", idx, calib.Attempts)
+			logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power attempt %d at %d W for %ds", s.idx, s.calib.Attempts, s.appliedLimitW, calibDurationSec))
-			cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, []int{idx})
+		}
 		// Snapshot throttle counters for all active GPUs before the run.
 		beforeThrottle := make(map[int]BenchmarkThrottleCounters, len(active))
 		for _, s := range active {
 			beforeThrottle[s.idx], _ = queryThrottleCounters(s.idx)
 		}
 		// Run targeted_power for ALL gpuIndices simultaneously so every card
 		// is under load during calibration — this reflects real server thermals.
 		logName := fmt.Sprintf("power-calibration-attempt-%d.log", sharedAttempt)
 		cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, gpuIndices)
 		attemptCtx, cancelAttempt := context.WithCancel(ctx)
 		doneCh := make(chan sharedAttemptResult, 1)
 		go func() {
-				out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, nil, []int{idx}, logFunc)
+			out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, nil, gpuIndices, logFunc)
-				doneCh <- calibrationAttemptResult{out: out, rows: rows, err: err}
+			doneCh <- sharedAttemptResult{out: out, rows: rows, err: err}
 		}()
 		ticker := time.NewTicker(time.Second)
-			var (
+		throttleReasons := make(map[int]string, len(active))
-				attempt        calibrationAttemptResult
+		var ar sharedAttemptResult
-				throttleReason string
+
 			)
 	attemptLoop:
 		for {
 			select {
-				case attempt = <-doneCh:
+			case ar = <-doneCh:
 				break attemptLoop
 			case <-ticker.C:
-					afterThrottle, err := queryThrottleCounters(idx)
+				// Poll throttle counters for each active GPU independently.
 				for _, s := range active {
 					if throttleReasons[s.idx] != "" {
 						continue // already detected for this GPU
 					}
 					after, err := queryThrottleCounters(s.idx)
 					if err != nil {
 						continue
 					}
-					// Record the throttle reason but do NOT cancel the dcgmi
+					// Record throttle but do NOT cancel — let dcgmi finish so
-					// process. Killing it mid-run leaves nv-hostengine holding
+					// nv-hostengine releases the slot cleanly before the next attempt.
-					// the diagnostic slot, which causes DCGM_ST_IN_USE on every
+					if reason := benchmarkCalibrationThrottleReason(beforeThrottle[s.idx], after); reason != "" {
-					// subsequent attempt. Let targeted_power run to its natural
+						throttleReasons[s.idx] = reason
-					// end so the daemon releases the slot cleanly before we
+						logFunc(fmt.Sprintf("power calibration: GPU %d detected %s throttle at %d W, waiting for run to finish", s.idx, reason, s.appliedLimitW))
-					// reduce power and retry.
+					}
 					if reason := benchmarkCalibrationThrottleReason(beforeThrottle, afterThrottle); reason != "" && throttleReason == "" {
 						throttleReason = reason
 						logFunc(fmt.Sprintf("power calibration: GPU %d detected %s throttle at %d W, waiting for current run to finish before reducing power limit", idx, reason, appliedLimitW))
 				}
 			case <-ctx.Done():
-					cancel()
+				cancelAttempt()
-					attempt = <-doneCh
+				ar = <-doneCh
 				break attemptLoop
 			}
 		}
 		ticker.Stop()
-			cancel()
+		cancelAttempt()
-			_ = os.WriteFile(filepath.Join(runDir, logName), attempt.out, 0644)
+		_ = os.WriteFile(filepath.Join(runDir, logName), ar.out, 0644)
-			perGPU := filterRowsByGPU(attempt.rows, idx)
+		// Resource busy: retry with exponential back-off (shared — one DCGM session).
-			summary := summarizeBenchmarkTelemetry(perGPU)
+		if ar.err != nil && isDCGMResourceBusy(ar.err) {
 			if throttleReason == "" && attempt.err == nil && summary.P95PowerW > 0 {
 				// Stable at appliedLimitW: record it and binary-search upward.
 				calib.Summary = summary
 				calib.Completed = true
 				calib.AppliedPowerLimitW = float64(appliedLimitW)
 				logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", idx, appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples))
 				lo = appliedLimitW
 				// If there is still headroom to search, try a higher level.
 				if canDerate && hi-lo > calibSearchTolerance {
 					nextLimitW := roundTo5W((lo + hi) / 2)
 					if nextLimitW > lo && nextLimitW < hi {
 						if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, nextLimitW); err == nil {
 							appliedLimitW = nextLimitW
 							calib.AppliedPowerLimitW = float64(appliedLimitW)
 							calib.Notes = append(calib.Notes, fmt.Sprintf("binary search: stable at %d W, trying %d W (lo=%d hi=%d)", lo, nextLimitW, lo, hi))
 							logFunc(fmt.Sprintf("power calibration: GPU %d binary search up: stable at %d W, trying %d W", idx, lo, nextLimitW))
 							continue calibLoop
 						}
 					}
 				}
 				break
 			}
 			// If DCGM reports the resource is in use, nv-hostengine has not yet
 			// released the diagnostic slot from the previous attempt. Do not
 			// derate: wait with exponential back-off and retry at the same
 			// power limit. Once the back-off delay would exceed
 			// dcgmResourceBusyMaxDelaySec, fail — the slot is persistently
 			// held by something else.
 			if attempt.err != nil && isDCGMResourceBusy(attempt.err) {
 			if busyDelaySec > dcgmResourceBusyMaxDelaySec {
-					calib.Notes = append(calib.Notes, fmt.Sprintf("DCGM resource busy after %d retries, giving up", busyRetries))
+				for _, s := range active {
-					logFunc(fmt.Sprintf("power calibration: GPU %d DCGM resource persistently busy after %d retries, stopping", idx, busyRetries))
+					s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("DCGM resource busy after %d retries, giving up", busyRetries))
-					break
+					s.converged = true
 				}
 				logFunc(fmt.Sprintf("power calibration: DCGM resource persistently busy after %d retries, stopping", busyRetries))
 				break calibDone
 			}
 			busyRetries++
-				logFunc(fmt.Sprintf("power calibration: GPU %d DCGM resource busy (attempt %d), retrying in %ds", idx, calib.Attempts, busyDelaySec))
+			// Undo attempt counter: busy retries don't count as real attempts.
 			for _, s := range active {
 				s.calib.Attempts--
 			}
 			logFunc(fmt.Sprintf("power calibration: DCGM resource busy (attempt %d), retrying in %ds", sharedAttempt, busyDelaySec))
 			select {
 			case <-ctx.Done():
-					break calibLoop
+				break calibDone
 			case <-time.After(time.Duration(busyDelaySec) * time.Second):
 			}
 			next := busyDelaySec * 2
 			if next > dcgmResourceBusyMaxDelaySec {
-					next = dcgmResourceBusyMaxDelaySec + 1 // sentinel: next busy → fail
+				next = dcgmResourceBusyMaxDelaySec + 1
 			}
 			busyDelaySec = next
-				continue calibLoop
+			sharedAttempt-- // retry same logical attempt number
 			continue
 		}
-			busyRetries = 0    // reset on any non-busy outcome
+		busyRetries = 0
-			busyDelaySec = 1   // reset back-off
+		busyDelaySec = 1
-			switch {
+		// Per-GPU analysis and binary search update.
-			case throttleReason != "":
+		for _, s := range active {
-				calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power was canceled on attempt %d after %s throttling at %d W", calib.Attempts, throttleReason, appliedLimitW))
+			perGPU := filterRowsByGPU(ar.rows, s.idx)
-				logFunc(fmt.Sprintf("power calibration: GPU %d throttled (%s) at %d W, reducing power limit", idx, throttleReason, appliedLimitW))
+			summary := summarizeBenchmarkTelemetry(perGPU)
-				// Check whether the thermal throttle coincided with fans below
+			throttle := throttleReasons[s.idx]
-				// maximum: that combination suggests cooling misconfiguration
+
-				// rather than a fundamental power-delivery limit.
+			// Cooling warning: thermal throttle with fans not at maximum.
-				if strings.Contains(throttleReason, "thermal") && calib.CoolingWarning == "" {
+			if strings.Contains(throttle, "thermal") && s.calib.CoolingWarning == "" {
 				clocks := make([]float64, 0, len(perGPU))
 				var fanDutyValues []float64
 				fanDutyAvail := false
@@ -2677,72 +2706,103 @@ func runBenchmarkPowerCalibration(
 				dropPct := benchmarkClockDrift(clocks)
 				p95FanDuty := benchmarkPercentile(fanDutyValues, 95)
 				if dropPct >= 20 && fanDutyAvail && p95FanDuty < 98 {
-						calib.CoolingWarning = fmt.Sprintf(
+					s.calib.CoolingWarning = fmt.Sprintf(
 						"thermal throttle (%s) caused a %.0f%% clock drop while fans were at %.0f%% duty cycle — server cooling may not be configured for full GPU load",
-							throttleReason, dropPct, p95FanDuty,
+						throttle, dropPct, p95FanDuty,
 					)
-						logFunc(fmt.Sprintf("power calibration: GPU %d cooling warning: %s", idx, calib.CoolingWarning))
+					logFunc(fmt.Sprintf("power calibration: GPU %d cooling warning: %s", s.idx, s.calib.CoolingWarning))
 				}
 			}
-			case attempt.err != nil:
+
-				calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power attempt %d failed at %d W: %v", calib.Attempts, appliedLimitW, attempt.err))
+			if throttle == "" && ar.err == nil && summary.P95PowerW > 0 {
-				logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power failed at %d W: %v", idx, appliedLimitW, attempt.err))
+				// Stable at current limit — update lo and binary-search upward.
 				s.calib.Summary = summary
 				s.calib.Completed = true
 				s.calib.AppliedPowerLimitW = float64(s.appliedLimitW)
 				logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", s.idx, s.appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples))
 				s.lo = s.appliedLimitW
 				if canDerate && s.hi-s.lo > calibSearchTolerance {
 					next := roundTo5W((s.lo + s.hi) / 2)
 					if next > s.lo && next < s.hi {
 						if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, next); err == nil {
 							s.appliedLimitW = next
 							s.calib.AppliedPowerLimitW = float64(next)
 							s.calib.Completed = false // keep searching
 							s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: stable at %d W, trying %d W (lo=%d hi=%d)", s.lo, next, s.lo, s.hi))
 							logFunc(fmt.Sprintf("power calibration: GPU %d binary search up: stable at %d W, trying %d W", s.idx, s.lo, next))
 							continue // next GPU in active list
 						}
 					}
 				}
 				s.converged = true
 				continue
 			}
 			// Failed or throttled — log and binary-search downward.
 			switch {
 			case throttle != "":
 				s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d: %s throttle at %d W", s.calib.Attempts, throttle, s.appliedLimitW))
 				logFunc(fmt.Sprintf("power calibration: GPU %d throttled (%s) at %d W, reducing power limit", s.idx, throttle, s.appliedLimitW))
 			case ar.err != nil:
 				s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d failed at %d W: %v", s.calib.Attempts, s.appliedLimitW, ar.err))
 				logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power failed at %d W: %v", s.idx, s.appliedLimitW, ar.err))
 			default:
-				calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power attempt %d at %d W produced no valid power telemetry", calib.Attempts, appliedLimitW))
+				s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d at %d W: no valid power telemetry", s.calib.Attempts, s.appliedLimitW))
-				logFunc(fmt.Sprintf("power calibration: GPU %d attempt %d at %d W produced no valid telemetry", idx, calib.Attempts, appliedLimitW))
+				logFunc(fmt.Sprintf("power calibration: GPU %d attempt %d at %d W: no valid telemetry", s.idx, s.calib.Attempts, s.appliedLimitW))
 			}
-			if !canDerate || appliedLimitW <= 0 {
+			if !canDerate || s.appliedLimitW <= 0 {
-				break
+				s.converged = true
 				continue
 			}
-			// Binary-search for the highest stable power limit.
+			s.hi = s.appliedLimitW
 			// This attempt failed or throttled, so update the upper bound.
 			hi = appliedLimitW
-			if hi-lo <= calibSearchTolerance {
+			if s.hi-s.lo <= calibSearchTolerance {
-				// Search range exhausted: lo is the highest verified-stable level.
+				if s.lo > s.minLimitW {
-				if lo > minLimitW {
+					s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search converged: using %d W (lo=%d hi=%d)", s.lo, s.lo, s.hi))
-					calib.Notes = append(calib.Notes, fmt.Sprintf("binary search converged: using %d W (lo=%d hi=%d)", lo, lo, hi))
+					if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, s.lo); err == nil {
-					if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, lo); err == nil {
+						s.appliedLimitW = s.lo
-						appliedLimitW = lo
+						s.calib.AppliedPowerLimitW = float64(s.lo)
-						calib.AppliedPowerLimitW = float64(lo)
+						s.calib.Derated = s.lo < s.originalLimitW
 						calib.Derated = lo < originalLimitW
 					}
 				} else {
-					calib.Notes = append(calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
+					s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
 				}
-				break
+				s.converged = true
 				continue
 			}
-			// Binary midpoint within the remaining search range.
+			next := roundTo5W((s.lo + s.hi) / 2)
-			nextLimitW := roundTo5W((lo + hi) / 2)
+			if next <= s.lo {
-			// Ensure the candidate is strictly inside the search range.
+				next = s.lo + calibSearchTolerance
 			if nextLimitW <= lo {
 				nextLimitW = lo + calibSearchTolerance
 			}
-			if nextLimitW >= hi {
+			if next >= s.hi {
-				nextLimitW = (lo + hi) / 2
+				next = (s.lo + s.hi) / 2
 			}
-			if nextLimitW < minLimitW {
+			if next < s.minLimitW {
-				calib.Notes = append(calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
+				s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
-				break
+				s.converged = true
 				continue
 			}
-			if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, nextLimitW); err != nil {
+			if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, next); err != nil {
-				calib.Notes = append(calib.Notes, "failed to set power limit: "+err.Error())
+				s.calib.Notes = append(s.calib.Notes, "failed to set power limit: "+err.Error())
-				logFunc(fmt.Sprintf("power calibration: GPU %d failed to set power limit %d W: %v", idx, nextLimitW, err))
+				logFunc(fmt.Sprintf("power calibration: GPU %d failed to set power limit %d W: %v", s.idx, next, err))
-				break
+				s.converged = true
 				continue
 			}
 			s.appliedLimitW = next
 			s.calib.AppliedPowerLimitW = float64(next)
 			s.calib.Derated = next < s.originalLimitW
 			s.info.PowerLimitW = float64(next)
 			infoByIndex[s.idx] = s.info
 			s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: trying %d W (lo=%d hi=%d)", next, s.lo, s.hi))
 			logFunc(fmt.Sprintf("power calibration: GPU %d binary search: trying %d W (lo=%d hi=%d)", s.idx, next, s.lo, s.hi))
 		}
 			appliedLimitW = nextLimitW
 			calib.AppliedPowerLimitW = float64(appliedLimitW)
 			calib.Derated = appliedLimitW < originalLimitW
 			info.PowerLimitW = float64(appliedLimitW)
 			infoByIndex[idx] = info
 			calib.Notes = append(calib.Notes, fmt.Sprintf("binary search: trying %d W (lo=%d hi=%d)", nextLimitW, lo, hi))
 			logFunc(fmt.Sprintf("power calibration: GPU %d binary search: trying %d W (lo=%d hi=%d)", idx, nextLimitW, lo, hi))
 	}
-		if calib.Completed || calib.Attempts > 0 || len(calib.Notes) > 0 {
+	for _, s := range states {
-			results[idx] = calib
+		if s.calib.Completed || s.calib.Attempts > 0 || len(s.calib.Notes) > 0 {
 			results[s.idx] = s.calib
 		}
 	}
 	return results, restore