diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go
index 2fabde9..8c33be0 100644
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
@@ -2498,8 +2498,25 @@ func runBenchmarkPowerCalibration(
 		err  error
 	}
 
+
+	// gpuCalibState holds per-GPU binary search state during parallel calibration.
+	type gpuCalibState struct {
+		idx            int
+		info           benchmarkGPUInfo
+		originalLimitW int
+		appliedLimitW  int
+		minLimitW      int
+		lo             int // highest verified-stable limit (assumed: minLimitW)
+		hi             int // lowest verified-unstable limit (exclusive sentinel above start)
+		calib          benchmarkPowerCalibrationResult
+		converged      bool
+	}
+
 	results := make(map[int]benchmarkPowerCalibrationResult, len(gpuIndices))
 	var restore []benchmarkRestoreAction
+
+	// Initialise per-GPU state.
+	states := make([]*gpuCalibState, 0, len(gpuIndices))
 	for _, idx := range gpuIndices {
 		info := infoByIndex[idx]
 		originalLimitW := int(math.Round(info.PowerLimitW))
@@ -2528,17 +2545,17 @@ func runBenchmarkPowerCalibration(
 		if minLimitW < calibSearchTolerance {
 			minLimitW = calibSearchTolerance
 		}
-
-		calib := benchmarkPowerCalibrationResult{
-			AppliedPowerLimitW: float64(appliedLimitW),
+		s := &gpuCalibState{
+			idx:            idx,
+			info:           info,
+			originalLimitW: originalLimitW,
+			appliedLimitW:  appliedLimitW,
+			minLimitW:      minLimitW,
+			lo:             minLimitW,
+			hi:             appliedLimitW + 1, // not yet tested, not yet confirmed unstable
+			calib:          benchmarkPowerCalibrationResult{AppliedPowerLimitW: float64(appliedLimitW)},
 		}
-		// Binary search bounds for finding the highest stable power limit.
-		// lo = highest verified-stable level (assumed: minLimitW).
-		// hi = lowest verified-unstable level (assumed: above the starting limit).
-		lo := minLimitW
-		hi := appliedLimitW + 1 // exclusive: not yet tested, so not yet confirmed unstable
-		busyRetries := 0
-		busyDelaySec := 1 // exponential back-off seed; doubles each retry up to dcgmResourceBusyMaxDelaySec
+		states = append(states, s)
 		if canDerate && originalLimitW > 0 {
 			idxCopy := idx
 			orig := originalLimitW
@@ -2549,200 +2566,243 @@ func runBenchmarkPowerCalibration(
 				},
 			})
 		}
+	}
 
-	calibLoop:
+	// Shared DCGM resource-busy back-off state (single diagnostic session).
+	busyRetries := 0
+	busyDelaySec := 1
+	sharedAttempt := 0
+
+	type sharedAttemptResult struct {
+		out  []byte
+		rows []GPUMetricRow
+		err  error
+	}
+
+calibDone:
+	for {
+		// Collect non-converged GPUs.
+		var active []*gpuCalibState
+		for _, s := range states {
+			if !s.converged {
+				active = append(active, s)
+			}
+		}
+		if len(active) == 0 || ctx.Err() != nil {
+			break
+		}
+
+		sharedAttempt++
+		for _, s := range active {
+			s.calib.Attempts++
+			logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power attempt %d at %d W for %ds", s.idx, s.calib.Attempts, s.appliedLimitW, calibDurationSec))
+		}
+
+		// Snapshot throttle counters for all active GPUs before the run.
+		beforeThrottle := make(map[int]BenchmarkThrottleCounters, len(active))
+		for _, s := range active {
+			beforeThrottle[s.idx], _ = queryThrottleCounters(s.idx)
+		}
+
+		// Run targeted_power for ALL gpuIndices simultaneously so every card
+		// is under load during calibration — this reflects real server thermals.
+		logName := fmt.Sprintf("power-calibration-attempt-%d.log", sharedAttempt)
+		cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, gpuIndices)
+		attemptCtx, cancelAttempt := context.WithCancel(ctx)
+		doneCh := make(chan sharedAttemptResult, 1)
+		go func() {
+			out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, nil, gpuIndices, logFunc)
+			doneCh <- sharedAttemptResult{out: out, rows: rows, err: err}
+		}()
+
+		ticker := time.NewTicker(time.Second)
+		throttleReasons := make(map[int]string, len(active))
+		var ar sharedAttemptResult
+
+	attemptLoop:
 		for {
-			calib.Attempts++
-			logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power attempt %d at %d W for %ds", idx, calib.Attempts, appliedLimitW, calibDurationSec))
-
-			beforeThrottle, _ := queryThrottleCounters(idx)
-			attemptCtx, cancel := context.WithCancel(ctx)
-			doneCh := make(chan calibrationAttemptResult, 1)
-			logName := fmt.Sprintf("power-calibration-gpu-%d-attempt-%d.log", idx, calib.Attempts)
-			cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, []int{idx})
-			go func() {
-				out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, nil, []int{idx}, logFunc)
-				doneCh <- calibrationAttemptResult{out: out, rows: rows, err: err}
-			}()
-
-			ticker := time.NewTicker(time.Second)
-			var (
-				attempt        calibrationAttemptResult
-				throttleReason string
-			)
-		attemptLoop:
-			for {
-				select {
-				case attempt = <-doneCh:
-					break attemptLoop
-				case <-ticker.C:
-					afterThrottle, err := queryThrottleCounters(idx)
+			select {
+			case ar = <-doneCh:
+				break attemptLoop
+			case <-ticker.C:
+				// Poll throttle counters for each active GPU independently.
+				for _, s := range active {
+					if throttleReasons[s.idx] != "" {
+						continue // already detected for this GPU
+					}
+					after, err := queryThrottleCounters(s.idx)
 					if err != nil {
 						continue
 					}
-					// Record the throttle reason but do NOT cancel the dcgmi
-					// process. Killing it mid-run leaves nv-hostengine holding
-					// the diagnostic slot, which causes DCGM_ST_IN_USE on every
-					// subsequent attempt. Let targeted_power run to its natural
-					// end so the daemon releases the slot cleanly before we
-					// reduce power and retry.
-					if reason := benchmarkCalibrationThrottleReason(beforeThrottle, afterThrottle); reason != "" && throttleReason == "" {
-						throttleReason = reason
-						logFunc(fmt.Sprintf("power calibration: GPU %d detected %s throttle at %d W, waiting for current run to finish before reducing power limit", idx, reason, appliedLimitW))
+					// Record throttle but do NOT cancel — let dcgmi finish so
+					// nv-hostengine releases the slot cleanly before the next attempt.
+					if reason := benchmarkCalibrationThrottleReason(beforeThrottle[s.idx], after); reason != "" {
+						throttleReasons[s.idx] = reason
+						logFunc(fmt.Sprintf("power calibration: GPU %d detected %s throttle at %d W, waiting for run to finish", s.idx, reason, s.appliedLimitW))
 					}
-				case <-ctx.Done():
-					cancel()
-					attempt = <-doneCh
-					break attemptLoop
 				}
+			case <-ctx.Done():
+				cancelAttempt()
+				ar = <-doneCh
+				break attemptLoop
 			}
-			ticker.Stop()
-			cancel()
-			_ = os.WriteFile(filepath.Join(runDir, logName), attempt.out, 0644)
+		}
+		ticker.Stop()
+		cancelAttempt()
+		_ = os.WriteFile(filepath.Join(runDir, logName), ar.out, 0644)
 
-			perGPU := filterRowsByGPU(attempt.rows, idx)
+		// Resource busy: retry with exponential back-off (shared — one DCGM session).
+		if ar.err != nil && isDCGMResourceBusy(ar.err) {
+			if busyDelaySec > dcgmResourceBusyMaxDelaySec {
+				for _, s := range active {
+					s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("DCGM resource busy after %d retries, giving up", busyRetries))
+					s.converged = true
+				}
+				logFunc(fmt.Sprintf("power calibration: DCGM resource persistently busy after %d retries, stopping", busyRetries))
+				break calibDone
+			}
+			busyRetries++
+			// Undo attempt counter: busy retries don't count as real attempts.
+			for _, s := range active {
+				s.calib.Attempts--
+			}
+			logFunc(fmt.Sprintf("power calibration: DCGM resource busy (attempt %d), retrying in %ds", sharedAttempt, busyDelaySec))
+			select {
+			case <-ctx.Done():
+				break calibDone
+			case <-time.After(time.Duration(busyDelaySec) * time.Second):
+			}
+			next := busyDelaySec * 2
+			if next > dcgmResourceBusyMaxDelaySec {
+				next = dcgmResourceBusyMaxDelaySec + 1
+			}
+			busyDelaySec = next
+			sharedAttempt-- // retry same logical attempt number
+			continue
+		}
+		busyRetries = 0
+		busyDelaySec = 1
+
+		// Per-GPU analysis and binary search update.
+		for _, s := range active {
+			perGPU := filterRowsByGPU(ar.rows, s.idx)
 			summary := summarizeBenchmarkTelemetry(perGPU)
-			if throttleReason == "" && attempt.err == nil && summary.P95PowerW > 0 {
-				// Stable at appliedLimitW: record it and binary-search upward.
-				calib.Summary = summary
-				calib.Completed = true
-				calib.AppliedPowerLimitW = float64(appliedLimitW)
-				logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", idx, appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples))
-				lo = appliedLimitW
-				// If there is still headroom to search, try a higher level.
-				if canDerate && hi-lo > calibSearchTolerance {
-					nextLimitW := roundTo5W((lo + hi) / 2)
-					if nextLimitW > lo && nextLimitW < hi {
-						if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, nextLimitW); err == nil {
-							appliedLimitW = nextLimitW
-							calib.AppliedPowerLimitW = float64(appliedLimitW)
-							calib.Notes = append(calib.Notes, fmt.Sprintf("binary search: stable at %d W, trying %d W (lo=%d hi=%d)", lo, nextLimitW, lo, hi))
-							logFunc(fmt.Sprintf("power calibration: GPU %d binary search up: stable at %d W, trying %d W", idx, lo, nextLimitW))
-							continue calibLoop
+			throttle := throttleReasons[s.idx]
+
+			// Cooling warning: thermal throttle with fans not at maximum.
+			if strings.Contains(throttle, "thermal") && s.calib.CoolingWarning == "" {
+				clocks := make([]float64, 0, len(perGPU))
+				var fanDutyValues []float64
+				fanDutyAvail := false
+				for _, r := range perGPU {
+					if r.ClockMHz > 0 {
+						clocks = append(clocks, r.ClockMHz)
+					}
+					if r.FanDutyCycleAvailable {
+						fanDutyAvail = true
+						fanDutyValues = append(fanDutyValues, r.FanDutyCyclePct)
+					}
+				}
+				dropPct := benchmarkClockDrift(clocks)
+				p95FanDuty := benchmarkPercentile(fanDutyValues, 95)
+				if dropPct >= 20 && fanDutyAvail && p95FanDuty < 98 {
+					s.calib.CoolingWarning = fmt.Sprintf(
+						"thermal throttle (%s) caused a %.0f%% clock drop while fans were at %.0f%% duty cycle — server cooling may not be configured for full GPU load",
+						throttle, dropPct, p95FanDuty,
+					)
+					logFunc(fmt.Sprintf("power calibration: GPU %d cooling warning: %s", s.idx, s.calib.CoolingWarning))
+				}
+			}
+
+			if throttle == "" && ar.err == nil && summary.P95PowerW > 0 {
+				// Stable at current limit — update lo and binary-search upward.
+				s.calib.Summary = summary
+				s.calib.Completed = true
+				s.calib.AppliedPowerLimitW = float64(s.appliedLimitW)
+				logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", s.idx, s.appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples))
+				s.lo = s.appliedLimitW
+				if canDerate && s.hi-s.lo > calibSearchTolerance {
+					next := roundTo5W((s.lo + s.hi) / 2)
+					if next > s.lo && next < s.hi {
+						if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, next); err == nil {
+							s.appliedLimitW = next
+							s.calib.AppliedPowerLimitW = float64(next)
+							s.calib.Completed = false // keep searching
+							s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: stable at %d W, trying %d W (lo=%d hi=%d)", s.lo, next, s.lo, s.hi))
+							logFunc(fmt.Sprintf("power calibration: GPU %d binary search up: stable at %d W, trying %d W", s.idx, s.lo, next))
+							continue // next GPU in active list
 						}
 					}
 				}
-				break
+				s.converged = true
+				continue
 			}
 
-			// If DCGM reports the resource is in use, nv-hostengine has not yet
-			// released the diagnostic slot from the previous attempt. Do not
-			// derate: wait with exponential back-off and retry at the same
-			// power limit. Once the back-off delay would exceed
-			// dcgmResourceBusyMaxDelaySec, fail — the slot is persistently
-			// held by something else.
-			if attempt.err != nil && isDCGMResourceBusy(attempt.err) {
-				if busyDelaySec > dcgmResourceBusyMaxDelaySec {
-					calib.Notes = append(calib.Notes, fmt.Sprintf("DCGM resource busy after %d retries, giving up", busyRetries))
-					logFunc(fmt.Sprintf("power calibration: GPU %d DCGM resource persistently busy after %d retries, stopping", idx, busyRetries))
-					break
-				}
-				busyRetries++
-				logFunc(fmt.Sprintf("power calibration: GPU %d DCGM resource busy (attempt %d), retrying in %ds", idx, calib.Attempts, busyDelaySec))
-				select {
-				case <-ctx.Done():
-					break calibLoop
-				case <-time.After(time.Duration(busyDelaySec) * time.Second):
-				}
-				next := busyDelaySec * 2
-				if next > dcgmResourceBusyMaxDelaySec {
-					next = dcgmResourceBusyMaxDelaySec + 1 // sentinel: next busy → fail
-				}
-				busyDelaySec = next
-				continue calibLoop
-			}
-			busyRetries = 0    // reset on any non-busy outcome
-			busyDelaySec = 1   // reset back-off
-
+			// Failed or throttled — log and binary-search downward.
 			switch {
-			case throttleReason != "":
-				calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power was canceled on attempt %d after %s throttling at %d W", calib.Attempts, throttleReason, appliedLimitW))
-				logFunc(fmt.Sprintf("power calibration: GPU %d throttled (%s) at %d W, reducing power limit", idx, throttleReason, appliedLimitW))
-				// Check whether the thermal throttle coincided with fans below
-				// maximum: that combination suggests cooling misconfiguration
-				// rather than a fundamental power-delivery limit.
-				if strings.Contains(throttleReason, "thermal") && calib.CoolingWarning == "" {
-					clocks := make([]float64, 0, len(perGPU))
-					var fanDutyValues []float64
-					fanDutyAvail := false
-					for _, r := range perGPU {
-						if r.ClockMHz > 0 {
-							clocks = append(clocks, r.ClockMHz)
-						}
-						if r.FanDutyCycleAvailable {
-							fanDutyAvail = true
-							fanDutyValues = append(fanDutyValues, r.FanDutyCyclePct)
-						}
-					}
-					dropPct := benchmarkClockDrift(clocks)
-					p95FanDuty := benchmarkPercentile(fanDutyValues, 95)
-					if dropPct >= 20 && fanDutyAvail && p95FanDuty < 98 {
-						calib.CoolingWarning = fmt.Sprintf(
-							"thermal throttle (%s) caused a %.0f%% clock drop while fans were at %.0f%% duty cycle — server cooling may not be configured for full GPU load",
-							throttleReason, dropPct, p95FanDuty,
-						)
-						logFunc(fmt.Sprintf("power calibration: GPU %d cooling warning: %s", idx, calib.CoolingWarning))
-					}
-				}
-			case attempt.err != nil:
-				calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power attempt %d failed at %d W: %v", calib.Attempts, appliedLimitW, attempt.err))
-				logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power failed at %d W: %v", idx, appliedLimitW, attempt.err))
+			case throttle != "":
+				s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d: %s throttle at %d W", s.calib.Attempts, throttle, s.appliedLimitW))
+				logFunc(fmt.Sprintf("power calibration: GPU %d throttled (%s) at %d W, reducing power limit", s.idx, throttle, s.appliedLimitW))
+			case ar.err != nil:
+				s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d failed at %d W: %v", s.calib.Attempts, s.appliedLimitW, ar.err))
+				logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power failed at %d W: %v", s.idx, s.appliedLimitW, ar.err))
 			default:
-				calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power attempt %d at %d W produced no valid power telemetry", calib.Attempts, appliedLimitW))
-				logFunc(fmt.Sprintf("power calibration: GPU %d attempt %d at %d W produced no valid telemetry", idx, calib.Attempts, appliedLimitW))
+				s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d at %d W: no valid power telemetry", s.calib.Attempts, s.appliedLimitW))
+				logFunc(fmt.Sprintf("power calibration: GPU %d attempt %d at %d W: no valid telemetry", s.idx, s.calib.Attempts, s.appliedLimitW))
 			}
 
-			if !canDerate || appliedLimitW <= 0 {
-				break
+			if !canDerate || s.appliedLimitW <= 0 {
+				s.converged = true
+				continue
 			}
-			// Binary-search for the highest stable power limit.
-			// This attempt failed or throttled, so update the upper bound.
-			hi = appliedLimitW
+			s.hi = s.appliedLimitW
 
-			if hi-lo <= calibSearchTolerance {
-				// Search range exhausted: lo is the highest verified-stable level.
-				if lo > minLimitW {
-					calib.Notes = append(calib.Notes, fmt.Sprintf("binary search converged: using %d W (lo=%d hi=%d)", lo, lo, hi))
-					if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, lo); err == nil {
-						appliedLimitW = lo
-						calib.AppliedPowerLimitW = float64(lo)
-						calib.Derated = lo < originalLimitW
+			if s.hi-s.lo <= calibSearchTolerance {
+				if s.lo > s.minLimitW {
+					s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search converged: using %d W (lo=%d hi=%d)", s.lo, s.lo, s.hi))
+					if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, s.lo); err == nil {
+						s.appliedLimitW = s.lo
+						s.calib.AppliedPowerLimitW = float64(s.lo)
+						s.calib.Derated = s.lo < s.originalLimitW
 					}
 				} else {
-					calib.Notes = append(calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
+					s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
 				}
-				break
+				s.converged = true
+				continue
 			}
 
-			// Binary midpoint within the remaining search range.
-			nextLimitW := roundTo5W((lo + hi) / 2)
-			// Ensure the candidate is strictly inside the search range.
-			if nextLimitW <= lo {
-				nextLimitW = lo + calibSearchTolerance
+			next := roundTo5W((s.lo + s.hi) / 2)
+			if next <= s.lo {
+				next = s.lo + calibSearchTolerance
 			}
-			if nextLimitW >= hi {
-				nextLimitW = (lo + hi) / 2
+			if next >= s.hi {
+				next = (s.lo + s.hi) / 2
 			}
-			if nextLimitW < minLimitW {
-				calib.Notes = append(calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
-				break
+			if next < s.minLimitW {
+				s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
+				s.converged = true
+				continue
 			}
-			if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, nextLimitW); err != nil {
-				calib.Notes = append(calib.Notes, "failed to set power limit: "+err.Error())
-				logFunc(fmt.Sprintf("power calibration: GPU %d failed to set power limit %d W: %v", idx, nextLimitW, err))
-				break
+			if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, next); err != nil {
+				s.calib.Notes = append(s.calib.Notes, "failed to set power limit: "+err.Error())
+				logFunc(fmt.Sprintf("power calibration: GPU %d failed to set power limit %d W: %v", s.idx, next, err))
+				s.converged = true
+				continue
 			}
-			appliedLimitW = nextLimitW
-			calib.AppliedPowerLimitW = float64(appliedLimitW)
-			calib.Derated = appliedLimitW < originalLimitW
-			info.PowerLimitW = float64(appliedLimitW)
-			infoByIndex[idx] = info
-			calib.Notes = append(calib.Notes, fmt.Sprintf("binary search: trying %d W (lo=%d hi=%d)", nextLimitW, lo, hi))
-			logFunc(fmt.Sprintf("power calibration: GPU %d binary search: trying %d W (lo=%d hi=%d)", idx, nextLimitW, lo, hi))
+			s.appliedLimitW = next
+			s.calib.AppliedPowerLimitW = float64(next)
+			s.calib.Derated = next < s.originalLimitW
+			s.info.PowerLimitW = float64(next)
+			infoByIndex[s.idx] = s.info
+			s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: trying %d W (lo=%d hi=%d)", next, s.lo, s.hi))
+			logFunc(fmt.Sprintf("power calibration: GPU %d binary search: trying %d W (lo=%d hi=%d)", s.idx, next, s.lo, s.hi))
 		}
+	}
 
-		if calib.Completed || calib.Attempts > 0 || len(calib.Notes) > 0 {
-			results[idx] = calib
+	for _, s := range states {
+		if s.calib.Completed || s.calib.Attempts > 0 || len(s.calib.Notes) > 0 {
+			results[s.idx] = s.calib
 		}
 	}
 	return results, restore