diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go index 2fabde9..8c33be0 100644 --- a/audit/internal/platform/benchmark.go +++ b/audit/internal/platform/benchmark.go @@ -2498,8 +2498,25 @@ func runBenchmarkPowerCalibration( err error } + + // gpuCalibState holds per-GPU binary search state during parallel calibration. + type gpuCalibState struct { + idx int + info benchmarkGPUInfo + originalLimitW int + appliedLimitW int + minLimitW int + lo int // highest verified-stable limit (assumed: minLimitW) + hi int // lowest verified-unstable limit (exclusive sentinel above start) + calib benchmarkPowerCalibrationResult + converged bool + } + results := make(map[int]benchmarkPowerCalibrationResult, len(gpuIndices)) var restore []benchmarkRestoreAction + + // Initialise per-GPU state. + states := make([]*gpuCalibState, 0, len(gpuIndices)) for _, idx := range gpuIndices { info := infoByIndex[idx] originalLimitW := int(math.Round(info.PowerLimitW)) @@ -2528,17 +2545,17 @@ func runBenchmarkPowerCalibration( if minLimitW < calibSearchTolerance { minLimitW = calibSearchTolerance } - - calib := benchmarkPowerCalibrationResult{ - AppliedPowerLimitW: float64(appliedLimitW), + s := &gpuCalibState{ + idx: idx, + info: info, + originalLimitW: originalLimitW, + appliedLimitW: appliedLimitW, + minLimitW: minLimitW, + lo: minLimitW, + hi: appliedLimitW + 1, // not yet tested, not yet confirmed unstable + calib: benchmarkPowerCalibrationResult{AppliedPowerLimitW: float64(appliedLimitW)}, } - // Binary search bounds for finding the highest stable power limit. - // lo = highest verified-stable level (assumed: minLimitW). - // hi = lowest verified-unstable level (assumed: above the starting limit). - lo := minLimitW - hi := appliedLimitW + 1 // exclusive: not yet tested, so not yet confirmed unstable - busyRetries := 0 - busyDelaySec := 1 // exponential back-off seed; doubles each retry up to dcgmResourceBusyMaxDelaySec + states = append(states, s) if canDerate && originalLimitW > 0 { idxCopy := idx orig := originalLimitW @@ -2549,200 +2566,243 @@ func runBenchmarkPowerCalibration( }, }) } + } - calibLoop: + // Shared DCGM resource-busy back-off state (single diagnostic session). + busyRetries := 0 + busyDelaySec := 1 + sharedAttempt := 0 + + type sharedAttemptResult struct { + out []byte + rows []GPUMetricRow + err error + } + +calibDone: + for { + // Collect non-converged GPUs. + var active []*gpuCalibState + for _, s := range states { + if !s.converged { + active = append(active, s) + } + } + if len(active) == 0 || ctx.Err() != nil { + break + } + + sharedAttempt++ + for _, s := range active { + s.calib.Attempts++ + logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power attempt %d at %d W for %ds", s.idx, s.calib.Attempts, s.appliedLimitW, calibDurationSec)) + } + + // Snapshot throttle counters for all active GPUs before the run. + beforeThrottle := make(map[int]BenchmarkThrottleCounters, len(active)) + for _, s := range active { + beforeThrottle[s.idx], _ = queryThrottleCounters(s.idx) + } + + // Run targeted_power for ALL gpuIndices simultaneously so every card + // is under load during calibration — this reflects real server thermals. + logName := fmt.Sprintf("power-calibration-attempt-%d.log", sharedAttempt) + cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, gpuIndices) + attemptCtx, cancelAttempt := context.WithCancel(ctx) + doneCh := make(chan sharedAttemptResult, 1) + go func() { + out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, nil, gpuIndices, logFunc) + doneCh <- sharedAttemptResult{out: out, rows: rows, err: err} + }() + + ticker := time.NewTicker(time.Second) + throttleReasons := make(map[int]string, len(active)) + var ar sharedAttemptResult + + attemptLoop: for { - calib.Attempts++ - logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power attempt %d at %d W for %ds", idx, calib.Attempts, appliedLimitW, calibDurationSec)) - - beforeThrottle, _ := queryThrottleCounters(idx) - attemptCtx, cancel := context.WithCancel(ctx) - doneCh := make(chan calibrationAttemptResult, 1) - logName := fmt.Sprintf("power-calibration-gpu-%d-attempt-%d.log", idx, calib.Attempts) - cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, []int{idx}) - go func() { - out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, nil, []int{idx}, logFunc) - doneCh <- calibrationAttemptResult{out: out, rows: rows, err: err} - }() - - ticker := time.NewTicker(time.Second) - var ( - attempt calibrationAttemptResult - throttleReason string - ) - attemptLoop: - for { - select { - case attempt = <-doneCh: - break attemptLoop - case <-ticker.C: - afterThrottle, err := queryThrottleCounters(idx) + select { + case ar = <-doneCh: + break attemptLoop + case <-ticker.C: + // Poll throttle counters for each active GPU independently. + for _, s := range active { + if throttleReasons[s.idx] != "" { + continue // already detected for this GPU + } + after, err := queryThrottleCounters(s.idx) if err != nil { continue } - // Record the throttle reason but do NOT cancel the dcgmi - // process. Killing it mid-run leaves nv-hostengine holding - // the diagnostic slot, which causes DCGM_ST_IN_USE on every - // subsequent attempt. Let targeted_power run to its natural - // end so the daemon releases the slot cleanly before we - // reduce power and retry. - if reason := benchmarkCalibrationThrottleReason(beforeThrottle, afterThrottle); reason != "" && throttleReason == "" { - throttleReason = reason - logFunc(fmt.Sprintf("power calibration: GPU %d detected %s throttle at %d W, waiting for current run to finish before reducing power limit", idx, reason, appliedLimitW)) + // Record throttle but do NOT cancel — let dcgmi finish so + // nv-hostengine releases the slot cleanly before the next attempt. + if reason := benchmarkCalibrationThrottleReason(beforeThrottle[s.idx], after); reason != "" { + throttleReasons[s.idx] = reason + logFunc(fmt.Sprintf("power calibration: GPU %d detected %s throttle at %d W, waiting for run to finish", s.idx, reason, s.appliedLimitW)) } - case <-ctx.Done(): - cancel() - attempt = <-doneCh - break attemptLoop } + case <-ctx.Done(): + cancelAttempt() + ar = <-doneCh + break attemptLoop } - ticker.Stop() - cancel() - _ = os.WriteFile(filepath.Join(runDir, logName), attempt.out, 0644) + } + ticker.Stop() + cancelAttempt() + _ = os.WriteFile(filepath.Join(runDir, logName), ar.out, 0644) - perGPU := filterRowsByGPU(attempt.rows, idx) + // Resource busy: retry with exponential back-off (shared — one DCGM session). + if ar.err != nil && isDCGMResourceBusy(ar.err) { + if busyDelaySec > dcgmResourceBusyMaxDelaySec { + for _, s := range active { + s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("DCGM resource busy after %d retries, giving up", busyRetries)) + s.converged = true + } + logFunc(fmt.Sprintf("power calibration: DCGM resource persistently busy after %d retries, stopping", busyRetries)) + break calibDone + } + busyRetries++ + // Undo attempt counter: busy retries don't count as real attempts. + for _, s := range active { + s.calib.Attempts-- + } + logFunc(fmt.Sprintf("power calibration: DCGM resource busy (attempt %d), retrying in %ds", sharedAttempt, busyDelaySec)) + select { + case <-ctx.Done(): + break calibDone + case <-time.After(time.Duration(busyDelaySec) * time.Second): + } + next := busyDelaySec * 2 + if next > dcgmResourceBusyMaxDelaySec { + next = dcgmResourceBusyMaxDelaySec + 1 + } + busyDelaySec = next + sharedAttempt-- // retry same logical attempt number + continue + } + busyRetries = 0 + busyDelaySec = 1 + + // Per-GPU analysis and binary search update. + for _, s := range active { + perGPU := filterRowsByGPU(ar.rows, s.idx) summary := summarizeBenchmarkTelemetry(perGPU) - if throttleReason == "" && attempt.err == nil && summary.P95PowerW > 0 { - // Stable at appliedLimitW: record it and binary-search upward. - calib.Summary = summary - calib.Completed = true - calib.AppliedPowerLimitW = float64(appliedLimitW) - logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", idx, appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples)) - lo = appliedLimitW - // If there is still headroom to search, try a higher level. - if canDerate && hi-lo > calibSearchTolerance { - nextLimitW := roundTo5W((lo + hi) / 2) - if nextLimitW > lo && nextLimitW < hi { - if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, nextLimitW); err == nil { - appliedLimitW = nextLimitW - calib.AppliedPowerLimitW = float64(appliedLimitW) - calib.Notes = append(calib.Notes, fmt.Sprintf("binary search: stable at %d W, trying %d W (lo=%d hi=%d)", lo, nextLimitW, lo, hi)) - logFunc(fmt.Sprintf("power calibration: GPU %d binary search up: stable at %d W, trying %d W", idx, lo, nextLimitW)) - continue calibLoop + throttle := throttleReasons[s.idx] + + // Cooling warning: thermal throttle with fans not at maximum. + if strings.Contains(throttle, "thermal") && s.calib.CoolingWarning == "" { + clocks := make([]float64, 0, len(perGPU)) + var fanDutyValues []float64 + fanDutyAvail := false + for _, r := range perGPU { + if r.ClockMHz > 0 { + clocks = append(clocks, r.ClockMHz) + } + if r.FanDutyCycleAvailable { + fanDutyAvail = true + fanDutyValues = append(fanDutyValues, r.FanDutyCyclePct) + } + } + dropPct := benchmarkClockDrift(clocks) + p95FanDuty := benchmarkPercentile(fanDutyValues, 95) + if dropPct >= 20 && fanDutyAvail && p95FanDuty < 98 { + s.calib.CoolingWarning = fmt.Sprintf( + "thermal throttle (%s) caused a %.0f%% clock drop while fans were at %.0f%% duty cycle — server cooling may not be configured for full GPU load", + throttle, dropPct, p95FanDuty, + ) + logFunc(fmt.Sprintf("power calibration: GPU %d cooling warning: %s", s.idx, s.calib.CoolingWarning)) + } + } + + if throttle == "" && ar.err == nil && summary.P95PowerW > 0 { + // Stable at current limit — update lo and binary-search upward. + s.calib.Summary = summary + s.calib.Completed = true + s.calib.AppliedPowerLimitW = float64(s.appliedLimitW) + logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", s.idx, s.appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples)) + s.lo = s.appliedLimitW + if canDerate && s.hi-s.lo > calibSearchTolerance { + next := roundTo5W((s.lo + s.hi) / 2) + if next > s.lo && next < s.hi { + if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, next); err == nil { + s.appliedLimitW = next + s.calib.AppliedPowerLimitW = float64(next) + s.calib.Completed = false // keep searching + s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: stable at %d W, trying %d W (lo=%d hi=%d)", s.lo, next, s.lo, s.hi)) + logFunc(fmt.Sprintf("power calibration: GPU %d binary search up: stable at %d W, trying %d W", s.idx, s.lo, next)) + continue // next GPU in active list } } } - break + s.converged = true + continue } - // If DCGM reports the resource is in use, nv-hostengine has not yet - // released the diagnostic slot from the previous attempt. Do not - // derate: wait with exponential back-off and retry at the same - // power limit. Once the back-off delay would exceed - // dcgmResourceBusyMaxDelaySec, fail — the slot is persistently - // held by something else. - if attempt.err != nil && isDCGMResourceBusy(attempt.err) { - if busyDelaySec > dcgmResourceBusyMaxDelaySec { - calib.Notes = append(calib.Notes, fmt.Sprintf("DCGM resource busy after %d retries, giving up", busyRetries)) - logFunc(fmt.Sprintf("power calibration: GPU %d DCGM resource persistently busy after %d retries, stopping", idx, busyRetries)) - break - } - busyRetries++ - logFunc(fmt.Sprintf("power calibration: GPU %d DCGM resource busy (attempt %d), retrying in %ds", idx, calib.Attempts, busyDelaySec)) - select { - case <-ctx.Done(): - break calibLoop - case <-time.After(time.Duration(busyDelaySec) * time.Second): - } - next := busyDelaySec * 2 - if next > dcgmResourceBusyMaxDelaySec { - next = dcgmResourceBusyMaxDelaySec + 1 // sentinel: next busy → fail - } - busyDelaySec = next - continue calibLoop - } - busyRetries = 0 // reset on any non-busy outcome - busyDelaySec = 1 // reset back-off - + // Failed or throttled — log and binary-search downward. switch { - case throttleReason != "": - calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power was canceled on attempt %d after %s throttling at %d W", calib.Attempts, throttleReason, appliedLimitW)) - logFunc(fmt.Sprintf("power calibration: GPU %d throttled (%s) at %d W, reducing power limit", idx, throttleReason, appliedLimitW)) - // Check whether the thermal throttle coincided with fans below - // maximum: that combination suggests cooling misconfiguration - // rather than a fundamental power-delivery limit. - if strings.Contains(throttleReason, "thermal") && calib.CoolingWarning == "" { - clocks := make([]float64, 0, len(perGPU)) - var fanDutyValues []float64 - fanDutyAvail := false - for _, r := range perGPU { - if r.ClockMHz > 0 { - clocks = append(clocks, r.ClockMHz) - } - if r.FanDutyCycleAvailable { - fanDutyAvail = true - fanDutyValues = append(fanDutyValues, r.FanDutyCyclePct) - } - } - dropPct := benchmarkClockDrift(clocks) - p95FanDuty := benchmarkPercentile(fanDutyValues, 95) - if dropPct >= 20 && fanDutyAvail && p95FanDuty < 98 { - calib.CoolingWarning = fmt.Sprintf( - "thermal throttle (%s) caused a %.0f%% clock drop while fans were at %.0f%% duty cycle — server cooling may not be configured for full GPU load", - throttleReason, dropPct, p95FanDuty, - ) - logFunc(fmt.Sprintf("power calibration: GPU %d cooling warning: %s", idx, calib.CoolingWarning)) - } - } - case attempt.err != nil: - calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power attempt %d failed at %d W: %v", calib.Attempts, appliedLimitW, attempt.err)) - logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power failed at %d W: %v", idx, appliedLimitW, attempt.err)) + case throttle != "": + s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d: %s throttle at %d W", s.calib.Attempts, throttle, s.appliedLimitW)) + logFunc(fmt.Sprintf("power calibration: GPU %d throttled (%s) at %d W, reducing power limit", s.idx, throttle, s.appliedLimitW)) + case ar.err != nil: + s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d failed at %d W: %v", s.calib.Attempts, s.appliedLimitW, ar.err)) + logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power failed at %d W: %v", s.idx, s.appliedLimitW, ar.err)) default: - calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power attempt %d at %d W produced no valid power telemetry", calib.Attempts, appliedLimitW)) - logFunc(fmt.Sprintf("power calibration: GPU %d attempt %d at %d W produced no valid telemetry", idx, calib.Attempts, appliedLimitW)) + s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d at %d W: no valid power telemetry", s.calib.Attempts, s.appliedLimitW)) + logFunc(fmt.Sprintf("power calibration: GPU %d attempt %d at %d W: no valid telemetry", s.idx, s.calib.Attempts, s.appliedLimitW)) } - if !canDerate || appliedLimitW <= 0 { - break + if !canDerate || s.appliedLimitW <= 0 { + s.converged = true + continue } - // Binary-search for the highest stable power limit. - // This attempt failed or throttled, so update the upper bound. - hi = appliedLimitW + s.hi = s.appliedLimitW - if hi-lo <= calibSearchTolerance { - // Search range exhausted: lo is the highest verified-stable level. - if lo > minLimitW { - calib.Notes = append(calib.Notes, fmt.Sprintf("binary search converged: using %d W (lo=%d hi=%d)", lo, lo, hi)) - if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, lo); err == nil { - appliedLimitW = lo - calib.AppliedPowerLimitW = float64(lo) - calib.Derated = lo < originalLimitW + if s.hi-s.lo <= calibSearchTolerance { + if s.lo > s.minLimitW { + s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search converged: using %d W (lo=%d hi=%d)", s.lo, s.lo, s.hi)) + if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, s.lo); err == nil { + s.appliedLimitW = s.lo + s.calib.AppliedPowerLimitW = float64(s.lo) + s.calib.Derated = s.lo < s.originalLimitW } } else { - calib.Notes = append(calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW)) + s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW)) } - break + s.converged = true + continue } - // Binary midpoint within the remaining search range. - nextLimitW := roundTo5W((lo + hi) / 2) - // Ensure the candidate is strictly inside the search range. - if nextLimitW <= lo { - nextLimitW = lo + calibSearchTolerance + next := roundTo5W((s.lo + s.hi) / 2) + if next <= s.lo { + next = s.lo + calibSearchTolerance } - if nextLimitW >= hi { - nextLimitW = (lo + hi) / 2 + if next >= s.hi { + next = (s.lo + s.hi) / 2 } - if nextLimitW < minLimitW { - calib.Notes = append(calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW)) - break + if next < s.minLimitW { + s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW)) + s.converged = true + continue } - if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, nextLimitW); err != nil { - calib.Notes = append(calib.Notes, "failed to set power limit: "+err.Error()) - logFunc(fmt.Sprintf("power calibration: GPU %d failed to set power limit %d W: %v", idx, nextLimitW, err)) - break + if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, next); err != nil { + s.calib.Notes = append(s.calib.Notes, "failed to set power limit: "+err.Error()) + logFunc(fmt.Sprintf("power calibration: GPU %d failed to set power limit %d W: %v", s.idx, next, err)) + s.converged = true + continue } - appliedLimitW = nextLimitW - calib.AppliedPowerLimitW = float64(appliedLimitW) - calib.Derated = appliedLimitW < originalLimitW - info.PowerLimitW = float64(appliedLimitW) - infoByIndex[idx] = info - calib.Notes = append(calib.Notes, fmt.Sprintf("binary search: trying %d W (lo=%d hi=%d)", nextLimitW, lo, hi)) - logFunc(fmt.Sprintf("power calibration: GPU %d binary search: trying %d W (lo=%d hi=%d)", idx, nextLimitW, lo, hi)) + s.appliedLimitW = next + s.calib.AppliedPowerLimitW = float64(next) + s.calib.Derated = next < s.originalLimitW + s.info.PowerLimitW = float64(next) + infoByIndex[s.idx] = s.info + s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: trying %d W (lo=%d hi=%d)", next, s.lo, s.hi)) + logFunc(fmt.Sprintf("power calibration: GPU %d binary search: trying %d W (lo=%d hi=%d)", s.idx, next, s.lo, s.hi)) } + } - if calib.Completed || calib.Attempts > 0 || len(calib.Notes) > 0 { - results[idx] = calib + for _, s := range states { + if s.calib.Completed || s.calib.Attempts > 0 || len(s.calib.Notes) > 0 { + results[s.idx] = s.calib } } return results, restore