audit: switch power benchmark load to dcgmproftester
This commit is contained in:
@@ -240,6 +240,47 @@ func setBenchmarkPowerLimit(ctx context.Context, verboseLog string, gpuIndex, po
|
||||
return nil
|
||||
}
|
||||
|
||||
func benchmarkPowerEngine() string {
|
||||
switch strings.TrimSpace(strings.ToLower(os.Getenv("BEE_BENCH_POWER_ENGINE"))) {
|
||||
case BenchmarkPowerEngineTargetedPower:
|
||||
return BenchmarkPowerEngineTargetedPower
|
||||
default:
|
||||
return BenchmarkPowerEngineDCGMProfTester
|
||||
}
|
||||
}
|
||||
|
||||
func benchmarkPowerEngineLabel(engine string) string {
|
||||
switch strings.TrimSpace(strings.ToLower(engine)) {
|
||||
case BenchmarkPowerEngineTargetedPower:
|
||||
return "dcgmi diag targeted_power"
|
||||
default:
|
||||
return "dcgmproftester"
|
||||
}
|
||||
}
|
||||
|
||||
func resolveBenchmarkPowerLoadCommand(durationSec int, gpuIndices []int) ([]string, []string, error) {
|
||||
engine := benchmarkPowerEngine()
|
||||
durationSec = normalizeNvidiaBurnDuration(durationSec)
|
||||
switch engine {
|
||||
case BenchmarkPowerEngineTargetedPower:
|
||||
return nvidiaDCGMNamedDiagCommand("targeted_power", durationSec, gpuIndices), nil, nil
|
||||
default:
|
||||
if len(gpuIndices) > 1 {
|
||||
return []string{
|
||||
"bee-dcgmproftester-staggered",
|
||||
"--seconds", strconv.Itoa(durationSec),
|
||||
"--stagger-seconds", "0",
|
||||
"--devices", joinIndexList(gpuIndices),
|
||||
}, nil, nil
|
||||
}
|
||||
cmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(durationSec))
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
return cmd, nvidiaVisibleDevicesEnv(gpuIndices), nil
|
||||
}
|
||||
}
|
||||
|
||||
func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||
if ctx == nil {
|
||||
ctx = context.Background()
|
||||
@@ -384,10 +425,10 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
||||
|
||||
// Sample server idle power once (first GPU only — server state is global).
|
||||
if !serverIdleOK {
|
||||
if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok {
|
||||
if w, ok := sampleBenchmarkPowerSourceSeries(ctx, opts.ServerPowerSource, maxInt(spec.BaselineSec, 10), benchmarkPowerAutotuneSampleInterval); ok {
|
||||
serverIdleW = w
|
||||
serverIdleOK = true
|
||||
logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
|
||||
logFunc(fmt.Sprintf("server idle power (%s): %.0f W", opts.ServerPowerSource, w))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -430,7 +471,16 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
||||
"--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases),
|
||||
}
|
||||
logFunc(fmt.Sprintf("GPU %d: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", idx, len(supportedPrecisions), basePhaseSec, mixedPhaseSec))
|
||||
serverPowerStopCh := make(chan struct{})
|
||||
serverPowerCh := startSelectedPowerSourceSampler(serverPowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
|
||||
_, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-precision-plan.log", idx), planCmd, nil, []int{idx}, planPhases, logFunc)
|
||||
close(serverPowerStopCh)
|
||||
if serverPowerSamples := <-serverPowerCh; len(serverPowerSamples) > 0 {
|
||||
serverLoadedWSum += benchmarkMean(serverPowerSamples)
|
||||
serverLoadedSamples++
|
||||
serverLoadedOK = true
|
||||
logFunc(fmt.Sprintf("GPU %d: server loaded power (%s avg): %.0f W", idx, opts.ServerPowerSource, benchmarkMean(serverPowerSamples)))
|
||||
}
|
||||
for _, phaseSpec := range planPhases {
|
||||
if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 {
|
||||
appendBenchmarkMetrics(&metricRows, rows, phaseSpec.MetricStage, &metricTimelineSec, float64(phaseSpec.DurationSec))
|
||||
@@ -461,48 +511,6 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
||||
|
||||
beforeThrottle, _ := queryThrottleCounters(idx)
|
||||
logFunc(fmt.Sprintf("GPU %d: steady compute (combined, %ds)", idx, mixedPhaseSec))
|
||||
|
||||
// Sample server power via IPMI in parallel with the steady phase.
|
||||
// We collect readings every 5s and average them.
|
||||
ipmiStopCh := make(chan struct{})
|
||||
ipmiResultCh := make(chan float64, 1)
|
||||
go func() {
|
||||
defer close(ipmiResultCh)
|
||||
var samples []float64
|
||||
ticker := time.NewTicker(5 * time.Second)
|
||||
defer ticker.Stop()
|
||||
// First sample after a short warmup delay.
|
||||
select {
|
||||
case <-ipmiStopCh:
|
||||
return
|
||||
case <-time.After(15 * time.Second):
|
||||
}
|
||||
for {
|
||||
if w, err := queryIPMIServerPowerW(); err == nil {
|
||||
samples = append(samples, w)
|
||||
}
|
||||
select {
|
||||
case <-ipmiStopCh:
|
||||
if len(samples) > 0 {
|
||||
var sum float64
|
||||
for _, w := range samples {
|
||||
sum += w
|
||||
}
|
||||
ipmiResultCh <- sum / float64(len(samples))
|
||||
}
|
||||
return
|
||||
case <-ticker.C:
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
close(ipmiStopCh)
|
||||
if loadedW, ok := <-ipmiResultCh; ok {
|
||||
serverLoadedWSum += loadedW
|
||||
serverLoadedSamples++
|
||||
serverLoadedOK = true
|
||||
logFunc(fmt.Sprintf("GPU %d: server loaded power (IPMI): %.0f W", idx, loadedW))
|
||||
}
|
||||
afterThrottle, _ := queryThrottleCounters(idx)
|
||||
if planErr != nil {
|
||||
gpuResult.Notes = append(gpuResult.Notes, "precision plan failed: "+planErr.Error())
|
||||
@@ -652,7 +660,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
||||
if serverLoadedSamples > 0 {
|
||||
serverLoadedW = serverLoadedWSum / float64(serverLoadedSamples)
|
||||
}
|
||||
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuReportedSumW, serverIdleOK && serverLoadedOK)
|
||||
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuReportedSumW, opts.ServerPowerSource, serverIdleOK && serverLoadedOK)
|
||||
result.Cooling = summarizeBenchmarkCooling(metricRows)
|
||||
|
||||
// Apply server-power penalty when IPMI reports the server delta is much
|
||||
@@ -707,6 +715,7 @@ func normalizeNvidiaBenchmarkOptionsForBenchmark(opts NvidiaBenchmarkOptions) Nv
|
||||
if opts.SizeMB < 0 {
|
||||
opts.SizeMB = 0
|
||||
}
|
||||
opts.ServerPowerSource = normalizeBenchmarkPowerSource(opts.ServerPowerSource)
|
||||
opts.GPUIndices = dedupeSortedIndices(opts.GPUIndices)
|
||||
opts.ExcludeGPUIndices = dedupeSortedIndices(opts.ExcludeGPUIndices)
|
||||
return opts
|
||||
@@ -2535,10 +2544,14 @@ loop:
|
||||
}
|
||||
|
||||
// characterizeServerPower computes BenchmarkServerPower from idle and loaded
|
||||
// IPMI samples plus the GPU-reported average power during steady state.
|
||||
func characterizeServerPower(idleW, loadedW, gpuReportedSumW float64, ipmiAvailable bool) *BenchmarkServerPower {
|
||||
sp := &BenchmarkServerPower{Available: ipmiAvailable}
|
||||
if !ipmiAvailable {
|
||||
// samples plus the GPU-reported average power during steady state.
|
||||
func characterizeServerPower(idleW, loadedW, gpuReportedSumW float64, source string, available bool) *BenchmarkServerPower {
|
||||
sp := &BenchmarkServerPower{
|
||||
Available: available,
|
||||
Source: normalizeBenchmarkPowerSource(source),
|
||||
SampleIntervalSec: benchmarkPowerAutotuneSampleInterval,
|
||||
}
|
||||
if !available {
|
||||
sp.Notes = append(sp.Notes, "IPMI power reading unavailable; server-side power characterization skipped")
|
||||
return sp
|
||||
}
|
||||
@@ -2671,10 +2684,10 @@ func runNvidiaBenchmarkParallel(
|
||||
|
||||
// Sample server idle power once.
|
||||
if !*serverIdleOK {
|
||||
if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok {
|
||||
if w, ok := sampleBenchmarkPowerSourceSeries(ctx, opts.ServerPowerSource, maxInt(spec.BaselineSec, 10), benchmarkPowerAutotuneSampleInterval); ok {
|
||||
*serverIdleW = w
|
||||
*serverIdleOK = true
|
||||
logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
|
||||
logFunc(fmt.Sprintf("server idle power (%s): %.0f W", opts.ServerPowerSource, w))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2728,7 +2741,16 @@ func runNvidiaBenchmarkParallel(
|
||||
"--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases),
|
||||
}
|
||||
logFunc(fmt.Sprintf("GPUs %s: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", allDevices, len(supportedPrecisions), basePhaseSec, mixedPhaseSec))
|
||||
serverPowerStopCh := make(chan struct{})
|
||||
serverPowerCh := startSelectedPowerSourceSampler(serverPowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
|
||||
_, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, "gpu-all-precision-plan.log", planCmd, nil, selected, planPhases, logFunc)
|
||||
close(serverPowerStopCh)
|
||||
if serverPowerSamples := <-serverPowerCh; len(serverPowerSamples) > 0 {
|
||||
*serverLoadedWSum += benchmarkMean(serverPowerSamples)
|
||||
(*serverLoadedSamples)++
|
||||
*serverLoadedOK = true
|
||||
logFunc(fmt.Sprintf("GPUs %s: server loaded power (%s avg): %.0f W", allDevices, opts.ServerPowerSource, benchmarkMean(serverPowerSamples)))
|
||||
}
|
||||
for _, phaseSpec := range planPhases {
|
||||
if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 {
|
||||
appendBenchmarkMetrics(allMetricRows, rows, phaseSpec.MetricStage, metricTimelineSec, float64(phaseSpec.DurationSec))
|
||||
@@ -2770,46 +2792,6 @@ func runNvidiaBenchmarkParallel(
|
||||
}
|
||||
|
||||
logFunc(fmt.Sprintf("GPUs %s: parallel steady compute (combined, %ds)", allDevices, mixedPhaseSec))
|
||||
|
||||
// Sample server power via IPMI in parallel with steady phase.
|
||||
ipmiStopCh := make(chan struct{})
|
||||
ipmiResultCh := make(chan float64, 1)
|
||||
go func() {
|
||||
defer close(ipmiResultCh)
|
||||
var samples []float64
|
||||
ticker := time.NewTicker(5 * time.Second)
|
||||
defer ticker.Stop()
|
||||
select {
|
||||
case <-ipmiStopCh:
|
||||
return
|
||||
case <-time.After(15 * time.Second):
|
||||
}
|
||||
for {
|
||||
if w, err := queryIPMIServerPowerW(); err == nil {
|
||||
samples = append(samples, w)
|
||||
}
|
||||
select {
|
||||
case <-ipmiStopCh:
|
||||
if len(samples) > 0 {
|
||||
var sum float64
|
||||
for _, w := range samples {
|
||||
sum += w
|
||||
}
|
||||
ipmiResultCh <- sum / float64(len(samples))
|
||||
}
|
||||
return
|
||||
case <-ticker.C:
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
close(ipmiStopCh)
|
||||
if loadedW, ok := <-ipmiResultCh; ok {
|
||||
*serverLoadedWSum += loadedW
|
||||
(*serverLoadedSamples)++
|
||||
*serverLoadedOK = true
|
||||
logFunc(fmt.Sprintf("GPUs %s: server loaded power (IPMI): %.0f W", allDevices, loadedW))
|
||||
}
|
||||
afterThrottle := make(map[int]BenchmarkThrottleCounters, len(selected))
|
||||
for _, idx := range selected {
|
||||
afterThrottle[idx], _ = queryThrottleCounters(idx)
|
||||
@@ -3040,8 +3022,8 @@ func summarizeCPULoad(samples []float64) *BenchmarkCPULoad {
|
||||
return cl
|
||||
}
|
||||
|
||||
// runBenchmarkPowerCalibration runs targeted_power for the supplied GPU set and
|
||||
// actively watches throttle counters. seedLimits, when provided, are treated as
|
||||
// runBenchmarkPowerCalibration runs the configured power-fit load for the supplied
|
||||
// GPU set and actively watches throttle counters. seedLimits, when provided, are treated as
|
||||
// the starting point for this calibration pass rather than as immutable fixed
|
||||
// limits. This matters during cumulative ramp-up: once an additional GPU is
|
||||
// introduced, every already-active GPU must be revalidated under the new
|
||||
@@ -3070,10 +3052,19 @@ func runBenchmarkPowerCalibration(
|
||||
// doubling each retry until it would exceed the cap, at which point the
|
||||
// next busy response fails the calibration immediately.
|
||||
const dcgmResourceBusyMaxDelaySec = 300
|
||||
engine := benchmarkPowerEngine()
|
||||
engineLabel := benchmarkPowerEngineLabel(engine)
|
||||
|
||||
if _, err := exec.LookPath("dcgmi"); err != nil {
|
||||
logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
|
||||
return map[int]benchmarkPowerCalibrationResult{}, nil, nil
|
||||
if engine == BenchmarkPowerEngineTargetedPower {
|
||||
if _, err := exec.LookPath("dcgmi"); err != nil {
|
||||
logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
|
||||
return map[int]benchmarkPowerCalibrationResult{}, nil, nil
|
||||
}
|
||||
} else {
|
||||
if _, _, err := resolveBenchmarkPowerLoadCommand(calibDurationSec, gpuIndices); err != nil {
|
||||
logFunc("power calibration: dcgmproftester not found, skipping (will use default power limit)")
|
||||
return map[int]benchmarkPowerCalibrationResult{}, nil, nil
|
||||
}
|
||||
}
|
||||
if killed := KillTestWorkers(); len(killed) > 0 {
|
||||
for _, p := range killed {
|
||||
@@ -3206,7 +3197,7 @@ calibDone:
|
||||
sharedAttempt++
|
||||
for _, s := range active {
|
||||
s.calib.Attempts++
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power attempt %d at %d W for %ds", s.idx, s.calib.Attempts, s.appliedLimitW, calibDurationSec))
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d %s attempt %d at %d W for %ds", s.idx, engineLabel, s.calib.Attempts, s.appliedLimitW, calibDurationSec))
|
||||
}
|
||||
|
||||
// Snapshot throttle counters for all active GPUs before the run.
|
||||
@@ -3215,14 +3206,22 @@ calibDone:
|
||||
beforeThrottle[s.idx], _ = queryThrottleCounters(s.idx)
|
||||
}
|
||||
|
||||
// Run targeted_power for ALL gpuIndices simultaneously so every card
|
||||
// Run the selected power-fit load for ALL gpuIndices simultaneously so every card
|
||||
// is under load during calibration — this reflects real server thermals.
|
||||
logName := fmt.Sprintf("power-calibration-attempt-%d.log", sharedAttempt)
|
||||
cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, gpuIndices)
|
||||
cmd, env, err := resolveBenchmarkPowerLoadCommand(calibDurationSec, gpuIndices)
|
||||
if err != nil {
|
||||
for _, s := range active {
|
||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("failed to resolve %s command: %v", engineLabel, err))
|
||||
s.converged = true
|
||||
}
|
||||
logFunc(fmt.Sprintf("power calibration: failed to resolve %s command: %v", engineLabel, err))
|
||||
break calibDone
|
||||
}
|
||||
attemptCtx, cancelAttempt := context.WithCancel(ctx)
|
||||
doneCh := make(chan sharedAttemptResult, 1)
|
||||
go func() {
|
||||
out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, nil, gpuIndices, logFunc)
|
||||
out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, env, gpuIndices, logFunc)
|
||||
doneCh <- sharedAttemptResult{out: out, rows: rows, err: err}
|
||||
}()
|
||||
|
||||
@@ -3245,8 +3244,8 @@ calibDone:
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
// Record throttle but do NOT cancel — let dcgmi finish so
|
||||
// nv-hostengine releases the slot cleanly before the next attempt.
|
||||
// Record throttle but do NOT cancel — let the load command finish so
|
||||
// runtime resources release cleanly before the next attempt.
|
||||
if reason := benchmarkCalibrationThrottleReason(beforeThrottle[s.idx], after); reason != "" {
|
||||
throttleReasons[s.idx] = reason
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d detected %s throttle at %d W, waiting for run to finish", s.idx, reason, s.appliedLimitW))
|
||||
@@ -3359,9 +3358,9 @@ calibDone:
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d throttled (%s) at %d W, reducing power limit", s.idx, throttle, s.appliedLimitW))
|
||||
case ar.err != nil:
|
||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d failed at %d W: %v", s.calib.Attempts, s.appliedLimitW, ar.err))
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power failed at %d W: %v", s.idx, s.appliedLimitW, ar.err))
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d %s failed at %d W: %v", s.idx, engineLabel, s.appliedLimitW, ar.err))
|
||||
default:
|
||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d at %d W: no valid power telemetry", s.calib.Attempts, s.appliedLimitW))
|
||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("%s attempt %d at %d W: no valid power telemetry", engineLabel, s.calib.Attempts, s.appliedLimitW))
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d attempt %d at %d W: no valid telemetry", s.idx, s.calib.Attempts, s.appliedLimitW))
|
||||
}
|
||||
|
||||
@@ -3384,7 +3383,7 @@ calibDone:
|
||||
s.calib.Completed = true
|
||||
}
|
||||
} else {
|
||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
|
||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit within %d W of the default", engineLabel, maxDerateW))
|
||||
}
|
||||
s.calib.MetricRows = filterRowsByGPU(ar.rows, s.idx)
|
||||
s.converged = true
|
||||
@@ -3399,7 +3398,7 @@ calibDone:
|
||||
next = (s.lo + s.hi) / 2
|
||||
}
|
||||
if next < s.minLimitW {
|
||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
|
||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit within %d W of the default", engineLabel, maxDerateW))
|
||||
s.converged = true
|
||||
continue
|
||||
}
|
||||
@@ -4117,13 +4116,13 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
}
|
||||
durationSec := powerBenchDurationSec(opts.Profile)
|
||||
|
||||
// Sample IPMI idle power before any GPU load.
|
||||
// Sample server idle power before any GPU load.
|
||||
var serverIdleW float64
|
||||
var serverIdleOK bool
|
||||
if w, ok := sampleIPMIPowerSeries(ctx, 10); ok {
|
||||
if w, ok := sampleBenchmarkPowerSourceSeries(ctx, opts.ServerPowerSource, 10, benchmarkPowerAutotuneSampleInterval); ok {
|
||||
serverIdleW = w
|
||||
serverIdleOK = true
|
||||
logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
|
||||
logFunc(fmt.Sprintf("server idle power (%s): %.0f W", opts.ServerPowerSource, w))
|
||||
}
|
||||
sdrIdle := sampleIPMISDRPowerSensors()
|
||||
psuBefore := psuStatusSnapshot()
|
||||
@@ -4141,26 +4140,18 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
_ = os.MkdirAll(singleDir, 0755)
|
||||
singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
|
||||
ipmiSingleCtx, ipmiSingleCancel := context.WithCancel(ctx)
|
||||
ipmiSingleDone := make(chan float64, 1)
|
||||
go func() {
|
||||
defer close(ipmiSingleDone)
|
||||
if w, ok := sampleIPMIPowerSeries(ipmiSingleCtx, 3600); ok {
|
||||
ipmiSingleDone <- w
|
||||
}
|
||||
}()
|
||||
singlePowerStopCh := make(chan struct{})
|
||||
singlePowerCh := startSelectedPowerSourceSampler(singlePowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
|
||||
c, restore, singleRows := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil, durationSec)
|
||||
appendBenchmarkMetrics(&allPowerRows, singleRows, fmt.Sprintf("single-gpu-%d", idx), &powerCursor, 0)
|
||||
ipmiSingleCancel()
|
||||
close(singlePowerStopCh)
|
||||
sdrSingle := sampleIPMISDRPowerSensors()
|
||||
if sdrSingle.PSUInW > 0 {
|
||||
if samples := <-singlePowerCh; len(samples) > 0 {
|
||||
singleIPMILoadedW[idx] = benchmarkMean(samples)
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d single-card server power (%s avg): %.0f W", idx, opts.ServerPowerSource, singleIPMILoadedW[idx]))
|
||||
} else if opts.ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && sdrSingle.PSUInW > 0 {
|
||||
singleIPMILoadedW[idx] = sdrSingle.PSUInW
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d single-card IPMI loaded: %.0f W (SDR PSU AC input)", idx, sdrSingle.PSUInW))
|
||||
} else if w, ok := <-ipmiSingleDone; ok {
|
||||
singleIPMILoadedW[idx] = w
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d single-card IPMI loaded: %.0f W (DCMI)", idx, w))
|
||||
} else {
|
||||
<-ipmiSingleDone // drain channel
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d single-card fallback server power (SDR snapshot): %.0f W", idx, sdrSingle.PSUInW))
|
||||
}
|
||||
allRestoreActions = append(allRestoreActions, restore...)
|
||||
if r, ok := c[idx]; ok {
|
||||
@@ -4234,11 +4225,11 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
result.RecommendedSlotOrder = append(result.RecommendedSlotOrder, gpu.Index)
|
||||
}
|
||||
if len(result.RecommendedSlotOrder) > 0 {
|
||||
result.Findings = append(result.Findings, fmt.Sprintf("Recommended slot order for installation based on single-card targeted_power: %s.", joinIndexList(result.RecommendedSlotOrder)))
|
||||
result.Findings = append(result.Findings, fmt.Sprintf("Recommended slot order for installation based on single-card %s: %s.", benchmarkPowerEngineLabel(benchmarkPowerEngine()), joinIndexList(result.RecommendedSlotOrder)))
|
||||
}
|
||||
for _, gpu := range gpus {
|
||||
if gpu.Derated {
|
||||
result.Findings = append(result.Findings, fmt.Sprintf("GPU %d required reduced power limit %.0f W to complete targeted_power.", gpu.Index, gpu.AppliedPowerLimitW))
|
||||
result.Findings = append(result.Findings, fmt.Sprintf("GPU %d required reduced power limit %.0f W to complete %s.", gpu.Index, gpu.AppliedPowerLimitW, benchmarkPowerEngineLabel(benchmarkPowerEngine())))
|
||||
}
|
||||
if gpu.CoolingWarning != "" {
|
||||
result.Findings = append(result.Findings, fmt.Sprintf(
|
||||
@@ -4255,7 +4246,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
// Phase 2: cumulative thermal ramp.
|
||||
// Each step introduces one new GPU into an environment where all previously
|
||||
// calibrated GPUs are already running at their fixed stable limits. The new
|
||||
// GPU's stable TDP is searched via binary search (targeted_power) under real
|
||||
// GPU's stable TDP is searched via binary search under real
|
||||
// multi-GPU thermal load. Once found, its limit is fixed permanently for all
|
||||
// subsequent steps. This ensures each GPU's limit reflects actual sustained
|
||||
// power in the final full-system thermal state.
|
||||
@@ -4294,7 +4285,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
}
|
||||
if !firstCalib.Completed {
|
||||
ramp.Status = "FAILED"
|
||||
ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card targeted_power", firstIdx))
|
||||
ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card %s", firstIdx, benchmarkPowerEngineLabel(benchmarkPowerEngine())))
|
||||
result.OverallStatus = "PARTIAL"
|
||||
} else if firstCalib.Derated {
|
||||
ramp.Status = "PARTIAL"
|
||||
@@ -4340,21 +4331,15 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
step, len(result.RecommendedSlotOrder), len(subset), newGPUIdx))
|
||||
|
||||
stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
||||
ipmiStepCtx, ipmiStepCancel := context.WithCancel(ctx)
|
||||
ipmiStepDone := make(chan float64, 1)
|
||||
go func() {
|
||||
defer close(ipmiStepDone)
|
||||
if w, ok := sampleIPMIPowerSeries(ipmiStepCtx, 3600); ok {
|
||||
ipmiStepDone <- w
|
||||
}
|
||||
}()
|
||||
stepPowerStopCh := make(chan struct{})
|
||||
stepPowerCh := startSelectedPowerSourceSampler(stepPowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
|
||||
stepCalib, stepRestore, stepRows := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep, durationSec)
|
||||
appendBenchmarkMetrics(&allPowerRows, stepRows, fmt.Sprintf("ramp-step-%d", step), &powerCursor, 0)
|
||||
ipmiStepCancel()
|
||||
close(stepPowerStopCh)
|
||||
var stepIPMILoadedW float64
|
||||
var stepIPMIOK bool
|
||||
if w, ok := <-ipmiStepDone; ok {
|
||||
stepIPMILoadedW = w
|
||||
if samples := <-stepPowerCh; len(samples) > 0 {
|
||||
stepIPMILoadedW = benchmarkMean(samples)
|
||||
stepIPMIOK = true
|
||||
}
|
||||
// Accumulate restore actions; they all run in the outer defer.
|
||||
@@ -4391,7 +4376,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
}
|
||||
ramp.Status = "FAILED"
|
||||
ramp.Notes = append(ramp.Notes,
|
||||
fmt.Sprintf("GPU %d did not complete targeted_power in ramp step %d; keeping previous stable limit %d W", idx, step, fallback))
|
||||
fmt.Sprintf("GPU %d did not complete %s in ramp step %d; keeping previous stable limit %d W", idx, benchmarkPowerEngineLabel(benchmarkPowerEngine()), step, fallback))
|
||||
result.OverallStatus = "PARTIAL"
|
||||
continue
|
||||
}
|
||||
@@ -4427,24 +4412,24 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
ramp.PSUSlotReadings = sdrStep.PSUSlots
|
||||
}
|
||||
|
||||
if sdrStep.PSUInW > 0 {
|
||||
// SDR PSU sum is available — use it for server power (includes all PSUs).
|
||||
ramp.ServerLoadedW = sdrStep.PSUInW
|
||||
ramp.ServerDeltaW = sdrStep.PSUInW - sdrIdle.PSUInW
|
||||
logFunc(fmt.Sprintf("power ramp: step %d IPMI loaded: %.0f W (SDR PSU AC input)", step, sdrStep.PSUInW))
|
||||
if step == len(result.RecommendedSlotOrder) {
|
||||
serverLoadedW = sdrStep.PSUInW
|
||||
serverLoadedOK = true
|
||||
sdrLastStep = sdrStep
|
||||
}
|
||||
} else if stepIPMIOK && serverIdleOK && stepIPMILoadedW > 0 {
|
||||
if stepIPMIOK && serverIdleOK && stepIPMILoadedW > 0 {
|
||||
ramp.ServerLoadedW = stepIPMILoadedW
|
||||
ramp.ServerDeltaW = stepIPMILoadedW - serverIdleW
|
||||
logFunc(fmt.Sprintf("power ramp: step %d IPMI loaded: %.0f W (DCMI)", step, stepIPMILoadedW))
|
||||
logFunc(fmt.Sprintf("power ramp: step %d server loaded power (%s avg): %.0f W", step, opts.ServerPowerSource, stepIPMILoadedW))
|
||||
// The last step has all GPUs loaded — use it as the top-level loaded_w.
|
||||
if step == len(result.RecommendedSlotOrder) {
|
||||
serverLoadedW = stepIPMILoadedW
|
||||
serverLoadedOK = true
|
||||
sdrLastStep = sdrStep
|
||||
}
|
||||
} else if opts.ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && sdrStep.PSUInW > 0 {
|
||||
ramp.ServerLoadedW = sdrStep.PSUInW
|
||||
ramp.ServerDeltaW = sdrStep.PSUInW - sdrIdle.PSUInW
|
||||
logFunc(fmt.Sprintf("power ramp: step %d fallback server loaded power (SDR snapshot): %.0f W", step, sdrStep.PSUInW))
|
||||
if step == len(result.RecommendedSlotOrder) {
|
||||
serverLoadedW = sdrStep.PSUInW
|
||||
serverLoadedOK = true
|
||||
sdrLastStep = sdrStep
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4502,7 +4487,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
gpuActualSumW = result.PlatformMaxTDPW
|
||||
}
|
||||
_ = serverIdleOK // used implicitly via characterizeServerPower
|
||||
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuActualSumW, serverIdleOK && serverLoadedOK)
|
||||
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuActualSumW, opts.ServerPowerSource, serverIdleOK && serverLoadedOK)
|
||||
// Supplement DCMI with SDR multi-source data via collector's PSU slot patterns.
|
||||
// Per-slot readings enable correlation with audit HardwarePowerSupply entries.
|
||||
if result.ServerPower != nil {
|
||||
|
||||
Reference in New Issue
Block a user