diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go index 264168e..1b4e4d2 100644 --- a/audit/internal/platform/benchmark.go +++ b/audit/internal/platform/benchmark.go @@ -240,6 +240,47 @@ func setBenchmarkPowerLimit(ctx context.Context, verboseLog string, gpuIndex, po return nil } +func benchmarkPowerEngine() string { + switch strings.TrimSpace(strings.ToLower(os.Getenv("BEE_BENCH_POWER_ENGINE"))) { + case BenchmarkPowerEngineTargetedPower: + return BenchmarkPowerEngineTargetedPower + default: + return BenchmarkPowerEngineDCGMProfTester + } +} + +func benchmarkPowerEngineLabel(engine string) string { + switch strings.TrimSpace(strings.ToLower(engine)) { + case BenchmarkPowerEngineTargetedPower: + return "dcgmi diag targeted_power" + default: + return "dcgmproftester" + } +} + +func resolveBenchmarkPowerLoadCommand(durationSec int, gpuIndices []int) ([]string, []string, error) { + engine := benchmarkPowerEngine() + durationSec = normalizeNvidiaBurnDuration(durationSec) + switch engine { + case BenchmarkPowerEngineTargetedPower: + return nvidiaDCGMNamedDiagCommand("targeted_power", durationSec, gpuIndices), nil, nil + default: + if len(gpuIndices) > 1 { + return []string{ + "bee-dcgmproftester-staggered", + "--seconds", strconv.Itoa(durationSec), + "--stagger-seconds", "0", + "--devices", joinIndexList(gpuIndices), + }, nil, nil + } + cmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(durationSec)) + if err != nil { + return nil, nil, err + } + return cmd, nvidiaVisibleDevicesEnv(gpuIndices), nil + } +} + func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) { if ctx == nil { ctx = context.Background() @@ -384,10 +425,10 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv // Sample server idle power once (first GPU only — server state is global). if !serverIdleOK { - if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok { + if w, ok := sampleBenchmarkPowerSourceSeries(ctx, opts.ServerPowerSource, maxInt(spec.BaselineSec, 10), benchmarkPowerAutotuneSampleInterval); ok { serverIdleW = w serverIdleOK = true - logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w)) + logFunc(fmt.Sprintf("server idle power (%s): %.0f W", opts.ServerPowerSource, w)) } } @@ -430,7 +471,16 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv "--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases), } logFunc(fmt.Sprintf("GPU %d: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", idx, len(supportedPrecisions), basePhaseSec, mixedPhaseSec)) + serverPowerStopCh := make(chan struct{}) + serverPowerCh := startSelectedPowerSourceSampler(serverPowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval) _, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-precision-plan.log", idx), planCmd, nil, []int{idx}, planPhases, logFunc) + close(serverPowerStopCh) + if serverPowerSamples := <-serverPowerCh; len(serverPowerSamples) > 0 { + serverLoadedWSum += benchmarkMean(serverPowerSamples) + serverLoadedSamples++ + serverLoadedOK = true + logFunc(fmt.Sprintf("GPU %d: server loaded power (%s avg): %.0f W", idx, opts.ServerPowerSource, benchmarkMean(serverPowerSamples))) + } for _, phaseSpec := range planPhases { if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 { appendBenchmarkMetrics(&metricRows, rows, phaseSpec.MetricStage, &metricTimelineSec, float64(phaseSpec.DurationSec)) @@ -461,48 +511,6 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv beforeThrottle, _ := queryThrottleCounters(idx) logFunc(fmt.Sprintf("GPU %d: steady compute (combined, %ds)", idx, mixedPhaseSec)) - - // Sample server power via IPMI in parallel with the steady phase. - // We collect readings every 5s and average them. - ipmiStopCh := make(chan struct{}) - ipmiResultCh := make(chan float64, 1) - go func() { - defer close(ipmiResultCh) - var samples []float64 - ticker := time.NewTicker(5 * time.Second) - defer ticker.Stop() - // First sample after a short warmup delay. - select { - case <-ipmiStopCh: - return - case <-time.After(15 * time.Second): - } - for { - if w, err := queryIPMIServerPowerW(); err == nil { - samples = append(samples, w) - } - select { - case <-ipmiStopCh: - if len(samples) > 0 { - var sum float64 - for _, w := range samples { - sum += w - } - ipmiResultCh <- sum / float64(len(samples)) - } - return - case <-ticker.C: - } - } - }() - - close(ipmiStopCh) - if loadedW, ok := <-ipmiResultCh; ok { - serverLoadedWSum += loadedW - serverLoadedSamples++ - serverLoadedOK = true - logFunc(fmt.Sprintf("GPU %d: server loaded power (IPMI): %.0f W", idx, loadedW)) - } afterThrottle, _ := queryThrottleCounters(idx) if planErr != nil { gpuResult.Notes = append(gpuResult.Notes, "precision plan failed: "+planErr.Error()) @@ -652,7 +660,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv if serverLoadedSamples > 0 { serverLoadedW = serverLoadedWSum / float64(serverLoadedSamples) } - result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuReportedSumW, serverIdleOK && serverLoadedOK) + result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuReportedSumW, opts.ServerPowerSource, serverIdleOK && serverLoadedOK) result.Cooling = summarizeBenchmarkCooling(metricRows) // Apply server-power penalty when IPMI reports the server delta is much @@ -707,6 +715,7 @@ func normalizeNvidiaBenchmarkOptionsForBenchmark(opts NvidiaBenchmarkOptions) Nv if opts.SizeMB < 0 { opts.SizeMB = 0 } + opts.ServerPowerSource = normalizeBenchmarkPowerSource(opts.ServerPowerSource) opts.GPUIndices = dedupeSortedIndices(opts.GPUIndices) opts.ExcludeGPUIndices = dedupeSortedIndices(opts.ExcludeGPUIndices) return opts @@ -2535,10 +2544,14 @@ loop: } // characterizeServerPower computes BenchmarkServerPower from idle and loaded -// IPMI samples plus the GPU-reported average power during steady state. -func characterizeServerPower(idleW, loadedW, gpuReportedSumW float64, ipmiAvailable bool) *BenchmarkServerPower { - sp := &BenchmarkServerPower{Available: ipmiAvailable} - if !ipmiAvailable { +// samples plus the GPU-reported average power during steady state. +func characterizeServerPower(idleW, loadedW, gpuReportedSumW float64, source string, available bool) *BenchmarkServerPower { + sp := &BenchmarkServerPower{ + Available: available, + Source: normalizeBenchmarkPowerSource(source), + SampleIntervalSec: benchmarkPowerAutotuneSampleInterval, + } + if !available { sp.Notes = append(sp.Notes, "IPMI power reading unavailable; server-side power characterization skipped") return sp } @@ -2671,10 +2684,10 @@ func runNvidiaBenchmarkParallel( // Sample server idle power once. if !*serverIdleOK { - if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok { + if w, ok := sampleBenchmarkPowerSourceSeries(ctx, opts.ServerPowerSource, maxInt(spec.BaselineSec, 10), benchmarkPowerAutotuneSampleInterval); ok { *serverIdleW = w *serverIdleOK = true - logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w)) + logFunc(fmt.Sprintf("server idle power (%s): %.0f W", opts.ServerPowerSource, w)) } } @@ -2728,7 +2741,16 @@ func runNvidiaBenchmarkParallel( "--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases), } logFunc(fmt.Sprintf("GPUs %s: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", allDevices, len(supportedPrecisions), basePhaseSec, mixedPhaseSec)) + serverPowerStopCh := make(chan struct{}) + serverPowerCh := startSelectedPowerSourceSampler(serverPowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval) _, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, "gpu-all-precision-plan.log", planCmd, nil, selected, planPhases, logFunc) + close(serverPowerStopCh) + if serverPowerSamples := <-serverPowerCh; len(serverPowerSamples) > 0 { + *serverLoadedWSum += benchmarkMean(serverPowerSamples) + (*serverLoadedSamples)++ + *serverLoadedOK = true + logFunc(fmt.Sprintf("GPUs %s: server loaded power (%s avg): %.0f W", allDevices, opts.ServerPowerSource, benchmarkMean(serverPowerSamples))) + } for _, phaseSpec := range planPhases { if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 { appendBenchmarkMetrics(allMetricRows, rows, phaseSpec.MetricStage, metricTimelineSec, float64(phaseSpec.DurationSec)) @@ -2770,46 +2792,6 @@ func runNvidiaBenchmarkParallel( } logFunc(fmt.Sprintf("GPUs %s: parallel steady compute (combined, %ds)", allDevices, mixedPhaseSec)) - - // Sample server power via IPMI in parallel with steady phase. - ipmiStopCh := make(chan struct{}) - ipmiResultCh := make(chan float64, 1) - go func() { - defer close(ipmiResultCh) - var samples []float64 - ticker := time.NewTicker(5 * time.Second) - defer ticker.Stop() - select { - case <-ipmiStopCh: - return - case <-time.After(15 * time.Second): - } - for { - if w, err := queryIPMIServerPowerW(); err == nil { - samples = append(samples, w) - } - select { - case <-ipmiStopCh: - if len(samples) > 0 { - var sum float64 - for _, w := range samples { - sum += w - } - ipmiResultCh <- sum / float64(len(samples)) - } - return - case <-ticker.C: - } - } - }() - - close(ipmiStopCh) - if loadedW, ok := <-ipmiResultCh; ok { - *serverLoadedWSum += loadedW - (*serverLoadedSamples)++ - *serverLoadedOK = true - logFunc(fmt.Sprintf("GPUs %s: server loaded power (IPMI): %.0f W", allDevices, loadedW)) - } afterThrottle := make(map[int]BenchmarkThrottleCounters, len(selected)) for _, idx := range selected { afterThrottle[idx], _ = queryThrottleCounters(idx) @@ -3040,8 +3022,8 @@ func summarizeCPULoad(samples []float64) *BenchmarkCPULoad { return cl } -// runBenchmarkPowerCalibration runs targeted_power for the supplied GPU set and -// actively watches throttle counters. seedLimits, when provided, are treated as +// runBenchmarkPowerCalibration runs the configured power-fit load for the supplied +// GPU set and actively watches throttle counters. seedLimits, when provided, are treated as // the starting point for this calibration pass rather than as immutable fixed // limits. This matters during cumulative ramp-up: once an additional GPU is // introduced, every already-active GPU must be revalidated under the new @@ -3070,10 +3052,19 @@ func runBenchmarkPowerCalibration( // doubling each retry until it would exceed the cap, at which point the // next busy response fails the calibration immediately. const dcgmResourceBusyMaxDelaySec = 300 + engine := benchmarkPowerEngine() + engineLabel := benchmarkPowerEngineLabel(engine) - if _, err := exec.LookPath("dcgmi"); err != nil { - logFunc("power calibration: dcgmi not found, skipping (will use default power limit)") - return map[int]benchmarkPowerCalibrationResult{}, nil, nil + if engine == BenchmarkPowerEngineTargetedPower { + if _, err := exec.LookPath("dcgmi"); err != nil { + logFunc("power calibration: dcgmi not found, skipping (will use default power limit)") + return map[int]benchmarkPowerCalibrationResult{}, nil, nil + } + } else { + if _, _, err := resolveBenchmarkPowerLoadCommand(calibDurationSec, gpuIndices); err != nil { + logFunc("power calibration: dcgmproftester not found, skipping (will use default power limit)") + return map[int]benchmarkPowerCalibrationResult{}, nil, nil + } } if killed := KillTestWorkers(); len(killed) > 0 { for _, p := range killed { @@ -3206,7 +3197,7 @@ calibDone: sharedAttempt++ for _, s := range active { s.calib.Attempts++ - logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power attempt %d at %d W for %ds", s.idx, s.calib.Attempts, s.appliedLimitW, calibDurationSec)) + logFunc(fmt.Sprintf("power calibration: GPU %d %s attempt %d at %d W for %ds", s.idx, engineLabel, s.calib.Attempts, s.appliedLimitW, calibDurationSec)) } // Snapshot throttle counters for all active GPUs before the run. @@ -3215,14 +3206,22 @@ calibDone: beforeThrottle[s.idx], _ = queryThrottleCounters(s.idx) } - // Run targeted_power for ALL gpuIndices simultaneously so every card + // Run the selected power-fit load for ALL gpuIndices simultaneously so every card // is under load during calibration — this reflects real server thermals. logName := fmt.Sprintf("power-calibration-attempt-%d.log", sharedAttempt) - cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, gpuIndices) + cmd, env, err := resolveBenchmarkPowerLoadCommand(calibDurationSec, gpuIndices) + if err != nil { + for _, s := range active { + s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("failed to resolve %s command: %v", engineLabel, err)) + s.converged = true + } + logFunc(fmt.Sprintf("power calibration: failed to resolve %s command: %v", engineLabel, err)) + break calibDone + } attemptCtx, cancelAttempt := context.WithCancel(ctx) doneCh := make(chan sharedAttemptResult, 1) go func() { - out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, nil, gpuIndices, logFunc) + out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, env, gpuIndices, logFunc) doneCh <- sharedAttemptResult{out: out, rows: rows, err: err} }() @@ -3245,8 +3244,8 @@ calibDone: if err != nil { continue } - // Record throttle but do NOT cancel — let dcgmi finish so - // nv-hostengine releases the slot cleanly before the next attempt. + // Record throttle but do NOT cancel — let the load command finish so + // runtime resources release cleanly before the next attempt. if reason := benchmarkCalibrationThrottleReason(beforeThrottle[s.idx], after); reason != "" { throttleReasons[s.idx] = reason logFunc(fmt.Sprintf("power calibration: GPU %d detected %s throttle at %d W, waiting for run to finish", s.idx, reason, s.appliedLimitW)) @@ -3359,9 +3358,9 @@ calibDone: logFunc(fmt.Sprintf("power calibration: GPU %d throttled (%s) at %d W, reducing power limit", s.idx, throttle, s.appliedLimitW)) case ar.err != nil: s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d failed at %d W: %v", s.calib.Attempts, s.appliedLimitW, ar.err)) - logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power failed at %d W: %v", s.idx, s.appliedLimitW, ar.err)) + logFunc(fmt.Sprintf("power calibration: GPU %d %s failed at %d W: %v", s.idx, engineLabel, s.appliedLimitW, ar.err)) default: - s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d at %d W: no valid power telemetry", s.calib.Attempts, s.appliedLimitW)) + s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("%s attempt %d at %d W: no valid power telemetry", engineLabel, s.calib.Attempts, s.appliedLimitW)) logFunc(fmt.Sprintf("power calibration: GPU %d attempt %d at %d W: no valid telemetry", s.idx, s.calib.Attempts, s.appliedLimitW)) } @@ -3384,7 +3383,7 @@ calibDone: s.calib.Completed = true } } else { - s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW)) + s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit within %d W of the default", engineLabel, maxDerateW)) } s.calib.MetricRows = filterRowsByGPU(ar.rows, s.idx) s.converged = true @@ -3399,7 +3398,7 @@ calibDone: next = (s.lo + s.hi) / 2 } if next < s.minLimitW { - s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW)) + s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit within %d W of the default", engineLabel, maxDerateW)) s.converged = true continue } @@ -4117,13 +4116,13 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N } durationSec := powerBenchDurationSec(opts.Profile) - // Sample IPMI idle power before any GPU load. + // Sample server idle power before any GPU load. var serverIdleW float64 var serverIdleOK bool - if w, ok := sampleIPMIPowerSeries(ctx, 10); ok { + if w, ok := sampleBenchmarkPowerSourceSeries(ctx, opts.ServerPowerSource, 10, benchmarkPowerAutotuneSampleInterval); ok { serverIdleW = w serverIdleOK = true - logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w)) + logFunc(fmt.Sprintf("server idle power (%s): %.0f W", opts.ServerPowerSource, w)) } sdrIdle := sampleIPMISDRPowerSensors() psuBefore := psuStatusSnapshot() @@ -4141,26 +4140,18 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N _ = os.MkdirAll(singleDir, 0755) singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex) logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx)) - ipmiSingleCtx, ipmiSingleCancel := context.WithCancel(ctx) - ipmiSingleDone := make(chan float64, 1) - go func() { - defer close(ipmiSingleDone) - if w, ok := sampleIPMIPowerSeries(ipmiSingleCtx, 3600); ok { - ipmiSingleDone <- w - } - }() + singlePowerStopCh := make(chan struct{}) + singlePowerCh := startSelectedPowerSourceSampler(singlePowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval) c, restore, singleRows := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil, durationSec) appendBenchmarkMetrics(&allPowerRows, singleRows, fmt.Sprintf("single-gpu-%d", idx), &powerCursor, 0) - ipmiSingleCancel() + close(singlePowerStopCh) sdrSingle := sampleIPMISDRPowerSensors() - if sdrSingle.PSUInW > 0 { + if samples := <-singlePowerCh; len(samples) > 0 { + singleIPMILoadedW[idx] = benchmarkMean(samples) + logFunc(fmt.Sprintf("power calibration: GPU %d single-card server power (%s avg): %.0f W", idx, opts.ServerPowerSource, singleIPMILoadedW[idx])) + } else if opts.ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && sdrSingle.PSUInW > 0 { singleIPMILoadedW[idx] = sdrSingle.PSUInW - logFunc(fmt.Sprintf("power calibration: GPU %d single-card IPMI loaded: %.0f W (SDR PSU AC input)", idx, sdrSingle.PSUInW)) - } else if w, ok := <-ipmiSingleDone; ok { - singleIPMILoadedW[idx] = w - logFunc(fmt.Sprintf("power calibration: GPU %d single-card IPMI loaded: %.0f W (DCMI)", idx, w)) - } else { - <-ipmiSingleDone // drain channel + logFunc(fmt.Sprintf("power calibration: GPU %d single-card fallback server power (SDR snapshot): %.0f W", idx, sdrSingle.PSUInW)) } allRestoreActions = append(allRestoreActions, restore...) if r, ok := c[idx]; ok { @@ -4234,11 +4225,11 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N result.RecommendedSlotOrder = append(result.RecommendedSlotOrder, gpu.Index) } if len(result.RecommendedSlotOrder) > 0 { - result.Findings = append(result.Findings, fmt.Sprintf("Recommended slot order for installation based on single-card targeted_power: %s.", joinIndexList(result.RecommendedSlotOrder))) + result.Findings = append(result.Findings, fmt.Sprintf("Recommended slot order for installation based on single-card %s: %s.", benchmarkPowerEngineLabel(benchmarkPowerEngine()), joinIndexList(result.RecommendedSlotOrder))) } for _, gpu := range gpus { if gpu.Derated { - result.Findings = append(result.Findings, fmt.Sprintf("GPU %d required reduced power limit %.0f W to complete targeted_power.", gpu.Index, gpu.AppliedPowerLimitW)) + result.Findings = append(result.Findings, fmt.Sprintf("GPU %d required reduced power limit %.0f W to complete %s.", gpu.Index, gpu.AppliedPowerLimitW, benchmarkPowerEngineLabel(benchmarkPowerEngine()))) } if gpu.CoolingWarning != "" { result.Findings = append(result.Findings, fmt.Sprintf( @@ -4255,7 +4246,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N // Phase 2: cumulative thermal ramp. // Each step introduces one new GPU into an environment where all previously // calibrated GPUs are already running at their fixed stable limits. The new - // GPU's stable TDP is searched via binary search (targeted_power) under real + // GPU's stable TDP is searched via binary search under real // multi-GPU thermal load. Once found, its limit is fixed permanently for all // subsequent steps. This ensures each GPU's limit reflects actual sustained // power in the final full-system thermal state. @@ -4294,7 +4285,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N } if !firstCalib.Completed { ramp.Status = "FAILED" - ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card targeted_power", firstIdx)) + ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card %s", firstIdx, benchmarkPowerEngineLabel(benchmarkPowerEngine()))) result.OverallStatus = "PARTIAL" } else if firstCalib.Derated { ramp.Status = "PARTIAL" @@ -4340,21 +4331,15 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N step, len(result.RecommendedSlotOrder), len(subset), newGPUIdx)) stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex) - ipmiStepCtx, ipmiStepCancel := context.WithCancel(ctx) - ipmiStepDone := make(chan float64, 1) - go func() { - defer close(ipmiStepDone) - if w, ok := sampleIPMIPowerSeries(ipmiStepCtx, 3600); ok { - ipmiStepDone <- w - } - }() + stepPowerStopCh := make(chan struct{}) + stepPowerCh := startSelectedPowerSourceSampler(stepPowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval) stepCalib, stepRestore, stepRows := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep, durationSec) appendBenchmarkMetrics(&allPowerRows, stepRows, fmt.Sprintf("ramp-step-%d", step), &powerCursor, 0) - ipmiStepCancel() + close(stepPowerStopCh) var stepIPMILoadedW float64 var stepIPMIOK bool - if w, ok := <-ipmiStepDone; ok { - stepIPMILoadedW = w + if samples := <-stepPowerCh; len(samples) > 0 { + stepIPMILoadedW = benchmarkMean(samples) stepIPMIOK = true } // Accumulate restore actions; they all run in the outer defer. @@ -4391,7 +4376,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N } ramp.Status = "FAILED" ramp.Notes = append(ramp.Notes, - fmt.Sprintf("GPU %d did not complete targeted_power in ramp step %d; keeping previous stable limit %d W", idx, step, fallback)) + fmt.Sprintf("GPU %d did not complete %s in ramp step %d; keeping previous stable limit %d W", idx, benchmarkPowerEngineLabel(benchmarkPowerEngine()), step, fallback)) result.OverallStatus = "PARTIAL" continue } @@ -4427,24 +4412,24 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N ramp.PSUSlotReadings = sdrStep.PSUSlots } - if sdrStep.PSUInW > 0 { - // SDR PSU sum is available — use it for server power (includes all PSUs). - ramp.ServerLoadedW = sdrStep.PSUInW - ramp.ServerDeltaW = sdrStep.PSUInW - sdrIdle.PSUInW - logFunc(fmt.Sprintf("power ramp: step %d IPMI loaded: %.0f W (SDR PSU AC input)", step, sdrStep.PSUInW)) - if step == len(result.RecommendedSlotOrder) { - serverLoadedW = sdrStep.PSUInW - serverLoadedOK = true - sdrLastStep = sdrStep - } - } else if stepIPMIOK && serverIdleOK && stepIPMILoadedW > 0 { + if stepIPMIOK && serverIdleOK && stepIPMILoadedW > 0 { ramp.ServerLoadedW = stepIPMILoadedW ramp.ServerDeltaW = stepIPMILoadedW - serverIdleW - logFunc(fmt.Sprintf("power ramp: step %d IPMI loaded: %.0f W (DCMI)", step, stepIPMILoadedW)) + logFunc(fmt.Sprintf("power ramp: step %d server loaded power (%s avg): %.0f W", step, opts.ServerPowerSource, stepIPMILoadedW)) // The last step has all GPUs loaded — use it as the top-level loaded_w. if step == len(result.RecommendedSlotOrder) { serverLoadedW = stepIPMILoadedW serverLoadedOK = true + sdrLastStep = sdrStep + } + } else if opts.ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && sdrStep.PSUInW > 0 { + ramp.ServerLoadedW = sdrStep.PSUInW + ramp.ServerDeltaW = sdrStep.PSUInW - sdrIdle.PSUInW + logFunc(fmt.Sprintf("power ramp: step %d fallback server loaded power (SDR snapshot): %.0f W", step, sdrStep.PSUInW)) + if step == len(result.RecommendedSlotOrder) { + serverLoadedW = sdrStep.PSUInW + serverLoadedOK = true + sdrLastStep = sdrStep } } @@ -4502,7 +4487,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N gpuActualSumW = result.PlatformMaxTDPW } _ = serverIdleOK // used implicitly via characterizeServerPower - result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuActualSumW, serverIdleOK && serverLoadedOK) + result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuActualSumW, opts.ServerPowerSource, serverIdleOK && serverLoadedOK) // Supplement DCMI with SDR multi-source data via collector's PSU slot patterns. // Per-slot readings enable correlation with audit HardwarePowerSupply entries. if result.ServerPower != nil { diff --git a/audit/internal/platform/benchmark_power_autotune.go b/audit/internal/platform/benchmark_power_autotune.go new file mode 100644 index 0000000..7ab7776 --- /dev/null +++ b/audit/internal/platform/benchmark_power_autotune.go @@ -0,0 +1,735 @@ +package platform + +import ( + "context" + "encoding/json" + "fmt" + "math" + "os" + "os/exec" + "path/filepath" + "sort" + "strings" + "time" +) + +const ( + benchmarkPowerAutotuneVersion = 1 + benchmarkPowerAutotuneIdleSec = 60 + benchmarkPowerAutotuneLoadSec = 90 + benchmarkPowerAutotuneSampleInterval = 3 + defaultBenchmarkPowerSourceConfigPath = "/appdata/bee/export/bee-bench/power-source-autotune.json" +) + +func BenchmarkPowerSourceConfigPath(baseDir string) string { + baseDir = strings.TrimSpace(baseDir) + if baseDir == "" { + return defaultBenchmarkPowerSourceConfigPath + } + return filepath.Join(filepath.Dir(baseDir), "power-source-autotune.json") +} + +func LoadBenchmarkPowerAutotuneConfig(path string) (*BenchmarkPowerAutotuneConfig, error) { + raw, err := os.ReadFile(path) + if err != nil { + return nil, err + } + var cfg BenchmarkPowerAutotuneConfig + if err := json.Unmarshal(raw, &cfg); err != nil { + return nil, err + } + if strings.TrimSpace(cfg.SelectedSource) == "" { + return nil, fmt.Errorf("autotune config missing selected_source") + } + return &cfg, nil +} + +func SaveBenchmarkPowerAutotuneConfig(path string, cfg BenchmarkPowerAutotuneConfig) error { + if strings.TrimSpace(path) == "" { + return fmt.Errorf("empty autotune config path") + } + if cfg.Version <= 0 { + cfg.Version = benchmarkPowerAutotuneVersion + } + if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil { + return err + } + data, err := json.MarshalIndent(cfg, "", " ") + if err != nil { + return err + } + tmp := path + ".tmp" + if err := os.WriteFile(tmp, data, 0644); err != nil { + return err + } + return os.Rename(tmp, path) +} + +func LoadSystemPowerSourceConfig(exportDir string) (*BenchmarkPowerAutotuneConfig, error) { + return LoadBenchmarkPowerAutotuneConfig(BenchmarkPowerSourceConfigPath(exportDir)) +} + +func ResetBenchmarkPowerAutotuneConfig(path string) error { + if strings.TrimSpace(path) == "" { + return fmt.Errorf("empty autotune config path") + } + if err := os.Remove(path); err != nil && !os.IsNotExist(err) { + return err + } + return nil +} + +func normalizeBenchmarkPowerSource(source string) string { + switch strings.TrimSpace(strings.ToLower(source)) { + case BenchmarkPowerSourceSDRPSUInput: + return BenchmarkPowerSourceSDRPSUInput + default: + return BenchmarkPowerSourceDCMI + } +} + +func ResolveSystemPowerDecision(exportDir string) SystemPowerSourceDecision { + cfg, err := LoadSystemPowerSourceConfig(exportDir) + if err == nil && cfg != nil && strings.TrimSpace(cfg.SelectedSource) != "" { + selected := normalizeBenchmarkPowerSource(cfg.SelectedSource) + return SystemPowerSourceDecision{ + Configured: true, + SelectedSource: selected, + EffectiveSource: selected, + Mode: "autotuned", + Reason: strings.TrimSpace(cfg.Reason), + ConfiguredAt: cfg.UpdatedAt, + } + } + + sources := sampleBenchmarkPowerSources() + if value := sources[BenchmarkPowerSourceSDRPSUInput]; value > 0 { + return SystemPowerSourceDecision{ + Configured: false, + EffectiveSource: BenchmarkPowerSourceSDRPSUInput, + Mode: "fallback", + Reason: "autotune config not found; using temporary fallback source sdr_psu_input", + } + } + return SystemPowerSourceDecision{ + Configured: false, + EffectiveSource: BenchmarkPowerSourceDCMI, + Mode: "fallback", + Reason: "autotune config not found; using temporary fallback source dcmi", + } +} + +func SampleSystemPowerResolved(exportDir string) (float64, SystemPowerSourceDecision, error) { + decision := ResolveSystemPowerDecision(exportDir) + if decision.EffectiveSource != "" { + if value, err := queryBenchmarkPowerSourceW(decision.EffectiveSource); err == nil && value > 0 { + return value, decision, nil + } else if decision.Configured { + fallback := BenchmarkPowerSourceDCMI + if decision.EffectiveSource == BenchmarkPowerSourceDCMI { + fallback = BenchmarkPowerSourceSDRPSUInput + } + if fallbackValue, fallbackErr := queryBenchmarkPowerSourceW(fallback); fallbackErr == nil && fallbackValue > 0 { + decision.Mode = "degraded" + decision.Reason = fmt.Sprintf("configured source %s unavailable; using degraded fallback %s", decision.SelectedSource, fallback) + decision.EffectiveSource = fallback + return fallbackValue, decision, nil + } + decision.Mode = "degraded" + decision.Reason = fmt.Sprintf("configured source %s unavailable and no fallback source responded", decision.SelectedSource) + return 0, decision, err + } + } + return 0, decision, fmt.Errorf("system power source unavailable") +} + +func queryBenchmarkPowerSourceW(source string) (float64, error) { + switch normalizeBenchmarkPowerSource(source) { + case BenchmarkPowerSourceSDRPSUInput: + sdr := sampleIPMISDRPowerSensors() + if sdr.PSUInW > 0 { + return sdr.PSUInW, nil + } + return 0, fmt.Errorf("sdr psu input unavailable") + default: + return queryIPMIServerPowerW() + } +} + +func sampleBenchmarkPowerSources() map[string]float64 { + out := map[string]float64{} + if w, err := queryIPMIServerPowerW(); err == nil && w > 0 { + out[BenchmarkPowerSourceDCMI] = w + } + if w, err := queryBenchmarkPowerSourceW(BenchmarkPowerSourceSDRPSUInput); err == nil && w > 0 { + out[BenchmarkPowerSourceSDRPSUInput] = w + } + return out +} + +func sampleBenchmarkPowerSourceSeries(ctx context.Context, source string, durationSec, intervalSec int) (float64, bool) { + if durationSec <= 0 { + return 0, false + } + samples := collectSelectedPowerSourceSamples(ctx, source, durationSec, intervalSec) + if len(samples) == 0 { + return 0, false + } + return benchmarkMean(samples), true +} + +func collectSelectedPowerSourceSamples(ctx context.Context, source string, durationSec, intervalSec int) []float64 { + if durationSec <= 0 { + return nil + } + stopCh := make(chan struct{}) + doneCh := startSelectedPowerSourceSampler(stopCh, source, intervalSec) + select { + case <-ctx.Done(): + case <-time.After(time.Duration(durationSec) * time.Second): + } + close(stopCh) + return <-doneCh +} + +func startSelectedPowerSourceSampler(stopCh <-chan struct{}, source string, intervalSec int) <-chan []float64 { + if intervalSec <= 0 { + intervalSec = benchmarkPowerAutotuneSampleInterval + } + ch := make(chan []float64, 1) + go func() { + defer close(ch) + var samples []float64 + record := func() { + if w, err := queryBenchmarkPowerSourceW(source); err == nil && w > 0 { + samples = append(samples, w) + } + } + record() + ticker := time.NewTicker(time.Duration(intervalSec) * time.Second) + defer ticker.Stop() + for { + select { + case <-stopCh: + ch <- samples + return + case <-ticker.C: + record() + } + } + }() + return ch +} + +type benchmarkPowerAutotuneSample struct { + ElapsedSec float64 + GPUAvgUsagePct float64 + CPUUsagePct float64 + GPUSumPowerW float64 + Sources map[string]float64 +} + +func collectBenchmarkPowerAutotuneSamples(ctx context.Context, phase string, gpuIndices []int, durationSec int, logFunc func(string)) []benchmarkPowerAutotuneSample { + if durationSec <= 0 { + return nil + } + var out []benchmarkPowerAutotuneSample + deadline := time.Now().Add(time.Duration(durationSec) * time.Second) + start := time.Now() + for { + if ctx.Err() != nil { + return out + } + row := benchmarkPowerAutotuneSample{ + ElapsedSec: time.Since(start).Seconds(), + CPUUsagePct: sampleCPULoadPct(), + Sources: sampleBenchmarkPowerSources(), + } + if gpuRows, err := sampleGPUMetrics(gpuIndices); err == nil && len(gpuRows) > 0 { + var usageSum float64 + for _, gpu := range gpuRows { + row.GPUSumPowerW += gpu.PowerW + usageSum += gpu.UsagePct + } + row.GPUAvgUsagePct = usageSum / float64(len(gpuRows)) + } + out = append(out, row) + logBenchmarkPowerAutotuneSample(phase, row, logFunc) + if time.Now().After(deadline) { + return out + } + select { + case <-ctx.Done(): + return out + case <-time.After(benchmarkPowerAutotuneSampleInterval * time.Second): + } + } +} + +func logBenchmarkPowerAutotuneSample(phase string, sample benchmarkPowerAutotuneSample, logFunc func(string)) { + if logFunc == nil { + return + } + var sourceParts []string + for _, source := range []string{BenchmarkPowerSourceDCMI, BenchmarkPowerSourceSDRPSUInput} { + if value, ok := sample.Sources[source]; ok && value > 0 { + sourceParts = append(sourceParts, fmt.Sprintf("%s=%.0fW", source, value)) + } else { + sourceParts = append(sourceParts, fmt.Sprintf("%s=n/a", source)) + } + } + logFunc(fmt.Sprintf( + "autotune %s sample t=%.0fs gpu_avg_util=%.1f%% gpu_sum_power=%.0fW cpu_load=%.1f%% %s", + phase, + sample.ElapsedSec, + sample.GPUAvgUsagePct, + sample.GPUSumPowerW, + sample.CPUUsagePct, + strings.Join(sourceParts, " "), + )) +} + +func logBenchmarkPowerAutotunePhaseSummary(phase string, samples []benchmarkPowerAutotuneSample, logFunc func(string)) { + if logFunc == nil || len(samples) == 0 { + return + } + var gpuUsage []float64 + var cpuUsage []float64 + var gpuPower []float64 + sourceBuckets := map[string][]float64{} + for _, sample := range samples { + gpuUsage = append(gpuUsage, sample.GPUAvgUsagePct) + cpuUsage = append(cpuUsage, sample.CPUUsagePct) + gpuPower = append(gpuPower, sample.GPUSumPowerW) + for source, value := range sample.Sources { + if value > 0 { + sourceBuckets[source] = append(sourceBuckets[source], value) + } + } + } + var sourceParts []string + for _, source := range []string{BenchmarkPowerSourceDCMI, BenchmarkPowerSourceSDRPSUInput} { + values := sourceBuckets[source] + if len(values) == 0 { + sourceParts = append(sourceParts, fmt.Sprintf("%s_avg=n/a", source)) + continue + } + sourceParts = append(sourceParts, fmt.Sprintf("%s_avg=%.0fW", source, benchmarkMean(values))) + } + logFunc(fmt.Sprintf( + "autotune %s summary samples=%d gpu_avg_util=%.1f%% gpu_p95_util=%.1f%% gpu_avg_power=%.0fW cpu_avg=%.1f%% cpu_p95=%.1f%% %s", + phase, + len(samples), + benchmarkMean(gpuUsage), + benchmarkPercentile(gpuUsage, 95), + benchmarkMean(gpuPower), + benchmarkMean(cpuUsage), + benchmarkPercentile(cpuUsage, 95), + strings.Join(sourceParts, " "), + )) +} + +func logBenchmarkPowerAutotuneSelection(candidates []BenchmarkPowerAutotuneCandidate, selectedSource string, gpuDelta float64, logFunc func(string)) { + if logFunc == nil { + return + } + for _, candidate := range candidates { + if !candidate.Available { + logFunc(fmt.Sprintf("autotune candidate %s unavailable", candidate.Source)) + continue + } + logFunc(fmt.Sprintf( + "autotune candidate %s idle_avg=%.0fW load_avg=%.0fW delta=%.0fW gpu_delta=%.0fW relative_error=%.3f confidence=%.0f%%%s", + candidate.Source, + candidate.IdleAvgW, + candidate.LoadAvgW, + candidate.DeltaW, + gpuDelta, + candidate.RelativeError, + candidate.Confidence*100, + map[bool]string{true: " SELECTED", false: ""}[candidate.Source == selectedSource], + )) + if strings.TrimSpace(candidate.SelectionNotes) != "" { + logFunc(fmt.Sprintf("autotune candidate %s reason: %s", candidate.Source, candidate.SelectionNotes)) + } + } +} + +func validateBenchmarkPowerAutotuneIdle(samples []benchmarkPowerAutotuneSample) *BenchmarkPowerAutotuneValidation { + result := &BenchmarkPowerAutotuneValidation{} + if len(samples) == 0 { + result.Reason = "no idle telemetry samples collected" + return result + } + var gpuUsage []float64 + var cpuUsage []float64 + for _, sample := range samples { + gpuUsage = append(gpuUsage, sample.GPUAvgUsagePct) + if sample.CPUUsagePct > 0 { + cpuUsage = append(cpuUsage, sample.CPUUsagePct) + } + } + result.GPUSamples = len(gpuUsage) + result.CPUSamples = len(cpuUsage) + result.GPUAvgUsagePct = math.Round(benchmarkMean(gpuUsage)*10) / 10 + result.GPUP95UsagePct = math.Round(benchmarkPercentile(gpuUsage, 95)*10) / 10 + result.CPUAvgUsagePct = math.Round(benchmarkMean(cpuUsage)*10) / 10 + result.CPUP95UsagePct = math.Round(benchmarkPercentile(cpuUsage, 95)*10) / 10 + switch { + case result.GPUAvgUsagePct > 5: + result.Reason = fmt.Sprintf("idle validation failed: average GPU load %.1f%% exceeds 5%%", result.GPUAvgUsagePct) + case result.GPUP95UsagePct > 10: + result.Reason = fmt.Sprintf("idle validation failed: p95 GPU load %.1f%% exceeds 10%%", result.GPUP95UsagePct) + case result.CPUAvgUsagePct > 20: + result.Reason = fmt.Sprintf("idle validation failed: average CPU load %.1f%% exceeds 20%%", result.CPUAvgUsagePct) + case result.CPUP95UsagePct > 35: + result.Reason = fmt.Sprintf("idle validation failed: p95 CPU load %.1f%% exceeds 35%%", result.CPUP95UsagePct) + default: + result.Valid = true + } + return result +} + +func chooseBenchmarkPowerAutotuneSource(idle, load []benchmarkPowerAutotuneSample) (string, []BenchmarkPowerAutotuneCandidate, float64, float64, error) { + idleBySource := map[string][]float64{} + loadBySource := map[string][]float64{} + var idleGPU []float64 + var loadGPU []float64 + for _, sample := range idle { + idleGPU = append(idleGPU, sample.GPUSumPowerW) + for source, value := range sample.Sources { + if value > 0 { + idleBySource[source] = append(idleBySource[source], value) + } + } + } + for _, sample := range load { + loadGPU = append(loadGPU, sample.GPUSumPowerW) + for source, value := range sample.Sources { + if value > 0 { + loadBySource[source] = append(loadBySource[source], value) + } + } + } + idleGPUAvg := benchmarkMean(idleGPU) + loadGPUAvg := benchmarkMean(loadGPU) + gpuDelta := loadGPUAvg - idleGPUAvg + if gpuDelta <= 0 { + gpuDelta = loadGPUAvg + } + + candidates := []BenchmarkPowerAutotuneCandidate{ + buildBenchmarkPowerAutotuneCandidate(BenchmarkPowerSourceDCMI, idleBySource[BenchmarkPowerSourceDCMI], loadBySource[BenchmarkPowerSourceDCMI], gpuDelta), + buildBenchmarkPowerAutotuneCandidate(BenchmarkPowerSourceSDRPSUInput, idleBySource[BenchmarkPowerSourceSDRPSUInput], loadBySource[BenchmarkPowerSourceSDRPSUInput], gpuDelta), + } + available := make([]BenchmarkPowerAutotuneCandidate, 0, len(candidates)) + for _, candidate := range candidates { + if candidate.Available && candidate.DeltaW > 0 { + available = append(available, candidate) + } + } + if len(available) == 0 { + return "", candidates, idleGPUAvg, loadGPUAvg, fmt.Errorf("no usable server power source samples collected") + } + sort.Slice(available, func(i, j int) bool { + if math.Abs(available[i].RelativeError-available[j].RelativeError) <= 0.10 { + if available[i].Source != available[j].Source { + return available[i].Source == BenchmarkPowerSourceSDRPSUInput + } + } + if available[i].RelativeError != available[j].RelativeError { + return available[i].RelativeError < available[j].RelativeError + } + return available[i].Samples > available[j].Samples + }) + selected := available[0] + for idx := range candidates { + if candidates[idx].Source == selected.Source { + candidates[idx].Selected = true + candidates[idx].SelectionNotes = fmt.Sprintf("selected because delta %.0f W is closest to GPU delta %.0f W (relative error %.3f)", selected.DeltaW, gpuDelta, selected.RelativeError) + } + } + return selected.Source, candidates, idleGPUAvg, loadGPUAvg, nil +} + +func buildBenchmarkPowerAutotuneCandidate(source string, idle, load []float64, gpuDelta float64) BenchmarkPowerAutotuneCandidate { + candidate := BenchmarkPowerAutotuneCandidate{ + Source: source, + Available: len(idle) > 0 && len(load) > 0, + Samples: minInt(len(idle), len(load)), + } + if !candidate.Available { + return candidate + } + candidate.IdleAvgW = benchmarkMean(idle) + candidate.LoadAvgW = benchmarkMean(load) + candidate.DeltaW = candidate.LoadAvgW - candidate.IdleAvgW + if gpuDelta > 0 { + candidate.RelativeError = math.Abs(candidate.DeltaW-gpuDelta) / gpuDelta + candidate.Confidence = math.Max(0, 1-candidate.RelativeError) + } + return candidate +} + +func renderBenchmarkPowerAutotuneSummary(result BenchmarkPowerAutotuneResult) string { + var b strings.Builder + fmt.Fprintf(&b, "generated_at=%s\n", result.GeneratedAt.UTC().Format(time.RFC3339)) + fmt.Fprintf(&b, "status=%s\n", result.Status) + fmt.Fprintf(&b, "benchmark_kind=%s\n", result.BenchmarkKind) + fmt.Fprintf(&b, "profile=%s\n", result.Profile) + fmt.Fprintf(&b, "idle_duration_sec=%d\n", result.IdleDurationSec) + fmt.Fprintf(&b, "load_duration_sec=%d\n", result.LoadDurationSec) + fmt.Fprintf(&b, "sample_interval_sec=%d\n", result.SampleIntervalSec) + if result.SelectedSource != "" { + fmt.Fprintf(&b, "selected_source=%s\n", result.SelectedSource) + } + if result.IdleValidation != nil { + fmt.Fprintf(&b, "idle_valid=%t\n", result.IdleValidation.Valid) + fmt.Fprintf(&b, "idle_gpu_avg_usage_pct=%.1f\n", result.IdleValidation.GPUAvgUsagePct) + fmt.Fprintf(&b, "idle_gpu_p95_usage_pct=%.1f\n", result.IdleValidation.GPUP95UsagePct) + fmt.Fprintf(&b, "idle_cpu_avg_usage_pct=%.1f\n", result.IdleValidation.CPUAvgUsagePct) + fmt.Fprintf(&b, "idle_cpu_p95_usage_pct=%.1f\n", result.IdleValidation.CPUP95UsagePct) + if result.IdleValidation.Reason != "" { + fmt.Fprintf(&b, "idle_validation_error=%s\n", result.IdleValidation.Reason) + } + } + for _, candidate := range result.Candidates { + fmt.Fprintf(&b, "candidate_%s_available=%t\n", candidate.Source, candidate.Available) + if candidate.Available { + fmt.Fprintf(&b, "candidate_%s_idle_avg_w=%.0f\n", candidate.Source, candidate.IdleAvgW) + fmt.Fprintf(&b, "candidate_%s_load_avg_w=%.0f\n", candidate.Source, candidate.LoadAvgW) + fmt.Fprintf(&b, "candidate_%s_delta_w=%.0f\n", candidate.Source, candidate.DeltaW) + fmt.Fprintf(&b, "candidate_%s_relative_error=%.3f\n", candidate.Source, candidate.RelativeError) + } + } + return b.String() +} + +func renderBenchmarkPowerAutotuneReport(result BenchmarkPowerAutotuneResult) string { + var b strings.Builder + b.WriteString("# Bee Bench Power Source Autotune\n\n") + fmt.Fprintf(&b, "**Status:** %s \n", result.Status) + fmt.Fprintf(&b, "**Benchmark kind:** %s \n", result.BenchmarkKind) + fmt.Fprintf(&b, "**Profile:** %s \n", result.Profile) + fmt.Fprintf(&b, "**Idle window:** %ds \n", result.IdleDurationSec) + fmt.Fprintf(&b, "**Load window:** %ds \n", result.LoadDurationSec) + fmt.Fprintf(&b, "**Sample interval:** %ds \n", result.SampleIntervalSec) + if result.SelectedSource != "" { + fmt.Fprintf(&b, "**Selected source:** `%s` \n", result.SelectedSource) + } + b.WriteString("\n") + if result.IdleValidation != nil { + b.WriteString("## Idle Validation\n\n") + fmt.Fprintf(&b, "- valid: %t\n", result.IdleValidation.Valid) + fmt.Fprintf(&b, "- GPU avg usage: %.1f%%\n", result.IdleValidation.GPUAvgUsagePct) + fmt.Fprintf(&b, "- GPU p95 usage: %.1f%%\n", result.IdleValidation.GPUP95UsagePct) + fmt.Fprintf(&b, "- CPU avg usage: %.1f%%\n", result.IdleValidation.CPUAvgUsagePct) + fmt.Fprintf(&b, "- CPU p95 usage: %.1f%%\n", result.IdleValidation.CPUP95UsagePct) + if result.IdleValidation.Reason != "" { + fmt.Fprintf(&b, "- reason: %s\n", result.IdleValidation.Reason) + } + b.WriteString("\n") + } + if len(result.Candidates) > 0 { + b.WriteString("## Candidates\n\n") + b.WriteString("| Source | Idle avg W | Load avg W | Delta W | Relative error | Selected |\n") + b.WriteString("|--------|------------|------------|---------|----------------|----------|\n") + for _, candidate := range result.Candidates { + if !candidate.Available { + fmt.Fprintf(&b, "| %s | — | — | — | — | no |\n", candidate.Source) + continue + } + selected := "no" + if candidate.Selected { + selected = "yes" + } + fmt.Fprintf(&b, "| %s | %.0f | %.0f | %.0f | %.2f | %s |\n", + candidate.Source, candidate.IdleAvgW, candidate.LoadAvgW, candidate.DeltaW, candidate.RelativeError, selected) + } + b.WriteString("\n") + } + for _, note := range result.Notes { + fmt.Fprintf(&b, "- %s\n", note) + } + return b.String() +} + +func benchmarkAutotuneLoadCommand(kind string, durationSec int, gpuIndices []int, sizeMB int) ([]string, string) { + allDevices := joinIndexList(gpuIndices) + switch strings.TrimSpace(strings.ToLower(kind)) { + case "power-fit", "power", "nvidia-bench-power": + cmd, _, err := resolveBenchmarkPowerLoadCommand(durationSec, gpuIndices) + if err == nil { + return cmd, "power-fit" + } + return nvidiaDCGMNamedDiagCommand("targeted_power", durationSec, gpuIndices), "power-fit" + default: + cmd := []string{ + "bee-gpu-burn", + "--seconds", fmt.Sprintf("%d", durationSec), + "--devices", allDevices, + } + if sizeMB > 0 { + cmd = append(cmd, "--size-mb", fmt.Sprintf("%d", sizeMB)) + } + return cmd, "performance" + } +} + +func (s *System) RunNvidiaPowerSourceAutotune(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error) { + if ctx == nil { + ctx = context.Background() + } + if logFunc == nil { + logFunc = func(string) {} + } + if strings.TrimSpace(baseDir) == "" { + baseDir = "/var/log/bee-bench/autotune" + } + if err := os.MkdirAll(baseDir, 0755); err != nil { + return "", fmt.Errorf("mkdir %s: %w", baseDir, err) + } + selected, err := resolveNvidiaGPUSelection(nil, nil) + if err != nil { + return "", err + } + if len(selected) == 0 { + return "", fmt.Errorf("no NVIDIA GPUs detected for autotune") + } + ts := time.Now().UTC().Format("20060102-150405") + runDir := filepath.Join(baseDir, "autotune-"+ts) + if err := os.MkdirAll(runDir, 0755); err != nil { + return "", fmt.Errorf("mkdir %s: %w", runDir, err) + } + verboseLog := filepath.Join(runDir, "verbose.log") + hostname, _ := os.Hostname() + loadCmd, normalizedKind := benchmarkAutotuneLoadCommand(benchmarkKind, benchmarkPowerAutotuneLoadSec, selected, opts.SizeMB) + result := BenchmarkPowerAutotuneResult{ + GeneratedAt: time.Now().UTC(), + Hostname: hostname, + ServerModel: readServerModel(), + BenchmarkKind: normalizedKind, + Profile: opts.Profile, + Status: "FAILED", + IdleDurationSec: benchmarkPowerAutotuneIdleSec, + LoadDurationSec: benchmarkPowerAutotuneLoadSec, + SampleIntervalSec: benchmarkPowerAutotuneSampleInterval, + } + + logFunc(fmt.Sprintf("autotune: idle validation window %ds on GPUs %s", benchmarkPowerAutotuneIdleSec, joinIndexList(selected))) + idleSamples := collectBenchmarkPowerAutotuneSamples(ctx, "idle", selected, benchmarkPowerAutotuneIdleSec, logFunc) + logBenchmarkPowerAutotunePhaseSummary("idle", idleSamples, logFunc) + result.IdleValidation = validateBenchmarkPowerAutotuneIdle(idleSamples) + if result.IdleValidation == nil || !result.IdleValidation.Valid { + if result.IdleValidation != nil { + result.IdleValidationError = result.IdleValidation.Reason + logFunc(result.IdleValidation.Reason) + } + result.Notes = append(result.Notes, "autotune stopped before load stage because idle validation failed") + if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil { + return "", err + } + return runDir, fmt.Errorf("%s", result.IdleValidationError) + } + + logFunc(fmt.Sprintf("autotune: full-load stage using %s for %ds", normalizedKind, benchmarkPowerAutotuneLoadSec)) + loadSamplesCh := make(chan []benchmarkPowerAutotuneSample, 1) + go func() { + loadSamplesCh <- collectBenchmarkPowerAutotuneSamples(ctx, "load", selected, benchmarkPowerAutotuneLoadSec, logFunc) + }() + out, runErr := runSATCommandCtx(ctx, verboseLog, "autotune-load.log", loadCmd, nil, logFunc) + _ = os.WriteFile(filepath.Join(runDir, "autotune-load.log"), out, 0644) + loadSamples := <-loadSamplesCh + logBenchmarkPowerAutotunePhaseSummary("load", loadSamples, logFunc) + if runErr != nil { + result.Notes = append(result.Notes, "full-load stage failed: "+runErr.Error()) + if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil { + return "", err + } + return runDir, fmt.Errorf("autotune load stage: %w", runErr) + } + + selectedSource, candidates, idleGPUAvg, loadGPUAvg, chooseErr := chooseBenchmarkPowerAutotuneSource(idleSamples, loadSamples) + result.Candidates = candidates + result.GPUPowerIdleW = idleGPUAvg + result.GPUPowerLoadW = loadGPUAvg + if chooseErr != nil { + result.Notes = append(result.Notes, chooseErr.Error()) + if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil { + return "", err + } + return runDir, chooseErr + } + gpuDelta := loadGPUAvg - idleGPUAvg + if gpuDelta <= 0 { + gpuDelta = loadGPUAvg + } + logBenchmarkPowerAutotuneSelection(candidates, selectedSource, gpuDelta, logFunc) + result.SelectedSource = selectedSource + result.Status = "OK" + var confidence float64 + selectionReason := fmt.Sprintf("selected %s after comparing full-load average against GPU-reported delta", selectedSource) + for _, candidate := range candidates { + if candidate.Selected { + confidence = candidate.Confidence + if strings.TrimSpace(candidate.SelectionNotes) != "" { + selectionReason = candidate.SelectionNotes + } + break + } + } + cfg := BenchmarkPowerAutotuneConfig{ + Version: benchmarkPowerAutotuneVersion, + UpdatedAt: time.Now().UTC(), + SelectedSource: selectedSource, + BenchmarkKind: normalizedKind, + Profile: opts.Profile, + IdleDurationSec: benchmarkPowerAutotuneIdleSec, + LoadDurationSec: benchmarkPowerAutotuneLoadSec, + SampleIntervalSec: benchmarkPowerAutotuneSampleInterval, + Confidence: confidence, + Reason: selectionReason, + } + result.Config = &cfg + configPath := BenchmarkPowerSourceConfigPath(baseDir) + if err := SaveBenchmarkPowerAutotuneConfig(configPath, cfg); err != nil { + result.Status = "FAILED" + result.Notes = append(result.Notes, "failed to save autotune config: "+err.Error()) + if writeErr := writeBenchmarkPowerAutotuneArtifacts(runDir, result); writeErr != nil { + return "", writeErr + } + return runDir, err + } + logFunc(fmt.Sprintf("autotune conclusion: selected source %s; reason: %s", selectedSource, cfg.Reason)) + result.Notes = append(result.Notes, "saved autotune config to "+configPath) + if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil { + return "", err + } + return runDir, nil +} + +func writeBenchmarkPowerAutotuneArtifacts(runDir string, result BenchmarkPowerAutotuneResult) error { + resultJSON, err := json.MarshalIndent(result, "", " ") + if err != nil { + return fmt.Errorf("marshal autotune result: %w", err) + } + if err := os.WriteFile(filepath.Join(runDir, "result.json"), resultJSON, 0644); err != nil { + return fmt.Errorf("write autotune result.json: %w", err) + } + if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(renderBenchmarkPowerAutotuneSummary(result)), 0644); err != nil { + return fmt.Errorf("write autotune summary.txt: %w", err) + } + if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(renderBenchmarkPowerAutotuneReport(result)), 0644); err != nil { + return fmt.Errorf("write autotune report.md: %w", err) + } + return nil +} + +func minInt(a, b int) int { + if a < b { + return a + } + return b +} + +var _ = exec.ErrNotFound diff --git a/audit/internal/platform/benchmark_types.go b/audit/internal/platform/benchmark_types.go index d0c83ef..da764c8 100644 --- a/audit/internal/platform/benchmark_types.go +++ b/audit/internal/platform/benchmark_types.go @@ -43,6 +43,11 @@ const ( NvidiaBenchmarkProfileOvernight = "overnight" ) +const ( + BenchmarkPowerEngineDCGMProfTester = "dcgmproftester" + BenchmarkPowerEngineTargetedPower = "targeted_power" +) + // Estimated wall-clock durations for benchmark runs, derived from real _v8 logs. // Rule: when changing profile phase durations in resolveBenchmarkProfile(), // re-measure from actual task logs and update the constants here. @@ -61,7 +66,7 @@ const ( BenchmarkEstimatedPerfStabilitySec = 5532 // ~92 min; ramp-up 1-8 measured BenchmarkEstimatedPerfOvernightSec = 8 * 3600 - // Power / Thermal Fit (dcgmi targeted_power binary-search calibration). + // Power / Thermal Fit (dcgmproftester load + nvidia-smi power-limit binary search). // Duration is for the full ramp-up run; individual steps vary with convergence speed. BenchmarkEstimatedPowerStandardSec = 2600 // ~43 min; ramp 1-4: 2663 s, ramp 1-8: 2375 s BenchmarkEstimatedPowerStabilitySec = 5400 // ~90 min; calibDurationSec=300 × 8 GPU × ~2-3 attempts @@ -74,12 +79,84 @@ type NvidiaBenchmarkOptions struct { GPUIndices []int ExcludeGPUIndices []int RunNCCL bool + ServerPowerSource string ParallelGPUs bool // run all selected GPUs simultaneously instead of sequentially RampStep int // 1-based step index within a ramp-up run (0 = not a ramp-up) RampTotal int // total number of ramp-up steps in this run RampRunID string // shared identifier across all steps of the same ramp-up run } +const ( + BenchmarkPowerSourceDCMI = "dcmi" + BenchmarkPowerSourceSDRPSUInput = "sdr_psu_input" +) + +type BenchmarkPowerAutotuneConfig struct { + Version int `json:"version"` + UpdatedAt time.Time `json:"updated_at"` + SelectedSource string `json:"selected_source"` + BenchmarkKind string `json:"benchmark_kind,omitempty"` + Profile string `json:"profile,omitempty"` + IdleDurationSec int `json:"idle_duration_sec,omitempty"` + LoadDurationSec int `json:"load_duration_sec,omitempty"` + SampleIntervalSec int `json:"sample_interval_sec,omitempty"` + Confidence float64 `json:"confidence,omitempty"` + Reason string `json:"reason,omitempty"` +} + +type SystemPowerSourceDecision struct { + Configured bool `json:"configured"` + SelectedSource string `json:"selected_source,omitempty"` + EffectiveSource string `json:"effective_source,omitempty"` + Mode string `json:"mode,omitempty"` // autotuned, fallback, degraded + Reason string `json:"reason,omitempty"` + ConfiguredAt time.Time `json:"configured_at,omitempty"` +} + +type BenchmarkPowerAutotuneResult struct { + GeneratedAt time.Time `json:"generated_at"` + Hostname string `json:"hostname,omitempty"` + ServerModel string `json:"server_model,omitempty"` + BenchmarkKind string `json:"benchmark_kind,omitempty"` + Profile string `json:"profile,omitempty"` + Status string `json:"status"` + IdleDurationSec int `json:"idle_duration_sec"` + LoadDurationSec int `json:"load_duration_sec"` + SampleIntervalSec int `json:"sample_interval_sec"` + SelectedSource string `json:"selected_source,omitempty"` + IdleValidationError string `json:"idle_validation_error,omitempty"` + IdleValidation *BenchmarkPowerAutotuneValidation `json:"idle_validation,omitempty"` + GPUPowerIdleW float64 `json:"gpu_power_idle_w,omitempty"` + GPUPowerLoadW float64 `json:"gpu_power_load_w,omitempty"` + Candidates []BenchmarkPowerAutotuneCandidate `json:"candidates,omitempty"` + Notes []string `json:"notes,omitempty"` + Config *BenchmarkPowerAutotuneConfig `json:"config,omitempty"` +} + +type BenchmarkPowerAutotuneValidation struct { + Valid bool `json:"valid"` + GPUAvgUsagePct float64 `json:"gpu_avg_usage_pct,omitempty"` + GPUP95UsagePct float64 `json:"gpu_p95_usage_pct,omitempty"` + CPUAvgUsagePct float64 `json:"cpu_avg_usage_pct,omitempty"` + CPUP95UsagePct float64 `json:"cpu_p95_usage_pct,omitempty"` + GPUSamples int `json:"gpu_samples,omitempty"` + CPUSamples int `json:"cpu_samples,omitempty"` + Reason string `json:"reason,omitempty"` +} + +type BenchmarkPowerAutotuneCandidate struct { + Source string `json:"source"` + IdleAvgW float64 `json:"idle_avg_w,omitempty"` + LoadAvgW float64 `json:"load_avg_w,omitempty"` + DeltaW float64 `json:"delta_w,omitempty"` + Samples int `json:"samples,omitempty"` + RelativeError float64 `json:"relative_error,omitempty"` + Confidence float64 `json:"confidence,omitempty"` + Selected bool `json:"selected,omitempty"` + Available bool `json:"available"` + SelectionNotes string `json:"selection_notes,omitempty"` +} + type NvidiaBenchmarkResult struct { BenchmarkVersion string `json:"benchmark_version"` GeneratedAt time.Time `json:"generated_at"` @@ -294,12 +371,16 @@ type BenchmarkPSUSlotPower struct { // - SDR — `ipmitool sdr` PSUx_POWER_IN/OUT; per-PSU, reliable // - nvidia-smi — GPU self-reported via internal shunt; accurate for GPU load type BenchmarkServerPower struct { - Available bool `json:"available"` - IdleW float64 `json:"idle_w,omitempty"` // DCMI at idle - LoadedW float64 `json:"loaded_w,omitempty"` // DCMI at peak load - DeltaW float64 `json:"delta_w,omitempty"` // DCMI loaded − idle - GPUReportedSumW float64 `json:"gpu_reported_sum_w,omitempty"` - ReportingRatio float64 `json:"reporting_ratio,omitempty"` + Available bool `json:"available"` + Source string `json:"source,omitempty"` + Mode string `json:"mode,omitempty"` + Reason string `json:"reason,omitempty"` + SampleIntervalSec int `json:"sample_interval_sec,omitempty"` + IdleW float64 `json:"idle_w,omitempty"` // DCMI at idle + LoadedW float64 `json:"loaded_w,omitempty"` // DCMI at peak load + DeltaW float64 `json:"delta_w,omitempty"` // DCMI loaded − idle + GPUReportedSumW float64 `json:"gpu_reported_sum_w,omitempty"` + ReportingRatio float64 `json:"reporting_ratio,omitempty"` // PSU AC input sum — sampled at idle and at peak load using collector's // slot patterns (PSU1_POWER_IN, PSU1_PIN, PS1 POut, Power1…).