audit: switch power benchmark load to dcgmproftester
This commit is contained in:
@@ -240,6 +240,47 @@ func setBenchmarkPowerLimit(ctx context.Context, verboseLog string, gpuIndex, po
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func benchmarkPowerEngine() string {
|
||||||
|
switch strings.TrimSpace(strings.ToLower(os.Getenv("BEE_BENCH_POWER_ENGINE"))) {
|
||||||
|
case BenchmarkPowerEngineTargetedPower:
|
||||||
|
return BenchmarkPowerEngineTargetedPower
|
||||||
|
default:
|
||||||
|
return BenchmarkPowerEngineDCGMProfTester
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func benchmarkPowerEngineLabel(engine string) string {
|
||||||
|
switch strings.TrimSpace(strings.ToLower(engine)) {
|
||||||
|
case BenchmarkPowerEngineTargetedPower:
|
||||||
|
return "dcgmi diag targeted_power"
|
||||||
|
default:
|
||||||
|
return "dcgmproftester"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func resolveBenchmarkPowerLoadCommand(durationSec int, gpuIndices []int) ([]string, []string, error) {
|
||||||
|
engine := benchmarkPowerEngine()
|
||||||
|
durationSec = normalizeNvidiaBurnDuration(durationSec)
|
||||||
|
switch engine {
|
||||||
|
case BenchmarkPowerEngineTargetedPower:
|
||||||
|
return nvidiaDCGMNamedDiagCommand("targeted_power", durationSec, gpuIndices), nil, nil
|
||||||
|
default:
|
||||||
|
if len(gpuIndices) > 1 {
|
||||||
|
return []string{
|
||||||
|
"bee-dcgmproftester-staggered",
|
||||||
|
"--seconds", strconv.Itoa(durationSec),
|
||||||
|
"--stagger-seconds", "0",
|
||||||
|
"--devices", joinIndexList(gpuIndices),
|
||||||
|
}, nil, nil
|
||||||
|
}
|
||||||
|
cmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(durationSec))
|
||||||
|
if err != nil {
|
||||||
|
return nil, nil, err
|
||||||
|
}
|
||||||
|
return cmd, nvidiaVisibleDevicesEnv(gpuIndices), nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||||
if ctx == nil {
|
if ctx == nil {
|
||||||
ctx = context.Background()
|
ctx = context.Background()
|
||||||
@@ -384,10 +425,10 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
|
|
||||||
// Sample server idle power once (first GPU only — server state is global).
|
// Sample server idle power once (first GPU only — server state is global).
|
||||||
if !serverIdleOK {
|
if !serverIdleOK {
|
||||||
if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok {
|
if w, ok := sampleBenchmarkPowerSourceSeries(ctx, opts.ServerPowerSource, maxInt(spec.BaselineSec, 10), benchmarkPowerAutotuneSampleInterval); ok {
|
||||||
serverIdleW = w
|
serverIdleW = w
|
||||||
serverIdleOK = true
|
serverIdleOK = true
|
||||||
logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
|
logFunc(fmt.Sprintf("server idle power (%s): %.0f W", opts.ServerPowerSource, w))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -430,7 +471,16 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
"--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases),
|
"--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases),
|
||||||
}
|
}
|
||||||
logFunc(fmt.Sprintf("GPU %d: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", idx, len(supportedPrecisions), basePhaseSec, mixedPhaseSec))
|
logFunc(fmt.Sprintf("GPU %d: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", idx, len(supportedPrecisions), basePhaseSec, mixedPhaseSec))
|
||||||
|
serverPowerStopCh := make(chan struct{})
|
||||||
|
serverPowerCh := startSelectedPowerSourceSampler(serverPowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
|
||||||
_, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-precision-plan.log", idx), planCmd, nil, []int{idx}, planPhases, logFunc)
|
_, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-precision-plan.log", idx), planCmd, nil, []int{idx}, planPhases, logFunc)
|
||||||
|
close(serverPowerStopCh)
|
||||||
|
if serverPowerSamples := <-serverPowerCh; len(serverPowerSamples) > 0 {
|
||||||
|
serverLoadedWSum += benchmarkMean(serverPowerSamples)
|
||||||
|
serverLoadedSamples++
|
||||||
|
serverLoadedOK = true
|
||||||
|
logFunc(fmt.Sprintf("GPU %d: server loaded power (%s avg): %.0f W", idx, opts.ServerPowerSource, benchmarkMean(serverPowerSamples)))
|
||||||
|
}
|
||||||
for _, phaseSpec := range planPhases {
|
for _, phaseSpec := range planPhases {
|
||||||
if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 {
|
if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 {
|
||||||
appendBenchmarkMetrics(&metricRows, rows, phaseSpec.MetricStage, &metricTimelineSec, float64(phaseSpec.DurationSec))
|
appendBenchmarkMetrics(&metricRows, rows, phaseSpec.MetricStage, &metricTimelineSec, float64(phaseSpec.DurationSec))
|
||||||
@@ -461,48 +511,6 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
|
|
||||||
beforeThrottle, _ := queryThrottleCounters(idx)
|
beforeThrottle, _ := queryThrottleCounters(idx)
|
||||||
logFunc(fmt.Sprintf("GPU %d: steady compute (combined, %ds)", idx, mixedPhaseSec))
|
logFunc(fmt.Sprintf("GPU %d: steady compute (combined, %ds)", idx, mixedPhaseSec))
|
||||||
|
|
||||||
// Sample server power via IPMI in parallel with the steady phase.
|
|
||||||
// We collect readings every 5s and average them.
|
|
||||||
ipmiStopCh := make(chan struct{})
|
|
||||||
ipmiResultCh := make(chan float64, 1)
|
|
||||||
go func() {
|
|
||||||
defer close(ipmiResultCh)
|
|
||||||
var samples []float64
|
|
||||||
ticker := time.NewTicker(5 * time.Second)
|
|
||||||
defer ticker.Stop()
|
|
||||||
// First sample after a short warmup delay.
|
|
||||||
select {
|
|
||||||
case <-ipmiStopCh:
|
|
||||||
return
|
|
||||||
case <-time.After(15 * time.Second):
|
|
||||||
}
|
|
||||||
for {
|
|
||||||
if w, err := queryIPMIServerPowerW(); err == nil {
|
|
||||||
samples = append(samples, w)
|
|
||||||
}
|
|
||||||
select {
|
|
||||||
case <-ipmiStopCh:
|
|
||||||
if len(samples) > 0 {
|
|
||||||
var sum float64
|
|
||||||
for _, w := range samples {
|
|
||||||
sum += w
|
|
||||||
}
|
|
||||||
ipmiResultCh <- sum / float64(len(samples))
|
|
||||||
}
|
|
||||||
return
|
|
||||||
case <-ticker.C:
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
|
|
||||||
close(ipmiStopCh)
|
|
||||||
if loadedW, ok := <-ipmiResultCh; ok {
|
|
||||||
serverLoadedWSum += loadedW
|
|
||||||
serverLoadedSamples++
|
|
||||||
serverLoadedOK = true
|
|
||||||
logFunc(fmt.Sprintf("GPU %d: server loaded power (IPMI): %.0f W", idx, loadedW))
|
|
||||||
}
|
|
||||||
afterThrottle, _ := queryThrottleCounters(idx)
|
afterThrottle, _ := queryThrottleCounters(idx)
|
||||||
if planErr != nil {
|
if planErr != nil {
|
||||||
gpuResult.Notes = append(gpuResult.Notes, "precision plan failed: "+planErr.Error())
|
gpuResult.Notes = append(gpuResult.Notes, "precision plan failed: "+planErr.Error())
|
||||||
@@ -652,7 +660,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
if serverLoadedSamples > 0 {
|
if serverLoadedSamples > 0 {
|
||||||
serverLoadedW = serverLoadedWSum / float64(serverLoadedSamples)
|
serverLoadedW = serverLoadedWSum / float64(serverLoadedSamples)
|
||||||
}
|
}
|
||||||
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuReportedSumW, serverIdleOK && serverLoadedOK)
|
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuReportedSumW, opts.ServerPowerSource, serverIdleOK && serverLoadedOK)
|
||||||
result.Cooling = summarizeBenchmarkCooling(metricRows)
|
result.Cooling = summarizeBenchmarkCooling(metricRows)
|
||||||
|
|
||||||
// Apply server-power penalty when IPMI reports the server delta is much
|
// Apply server-power penalty when IPMI reports the server delta is much
|
||||||
@@ -707,6 +715,7 @@ func normalizeNvidiaBenchmarkOptionsForBenchmark(opts NvidiaBenchmarkOptions) Nv
|
|||||||
if opts.SizeMB < 0 {
|
if opts.SizeMB < 0 {
|
||||||
opts.SizeMB = 0
|
opts.SizeMB = 0
|
||||||
}
|
}
|
||||||
|
opts.ServerPowerSource = normalizeBenchmarkPowerSource(opts.ServerPowerSource)
|
||||||
opts.GPUIndices = dedupeSortedIndices(opts.GPUIndices)
|
opts.GPUIndices = dedupeSortedIndices(opts.GPUIndices)
|
||||||
opts.ExcludeGPUIndices = dedupeSortedIndices(opts.ExcludeGPUIndices)
|
opts.ExcludeGPUIndices = dedupeSortedIndices(opts.ExcludeGPUIndices)
|
||||||
return opts
|
return opts
|
||||||
@@ -2535,10 +2544,14 @@ loop:
|
|||||||
}
|
}
|
||||||
|
|
||||||
// characterizeServerPower computes BenchmarkServerPower from idle and loaded
|
// characterizeServerPower computes BenchmarkServerPower from idle and loaded
|
||||||
// IPMI samples plus the GPU-reported average power during steady state.
|
// samples plus the GPU-reported average power during steady state.
|
||||||
func characterizeServerPower(idleW, loadedW, gpuReportedSumW float64, ipmiAvailable bool) *BenchmarkServerPower {
|
func characterizeServerPower(idleW, loadedW, gpuReportedSumW float64, source string, available bool) *BenchmarkServerPower {
|
||||||
sp := &BenchmarkServerPower{Available: ipmiAvailable}
|
sp := &BenchmarkServerPower{
|
||||||
if !ipmiAvailable {
|
Available: available,
|
||||||
|
Source: normalizeBenchmarkPowerSource(source),
|
||||||
|
SampleIntervalSec: benchmarkPowerAutotuneSampleInterval,
|
||||||
|
}
|
||||||
|
if !available {
|
||||||
sp.Notes = append(sp.Notes, "IPMI power reading unavailable; server-side power characterization skipped")
|
sp.Notes = append(sp.Notes, "IPMI power reading unavailable; server-side power characterization skipped")
|
||||||
return sp
|
return sp
|
||||||
}
|
}
|
||||||
@@ -2671,10 +2684,10 @@ func runNvidiaBenchmarkParallel(
|
|||||||
|
|
||||||
// Sample server idle power once.
|
// Sample server idle power once.
|
||||||
if !*serverIdleOK {
|
if !*serverIdleOK {
|
||||||
if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok {
|
if w, ok := sampleBenchmarkPowerSourceSeries(ctx, opts.ServerPowerSource, maxInt(spec.BaselineSec, 10), benchmarkPowerAutotuneSampleInterval); ok {
|
||||||
*serverIdleW = w
|
*serverIdleW = w
|
||||||
*serverIdleOK = true
|
*serverIdleOK = true
|
||||||
logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
|
logFunc(fmt.Sprintf("server idle power (%s): %.0f W", opts.ServerPowerSource, w))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2728,7 +2741,16 @@ func runNvidiaBenchmarkParallel(
|
|||||||
"--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases),
|
"--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases),
|
||||||
}
|
}
|
||||||
logFunc(fmt.Sprintf("GPUs %s: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", allDevices, len(supportedPrecisions), basePhaseSec, mixedPhaseSec))
|
logFunc(fmt.Sprintf("GPUs %s: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", allDevices, len(supportedPrecisions), basePhaseSec, mixedPhaseSec))
|
||||||
|
serverPowerStopCh := make(chan struct{})
|
||||||
|
serverPowerCh := startSelectedPowerSourceSampler(serverPowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
|
||||||
_, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, "gpu-all-precision-plan.log", planCmd, nil, selected, planPhases, logFunc)
|
_, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, "gpu-all-precision-plan.log", planCmd, nil, selected, planPhases, logFunc)
|
||||||
|
close(serverPowerStopCh)
|
||||||
|
if serverPowerSamples := <-serverPowerCh; len(serverPowerSamples) > 0 {
|
||||||
|
*serverLoadedWSum += benchmarkMean(serverPowerSamples)
|
||||||
|
(*serverLoadedSamples)++
|
||||||
|
*serverLoadedOK = true
|
||||||
|
logFunc(fmt.Sprintf("GPUs %s: server loaded power (%s avg): %.0f W", allDevices, opts.ServerPowerSource, benchmarkMean(serverPowerSamples)))
|
||||||
|
}
|
||||||
for _, phaseSpec := range planPhases {
|
for _, phaseSpec := range planPhases {
|
||||||
if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 {
|
if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 {
|
||||||
appendBenchmarkMetrics(allMetricRows, rows, phaseSpec.MetricStage, metricTimelineSec, float64(phaseSpec.DurationSec))
|
appendBenchmarkMetrics(allMetricRows, rows, phaseSpec.MetricStage, metricTimelineSec, float64(phaseSpec.DurationSec))
|
||||||
@@ -2770,46 +2792,6 @@ func runNvidiaBenchmarkParallel(
|
|||||||
}
|
}
|
||||||
|
|
||||||
logFunc(fmt.Sprintf("GPUs %s: parallel steady compute (combined, %ds)", allDevices, mixedPhaseSec))
|
logFunc(fmt.Sprintf("GPUs %s: parallel steady compute (combined, %ds)", allDevices, mixedPhaseSec))
|
||||||
|
|
||||||
// Sample server power via IPMI in parallel with steady phase.
|
|
||||||
ipmiStopCh := make(chan struct{})
|
|
||||||
ipmiResultCh := make(chan float64, 1)
|
|
||||||
go func() {
|
|
||||||
defer close(ipmiResultCh)
|
|
||||||
var samples []float64
|
|
||||||
ticker := time.NewTicker(5 * time.Second)
|
|
||||||
defer ticker.Stop()
|
|
||||||
select {
|
|
||||||
case <-ipmiStopCh:
|
|
||||||
return
|
|
||||||
case <-time.After(15 * time.Second):
|
|
||||||
}
|
|
||||||
for {
|
|
||||||
if w, err := queryIPMIServerPowerW(); err == nil {
|
|
||||||
samples = append(samples, w)
|
|
||||||
}
|
|
||||||
select {
|
|
||||||
case <-ipmiStopCh:
|
|
||||||
if len(samples) > 0 {
|
|
||||||
var sum float64
|
|
||||||
for _, w := range samples {
|
|
||||||
sum += w
|
|
||||||
}
|
|
||||||
ipmiResultCh <- sum / float64(len(samples))
|
|
||||||
}
|
|
||||||
return
|
|
||||||
case <-ticker.C:
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
|
|
||||||
close(ipmiStopCh)
|
|
||||||
if loadedW, ok := <-ipmiResultCh; ok {
|
|
||||||
*serverLoadedWSum += loadedW
|
|
||||||
(*serverLoadedSamples)++
|
|
||||||
*serverLoadedOK = true
|
|
||||||
logFunc(fmt.Sprintf("GPUs %s: server loaded power (IPMI): %.0f W", allDevices, loadedW))
|
|
||||||
}
|
|
||||||
afterThrottle := make(map[int]BenchmarkThrottleCounters, len(selected))
|
afterThrottle := make(map[int]BenchmarkThrottleCounters, len(selected))
|
||||||
for _, idx := range selected {
|
for _, idx := range selected {
|
||||||
afterThrottle[idx], _ = queryThrottleCounters(idx)
|
afterThrottle[idx], _ = queryThrottleCounters(idx)
|
||||||
@@ -3040,8 +3022,8 @@ func summarizeCPULoad(samples []float64) *BenchmarkCPULoad {
|
|||||||
return cl
|
return cl
|
||||||
}
|
}
|
||||||
|
|
||||||
// runBenchmarkPowerCalibration runs targeted_power for the supplied GPU set and
|
// runBenchmarkPowerCalibration runs the configured power-fit load for the supplied
|
||||||
// actively watches throttle counters. seedLimits, when provided, are treated as
|
// GPU set and actively watches throttle counters. seedLimits, when provided, are treated as
|
||||||
// the starting point for this calibration pass rather than as immutable fixed
|
// the starting point for this calibration pass rather than as immutable fixed
|
||||||
// limits. This matters during cumulative ramp-up: once an additional GPU is
|
// limits. This matters during cumulative ramp-up: once an additional GPU is
|
||||||
// introduced, every already-active GPU must be revalidated under the new
|
// introduced, every already-active GPU must be revalidated under the new
|
||||||
@@ -3070,10 +3052,19 @@ func runBenchmarkPowerCalibration(
|
|||||||
// doubling each retry until it would exceed the cap, at which point the
|
// doubling each retry until it would exceed the cap, at which point the
|
||||||
// next busy response fails the calibration immediately.
|
// next busy response fails the calibration immediately.
|
||||||
const dcgmResourceBusyMaxDelaySec = 300
|
const dcgmResourceBusyMaxDelaySec = 300
|
||||||
|
engine := benchmarkPowerEngine()
|
||||||
|
engineLabel := benchmarkPowerEngineLabel(engine)
|
||||||
|
|
||||||
if _, err := exec.LookPath("dcgmi"); err != nil {
|
if engine == BenchmarkPowerEngineTargetedPower {
|
||||||
logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
|
if _, err := exec.LookPath("dcgmi"); err != nil {
|
||||||
return map[int]benchmarkPowerCalibrationResult{}, nil, nil
|
logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
|
||||||
|
return map[int]benchmarkPowerCalibrationResult{}, nil, nil
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if _, _, err := resolveBenchmarkPowerLoadCommand(calibDurationSec, gpuIndices); err != nil {
|
||||||
|
logFunc("power calibration: dcgmproftester not found, skipping (will use default power limit)")
|
||||||
|
return map[int]benchmarkPowerCalibrationResult{}, nil, nil
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if killed := KillTestWorkers(); len(killed) > 0 {
|
if killed := KillTestWorkers(); len(killed) > 0 {
|
||||||
for _, p := range killed {
|
for _, p := range killed {
|
||||||
@@ -3206,7 +3197,7 @@ calibDone:
|
|||||||
sharedAttempt++
|
sharedAttempt++
|
||||||
for _, s := range active {
|
for _, s := range active {
|
||||||
s.calib.Attempts++
|
s.calib.Attempts++
|
||||||
logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power attempt %d at %d W for %ds", s.idx, s.calib.Attempts, s.appliedLimitW, calibDurationSec))
|
logFunc(fmt.Sprintf("power calibration: GPU %d %s attempt %d at %d W for %ds", s.idx, engineLabel, s.calib.Attempts, s.appliedLimitW, calibDurationSec))
|
||||||
}
|
}
|
||||||
|
|
||||||
// Snapshot throttle counters for all active GPUs before the run.
|
// Snapshot throttle counters for all active GPUs before the run.
|
||||||
@@ -3215,14 +3206,22 @@ calibDone:
|
|||||||
beforeThrottle[s.idx], _ = queryThrottleCounters(s.idx)
|
beforeThrottle[s.idx], _ = queryThrottleCounters(s.idx)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Run targeted_power for ALL gpuIndices simultaneously so every card
|
// Run the selected power-fit load for ALL gpuIndices simultaneously so every card
|
||||||
// is under load during calibration — this reflects real server thermals.
|
// is under load during calibration — this reflects real server thermals.
|
||||||
logName := fmt.Sprintf("power-calibration-attempt-%d.log", sharedAttempt)
|
logName := fmt.Sprintf("power-calibration-attempt-%d.log", sharedAttempt)
|
||||||
cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, gpuIndices)
|
cmd, env, err := resolveBenchmarkPowerLoadCommand(calibDurationSec, gpuIndices)
|
||||||
|
if err != nil {
|
||||||
|
for _, s := range active {
|
||||||
|
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("failed to resolve %s command: %v", engineLabel, err))
|
||||||
|
s.converged = true
|
||||||
|
}
|
||||||
|
logFunc(fmt.Sprintf("power calibration: failed to resolve %s command: %v", engineLabel, err))
|
||||||
|
break calibDone
|
||||||
|
}
|
||||||
attemptCtx, cancelAttempt := context.WithCancel(ctx)
|
attemptCtx, cancelAttempt := context.WithCancel(ctx)
|
||||||
doneCh := make(chan sharedAttemptResult, 1)
|
doneCh := make(chan sharedAttemptResult, 1)
|
||||||
go func() {
|
go func() {
|
||||||
out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, nil, gpuIndices, logFunc)
|
out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, env, gpuIndices, logFunc)
|
||||||
doneCh <- sharedAttemptResult{out: out, rows: rows, err: err}
|
doneCh <- sharedAttemptResult{out: out, rows: rows, err: err}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
@@ -3245,8 +3244,8 @@ calibDone:
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
// Record throttle but do NOT cancel — let dcgmi finish so
|
// Record throttle but do NOT cancel — let the load command finish so
|
||||||
// nv-hostengine releases the slot cleanly before the next attempt.
|
// runtime resources release cleanly before the next attempt.
|
||||||
if reason := benchmarkCalibrationThrottleReason(beforeThrottle[s.idx], after); reason != "" {
|
if reason := benchmarkCalibrationThrottleReason(beforeThrottle[s.idx], after); reason != "" {
|
||||||
throttleReasons[s.idx] = reason
|
throttleReasons[s.idx] = reason
|
||||||
logFunc(fmt.Sprintf("power calibration: GPU %d detected %s throttle at %d W, waiting for run to finish", s.idx, reason, s.appliedLimitW))
|
logFunc(fmt.Sprintf("power calibration: GPU %d detected %s throttle at %d W, waiting for run to finish", s.idx, reason, s.appliedLimitW))
|
||||||
@@ -3359,9 +3358,9 @@ calibDone:
|
|||||||
logFunc(fmt.Sprintf("power calibration: GPU %d throttled (%s) at %d W, reducing power limit", s.idx, throttle, s.appliedLimitW))
|
logFunc(fmt.Sprintf("power calibration: GPU %d throttled (%s) at %d W, reducing power limit", s.idx, throttle, s.appliedLimitW))
|
||||||
case ar.err != nil:
|
case ar.err != nil:
|
||||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d failed at %d W: %v", s.calib.Attempts, s.appliedLimitW, ar.err))
|
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d failed at %d W: %v", s.calib.Attempts, s.appliedLimitW, ar.err))
|
||||||
logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power failed at %d W: %v", s.idx, s.appliedLimitW, ar.err))
|
logFunc(fmt.Sprintf("power calibration: GPU %d %s failed at %d W: %v", s.idx, engineLabel, s.appliedLimitW, ar.err))
|
||||||
default:
|
default:
|
||||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d at %d W: no valid power telemetry", s.calib.Attempts, s.appliedLimitW))
|
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("%s attempt %d at %d W: no valid power telemetry", engineLabel, s.calib.Attempts, s.appliedLimitW))
|
||||||
logFunc(fmt.Sprintf("power calibration: GPU %d attempt %d at %d W: no valid telemetry", s.idx, s.calib.Attempts, s.appliedLimitW))
|
logFunc(fmt.Sprintf("power calibration: GPU %d attempt %d at %d W: no valid telemetry", s.idx, s.calib.Attempts, s.appliedLimitW))
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -3384,7 +3383,7 @@ calibDone:
|
|||||||
s.calib.Completed = true
|
s.calib.Completed = true
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
|
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit within %d W of the default", engineLabel, maxDerateW))
|
||||||
}
|
}
|
||||||
s.calib.MetricRows = filterRowsByGPU(ar.rows, s.idx)
|
s.calib.MetricRows = filterRowsByGPU(ar.rows, s.idx)
|
||||||
s.converged = true
|
s.converged = true
|
||||||
@@ -3399,7 +3398,7 @@ calibDone:
|
|||||||
next = (s.lo + s.hi) / 2
|
next = (s.lo + s.hi) / 2
|
||||||
}
|
}
|
||||||
if next < s.minLimitW {
|
if next < s.minLimitW {
|
||||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
|
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable %s limit within %d W of the default", engineLabel, maxDerateW))
|
||||||
s.converged = true
|
s.converged = true
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
@@ -4117,13 +4116,13 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
}
|
}
|
||||||
durationSec := powerBenchDurationSec(opts.Profile)
|
durationSec := powerBenchDurationSec(opts.Profile)
|
||||||
|
|
||||||
// Sample IPMI idle power before any GPU load.
|
// Sample server idle power before any GPU load.
|
||||||
var serverIdleW float64
|
var serverIdleW float64
|
||||||
var serverIdleOK bool
|
var serverIdleOK bool
|
||||||
if w, ok := sampleIPMIPowerSeries(ctx, 10); ok {
|
if w, ok := sampleBenchmarkPowerSourceSeries(ctx, opts.ServerPowerSource, 10, benchmarkPowerAutotuneSampleInterval); ok {
|
||||||
serverIdleW = w
|
serverIdleW = w
|
||||||
serverIdleOK = true
|
serverIdleOK = true
|
||||||
logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
|
logFunc(fmt.Sprintf("server idle power (%s): %.0f W", opts.ServerPowerSource, w))
|
||||||
}
|
}
|
||||||
sdrIdle := sampleIPMISDRPowerSensors()
|
sdrIdle := sampleIPMISDRPowerSensors()
|
||||||
psuBefore := psuStatusSnapshot()
|
psuBefore := psuStatusSnapshot()
|
||||||
@@ -4141,26 +4140,18 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
_ = os.MkdirAll(singleDir, 0755)
|
_ = os.MkdirAll(singleDir, 0755)
|
||||||
singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
||||||
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
|
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
|
||||||
ipmiSingleCtx, ipmiSingleCancel := context.WithCancel(ctx)
|
singlePowerStopCh := make(chan struct{})
|
||||||
ipmiSingleDone := make(chan float64, 1)
|
singlePowerCh := startSelectedPowerSourceSampler(singlePowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
|
||||||
go func() {
|
|
||||||
defer close(ipmiSingleDone)
|
|
||||||
if w, ok := sampleIPMIPowerSeries(ipmiSingleCtx, 3600); ok {
|
|
||||||
ipmiSingleDone <- w
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
c, restore, singleRows := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil, durationSec)
|
c, restore, singleRows := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil, durationSec)
|
||||||
appendBenchmarkMetrics(&allPowerRows, singleRows, fmt.Sprintf("single-gpu-%d", idx), &powerCursor, 0)
|
appendBenchmarkMetrics(&allPowerRows, singleRows, fmt.Sprintf("single-gpu-%d", idx), &powerCursor, 0)
|
||||||
ipmiSingleCancel()
|
close(singlePowerStopCh)
|
||||||
sdrSingle := sampleIPMISDRPowerSensors()
|
sdrSingle := sampleIPMISDRPowerSensors()
|
||||||
if sdrSingle.PSUInW > 0 {
|
if samples := <-singlePowerCh; len(samples) > 0 {
|
||||||
|
singleIPMILoadedW[idx] = benchmarkMean(samples)
|
||||||
|
logFunc(fmt.Sprintf("power calibration: GPU %d single-card server power (%s avg): %.0f W", idx, opts.ServerPowerSource, singleIPMILoadedW[idx]))
|
||||||
|
} else if opts.ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && sdrSingle.PSUInW > 0 {
|
||||||
singleIPMILoadedW[idx] = sdrSingle.PSUInW
|
singleIPMILoadedW[idx] = sdrSingle.PSUInW
|
||||||
logFunc(fmt.Sprintf("power calibration: GPU %d single-card IPMI loaded: %.0f W (SDR PSU AC input)", idx, sdrSingle.PSUInW))
|
logFunc(fmt.Sprintf("power calibration: GPU %d single-card fallback server power (SDR snapshot): %.0f W", idx, sdrSingle.PSUInW))
|
||||||
} else if w, ok := <-ipmiSingleDone; ok {
|
|
||||||
singleIPMILoadedW[idx] = w
|
|
||||||
logFunc(fmt.Sprintf("power calibration: GPU %d single-card IPMI loaded: %.0f W (DCMI)", idx, w))
|
|
||||||
} else {
|
|
||||||
<-ipmiSingleDone // drain channel
|
|
||||||
}
|
}
|
||||||
allRestoreActions = append(allRestoreActions, restore...)
|
allRestoreActions = append(allRestoreActions, restore...)
|
||||||
if r, ok := c[idx]; ok {
|
if r, ok := c[idx]; ok {
|
||||||
@@ -4234,11 +4225,11 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
result.RecommendedSlotOrder = append(result.RecommendedSlotOrder, gpu.Index)
|
result.RecommendedSlotOrder = append(result.RecommendedSlotOrder, gpu.Index)
|
||||||
}
|
}
|
||||||
if len(result.RecommendedSlotOrder) > 0 {
|
if len(result.RecommendedSlotOrder) > 0 {
|
||||||
result.Findings = append(result.Findings, fmt.Sprintf("Recommended slot order for installation based on single-card targeted_power: %s.", joinIndexList(result.RecommendedSlotOrder)))
|
result.Findings = append(result.Findings, fmt.Sprintf("Recommended slot order for installation based on single-card %s: %s.", benchmarkPowerEngineLabel(benchmarkPowerEngine()), joinIndexList(result.RecommendedSlotOrder)))
|
||||||
}
|
}
|
||||||
for _, gpu := range gpus {
|
for _, gpu := range gpus {
|
||||||
if gpu.Derated {
|
if gpu.Derated {
|
||||||
result.Findings = append(result.Findings, fmt.Sprintf("GPU %d required reduced power limit %.0f W to complete targeted_power.", gpu.Index, gpu.AppliedPowerLimitW))
|
result.Findings = append(result.Findings, fmt.Sprintf("GPU %d required reduced power limit %.0f W to complete %s.", gpu.Index, gpu.AppliedPowerLimitW, benchmarkPowerEngineLabel(benchmarkPowerEngine())))
|
||||||
}
|
}
|
||||||
if gpu.CoolingWarning != "" {
|
if gpu.CoolingWarning != "" {
|
||||||
result.Findings = append(result.Findings, fmt.Sprintf(
|
result.Findings = append(result.Findings, fmt.Sprintf(
|
||||||
@@ -4255,7 +4246,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
// Phase 2: cumulative thermal ramp.
|
// Phase 2: cumulative thermal ramp.
|
||||||
// Each step introduces one new GPU into an environment where all previously
|
// Each step introduces one new GPU into an environment where all previously
|
||||||
// calibrated GPUs are already running at their fixed stable limits. The new
|
// calibrated GPUs are already running at their fixed stable limits. The new
|
||||||
// GPU's stable TDP is searched via binary search (targeted_power) under real
|
// GPU's stable TDP is searched via binary search under real
|
||||||
// multi-GPU thermal load. Once found, its limit is fixed permanently for all
|
// multi-GPU thermal load. Once found, its limit is fixed permanently for all
|
||||||
// subsequent steps. This ensures each GPU's limit reflects actual sustained
|
// subsequent steps. This ensures each GPU's limit reflects actual sustained
|
||||||
// power in the final full-system thermal state.
|
// power in the final full-system thermal state.
|
||||||
@@ -4294,7 +4285,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
}
|
}
|
||||||
if !firstCalib.Completed {
|
if !firstCalib.Completed {
|
||||||
ramp.Status = "FAILED"
|
ramp.Status = "FAILED"
|
||||||
ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card targeted_power", firstIdx))
|
ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card %s", firstIdx, benchmarkPowerEngineLabel(benchmarkPowerEngine())))
|
||||||
result.OverallStatus = "PARTIAL"
|
result.OverallStatus = "PARTIAL"
|
||||||
} else if firstCalib.Derated {
|
} else if firstCalib.Derated {
|
||||||
ramp.Status = "PARTIAL"
|
ramp.Status = "PARTIAL"
|
||||||
@@ -4340,21 +4331,15 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
step, len(result.RecommendedSlotOrder), len(subset), newGPUIdx))
|
step, len(result.RecommendedSlotOrder), len(subset), newGPUIdx))
|
||||||
|
|
||||||
stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
||||||
ipmiStepCtx, ipmiStepCancel := context.WithCancel(ctx)
|
stepPowerStopCh := make(chan struct{})
|
||||||
ipmiStepDone := make(chan float64, 1)
|
stepPowerCh := startSelectedPowerSourceSampler(stepPowerStopCh, opts.ServerPowerSource, benchmarkPowerAutotuneSampleInterval)
|
||||||
go func() {
|
|
||||||
defer close(ipmiStepDone)
|
|
||||||
if w, ok := sampleIPMIPowerSeries(ipmiStepCtx, 3600); ok {
|
|
||||||
ipmiStepDone <- w
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
stepCalib, stepRestore, stepRows := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep, durationSec)
|
stepCalib, stepRestore, stepRows := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep, durationSec)
|
||||||
appendBenchmarkMetrics(&allPowerRows, stepRows, fmt.Sprintf("ramp-step-%d", step), &powerCursor, 0)
|
appendBenchmarkMetrics(&allPowerRows, stepRows, fmt.Sprintf("ramp-step-%d", step), &powerCursor, 0)
|
||||||
ipmiStepCancel()
|
close(stepPowerStopCh)
|
||||||
var stepIPMILoadedW float64
|
var stepIPMILoadedW float64
|
||||||
var stepIPMIOK bool
|
var stepIPMIOK bool
|
||||||
if w, ok := <-ipmiStepDone; ok {
|
if samples := <-stepPowerCh; len(samples) > 0 {
|
||||||
stepIPMILoadedW = w
|
stepIPMILoadedW = benchmarkMean(samples)
|
||||||
stepIPMIOK = true
|
stepIPMIOK = true
|
||||||
}
|
}
|
||||||
// Accumulate restore actions; they all run in the outer defer.
|
// Accumulate restore actions; they all run in the outer defer.
|
||||||
@@ -4391,7 +4376,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
}
|
}
|
||||||
ramp.Status = "FAILED"
|
ramp.Status = "FAILED"
|
||||||
ramp.Notes = append(ramp.Notes,
|
ramp.Notes = append(ramp.Notes,
|
||||||
fmt.Sprintf("GPU %d did not complete targeted_power in ramp step %d; keeping previous stable limit %d W", idx, step, fallback))
|
fmt.Sprintf("GPU %d did not complete %s in ramp step %d; keeping previous stable limit %d W", idx, benchmarkPowerEngineLabel(benchmarkPowerEngine()), step, fallback))
|
||||||
result.OverallStatus = "PARTIAL"
|
result.OverallStatus = "PARTIAL"
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
@@ -4427,24 +4412,24 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
ramp.PSUSlotReadings = sdrStep.PSUSlots
|
ramp.PSUSlotReadings = sdrStep.PSUSlots
|
||||||
}
|
}
|
||||||
|
|
||||||
if sdrStep.PSUInW > 0 {
|
if stepIPMIOK && serverIdleOK && stepIPMILoadedW > 0 {
|
||||||
// SDR PSU sum is available — use it for server power (includes all PSUs).
|
|
||||||
ramp.ServerLoadedW = sdrStep.PSUInW
|
|
||||||
ramp.ServerDeltaW = sdrStep.PSUInW - sdrIdle.PSUInW
|
|
||||||
logFunc(fmt.Sprintf("power ramp: step %d IPMI loaded: %.0f W (SDR PSU AC input)", step, sdrStep.PSUInW))
|
|
||||||
if step == len(result.RecommendedSlotOrder) {
|
|
||||||
serverLoadedW = sdrStep.PSUInW
|
|
||||||
serverLoadedOK = true
|
|
||||||
sdrLastStep = sdrStep
|
|
||||||
}
|
|
||||||
} else if stepIPMIOK && serverIdleOK && stepIPMILoadedW > 0 {
|
|
||||||
ramp.ServerLoadedW = stepIPMILoadedW
|
ramp.ServerLoadedW = stepIPMILoadedW
|
||||||
ramp.ServerDeltaW = stepIPMILoadedW - serverIdleW
|
ramp.ServerDeltaW = stepIPMILoadedW - serverIdleW
|
||||||
logFunc(fmt.Sprintf("power ramp: step %d IPMI loaded: %.0f W (DCMI)", step, stepIPMILoadedW))
|
logFunc(fmt.Sprintf("power ramp: step %d server loaded power (%s avg): %.0f W", step, opts.ServerPowerSource, stepIPMILoadedW))
|
||||||
// The last step has all GPUs loaded — use it as the top-level loaded_w.
|
// The last step has all GPUs loaded — use it as the top-level loaded_w.
|
||||||
if step == len(result.RecommendedSlotOrder) {
|
if step == len(result.RecommendedSlotOrder) {
|
||||||
serverLoadedW = stepIPMILoadedW
|
serverLoadedW = stepIPMILoadedW
|
||||||
serverLoadedOK = true
|
serverLoadedOK = true
|
||||||
|
sdrLastStep = sdrStep
|
||||||
|
}
|
||||||
|
} else if opts.ServerPowerSource == BenchmarkPowerSourceSDRPSUInput && sdrStep.PSUInW > 0 {
|
||||||
|
ramp.ServerLoadedW = sdrStep.PSUInW
|
||||||
|
ramp.ServerDeltaW = sdrStep.PSUInW - sdrIdle.PSUInW
|
||||||
|
logFunc(fmt.Sprintf("power ramp: step %d fallback server loaded power (SDR snapshot): %.0f W", step, sdrStep.PSUInW))
|
||||||
|
if step == len(result.RecommendedSlotOrder) {
|
||||||
|
serverLoadedW = sdrStep.PSUInW
|
||||||
|
serverLoadedOK = true
|
||||||
|
sdrLastStep = sdrStep
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -4502,7 +4487,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
gpuActualSumW = result.PlatformMaxTDPW
|
gpuActualSumW = result.PlatformMaxTDPW
|
||||||
}
|
}
|
||||||
_ = serverIdleOK // used implicitly via characterizeServerPower
|
_ = serverIdleOK // used implicitly via characterizeServerPower
|
||||||
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuActualSumW, serverIdleOK && serverLoadedOK)
|
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuActualSumW, opts.ServerPowerSource, serverIdleOK && serverLoadedOK)
|
||||||
// Supplement DCMI with SDR multi-source data via collector's PSU slot patterns.
|
// Supplement DCMI with SDR multi-source data via collector's PSU slot patterns.
|
||||||
// Per-slot readings enable correlation with audit HardwarePowerSupply entries.
|
// Per-slot readings enable correlation with audit HardwarePowerSupply entries.
|
||||||
if result.ServerPower != nil {
|
if result.ServerPower != nil {
|
||||||
|
|||||||
735
audit/internal/platform/benchmark_power_autotune.go
Normal file
735
audit/internal/platform/benchmark_power_autotune.go
Normal file
@@ -0,0 +1,735 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"math"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
|
"sort"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
benchmarkPowerAutotuneVersion = 1
|
||||||
|
benchmarkPowerAutotuneIdleSec = 60
|
||||||
|
benchmarkPowerAutotuneLoadSec = 90
|
||||||
|
benchmarkPowerAutotuneSampleInterval = 3
|
||||||
|
defaultBenchmarkPowerSourceConfigPath = "/appdata/bee/export/bee-bench/power-source-autotune.json"
|
||||||
|
)
|
||||||
|
|
||||||
|
func BenchmarkPowerSourceConfigPath(baseDir string) string {
|
||||||
|
baseDir = strings.TrimSpace(baseDir)
|
||||||
|
if baseDir == "" {
|
||||||
|
return defaultBenchmarkPowerSourceConfigPath
|
||||||
|
}
|
||||||
|
return filepath.Join(filepath.Dir(baseDir), "power-source-autotune.json")
|
||||||
|
}
|
||||||
|
|
||||||
|
func LoadBenchmarkPowerAutotuneConfig(path string) (*BenchmarkPowerAutotuneConfig, error) {
|
||||||
|
raw, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
var cfg BenchmarkPowerAutotuneConfig
|
||||||
|
if err := json.Unmarshal(raw, &cfg); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(cfg.SelectedSource) == "" {
|
||||||
|
return nil, fmt.Errorf("autotune config missing selected_source")
|
||||||
|
}
|
||||||
|
return &cfg, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func SaveBenchmarkPowerAutotuneConfig(path string, cfg BenchmarkPowerAutotuneConfig) error {
|
||||||
|
if strings.TrimSpace(path) == "" {
|
||||||
|
return fmt.Errorf("empty autotune config path")
|
||||||
|
}
|
||||||
|
if cfg.Version <= 0 {
|
||||||
|
cfg.Version = benchmarkPowerAutotuneVersion
|
||||||
|
}
|
||||||
|
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
data, err := json.MarshalIndent(cfg, "", " ")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
tmp := path + ".tmp"
|
||||||
|
if err := os.WriteFile(tmp, data, 0644); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return os.Rename(tmp, path)
|
||||||
|
}
|
||||||
|
|
||||||
|
func LoadSystemPowerSourceConfig(exportDir string) (*BenchmarkPowerAutotuneConfig, error) {
|
||||||
|
return LoadBenchmarkPowerAutotuneConfig(BenchmarkPowerSourceConfigPath(exportDir))
|
||||||
|
}
|
||||||
|
|
||||||
|
func ResetBenchmarkPowerAutotuneConfig(path string) error {
|
||||||
|
if strings.TrimSpace(path) == "" {
|
||||||
|
return fmt.Errorf("empty autotune config path")
|
||||||
|
}
|
||||||
|
if err := os.Remove(path); err != nil && !os.IsNotExist(err) {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func normalizeBenchmarkPowerSource(source string) string {
|
||||||
|
switch strings.TrimSpace(strings.ToLower(source)) {
|
||||||
|
case BenchmarkPowerSourceSDRPSUInput:
|
||||||
|
return BenchmarkPowerSourceSDRPSUInput
|
||||||
|
default:
|
||||||
|
return BenchmarkPowerSourceDCMI
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func ResolveSystemPowerDecision(exportDir string) SystemPowerSourceDecision {
|
||||||
|
cfg, err := LoadSystemPowerSourceConfig(exportDir)
|
||||||
|
if err == nil && cfg != nil && strings.TrimSpace(cfg.SelectedSource) != "" {
|
||||||
|
selected := normalizeBenchmarkPowerSource(cfg.SelectedSource)
|
||||||
|
return SystemPowerSourceDecision{
|
||||||
|
Configured: true,
|
||||||
|
SelectedSource: selected,
|
||||||
|
EffectiveSource: selected,
|
||||||
|
Mode: "autotuned",
|
||||||
|
Reason: strings.TrimSpace(cfg.Reason),
|
||||||
|
ConfiguredAt: cfg.UpdatedAt,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sources := sampleBenchmarkPowerSources()
|
||||||
|
if value := sources[BenchmarkPowerSourceSDRPSUInput]; value > 0 {
|
||||||
|
return SystemPowerSourceDecision{
|
||||||
|
Configured: false,
|
||||||
|
EffectiveSource: BenchmarkPowerSourceSDRPSUInput,
|
||||||
|
Mode: "fallback",
|
||||||
|
Reason: "autotune config not found; using temporary fallback source sdr_psu_input",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return SystemPowerSourceDecision{
|
||||||
|
Configured: false,
|
||||||
|
EffectiveSource: BenchmarkPowerSourceDCMI,
|
||||||
|
Mode: "fallback",
|
||||||
|
Reason: "autotune config not found; using temporary fallback source dcmi",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func SampleSystemPowerResolved(exportDir string) (float64, SystemPowerSourceDecision, error) {
|
||||||
|
decision := ResolveSystemPowerDecision(exportDir)
|
||||||
|
if decision.EffectiveSource != "" {
|
||||||
|
if value, err := queryBenchmarkPowerSourceW(decision.EffectiveSource); err == nil && value > 0 {
|
||||||
|
return value, decision, nil
|
||||||
|
} else if decision.Configured {
|
||||||
|
fallback := BenchmarkPowerSourceDCMI
|
||||||
|
if decision.EffectiveSource == BenchmarkPowerSourceDCMI {
|
||||||
|
fallback = BenchmarkPowerSourceSDRPSUInput
|
||||||
|
}
|
||||||
|
if fallbackValue, fallbackErr := queryBenchmarkPowerSourceW(fallback); fallbackErr == nil && fallbackValue > 0 {
|
||||||
|
decision.Mode = "degraded"
|
||||||
|
decision.Reason = fmt.Sprintf("configured source %s unavailable; using degraded fallback %s", decision.SelectedSource, fallback)
|
||||||
|
decision.EffectiveSource = fallback
|
||||||
|
return fallbackValue, decision, nil
|
||||||
|
}
|
||||||
|
decision.Mode = "degraded"
|
||||||
|
decision.Reason = fmt.Sprintf("configured source %s unavailable and no fallback source responded", decision.SelectedSource)
|
||||||
|
return 0, decision, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0, decision, fmt.Errorf("system power source unavailable")
|
||||||
|
}
|
||||||
|
|
||||||
|
func queryBenchmarkPowerSourceW(source string) (float64, error) {
|
||||||
|
switch normalizeBenchmarkPowerSource(source) {
|
||||||
|
case BenchmarkPowerSourceSDRPSUInput:
|
||||||
|
sdr := sampleIPMISDRPowerSensors()
|
||||||
|
if sdr.PSUInW > 0 {
|
||||||
|
return sdr.PSUInW, nil
|
||||||
|
}
|
||||||
|
return 0, fmt.Errorf("sdr psu input unavailable")
|
||||||
|
default:
|
||||||
|
return queryIPMIServerPowerW()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func sampleBenchmarkPowerSources() map[string]float64 {
|
||||||
|
out := map[string]float64{}
|
||||||
|
if w, err := queryIPMIServerPowerW(); err == nil && w > 0 {
|
||||||
|
out[BenchmarkPowerSourceDCMI] = w
|
||||||
|
}
|
||||||
|
if w, err := queryBenchmarkPowerSourceW(BenchmarkPowerSourceSDRPSUInput); err == nil && w > 0 {
|
||||||
|
out[BenchmarkPowerSourceSDRPSUInput] = w
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func sampleBenchmarkPowerSourceSeries(ctx context.Context, source string, durationSec, intervalSec int) (float64, bool) {
|
||||||
|
if durationSec <= 0 {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
samples := collectSelectedPowerSourceSamples(ctx, source, durationSec, intervalSec)
|
||||||
|
if len(samples) == 0 {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
return benchmarkMean(samples), true
|
||||||
|
}
|
||||||
|
|
||||||
|
func collectSelectedPowerSourceSamples(ctx context.Context, source string, durationSec, intervalSec int) []float64 {
|
||||||
|
if durationSec <= 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
stopCh := make(chan struct{})
|
||||||
|
doneCh := startSelectedPowerSourceSampler(stopCh, source, intervalSec)
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
case <-time.After(time.Duration(durationSec) * time.Second):
|
||||||
|
}
|
||||||
|
close(stopCh)
|
||||||
|
return <-doneCh
|
||||||
|
}
|
||||||
|
|
||||||
|
func startSelectedPowerSourceSampler(stopCh <-chan struct{}, source string, intervalSec int) <-chan []float64 {
|
||||||
|
if intervalSec <= 0 {
|
||||||
|
intervalSec = benchmarkPowerAutotuneSampleInterval
|
||||||
|
}
|
||||||
|
ch := make(chan []float64, 1)
|
||||||
|
go func() {
|
||||||
|
defer close(ch)
|
||||||
|
var samples []float64
|
||||||
|
record := func() {
|
||||||
|
if w, err := queryBenchmarkPowerSourceW(source); err == nil && w > 0 {
|
||||||
|
samples = append(samples, w)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
record()
|
||||||
|
ticker := time.NewTicker(time.Duration(intervalSec) * time.Second)
|
||||||
|
defer ticker.Stop()
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-stopCh:
|
||||||
|
ch <- samples
|
||||||
|
return
|
||||||
|
case <-ticker.C:
|
||||||
|
record()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
return ch
|
||||||
|
}
|
||||||
|
|
||||||
|
type benchmarkPowerAutotuneSample struct {
|
||||||
|
ElapsedSec float64
|
||||||
|
GPUAvgUsagePct float64
|
||||||
|
CPUUsagePct float64
|
||||||
|
GPUSumPowerW float64
|
||||||
|
Sources map[string]float64
|
||||||
|
}
|
||||||
|
|
||||||
|
func collectBenchmarkPowerAutotuneSamples(ctx context.Context, phase string, gpuIndices []int, durationSec int, logFunc func(string)) []benchmarkPowerAutotuneSample {
|
||||||
|
if durationSec <= 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
var out []benchmarkPowerAutotuneSample
|
||||||
|
deadline := time.Now().Add(time.Duration(durationSec) * time.Second)
|
||||||
|
start := time.Now()
|
||||||
|
for {
|
||||||
|
if ctx.Err() != nil {
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
row := benchmarkPowerAutotuneSample{
|
||||||
|
ElapsedSec: time.Since(start).Seconds(),
|
||||||
|
CPUUsagePct: sampleCPULoadPct(),
|
||||||
|
Sources: sampleBenchmarkPowerSources(),
|
||||||
|
}
|
||||||
|
if gpuRows, err := sampleGPUMetrics(gpuIndices); err == nil && len(gpuRows) > 0 {
|
||||||
|
var usageSum float64
|
||||||
|
for _, gpu := range gpuRows {
|
||||||
|
row.GPUSumPowerW += gpu.PowerW
|
||||||
|
usageSum += gpu.UsagePct
|
||||||
|
}
|
||||||
|
row.GPUAvgUsagePct = usageSum / float64(len(gpuRows))
|
||||||
|
}
|
||||||
|
out = append(out, row)
|
||||||
|
logBenchmarkPowerAutotuneSample(phase, row, logFunc)
|
||||||
|
if time.Now().After(deadline) {
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return out
|
||||||
|
case <-time.After(benchmarkPowerAutotuneSampleInterval * time.Second):
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func logBenchmarkPowerAutotuneSample(phase string, sample benchmarkPowerAutotuneSample, logFunc func(string)) {
|
||||||
|
if logFunc == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
var sourceParts []string
|
||||||
|
for _, source := range []string{BenchmarkPowerSourceDCMI, BenchmarkPowerSourceSDRPSUInput} {
|
||||||
|
if value, ok := sample.Sources[source]; ok && value > 0 {
|
||||||
|
sourceParts = append(sourceParts, fmt.Sprintf("%s=%.0fW", source, value))
|
||||||
|
} else {
|
||||||
|
sourceParts = append(sourceParts, fmt.Sprintf("%s=n/a", source))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
logFunc(fmt.Sprintf(
|
||||||
|
"autotune %s sample t=%.0fs gpu_avg_util=%.1f%% gpu_sum_power=%.0fW cpu_load=%.1f%% %s",
|
||||||
|
phase,
|
||||||
|
sample.ElapsedSec,
|
||||||
|
sample.GPUAvgUsagePct,
|
||||||
|
sample.GPUSumPowerW,
|
||||||
|
sample.CPUUsagePct,
|
||||||
|
strings.Join(sourceParts, " "),
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
|
func logBenchmarkPowerAutotunePhaseSummary(phase string, samples []benchmarkPowerAutotuneSample, logFunc func(string)) {
|
||||||
|
if logFunc == nil || len(samples) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
var gpuUsage []float64
|
||||||
|
var cpuUsage []float64
|
||||||
|
var gpuPower []float64
|
||||||
|
sourceBuckets := map[string][]float64{}
|
||||||
|
for _, sample := range samples {
|
||||||
|
gpuUsage = append(gpuUsage, sample.GPUAvgUsagePct)
|
||||||
|
cpuUsage = append(cpuUsage, sample.CPUUsagePct)
|
||||||
|
gpuPower = append(gpuPower, sample.GPUSumPowerW)
|
||||||
|
for source, value := range sample.Sources {
|
||||||
|
if value > 0 {
|
||||||
|
sourceBuckets[source] = append(sourceBuckets[source], value)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
var sourceParts []string
|
||||||
|
for _, source := range []string{BenchmarkPowerSourceDCMI, BenchmarkPowerSourceSDRPSUInput} {
|
||||||
|
values := sourceBuckets[source]
|
||||||
|
if len(values) == 0 {
|
||||||
|
sourceParts = append(sourceParts, fmt.Sprintf("%s_avg=n/a", source))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
sourceParts = append(sourceParts, fmt.Sprintf("%s_avg=%.0fW", source, benchmarkMean(values)))
|
||||||
|
}
|
||||||
|
logFunc(fmt.Sprintf(
|
||||||
|
"autotune %s summary samples=%d gpu_avg_util=%.1f%% gpu_p95_util=%.1f%% gpu_avg_power=%.0fW cpu_avg=%.1f%% cpu_p95=%.1f%% %s",
|
||||||
|
phase,
|
||||||
|
len(samples),
|
||||||
|
benchmarkMean(gpuUsage),
|
||||||
|
benchmarkPercentile(gpuUsage, 95),
|
||||||
|
benchmarkMean(gpuPower),
|
||||||
|
benchmarkMean(cpuUsage),
|
||||||
|
benchmarkPercentile(cpuUsage, 95),
|
||||||
|
strings.Join(sourceParts, " "),
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
|
func logBenchmarkPowerAutotuneSelection(candidates []BenchmarkPowerAutotuneCandidate, selectedSource string, gpuDelta float64, logFunc func(string)) {
|
||||||
|
if logFunc == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for _, candidate := range candidates {
|
||||||
|
if !candidate.Available {
|
||||||
|
logFunc(fmt.Sprintf("autotune candidate %s unavailable", candidate.Source))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
logFunc(fmt.Sprintf(
|
||||||
|
"autotune candidate %s idle_avg=%.0fW load_avg=%.0fW delta=%.0fW gpu_delta=%.0fW relative_error=%.3f confidence=%.0f%%%s",
|
||||||
|
candidate.Source,
|
||||||
|
candidate.IdleAvgW,
|
||||||
|
candidate.LoadAvgW,
|
||||||
|
candidate.DeltaW,
|
||||||
|
gpuDelta,
|
||||||
|
candidate.RelativeError,
|
||||||
|
candidate.Confidence*100,
|
||||||
|
map[bool]string{true: " SELECTED", false: ""}[candidate.Source == selectedSource],
|
||||||
|
))
|
||||||
|
if strings.TrimSpace(candidate.SelectionNotes) != "" {
|
||||||
|
logFunc(fmt.Sprintf("autotune candidate %s reason: %s", candidate.Source, candidate.SelectionNotes))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func validateBenchmarkPowerAutotuneIdle(samples []benchmarkPowerAutotuneSample) *BenchmarkPowerAutotuneValidation {
|
||||||
|
result := &BenchmarkPowerAutotuneValidation{}
|
||||||
|
if len(samples) == 0 {
|
||||||
|
result.Reason = "no idle telemetry samples collected"
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
var gpuUsage []float64
|
||||||
|
var cpuUsage []float64
|
||||||
|
for _, sample := range samples {
|
||||||
|
gpuUsage = append(gpuUsage, sample.GPUAvgUsagePct)
|
||||||
|
if sample.CPUUsagePct > 0 {
|
||||||
|
cpuUsage = append(cpuUsage, sample.CPUUsagePct)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
result.GPUSamples = len(gpuUsage)
|
||||||
|
result.CPUSamples = len(cpuUsage)
|
||||||
|
result.GPUAvgUsagePct = math.Round(benchmarkMean(gpuUsage)*10) / 10
|
||||||
|
result.GPUP95UsagePct = math.Round(benchmarkPercentile(gpuUsage, 95)*10) / 10
|
||||||
|
result.CPUAvgUsagePct = math.Round(benchmarkMean(cpuUsage)*10) / 10
|
||||||
|
result.CPUP95UsagePct = math.Round(benchmarkPercentile(cpuUsage, 95)*10) / 10
|
||||||
|
switch {
|
||||||
|
case result.GPUAvgUsagePct > 5:
|
||||||
|
result.Reason = fmt.Sprintf("idle validation failed: average GPU load %.1f%% exceeds 5%%", result.GPUAvgUsagePct)
|
||||||
|
case result.GPUP95UsagePct > 10:
|
||||||
|
result.Reason = fmt.Sprintf("idle validation failed: p95 GPU load %.1f%% exceeds 10%%", result.GPUP95UsagePct)
|
||||||
|
case result.CPUAvgUsagePct > 20:
|
||||||
|
result.Reason = fmt.Sprintf("idle validation failed: average CPU load %.1f%% exceeds 20%%", result.CPUAvgUsagePct)
|
||||||
|
case result.CPUP95UsagePct > 35:
|
||||||
|
result.Reason = fmt.Sprintf("idle validation failed: p95 CPU load %.1f%% exceeds 35%%", result.CPUP95UsagePct)
|
||||||
|
default:
|
||||||
|
result.Valid = true
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
func chooseBenchmarkPowerAutotuneSource(idle, load []benchmarkPowerAutotuneSample) (string, []BenchmarkPowerAutotuneCandidate, float64, float64, error) {
|
||||||
|
idleBySource := map[string][]float64{}
|
||||||
|
loadBySource := map[string][]float64{}
|
||||||
|
var idleGPU []float64
|
||||||
|
var loadGPU []float64
|
||||||
|
for _, sample := range idle {
|
||||||
|
idleGPU = append(idleGPU, sample.GPUSumPowerW)
|
||||||
|
for source, value := range sample.Sources {
|
||||||
|
if value > 0 {
|
||||||
|
idleBySource[source] = append(idleBySource[source], value)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, sample := range load {
|
||||||
|
loadGPU = append(loadGPU, sample.GPUSumPowerW)
|
||||||
|
for source, value := range sample.Sources {
|
||||||
|
if value > 0 {
|
||||||
|
loadBySource[source] = append(loadBySource[source], value)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
idleGPUAvg := benchmarkMean(idleGPU)
|
||||||
|
loadGPUAvg := benchmarkMean(loadGPU)
|
||||||
|
gpuDelta := loadGPUAvg - idleGPUAvg
|
||||||
|
if gpuDelta <= 0 {
|
||||||
|
gpuDelta = loadGPUAvg
|
||||||
|
}
|
||||||
|
|
||||||
|
candidates := []BenchmarkPowerAutotuneCandidate{
|
||||||
|
buildBenchmarkPowerAutotuneCandidate(BenchmarkPowerSourceDCMI, idleBySource[BenchmarkPowerSourceDCMI], loadBySource[BenchmarkPowerSourceDCMI], gpuDelta),
|
||||||
|
buildBenchmarkPowerAutotuneCandidate(BenchmarkPowerSourceSDRPSUInput, idleBySource[BenchmarkPowerSourceSDRPSUInput], loadBySource[BenchmarkPowerSourceSDRPSUInput], gpuDelta),
|
||||||
|
}
|
||||||
|
available := make([]BenchmarkPowerAutotuneCandidate, 0, len(candidates))
|
||||||
|
for _, candidate := range candidates {
|
||||||
|
if candidate.Available && candidate.DeltaW > 0 {
|
||||||
|
available = append(available, candidate)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(available) == 0 {
|
||||||
|
return "", candidates, idleGPUAvg, loadGPUAvg, fmt.Errorf("no usable server power source samples collected")
|
||||||
|
}
|
||||||
|
sort.Slice(available, func(i, j int) bool {
|
||||||
|
if math.Abs(available[i].RelativeError-available[j].RelativeError) <= 0.10 {
|
||||||
|
if available[i].Source != available[j].Source {
|
||||||
|
return available[i].Source == BenchmarkPowerSourceSDRPSUInput
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if available[i].RelativeError != available[j].RelativeError {
|
||||||
|
return available[i].RelativeError < available[j].RelativeError
|
||||||
|
}
|
||||||
|
return available[i].Samples > available[j].Samples
|
||||||
|
})
|
||||||
|
selected := available[0]
|
||||||
|
for idx := range candidates {
|
||||||
|
if candidates[idx].Source == selected.Source {
|
||||||
|
candidates[idx].Selected = true
|
||||||
|
candidates[idx].SelectionNotes = fmt.Sprintf("selected because delta %.0f W is closest to GPU delta %.0f W (relative error %.3f)", selected.DeltaW, gpuDelta, selected.RelativeError)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return selected.Source, candidates, idleGPUAvg, loadGPUAvg, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildBenchmarkPowerAutotuneCandidate(source string, idle, load []float64, gpuDelta float64) BenchmarkPowerAutotuneCandidate {
|
||||||
|
candidate := BenchmarkPowerAutotuneCandidate{
|
||||||
|
Source: source,
|
||||||
|
Available: len(idle) > 0 && len(load) > 0,
|
||||||
|
Samples: minInt(len(idle), len(load)),
|
||||||
|
}
|
||||||
|
if !candidate.Available {
|
||||||
|
return candidate
|
||||||
|
}
|
||||||
|
candidate.IdleAvgW = benchmarkMean(idle)
|
||||||
|
candidate.LoadAvgW = benchmarkMean(load)
|
||||||
|
candidate.DeltaW = candidate.LoadAvgW - candidate.IdleAvgW
|
||||||
|
if gpuDelta > 0 {
|
||||||
|
candidate.RelativeError = math.Abs(candidate.DeltaW-gpuDelta) / gpuDelta
|
||||||
|
candidate.Confidence = math.Max(0, 1-candidate.RelativeError)
|
||||||
|
}
|
||||||
|
return candidate
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderBenchmarkPowerAutotuneSummary(result BenchmarkPowerAutotuneResult) string {
|
||||||
|
var b strings.Builder
|
||||||
|
fmt.Fprintf(&b, "generated_at=%s\n", result.GeneratedAt.UTC().Format(time.RFC3339))
|
||||||
|
fmt.Fprintf(&b, "status=%s\n", result.Status)
|
||||||
|
fmt.Fprintf(&b, "benchmark_kind=%s\n", result.BenchmarkKind)
|
||||||
|
fmt.Fprintf(&b, "profile=%s\n", result.Profile)
|
||||||
|
fmt.Fprintf(&b, "idle_duration_sec=%d\n", result.IdleDurationSec)
|
||||||
|
fmt.Fprintf(&b, "load_duration_sec=%d\n", result.LoadDurationSec)
|
||||||
|
fmt.Fprintf(&b, "sample_interval_sec=%d\n", result.SampleIntervalSec)
|
||||||
|
if result.SelectedSource != "" {
|
||||||
|
fmt.Fprintf(&b, "selected_source=%s\n", result.SelectedSource)
|
||||||
|
}
|
||||||
|
if result.IdleValidation != nil {
|
||||||
|
fmt.Fprintf(&b, "idle_valid=%t\n", result.IdleValidation.Valid)
|
||||||
|
fmt.Fprintf(&b, "idle_gpu_avg_usage_pct=%.1f\n", result.IdleValidation.GPUAvgUsagePct)
|
||||||
|
fmt.Fprintf(&b, "idle_gpu_p95_usage_pct=%.1f\n", result.IdleValidation.GPUP95UsagePct)
|
||||||
|
fmt.Fprintf(&b, "idle_cpu_avg_usage_pct=%.1f\n", result.IdleValidation.CPUAvgUsagePct)
|
||||||
|
fmt.Fprintf(&b, "idle_cpu_p95_usage_pct=%.1f\n", result.IdleValidation.CPUP95UsagePct)
|
||||||
|
if result.IdleValidation.Reason != "" {
|
||||||
|
fmt.Fprintf(&b, "idle_validation_error=%s\n", result.IdleValidation.Reason)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, candidate := range result.Candidates {
|
||||||
|
fmt.Fprintf(&b, "candidate_%s_available=%t\n", candidate.Source, candidate.Available)
|
||||||
|
if candidate.Available {
|
||||||
|
fmt.Fprintf(&b, "candidate_%s_idle_avg_w=%.0f\n", candidate.Source, candidate.IdleAvgW)
|
||||||
|
fmt.Fprintf(&b, "candidate_%s_load_avg_w=%.0f\n", candidate.Source, candidate.LoadAvgW)
|
||||||
|
fmt.Fprintf(&b, "candidate_%s_delta_w=%.0f\n", candidate.Source, candidate.DeltaW)
|
||||||
|
fmt.Fprintf(&b, "candidate_%s_relative_error=%.3f\n", candidate.Source, candidate.RelativeError)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderBenchmarkPowerAutotuneReport(result BenchmarkPowerAutotuneResult) string {
|
||||||
|
var b strings.Builder
|
||||||
|
b.WriteString("# Bee Bench Power Source Autotune\n\n")
|
||||||
|
fmt.Fprintf(&b, "**Status:** %s \n", result.Status)
|
||||||
|
fmt.Fprintf(&b, "**Benchmark kind:** %s \n", result.BenchmarkKind)
|
||||||
|
fmt.Fprintf(&b, "**Profile:** %s \n", result.Profile)
|
||||||
|
fmt.Fprintf(&b, "**Idle window:** %ds \n", result.IdleDurationSec)
|
||||||
|
fmt.Fprintf(&b, "**Load window:** %ds \n", result.LoadDurationSec)
|
||||||
|
fmt.Fprintf(&b, "**Sample interval:** %ds \n", result.SampleIntervalSec)
|
||||||
|
if result.SelectedSource != "" {
|
||||||
|
fmt.Fprintf(&b, "**Selected source:** `%s` \n", result.SelectedSource)
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
if result.IdleValidation != nil {
|
||||||
|
b.WriteString("## Idle Validation\n\n")
|
||||||
|
fmt.Fprintf(&b, "- valid: %t\n", result.IdleValidation.Valid)
|
||||||
|
fmt.Fprintf(&b, "- GPU avg usage: %.1f%%\n", result.IdleValidation.GPUAvgUsagePct)
|
||||||
|
fmt.Fprintf(&b, "- GPU p95 usage: %.1f%%\n", result.IdleValidation.GPUP95UsagePct)
|
||||||
|
fmt.Fprintf(&b, "- CPU avg usage: %.1f%%\n", result.IdleValidation.CPUAvgUsagePct)
|
||||||
|
fmt.Fprintf(&b, "- CPU p95 usage: %.1f%%\n", result.IdleValidation.CPUP95UsagePct)
|
||||||
|
if result.IdleValidation.Reason != "" {
|
||||||
|
fmt.Fprintf(&b, "- reason: %s\n", result.IdleValidation.Reason)
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
if len(result.Candidates) > 0 {
|
||||||
|
b.WriteString("## Candidates\n\n")
|
||||||
|
b.WriteString("| Source | Idle avg W | Load avg W | Delta W | Relative error | Selected |\n")
|
||||||
|
b.WriteString("|--------|------------|------------|---------|----------------|----------|\n")
|
||||||
|
for _, candidate := range result.Candidates {
|
||||||
|
if !candidate.Available {
|
||||||
|
fmt.Fprintf(&b, "| %s | — | — | — | — | no |\n", candidate.Source)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
selected := "no"
|
||||||
|
if candidate.Selected {
|
||||||
|
selected = "yes"
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, "| %s | %.0f | %.0f | %.0f | %.2f | %s |\n",
|
||||||
|
candidate.Source, candidate.IdleAvgW, candidate.LoadAvgW, candidate.DeltaW, candidate.RelativeError, selected)
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
for _, note := range result.Notes {
|
||||||
|
fmt.Fprintf(&b, "- %s\n", note)
|
||||||
|
}
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func benchmarkAutotuneLoadCommand(kind string, durationSec int, gpuIndices []int, sizeMB int) ([]string, string) {
|
||||||
|
allDevices := joinIndexList(gpuIndices)
|
||||||
|
switch strings.TrimSpace(strings.ToLower(kind)) {
|
||||||
|
case "power-fit", "power", "nvidia-bench-power":
|
||||||
|
cmd, _, err := resolveBenchmarkPowerLoadCommand(durationSec, gpuIndices)
|
||||||
|
if err == nil {
|
||||||
|
return cmd, "power-fit"
|
||||||
|
}
|
||||||
|
return nvidiaDCGMNamedDiagCommand("targeted_power", durationSec, gpuIndices), "power-fit"
|
||||||
|
default:
|
||||||
|
cmd := []string{
|
||||||
|
"bee-gpu-burn",
|
||||||
|
"--seconds", fmt.Sprintf("%d", durationSec),
|
||||||
|
"--devices", allDevices,
|
||||||
|
}
|
||||||
|
if sizeMB > 0 {
|
||||||
|
cmd = append(cmd, "--size-mb", fmt.Sprintf("%d", sizeMB))
|
||||||
|
}
|
||||||
|
return cmd, "performance"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) RunNvidiaPowerSourceAutotune(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error) {
|
||||||
|
if ctx == nil {
|
||||||
|
ctx = context.Background()
|
||||||
|
}
|
||||||
|
if logFunc == nil {
|
||||||
|
logFunc = func(string) {}
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = "/var/log/bee-bench/autotune"
|
||||||
|
}
|
||||||
|
if err := os.MkdirAll(baseDir, 0755); err != nil {
|
||||||
|
return "", fmt.Errorf("mkdir %s: %w", baseDir, err)
|
||||||
|
}
|
||||||
|
selected, err := resolveNvidiaGPUSelection(nil, nil)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
if len(selected) == 0 {
|
||||||
|
return "", fmt.Errorf("no NVIDIA GPUs detected for autotune")
|
||||||
|
}
|
||||||
|
ts := time.Now().UTC().Format("20060102-150405")
|
||||||
|
runDir := filepath.Join(baseDir, "autotune-"+ts)
|
||||||
|
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||||
|
return "", fmt.Errorf("mkdir %s: %w", runDir, err)
|
||||||
|
}
|
||||||
|
verboseLog := filepath.Join(runDir, "verbose.log")
|
||||||
|
hostname, _ := os.Hostname()
|
||||||
|
loadCmd, normalizedKind := benchmarkAutotuneLoadCommand(benchmarkKind, benchmarkPowerAutotuneLoadSec, selected, opts.SizeMB)
|
||||||
|
result := BenchmarkPowerAutotuneResult{
|
||||||
|
GeneratedAt: time.Now().UTC(),
|
||||||
|
Hostname: hostname,
|
||||||
|
ServerModel: readServerModel(),
|
||||||
|
BenchmarkKind: normalizedKind,
|
||||||
|
Profile: opts.Profile,
|
||||||
|
Status: "FAILED",
|
||||||
|
IdleDurationSec: benchmarkPowerAutotuneIdleSec,
|
||||||
|
LoadDurationSec: benchmarkPowerAutotuneLoadSec,
|
||||||
|
SampleIntervalSec: benchmarkPowerAutotuneSampleInterval,
|
||||||
|
}
|
||||||
|
|
||||||
|
logFunc(fmt.Sprintf("autotune: idle validation window %ds on GPUs %s", benchmarkPowerAutotuneIdleSec, joinIndexList(selected)))
|
||||||
|
idleSamples := collectBenchmarkPowerAutotuneSamples(ctx, "idle", selected, benchmarkPowerAutotuneIdleSec, logFunc)
|
||||||
|
logBenchmarkPowerAutotunePhaseSummary("idle", idleSamples, logFunc)
|
||||||
|
result.IdleValidation = validateBenchmarkPowerAutotuneIdle(idleSamples)
|
||||||
|
if result.IdleValidation == nil || !result.IdleValidation.Valid {
|
||||||
|
if result.IdleValidation != nil {
|
||||||
|
result.IdleValidationError = result.IdleValidation.Reason
|
||||||
|
logFunc(result.IdleValidation.Reason)
|
||||||
|
}
|
||||||
|
result.Notes = append(result.Notes, "autotune stopped before load stage because idle validation failed")
|
||||||
|
if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return runDir, fmt.Errorf("%s", result.IdleValidationError)
|
||||||
|
}
|
||||||
|
|
||||||
|
logFunc(fmt.Sprintf("autotune: full-load stage using %s for %ds", normalizedKind, benchmarkPowerAutotuneLoadSec))
|
||||||
|
loadSamplesCh := make(chan []benchmarkPowerAutotuneSample, 1)
|
||||||
|
go func() {
|
||||||
|
loadSamplesCh <- collectBenchmarkPowerAutotuneSamples(ctx, "load", selected, benchmarkPowerAutotuneLoadSec, logFunc)
|
||||||
|
}()
|
||||||
|
out, runErr := runSATCommandCtx(ctx, verboseLog, "autotune-load.log", loadCmd, nil, logFunc)
|
||||||
|
_ = os.WriteFile(filepath.Join(runDir, "autotune-load.log"), out, 0644)
|
||||||
|
loadSamples := <-loadSamplesCh
|
||||||
|
logBenchmarkPowerAutotunePhaseSummary("load", loadSamples, logFunc)
|
||||||
|
if runErr != nil {
|
||||||
|
result.Notes = append(result.Notes, "full-load stage failed: "+runErr.Error())
|
||||||
|
if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return runDir, fmt.Errorf("autotune load stage: %w", runErr)
|
||||||
|
}
|
||||||
|
|
||||||
|
selectedSource, candidates, idleGPUAvg, loadGPUAvg, chooseErr := chooseBenchmarkPowerAutotuneSource(idleSamples, loadSamples)
|
||||||
|
result.Candidates = candidates
|
||||||
|
result.GPUPowerIdleW = idleGPUAvg
|
||||||
|
result.GPUPowerLoadW = loadGPUAvg
|
||||||
|
if chooseErr != nil {
|
||||||
|
result.Notes = append(result.Notes, chooseErr.Error())
|
||||||
|
if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return runDir, chooseErr
|
||||||
|
}
|
||||||
|
gpuDelta := loadGPUAvg - idleGPUAvg
|
||||||
|
if gpuDelta <= 0 {
|
||||||
|
gpuDelta = loadGPUAvg
|
||||||
|
}
|
||||||
|
logBenchmarkPowerAutotuneSelection(candidates, selectedSource, gpuDelta, logFunc)
|
||||||
|
result.SelectedSource = selectedSource
|
||||||
|
result.Status = "OK"
|
||||||
|
var confidence float64
|
||||||
|
selectionReason := fmt.Sprintf("selected %s after comparing full-load average against GPU-reported delta", selectedSource)
|
||||||
|
for _, candidate := range candidates {
|
||||||
|
if candidate.Selected {
|
||||||
|
confidence = candidate.Confidence
|
||||||
|
if strings.TrimSpace(candidate.SelectionNotes) != "" {
|
||||||
|
selectionReason = candidate.SelectionNotes
|
||||||
|
}
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
cfg := BenchmarkPowerAutotuneConfig{
|
||||||
|
Version: benchmarkPowerAutotuneVersion,
|
||||||
|
UpdatedAt: time.Now().UTC(),
|
||||||
|
SelectedSource: selectedSource,
|
||||||
|
BenchmarkKind: normalizedKind,
|
||||||
|
Profile: opts.Profile,
|
||||||
|
IdleDurationSec: benchmarkPowerAutotuneIdleSec,
|
||||||
|
LoadDurationSec: benchmarkPowerAutotuneLoadSec,
|
||||||
|
SampleIntervalSec: benchmarkPowerAutotuneSampleInterval,
|
||||||
|
Confidence: confidence,
|
||||||
|
Reason: selectionReason,
|
||||||
|
}
|
||||||
|
result.Config = &cfg
|
||||||
|
configPath := BenchmarkPowerSourceConfigPath(baseDir)
|
||||||
|
if err := SaveBenchmarkPowerAutotuneConfig(configPath, cfg); err != nil {
|
||||||
|
result.Status = "FAILED"
|
||||||
|
result.Notes = append(result.Notes, "failed to save autotune config: "+err.Error())
|
||||||
|
if writeErr := writeBenchmarkPowerAutotuneArtifacts(runDir, result); writeErr != nil {
|
||||||
|
return "", writeErr
|
||||||
|
}
|
||||||
|
return runDir, err
|
||||||
|
}
|
||||||
|
logFunc(fmt.Sprintf("autotune conclusion: selected source %s; reason: %s", selectedSource, cfg.Reason))
|
||||||
|
result.Notes = append(result.Notes, "saved autotune config to "+configPath)
|
||||||
|
if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
return runDir, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeBenchmarkPowerAutotuneArtifacts(runDir string, result BenchmarkPowerAutotuneResult) error {
|
||||||
|
resultJSON, err := json.MarshalIndent(result, "", " ")
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("marshal autotune result: %w", err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(filepath.Join(runDir, "result.json"), resultJSON, 0644); err != nil {
|
||||||
|
return fmt.Errorf("write autotune result.json: %w", err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(renderBenchmarkPowerAutotuneSummary(result)), 0644); err != nil {
|
||||||
|
return fmt.Errorf("write autotune summary.txt: %w", err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(renderBenchmarkPowerAutotuneReport(result)), 0644); err != nil {
|
||||||
|
return fmt.Errorf("write autotune report.md: %w", err)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func minInt(a, b int) int {
|
||||||
|
if a < b {
|
||||||
|
return a
|
||||||
|
}
|
||||||
|
return b
|
||||||
|
}
|
||||||
|
|
||||||
|
var _ = exec.ErrNotFound
|
||||||
@@ -43,6 +43,11 @@ const (
|
|||||||
NvidiaBenchmarkProfileOvernight = "overnight"
|
NvidiaBenchmarkProfileOvernight = "overnight"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
BenchmarkPowerEngineDCGMProfTester = "dcgmproftester"
|
||||||
|
BenchmarkPowerEngineTargetedPower = "targeted_power"
|
||||||
|
)
|
||||||
|
|
||||||
// Estimated wall-clock durations for benchmark runs, derived from real _v8 logs.
|
// Estimated wall-clock durations for benchmark runs, derived from real _v8 logs.
|
||||||
// Rule: when changing profile phase durations in resolveBenchmarkProfile(),
|
// Rule: when changing profile phase durations in resolveBenchmarkProfile(),
|
||||||
// re-measure from actual task logs and update the constants here.
|
// re-measure from actual task logs and update the constants here.
|
||||||
@@ -61,7 +66,7 @@ const (
|
|||||||
BenchmarkEstimatedPerfStabilitySec = 5532 // ~92 min; ramp-up 1-8 measured
|
BenchmarkEstimatedPerfStabilitySec = 5532 // ~92 min; ramp-up 1-8 measured
|
||||||
BenchmarkEstimatedPerfOvernightSec = 8 * 3600
|
BenchmarkEstimatedPerfOvernightSec = 8 * 3600
|
||||||
|
|
||||||
// Power / Thermal Fit (dcgmi targeted_power binary-search calibration).
|
// Power / Thermal Fit (dcgmproftester load + nvidia-smi power-limit binary search).
|
||||||
// Duration is for the full ramp-up run; individual steps vary with convergence speed.
|
// Duration is for the full ramp-up run; individual steps vary with convergence speed.
|
||||||
BenchmarkEstimatedPowerStandardSec = 2600 // ~43 min; ramp 1-4: 2663 s, ramp 1-8: 2375 s
|
BenchmarkEstimatedPowerStandardSec = 2600 // ~43 min; ramp 1-4: 2663 s, ramp 1-8: 2375 s
|
||||||
BenchmarkEstimatedPowerStabilitySec = 5400 // ~90 min; calibDurationSec=300 × 8 GPU × ~2-3 attempts
|
BenchmarkEstimatedPowerStabilitySec = 5400 // ~90 min; calibDurationSec=300 × 8 GPU × ~2-3 attempts
|
||||||
@@ -74,12 +79,84 @@ type NvidiaBenchmarkOptions struct {
|
|||||||
GPUIndices []int
|
GPUIndices []int
|
||||||
ExcludeGPUIndices []int
|
ExcludeGPUIndices []int
|
||||||
RunNCCL bool
|
RunNCCL bool
|
||||||
|
ServerPowerSource string
|
||||||
ParallelGPUs bool // run all selected GPUs simultaneously instead of sequentially
|
ParallelGPUs bool // run all selected GPUs simultaneously instead of sequentially
|
||||||
RampStep int // 1-based step index within a ramp-up run (0 = not a ramp-up)
|
RampStep int // 1-based step index within a ramp-up run (0 = not a ramp-up)
|
||||||
RampTotal int // total number of ramp-up steps in this run
|
RampTotal int // total number of ramp-up steps in this run
|
||||||
RampRunID string // shared identifier across all steps of the same ramp-up run
|
RampRunID string // shared identifier across all steps of the same ramp-up run
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const (
|
||||||
|
BenchmarkPowerSourceDCMI = "dcmi"
|
||||||
|
BenchmarkPowerSourceSDRPSUInput = "sdr_psu_input"
|
||||||
|
)
|
||||||
|
|
||||||
|
type BenchmarkPowerAutotuneConfig struct {
|
||||||
|
Version int `json:"version"`
|
||||||
|
UpdatedAt time.Time `json:"updated_at"`
|
||||||
|
SelectedSource string `json:"selected_source"`
|
||||||
|
BenchmarkKind string `json:"benchmark_kind,omitempty"`
|
||||||
|
Profile string `json:"profile,omitempty"`
|
||||||
|
IdleDurationSec int `json:"idle_duration_sec,omitempty"`
|
||||||
|
LoadDurationSec int `json:"load_duration_sec,omitempty"`
|
||||||
|
SampleIntervalSec int `json:"sample_interval_sec,omitempty"`
|
||||||
|
Confidence float64 `json:"confidence,omitempty"`
|
||||||
|
Reason string `json:"reason,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type SystemPowerSourceDecision struct {
|
||||||
|
Configured bool `json:"configured"`
|
||||||
|
SelectedSource string `json:"selected_source,omitempty"`
|
||||||
|
EffectiveSource string `json:"effective_source,omitempty"`
|
||||||
|
Mode string `json:"mode,omitempty"` // autotuned, fallback, degraded
|
||||||
|
Reason string `json:"reason,omitempty"`
|
||||||
|
ConfiguredAt time.Time `json:"configured_at,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type BenchmarkPowerAutotuneResult struct {
|
||||||
|
GeneratedAt time.Time `json:"generated_at"`
|
||||||
|
Hostname string `json:"hostname,omitempty"`
|
||||||
|
ServerModel string `json:"server_model,omitempty"`
|
||||||
|
BenchmarkKind string `json:"benchmark_kind,omitempty"`
|
||||||
|
Profile string `json:"profile,omitempty"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
IdleDurationSec int `json:"idle_duration_sec"`
|
||||||
|
LoadDurationSec int `json:"load_duration_sec"`
|
||||||
|
SampleIntervalSec int `json:"sample_interval_sec"`
|
||||||
|
SelectedSource string `json:"selected_source,omitempty"`
|
||||||
|
IdleValidationError string `json:"idle_validation_error,omitempty"`
|
||||||
|
IdleValidation *BenchmarkPowerAutotuneValidation `json:"idle_validation,omitempty"`
|
||||||
|
GPUPowerIdleW float64 `json:"gpu_power_idle_w,omitempty"`
|
||||||
|
GPUPowerLoadW float64 `json:"gpu_power_load_w,omitempty"`
|
||||||
|
Candidates []BenchmarkPowerAutotuneCandidate `json:"candidates,omitempty"`
|
||||||
|
Notes []string `json:"notes,omitempty"`
|
||||||
|
Config *BenchmarkPowerAutotuneConfig `json:"config,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type BenchmarkPowerAutotuneValidation struct {
|
||||||
|
Valid bool `json:"valid"`
|
||||||
|
GPUAvgUsagePct float64 `json:"gpu_avg_usage_pct,omitempty"`
|
||||||
|
GPUP95UsagePct float64 `json:"gpu_p95_usage_pct,omitempty"`
|
||||||
|
CPUAvgUsagePct float64 `json:"cpu_avg_usage_pct,omitempty"`
|
||||||
|
CPUP95UsagePct float64 `json:"cpu_p95_usage_pct,omitempty"`
|
||||||
|
GPUSamples int `json:"gpu_samples,omitempty"`
|
||||||
|
CPUSamples int `json:"cpu_samples,omitempty"`
|
||||||
|
Reason string `json:"reason,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type BenchmarkPowerAutotuneCandidate struct {
|
||||||
|
Source string `json:"source"`
|
||||||
|
IdleAvgW float64 `json:"idle_avg_w,omitempty"`
|
||||||
|
LoadAvgW float64 `json:"load_avg_w,omitempty"`
|
||||||
|
DeltaW float64 `json:"delta_w,omitempty"`
|
||||||
|
Samples int `json:"samples,omitempty"`
|
||||||
|
RelativeError float64 `json:"relative_error,omitempty"`
|
||||||
|
Confidence float64 `json:"confidence,omitempty"`
|
||||||
|
Selected bool `json:"selected,omitempty"`
|
||||||
|
Available bool `json:"available"`
|
||||||
|
SelectionNotes string `json:"selection_notes,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
type NvidiaBenchmarkResult struct {
|
type NvidiaBenchmarkResult struct {
|
||||||
BenchmarkVersion string `json:"benchmark_version"`
|
BenchmarkVersion string `json:"benchmark_version"`
|
||||||
GeneratedAt time.Time `json:"generated_at"`
|
GeneratedAt time.Time `json:"generated_at"`
|
||||||
@@ -294,12 +371,16 @@ type BenchmarkPSUSlotPower struct {
|
|||||||
// - SDR — `ipmitool sdr` PSUx_POWER_IN/OUT; per-PSU, reliable
|
// - SDR — `ipmitool sdr` PSUx_POWER_IN/OUT; per-PSU, reliable
|
||||||
// - nvidia-smi — GPU self-reported via internal shunt; accurate for GPU load
|
// - nvidia-smi — GPU self-reported via internal shunt; accurate for GPU load
|
||||||
type BenchmarkServerPower struct {
|
type BenchmarkServerPower struct {
|
||||||
Available bool `json:"available"`
|
Available bool `json:"available"`
|
||||||
IdleW float64 `json:"idle_w,omitempty"` // DCMI at idle
|
Source string `json:"source,omitempty"`
|
||||||
LoadedW float64 `json:"loaded_w,omitempty"` // DCMI at peak load
|
Mode string `json:"mode,omitempty"`
|
||||||
DeltaW float64 `json:"delta_w,omitempty"` // DCMI loaded − idle
|
Reason string `json:"reason,omitempty"`
|
||||||
GPUReportedSumW float64 `json:"gpu_reported_sum_w,omitempty"`
|
SampleIntervalSec int `json:"sample_interval_sec,omitempty"`
|
||||||
ReportingRatio float64 `json:"reporting_ratio,omitempty"`
|
IdleW float64 `json:"idle_w,omitempty"` // DCMI at idle
|
||||||
|
LoadedW float64 `json:"loaded_w,omitempty"` // DCMI at peak load
|
||||||
|
DeltaW float64 `json:"delta_w,omitempty"` // DCMI loaded − idle
|
||||||
|
GPUReportedSumW float64 `json:"gpu_reported_sum_w,omitempty"`
|
||||||
|
ReportingRatio float64 `json:"reporting_ratio,omitempty"`
|
||||||
|
|
||||||
// PSU AC input sum — sampled at idle and at peak load using collector's
|
// PSU AC input sum — sampled at idle and at peak load using collector's
|
||||||
// slot patterns (PSU1_POWER_IN, PSU1_PIN, PS1 POut, Power1…).
|
// slot patterns (PSU1_POWER_IN, PSU1_PIN, PS1 POut, Power1…).
|
||||||
|
|||||||
Reference in New Issue
Block a user