@@ -94,9 +94,13 @@ var (
)
// benchmarkPrecisionPhases lists the precision categories run as individual
// steady-state windows before the combined steady pass. Order is from lowest
// steady-state windows before the combined steady pass. Order is from lowest
// to highest power draw so thermal ramp-up is gradual.
var benchmarkPrecisionPhases = [ ] string { "int8" , "fp8" , "fp16" , "fp32" , "fp64" , "fp4" }
//
// fp64 and fp4 are intentionally disabled for now: both are currently unstable
// on the target fleet and can abort the mixed steady stage after the earlier
// phases already collected useful telemetry.
var benchmarkPrecisionPhases = [ ] string { "int8" , "fp8" , "fp16" , "fp32" }
func computeCapabilityCode ( raw string ) int {
raw = strings . TrimSpace ( raw )
@@ -124,6 +128,15 @@ func benchmarkSupportedPrecisions(computeCapability string) []string {
return out
}
func benchmarkPrecisionEnabled ( category string ) bool {
switch category {
case "int8" , "fp8" , "fp16" , "fp16_bf16" , "fp32" , "fp32_tf32" :
return true
default :
return false
}
}
func buildBenchmarkSteadyPlan ( spec benchmarkProfileSpec , precisions [ ] string , metricStage func ( string ) string ) ( planLabels [ ] string , planPhases [ ] benchmarkPlannedPhase , basePhaseSec int , mixedPhaseSec int ) {
if len ( precisions ) == 0 {
precisions = append ( [ ] string ( nil ) , benchmarkPrecisionPhases ... )
@@ -514,6 +527,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
appendBenchmarkMetrics ( & metricRows , cooldownRows , fmt . Sprintf ( "gpu-%d-cooldown" , idx ) , & metricTimelineSec , float64 ( spec . CooldownSec ) )
}
applyBenchmarkSteadyFallback ( & gpuResult )
gpuResult . Scores = scoreBenchmarkGPUResult ( gpuResult )
gpuResult . DegradationReasons = detectBenchmarkDegradationReasons ( gpuResult , result . Normalization . Status )
if anomaly := detectPowerAnomaly ( metricRows , idx ) ; anomaly != "" {
@@ -1398,19 +1412,58 @@ func summarizeBenchmarkCooling(rows []GPUMetricRow) *BenchmarkCoolingSummary {
return summary
}
func benchmarkTelemetryAvailable ( summary BenchmarkTelemetrySummary ) bool {
return summary . Samples > 0 || summary . DurationSec > 0
}
func benchmarkPrecisionSteadyFallback ( phases [ ] BenchmarkPrecisionSteadyPhase ) ( BenchmarkTelemetrySummary , string , bool ) {
var (
best BenchmarkTelemetrySummary
bestLabel string
found bool
)
for _ , phase := range phases {
if ! benchmarkTelemetryAvailable ( phase . Steady ) {
continue
}
if ! found ||
phase . Steady . DurationSec > best . DurationSec ||
( phase . Steady . DurationSec == best . DurationSec && phase . Steady . P95PowerW > best . P95PowerW ) {
best = phase . Steady
bestLabel = phase . Precision
found = true
}
}
return best , bestLabel , found
}
func applyBenchmarkSteadyFallback ( gpu * BenchmarkGPUResult ) {
if gpu == nil || benchmarkTelemetryAvailable ( gpu . Steady ) {
return
}
if fallback , label , ok := benchmarkPrecisionSteadyFallback ( gpu . PrecisionSteady ) ; ok {
gpu . Steady = fallback
gpu . Notes = append ( gpu . Notes ,
fmt . Sprintf ( "mixed steady telemetry unavailable; reporting steady-state fallback from %s precision phase" , label ) )
}
}
func scoreBenchmarkGPUResult ( gpu BenchmarkGPUResult ) BenchmarkScorecard {
score := BenchmarkScorecard { }
// SyntheticScore: sum of fp32-equivalent TOPS from per-precision phases.
// Each precision ran alone with full GPU dedicated — peak capability.
for _ , p := range gpu . PrecisionSteady {
if ! benchmarkPrecisionEnabled ( p . Precision ) {
continue
}
score . SyntheticScore += p . WeightedTeraOpsPerSec
}
// MixedScore: sum of fp32-equivalent TOPS from the combined phase.
// All precisions compete simultaneously — closer to real inference workloads.
for _ , p := range gpu . PrecisionResults {
if p . Supported {
if p . Supported && benchmarkPrecisionEnabled ( p . Category ) {
score . MixedScore += p . WeightedTeraOpsPerSec
}
}
@@ -1441,10 +1494,17 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
// so CV reflects genuine power regulation, not workload switching).
if len ( gpu . PrecisionSteady ) > 0 {
var sum float64
var count int
for _ , p := range gpu . PrecisionSteady {
if ! benchmarkPrecisionEnabled ( p . Precision ) {
continue
}
sum += clampScore ( 100 - p . Steady . PowerCVPct * 3 )
count ++
}
if count > 0 {
score . PowerSustainScore = sum / float64 ( count )
}
score . PowerSustainScore = sum / float64 ( len ( gpu . PrecisionSteady ) )
} else if gpu . Steady . PowerCVPct > 0 {
score . PowerSustainScore = clampScore ( 100 - gpu . Steady . PowerCVPct * 3 )
}
@@ -2512,6 +2572,7 @@ func runNvidiaBenchmarkParallel(
// Score and finalize each GPU.
for _ , idx := range selected {
r := gpuResults [ idx ]
applyBenchmarkSteadyFallback ( r )
r . Scores = scoreBenchmarkGPUResult ( * r )
r . DegradationReasons = detectBenchmarkDegradationReasons ( * r , result . Normalization . Status )
pr := parseResults [ idx ]
@@ -2694,18 +2755,21 @@ func summarizeCPULoad(samples []float64) *BenchmarkCPULoad {
return cl
}
// runBenchmarkPowerCalibration runs targeted_power per GPU and actively watches
// throttle counters. If a GPU starts throttling, the current targeted_power run
// is canceled immediately, the power limit is reduced, and a fresh full cycle
// is started again from the beginning. The selected reduced power limit stay s
// active for the main benchmark and is restored by the caller afterwards.
// runBenchmarkPowerCalibration runs targeted_power for the supplied GPU set and
// actively watches throttle counters. seedLimits, when provided, are treated as
// the starting point for this calibration pass rather th an as immutable fixed
// limits. This matters during cumulative ramp-up: once an additional GPU i s
// introduced, every already-active GPU must be revalidated under the new
// thermal state instead of assuming its previous single-step limit is still
// valid. The selected reduced power limits stay active for the main benchmark
// and are restored by the caller afterwards.
func runBenchmarkPowerCalibration (
ctx context . Context ,
verboseLog , runDir string ,
gpuIndices [ ] int ,
infoByIndex map [ int ] benchmarkGPUInfo ,
logFunc func ( string ) ,
fix edLimits map [ int ] int ,
se edLimits map [ int ] int ,
) ( map [ int ] benchmarkPowerCalibrationResult , [ ] benchmarkRestoreAction ) {
const calibDurationSec = 120
const maxDerateW = 150
@@ -2739,7 +2803,6 @@ func runBenchmarkPowerCalibration(
err error
}
// gpuCalibState holds per-GPU binary search state during parallel calibration.
type gpuCalibState struct {
idx int
@@ -2796,19 +2859,20 @@ func runBenchmarkPowerCalibration(
hi : appliedLimitW + 1 , // not yet tested, not yet confirmed unstable
calib : benchmarkPowerCalibrationResult { AppliedPowerLimitW : float64 ( appliedLimitW ) } ,
}
if fix edLimits != nil {
if fix edW, ok := fix edLimits[ idx ] ; ok {
// This GPU's limit was established in a prior ramp step and must
// remain unchanged. Apply it immediately and skip the binary search.
if canDerate && fixedW > 0 {
_ = setBenchmarkPowerLimit ( ctx , verboseLog , idx , fixedW )
if se edLimits != nil {
if se edW, ok := se edLimits[ idx ] ; ok && seedW > 0 {
// A previously validated limit is only a starting point. Re-run
// targeted_power under the current multi-GPU thermal load and derate
// again if this step shows new throttling.
if canDerate {
_ = setBenchmarkPowerLimit ( ctx , verboseLog , idx , seedW )
}
s . appliedLimitW = fix edW
s . calib . AppliedPowerLimitW = float64 ( fixedW )
s . calib . Completed = true
s . converged = true
s . appliedLimitW = se edW
s . hi = seedW + 1
s . calib . AppliedPowerLimitW = float64 ( seedW )
s . calib . Derated = seedW < s . originalLimitW
s . calib . Notes = append ( s . calib . Notes ,
fmt . Sprintf ( "fix ed limit: %d W (held from prior ramp step )" , fix edW) )
fmt . Sprintf ( "se ed limit: %d W (revalidating under current thermal load )" , se edW) )
}
}
states = append ( states , s )
@@ -3091,7 +3155,6 @@ func powerBenchDurationSec(profile string) int {
}
}
func cloneBenchmarkGPUInfoMap ( src map [ int ] benchmarkGPUInfo ) map [ int ] benchmarkGPUInfo {
out := make ( map [ int ] benchmarkGPUInfo , len ( src ) )
for k , v := range src {
@@ -3107,7 +3170,42 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
fmt . Fprintf ( & b , "**Profile:** %s \n" , result . BenchmarkProfile )
fmt . Fprintf ( & b , "**Generated:** %s \n" , result . GeneratedAt . Format ( "2006-01-02 15:04:05 UTC" ) )
fmt . Fprintf ( & b , "**Overall status:** %s \n" , result . OverallStatus )
fmt . Fprintf ( & b , "**Platform max TDP:** %.0f W \n \n" , result . PlatformMaxTDPW )
fmt . Fprintf ( & b , "**Platform max TDP (GPU-reported) :** %.0f W \n" , result . PlatformMaxTDPW )
if sp := result . ServerPower ; sp != nil && sp . Available {
fmt . Fprintf ( & b , "**Server power delta (IPMI):** %.0f W \n" , sp . DeltaW )
fmt . Fprintf ( & b , "**Reporting ratio (IPMI Δ / GPU sum):** %.2f \n" , sp . ReportingRatio )
}
b . WriteString ( "\n" )
// Server power comparison table.
if sp := result . ServerPower ; sp != nil {
b . WriteString ( "## Server vs GPU Power Comparison\n\n" )
b . WriteString ( "| Metric | Value |\n" )
b . WriteString ( "|--------|-------|\n" )
fmt . Fprintf ( & b , "| GPU stable limits sum (nvidia-smi) | %.0f W |\n" , result . PlatformMaxTDPW )
if sp . Available {
fmt . Fprintf ( & b , "| Server idle power (IPMI) | %.0f W |\n" , sp . IdleW )
fmt . Fprintf ( & b , "| Server loaded power (IPMI) | %.0f W |\n" , sp . LoadedW )
fmt . Fprintf ( & b , "| Server Δ power (loaded − idle) | %.0f W |\n" , sp . DeltaW )
ratio := sp . ReportingRatio
ratioNote := ""
switch {
case ratio >= 0.9 :
ratioNote = "✓ GPU telemetry matches server power"
case ratio >= 0.75 :
ratioNote = "⚠ minor discrepancy — GPU may slightly over-report TDP"
default :
ratioNote = "✗ significant discrepancy — GPU over-reports TDP vs wall power"
}
fmt . Fprintf ( & b , "| Reporting ratio (IPMI Δ / GPU sum) | %.2f — %s |\n" , ratio , ratioNote )
} else {
b . WriteString ( "| IPMI availability | not available — IPMI not supported or ipmitool not found |\n" )
}
for _ , note := range sp . Notes {
fmt . Fprintf ( & b , "\n> %s\n" , note )
}
b . WriteString ( "\n" )
}
if len ( result . Findings ) > 0 {
b . WriteString ( "## Summary\n\n" )
for _ , finding := range result . Findings {
@@ -3181,6 +3279,12 @@ func renderPowerBenchSummary(result NvidiaPowerBenchResult) string {
fmt . Fprintf ( & b , "gpu_%d_stable_limit_w=%.0f\n" , gpu . Index , gpu . StablePowerLimitW )
}
}
if sp := result . ServerPower ; sp != nil && sp . Available {
fmt . Fprintf ( & b , "server_idle_w=%.0f\n" , sp . IdleW )
fmt . Fprintf ( & b , "server_loaded_w=%.0f\n" , sp . LoadedW )
fmt . Fprintf ( & b , "server_delta_w=%.0f\n" , sp . DeltaW )
fmt . Fprintf ( & b , "server_reporting_ratio=%.2f\n" , sp . ReportingRatio )
}
return b . String ( )
}
@@ -3224,6 +3328,16 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
}
durationSec := powerBenchDurationSec ( opts . Profile )
_ = durationSec
// Sample IPMI idle power before any GPU load.
var serverIdleW float64
var serverIdleOK bool
if w , ok := sampleIPMIPowerSeries ( ctx , 10 ) ; ok {
serverIdleW = w
serverIdleOK = true
logFunc ( fmt . Sprintf ( "server idle power (IPMI): %.0f W" , w ) )
}
// Phase 1: calibrate each GPU individually (sequentially, one at a time) to
// establish a true single-card power baseline unaffected by neighbour heat.
calibByIndex := make ( map [ int ] benchmarkPowerCalibrationResult , len ( selected ) )
@@ -3320,20 +3434,35 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
// stableLimits accumulates GPU index → fixed stable limit (W) across steps.
stableLimits := make ( map [ int ] int , len ( result . RecommendedSlotOrder ) )
// Start an IPMI sampling goroutine that runs throughout Phase 2 to capture
// server-side loaded power while GPUs are under stress. The goroutine is
// cancelled as soon as Phase 2 finishes, and the average is used to compare
// against PlatformMaxTDPW (GPU-reported stable limits sum).
var serverLoadedW float64
var serverLoadedOK bool
ipmiPhase2Ctx , ipmiPhase2Cancel := context . WithCancel ( ctx )
ipmiPhase2Done := make ( chan float64 , 1 )
go func ( ) {
defer close ( ipmiPhase2Done )
if w , ok := sampleIPMIPowerSeries ( ipmiPhase2Ctx , 3600 ) ; ok {
ipmiPhase2Done <- w
}
} ( )
// Step 1: reuse single-card calibration result directly.
if len ( result . RecommendedSlotOrder ) > 0 {
firstIdx := result . RecommendedSlotOrder [ 0 ]
firstCalib := calibByIndex [ firstIdx ]
stableLimits [ firstIdx ] = int ( math . Round ( firstCalib . AppliedPowerLimitW ) )
ramp := NvidiaPowerBenchStep {
StepIndex : 1 ,
GPUIndices : [ ] int { firstIdx } ,
NewGPUIndex : firstIdx ,
NewGPUStableLimitW : firstCalib . AppliedPowerLimitW ,
StepIndex : 1 ,
GPUIndices : [ ] int { firstIdx } ,
NewGPUIndex : firstIdx ,
NewGPUStableLimitW : firstCalib . AppliedPowerLimitW ,
TotalObservedPowerW : firstCalib . Summary . P95PowerW ,
AvgObservedPowerW : firstCalib . Summary . P95PowerW ,
Derated : firstCalib . Derated ,
Status : "OK" ,
Derated : firstCalib . Derated ,
Status : "OK" ,
}
if ! firstCalib . Completed {
ramp . Status = "FAILED"
@@ -3351,8 +3480,9 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
len ( result . RecommendedSlotOrder ) , firstIdx , firstCalib . AppliedPowerLimitW ) )
}
// Steps 2..N: each step fixes previousl y c alibrated GPUs and searches only
// the new GPU's stable limit in the combined thermal environment.
// Steps 2..N: each step revalidates ever y already-active GPU under the new
// cumulative thermal environment and also calibrates the newly introduced
// GPU. Previously found limits are used only as seeds for the search.
for stepNum := 1 ; stepNum < len ( result . RecommendedSlotOrder ) ; stepNum ++ {
step := stepNum + 1
subset := append ( [ ] int ( nil ) , result . RecommendedSlotOrder [ : step ] ... )
@@ -3360,17 +3490,18 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
stepDir := filepath . Join ( runDir , fmt . Sprintf ( "step-%02d" , step ) )
_ = os . MkdirAll ( stepDir , 0755 )
// All previously calibrated GPUs are fixed at their stable limits.
fixedForStep := make ( map [ int ] int , len ( stableLimits ) )
// Reuse the latest stable limits as starting points, but re-check every
// active GPU in this hotter configuration.
seedForStep := make ( map [ int ] int , len ( stableLimits ) )
for k , v := range stableLimits {
fix edForStep[ k ] = v
se edForStep[ k ] = v
}
logFunc ( fmt . Sprintf ( "power ramp: step %d/%d — c alibr ating GPU %d with %d fixed GPU(s) " ,
step , len ( result . RecommendedSlotOrder ) , newGPUIdx , len ( fixedForStep ) ) )
logFunc ( fmt . Sprintf ( "power ramp: step %d/%d — rev alid ating %d active GPU(s) including new GPU %d " ,
step , len ( result . RecommendedSlotOrder ) , len ( subset ) , newGPUIdx ) )
stepInfo := cloneBenchmarkGPUInfoMap ( infoByIndex )
stepCalib , stepRestore := runBenchmarkPowerCalibration ( ctx , verboseLog , stepDir , subset , stepInfo , logFunc , fix edForStep)
stepCalib , stepRestore := runBenchmarkPowerCalibration ( ctx , verboseLog , stepDir , subset , stepInfo , logFunc , se edForStep)
// Accumulate restore actions; they all run in the outer defer.
allRestoreActions = append ( allRestoreActions , stepRestore ... )
@@ -3391,36 +3522,72 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
ramp . AvgObservedPowerW = ramp . TotalObservedPowerW / float64 ( len ( subset ) )
}
// Determine stable limit for the new GPU.
if c , ok := stepCalib [ newGPUIdx ] ; ok && c . Completed {
stableLimits [ newGPUIdx ] = int ( math . Round ( c . AppliedPowerLimitW ) )
ramp . NewGPUStableLimitW = c . AppliedPowerLimitW
ramp . Derated = c . Derated
for _ , idx := range subset {
c , ok := stepCalib [ idx ]
if ! ok || ! c . Completed {
fallback : = 0
if lim , ok := stableLimits [ idx ] ; ok && lim > 0 {
fallback = lim
} else if fb , ok := calibByIndex [ idx ] ; ok {
fallback = int ( math . Round ( fb . AppliedPowerLimitW ) )
}
if fallback > 0 {
stableLimits [ idx ] = fallback
}
ramp . Status = "FAILED"
ramp . Notes = append ( ramp . Notes ,
fmt . Sprintf ( "GPU %d did not complete targeted_power in ramp step %d; keeping previous stable limit %d W" , idx , step , fallback ) )
result . OverallStatus = "PARTIAL"
continue
}
prevLimit , hadPrev := stableLimits [ idx ]
newLimit := int ( math . Round ( c . AppliedPowerLimitW ) )
stableLimits [ idx ] = newLimit
if idx == newGPUIdx {
ramp . NewGPUStableLimitW = c . AppliedPowerLimitW
ramp . Derated = c . Derated
}
if c . Derated {
ramp . Status = "PARTIAL"
if result . OverallStatus == "OK" {
result . OverallStatus = "PARTIAL"
}
result . Findings = append ( result . Findings , fmt . Sprintf ( "Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load." , step , newGPUIdx , c . AppliedPowerLimitW ) )
}
} else {
// Calibration failed — fall back to single-card limit.
fb := calibByIndex [ newGPUIdx ]
stableLimits [ newGPUIdx ] = int ( math . Round ( fb . AppliedPowerLimitW ) )
ramp . NewGPUStableLimitW = fb . AppliedPowerLimitW
ramp . Status = "FAILED"
ramp . Notes = append ( ramp . Notes , fmt . Sprintf ( "GPU %d did not complete targeted_power in ramp step %d; using single-card limit %.0f W" , newGPUIdx , step , fb . AppliedPowerLimitW ) )
result . OverallStatus = "PARTIAL"
if hadPrev && newLimit < prevLimit {
ramp . Notes = append ( ramp . Notes ,
fmt . Sprintf ( "GPU %d was re-derated from %d W to %d W under combined thermal load." , idx , prevLimit , newLimit ) )
}
}
if c , ok := stepCalib [ newGPUIdx ] ; ok && c . Completed && c . Derated {
result . Findings = append ( result . Findings , fmt . Sprintf ( "Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load." , step , newGPUIdx , c . AppliedPowerLimitW ) )
}
result . RampSteps = append ( result . RampSteps , ramp )
}
// Stop IPMI Phase 2 sampling and collect result.
ipmiPhase2Cancel ( )
if w , ok := <- ipmiPhase2Done ; ok {
serverLoadedW = w
serverLoadedOK = true
logFunc ( fmt . Sprintf ( "server loaded power (IPMI, Phase 2 avg): %.0f W" , w ) )
}
// Populate StablePowerLimitW on each GPU entry from the accumulated stable limits.
for i := range result . GPUs {
if lim , ok := stableLimits [ result . GPUs [ i ] . Index ] ; ok {
result . GPUs [ i ] . StablePowerLimitW = float64 ( lim )
}
if result . GPUs [ i ] . StablePowerLimitW > 0 && result . GPUs [ i ] . AppliedPowerLimitW > 0 &&
result . GPUs [ i ] . StablePowerLimitW < result . GPUs [ i ] . AppliedPowerLimitW {
result . GPUs [ i ] . Derated = true
result . Findings = append ( result . Findings , fmt . Sprintf (
"GPU %d required additional derating from %.0f W (single-card) to %.0f W under full-system thermal load." ,
result . GPUs [ i ] . Index , result . GPUs [ i ] . AppliedPowerLimitW , result . GPUs [ i ] . StablePowerLimitW ,
) )
}
}
// PlatformMaxTDPW = sum of all stable limits — the actual sustained power
@@ -3428,6 +3595,13 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
for _ , lim := range stableLimits {
result . PlatformMaxTDPW += float64 ( lim )
}
// Characterize server power from IPMI idle/loaded samples.
// GPUReportedSumW = PlatformMaxTDPW (sum of stable GPU limits, nvidia-smi).
// ReportingRatio = IPMI_delta / GPU_reported_sum:
// ~1.0 → GPU telemetry matches wall power; <0.75 → GPU over-reports its TDP.
_ = serverIdleOK // used implicitly via characterizeServerPower
result . ServerPower = characterizeServerPower ( serverIdleW , serverLoadedW , result . PlatformMaxTDPW , serverIdleOK && serverLoadedOK )
resultJSON , err := json . MarshalIndent ( result , "" , " " )
if err != nil {
return "" , fmt . Errorf ( "marshal power result: %w" , err )