@@ -76,7 +76,56 @@ var (
// benchmarkPrecisionPhases lists the precision categories run as individual
// steady-state windows before the combined steady pass. Order is from lowest
// to highest power draw so thermal ramp-up is gradual.
var benchmarkPrecisionPhases = [ ] string { "fp8" , "fp16" , "fp32" , "fp64" , "fp4" }
var benchmarkPrecisionPhases = [ ] string { "int8" , " fp8" , "fp16" , "fp32" , "fp64" , "fp4" }
func buildBenchmarkSteadyPlan ( spec benchmarkProfileSpec , metricStage func ( string ) string ) ( planLabels [ ] string , planPhases [ ] benchmarkPlannedPhase , basePhaseSec int , mixedPhaseSec int ) {
switch spec . Name {
case NvidiaBenchmarkProfileStandard :
basePhaseSec = 60
mixedPhaseSec = 300
case NvidiaBenchmarkProfileStability :
basePhaseSec = 300
mixedPhaseSec = 3600
case NvidiaBenchmarkProfileOvernight :
basePhaseSec = 3600
mixedPhaseSec = 14400
default :
totalWeight := len ( benchmarkPrecisionPhases ) + 5
if totalWeight <= 0 {
return nil , nil , 0 , 0
}
basePhaseSec = spec . SteadySec / totalWeight
if basePhaseSec <= 0 {
basePhaseSec = 1
}
mixedPhaseSec = basePhaseSec * 5
}
planLabels = make ( [ ] string , 0 , len ( benchmarkPrecisionPhases ) + 1 )
planPhases = make ( [ ] benchmarkPlannedPhase , 0 , len ( benchmarkPrecisionPhases ) + 1 )
for _ , prec := range benchmarkPrecisionPhases {
planLabels = append ( planLabels , prec )
planPhases = append ( planPhases , benchmarkPlannedPhase {
PlanLabel : prec ,
MetricStage : metricStage ( prec ) ,
DurationSec : basePhaseSec ,
} )
}
planLabels = append ( planLabels , "mixed" )
planPhases = append ( planPhases , benchmarkPlannedPhase {
PlanLabel : "mixed" ,
MetricStage : metricStage ( "mixed" ) ,
DurationSec : mixedPhaseSec ,
} )
return planLabels , planPhases , basePhaseSec , mixedPhaseSec
}
func benchmarkPlanDurationsCSV ( phases [ ] benchmarkPlannedPhase ) string {
values := make ( [ ] string , 0 , len ( phases ) )
for _ , phase := range phases {
values = append ( values , strconv . Itoa ( phase . DurationSec ) )
}
return strings . Join ( values , "," )
}
func ( s * System ) RunNvidiaBenchmark ( ctx context . Context , baseDir string , opts NvidiaBenchmarkOptions , logFunc func ( string ) ) ( string , error ) {
if ctx == nil {
@@ -233,42 +282,42 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
continue
}
// ── Per-precision stability phases ────────────────────────────────────────
// R un each precision category alone so PowerCVPct reflects genuine GPU
// power stability, not kernel-mix variance.
// Time budget: each phase gets steadySec/numPhases, minimum 60 s.
// SteadySec is split equally across all precision phases + 1 combined slot.
// Skipped phases (unsupported precision) are simply omitted; combined is fixed.
totalSlots := len ( benchmarkPrecisionPhases ) + 1
perPhaseSec := spec . SteadySec / totalSlots
if perPhaseSec < 60 {
perPhaseSec = 60
}
// Run synthetic precision phases and the combined steady phase as one
// uninterrupted command so the GPU stays hot between windows.
eccBase , _ := queryECCCounters ( idx )
for _ , prec := range benchmarkPrecisionPhases {
phaseCmd : = [ ] string {
"bee-gpu-burn" ,
"--seconds" , strconv . Itoa ( perPhaseSec ) ,
"--size-mb" , strconv . Itoa ( opts . SizeMB ) ,
"--devices" , strconv . Itoa ( idx ) ,
"--precision" , prec ,
planLabels , planPhases , basePhaseSec , mixedPhaseSec := buildBenchmarkSteadyPlan ( spec , func ( label string ) string {
if label = = "mixed" {
return fmt . Sprintf ( "gpu-%d-steady" , idx )
}
logFunc ( fmt . Sprintf ( "GPU %d: %s stability phase (%ds)" , idx , prec , perPhaseSec ) )
phaseLogName := fmt . Sprintf ( "gpu-%d-steady-%s" , idx , prec )
eccBefore , _ := queryECCCounters ( idx )
phaseOut , phaseRows , phaseErr := runBenchmarkCommandWithMetrics ( ctx , verboseLog , phaseLogName + ".log" , phaseCmd , nil , [ ] int { idx } , logFunc )
appendBenchmarkMetrics ( & metricRows , phaseRows , phaseLogName )
appendBenchmarkStageLog ( gpuBurnLog , "bee-gpu-burn" , phaseLogName , phaseOut )
eccAfter , _ := queryECCCounters ( idx )
if phaseErr != nil || len ( phaseRows ) == 0 {
return fmt . Sprintf ( "gpu-%d-steady-%s" , idx , label )
} )
planCmd := [ ] string {
"bee-gpu-burn" ,
"--seconds" , strconv . Itoa ( basePhaseSec ) ,
"--size-mb" , strconv . Itoa ( opts . SizeMB ) ,
"--devices" , strconv . Itoa ( idx ) ,
"--precision-plan" , strings . Join ( planLabels , "," ) ,
"--precision-plan-seconds" , benchmarkPlanDurationsCSV ( planPhases ) ,
}
logFunc ( fmt . Sprintf ( "GPU %d: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)" , idx , len ( benchmarkPrecisionPhases ) , basePhaseSec , mixedPhaseSec ) )
_ , phaseRowsByStage , phaseLogs , planErr := runBenchmarkPlannedCommandWithMetrics ( ctx , verboseLog , fmt . Sprintf ( "gpu-%d-precision-plan.log" , idx ) , planCmd , nil , [ ] int { idx } , planPhases , logFunc )
for _ , phaseSpec := range planPhases {
if rows := phaseRowsByStage [ phaseSpec . MetricStage ] ; len ( rows ) > 0 {
appendBenchmarkMetrics ( & metricRows , rows , phaseSpec . MetricStage )
}
appendBenchmarkStageLog ( gpuBurnLog , "bee-gpu-burn" , phaseSpec . MetricStage , phaseLogs [ phaseSpec . PlanLabel ] )
}
for _ , prec := range benchmarkPrecisionPhases {
stageName := fmt . Sprintf ( "gpu-%d-steady-%s" , idx , prec )
phaseRows := phaseRowsByStage [ stageName ]
if len ( phaseRows ) == 0 {
continue
}
phase := BenchmarkPrecisionSteadyPhase {
Precision : prec ,
Steady : summarizeBenchmarkTelemetry ( phaseRows ) ,
ECC : diffECCCounters ( eccBefore , eccAfter ) ,
}
for _ , p := range parseBenchmarkBurnLog ( string ( phaseOut ) ) . Profiles {
for _ , p := range parseBenchmarkBurnLog ( string ( phaseLogs [ prec ] ) ) . Profiles {
if p . Supported {
phase . TeraOpsPerSec += p . TeraOpsPerSec
phase . WeightedTeraOpsPerSec += p . WeightedTeraOpsPerSec
@@ -278,13 +327,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
}
beforeThrottle , _ := queryThrottleCounters ( idx )
steadyCmd := [ ] string {
"bee-gpu-burn" ,
"--seconds" , strconv . Itoa ( perPhaseSec ) ,
"--size-mb" , strconv . Itoa ( opts . SizeMB ) ,
"--devices" , strconv . Itoa ( idx ) ,
}
logFunc ( fmt . Sprintf ( "GPU %d: steady compute (combined, %ds)" , idx , perPhaseSec ) )
logFunc ( fmt . Sprintf ( "GPU %d: steady compute (combined, %ds)" , idx , mixedPhaseSec ) )
// Sample server power via IPMI in parallel with the steady phase.
// We collect readings every 5s and average them.
@@ -320,9 +363,6 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
}
} ( )
steadyOut , steadyRows , steadyErr := runBenchmarkCommandWithMetrics ( ctx , verboseLog , fmt . Sprintf ( "gpu-%d-steady.log" , idx ) , steadyCmd , nil , [ ] int { idx } , logFunc )
appendBenchmarkMetrics ( & metricRows , steadyRows , fmt . Sprintf ( "gpu-%d-steady" , idx ) )
appendBenchmarkStageLog ( gpuBurnLog , "bee-gpu-burn" , fmt . Sprintf ( "gpu-%d-steady" , idx ) , steadyOut )
close ( ipmiStopCh )
if loadedW , ok := <- ipmiResultCh ; ok {
serverLoadedWSum += loadedW
@@ -331,11 +371,12 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
logFunc ( fmt . Sprintf ( "GPU %d: server loaded power (IPMI): %.0f W" , idx , loadedW ) )
}
afterThrottle , _ := queryThrottleCounters ( idx )
if steady Err != nil {
gpuResult . Notes = append ( gpuResult . Notes , "steady compute failed: " + steady Err. Error ( ) )
if plan Err != nil {
gpuResult . Notes = append ( gpuResult . Notes , "precision plan failed: " + plan Err. Error ( ) )
}
parseResult := parseBenchmarkBurnLog ( st ring ( steadyOut ) )
steadyRows := phaseRowsByStage [ fmt . Sp rintf ( "gpu-%d-steady" , idx ) ]
parseResult := parseBenchmarkBurnLog ( string ( phaseLogs [ "mixed" ] ) )
gpuResult . ComputeCapability = parseResult . ComputeCapability
gpuResult . Backend = parseResult . Backend
gpuResult . PrecisionResults = parseResult . Profiles
@@ -349,17 +390,19 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
gpuResult . ECC = diffECCCounters ( eccBase , eccFinal )
}
cooldownRows , err := collectBenchmarkSamples ( ctx , spec . CooldownSec , [ ] int { idx } )
if err ! = nil && err != context . Canceled {
gpuResult . Notes = append ( gpuResult . Notes , "cooldown sampling failed: " + err . Error ( ) )
if spec . CooldownSec > 0 {
cooldownRows , err : = collectBenchmarkSamples ( ctx , spec . CooldownSec , [ ] int { idx } )
if err ! = nil && err != context . Canceled {
gpuResult . Notes = append ( gpuResult . Notes , "cooldown sampling failed: " + err . Error ( ) )
}
gpuResult . Cooldown = summarizeBenchmarkTelemetry ( cooldownRows )
appendBenchmarkMetrics ( & metricRows , cooldownRows , fmt . Sprintf ( "gpu-%d-cooldown" , idx ) )
}
gpuResult . Cooldown = summarizeBenchmarkTelemetry ( cooldownRows )
appendBenchmarkMetrics ( & metricRows , cooldownRows , fmt . Sprintf ( "gpu-%d-cooldown" , idx ) )
gpuResult . Scores = scoreBenchmarkGPUResult ( gpuResult )
gpuResult . DegradationReasons = detectBenchmarkDegradationReasons ( gpuResult , result . Normalization . Status )
if steady Err != nil {
gpuResult . Status = classifySATErrorStatus ( steadyOut , steady Err)
if plan Err != nil {
gpuResult . Status = classifySATErrorStatus ( phaseLogs [ "mixed" ] , plan Err)
} else if parseResult . Fallback {
gpuResult . Status = "PARTIAL"
} else {
@@ -462,11 +505,11 @@ func normalizeNvidiaBenchmarkOptionsForBenchmark(opts NvidiaBenchmarkOptions) Nv
func resolveBenchmarkProfile ( profile string ) benchmarkProfileSpec {
switch strings . TrimSpace ( strings . ToLower ( profile ) ) {
case NvidiaBenchmarkProfileStability :
return benchmarkProfileSpec { Name : NvidiaBenchmarkProfileStability , BaselineSec : 30 , WarmupSec : 30 0, SteadySec : 3600 , NCCLSec : 300 , CooldownSec : 30 0}
return benchmarkProfileSpec { Name : NvidiaBenchmarkProfileStability , BaselineSec : 30 , WarmupSec : 12 0, SteadySec : 3600 , NCCLSec : 300 , CooldownSec : 0 }
case NvidiaBenchmarkProfileOvernight :
return benchmarkProfileSpec { Name : NvidiaBenchmarkProfileOvernight , BaselineSec : 60 , WarmupSec : 60 0, SteadySec : 27000 , NCCLSec : 600 , CooldownSec : 30 0}
return benchmarkProfileSpec { Name : NvidiaBenchmarkProfileOvernight , BaselineSec : 60 , WarmupSec : 18 0, SteadySec : 27000 , NCCLSec : 600 , CooldownSec : 0 }
default :
return benchmarkProfileSpec { Name : NvidiaBenchmarkProfileStandard , BaselineSec : 15 , WarmupSec : 120 , SteadySec : 480 , NCCLSec : 180 , CooldownSec : 12 0}
return benchmarkProfileSpec { Name : NvidiaBenchmarkProfileStandard , BaselineSec : 15 , WarmupSec : 45 , SteadySec : 480 , NCCLSec : 180 , CooldownSec : 0 }
}
}
@@ -795,6 +838,66 @@ func runBenchmarkCommandWithMetrics(ctx context.Context, verboseLog, name string
return out , metricRows , err
}
type benchmarkPlannedPhase struct {
PlanLabel string
MetricStage string
DurationSec int
}
func runBenchmarkPlannedCommandWithMetrics (
ctx context . Context ,
verboseLog , name string ,
cmd [ ] string ,
env [ ] string ,
gpuIndices [ ] int ,
phases [ ] benchmarkPlannedPhase ,
logFunc func ( string ) ,
) ( [ ] byte , map [ string ] [ ] GPUMetricRow , map [ string ] [ ] byte , error ) {
out , rows , err := runBenchmarkCommandWithMetrics ( ctx , verboseLog , name , cmd , env , gpuIndices , logFunc )
return out , splitBenchmarkRowsByPlannedPhase ( rows , phases ) , splitBenchmarkLogByPlannedPhase ( out ) , err
}
func splitBenchmarkRowsByPlannedPhase ( rows [ ] GPUMetricRow , phases [ ] benchmarkPlannedPhase ) map [ string ] [ ] GPUMetricRow {
out := make ( map [ string ] [ ] GPUMetricRow , len ( phases ) )
if len ( rows ) == 0 || len ( phases ) == 0 {
return out
}
for _ , row := range rows {
idx := len ( phases ) - 1
var elapsed float64
for i , phase := range phases {
durationSec := phase . DurationSec
if durationSec <= 0 {
durationSec = 1
}
elapsed += float64 ( durationSec )
if row . ElapsedSec < elapsed {
idx = i
break
}
}
out [ phases [ idx ] . MetricStage ] = append ( out [ phases [ idx ] . MetricStage ] , row )
}
return out
}
func splitBenchmarkLogByPlannedPhase ( raw [ ] byte ) map [ string ] [ ] byte {
out := make ( map [ string ] [ ] byte )
var current string
for _ , line := range strings . Split ( strings . ReplaceAll ( string ( raw ) , "\r\n" , "\n" ) , "\n" ) {
trimmed := strings . TrimSpace ( stripBenchmarkPrefix ( line ) )
switch {
case strings . HasPrefix ( trimmed , "phase_begin=" ) :
current = strings . TrimSpace ( strings . TrimPrefix ( trimmed , "phase_begin=" ) )
case strings . HasPrefix ( trimmed , "phase_end=" ) :
current = ""
case current != "" :
out [ current ] = append ( out [ current ] , [ ] byte ( line + "\n" ) ... )
}
}
return out
}
type benchmarkCoolingSample struct {
AvgFanRPM float64
AvgFanDutyCyclePct float64
@@ -968,6 +1071,8 @@ func ensureBenchmarkProfile(profiles map[string]*benchmarkBurnProfile, name stri
category = "fp32_tf32"
case strings . HasPrefix ( name , "fp16" ) :
category = "fp16_bf16"
case strings . HasPrefix ( name , "int8" ) :
category = "int8"
case strings . HasPrefix ( name , "fp8" ) :
category = "fp8"
case strings . HasPrefix ( name , "fp4" ) :
@@ -985,6 +1090,7 @@ func ensureBenchmarkProfile(profiles map[string]*benchmarkBurnProfile, name stri
// fp64 = 2.0 — double precision, 2× more bits per operand
// fp32 = 1.0 — single precision baseline
// fp16 = 0.5 — half precision
// int8 = 0.25 — quarter precision
// fp8 = 0.25 — quarter precision
// fp4 = 0.125 — eighth precision
//
@@ -998,6 +1104,8 @@ func precisionWeight(category string) float64 {
return 1.0
case "fp16_bf16" :
return 0.5
case "int8" :
return 0.25
case "fp8" :
return 0.25
case "fp4" :
@@ -1861,41 +1969,41 @@ func runNvidiaBenchmarkParallel(
}
}
// ── Per-precision stability phases (parallel) ─────────────────────────────
totalSlots := len ( benchmarkPrecisionPhases ) + 1
perPhaseSec := spec . SteadySec / totalSlots
if perPhaseSec < 60 {
perPhaseSec = 60
}
// Run synthetic precision phases and the combined steady phase as one
// uninterrupted command so the GPUs stay hot between windows.
eccBase := make ( map [ int ] BenchmarkECCCounters , len ( selected ) )
for _ , idx := range selected {
eccBase [ idx ] , _ = queryECCCounters ( idx )
}
planLabels , planPhases , basePhaseSec , mixedPhaseSec := buildBenchmarkSteadyPlan ( spec , func ( label string ) string {
if label == "mixed" {
return "steady"
}
return "gpu-all-steady-" + label
} )
planCmd := [ ] string {
"bee-gpu-burn" ,
"--seconds" , strconv . Itoa ( basePhaseSec ) ,
"--size-mb" , strconv . Itoa ( opts . SizeMB ) ,
"--devices" , allDevices ,
"--precision-plan" , strings . Join ( planLabels , "," ) ,
"--precision-plan-seconds" , benchmarkPlanDurationsCSV ( planPhases ) ,
}
logFunc ( fmt . Sprintf ( "GPUs %s: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)" , allDevices , len ( benchmarkPrecisionPhases ) , basePhaseSec , mixedPhaseSec ) )
_ , phaseRowsByStage , phaseLogs , planErr := runBenchmarkPlannedCommandWithMetrics ( ctx , verboseLog , "gpu-all-precision-plan.log" , planCmd , nil , selected , planPhases , logFunc )
for _ , phaseSpec := range planPhases {
if rows := phaseRowsByStage [ phaseSpec . MetricStage ] ; len ( rows ) > 0 {
appendBenchmarkMetrics ( allMetricRows , rows , phaseSpec . MetricStage )
}
appendBenchmarkStageLog ( gpuBurnLog , "bee-gpu-burn" , phaseSpec . MetricStage , phaseLogs [ phaseSpec . PlanLabel ] )
}
for _ , prec := range benchmarkPrecisionPhases {
phaseCmd := [ ] string {
"bee-gpu-burn" ,
"--seconds" , strconv . Itoa ( perPhaseSec ) ,
"--size-mb" , strconv . Itoa ( opts . SizeMB ) ,
"--devices" , allDevices ,
"--precision" , prec ,
}
logFunc ( fmt . Sprintf ( "GPUs %s: %s stability phase (%ds)" , allDevices , prec , perPhaseSec ) )
phaseLogName := "gpu-all-steady-" + prec
eccBeforePhase := make ( map [ int ] BenchmarkECCCounters , len ( selected ) )
for _ , idx := range selected {
eccBeforePhase [ idx ] , _ = queryECCCounters ( idx )
}
phaseOut , phaseRows , phaseErr := runBenchmarkCommandWithMetrics ( ctx , verboseLog , phaseLogName + ".log" , phaseCmd , nil , selected , logFunc )
appendBenchmarkMetrics ( allMetricRows , phaseRows , phaseLogName )
appendBenchmarkStageLog ( gpuBurnLog , "bee-gpu-burn" , phaseLogName , phaseOut )
eccAfterPhase := make ( map [ int ] BenchmarkECCCounters , len ( selected ) )
for _ , idx := range selected {
eccAfterPhase [ idx ] , _ = queryECCCounters ( idx )
}
if phaseErr != nil || len ( phaseRows ) == 0 {
phaseRows := phaseRowsByStage [ phaseLogName ]
i f len ( phaseRows ) == 0 {
continue
}
parseByGPU := parseBenchmarkBurnLogByGPU ( string ( phaseOut ) )
parseByGPU := parseBenchmarkBurnLogByGPU ( string ( phaseLogs [ prec ] ) )
for _ , idx := range selected {
perGPU := filterRowsByGPU ( phaseRows , idx )
if len ( perGPU ) == 0 {
@@ -1904,7 +2012,6 @@ func runNvidiaBenchmarkParallel(
phase := BenchmarkPrecisionSteadyPhase {
Precision : prec ,
Steady : summarizeBenchmarkTelemetry ( perGPU ) ,
ECC : diffECCCounters ( eccBeforePhase [ idx ] , eccAfterPhase [ idx ] ) ,
}
if pr , ok := parseByGPU [ idx ] ; ok {
for _ , p := range pr . Profiles {
@@ -1924,14 +2031,7 @@ func runNvidiaBenchmarkParallel(
beforeThrottle [ idx ] , _ = queryThrottleCounters ( idx )
}
// Steady: all GPUs simultaneously (combined). Fixed at one slot = per PhaseSec.
steadyCmd := [ ] string {
"bee-gpu-burn" ,
"--seconds" , strconv . Itoa ( perPhaseSec ) ,
"--size-mb" , strconv . Itoa ( opts . SizeMB ) ,
"--devices" , allDevices ,
}
logFunc ( fmt . Sprintf ( "GPUs %s: parallel steady compute (combined, %ds)" , allDevices , perPhaseSec ) )
logFunc ( fmt . Sprintf ( "GPUs %s: parallel steady compute (combined, %ds)" , allDevices , mixed PhaseSec) )
// Sample server power via IPMI in parallel with steady phase.
ipmiStopCh := make ( chan struct { } )
@@ -1965,9 +2065,6 @@ func runNvidiaBenchmarkParallel(
}
} ( )
steadyOut , steadyRows , steadyErr := runBenchmarkCommandWithMetrics ( ctx , verboseLog , "gpu-all-steady.log" , steadyCmd , nil , selected , logFunc )
appendBenchmarkMetrics ( allMetricRows , steadyRows , "steady" )
appendBenchmarkStageLog ( gpuBurnLog , "bee-gpu-burn" , "steady" , steadyOut )
close ( ipmiStopCh )
if loadedW , ok := <- ipmiResultCh ; ok {
* serverLoadedWSum += loadedW
@@ -1980,7 +2077,8 @@ func runNvidiaBenchmarkParallel(
afterThrottle [ idx ] , _ = queryThrottleCounters ( idx )
}
parseResult s := parseBenchmarkBurnLogByGPU ( string ( steadyOut ) )
steadyRow s := phaseRowsByStage [ "steady" ]
parseResults := parseBenchmarkBurnLogByGPU ( string ( phaseLogs [ "mixed" ] ) )
for _ , idx := range selected {
perGPU := filterRowsByGPU ( steadyRows , idx )
@@ -1998,23 +2096,25 @@ func runNvidiaBenchmarkParallel(
gpuResults [ idx ] . Notes = append ( gpuResults [ idx ] . Notes , "benchmark used driver PTX fallback; tensor throughput score is not comparable" )
}
}
if steady Err != nil {
gpuResults [ idx ] . Notes = append ( gpuResults [ idx ] . Notes , "parallel steady compute failed: " + steady Err. Error ( ) )
if plan Err != nil {
gpuResults [ idx ] . Notes = append ( gpuResults [ idx ] . Notes , "precision plan failed: " + plan Err. Error ( ) )
}
}
// Cooldown: all GPUs together.
cooldownRows , err := collectBenchmarkSamples ( ctx , spec . CooldownSec , selected )
if err ! = nil && err != context . Canceled {
for _ , idx := range select ed {
gpuResults [ idx ] . Notes = append ( gpuResults [ idx ] . Notes , "cooldown sampling failed: " + err . Error ( ) )
if spec . CooldownSec > 0 {
cooldownRows , err : = collectBenchmarkSamples ( ctx , spec . CooldownSec , selected )
i f err != nil && err != context . Cancel ed {
for _ , idx : = range selected {
gpuResults [ idx ] . Notes = append ( gpuResults [ idx ] . Notes , "cooldown sampling failed: " + err . Error ( ) )
}
}
for _ , idx := range selected {
perGPU := filterRowsByGPU ( cooldownRows , idx )
gpuResults [ idx ] . Cooldown = summarizeBenchmarkTelemetry ( perGPU )
}
appendBenchmarkMetrics ( allMetricRows , cooldownRows , "cooldown" )
}
for _ , idx := range selected {
perGPU := filterRowsByGPU ( cooldownRows , idx )
gpuResults [ idx ] . Cooldown = summarizeBenchmarkTelemetry ( perGPU )
}
appendBenchmarkMetrics ( allMetricRows , cooldownRows , "cooldown" )
// Score and finalize each GPU.
for _ , idx := range selected {
@@ -2023,8 +2123,8 @@ func runNvidiaBenchmarkParallel(
r . DegradationReasons = detectBenchmarkDegradationReasons ( * r , result . Normalization . Status )
pr := parseResults [ idx ]
switch {
case steady Err != nil :
r . Status = classifySATErrorStatus ( steadyOut , steady Err)
case plan Err != nil :
r . Status = classifySATErrorStatus ( phaseLogs [ "mixed" ] , plan Err)
case pr . Fallback :
r . Status = "PARTIAL"
default :
@@ -2213,7 +2313,7 @@ func runBenchmarkPowerCalibration(
gpuIndices [ ] int ,
logFunc func ( string ) ,
) map [ int ] float64 {
const calibDurationSec = 45
const calibDurationSec = 120
// dcgmi must be present.
if _ , err := exec . LookPath ( "dcgmi" ) ; err != nil {