Refine NVIDIA benchmark phase timing

This commit is contained in:
Mikhail Chusavitin
2026-04-14 14:12:06 +03:00
parent b1a5035edd
commit 2be7ae6d28
6 changed files with 450 additions and 133 deletions

View File

@@ -76,7 +76,56 @@ var (
// benchmarkPrecisionPhases lists the precision categories run as individual
// steady-state windows before the combined steady pass. Order is from lowest
// to highest power draw so thermal ramp-up is gradual.
var benchmarkPrecisionPhases = []string{"fp8", "fp16", "fp32", "fp64", "fp4"}
var benchmarkPrecisionPhases = []string{"int8", "fp8", "fp16", "fp32", "fp64", "fp4"}
func buildBenchmarkSteadyPlan(spec benchmarkProfileSpec, metricStage func(string) string) (planLabels []string, planPhases []benchmarkPlannedPhase, basePhaseSec int, mixedPhaseSec int) {
switch spec.Name {
case NvidiaBenchmarkProfileStandard:
basePhaseSec = 60
mixedPhaseSec = 300
case NvidiaBenchmarkProfileStability:
basePhaseSec = 300
mixedPhaseSec = 3600
case NvidiaBenchmarkProfileOvernight:
basePhaseSec = 3600
mixedPhaseSec = 14400
default:
totalWeight := len(benchmarkPrecisionPhases) + 5
if totalWeight <= 0 {
return nil, nil, 0, 0
}
basePhaseSec = spec.SteadySec / totalWeight
if basePhaseSec <= 0 {
basePhaseSec = 1
}
mixedPhaseSec = basePhaseSec * 5
}
planLabels = make([]string, 0, len(benchmarkPrecisionPhases)+1)
planPhases = make([]benchmarkPlannedPhase, 0, len(benchmarkPrecisionPhases)+1)
for _, prec := range benchmarkPrecisionPhases {
planLabels = append(planLabels, prec)
planPhases = append(planPhases, benchmarkPlannedPhase{
PlanLabel: prec,
MetricStage: metricStage(prec),
DurationSec: basePhaseSec,
})
}
planLabels = append(planLabels, "mixed")
planPhases = append(planPhases, benchmarkPlannedPhase{
PlanLabel: "mixed",
MetricStage: metricStage("mixed"),
DurationSec: mixedPhaseSec,
})
return planLabels, planPhases, basePhaseSec, mixedPhaseSec
}
func benchmarkPlanDurationsCSV(phases []benchmarkPlannedPhase) string {
values := make([]string, 0, len(phases))
for _, phase := range phases {
values = append(values, strconv.Itoa(phase.DurationSec))
}
return strings.Join(values, ",")
}
func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
if ctx == nil {
@@ -233,42 +282,42 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
continue
}
// ── Per-precision stability phases ────────────────────────────────────────
// Run each precision category alone so PowerCVPct reflects genuine GPU
// power stability, not kernel-mix variance.
// Time budget: each phase gets steadySec/numPhases, minimum 60 s.
// SteadySec is split equally across all precision phases + 1 combined slot.
// Skipped phases (unsupported precision) are simply omitted; combined is fixed.
totalSlots := len(benchmarkPrecisionPhases) + 1
perPhaseSec := spec.SteadySec / totalSlots
if perPhaseSec < 60 {
perPhaseSec = 60
}
// Run synthetic precision phases and the combined steady phase as one
// uninterrupted command so the GPU stays hot between windows.
eccBase, _ := queryECCCounters(idx)
for _, prec := range benchmarkPrecisionPhases {
phaseCmd := []string{
"bee-gpu-burn",
"--seconds", strconv.Itoa(perPhaseSec),
"--size-mb", strconv.Itoa(opts.SizeMB),
"--devices", strconv.Itoa(idx),
"--precision", prec,
planLabels, planPhases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(spec, func(label string) string {
if label == "mixed" {
return fmt.Sprintf("gpu-%d-steady", idx)
}
logFunc(fmt.Sprintf("GPU %d: %s stability phase (%ds)", idx, prec, perPhaseSec))
phaseLogName := fmt.Sprintf("gpu-%d-steady-%s", idx, prec)
eccBefore, _ := queryECCCounters(idx)
phaseOut, phaseRows, phaseErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, phaseLogName+".log", phaseCmd, nil, []int{idx}, logFunc)
appendBenchmarkMetrics(&metricRows, phaseRows, phaseLogName)
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseLogName, phaseOut)
eccAfter, _ := queryECCCounters(idx)
if phaseErr != nil || len(phaseRows) == 0 {
return fmt.Sprintf("gpu-%d-steady-%s", idx, label)
})
planCmd := []string{
"bee-gpu-burn",
"--seconds", strconv.Itoa(basePhaseSec),
"--size-mb", strconv.Itoa(opts.SizeMB),
"--devices", strconv.Itoa(idx),
"--precision-plan", strings.Join(planLabels, ","),
"--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases),
}
logFunc(fmt.Sprintf("GPU %d: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", idx, len(benchmarkPrecisionPhases), basePhaseSec, mixedPhaseSec))
_, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-precision-plan.log", idx), planCmd, nil, []int{idx}, planPhases, logFunc)
for _, phaseSpec := range planPhases {
if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 {
appendBenchmarkMetrics(&metricRows, rows, phaseSpec.MetricStage)
}
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseSpec.MetricStage, phaseLogs[phaseSpec.PlanLabel])
}
for _, prec := range benchmarkPrecisionPhases {
stageName := fmt.Sprintf("gpu-%d-steady-%s", idx, prec)
phaseRows := phaseRowsByStage[stageName]
if len(phaseRows) == 0 {
continue
}
phase := BenchmarkPrecisionSteadyPhase{
Precision: prec,
Steady: summarizeBenchmarkTelemetry(phaseRows),
ECC: diffECCCounters(eccBefore, eccAfter),
}
for _, p := range parseBenchmarkBurnLog(string(phaseOut)).Profiles {
for _, p := range parseBenchmarkBurnLog(string(phaseLogs[prec])).Profiles {
if p.Supported {
phase.TeraOpsPerSec += p.TeraOpsPerSec
phase.WeightedTeraOpsPerSec += p.WeightedTeraOpsPerSec
@@ -278,13 +327,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
}
beforeThrottle, _ := queryThrottleCounters(idx)
steadyCmd := []string{
"bee-gpu-burn",
"--seconds", strconv.Itoa(perPhaseSec),
"--size-mb", strconv.Itoa(opts.SizeMB),
"--devices", strconv.Itoa(idx),
}
logFunc(fmt.Sprintf("GPU %d: steady compute (combined, %ds)", idx, perPhaseSec))
logFunc(fmt.Sprintf("GPU %d: steady compute (combined, %ds)", idx, mixedPhaseSec))
// Sample server power via IPMI in parallel with the steady phase.
// We collect readings every 5s and average them.
@@ -320,9 +363,6 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
}
}()
steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-steady.log", idx), steadyCmd, nil, []int{idx}, logFunc)
appendBenchmarkMetrics(&metricRows, steadyRows, fmt.Sprintf("gpu-%d-steady", idx))
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", fmt.Sprintf("gpu-%d-steady", idx), steadyOut)
close(ipmiStopCh)
if loadedW, ok := <-ipmiResultCh; ok {
serverLoadedWSum += loadedW
@@ -331,11 +371,12 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
logFunc(fmt.Sprintf("GPU %d: server loaded power (IPMI): %.0f W", idx, loadedW))
}
afterThrottle, _ := queryThrottleCounters(idx)
if steadyErr != nil {
gpuResult.Notes = append(gpuResult.Notes, "steady compute failed: "+steadyErr.Error())
if planErr != nil {
gpuResult.Notes = append(gpuResult.Notes, "precision plan failed: "+planErr.Error())
}
parseResult := parseBenchmarkBurnLog(string(steadyOut))
steadyRows := phaseRowsByStage[fmt.Sprintf("gpu-%d-steady", idx)]
parseResult := parseBenchmarkBurnLog(string(phaseLogs["mixed"]))
gpuResult.ComputeCapability = parseResult.ComputeCapability
gpuResult.Backend = parseResult.Backend
gpuResult.PrecisionResults = parseResult.Profiles
@@ -349,17 +390,19 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
gpuResult.ECC = diffECCCounters(eccBase, eccFinal)
}
cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, []int{idx})
if err != nil && err != context.Canceled {
gpuResult.Notes = append(gpuResult.Notes, "cooldown sampling failed: "+err.Error())
if spec.CooldownSec > 0 {
cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, []int{idx})
if err != nil && err != context.Canceled {
gpuResult.Notes = append(gpuResult.Notes, "cooldown sampling failed: "+err.Error())
}
gpuResult.Cooldown = summarizeBenchmarkTelemetry(cooldownRows)
appendBenchmarkMetrics(&metricRows, cooldownRows, fmt.Sprintf("gpu-%d-cooldown", idx))
}
gpuResult.Cooldown = summarizeBenchmarkTelemetry(cooldownRows)
appendBenchmarkMetrics(&metricRows, cooldownRows, fmt.Sprintf("gpu-%d-cooldown", idx))
gpuResult.Scores = scoreBenchmarkGPUResult(gpuResult)
gpuResult.DegradationReasons = detectBenchmarkDegradationReasons(gpuResult, result.Normalization.Status)
if steadyErr != nil {
gpuResult.Status = classifySATErrorStatus(steadyOut, steadyErr)
if planErr != nil {
gpuResult.Status = classifySATErrorStatus(phaseLogs["mixed"], planErr)
} else if parseResult.Fallback {
gpuResult.Status = "PARTIAL"
} else {
@@ -462,11 +505,11 @@ func normalizeNvidiaBenchmarkOptionsForBenchmark(opts NvidiaBenchmarkOptions) Nv
func resolveBenchmarkProfile(profile string) benchmarkProfileSpec {
switch strings.TrimSpace(strings.ToLower(profile)) {
case NvidiaBenchmarkProfileStability:
return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 300, SteadySec: 3600, NCCLSec: 300, CooldownSec: 300}
return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 120, SteadySec: 3600, NCCLSec: 300, CooldownSec: 0}
case NvidiaBenchmarkProfileOvernight:
return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 600, SteadySec: 27000, NCCLSec: 600, CooldownSec: 300}
return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 180, SteadySec: 27000, NCCLSec: 600, CooldownSec: 0}
default:
return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 120, SteadySec: 480, NCCLSec: 180, CooldownSec: 120}
return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 45, SteadySec: 480, NCCLSec: 180, CooldownSec: 0}
}
}
@@ -795,6 +838,66 @@ func runBenchmarkCommandWithMetrics(ctx context.Context, verboseLog, name string
return out, metricRows, err
}
type benchmarkPlannedPhase struct {
PlanLabel string
MetricStage string
DurationSec int
}
func runBenchmarkPlannedCommandWithMetrics(
ctx context.Context,
verboseLog, name string,
cmd []string,
env []string,
gpuIndices []int,
phases []benchmarkPlannedPhase,
logFunc func(string),
) ([]byte, map[string][]GPUMetricRow, map[string][]byte, error) {
out, rows, err := runBenchmarkCommandWithMetrics(ctx, verboseLog, name, cmd, env, gpuIndices, logFunc)
return out, splitBenchmarkRowsByPlannedPhase(rows, phases), splitBenchmarkLogByPlannedPhase(out), err
}
func splitBenchmarkRowsByPlannedPhase(rows []GPUMetricRow, phases []benchmarkPlannedPhase) map[string][]GPUMetricRow {
out := make(map[string][]GPUMetricRow, len(phases))
if len(rows) == 0 || len(phases) == 0 {
return out
}
for _, row := range rows {
idx := len(phases) - 1
var elapsed float64
for i, phase := range phases {
durationSec := phase.DurationSec
if durationSec <= 0 {
durationSec = 1
}
elapsed += float64(durationSec)
if row.ElapsedSec < elapsed {
idx = i
break
}
}
out[phases[idx].MetricStage] = append(out[phases[idx].MetricStage], row)
}
return out
}
func splitBenchmarkLogByPlannedPhase(raw []byte) map[string][]byte {
out := make(map[string][]byte)
var current string
for _, line := range strings.Split(strings.ReplaceAll(string(raw), "\r\n", "\n"), "\n") {
trimmed := strings.TrimSpace(stripBenchmarkPrefix(line))
switch {
case strings.HasPrefix(trimmed, "phase_begin="):
current = strings.TrimSpace(strings.TrimPrefix(trimmed, "phase_begin="))
case strings.HasPrefix(trimmed, "phase_end="):
current = ""
case current != "":
out[current] = append(out[current], []byte(line+"\n")...)
}
}
return out
}
type benchmarkCoolingSample struct {
AvgFanRPM float64
AvgFanDutyCyclePct float64
@@ -968,6 +1071,8 @@ func ensureBenchmarkProfile(profiles map[string]*benchmarkBurnProfile, name stri
category = "fp32_tf32"
case strings.HasPrefix(name, "fp16"):
category = "fp16_bf16"
case strings.HasPrefix(name, "int8"):
category = "int8"
case strings.HasPrefix(name, "fp8"):
category = "fp8"
case strings.HasPrefix(name, "fp4"):
@@ -985,6 +1090,7 @@ func ensureBenchmarkProfile(profiles map[string]*benchmarkBurnProfile, name stri
// fp64 = 2.0 — double precision, 2× more bits per operand
// fp32 = 1.0 — single precision baseline
// fp16 = 0.5 — half precision
// int8 = 0.25 — quarter precision
// fp8 = 0.25 — quarter precision
// fp4 = 0.125 — eighth precision
//
@@ -998,6 +1104,8 @@ func precisionWeight(category string) float64 {
return 1.0
case "fp16_bf16":
return 0.5
case "int8":
return 0.25
case "fp8":
return 0.25
case "fp4":
@@ -1861,41 +1969,41 @@ func runNvidiaBenchmarkParallel(
}
}
// ── Per-precision stability phases (parallel) ─────────────────────────────
totalSlots := len(benchmarkPrecisionPhases) + 1
perPhaseSec := spec.SteadySec / totalSlots
if perPhaseSec < 60 {
perPhaseSec = 60
}
// Run synthetic precision phases and the combined steady phase as one
// uninterrupted command so the GPUs stay hot between windows.
eccBase := make(map[int]BenchmarkECCCounters, len(selected))
for _, idx := range selected {
eccBase[idx], _ = queryECCCounters(idx)
}
planLabels, planPhases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(spec, func(label string) string {
if label == "mixed" {
return "steady"
}
return "gpu-all-steady-" + label
})
planCmd := []string{
"bee-gpu-burn",
"--seconds", strconv.Itoa(basePhaseSec),
"--size-mb", strconv.Itoa(opts.SizeMB),
"--devices", allDevices,
"--precision-plan", strings.Join(planLabels, ","),
"--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases),
}
logFunc(fmt.Sprintf("GPUs %s: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", allDevices, len(benchmarkPrecisionPhases), basePhaseSec, mixedPhaseSec))
_, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, "gpu-all-precision-plan.log", planCmd, nil, selected, planPhases, logFunc)
for _, phaseSpec := range planPhases {
if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 {
appendBenchmarkMetrics(allMetricRows, rows, phaseSpec.MetricStage)
}
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseSpec.MetricStage, phaseLogs[phaseSpec.PlanLabel])
}
for _, prec := range benchmarkPrecisionPhases {
phaseCmd := []string{
"bee-gpu-burn",
"--seconds", strconv.Itoa(perPhaseSec),
"--size-mb", strconv.Itoa(opts.SizeMB),
"--devices", allDevices,
"--precision", prec,
}
logFunc(fmt.Sprintf("GPUs %s: %s stability phase (%ds)", allDevices, prec, perPhaseSec))
phaseLogName := "gpu-all-steady-" + prec
eccBeforePhase := make(map[int]BenchmarkECCCounters, len(selected))
for _, idx := range selected {
eccBeforePhase[idx], _ = queryECCCounters(idx)
}
phaseOut, phaseRows, phaseErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, phaseLogName+".log", phaseCmd, nil, selected, logFunc)
appendBenchmarkMetrics(allMetricRows, phaseRows, phaseLogName)
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseLogName, phaseOut)
eccAfterPhase := make(map[int]BenchmarkECCCounters, len(selected))
for _, idx := range selected {
eccAfterPhase[idx], _ = queryECCCounters(idx)
}
if phaseErr != nil || len(phaseRows) == 0 {
phaseRows := phaseRowsByStage[phaseLogName]
if len(phaseRows) == 0 {
continue
}
parseByGPU := parseBenchmarkBurnLogByGPU(string(phaseOut))
parseByGPU := parseBenchmarkBurnLogByGPU(string(phaseLogs[prec]))
for _, idx := range selected {
perGPU := filterRowsByGPU(phaseRows, idx)
if len(perGPU) == 0 {
@@ -1904,7 +2012,6 @@ func runNvidiaBenchmarkParallel(
phase := BenchmarkPrecisionSteadyPhase{
Precision: prec,
Steady: summarizeBenchmarkTelemetry(perGPU),
ECC: diffECCCounters(eccBeforePhase[idx], eccAfterPhase[idx]),
}
if pr, ok := parseByGPU[idx]; ok {
for _, p := range pr.Profiles {
@@ -1924,14 +2031,7 @@ func runNvidiaBenchmarkParallel(
beforeThrottle[idx], _ = queryThrottleCounters(idx)
}
// Steady: all GPUs simultaneously (combined). Fixed at one slot = perPhaseSec.
steadyCmd := []string{
"bee-gpu-burn",
"--seconds", strconv.Itoa(perPhaseSec),
"--size-mb", strconv.Itoa(opts.SizeMB),
"--devices", allDevices,
}
logFunc(fmt.Sprintf("GPUs %s: parallel steady compute (combined, %ds)", allDevices, perPhaseSec))
logFunc(fmt.Sprintf("GPUs %s: parallel steady compute (combined, %ds)", allDevices, mixedPhaseSec))
// Sample server power via IPMI in parallel with steady phase.
ipmiStopCh := make(chan struct{})
@@ -1965,9 +2065,6 @@ func runNvidiaBenchmarkParallel(
}
}()
steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-steady.log", steadyCmd, nil, selected, logFunc)
appendBenchmarkMetrics(allMetricRows, steadyRows, "steady")
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", "steady", steadyOut)
close(ipmiStopCh)
if loadedW, ok := <-ipmiResultCh; ok {
*serverLoadedWSum += loadedW
@@ -1980,7 +2077,8 @@ func runNvidiaBenchmarkParallel(
afterThrottle[idx], _ = queryThrottleCounters(idx)
}
parseResults := parseBenchmarkBurnLogByGPU(string(steadyOut))
steadyRows := phaseRowsByStage["steady"]
parseResults := parseBenchmarkBurnLogByGPU(string(phaseLogs["mixed"]))
for _, idx := range selected {
perGPU := filterRowsByGPU(steadyRows, idx)
@@ -1998,23 +2096,25 @@ func runNvidiaBenchmarkParallel(
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "benchmark used driver PTX fallback; tensor throughput score is not comparable")
}
}
if steadyErr != nil {
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "parallel steady compute failed: "+steadyErr.Error())
if planErr != nil {
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "precision plan failed: "+planErr.Error())
}
}
// Cooldown: all GPUs together.
cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, selected)
if err != nil && err != context.Canceled {
for _, idx := range selected {
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "cooldown sampling failed: "+err.Error())
if spec.CooldownSec > 0 {
cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, selected)
if err != nil && err != context.Canceled {
for _, idx := range selected {
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "cooldown sampling failed: "+err.Error())
}
}
for _, idx := range selected {
perGPU := filterRowsByGPU(cooldownRows, idx)
gpuResults[idx].Cooldown = summarizeBenchmarkTelemetry(perGPU)
}
appendBenchmarkMetrics(allMetricRows, cooldownRows, "cooldown")
}
for _, idx := range selected {
perGPU := filterRowsByGPU(cooldownRows, idx)
gpuResults[idx].Cooldown = summarizeBenchmarkTelemetry(perGPU)
}
appendBenchmarkMetrics(allMetricRows, cooldownRows, "cooldown")
// Score and finalize each GPU.
for _, idx := range selected {
@@ -2023,8 +2123,8 @@ func runNvidiaBenchmarkParallel(
r.DegradationReasons = detectBenchmarkDegradationReasons(*r, result.Normalization.Status)
pr := parseResults[idx]
switch {
case steadyErr != nil:
r.Status = classifySATErrorStatus(steadyOut, steadyErr)
case planErr != nil:
r.Status = classifySATErrorStatus(phaseLogs["mixed"], planErr)
case pr.Fallback:
r.Status = "PARTIAL"
default:
@@ -2213,7 +2313,7 @@ func runBenchmarkPowerCalibration(
gpuIndices []int,
logFunc func(string),
) map[int]float64 {
const calibDurationSec = 45
const calibDurationSec = 120
// dcgmi must be present.
if _, err := exec.LookPath("dcgmi"); err != nil {

View File

@@ -88,10 +88,10 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
b.WriteString("- Thermal and power limits are inferred from NVIDIA clock-event counters plus sustained telemetry.\n")
b.WriteString("- `result.json` is the canonical machine-readable source for the run.\n\n")
b.WriteString("**Compute score** is derived from two phases:\n\n")
b.WriteString("- **Synthetic** — each precision type (fp8, fp16, fp32, fp64, fp4) runs alone for a dedicated window. ")
b.WriteString("- **Synthetic** — each precision type (int8, fp8, fp16, fp32, fp64, fp4) runs alone for a dedicated window. ")
b.WriteString("Measures peak throughput with the full GPU dedicated to one kernel type. ")
b.WriteString("Each result is normalised to fp32-equivalent TOPS using precision weights: ")
b.WriteString("fp64 ×2.0 · fp32 ×1.0 · fp16 ×0.5 · fp8 ×0.25 · fp4 ×0.125.\n")
b.WriteString("fp64 ×2.0 · fp32 ×1.0 · fp16 ×0.5 · int8 ×0.25 · fp8 ×0.25 · fp4 ×0.125.\n")
b.WriteString("- **Mixed** — all precision types run simultaneously (combined phase). ")
b.WriteString("Reflects real inference workloads where fp8 matrix ops, fp16 attention and fp32 accumulation compete for bandwidth and SM scheduler slots.\n\n")
b.WriteString("**Formula:** `Compute = Synthetic × (1 + MixedEfficiency × 0.3)`\n\n")

View File

@@ -16,17 +16,17 @@ func TestResolveBenchmarkProfile(t *testing.T) {
{
name: "default",
profile: "",
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 120, SteadySec: 480, NCCLSec: 180, CooldownSec: 120},
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 45, SteadySec: 480, NCCLSec: 180, CooldownSec: 0},
},
{
name: "stability",
profile: "stability",
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 300, SteadySec: 3600, NCCLSec: 300, CooldownSec: 300},
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 120, SteadySec: 3600, NCCLSec: 300, CooldownSec: 0},
},
{
name: "overnight",
profile: "overnight",
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 600, SteadySec: 27000, NCCLSec: 600, CooldownSec: 300},
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 180, SteadySec: 27000, NCCLSec: 600, CooldownSec: 0},
},
}
@@ -41,6 +41,92 @@ func TestResolveBenchmarkProfile(t *testing.T) {
}
}
func TestBuildBenchmarkSteadyPlanStandard(t *testing.T) {
t.Parallel()
labels, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, SteadySec: 480},
func(label string) string { return label },
)
if len(labels) != 7 || len(phases) != 7 {
t.Fatalf("labels=%d phases=%d want 7", len(labels), len(phases))
}
if basePhaseSec != 60 {
t.Fatalf("basePhaseSec=%d want 60", basePhaseSec)
}
if mixedPhaseSec != 300 {
t.Fatalf("mixedPhaseSec=%d want 300", mixedPhaseSec)
}
if phases[len(phases)-1].PlanLabel != "mixed" || phases[len(phases)-1].DurationSec != 300 {
t.Fatalf("mixed phase=%+v want duration 300", phases[len(phases)-1])
}
if benchmarkPlanDurationsCSV(phases) != "60,60,60,60,60,60,300" {
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
}
}
func TestBuildBenchmarkSteadyPlanStability(t *testing.T) {
t.Parallel()
_, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, SteadySec: 3600},
func(label string) string { return label },
)
if basePhaseSec != 300 {
t.Fatalf("basePhaseSec=%d want 300", basePhaseSec)
}
if mixedPhaseSec != 3600 {
t.Fatalf("mixedPhaseSec=%d want 3600", mixedPhaseSec)
}
if benchmarkPlanDurationsCSV(phases) != "300,300,300,300,300,300,3600" {
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
}
}
func TestBuildBenchmarkSteadyPlanOvernight(t *testing.T) {
t.Parallel()
_, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, SteadySec: 27000},
func(label string) string { return label },
)
if basePhaseSec != 3600 {
t.Fatalf("basePhaseSec=%d want 3600", basePhaseSec)
}
if mixedPhaseSec != 14400 {
t.Fatalf("mixedPhaseSec=%d want 14400", mixedPhaseSec)
}
if benchmarkPlanDurationsCSV(phases) != "3600,3600,3600,3600,3600,3600,14400" {
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
}
}
func TestSplitBenchmarkRowsByPlannedPhaseUsesPhaseDurations(t *testing.T) {
t.Parallel()
phases := []benchmarkPlannedPhase{
{PlanLabel: "fp8", MetricStage: "fp8", DurationSec: 10},
{PlanLabel: "fp16", MetricStage: "fp16", DurationSec: 10},
{PlanLabel: "mixed", MetricStage: "mixed", DurationSec: 50},
}
rows := []GPUMetricRow{
{ElapsedSec: 5},
{ElapsedSec: 15},
{ElapsedSec: 25},
{ElapsedSec: 65},
}
got := splitBenchmarkRowsByPlannedPhase(rows, phases)
if len(got["fp8"]) != 1 {
t.Fatalf("fp8 rows=%d want 1", len(got["fp8"]))
}
if len(got["fp16"]) != 1 {
t.Fatalf("fp16 rows=%d want 1", len(got["fp16"]))
}
if len(got["mixed"]) != 2 {
t.Fatalf("mixed rows=%d want 2", len(got["mixed"]))
}
}
func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
t.Parallel()
@@ -65,8 +151,10 @@ func TestParseBenchmarkBurnLog(t *testing.T) {
"[gpu 0] compute_capability=9.0",
"[gpu 0] backend=cublasLt",
"[gpu 0] duration_s=10",
"[gpu 0] int8_tensor[0]=READY dim=16384x16384x8192 block=128 stream=0",
"[gpu 0] fp16_tensor[0]=READY dim=4096x4096x4096 block=128 stream=0",
"[gpu 0] fp8_e4m3[0]=READY dim=8192x8192x4096 block=128 stream=0",
"[gpu 0] int8_tensor_iterations=80",
"[gpu 0] fp16_tensor_iterations=200",
"[gpu 0] fp8_e4m3_iterations=50",
"[gpu 0] status=OK",
@@ -79,15 +167,24 @@ func TestParseBenchmarkBurnLog(t *testing.T) {
if got.ComputeCapability != "9.0" {
t.Fatalf("compute capability=%q want 9.0", got.ComputeCapability)
}
if len(got.Profiles) != 2 {
t.Fatalf("profiles=%d want 2", len(got.Profiles))
if len(got.Profiles) != 3 {
t.Fatalf("profiles=%d want 3", len(got.Profiles))
}
if got.Profiles[0].TeraOpsPerSec <= 0 {
t.Fatalf("profile[0] teraops=%f want >0", got.Profiles[0].TeraOpsPerSec)
}
if got.Profiles[0].Category != "fp16_bf16" {
t.Fatalf("profile[0] category=%q want fp16_bf16", got.Profiles[0].Category)
}
if got.Profiles[1].Category != "fp8" {
t.Fatalf("profile[1] category=%q want fp8", got.Profiles[1].Category)
}
if got.Profiles[2].Category != "int8" {
t.Fatalf("profile[2] category=%q want int8", got.Profiles[2].Category)
}
if got.Profiles[2].Weight != 0.25 {
t.Fatalf("profile[2] weight=%f want 0.25", got.Profiles[2].Weight)
}
}
func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {

View File

@@ -179,7 +179,7 @@ type BenchmarkPrecisionResult struct {
Iterations uint64 `json:"iterations,omitempty"`
TeraOpsPerSec float64 `json:"teraops_per_sec,omitempty"`
// Weight is the fp32-equivalence factor for this precision category.
// fp32 = 1.0 (baseline), fp64 = 2.0, fp16 = 0.5, fp8 = 0.25, fp4 = 0.125.
// fp32 = 1.0 (baseline), fp64 = 2.0, fp16 = 0.5, int8/fp8 = 0.25, fp4 = 0.125.
// WeightedTOPS = TeraOpsPerSec * Weight gives fp32-equivalent throughput.
Weight float64 `json:"weight,omitempty"`
WeightedTeraOpsPerSec float64 `json:"weighted_teraops_per_sec,omitempty"`