Refine NVIDIA benchmark phase timing

This commit is contained in:
Mikhail Chusavitin
2026-04-14 14:12:06 +03:00
parent b1a5035edd
commit 2be7ae6d28
6 changed files with 450 additions and 133 deletions

View File

@@ -16,17 +16,17 @@ func TestResolveBenchmarkProfile(t *testing.T) {
{
name: "default",
profile: "",
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 120, SteadySec: 480, NCCLSec: 180, CooldownSec: 120},
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 45, SteadySec: 480, NCCLSec: 180, CooldownSec: 0},
},
{
name: "stability",
profile: "stability",
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 300, SteadySec: 3600, NCCLSec: 300, CooldownSec: 300},
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 120, SteadySec: 3600, NCCLSec: 300, CooldownSec: 0},
},
{
name: "overnight",
profile: "overnight",
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 600, SteadySec: 27000, NCCLSec: 600, CooldownSec: 300},
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 180, SteadySec: 27000, NCCLSec: 600, CooldownSec: 0},
},
}
@@ -41,6 +41,92 @@ func TestResolveBenchmarkProfile(t *testing.T) {
}
}
func TestBuildBenchmarkSteadyPlanStandard(t *testing.T) {
t.Parallel()
labels, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, SteadySec: 480},
func(label string) string { return label },
)
if len(labels) != 7 || len(phases) != 7 {
t.Fatalf("labels=%d phases=%d want 7", len(labels), len(phases))
}
if basePhaseSec != 60 {
t.Fatalf("basePhaseSec=%d want 60", basePhaseSec)
}
if mixedPhaseSec != 300 {
t.Fatalf("mixedPhaseSec=%d want 300", mixedPhaseSec)
}
if phases[len(phases)-1].PlanLabel != "mixed" || phases[len(phases)-1].DurationSec != 300 {
t.Fatalf("mixed phase=%+v want duration 300", phases[len(phases)-1])
}
if benchmarkPlanDurationsCSV(phases) != "60,60,60,60,60,60,300" {
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
}
}
func TestBuildBenchmarkSteadyPlanStability(t *testing.T) {
t.Parallel()
_, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, SteadySec: 3600},
func(label string) string { return label },
)
if basePhaseSec != 300 {
t.Fatalf("basePhaseSec=%d want 300", basePhaseSec)
}
if mixedPhaseSec != 3600 {
t.Fatalf("mixedPhaseSec=%d want 3600", mixedPhaseSec)
}
if benchmarkPlanDurationsCSV(phases) != "300,300,300,300,300,300,3600" {
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
}
}
func TestBuildBenchmarkSteadyPlanOvernight(t *testing.T) {
t.Parallel()
_, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, SteadySec: 27000},
func(label string) string { return label },
)
if basePhaseSec != 3600 {
t.Fatalf("basePhaseSec=%d want 3600", basePhaseSec)
}
if mixedPhaseSec != 14400 {
t.Fatalf("mixedPhaseSec=%d want 14400", mixedPhaseSec)
}
if benchmarkPlanDurationsCSV(phases) != "3600,3600,3600,3600,3600,3600,14400" {
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
}
}
func TestSplitBenchmarkRowsByPlannedPhaseUsesPhaseDurations(t *testing.T) {
t.Parallel()
phases := []benchmarkPlannedPhase{
{PlanLabel: "fp8", MetricStage: "fp8", DurationSec: 10},
{PlanLabel: "fp16", MetricStage: "fp16", DurationSec: 10},
{PlanLabel: "mixed", MetricStage: "mixed", DurationSec: 50},
}
rows := []GPUMetricRow{
{ElapsedSec: 5},
{ElapsedSec: 15},
{ElapsedSec: 25},
{ElapsedSec: 65},
}
got := splitBenchmarkRowsByPlannedPhase(rows, phases)
if len(got["fp8"]) != 1 {
t.Fatalf("fp8 rows=%d want 1", len(got["fp8"]))
}
if len(got["fp16"]) != 1 {
t.Fatalf("fp16 rows=%d want 1", len(got["fp16"]))
}
if len(got["mixed"]) != 2 {
t.Fatalf("mixed rows=%d want 2", len(got["mixed"]))
}
}
func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
t.Parallel()
@@ -65,8 +151,10 @@ func TestParseBenchmarkBurnLog(t *testing.T) {
"[gpu 0] compute_capability=9.0",
"[gpu 0] backend=cublasLt",
"[gpu 0] duration_s=10",
"[gpu 0] int8_tensor[0]=READY dim=16384x16384x8192 block=128 stream=0",
"[gpu 0] fp16_tensor[0]=READY dim=4096x4096x4096 block=128 stream=0",
"[gpu 0] fp8_e4m3[0]=READY dim=8192x8192x4096 block=128 stream=0",
"[gpu 0] int8_tensor_iterations=80",
"[gpu 0] fp16_tensor_iterations=200",
"[gpu 0] fp8_e4m3_iterations=50",
"[gpu 0] status=OK",
@@ -79,15 +167,24 @@ func TestParseBenchmarkBurnLog(t *testing.T) {
if got.ComputeCapability != "9.0" {
t.Fatalf("compute capability=%q want 9.0", got.ComputeCapability)
}
if len(got.Profiles) != 2 {
t.Fatalf("profiles=%d want 2", len(got.Profiles))
if len(got.Profiles) != 3 {
t.Fatalf("profiles=%d want 3", len(got.Profiles))
}
if got.Profiles[0].TeraOpsPerSec <= 0 {
t.Fatalf("profile[0] teraops=%f want >0", got.Profiles[0].TeraOpsPerSec)
}
if got.Profiles[0].Category != "fp16_bf16" {
t.Fatalf("profile[0] category=%q want fp16_bf16", got.Profiles[0].Category)
}
if got.Profiles[1].Category != "fp8" {
t.Fatalf("profile[1] category=%q want fp8", got.Profiles[1].Category)
}
if got.Profiles[2].Category != "int8" {
t.Fatalf("profile[2] category=%q want int8", got.Profiles[2].Category)
}
if got.Profiles[2].Weight != 0.25 {
t.Fatalf("profile[2] weight=%f want 0.25", got.Profiles[2].Weight)
}
}
func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {