Files
bee/audit/internal/platform/benchmark_test.go

148 lines
4.0 KiB
Go

package platform
import (
"strings"
"testing"
)
func TestResolveBenchmarkProfile(t *testing.T) {
t.Parallel()
cases := []struct {
name string
profile string
want benchmarkProfileSpec
}{
{
name: "default",
profile: "",
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 120, SteadySec: 480, NCCLSec: 180, CooldownSec: 120},
},
{
name: "stability",
profile: "stability",
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 300, SteadySec: 3600, NCCLSec: 300, CooldownSec: 300},
},
{
name: "overnight",
profile: "overnight",
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 600, SteadySec: 27000, NCCLSec: 600, CooldownSec: 300},
},
}
for _, tc := range cases {
tc := tc
t.Run(tc.name, func(t *testing.T) {
got := resolveBenchmarkProfile(tc.profile)
if got != tc.want {
t.Fatalf("profile=%q got %+v want %+v", tc.profile, got, tc.want)
}
})
}
}
func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
t.Parallel()
opts := normalizeNvidiaBenchmarkOptionsForBenchmark(NvidiaBenchmarkOptions{
Profile: "stability",
RunNCCL: false,
})
if opts.Profile != NvidiaBenchmarkProfileStability {
t.Fatalf("profile=%q want %q", opts.Profile, NvidiaBenchmarkProfileStability)
}
if opts.RunNCCL {
t.Fatalf("RunNCCL should stay false when explicitly disabled")
}
}
func TestParseBenchmarkBurnLog(t *testing.T) {
t.Parallel()
raw := strings.Join([]string{
"loader=bee-gpu-burn",
"[gpu 0] device=NVIDIA H100",
"[gpu 0] compute_capability=9.0",
"[gpu 0] backend=cublasLt",
"[gpu 0] duration_s=10",
"[gpu 0] fp16_tensor[0]=READY dim=4096x4096x4096 block=128 stream=0",
"[gpu 0] fp8_e4m3[0]=READY dim=8192x8192x4096 block=128 stream=0",
"[gpu 0] fp16_tensor_iterations=200",
"[gpu 0] fp8_e4m3_iterations=50",
"[gpu 0] status=OK",
}, "\n")
got := parseBenchmarkBurnLog(raw)
if got.Backend != "cublasLt" {
t.Fatalf("backend=%q want cublasLt", got.Backend)
}
if got.ComputeCapability != "9.0" {
t.Fatalf("compute capability=%q want 9.0", got.ComputeCapability)
}
if len(got.Profiles) != 2 {
t.Fatalf("profiles=%d want 2", len(got.Profiles))
}
if got.Profiles[0].TeraOpsPerSec <= 0 {
t.Fatalf("profile[0] teraops=%f want >0", got.Profiles[0].TeraOpsPerSec)
}
if got.Profiles[1].Category != "fp8" {
t.Fatalf("profile[1] category=%q want fp8", got.Profiles[1].Category)
}
}
func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
t.Parallel()
result := NvidiaBenchmarkResult{
BenchmarkVersion: benchmarkVersion,
BenchmarkProfile: NvidiaBenchmarkProfileStandard,
OverallStatus: "PARTIAL",
SelectedGPUIndices: []int{0},
Normalization: BenchmarkNormalization{
Status: "partial",
},
Findings: []string{"GPU 0 spent measurable time under SW power cap."},
GPUs: []BenchmarkGPUResult{
{
Index: 0,
Name: "NVIDIA H100",
Status: "OK",
Steady: BenchmarkTelemetrySummary{
AvgPowerW: 680,
AvgTempC: 79,
AvgGraphicsClockMHz: 1725,
P95PowerW: 700,
P95TempC: 82,
P95GraphicsClockMHz: 1800,
},
Scores: BenchmarkScorecard{
ComputeScore: 1200,
PowerSustainScore: 96,
ThermalSustainScore: 88,
StabilityScore: 92,
CompositeScore: 1176,
},
PrecisionResults: []BenchmarkPrecisionResult{
{Name: "fp16_tensor", Supported: true, TeraOpsPerSec: 700},
},
Throttle: BenchmarkThrottleCounters{
SWPowerCapUS: 1000000,
},
DegradationReasons: []string{"power_capped"},
},
},
}
report := renderBenchmarkReport(result)
for _, needle := range []string{
"Executive Summary",
"GPU 0 spent measurable time under SW power cap.",
"Composite score: 1176.00",
"fp16_tensor: 700.00 TOPS",
} {
if !strings.Contains(report, needle) {
t.Fatalf("report missing %q\n%s", needle, report)
}
}
}