bee/audit/internal/platform/benchmark_test.go

package platform

import (
	"strings"
	"testing"
)

func TestResolveBenchmarkProfile(t *testing.T) {
	t.Parallel()

	cases := []struct {
		name    string
		profile string
		want    benchmarkProfileSpec
	}{
		{
			name:    "default",
			profile: "",
			want:    benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 45, SteadySec: 480, NCCLSec: 180, CooldownSec: 0},
		},
		{
			name:    "stability",
			profile: "stability",
			want:    benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 120, SteadySec: 3600, NCCLSec: 300, CooldownSec: 0},
		},
		{
			name:    "overnight",
			profile: "overnight",
			want:    benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 180, SteadySec: 27000, NCCLSec: 600, CooldownSec: 0},
		},
	}

	for _, tc := range cases {
		tc := tc
		t.Run(tc.name, func(t *testing.T) {
			got := resolveBenchmarkProfile(tc.profile)
			if got != tc.want {
				t.Fatalf("profile=%q got %+v want %+v", tc.profile, got, tc.want)
			}
		})
	}
}

func TestBuildBenchmarkSteadyPlanStandard(t *testing.T) {
	t.Parallel()

	labels, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
		benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, SteadySec: 480},
		benchmarkPrecisionPhases,
		func(label string) string { return label },
	)
	if len(labels) != 7 || len(phases) != 7 {
		t.Fatalf("labels=%d phases=%d want 7", len(labels), len(phases))
	}
	if basePhaseSec != 60 {
		t.Fatalf("basePhaseSec=%d want 60", basePhaseSec)
	}
	if mixedPhaseSec != 300 {
		t.Fatalf("mixedPhaseSec=%d want 300", mixedPhaseSec)
	}
	if phases[len(phases)-1].PlanLabel != "mixed" || phases[len(phases)-1].DurationSec != 300 {
		t.Fatalf("mixed phase=%+v want duration 300", phases[len(phases)-1])
	}
	if benchmarkPlanDurationsCSV(phases) != "60,60,60,60,60,60,300" {
		t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
	}
}

func TestBuildBenchmarkSteadyPlanStability(t *testing.T) {
	t.Parallel()

	_, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
		benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, SteadySec: 3600},
		benchmarkPrecisionPhases,
		func(label string) string { return label },
	)
	if basePhaseSec != 300 {
		t.Fatalf("basePhaseSec=%d want 300", basePhaseSec)
	}
	if mixedPhaseSec != 3600 {
		t.Fatalf("mixedPhaseSec=%d want 3600", mixedPhaseSec)
	}
	if benchmarkPlanDurationsCSV(phases) != "300,300,300,300,300,300,3600" {
		t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
	}
}

func TestBuildBenchmarkSteadyPlanOvernight(t *testing.T) {
	t.Parallel()

	_, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
		benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, SteadySec: 27000},
		benchmarkPrecisionPhases,
		func(label string) string { return label },
	)
	if basePhaseSec != 3600 {
		t.Fatalf("basePhaseSec=%d want 3600", basePhaseSec)
	}
	if mixedPhaseSec != 14400 {
		t.Fatalf("mixedPhaseSec=%d want 14400", mixedPhaseSec)
	}
	if benchmarkPlanDurationsCSV(phases) != "3600,3600,3600,3600,3600,3600,14400" {
		t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
	}
}

func TestSplitBenchmarkRowsByPlannedPhaseUsesPhaseDurations(t *testing.T) {
	t.Parallel()

	phases := []benchmarkPlannedPhase{
		{PlanLabel: "fp8", MetricStage: "fp8", DurationSec: 10},
		{PlanLabel: "fp16", MetricStage: "fp16", DurationSec: 10},
		{PlanLabel: "mixed", MetricStage: "mixed", DurationSec: 50},
	}
	rows := []GPUMetricRow{
		{ElapsedSec: 5},
		{ElapsedSec: 15},
		{ElapsedSec: 25},
		{ElapsedSec: 65},
	}
	got := splitBenchmarkRowsByPlannedPhase(rows, phases)
	if len(got["fp8"]) != 1 {
		t.Fatalf("fp8 rows=%d want 1", len(got["fp8"]))
	}
	if len(got["fp16"]) != 1 {
		t.Fatalf("fp16 rows=%d want 1", len(got["fp16"]))
	}
	if len(got["mixed"]) != 2 {
		t.Fatalf("mixed rows=%d want 2", len(got["mixed"]))
	}
}

func TestBenchmarkSupportedPrecisionsSkipsFP4BeforeBlackwell(t *testing.T) {
	t.Parallel()

	if got := benchmarkSupportedPrecisions("9.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32,fp64" {
		t.Fatalf("supported=%v", got)
	}
	if got := benchmarkSupportedPrecisions("10.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32,fp64,fp4" {
		t.Fatalf("supported=%v", got)
	}
}

func TestBenchmarkPlannedPhaseStatus(t *testing.T) {
	t.Parallel()

	cases := []struct {
		name       string
		raw        string
		wantStatus string
	}{
		{name: "ok", raw: "status=OK\n", wantStatus: "OK"},
		{name: "failed", raw: "phase_error=fp16\n", wantStatus: "FAILED"},
		{name: "unsupported", raw: "cublasLt_profiles=unsupported\nphase_error=fp4\n", wantStatus: "UNSUPPORTED"},
	}
	for _, tc := range cases {
		tc := tc
		t.Run(tc.name, func(t *testing.T) {
			got, _ := benchmarkPlannedPhaseStatus([]byte(tc.raw))
			if got != tc.wantStatus {
				t.Fatalf("status=%q want %q", got, tc.wantStatus)
			}
		})
	}
}

func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
	t.Parallel()

	opts := normalizeNvidiaBenchmarkOptionsForBenchmark(NvidiaBenchmarkOptions{
		Profile: "stability",
		RunNCCL: false,
	})
	if opts.Profile != NvidiaBenchmarkProfileStability {
		t.Fatalf("profile=%q want %q", opts.Profile, NvidiaBenchmarkProfileStability)
	}
	if opts.RunNCCL {
		t.Fatalf("RunNCCL should stay false when explicitly disabled")
	}
}

func TestParseBenchmarkBurnLog(t *testing.T) {
	t.Parallel()

	raw := strings.Join([]string{
		"loader=bee-gpu-burn",
		"[gpu 0] device=NVIDIA H100",
		"[gpu 0] compute_capability=9.0",
		"[gpu 0] backend=cublasLt",
		"[gpu 0] duration_s=10",
		"[gpu 0] int8_tensor[0]=READY dim=16384x16384x8192 block=128 stream=0",
		"[gpu 0] fp16_tensor[0]=READY dim=4096x4096x4096 block=128 stream=0",
		"[gpu 0] fp8_e4m3[0]=READY dim=8192x8192x4096 block=128 stream=0",
		"[gpu 0] int8_tensor_iterations=80",
		"[gpu 0] fp16_tensor_iterations=200",
		"[gpu 0] fp8_e4m3_iterations=50",
		"[gpu 0] status=OK",
	}, "\n")

	got := parseBenchmarkBurnLog(raw)
	if got.Backend != "cublasLt" {
		t.Fatalf("backend=%q want cublasLt", got.Backend)
	}
	if got.ComputeCapability != "9.0" {
		t.Fatalf("compute capability=%q want 9.0", got.ComputeCapability)
	}
	if len(got.Profiles) != 3 {
		t.Fatalf("profiles=%d want 3", len(got.Profiles))
	}
	if got.Profiles[0].TeraOpsPerSec <= 0 {
		t.Fatalf("profile[0] teraops=%f want >0", got.Profiles[0].TeraOpsPerSec)
	}
	if got.Profiles[0].Category != "fp16_bf16" {
		t.Fatalf("profile[0] category=%q want fp16_bf16", got.Profiles[0].Category)
	}
	if got.Profiles[1].Category != "fp8" {
		t.Fatalf("profile[1] category=%q want fp8", got.Profiles[1].Category)
	}
	if got.Profiles[2].Category != "int8" {
		t.Fatalf("profile[2] category=%q want int8", got.Profiles[2].Category)
	}
	if got.Profiles[2].Weight != 0.25 {
		t.Fatalf("profile[2] weight=%f want 0.25", got.Profiles[2].Weight)
	}
}

func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
	t.Parallel()

	result := NvidiaBenchmarkResult{
		BenchmarkVersion:   benchmarkVersion,
		BenchmarkProfile:   NvidiaBenchmarkProfileStandard,
		OverallStatus:      "PARTIAL",
		SelectedGPUIndices: []int{0},
		Normalization: BenchmarkNormalization{
			Status: "partial",
		},
		Findings: []string{"GPU 0 spent measurable time under SW power cap."},
		GPUs: []BenchmarkGPUResult{
			{
				Index:  0,
				Name:   "NVIDIA H100",
				Status: "OK",
				Steady: BenchmarkTelemetrySummary{
					AvgPowerW:           680,
					AvgTempC:            79,
					AvgGraphicsClockMHz: 1725,
					P95PowerW:           700,
					P95TempC:            82,
					P95GraphicsClockMHz: 1800,
				},
				Scores: BenchmarkScorecard{
					ComputeScore:        1200,
					PowerSustainScore:   96,
					ThermalSustainScore: 88,
					StabilityScore:      92,
					CompositeScore:      1176,
				},
				PrecisionResults: []BenchmarkPrecisionResult{
					{Name: "fp16_tensor", Supported: true, TeraOpsPerSec: 700},
				},
				Throttle: BenchmarkThrottleCounters{
					SWPowerCapUS: 1000000,
				},
				DegradationReasons: []string{"power_capped"},
			},
		},
		Cooling: &BenchmarkCoolingSummary{
			Available:             true,
			AvgFanRPM:             9200,
			FanDutyCycleAvailable: true,
			AvgFanDutyCyclePct:    47.5,
			P95FanDutyCyclePct:    62.0,
		},
	}

	report := renderBenchmarkReport(result)
	for _, needle := range []string{
		"Executive Summary",
		"GPU 0 spent measurable time under SW power cap.",
		"1176.00",
		"fp16_tensor",
		"700.00",
		"Cooling",
		"Average fan duty cycle",
		"47.5%",
	} {
		if !strings.Contains(report, needle) {
			t.Fatalf("report missing %q\n%s", needle, report)
		}
	}
}

func TestRenderBenchmarkReportListsUnifiedArtifacts(t *testing.T) {
	t.Parallel()

	report := renderBenchmarkReport(NvidiaBenchmarkResult{
		BenchmarkProfile:   NvidiaBenchmarkProfileStandard,
		OverallStatus:      "OK",
		SelectedGPUIndices: []int{0},
		Normalization: BenchmarkNormalization{
			Status: "full",
		},
	})

	for _, needle := range []string{
		"gpu-metrics.csv",
		"gpu-metrics.html",
		"gpu-burn.log",
	} {
		if !strings.Contains(report, needle) {
			t.Fatalf("report missing %q\n%s", needle, report)
		}
	}
}

func TestEnrichGPUInfoWithMaxClocks(t *testing.T) {
	t.Parallel()

	nvsmiQ := []byte(`
GPU 00000000:4E:00.0
    Product Name                          : NVIDIA RTX PRO 6000 Blackwell Server Edition
    Clocks
        Graphics                          : 2422 MHz
        Memory                            : 12481 MHz
    Max Clocks
        Graphics                          : 2430 MHz
        SM                                : 2430 MHz
        Memory                            : 12481 MHz
        Video                             : 2107 MHz

GPU 00000000:4F:00.0
    Product Name                          : NVIDIA RTX PRO 6000 Blackwell Server Edition
    Max Clocks
        Graphics                          : 2430 MHz
        Memory                            : 12481 MHz
`)

	infoByIndex := map[int]benchmarkGPUInfo{
		0: {Index: 0, BusID: "00000000:4E:00.0"},
		1: {Index: 1, BusID: "00000000:4F:00.0"},
	}

	enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)

	if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
		t.Errorf("GPU 0 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[0].MaxGraphicsClockMHz)
	}
	if infoByIndex[0].MaxMemoryClockMHz != 12481 {
		t.Errorf("GPU 0 MaxMemoryClockMHz = %v, want 12481", infoByIndex[0].MaxMemoryClockMHz)
	}
	if infoByIndex[1].MaxGraphicsClockMHz != 2430 {
		t.Errorf("GPU 1 MaxGraphicsClockMHz = %v, want 2430", infoByIndex[1].MaxGraphicsClockMHz)
	}
	if infoByIndex[1].MaxMemoryClockMHz != 12481 {
		t.Errorf("GPU 1 MaxMemoryClockMHz = %v, want 12481", infoByIndex[1].MaxMemoryClockMHz)
	}
}

func TestEnrichGPUInfoWithMaxClocksSkipsPopulated(t *testing.T) {
	t.Parallel()

	nvsmiQ := []byte(`
GPU 00000000:4E:00.0
    Max Clocks
        Graphics                          : 9999 MHz
        Memory                            : 9999 MHz
`)
	// Already populated — must not be overwritten.
	infoByIndex := map[int]benchmarkGPUInfo{
		0: {Index: 0, BusID: "00000000:4E:00.0", MaxGraphicsClockMHz: 2430, MaxMemoryClockMHz: 12481},
	}

	enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQ)

	if infoByIndex[0].MaxGraphicsClockMHz != 2430 {
		t.Errorf("expected existing value to be preserved, got %v", infoByIndex[0].MaxGraphicsClockMHz)
	}
}