From 2be7ae6d28243e8571e8f56b90ea40fc75737c38 Mon Sep 17 00:00:00 2001 From: Mikhail Chusavitin Date: Tue, 14 Apr 2026 14:12:06 +0300 Subject: [PATCH] Refine NVIDIA benchmark phase timing --- audit/internal/platform/benchmark.go | 312 +++++++++++++------- audit/internal/platform/benchmark_report.go | 4 +- audit/internal/platform/benchmark_test.go | 107 ++++++- audit/internal/platform/benchmark_types.go | 2 +- iso/builder/bee-gpu-stress.c | 146 +++++++-- iso/overlay/usr/local/bin/bee-gpu-burn | 12 +- 6 files changed, 450 insertions(+), 133 deletions(-) diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go index bddf377..6ad9754 100644 --- a/audit/internal/platform/benchmark.go +++ b/audit/internal/platform/benchmark.go @@ -76,7 +76,56 @@ var ( // benchmarkPrecisionPhases lists the precision categories run as individual // steady-state windows before the combined steady pass. Order is from lowest // to highest power draw so thermal ramp-up is gradual. -var benchmarkPrecisionPhases = []string{"fp8", "fp16", "fp32", "fp64", "fp4"} +var benchmarkPrecisionPhases = []string{"int8", "fp8", "fp16", "fp32", "fp64", "fp4"} + +func buildBenchmarkSteadyPlan(spec benchmarkProfileSpec, metricStage func(string) string) (planLabels []string, planPhases []benchmarkPlannedPhase, basePhaseSec int, mixedPhaseSec int) { + switch spec.Name { + case NvidiaBenchmarkProfileStandard: + basePhaseSec = 60 + mixedPhaseSec = 300 + case NvidiaBenchmarkProfileStability: + basePhaseSec = 300 + mixedPhaseSec = 3600 + case NvidiaBenchmarkProfileOvernight: + basePhaseSec = 3600 + mixedPhaseSec = 14400 + default: + totalWeight := len(benchmarkPrecisionPhases) + 5 + if totalWeight <= 0 { + return nil, nil, 0, 0 + } + basePhaseSec = spec.SteadySec / totalWeight + if basePhaseSec <= 0 { + basePhaseSec = 1 + } + mixedPhaseSec = basePhaseSec * 5 + } + planLabels = make([]string, 0, len(benchmarkPrecisionPhases)+1) + planPhases = make([]benchmarkPlannedPhase, 0, len(benchmarkPrecisionPhases)+1) + for _, prec := range benchmarkPrecisionPhases { + planLabels = append(planLabels, prec) + planPhases = append(planPhases, benchmarkPlannedPhase{ + PlanLabel: prec, + MetricStage: metricStage(prec), + DurationSec: basePhaseSec, + }) + } + planLabels = append(planLabels, "mixed") + planPhases = append(planPhases, benchmarkPlannedPhase{ + PlanLabel: "mixed", + MetricStage: metricStage("mixed"), + DurationSec: mixedPhaseSec, + }) + return planLabels, planPhases, basePhaseSec, mixedPhaseSec +} + +func benchmarkPlanDurationsCSV(phases []benchmarkPlannedPhase) string { + values := make([]string, 0, len(phases)) + for _, phase := range phases { + values = append(values, strconv.Itoa(phase.DurationSec)) + } + return strings.Join(values, ",") +} func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) { if ctx == nil { @@ -233,42 +282,42 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv continue } - // ── Per-precision stability phases ──────────────────────────────────────── - // Run each precision category alone so PowerCVPct reflects genuine GPU - // power stability, not kernel-mix variance. - // Time budget: each phase gets steadySec/numPhases, minimum 60 s. - // SteadySec is split equally across all precision phases + 1 combined slot. - // Skipped phases (unsupported precision) are simply omitted; combined is fixed. - totalSlots := len(benchmarkPrecisionPhases) + 1 - perPhaseSec := spec.SteadySec / totalSlots - if perPhaseSec < 60 { - perPhaseSec = 60 - } + // Run synthetic precision phases and the combined steady phase as one + // uninterrupted command so the GPU stays hot between windows. eccBase, _ := queryECCCounters(idx) - for _, prec := range benchmarkPrecisionPhases { - phaseCmd := []string{ - "bee-gpu-burn", - "--seconds", strconv.Itoa(perPhaseSec), - "--size-mb", strconv.Itoa(opts.SizeMB), - "--devices", strconv.Itoa(idx), - "--precision", prec, + planLabels, planPhases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(spec, func(label string) string { + if label == "mixed" { + return fmt.Sprintf("gpu-%d-steady", idx) } - logFunc(fmt.Sprintf("GPU %d: %s stability phase (%ds)", idx, prec, perPhaseSec)) - phaseLogName := fmt.Sprintf("gpu-%d-steady-%s", idx, prec) - eccBefore, _ := queryECCCounters(idx) - phaseOut, phaseRows, phaseErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, phaseLogName+".log", phaseCmd, nil, []int{idx}, logFunc) - appendBenchmarkMetrics(&metricRows, phaseRows, phaseLogName) - appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseLogName, phaseOut) - eccAfter, _ := queryECCCounters(idx) - if phaseErr != nil || len(phaseRows) == 0 { + return fmt.Sprintf("gpu-%d-steady-%s", idx, label) + }) + planCmd := []string{ + "bee-gpu-burn", + "--seconds", strconv.Itoa(basePhaseSec), + "--size-mb", strconv.Itoa(opts.SizeMB), + "--devices", strconv.Itoa(idx), + "--precision-plan", strings.Join(planLabels, ","), + "--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases), + } + logFunc(fmt.Sprintf("GPU %d: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", idx, len(benchmarkPrecisionPhases), basePhaseSec, mixedPhaseSec)) + _, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-precision-plan.log", idx), planCmd, nil, []int{idx}, planPhases, logFunc) + for _, phaseSpec := range planPhases { + if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 { + appendBenchmarkMetrics(&metricRows, rows, phaseSpec.MetricStage) + } + appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseSpec.MetricStage, phaseLogs[phaseSpec.PlanLabel]) + } + for _, prec := range benchmarkPrecisionPhases { + stageName := fmt.Sprintf("gpu-%d-steady-%s", idx, prec) + phaseRows := phaseRowsByStage[stageName] + if len(phaseRows) == 0 { continue } phase := BenchmarkPrecisionSteadyPhase{ Precision: prec, Steady: summarizeBenchmarkTelemetry(phaseRows), - ECC: diffECCCounters(eccBefore, eccAfter), } - for _, p := range parseBenchmarkBurnLog(string(phaseOut)).Profiles { + for _, p := range parseBenchmarkBurnLog(string(phaseLogs[prec])).Profiles { if p.Supported { phase.TeraOpsPerSec += p.TeraOpsPerSec phase.WeightedTeraOpsPerSec += p.WeightedTeraOpsPerSec @@ -278,13 +327,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv } beforeThrottle, _ := queryThrottleCounters(idx) - steadyCmd := []string{ - "bee-gpu-burn", - "--seconds", strconv.Itoa(perPhaseSec), - "--size-mb", strconv.Itoa(opts.SizeMB), - "--devices", strconv.Itoa(idx), - } - logFunc(fmt.Sprintf("GPU %d: steady compute (combined, %ds)", idx, perPhaseSec)) + logFunc(fmt.Sprintf("GPU %d: steady compute (combined, %ds)", idx, mixedPhaseSec)) // Sample server power via IPMI in parallel with the steady phase. // We collect readings every 5s and average them. @@ -320,9 +363,6 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv } }() - steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-steady.log", idx), steadyCmd, nil, []int{idx}, logFunc) - appendBenchmarkMetrics(&metricRows, steadyRows, fmt.Sprintf("gpu-%d-steady", idx)) - appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", fmt.Sprintf("gpu-%d-steady", idx), steadyOut) close(ipmiStopCh) if loadedW, ok := <-ipmiResultCh; ok { serverLoadedWSum += loadedW @@ -331,11 +371,12 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv logFunc(fmt.Sprintf("GPU %d: server loaded power (IPMI): %.0f W", idx, loadedW)) } afterThrottle, _ := queryThrottleCounters(idx) - if steadyErr != nil { - gpuResult.Notes = append(gpuResult.Notes, "steady compute failed: "+steadyErr.Error()) + if planErr != nil { + gpuResult.Notes = append(gpuResult.Notes, "precision plan failed: "+planErr.Error()) } - parseResult := parseBenchmarkBurnLog(string(steadyOut)) + steadyRows := phaseRowsByStage[fmt.Sprintf("gpu-%d-steady", idx)] + parseResult := parseBenchmarkBurnLog(string(phaseLogs["mixed"])) gpuResult.ComputeCapability = parseResult.ComputeCapability gpuResult.Backend = parseResult.Backend gpuResult.PrecisionResults = parseResult.Profiles @@ -349,17 +390,19 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv gpuResult.ECC = diffECCCounters(eccBase, eccFinal) } - cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, []int{idx}) - if err != nil && err != context.Canceled { - gpuResult.Notes = append(gpuResult.Notes, "cooldown sampling failed: "+err.Error()) + if spec.CooldownSec > 0 { + cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, []int{idx}) + if err != nil && err != context.Canceled { + gpuResult.Notes = append(gpuResult.Notes, "cooldown sampling failed: "+err.Error()) + } + gpuResult.Cooldown = summarizeBenchmarkTelemetry(cooldownRows) + appendBenchmarkMetrics(&metricRows, cooldownRows, fmt.Sprintf("gpu-%d-cooldown", idx)) } - gpuResult.Cooldown = summarizeBenchmarkTelemetry(cooldownRows) - appendBenchmarkMetrics(&metricRows, cooldownRows, fmt.Sprintf("gpu-%d-cooldown", idx)) gpuResult.Scores = scoreBenchmarkGPUResult(gpuResult) gpuResult.DegradationReasons = detectBenchmarkDegradationReasons(gpuResult, result.Normalization.Status) - if steadyErr != nil { - gpuResult.Status = classifySATErrorStatus(steadyOut, steadyErr) + if planErr != nil { + gpuResult.Status = classifySATErrorStatus(phaseLogs["mixed"], planErr) } else if parseResult.Fallback { gpuResult.Status = "PARTIAL" } else { @@ -462,11 +505,11 @@ func normalizeNvidiaBenchmarkOptionsForBenchmark(opts NvidiaBenchmarkOptions) Nv func resolveBenchmarkProfile(profile string) benchmarkProfileSpec { switch strings.TrimSpace(strings.ToLower(profile)) { case NvidiaBenchmarkProfileStability: - return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 300, SteadySec: 3600, NCCLSec: 300, CooldownSec: 300} + return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 120, SteadySec: 3600, NCCLSec: 300, CooldownSec: 0} case NvidiaBenchmarkProfileOvernight: - return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 600, SteadySec: 27000, NCCLSec: 600, CooldownSec: 300} + return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 180, SteadySec: 27000, NCCLSec: 600, CooldownSec: 0} default: - return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 120, SteadySec: 480, NCCLSec: 180, CooldownSec: 120} + return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 45, SteadySec: 480, NCCLSec: 180, CooldownSec: 0} } } @@ -795,6 +838,66 @@ func runBenchmarkCommandWithMetrics(ctx context.Context, verboseLog, name string return out, metricRows, err } +type benchmarkPlannedPhase struct { + PlanLabel string + MetricStage string + DurationSec int +} + +func runBenchmarkPlannedCommandWithMetrics( + ctx context.Context, + verboseLog, name string, + cmd []string, + env []string, + gpuIndices []int, + phases []benchmarkPlannedPhase, + logFunc func(string), +) ([]byte, map[string][]GPUMetricRow, map[string][]byte, error) { + out, rows, err := runBenchmarkCommandWithMetrics(ctx, verboseLog, name, cmd, env, gpuIndices, logFunc) + return out, splitBenchmarkRowsByPlannedPhase(rows, phases), splitBenchmarkLogByPlannedPhase(out), err +} + +func splitBenchmarkRowsByPlannedPhase(rows []GPUMetricRow, phases []benchmarkPlannedPhase) map[string][]GPUMetricRow { + out := make(map[string][]GPUMetricRow, len(phases)) + if len(rows) == 0 || len(phases) == 0 { + return out + } + for _, row := range rows { + idx := len(phases) - 1 + var elapsed float64 + for i, phase := range phases { + durationSec := phase.DurationSec + if durationSec <= 0 { + durationSec = 1 + } + elapsed += float64(durationSec) + if row.ElapsedSec < elapsed { + idx = i + break + } + } + out[phases[idx].MetricStage] = append(out[phases[idx].MetricStage], row) + } + return out +} + +func splitBenchmarkLogByPlannedPhase(raw []byte) map[string][]byte { + out := make(map[string][]byte) + var current string + for _, line := range strings.Split(strings.ReplaceAll(string(raw), "\r\n", "\n"), "\n") { + trimmed := strings.TrimSpace(stripBenchmarkPrefix(line)) + switch { + case strings.HasPrefix(trimmed, "phase_begin="): + current = strings.TrimSpace(strings.TrimPrefix(trimmed, "phase_begin=")) + case strings.HasPrefix(trimmed, "phase_end="): + current = "" + case current != "": + out[current] = append(out[current], []byte(line+"\n")...) + } + } + return out +} + type benchmarkCoolingSample struct { AvgFanRPM float64 AvgFanDutyCyclePct float64 @@ -968,6 +1071,8 @@ func ensureBenchmarkProfile(profiles map[string]*benchmarkBurnProfile, name stri category = "fp32_tf32" case strings.HasPrefix(name, "fp16"): category = "fp16_bf16" + case strings.HasPrefix(name, "int8"): + category = "int8" case strings.HasPrefix(name, "fp8"): category = "fp8" case strings.HasPrefix(name, "fp4"): @@ -985,6 +1090,7 @@ func ensureBenchmarkProfile(profiles map[string]*benchmarkBurnProfile, name stri // fp64 = 2.0 — double precision, 2× more bits per operand // fp32 = 1.0 — single precision baseline // fp16 = 0.5 — half precision +// int8 = 0.25 — quarter precision // fp8 = 0.25 — quarter precision // fp4 = 0.125 — eighth precision // @@ -998,6 +1104,8 @@ func precisionWeight(category string) float64 { return 1.0 case "fp16_bf16": return 0.5 + case "int8": + return 0.25 case "fp8": return 0.25 case "fp4": @@ -1861,41 +1969,41 @@ func runNvidiaBenchmarkParallel( } } - // ── Per-precision stability phases (parallel) ───────────────────────────── - totalSlots := len(benchmarkPrecisionPhases) + 1 - perPhaseSec := spec.SteadySec / totalSlots - if perPhaseSec < 60 { - perPhaseSec = 60 - } + // Run synthetic precision phases and the combined steady phase as one + // uninterrupted command so the GPUs stay hot between windows. eccBase := make(map[int]BenchmarkECCCounters, len(selected)) for _, idx := range selected { eccBase[idx], _ = queryECCCounters(idx) } + planLabels, planPhases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(spec, func(label string) string { + if label == "mixed" { + return "steady" + } + return "gpu-all-steady-" + label + }) + planCmd := []string{ + "bee-gpu-burn", + "--seconds", strconv.Itoa(basePhaseSec), + "--size-mb", strconv.Itoa(opts.SizeMB), + "--devices", allDevices, + "--precision-plan", strings.Join(planLabels, ","), + "--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases), + } + logFunc(fmt.Sprintf("GPUs %s: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", allDevices, len(benchmarkPrecisionPhases), basePhaseSec, mixedPhaseSec)) + _, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, "gpu-all-precision-plan.log", planCmd, nil, selected, planPhases, logFunc) + for _, phaseSpec := range planPhases { + if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 { + appendBenchmarkMetrics(allMetricRows, rows, phaseSpec.MetricStage) + } + appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseSpec.MetricStage, phaseLogs[phaseSpec.PlanLabel]) + } for _, prec := range benchmarkPrecisionPhases { - phaseCmd := []string{ - "bee-gpu-burn", - "--seconds", strconv.Itoa(perPhaseSec), - "--size-mb", strconv.Itoa(opts.SizeMB), - "--devices", allDevices, - "--precision", prec, - } - logFunc(fmt.Sprintf("GPUs %s: %s stability phase (%ds)", allDevices, prec, perPhaseSec)) phaseLogName := "gpu-all-steady-" + prec - eccBeforePhase := make(map[int]BenchmarkECCCounters, len(selected)) - for _, idx := range selected { - eccBeforePhase[idx], _ = queryECCCounters(idx) - } - phaseOut, phaseRows, phaseErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, phaseLogName+".log", phaseCmd, nil, selected, logFunc) - appendBenchmarkMetrics(allMetricRows, phaseRows, phaseLogName) - appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseLogName, phaseOut) - eccAfterPhase := make(map[int]BenchmarkECCCounters, len(selected)) - for _, idx := range selected { - eccAfterPhase[idx], _ = queryECCCounters(idx) - } - if phaseErr != nil || len(phaseRows) == 0 { + phaseRows := phaseRowsByStage[phaseLogName] + if len(phaseRows) == 0 { continue } - parseByGPU := parseBenchmarkBurnLogByGPU(string(phaseOut)) + parseByGPU := parseBenchmarkBurnLogByGPU(string(phaseLogs[prec])) for _, idx := range selected { perGPU := filterRowsByGPU(phaseRows, idx) if len(perGPU) == 0 { @@ -1904,7 +2012,6 @@ func runNvidiaBenchmarkParallel( phase := BenchmarkPrecisionSteadyPhase{ Precision: prec, Steady: summarizeBenchmarkTelemetry(perGPU), - ECC: diffECCCounters(eccBeforePhase[idx], eccAfterPhase[idx]), } if pr, ok := parseByGPU[idx]; ok { for _, p := range pr.Profiles { @@ -1924,14 +2031,7 @@ func runNvidiaBenchmarkParallel( beforeThrottle[idx], _ = queryThrottleCounters(idx) } - // Steady: all GPUs simultaneously (combined). Fixed at one slot = perPhaseSec. - steadyCmd := []string{ - "bee-gpu-burn", - "--seconds", strconv.Itoa(perPhaseSec), - "--size-mb", strconv.Itoa(opts.SizeMB), - "--devices", allDevices, - } - logFunc(fmt.Sprintf("GPUs %s: parallel steady compute (combined, %ds)", allDevices, perPhaseSec)) + logFunc(fmt.Sprintf("GPUs %s: parallel steady compute (combined, %ds)", allDevices, mixedPhaseSec)) // Sample server power via IPMI in parallel with steady phase. ipmiStopCh := make(chan struct{}) @@ -1965,9 +2065,6 @@ func runNvidiaBenchmarkParallel( } }() - steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-steady.log", steadyCmd, nil, selected, logFunc) - appendBenchmarkMetrics(allMetricRows, steadyRows, "steady") - appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", "steady", steadyOut) close(ipmiStopCh) if loadedW, ok := <-ipmiResultCh; ok { *serverLoadedWSum += loadedW @@ -1980,7 +2077,8 @@ func runNvidiaBenchmarkParallel( afterThrottle[idx], _ = queryThrottleCounters(idx) } - parseResults := parseBenchmarkBurnLogByGPU(string(steadyOut)) + steadyRows := phaseRowsByStage["steady"] + parseResults := parseBenchmarkBurnLogByGPU(string(phaseLogs["mixed"])) for _, idx := range selected { perGPU := filterRowsByGPU(steadyRows, idx) @@ -1998,23 +2096,25 @@ func runNvidiaBenchmarkParallel( gpuResults[idx].Notes = append(gpuResults[idx].Notes, "benchmark used driver PTX fallback; tensor throughput score is not comparable") } } - if steadyErr != nil { - gpuResults[idx].Notes = append(gpuResults[idx].Notes, "parallel steady compute failed: "+steadyErr.Error()) + if planErr != nil { + gpuResults[idx].Notes = append(gpuResults[idx].Notes, "precision plan failed: "+planErr.Error()) } } // Cooldown: all GPUs together. - cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, selected) - if err != nil && err != context.Canceled { - for _, idx := range selected { - gpuResults[idx].Notes = append(gpuResults[idx].Notes, "cooldown sampling failed: "+err.Error()) + if spec.CooldownSec > 0 { + cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, selected) + if err != nil && err != context.Canceled { + for _, idx := range selected { + gpuResults[idx].Notes = append(gpuResults[idx].Notes, "cooldown sampling failed: "+err.Error()) + } } + for _, idx := range selected { + perGPU := filterRowsByGPU(cooldownRows, idx) + gpuResults[idx].Cooldown = summarizeBenchmarkTelemetry(perGPU) + } + appendBenchmarkMetrics(allMetricRows, cooldownRows, "cooldown") } - for _, idx := range selected { - perGPU := filterRowsByGPU(cooldownRows, idx) - gpuResults[idx].Cooldown = summarizeBenchmarkTelemetry(perGPU) - } - appendBenchmarkMetrics(allMetricRows, cooldownRows, "cooldown") // Score and finalize each GPU. for _, idx := range selected { @@ -2023,8 +2123,8 @@ func runNvidiaBenchmarkParallel( r.DegradationReasons = detectBenchmarkDegradationReasons(*r, result.Normalization.Status) pr := parseResults[idx] switch { - case steadyErr != nil: - r.Status = classifySATErrorStatus(steadyOut, steadyErr) + case planErr != nil: + r.Status = classifySATErrorStatus(phaseLogs["mixed"], planErr) case pr.Fallback: r.Status = "PARTIAL" default: @@ -2213,7 +2313,7 @@ func runBenchmarkPowerCalibration( gpuIndices []int, logFunc func(string), ) map[int]float64 { - const calibDurationSec = 45 + const calibDurationSec = 120 // dcgmi must be present. if _, err := exec.LookPath("dcgmi"); err != nil { diff --git a/audit/internal/platform/benchmark_report.go b/audit/internal/platform/benchmark_report.go index 0b66d92..b75fa5d 100644 --- a/audit/internal/platform/benchmark_report.go +++ b/audit/internal/platform/benchmark_report.go @@ -88,10 +88,10 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string { b.WriteString("- Thermal and power limits are inferred from NVIDIA clock-event counters plus sustained telemetry.\n") b.WriteString("- `result.json` is the canonical machine-readable source for the run.\n\n") b.WriteString("**Compute score** is derived from two phases:\n\n") - b.WriteString("- **Synthetic** — each precision type (fp8, fp16, fp32, fp64, fp4) runs alone for a dedicated window. ") + b.WriteString("- **Synthetic** — each precision type (int8, fp8, fp16, fp32, fp64, fp4) runs alone for a dedicated window. ") b.WriteString("Measures peak throughput with the full GPU dedicated to one kernel type. ") b.WriteString("Each result is normalised to fp32-equivalent TOPS using precision weights: ") - b.WriteString("fp64 ×2.0 · fp32 ×1.0 · fp16 ×0.5 · fp8 ×0.25 · fp4 ×0.125.\n") + b.WriteString("fp64 ×2.0 · fp32 ×1.0 · fp16 ×0.5 · int8 ×0.25 · fp8 ×0.25 · fp4 ×0.125.\n") b.WriteString("- **Mixed** — all precision types run simultaneously (combined phase). ") b.WriteString("Reflects real inference workloads where fp8 matrix ops, fp16 attention and fp32 accumulation compete for bandwidth and SM scheduler slots.\n\n") b.WriteString("**Formula:** `Compute = Synthetic × (1 + MixedEfficiency × 0.3)`\n\n") diff --git a/audit/internal/platform/benchmark_test.go b/audit/internal/platform/benchmark_test.go index 1b2f08a..57219d9 100644 --- a/audit/internal/platform/benchmark_test.go +++ b/audit/internal/platform/benchmark_test.go @@ -16,17 +16,17 @@ func TestResolveBenchmarkProfile(t *testing.T) { { name: "default", profile: "", - want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 120, SteadySec: 480, NCCLSec: 180, CooldownSec: 120}, + want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 45, SteadySec: 480, NCCLSec: 180, CooldownSec: 0}, }, { name: "stability", profile: "stability", - want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 300, SteadySec: 3600, NCCLSec: 300, CooldownSec: 300}, + want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 120, SteadySec: 3600, NCCLSec: 300, CooldownSec: 0}, }, { name: "overnight", profile: "overnight", - want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 600, SteadySec: 27000, NCCLSec: 600, CooldownSec: 300}, + want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 180, SteadySec: 27000, NCCLSec: 600, CooldownSec: 0}, }, } @@ -41,6 +41,92 @@ func TestResolveBenchmarkProfile(t *testing.T) { } } +func TestBuildBenchmarkSteadyPlanStandard(t *testing.T) { + t.Parallel() + + labels, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan( + benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, SteadySec: 480}, + func(label string) string { return label }, + ) + if len(labels) != 7 || len(phases) != 7 { + t.Fatalf("labels=%d phases=%d want 7", len(labels), len(phases)) + } + if basePhaseSec != 60 { + t.Fatalf("basePhaseSec=%d want 60", basePhaseSec) + } + if mixedPhaseSec != 300 { + t.Fatalf("mixedPhaseSec=%d want 300", mixedPhaseSec) + } + if phases[len(phases)-1].PlanLabel != "mixed" || phases[len(phases)-1].DurationSec != 300 { + t.Fatalf("mixed phase=%+v want duration 300", phases[len(phases)-1]) + } + if benchmarkPlanDurationsCSV(phases) != "60,60,60,60,60,60,300" { + t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases)) + } +} + +func TestBuildBenchmarkSteadyPlanStability(t *testing.T) { + t.Parallel() + + _, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan( + benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, SteadySec: 3600}, + func(label string) string { return label }, + ) + if basePhaseSec != 300 { + t.Fatalf("basePhaseSec=%d want 300", basePhaseSec) + } + if mixedPhaseSec != 3600 { + t.Fatalf("mixedPhaseSec=%d want 3600", mixedPhaseSec) + } + if benchmarkPlanDurationsCSV(phases) != "300,300,300,300,300,300,3600" { + t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases)) + } +} + +func TestBuildBenchmarkSteadyPlanOvernight(t *testing.T) { + t.Parallel() + + _, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan( + benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, SteadySec: 27000}, + func(label string) string { return label }, + ) + if basePhaseSec != 3600 { + t.Fatalf("basePhaseSec=%d want 3600", basePhaseSec) + } + if mixedPhaseSec != 14400 { + t.Fatalf("mixedPhaseSec=%d want 14400", mixedPhaseSec) + } + if benchmarkPlanDurationsCSV(phases) != "3600,3600,3600,3600,3600,3600,14400" { + t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases)) + } +} + +func TestSplitBenchmarkRowsByPlannedPhaseUsesPhaseDurations(t *testing.T) { + t.Parallel() + + phases := []benchmarkPlannedPhase{ + {PlanLabel: "fp8", MetricStage: "fp8", DurationSec: 10}, + {PlanLabel: "fp16", MetricStage: "fp16", DurationSec: 10}, + {PlanLabel: "mixed", MetricStage: "mixed", DurationSec: 50}, + } + rows := []GPUMetricRow{ + {ElapsedSec: 5}, + {ElapsedSec: 15}, + {ElapsedSec: 25}, + {ElapsedSec: 65}, + } + got := splitBenchmarkRowsByPlannedPhase(rows, phases) + if len(got["fp8"]) != 1 { + t.Fatalf("fp8 rows=%d want 1", len(got["fp8"])) + } + if len(got["fp16"]) != 1 { + t.Fatalf("fp16 rows=%d want 1", len(got["fp16"])) + } + if len(got["mixed"]) != 2 { + t.Fatalf("mixed rows=%d want 2", len(got["mixed"])) + } +} + func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) { t.Parallel() @@ -65,8 +151,10 @@ func TestParseBenchmarkBurnLog(t *testing.T) { "[gpu 0] compute_capability=9.0", "[gpu 0] backend=cublasLt", "[gpu 0] duration_s=10", + "[gpu 0] int8_tensor[0]=READY dim=16384x16384x8192 block=128 stream=0", "[gpu 0] fp16_tensor[0]=READY dim=4096x4096x4096 block=128 stream=0", "[gpu 0] fp8_e4m3[0]=READY dim=8192x8192x4096 block=128 stream=0", + "[gpu 0] int8_tensor_iterations=80", "[gpu 0] fp16_tensor_iterations=200", "[gpu 0] fp8_e4m3_iterations=50", "[gpu 0] status=OK", @@ -79,15 +167,24 @@ func TestParseBenchmarkBurnLog(t *testing.T) { if got.ComputeCapability != "9.0" { t.Fatalf("compute capability=%q want 9.0", got.ComputeCapability) } - if len(got.Profiles) != 2 { - t.Fatalf("profiles=%d want 2", len(got.Profiles)) + if len(got.Profiles) != 3 { + t.Fatalf("profiles=%d want 3", len(got.Profiles)) } if got.Profiles[0].TeraOpsPerSec <= 0 { t.Fatalf("profile[0] teraops=%f want >0", got.Profiles[0].TeraOpsPerSec) } + if got.Profiles[0].Category != "fp16_bf16" { + t.Fatalf("profile[0] category=%q want fp16_bf16", got.Profiles[0].Category) + } if got.Profiles[1].Category != "fp8" { t.Fatalf("profile[1] category=%q want fp8", got.Profiles[1].Category) } + if got.Profiles[2].Category != "int8" { + t.Fatalf("profile[2] category=%q want int8", got.Profiles[2].Category) + } + if got.Profiles[2].Weight != 0.25 { + t.Fatalf("profile[2] weight=%f want 0.25", got.Profiles[2].Weight) + } } func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) { diff --git a/audit/internal/platform/benchmark_types.go b/audit/internal/platform/benchmark_types.go index bb0690b..2c1544b 100644 --- a/audit/internal/platform/benchmark_types.go +++ b/audit/internal/platform/benchmark_types.go @@ -179,7 +179,7 @@ type BenchmarkPrecisionResult struct { Iterations uint64 `json:"iterations,omitempty"` TeraOpsPerSec float64 `json:"teraops_per_sec,omitempty"` // Weight is the fp32-equivalence factor for this precision category. - // fp32 = 1.0 (baseline), fp64 = 2.0, fp16 = 0.5, fp8 = 0.25, fp4 = 0.125. + // fp32 = 1.0 (baseline), fp64 = 2.0, fp16 = 0.5, int8/fp8 = 0.25, fp4 = 0.125. // WeightedTOPS = TeraOpsPerSec * Weight gives fp32-equivalent throughput. Weight float64 `json:"weight,omitempty"` WeightedTeraOpsPerSec float64 `json:"weighted_teraops_per_sec,omitempty"` diff --git a/iso/builder/bee-gpu-stress.c b/iso/builder/bee-gpu-stress.c index 65f0674..3bba297 100644 --- a/iso/builder/bee-gpu-stress.c +++ b/iso/builder/bee-gpu-stress.c @@ -642,6 +642,20 @@ static const struct profile_desc k_profiles[] = { CUDA_R_16F, CUBLAS_COMPUTE_32F_FAST_16F, }, + { + "int8_tensor", + "int8", + 75, + 1, + 0, + 0, + 128, + CUDA_R_8I, + CUDA_R_8I, + CUDA_R_32I, + CUDA_R_32I, + CUBLAS_COMPUTE_32I, + }, { "fp8_e4m3", "fp8", @@ -760,10 +774,12 @@ static int check_cublas(const char *step, cublasStatus_t status) { static size_t bytes_for_elements(cudaDataType_t type, uint64_t elements) { switch (type) { case CUDA_R_32F: + case CUDA_R_32I: return (size_t)(elements * 4u); case CUDA_R_16F: case CUDA_R_16BF: return (size_t)(elements * 2u); + case CUDA_R_8I: case CUDA_R_8F_E4M3: case CUDA_R_8F_E5M2: return (size_t)(elements); @@ -776,6 +792,13 @@ static size_t bytes_for_elements(cudaDataType_t type, uint64_t elements) { } } +static cudaDataType_t matmul_scale_type(const struct profile_desc *desc) { + if (desc->compute_type == CUBLAS_COMPUTE_32I) { + return CUDA_R_32I; + } + return CUDA_R_32F; +} + static size_t fp4_scale_bytes(uint64_t rows, uint64_t cols) { uint64_t row_tiles = (rows + 127u) / 128u; uint64_t col_tiles = (cols + 63u) / 64u; @@ -944,8 +967,9 @@ static int prepare_profile(struct cublaslt_api *cublas, return 0; } + cudaDataType_t scale_type = matmul_scale_type(desc); if (!check_cublas("cublasLtMatmulDescCreate", - cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, CUDA_R_32F))) { + cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, scale_type))) { destroy_profile(cublas, cuda, out); return 0; } @@ -1094,17 +1118,25 @@ static int prepare_profile(struct cublaslt_api *cublas, static int run_cublas_profile(cublasLtHandle_t handle, struct cublaslt_api *cublas, struct prepared_profile *profile) { + int32_t alpha_i32 = 1; + int32_t beta_i32 = 0; float alpha = 1.0f; float beta = 0.0f; + const void *alpha_ptr = α + const void *beta_ptr = β + if (profile->desc.compute_type == CUBLAS_COMPUTE_32I) { + alpha_ptr = &alpha_i32; + beta_ptr = &beta_i32; + } return check_cublas(profile->desc.name, cublas->cublasLtMatmul(handle, profile->op_desc, - &alpha, + alpha_ptr, (const void *)(uintptr_t)profile->a_dev, profile->a_layout, (const void *)(uintptr_t)profile->b_dev, profile->b_layout, - &beta, + beta_ptr, (const void *)(uintptr_t)profile->c_dev, profile->c_layout, (void *)(uintptr_t)profile->d_dev, @@ -1359,11 +1391,29 @@ static int run_cublaslt_stress(struct cuda_api *cuda, } #endif +static void print_stress_report(const struct stress_report *report, int device_index, int seconds) { + printf("device=%s\n", report->device); + printf("device_index=%d\n", device_index); + printf("compute_capability=%d.%d\n", report->cc_major, report->cc_minor); + printf("backend=%s\n", report->backend); + printf("duration_s=%d\n", seconds); + printf("buffer_mb=%d\n", report->buffer_mb); + printf("streams=%d\n", report->stream_count); + printf("iterations=%lu\n", report->iterations); + printf("checksum=%llu\n", (unsigned long long)report->checksum); + if (report->details[0] != '\0') { + printf("%s", report->details); + } + printf("status=OK\n"); +} + int main(int argc, char **argv) { int seconds = 5; int size_mb = 64; int device_index = 0; const char *precision_filter = NULL; /* NULL = all; else block_label to match */ + const char *precision_plan = NULL; + const char *precision_plan_seconds = NULL; for (int i = 1; i < argc; i++) { if ((strcmp(argv[i], "--seconds") == 0 || strcmp(argv[i], "-t") == 0) && i + 1 < argc) { seconds = atoi(argv[++i]); @@ -1373,9 +1423,13 @@ int main(int argc, char **argv) { device_index = atoi(argv[++i]); } else if (strcmp(argv[i], "--precision") == 0 && i + 1 < argc) { precision_filter = argv[++i]; + } else if (strcmp(argv[i], "--precision-plan") == 0 && i + 1 < argc) { + precision_plan = argv[++i]; + } else if (strcmp(argv[i], "--precision-plan-seconds") == 0 && i + 1 < argc) { + precision_plan_seconds = argv[++i]; } else { fprintf(stderr, - "usage: %s [--seconds N] [--size-mb N] [--device N] [--precision fp8|fp16|fp32|fp64|fp4]\n", + "usage: %s [--seconds N] [--size-mb N] [--device N] [--precision int8|fp8|fp16|fp32|fp64|fp4] [--precision-plan p1,p2,...,mixed] [--precision-plan-seconds s1,s2,...]\n", argv[0]); return 2; } @@ -1436,6 +1490,76 @@ int main(int argc, char **argv) { int ok = 0; #if HAVE_CUBLASLT_HEADERS + if (precision_plan != NULL && precision_plan[0] != '\0') { + char *plan_copy = strdup(precision_plan); + char *plan_seconds_copy = NULL; + int phase_seconds[32] = {0}; + int phase_seconds_count = 0; + int phase_ok = 0; + if (plan_copy == NULL) { + fprintf(stderr, "failed to allocate precision plan buffer\n"); + return 1; + } + if (precision_plan_seconds != NULL && precision_plan_seconds[0] != '\0') { + plan_seconds_copy = strdup(precision_plan_seconds); + if (plan_seconds_copy == NULL) { + free(plan_copy); + fprintf(stderr, "failed to allocate precision plan seconds buffer\n"); + return 1; + } + for (char *sec_token = strtok(plan_seconds_copy, ","); + sec_token != NULL && phase_seconds_count < (int)(sizeof(phase_seconds) / sizeof(phase_seconds[0])); + sec_token = strtok(NULL, ",")) { + while (*sec_token == ' ' || *sec_token == '\t') { + sec_token++; + } + if (*sec_token == '\0') { + continue; + } + phase_seconds[phase_seconds_count++] = atoi(sec_token); + } + } + int phase_idx = 0; + for (char *token = strtok(plan_copy, ","); token != NULL; token = strtok(NULL, ","), phase_idx++) { + while (*token == ' ' || *token == '\t') { + token++; + } + if (*token == '\0') { + continue; + } + const char *phase_name = token; + const char *phase_filter = token; + if (strcmp(token, "mixed") == 0 || strcmp(token, "all") == 0) { + phase_filter = NULL; + } + int phase_duration = seconds; + if (phase_idx < phase_seconds_count && phase_seconds[phase_idx] > 0) { + phase_duration = phase_seconds[phase_idx]; + } + printf("phase_begin=%s\n", phase_name); + fflush(stdout); + memset(&report, 0, sizeof(report)); + ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, phase_duration, size_mb, phase_filter, &report); + if (ok) { + print_stress_report(&report, device_index, phase_duration); + phase_ok = 1; + } else { + printf("phase_error=%s\n", phase_name); + if (report.details[0] != '\0') { + printf("%s", report.details); + if (report.details[strlen(report.details) - 1] != '\n') { + printf("\n"); + } + } + printf("status=FAILED\n"); + } + printf("phase_end=%s\n", phase_name); + fflush(stdout); + } + free(plan_seconds_copy); + free(plan_copy); + return phase_ok ? 0 : 1; + } ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, precision_filter, &report); #endif if (!ok) { @@ -1454,18 +1578,6 @@ int main(int argc, char **argv) { } } - printf("device=%s\n", report.device); - printf("device_index=%d\n", device_index); - printf("compute_capability=%d.%d\n", report.cc_major, report.cc_minor); - printf("backend=%s\n", report.backend); - printf("duration_s=%d\n", seconds); - printf("buffer_mb=%d\n", report.buffer_mb); - printf("streams=%d\n", report.stream_count); - printf("iterations=%lu\n", report.iterations); - printf("checksum=%llu\n", (unsigned long long)report.checksum); - if (report.details[0] != '\0') { - printf("%s", report.details); - } - printf("status=OK\n"); + print_stress_report(&report, device_index, seconds); return 0; } diff --git a/iso/overlay/usr/local/bin/bee-gpu-burn b/iso/overlay/usr/local/bin/bee-gpu-burn index d736022..7b6018c 100755 --- a/iso/overlay/usr/local/bin/bee-gpu-burn +++ b/iso/overlay/usr/local/bin/bee-gpu-burn @@ -7,10 +7,12 @@ SIZE_MB=0 DEVICES="" EXCLUDE="" PRECISION="" +PRECISION_PLAN="" +PRECISION_PLAN_SECONDS="" WORKER="/usr/local/lib/bee/bee-gpu-burn-worker" usage() { - echo "usage: $0 [--seconds N] [--stagger-seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3] [--precision fp8|fp16|fp32|fp64|fp4]" >&2 + echo "usage: $0 [--seconds N] [--stagger-seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3] [--precision int8|fp8|fp16|fp32|fp64|fp4] [--precision-plan p1,p2,...,mixed] [--precision-plan-seconds s1,s2,...]" >&2 exit 2 } @@ -32,6 +34,8 @@ while [ "$#" -gt 0 ]; do --devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;; --exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;; --precision) [ "$#" -ge 2 ] || usage; PRECISION="$2"; shift 2 ;; + --precision-plan) [ "$#" -ge 2 ] || usage; PRECISION_PLAN="$2"; shift 2 ;; + --precision-plan-seconds) [ "$#" -ge 2 ] || usage; PRECISION_PLAN_SECONDS="$2"; shift 2 ;; *) usage ;; esac done @@ -92,8 +96,12 @@ for id in $(echo "${FINAL}" | tr ',' ' '); do echo "starting gpu ${id} size=${gpu_size_mb}MB seconds=${gpu_seconds}" precision_arg="" [ -n "${PRECISION}" ] && precision_arg="--precision ${PRECISION}" + precision_plan_arg="" + [ -n "${PRECISION_PLAN}" ] && precision_plan_arg="--precision-plan ${PRECISION_PLAN}" + precision_plan_seconds_arg="" + [ -n "${PRECISION_PLAN_SECONDS}" ] && precision_plan_seconds_arg="--precision-plan-seconds ${PRECISION_PLAN_SECONDS}" CUDA_VISIBLE_DEVICES="${id}" \ - "${WORKER}" --device 0 --seconds "${gpu_seconds}" --size-mb "${gpu_size_mb}" ${precision_arg} >"${log}" 2>&1 & + "${WORKER}" --device 0 --seconds "${gpu_seconds}" --size-mb "${gpu_size_mb}" ${precision_arg} ${precision_plan_arg} ${precision_plan_seconds_arg} >"${log}" 2>&1 & pid=$! WORKERS="${WORKERS} ${pid}:${id}:${log}" if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${gpu_pos}" -lt "${GPU_COUNT}" ]; then