Refine NVIDIA benchmark phase timing
This commit is contained in:
@@ -76,7 +76,56 @@ var (
|
|||||||
// benchmarkPrecisionPhases lists the precision categories run as individual
|
// benchmarkPrecisionPhases lists the precision categories run as individual
|
||||||
// steady-state windows before the combined steady pass. Order is from lowest
|
// steady-state windows before the combined steady pass. Order is from lowest
|
||||||
// to highest power draw so thermal ramp-up is gradual.
|
// to highest power draw so thermal ramp-up is gradual.
|
||||||
var benchmarkPrecisionPhases = []string{"fp8", "fp16", "fp32", "fp64", "fp4"}
|
var benchmarkPrecisionPhases = []string{"int8", "fp8", "fp16", "fp32", "fp64", "fp4"}
|
||||||
|
|
||||||
|
func buildBenchmarkSteadyPlan(spec benchmarkProfileSpec, metricStage func(string) string) (planLabels []string, planPhases []benchmarkPlannedPhase, basePhaseSec int, mixedPhaseSec int) {
|
||||||
|
switch spec.Name {
|
||||||
|
case NvidiaBenchmarkProfileStandard:
|
||||||
|
basePhaseSec = 60
|
||||||
|
mixedPhaseSec = 300
|
||||||
|
case NvidiaBenchmarkProfileStability:
|
||||||
|
basePhaseSec = 300
|
||||||
|
mixedPhaseSec = 3600
|
||||||
|
case NvidiaBenchmarkProfileOvernight:
|
||||||
|
basePhaseSec = 3600
|
||||||
|
mixedPhaseSec = 14400
|
||||||
|
default:
|
||||||
|
totalWeight := len(benchmarkPrecisionPhases) + 5
|
||||||
|
if totalWeight <= 0 {
|
||||||
|
return nil, nil, 0, 0
|
||||||
|
}
|
||||||
|
basePhaseSec = spec.SteadySec / totalWeight
|
||||||
|
if basePhaseSec <= 0 {
|
||||||
|
basePhaseSec = 1
|
||||||
|
}
|
||||||
|
mixedPhaseSec = basePhaseSec * 5
|
||||||
|
}
|
||||||
|
planLabels = make([]string, 0, len(benchmarkPrecisionPhases)+1)
|
||||||
|
planPhases = make([]benchmarkPlannedPhase, 0, len(benchmarkPrecisionPhases)+1)
|
||||||
|
for _, prec := range benchmarkPrecisionPhases {
|
||||||
|
planLabels = append(planLabels, prec)
|
||||||
|
planPhases = append(planPhases, benchmarkPlannedPhase{
|
||||||
|
PlanLabel: prec,
|
||||||
|
MetricStage: metricStage(prec),
|
||||||
|
DurationSec: basePhaseSec,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
planLabels = append(planLabels, "mixed")
|
||||||
|
planPhases = append(planPhases, benchmarkPlannedPhase{
|
||||||
|
PlanLabel: "mixed",
|
||||||
|
MetricStage: metricStage("mixed"),
|
||||||
|
DurationSec: mixedPhaseSec,
|
||||||
|
})
|
||||||
|
return planLabels, planPhases, basePhaseSec, mixedPhaseSec
|
||||||
|
}
|
||||||
|
|
||||||
|
func benchmarkPlanDurationsCSV(phases []benchmarkPlannedPhase) string {
|
||||||
|
values := make([]string, 0, len(phases))
|
||||||
|
for _, phase := range phases {
|
||||||
|
values = append(values, strconv.Itoa(phase.DurationSec))
|
||||||
|
}
|
||||||
|
return strings.Join(values, ",")
|
||||||
|
}
|
||||||
|
|
||||||
func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||||
if ctx == nil {
|
if ctx == nil {
|
||||||
@@ -233,42 +282,42 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Per-precision stability phases ────────────────────────────────────────
|
// Run synthetic precision phases and the combined steady phase as one
|
||||||
// Run each precision category alone so PowerCVPct reflects genuine GPU
|
// uninterrupted command so the GPU stays hot between windows.
|
||||||
// power stability, not kernel-mix variance.
|
|
||||||
// Time budget: each phase gets steadySec/numPhases, minimum 60 s.
|
|
||||||
// SteadySec is split equally across all precision phases + 1 combined slot.
|
|
||||||
// Skipped phases (unsupported precision) are simply omitted; combined is fixed.
|
|
||||||
totalSlots := len(benchmarkPrecisionPhases) + 1
|
|
||||||
perPhaseSec := spec.SteadySec / totalSlots
|
|
||||||
if perPhaseSec < 60 {
|
|
||||||
perPhaseSec = 60
|
|
||||||
}
|
|
||||||
eccBase, _ := queryECCCounters(idx)
|
eccBase, _ := queryECCCounters(idx)
|
||||||
for _, prec := range benchmarkPrecisionPhases {
|
planLabels, planPhases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(spec, func(label string) string {
|
||||||
phaseCmd := []string{
|
if label == "mixed" {
|
||||||
|
return fmt.Sprintf("gpu-%d-steady", idx)
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("gpu-%d-steady-%s", idx, label)
|
||||||
|
})
|
||||||
|
planCmd := []string{
|
||||||
"bee-gpu-burn",
|
"bee-gpu-burn",
|
||||||
"--seconds", strconv.Itoa(perPhaseSec),
|
"--seconds", strconv.Itoa(basePhaseSec),
|
||||||
"--size-mb", strconv.Itoa(opts.SizeMB),
|
"--size-mb", strconv.Itoa(opts.SizeMB),
|
||||||
"--devices", strconv.Itoa(idx),
|
"--devices", strconv.Itoa(idx),
|
||||||
"--precision", prec,
|
"--precision-plan", strings.Join(planLabels, ","),
|
||||||
|
"--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases),
|
||||||
}
|
}
|
||||||
logFunc(fmt.Sprintf("GPU %d: %s stability phase (%ds)", idx, prec, perPhaseSec))
|
logFunc(fmt.Sprintf("GPU %d: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", idx, len(benchmarkPrecisionPhases), basePhaseSec, mixedPhaseSec))
|
||||||
phaseLogName := fmt.Sprintf("gpu-%d-steady-%s", idx, prec)
|
_, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-precision-plan.log", idx), planCmd, nil, []int{idx}, planPhases, logFunc)
|
||||||
eccBefore, _ := queryECCCounters(idx)
|
for _, phaseSpec := range planPhases {
|
||||||
phaseOut, phaseRows, phaseErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, phaseLogName+".log", phaseCmd, nil, []int{idx}, logFunc)
|
if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 {
|
||||||
appendBenchmarkMetrics(&metricRows, phaseRows, phaseLogName)
|
appendBenchmarkMetrics(&metricRows, rows, phaseSpec.MetricStage)
|
||||||
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseLogName, phaseOut)
|
}
|
||||||
eccAfter, _ := queryECCCounters(idx)
|
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseSpec.MetricStage, phaseLogs[phaseSpec.PlanLabel])
|
||||||
if phaseErr != nil || len(phaseRows) == 0 {
|
}
|
||||||
|
for _, prec := range benchmarkPrecisionPhases {
|
||||||
|
stageName := fmt.Sprintf("gpu-%d-steady-%s", idx, prec)
|
||||||
|
phaseRows := phaseRowsByStage[stageName]
|
||||||
|
if len(phaseRows) == 0 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
phase := BenchmarkPrecisionSteadyPhase{
|
phase := BenchmarkPrecisionSteadyPhase{
|
||||||
Precision: prec,
|
Precision: prec,
|
||||||
Steady: summarizeBenchmarkTelemetry(phaseRows),
|
Steady: summarizeBenchmarkTelemetry(phaseRows),
|
||||||
ECC: diffECCCounters(eccBefore, eccAfter),
|
|
||||||
}
|
}
|
||||||
for _, p := range parseBenchmarkBurnLog(string(phaseOut)).Profiles {
|
for _, p := range parseBenchmarkBurnLog(string(phaseLogs[prec])).Profiles {
|
||||||
if p.Supported {
|
if p.Supported {
|
||||||
phase.TeraOpsPerSec += p.TeraOpsPerSec
|
phase.TeraOpsPerSec += p.TeraOpsPerSec
|
||||||
phase.WeightedTeraOpsPerSec += p.WeightedTeraOpsPerSec
|
phase.WeightedTeraOpsPerSec += p.WeightedTeraOpsPerSec
|
||||||
@@ -278,13 +327,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
}
|
}
|
||||||
|
|
||||||
beforeThrottle, _ := queryThrottleCounters(idx)
|
beforeThrottle, _ := queryThrottleCounters(idx)
|
||||||
steadyCmd := []string{
|
logFunc(fmt.Sprintf("GPU %d: steady compute (combined, %ds)", idx, mixedPhaseSec))
|
||||||
"bee-gpu-burn",
|
|
||||||
"--seconds", strconv.Itoa(perPhaseSec),
|
|
||||||
"--size-mb", strconv.Itoa(opts.SizeMB),
|
|
||||||
"--devices", strconv.Itoa(idx),
|
|
||||||
}
|
|
||||||
logFunc(fmt.Sprintf("GPU %d: steady compute (combined, %ds)", idx, perPhaseSec))
|
|
||||||
|
|
||||||
// Sample server power via IPMI in parallel with the steady phase.
|
// Sample server power via IPMI in parallel with the steady phase.
|
||||||
// We collect readings every 5s and average them.
|
// We collect readings every 5s and average them.
|
||||||
@@ -320,9 +363,6 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-steady.log", idx), steadyCmd, nil, []int{idx}, logFunc)
|
|
||||||
appendBenchmarkMetrics(&metricRows, steadyRows, fmt.Sprintf("gpu-%d-steady", idx))
|
|
||||||
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", fmt.Sprintf("gpu-%d-steady", idx), steadyOut)
|
|
||||||
close(ipmiStopCh)
|
close(ipmiStopCh)
|
||||||
if loadedW, ok := <-ipmiResultCh; ok {
|
if loadedW, ok := <-ipmiResultCh; ok {
|
||||||
serverLoadedWSum += loadedW
|
serverLoadedWSum += loadedW
|
||||||
@@ -331,11 +371,12 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
logFunc(fmt.Sprintf("GPU %d: server loaded power (IPMI): %.0f W", idx, loadedW))
|
logFunc(fmt.Sprintf("GPU %d: server loaded power (IPMI): %.0f W", idx, loadedW))
|
||||||
}
|
}
|
||||||
afterThrottle, _ := queryThrottleCounters(idx)
|
afterThrottle, _ := queryThrottleCounters(idx)
|
||||||
if steadyErr != nil {
|
if planErr != nil {
|
||||||
gpuResult.Notes = append(gpuResult.Notes, "steady compute failed: "+steadyErr.Error())
|
gpuResult.Notes = append(gpuResult.Notes, "precision plan failed: "+planErr.Error())
|
||||||
}
|
}
|
||||||
|
|
||||||
parseResult := parseBenchmarkBurnLog(string(steadyOut))
|
steadyRows := phaseRowsByStage[fmt.Sprintf("gpu-%d-steady", idx)]
|
||||||
|
parseResult := parseBenchmarkBurnLog(string(phaseLogs["mixed"]))
|
||||||
gpuResult.ComputeCapability = parseResult.ComputeCapability
|
gpuResult.ComputeCapability = parseResult.ComputeCapability
|
||||||
gpuResult.Backend = parseResult.Backend
|
gpuResult.Backend = parseResult.Backend
|
||||||
gpuResult.PrecisionResults = parseResult.Profiles
|
gpuResult.PrecisionResults = parseResult.Profiles
|
||||||
@@ -349,17 +390,19 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
gpuResult.ECC = diffECCCounters(eccBase, eccFinal)
|
gpuResult.ECC = diffECCCounters(eccBase, eccFinal)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if spec.CooldownSec > 0 {
|
||||||
cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, []int{idx})
|
cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, []int{idx})
|
||||||
if err != nil && err != context.Canceled {
|
if err != nil && err != context.Canceled {
|
||||||
gpuResult.Notes = append(gpuResult.Notes, "cooldown sampling failed: "+err.Error())
|
gpuResult.Notes = append(gpuResult.Notes, "cooldown sampling failed: "+err.Error())
|
||||||
}
|
}
|
||||||
gpuResult.Cooldown = summarizeBenchmarkTelemetry(cooldownRows)
|
gpuResult.Cooldown = summarizeBenchmarkTelemetry(cooldownRows)
|
||||||
appendBenchmarkMetrics(&metricRows, cooldownRows, fmt.Sprintf("gpu-%d-cooldown", idx))
|
appendBenchmarkMetrics(&metricRows, cooldownRows, fmt.Sprintf("gpu-%d-cooldown", idx))
|
||||||
|
}
|
||||||
|
|
||||||
gpuResult.Scores = scoreBenchmarkGPUResult(gpuResult)
|
gpuResult.Scores = scoreBenchmarkGPUResult(gpuResult)
|
||||||
gpuResult.DegradationReasons = detectBenchmarkDegradationReasons(gpuResult, result.Normalization.Status)
|
gpuResult.DegradationReasons = detectBenchmarkDegradationReasons(gpuResult, result.Normalization.Status)
|
||||||
if steadyErr != nil {
|
if planErr != nil {
|
||||||
gpuResult.Status = classifySATErrorStatus(steadyOut, steadyErr)
|
gpuResult.Status = classifySATErrorStatus(phaseLogs["mixed"], planErr)
|
||||||
} else if parseResult.Fallback {
|
} else if parseResult.Fallback {
|
||||||
gpuResult.Status = "PARTIAL"
|
gpuResult.Status = "PARTIAL"
|
||||||
} else {
|
} else {
|
||||||
@@ -462,11 +505,11 @@ func normalizeNvidiaBenchmarkOptionsForBenchmark(opts NvidiaBenchmarkOptions) Nv
|
|||||||
func resolveBenchmarkProfile(profile string) benchmarkProfileSpec {
|
func resolveBenchmarkProfile(profile string) benchmarkProfileSpec {
|
||||||
switch strings.TrimSpace(strings.ToLower(profile)) {
|
switch strings.TrimSpace(strings.ToLower(profile)) {
|
||||||
case NvidiaBenchmarkProfileStability:
|
case NvidiaBenchmarkProfileStability:
|
||||||
return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 300, SteadySec: 3600, NCCLSec: 300, CooldownSec: 300}
|
return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 120, SteadySec: 3600, NCCLSec: 300, CooldownSec: 0}
|
||||||
case NvidiaBenchmarkProfileOvernight:
|
case NvidiaBenchmarkProfileOvernight:
|
||||||
return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 600, SteadySec: 27000, NCCLSec: 600, CooldownSec: 300}
|
return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 180, SteadySec: 27000, NCCLSec: 600, CooldownSec: 0}
|
||||||
default:
|
default:
|
||||||
return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 120, SteadySec: 480, NCCLSec: 180, CooldownSec: 120}
|
return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 45, SteadySec: 480, NCCLSec: 180, CooldownSec: 0}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -795,6 +838,66 @@ func runBenchmarkCommandWithMetrics(ctx context.Context, verboseLog, name string
|
|||||||
return out, metricRows, err
|
return out, metricRows, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type benchmarkPlannedPhase struct {
|
||||||
|
PlanLabel string
|
||||||
|
MetricStage string
|
||||||
|
DurationSec int
|
||||||
|
}
|
||||||
|
|
||||||
|
func runBenchmarkPlannedCommandWithMetrics(
|
||||||
|
ctx context.Context,
|
||||||
|
verboseLog, name string,
|
||||||
|
cmd []string,
|
||||||
|
env []string,
|
||||||
|
gpuIndices []int,
|
||||||
|
phases []benchmarkPlannedPhase,
|
||||||
|
logFunc func(string),
|
||||||
|
) ([]byte, map[string][]GPUMetricRow, map[string][]byte, error) {
|
||||||
|
out, rows, err := runBenchmarkCommandWithMetrics(ctx, verboseLog, name, cmd, env, gpuIndices, logFunc)
|
||||||
|
return out, splitBenchmarkRowsByPlannedPhase(rows, phases), splitBenchmarkLogByPlannedPhase(out), err
|
||||||
|
}
|
||||||
|
|
||||||
|
func splitBenchmarkRowsByPlannedPhase(rows []GPUMetricRow, phases []benchmarkPlannedPhase) map[string][]GPUMetricRow {
|
||||||
|
out := make(map[string][]GPUMetricRow, len(phases))
|
||||||
|
if len(rows) == 0 || len(phases) == 0 {
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
for _, row := range rows {
|
||||||
|
idx := len(phases) - 1
|
||||||
|
var elapsed float64
|
||||||
|
for i, phase := range phases {
|
||||||
|
durationSec := phase.DurationSec
|
||||||
|
if durationSec <= 0 {
|
||||||
|
durationSec = 1
|
||||||
|
}
|
||||||
|
elapsed += float64(durationSec)
|
||||||
|
if row.ElapsedSec < elapsed {
|
||||||
|
idx = i
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out[phases[idx].MetricStage] = append(out[phases[idx].MetricStage], row)
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func splitBenchmarkLogByPlannedPhase(raw []byte) map[string][]byte {
|
||||||
|
out := make(map[string][]byte)
|
||||||
|
var current string
|
||||||
|
for _, line := range strings.Split(strings.ReplaceAll(string(raw), "\r\n", "\n"), "\n") {
|
||||||
|
trimmed := strings.TrimSpace(stripBenchmarkPrefix(line))
|
||||||
|
switch {
|
||||||
|
case strings.HasPrefix(trimmed, "phase_begin="):
|
||||||
|
current = strings.TrimSpace(strings.TrimPrefix(trimmed, "phase_begin="))
|
||||||
|
case strings.HasPrefix(trimmed, "phase_end="):
|
||||||
|
current = ""
|
||||||
|
case current != "":
|
||||||
|
out[current] = append(out[current], []byte(line+"\n")...)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
type benchmarkCoolingSample struct {
|
type benchmarkCoolingSample struct {
|
||||||
AvgFanRPM float64
|
AvgFanRPM float64
|
||||||
AvgFanDutyCyclePct float64
|
AvgFanDutyCyclePct float64
|
||||||
@@ -968,6 +1071,8 @@ func ensureBenchmarkProfile(profiles map[string]*benchmarkBurnProfile, name stri
|
|||||||
category = "fp32_tf32"
|
category = "fp32_tf32"
|
||||||
case strings.HasPrefix(name, "fp16"):
|
case strings.HasPrefix(name, "fp16"):
|
||||||
category = "fp16_bf16"
|
category = "fp16_bf16"
|
||||||
|
case strings.HasPrefix(name, "int8"):
|
||||||
|
category = "int8"
|
||||||
case strings.HasPrefix(name, "fp8"):
|
case strings.HasPrefix(name, "fp8"):
|
||||||
category = "fp8"
|
category = "fp8"
|
||||||
case strings.HasPrefix(name, "fp4"):
|
case strings.HasPrefix(name, "fp4"):
|
||||||
@@ -985,6 +1090,7 @@ func ensureBenchmarkProfile(profiles map[string]*benchmarkBurnProfile, name stri
|
|||||||
// fp64 = 2.0 — double precision, 2× more bits per operand
|
// fp64 = 2.0 — double precision, 2× more bits per operand
|
||||||
// fp32 = 1.0 — single precision baseline
|
// fp32 = 1.0 — single precision baseline
|
||||||
// fp16 = 0.5 — half precision
|
// fp16 = 0.5 — half precision
|
||||||
|
// int8 = 0.25 — quarter precision
|
||||||
// fp8 = 0.25 — quarter precision
|
// fp8 = 0.25 — quarter precision
|
||||||
// fp4 = 0.125 — eighth precision
|
// fp4 = 0.125 — eighth precision
|
||||||
//
|
//
|
||||||
@@ -998,6 +1104,8 @@ func precisionWeight(category string) float64 {
|
|||||||
return 1.0
|
return 1.0
|
||||||
case "fp16_bf16":
|
case "fp16_bf16":
|
||||||
return 0.5
|
return 0.5
|
||||||
|
case "int8":
|
||||||
|
return 0.25
|
||||||
case "fp8":
|
case "fp8":
|
||||||
return 0.25
|
return 0.25
|
||||||
case "fp4":
|
case "fp4":
|
||||||
@@ -1861,41 +1969,41 @@ func runNvidiaBenchmarkParallel(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Per-precision stability phases (parallel) ─────────────────────────────
|
// Run synthetic precision phases and the combined steady phase as one
|
||||||
totalSlots := len(benchmarkPrecisionPhases) + 1
|
// uninterrupted command so the GPUs stay hot between windows.
|
||||||
perPhaseSec := spec.SteadySec / totalSlots
|
|
||||||
if perPhaseSec < 60 {
|
|
||||||
perPhaseSec = 60
|
|
||||||
}
|
|
||||||
eccBase := make(map[int]BenchmarkECCCounters, len(selected))
|
eccBase := make(map[int]BenchmarkECCCounters, len(selected))
|
||||||
for _, idx := range selected {
|
for _, idx := range selected {
|
||||||
eccBase[idx], _ = queryECCCounters(idx)
|
eccBase[idx], _ = queryECCCounters(idx)
|
||||||
}
|
}
|
||||||
for _, prec := range benchmarkPrecisionPhases {
|
planLabels, planPhases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(spec, func(label string) string {
|
||||||
phaseCmd := []string{
|
if label == "mixed" {
|
||||||
|
return "steady"
|
||||||
|
}
|
||||||
|
return "gpu-all-steady-" + label
|
||||||
|
})
|
||||||
|
planCmd := []string{
|
||||||
"bee-gpu-burn",
|
"bee-gpu-burn",
|
||||||
"--seconds", strconv.Itoa(perPhaseSec),
|
"--seconds", strconv.Itoa(basePhaseSec),
|
||||||
"--size-mb", strconv.Itoa(opts.SizeMB),
|
"--size-mb", strconv.Itoa(opts.SizeMB),
|
||||||
"--devices", allDevices,
|
"--devices", allDevices,
|
||||||
"--precision", prec,
|
"--precision-plan", strings.Join(planLabels, ","),
|
||||||
|
"--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases),
|
||||||
}
|
}
|
||||||
logFunc(fmt.Sprintf("GPUs %s: %s stability phase (%ds)", allDevices, prec, perPhaseSec))
|
logFunc(fmt.Sprintf("GPUs %s: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", allDevices, len(benchmarkPrecisionPhases), basePhaseSec, mixedPhaseSec))
|
||||||
|
_, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, "gpu-all-precision-plan.log", planCmd, nil, selected, planPhases, logFunc)
|
||||||
|
for _, phaseSpec := range planPhases {
|
||||||
|
if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 {
|
||||||
|
appendBenchmarkMetrics(allMetricRows, rows, phaseSpec.MetricStage)
|
||||||
|
}
|
||||||
|
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseSpec.MetricStage, phaseLogs[phaseSpec.PlanLabel])
|
||||||
|
}
|
||||||
|
for _, prec := range benchmarkPrecisionPhases {
|
||||||
phaseLogName := "gpu-all-steady-" + prec
|
phaseLogName := "gpu-all-steady-" + prec
|
||||||
eccBeforePhase := make(map[int]BenchmarkECCCounters, len(selected))
|
phaseRows := phaseRowsByStage[phaseLogName]
|
||||||
for _, idx := range selected {
|
if len(phaseRows) == 0 {
|
||||||
eccBeforePhase[idx], _ = queryECCCounters(idx)
|
|
||||||
}
|
|
||||||
phaseOut, phaseRows, phaseErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, phaseLogName+".log", phaseCmd, nil, selected, logFunc)
|
|
||||||
appendBenchmarkMetrics(allMetricRows, phaseRows, phaseLogName)
|
|
||||||
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseLogName, phaseOut)
|
|
||||||
eccAfterPhase := make(map[int]BenchmarkECCCounters, len(selected))
|
|
||||||
for _, idx := range selected {
|
|
||||||
eccAfterPhase[idx], _ = queryECCCounters(idx)
|
|
||||||
}
|
|
||||||
if phaseErr != nil || len(phaseRows) == 0 {
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
parseByGPU := parseBenchmarkBurnLogByGPU(string(phaseOut))
|
parseByGPU := parseBenchmarkBurnLogByGPU(string(phaseLogs[prec]))
|
||||||
for _, idx := range selected {
|
for _, idx := range selected {
|
||||||
perGPU := filterRowsByGPU(phaseRows, idx)
|
perGPU := filterRowsByGPU(phaseRows, idx)
|
||||||
if len(perGPU) == 0 {
|
if len(perGPU) == 0 {
|
||||||
@@ -1904,7 +2012,6 @@ func runNvidiaBenchmarkParallel(
|
|||||||
phase := BenchmarkPrecisionSteadyPhase{
|
phase := BenchmarkPrecisionSteadyPhase{
|
||||||
Precision: prec,
|
Precision: prec,
|
||||||
Steady: summarizeBenchmarkTelemetry(perGPU),
|
Steady: summarizeBenchmarkTelemetry(perGPU),
|
||||||
ECC: diffECCCounters(eccBeforePhase[idx], eccAfterPhase[idx]),
|
|
||||||
}
|
}
|
||||||
if pr, ok := parseByGPU[idx]; ok {
|
if pr, ok := parseByGPU[idx]; ok {
|
||||||
for _, p := range pr.Profiles {
|
for _, p := range pr.Profiles {
|
||||||
@@ -1924,14 +2031,7 @@ func runNvidiaBenchmarkParallel(
|
|||||||
beforeThrottle[idx], _ = queryThrottleCounters(idx)
|
beforeThrottle[idx], _ = queryThrottleCounters(idx)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Steady: all GPUs simultaneously (combined). Fixed at one slot = perPhaseSec.
|
logFunc(fmt.Sprintf("GPUs %s: parallel steady compute (combined, %ds)", allDevices, mixedPhaseSec))
|
||||||
steadyCmd := []string{
|
|
||||||
"bee-gpu-burn",
|
|
||||||
"--seconds", strconv.Itoa(perPhaseSec),
|
|
||||||
"--size-mb", strconv.Itoa(opts.SizeMB),
|
|
||||||
"--devices", allDevices,
|
|
||||||
}
|
|
||||||
logFunc(fmt.Sprintf("GPUs %s: parallel steady compute (combined, %ds)", allDevices, perPhaseSec))
|
|
||||||
|
|
||||||
// Sample server power via IPMI in parallel with steady phase.
|
// Sample server power via IPMI in parallel with steady phase.
|
||||||
ipmiStopCh := make(chan struct{})
|
ipmiStopCh := make(chan struct{})
|
||||||
@@ -1965,9 +2065,6 @@ func runNvidiaBenchmarkParallel(
|
|||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-steady.log", steadyCmd, nil, selected, logFunc)
|
|
||||||
appendBenchmarkMetrics(allMetricRows, steadyRows, "steady")
|
|
||||||
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", "steady", steadyOut)
|
|
||||||
close(ipmiStopCh)
|
close(ipmiStopCh)
|
||||||
if loadedW, ok := <-ipmiResultCh; ok {
|
if loadedW, ok := <-ipmiResultCh; ok {
|
||||||
*serverLoadedWSum += loadedW
|
*serverLoadedWSum += loadedW
|
||||||
@@ -1980,7 +2077,8 @@ func runNvidiaBenchmarkParallel(
|
|||||||
afterThrottle[idx], _ = queryThrottleCounters(idx)
|
afterThrottle[idx], _ = queryThrottleCounters(idx)
|
||||||
}
|
}
|
||||||
|
|
||||||
parseResults := parseBenchmarkBurnLogByGPU(string(steadyOut))
|
steadyRows := phaseRowsByStage["steady"]
|
||||||
|
parseResults := parseBenchmarkBurnLogByGPU(string(phaseLogs["mixed"]))
|
||||||
|
|
||||||
for _, idx := range selected {
|
for _, idx := range selected {
|
||||||
perGPU := filterRowsByGPU(steadyRows, idx)
|
perGPU := filterRowsByGPU(steadyRows, idx)
|
||||||
@@ -1998,12 +2096,13 @@ func runNvidiaBenchmarkParallel(
|
|||||||
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "benchmark used driver PTX fallback; tensor throughput score is not comparable")
|
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "benchmark used driver PTX fallback; tensor throughput score is not comparable")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if steadyErr != nil {
|
if planErr != nil {
|
||||||
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "parallel steady compute failed: "+steadyErr.Error())
|
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "precision plan failed: "+planErr.Error())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Cooldown: all GPUs together.
|
// Cooldown: all GPUs together.
|
||||||
|
if spec.CooldownSec > 0 {
|
||||||
cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, selected)
|
cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, selected)
|
||||||
if err != nil && err != context.Canceled {
|
if err != nil && err != context.Canceled {
|
||||||
for _, idx := range selected {
|
for _, idx := range selected {
|
||||||
@@ -2015,6 +2114,7 @@ func runNvidiaBenchmarkParallel(
|
|||||||
gpuResults[idx].Cooldown = summarizeBenchmarkTelemetry(perGPU)
|
gpuResults[idx].Cooldown = summarizeBenchmarkTelemetry(perGPU)
|
||||||
}
|
}
|
||||||
appendBenchmarkMetrics(allMetricRows, cooldownRows, "cooldown")
|
appendBenchmarkMetrics(allMetricRows, cooldownRows, "cooldown")
|
||||||
|
}
|
||||||
|
|
||||||
// Score and finalize each GPU.
|
// Score and finalize each GPU.
|
||||||
for _, idx := range selected {
|
for _, idx := range selected {
|
||||||
@@ -2023,8 +2123,8 @@ func runNvidiaBenchmarkParallel(
|
|||||||
r.DegradationReasons = detectBenchmarkDegradationReasons(*r, result.Normalization.Status)
|
r.DegradationReasons = detectBenchmarkDegradationReasons(*r, result.Normalization.Status)
|
||||||
pr := parseResults[idx]
|
pr := parseResults[idx]
|
||||||
switch {
|
switch {
|
||||||
case steadyErr != nil:
|
case planErr != nil:
|
||||||
r.Status = classifySATErrorStatus(steadyOut, steadyErr)
|
r.Status = classifySATErrorStatus(phaseLogs["mixed"], planErr)
|
||||||
case pr.Fallback:
|
case pr.Fallback:
|
||||||
r.Status = "PARTIAL"
|
r.Status = "PARTIAL"
|
||||||
default:
|
default:
|
||||||
@@ -2213,7 +2313,7 @@ func runBenchmarkPowerCalibration(
|
|||||||
gpuIndices []int,
|
gpuIndices []int,
|
||||||
logFunc func(string),
|
logFunc func(string),
|
||||||
) map[int]float64 {
|
) map[int]float64 {
|
||||||
const calibDurationSec = 45
|
const calibDurationSec = 120
|
||||||
|
|
||||||
// dcgmi must be present.
|
// dcgmi must be present.
|
||||||
if _, err := exec.LookPath("dcgmi"); err != nil {
|
if _, err := exec.LookPath("dcgmi"); err != nil {
|
||||||
|
|||||||
@@ -88,10 +88,10 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
|||||||
b.WriteString("- Thermal and power limits are inferred from NVIDIA clock-event counters plus sustained telemetry.\n")
|
b.WriteString("- Thermal and power limits are inferred from NVIDIA clock-event counters plus sustained telemetry.\n")
|
||||||
b.WriteString("- `result.json` is the canonical machine-readable source for the run.\n\n")
|
b.WriteString("- `result.json` is the canonical machine-readable source for the run.\n\n")
|
||||||
b.WriteString("**Compute score** is derived from two phases:\n\n")
|
b.WriteString("**Compute score** is derived from two phases:\n\n")
|
||||||
b.WriteString("- **Synthetic** — each precision type (fp8, fp16, fp32, fp64, fp4) runs alone for a dedicated window. ")
|
b.WriteString("- **Synthetic** — each precision type (int8, fp8, fp16, fp32, fp64, fp4) runs alone for a dedicated window. ")
|
||||||
b.WriteString("Measures peak throughput with the full GPU dedicated to one kernel type. ")
|
b.WriteString("Measures peak throughput with the full GPU dedicated to one kernel type. ")
|
||||||
b.WriteString("Each result is normalised to fp32-equivalent TOPS using precision weights: ")
|
b.WriteString("Each result is normalised to fp32-equivalent TOPS using precision weights: ")
|
||||||
b.WriteString("fp64 ×2.0 · fp32 ×1.0 · fp16 ×0.5 · fp8 ×0.25 · fp4 ×0.125.\n")
|
b.WriteString("fp64 ×2.0 · fp32 ×1.0 · fp16 ×0.5 · int8 ×0.25 · fp8 ×0.25 · fp4 ×0.125.\n")
|
||||||
b.WriteString("- **Mixed** — all precision types run simultaneously (combined phase). ")
|
b.WriteString("- **Mixed** — all precision types run simultaneously (combined phase). ")
|
||||||
b.WriteString("Reflects real inference workloads where fp8 matrix ops, fp16 attention and fp32 accumulation compete for bandwidth and SM scheduler slots.\n\n")
|
b.WriteString("Reflects real inference workloads where fp8 matrix ops, fp16 attention and fp32 accumulation compete for bandwidth and SM scheduler slots.\n\n")
|
||||||
b.WriteString("**Formula:** `Compute = Synthetic × (1 + MixedEfficiency × 0.3)`\n\n")
|
b.WriteString("**Formula:** `Compute = Synthetic × (1 + MixedEfficiency × 0.3)`\n\n")
|
||||||
|
|||||||
@@ -16,17 +16,17 @@ func TestResolveBenchmarkProfile(t *testing.T) {
|
|||||||
{
|
{
|
||||||
name: "default",
|
name: "default",
|
||||||
profile: "",
|
profile: "",
|
||||||
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 120, SteadySec: 480, NCCLSec: 180, CooldownSec: 120},
|
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 45, SteadySec: 480, NCCLSec: 180, CooldownSec: 0},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "stability",
|
name: "stability",
|
||||||
profile: "stability",
|
profile: "stability",
|
||||||
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 300, SteadySec: 3600, NCCLSec: 300, CooldownSec: 300},
|
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 120, SteadySec: 3600, NCCLSec: 300, CooldownSec: 0},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "overnight",
|
name: "overnight",
|
||||||
profile: "overnight",
|
profile: "overnight",
|
||||||
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 600, SteadySec: 27000, NCCLSec: 600, CooldownSec: 300},
|
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 180, SteadySec: 27000, NCCLSec: 600, CooldownSec: 0},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -41,6 +41,92 @@ func TestResolveBenchmarkProfile(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestBuildBenchmarkSteadyPlanStandard(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
labels, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
|
||||||
|
benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, SteadySec: 480},
|
||||||
|
func(label string) string { return label },
|
||||||
|
)
|
||||||
|
if len(labels) != 7 || len(phases) != 7 {
|
||||||
|
t.Fatalf("labels=%d phases=%d want 7", len(labels), len(phases))
|
||||||
|
}
|
||||||
|
if basePhaseSec != 60 {
|
||||||
|
t.Fatalf("basePhaseSec=%d want 60", basePhaseSec)
|
||||||
|
}
|
||||||
|
if mixedPhaseSec != 300 {
|
||||||
|
t.Fatalf("mixedPhaseSec=%d want 300", mixedPhaseSec)
|
||||||
|
}
|
||||||
|
if phases[len(phases)-1].PlanLabel != "mixed" || phases[len(phases)-1].DurationSec != 300 {
|
||||||
|
t.Fatalf("mixed phase=%+v want duration 300", phases[len(phases)-1])
|
||||||
|
}
|
||||||
|
if benchmarkPlanDurationsCSV(phases) != "60,60,60,60,60,60,300" {
|
||||||
|
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBuildBenchmarkSteadyPlanStability(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
_, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
|
||||||
|
benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, SteadySec: 3600},
|
||||||
|
func(label string) string { return label },
|
||||||
|
)
|
||||||
|
if basePhaseSec != 300 {
|
||||||
|
t.Fatalf("basePhaseSec=%d want 300", basePhaseSec)
|
||||||
|
}
|
||||||
|
if mixedPhaseSec != 3600 {
|
||||||
|
t.Fatalf("mixedPhaseSec=%d want 3600", mixedPhaseSec)
|
||||||
|
}
|
||||||
|
if benchmarkPlanDurationsCSV(phases) != "300,300,300,300,300,300,3600" {
|
||||||
|
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBuildBenchmarkSteadyPlanOvernight(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
_, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
|
||||||
|
benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, SteadySec: 27000},
|
||||||
|
func(label string) string { return label },
|
||||||
|
)
|
||||||
|
if basePhaseSec != 3600 {
|
||||||
|
t.Fatalf("basePhaseSec=%d want 3600", basePhaseSec)
|
||||||
|
}
|
||||||
|
if mixedPhaseSec != 14400 {
|
||||||
|
t.Fatalf("mixedPhaseSec=%d want 14400", mixedPhaseSec)
|
||||||
|
}
|
||||||
|
if benchmarkPlanDurationsCSV(phases) != "3600,3600,3600,3600,3600,3600,14400" {
|
||||||
|
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSplitBenchmarkRowsByPlannedPhaseUsesPhaseDurations(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
phases := []benchmarkPlannedPhase{
|
||||||
|
{PlanLabel: "fp8", MetricStage: "fp8", DurationSec: 10},
|
||||||
|
{PlanLabel: "fp16", MetricStage: "fp16", DurationSec: 10},
|
||||||
|
{PlanLabel: "mixed", MetricStage: "mixed", DurationSec: 50},
|
||||||
|
}
|
||||||
|
rows := []GPUMetricRow{
|
||||||
|
{ElapsedSec: 5},
|
||||||
|
{ElapsedSec: 15},
|
||||||
|
{ElapsedSec: 25},
|
||||||
|
{ElapsedSec: 65},
|
||||||
|
}
|
||||||
|
got := splitBenchmarkRowsByPlannedPhase(rows, phases)
|
||||||
|
if len(got["fp8"]) != 1 {
|
||||||
|
t.Fatalf("fp8 rows=%d want 1", len(got["fp8"]))
|
||||||
|
}
|
||||||
|
if len(got["fp16"]) != 1 {
|
||||||
|
t.Fatalf("fp16 rows=%d want 1", len(got["fp16"]))
|
||||||
|
}
|
||||||
|
if len(got["mixed"]) != 2 {
|
||||||
|
t.Fatalf("mixed rows=%d want 2", len(got["mixed"]))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
|
func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
@@ -65,8 +151,10 @@ func TestParseBenchmarkBurnLog(t *testing.T) {
|
|||||||
"[gpu 0] compute_capability=9.0",
|
"[gpu 0] compute_capability=9.0",
|
||||||
"[gpu 0] backend=cublasLt",
|
"[gpu 0] backend=cublasLt",
|
||||||
"[gpu 0] duration_s=10",
|
"[gpu 0] duration_s=10",
|
||||||
|
"[gpu 0] int8_tensor[0]=READY dim=16384x16384x8192 block=128 stream=0",
|
||||||
"[gpu 0] fp16_tensor[0]=READY dim=4096x4096x4096 block=128 stream=0",
|
"[gpu 0] fp16_tensor[0]=READY dim=4096x4096x4096 block=128 stream=0",
|
||||||
"[gpu 0] fp8_e4m3[0]=READY dim=8192x8192x4096 block=128 stream=0",
|
"[gpu 0] fp8_e4m3[0]=READY dim=8192x8192x4096 block=128 stream=0",
|
||||||
|
"[gpu 0] int8_tensor_iterations=80",
|
||||||
"[gpu 0] fp16_tensor_iterations=200",
|
"[gpu 0] fp16_tensor_iterations=200",
|
||||||
"[gpu 0] fp8_e4m3_iterations=50",
|
"[gpu 0] fp8_e4m3_iterations=50",
|
||||||
"[gpu 0] status=OK",
|
"[gpu 0] status=OK",
|
||||||
@@ -79,15 +167,24 @@ func TestParseBenchmarkBurnLog(t *testing.T) {
|
|||||||
if got.ComputeCapability != "9.0" {
|
if got.ComputeCapability != "9.0" {
|
||||||
t.Fatalf("compute capability=%q want 9.0", got.ComputeCapability)
|
t.Fatalf("compute capability=%q want 9.0", got.ComputeCapability)
|
||||||
}
|
}
|
||||||
if len(got.Profiles) != 2 {
|
if len(got.Profiles) != 3 {
|
||||||
t.Fatalf("profiles=%d want 2", len(got.Profiles))
|
t.Fatalf("profiles=%d want 3", len(got.Profiles))
|
||||||
}
|
}
|
||||||
if got.Profiles[0].TeraOpsPerSec <= 0 {
|
if got.Profiles[0].TeraOpsPerSec <= 0 {
|
||||||
t.Fatalf("profile[0] teraops=%f want >0", got.Profiles[0].TeraOpsPerSec)
|
t.Fatalf("profile[0] teraops=%f want >0", got.Profiles[0].TeraOpsPerSec)
|
||||||
}
|
}
|
||||||
|
if got.Profiles[0].Category != "fp16_bf16" {
|
||||||
|
t.Fatalf("profile[0] category=%q want fp16_bf16", got.Profiles[0].Category)
|
||||||
|
}
|
||||||
if got.Profiles[1].Category != "fp8" {
|
if got.Profiles[1].Category != "fp8" {
|
||||||
t.Fatalf("profile[1] category=%q want fp8", got.Profiles[1].Category)
|
t.Fatalf("profile[1] category=%q want fp8", got.Profiles[1].Category)
|
||||||
}
|
}
|
||||||
|
if got.Profiles[2].Category != "int8" {
|
||||||
|
t.Fatalf("profile[2] category=%q want int8", got.Profiles[2].Category)
|
||||||
|
}
|
||||||
|
if got.Profiles[2].Weight != 0.25 {
|
||||||
|
t.Fatalf("profile[2] weight=%f want 0.25", got.Profiles[2].Weight)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
|
func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
|
||||||
|
|||||||
@@ -179,7 +179,7 @@ type BenchmarkPrecisionResult struct {
|
|||||||
Iterations uint64 `json:"iterations,omitempty"`
|
Iterations uint64 `json:"iterations,omitempty"`
|
||||||
TeraOpsPerSec float64 `json:"teraops_per_sec,omitempty"`
|
TeraOpsPerSec float64 `json:"teraops_per_sec,omitempty"`
|
||||||
// Weight is the fp32-equivalence factor for this precision category.
|
// Weight is the fp32-equivalence factor for this precision category.
|
||||||
// fp32 = 1.0 (baseline), fp64 = 2.0, fp16 = 0.5, fp8 = 0.25, fp4 = 0.125.
|
// fp32 = 1.0 (baseline), fp64 = 2.0, fp16 = 0.5, int8/fp8 = 0.25, fp4 = 0.125.
|
||||||
// WeightedTOPS = TeraOpsPerSec * Weight gives fp32-equivalent throughput.
|
// WeightedTOPS = TeraOpsPerSec * Weight gives fp32-equivalent throughput.
|
||||||
Weight float64 `json:"weight,omitempty"`
|
Weight float64 `json:"weight,omitempty"`
|
||||||
WeightedTeraOpsPerSec float64 `json:"weighted_teraops_per_sec,omitempty"`
|
WeightedTeraOpsPerSec float64 `json:"weighted_teraops_per_sec,omitempty"`
|
||||||
|
|||||||
@@ -642,6 +642,20 @@ static const struct profile_desc k_profiles[] = {
|
|||||||
CUDA_R_16F,
|
CUDA_R_16F,
|
||||||
CUBLAS_COMPUTE_32F_FAST_16F,
|
CUBLAS_COMPUTE_32F_FAST_16F,
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"int8_tensor",
|
||||||
|
"int8",
|
||||||
|
75,
|
||||||
|
1,
|
||||||
|
0,
|
||||||
|
0,
|
||||||
|
128,
|
||||||
|
CUDA_R_8I,
|
||||||
|
CUDA_R_8I,
|
||||||
|
CUDA_R_32I,
|
||||||
|
CUDA_R_32I,
|
||||||
|
CUBLAS_COMPUTE_32I,
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"fp8_e4m3",
|
"fp8_e4m3",
|
||||||
"fp8",
|
"fp8",
|
||||||
@@ -760,10 +774,12 @@ static int check_cublas(const char *step, cublasStatus_t status) {
|
|||||||
static size_t bytes_for_elements(cudaDataType_t type, uint64_t elements) {
|
static size_t bytes_for_elements(cudaDataType_t type, uint64_t elements) {
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case CUDA_R_32F:
|
case CUDA_R_32F:
|
||||||
|
case CUDA_R_32I:
|
||||||
return (size_t)(elements * 4u);
|
return (size_t)(elements * 4u);
|
||||||
case CUDA_R_16F:
|
case CUDA_R_16F:
|
||||||
case CUDA_R_16BF:
|
case CUDA_R_16BF:
|
||||||
return (size_t)(elements * 2u);
|
return (size_t)(elements * 2u);
|
||||||
|
case CUDA_R_8I:
|
||||||
case CUDA_R_8F_E4M3:
|
case CUDA_R_8F_E4M3:
|
||||||
case CUDA_R_8F_E5M2:
|
case CUDA_R_8F_E5M2:
|
||||||
return (size_t)(elements);
|
return (size_t)(elements);
|
||||||
@@ -776,6 +792,13 @@ static size_t bytes_for_elements(cudaDataType_t type, uint64_t elements) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static cudaDataType_t matmul_scale_type(const struct profile_desc *desc) {
|
||||||
|
if (desc->compute_type == CUBLAS_COMPUTE_32I) {
|
||||||
|
return CUDA_R_32I;
|
||||||
|
}
|
||||||
|
return CUDA_R_32F;
|
||||||
|
}
|
||||||
|
|
||||||
static size_t fp4_scale_bytes(uint64_t rows, uint64_t cols) {
|
static size_t fp4_scale_bytes(uint64_t rows, uint64_t cols) {
|
||||||
uint64_t row_tiles = (rows + 127u) / 128u;
|
uint64_t row_tiles = (rows + 127u) / 128u;
|
||||||
uint64_t col_tiles = (cols + 63u) / 64u;
|
uint64_t col_tiles = (cols + 63u) / 64u;
|
||||||
@@ -944,8 +967,9 @@ static int prepare_profile(struct cublaslt_api *cublas,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
cudaDataType_t scale_type = matmul_scale_type(desc);
|
||||||
if (!check_cublas("cublasLtMatmulDescCreate",
|
if (!check_cublas("cublasLtMatmulDescCreate",
|
||||||
cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, CUDA_R_32F))) {
|
cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, scale_type))) {
|
||||||
destroy_profile(cublas, cuda, out);
|
destroy_profile(cublas, cuda, out);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@@ -1094,17 +1118,25 @@ static int prepare_profile(struct cublaslt_api *cublas,
|
|||||||
static int run_cublas_profile(cublasLtHandle_t handle,
|
static int run_cublas_profile(cublasLtHandle_t handle,
|
||||||
struct cublaslt_api *cublas,
|
struct cublaslt_api *cublas,
|
||||||
struct prepared_profile *profile) {
|
struct prepared_profile *profile) {
|
||||||
|
int32_t alpha_i32 = 1;
|
||||||
|
int32_t beta_i32 = 0;
|
||||||
float alpha = 1.0f;
|
float alpha = 1.0f;
|
||||||
float beta = 0.0f;
|
float beta = 0.0f;
|
||||||
|
const void *alpha_ptr = α
|
||||||
|
const void *beta_ptr = β
|
||||||
|
if (profile->desc.compute_type == CUBLAS_COMPUTE_32I) {
|
||||||
|
alpha_ptr = &alpha_i32;
|
||||||
|
beta_ptr = &beta_i32;
|
||||||
|
}
|
||||||
return check_cublas(profile->desc.name,
|
return check_cublas(profile->desc.name,
|
||||||
cublas->cublasLtMatmul(handle,
|
cublas->cublasLtMatmul(handle,
|
||||||
profile->op_desc,
|
profile->op_desc,
|
||||||
&alpha,
|
alpha_ptr,
|
||||||
(const void *)(uintptr_t)profile->a_dev,
|
(const void *)(uintptr_t)profile->a_dev,
|
||||||
profile->a_layout,
|
profile->a_layout,
|
||||||
(const void *)(uintptr_t)profile->b_dev,
|
(const void *)(uintptr_t)profile->b_dev,
|
||||||
profile->b_layout,
|
profile->b_layout,
|
||||||
&beta,
|
beta_ptr,
|
||||||
(const void *)(uintptr_t)profile->c_dev,
|
(const void *)(uintptr_t)profile->c_dev,
|
||||||
profile->c_layout,
|
profile->c_layout,
|
||||||
(void *)(uintptr_t)profile->d_dev,
|
(void *)(uintptr_t)profile->d_dev,
|
||||||
@@ -1359,11 +1391,29 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
static void print_stress_report(const struct stress_report *report, int device_index, int seconds) {
|
||||||
|
printf("device=%s\n", report->device);
|
||||||
|
printf("device_index=%d\n", device_index);
|
||||||
|
printf("compute_capability=%d.%d\n", report->cc_major, report->cc_minor);
|
||||||
|
printf("backend=%s\n", report->backend);
|
||||||
|
printf("duration_s=%d\n", seconds);
|
||||||
|
printf("buffer_mb=%d\n", report->buffer_mb);
|
||||||
|
printf("streams=%d\n", report->stream_count);
|
||||||
|
printf("iterations=%lu\n", report->iterations);
|
||||||
|
printf("checksum=%llu\n", (unsigned long long)report->checksum);
|
||||||
|
if (report->details[0] != '\0') {
|
||||||
|
printf("%s", report->details);
|
||||||
|
}
|
||||||
|
printf("status=OK\n");
|
||||||
|
}
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv) {
|
||||||
int seconds = 5;
|
int seconds = 5;
|
||||||
int size_mb = 64;
|
int size_mb = 64;
|
||||||
int device_index = 0;
|
int device_index = 0;
|
||||||
const char *precision_filter = NULL; /* NULL = all; else block_label to match */
|
const char *precision_filter = NULL; /* NULL = all; else block_label to match */
|
||||||
|
const char *precision_plan = NULL;
|
||||||
|
const char *precision_plan_seconds = NULL;
|
||||||
for (int i = 1; i < argc; i++) {
|
for (int i = 1; i < argc; i++) {
|
||||||
if ((strcmp(argv[i], "--seconds") == 0 || strcmp(argv[i], "-t") == 0) && i + 1 < argc) {
|
if ((strcmp(argv[i], "--seconds") == 0 || strcmp(argv[i], "-t") == 0) && i + 1 < argc) {
|
||||||
seconds = atoi(argv[++i]);
|
seconds = atoi(argv[++i]);
|
||||||
@@ -1373,9 +1423,13 @@ int main(int argc, char **argv) {
|
|||||||
device_index = atoi(argv[++i]);
|
device_index = atoi(argv[++i]);
|
||||||
} else if (strcmp(argv[i], "--precision") == 0 && i + 1 < argc) {
|
} else if (strcmp(argv[i], "--precision") == 0 && i + 1 < argc) {
|
||||||
precision_filter = argv[++i];
|
precision_filter = argv[++i];
|
||||||
|
} else if (strcmp(argv[i], "--precision-plan") == 0 && i + 1 < argc) {
|
||||||
|
precision_plan = argv[++i];
|
||||||
|
} else if (strcmp(argv[i], "--precision-plan-seconds") == 0 && i + 1 < argc) {
|
||||||
|
precision_plan_seconds = argv[++i];
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr,
|
fprintf(stderr,
|
||||||
"usage: %s [--seconds N] [--size-mb N] [--device N] [--precision fp8|fp16|fp32|fp64|fp4]\n",
|
"usage: %s [--seconds N] [--size-mb N] [--device N] [--precision int8|fp8|fp16|fp32|fp64|fp4] [--precision-plan p1,p2,...,mixed] [--precision-plan-seconds s1,s2,...]\n",
|
||||||
argv[0]);
|
argv[0]);
|
||||||
return 2;
|
return 2;
|
||||||
}
|
}
|
||||||
@@ -1436,6 +1490,76 @@ int main(int argc, char **argv) {
|
|||||||
int ok = 0;
|
int ok = 0;
|
||||||
|
|
||||||
#if HAVE_CUBLASLT_HEADERS
|
#if HAVE_CUBLASLT_HEADERS
|
||||||
|
if (precision_plan != NULL && precision_plan[0] != '\0') {
|
||||||
|
char *plan_copy = strdup(precision_plan);
|
||||||
|
char *plan_seconds_copy = NULL;
|
||||||
|
int phase_seconds[32] = {0};
|
||||||
|
int phase_seconds_count = 0;
|
||||||
|
int phase_ok = 0;
|
||||||
|
if (plan_copy == NULL) {
|
||||||
|
fprintf(stderr, "failed to allocate precision plan buffer\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (precision_plan_seconds != NULL && precision_plan_seconds[0] != '\0') {
|
||||||
|
plan_seconds_copy = strdup(precision_plan_seconds);
|
||||||
|
if (plan_seconds_copy == NULL) {
|
||||||
|
free(plan_copy);
|
||||||
|
fprintf(stderr, "failed to allocate precision plan seconds buffer\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
for (char *sec_token = strtok(plan_seconds_copy, ",");
|
||||||
|
sec_token != NULL && phase_seconds_count < (int)(sizeof(phase_seconds) / sizeof(phase_seconds[0]));
|
||||||
|
sec_token = strtok(NULL, ",")) {
|
||||||
|
while (*sec_token == ' ' || *sec_token == '\t') {
|
||||||
|
sec_token++;
|
||||||
|
}
|
||||||
|
if (*sec_token == '\0') {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
phase_seconds[phase_seconds_count++] = atoi(sec_token);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
int phase_idx = 0;
|
||||||
|
for (char *token = strtok(plan_copy, ","); token != NULL; token = strtok(NULL, ","), phase_idx++) {
|
||||||
|
while (*token == ' ' || *token == '\t') {
|
||||||
|
token++;
|
||||||
|
}
|
||||||
|
if (*token == '\0') {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const char *phase_name = token;
|
||||||
|
const char *phase_filter = token;
|
||||||
|
if (strcmp(token, "mixed") == 0 || strcmp(token, "all") == 0) {
|
||||||
|
phase_filter = NULL;
|
||||||
|
}
|
||||||
|
int phase_duration = seconds;
|
||||||
|
if (phase_idx < phase_seconds_count && phase_seconds[phase_idx] > 0) {
|
||||||
|
phase_duration = phase_seconds[phase_idx];
|
||||||
|
}
|
||||||
|
printf("phase_begin=%s\n", phase_name);
|
||||||
|
fflush(stdout);
|
||||||
|
memset(&report, 0, sizeof(report));
|
||||||
|
ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, phase_duration, size_mb, phase_filter, &report);
|
||||||
|
if (ok) {
|
||||||
|
print_stress_report(&report, device_index, phase_duration);
|
||||||
|
phase_ok = 1;
|
||||||
|
} else {
|
||||||
|
printf("phase_error=%s\n", phase_name);
|
||||||
|
if (report.details[0] != '\0') {
|
||||||
|
printf("%s", report.details);
|
||||||
|
if (report.details[strlen(report.details) - 1] != '\n') {
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
printf("status=FAILED\n");
|
||||||
|
}
|
||||||
|
printf("phase_end=%s\n", phase_name);
|
||||||
|
fflush(stdout);
|
||||||
|
}
|
||||||
|
free(plan_seconds_copy);
|
||||||
|
free(plan_copy);
|
||||||
|
return phase_ok ? 0 : 1;
|
||||||
|
}
|
||||||
ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, precision_filter, &report);
|
ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, precision_filter, &report);
|
||||||
#endif
|
#endif
|
||||||
if (!ok) {
|
if (!ok) {
|
||||||
@@ -1454,18 +1578,6 @@ int main(int argc, char **argv) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("device=%s\n", report.device);
|
print_stress_report(&report, device_index, seconds);
|
||||||
printf("device_index=%d\n", device_index);
|
|
||||||
printf("compute_capability=%d.%d\n", report.cc_major, report.cc_minor);
|
|
||||||
printf("backend=%s\n", report.backend);
|
|
||||||
printf("duration_s=%d\n", seconds);
|
|
||||||
printf("buffer_mb=%d\n", report.buffer_mb);
|
|
||||||
printf("streams=%d\n", report.stream_count);
|
|
||||||
printf("iterations=%lu\n", report.iterations);
|
|
||||||
printf("checksum=%llu\n", (unsigned long long)report.checksum);
|
|
||||||
if (report.details[0] != '\0') {
|
|
||||||
printf("%s", report.details);
|
|
||||||
}
|
|
||||||
printf("status=OK\n");
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -7,10 +7,12 @@ SIZE_MB=0
|
|||||||
DEVICES=""
|
DEVICES=""
|
||||||
EXCLUDE=""
|
EXCLUDE=""
|
||||||
PRECISION=""
|
PRECISION=""
|
||||||
|
PRECISION_PLAN=""
|
||||||
|
PRECISION_PLAN_SECONDS=""
|
||||||
WORKER="/usr/local/lib/bee/bee-gpu-burn-worker"
|
WORKER="/usr/local/lib/bee/bee-gpu-burn-worker"
|
||||||
|
|
||||||
usage() {
|
usage() {
|
||||||
echo "usage: $0 [--seconds N] [--stagger-seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3] [--precision fp8|fp16|fp32|fp64|fp4]" >&2
|
echo "usage: $0 [--seconds N] [--stagger-seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3] [--precision int8|fp8|fp16|fp32|fp64|fp4] [--precision-plan p1,p2,...,mixed] [--precision-plan-seconds s1,s2,...]" >&2
|
||||||
exit 2
|
exit 2
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -32,6 +34,8 @@ while [ "$#" -gt 0 ]; do
|
|||||||
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
||||||
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
||||||
--precision) [ "$#" -ge 2 ] || usage; PRECISION="$2"; shift 2 ;;
|
--precision) [ "$#" -ge 2 ] || usage; PRECISION="$2"; shift 2 ;;
|
||||||
|
--precision-plan) [ "$#" -ge 2 ] || usage; PRECISION_PLAN="$2"; shift 2 ;;
|
||||||
|
--precision-plan-seconds) [ "$#" -ge 2 ] || usage; PRECISION_PLAN_SECONDS="$2"; shift 2 ;;
|
||||||
*) usage ;;
|
*) usage ;;
|
||||||
esac
|
esac
|
||||||
done
|
done
|
||||||
@@ -92,8 +96,12 @@ for id in $(echo "${FINAL}" | tr ',' ' '); do
|
|||||||
echo "starting gpu ${id} size=${gpu_size_mb}MB seconds=${gpu_seconds}"
|
echo "starting gpu ${id} size=${gpu_size_mb}MB seconds=${gpu_seconds}"
|
||||||
precision_arg=""
|
precision_arg=""
|
||||||
[ -n "${PRECISION}" ] && precision_arg="--precision ${PRECISION}"
|
[ -n "${PRECISION}" ] && precision_arg="--precision ${PRECISION}"
|
||||||
|
precision_plan_arg=""
|
||||||
|
[ -n "${PRECISION_PLAN}" ] && precision_plan_arg="--precision-plan ${PRECISION_PLAN}"
|
||||||
|
precision_plan_seconds_arg=""
|
||||||
|
[ -n "${PRECISION_PLAN_SECONDS}" ] && precision_plan_seconds_arg="--precision-plan-seconds ${PRECISION_PLAN_SECONDS}"
|
||||||
CUDA_VISIBLE_DEVICES="${id}" \
|
CUDA_VISIBLE_DEVICES="${id}" \
|
||||||
"${WORKER}" --device 0 --seconds "${gpu_seconds}" --size-mb "${gpu_size_mb}" ${precision_arg} >"${log}" 2>&1 &
|
"${WORKER}" --device 0 --seconds "${gpu_seconds}" --size-mb "${gpu_size_mb}" ${precision_arg} ${precision_plan_arg} ${precision_plan_seconds_arg} >"${log}" 2>&1 &
|
||||||
pid=$!
|
pid=$!
|
||||||
WORKERS="${WORKERS} ${pid}:${id}:${log}"
|
WORKERS="${WORKERS} ${pid}:${id}:${log}"
|
||||||
if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${gpu_pos}" -lt "${GPU_COUNT}" ]; then
|
if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${gpu_pos}" -lt "${GPU_COUNT}" ]; then
|
||||||
|
|||||||
Reference in New Issue
Block a user