Refine NVIDIA benchmark phase timing

This commit is contained in:
Mikhail Chusavitin
2026-04-14 14:12:06 +03:00
parent b1a5035edd
commit 2be7ae6d28
6 changed files with 450 additions and 133 deletions

View File

@@ -76,7 +76,56 @@ var (
// benchmarkPrecisionPhases lists the precision categories run as individual // benchmarkPrecisionPhases lists the precision categories run as individual
// steady-state windows before the combined steady pass. Order is from lowest // steady-state windows before the combined steady pass. Order is from lowest
// to highest power draw so thermal ramp-up is gradual. // to highest power draw so thermal ramp-up is gradual.
var benchmarkPrecisionPhases = []string{"fp8", "fp16", "fp32", "fp64", "fp4"} var benchmarkPrecisionPhases = []string{"int8", "fp8", "fp16", "fp32", "fp64", "fp4"}
func buildBenchmarkSteadyPlan(spec benchmarkProfileSpec, metricStage func(string) string) (planLabels []string, planPhases []benchmarkPlannedPhase, basePhaseSec int, mixedPhaseSec int) {
switch spec.Name {
case NvidiaBenchmarkProfileStandard:
basePhaseSec = 60
mixedPhaseSec = 300
case NvidiaBenchmarkProfileStability:
basePhaseSec = 300
mixedPhaseSec = 3600
case NvidiaBenchmarkProfileOvernight:
basePhaseSec = 3600
mixedPhaseSec = 14400
default:
totalWeight := len(benchmarkPrecisionPhases) + 5
if totalWeight <= 0 {
return nil, nil, 0, 0
}
basePhaseSec = spec.SteadySec / totalWeight
if basePhaseSec <= 0 {
basePhaseSec = 1
}
mixedPhaseSec = basePhaseSec * 5
}
planLabels = make([]string, 0, len(benchmarkPrecisionPhases)+1)
planPhases = make([]benchmarkPlannedPhase, 0, len(benchmarkPrecisionPhases)+1)
for _, prec := range benchmarkPrecisionPhases {
planLabels = append(planLabels, prec)
planPhases = append(planPhases, benchmarkPlannedPhase{
PlanLabel: prec,
MetricStage: metricStage(prec),
DurationSec: basePhaseSec,
})
}
planLabels = append(planLabels, "mixed")
planPhases = append(planPhases, benchmarkPlannedPhase{
PlanLabel: "mixed",
MetricStage: metricStage("mixed"),
DurationSec: mixedPhaseSec,
})
return planLabels, planPhases, basePhaseSec, mixedPhaseSec
}
func benchmarkPlanDurationsCSV(phases []benchmarkPlannedPhase) string {
values := make([]string, 0, len(phases))
for _, phase := range phases {
values = append(values, strconv.Itoa(phase.DurationSec))
}
return strings.Join(values, ",")
}
func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) { func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
if ctx == nil { if ctx == nil {
@@ -233,42 +282,42 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
continue continue
} }
// ── Per-precision stability phases ──────────────────────────────────────── // Run synthetic precision phases and the combined steady phase as one
// Run each precision category alone so PowerCVPct reflects genuine GPU // uninterrupted command so the GPU stays hot between windows.
// power stability, not kernel-mix variance.
// Time budget: each phase gets steadySec/numPhases, minimum 60 s.
// SteadySec is split equally across all precision phases + 1 combined slot.
// Skipped phases (unsupported precision) are simply omitted; combined is fixed.
totalSlots := len(benchmarkPrecisionPhases) + 1
perPhaseSec := spec.SteadySec / totalSlots
if perPhaseSec < 60 {
perPhaseSec = 60
}
eccBase, _ := queryECCCounters(idx) eccBase, _ := queryECCCounters(idx)
for _, prec := range benchmarkPrecisionPhases { planLabels, planPhases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(spec, func(label string) string {
phaseCmd := []string{ if label == "mixed" {
"bee-gpu-burn", return fmt.Sprintf("gpu-%d-steady", idx)
"--seconds", strconv.Itoa(perPhaseSec),
"--size-mb", strconv.Itoa(opts.SizeMB),
"--devices", strconv.Itoa(idx),
"--precision", prec,
} }
logFunc(fmt.Sprintf("GPU %d: %s stability phase (%ds)", idx, prec, perPhaseSec)) return fmt.Sprintf("gpu-%d-steady-%s", idx, label)
phaseLogName := fmt.Sprintf("gpu-%d-steady-%s", idx, prec) })
eccBefore, _ := queryECCCounters(idx) planCmd := []string{
phaseOut, phaseRows, phaseErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, phaseLogName+".log", phaseCmd, nil, []int{idx}, logFunc) "bee-gpu-burn",
appendBenchmarkMetrics(&metricRows, phaseRows, phaseLogName) "--seconds", strconv.Itoa(basePhaseSec),
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseLogName, phaseOut) "--size-mb", strconv.Itoa(opts.SizeMB),
eccAfter, _ := queryECCCounters(idx) "--devices", strconv.Itoa(idx),
if phaseErr != nil || len(phaseRows) == 0 { "--precision-plan", strings.Join(planLabels, ","),
"--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases),
}
logFunc(fmt.Sprintf("GPU %d: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", idx, len(benchmarkPrecisionPhases), basePhaseSec, mixedPhaseSec))
_, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-precision-plan.log", idx), planCmd, nil, []int{idx}, planPhases, logFunc)
for _, phaseSpec := range planPhases {
if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 {
appendBenchmarkMetrics(&metricRows, rows, phaseSpec.MetricStage)
}
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseSpec.MetricStage, phaseLogs[phaseSpec.PlanLabel])
}
for _, prec := range benchmarkPrecisionPhases {
stageName := fmt.Sprintf("gpu-%d-steady-%s", idx, prec)
phaseRows := phaseRowsByStage[stageName]
if len(phaseRows) == 0 {
continue continue
} }
phase := BenchmarkPrecisionSteadyPhase{ phase := BenchmarkPrecisionSteadyPhase{
Precision: prec, Precision: prec,
Steady: summarizeBenchmarkTelemetry(phaseRows), Steady: summarizeBenchmarkTelemetry(phaseRows),
ECC: diffECCCounters(eccBefore, eccAfter),
} }
for _, p := range parseBenchmarkBurnLog(string(phaseOut)).Profiles { for _, p := range parseBenchmarkBurnLog(string(phaseLogs[prec])).Profiles {
if p.Supported { if p.Supported {
phase.TeraOpsPerSec += p.TeraOpsPerSec phase.TeraOpsPerSec += p.TeraOpsPerSec
phase.WeightedTeraOpsPerSec += p.WeightedTeraOpsPerSec phase.WeightedTeraOpsPerSec += p.WeightedTeraOpsPerSec
@@ -278,13 +327,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
} }
beforeThrottle, _ := queryThrottleCounters(idx) beforeThrottle, _ := queryThrottleCounters(idx)
steadyCmd := []string{ logFunc(fmt.Sprintf("GPU %d: steady compute (combined, %ds)", idx, mixedPhaseSec))
"bee-gpu-burn",
"--seconds", strconv.Itoa(perPhaseSec),
"--size-mb", strconv.Itoa(opts.SizeMB),
"--devices", strconv.Itoa(idx),
}
logFunc(fmt.Sprintf("GPU %d: steady compute (combined, %ds)", idx, perPhaseSec))
// Sample server power via IPMI in parallel with the steady phase. // Sample server power via IPMI in parallel with the steady phase.
// We collect readings every 5s and average them. // We collect readings every 5s and average them.
@@ -320,9 +363,6 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
} }
}() }()
steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-steady.log", idx), steadyCmd, nil, []int{idx}, logFunc)
appendBenchmarkMetrics(&metricRows, steadyRows, fmt.Sprintf("gpu-%d-steady", idx))
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", fmt.Sprintf("gpu-%d-steady", idx), steadyOut)
close(ipmiStopCh) close(ipmiStopCh)
if loadedW, ok := <-ipmiResultCh; ok { if loadedW, ok := <-ipmiResultCh; ok {
serverLoadedWSum += loadedW serverLoadedWSum += loadedW
@@ -331,11 +371,12 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
logFunc(fmt.Sprintf("GPU %d: server loaded power (IPMI): %.0f W", idx, loadedW)) logFunc(fmt.Sprintf("GPU %d: server loaded power (IPMI): %.0f W", idx, loadedW))
} }
afterThrottle, _ := queryThrottleCounters(idx) afterThrottle, _ := queryThrottleCounters(idx)
if steadyErr != nil { if planErr != nil {
gpuResult.Notes = append(gpuResult.Notes, "steady compute failed: "+steadyErr.Error()) gpuResult.Notes = append(gpuResult.Notes, "precision plan failed: "+planErr.Error())
} }
parseResult := parseBenchmarkBurnLog(string(steadyOut)) steadyRows := phaseRowsByStage[fmt.Sprintf("gpu-%d-steady", idx)]
parseResult := parseBenchmarkBurnLog(string(phaseLogs["mixed"]))
gpuResult.ComputeCapability = parseResult.ComputeCapability gpuResult.ComputeCapability = parseResult.ComputeCapability
gpuResult.Backend = parseResult.Backend gpuResult.Backend = parseResult.Backend
gpuResult.PrecisionResults = parseResult.Profiles gpuResult.PrecisionResults = parseResult.Profiles
@@ -349,17 +390,19 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
gpuResult.ECC = diffECCCounters(eccBase, eccFinal) gpuResult.ECC = diffECCCounters(eccBase, eccFinal)
} }
cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, []int{idx}) if spec.CooldownSec > 0 {
if err != nil && err != context.Canceled { cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, []int{idx})
gpuResult.Notes = append(gpuResult.Notes, "cooldown sampling failed: "+err.Error()) if err != nil && err != context.Canceled {
gpuResult.Notes = append(gpuResult.Notes, "cooldown sampling failed: "+err.Error())
}
gpuResult.Cooldown = summarizeBenchmarkTelemetry(cooldownRows)
appendBenchmarkMetrics(&metricRows, cooldownRows, fmt.Sprintf("gpu-%d-cooldown", idx))
} }
gpuResult.Cooldown = summarizeBenchmarkTelemetry(cooldownRows)
appendBenchmarkMetrics(&metricRows, cooldownRows, fmt.Sprintf("gpu-%d-cooldown", idx))
gpuResult.Scores = scoreBenchmarkGPUResult(gpuResult) gpuResult.Scores = scoreBenchmarkGPUResult(gpuResult)
gpuResult.DegradationReasons = detectBenchmarkDegradationReasons(gpuResult, result.Normalization.Status) gpuResult.DegradationReasons = detectBenchmarkDegradationReasons(gpuResult, result.Normalization.Status)
if steadyErr != nil { if planErr != nil {
gpuResult.Status = classifySATErrorStatus(steadyOut, steadyErr) gpuResult.Status = classifySATErrorStatus(phaseLogs["mixed"], planErr)
} else if parseResult.Fallback { } else if parseResult.Fallback {
gpuResult.Status = "PARTIAL" gpuResult.Status = "PARTIAL"
} else { } else {
@@ -462,11 +505,11 @@ func normalizeNvidiaBenchmarkOptionsForBenchmark(opts NvidiaBenchmarkOptions) Nv
func resolveBenchmarkProfile(profile string) benchmarkProfileSpec { func resolveBenchmarkProfile(profile string) benchmarkProfileSpec {
switch strings.TrimSpace(strings.ToLower(profile)) { switch strings.TrimSpace(strings.ToLower(profile)) {
case NvidiaBenchmarkProfileStability: case NvidiaBenchmarkProfileStability:
return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 300, SteadySec: 3600, NCCLSec: 300, CooldownSec: 300} return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 120, SteadySec: 3600, NCCLSec: 300, CooldownSec: 0}
case NvidiaBenchmarkProfileOvernight: case NvidiaBenchmarkProfileOvernight:
return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 600, SteadySec: 27000, NCCLSec: 600, CooldownSec: 300} return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 180, SteadySec: 27000, NCCLSec: 600, CooldownSec: 0}
default: default:
return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 120, SteadySec: 480, NCCLSec: 180, CooldownSec: 120} return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 45, SteadySec: 480, NCCLSec: 180, CooldownSec: 0}
} }
} }
@@ -795,6 +838,66 @@ func runBenchmarkCommandWithMetrics(ctx context.Context, verboseLog, name string
return out, metricRows, err return out, metricRows, err
} }
type benchmarkPlannedPhase struct {
PlanLabel string
MetricStage string
DurationSec int
}
func runBenchmarkPlannedCommandWithMetrics(
ctx context.Context,
verboseLog, name string,
cmd []string,
env []string,
gpuIndices []int,
phases []benchmarkPlannedPhase,
logFunc func(string),
) ([]byte, map[string][]GPUMetricRow, map[string][]byte, error) {
out, rows, err := runBenchmarkCommandWithMetrics(ctx, verboseLog, name, cmd, env, gpuIndices, logFunc)
return out, splitBenchmarkRowsByPlannedPhase(rows, phases), splitBenchmarkLogByPlannedPhase(out), err
}
func splitBenchmarkRowsByPlannedPhase(rows []GPUMetricRow, phases []benchmarkPlannedPhase) map[string][]GPUMetricRow {
out := make(map[string][]GPUMetricRow, len(phases))
if len(rows) == 0 || len(phases) == 0 {
return out
}
for _, row := range rows {
idx := len(phases) - 1
var elapsed float64
for i, phase := range phases {
durationSec := phase.DurationSec
if durationSec <= 0 {
durationSec = 1
}
elapsed += float64(durationSec)
if row.ElapsedSec < elapsed {
idx = i
break
}
}
out[phases[idx].MetricStage] = append(out[phases[idx].MetricStage], row)
}
return out
}
func splitBenchmarkLogByPlannedPhase(raw []byte) map[string][]byte {
out := make(map[string][]byte)
var current string
for _, line := range strings.Split(strings.ReplaceAll(string(raw), "\r\n", "\n"), "\n") {
trimmed := strings.TrimSpace(stripBenchmarkPrefix(line))
switch {
case strings.HasPrefix(trimmed, "phase_begin="):
current = strings.TrimSpace(strings.TrimPrefix(trimmed, "phase_begin="))
case strings.HasPrefix(trimmed, "phase_end="):
current = ""
case current != "":
out[current] = append(out[current], []byte(line+"\n")...)
}
}
return out
}
type benchmarkCoolingSample struct { type benchmarkCoolingSample struct {
AvgFanRPM float64 AvgFanRPM float64
AvgFanDutyCyclePct float64 AvgFanDutyCyclePct float64
@@ -968,6 +1071,8 @@ func ensureBenchmarkProfile(profiles map[string]*benchmarkBurnProfile, name stri
category = "fp32_tf32" category = "fp32_tf32"
case strings.HasPrefix(name, "fp16"): case strings.HasPrefix(name, "fp16"):
category = "fp16_bf16" category = "fp16_bf16"
case strings.HasPrefix(name, "int8"):
category = "int8"
case strings.HasPrefix(name, "fp8"): case strings.HasPrefix(name, "fp8"):
category = "fp8" category = "fp8"
case strings.HasPrefix(name, "fp4"): case strings.HasPrefix(name, "fp4"):
@@ -985,6 +1090,7 @@ func ensureBenchmarkProfile(profiles map[string]*benchmarkBurnProfile, name stri
// fp64 = 2.0 — double precision, 2× more bits per operand // fp64 = 2.0 — double precision, 2× more bits per operand
// fp32 = 1.0 — single precision baseline // fp32 = 1.0 — single precision baseline
// fp16 = 0.5 — half precision // fp16 = 0.5 — half precision
// int8 = 0.25 — quarter precision
// fp8 = 0.25 — quarter precision // fp8 = 0.25 — quarter precision
// fp4 = 0.125 — eighth precision // fp4 = 0.125 — eighth precision
// //
@@ -998,6 +1104,8 @@ func precisionWeight(category string) float64 {
return 1.0 return 1.0
case "fp16_bf16": case "fp16_bf16":
return 0.5 return 0.5
case "int8":
return 0.25
case "fp8": case "fp8":
return 0.25 return 0.25
case "fp4": case "fp4":
@@ -1861,41 +1969,41 @@ func runNvidiaBenchmarkParallel(
} }
} }
// ── Per-precision stability phases (parallel) ───────────────────────────── // Run synthetic precision phases and the combined steady phase as one
totalSlots := len(benchmarkPrecisionPhases) + 1 // uninterrupted command so the GPUs stay hot between windows.
perPhaseSec := spec.SteadySec / totalSlots
if perPhaseSec < 60 {
perPhaseSec = 60
}
eccBase := make(map[int]BenchmarkECCCounters, len(selected)) eccBase := make(map[int]BenchmarkECCCounters, len(selected))
for _, idx := range selected { for _, idx := range selected {
eccBase[idx], _ = queryECCCounters(idx) eccBase[idx], _ = queryECCCounters(idx)
} }
planLabels, planPhases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(spec, func(label string) string {
if label == "mixed" {
return "steady"
}
return "gpu-all-steady-" + label
})
planCmd := []string{
"bee-gpu-burn",
"--seconds", strconv.Itoa(basePhaseSec),
"--size-mb", strconv.Itoa(opts.SizeMB),
"--devices", allDevices,
"--precision-plan", strings.Join(planLabels, ","),
"--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases),
}
logFunc(fmt.Sprintf("GPUs %s: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", allDevices, len(benchmarkPrecisionPhases), basePhaseSec, mixedPhaseSec))
_, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, "gpu-all-precision-plan.log", planCmd, nil, selected, planPhases, logFunc)
for _, phaseSpec := range planPhases {
if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 {
appendBenchmarkMetrics(allMetricRows, rows, phaseSpec.MetricStage)
}
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseSpec.MetricStage, phaseLogs[phaseSpec.PlanLabel])
}
for _, prec := range benchmarkPrecisionPhases { for _, prec := range benchmarkPrecisionPhases {
phaseCmd := []string{
"bee-gpu-burn",
"--seconds", strconv.Itoa(perPhaseSec),
"--size-mb", strconv.Itoa(opts.SizeMB),
"--devices", allDevices,
"--precision", prec,
}
logFunc(fmt.Sprintf("GPUs %s: %s stability phase (%ds)", allDevices, prec, perPhaseSec))
phaseLogName := "gpu-all-steady-" + prec phaseLogName := "gpu-all-steady-" + prec
eccBeforePhase := make(map[int]BenchmarkECCCounters, len(selected)) phaseRows := phaseRowsByStage[phaseLogName]
for _, idx := range selected { if len(phaseRows) == 0 {
eccBeforePhase[idx], _ = queryECCCounters(idx)
}
phaseOut, phaseRows, phaseErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, phaseLogName+".log", phaseCmd, nil, selected, logFunc)
appendBenchmarkMetrics(allMetricRows, phaseRows, phaseLogName)
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseLogName, phaseOut)
eccAfterPhase := make(map[int]BenchmarkECCCounters, len(selected))
for _, idx := range selected {
eccAfterPhase[idx], _ = queryECCCounters(idx)
}
if phaseErr != nil || len(phaseRows) == 0 {
continue continue
} }
parseByGPU := parseBenchmarkBurnLogByGPU(string(phaseOut)) parseByGPU := parseBenchmarkBurnLogByGPU(string(phaseLogs[prec]))
for _, idx := range selected { for _, idx := range selected {
perGPU := filterRowsByGPU(phaseRows, idx) perGPU := filterRowsByGPU(phaseRows, idx)
if len(perGPU) == 0 { if len(perGPU) == 0 {
@@ -1904,7 +2012,6 @@ func runNvidiaBenchmarkParallel(
phase := BenchmarkPrecisionSteadyPhase{ phase := BenchmarkPrecisionSteadyPhase{
Precision: prec, Precision: prec,
Steady: summarizeBenchmarkTelemetry(perGPU), Steady: summarizeBenchmarkTelemetry(perGPU),
ECC: diffECCCounters(eccBeforePhase[idx], eccAfterPhase[idx]),
} }
if pr, ok := parseByGPU[idx]; ok { if pr, ok := parseByGPU[idx]; ok {
for _, p := range pr.Profiles { for _, p := range pr.Profiles {
@@ -1924,14 +2031,7 @@ func runNvidiaBenchmarkParallel(
beforeThrottle[idx], _ = queryThrottleCounters(idx) beforeThrottle[idx], _ = queryThrottleCounters(idx)
} }
// Steady: all GPUs simultaneously (combined). Fixed at one slot = perPhaseSec. logFunc(fmt.Sprintf("GPUs %s: parallel steady compute (combined, %ds)", allDevices, mixedPhaseSec))
steadyCmd := []string{
"bee-gpu-burn",
"--seconds", strconv.Itoa(perPhaseSec),
"--size-mb", strconv.Itoa(opts.SizeMB),
"--devices", allDevices,
}
logFunc(fmt.Sprintf("GPUs %s: parallel steady compute (combined, %ds)", allDevices, perPhaseSec))
// Sample server power via IPMI in parallel with steady phase. // Sample server power via IPMI in parallel with steady phase.
ipmiStopCh := make(chan struct{}) ipmiStopCh := make(chan struct{})
@@ -1965,9 +2065,6 @@ func runNvidiaBenchmarkParallel(
} }
}() }()
steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-steady.log", steadyCmd, nil, selected, logFunc)
appendBenchmarkMetrics(allMetricRows, steadyRows, "steady")
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", "steady", steadyOut)
close(ipmiStopCh) close(ipmiStopCh)
if loadedW, ok := <-ipmiResultCh; ok { if loadedW, ok := <-ipmiResultCh; ok {
*serverLoadedWSum += loadedW *serverLoadedWSum += loadedW
@@ -1980,7 +2077,8 @@ func runNvidiaBenchmarkParallel(
afterThrottle[idx], _ = queryThrottleCounters(idx) afterThrottle[idx], _ = queryThrottleCounters(idx)
} }
parseResults := parseBenchmarkBurnLogByGPU(string(steadyOut)) steadyRows := phaseRowsByStage["steady"]
parseResults := parseBenchmarkBurnLogByGPU(string(phaseLogs["mixed"]))
for _, idx := range selected { for _, idx := range selected {
perGPU := filterRowsByGPU(steadyRows, idx) perGPU := filterRowsByGPU(steadyRows, idx)
@@ -1998,23 +2096,25 @@ func runNvidiaBenchmarkParallel(
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "benchmark used driver PTX fallback; tensor throughput score is not comparable") gpuResults[idx].Notes = append(gpuResults[idx].Notes, "benchmark used driver PTX fallback; tensor throughput score is not comparable")
} }
} }
if steadyErr != nil { if planErr != nil {
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "parallel steady compute failed: "+steadyErr.Error()) gpuResults[idx].Notes = append(gpuResults[idx].Notes, "precision plan failed: "+planErr.Error())
} }
} }
// Cooldown: all GPUs together. // Cooldown: all GPUs together.
cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, selected) if spec.CooldownSec > 0 {
if err != nil && err != context.Canceled { cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, selected)
for _, idx := range selected { if err != nil && err != context.Canceled {
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "cooldown sampling failed: "+err.Error()) for _, idx := range selected {
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "cooldown sampling failed: "+err.Error())
}
} }
for _, idx := range selected {
perGPU := filterRowsByGPU(cooldownRows, idx)
gpuResults[idx].Cooldown = summarizeBenchmarkTelemetry(perGPU)
}
appendBenchmarkMetrics(allMetricRows, cooldownRows, "cooldown")
} }
for _, idx := range selected {
perGPU := filterRowsByGPU(cooldownRows, idx)
gpuResults[idx].Cooldown = summarizeBenchmarkTelemetry(perGPU)
}
appendBenchmarkMetrics(allMetricRows, cooldownRows, "cooldown")
// Score and finalize each GPU. // Score and finalize each GPU.
for _, idx := range selected { for _, idx := range selected {
@@ -2023,8 +2123,8 @@ func runNvidiaBenchmarkParallel(
r.DegradationReasons = detectBenchmarkDegradationReasons(*r, result.Normalization.Status) r.DegradationReasons = detectBenchmarkDegradationReasons(*r, result.Normalization.Status)
pr := parseResults[idx] pr := parseResults[idx]
switch { switch {
case steadyErr != nil: case planErr != nil:
r.Status = classifySATErrorStatus(steadyOut, steadyErr) r.Status = classifySATErrorStatus(phaseLogs["mixed"], planErr)
case pr.Fallback: case pr.Fallback:
r.Status = "PARTIAL" r.Status = "PARTIAL"
default: default:
@@ -2213,7 +2313,7 @@ func runBenchmarkPowerCalibration(
gpuIndices []int, gpuIndices []int,
logFunc func(string), logFunc func(string),
) map[int]float64 { ) map[int]float64 {
const calibDurationSec = 45 const calibDurationSec = 120
// dcgmi must be present. // dcgmi must be present.
if _, err := exec.LookPath("dcgmi"); err != nil { if _, err := exec.LookPath("dcgmi"); err != nil {

View File

@@ -88,10 +88,10 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
b.WriteString("- Thermal and power limits are inferred from NVIDIA clock-event counters plus sustained telemetry.\n") b.WriteString("- Thermal and power limits are inferred from NVIDIA clock-event counters plus sustained telemetry.\n")
b.WriteString("- `result.json` is the canonical machine-readable source for the run.\n\n") b.WriteString("- `result.json` is the canonical machine-readable source for the run.\n\n")
b.WriteString("**Compute score** is derived from two phases:\n\n") b.WriteString("**Compute score** is derived from two phases:\n\n")
b.WriteString("- **Synthetic** — each precision type (fp8, fp16, fp32, fp64, fp4) runs alone for a dedicated window. ") b.WriteString("- **Synthetic** — each precision type (int8, fp8, fp16, fp32, fp64, fp4) runs alone for a dedicated window. ")
b.WriteString("Measures peak throughput with the full GPU dedicated to one kernel type. ") b.WriteString("Measures peak throughput with the full GPU dedicated to one kernel type. ")
b.WriteString("Each result is normalised to fp32-equivalent TOPS using precision weights: ") b.WriteString("Each result is normalised to fp32-equivalent TOPS using precision weights: ")
b.WriteString("fp64 ×2.0 · fp32 ×1.0 · fp16 ×0.5 · fp8 ×0.25 · fp4 ×0.125.\n") b.WriteString("fp64 ×2.0 · fp32 ×1.0 · fp16 ×0.5 · int8 ×0.25 · fp8 ×0.25 · fp4 ×0.125.\n")
b.WriteString("- **Mixed** — all precision types run simultaneously (combined phase). ") b.WriteString("- **Mixed** — all precision types run simultaneously (combined phase). ")
b.WriteString("Reflects real inference workloads where fp8 matrix ops, fp16 attention and fp32 accumulation compete for bandwidth and SM scheduler slots.\n\n") b.WriteString("Reflects real inference workloads where fp8 matrix ops, fp16 attention and fp32 accumulation compete for bandwidth and SM scheduler slots.\n\n")
b.WriteString("**Formula:** `Compute = Synthetic × (1 + MixedEfficiency × 0.3)`\n\n") b.WriteString("**Formula:** `Compute = Synthetic × (1 + MixedEfficiency × 0.3)`\n\n")

View File

@@ -16,17 +16,17 @@ func TestResolveBenchmarkProfile(t *testing.T) {
{ {
name: "default", name: "default",
profile: "", profile: "",
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 120, SteadySec: 480, NCCLSec: 180, CooldownSec: 120}, want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 45, SteadySec: 480, NCCLSec: 180, CooldownSec: 0},
}, },
{ {
name: "stability", name: "stability",
profile: "stability", profile: "stability",
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 300, SteadySec: 3600, NCCLSec: 300, CooldownSec: 300}, want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 120, SteadySec: 3600, NCCLSec: 300, CooldownSec: 0},
}, },
{ {
name: "overnight", name: "overnight",
profile: "overnight", profile: "overnight",
want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 600, SteadySec: 27000, NCCLSec: 600, CooldownSec: 300}, want: benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 180, SteadySec: 27000, NCCLSec: 600, CooldownSec: 0},
}, },
} }
@@ -41,6 +41,92 @@ func TestResolveBenchmarkProfile(t *testing.T) {
} }
} }
func TestBuildBenchmarkSteadyPlanStandard(t *testing.T) {
t.Parallel()
labels, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, SteadySec: 480},
func(label string) string { return label },
)
if len(labels) != 7 || len(phases) != 7 {
t.Fatalf("labels=%d phases=%d want 7", len(labels), len(phases))
}
if basePhaseSec != 60 {
t.Fatalf("basePhaseSec=%d want 60", basePhaseSec)
}
if mixedPhaseSec != 300 {
t.Fatalf("mixedPhaseSec=%d want 300", mixedPhaseSec)
}
if phases[len(phases)-1].PlanLabel != "mixed" || phases[len(phases)-1].DurationSec != 300 {
t.Fatalf("mixed phase=%+v want duration 300", phases[len(phases)-1])
}
if benchmarkPlanDurationsCSV(phases) != "60,60,60,60,60,60,300" {
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
}
}
func TestBuildBenchmarkSteadyPlanStability(t *testing.T) {
t.Parallel()
_, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, SteadySec: 3600},
func(label string) string { return label },
)
if basePhaseSec != 300 {
t.Fatalf("basePhaseSec=%d want 300", basePhaseSec)
}
if mixedPhaseSec != 3600 {
t.Fatalf("mixedPhaseSec=%d want 3600", mixedPhaseSec)
}
if benchmarkPlanDurationsCSV(phases) != "300,300,300,300,300,300,3600" {
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
}
}
func TestBuildBenchmarkSteadyPlanOvernight(t *testing.T) {
t.Parallel()
_, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, SteadySec: 27000},
func(label string) string { return label },
)
if basePhaseSec != 3600 {
t.Fatalf("basePhaseSec=%d want 3600", basePhaseSec)
}
if mixedPhaseSec != 14400 {
t.Fatalf("mixedPhaseSec=%d want 14400", mixedPhaseSec)
}
if benchmarkPlanDurationsCSV(phases) != "3600,3600,3600,3600,3600,3600,14400" {
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
}
}
func TestSplitBenchmarkRowsByPlannedPhaseUsesPhaseDurations(t *testing.T) {
t.Parallel()
phases := []benchmarkPlannedPhase{
{PlanLabel: "fp8", MetricStage: "fp8", DurationSec: 10},
{PlanLabel: "fp16", MetricStage: "fp16", DurationSec: 10},
{PlanLabel: "mixed", MetricStage: "mixed", DurationSec: 50},
}
rows := []GPUMetricRow{
{ElapsedSec: 5},
{ElapsedSec: 15},
{ElapsedSec: 25},
{ElapsedSec: 65},
}
got := splitBenchmarkRowsByPlannedPhase(rows, phases)
if len(got["fp8"]) != 1 {
t.Fatalf("fp8 rows=%d want 1", len(got["fp8"]))
}
if len(got["fp16"]) != 1 {
t.Fatalf("fp16 rows=%d want 1", len(got["fp16"]))
}
if len(got["mixed"]) != 2 {
t.Fatalf("mixed rows=%d want 2", len(got["mixed"]))
}
}
func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) { func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
t.Parallel() t.Parallel()
@@ -65,8 +151,10 @@ func TestParseBenchmarkBurnLog(t *testing.T) {
"[gpu 0] compute_capability=9.0", "[gpu 0] compute_capability=9.0",
"[gpu 0] backend=cublasLt", "[gpu 0] backend=cublasLt",
"[gpu 0] duration_s=10", "[gpu 0] duration_s=10",
"[gpu 0] int8_tensor[0]=READY dim=16384x16384x8192 block=128 stream=0",
"[gpu 0] fp16_tensor[0]=READY dim=4096x4096x4096 block=128 stream=0", "[gpu 0] fp16_tensor[0]=READY dim=4096x4096x4096 block=128 stream=0",
"[gpu 0] fp8_e4m3[0]=READY dim=8192x8192x4096 block=128 stream=0", "[gpu 0] fp8_e4m3[0]=READY dim=8192x8192x4096 block=128 stream=0",
"[gpu 0] int8_tensor_iterations=80",
"[gpu 0] fp16_tensor_iterations=200", "[gpu 0] fp16_tensor_iterations=200",
"[gpu 0] fp8_e4m3_iterations=50", "[gpu 0] fp8_e4m3_iterations=50",
"[gpu 0] status=OK", "[gpu 0] status=OK",
@@ -79,15 +167,24 @@ func TestParseBenchmarkBurnLog(t *testing.T) {
if got.ComputeCapability != "9.0" { if got.ComputeCapability != "9.0" {
t.Fatalf("compute capability=%q want 9.0", got.ComputeCapability) t.Fatalf("compute capability=%q want 9.0", got.ComputeCapability)
} }
if len(got.Profiles) != 2 { if len(got.Profiles) != 3 {
t.Fatalf("profiles=%d want 2", len(got.Profiles)) t.Fatalf("profiles=%d want 3", len(got.Profiles))
} }
if got.Profiles[0].TeraOpsPerSec <= 0 { if got.Profiles[0].TeraOpsPerSec <= 0 {
t.Fatalf("profile[0] teraops=%f want >0", got.Profiles[0].TeraOpsPerSec) t.Fatalf("profile[0] teraops=%f want >0", got.Profiles[0].TeraOpsPerSec)
} }
if got.Profiles[0].Category != "fp16_bf16" {
t.Fatalf("profile[0] category=%q want fp16_bf16", got.Profiles[0].Category)
}
if got.Profiles[1].Category != "fp8" { if got.Profiles[1].Category != "fp8" {
t.Fatalf("profile[1] category=%q want fp8", got.Profiles[1].Category) t.Fatalf("profile[1] category=%q want fp8", got.Profiles[1].Category)
} }
if got.Profiles[2].Category != "int8" {
t.Fatalf("profile[2] category=%q want int8", got.Profiles[2].Category)
}
if got.Profiles[2].Weight != 0.25 {
t.Fatalf("profile[2] weight=%f want 0.25", got.Profiles[2].Weight)
}
} }
func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) { func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {

View File

@@ -179,7 +179,7 @@ type BenchmarkPrecisionResult struct {
Iterations uint64 `json:"iterations,omitempty"` Iterations uint64 `json:"iterations,omitempty"`
TeraOpsPerSec float64 `json:"teraops_per_sec,omitempty"` TeraOpsPerSec float64 `json:"teraops_per_sec,omitempty"`
// Weight is the fp32-equivalence factor for this precision category. // Weight is the fp32-equivalence factor for this precision category.
// fp32 = 1.0 (baseline), fp64 = 2.0, fp16 = 0.5, fp8 = 0.25, fp4 = 0.125. // fp32 = 1.0 (baseline), fp64 = 2.0, fp16 = 0.5, int8/fp8 = 0.25, fp4 = 0.125.
// WeightedTOPS = TeraOpsPerSec * Weight gives fp32-equivalent throughput. // WeightedTOPS = TeraOpsPerSec * Weight gives fp32-equivalent throughput.
Weight float64 `json:"weight,omitempty"` Weight float64 `json:"weight,omitempty"`
WeightedTeraOpsPerSec float64 `json:"weighted_teraops_per_sec,omitempty"` WeightedTeraOpsPerSec float64 `json:"weighted_teraops_per_sec,omitempty"`

View File

@@ -642,6 +642,20 @@ static const struct profile_desc k_profiles[] = {
CUDA_R_16F, CUDA_R_16F,
CUBLAS_COMPUTE_32F_FAST_16F, CUBLAS_COMPUTE_32F_FAST_16F,
}, },
{
"int8_tensor",
"int8",
75,
1,
0,
0,
128,
CUDA_R_8I,
CUDA_R_8I,
CUDA_R_32I,
CUDA_R_32I,
CUBLAS_COMPUTE_32I,
},
{ {
"fp8_e4m3", "fp8_e4m3",
"fp8", "fp8",
@@ -760,10 +774,12 @@ static int check_cublas(const char *step, cublasStatus_t status) {
static size_t bytes_for_elements(cudaDataType_t type, uint64_t elements) { static size_t bytes_for_elements(cudaDataType_t type, uint64_t elements) {
switch (type) { switch (type) {
case CUDA_R_32F: case CUDA_R_32F:
case CUDA_R_32I:
return (size_t)(elements * 4u); return (size_t)(elements * 4u);
case CUDA_R_16F: case CUDA_R_16F:
case CUDA_R_16BF: case CUDA_R_16BF:
return (size_t)(elements * 2u); return (size_t)(elements * 2u);
case CUDA_R_8I:
case CUDA_R_8F_E4M3: case CUDA_R_8F_E4M3:
case CUDA_R_8F_E5M2: case CUDA_R_8F_E5M2:
return (size_t)(elements); return (size_t)(elements);
@@ -776,6 +792,13 @@ static size_t bytes_for_elements(cudaDataType_t type, uint64_t elements) {
} }
} }
static cudaDataType_t matmul_scale_type(const struct profile_desc *desc) {
if (desc->compute_type == CUBLAS_COMPUTE_32I) {
return CUDA_R_32I;
}
return CUDA_R_32F;
}
static size_t fp4_scale_bytes(uint64_t rows, uint64_t cols) { static size_t fp4_scale_bytes(uint64_t rows, uint64_t cols) {
uint64_t row_tiles = (rows + 127u) / 128u; uint64_t row_tiles = (rows + 127u) / 128u;
uint64_t col_tiles = (cols + 63u) / 64u; uint64_t col_tiles = (cols + 63u) / 64u;
@@ -944,8 +967,9 @@ static int prepare_profile(struct cublaslt_api *cublas,
return 0; return 0;
} }
cudaDataType_t scale_type = matmul_scale_type(desc);
if (!check_cublas("cublasLtMatmulDescCreate", if (!check_cublas("cublasLtMatmulDescCreate",
cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, CUDA_R_32F))) { cublas->cublasLtMatmulDescCreate(&out->op_desc, desc->compute_type, scale_type))) {
destroy_profile(cublas, cuda, out); destroy_profile(cublas, cuda, out);
return 0; return 0;
} }
@@ -1094,17 +1118,25 @@ static int prepare_profile(struct cublaslt_api *cublas,
static int run_cublas_profile(cublasLtHandle_t handle, static int run_cublas_profile(cublasLtHandle_t handle,
struct cublaslt_api *cublas, struct cublaslt_api *cublas,
struct prepared_profile *profile) { struct prepared_profile *profile) {
int32_t alpha_i32 = 1;
int32_t beta_i32 = 0;
float alpha = 1.0f; float alpha = 1.0f;
float beta = 0.0f; float beta = 0.0f;
const void *alpha_ptr = &alpha;
const void *beta_ptr = &beta;
if (profile->desc.compute_type == CUBLAS_COMPUTE_32I) {
alpha_ptr = &alpha_i32;
beta_ptr = &beta_i32;
}
return check_cublas(profile->desc.name, return check_cublas(profile->desc.name,
cublas->cublasLtMatmul(handle, cublas->cublasLtMatmul(handle,
profile->op_desc, profile->op_desc,
&alpha, alpha_ptr,
(const void *)(uintptr_t)profile->a_dev, (const void *)(uintptr_t)profile->a_dev,
profile->a_layout, profile->a_layout,
(const void *)(uintptr_t)profile->b_dev, (const void *)(uintptr_t)profile->b_dev,
profile->b_layout, profile->b_layout,
&beta, beta_ptr,
(const void *)(uintptr_t)profile->c_dev, (const void *)(uintptr_t)profile->c_dev,
profile->c_layout, profile->c_layout,
(void *)(uintptr_t)profile->d_dev, (void *)(uintptr_t)profile->d_dev,
@@ -1359,11 +1391,29 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
} }
#endif #endif
static void print_stress_report(const struct stress_report *report, int device_index, int seconds) {
printf("device=%s\n", report->device);
printf("device_index=%d\n", device_index);
printf("compute_capability=%d.%d\n", report->cc_major, report->cc_minor);
printf("backend=%s\n", report->backend);
printf("duration_s=%d\n", seconds);
printf("buffer_mb=%d\n", report->buffer_mb);
printf("streams=%d\n", report->stream_count);
printf("iterations=%lu\n", report->iterations);
printf("checksum=%llu\n", (unsigned long long)report->checksum);
if (report->details[0] != '\0') {
printf("%s", report->details);
}
printf("status=OK\n");
}
int main(int argc, char **argv) { int main(int argc, char **argv) {
int seconds = 5; int seconds = 5;
int size_mb = 64; int size_mb = 64;
int device_index = 0; int device_index = 0;
const char *precision_filter = NULL; /* NULL = all; else block_label to match */ const char *precision_filter = NULL; /* NULL = all; else block_label to match */
const char *precision_plan = NULL;
const char *precision_plan_seconds = NULL;
for (int i = 1; i < argc; i++) { for (int i = 1; i < argc; i++) {
if ((strcmp(argv[i], "--seconds") == 0 || strcmp(argv[i], "-t") == 0) && i + 1 < argc) { if ((strcmp(argv[i], "--seconds") == 0 || strcmp(argv[i], "-t") == 0) && i + 1 < argc) {
seconds = atoi(argv[++i]); seconds = atoi(argv[++i]);
@@ -1373,9 +1423,13 @@ int main(int argc, char **argv) {
device_index = atoi(argv[++i]); device_index = atoi(argv[++i]);
} else if (strcmp(argv[i], "--precision") == 0 && i + 1 < argc) { } else if (strcmp(argv[i], "--precision") == 0 && i + 1 < argc) {
precision_filter = argv[++i]; precision_filter = argv[++i];
} else if (strcmp(argv[i], "--precision-plan") == 0 && i + 1 < argc) {
precision_plan = argv[++i];
} else if (strcmp(argv[i], "--precision-plan-seconds") == 0 && i + 1 < argc) {
precision_plan_seconds = argv[++i];
} else { } else {
fprintf(stderr, fprintf(stderr,
"usage: %s [--seconds N] [--size-mb N] [--device N] [--precision fp8|fp16|fp32|fp64|fp4]\n", "usage: %s [--seconds N] [--size-mb N] [--device N] [--precision int8|fp8|fp16|fp32|fp64|fp4] [--precision-plan p1,p2,...,mixed] [--precision-plan-seconds s1,s2,...]\n",
argv[0]); argv[0]);
return 2; return 2;
} }
@@ -1436,6 +1490,76 @@ int main(int argc, char **argv) {
int ok = 0; int ok = 0;
#if HAVE_CUBLASLT_HEADERS #if HAVE_CUBLASLT_HEADERS
if (precision_plan != NULL && precision_plan[0] != '\0') {
char *plan_copy = strdup(precision_plan);
char *plan_seconds_copy = NULL;
int phase_seconds[32] = {0};
int phase_seconds_count = 0;
int phase_ok = 0;
if (plan_copy == NULL) {
fprintf(stderr, "failed to allocate precision plan buffer\n");
return 1;
}
if (precision_plan_seconds != NULL && precision_plan_seconds[0] != '\0') {
plan_seconds_copy = strdup(precision_plan_seconds);
if (plan_seconds_copy == NULL) {
free(plan_copy);
fprintf(stderr, "failed to allocate precision plan seconds buffer\n");
return 1;
}
for (char *sec_token = strtok(plan_seconds_copy, ",");
sec_token != NULL && phase_seconds_count < (int)(sizeof(phase_seconds) / sizeof(phase_seconds[0]));
sec_token = strtok(NULL, ",")) {
while (*sec_token == ' ' || *sec_token == '\t') {
sec_token++;
}
if (*sec_token == '\0') {
continue;
}
phase_seconds[phase_seconds_count++] = atoi(sec_token);
}
}
int phase_idx = 0;
for (char *token = strtok(plan_copy, ","); token != NULL; token = strtok(NULL, ","), phase_idx++) {
while (*token == ' ' || *token == '\t') {
token++;
}
if (*token == '\0') {
continue;
}
const char *phase_name = token;
const char *phase_filter = token;
if (strcmp(token, "mixed") == 0 || strcmp(token, "all") == 0) {
phase_filter = NULL;
}
int phase_duration = seconds;
if (phase_idx < phase_seconds_count && phase_seconds[phase_idx] > 0) {
phase_duration = phase_seconds[phase_idx];
}
printf("phase_begin=%s\n", phase_name);
fflush(stdout);
memset(&report, 0, sizeof(report));
ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, phase_duration, size_mb, phase_filter, &report);
if (ok) {
print_stress_report(&report, device_index, phase_duration);
phase_ok = 1;
} else {
printf("phase_error=%s\n", phase_name);
if (report.details[0] != '\0') {
printf("%s", report.details);
if (report.details[strlen(report.details) - 1] != '\n') {
printf("\n");
}
}
printf("status=FAILED\n");
}
printf("phase_end=%s\n", phase_name);
fflush(stdout);
}
free(plan_seconds_copy);
free(plan_copy);
return phase_ok ? 0 : 1;
}
ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, precision_filter, &report); ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, precision_filter, &report);
#endif #endif
if (!ok) { if (!ok) {
@@ -1454,18 +1578,6 @@ int main(int argc, char **argv) {
} }
} }
printf("device=%s\n", report.device); print_stress_report(&report, device_index, seconds);
printf("device_index=%d\n", device_index);
printf("compute_capability=%d.%d\n", report.cc_major, report.cc_minor);
printf("backend=%s\n", report.backend);
printf("duration_s=%d\n", seconds);
printf("buffer_mb=%d\n", report.buffer_mb);
printf("streams=%d\n", report.stream_count);
printf("iterations=%lu\n", report.iterations);
printf("checksum=%llu\n", (unsigned long long)report.checksum);
if (report.details[0] != '\0') {
printf("%s", report.details);
}
printf("status=OK\n");
return 0; return 0;
} }

View File

@@ -7,10 +7,12 @@ SIZE_MB=0
DEVICES="" DEVICES=""
EXCLUDE="" EXCLUDE=""
PRECISION="" PRECISION=""
PRECISION_PLAN=""
PRECISION_PLAN_SECONDS=""
WORKER="/usr/local/lib/bee/bee-gpu-burn-worker" WORKER="/usr/local/lib/bee/bee-gpu-burn-worker"
usage() { usage() {
echo "usage: $0 [--seconds N] [--stagger-seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3] [--precision fp8|fp16|fp32|fp64|fp4]" >&2 echo "usage: $0 [--seconds N] [--stagger-seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3] [--precision int8|fp8|fp16|fp32|fp64|fp4] [--precision-plan p1,p2,...,mixed] [--precision-plan-seconds s1,s2,...]" >&2
exit 2 exit 2
} }
@@ -32,6 +34,8 @@ while [ "$#" -gt 0 ]; do
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;; --devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;; --exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
--precision) [ "$#" -ge 2 ] || usage; PRECISION="$2"; shift 2 ;; --precision) [ "$#" -ge 2 ] || usage; PRECISION="$2"; shift 2 ;;
--precision-plan) [ "$#" -ge 2 ] || usage; PRECISION_PLAN="$2"; shift 2 ;;
--precision-plan-seconds) [ "$#" -ge 2 ] || usage; PRECISION_PLAN_SECONDS="$2"; shift 2 ;;
*) usage ;; *) usage ;;
esac esac
done done
@@ -92,8 +96,12 @@ for id in $(echo "${FINAL}" | tr ',' ' '); do
echo "starting gpu ${id} size=${gpu_size_mb}MB seconds=${gpu_seconds}" echo "starting gpu ${id} size=${gpu_size_mb}MB seconds=${gpu_seconds}"
precision_arg="" precision_arg=""
[ -n "${PRECISION}" ] && precision_arg="--precision ${PRECISION}" [ -n "${PRECISION}" ] && precision_arg="--precision ${PRECISION}"
precision_plan_arg=""
[ -n "${PRECISION_PLAN}" ] && precision_plan_arg="--precision-plan ${PRECISION_PLAN}"
precision_plan_seconds_arg=""
[ -n "${PRECISION_PLAN_SECONDS}" ] && precision_plan_seconds_arg="--precision-plan-seconds ${PRECISION_PLAN_SECONDS}"
CUDA_VISIBLE_DEVICES="${id}" \ CUDA_VISIBLE_DEVICES="${id}" \
"${WORKER}" --device 0 --seconds "${gpu_seconds}" --size-mb "${gpu_size_mb}" ${precision_arg} >"${log}" 2>&1 & "${WORKER}" --device 0 --seconds "${gpu_seconds}" --size-mb "${gpu_size_mb}" ${precision_arg} ${precision_plan_arg} ${precision_plan_seconds_arg} >"${log}" 2>&1 &
pid=$! pid=$!
WORKERS="${WORKERS} ${pid}:${id}:${log}" WORKERS="${WORKERS} ${pid}:${id}:${log}"
if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${gpu_pos}" -lt "${GPU_COUNT}" ]; then if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${gpu_pos}" -lt "${GPU_COUNT}" ]; then