Compare commits

...

7 Commits
v7.13 ... v7.19

Author SHA1 Message Date
Mikhail Chusavitin
88b5e0edf2 Harden IPMI power probe timeout 2026-04-14 10:18:23 +03:00
Mikhail Chusavitin
82fe1f6d26 Disable precision fallback and pin cuBLAS 13.1 2026-04-14 10:17:44 +03:00
81e7c921f8 дебаг при сборке 2026-04-14 07:02:37 +03:00
0fb8f2777f Fix combined gpu burn profile capacity for fp4 2026-04-14 00:00:40 +03:00
bf182daa89 Fix benchmark report methodology and rebuild gpu burn worker on toolchain changes 2026-04-13 23:43:12 +03:00
457ea1cf04 Unify benchmark exports and drop ASCII charts 2026-04-13 21:38:28 +03:00
bf6ecab4f0 Add per-precision benchmark phases, weighted TOPS scoring, and ECC tracking
- Split steady window into 6 equal slots: fp8/fp16/fp32/fp64/fp4 + combined
- Each precision phase runs bee-gpu-burn with --precision filter so PowerCVPct reflects single-kernel stability (not round-robin artifact)
- Add fp4 support in bee-gpu-stress.c for Blackwell (cc>=100) via existing CUDA_R_4F_E2M1 guard
- Weighted TOPS: fp64×2.0, fp32×1.0, fp16×0.5, fp8×0.25, fp4×0.125
- SyntheticScore = sum of weighted TOPS from per-precision phases
- MixedScore = sum from combined phase; MixedEfficiency = Mixed/Synthetic
- ComputeScore = SyntheticScore × (1 + MixedEfficiency × 0.3)
- ECC volatile counters sampled before/after each phase and overall
- DegradationReasons: ecc_uncorrected_errors, ecc_corrected_errors
- Report: per-precision stability table with ECC columns, methodology section
- Ramp-up history table redesign: GPU indices as columns, runs as rows

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-13 10:49:49 +03:00
16 changed files with 906 additions and 654 deletions

1
.gitignore vendored
View File

@@ -2,3 +2,4 @@
.DS_Store
dist/
iso/out/
build-cache/

View File

@@ -73,6 +73,11 @@ var (
benchmarkIterationsPattern = regexp.MustCompile(`^([a-z0-9_]+)_iterations=(\d+)$`)
)
// benchmarkPrecisionPhases lists the precision categories run as individual
// steady-state windows before the combined steady pass. Order is from lowest
// to highest power draw so thermal ramp-up is gradual.
var benchmarkPrecisionPhases = []string{"fp8", "fp16", "fp32", "fp64", "fp4"}
func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
if ctx == nil {
ctx = context.Background()
@@ -120,6 +125,8 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
}
logFunc(fmt.Sprintf("NVIDIA benchmark profile=%s gpus=%s", spec.Name, joinIndexList(selected)))
var metricRows []GPUMetricRow
gpuBurnLog := filepath.Join(runDir, "gpu-burn.log")
// Server power characterization state — populated during per-GPU phases.
var serverIdleW, serverLoadedWSum float64
@@ -166,154 +173,202 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
cpuSamplesCh := startCPULoadSampler(cpuStopCh, 10)
if opts.ParallelGPUs {
runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, calibPowerByIndex, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples)
runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, calibPowerByIndex, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples, &metricRows, gpuBurnLog)
} else {
for _, idx := range selected {
gpuResult := BenchmarkGPUResult{
Index: idx,
Status: "FAILED",
}
if info, ok := infoByIndex[idx]; ok {
gpuResult.UUID = info.UUID
gpuResult.Name = info.Name
gpuResult.BusID = info.BusID
gpuResult.VBIOS = info.VBIOS
gpuResult.PowerLimitW = info.PowerLimitW
gpuResult.MultiprocessorCount = info.MultiprocessorCount
gpuResult.DefaultPowerLimitW = info.DefaultPowerLimitW
gpuResult.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz
gpuResult.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz
}
if w, ok := calibPowerByIndex[idx]; ok && w > 0 {
gpuResult.CalibratedPeakPowerW = w
}
if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
gpuResult.LockedGraphicsClockMHz = norm.GPUClockLockMHz
gpuResult.LockedMemoryClockMHz = norm.MemoryClockLockMHz
}
baselineRows, err := collectBenchmarkSamples(ctx, spec.BaselineSec, []int{idx})
if err != nil && err != context.Canceled {
gpuResult.Notes = append(gpuResult.Notes, "baseline sampling failed: "+err.Error())
}
gpuResult.Baseline = summarizeBenchmarkTelemetry(baselineRows)
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-baseline", idx), baselineRows)
// Sample server idle power once (first GPU only — server state is global).
if !serverIdleOK {
if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok {
serverIdleW = w
serverIdleOK = true
logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
for _, idx := range selected {
gpuResult := BenchmarkGPUResult{
Index: idx,
Status: "FAILED",
}
}
warmupCmd := []string{
"bee-gpu-burn",
"--seconds", strconv.Itoa(spec.WarmupSec),
"--size-mb", strconv.Itoa(opts.SizeMB),
"--devices", strconv.Itoa(idx),
}
logFunc(fmt.Sprintf("GPU %d: warmup (%ds)", idx, spec.WarmupSec))
warmupOut, _, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-warmup.log", idx), warmupCmd, nil, []int{idx}, runDir, fmt.Sprintf("gpu-%d-warmup", idx), logFunc)
_ = os.WriteFile(filepath.Join(runDir, fmt.Sprintf("gpu-%d-warmup.log", idx)), warmupOut, 0644)
if warmupErr != nil {
gpuResult.Notes = append(gpuResult.Notes, "warmup failed: "+warmupErr.Error())
result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult))
continue
}
beforeThrottle, _ := queryThrottleCounters(idx)
steadyCmd := []string{
"bee-gpu-burn",
"--seconds", strconv.Itoa(spec.SteadySec),
"--size-mb", strconv.Itoa(opts.SizeMB),
"--devices", strconv.Itoa(idx),
}
logFunc(fmt.Sprintf("GPU %d: steady compute (%ds)", idx, spec.SteadySec))
// Sample server power via IPMI in parallel with the steady phase.
// We collect readings every 5s and average them.
ipmiStopCh := make(chan struct{})
ipmiResultCh := make(chan float64, 1)
go func() {
defer close(ipmiResultCh)
var samples []float64
ticker := time.NewTicker(5 * time.Second)
defer ticker.Stop()
// First sample after a short warmup delay.
select {
case <-ipmiStopCh:
return
case <-time.After(15 * time.Second):
if info, ok := infoByIndex[idx]; ok {
gpuResult.UUID = info.UUID
gpuResult.Name = info.Name
gpuResult.BusID = info.BusID
gpuResult.VBIOS = info.VBIOS
gpuResult.PowerLimitW = info.PowerLimitW
gpuResult.MultiprocessorCount = info.MultiprocessorCount
gpuResult.DefaultPowerLimitW = info.DefaultPowerLimitW
gpuResult.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz
gpuResult.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz
}
for {
if w, err := queryIPMIServerPowerW(); err == nil {
samples = append(samples, w)
if w, ok := calibPowerByIndex[idx]; ok && w > 0 {
gpuResult.CalibratedPeakPowerW = w
}
if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
gpuResult.LockedGraphicsClockMHz = norm.GPUClockLockMHz
gpuResult.LockedMemoryClockMHz = norm.MemoryClockLockMHz
}
baselineRows, err := collectBenchmarkSamples(ctx, spec.BaselineSec, []int{idx})
if err != nil && err != context.Canceled {
gpuResult.Notes = append(gpuResult.Notes, "baseline sampling failed: "+err.Error())
}
gpuResult.Baseline = summarizeBenchmarkTelemetry(baselineRows)
appendBenchmarkMetrics(&metricRows, baselineRows, fmt.Sprintf("gpu-%d-baseline", idx))
// Sample server idle power once (first GPU only — server state is global).
if !serverIdleOK {
if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok {
serverIdleW = w
serverIdleOK = true
logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
}
}
warmupCmd := []string{
"bee-gpu-burn",
"--seconds", strconv.Itoa(spec.WarmupSec),
"--size-mb", strconv.Itoa(opts.SizeMB),
"--devices", strconv.Itoa(idx),
}
logFunc(fmt.Sprintf("GPU %d: warmup (%ds)", idx, spec.WarmupSec))
warmupOut, warmupRows, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-warmup.log", idx), warmupCmd, nil, []int{idx}, logFunc)
appendBenchmarkMetrics(&metricRows, warmupRows, fmt.Sprintf("gpu-%d-warmup", idx))
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", fmt.Sprintf("gpu-%d-warmup", idx), warmupOut)
if warmupErr != nil {
gpuResult.Notes = append(gpuResult.Notes, "warmup failed: "+warmupErr.Error())
result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult))
continue
}
// ── Per-precision stability phases ────────────────────────────────────────
// Run each precision category alone so PowerCVPct reflects genuine GPU
// power stability, not kernel-mix variance.
// Time budget: each phase gets steadySec/numPhases, minimum 60 s.
// SteadySec is split equally across all precision phases + 1 combined slot.
// Skipped phases (unsupported precision) are simply omitted; combined is fixed.
totalSlots := len(benchmarkPrecisionPhases) + 1
perPhaseSec := spec.SteadySec / totalSlots
if perPhaseSec < 60 {
perPhaseSec = 60
}
eccBase, _ := queryECCCounters(idx)
for _, prec := range benchmarkPrecisionPhases {
phaseCmd := []string{
"bee-gpu-burn",
"--seconds", strconv.Itoa(perPhaseSec),
"--size-mb", strconv.Itoa(opts.SizeMB),
"--devices", strconv.Itoa(idx),
"--precision", prec,
}
logFunc(fmt.Sprintf("GPU %d: %s stability phase (%ds)", idx, prec, perPhaseSec))
phaseLogName := fmt.Sprintf("gpu-%d-steady-%s", idx, prec)
eccBefore, _ := queryECCCounters(idx)
phaseOut, phaseRows, phaseErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, phaseLogName+".log", phaseCmd, nil, []int{idx}, logFunc)
appendBenchmarkMetrics(&metricRows, phaseRows, phaseLogName)
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseLogName, phaseOut)
eccAfter, _ := queryECCCounters(idx)
if phaseErr != nil || len(phaseRows) == 0 {
continue
}
phase := BenchmarkPrecisionSteadyPhase{
Precision: prec,
Steady: summarizeBenchmarkTelemetry(phaseRows),
ECC: diffECCCounters(eccBefore, eccAfter),
}
for _, p := range parseBenchmarkBurnLog(string(phaseOut)).Profiles {
if p.Supported {
phase.TeraOpsPerSec += p.TeraOpsPerSec
phase.WeightedTeraOpsPerSec += p.WeightedTeraOpsPerSec
}
}
gpuResult.PrecisionSteady = append(gpuResult.PrecisionSteady, phase)
}
beforeThrottle, _ := queryThrottleCounters(idx)
steadyCmd := []string{
"bee-gpu-burn",
"--seconds", strconv.Itoa(perPhaseSec),
"--size-mb", strconv.Itoa(opts.SizeMB),
"--devices", strconv.Itoa(idx),
}
logFunc(fmt.Sprintf("GPU %d: steady compute (combined, %ds)", idx, perPhaseSec))
// Sample server power via IPMI in parallel with the steady phase.
// We collect readings every 5s and average them.
ipmiStopCh := make(chan struct{})
ipmiResultCh := make(chan float64, 1)
go func() {
defer close(ipmiResultCh)
var samples []float64
ticker := time.NewTicker(5 * time.Second)
defer ticker.Stop()
// First sample after a short warmup delay.
select {
case <-ipmiStopCh:
if len(samples) > 0 {
var sum float64
for _, w := range samples {
sum += w
}
ipmiResultCh <- sum / float64(len(samples))
}
return
case <-ticker.C:
case <-time.After(15 * time.Second):
}
for {
if w, err := queryIPMIServerPowerW(); err == nil {
samples = append(samples, w)
}
select {
case <-ipmiStopCh:
if len(samples) > 0 {
var sum float64
for _, w := range samples {
sum += w
}
ipmiResultCh <- sum / float64(len(samples))
}
return
case <-ticker.C:
}
}
}()
steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-steady.log", idx), steadyCmd, nil, []int{idx}, logFunc)
appendBenchmarkMetrics(&metricRows, steadyRows, fmt.Sprintf("gpu-%d-steady", idx))
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", fmt.Sprintf("gpu-%d-steady", idx), steadyOut)
close(ipmiStopCh)
if loadedW, ok := <-ipmiResultCh; ok {
serverLoadedWSum += loadedW
serverLoadedSamples++
serverLoadedOK = true
logFunc(fmt.Sprintf("GPU %d: server loaded power (IPMI): %.0f W", idx, loadedW))
}
afterThrottle, _ := queryThrottleCounters(idx)
if steadyErr != nil {
gpuResult.Notes = append(gpuResult.Notes, "steady compute failed: "+steadyErr.Error())
}
}()
steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-steady.log", idx), steadyCmd, nil, []int{idx}, runDir, fmt.Sprintf("gpu-%d-steady", idx), logFunc)
close(ipmiStopCh)
if loadedW, ok := <-ipmiResultCh; ok {
serverLoadedWSum += loadedW
serverLoadedSamples++
serverLoadedOK = true
logFunc(fmt.Sprintf("GPU %d: server loaded power (IPMI): %.0f W", idx, loadedW))
parseResult := parseBenchmarkBurnLog(string(steadyOut))
gpuResult.ComputeCapability = parseResult.ComputeCapability
gpuResult.Backend = parseResult.Backend
gpuResult.PrecisionResults = parseResult.Profiles
if parseResult.Fallback {
gpuResult.Notes = append(gpuResult.Notes, "benchmark used driver PTX fallback; tensor throughput score is not comparable")
}
gpuResult.Steady = summarizeBenchmarkTelemetry(steadyRows)
gpuResult.Throttle = diffThrottleCounters(beforeThrottle, afterThrottle)
if eccFinal, err := queryECCCounters(idx); err == nil {
gpuResult.ECC = diffECCCounters(eccBase, eccFinal)
}
cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, []int{idx})
if err != nil && err != context.Canceled {
gpuResult.Notes = append(gpuResult.Notes, "cooldown sampling failed: "+err.Error())
}
gpuResult.Cooldown = summarizeBenchmarkTelemetry(cooldownRows)
appendBenchmarkMetrics(&metricRows, cooldownRows, fmt.Sprintf("gpu-%d-cooldown", idx))
gpuResult.Scores = scoreBenchmarkGPUResult(gpuResult)
gpuResult.DegradationReasons = detectBenchmarkDegradationReasons(gpuResult, result.Normalization.Status)
if steadyErr != nil {
gpuResult.Status = classifySATErrorStatus(steadyOut, steadyErr)
} else if parseResult.Fallback {
gpuResult.Status = "PARTIAL"
} else {
gpuResult.Status = "OK"
}
result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult))
}
_ = os.WriteFile(filepath.Join(runDir, fmt.Sprintf("gpu-%d-steady.log", idx)), steadyOut, 0644)
afterThrottle, _ := queryThrottleCounters(idx)
if steadyErr != nil {
gpuResult.Notes = append(gpuResult.Notes, "steady compute failed: "+steadyErr.Error())
}
parseResult := parseBenchmarkBurnLog(string(steadyOut))
gpuResult.ComputeCapability = parseResult.ComputeCapability
gpuResult.Backend = parseResult.Backend
gpuResult.PrecisionResults = parseResult.Profiles
if parseResult.Fallback {
gpuResult.Notes = append(gpuResult.Notes, "benchmark used driver PTX fallback; tensor throughput score is not comparable")
}
gpuResult.Steady = summarizeBenchmarkTelemetry(steadyRows)
gpuResult.Throttle = diffThrottleCounters(beforeThrottle, afterThrottle)
cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, []int{idx})
if err != nil && err != context.Canceled {
gpuResult.Notes = append(gpuResult.Notes, "cooldown sampling failed: "+err.Error())
}
gpuResult.Cooldown = summarizeBenchmarkTelemetry(cooldownRows)
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-cooldown", idx), cooldownRows)
gpuResult.Scores = scoreBenchmarkGPUResult(gpuResult)
gpuResult.DegradationReasons = detectBenchmarkDegradationReasons(gpuResult, result.Normalization.Status)
if steadyErr != nil {
gpuResult.Status = classifySATErrorStatus(steadyOut, steadyErr)
} else if parseResult.Fallback {
gpuResult.Status = "PARTIAL"
} else {
gpuResult.Status = "OK"
}
result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult))
}
} // end sequential path
if len(selected) > 1 && opts.RunNCCL {
@@ -363,6 +418,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
result.Findings = buildBenchmarkFindings(result)
result.OverallStatus = benchmarkOverallStatus(result)
writeBenchmarkMetricsFiles(runDir, metricRows)
resultJSON, err := json.MarshalIndent(result, "", " ")
if err != nil {
@@ -372,7 +428,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
return "", fmt.Errorf("write result.json: %w", err)
}
report := renderBenchmarkReportWithCharts(result, loadBenchmarkReportCharts(runDir, selected))
report := renderBenchmarkReportWithCharts(result)
if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(report), 0644); err != nil {
return "", fmt.Errorf("write report.md: %w", err)
}
@@ -461,11 +517,11 @@ func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []b
// Split the verbose output into per-GPU sections on "^GPU " lines.
gpuSectionRe := regexp.MustCompile(`(?m)^GPU\s+([\dA-Fa-f:\.]+)`)
maxGfxRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Graphics\s*:\s*(\d+)\s*MHz`)
maxMemRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Memory\s*:\s*(\d+)\s*MHz`)
defaultPwrRe := regexp.MustCompile(`(?i)Default Power Limit\s*:\s*([0-9.]+)\s*W`)
currentPwrRe := regexp.MustCompile(`(?i)Current Power Limit\s*:\s*([0-9.]+)\s*W`)
smCountRe := regexp.MustCompile(`(?i)Multiprocessor Count\s*:\s*(\d+)`)
maxGfxRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Graphics\s*:\s*(\d+)\s*MHz`)
maxMemRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Memory\s*:\s*(\d+)\s*MHz`)
defaultPwrRe := regexp.MustCompile(`(?i)Default Power Limit\s*:\s*([0-9.]+)\s*W`)
currentPwrRe := regexp.MustCompile(`(?i)Current Power Limit\s*:\s*([0-9.]+)\s*W`)
smCountRe := regexp.MustCompile(`(?i)Multiprocessor Count\s*:\s*(\d+)`)
sectionStarts := gpuSectionRe.FindAllSubmatchIndex(nvsmiQ, -1)
for i, loc := range sectionStarts {
@@ -601,7 +657,6 @@ func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
return nil, lastErr
}
func applyBenchmarkNormalization(ctx context.Context, verboseLog string, gpuIndices []int, infoByIndex map[int]benchmarkGPUInfo, result *NvidiaBenchmarkResult) []benchmarkRestoreAction {
if os.Geteuid() != 0 {
result.Normalization.Status = "partial"
@@ -704,7 +759,7 @@ func collectBenchmarkSamples(ctx context.Context, durationSec int, gpuIndices []
return rows, nil
}
func runBenchmarkCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, runDir, baseName string, logFunc func(string)) ([]byte, []GPUMetricRow, error) {
func runBenchmarkCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, logFunc func(string)) ([]byte, []GPUMetricRow, error) {
stopCh := make(chan struct{})
doneCh := make(chan struct{})
var metricRows []GPUMetricRow
@@ -736,18 +791,65 @@ func runBenchmarkCommandWithMetrics(ctx context.Context, verboseLog, name string
close(stopCh)
<-doneCh
writeBenchmarkMetricsFiles(runDir, baseName, metricRows)
return out, metricRows, err
}
func writeBenchmarkMetricsFiles(runDir, baseName string, rows []GPUMetricRow) {
func annotateBenchmarkMetricRows(rows []GPUMetricRow, stage string, offset float64) []GPUMetricRow {
if len(rows) == 0 {
return nil
}
out := make([]GPUMetricRow, len(rows))
for i, row := range rows {
row.Stage = stage
row.ElapsedSec += offset
out[i] = row
}
return out
}
func benchmarkMetricOffset(rows []GPUMetricRow) float64 {
if len(rows) == 0 {
return 0
}
var maxElapsed float64
for _, row := range rows {
if row.ElapsedSec > maxElapsed {
maxElapsed = row.ElapsedSec
}
}
return maxElapsed
}
func appendBenchmarkMetrics(allRows *[]GPUMetricRow, rows []GPUMetricRow, stage string) {
annotated := annotateBenchmarkMetricRows(rows, stage, benchmarkMetricOffset(*allRows))
*allRows = append(*allRows, annotated...)
}
func writeBenchmarkMetricsFiles(runDir string, rows []GPUMetricRow) {
if len(rows) == 0 {
return
}
_ = WriteGPUMetricsCSV(filepath.Join(runDir, baseName+"-metrics.csv"), rows)
_ = WriteGPUMetricsHTML(filepath.Join(runDir, baseName+"-metrics.html"), rows)
chart := RenderGPUTerminalChart(rows)
_ = os.WriteFile(filepath.Join(runDir, baseName+"-metrics-term.txt"), []byte(chart), 0644)
_ = WriteGPUMetricsCSV(filepath.Join(runDir, "gpu-metrics.csv"), rows)
_ = WriteGPUMetricsHTML(filepath.Join(runDir, "gpu-metrics.html"), rows)
}
func appendBenchmarkStageLog(path, source, stage string, raw []byte) {
if path == "" || len(raw) == 0 {
return
}
f, err := os.OpenFile(path, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644)
if err != nil {
return
}
defer f.Close()
header := fmt.Sprintf("\n========== %s | stage=%s ==========\n", source, stage)
_, _ = f.WriteString(header)
if len(raw) > 0 {
_, _ = f.Write(raw)
if raw[len(raw)-1] != '\n' {
_, _ = f.WriteString("\n")
}
}
}
func parseBenchmarkBurnLog(raw string) benchmarkBurnParseResult {
@@ -811,8 +913,11 @@ func parseBenchmarkBurnLog(raw string) benchmarkBurnParseResult {
Iterations: profile.iterations,
Notes: profile.notes,
}
w := precisionWeight(profile.category)
precision.Weight = w
if profile.supported && result.DurationSec > 0 && profile.m > 0 && profile.n > 0 && profile.k > 0 && profile.iterations > 0 {
precision.TeraOpsPerSec = (2.0 * float64(profile.m) * float64(profile.n) * float64(profile.k) * float64(profile.iterations)) / float64(result.DurationSec) / 1e12
precision.WeightedTeraOpsPerSec = precision.TeraOpsPerSec * w
}
result.Profiles = append(result.Profiles, precision)
}
@@ -841,6 +946,35 @@ func ensureBenchmarkProfile(profiles map[string]*benchmarkBurnProfile, name stri
return profile
}
// precisionWeight returns the fp32-equivalence factor for a precision category.
// Each factor represents how much "real" numeric work one operation of that
// type performs relative to fp32 (single precision = 1.0 baseline):
//
// fp64 = 2.0 — double precision, 2× more bits per operand
// fp32 = 1.0 — single precision baseline
// fp16 = 0.5 — half precision
// fp8 = 0.25 — quarter precision
// fp4 = 0.125 — eighth precision
//
// Multiplying raw TOPS by the weight gives fp32-equivalent TOPS, enabling
// cross-precision comparison on the same numeric scale.
func precisionWeight(category string) float64 {
switch category {
case "fp64":
return 2.0
case "fp32_tf32":
return 1.0
case "fp16_bf16":
return 0.5
case "fp8":
return 0.25
case "fp4":
return 0.125
default:
return 1.0
}
}
func stripBenchmarkPrefix(line string) string {
if strings.HasPrefix(line, "[gpu ") {
if idx := strings.Index(line, "] "); idx >= 0 {
@@ -890,11 +1024,39 @@ func summarizeBenchmarkTelemetry(rows []GPUMetricRow) BenchmarkTelemetrySummary
func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
score := BenchmarkScorecard{}
for _, precision := range gpu.PrecisionResults {
if precision.Supported {
score.ComputeScore += precision.TeraOpsPerSec
// SyntheticScore: sum of fp32-equivalent TOPS from per-precision phases.
// Each precision ran alone with full GPU dedicated — peak capability.
for _, p := range gpu.PrecisionSteady {
score.SyntheticScore += p.WeightedTeraOpsPerSec
}
// MixedScore: sum of fp32-equivalent TOPS from the combined phase.
// All precisions compete simultaneously — closer to real inference workloads.
for _, p := range gpu.PrecisionResults {
if p.Supported {
score.MixedScore += p.WeightedTeraOpsPerSec
}
}
// MixedEfficiency = MixedScore / SyntheticScore.
// Measures how well the GPU sustains throughput under concurrent mixed load.
// A healthy GPU scores ~0.80.95; severe degradation suggests bandwidth
// contention or scheduler inefficiency.
if score.SyntheticScore > 0 && score.MixedScore > 0 {
score.MixedEfficiency = score.MixedScore / score.SyntheticScore
}
// ComputeScore = SyntheticScore × (1 + MixedEfficiency × 0.3).
// SyntheticScore is the primary signal; MixedEfficiency adds up to +30%
// bonus for GPUs that handle mixed-precision concurrency well.
// Falls back to MixedScore alone when per-precision data is absent.
switch {
case score.SyntheticScore > 0:
score.ComputeScore = score.SyntheticScore * (1 + score.MixedEfficiency*0.3)
case score.MixedScore > 0:
score.ComputeScore = score.MixedScore
}
// PowerSustainScore: measures how close the GPU came to its rated TDP under
// a full-spectrum load (dcgmi targeted_power). 100 = exactly at rated TDP.
// Penalty applied symmetrically for both under- and over-TDP deviations:
@@ -915,7 +1077,19 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6)
thermalRatio := float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS) / runtimeUS
score.ThermalSustainScore = clampScore(100 - thermalRatio*100)
score.StabilityScore = clampScore(100 - (gpu.Steady.ClockCVPct*4 + gpu.Steady.PowerCVPct*2 + gpu.Steady.ClockDriftPct*2))
// StabilityScore: prefer per-precision steady phases where each window runs a
// single kernel type so PowerCVPct is a genuine stability signal (not a
// workload-mix artifact). Fall back to combined steady using clock-only metrics
// when per-precision data is absent (older results, short profiles).
if len(gpu.PrecisionSteady) > 0 {
var sum float64
for _, p := range gpu.PrecisionSteady {
sum += clampScore(100 - (p.Steady.ClockCVPct*4 + p.Steady.PowerCVPct*2 + p.Steady.ClockDriftPct*2))
}
score.StabilityScore = sum / float64(len(gpu.PrecisionSteady))
} else {
score.StabilityScore = clampScore(100 - (gpu.Steady.ClockCVPct*4 + gpu.Steady.ClockDriftPct*2))
}
score.CompositeScore = compositeBenchmarkScore(score)
if gpu.MultiprocessorCount > 0 && gpu.Steady.AvgGraphicsClockMHz > 0 && score.ComputeScore > 0 {
score.TOPSPerSMPerGHz = score.ComputeScore / float64(gpu.MultiprocessorCount) / (gpu.Steady.AvgGraphicsClockMHz / 1000.0)
@@ -963,6 +1137,12 @@ func detectBenchmarkDegradationReasons(gpu BenchmarkGPUResult, normalizationStat
if normalizationStatus != "full" {
reasons = append(reasons, "normalization_partial")
}
if gpu.ECC.Uncorrected > 0 {
reasons = append(reasons, "ecc_uncorrected_errors")
}
if gpu.ECC.Corrected > 0 {
reasons = append(reasons, "ecc_corrected_errors")
}
return dedupeStrings(reasons)
}
@@ -1064,6 +1244,36 @@ func diffThrottleCounters(before, after BenchmarkThrottleCounters) BenchmarkThro
}
}
func queryECCCounters(gpuIndex int) (BenchmarkECCCounters, error) {
out, err := satExecCommand(
"nvidia-smi",
"--id="+strconv.Itoa(gpuIndex),
"--query-gpu=ecc.errors.corrected.volatile.total,ecc.errors.uncorrected.volatile.total",
"--format=csv,noheader,nounits",
).Output()
if err != nil {
return BenchmarkECCCounters{}, err
}
fields := strings.Split(strings.TrimSpace(string(out)), ",")
if len(fields) < 2 {
return BenchmarkECCCounters{}, fmt.Errorf("unexpected ECC counter columns: %q", strings.TrimSpace(string(out)))
}
corrected, err1 := strconv.ParseUint(strings.TrimSpace(fields[0]), 10, 64)
uncorrected, err2 := strconv.ParseUint(strings.TrimSpace(fields[1]), 10, 64)
if err1 != nil || err2 != nil {
// ECC may be disabled on this GPU — return zero counters silently.
return BenchmarkECCCounters{}, nil
}
return BenchmarkECCCounters{Corrected: corrected, Uncorrected: uncorrected}, nil
}
func diffECCCounters(before, after BenchmarkECCCounters) BenchmarkECCCounters {
return BenchmarkECCCounters{
Corrected: saturatingSub(after.Corrected, before.Corrected),
Uncorrected: saturatingSub(after.Uncorrected, before.Uncorrected),
}
}
func queryActiveComputeApps(gpuIndices []int) ([]string, error) {
args := []string{
"--query-compute-apps=gpu_uuid,pid,process_name",
@@ -1141,6 +1351,10 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
findings = append(findings, fmt.Sprintf("GPU %d showed unstable clocks/power over the benchmark window.", gpu.Index))
case "normalization_partial":
findings = append(findings, fmt.Sprintf("GPU %d ran without full benchmark normalization.", gpu.Index))
case "ecc_uncorrected_errors":
findings = append(findings, fmt.Sprintf("GPU %d reported %d uncorrected ECC error(s) — possible hardware fault.", gpu.Index, gpu.ECC.Uncorrected))
case "ecc_corrected_errors":
findings = append(findings, fmt.Sprintf("GPU %d reported %d corrected ECC error(s) — possible DRAM degradation.", gpu.Index, gpu.ECC.Corrected))
}
}
if gpu.Backend == "driver-ptx" {
@@ -1387,7 +1601,10 @@ func maxInt(a, b int) int {
// queryIPMIServerPowerW reads the current server power draw via ipmitool dcmi.
// Returns 0 and an error if IPMI is unavailable or the output cannot be parsed.
func queryIPMIServerPowerW() (float64, error) {
out, err := satExecCommand("ipmitool", "dcmi", "power", "reading").Output()
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
cmd := exec.CommandContext(ctx, "ipmitool", "dcmi", "power", "reading")
out, err := cmd.Output()
if err != nil {
return 0, fmt.Errorf("ipmitool dcmi power reading: %w", err)
}
@@ -1406,6 +1623,7 @@ func sampleIPMIPowerSeries(ctx context.Context, durationSec int) (meanW float64,
}
deadline := time.Now().Add(time.Duration(durationSec) * time.Second)
var samples []float64
loop:
for {
if w, err := queryIPMIServerPowerW(); err == nil {
samples = append(samples, w)
@@ -1415,7 +1633,7 @@ func sampleIPMIPowerSeries(ctx context.Context, durationSec int) (meanW float64,
}
select {
case <-ctx.Done():
break
break loop
case <-time.After(2 * time.Second):
}
}
@@ -1510,6 +1728,8 @@ func runNvidiaBenchmarkParallel(
calibPowerByIndex map[int]float64,
serverIdleW *float64, serverLoadedWSum *float64,
serverIdleOK *bool, serverLoadedOK *bool, serverLoadedSamples *int,
allMetricRows *[]GPUMetricRow,
gpuBurnLog string,
) {
allDevices := joinIndexList(selected)
@@ -1549,8 +1769,8 @@ func runNvidiaBenchmarkParallel(
for _, idx := range selected {
perGPU := filterRowsByGPU(baselineRows, idx)
gpuResults[idx].Baseline = summarizeBenchmarkTelemetry(perGPU)
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-baseline", idx), perGPU)
}
appendBenchmarkMetrics(allMetricRows, baselineRows, "baseline")
// Sample server idle power once.
if !*serverIdleOK {
@@ -1569,31 +1789,86 @@ func runNvidiaBenchmarkParallel(
"--devices", allDevices,
}
logFunc(fmt.Sprintf("GPUs %s: parallel warmup (%ds)", allDevices, spec.WarmupSec))
warmupOut, warmupRows, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-warmup.log", warmupCmd, nil, selected, runDir, "gpu-all-warmup", logFunc)
_ = os.WriteFile(filepath.Join(runDir, "gpu-all-warmup.log"), warmupOut, 0644)
for _, idx := range selected {
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-warmup", idx), filterRowsByGPU(warmupRows, idx))
}
warmupOut, warmupRows, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-warmup.log", warmupCmd, nil, selected, logFunc)
appendBenchmarkMetrics(allMetricRows, warmupRows, "warmup")
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", "warmup", warmupOut)
if warmupErr != nil {
for _, idx := range selected {
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "parallel warmup failed: "+warmupErr.Error())
}
}
// ── Per-precision stability phases (parallel) ─────────────────────────────
totalSlots := len(benchmarkPrecisionPhases) + 1
perPhaseSec := spec.SteadySec / totalSlots
if perPhaseSec < 60 {
perPhaseSec = 60
}
eccBase := make(map[int]BenchmarkECCCounters, len(selected))
for _, idx := range selected {
eccBase[idx], _ = queryECCCounters(idx)
}
for _, prec := range benchmarkPrecisionPhases {
phaseCmd := []string{
"bee-gpu-burn",
"--seconds", strconv.Itoa(perPhaseSec),
"--size-mb", strconv.Itoa(opts.SizeMB),
"--devices", allDevices,
"--precision", prec,
}
logFunc(fmt.Sprintf("GPUs %s: %s stability phase (%ds)", allDevices, prec, perPhaseSec))
phaseLogName := "gpu-all-steady-" + prec
eccBeforePhase := make(map[int]BenchmarkECCCounters, len(selected))
for _, idx := range selected {
eccBeforePhase[idx], _ = queryECCCounters(idx)
}
phaseOut, phaseRows, phaseErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, phaseLogName+".log", phaseCmd, nil, selected, logFunc)
appendBenchmarkMetrics(allMetricRows, phaseRows, phaseLogName)
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseLogName, phaseOut)
eccAfterPhase := make(map[int]BenchmarkECCCounters, len(selected))
for _, idx := range selected {
eccAfterPhase[idx], _ = queryECCCounters(idx)
}
if phaseErr != nil || len(phaseRows) == 0 {
continue
}
parseByGPU := parseBenchmarkBurnLogByGPU(string(phaseOut))
for _, idx := range selected {
perGPU := filterRowsByGPU(phaseRows, idx)
if len(perGPU) == 0 {
continue
}
phase := BenchmarkPrecisionSteadyPhase{
Precision: prec,
Steady: summarizeBenchmarkTelemetry(perGPU),
ECC: diffECCCounters(eccBeforePhase[idx], eccAfterPhase[idx]),
}
if pr, ok := parseByGPU[idx]; ok {
for _, p := range pr.Profiles {
if p.Supported {
phase.TeraOpsPerSec += p.TeraOpsPerSec
phase.WeightedTeraOpsPerSec += p.WeightedTeraOpsPerSec
}
}
}
gpuResults[idx].PrecisionSteady = append(gpuResults[idx].PrecisionSteady, phase)
}
}
// Snapshot throttle counters before steady.
beforeThrottle := make(map[int]BenchmarkThrottleCounters, len(selected))
for _, idx := range selected {
beforeThrottle[idx], _ = queryThrottleCounters(idx)
}
// Steady: all GPUs simultaneously.
// Steady: all GPUs simultaneously (combined). Fixed at one slot = perPhaseSec.
steadyCmd := []string{
"bee-gpu-burn",
"--seconds", strconv.Itoa(spec.SteadySec),
"--seconds", strconv.Itoa(perPhaseSec),
"--size-mb", strconv.Itoa(opts.SizeMB),
"--devices", allDevices,
}
logFunc(fmt.Sprintf("GPUs %s: parallel steady compute (%ds)", allDevices, spec.SteadySec))
logFunc(fmt.Sprintf("GPUs %s: parallel steady compute (combined, %ds)", allDevices, perPhaseSec))
// Sample server power via IPMI in parallel with steady phase.
ipmiStopCh := make(chan struct{})
@@ -1627,7 +1902,9 @@ func runNvidiaBenchmarkParallel(
}
}()
steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-steady.log", steadyCmd, nil, selected, runDir, "gpu-all-steady", logFunc)
steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-steady.log", steadyCmd, nil, selected, logFunc)
appendBenchmarkMetrics(allMetricRows, steadyRows, "steady")
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", "steady", steadyOut)
close(ipmiStopCh)
if loadedW, ok := <-ipmiResultCh; ok {
*serverLoadedWSum += loadedW
@@ -1635,8 +1912,6 @@ func runNvidiaBenchmarkParallel(
*serverLoadedOK = true
logFunc(fmt.Sprintf("GPUs %s: server loaded power (IPMI): %.0f W", allDevices, loadedW))
}
_ = os.WriteFile(filepath.Join(runDir, "gpu-all-steady.log"), steadyOut, 0644)
afterThrottle := make(map[int]BenchmarkThrottleCounters, len(selected))
for _, idx := range selected {
afterThrottle[idx], _ = queryThrottleCounters(idx)
@@ -1646,9 +1921,11 @@ func runNvidiaBenchmarkParallel(
for _, idx := range selected {
perGPU := filterRowsByGPU(steadyRows, idx)
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-steady", idx), perGPU)
gpuResults[idx].Steady = summarizeBenchmarkTelemetry(perGPU)
gpuResults[idx].Throttle = diffThrottleCounters(beforeThrottle[idx], afterThrottle[idx])
if eccFinal, err := queryECCCounters(idx); err == nil {
gpuResults[idx].ECC = diffECCCounters(eccBase[idx], eccFinal)
}
if pr, ok := parseResults[idx]; ok {
gpuResults[idx].ComputeCapability = pr.ComputeCapability
@@ -1673,8 +1950,8 @@ func runNvidiaBenchmarkParallel(
for _, idx := range selected {
perGPU := filterRowsByGPU(cooldownRows, idx)
gpuResults[idx].Cooldown = summarizeBenchmarkTelemetry(perGPU)
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-cooldown", idx), perGPU)
}
appendBenchmarkMetrics(allMetricRows, cooldownRows, "cooldown")
// Score and finalize each GPU.
for _, idx := range selected {
@@ -1884,7 +2161,7 @@ func runBenchmarkPowerCalibration(
logFunc(fmt.Sprintf("power calibration: running dcgmi targeted_power for %ds on GPUs %s", calibDurationSec, joinIndexList(gpuIndices)))
cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, gpuIndices)
out, rows, err := runBenchmarkCommandWithMetrics(ctx, verboseLog, "power-calibration.log", cmd, nil, gpuIndices, runDir, "power-calibration", logFunc)
out, rows, err := runBenchmarkCommandWithMetrics(ctx, verboseLog, "power-calibration.log", cmd, nil, gpuIndices, logFunc)
_ = os.WriteFile(filepath.Join(runDir, "power-calibration.log"), out, 0644)
if err != nil {
logFunc(fmt.Sprintf("power calibration: dcgmi targeted_power failed (%v), skipping", err))

View File

@@ -2,25 +2,15 @@ package platform
import (
"fmt"
"os"
"path/filepath"
"regexp"
"strings"
"time"
)
func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
return renderBenchmarkReportWithCharts(result, nil)
return renderBenchmarkReportWithCharts(result)
}
type benchmarkReportChart struct {
Title string
Content string
}
var ansiEscapePattern = regexp.MustCompile(`\x1b\[[0-9;]*m`)
func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benchmarkReportChart) string {
func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
var b strings.Builder
// ── Header ────────────────────────────────────────────────────────────────
@@ -91,10 +81,28 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
b.WriteString("\n")
}
// ── Methodology ───────────────────────────────────────────────────────────
b.WriteString("## Methodology\n\n")
fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline -> warmup -> steady-state -> interconnect -> cooldown phases.\n", result.BenchmarkProfile)
b.WriteString("- Single-GPU compute score comes from `bee-gpu-burn` on the cuBLASLt path when available.\n")
b.WriteString("- Thermal and power limits are inferred from NVIDIA clock-event counters plus sustained telemetry.\n")
b.WriteString("- `result.json` is the canonical machine-readable source for the run.\n\n")
b.WriteString("**Compute score** is derived from two phases:\n\n")
b.WriteString("- **Synthetic** — each precision type (fp8, fp16, fp32, fp64, fp4) runs alone for a dedicated window. ")
b.WriteString("Measures peak throughput with the full GPU dedicated to one kernel type. ")
b.WriteString("Each result is normalised to fp32-equivalent TOPS using precision weights: ")
b.WriteString("fp64 ×2.0 · fp32 ×1.0 · fp16 ×0.5 · fp8 ×0.25 · fp4 ×0.125.\n")
b.WriteString("- **Mixed** — all precision types run simultaneously (combined phase). ")
b.WriteString("Reflects real inference workloads where fp8 matrix ops, fp16 attention and fp32 accumulation compete for bandwidth and SM scheduler slots.\n\n")
b.WriteString("**Formula:** `Compute = Synthetic × (1 + MixedEfficiency × 0.3)`\n\n")
b.WriteString("where `MixedEfficiency = Mixed / Synthetic`. A GPU that sustains 90 % throughput under mixed load ")
b.WriteString("receives a +27 % bonus over its synthetic score; one that drops to 60 % receives +18 %.\n\n")
b.WriteString("**Composite score** = `Compute × quality_factor` where quality factors in power sustain, thermal sustain, stability, and interconnect.\n\n")
// ── Scorecard table ───────────────────────────────────────────────────────
b.WriteString("## Scorecard\n\n")
b.WriteString("| GPU | Status | Composite | Compute | TOPS/SM/GHz | Power Sustain | Thermal Sustain | Stability | Interconnect |\n")
b.WriteString("|-----|--------|-----------|---------|-------------|---------------|-----------------|-----------|-------------|\n")
b.WriteString("| GPU | Status | Composite | Compute | Synthetic | Mixed | Mixed Eff. | TOPS/SM/GHz | Power Sustain | Thermal Sustain | Stability | Interconnect |\n")
b.WriteString("|-----|--------|-----------|---------|-----------|-------|------------|-------------|---------------|-----------------|-----------|-------------|\n")
for _, gpu := range result.GPUs {
name := strings.TrimSpace(gpu.Name)
if name == "" {
@@ -108,11 +116,26 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
if gpu.Scores.TOPSPerSMPerGHz > 0 {
topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
}
fmt.Fprintf(&b, "| GPU %d %s | %s | **%.2f** | %.2f | %s | %.1f | %.1f | %.1f | %s |\n",
synthetic := "-"
if gpu.Scores.SyntheticScore > 0 {
synthetic = fmt.Sprintf("%.2f", gpu.Scores.SyntheticScore)
}
mixed := "-"
if gpu.Scores.MixedScore > 0 {
mixed = fmt.Sprintf("%.2f", gpu.Scores.MixedScore)
}
mixedEff := "-"
if gpu.Scores.MixedEfficiency > 0 {
mixedEff = fmt.Sprintf("%.1f%%", gpu.Scores.MixedEfficiency*100)
}
fmt.Fprintf(&b, "| GPU %d %s | %s | **%.2f** | %.2f | %s | %s | %s | %s | %.1f | %.1f | %.1f | %s |\n",
gpu.Index, name,
gpu.Status,
gpu.Scores.CompositeScore,
gpu.Scores.ComputeScore,
synthetic,
mixed,
mixedEff,
topsPerSM,
gpu.Scores.PowerSustainScore,
gpu.Scores.ThermalSustainScore,
@@ -162,6 +185,34 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
fmt.Fprintf(&b, "| GPU utilisation | %.1f %% | — |\n", gpu.Steady.AvgUsagePct)
b.WriteString("\n")
// Per-precision stability phases.
if len(gpu.PrecisionSteady) > 0 {
b.WriteString("**Per-precision stability:**\n\n")
b.WriteString("| Precision | Clock CV | Power CV | Clock Drift | ECC corr | ECC uncorr |\n|-----------|----------|----------|-------------|----------|------------|\n")
for _, p := range gpu.PrecisionSteady {
eccCorr := "—"
eccUncorr := "—"
if !p.ECC.IsZero() {
eccCorr = fmt.Sprintf("%d", p.ECC.Corrected)
eccUncorr = fmt.Sprintf("%d", p.ECC.Uncorrected)
}
fmt.Fprintf(&b, "| %s | %.1f%% | %.1f%% | %.1f%% | %s | %s |\n",
p.Precision, p.Steady.ClockCVPct, p.Steady.PowerCVPct, p.Steady.ClockDriftPct,
eccCorr, eccUncorr)
}
b.WriteString("\n")
} else {
// Legacy: show combined-window variance.
fmt.Fprintf(&b, "**Clock/power variance (combined window):** clock CV %.1f%% · power CV %.1f%% · clock drift %.1f%%\n\n",
gpu.Steady.ClockCVPct, gpu.Steady.PowerCVPct, gpu.Steady.ClockDriftPct)
}
// ECC summary
if !gpu.ECC.IsZero() {
fmt.Fprintf(&b, "**ECC errors (total):** corrected=%d uncorrected=%d\n\n",
gpu.ECC.Corrected, gpu.ECC.Uncorrected)
}
// Throttle
throttle := formatThrottleLine(gpu.Throttle, gpu.Steady.DurationSec)
if throttle != "none" {
@@ -171,12 +222,14 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
// Precision results
if len(gpu.PrecisionResults) > 0 {
b.WriteString("**Precision results:**\n\n")
b.WriteString("| Precision | TOPS | Lanes | Iterations |\n|-----------|------|-------|------------|\n")
b.WriteString("| Precision | TOPS (raw) | Weight | TOPS (fp32-eq) | Lanes | Iterations |\n|-----------|------------|--------|----------------|-------|------------|\n")
for _, p := range gpu.PrecisionResults {
if p.Supported {
fmt.Fprintf(&b, "| %s | %.2f | %d | %d |\n", p.Name, p.TeraOpsPerSec, p.Lanes, p.Iterations)
weightStr := fmt.Sprintf("×%.3g", p.Weight)
fmt.Fprintf(&b, "| %s | %.2f | %s | %.2f | %d | %d |\n",
p.Name, p.TeraOpsPerSec, weightStr, p.WeightedTeraOpsPerSec, p.Lanes, p.Iterations)
} else {
fmt.Fprintf(&b, "| %s | — (unsupported) | — | — |\n", p.Name)
fmt.Fprintf(&b, "| %s | — (unsupported) | — | — | — | — |\n", p.Name)
}
}
b.WriteString("\n")
@@ -237,61 +290,16 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
}
}
// ── Terminal charts (steady-state only) ───────────────────────────────────
if len(charts) > 0 {
b.WriteString("## Steady-State Charts\n\n")
for _, chart := range charts {
content := strings.TrimSpace(stripANSIEscapeSequences(chart.Content))
if content == "" {
continue
}
fmt.Fprintf(&b, "### %s\n\n```\n%s\n```\n\n", chart.Title, content)
}
}
// ── Methodology ───────────────────────────────────────────────────────────
b.WriteString("## Methodology\n\n")
fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline → warmup → steady-state → interconnect → cooldown phases.\n", result.BenchmarkProfile)
b.WriteString("- Single-GPU compute score from bee-gpu-burn cuBLASLt when available.\n")
b.WriteString("- Thermal and power limitations inferred from NVIDIA clock event reason counters and sustained telemetry.\n")
b.WriteString("- `result.json` is the canonical machine-readable source for this benchmark run.\n\n")
// ── Raw files ─────────────────────────────────────────────────────────────
b.WriteString("## Raw Files\n\n")
b.WriteString("- `result.json`\n- `report.md`\n- `summary.txt`\n- `verbose.log`\n")
b.WriteString("- `gpu-*-baseline-metrics.csv/html/term.txt`\n")
b.WriteString("- `gpu-*-warmup.log`\n")
b.WriteString("- `gpu-*-steady.log`\n")
b.WriteString("- `gpu-*-steady-metrics.csv/html/term.txt`\n")
b.WriteString("- `gpu-*-cooldown-metrics.csv/html/term.txt`\n")
b.WriteString("- `gpu-metrics.csv`\n- `gpu-metrics.html`\n- `gpu-burn.log`\n")
if result.Interconnect != nil {
b.WriteString("- `nccl-all-reduce.log`\n")
}
return b.String()
}
// loadBenchmarkReportCharts loads only steady-state terminal charts (baseline and
// cooldown charts are not useful for human review).
func loadBenchmarkReportCharts(runDir string, gpuIndices []int) []benchmarkReportChart {
var charts []benchmarkReportChart
for _, idx := range gpuIndices {
path := filepath.Join(runDir, fmt.Sprintf("gpu-%d-steady-metrics-term.txt", idx))
raw, err := os.ReadFile(path)
if err != nil || len(raw) == 0 {
continue
}
charts = append(charts, benchmarkReportChart{
Title: fmt.Sprintf("GPU %d — Steady State", idx),
Content: string(raw),
})
}
return charts
}
func stripANSIEscapeSequences(raw string) string {
return ansiEscapePattern.ReplaceAllString(raw, "")
}
// formatThrottleLine renders throttle counters as human-readable percentages of
// the steady-state window. Only non-zero counters are shown. When the steady
// duration is unknown (0), raw seconds are shown instead.

View File

@@ -147,36 +147,27 @@ func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
}
}
func TestRenderBenchmarkReportIncludesTerminalChartsWithoutANSI(t *testing.T) {
func TestRenderBenchmarkReportListsUnifiedArtifacts(t *testing.T) {
t.Parallel()
report := renderBenchmarkReportWithCharts(NvidiaBenchmarkResult{
report := renderBenchmarkReport(NvidiaBenchmarkResult{
BenchmarkProfile: NvidiaBenchmarkProfileStandard,
OverallStatus: "OK",
SelectedGPUIndices: []int{0},
Normalization: BenchmarkNormalization{
Status: "full",
},
}, []benchmarkReportChart{
{
Title: "GPU 0 Steady State",
Content: "\x1b[31mGPU 0 chart\x1b[0m\n 42┤───",
},
})
for _, needle := range []string{
"Steady-State Charts",
"GPU 0 Steady State",
"GPU 0 chart",
"42┤───",
"gpu-metrics.csv",
"gpu-metrics.html",
"gpu-burn.log",
} {
if !strings.Contains(report, needle) {
t.Fatalf("report missing %q\n%s", needle, report)
}
}
if strings.Contains(report, "\x1b[31m") {
t.Fatalf("report should not contain ANSI escapes\n%s", report)
}
}
func TestEnrichGPUInfoWithMaxClocks(t *testing.T) {

View File

@@ -43,7 +43,6 @@ type NvidiaBenchmarkOptions struct {
RampRunID string // shared identifier across all steps of the same ramp-up run
}
type NvidiaBenchmarkResult struct {
BenchmarkVersion string `json:"benchmark_version"`
GeneratedAt time.Time `json:"generated_at"`
@@ -84,35 +83,38 @@ type BenchmarkNormalizationGPU struct {
}
type BenchmarkGPUResult struct {
Index int `json:"index"`
UUID string `json:"uuid,omitempty"`
Name string `json:"name,omitempty"`
BusID string `json:"bus_id,omitempty"`
VBIOS string `json:"vbios,omitempty"`
ComputeCapability string `json:"compute_capability,omitempty"`
Backend string `json:"backend,omitempty"`
Status string `json:"status"`
PowerLimitW float64 `json:"power_limit_w,omitempty"`
MultiprocessorCount int `json:"multiprocessor_count,omitempty"`
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
Index int `json:"index"`
UUID string `json:"uuid,omitempty"`
Name string `json:"name,omitempty"`
BusID string `json:"bus_id,omitempty"`
VBIOS string `json:"vbios,omitempty"`
ComputeCapability string `json:"compute_capability,omitempty"`
Backend string `json:"backend,omitempty"`
Status string `json:"status"`
PowerLimitW float64 `json:"power_limit_w,omitempty"`
MultiprocessorCount int `json:"multiprocessor_count,omitempty"`
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
// CalibratedPeakPowerW is the p95 power measured during a short
// dcgmi targeted_power calibration run before the main benchmark.
// Used as the reference denominator for PowerSustainScore instead of
// the hardware default limit, which bee-gpu-burn cannot reach.
CalibratedPeakPowerW float64 `json:"calibrated_peak_power_w,omitempty"`
MaxGraphicsClockMHz float64 `json:"max_graphics_clock_mhz,omitempty"`
BaseGraphicsClockMHz float64 `json:"base_graphics_clock_mhz,omitempty"`
MaxMemoryClockMHz float64 `json:"max_memory_clock_mhz,omitempty"`
LockedGraphicsClockMHz float64 `json:"locked_graphics_clock_mhz,omitempty"`
LockedMemoryClockMHz float64 `json:"locked_memory_clock_mhz,omitempty"`
Baseline BenchmarkTelemetrySummary `json:"baseline"`
Steady BenchmarkTelemetrySummary `json:"steady"`
Cooldown BenchmarkTelemetrySummary `json:"cooldown"`
Throttle BenchmarkThrottleCounters `json:"throttle_counters"`
PrecisionResults []BenchmarkPrecisionResult `json:"precision_results,omitempty"`
Scores BenchmarkScorecard `json:"scores"`
DegradationReasons []string `json:"degradation_reasons,omitempty"`
Notes []string `json:"notes,omitempty"`
CalibratedPeakPowerW float64 `json:"calibrated_peak_power_w,omitempty"`
MaxGraphicsClockMHz float64 `json:"max_graphics_clock_mhz,omitempty"`
BaseGraphicsClockMHz float64 `json:"base_graphics_clock_mhz,omitempty"`
MaxMemoryClockMHz float64 `json:"max_memory_clock_mhz,omitempty"`
LockedGraphicsClockMHz float64 `json:"locked_graphics_clock_mhz,omitempty"`
LockedMemoryClockMHz float64 `json:"locked_memory_clock_mhz,omitempty"`
Baseline BenchmarkTelemetrySummary `json:"baseline"`
Steady BenchmarkTelemetrySummary `json:"steady"`
PrecisionSteady []BenchmarkPrecisionSteadyPhase `json:"precision_steady,omitempty"`
Cooldown BenchmarkTelemetrySummary `json:"cooldown"`
Throttle BenchmarkThrottleCounters `json:"throttle_counters"`
// ECC error delta accumulated over the full benchmark (all phases combined).
ECC BenchmarkECCCounters `json:"ecc,omitempty"`
PrecisionResults []BenchmarkPrecisionResult `json:"precision_results,omitempty"`
Scores BenchmarkScorecard `json:"scores"`
DegradationReasons []string `json:"degradation_reasons,omitempty"`
Notes []string `json:"notes,omitempty"`
}
type BenchmarkTelemetrySummary struct {
@@ -142,6 +144,18 @@ type BenchmarkThrottleCounters struct {
HWPowerBrakeSlowdownUS uint64 `json:"hw_power_brake_slowdown_us"`
}
// BenchmarkECCCounters holds ECC error counts sampled at a point in time.
// Corrected = single-bit errors fixed by ECC (DRAM degradation).
// Uncorrected = double-bit errors that could not be corrected (serious fault).
// Both are volatile (since last driver reset), not persistent.
type BenchmarkECCCounters struct {
Corrected uint64 `json:"corrected"`
Uncorrected uint64 `json:"uncorrected"`
}
func (e BenchmarkECCCounters) Total() uint64 { return e.Corrected + e.Uncorrected }
func (e BenchmarkECCCounters) IsZero() bool { return e.Corrected == 0 && e.Uncorrected == 0 }
type BenchmarkPrecisionResult struct {
Name string `json:"name"`
Category string `json:"category"`
@@ -152,19 +166,31 @@ type BenchmarkPrecisionResult struct {
K uint64 `json:"k,omitempty"`
Iterations uint64 `json:"iterations,omitempty"`
TeraOpsPerSec float64 `json:"teraops_per_sec,omitempty"`
Notes string `json:"notes,omitempty"`
// Weight is the fp32-equivalence factor for this precision category.
// fp32 = 1.0 (baseline), fp64 = 2.0, fp16 = 0.5, fp8 = 0.25, fp4 = 0.125.
// WeightedTOPS = TeraOpsPerSec * Weight gives fp32-equivalent throughput.
Weight float64 `json:"weight,omitempty"`
WeightedTeraOpsPerSec float64 `json:"weighted_teraops_per_sec,omitempty"`
Notes string `json:"notes,omitempty"`
}
type BenchmarkScorecard struct {
ComputeScore float64 `json:"compute_score"`
ComputeScore float64 `json:"compute_score"`
// SyntheticScore is the sum of fp32-equivalent TOPS from per-precision
// steady phases (each precision ran alone, full GPU dedicated).
SyntheticScore float64 `json:"synthetic_score,omitempty"`
// MixedScore is the sum of fp32-equivalent TOPS from the combined phase
// (all precisions competing simultaneously — closer to real workloads).
MixedScore float64 `json:"mixed_score,omitempty"`
// MixedEfficiency = MixedScore / SyntheticScore. Measures how well the GPU
// sustains throughput under concurrent mixed-precision load.
MixedEfficiency float64 `json:"mixed_efficiency,omitempty"`
PowerSustainScore float64 `json:"power_sustain_score"`
ThermalSustainScore float64 `json:"thermal_sustain_score"`
StabilityScore float64 `json:"stability_score"`
InterconnectScore float64 `json:"interconnect_score"`
CompositeScore float64 `json:"composite_score"`
// TOPSPerSMPerGHz is compute efficiency independent of clock speed and SM count.
// Comparable across throttle levels and GPU generations. Low value at normal
// clocks indicates silicon degradation.
TOPSPerSMPerGHz float64 `json:"tops_per_sm_per_ghz,omitempty"`
}
@@ -182,6 +208,20 @@ type BenchmarkServerPower struct {
Notes []string `json:"notes,omitempty"`
}
// BenchmarkPrecisionSteadyPhase holds per-precision-category telemetry collected
// during a dedicated single-precision steady window. Because only one kernel
// type runs at a time the PowerCVPct here is a genuine stability signal.
type BenchmarkPrecisionSteadyPhase struct {
Precision string `json:"precision"` // e.g. "fp8", "fp16", "fp32"
Steady BenchmarkTelemetrySummary `json:"steady"`
TeraOpsPerSec float64 `json:"teraops_per_sec,omitempty"`
WeightedTeraOpsPerSec float64 `json:"weighted_teraops_per_sec,omitempty"`
// ECC errors accumulated during this precision phase only.
// Non-zero corrected = stress-induced DRAM errors for this kernel type.
// Any uncorrected = serious fault triggered by this precision workload.
ECC BenchmarkECCCounters `json:"ecc,omitempty"`
}
type BenchmarkInterconnectResult struct {
Status string `json:"status"`
Attempted bool `json:"attempted"`

View File

@@ -13,6 +13,7 @@ import (
// GPUMetricRow is one telemetry sample from nvidia-smi during a stress test.
type GPUMetricRow struct {
Stage string `json:"stage,omitempty"`
ElapsedSec float64 `json:"elapsed_sec"`
GPUIndex int `json:"index"`
TempC float64 `json:"temp_c"`
@@ -141,14 +142,20 @@ func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
// WriteGPUMetricsCSV writes collected rows as a CSV file.
func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
var b bytes.Buffer
b.WriteString("elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz\n")
b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz\n")
for _, r := range rows {
fmt.Fprintf(&b, "%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f\n",
r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz)
fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f\n",
strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz)
}
return os.WriteFile(path, b.Bytes(), 0644)
}
type gpuMetricStageSpan struct {
Name string
Start float64
End float64
}
// WriteGPUMetricsHTML writes a standalone HTML file with one SVG chart per GPU.
func WriteGPUMetricsHTML(path string, rows []GPUMetricRow) error {
// Group by GPU index preserving order.
@@ -163,9 +170,25 @@ func WriteGPUMetricsHTML(path string, rows []GPUMetricRow) error {
gpuMap[r.GPUIndex] = append(gpuMap[r.GPUIndex], r)
}
stageSpans := buildGPUMetricStageSpans(rows)
stageColorByName := make(map[string]string, len(stageSpans))
for i, span := range stageSpans {
stageColorByName[span.Name] = gpuMetricStagePalette[i%len(gpuMetricStagePalette)]
}
var legend strings.Builder
if len(stageSpans) > 0 {
legend.WriteString(`<div class="stage-legend">`)
for _, span := range stageSpans {
fmt.Fprintf(&legend, `<span class="stage-chip"><span class="stage-swatch" style="background:%s"></span>%s</span>`,
stageColorByName[span.Name], gpuHTMLEscape(span.Name))
}
legend.WriteString(`</div>`)
}
var svgs strings.Builder
for _, gpuIdx := range order {
svgs.WriteString(drawGPUChartSVG(gpuMap[gpuIdx], gpuIdx))
svgs.WriteString(drawGPUChartSVG(gpuMap[gpuIdx], gpuIdx, stageSpans, stageColorByName))
svgs.WriteString("\n")
}
@@ -175,21 +198,39 @@ func WriteGPUMetricsHTML(path string, rows []GPUMetricRow) error {
<meta charset="utf-8">
<title>GPU Stress Test Metrics</title>
<style>
body { font-family: sans-serif; background: #f0f0f0; margin: 0; padding: 20px; }
h1 { text-align: center; color: #333; margin: 0 0 8px; }
p { text-align: center; color: #888; font-size: 13px; margin: 0 0 24px; }
:root{--bg:#fff;--surface:#fff;--surface-2:#f9fafb;--border:rgba(34,36,38,.15);--border-lite:rgba(34,36,38,.1);--ink:rgba(0,0,0,.87);--muted:rgba(0,0,0,.6)}
*{box-sizing:border-box}
body{font:14px/1.5 Lato,"Helvetica Neue",Arial,Helvetica,sans-serif;background:var(--bg);color:var(--ink);margin:0}
.page{padding:24px}
.card{background:var(--surface);border:1px solid var(--border);border-radius:4px;box-shadow:0 1px 2px rgba(34,36,38,.15);overflow:hidden}
.card-head{padding:11px 16px;background:var(--surface-2);border-bottom:1px solid var(--border);font-weight:700;font-size:13px}
.card-body{padding:16px}
h1{font-size:22px;margin:0 0 6px}
p{color:var(--muted);font-size:13px;margin:0 0 16px}
.stage-legend{display:flex;flex-wrap:wrap;gap:10px;margin:0 0 16px}
.stage-chip{display:inline-flex;align-items:center;gap:8px;padding:4px 10px;border-radius:999px;background:var(--surface-2);border:1px solid var(--border-lite);font-size:12px}
.stage-swatch{display:inline-block;width:12px;height:12px;border-radius:999px}
.chart-block{margin-top:16px}
</style>
</head><body>
<div class="page">
<div class="card">
<div class="card-head">GPU Stress Test Metrics</div>
<div class="card-body">
<h1>GPU Stress Test Metrics</h1>
<p>Generated %s</p>
%s
</body></html>`, ts, svgs.String())
<div class="chart-block">%s</div>
</div>
</div>
</div>
</body></html>`, ts, legend.String(), svgs.String())
return os.WriteFile(path, []byte(html), 0644)
}
// drawGPUChartSVG generates a self-contained SVG chart for one GPU.
func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int, stageSpans []gpuMetricStageSpan, stageColorByName map[string]string) string {
// Layout
const W, H = 960, 520
const plotX1 = 120 // usage axis / chart left border
@@ -284,6 +325,23 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
}
b.WriteString("</g>\n")
// Stage backgrounds
for _, span := range stageSpans {
x1 := xv(span.Start)
x2 := xv(span.End)
if x2 < x1 {
x1, x2 = x2, x1
}
if x2-x1 < 1 {
x2 = x1 + 1
}
color := stageColorByName[span.Name]
fmt.Fprintf(&b, `<rect x="%.1f" y="%d" width="%.1f" height="%d" fill="%s" fill-opacity="0.18"/>`+"\n",
x1, plotY1, x2-x1, PH, color)
fmt.Fprintf(&b, `<text x="%.1f" y="%d" font-family="sans-serif" font-size="10" fill="#444" text-anchor="middle">%s</text>`+"\n",
x1+(x2-x1)/2, plotY1+12, gpuHTMLEscape(span.Name))
}
// Chart border
fmt.Fprintf(&b, `<rect x="%d" y="%d" width="%d" height="%d"`+
` fill="none" stroke="#333" stroke-width="1"/>`+"\n",
@@ -382,221 +440,6 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
return b.String()
}
const (
ansiAmber = "\033[38;5;214m"
ansiReset = "\033[0m"
)
const (
termChartWidth = 70
termChartHeight = 12
)
// RenderGPUTerminalChart returns ANSI line charts (asciigraph-style) per GPU.
// Used in SAT stress-test logs.
func RenderGPUTerminalChart(rows []GPUMetricRow) string {
seen := make(map[int]bool)
var order []int
gpuMap := make(map[int][]GPUMetricRow)
for _, r := range rows {
if !seen[r.GPUIndex] {
seen[r.GPUIndex] = true
order = append(order, r.GPUIndex)
}
gpuMap[r.GPUIndex] = append(gpuMap[r.GPUIndex], r)
}
type seriesDef struct {
caption string
color string
fn func(GPUMetricRow) float64
}
defs := []seriesDef{
{"Temperature (°C)", ansiAmber, func(r GPUMetricRow) float64 { return r.TempC }},
{"GPU Usage (%)", ansiAmber, func(r GPUMetricRow) float64 { return r.UsagePct }},
{"Power (W)", ansiAmber, func(r GPUMetricRow) float64 { return r.PowerW }},
{"Clock (MHz)", ansiAmber, func(r GPUMetricRow) float64 { return r.ClockMHz }},
}
var b strings.Builder
for _, gpuIdx := range order {
gr := gpuMap[gpuIdx]
if len(gr) == 0 {
continue
}
tMax := gr[len(gr)-1].ElapsedSec - gr[0].ElapsedSec
fmt.Fprintf(&b, "GPU %d — Stress Test Metrics (%.0f seconds)\n\n", gpuIdx, tMax)
for _, d := range defs {
b.WriteString(renderLineChart(extractGPUField(gr, d.fn), d.color, d.caption,
termChartHeight, termChartWidth))
b.WriteRune('\n')
}
}
return strings.TrimRight(b.String(), "\n")
}
// renderLineChart draws a single time-series line chart using box-drawing characters.
// Produces output in the style of asciigraph: ╭─╮ │ ╰─╯ with a Y axis and caption.
func renderLineChart(vals []float64, color, caption string, height, width int) string {
if len(vals) == 0 {
return caption + "\n"
}
mn, mx := gpuMinMax(vals)
if mn == mx {
mx = mn + 1
}
// Use the smaller of width or len(vals) to avoid stretching sparse data.
w := width
if len(vals) < w {
w = len(vals)
}
data := gpuDownsample(vals, w)
// row[i] = display row index: 0 = top = max value, height = bottom = min value.
row := make([]int, w)
for i, v := range data {
r := int(math.Round((mx - v) / (mx - mn) * float64(height)))
if r < 0 {
r = 0
}
if r > height {
r = height
}
row[i] = r
}
// Fill the character grid.
grid := make([][]rune, height+1)
for i := range grid {
grid[i] = make([]rune, w)
for j := range grid[i] {
grid[i][j] = ' '
}
}
for x := 0; x < w; x++ {
r := row[x]
if x == 0 {
grid[r][0] = '─'
continue
}
p := row[x-1]
switch {
case r == p:
grid[r][x] = '─'
case r < p: // value went up (row index decreased toward top)
grid[r][x] = '╭'
grid[p][x] = '╯'
for y := r + 1; y < p; y++ {
grid[y][x] = '│'
}
default: // r > p, value went down
grid[p][x] = '╮'
grid[r][x] = '╰'
for y := p + 1; y < r; y++ {
grid[y][x] = '│'
}
}
}
// Y axis tick labels.
ticks := gpuNiceTicks(mn, mx, height/2)
tickAtRow := make(map[int]string)
labelWidth := 4
for _, t := range ticks {
r := int(math.Round((mx - t) / (mx - mn) * float64(height)))
if r < 0 || r > height {
continue
}
s := gpuFormatTick(t)
tickAtRow[r] = s
if len(s) > labelWidth {
labelWidth = len(s)
}
}
var b strings.Builder
for r := 0; r <= height; r++ {
label := tickAtRow[r]
fmt.Fprintf(&b, "%*s", labelWidth, label)
switch {
case label != "":
b.WriteRune('┤')
case r == height:
b.WriteRune('┼')
default:
b.WriteRune('│')
}
b.WriteString(color)
b.WriteString(string(grid[r]))
b.WriteString(ansiReset)
b.WriteRune('\n')
}
// Bottom axis.
b.WriteString(strings.Repeat(" ", labelWidth))
b.WriteRune('└')
b.WriteString(strings.Repeat("─", w))
b.WriteRune('\n')
// Caption centered under the chart.
if caption != "" {
total := labelWidth + 1 + w
if pad := (total - len(caption)) / 2; pad > 0 {
b.WriteString(strings.Repeat(" ", pad))
}
b.WriteString(caption)
b.WriteRune('\n')
}
return b.String()
}
func extractGPUField(rows []GPUMetricRow, fn func(GPUMetricRow) float64) []float64 {
v := make([]float64, len(rows))
for i, r := range rows {
v[i] = fn(r)
}
return v
}
// gpuDownsample averages vals into w buckets (or nearest-neighbor upsamples if len(vals) < w).
func gpuDownsample(vals []float64, w int) []float64 {
n := len(vals)
if n == 0 {
return make([]float64, w)
}
result := make([]float64, w)
if n >= w {
counts := make([]int, w)
for i, v := range vals {
bucket := i * w / n
if bucket >= w {
bucket = w - 1
}
result[bucket] += v
counts[bucket]++
}
for i := range result {
if counts[i] > 0 {
result[i] /= float64(counts[i])
}
}
} else {
// Nearest-neighbour upsample.
for i := range result {
src := i * (n - 1) / (w - 1)
if src >= n {
src = n - 1
}
result[i] = vals[src]
}
}
return result
}
func gpuMinMax(vals []float64) (float64, float64) {
if len(vals) == 0 {
return 0, 1
@@ -641,3 +484,46 @@ func gpuFormatTick(v float64) string {
}
return strconv.FormatFloat(v, 'f', 1, 64)
}
var gpuMetricStagePalette = []string{
"#d95c5c",
"#2185d0",
"#21ba45",
"#f2c037",
"#6435c9",
"#00b5ad",
"#a5673f",
}
func buildGPUMetricStageSpans(rows []GPUMetricRow) []gpuMetricStageSpan {
var spans []gpuMetricStageSpan
for _, row := range rows {
name := strings.TrimSpace(row.Stage)
if name == "" {
name = "run"
}
if len(spans) == 0 || spans[len(spans)-1].Name != name {
spans = append(spans, gpuMetricStageSpan{Name: name, Start: row.ElapsedSec, End: row.ElapsedSec})
continue
}
spans[len(spans)-1].End = row.ElapsedSec
}
for i := range spans {
if spans[i].End <= spans[i].Start {
spans[i].End = spans[i].Start + 1
}
}
return spans
}
var gpuHTMLReplacer = strings.NewReplacer(
"&", "&amp;",
"<", "&lt;",
">", "&gt;",
`"`, "&quot;",
"'", "&#39;",
)
func gpuHTMLEscape(s string) string {
return gpuHTMLReplacer.Replace(s)
}

View File

@@ -0,0 +1,65 @@
package platform
import (
"os"
"path/filepath"
"strings"
"testing"
)
func TestWriteGPUMetricsCSVIncludesStageColumn(t *testing.T) {
t.Parallel()
dir := t.TempDir()
path := filepath.Join(dir, "gpu-metrics.csv")
rows := []GPUMetricRow{
{Stage: "warmup", ElapsedSec: 1, GPUIndex: 0, TempC: 71, UsagePct: 99, MemUsagePct: 80, PowerW: 420, ClockMHz: 1800, MemClockMHz: 1200},
}
if err := WriteGPUMetricsCSV(path, rows); err != nil {
t.Fatalf("WriteGPUMetricsCSV: %v", err)
}
raw, err := os.ReadFile(path)
if err != nil {
t.Fatalf("ReadFile: %v", err)
}
text := string(raw)
for _, needle := range []string{
"stage,elapsed_sec,gpu_index",
`"warmup",1.0,0,71.0,99.0,80.0,420.0,1800,1200`,
} {
if !strings.Contains(text, needle) {
t.Fatalf("csv missing %q\n%s", needle, text)
}
}
}
func TestWriteGPUMetricsHTMLShowsStageLegendAndLabels(t *testing.T) {
t.Parallel()
dir := t.TempDir()
path := filepath.Join(dir, "gpu-metrics.html")
rows := []GPUMetricRow{
{Stage: "baseline", ElapsedSec: 1, GPUIndex: 0, TempC: 50, UsagePct: 10, MemUsagePct: 5, PowerW: 100, ClockMHz: 500, MemClockMHz: 400},
{Stage: "baseline", ElapsedSec: 2, GPUIndex: 0, TempC: 51, UsagePct: 11, MemUsagePct: 5, PowerW: 101, ClockMHz: 510, MemClockMHz: 400},
{Stage: "steady-fp16", ElapsedSec: 3, GPUIndex: 0, TempC: 70, UsagePct: 98, MemUsagePct: 75, PowerW: 390, ClockMHz: 1700, MemClockMHz: 1100},
{Stage: "steady-fp16", ElapsedSec: 4, GPUIndex: 0, TempC: 71, UsagePct: 99, MemUsagePct: 76, PowerW: 395, ClockMHz: 1710, MemClockMHz: 1110},
}
if err := WriteGPUMetricsHTML(path, rows); err != nil {
t.Fatalf("WriteGPUMetricsHTML: %v", err)
}
raw, err := os.ReadFile(path)
if err != nil {
t.Fatalf("ReadFile: %v", err)
}
text := string(raw)
for _, needle := range []string{
"stage-legend",
"baseline",
"steady-fp16",
"GPU Stress Test Metrics",
} {
if !strings.Contains(text, needle) {
t.Fatalf("html missing %q\n%s", needle, text)
}
}
}

View File

@@ -108,15 +108,15 @@ type nvidiaGPUHealth struct {
}
type nvidiaGPUStatusFile struct {
Index int
Name string
RunStatus string
Reason string
Health string
HealthRaw string
Observed bool
Selected bool
FailingJob string
Index int
Name string
RunStatus string
Reason string
Health string
HealthRaw string
Observed bool
Selected bool
FailingJob string
}
// AMDGPUInfo holds basic info about an AMD GPU from rocm-smi.
@@ -410,13 +410,13 @@ func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir strin
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", withNvidiaPersistenceMode(
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
satJob{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
satJob{
name: "03-dcgmproftester.log",
cmd: profCmd,
env: profEnv,
collectGPU: true,
gpuIndices: selected,
},
satJob{
name: "03-dcgmproftester.log",
cmd: profCmd,
env: profEnv,
collectGPU: true,
gpuIndices: selected,
},
satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
), logFunc)
}
@@ -1382,8 +1382,6 @@ func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd
if len(metricRows) > 0 {
_ = WriteGPUMetricsCSV(filepath.Join(runDir, "gpu-metrics.csv"), metricRows)
_ = WriteGPUMetricsHTML(filepath.Join(runDir, "gpu-metrics.html"), metricRows)
chart := RenderGPUTerminalChart(metricRows)
_ = os.WriteFile(filepath.Join(runDir, "gpu-metrics-term.txt"), []byte(chart), 0644)
}
return out, err

View File

@@ -497,6 +497,7 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
GPUIndices []int `json:"gpu_indices"`
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
StaggerGPUStart bool `json:"stagger_gpu_start"`
ParallelGPUs bool `json:"parallel_gpus"`
Loader string `json:"loader"`
Profile string `json:"profile"`
DisplayName string `json:"display_name"`
@@ -519,6 +520,7 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
GPUIndices: body.GPUIndices,
ExcludeGPUIndices: body.ExcludeGPUIndices,
StaggerGPUStart: body.StaggerGPUStart,
ParallelGPUs: body.ParallelGPUs,
Loader: body.Loader,
BurnProfile: body.Profile,
DisplayName: body.DisplayName,

View File

@@ -1928,23 +1928,10 @@ func renderSATCard(id, label, runAction, headerActions, body string) string {
// ── Benchmark ─────────────────────────────────────────────────────────────────
type benchmarkHistoryColumn struct {
key string
label string
name string
index int
parallel bool
}
type benchmarkHistoryCell struct {
score float64
present bool
}
type benchmarkHistoryRun struct {
generatedAt time.Time
displayTime string
cells map[string]benchmarkHistoryCell
gpuScores map[int]float64 // GPU index → composite score
}
func renderBenchmark(opts HandlerOptions) string {
@@ -2206,17 +2193,17 @@ benchmarkLoadGPUs();
}
func renderBenchmarkResultsCard(exportDir string) string {
columns, runs := loadBenchmarkHistory(exportDir)
maxIdx, runs := loadBenchmarkHistory(exportDir)
return renderBenchmarkResultsCardFromRuns(
"Benchmark Results",
"Composite score by saved benchmark run and GPU.",
"No saved benchmark runs yet.",
columns,
maxIdx,
runs,
)
}
func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string, columns []benchmarkHistoryColumn, runs []benchmarkHistoryRun) string {
func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string, maxGPUIndex int, runs []benchmarkHistoryRun) string {
if len(runs) == 0 {
return `<div class="card"><div class="card-head">` + html.EscapeString(title) + `</div><div class="card-body"><p style="color:var(--muted);font-size:13px">` + html.EscapeString(emptyMessage) + `</p></div></div>`
}
@@ -2226,22 +2213,22 @@ func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string,
b.WriteString(`<p style="color:var(--muted);font-size:13px;margin-bottom:12px">` + html.EscapeString(description) + `</p>`)
}
b.WriteString(`<div style="overflow-x:auto">`)
b.WriteString(`<table><thead><tr><th>Test</th><th>Time</th>`)
for _, col := range columns {
b.WriteString(`<th>` + html.EscapeString(col.label) + `</th>`)
b.WriteString(`<table><thead><tr><th>Run</th><th>Time</th>`)
for i := 0; i <= maxGPUIndex; i++ {
b.WriteString(`<th>GPU ` + strconv.Itoa(i) + `</th>`)
}
b.WriteString(`</tr></thead><tbody>`)
for i, run := range runs {
b.WriteString(`<tr>`)
b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
for _, col := range columns {
cell, ok := run.cells[col.key]
if !ok || !cell.present {
for idx := 0; idx <= maxGPUIndex; idx++ {
score, ok := run.gpuScores[idx]
if !ok {
b.WriteString(`<td style="color:var(--muted)">-</td>`)
continue
}
b.WriteString(`<td>` + fmt.Sprintf("%.2f", cell.score) + `</td>`)
b.WriteString(`<td>` + fmt.Sprintf("%.2f", score) + `</td>`)
}
b.WriteString(`</tr>`)
}
@@ -2249,22 +2236,22 @@ func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string,
return b.String()
}
func loadBenchmarkHistory(exportDir string) ([]benchmarkHistoryColumn, []benchmarkHistoryRun) {
func loadBenchmarkHistory(exportDir string) (int, []benchmarkHistoryRun) {
baseDir := app.DefaultBenchmarkBaseDir
if strings.TrimSpace(exportDir) != "" {
baseDir = filepath.Join(exportDir, "bee-benchmark")
}
paths, err := filepath.Glob(filepath.Join(baseDir, "gpu-benchmark-*", "result.json"))
if err != nil || len(paths) == 0 {
return nil, nil
return -1, nil
}
sort.Strings(paths)
return loadBenchmarkHistoryFromPaths(paths)
}
func loadBenchmarkHistoryFromPaths(paths []string) ([]benchmarkHistoryColumn, []benchmarkHistoryRun) {
columnByKey := make(map[string]benchmarkHistoryColumn)
func loadBenchmarkHistoryFromPaths(paths []string) (int, []benchmarkHistoryRun) {
runs := make([]benchmarkHistoryRun, 0, len(paths))
maxGPUIndex := -1
for _, path := range paths {
raw, err := os.ReadFile(path)
if err != nil {
@@ -2277,102 +2264,22 @@ func loadBenchmarkHistoryFromPaths(paths []string) ([]benchmarkHistoryColumn, []
run := benchmarkHistoryRun{
generatedAt: result.GeneratedAt,
displayTime: result.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
cells: make(map[string]benchmarkHistoryCell),
gpuScores: make(map[int]float64),
}
if result.ParallelGPUs {
// All GPUs ran simultaneously — one column per server, score = avg composite.
gpuModelCount := make(map[string]int)
for _, gpu := range result.GPUs {
gpuModelCount[strings.TrimSpace(gpu.Name)]++
}
scoreSum := make(map[string]float64)
scoreCnt := make(map[string]int)
for _, gpu := range result.GPUs {
key := "parallel|" + strings.TrimSpace(result.ServerModel) + "|" + strings.TrimSpace(gpu.Name)
scoreSum[key] += gpu.Scores.CompositeScore
scoreCnt[key]++
count := gpuModelCount[strings.TrimSpace(gpu.Name)]
columnByKey[key] = benchmarkHistoryColumn{
key: key,
label: benchmarkHistoryParallelLabel(result.ServerModel, gpu.Name, count),
name: strings.TrimSpace(gpu.Name),
index: -1,
parallel: true,
}
}
for key, sum := range scoreSum {
run.cells[key] = benchmarkHistoryCell{score: sum / float64(scoreCnt[key]), present: true}
}
} else {
// Each GPU ran independently — one column per GPU index.
for _, gpu := range result.GPUs {
key := "gpu|" + strings.TrimSpace(result.ServerModel) + "|" + strings.TrimSpace(gpu.Name) + "|" + strconv.Itoa(gpu.Index)
columnByKey[key] = benchmarkHistoryColumn{
key: key,
label: benchmarkHistoryPerGPULabel(gpu.Name, gpu.Index),
name: strings.TrimSpace(gpu.Name),
index: gpu.Index,
parallel: false,
}
run.cells[key] = benchmarkHistoryCell{score: gpu.Scores.CompositeScore, present: true}
for _, gpu := range result.GPUs {
run.gpuScores[gpu.Index] = gpu.Scores.CompositeScore
if gpu.Index > maxGPUIndex {
maxGPUIndex = gpu.Index
}
}
runs = append(runs, run)
}
columns := make([]benchmarkHistoryColumn, 0, len(columnByKey))
for _, col := range columnByKey {
columns = append(columns, col)
}
// Sequential GPU columns first (sorted by GPU index), then parallel server columns.
sort.Slice(columns, func(i, j int) bool {
if columns[i].parallel != columns[j].parallel {
return !columns[i].parallel // sequential first
}
if columns[i].parallel {
li := strings.ToLower(columns[i].label)
lj := strings.ToLower(columns[j].label)
if li != lj {
return li < lj
}
return columns[i].key < columns[j].key
}
// Sequential: sort by GPU index, then name.
if columns[i].index != columns[j].index {
return columns[i].index < columns[j].index
}
return strings.ToLower(columns[i].name) < strings.ToLower(columns[j].name)
})
sort.Slice(runs, func(i, j int) bool {
return runs[i].generatedAt.After(runs[j].generatedAt)
})
return columns, runs
return maxGPUIndex, runs
}
// benchmarkHistoryPerGPULabel formats a label for a single-GPU column: "GPU #N — ModelName".
func benchmarkHistoryPerGPULabel(gpuName string, index int) string {
gpuName = strings.TrimSpace(gpuName)
if gpuName == "" {
gpuName = "Unknown GPU"
}
return fmt.Sprintf("GPU #%d — %s", index, gpuName)
}
// benchmarkHistoryParallelLabel formats a label for an all-GPU parallel column:
// "ServerModel — N× ModelName (All GPUs)" or "N× ModelName (All GPUs)" if no server.
func benchmarkHistoryParallelLabel(serverModel, gpuName string, count int) string {
serverModel = strings.TrimSpace(serverModel)
gpuName = strings.TrimSpace(gpuName)
if gpuName == "" {
gpuName = "Unknown GPU"
}
gpuPart := fmt.Sprintf("%d× %s (All GPUs)", count, gpuName)
if serverModel == "" {
return gpuPart
}
return fmt.Sprintf("%s — %s", serverModel, gpuPart)
}
// ── Burn ──────────────────────────────────────────────────────────────────────

View File

@@ -693,8 +693,8 @@ func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) {
for _, needle := range []string{
`Benchmark Results`,
`Composite score by saved benchmark run and GPU.`,
`GPU #0 — NVIDIA H100 PCIe`,
`GPU #1 — NVIDIA H100 PCIe`,
`GPU 0`,
`GPU 1`,
`#1`,
wantTime,
`1176.25`,

View File

@@ -422,7 +422,7 @@ func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) {
for _, needle := range []string{
`Benchmark Results`,
`Composite score for this benchmark task.`,
`GPU #0 — NVIDIA H100 PCIe`,
`GPU 0`,
`1176.25`,
} {
if !strings.Contains(html, needle) {

View File

@@ -6,7 +6,7 @@ NCCL_CUDA_VERSION=13.0
NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186
NCCL_TESTS_VERSION=2.13.10
NVCC_VERSION=12.8
CUBLAS_VERSION=13.0.2.14-1
CUBLAS_VERSION=13.1.1.3-1
CUDA_USERSPACE_VERSION=13.0.96-1
DCGM_VERSION=4.5.3-1
JOHN_JUMBO_COMMIT=67fcf9fe5a

View File

@@ -33,7 +33,6 @@ typedef void *CUstream;
#define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR 75
#define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR 76
#define MAX_STRESS_STREAMS 16
#define MAX_CUBLAS_PROFILES 5
#define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
#define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
@@ -689,6 +688,8 @@ static const struct profile_desc k_profiles[] = {
#endif
};
#define PROFILE_COUNT ((int)(sizeof(k_profiles) / sizeof(k_profiles[0])))
static int load_cublaslt(struct cublaslt_api *api) {
memset(api, 0, sizeof(*api));
api->lib = dlopen("libcublasLt.so.13", RTLD_NOW | RTLD_LOCAL);
@@ -1121,9 +1122,10 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
int cc_minor,
int seconds,
int size_mb,
const char *precision_filter,
struct stress_report *report) {
struct cublaslt_api cublas;
struct prepared_profile prepared[MAX_STRESS_STREAMS * MAX_CUBLAS_PROFILES];
struct prepared_profile prepared[MAX_STRESS_STREAMS * PROFILE_COUNT];
cublasLtHandle_t handle = NULL;
CUcontext ctx = NULL;
CUstream streams[MAX_STRESS_STREAMS] = {0};
@@ -1133,7 +1135,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
int active = 0;
int mp_count = 0;
int stream_count = 1;
int profile_count = (int)(sizeof(k_profiles) / sizeof(k_profiles[0]));
int profile_count = PROFILE_COUNT;
int prepared_count = 0;
size_t requested_budget = 0;
size_t total_budget = 0;
@@ -1158,8 +1160,10 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
return 0;
}
/* Count profiles matching the filter (for deciding what to run). */
for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc) {
if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc &&
(precision_filter == NULL || strcmp(k_profiles[i].block_label, precision_filter) == 0)) {
planned++;
}
}
@@ -1170,18 +1174,31 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
return 0;
}
/* Count all profiles active on this GPU regardless of filter.
* Used as the budget divisor so matrix sizes stay consistent whether
* running all precisions together or a single-precision phase. */
int planned_total = 0;
for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc) {
planned_total++;
}
}
if (planned_total < planned) {
planned_total = planned;
}
requested_budget = (size_t)size_mb * 1024u * 1024u;
if (requested_budget < (size_t)planned * MIN_PROFILE_BUDGET_BYTES) {
requested_budget = (size_t)planned * MIN_PROFILE_BUDGET_BYTES;
if (requested_budget < (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES) {
requested_budget = (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES;
}
total_budget = clamp_budget_to_free_memory(cuda, requested_budget);
if (total_budget < (size_t)planned * MIN_PROFILE_BUDGET_BYTES) {
total_budget = (size_t)planned * MIN_PROFILE_BUDGET_BYTES;
if (total_budget < (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES) {
total_budget = (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES;
}
if (query_multiprocessor_count(cuda, dev, &mp_count) &&
cuda->cuStreamCreate &&
cuda->cuStreamDestroy) {
stream_count = choose_stream_count(mp_count, planned, total_budget, 1);
stream_count = choose_stream_count(mp_count, planned_total, total_budget, 1);
}
if (stream_count > 1) {
int created = 0;
@@ -1194,7 +1211,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
}
}
report->stream_count = stream_count;
per_profile_budget = total_budget / ((size_t)planned * (size_t)stream_count);
per_profile_budget = total_budget / ((size_t)planned_total * (size_t)stream_count);
if (per_profile_budget < MIN_PROFILE_BUDGET_BYTES) {
per_profile_budget = MIN_PROFILE_BUDGET_BYTES;
}
@@ -1218,6 +1235,13 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
desc->min_cc);
continue;
}
if (precision_filter != NULL && strcmp(desc->block_label, precision_filter) != 0) {
append_detail(report->details,
sizeof(report->details),
"%s=SKIPPED precision_filter\n",
desc->name);
continue;
}
for (int lane = 0; lane < stream_count; lane++) {
CUstream stream = streams[lane];
if (prepared_count >= (int)(sizeof(prepared) / sizeof(prepared[0]))) {
@@ -1339,6 +1363,7 @@ int main(int argc, char **argv) {
int seconds = 5;
int size_mb = 64;
int device_index = 0;
const char *precision_filter = NULL; /* NULL = all; else block_label to match */
for (int i = 1; i < argc; i++) {
if ((strcmp(argv[i], "--seconds") == 0 || strcmp(argv[i], "-t") == 0) && i + 1 < argc) {
seconds = atoi(argv[++i]);
@@ -1346,8 +1371,12 @@ int main(int argc, char **argv) {
size_mb = atoi(argv[++i]);
} else if ((strcmp(argv[i], "--device") == 0 || strcmp(argv[i], "-d") == 0) && i + 1 < argc) {
device_index = atoi(argv[++i]);
} else if (strcmp(argv[i], "--precision") == 0 && i + 1 < argc) {
precision_filter = argv[++i];
} else {
fprintf(stderr, "usage: %s [--seconds N] [--size-mb N] [--device N]\n", argv[0]);
fprintf(stderr,
"usage: %s [--seconds N] [--size-mb N] [--device N] [--precision fp8|fp16|fp32|fp64|fp4]\n",
argv[0]);
return 2;
}
}
@@ -1407,10 +1436,20 @@ int main(int argc, char **argv) {
int ok = 0;
#if HAVE_CUBLASLT_HEADERS
ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, &report);
ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, precision_filter, &report);
#endif
if (!ok) {
if (!run_ptx_fallback(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, &report)) {
if (precision_filter != NULL) {
fprintf(stderr,
"requested precision path unavailable: precision=%s device=%s cc=%d.%d\n",
precision_filter,
name,
cc_major,
cc_minor);
return 1;
}
int ptx_mb = size_mb;
if (!run_ptx_fallback(&cuda, dev, name, cc_major, cc_minor, seconds, ptx_mb, &report)) {
return 1;
}
}

View File

@@ -873,9 +873,37 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"
echo "=== bee-gpu-burn FP4 header probe ==="
fp4_type_match="$(grep -Rsnm 1 'CUDA_R_4F_E2M1' "${CUBLAS_CACHE}/include" 2>/dev/null || true)"
fp4_scale_match="$(grep -Rsnm 1 'CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3' "${CUBLAS_CACHE}/include" 2>/dev/null || true)"
if [ -n "$fp4_type_match" ]; then
echo "fp4_header_symbol=present"
echo "$fp4_type_match"
else
echo "fp4_header_symbol=missing"
fi
if [ -n "$fp4_scale_match" ]; then
echo "fp4_scale_mode_symbol=present"
echo "$fp4_scale_match"
else
echo "fp4_scale_mode_symbol=missing"
fi
GPU_STRESS_NEED_BUILD=1
if [ -f "$GPU_BURN_WORKER_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_BURN_WORKER_BIN" ]; then
if [ -f "$GPU_BURN_WORKER_BIN" ]; then
GPU_STRESS_NEED_BUILD=0
for dep in \
"${BUILDER_DIR}/bee-gpu-stress.c" \
"${BUILDER_DIR}/VERSIONS"; do
if [ "$dep" -nt "$GPU_BURN_WORKER_BIN" ]; then
GPU_STRESS_NEED_BUILD=1
break
fi
done
if [ "$GPU_STRESS_NEED_BUILD" = "0" ] && \
find "${CUBLAS_CACHE}/include" "${CUBLAS_CACHE}/lib" -type f -newer "$GPU_BURN_WORKER_BIN" | grep -q .; then
GPU_STRESS_NEED_BUILD=1
fi
fi
if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then
@@ -889,6 +917,12 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
else
echo "=== bee-gpu-burn worker up to date, skipping build ==="
fi
echo "=== bee-gpu-burn compiled profile probe ==="
if grep -aq 'fp4_e2m1' "$GPU_BURN_WORKER_BIN"; then
echo "fp4_profile_string=present"
else
echo "fp4_profile_string=missing"
fi
fi
echo "=== preparing staged overlay (${BUILD_VARIANT}) ==="

View File

@@ -6,10 +6,11 @@ STAGGER_SECONDS=0
SIZE_MB=0
DEVICES=""
EXCLUDE=""
PRECISION=""
WORKER="/usr/local/lib/bee/bee-gpu-burn-worker"
usage() {
echo "usage: $0 [--seconds N] [--stagger-seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3]" >&2
echo "usage: $0 [--seconds N] [--stagger-seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3] [--precision fp8|fp16|fp32|fp64|fp4]" >&2
exit 2
}
@@ -30,6 +31,7 @@ while [ "$#" -gt 0 ]; do
--size-mb|-m) [ "$#" -ge 2 ] || usage; SIZE_MB="$2"; shift 2 ;;
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
--precision) [ "$#" -ge 2 ] || usage; PRECISION="$2"; shift 2 ;;
*) usage ;;
esac
done
@@ -88,8 +90,10 @@ for id in $(echo "${FINAL}" | tr ',' ' '); do
extra_sec=$(( STAGGER_SECONDS * (GPU_COUNT - gpu_pos) ))
gpu_seconds=$(( SECONDS + extra_sec ))
echo "starting gpu ${id} size=${gpu_size_mb}MB seconds=${gpu_seconds}"
precision_arg=""
[ -n "${PRECISION}" ] && precision_arg="--precision ${PRECISION}"
CUDA_VISIBLE_DEVICES="${id}" \
"${WORKER}" --device 0 --seconds "${gpu_seconds}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
"${WORKER}" --device 0 --seconds "${gpu_seconds}" --size-mb "${gpu_size_mb}" ${precision_arg} >"${log}" 2>&1 &
pid=$!
WORKERS="${WORKERS} ${pid}:${id}:${log}"
if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${gpu_pos}" -lt "${GPU_COUNT}" ]; then