Compare commits

..

8 Commits
v7.14 ... v7.20

Author SHA1 Message Date
Mikhail Chusavitin
b1a5035edd Normalize task queue priorities by workflow 2026-04-14 11:13:54 +03:00
Mikhail Chusavitin
8fc986c933 Add benchmark fan duty cycle summary to report 2026-04-14 10:24:02 +03:00
Mikhail Chusavitin
88b5e0edf2 Harden IPMI power probe timeout 2026-04-14 10:18:23 +03:00
Mikhail Chusavitin
82fe1f6d26 Disable precision fallback and pin cuBLAS 13.1 2026-04-14 10:17:44 +03:00
81e7c921f8 дебаг при сборке 2026-04-14 07:02:37 +03:00
0fb8f2777f Fix combined gpu burn profile capacity for fp4 2026-04-14 00:00:40 +03:00
bf182daa89 Fix benchmark report methodology and rebuild gpu burn worker on toolchain changes 2026-04-13 23:43:12 +03:00
457ea1cf04 Unify benchmark exports and drop ASCII charts 2026-04-13 21:38:28 +03:00
15 changed files with 898 additions and 579 deletions

1
.gitignore vendored
View File

@@ -2,3 +2,4 @@
.DS_Store .DS_Store
dist/ dist/
iso/out/ iso/out/
build-cache/

View File

@@ -125,6 +125,8 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
} }
logFunc(fmt.Sprintf("NVIDIA benchmark profile=%s gpus=%s", spec.Name, joinIndexList(selected))) logFunc(fmt.Sprintf("NVIDIA benchmark profile=%s gpus=%s", spec.Name, joinIndexList(selected)))
var metricRows []GPUMetricRow
gpuBurnLog := filepath.Join(runDir, "gpu-burn.log")
// Server power characterization state — populated during per-GPU phases. // Server power characterization state — populated during per-GPU phases.
var serverIdleW, serverLoadedWSum float64 var serverIdleW, serverLoadedWSum float64
@@ -171,199 +173,202 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
cpuSamplesCh := startCPULoadSampler(cpuStopCh, 10) cpuSamplesCh := startCPULoadSampler(cpuStopCh, 10)
if opts.ParallelGPUs { if opts.ParallelGPUs {
runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, calibPowerByIndex, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples) runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, calibPowerByIndex, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples, &metricRows, gpuBurnLog)
} else { } else {
for _, idx := range selected { for _, idx := range selected {
gpuResult := BenchmarkGPUResult{ gpuResult := BenchmarkGPUResult{
Index: idx, Index: idx,
Status: "FAILED", Status: "FAILED",
} }
if info, ok := infoByIndex[idx]; ok { if info, ok := infoByIndex[idx]; ok {
gpuResult.UUID = info.UUID gpuResult.UUID = info.UUID
gpuResult.Name = info.Name gpuResult.Name = info.Name
gpuResult.BusID = info.BusID gpuResult.BusID = info.BusID
gpuResult.VBIOS = info.VBIOS gpuResult.VBIOS = info.VBIOS
gpuResult.PowerLimitW = info.PowerLimitW gpuResult.PowerLimitW = info.PowerLimitW
gpuResult.MultiprocessorCount = info.MultiprocessorCount gpuResult.MultiprocessorCount = info.MultiprocessorCount
gpuResult.DefaultPowerLimitW = info.DefaultPowerLimitW gpuResult.DefaultPowerLimitW = info.DefaultPowerLimitW
gpuResult.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz gpuResult.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz
gpuResult.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz gpuResult.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz
} }
if w, ok := calibPowerByIndex[idx]; ok && w > 0 { if w, ok := calibPowerByIndex[idx]; ok && w > 0 {
gpuResult.CalibratedPeakPowerW = w gpuResult.CalibratedPeakPowerW = w
} }
if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil { if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
gpuResult.LockedGraphicsClockMHz = norm.GPUClockLockMHz gpuResult.LockedGraphicsClockMHz = norm.GPUClockLockMHz
gpuResult.LockedMemoryClockMHz = norm.MemoryClockLockMHz gpuResult.LockedMemoryClockMHz = norm.MemoryClockLockMHz
}
baselineRows, err := collectBenchmarkSamples(ctx, spec.BaselineSec, []int{idx})
if err != nil && err != context.Canceled {
gpuResult.Notes = append(gpuResult.Notes, "baseline sampling failed: "+err.Error())
}
gpuResult.Baseline = summarizeBenchmarkTelemetry(baselineRows)
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-baseline", idx), baselineRows)
// Sample server idle power once (first GPU only — server state is global).
if !serverIdleOK {
if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok {
serverIdleW = w
serverIdleOK = true
logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
} }
}
warmupCmd := []string{ baselineRows, err := collectBenchmarkSamples(ctx, spec.BaselineSec, []int{idx})
"bee-gpu-burn", if err != nil && err != context.Canceled {
"--seconds", strconv.Itoa(spec.WarmupSec), gpuResult.Notes = append(gpuResult.Notes, "baseline sampling failed: "+err.Error())
"--size-mb", strconv.Itoa(opts.SizeMB), }
"--devices", strconv.Itoa(idx), gpuResult.Baseline = summarizeBenchmarkTelemetry(baselineRows)
} appendBenchmarkMetrics(&metricRows, baselineRows, fmt.Sprintf("gpu-%d-baseline", idx))
logFunc(fmt.Sprintf("GPU %d: warmup (%ds)", idx, spec.WarmupSec))
warmupOut, _, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-warmup.log", idx), warmupCmd, nil, []int{idx}, runDir, fmt.Sprintf("gpu-%d-warmup", idx), logFunc)
_ = os.WriteFile(filepath.Join(runDir, fmt.Sprintf("gpu-%d-warmup.log", idx)), warmupOut, 0644)
if warmupErr != nil {
gpuResult.Notes = append(gpuResult.Notes, "warmup failed: "+warmupErr.Error())
result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult))
continue
}
// ── Per-precision stability phases ──────────────────────────────────────── // Sample server idle power once (first GPU only — server state is global).
// Run each precision category alone so PowerCVPct reflects genuine GPU if !serverIdleOK {
// power stability, not kernel-mix variance. if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok {
// Time budget: each phase gets steadySec/numPhases, minimum 60 s. serverIdleW = w
// SteadySec is split equally across all precision phases + 1 combined slot. serverIdleOK = true
// Skipped phases (unsupported precision) are simply omitted; combined is fixed. logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
totalSlots := len(benchmarkPrecisionPhases) + 1 }
perPhaseSec := spec.SteadySec / totalSlots }
if perPhaseSec < 60 {
perPhaseSec = 60 warmupCmd := []string{
} "bee-gpu-burn",
eccBase, _ := queryECCCounters(idx) "--seconds", strconv.Itoa(spec.WarmupSec),
for _, prec := range benchmarkPrecisionPhases { "--size-mb", strconv.Itoa(opts.SizeMB),
phaseCmd := []string{ "--devices", strconv.Itoa(idx),
}
logFunc(fmt.Sprintf("GPU %d: warmup (%ds)", idx, spec.WarmupSec))
warmupOut, warmupRows, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-warmup.log", idx), warmupCmd, nil, []int{idx}, logFunc)
appendBenchmarkMetrics(&metricRows, warmupRows, fmt.Sprintf("gpu-%d-warmup", idx))
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", fmt.Sprintf("gpu-%d-warmup", idx), warmupOut)
if warmupErr != nil {
gpuResult.Notes = append(gpuResult.Notes, "warmup failed: "+warmupErr.Error())
result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult))
continue
}
// ── Per-precision stability phases ────────────────────────────────────────
// Run each precision category alone so PowerCVPct reflects genuine GPU
// power stability, not kernel-mix variance.
// Time budget: each phase gets steadySec/numPhases, minimum 60 s.
// SteadySec is split equally across all precision phases + 1 combined slot.
// Skipped phases (unsupported precision) are simply omitted; combined is fixed.
totalSlots := len(benchmarkPrecisionPhases) + 1
perPhaseSec := spec.SteadySec / totalSlots
if perPhaseSec < 60 {
perPhaseSec = 60
}
eccBase, _ := queryECCCounters(idx)
for _, prec := range benchmarkPrecisionPhases {
phaseCmd := []string{
"bee-gpu-burn",
"--seconds", strconv.Itoa(perPhaseSec),
"--size-mb", strconv.Itoa(opts.SizeMB),
"--devices", strconv.Itoa(idx),
"--precision", prec,
}
logFunc(fmt.Sprintf("GPU %d: %s stability phase (%ds)", idx, prec, perPhaseSec))
phaseLogName := fmt.Sprintf("gpu-%d-steady-%s", idx, prec)
eccBefore, _ := queryECCCounters(idx)
phaseOut, phaseRows, phaseErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, phaseLogName+".log", phaseCmd, nil, []int{idx}, logFunc)
appendBenchmarkMetrics(&metricRows, phaseRows, phaseLogName)
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseLogName, phaseOut)
eccAfter, _ := queryECCCounters(idx)
if phaseErr != nil || len(phaseRows) == 0 {
continue
}
phase := BenchmarkPrecisionSteadyPhase{
Precision: prec,
Steady: summarizeBenchmarkTelemetry(phaseRows),
ECC: diffECCCounters(eccBefore, eccAfter),
}
for _, p := range parseBenchmarkBurnLog(string(phaseOut)).Profiles {
if p.Supported {
phase.TeraOpsPerSec += p.TeraOpsPerSec
phase.WeightedTeraOpsPerSec += p.WeightedTeraOpsPerSec
}
}
gpuResult.PrecisionSteady = append(gpuResult.PrecisionSteady, phase)
}
beforeThrottle, _ := queryThrottleCounters(idx)
steadyCmd := []string{
"bee-gpu-burn", "bee-gpu-burn",
"--seconds", strconv.Itoa(perPhaseSec), "--seconds", strconv.Itoa(perPhaseSec),
"--size-mb", strconv.Itoa(opts.SizeMB), "--size-mb", strconv.Itoa(opts.SizeMB),
"--devices", strconv.Itoa(idx), "--devices", strconv.Itoa(idx),
"--precision", prec,
} }
logFunc(fmt.Sprintf("GPU %d: %s stability phase (%ds)", idx, prec, perPhaseSec)) logFunc(fmt.Sprintf("GPU %d: steady compute (combined, %ds)", idx, perPhaseSec))
phaseLogName := fmt.Sprintf("gpu-%d-steady-%s", idx, prec)
eccBefore, _ := queryECCCounters(idx)
phaseOut, phaseRows, phaseErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, phaseLogName+".log", phaseCmd, nil, []int{idx}, runDir, phaseLogName, logFunc)
eccAfter, _ := queryECCCounters(idx)
if phaseErr != nil || len(phaseRows) == 0 {
continue
}
phase := BenchmarkPrecisionSteadyPhase{
Precision: prec,
Steady: summarizeBenchmarkTelemetry(phaseRows),
ECC: diffECCCounters(eccBefore, eccAfter),
}
for _, p := range parseBenchmarkBurnLog(string(phaseOut)).Profiles {
if p.Supported {
phase.TeraOpsPerSec += p.TeraOpsPerSec
phase.WeightedTeraOpsPerSec += p.WeightedTeraOpsPerSec
}
}
gpuResult.PrecisionSteady = append(gpuResult.PrecisionSteady, phase)
}
beforeThrottle, _ := queryThrottleCounters(idx) // Sample server power via IPMI in parallel with the steady phase.
steadyCmd := []string{ // We collect readings every 5s and average them.
"bee-gpu-burn", ipmiStopCh := make(chan struct{})
"--seconds", strconv.Itoa(perPhaseSec), ipmiResultCh := make(chan float64, 1)
"--size-mb", strconv.Itoa(opts.SizeMB), go func() {
"--devices", strconv.Itoa(idx), defer close(ipmiResultCh)
} var samples []float64
logFunc(fmt.Sprintf("GPU %d: steady compute (combined, %ds)", idx, perPhaseSec)) ticker := time.NewTicker(5 * time.Second)
defer ticker.Stop()
// Sample server power via IPMI in parallel with the steady phase. // First sample after a short warmup delay.
// We collect readings every 5s and average them.
ipmiStopCh := make(chan struct{})
ipmiResultCh := make(chan float64, 1)
go func() {
defer close(ipmiResultCh)
var samples []float64
ticker := time.NewTicker(5 * time.Second)
defer ticker.Stop()
// First sample after a short warmup delay.
select {
case <-ipmiStopCh:
return
case <-time.After(15 * time.Second):
}
for {
if w, err := queryIPMIServerPowerW(); err == nil {
samples = append(samples, w)
}
select { select {
case <-ipmiStopCh: case <-ipmiStopCh:
if len(samples) > 0 {
var sum float64
for _, w := range samples {
sum += w
}
ipmiResultCh <- sum / float64(len(samples))
}
return return
case <-ticker.C: case <-time.After(15 * time.Second):
} }
for {
if w, err := queryIPMIServerPowerW(); err == nil {
samples = append(samples, w)
}
select {
case <-ipmiStopCh:
if len(samples) > 0 {
var sum float64
for _, w := range samples {
sum += w
}
ipmiResultCh <- sum / float64(len(samples))
}
return
case <-ticker.C:
}
}
}()
steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-steady.log", idx), steadyCmd, nil, []int{idx}, logFunc)
appendBenchmarkMetrics(&metricRows, steadyRows, fmt.Sprintf("gpu-%d-steady", idx))
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", fmt.Sprintf("gpu-%d-steady", idx), steadyOut)
close(ipmiStopCh)
if loadedW, ok := <-ipmiResultCh; ok {
serverLoadedWSum += loadedW
serverLoadedSamples++
serverLoadedOK = true
logFunc(fmt.Sprintf("GPU %d: server loaded power (IPMI): %.0f W", idx, loadedW))
}
afterThrottle, _ := queryThrottleCounters(idx)
if steadyErr != nil {
gpuResult.Notes = append(gpuResult.Notes, "steady compute failed: "+steadyErr.Error())
} }
}()
steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-steady.log", idx), steadyCmd, nil, []int{idx}, runDir, fmt.Sprintf("gpu-%d-steady", idx), logFunc) parseResult := parseBenchmarkBurnLog(string(steadyOut))
close(ipmiStopCh) gpuResult.ComputeCapability = parseResult.ComputeCapability
if loadedW, ok := <-ipmiResultCh; ok { gpuResult.Backend = parseResult.Backend
serverLoadedWSum += loadedW gpuResult.PrecisionResults = parseResult.Profiles
serverLoadedSamples++ if parseResult.Fallback {
serverLoadedOK = true gpuResult.Notes = append(gpuResult.Notes, "benchmark used driver PTX fallback; tensor throughput score is not comparable")
logFunc(fmt.Sprintf("GPU %d: server loaded power (IPMI): %.0f W", idx, loadedW)) }
gpuResult.Steady = summarizeBenchmarkTelemetry(steadyRows)
gpuResult.Throttle = diffThrottleCounters(beforeThrottle, afterThrottle)
if eccFinal, err := queryECCCounters(idx); err == nil {
gpuResult.ECC = diffECCCounters(eccBase, eccFinal)
}
cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, []int{idx})
if err != nil && err != context.Canceled {
gpuResult.Notes = append(gpuResult.Notes, "cooldown sampling failed: "+err.Error())
}
gpuResult.Cooldown = summarizeBenchmarkTelemetry(cooldownRows)
appendBenchmarkMetrics(&metricRows, cooldownRows, fmt.Sprintf("gpu-%d-cooldown", idx))
gpuResult.Scores = scoreBenchmarkGPUResult(gpuResult)
gpuResult.DegradationReasons = detectBenchmarkDegradationReasons(gpuResult, result.Normalization.Status)
if steadyErr != nil {
gpuResult.Status = classifySATErrorStatus(steadyOut, steadyErr)
} else if parseResult.Fallback {
gpuResult.Status = "PARTIAL"
} else {
gpuResult.Status = "OK"
}
result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult))
} }
_ = os.WriteFile(filepath.Join(runDir, fmt.Sprintf("gpu-%d-steady.log", idx)), steadyOut, 0644)
afterThrottle, _ := queryThrottleCounters(idx)
if steadyErr != nil {
gpuResult.Notes = append(gpuResult.Notes, "steady compute failed: "+steadyErr.Error())
}
parseResult := parseBenchmarkBurnLog(string(steadyOut))
gpuResult.ComputeCapability = parseResult.ComputeCapability
gpuResult.Backend = parseResult.Backend
gpuResult.PrecisionResults = parseResult.Profiles
if parseResult.Fallback {
gpuResult.Notes = append(gpuResult.Notes, "benchmark used driver PTX fallback; tensor throughput score is not comparable")
}
gpuResult.Steady = summarizeBenchmarkTelemetry(steadyRows)
gpuResult.Throttle = diffThrottleCounters(beforeThrottle, afterThrottle)
if eccFinal, err := queryECCCounters(idx); err == nil {
gpuResult.ECC = diffECCCounters(eccBase, eccFinal)
}
cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, []int{idx})
if err != nil && err != context.Canceled {
gpuResult.Notes = append(gpuResult.Notes, "cooldown sampling failed: "+err.Error())
}
gpuResult.Cooldown = summarizeBenchmarkTelemetry(cooldownRows)
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-cooldown", idx), cooldownRows)
gpuResult.Scores = scoreBenchmarkGPUResult(gpuResult)
gpuResult.DegradationReasons = detectBenchmarkDegradationReasons(gpuResult, result.Normalization.Status)
if steadyErr != nil {
gpuResult.Status = classifySATErrorStatus(steadyOut, steadyErr)
} else if parseResult.Fallback {
gpuResult.Status = "PARTIAL"
} else {
gpuResult.Status = "OK"
}
result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult))
}
} // end sequential path } // end sequential path
if len(selected) > 1 && opts.RunNCCL { if len(selected) > 1 && opts.RunNCCL {
@@ -396,6 +401,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
serverLoadedW = serverLoadedWSum / float64(serverLoadedSamples) serverLoadedW = serverLoadedWSum / float64(serverLoadedSamples)
} }
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuReportedSumW, serverIdleOK && serverLoadedOK) result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuReportedSumW, serverIdleOK && serverLoadedOK)
result.Cooling = summarizeBenchmarkCooling(metricRows)
// Apply server-power penalty when IPMI reports the server delta is much // Apply server-power penalty when IPMI reports the server delta is much
// lower than GPU-reported sum: GPU power telemetry is over-stated, making // lower than GPU-reported sum: GPU power telemetry is over-stated, making
@@ -413,6 +419,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
result.Findings = buildBenchmarkFindings(result) result.Findings = buildBenchmarkFindings(result)
result.OverallStatus = benchmarkOverallStatus(result) result.OverallStatus = benchmarkOverallStatus(result)
writeBenchmarkMetricsFiles(runDir, metricRows)
resultJSON, err := json.MarshalIndent(result, "", " ") resultJSON, err := json.MarshalIndent(result, "", " ")
if err != nil { if err != nil {
@@ -422,7 +429,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
return "", fmt.Errorf("write result.json: %w", err) return "", fmt.Errorf("write result.json: %w", err)
} }
report := renderBenchmarkReportWithCharts(result, loadBenchmarkReportCharts(runDir, selected)) report := renderBenchmarkReportWithCharts(result)
if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(report), 0644); err != nil { if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(report), 0644); err != nil {
return "", fmt.Errorf("write report.md: %w", err) return "", fmt.Errorf("write report.md: %w", err)
} }
@@ -511,11 +518,11 @@ func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []b
// Split the verbose output into per-GPU sections on "^GPU " lines. // Split the verbose output into per-GPU sections on "^GPU " lines.
gpuSectionRe := regexp.MustCompile(`(?m)^GPU\s+([\dA-Fa-f:\.]+)`) gpuSectionRe := regexp.MustCompile(`(?m)^GPU\s+([\dA-Fa-f:\.]+)`)
maxGfxRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Graphics\s*:\s*(\d+)\s*MHz`) maxGfxRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Graphics\s*:\s*(\d+)\s*MHz`)
maxMemRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Memory\s*:\s*(\d+)\s*MHz`) maxMemRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Memory\s*:\s*(\d+)\s*MHz`)
defaultPwrRe := regexp.MustCompile(`(?i)Default Power Limit\s*:\s*([0-9.]+)\s*W`) defaultPwrRe := regexp.MustCompile(`(?i)Default Power Limit\s*:\s*([0-9.]+)\s*W`)
currentPwrRe := regexp.MustCompile(`(?i)Current Power Limit\s*:\s*([0-9.]+)\s*W`) currentPwrRe := regexp.MustCompile(`(?i)Current Power Limit\s*:\s*([0-9.]+)\s*W`)
smCountRe := regexp.MustCompile(`(?i)Multiprocessor Count\s*:\s*(\d+)`) smCountRe := regexp.MustCompile(`(?i)Multiprocessor Count\s*:\s*(\d+)`)
sectionStarts := gpuSectionRe.FindAllSubmatchIndex(nvsmiQ, -1) sectionStarts := gpuSectionRe.FindAllSubmatchIndex(nvsmiQ, -1)
for i, loc := range sectionStarts { for i, loc := range sectionStarts {
@@ -651,7 +658,6 @@ func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
return nil, lastErr return nil, lastErr
} }
func applyBenchmarkNormalization(ctx context.Context, verboseLog string, gpuIndices []int, infoByIndex map[int]benchmarkGPUInfo, result *NvidiaBenchmarkResult) []benchmarkRestoreAction { func applyBenchmarkNormalization(ctx context.Context, verboseLog string, gpuIndices []int, infoByIndex map[int]benchmarkGPUInfo, result *NvidiaBenchmarkResult) []benchmarkRestoreAction {
if os.Geteuid() != 0 { if os.Geteuid() != 0 {
result.Normalization.Status = "partial" result.Normalization.Status = "partial"
@@ -734,7 +740,7 @@ func collectBenchmarkSamples(ctx context.Context, durationSec int, gpuIndices []
if ctx.Err() != nil { if ctx.Err() != nil {
return rows, ctx.Err() return rows, ctx.Err()
} }
samples, err := sampleGPUMetrics(gpuIndices) samples, err := sampleBenchmarkTelemetry(gpuIndices)
if err == nil { if err == nil {
elapsed := time.Since(start).Seconds() elapsed := time.Since(start).Seconds()
for i := range samples { for i := range samples {
@@ -754,7 +760,7 @@ func collectBenchmarkSamples(ctx context.Context, durationSec int, gpuIndices []
return rows, nil return rows, nil
} }
func runBenchmarkCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, runDir, baseName string, logFunc func(string)) ([]byte, []GPUMetricRow, error) { func runBenchmarkCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, logFunc func(string)) ([]byte, []GPUMetricRow, error) {
stopCh := make(chan struct{}) stopCh := make(chan struct{})
doneCh := make(chan struct{}) doneCh := make(chan struct{})
var metricRows []GPUMetricRow var metricRows []GPUMetricRow
@@ -769,7 +775,7 @@ func runBenchmarkCommandWithMetrics(ctx context.Context, verboseLog, name string
case <-stopCh: case <-stopCh:
return return
case <-ticker.C: case <-ticker.C:
samples, err := sampleGPUMetrics(gpuIndices) samples, err := sampleBenchmarkTelemetry(gpuIndices)
if err != nil { if err != nil {
continue continue
} }
@@ -786,18 +792,96 @@ func runBenchmarkCommandWithMetrics(ctx context.Context, verboseLog, name string
close(stopCh) close(stopCh)
<-doneCh <-doneCh
writeBenchmarkMetricsFiles(runDir, baseName, metricRows)
return out, metricRows, err return out, metricRows, err
} }
func writeBenchmarkMetricsFiles(runDir, baseName string, rows []GPUMetricRow) { type benchmarkCoolingSample struct {
AvgFanRPM float64
AvgFanDutyCyclePct float64
FanDutyCycleAvailable bool
}
func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) {
samples, err := sampleGPUMetrics(gpuIndices)
if err != nil {
return nil, err
}
fanSample := sampleBenchmarkCoolingSample()
for i := range samples {
samples[i].FanAvgRPM = fanSample.AvgFanRPM
samples[i].FanDutyCyclePct = fanSample.AvgFanDutyCyclePct
samples[i].FanDutyCycleAvailable = fanSample.FanDutyCycleAvailable
}
return samples, nil
}
func sampleBenchmarkCoolingSample() benchmarkCoolingSample {
fans, _ := sampleFanSpeeds()
avgRPM, _, _ := fanRPMStats(fans)
dutyPct, dutyAvailable := sampleFanDutyCyclePct()
return benchmarkCoolingSample{
AvgFanRPM: avgRPM,
AvgFanDutyCyclePct: dutyPct,
FanDutyCycleAvailable: dutyAvailable,
}
}
func annotateBenchmarkMetricRows(rows []GPUMetricRow, stage string, offset float64) []GPUMetricRow {
if len(rows) == 0 {
return nil
}
out := make([]GPUMetricRow, len(rows))
for i, row := range rows {
row.Stage = stage
row.ElapsedSec += offset
out[i] = row
}
return out
}
func benchmarkMetricOffset(rows []GPUMetricRow) float64 {
if len(rows) == 0 {
return 0
}
var maxElapsed float64
for _, row := range rows {
if row.ElapsedSec > maxElapsed {
maxElapsed = row.ElapsedSec
}
}
return maxElapsed
}
func appendBenchmarkMetrics(allRows *[]GPUMetricRow, rows []GPUMetricRow, stage string) {
annotated := annotateBenchmarkMetricRows(rows, stage, benchmarkMetricOffset(*allRows))
*allRows = append(*allRows, annotated...)
}
func writeBenchmarkMetricsFiles(runDir string, rows []GPUMetricRow) {
if len(rows) == 0 { if len(rows) == 0 {
return return
} }
_ = WriteGPUMetricsCSV(filepath.Join(runDir, baseName+"-metrics.csv"), rows) _ = WriteGPUMetricsCSV(filepath.Join(runDir, "gpu-metrics.csv"), rows)
_ = WriteGPUMetricsHTML(filepath.Join(runDir, baseName+"-metrics.html"), rows) _ = WriteGPUMetricsHTML(filepath.Join(runDir, "gpu-metrics.html"), rows)
chart := RenderGPUTerminalChart(rows) }
_ = os.WriteFile(filepath.Join(runDir, baseName+"-metrics-term.txt"), []byte(chart), 0644)
func appendBenchmarkStageLog(path, source, stage string, raw []byte) {
if path == "" || len(raw) == 0 {
return
}
f, err := os.OpenFile(path, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644)
if err != nil {
return
}
defer f.Close()
header := fmt.Sprintf("\n========== %s | stage=%s ==========\n", source, stage)
_, _ = f.WriteString(header)
if len(raw) > 0 {
_, _ = f.Write(raw)
if raw[len(raw)-1] != '\n' {
_, _ = f.WriteString("\n")
}
}
} }
func parseBenchmarkBurnLog(raw string) benchmarkBurnParseResult { func parseBenchmarkBurnLog(raw string) benchmarkBurnParseResult {
@@ -897,11 +981,13 @@ func ensureBenchmarkProfile(profiles map[string]*benchmarkBurnProfile, name stri
// precisionWeight returns the fp32-equivalence factor for a precision category. // precisionWeight returns the fp32-equivalence factor for a precision category.
// Each factor represents how much "real" numeric work one operation of that // Each factor represents how much "real" numeric work one operation of that
// type performs relative to fp32 (single precision = 1.0 baseline): // type performs relative to fp32 (single precision = 1.0 baseline):
// fp64 = 2.0 — double precision, 2× more bits per operand //
// fp32 = 1.0 — single precision baseline // fp64 = 2.0 — double precision, 2× more bits per operand
// fp16 = 0.5half precision // fp32 = 1.0single precision baseline
// fp8 = 0.25 — quarter precision // fp16 = 0.5 half precision
// fp4 = 0.125 — eighth precision // fp8 = 0.25 — quarter precision
// fp4 = 0.125 — eighth precision
//
// Multiplying raw TOPS by the weight gives fp32-equivalent TOPS, enabling // Multiplying raw TOPS by the weight gives fp32-equivalent TOPS, enabling
// cross-precision comparison on the same numeric scale. // cross-precision comparison on the same numeric scale.
func precisionWeight(category string) float64 { func precisionWeight(category string) float64 {
@@ -968,6 +1054,37 @@ func summarizeBenchmarkTelemetry(rows []GPUMetricRow) BenchmarkTelemetrySummary
return summary return summary
} }
func summarizeBenchmarkCooling(rows []GPUMetricRow) *BenchmarkCoolingSummary {
if len(rows) == 0 {
return nil
}
var rpmValues []float64
var dutyValues []float64
for _, row := range rows {
if row.FanAvgRPM > 0 {
rpmValues = append(rpmValues, row.FanAvgRPM)
}
if row.FanDutyCycleAvailable {
dutyValues = append(dutyValues, row.FanDutyCyclePct)
}
}
if len(rpmValues) == 0 && len(dutyValues) == 0 {
return nil
}
summary := &BenchmarkCoolingSummary{
Available: true,
AvgFanRPM: benchmarkMean(rpmValues),
}
if len(dutyValues) > 0 {
summary.FanDutyCycleAvailable = true
summary.AvgFanDutyCyclePct = benchmarkMean(dutyValues)
summary.P95FanDutyCyclePct = benchmarkPercentile(dutyValues, 95)
} else {
summary.Notes = append(summary.Notes, "fan duty cycle unavailable on this host; RPM-only fan telemetry was collected")
}
return summary
}
func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard { func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
score := BenchmarkScorecard{} score := BenchmarkScorecard{}
@@ -1547,7 +1664,10 @@ func maxInt(a, b int) int {
// queryIPMIServerPowerW reads the current server power draw via ipmitool dcmi. // queryIPMIServerPowerW reads the current server power draw via ipmitool dcmi.
// Returns 0 and an error if IPMI is unavailable or the output cannot be parsed. // Returns 0 and an error if IPMI is unavailable or the output cannot be parsed.
func queryIPMIServerPowerW() (float64, error) { func queryIPMIServerPowerW() (float64, error) {
out, err := satExecCommand("ipmitool", "dcmi", "power", "reading").Output() ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
cmd := exec.CommandContext(ctx, "ipmitool", "dcmi", "power", "reading")
out, err := cmd.Output()
if err != nil { if err != nil {
return 0, fmt.Errorf("ipmitool dcmi power reading: %w", err) return 0, fmt.Errorf("ipmitool dcmi power reading: %w", err)
} }
@@ -1566,6 +1686,7 @@ func sampleIPMIPowerSeries(ctx context.Context, durationSec int) (meanW float64,
} }
deadline := time.Now().Add(time.Duration(durationSec) * time.Second) deadline := time.Now().Add(time.Duration(durationSec) * time.Second)
var samples []float64 var samples []float64
loop:
for { for {
if w, err := queryIPMIServerPowerW(); err == nil { if w, err := queryIPMIServerPowerW(); err == nil {
samples = append(samples, w) samples = append(samples, w)
@@ -1575,7 +1696,7 @@ func sampleIPMIPowerSeries(ctx context.Context, durationSec int) (meanW float64,
} }
select { select {
case <-ctx.Done(): case <-ctx.Done():
break break loop
case <-time.After(2 * time.Second): case <-time.After(2 * time.Second):
} }
} }
@@ -1670,6 +1791,8 @@ func runNvidiaBenchmarkParallel(
calibPowerByIndex map[int]float64, calibPowerByIndex map[int]float64,
serverIdleW *float64, serverLoadedWSum *float64, serverIdleW *float64, serverLoadedWSum *float64,
serverIdleOK *bool, serverLoadedOK *bool, serverLoadedSamples *int, serverIdleOK *bool, serverLoadedOK *bool, serverLoadedSamples *int,
allMetricRows *[]GPUMetricRow,
gpuBurnLog string,
) { ) {
allDevices := joinIndexList(selected) allDevices := joinIndexList(selected)
@@ -1709,8 +1832,8 @@ func runNvidiaBenchmarkParallel(
for _, idx := range selected { for _, idx := range selected {
perGPU := filterRowsByGPU(baselineRows, idx) perGPU := filterRowsByGPU(baselineRows, idx)
gpuResults[idx].Baseline = summarizeBenchmarkTelemetry(perGPU) gpuResults[idx].Baseline = summarizeBenchmarkTelemetry(perGPU)
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-baseline", idx), perGPU)
} }
appendBenchmarkMetrics(allMetricRows, baselineRows, "baseline")
// Sample server idle power once. // Sample server idle power once.
if !*serverIdleOK { if !*serverIdleOK {
@@ -1729,11 +1852,9 @@ func runNvidiaBenchmarkParallel(
"--devices", allDevices, "--devices", allDevices,
} }
logFunc(fmt.Sprintf("GPUs %s: parallel warmup (%ds)", allDevices, spec.WarmupSec)) logFunc(fmt.Sprintf("GPUs %s: parallel warmup (%ds)", allDevices, spec.WarmupSec))
warmupOut, warmupRows, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-warmup.log", warmupCmd, nil, selected, runDir, "gpu-all-warmup", logFunc) warmupOut, warmupRows, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-warmup.log", warmupCmd, nil, selected, logFunc)
_ = os.WriteFile(filepath.Join(runDir, "gpu-all-warmup.log"), warmupOut, 0644) appendBenchmarkMetrics(allMetricRows, warmupRows, "warmup")
for _, idx := range selected { appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", "warmup", warmupOut)
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-warmup", idx), filterRowsByGPU(warmupRows, idx))
}
if warmupErr != nil { if warmupErr != nil {
for _, idx := range selected { for _, idx := range selected {
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "parallel warmup failed: "+warmupErr.Error()) gpuResults[idx].Notes = append(gpuResults[idx].Notes, "parallel warmup failed: "+warmupErr.Error())
@@ -1764,7 +1885,9 @@ func runNvidiaBenchmarkParallel(
for _, idx := range selected { for _, idx := range selected {
eccBeforePhase[idx], _ = queryECCCounters(idx) eccBeforePhase[idx], _ = queryECCCounters(idx)
} }
phaseOut, phaseRows, phaseErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, phaseLogName+".log", phaseCmd, nil, selected, runDir, phaseLogName, logFunc) phaseOut, phaseRows, phaseErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, phaseLogName+".log", phaseCmd, nil, selected, logFunc)
appendBenchmarkMetrics(allMetricRows, phaseRows, phaseLogName)
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseLogName, phaseOut)
eccAfterPhase := make(map[int]BenchmarkECCCounters, len(selected)) eccAfterPhase := make(map[int]BenchmarkECCCounters, len(selected))
for _, idx := range selected { for _, idx := range selected {
eccAfterPhase[idx], _ = queryECCCounters(idx) eccAfterPhase[idx], _ = queryECCCounters(idx)
@@ -1842,7 +1965,9 @@ func runNvidiaBenchmarkParallel(
} }
}() }()
steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-steady.log", steadyCmd, nil, selected, runDir, "gpu-all-steady", logFunc) steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-steady.log", steadyCmd, nil, selected, logFunc)
appendBenchmarkMetrics(allMetricRows, steadyRows, "steady")
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", "steady", steadyOut)
close(ipmiStopCh) close(ipmiStopCh)
if loadedW, ok := <-ipmiResultCh; ok { if loadedW, ok := <-ipmiResultCh; ok {
*serverLoadedWSum += loadedW *serverLoadedWSum += loadedW
@@ -1850,8 +1975,6 @@ func runNvidiaBenchmarkParallel(
*serverLoadedOK = true *serverLoadedOK = true
logFunc(fmt.Sprintf("GPUs %s: server loaded power (IPMI): %.0f W", allDevices, loadedW)) logFunc(fmt.Sprintf("GPUs %s: server loaded power (IPMI): %.0f W", allDevices, loadedW))
} }
_ = os.WriteFile(filepath.Join(runDir, "gpu-all-steady.log"), steadyOut, 0644)
afterThrottle := make(map[int]BenchmarkThrottleCounters, len(selected)) afterThrottle := make(map[int]BenchmarkThrottleCounters, len(selected))
for _, idx := range selected { for _, idx := range selected {
afterThrottle[idx], _ = queryThrottleCounters(idx) afterThrottle[idx], _ = queryThrottleCounters(idx)
@@ -1861,7 +1984,6 @@ func runNvidiaBenchmarkParallel(
for _, idx := range selected { for _, idx := range selected {
perGPU := filterRowsByGPU(steadyRows, idx) perGPU := filterRowsByGPU(steadyRows, idx)
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-steady", idx), perGPU)
gpuResults[idx].Steady = summarizeBenchmarkTelemetry(perGPU) gpuResults[idx].Steady = summarizeBenchmarkTelemetry(perGPU)
gpuResults[idx].Throttle = diffThrottleCounters(beforeThrottle[idx], afterThrottle[idx]) gpuResults[idx].Throttle = diffThrottleCounters(beforeThrottle[idx], afterThrottle[idx])
if eccFinal, err := queryECCCounters(idx); err == nil { if eccFinal, err := queryECCCounters(idx); err == nil {
@@ -1891,8 +2013,8 @@ func runNvidiaBenchmarkParallel(
for _, idx := range selected { for _, idx := range selected {
perGPU := filterRowsByGPU(cooldownRows, idx) perGPU := filterRowsByGPU(cooldownRows, idx)
gpuResults[idx].Cooldown = summarizeBenchmarkTelemetry(perGPU) gpuResults[idx].Cooldown = summarizeBenchmarkTelemetry(perGPU)
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-cooldown", idx), perGPU)
} }
appendBenchmarkMetrics(allMetricRows, cooldownRows, "cooldown")
// Score and finalize each GPU. // Score and finalize each GPU.
for _, idx := range selected { for _, idx := range selected {
@@ -2102,7 +2224,7 @@ func runBenchmarkPowerCalibration(
logFunc(fmt.Sprintf("power calibration: running dcgmi targeted_power for %ds on GPUs %s", calibDurationSec, joinIndexList(gpuIndices))) logFunc(fmt.Sprintf("power calibration: running dcgmi targeted_power for %ds on GPUs %s", calibDurationSec, joinIndexList(gpuIndices)))
cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, gpuIndices) cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, gpuIndices)
out, rows, err := runBenchmarkCommandWithMetrics(ctx, verboseLog, "power-calibration.log", cmd, nil, gpuIndices, runDir, "power-calibration", logFunc) out, rows, err := runBenchmarkCommandWithMetrics(ctx, verboseLog, "power-calibration.log", cmd, nil, gpuIndices, logFunc)
_ = os.WriteFile(filepath.Join(runDir, "power-calibration.log"), out, 0644) _ = os.WriteFile(filepath.Join(runDir, "power-calibration.log"), out, 0644)
if err != nil { if err != nil {
logFunc(fmt.Sprintf("power calibration: dcgmi targeted_power failed (%v), skipping", err)) logFunc(fmt.Sprintf("power calibration: dcgmi targeted_power failed (%v), skipping", err))

View File

@@ -2,25 +2,15 @@ package platform
import ( import (
"fmt" "fmt"
"os"
"path/filepath"
"regexp"
"strings" "strings"
"time" "time"
) )
func renderBenchmarkReport(result NvidiaBenchmarkResult) string { func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
return renderBenchmarkReportWithCharts(result, nil) return renderBenchmarkReportWithCharts(result)
} }
type benchmarkReportChart struct { func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
Title string
Content string
}
var ansiEscapePattern = regexp.MustCompile(`\x1b\[[0-9;]*m`)
func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benchmarkReportChart) string {
var b strings.Builder var b strings.Builder
// ── Header ──────────────────────────────────────────────────────────────── // ── Header ────────────────────────────────────────────────────────────────
@@ -91,8 +81,12 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
b.WriteString("\n") b.WriteString("\n")
} }
// ── Scoring methodology ─────────────────────────────────────────────────── // ── Methodology ───────────────────────────────────────────────────────────
b.WriteString("## Scoring Methodology\n\n") b.WriteString("## Methodology\n\n")
fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline -> warmup -> steady-state -> interconnect -> cooldown phases.\n", result.BenchmarkProfile)
b.WriteString("- Single-GPU compute score comes from `bee-gpu-burn` on the cuBLASLt path when available.\n")
b.WriteString("- Thermal and power limits are inferred from NVIDIA clock-event counters plus sustained telemetry.\n")
b.WriteString("- `result.json` is the canonical machine-readable source for the run.\n\n")
b.WriteString("**Compute score** is derived from two phases:\n\n") b.WriteString("**Compute score** is derived from two phases:\n\n")
b.WriteString("- **Synthetic** — each precision type (fp8, fp16, fp32, fp64, fp4) runs alone for a dedicated window. ") b.WriteString("- **Synthetic** — each precision type (fp8, fp16, fp32, fp64, fp4) runs alone for a dedicated window. ")
b.WriteString("Measures peak throughput with the full GPU dedicated to one kernel type. ") b.WriteString("Measures peak throughput with the full GPU dedicated to one kernel type. ")
@@ -213,7 +207,6 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
gpu.Steady.ClockCVPct, gpu.Steady.PowerCVPct, gpu.Steady.ClockDriftPct) gpu.Steady.ClockCVPct, gpu.Steady.PowerCVPct, gpu.Steady.ClockDriftPct)
} }
// ECC summary // ECC summary
if !gpu.ECC.IsZero() { if !gpu.ECC.IsZero() {
fmt.Fprintf(&b, "**ECC errors (total):** corrected=%d uncorrected=%d\n\n", fmt.Fprintf(&b, "**ECC errors (total):** corrected=%d uncorrected=%d\n\n",
@@ -297,61 +290,41 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
} }
} }
// ── Terminal charts (steady-state only) ─────────────────────────────────── // ── Cooling ───────────────────────────────────────────────────────────────
if len(charts) > 0 { if cooling := result.Cooling; cooling != nil {
b.WriteString("## Steady-State Charts\n\n") b.WriteString("## Cooling\n\n")
for _, chart := range charts { if cooling.Available {
content := strings.TrimSpace(stripANSIEscapeSequences(chart.Content)) b.WriteString("| Metric | Value |\n|--------|-------|\n")
if content == "" { fmt.Fprintf(&b, "| Average fan speed | %.0f RPM |\n", cooling.AvgFanRPM)
continue if cooling.FanDutyCycleAvailable {
fmt.Fprintf(&b, "| Average fan duty cycle | %.1f%% |\n", cooling.AvgFanDutyCyclePct)
fmt.Fprintf(&b, "| P95 fan duty cycle | %.1f%% |\n", cooling.P95FanDutyCyclePct)
} else {
b.WriteString("| Average fan duty cycle | N/A |\n")
b.WriteString("| P95 fan duty cycle | N/A |\n")
} }
fmt.Fprintf(&b, "### %s\n\n```\n%s\n```\n\n", chart.Title, content) b.WriteString("\n")
} else {
b.WriteString("Cooling telemetry unavailable.\n\n")
}
for _, note := range cooling.Notes {
fmt.Fprintf(&b, "- %s\n", note)
}
if len(cooling.Notes) > 0 {
b.WriteString("\n")
} }
} }
// ── Methodology ───────────────────────────────────────────────────────────
b.WriteString("## Methodology\n\n")
fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline → warmup → steady-state → interconnect → cooldown phases.\n", result.BenchmarkProfile)
b.WriteString("- Single-GPU compute score from bee-gpu-burn cuBLASLt when available.\n")
b.WriteString("- Thermal and power limitations inferred from NVIDIA clock event reason counters and sustained telemetry.\n")
b.WriteString("- `result.json` is the canonical machine-readable source for this benchmark run.\n\n")
// ── Raw files ───────────────────────────────────────────────────────────── // ── Raw files ─────────────────────────────────────────────────────────────
b.WriteString("## Raw Files\n\n") b.WriteString("## Raw Files\n\n")
b.WriteString("- `result.json`\n- `report.md`\n- `summary.txt`\n- `verbose.log`\n") b.WriteString("- `result.json`\n- `report.md`\n- `summary.txt`\n- `verbose.log`\n")
b.WriteString("- `gpu-*-baseline-metrics.csv/html/term.txt`\n") b.WriteString("- `gpu-metrics.csv`\n- `gpu-metrics.html`\n- `gpu-burn.log`\n")
b.WriteString("- `gpu-*-warmup.log`\n")
b.WriteString("- `gpu-*-steady.log`\n")
b.WriteString("- `gpu-*-steady-metrics.csv/html/term.txt`\n")
b.WriteString("- `gpu-*-cooldown-metrics.csv/html/term.txt`\n")
if result.Interconnect != nil { if result.Interconnect != nil {
b.WriteString("- `nccl-all-reduce.log`\n") b.WriteString("- `nccl-all-reduce.log`\n")
} }
return b.String() return b.String()
} }
// loadBenchmarkReportCharts loads only steady-state terminal charts (baseline and
// cooldown charts are not useful for human review).
func loadBenchmarkReportCharts(runDir string, gpuIndices []int) []benchmarkReportChart {
var charts []benchmarkReportChart
for _, idx := range gpuIndices {
path := filepath.Join(runDir, fmt.Sprintf("gpu-%d-steady-metrics-term.txt", idx))
raw, err := os.ReadFile(path)
if err != nil || len(raw) == 0 {
continue
}
charts = append(charts, benchmarkReportChart{
Title: fmt.Sprintf("GPU %d — Steady State", idx),
Content: string(raw),
})
}
return charts
}
func stripANSIEscapeSequences(raw string) string {
return ansiEscapePattern.ReplaceAllString(raw, "")
}
// formatThrottleLine renders throttle counters as human-readable percentages of // formatThrottleLine renders throttle counters as human-readable percentages of
// the steady-state window. Only non-zero counters are shown. When the steady // the steady-state window. Only non-zero counters are shown. When the steady
// duration is unknown (0), raw seconds are shown instead. // duration is unknown (0), raw seconds are shown instead.

View File

@@ -131,6 +131,13 @@ func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
DegradationReasons: []string{"power_capped"}, DegradationReasons: []string{"power_capped"},
}, },
}, },
Cooling: &BenchmarkCoolingSummary{
Available: true,
AvgFanRPM: 9200,
FanDutyCycleAvailable: true,
AvgFanDutyCyclePct: 47.5,
P95FanDutyCyclePct: 62.0,
},
} }
report := renderBenchmarkReport(result) report := renderBenchmarkReport(result)
@@ -140,6 +147,9 @@ func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
"1176.00", "1176.00",
"fp16_tensor", "fp16_tensor",
"700.00", "700.00",
"Cooling",
"Average fan duty cycle",
"47.5%",
} { } {
if !strings.Contains(report, needle) { if !strings.Contains(report, needle) {
t.Fatalf("report missing %q\n%s", needle, report) t.Fatalf("report missing %q\n%s", needle, report)
@@ -147,36 +157,27 @@ func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
} }
} }
func TestRenderBenchmarkReportIncludesTerminalChartsWithoutANSI(t *testing.T) { func TestRenderBenchmarkReportListsUnifiedArtifacts(t *testing.T) {
t.Parallel() t.Parallel()
report := renderBenchmarkReportWithCharts(NvidiaBenchmarkResult{ report := renderBenchmarkReport(NvidiaBenchmarkResult{
BenchmarkProfile: NvidiaBenchmarkProfileStandard, BenchmarkProfile: NvidiaBenchmarkProfileStandard,
OverallStatus: "OK", OverallStatus: "OK",
SelectedGPUIndices: []int{0}, SelectedGPUIndices: []int{0},
Normalization: BenchmarkNormalization{ Normalization: BenchmarkNormalization{
Status: "full", Status: "full",
}, },
}, []benchmarkReportChart{
{
Title: "GPU 0 Steady State",
Content: "\x1b[31mGPU 0 chart\x1b[0m\n 42┤───",
},
}) })
for _, needle := range []string{ for _, needle := range []string{
"Steady-State Charts", "gpu-metrics.csv",
"GPU 0 Steady State", "gpu-metrics.html",
"GPU 0 chart", "gpu-burn.log",
"42┤───",
} { } {
if !strings.Contains(report, needle) { if !strings.Contains(report, needle) {
t.Fatalf("report missing %q\n%s", needle, report) t.Fatalf("report missing %q\n%s", needle, report)
} }
} }
if strings.Contains(report, "\x1b[31m") {
t.Fatalf("report should not contain ANSI escapes\n%s", report)
}
} }
func TestEnrichGPUInfoWithMaxClocks(t *testing.T) { func TestEnrichGPUInfoWithMaxClocks(t *testing.T) {

View File

@@ -25,6 +25,17 @@ type BenchmarkCPULoad struct {
Note string `json:"note,omitempty"` Note string `json:"note,omitempty"`
} }
// BenchmarkCoolingSummary captures fan telemetry averaged across the full
// benchmark run.
type BenchmarkCoolingSummary struct {
Available bool `json:"available"`
AvgFanRPM float64 `json:"avg_fan_rpm,omitempty"`
FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"`
AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
P95FanDutyCyclePct float64 `json:"p95_fan_duty_cycle_pct,omitempty"`
Notes []string `json:"notes,omitempty"`
}
const ( const (
NvidiaBenchmarkProfileStandard = "standard" NvidiaBenchmarkProfileStandard = "standard"
NvidiaBenchmarkProfileStability = "stability" NvidiaBenchmarkProfileStability = "stability"
@@ -43,7 +54,6 @@ type NvidiaBenchmarkOptions struct {
RampRunID string // shared identifier across all steps of the same ramp-up run RampRunID string // shared identifier across all steps of the same ramp-up run
} }
type NvidiaBenchmarkResult struct { type NvidiaBenchmarkResult struct {
BenchmarkVersion string `json:"benchmark_version"` BenchmarkVersion string `json:"benchmark_version"`
GeneratedAt time.Time `json:"generated_at"` GeneratedAt time.Time `json:"generated_at"`
@@ -62,6 +72,7 @@ type NvidiaBenchmarkResult struct {
Normalization BenchmarkNormalization `json:"normalization"` Normalization BenchmarkNormalization `json:"normalization"`
HostConfig *BenchmarkHostConfig `json:"host_config,omitempty"` HostConfig *BenchmarkHostConfig `json:"host_config,omitempty"`
CPULoad *BenchmarkCPULoad `json:"cpu_load,omitempty"` CPULoad *BenchmarkCPULoad `json:"cpu_load,omitempty"`
Cooling *BenchmarkCoolingSummary `json:"cooling,omitempty"`
GPUs []BenchmarkGPUResult `json:"gpus"` GPUs []BenchmarkGPUResult `json:"gpus"`
Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"` Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"` ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
@@ -84,38 +95,38 @@ type BenchmarkNormalizationGPU struct {
} }
type BenchmarkGPUResult struct { type BenchmarkGPUResult struct {
Index int `json:"index"` Index int `json:"index"`
UUID string `json:"uuid,omitempty"` UUID string `json:"uuid,omitempty"`
Name string `json:"name,omitempty"` Name string `json:"name,omitempty"`
BusID string `json:"bus_id,omitempty"` BusID string `json:"bus_id,omitempty"`
VBIOS string `json:"vbios,omitempty"` VBIOS string `json:"vbios,omitempty"`
ComputeCapability string `json:"compute_capability,omitempty"` ComputeCapability string `json:"compute_capability,omitempty"`
Backend string `json:"backend,omitempty"` Backend string `json:"backend,omitempty"`
Status string `json:"status"` Status string `json:"status"`
PowerLimitW float64 `json:"power_limit_w,omitempty"` PowerLimitW float64 `json:"power_limit_w,omitempty"`
MultiprocessorCount int `json:"multiprocessor_count,omitempty"` MultiprocessorCount int `json:"multiprocessor_count,omitempty"`
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"` DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
// CalibratedPeakPowerW is the p95 power measured during a short // CalibratedPeakPowerW is the p95 power measured during a short
// dcgmi targeted_power calibration run before the main benchmark. // dcgmi targeted_power calibration run before the main benchmark.
// Used as the reference denominator for PowerSustainScore instead of // Used as the reference denominator for PowerSustainScore instead of
// the hardware default limit, which bee-gpu-burn cannot reach. // the hardware default limit, which bee-gpu-burn cannot reach.
CalibratedPeakPowerW float64 `json:"calibrated_peak_power_w,omitempty"` CalibratedPeakPowerW float64 `json:"calibrated_peak_power_w,omitempty"`
MaxGraphicsClockMHz float64 `json:"max_graphics_clock_mhz,omitempty"` MaxGraphicsClockMHz float64 `json:"max_graphics_clock_mhz,omitempty"`
BaseGraphicsClockMHz float64 `json:"base_graphics_clock_mhz,omitempty"` BaseGraphicsClockMHz float64 `json:"base_graphics_clock_mhz,omitempty"`
MaxMemoryClockMHz float64 `json:"max_memory_clock_mhz,omitempty"` MaxMemoryClockMHz float64 `json:"max_memory_clock_mhz,omitempty"`
LockedGraphicsClockMHz float64 `json:"locked_graphics_clock_mhz,omitempty"` LockedGraphicsClockMHz float64 `json:"locked_graphics_clock_mhz,omitempty"`
LockedMemoryClockMHz float64 `json:"locked_memory_clock_mhz,omitempty"` LockedMemoryClockMHz float64 `json:"locked_memory_clock_mhz,omitempty"`
Baseline BenchmarkTelemetrySummary `json:"baseline"` Baseline BenchmarkTelemetrySummary `json:"baseline"`
Steady BenchmarkTelemetrySummary `json:"steady"` Steady BenchmarkTelemetrySummary `json:"steady"`
PrecisionSteady []BenchmarkPrecisionSteadyPhase `json:"precision_steady,omitempty"` PrecisionSteady []BenchmarkPrecisionSteadyPhase `json:"precision_steady,omitempty"`
Cooldown BenchmarkTelemetrySummary `json:"cooldown"` Cooldown BenchmarkTelemetrySummary `json:"cooldown"`
Throttle BenchmarkThrottleCounters `json:"throttle_counters"` Throttle BenchmarkThrottleCounters `json:"throttle_counters"`
// ECC error delta accumulated over the full benchmark (all phases combined). // ECC error delta accumulated over the full benchmark (all phases combined).
ECC BenchmarkECCCounters `json:"ecc,omitempty"` ECC BenchmarkECCCounters `json:"ecc,omitempty"`
PrecisionResults []BenchmarkPrecisionResult `json:"precision_results,omitempty"` PrecisionResults []BenchmarkPrecisionResult `json:"precision_results,omitempty"`
Scores BenchmarkScorecard `json:"scores"` Scores BenchmarkScorecard `json:"scores"`
DegradationReasons []string `json:"degradation_reasons,omitempty"` DegradationReasons []string `json:"degradation_reasons,omitempty"`
Notes []string `json:"notes,omitempty"` Notes []string `json:"notes,omitempty"`
} }
type BenchmarkTelemetrySummary struct { type BenchmarkTelemetrySummary struct {
@@ -170,19 +181,19 @@ type BenchmarkPrecisionResult struct {
// Weight is the fp32-equivalence factor for this precision category. // Weight is the fp32-equivalence factor for this precision category.
// fp32 = 1.0 (baseline), fp64 = 2.0, fp16 = 0.5, fp8 = 0.25, fp4 = 0.125. // fp32 = 1.0 (baseline), fp64 = 2.0, fp16 = 0.5, fp8 = 0.25, fp4 = 0.125.
// WeightedTOPS = TeraOpsPerSec * Weight gives fp32-equivalent throughput. // WeightedTOPS = TeraOpsPerSec * Weight gives fp32-equivalent throughput.
Weight float64 `json:"weight,omitempty"` Weight float64 `json:"weight,omitempty"`
WeightedTeraOpsPerSec float64 `json:"weighted_teraops_per_sec,omitempty"` WeightedTeraOpsPerSec float64 `json:"weighted_teraops_per_sec,omitempty"`
Notes string `json:"notes,omitempty"` Notes string `json:"notes,omitempty"`
} }
type BenchmarkScorecard struct { type BenchmarkScorecard struct {
ComputeScore float64 `json:"compute_score"` ComputeScore float64 `json:"compute_score"`
// SyntheticScore is the sum of fp32-equivalent TOPS from per-precision // SyntheticScore is the sum of fp32-equivalent TOPS from per-precision
// steady phases (each precision ran alone, full GPU dedicated). // steady phases (each precision ran alone, full GPU dedicated).
SyntheticScore float64 `json:"synthetic_score,omitempty"` SyntheticScore float64 `json:"synthetic_score,omitempty"`
// MixedScore is the sum of fp32-equivalent TOPS from the combined phase // MixedScore is the sum of fp32-equivalent TOPS from the combined phase
// (all precisions competing simultaneously — closer to real workloads). // (all precisions competing simultaneously — closer to real workloads).
MixedScore float64 `json:"mixed_score,omitempty"` MixedScore float64 `json:"mixed_score,omitempty"`
// MixedEfficiency = MixedScore / SyntheticScore. Measures how well the GPU // MixedEfficiency = MixedScore / SyntheticScore. Measures how well the GPU
// sustains throughput under concurrent mixed-precision load. // sustains throughput under concurrent mixed-precision load.
MixedEfficiency float64 `json:"mixed_efficiency,omitempty"` MixedEfficiency float64 `json:"mixed_efficiency,omitempty"`
@@ -220,7 +231,7 @@ type BenchmarkPrecisionSteadyPhase struct {
// ECC errors accumulated during this precision phase only. // ECC errors accumulated during this precision phase only.
// Non-zero corrected = stress-induced DRAM errors for this kernel type. // Non-zero corrected = stress-induced DRAM errors for this kernel type.
// Any uncorrected = serious fault triggered by this precision workload. // Any uncorrected = serious fault triggered by this precision workload.
ECC BenchmarkECCCounters `json:"ecc,omitempty"` ECC BenchmarkECCCounters `json:"ecc,omitempty"`
} }
type BenchmarkInterconnectResult struct { type BenchmarkInterconnectResult struct {

View File

@@ -13,14 +13,18 @@ import (
// GPUMetricRow is one telemetry sample from nvidia-smi during a stress test. // GPUMetricRow is one telemetry sample from nvidia-smi during a stress test.
type GPUMetricRow struct { type GPUMetricRow struct {
ElapsedSec float64 `json:"elapsed_sec"` Stage string `json:"stage,omitempty"`
GPUIndex int `json:"index"` ElapsedSec float64 `json:"elapsed_sec"`
TempC float64 `json:"temp_c"` GPUIndex int `json:"index"`
UsagePct float64 `json:"usage_pct"` TempC float64 `json:"temp_c"`
MemUsagePct float64 `json:"mem_usage_pct"` UsagePct float64 `json:"usage_pct"`
PowerW float64 `json:"power_w"` MemUsagePct float64 `json:"mem_usage_pct"`
ClockMHz float64 `json:"clock_mhz"` PowerW float64 `json:"power_w"`
MemClockMHz float64 `json:"mem_clock_mhz"` ClockMHz float64 `json:"clock_mhz"`
MemClockMHz float64 `json:"mem_clock_mhz"`
FanAvgRPM float64 `json:"fan_avg_rpm,omitempty"`
FanDutyCyclePct float64 `json:"fan_duty_cycle_pct,omitempty"`
FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"`
} }
// sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU. // sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
@@ -141,14 +145,24 @@ func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
// WriteGPUMetricsCSV writes collected rows as a CSV file. // WriteGPUMetricsCSV writes collected rows as a CSV file.
func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error { func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
var b bytes.Buffer var b bytes.Buffer
b.WriteString("elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz\n") b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz,fan_avg_rpm,fan_duty_cycle_pct,fan_duty_cycle_available\n")
for _, r := range rows { for _, r := range rows {
fmt.Fprintf(&b, "%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f\n", dutyAvail := 0
r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz) if r.FanDutyCycleAvailable {
dutyAvail = 1
}
fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f,%.0f,%.1f,%d\n",
strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz, r.FanAvgRPM, r.FanDutyCyclePct, dutyAvail)
} }
return os.WriteFile(path, b.Bytes(), 0644) return os.WriteFile(path, b.Bytes(), 0644)
} }
type gpuMetricStageSpan struct {
Name string
Start float64
End float64
}
// WriteGPUMetricsHTML writes a standalone HTML file with one SVG chart per GPU. // WriteGPUMetricsHTML writes a standalone HTML file with one SVG chart per GPU.
func WriteGPUMetricsHTML(path string, rows []GPUMetricRow) error { func WriteGPUMetricsHTML(path string, rows []GPUMetricRow) error {
// Group by GPU index preserving order. // Group by GPU index preserving order.
@@ -163,9 +177,25 @@ func WriteGPUMetricsHTML(path string, rows []GPUMetricRow) error {
gpuMap[r.GPUIndex] = append(gpuMap[r.GPUIndex], r) gpuMap[r.GPUIndex] = append(gpuMap[r.GPUIndex], r)
} }
stageSpans := buildGPUMetricStageSpans(rows)
stageColorByName := make(map[string]string, len(stageSpans))
for i, span := range stageSpans {
stageColorByName[span.Name] = gpuMetricStagePalette[i%len(gpuMetricStagePalette)]
}
var legend strings.Builder
if len(stageSpans) > 0 {
legend.WriteString(`<div class="stage-legend">`)
for _, span := range stageSpans {
fmt.Fprintf(&legend, `<span class="stage-chip"><span class="stage-swatch" style="background:%s"></span>%s</span>`,
stageColorByName[span.Name], gpuHTMLEscape(span.Name))
}
legend.WriteString(`</div>`)
}
var svgs strings.Builder var svgs strings.Builder
for _, gpuIdx := range order { for _, gpuIdx := range order {
svgs.WriteString(drawGPUChartSVG(gpuMap[gpuIdx], gpuIdx)) svgs.WriteString(drawGPUChartSVG(gpuMap[gpuIdx], gpuIdx, stageSpans, stageColorByName))
svgs.WriteString("\n") svgs.WriteString("\n")
} }
@@ -175,21 +205,39 @@ func WriteGPUMetricsHTML(path string, rows []GPUMetricRow) error {
<meta charset="utf-8"> <meta charset="utf-8">
<title>GPU Stress Test Metrics</title> <title>GPU Stress Test Metrics</title>
<style> <style>
body { font-family: sans-serif; background: #f0f0f0; margin: 0; padding: 20px; } :root{--bg:#fff;--surface:#fff;--surface-2:#f9fafb;--border:rgba(34,36,38,.15);--border-lite:rgba(34,36,38,.1);--ink:rgba(0,0,0,.87);--muted:rgba(0,0,0,.6)}
h1 { text-align: center; color: #333; margin: 0 0 8px; } *{box-sizing:border-box}
p { text-align: center; color: #888; font-size: 13px; margin: 0 0 24px; } body{font:14px/1.5 Lato,"Helvetica Neue",Arial,Helvetica,sans-serif;background:var(--bg);color:var(--ink);margin:0}
.page{padding:24px}
.card{background:var(--surface);border:1px solid var(--border);border-radius:4px;box-shadow:0 1px 2px rgba(34,36,38,.15);overflow:hidden}
.card-head{padding:11px 16px;background:var(--surface-2);border-bottom:1px solid var(--border);font-weight:700;font-size:13px}
.card-body{padding:16px}
h1{font-size:22px;margin:0 0 6px}
p{color:var(--muted);font-size:13px;margin:0 0 16px}
.stage-legend{display:flex;flex-wrap:wrap;gap:10px;margin:0 0 16px}
.stage-chip{display:inline-flex;align-items:center;gap:8px;padding:4px 10px;border-radius:999px;background:var(--surface-2);border:1px solid var(--border-lite);font-size:12px}
.stage-swatch{display:inline-block;width:12px;height:12px;border-radius:999px}
.chart-block{margin-top:16px}
</style> </style>
</head><body> </head><body>
<div class="page">
<div class="card">
<div class="card-head">GPU Stress Test Metrics</div>
<div class="card-body">
<h1>GPU Stress Test Metrics</h1> <h1>GPU Stress Test Metrics</h1>
<p>Generated %s</p> <p>Generated %s</p>
%s %s
</body></html>`, ts, svgs.String()) <div class="chart-block">%s</div>
</div>
</div>
</div>
</body></html>`, ts, legend.String(), svgs.String())
return os.WriteFile(path, []byte(html), 0644) return os.WriteFile(path, []byte(html), 0644)
} }
// drawGPUChartSVG generates a self-contained SVG chart for one GPU. // drawGPUChartSVG generates a self-contained SVG chart for one GPU.
func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string { func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int, stageSpans []gpuMetricStageSpan, stageColorByName map[string]string) string {
// Layout // Layout
const W, H = 960, 520 const W, H = 960, 520
const plotX1 = 120 // usage axis / chart left border const plotX1 = 120 // usage axis / chart left border
@@ -284,6 +332,23 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
} }
b.WriteString("</g>\n") b.WriteString("</g>\n")
// Stage backgrounds
for _, span := range stageSpans {
x1 := xv(span.Start)
x2 := xv(span.End)
if x2 < x1 {
x1, x2 = x2, x1
}
if x2-x1 < 1 {
x2 = x1 + 1
}
color := stageColorByName[span.Name]
fmt.Fprintf(&b, `<rect x="%.1f" y="%d" width="%.1f" height="%d" fill="%s" fill-opacity="0.18"/>`+"\n",
x1, plotY1, x2-x1, PH, color)
fmt.Fprintf(&b, `<text x="%.1f" y="%d" font-family="sans-serif" font-size="10" fill="#444" text-anchor="middle">%s</text>`+"\n",
x1+(x2-x1)/2, plotY1+12, gpuHTMLEscape(span.Name))
}
// Chart border // Chart border
fmt.Fprintf(&b, `<rect x="%d" y="%d" width="%d" height="%d"`+ fmt.Fprintf(&b, `<rect x="%d" y="%d" width="%d" height="%d"`+
` fill="none" stroke="#333" stroke-width="1"/>`+"\n", ` fill="none" stroke="#333" stroke-width="1"/>`+"\n",
@@ -382,221 +447,6 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
return b.String() return b.String()
} }
const (
ansiAmber = "\033[38;5;214m"
ansiReset = "\033[0m"
)
const (
termChartWidth = 70
termChartHeight = 12
)
// RenderGPUTerminalChart returns ANSI line charts (asciigraph-style) per GPU.
// Used in SAT stress-test logs.
func RenderGPUTerminalChart(rows []GPUMetricRow) string {
seen := make(map[int]bool)
var order []int
gpuMap := make(map[int][]GPUMetricRow)
for _, r := range rows {
if !seen[r.GPUIndex] {
seen[r.GPUIndex] = true
order = append(order, r.GPUIndex)
}
gpuMap[r.GPUIndex] = append(gpuMap[r.GPUIndex], r)
}
type seriesDef struct {
caption string
color string
fn func(GPUMetricRow) float64
}
defs := []seriesDef{
{"Temperature (°C)", ansiAmber, func(r GPUMetricRow) float64 { return r.TempC }},
{"GPU Usage (%)", ansiAmber, func(r GPUMetricRow) float64 { return r.UsagePct }},
{"Power (W)", ansiAmber, func(r GPUMetricRow) float64 { return r.PowerW }},
{"Clock (MHz)", ansiAmber, func(r GPUMetricRow) float64 { return r.ClockMHz }},
}
var b strings.Builder
for _, gpuIdx := range order {
gr := gpuMap[gpuIdx]
if len(gr) == 0 {
continue
}
tMax := gr[len(gr)-1].ElapsedSec - gr[0].ElapsedSec
fmt.Fprintf(&b, "GPU %d — Stress Test Metrics (%.0f seconds)\n\n", gpuIdx, tMax)
for _, d := range defs {
b.WriteString(renderLineChart(extractGPUField(gr, d.fn), d.color, d.caption,
termChartHeight, termChartWidth))
b.WriteRune('\n')
}
}
return strings.TrimRight(b.String(), "\n")
}
// renderLineChart draws a single time-series line chart using box-drawing characters.
// Produces output in the style of asciigraph: ╭─╮ │ ╰─╯ with a Y axis and caption.
func renderLineChart(vals []float64, color, caption string, height, width int) string {
if len(vals) == 0 {
return caption + "\n"
}
mn, mx := gpuMinMax(vals)
if mn == mx {
mx = mn + 1
}
// Use the smaller of width or len(vals) to avoid stretching sparse data.
w := width
if len(vals) < w {
w = len(vals)
}
data := gpuDownsample(vals, w)
// row[i] = display row index: 0 = top = max value, height = bottom = min value.
row := make([]int, w)
for i, v := range data {
r := int(math.Round((mx - v) / (mx - mn) * float64(height)))
if r < 0 {
r = 0
}
if r > height {
r = height
}
row[i] = r
}
// Fill the character grid.
grid := make([][]rune, height+1)
for i := range grid {
grid[i] = make([]rune, w)
for j := range grid[i] {
grid[i][j] = ' '
}
}
for x := 0; x < w; x++ {
r := row[x]
if x == 0 {
grid[r][0] = '─'
continue
}
p := row[x-1]
switch {
case r == p:
grid[r][x] = '─'
case r < p: // value went up (row index decreased toward top)
grid[r][x] = '╭'
grid[p][x] = '╯'
for y := r + 1; y < p; y++ {
grid[y][x] = '│'
}
default: // r > p, value went down
grid[p][x] = '╮'
grid[r][x] = '╰'
for y := p + 1; y < r; y++ {
grid[y][x] = '│'
}
}
}
// Y axis tick labels.
ticks := gpuNiceTicks(mn, mx, height/2)
tickAtRow := make(map[int]string)
labelWidth := 4
for _, t := range ticks {
r := int(math.Round((mx - t) / (mx - mn) * float64(height)))
if r < 0 || r > height {
continue
}
s := gpuFormatTick(t)
tickAtRow[r] = s
if len(s) > labelWidth {
labelWidth = len(s)
}
}
var b strings.Builder
for r := 0; r <= height; r++ {
label := tickAtRow[r]
fmt.Fprintf(&b, "%*s", labelWidth, label)
switch {
case label != "":
b.WriteRune('┤')
case r == height:
b.WriteRune('┼')
default:
b.WriteRune('│')
}
b.WriteString(color)
b.WriteString(string(grid[r]))
b.WriteString(ansiReset)
b.WriteRune('\n')
}
// Bottom axis.
b.WriteString(strings.Repeat(" ", labelWidth))
b.WriteRune('└')
b.WriteString(strings.Repeat("─", w))
b.WriteRune('\n')
// Caption centered under the chart.
if caption != "" {
total := labelWidth + 1 + w
if pad := (total - len(caption)) / 2; pad > 0 {
b.WriteString(strings.Repeat(" ", pad))
}
b.WriteString(caption)
b.WriteRune('\n')
}
return b.String()
}
func extractGPUField(rows []GPUMetricRow, fn func(GPUMetricRow) float64) []float64 {
v := make([]float64, len(rows))
for i, r := range rows {
v[i] = fn(r)
}
return v
}
// gpuDownsample averages vals into w buckets (or nearest-neighbor upsamples if len(vals) < w).
func gpuDownsample(vals []float64, w int) []float64 {
n := len(vals)
if n == 0 {
return make([]float64, w)
}
result := make([]float64, w)
if n >= w {
counts := make([]int, w)
for i, v := range vals {
bucket := i * w / n
if bucket >= w {
bucket = w - 1
}
result[bucket] += v
counts[bucket]++
}
for i := range result {
if counts[i] > 0 {
result[i] /= float64(counts[i])
}
}
} else {
// Nearest-neighbour upsample.
for i := range result {
src := i * (n - 1) / (w - 1)
if src >= n {
src = n - 1
}
result[i] = vals[src]
}
}
return result
}
func gpuMinMax(vals []float64) (float64, float64) { func gpuMinMax(vals []float64) (float64, float64) {
if len(vals) == 0 { if len(vals) == 0 {
return 0, 1 return 0, 1
@@ -641,3 +491,46 @@ func gpuFormatTick(v float64) string {
} }
return strconv.FormatFloat(v, 'f', 1, 64) return strconv.FormatFloat(v, 'f', 1, 64)
} }
var gpuMetricStagePalette = []string{
"#d95c5c",
"#2185d0",
"#21ba45",
"#f2c037",
"#6435c9",
"#00b5ad",
"#a5673f",
}
func buildGPUMetricStageSpans(rows []GPUMetricRow) []gpuMetricStageSpan {
var spans []gpuMetricStageSpan
for _, row := range rows {
name := strings.TrimSpace(row.Stage)
if name == "" {
name = "run"
}
if len(spans) == 0 || spans[len(spans)-1].Name != name {
spans = append(spans, gpuMetricStageSpan{Name: name, Start: row.ElapsedSec, End: row.ElapsedSec})
continue
}
spans[len(spans)-1].End = row.ElapsedSec
}
for i := range spans {
if spans[i].End <= spans[i].Start {
spans[i].End = spans[i].Start + 1
}
}
return spans
}
var gpuHTMLReplacer = strings.NewReplacer(
"&", "&amp;",
"<", "&lt;",
">", "&gt;",
`"`, "&quot;",
"'", "&#39;",
)
func gpuHTMLEscape(s string) string {
return gpuHTMLReplacer.Replace(s)
}

View File

@@ -0,0 +1,65 @@
package platform
import (
"os"
"path/filepath"
"strings"
"testing"
)
func TestWriteGPUMetricsCSVIncludesStageColumn(t *testing.T) {
t.Parallel()
dir := t.TempDir()
path := filepath.Join(dir, "gpu-metrics.csv")
rows := []GPUMetricRow{
{Stage: "warmup", ElapsedSec: 1, GPUIndex: 0, TempC: 71, UsagePct: 99, MemUsagePct: 80, PowerW: 420, ClockMHz: 1800, MemClockMHz: 1200},
}
if err := WriteGPUMetricsCSV(path, rows); err != nil {
t.Fatalf("WriteGPUMetricsCSV: %v", err)
}
raw, err := os.ReadFile(path)
if err != nil {
t.Fatalf("ReadFile: %v", err)
}
text := string(raw)
for _, needle := range []string{
"stage,elapsed_sec,gpu_index",
`"warmup",1.0,0,71.0,99.0,80.0,420.0,1800,1200`,
} {
if !strings.Contains(text, needle) {
t.Fatalf("csv missing %q\n%s", needle, text)
}
}
}
func TestWriteGPUMetricsHTMLShowsStageLegendAndLabels(t *testing.T) {
t.Parallel()
dir := t.TempDir()
path := filepath.Join(dir, "gpu-metrics.html")
rows := []GPUMetricRow{
{Stage: "baseline", ElapsedSec: 1, GPUIndex: 0, TempC: 50, UsagePct: 10, MemUsagePct: 5, PowerW: 100, ClockMHz: 500, MemClockMHz: 400},
{Stage: "baseline", ElapsedSec: 2, GPUIndex: 0, TempC: 51, UsagePct: 11, MemUsagePct: 5, PowerW: 101, ClockMHz: 510, MemClockMHz: 400},
{Stage: "steady-fp16", ElapsedSec: 3, GPUIndex: 0, TempC: 70, UsagePct: 98, MemUsagePct: 75, PowerW: 390, ClockMHz: 1700, MemClockMHz: 1100},
{Stage: "steady-fp16", ElapsedSec: 4, GPUIndex: 0, TempC: 71, UsagePct: 99, MemUsagePct: 76, PowerW: 395, ClockMHz: 1710, MemClockMHz: 1110},
}
if err := WriteGPUMetricsHTML(path, rows); err != nil {
t.Fatalf("WriteGPUMetricsHTML: %v", err)
}
raw, err := os.ReadFile(path)
if err != nil {
t.Fatalf("ReadFile: %v", err)
}
text := string(raw)
for _, needle := range []string{
"stage-legend",
"baseline",
"steady-fp16",
"GPU Stress Test Metrics",
} {
if !strings.Contains(text, needle) {
t.Fatalf("html missing %q\n%s", needle, text)
}
}
}

View File

@@ -108,15 +108,15 @@ type nvidiaGPUHealth struct {
} }
type nvidiaGPUStatusFile struct { type nvidiaGPUStatusFile struct {
Index int Index int
Name string Name string
RunStatus string RunStatus string
Reason string Reason string
Health string Health string
HealthRaw string HealthRaw string
Observed bool Observed bool
Selected bool Selected bool
FailingJob string FailingJob string
} }
// AMDGPUInfo holds basic info about an AMD GPU from rocm-smi. // AMDGPUInfo holds basic info about an AMD GPU from rocm-smi.
@@ -410,13 +410,13 @@ func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir strin
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", withNvidiaPersistenceMode( return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", withNvidiaPersistenceMode(
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
satJob{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}}, satJob{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
satJob{ satJob{
name: "03-dcgmproftester.log", name: "03-dcgmproftester.log",
cmd: profCmd, cmd: profCmd,
env: profEnv, env: profEnv,
collectGPU: true, collectGPU: true,
gpuIndices: selected, gpuIndices: selected,
}, },
satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}}, satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
), logFunc) ), logFunc)
} }
@@ -1382,8 +1382,6 @@ func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd
if len(metricRows) > 0 { if len(metricRows) > 0 {
_ = WriteGPUMetricsCSV(filepath.Join(runDir, "gpu-metrics.csv"), metricRows) _ = WriteGPUMetricsCSV(filepath.Join(runDir, "gpu-metrics.csv"), metricRows)
_ = WriteGPUMetricsHTML(filepath.Join(runDir, "gpu-metrics.html"), metricRows) _ = WriteGPUMetricsHTML(filepath.Join(runDir, "gpu-metrics.html"), metricRows)
chart := RenderGPUTerminalChart(metricRows)
_ = os.WriteFile(filepath.Join(runDir, "gpu-metrics-term.txt"), []byte(chart), 0644)
} }
return out, err return out, err

View File

@@ -426,6 +426,101 @@ func sampleFanSpeedsViaSensorsJSON() ([]FanReading, error) {
return fans, nil return fans, nil
} }
// sampleFanDutyCyclePct reads fan PWM/duty-cycle controls from lm-sensors.
// Returns the average duty cycle across all exposed PWM controls.
func sampleFanDutyCyclePct() (float64, bool) {
out, err := exec.Command("sensors", "-j").Output()
if err != nil || len(out) == 0 {
return 0, false
}
return parseFanDutyCyclePctSensorsJSON(out)
}
func parseFanDutyCyclePctSensorsJSON(raw []byte) (float64, bool) {
var doc map[string]map[string]any
if err := json.Unmarshal(raw, &doc); err != nil {
return 0, false
}
var samples []float64
for _, features := range doc {
for name, feature := range features {
if strings.EqualFold(name, "Adapter") {
continue
}
featureMap, ok := feature.(map[string]any)
if !ok {
continue
}
if duty, ok := firstFanDutyValue(name, featureMap); ok {
samples = append(samples, duty)
}
}
}
if len(samples) == 0 {
return 0, false
}
return benchmarkMean(samples), true
}
func firstFanDutyValue(featureName string, feature map[string]any) (float64, bool) {
featureName = strings.ToLower(strings.TrimSpace(featureName))
if strings.Contains(featureName, "enable") || strings.Contains(featureName, "mode") || strings.Contains(featureName, "alarm") {
return 0, false
}
if strings.Contains(featureName, "pwm") {
for _, key := range []string{"input", "value", "current"} {
if value, ok := feature[key]; ok {
if duty, parsed := parseFanDutyValue(value); parsed {
return duty, true
}
}
}
}
keys := make([]string, 0, len(feature))
for key := range feature {
keys = append(keys, key)
}
sort.Strings(keys)
for _, key := range keys {
lower := strings.ToLower(key)
if !strings.Contains(lower, "pwm") {
continue
}
if strings.Contains(lower, "enable") || strings.Contains(lower, "mode") || strings.Contains(lower, "alarm") {
continue
}
if duty, parsed := parseFanDutyValue(feature[key]); parsed {
return duty, true
}
}
return 0, false
}
func parseFanDutyValue(value any) (float64, bool) {
switch v := value.(type) {
case float64:
return normalizePWMAsDutyPct(v)
case string:
if f, err := strconv.ParseFloat(strings.TrimSpace(v), 64); err == nil {
return normalizePWMAsDutyPct(f)
}
}
return 0, false
}
func normalizePWMAsDutyPct(raw float64) (float64, bool) {
if raw < 0 {
return 0, false
}
if raw <= 100 {
return raw, true
}
if raw <= 255 {
return raw / 255.0 * 100.0, true
}
return 0, false
}
func firstFanInputValue(feature map[string]any) (float64, bool) { func firstFanInputValue(feature map[string]any) (float64, bool) {
keys := make([]string, 0, len(feature)) keys := make([]string, 0, len(feature))
for key := range feature { for key := range feature {

View File

@@ -29,6 +29,27 @@ func TestFirstFanInputValue(t *testing.T) {
} }
} }
func TestParseFanDutyCyclePctSensorsJSON(t *testing.T) {
raw := []byte(`{
"chip0": {
"fan1": {"input": 9000},
"pwm1": {"input": 128},
"pwm1_enable": {"input": 1}
},
"chip1": {
"pwm2": {"input": 64}
}
}`)
got, ok := parseFanDutyCyclePctSensorsJSON(raw)
if !ok {
t.Fatalf("expected duty cycle telemetry to be parsed")
}
if got < 57 || got > 58 {
t.Fatalf("got=%v want ~57.1", got)
}
}
func TestParseDCMIPowerReading(t *testing.T) { func TestParseDCMIPowerReading(t *testing.T) {
raw := ` raw := `
Instantaneous power reading: 512 Watts Instantaneous power reading: 512 Watts

View File

@@ -36,6 +36,16 @@ var apiListNvidiaGPUStatuses = func(a *app.App) ([]platform.NvidiaGPUStatus, err
return a.ListNvidiaGPUStatuses() return a.ListNvidiaGPUStatuses()
} }
const (
taskPriorityBenchmark = 10
taskPriorityBurn = 20
taskPriorityValidateStress = 30
taskPriorityValidate = 40
taskPriorityAudit = 50
taskPriorityInstallToRAM = 60
taskPriorityInstall = 70
)
// ── Job ID counter ──────────────────────────────────────────────────────────── // ── Job ID counter ────────────────────────────────────────────────────────────
var jobCounter atomic.Uint64 var jobCounter atomic.Uint64
@@ -109,6 +119,30 @@ func shouldSplitHomogeneousNvidiaTarget(target string) bool {
} }
} }
func defaultTaskPriority(target string, params taskParams) int {
switch strings.TrimSpace(target) {
case "install":
return taskPriorityInstall
case "install-to-ram":
return taskPriorityInstallToRAM
case "audit":
return taskPriorityAudit
case "nvidia-benchmark":
return taskPriorityBenchmark
case "nvidia-stress", "amd-stress", "memory-stress", "sat-stress", "platform-stress", "nvidia-compute":
return taskPriorityBurn
case "nvidia", "nvidia-targeted-stress", "nvidia-targeted-power", "nvidia-pulse",
"nvidia-interconnect", "nvidia-bandwidth", "memory", "storage", "cpu",
"amd", "amd-mem", "amd-bandwidth":
if params.StressMode {
return taskPriorityValidateStress
}
return taskPriorityValidate
default:
return 0
}
}
func expandHomogeneousNvidiaSelections(gpus []platform.NvidiaGPU, include, exclude []int) ([]nvidiaTaskSelection, error) { func expandHomogeneousNvidiaSelections(gpus []platform.NvidiaGPU, include, exclude []int) ([]nvidiaTaskSelection, error) {
if len(gpus) == 0 { if len(gpus) == 0 {
return nil, fmt.Errorf("no NVIDIA GPUs detected") return nil, fmt.Errorf("no NVIDIA GPUs detected")
@@ -458,6 +492,7 @@ func (h *handler) handleAPIAuditRun(w http.ResponseWriter, _ *http.Request) {
ID: newJobID("audit"), ID: newJobID("audit"),
Name: "Audit", Name: "Audit",
Target: "audit", Target: "audit",
Priority: defaultTaskPriority("audit", taskParams{}),
Status: TaskPending, Status: TaskPending,
CreatedAt: time.Now(), CreatedAt: time.Now(),
} }
@@ -526,7 +561,7 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
DisplayName: body.DisplayName, DisplayName: body.DisplayName,
PlatformComponents: body.PlatformComponents, PlatformComponents: body.PlatformComponents,
} }
tasks, err := buildNvidiaTaskSet(target, 0, time.Now(), params, name, h.opts.App, "sat-"+target) tasks, err := buildNvidiaTaskSet(target, defaultTaskPriority(target, params), time.Now(), params, name, h.opts.App, "sat-"+target)
if err != nil { if err != nil {
writeError(w, http.StatusBadRequest, err.Error()) writeError(w, http.StatusBadRequest, err.Error())
return return
@@ -613,7 +648,7 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
ID: newJobID("benchmark-nvidia"), ID: newJobID("benchmark-nvidia"),
Name: stepName, Name: stepName,
Target: "nvidia-benchmark", Target: "nvidia-benchmark",
Priority: 15, Priority: defaultTaskPriority("nvidia-benchmark", taskParams{}),
Status: TaskPending, Status: TaskPending,
CreatedAt: now, CreatedAt: now,
params: taskParams{ params: taskParams{
@@ -645,7 +680,7 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
name = fmt.Sprintf("%s · sequential", name) name = fmt.Sprintf("%s · sequential", name)
} }
tasks, err := buildNvidiaTaskSet("nvidia-benchmark", 15, time.Now(), taskParams{ params := taskParams{
GPUIndices: body.GPUIndices, GPUIndices: body.GPUIndices,
ExcludeGPUIndices: body.ExcludeGPUIndices, ExcludeGPUIndices: body.ExcludeGPUIndices,
SizeMB: body.SizeMB, SizeMB: body.SizeMB,
@@ -653,7 +688,8 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
RunNCCL: runNCCL, RunNCCL: runNCCL,
ParallelGPUs: parallelGPUs, ParallelGPUs: parallelGPUs,
DisplayName: body.DisplayName, DisplayName: body.DisplayName,
}, name, h.opts.App, "benchmark-nvidia") }
tasks, err := buildNvidiaTaskSet("nvidia-benchmark", defaultTaskPriority("nvidia-benchmark", params), time.Now(), params, name, h.opts.App, "benchmark-nvidia")
if err != nil { if err != nil {
writeError(w, http.StatusBadRequest, err.Error()) writeError(w, http.StatusBadRequest, err.Error())
return return
@@ -1054,7 +1090,7 @@ func (h *handler) handleAPIInstallToRAM(w http.ResponseWriter, r *http.Request)
ID: newJobID("install-to-ram"), ID: newJobID("install-to-ram"),
Name: "Install to RAM", Name: "Install to RAM",
Target: "install-to-ram", Target: "install-to-ram",
Priority: 10, Priority: defaultTaskPriority("install-to-ram", taskParams{}),
Status: TaskPending, Status: TaskPending,
CreatedAt: time.Now(), CreatedAt: time.Now(),
} }
@@ -1169,7 +1205,7 @@ func (h *handler) handleAPIInstallRun(w http.ResponseWriter, r *http.Request) {
ID: newJobID("install"), ID: newJobID("install"),
Name: "Install to Disk", Name: "Install to Disk",
Target: "install", Target: "install",
Priority: 20, Priority: defaultTaskPriority("install", taskParams{}),
Status: TaskPending, Status: TaskPending,
CreatedAt: time.Now(), CreatedAt: time.Now(),
params: taskParams{ params: taskParams{
@@ -1461,4 +1497,3 @@ func (h *handler) rollbackPendingNetworkChange() error {
} }
return nil return nil
} }

View File

@@ -39,6 +39,9 @@ func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
if got := globalQueue.tasks[0].params.BurnProfile; got != "smoke" { if got := globalQueue.tasks[0].params.BurnProfile; got != "smoke" {
t.Fatalf("burn profile=%q want smoke", got) t.Fatalf("burn profile=%q want smoke", got)
} }
if got := globalQueue.tasks[0].Priority; got != taskPriorityValidate {
t.Fatalf("priority=%d want %d", got, taskPriorityValidate)
}
} }
func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) { func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
@@ -84,6 +87,9 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
if task.params.RunNCCL { if task.params.RunNCCL {
t.Fatal("RunNCCL should reflect explicit false from request") t.Fatal("RunNCCL should reflect explicit false from request")
} }
if task.Priority != taskPriorityBenchmark {
t.Fatalf("priority=%d want %d", task.Priority, taskPriorityBenchmark)
}
} }
func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) { func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) {
@@ -133,6 +139,12 @@ func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) {
if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 { if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
t.Fatalf("task[1] gpu indices=%v want [2]", got) t.Fatalf("task[1] gpu indices=%v want [2]", got)
} }
if got := globalQueue.tasks[0].Priority; got != taskPriorityBenchmark {
t.Fatalf("task[0] priority=%d want %d", got, taskPriorityBenchmark)
}
if got := globalQueue.tasks[1].Priority; got != taskPriorityBenchmark {
t.Fatalf("task[1] priority=%d want %d", got, taskPriorityBenchmark)
}
} }
func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) { func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) {
@@ -175,6 +187,39 @@ func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) {
if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 { if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
t.Fatalf("task[1] gpu indices=%v want [2]", got) t.Fatalf("task[1] gpu indices=%v want [2]", got)
} }
if got := globalQueue.tasks[0].Priority; got != taskPriorityValidate {
t.Fatalf("task[0] priority=%d want %d", got, taskPriorityValidate)
}
if got := globalQueue.tasks[1].Priority; got != taskPriorityValidate {
t.Fatalf("task[1] priority=%d want %d", got, taskPriorityValidate)
}
}
func TestDefaultTaskPriorityOrder(t *testing.T) {
got := []int{
defaultTaskPriority("install-to-ram", taskParams{}),
defaultTaskPriority("audit", taskParams{}),
defaultTaskPriority("cpu", taskParams{}),
defaultTaskPriority("cpu", taskParams{StressMode: true}),
defaultTaskPriority("nvidia-stress", taskParams{}),
defaultTaskPriority("nvidia-benchmark", taskParams{}),
}
want := []int{
taskPriorityInstallToRAM,
taskPriorityAudit,
taskPriorityValidate,
taskPriorityValidateStress,
taskPriorityBurn,
taskPriorityBenchmark,
}
for i := range want {
if got[i] != want[i] {
t.Fatalf("priority[%d]=%d want %d", i, got[i], want[i])
}
}
if !(got[0] > got[1] && got[1] > got[2] && got[2] > got[3] && got[3] > got[4] && got[4] > got[5]) {
t.Fatalf("priority order=%v", got)
}
} }
func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) { func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {

View File

@@ -6,7 +6,7 @@ NCCL_CUDA_VERSION=13.0
NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186 NCCL_SHA256=2e6faafd2c19cffc7738d9283976a3200ea9db9895907f337f0c7e5a25563186
NCCL_TESTS_VERSION=2.13.10 NCCL_TESTS_VERSION=2.13.10
NVCC_VERSION=12.8 NVCC_VERSION=12.8
CUBLAS_VERSION=13.0.2.14-1 CUBLAS_VERSION=13.1.1.3-1
CUDA_USERSPACE_VERSION=13.0.96-1 CUDA_USERSPACE_VERSION=13.0.96-1
DCGM_VERSION=4.5.3-1 DCGM_VERSION=4.5.3-1
JOHN_JUMBO_COMMIT=67fcf9fe5a JOHN_JUMBO_COMMIT=67fcf9fe5a

View File

@@ -33,7 +33,6 @@ typedef void *CUstream;
#define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR 75 #define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR 75
#define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR 76 #define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR 76
#define MAX_STRESS_STREAMS 16 #define MAX_STRESS_STREAMS 16
#define MAX_CUBLAS_PROFILES 5
#define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u) #define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
#define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u) #define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
@@ -689,6 +688,8 @@ static const struct profile_desc k_profiles[] = {
#endif #endif
}; };
#define PROFILE_COUNT ((int)(sizeof(k_profiles) / sizeof(k_profiles[0])))
static int load_cublaslt(struct cublaslt_api *api) { static int load_cublaslt(struct cublaslt_api *api) {
memset(api, 0, sizeof(*api)); memset(api, 0, sizeof(*api));
api->lib = dlopen("libcublasLt.so.13", RTLD_NOW | RTLD_LOCAL); api->lib = dlopen("libcublasLt.so.13", RTLD_NOW | RTLD_LOCAL);
@@ -1124,7 +1125,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
const char *precision_filter, const char *precision_filter,
struct stress_report *report) { struct stress_report *report) {
struct cublaslt_api cublas; struct cublaslt_api cublas;
struct prepared_profile prepared[MAX_STRESS_STREAMS * MAX_CUBLAS_PROFILES]; struct prepared_profile prepared[MAX_STRESS_STREAMS * PROFILE_COUNT];
cublasLtHandle_t handle = NULL; cublasLtHandle_t handle = NULL;
CUcontext ctx = NULL; CUcontext ctx = NULL;
CUstream streams[MAX_STRESS_STREAMS] = {0}; CUstream streams[MAX_STRESS_STREAMS] = {0};
@@ -1134,7 +1135,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
int active = 0; int active = 0;
int mp_count = 0; int mp_count = 0;
int stream_count = 1; int stream_count = 1;
int profile_count = (int)(sizeof(k_profiles) / sizeof(k_profiles[0])); int profile_count = PROFILE_COUNT;
int prepared_count = 0; int prepared_count = 0;
size_t requested_budget = 0; size_t requested_budget = 0;
size_t total_budget = 0; size_t total_budget = 0;
@@ -1159,6 +1160,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
return 0; return 0;
} }
/* Count profiles matching the filter (for deciding what to run). */
for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) { for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc && if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc &&
(precision_filter == NULL || strcmp(k_profiles[i].block_label, precision_filter) == 0)) { (precision_filter == NULL || strcmp(k_profiles[i].block_label, precision_filter) == 0)) {
@@ -1172,18 +1174,31 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
return 0; return 0;
} }
/* Count all profiles active on this GPU regardless of filter.
* Used as the budget divisor so matrix sizes stay consistent whether
* running all precisions together or a single-precision phase. */
int planned_total = 0;
for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc) {
planned_total++;
}
}
if (planned_total < planned) {
planned_total = planned;
}
requested_budget = (size_t)size_mb * 1024u * 1024u; requested_budget = (size_t)size_mb * 1024u * 1024u;
if (requested_budget < (size_t)planned * MIN_PROFILE_BUDGET_BYTES) { if (requested_budget < (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES) {
requested_budget = (size_t)planned * MIN_PROFILE_BUDGET_BYTES; requested_budget = (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES;
} }
total_budget = clamp_budget_to_free_memory(cuda, requested_budget); total_budget = clamp_budget_to_free_memory(cuda, requested_budget);
if (total_budget < (size_t)planned * MIN_PROFILE_BUDGET_BYTES) { if (total_budget < (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES) {
total_budget = (size_t)planned * MIN_PROFILE_BUDGET_BYTES; total_budget = (size_t)planned_total * MIN_PROFILE_BUDGET_BYTES;
} }
if (query_multiprocessor_count(cuda, dev, &mp_count) && if (query_multiprocessor_count(cuda, dev, &mp_count) &&
cuda->cuStreamCreate && cuda->cuStreamCreate &&
cuda->cuStreamDestroy) { cuda->cuStreamDestroy) {
stream_count = choose_stream_count(mp_count, planned, total_budget, 1); stream_count = choose_stream_count(mp_count, planned_total, total_budget, 1);
} }
if (stream_count > 1) { if (stream_count > 1) {
int created = 0; int created = 0;
@@ -1196,7 +1211,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
} }
} }
report->stream_count = stream_count; report->stream_count = stream_count;
per_profile_budget = total_budget / ((size_t)planned * (size_t)stream_count); per_profile_budget = total_budget / ((size_t)planned_total * (size_t)stream_count);
if (per_profile_budget < MIN_PROFILE_BUDGET_BYTES) { if (per_profile_budget < MIN_PROFILE_BUDGET_BYTES) {
per_profile_budget = MIN_PROFILE_BUDGET_BYTES; per_profile_budget = MIN_PROFILE_BUDGET_BYTES;
} }
@@ -1424,7 +1439,17 @@ int main(int argc, char **argv) {
ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, precision_filter, &report); ok = run_cublaslt_stress(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, precision_filter, &report);
#endif #endif
if (!ok) { if (!ok) {
if (!run_ptx_fallback(&cuda, dev, name, cc_major, cc_minor, seconds, size_mb, &report)) { if (precision_filter != NULL) {
fprintf(stderr,
"requested precision path unavailable: precision=%s device=%s cc=%d.%d\n",
precision_filter,
name,
cc_major,
cc_minor);
return 1;
}
int ptx_mb = size_mb;
if (!run_ptx_fallback(&cuda, dev, name, cc_major, cc_minor, seconds, ptx_mb, &report)) {
return 1; return 1;
} }
} }

View File

@@ -873,9 +873,37 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}" CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"
echo "=== bee-gpu-burn FP4 header probe ==="
fp4_type_match="$(grep -Rsnm 1 'CUDA_R_4F_E2M1' "${CUBLAS_CACHE}/include" 2>/dev/null || true)"
fp4_scale_match="$(grep -Rsnm 1 'CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3' "${CUBLAS_CACHE}/include" 2>/dev/null || true)"
if [ -n "$fp4_type_match" ]; then
echo "fp4_header_symbol=present"
echo "$fp4_type_match"
else
echo "fp4_header_symbol=missing"
fi
if [ -n "$fp4_scale_match" ]; then
echo "fp4_scale_mode_symbol=present"
echo "$fp4_scale_match"
else
echo "fp4_scale_mode_symbol=missing"
fi
GPU_STRESS_NEED_BUILD=1 GPU_STRESS_NEED_BUILD=1
if [ -f "$GPU_BURN_WORKER_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_BURN_WORKER_BIN" ]; then if [ -f "$GPU_BURN_WORKER_BIN" ]; then
GPU_STRESS_NEED_BUILD=0 GPU_STRESS_NEED_BUILD=0
for dep in \
"${BUILDER_DIR}/bee-gpu-stress.c" \
"${BUILDER_DIR}/VERSIONS"; do
if [ "$dep" -nt "$GPU_BURN_WORKER_BIN" ]; then
GPU_STRESS_NEED_BUILD=1
break
fi
done
if [ "$GPU_STRESS_NEED_BUILD" = "0" ] && \
find "${CUBLAS_CACHE}/include" "${CUBLAS_CACHE}/lib" -type f -newer "$GPU_BURN_WORKER_BIN" | grep -q .; then
GPU_STRESS_NEED_BUILD=1
fi
fi fi
if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then
@@ -889,6 +917,12 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
else else
echo "=== bee-gpu-burn worker up to date, skipping build ===" echo "=== bee-gpu-burn worker up to date, skipping build ==="
fi fi
echo "=== bee-gpu-burn compiled profile probe ==="
if grep -aq 'fp4_e2m1' "$GPU_BURN_WORKER_BIN"; then
echo "fp4_profile_string=present"
else
echo "fp4_profile_string=missing"
fi
fi fi
echo "=== preparing staged overlay (${BUILD_VARIANT}) ===" echo "=== preparing staged overlay (${BUILD_VARIANT}) ==="