Split bee-bench into perf and power workflows

This commit is contained in:
Mikhail Chusavitin
2026-04-14 17:33:13 +03:00
parent 54338dbae5
commit 95124d228f
17 changed files with 718 additions and 259 deletions

View File

@@ -30,7 +30,9 @@ var (
DefaultRuntimeLogPath = DefaultExportDir + "/runtime-health.log" DefaultRuntimeLogPath = DefaultExportDir + "/runtime-health.log"
DefaultTechDumpDir = DefaultExportDir + "/techdump" DefaultTechDumpDir = DefaultExportDir + "/techdump"
DefaultSATBaseDir = DefaultExportDir + "/bee-sat" DefaultSATBaseDir = DefaultExportDir + "/bee-sat"
DefaultBenchmarkBaseDir = DefaultExportDir + "/bee-benchmark" DefaultBeeBenchBaseDir = DefaultExportDir + "/bee-bench"
DefaultBeeBenchPerfDir = DefaultBeeBenchBaseDir + "/perf"
DefaultBeeBenchPowerDir = DefaultBeeBenchBaseDir + "/power"
) )
type App struct { type App struct {
@@ -567,7 +569,7 @@ func (a *App) RunNvidiaBenchmark(baseDir string, opts platform.NvidiaBenchmarkOp
func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) { func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" { if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultBenchmarkBaseDir baseDir = DefaultBeeBenchPerfDir
} }
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc) return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
} }

View File

@@ -16,7 +16,7 @@ import (
"time" "time"
) )
const benchmarkVersion = "1" const benchmarkVersion = "2"
type benchmarkProfileSpec struct { type benchmarkProfileSpec struct {
Name string Name string
@@ -41,6 +41,15 @@ type benchmarkGPUInfo struct {
MultiprocessorCount int MultiprocessorCount int
} }
type benchmarkPowerCalibrationResult struct {
Summary BenchmarkTelemetrySummary
AppliedPowerLimitW float64
Attempts int
Derated bool
Completed bool
Notes []string
}
type benchmarkBurnProfile struct { type benchmarkBurnProfile struct {
name string name string
category string category string
@@ -78,7 +87,36 @@ var (
// to highest power draw so thermal ramp-up is gradual. // to highest power draw so thermal ramp-up is gradual.
var benchmarkPrecisionPhases = []string{"int8", "fp8", "fp16", "fp32", "fp64", "fp4"} var benchmarkPrecisionPhases = []string{"int8", "fp8", "fp16", "fp32", "fp64", "fp4"}
func buildBenchmarkSteadyPlan(spec benchmarkProfileSpec, metricStage func(string) string) (planLabels []string, planPhases []benchmarkPlannedPhase, basePhaseSec int, mixedPhaseSec int) { func computeCapabilityCode(raw string) int {
raw = strings.TrimSpace(raw)
if raw == "" {
return 0
}
parts := strings.SplitN(raw, ".", 2)
major, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
minor := 0
if len(parts) > 1 {
minor, _ = strconv.Atoi(strings.TrimSpace(parts[1]))
}
return major*10 + minor
}
func benchmarkSupportedPrecisions(computeCapability string) []string {
cc := computeCapabilityCode(computeCapability)
out := make([]string, 0, len(benchmarkPrecisionPhases))
for _, prec := range benchmarkPrecisionPhases {
if prec == "fp4" && cc > 0 && cc < 100 {
continue
}
out = append(out, prec)
}
return out
}
func buildBenchmarkSteadyPlan(spec benchmarkProfileSpec, precisions []string, metricStage func(string) string) (planLabels []string, planPhases []benchmarkPlannedPhase, basePhaseSec int, mixedPhaseSec int) {
if len(precisions) == 0 {
precisions = append([]string(nil), benchmarkPrecisionPhases...)
}
switch spec.Name { switch spec.Name {
case NvidiaBenchmarkProfileStandard: case NvidiaBenchmarkProfileStandard:
basePhaseSec = 60 basePhaseSec = 60
@@ -90,7 +128,7 @@ func buildBenchmarkSteadyPlan(spec benchmarkProfileSpec, metricStage func(string
basePhaseSec = 3600 basePhaseSec = 3600
mixedPhaseSec = 14400 mixedPhaseSec = 14400
default: default:
totalWeight := len(benchmarkPrecisionPhases) + 5 totalWeight := len(precisions) + 5
if totalWeight <= 0 { if totalWeight <= 0 {
return nil, nil, 0, 0 return nil, nil, 0, 0
} }
@@ -100,9 +138,9 @@ func buildBenchmarkSteadyPlan(spec benchmarkProfileSpec, metricStage func(string
} }
mixedPhaseSec = basePhaseSec * 5 mixedPhaseSec = basePhaseSec * 5
} }
planLabels = make([]string, 0, len(benchmarkPrecisionPhases)+1) planLabels = make([]string, 0, len(precisions)+1)
planPhases = make([]benchmarkPlannedPhase, 0, len(benchmarkPrecisionPhases)+1) planPhases = make([]benchmarkPlannedPhase, 0, len(precisions)+1)
for _, prec := range benchmarkPrecisionPhases { for _, prec := range precisions {
planLabels = append(planLabels, prec) planLabels = append(planLabels, prec)
planPhases = append(planPhases, benchmarkPlannedPhase{ planPhases = append(planPhases, benchmarkPlannedPhase{
PlanLabel: prec, PlanLabel: prec,
@@ -127,6 +165,53 @@ func benchmarkPlanDurationsCSV(phases []benchmarkPlannedPhase) string {
return strings.Join(values, ",") return strings.Join(values, ",")
} }
func benchmarkPlannedPhaseStatus(raw []byte) (string, string) {
text := strings.ToLower(strings.TrimSpace(string(raw)))
switch {
case text == "":
return "FAILED", "phase produced no output"
case strings.Contains(text, "phase_error="):
if strings.Contains(text, "unsupported") || strings.Contains(text, "not supported") || strings.Contains(text, "cublaslt_profiles=unsupported") {
return "UNSUPPORTED", "precision phase unsupported on this GPU/userspace path"
}
return "FAILED", "precision phase failed"
case strings.Contains(text, "status=failed"):
if strings.Contains(text, "unsupported") || strings.Contains(text, "not supported") {
return "UNSUPPORTED", "precision phase unsupported on this GPU/userspace path"
}
return "FAILED", "precision phase failed"
default:
return "OK", ""
}
}
func benchmarkCalibrationThrottleReason(before, after BenchmarkThrottleCounters) string {
diff := diffThrottleCounters(before, after)
switch {
case diff.HWThermalSlowdownUS > 0:
return "hw_thermal"
case diff.SWThermalSlowdownUS > 0:
return "sw_thermal"
case diff.HWPowerBrakeSlowdownUS > 0:
return "hw_power_brake"
default:
return ""
}
}
func setBenchmarkPowerLimit(ctx context.Context, verboseLog string, gpuIndex, powerLimitW int) error {
if powerLimitW <= 0 {
return fmt.Errorf("invalid power limit %d", powerLimitW)
}
out, err := runSATCommandCtx(ctx, verboseLog, fmt.Sprintf("gpu-%d-set-power-limit-%dw", gpuIndex, powerLimitW), []string{
"nvidia-smi", "-i", strconv.Itoa(gpuIndex), "-pl", strconv.Itoa(powerLimitW),
}, nil, nil)
if err != nil {
return fmt.Errorf("set power limit gpu=%d limit=%dw: %w (%s)", gpuIndex, powerLimitW, err, strings.TrimSpace(string(out)))
}
return nil
}
func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) { func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
if ctx == nil { if ctx == nil {
ctx = context.Background() ctx = context.Background()
@@ -135,7 +220,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
logFunc = func(string) {} logFunc = func(string) {}
} }
if strings.TrimSpace(baseDir) == "" { if strings.TrimSpace(baseDir) == "" {
baseDir = "/var/log/bee-benchmark" baseDir = "/var/log/bee-bench/perf"
} }
spec := resolveBenchmarkProfile(opts.Profile) spec := resolveBenchmarkProfile(opts.Profile)
opts = normalizeNvidiaBenchmarkOptionsForBenchmark(opts) opts = normalizeNvidiaBenchmarkOptionsForBenchmark(opts)
@@ -149,7 +234,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
} }
ts := time.Now().UTC().Format("20060102-150405") ts := time.Now().UTC().Format("20060102-150405")
runDir := filepath.Join(baseDir, "gpu-benchmark-"+ts) runDir := filepath.Join(baseDir, "perf-"+ts)
if err := os.MkdirAll(runDir, 0755); err != nil { if err := os.MkdirAll(runDir, 0755); err != nil {
return "", fmt.Errorf("mkdir %s: %w", runDir, err) return "", fmt.Errorf("mkdir %s: %w", runDir, err)
} }
@@ -175,6 +260,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
logFunc(fmt.Sprintf("NVIDIA benchmark profile=%s gpus=%s", spec.Name, joinIndexList(selected))) logFunc(fmt.Sprintf("NVIDIA benchmark profile=%s gpus=%s", spec.Name, joinIndexList(selected)))
var metricRows []GPUMetricRow var metricRows []GPUMetricRow
metricTimelineSec := 0.0
gpuBurnLog := filepath.Join(runDir, "gpu-burn.log") gpuBurnLog := filepath.Join(runDir, "gpu-burn.log")
// Server power characterization state — populated during per-GPU phases. // Server power characterization state — populated during per-GPU phases.
@@ -215,14 +301,23 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
// Power calibration: run dcgmi targeted_power while sampling nvidia-smi power. // Power calibration: run dcgmi targeted_power while sampling nvidia-smi power.
// Returns per-GPU p95 power as an honest TDP reference for PowerSustainScore. // Returns per-GPU p95 power as an honest TDP reference for PowerSustainScore.
calibPowerByIndex := runBenchmarkPowerCalibration(ctx, verboseLog, runDir, selected, logFunc) calibByIndex, powerRestoreActions := runBenchmarkPowerCalibration(ctx, verboseLog, runDir, selected, infoByIndex, logFunc)
restoreActions = append(restoreActions, powerRestoreActions...)
for _, idx := range selected {
if calib, ok := calibByIndex[idx]; ok && calib.Derated && calib.AppliedPowerLimitW > 0 {
result.Warnings = append(result.Warnings, fmt.Sprintf(
"GPU %d could not complete targeted_power at its default server power budget; benchmark ran at reduced power limit %.0f W.",
idx, calib.AppliedPowerLimitW,
))
}
}
// Start background CPU load sampler — samples every 10s during GPU phases. // Start background CPU load sampler — samples every 10s during GPU phases.
cpuStopCh := make(chan struct{}) cpuStopCh := make(chan struct{})
cpuSamplesCh := startCPULoadSampler(cpuStopCh, 10) cpuSamplesCh := startCPULoadSampler(cpuStopCh, 10)
if opts.ParallelGPUs { if opts.ParallelGPUs {
runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, calibPowerByIndex, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples, &metricRows, gpuBurnLog) runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, calibByIndex, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples, &metricRows, &metricTimelineSec, gpuBurnLog)
} else { } else {
for _, idx := range selected { for _, idx := range selected {
@@ -242,8 +337,12 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
gpuResult.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz gpuResult.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz
} }
if w, ok := calibPowerByIndex[idx]; ok && w > 0 { if calib, ok := calibByIndex[idx]; ok {
gpuResult.CalibratedPeakPowerW = w gpuResult.CalibratedPeakPowerW = calib.Summary.P95PowerW
gpuResult.CalibratedPeakTempC = calib.Summary.P95TempC
gpuResult.PowerCalibrationTries = calib.Attempts
gpuResult.PowerLimitDerated = calib.Derated
gpuResult.Notes = append(gpuResult.Notes, calib.Notes...)
} }
if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil { if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
gpuResult.LockedGraphicsClockMHz = norm.GPUClockLockMHz gpuResult.LockedGraphicsClockMHz = norm.GPUClockLockMHz
@@ -255,7 +354,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
gpuResult.Notes = append(gpuResult.Notes, "baseline sampling failed: "+err.Error()) gpuResult.Notes = append(gpuResult.Notes, "baseline sampling failed: "+err.Error())
} }
gpuResult.Baseline = summarizeBenchmarkTelemetry(baselineRows) gpuResult.Baseline = summarizeBenchmarkTelemetry(baselineRows)
appendBenchmarkMetrics(&metricRows, baselineRows, fmt.Sprintf("gpu-%d-baseline", idx)) appendBenchmarkMetrics(&metricRows, baselineRows, fmt.Sprintf("gpu-%d-baseline", idx), &metricTimelineSec, float64(spec.BaselineSec))
// Sample server idle power once (first GPU only — server state is global). // Sample server idle power once (first GPU only — server state is global).
if !serverIdleOK { if !serverIdleOK {
@@ -274,18 +373,23 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
} }
logFunc(fmt.Sprintf("GPU %d: warmup (%ds)", idx, spec.WarmupSec)) logFunc(fmt.Sprintf("GPU %d: warmup (%ds)", idx, spec.WarmupSec))
warmupOut, warmupRows, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-warmup.log", idx), warmupCmd, nil, []int{idx}, logFunc) warmupOut, warmupRows, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-warmup.log", idx), warmupCmd, nil, []int{idx}, logFunc)
appendBenchmarkMetrics(&metricRows, warmupRows, fmt.Sprintf("gpu-%d-warmup", idx)) appendBenchmarkMetrics(&metricRows, warmupRows, fmt.Sprintf("gpu-%d-warmup", idx), &metricTimelineSec, float64(spec.WarmupSec))
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", fmt.Sprintf("gpu-%d-warmup", idx), warmupOut) appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", fmt.Sprintf("gpu-%d-warmup", idx), warmupOut)
if warmupErr != nil { if warmupErr != nil {
gpuResult.Notes = append(gpuResult.Notes, "warmup failed: "+warmupErr.Error()) gpuResult.Notes = append(gpuResult.Notes, "warmup failed: "+warmupErr.Error())
result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult)) result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult))
continue continue
} }
warmupParse := parseBenchmarkBurnLog(string(warmupOut))
if gpuResult.ComputeCapability == "" {
gpuResult.ComputeCapability = warmupParse.ComputeCapability
}
// Run synthetic precision phases and the combined steady phase as one // Run synthetic precision phases and the combined steady phase as one
// uninterrupted command so the GPU stays hot between windows. // uninterrupted command so the GPU stays hot between windows.
eccBase, _ := queryECCCounters(idx) eccBase, _ := queryECCCounters(idx)
planLabels, planPhases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(spec, func(label string) string { supportedPrecisions := benchmarkSupportedPrecisions(gpuResult.ComputeCapability)
planLabels, planPhases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(spec, supportedPrecisions, func(label string) string {
if label == "mixed" { if label == "mixed" {
return fmt.Sprintf("gpu-%d-steady", idx) return fmt.Sprintf("gpu-%d-steady", idx)
} }
@@ -299,24 +403,27 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
"--precision-plan", strings.Join(planLabels, ","), "--precision-plan", strings.Join(planLabels, ","),
"--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases), "--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases),
} }
logFunc(fmt.Sprintf("GPU %d: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", idx, len(benchmarkPrecisionPhases), basePhaseSec, mixedPhaseSec)) logFunc(fmt.Sprintf("GPU %d: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", idx, len(supportedPrecisions), basePhaseSec, mixedPhaseSec))
_, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-precision-plan.log", idx), planCmd, nil, []int{idx}, planPhases, logFunc) _, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-precision-plan.log", idx), planCmd, nil, []int{idx}, planPhases, logFunc)
for _, phaseSpec := range planPhases { for _, phaseSpec := range planPhases {
if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 { if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 {
appendBenchmarkMetrics(&metricRows, rows, phaseSpec.MetricStage) appendBenchmarkMetrics(&metricRows, rows, phaseSpec.MetricStage, &metricTimelineSec, float64(phaseSpec.DurationSec))
} }
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseSpec.MetricStage, phaseLogs[phaseSpec.PlanLabel]) appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseSpec.MetricStage, phaseLogs[phaseSpec.PlanLabel])
} }
for _, prec := range benchmarkPrecisionPhases { for _, prec := range supportedPrecisions {
stageName := fmt.Sprintf("gpu-%d-steady-%s", idx, prec) stageName := fmt.Sprintf("gpu-%d-steady-%s", idx, prec)
phaseRows := phaseRowsByStage[stageName] phaseRows := phaseRowsByStage[stageName]
if len(phaseRows) == 0 {
continue
}
phase := BenchmarkPrecisionSteadyPhase{ phase := BenchmarkPrecisionSteadyPhase{
Precision: prec, Precision: prec,
Status: "OK",
Steady: summarizeBenchmarkTelemetry(phaseRows), Steady: summarizeBenchmarkTelemetry(phaseRows),
} }
if status, note := benchmarkPlannedPhaseStatus(phaseLogs[prec]); status != "OK" {
phase.Status = status
phase.Notes = note
gpuResult.PrecisionFailures = append(gpuResult.PrecisionFailures, prec+":"+status)
}
for _, p := range parseBenchmarkBurnLog(string(phaseLogs[prec])).Profiles { for _, p := range parseBenchmarkBurnLog(string(phaseLogs[prec])).Profiles {
if p.Supported { if p.Supported {
phase.TeraOpsPerSec += p.TeraOpsPerSec phase.TeraOpsPerSec += p.TeraOpsPerSec
@@ -396,13 +503,15 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
gpuResult.Notes = append(gpuResult.Notes, "cooldown sampling failed: "+err.Error()) gpuResult.Notes = append(gpuResult.Notes, "cooldown sampling failed: "+err.Error())
} }
gpuResult.Cooldown = summarizeBenchmarkTelemetry(cooldownRows) gpuResult.Cooldown = summarizeBenchmarkTelemetry(cooldownRows)
appendBenchmarkMetrics(&metricRows, cooldownRows, fmt.Sprintf("gpu-%d-cooldown", idx)) appendBenchmarkMetrics(&metricRows, cooldownRows, fmt.Sprintf("gpu-%d-cooldown", idx), &metricTimelineSec, float64(spec.CooldownSec))
} }
gpuResult.Scores = scoreBenchmarkGPUResult(gpuResult) gpuResult.Scores = scoreBenchmarkGPUResult(gpuResult)
gpuResult.DegradationReasons = detectBenchmarkDegradationReasons(gpuResult, result.Normalization.Status) gpuResult.DegradationReasons = detectBenchmarkDegradationReasons(gpuResult, result.Normalization.Status)
if planErr != nil { if planErr != nil {
gpuResult.Status = classifySATErrorStatus(phaseLogs["mixed"], planErr) gpuResult.Status = classifySATErrorStatus(phaseLogs["mixed"], planErr)
} else if len(gpuResult.PrecisionFailures) > 0 {
gpuResult.Status = "PARTIAL"
} else if parseResult.Fallback { } else if parseResult.Fallback {
gpuResult.Status = "PARTIAL" gpuResult.Status = "PARTIAL"
} else { } else {
@@ -929,35 +1038,34 @@ func sampleBenchmarkCoolingSample() benchmarkCoolingSample {
} }
} }
func annotateBenchmarkMetricRows(rows []GPUMetricRow, stage string, offset float64) []GPUMetricRow { func annotateBenchmarkMetricRows(rows []GPUMetricRow, stage string, offset, durationSec float64) []GPUMetricRow {
if len(rows) == 0 { if len(rows) == 0 {
return nil return nil
} }
stageEnd := offset + durationSec
if stageEnd <= offset {
stageEnd = offset
for _, row := range rows {
if row.ElapsedSec+offset > stageEnd {
stageEnd = row.ElapsedSec + offset
}
}
}
out := make([]GPUMetricRow, len(rows)) out := make([]GPUMetricRow, len(rows))
for i, row := range rows { for i, row := range rows {
row.Stage = stage row.Stage = stage
row.ElapsedSec += offset row.ElapsedSec += offset
row.StageStartSec = offset
row.StageEndSec = stageEnd
out[i] = row out[i] = row
} }
return out return out
} }
func benchmarkMetricOffset(rows []GPUMetricRow) float64 { func appendBenchmarkMetrics(allRows *[]GPUMetricRow, rows []GPUMetricRow, stage string, cursor *float64, durationSec float64) {
if len(rows) == 0 { annotated := annotateBenchmarkMetricRows(rows, stage, *cursor, durationSec)
return 0
}
var maxElapsed float64
for _, row := range rows {
if row.ElapsedSec > maxElapsed {
maxElapsed = row.ElapsedSec
}
}
return maxElapsed
}
func appendBenchmarkMetrics(allRows *[]GPUMetricRow, rows []GPUMetricRow, stage string) {
annotated := annotateBenchmarkMetricRows(rows, stage, benchmarkMetricOffset(*allRows))
*allRows = append(*allRows, annotated...) *allRows = append(*allRows, annotated...)
*cursor += durationSec
} }
func writeBenchmarkMetricsFiles(runDir string, rows []GPUMetricRow) { func writeBenchmarkMetricsFiles(runDir string, rows []GPUMetricRow) {
@@ -1308,6 +1416,9 @@ func detectBenchmarkDegradationReasons(gpu BenchmarkGPUResult, normalizationStat
if normalizationStatus != "full" { if normalizationStatus != "full" {
reasons = append(reasons, "normalization_partial") reasons = append(reasons, "normalization_partial")
} }
if gpu.PowerLimitDerated {
reasons = append(reasons, "power_limit_derated")
}
if gpu.ECC.Uncorrected > 0 { if gpu.ECC.Uncorrected > 0 {
reasons = append(reasons, "ecc_uncorrected_errors") reasons = append(reasons, "ecc_uncorrected_errors")
} }
@@ -1522,12 +1633,17 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
findings = append(findings, fmt.Sprintf("GPU %d showed unstable clocks/power over the benchmark window.", gpu.Index)) findings = append(findings, fmt.Sprintf("GPU %d showed unstable clocks/power over the benchmark window.", gpu.Index))
case "normalization_partial": case "normalization_partial":
findings = append(findings, fmt.Sprintf("GPU %d ran without full benchmark normalization.", gpu.Index)) findings = append(findings, fmt.Sprintf("GPU %d ran without full benchmark normalization.", gpu.Index))
case "power_limit_derated":
findings = append(findings, fmt.Sprintf("GPU %d could not sustain targeted_power in this server at the default limit; benchmark ran derated at %.0f W.", gpu.Index, gpu.PowerLimitW))
case "ecc_uncorrected_errors": case "ecc_uncorrected_errors":
findings = append(findings, fmt.Sprintf("GPU %d reported %d uncorrected ECC error(s) — possible hardware fault.", gpu.Index, gpu.ECC.Uncorrected)) findings = append(findings, fmt.Sprintf("GPU %d reported %d uncorrected ECC error(s) — possible hardware fault.", gpu.Index, gpu.ECC.Uncorrected))
case "ecc_corrected_errors": case "ecc_corrected_errors":
findings = append(findings, fmt.Sprintf("GPU %d reported %d corrected ECC error(s) — possible DRAM degradation.", gpu.Index, gpu.ECC.Corrected)) findings = append(findings, fmt.Sprintf("GPU %d reported %d corrected ECC error(s) — possible DRAM degradation.", gpu.Index, gpu.ECC.Corrected))
} }
} }
if len(gpu.PrecisionFailures) > 0 {
findings = append(findings, fmt.Sprintf("GPU %d had incomplete precision coverage: %s.", gpu.Index, strings.Join(gpu.PrecisionFailures, ", ")))
}
if gpu.Backend == "driver-ptx" { if gpu.Backend == "driver-ptx" {
findings = append(findings, fmt.Sprintf("GPU %d used driver PTX fallback; tensor score is intentionally degraded.", gpu.Index)) findings = append(findings, fmt.Sprintf("GPU %d used driver PTX fallback; tensor score is intentionally degraded.", gpu.Index))
} }
@@ -1896,10 +2012,11 @@ func runNvidiaBenchmarkParallel(
spec benchmarkProfileSpec, spec benchmarkProfileSpec,
logFunc func(string), logFunc func(string),
result *NvidiaBenchmarkResult, result *NvidiaBenchmarkResult,
calibPowerByIndex map[int]float64, calibByIndex map[int]benchmarkPowerCalibrationResult,
serverIdleW *float64, serverLoadedWSum *float64, serverIdleW *float64, serverLoadedWSum *float64,
serverIdleOK *bool, serverLoadedOK *bool, serverLoadedSamples *int, serverIdleOK *bool, serverLoadedOK *bool, serverLoadedSamples *int,
allMetricRows *[]GPUMetricRow, allMetricRows *[]GPUMetricRow,
metricTimelineSec *float64,
gpuBurnLog string, gpuBurnLog string,
) { ) {
allDevices := joinIndexList(selected) allDevices := joinIndexList(selected)
@@ -1920,8 +2037,12 @@ func runNvidiaBenchmarkParallel(
r.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz r.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
r.MaxMemoryClockMHz = info.MaxMemoryClockMHz r.MaxMemoryClockMHz = info.MaxMemoryClockMHz
} }
if w, ok := calibPowerByIndex[idx]; ok && w > 0 { if calib, ok := calibByIndex[idx]; ok {
r.CalibratedPeakPowerW = w r.CalibratedPeakPowerW = calib.Summary.P95PowerW
r.CalibratedPeakTempC = calib.Summary.P95TempC
r.PowerCalibrationTries = calib.Attempts
r.PowerLimitDerated = calib.Derated
r.Notes = append(r.Notes, calib.Notes...)
} }
if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil { if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
r.LockedGraphicsClockMHz = norm.GPUClockLockMHz r.LockedGraphicsClockMHz = norm.GPUClockLockMHz
@@ -1941,7 +2062,7 @@ func runNvidiaBenchmarkParallel(
perGPU := filterRowsByGPU(baselineRows, idx) perGPU := filterRowsByGPU(baselineRows, idx)
gpuResults[idx].Baseline = summarizeBenchmarkTelemetry(perGPU) gpuResults[idx].Baseline = summarizeBenchmarkTelemetry(perGPU)
} }
appendBenchmarkMetrics(allMetricRows, baselineRows, "baseline") appendBenchmarkMetrics(allMetricRows, baselineRows, "baseline", metricTimelineSec, float64(spec.BaselineSec))
// Sample server idle power once. // Sample server idle power once.
if !*serverIdleOK { if !*serverIdleOK {
@@ -1961,13 +2082,25 @@ func runNvidiaBenchmarkParallel(
} }
logFunc(fmt.Sprintf("GPUs %s: parallel warmup (%ds)", allDevices, spec.WarmupSec)) logFunc(fmt.Sprintf("GPUs %s: parallel warmup (%ds)", allDevices, spec.WarmupSec))
warmupOut, warmupRows, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-warmup.log", warmupCmd, nil, selected, logFunc) warmupOut, warmupRows, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-warmup.log", warmupCmd, nil, selected, logFunc)
appendBenchmarkMetrics(allMetricRows, warmupRows, "warmup") appendBenchmarkMetrics(allMetricRows, warmupRows, "warmup", metricTimelineSec, float64(spec.WarmupSec))
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", "warmup", warmupOut) appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", "warmup", warmupOut)
if warmupErr != nil { if warmupErr != nil {
for _, idx := range selected { for _, idx := range selected {
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "parallel warmup failed: "+warmupErr.Error()) gpuResults[idx].Notes = append(gpuResults[idx].Notes, "parallel warmup failed: "+warmupErr.Error())
} }
} }
warmupParseByGPU := parseBenchmarkBurnLogByGPU(string(warmupOut))
supportedPrecisions := append([]string(nil), benchmarkPrecisionPhases...)
for _, idx := range selected {
if pr, ok := warmupParseByGPU[idx]; ok && pr.ComputeCapability != "" {
if gpuResults[idx].ComputeCapability == "" {
gpuResults[idx].ComputeCapability = pr.ComputeCapability
}
if ccPrecisions := benchmarkSupportedPrecisions(pr.ComputeCapability); len(ccPrecisions) < len(supportedPrecisions) {
supportedPrecisions = ccPrecisions
}
}
}
// Run synthetic precision phases and the combined steady phase as one // Run synthetic precision phases and the combined steady phase as one
// uninterrupted command so the GPUs stay hot between windows. // uninterrupted command so the GPUs stay hot between windows.
@@ -1975,7 +2108,7 @@ func runNvidiaBenchmarkParallel(
for _, idx := range selected { for _, idx := range selected {
eccBase[idx], _ = queryECCCounters(idx) eccBase[idx], _ = queryECCCounters(idx)
} }
planLabels, planPhases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(spec, func(label string) string { planLabels, planPhases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(spec, supportedPrecisions, func(label string) string {
if label == "mixed" { if label == "mixed" {
return "steady" return "steady"
} }
@@ -1989,30 +2122,30 @@ func runNvidiaBenchmarkParallel(
"--precision-plan", strings.Join(planLabels, ","), "--precision-plan", strings.Join(planLabels, ","),
"--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases), "--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases),
} }
logFunc(fmt.Sprintf("GPUs %s: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", allDevices, len(benchmarkPrecisionPhases), basePhaseSec, mixedPhaseSec)) logFunc(fmt.Sprintf("GPUs %s: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", allDevices, len(supportedPrecisions), basePhaseSec, mixedPhaseSec))
_, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, "gpu-all-precision-plan.log", planCmd, nil, selected, planPhases, logFunc) _, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, "gpu-all-precision-plan.log", planCmd, nil, selected, planPhases, logFunc)
for _, phaseSpec := range planPhases { for _, phaseSpec := range planPhases {
if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 { if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 {
appendBenchmarkMetrics(allMetricRows, rows, phaseSpec.MetricStage) appendBenchmarkMetrics(allMetricRows, rows, phaseSpec.MetricStage, metricTimelineSec, float64(phaseSpec.DurationSec))
} }
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseSpec.MetricStage, phaseLogs[phaseSpec.PlanLabel]) appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseSpec.MetricStage, phaseLogs[phaseSpec.PlanLabel])
} }
for _, prec := range benchmarkPrecisionPhases { for _, prec := range supportedPrecisions {
phaseLogName := "gpu-all-steady-" + prec phaseLogName := "gpu-all-steady-" + prec
phaseRows := phaseRowsByStage[phaseLogName] phaseRows := phaseRowsByStage[phaseLogName]
if len(phaseRows) == 0 {
continue
}
parseByGPU := parseBenchmarkBurnLogByGPU(string(phaseLogs[prec])) parseByGPU := parseBenchmarkBurnLogByGPU(string(phaseLogs[prec]))
for _, idx := range selected { for _, idx := range selected {
perGPU := filterRowsByGPU(phaseRows, idx) perGPU := filterRowsByGPU(phaseRows, idx)
if len(perGPU) == 0 {
continue
}
phase := BenchmarkPrecisionSteadyPhase{ phase := BenchmarkPrecisionSteadyPhase{
Precision: prec, Precision: prec,
Status: "OK",
Steady: summarizeBenchmarkTelemetry(perGPU), Steady: summarizeBenchmarkTelemetry(perGPU),
} }
if status, note := benchmarkPlannedPhaseStatus(phaseLogs[prec]); status != "OK" {
phase.Status = status
phase.Notes = note
gpuResults[idx].PrecisionFailures = append(gpuResults[idx].PrecisionFailures, prec+":"+status)
}
if pr, ok := parseByGPU[idx]; ok { if pr, ok := parseByGPU[idx]; ok {
for _, p := range pr.Profiles { for _, p := range pr.Profiles {
if p.Supported { if p.Supported {
@@ -2113,7 +2246,7 @@ func runNvidiaBenchmarkParallel(
perGPU := filterRowsByGPU(cooldownRows, idx) perGPU := filterRowsByGPU(cooldownRows, idx)
gpuResults[idx].Cooldown = summarizeBenchmarkTelemetry(perGPU) gpuResults[idx].Cooldown = summarizeBenchmarkTelemetry(perGPU)
} }
appendBenchmarkMetrics(allMetricRows, cooldownRows, "cooldown") appendBenchmarkMetrics(allMetricRows, cooldownRows, "cooldown", metricTimelineSec, float64(spec.CooldownSec))
} }
// Score and finalize each GPU. // Score and finalize each GPU.
@@ -2125,6 +2258,8 @@ func runNvidiaBenchmarkParallel(
switch { switch {
case planErr != nil: case planErr != nil:
r.Status = classifySATErrorStatus(phaseLogs["mixed"], planErr) r.Status = classifySATErrorStatus(phaseLogs["mixed"], planErr)
case len(r.PrecisionFailures) > 0:
r.Status = "PARTIAL"
case pr.Fallback: case pr.Fallback:
r.Status = "PARTIAL" r.Status = "PARTIAL"
default: default:
@@ -2299,59 +2434,172 @@ func summarizeCPULoad(samples []float64) *BenchmarkCPULoad {
return cl return cl
} }
// runBenchmarkPowerCalibration runs a short dcgmi targeted_power test while // runBenchmarkPowerCalibration runs targeted_power per GPU and actively watches
// collecting nvidia-smi power samples in parallel. It returns a map from GPU // throttle counters. If a GPU starts throttling, the current targeted_power run
// index to p95 observed power (watts), which is used as the reference for // is canceled immediately, the power limit is reduced, and a fresh full cycle
// PowerSustainScore instead of the hardware default limit. // is started again from the beginning. The selected reduced power limit stays
// // active for the main benchmark and is restored by the caller afterwards.
// If dcgmi is unavailable or the run fails the function returns an empty map
// and the caller falls back to DefaultPowerLimitW. The calibration is skipped
// gracefully — it must never block or fail the main benchmark.
func runBenchmarkPowerCalibration( func runBenchmarkPowerCalibration(
ctx context.Context, ctx context.Context,
verboseLog, runDir string, verboseLog, runDir string,
gpuIndices []int, gpuIndices []int,
infoByIndex map[int]benchmarkGPUInfo,
logFunc func(string), logFunc func(string),
) map[int]float64 { ) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction) {
const calibDurationSec = 120 const calibDurationSec = 120
const derateStepW = 25
const maxDerateW = 150
// dcgmi must be present.
if _, err := exec.LookPath("dcgmi"); err != nil { if _, err := exec.LookPath("dcgmi"); err != nil {
logFunc("power calibration: dcgmi not found, skipping (will use default power limit)") logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
return map[int]float64{} return map[int]benchmarkPowerCalibrationResult{}, nil
} }
logFunc(fmt.Sprintf("power calibration: running dcgmi targeted_power for %ds on GPUs %s", calibDurationSec, joinIndexList(gpuIndices))) canDerate := os.Geteuid() == 0
if !canDerate {
cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, gpuIndices) logFunc("power calibration: root privileges unavailable, adaptive power-limit derating disabled")
out, rows, err := runBenchmarkCommandWithMetrics(ctx, verboseLog, "power-calibration.log", cmd, nil, gpuIndices, logFunc)
_ = os.WriteFile(filepath.Join(runDir, "power-calibration.log"), out, 0644)
if err != nil {
logFunc(fmt.Sprintf("power calibration: dcgmi targeted_power failed (%v), skipping", err))
return map[int]float64{}
} }
// Group rows by GPU index and compute p95 power for each. type calibrationAttemptResult struct {
result := make(map[int]float64, len(gpuIndices)) out []byte
rows []GPUMetricRow
err error
}
results := make(map[int]benchmarkPowerCalibrationResult, len(gpuIndices))
var restore []benchmarkRestoreAction
for _, idx := range gpuIndices { for _, idx := range gpuIndices {
perGPU := filterRowsByGPU(rows, idx) info := infoByIndex[idx]
if len(perGPU) == 0 { originalLimitW := int(math.Round(info.PowerLimitW))
continue if originalLimitW <= 0 {
originalLimitW = int(math.Round(info.DefaultPowerLimitW))
} }
powers := make([]float64, 0, len(perGPU)) defaultLimitW := int(math.Round(info.DefaultPowerLimitW))
for _, r := range perGPU { if defaultLimitW <= 0 {
if r.PowerW > 0 { defaultLimitW = originalLimitW
powers = append(powers, r.PowerW) }
appliedLimitW := originalLimitW
if appliedLimitW <= 0 {
appliedLimitW = defaultLimitW
}
minLimitW := appliedLimitW
switch {
case defaultLimitW > 0:
minLimitW = defaultLimitW - maxDerateW
floorByRatio := int(math.Round(float64(defaultLimitW) * 0.70))
if minLimitW < floorByRatio {
minLimitW = floorByRatio
} }
case appliedLimitW > 0:
minLimitW = appliedLimitW - maxDerateW
} }
if len(powers) == 0 { if minLimitW < derateStepW {
continue minLimitW = derateStepW
} }
p95 := benchmarkPercentile(powers, 95)
if p95 > 0 { calib := benchmarkPowerCalibrationResult{
result[idx] = p95 AppliedPowerLimitW: float64(appliedLimitW),
logFunc(fmt.Sprintf("power calibration: GPU %d p95=%.0f W (%d samples)", idx, p95, len(powers))) }
if canDerate && originalLimitW > 0 {
idxCopy := idx
orig := originalLimitW
restore = append(restore, benchmarkRestoreAction{
name: fmt.Sprintf("gpu-%d-restore-power-limit", idxCopy),
fn: func() {
_ = setBenchmarkPowerLimit(context.Background(), verboseLog, idxCopy, orig)
},
})
}
for {
calib.Attempts++
logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power attempt %d at %d W for %ds", idx, calib.Attempts, appliedLimitW, calibDurationSec))
beforeThrottle, _ := queryThrottleCounters(idx)
attemptCtx, cancel := context.WithCancel(ctx)
doneCh := make(chan calibrationAttemptResult, 1)
logName := fmt.Sprintf("power-calibration-gpu-%d-attempt-%d.log", idx, calib.Attempts)
cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, []int{idx})
go func() {
out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, nil, []int{idx}, logFunc)
doneCh <- calibrationAttemptResult{out: out, rows: rows, err: err}
}()
ticker := time.NewTicker(time.Second)
var (
attempt calibrationAttemptResult
throttleReason string
)
attemptLoop:
for {
select {
case attempt = <-doneCh:
break attemptLoop
case <-ticker.C:
afterThrottle, err := queryThrottleCounters(idx)
if err != nil {
continue
}
if reason := benchmarkCalibrationThrottleReason(beforeThrottle, afterThrottle); reason != "" {
throttleReason = reason
cancel()
}
case <-ctx.Done():
cancel()
attempt = <-doneCh
break attemptLoop
}
}
ticker.Stop()
cancel()
_ = os.WriteFile(filepath.Join(runDir, logName), attempt.out, 0644)
perGPU := filterRowsByGPU(attempt.rows, idx)
summary := summarizeBenchmarkTelemetry(perGPU)
if throttleReason == "" && attempt.err == nil && summary.P95PowerW > 0 {
calib.Summary = summary
calib.Completed = true
calib.AppliedPowerLimitW = float64(appliedLimitW)
logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", idx, appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples))
break
}
switch {
case throttleReason != "":
calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power was canceled on attempt %d after %s throttling at %d W", calib.Attempts, throttleReason, appliedLimitW))
logFunc(fmt.Sprintf("power calibration: GPU %d throttled (%s) at %d W, reducing power limit", idx, throttleReason, appliedLimitW))
case attempt.err != nil:
calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power attempt %d failed at %d W: %v", calib.Attempts, appliedLimitW, attempt.err))
logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power failed at %d W: %v", idx, appliedLimitW, attempt.err))
default:
calib.Notes = append(calib.Notes, fmt.Sprintf("targeted_power attempt %d at %d W produced no valid power telemetry", calib.Attempts, appliedLimitW))
logFunc(fmt.Sprintf("power calibration: GPU %d attempt %d at %d W produced no valid telemetry", idx, calib.Attempts, appliedLimitW))
}
if !canDerate || appliedLimitW <= 0 {
break
}
nextLimitW := appliedLimitW - derateStepW
if nextLimitW < minLimitW {
calib.Notes = append(calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default/current limit", maxDerateW))
break
}
if err := setBenchmarkPowerLimit(ctx, verboseLog, idx, nextLimitW); err != nil {
calib.Notes = append(calib.Notes, "failed to lower power limit: "+err.Error())
logFunc(fmt.Sprintf("power calibration: GPU %d failed to set reduced power limit %d W: %v", idx, nextLimitW, err))
break
}
appliedLimitW = nextLimitW
calib.AppliedPowerLimitW = float64(appliedLimitW)
calib.Derated = true
info.PowerLimitW = float64(appliedLimitW)
infoByIndex[idx] = info
calib.Notes = append(calib.Notes, fmt.Sprintf("reduced power limit to %d W and restarted targeted_power from the beginning", appliedLimitW))
}
if calib.Completed || calib.Attempts > 0 || len(calib.Notes) > 0 {
results[idx] = calib
} }
} }
return result return results, restore
} }

View File

@@ -48,7 +48,7 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
fmt.Fprintf(&b, "**GPU(s):** %s \n", strings.Join(parts, ", ")) fmt.Fprintf(&b, "**GPU(s):** %s \n", strings.Join(parts, ", "))
} }
fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile) fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile)
fmt.Fprintf(&b, "**App version:** %s \n", result.BenchmarkVersion) fmt.Fprintf(&b, "**Benchmark version:** %s \n", result.BenchmarkVersion)
fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC")) fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
if result.RampStep > 0 && result.RampTotal > 0 { if result.RampStep > 0 && result.RampTotal > 0 {
fmt.Fprintf(&b, "**Ramp-up step:** %d of %d \n", result.RampStep, result.RampTotal) fmt.Fprintf(&b, "**Ramp-up step:** %d of %d \n", result.RampStep, result.RampTotal)
@@ -83,7 +83,7 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
// ── Methodology ─────────────────────────────────────────────────────────── // ── Methodology ───────────────────────────────────────────────────────────
b.WriteString("## Methodology\n\n") b.WriteString("## Methodology\n\n")
fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline -> warmup -> steady-state -> interconnect -> cooldown phases.\n", result.BenchmarkProfile) fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline -> warmup -> steady-state -> interconnect phases.\n", result.BenchmarkProfile)
b.WriteString("- Single-GPU compute score comes from `bee-gpu-burn` on the cuBLASLt path when available.\n") b.WriteString("- Single-GPU compute score comes from `bee-gpu-burn` on the cuBLASLt path when available.\n")
b.WriteString("- Thermal and power limits are inferred from NVIDIA clock-event counters plus sustained telemetry.\n") b.WriteString("- Thermal and power limits are inferred from NVIDIA clock-event counters plus sustained telemetry.\n")
b.WriteString("- `result.json` is the canonical machine-readable source for the run.\n\n") b.WriteString("- `result.json` is the canonical machine-readable source for the run.\n\n")
@@ -170,6 +170,16 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
if gpu.PowerLimitW > 0 { if gpu.PowerLimitW > 0 {
fmt.Fprintf(&b, "- **Power limit:** %.0f W (default %.0f W)\n", gpu.PowerLimitW, gpu.DefaultPowerLimitW) fmt.Fprintf(&b, "- **Power limit:** %.0f W (default %.0f W)\n", gpu.PowerLimitW, gpu.DefaultPowerLimitW)
} }
if gpu.PowerLimitDerated {
fmt.Fprintf(&b, "- **Power limit derating:** active after %d targeted_power attempt(s)\n", gpu.PowerCalibrationTries)
}
if gpu.CalibratedPeakPowerW > 0 {
if gpu.CalibratedPeakTempC > 0 {
fmt.Fprintf(&b, "- **Power calibration (`dcgmi targeted_power`):** %.0f W p95 at %.1f °C p95\n", gpu.CalibratedPeakPowerW, gpu.CalibratedPeakTempC)
} else {
fmt.Fprintf(&b, "- **Power calibration (`dcgmi targeted_power`):** %.0f W p95\n", gpu.CalibratedPeakPowerW)
}
}
if gpu.LockedGraphicsClockMHz > 0 { if gpu.LockedGraphicsClockMHz > 0 {
fmt.Fprintf(&b, "- **Locked clocks:** GPU %.0f MHz / Mem %.0f MHz\n", gpu.LockedGraphicsClockMHz, gpu.LockedMemoryClockMHz) fmt.Fprintf(&b, "- **Locked clocks:** GPU %.0f MHz / Mem %.0f MHz\n", gpu.LockedGraphicsClockMHz, gpu.LockedMemoryClockMHz)
} }
@@ -188,7 +198,7 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
// Per-precision stability phases. // Per-precision stability phases.
if len(gpu.PrecisionSteady) > 0 { if len(gpu.PrecisionSteady) > 0 {
b.WriteString("**Per-precision stability:**\n\n") b.WriteString("**Per-precision stability:**\n\n")
b.WriteString("| Precision | Clock CV | Power CV | Clock Drift | ECC corr | ECC uncorr |\n|-----------|----------|----------|-------------|----------|------------|\n") b.WriteString("| Precision | Status | Clock CV | Power CV | Clock Drift | ECC corr | ECC uncorr |\n|-----------|--------|----------|----------|-------------|----------|------------|\n")
for _, p := range gpu.PrecisionSteady { for _, p := range gpu.PrecisionSteady {
eccCorr := "—" eccCorr := "—"
eccUncorr := "—" eccUncorr := "—"
@@ -196,8 +206,12 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
eccCorr = fmt.Sprintf("%d", p.ECC.Corrected) eccCorr = fmt.Sprintf("%d", p.ECC.Corrected)
eccUncorr = fmt.Sprintf("%d", p.ECC.Uncorrected) eccUncorr = fmt.Sprintf("%d", p.ECC.Uncorrected)
} }
fmt.Fprintf(&b, "| %s | %.1f%% | %.1f%% | %.1f%% | %s | %s |\n", status := p.Status
p.Precision, p.Steady.ClockCVPct, p.Steady.PowerCVPct, p.Steady.ClockDriftPct, if strings.TrimSpace(status) == "" {
status = "OK"
}
fmt.Fprintf(&b, "| %s | %s | %.1f%% | %.1f%% | %.1f%% | %s | %s |\n",
p.Precision, status, p.Steady.ClockCVPct, p.Steady.PowerCVPct, p.Steady.ClockDriftPct,
eccCorr, eccUncorr) eccCorr, eccUncorr)
} }
b.WriteString("\n") b.WriteString("\n")
@@ -364,6 +378,7 @@ func formatThrottleLine(t BenchmarkThrottleCounters, steadyDurationSec float64)
func renderBenchmarkSummary(result NvidiaBenchmarkResult) string { func renderBenchmarkSummary(result NvidiaBenchmarkResult) string {
var b strings.Builder var b strings.Builder
fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339)) fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
fmt.Fprintf(&b, "benchmark_version=%s\n", result.BenchmarkVersion)
fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile) fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile)
fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus) fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus)
fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs)) fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs))

View File

@@ -46,6 +46,7 @@ func TestBuildBenchmarkSteadyPlanStandard(t *testing.T) {
labels, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan( labels, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, SteadySec: 480}, benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, SteadySec: 480},
benchmarkPrecisionPhases,
func(label string) string { return label }, func(label string) string { return label },
) )
if len(labels) != 7 || len(phases) != 7 { if len(labels) != 7 || len(phases) != 7 {
@@ -70,6 +71,7 @@ func TestBuildBenchmarkSteadyPlanStability(t *testing.T) {
_, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan( _, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, SteadySec: 3600}, benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, SteadySec: 3600},
benchmarkPrecisionPhases,
func(label string) string { return label }, func(label string) string { return label },
) )
if basePhaseSec != 300 { if basePhaseSec != 300 {
@@ -88,6 +90,7 @@ func TestBuildBenchmarkSteadyPlanOvernight(t *testing.T) {
_, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan( _, phases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(
benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, SteadySec: 27000}, benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, SteadySec: 27000},
benchmarkPrecisionPhases,
func(label string) string { return label }, func(label string) string { return label },
) )
if basePhaseSec != 3600 { if basePhaseSec != 3600 {
@@ -127,6 +130,40 @@ func TestSplitBenchmarkRowsByPlannedPhaseUsesPhaseDurations(t *testing.T) {
} }
} }
func TestBenchmarkSupportedPrecisionsSkipsFP4BeforeBlackwell(t *testing.T) {
t.Parallel()
if got := benchmarkSupportedPrecisions("9.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32,fp64" {
t.Fatalf("supported=%v", got)
}
if got := benchmarkSupportedPrecisions("10.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32,fp64,fp4" {
t.Fatalf("supported=%v", got)
}
}
func TestBenchmarkPlannedPhaseStatus(t *testing.T) {
t.Parallel()
cases := []struct {
name string
raw string
wantStatus string
}{
{name: "ok", raw: "status=OK\n", wantStatus: "OK"},
{name: "failed", raw: "phase_error=fp16\n", wantStatus: "FAILED"},
{name: "unsupported", raw: "cublasLt_profiles=unsupported\nphase_error=fp4\n", wantStatus: "UNSUPPORTED"},
}
for _, tc := range cases {
tc := tc
t.Run(tc.name, func(t *testing.T) {
got, _ := benchmarkPlannedPhaseStatus([]byte(tc.raw))
if got != tc.wantStatus {
t.Fatalf("status=%q want %q", got, tc.wantStatus)
}
})
}
}
func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) { func TestNormalizeNvidiaBenchmarkOptionsPreservesRunNCCLChoice(t *testing.T) {
t.Parallel() t.Parallel()

View File

@@ -104,6 +104,7 @@ type BenchmarkGPUResult struct {
Backend string `json:"backend,omitempty"` Backend string `json:"backend,omitempty"`
Status string `json:"status"` Status string `json:"status"`
PowerLimitW float64 `json:"power_limit_w,omitempty"` PowerLimitW float64 `json:"power_limit_w,omitempty"`
PowerLimitDerated bool `json:"power_limit_derated,omitempty"`
MultiprocessorCount int `json:"multiprocessor_count,omitempty"` MultiprocessorCount int `json:"multiprocessor_count,omitempty"`
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"` DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
// CalibratedPeakPowerW is the p95 power measured during a short // CalibratedPeakPowerW is the p95 power measured during a short
@@ -111,6 +112,8 @@ type BenchmarkGPUResult struct {
// Used as the reference denominator for PowerSustainScore instead of // Used as the reference denominator for PowerSustainScore instead of
// the hardware default limit, which bee-gpu-burn cannot reach. // the hardware default limit, which bee-gpu-burn cannot reach.
CalibratedPeakPowerW float64 `json:"calibrated_peak_power_w,omitempty"` CalibratedPeakPowerW float64 `json:"calibrated_peak_power_w,omitempty"`
CalibratedPeakTempC float64 `json:"calibrated_peak_temp_c,omitempty"`
PowerCalibrationTries int `json:"power_calibration_tries,omitempty"`
MaxGraphicsClockMHz float64 `json:"max_graphics_clock_mhz,omitempty"` MaxGraphicsClockMHz float64 `json:"max_graphics_clock_mhz,omitempty"`
BaseGraphicsClockMHz float64 `json:"base_graphics_clock_mhz,omitempty"` BaseGraphicsClockMHz float64 `json:"base_graphics_clock_mhz,omitempty"`
MaxMemoryClockMHz float64 `json:"max_memory_clock_mhz,omitempty"` MaxMemoryClockMHz float64 `json:"max_memory_clock_mhz,omitempty"`
@@ -119,6 +122,7 @@ type BenchmarkGPUResult struct {
Baseline BenchmarkTelemetrySummary `json:"baseline"` Baseline BenchmarkTelemetrySummary `json:"baseline"`
Steady BenchmarkTelemetrySummary `json:"steady"` Steady BenchmarkTelemetrySummary `json:"steady"`
PrecisionSteady []BenchmarkPrecisionSteadyPhase `json:"precision_steady,omitempty"` PrecisionSteady []BenchmarkPrecisionSteadyPhase `json:"precision_steady,omitempty"`
PrecisionFailures []string `json:"precision_failures,omitempty"`
Cooldown BenchmarkTelemetrySummary `json:"cooldown"` Cooldown BenchmarkTelemetrySummary `json:"cooldown"`
Throttle BenchmarkThrottleCounters `json:"throttle_counters"` Throttle BenchmarkThrottleCounters `json:"throttle_counters"`
// ECC error delta accumulated over the full benchmark (all phases combined). // ECC error delta accumulated over the full benchmark (all phases combined).
@@ -225,13 +229,15 @@ type BenchmarkServerPower struct {
// type runs at a time the PowerCVPct here is a genuine stability signal. // type runs at a time the PowerCVPct here is a genuine stability signal.
type BenchmarkPrecisionSteadyPhase struct { type BenchmarkPrecisionSteadyPhase struct {
Precision string `json:"precision"` // e.g. "fp8", "fp16", "fp32" Precision string `json:"precision"` // e.g. "fp8", "fp16", "fp32"
Status string `json:"status,omitempty"`
Steady BenchmarkTelemetrySummary `json:"steady"` Steady BenchmarkTelemetrySummary `json:"steady"`
TeraOpsPerSec float64 `json:"teraops_per_sec,omitempty"` TeraOpsPerSec float64 `json:"teraops_per_sec,omitempty"`
WeightedTeraOpsPerSec float64 `json:"weighted_teraops_per_sec,omitempty"` WeightedTeraOpsPerSec float64 `json:"weighted_teraops_per_sec,omitempty"`
// ECC errors accumulated during this precision phase only. // ECC errors accumulated during this precision phase only.
// Non-zero corrected = stress-induced DRAM errors for this kernel type. // Non-zero corrected = stress-induced DRAM errors for this kernel type.
// Any uncorrected = serious fault triggered by this precision workload. // Any uncorrected = serious fault triggered by this precision workload.
ECC BenchmarkECCCounters `json:"ecc,omitempty"` ECC BenchmarkECCCounters `json:"ecc,omitempty"`
Notes string `json:"notes,omitempty"`
} }
type BenchmarkInterconnectResult struct { type BenchmarkInterconnectResult struct {

View File

@@ -14,6 +14,8 @@ import (
// GPUMetricRow is one telemetry sample from nvidia-smi during a stress test. // GPUMetricRow is one telemetry sample from nvidia-smi during a stress test.
type GPUMetricRow struct { type GPUMetricRow struct {
Stage string `json:"stage,omitempty"` Stage string `json:"stage,omitempty"`
StageStartSec float64 `json:"stage_start_sec,omitempty"`
StageEndSec float64 `json:"stage_end_sec,omitempty"`
ElapsedSec float64 `json:"elapsed_sec"` ElapsedSec float64 `json:"elapsed_sec"`
GPUIndex int `json:"index"` GPUIndex int `json:"index"`
TempC float64 `json:"temp_c"` TempC float64 `json:"temp_c"`
@@ -509,11 +511,22 @@ func buildGPUMetricStageSpans(rows []GPUMetricRow) []gpuMetricStageSpan {
if name == "" { if name == "" {
name = "run" name = "run"
} }
start := row.StageStartSec
end := row.StageEndSec
if end <= start {
start = row.ElapsedSec
end = row.ElapsedSec
}
if len(spans) == 0 || spans[len(spans)-1].Name != name { if len(spans) == 0 || spans[len(spans)-1].Name != name {
spans = append(spans, gpuMetricStageSpan{Name: name, Start: row.ElapsedSec, End: row.ElapsedSec}) spans = append(spans, gpuMetricStageSpan{Name: name, Start: start, End: end})
continue continue
} }
spans[len(spans)-1].End = row.ElapsedSec if start < spans[len(spans)-1].Start {
spans[len(spans)-1].Start = start
}
if end > spans[len(spans)-1].End {
spans[len(spans)-1].End = end
}
} }
for i := range spans { for i := range spans {
if spans[i].End <= spans[i].Start { if spans[i].End <= spans[i].Start {

View File

@@ -110,7 +110,7 @@ func writeTaskRunResponse(w http.ResponseWriter, tasks []*Task) {
func shouldSplitHomogeneousNvidiaTarget(target string) bool { func shouldSplitHomogeneousNvidiaTarget(target string) bool {
switch strings.TrimSpace(target) { switch strings.TrimSpace(target) {
case "nvidia", "nvidia-targeted-stress", "nvidia-benchmark", "nvidia-compute", case "nvidia", "nvidia-targeted-stress", "nvidia-bench-perf", "nvidia-bench-power", "nvidia-compute",
"nvidia-targeted-power", "nvidia-pulse", "nvidia-interconnect", "nvidia-targeted-power", "nvidia-pulse", "nvidia-interconnect",
"nvidia-bandwidth", "nvidia-stress": "nvidia-bandwidth", "nvidia-stress":
return true return true
@@ -127,7 +127,7 @@ func defaultTaskPriority(target string, params taskParams) int {
return taskPriorityInstallToRAM return taskPriorityInstallToRAM
case "audit": case "audit":
return taskPriorityAudit return taskPriorityAudit
case "nvidia-benchmark": case "nvidia-bench-perf", "nvidia-bench-power":
return taskPriorityBenchmark return taskPriorityBenchmark
case "nvidia-stress", "amd-stress", "memory-stress", "sat-stress", "platform-stress", "nvidia-compute": case "nvidia-stress", "amd-stress", "memory-stress", "sat-stress", "platform-stress", "nvidia-compute":
return taskPriorityBurn return taskPriorityBurn
@@ -573,131 +573,142 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
} }
} }
func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Request) { func (h *handler) handleAPIBenchmarkNvidiaRunKind(target string) http.HandlerFunc {
if h.opts.App == nil { return func(w http.ResponseWriter, r *http.Request) {
writeError(w, http.StatusServiceUnavailable, "app not configured") if h.opts.App == nil {
return writeError(w, http.StatusServiceUnavailable, "app not configured")
}
var body struct {
Profile string `json:"profile"`
SizeMB int `json:"size_mb"`
GPUIndices []int `json:"gpu_indices"`
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
RunNCCL *bool `json:"run_nccl"`
ParallelGPUs *bool `json:"parallel_gpus"`
RampUp *bool `json:"ramp_up"`
DisplayName string `json:"display_name"`
}
if r.Body != nil {
if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
writeError(w, http.StatusBadRequest, "invalid request body")
return return
} }
}
runNCCL := true var body struct {
if body.RunNCCL != nil { Profile string `json:"profile"`
runNCCL = *body.RunNCCL SizeMB int `json:"size_mb"`
} GPUIndices []int `json:"gpu_indices"`
parallelGPUs := false ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
if body.ParallelGPUs != nil { RunNCCL *bool `json:"run_nccl"`
parallelGPUs = *body.ParallelGPUs ParallelGPUs *bool `json:"parallel_gpus"`
} RampUp *bool `json:"ramp_up"`
rampUp := false DisplayName string `json:"display_name"`
if body.RampUp != nil { }
rampUp = *body.RampUp if r.Body != nil {
} if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
// Build a descriptive base name that includes profile and mode so the task writeError(w, http.StatusBadRequest, "invalid request body")
// list is self-explanatory without opening individual task detail pages. return
profile := strings.TrimSpace(body.Profile) }
if profile == "" { }
profile = "standard"
}
name := taskDisplayName("nvidia-benchmark", "", "")
if strings.TrimSpace(body.DisplayName) != "" {
name = body.DisplayName
}
// Append profile tag.
name = fmt.Sprintf("%s · %s", name, profile)
if rampUp && len(body.GPUIndices) > 1 { runNCCL := true
// Ramp-up mode: resolve GPU list, then create one task per prefix if body.RunNCCL != nil {
// [gpu0], [gpu0,gpu1], ..., [gpu0,...,gpuN-1], each running in parallel. runNCCL = *body.RunNCCL
gpus, err := apiListNvidiaGPUs(h.opts.App) }
if err != nil { parallelGPUs := false
writeError(w, http.StatusBadRequest, err.Error()) if body.ParallelGPUs != nil {
parallelGPUs = *body.ParallelGPUs
}
rampUp := false
if body.RampUp != nil {
rampUp = *body.RampUp
}
// Build a descriptive base name that includes profile and mode so the task
// list is self-explanatory without opening individual task detail pages.
profile := strings.TrimSpace(body.Profile)
if profile == "" {
profile = "standard"
}
name := taskDisplayName(target, "", "")
if strings.TrimSpace(body.DisplayName) != "" {
name = body.DisplayName
}
// Append profile tag.
name = fmt.Sprintf("%s · %s", name, profile)
if target == "nvidia-bench-power" && parallelGPUs {
writeError(w, http.StatusBadRequest, "power / thermal fit benchmark uses sequential or ramp-up modes only")
return return
} }
resolved, err := expandSelectedGPUIndices(gpus, body.GPUIndices, body.ExcludeGPUIndices)
if err != nil { if rampUp && len(body.GPUIndices) > 1 {
writeError(w, http.StatusBadRequest, err.Error()) // Ramp-up mode: resolve GPU list, then create one task per prefix
return // [gpu0], [gpu0,gpu1], ..., [gpu0,...,gpuN-1], each running in parallel.
} gpus, err := apiListNvidiaGPUs(h.opts.App)
if len(resolved) < 2 { if err != nil {
// Fall through to normal single-task path. writeError(w, http.StatusBadRequest, err.Error())
rampUp = false return
} else { }
now := time.Now() resolved, err := expandSelectedGPUIndices(gpus, body.GPUIndices, body.ExcludeGPUIndices)
rampRunID := fmt.Sprintf("ramp-%s", now.UTC().Format("20060102-150405")) if err != nil {
var allTasks []*Task writeError(w, http.StatusBadRequest, err.Error())
for step := 1; step <= len(resolved); step++ { return
subset := resolved[:step] }
stepName := fmt.Sprintf("%s · ramp %d/%d · GPU %s", name, step, len(resolved), formatGPUIndexList(subset)) if len(resolved) < 2 {
t := &Task{ // Fall through to normal single-task path.
ID: newJobID("benchmark-nvidia"), rampUp = false
Name: stepName, } else {
Target: "nvidia-benchmark", now := time.Now()
Priority: defaultTaskPriority("nvidia-benchmark", taskParams{}), rampRunID := fmt.Sprintf("ramp-%s", now.UTC().Format("20060102-150405"))
Status: TaskPending, var allTasks []*Task
CreatedAt: now, for step := 1; step <= len(resolved); step++ {
params: taskParams{ subset := resolved[:step]
GPUIndices: append([]int(nil), subset...), stepName := fmt.Sprintf("%s · ramp %d/%d · GPU %s", name, step, len(resolved), formatGPUIndexList(subset))
SizeMB: body.SizeMB, t := &Task{
BenchmarkProfile: body.Profile, ID: newJobID("bee-bench-nvidia"),
RunNCCL: runNCCL && step == len(resolved), Name: stepName,
ParallelGPUs: true, Target: target,
RampStep: step, Priority: defaultTaskPriority(target, taskParams{}),
RampTotal: len(resolved), Status: TaskPending,
RampRunID: rampRunID, CreatedAt: now,
DisplayName: stepName, params: taskParams{
}, GPUIndices: append([]int(nil), subset...),
SizeMB: body.SizeMB,
BenchmarkProfile: body.Profile,
RunNCCL: runNCCL && step == len(resolved),
ParallelGPUs: true,
RampStep: step,
RampTotal: len(resolved),
RampRunID: rampRunID,
DisplayName: stepName,
},
}
allTasks = append(allTasks, t)
} }
allTasks = append(allTasks, t) for _, t := range allTasks {
globalQueue.enqueue(t)
}
writeTaskRunResponse(w, allTasks)
return
} }
for _, t := range allTasks { }
globalQueue.enqueue(t)
} // For non-ramp tasks append mode tag.
writeTaskRunResponse(w, allTasks) if parallelGPUs {
name = fmt.Sprintf("%s · parallel", name)
} else {
name = fmt.Sprintf("%s · sequential", name)
}
params := taskParams{
GPUIndices: body.GPUIndices,
ExcludeGPUIndices: body.ExcludeGPUIndices,
SizeMB: body.SizeMB,
BenchmarkProfile: body.Profile,
RunNCCL: runNCCL,
ParallelGPUs: parallelGPUs,
DisplayName: body.DisplayName,
}
tasks, err := buildNvidiaTaskSet(target, defaultTaskPriority(target, params), time.Now(), params, name, h.opts.App, "bee-bench-nvidia")
if err != nil {
writeError(w, http.StatusBadRequest, err.Error())
return return
} }
for _, t := range tasks {
globalQueue.enqueue(t)
}
writeTaskRunResponse(w, tasks)
} }
}
// For non-ramp tasks append mode tag. func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Request) {
if parallelGPUs { h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf").ServeHTTP(w, r)
name = fmt.Sprintf("%s · parallel", name)
} else {
name = fmt.Sprintf("%s · sequential", name)
}
params := taskParams{
GPUIndices: body.GPUIndices,
ExcludeGPUIndices: body.ExcludeGPUIndices,
SizeMB: body.SizeMB,
BenchmarkProfile: body.Profile,
RunNCCL: runNCCL,
ParallelGPUs: parallelGPUs,
DisplayName: body.DisplayName,
}
tasks, err := buildNvidiaTaskSet("nvidia-benchmark", defaultTaskPriority("nvidia-benchmark", params), time.Now(), params, name, h.opts.App, "benchmark-nvidia")
if err != nil {
writeError(w, http.StatusBadRequest, err.Error())
return
}
for _, t := range tasks {
globalQueue.enqueue(t)
}
writeTaskRunResponse(w, tasks)
} }
func (h *handler) handleAPISATStream(w http.ResponseWriter, r *http.Request) { func (h *handler) handleAPISATStream(w http.ResponseWriter, r *http.Request) {

View File

@@ -64,7 +64,7 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
t.Cleanup(func() { apiListNvidiaGPUs = prevList }) t.Cleanup(func() { apiListNvidiaGPUs = prevList })
h := &handler{opts: HandlerOptions{App: &app.App{}}} h := &handler{opts: HandlerOptions{App: &app.App{}}}
req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`)) req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/perf/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`))
rec := httptest.NewRecorder() rec := httptest.NewRecorder()
h.handleAPIBenchmarkNvidiaRun(rec, req) h.handleAPIBenchmarkNvidiaRun(rec, req)
@@ -78,8 +78,8 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
t.Fatalf("tasks=%d want 1", len(globalQueue.tasks)) t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
} }
task := globalQueue.tasks[0] task := globalQueue.tasks[0]
if task.Target != "nvidia-benchmark" { if task.Target != "nvidia-bench-perf" {
t.Fatalf("target=%q want nvidia-benchmark", task.Target) t.Fatalf("target=%q want nvidia-bench-perf", task.Target)
} }
if got := task.params.GPUIndices; len(got) != 2 || got[0] != 1 || got[1] != 3 { if got := task.params.GPUIndices; len(got) != 2 || got[0] != 1 || got[1] != 3 {
t.Fatalf("gpu indices=%v want [1 3]", got) t.Fatalf("gpu indices=%v want [1 3]", got)
@@ -113,7 +113,7 @@ func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) {
t.Cleanup(func() { apiListNvidiaGPUs = prevList }) t.Cleanup(func() { apiListNvidiaGPUs = prevList })
h := &handler{opts: HandlerOptions{App: &app.App{}}} h := &handler{opts: HandlerOptions{App: &app.App{}}}
req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"run_nccl":false}`)) req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/perf/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"run_nccl":false}`))
rec := httptest.NewRecorder() rec := httptest.NewRecorder()
h.handleAPIBenchmarkNvidiaRun(rec, req) h.handleAPIBenchmarkNvidiaRun(rec, req)
@@ -147,6 +147,50 @@ func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) {
} }
} }
func TestHandleAPIBenchmarkPowerFitRampQueuesBenchmarkPowerFitTasks(t *testing.T) {
globalQueue.mu.Lock()
originalTasks := globalQueue.tasks
globalQueue.tasks = nil
globalQueue.mu.Unlock()
t.Cleanup(func() {
globalQueue.mu.Lock()
globalQueue.tasks = originalTasks
globalQueue.mu.Unlock()
})
prevList := apiListNvidiaGPUs
apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
return []platform.NvidiaGPU{
{Index: 0, Name: "NVIDIA H100 PCIe"},
{Index: 1, Name: "NVIDIA H100 PCIe"},
{Index: 2, Name: "NVIDIA H100 PCIe"},
}, nil
}
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
h := &handler{opts: HandlerOptions{App: &app.App{}}}
req := httptest.NewRequest("POST", "/api/bee-bench/nvidia/power/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"ramp_up":true}`))
rec := httptest.NewRecorder()
h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power").ServeHTTP(rec, req)
if rec.Code != 200 {
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
}
globalQueue.mu.Lock()
defer globalQueue.mu.Unlock()
if len(globalQueue.tasks) != 3 {
t.Fatalf("tasks=%d want 3", len(globalQueue.tasks))
}
for i, task := range globalQueue.tasks {
if task.Target != "nvidia-bench-power" {
t.Fatalf("task[%d] target=%q", i, task.Target)
}
if task.Priority != taskPriorityBenchmark {
t.Fatalf("task[%d] priority=%d want %d", i, task.Priority, taskPriorityBenchmark)
}
}
}
func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) { func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) {
globalQueue.mu.Lock() globalQueue.mu.Lock()
originalTasks := globalQueue.tasks originalTasks := globalQueue.tasks
@@ -202,7 +246,8 @@ func TestDefaultTaskPriorityOrder(t *testing.T) {
defaultTaskPriority("cpu", taskParams{}), defaultTaskPriority("cpu", taskParams{}),
defaultTaskPriority("cpu", taskParams{StressMode: true}), defaultTaskPriority("cpu", taskParams{StressMode: true}),
defaultTaskPriority("nvidia-stress", taskParams{}), defaultTaskPriority("nvidia-stress", taskParams{}),
defaultTaskPriority("nvidia-benchmark", taskParams{}), defaultTaskPriority("nvidia-bench-perf", taskParams{}),
defaultTaskPriority("nvidia-bench-power", taskParams{}),
} }
want := []int{ want := []int{
taskPriorityInstallToRAM, taskPriorityInstallToRAM,
@@ -211,13 +256,14 @@ func TestDefaultTaskPriorityOrder(t *testing.T) {
taskPriorityValidateStress, taskPriorityValidateStress,
taskPriorityBurn, taskPriorityBurn,
taskPriorityBenchmark, taskPriorityBenchmark,
taskPriorityBenchmark,
} }
for i := range want { for i := range want {
if got[i] != want[i] { if got[i] != want[i] {
t.Fatalf("priority[%d]=%d want %d", i, got[i], want[i]) t.Fatalf("priority[%d]=%d want %d", i, got[i], want[i])
} }
} }
if !(got[0] > got[1] && got[1] > got[2] && got[2] > got[3] && got[3] > got[4] && got[4] > got[5]) { if !(got[0] > got[1] && got[1] > got[2] && got[2] > got[3] && got[3] > got[4] && got[4] > got[5] && got[5] == got[6]) {
t.Fatalf("priority order=%v", got) t.Fatalf("priority order=%v", got)
} }
} }

View File

@@ -232,7 +232,7 @@ func truncate(s string, max int) string {
// isSATTarget returns true for task targets that run hardware acceptance tests. // isSATTarget returns true for task targets that run hardware acceptance tests.
func isSATTarget(target string) bool { func isSATTarget(target string) bool {
switch target { switch target {
case "nvidia", "nvidia-targeted-stress", "nvidia-benchmark", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse", case "nvidia", "nvidia-targeted-stress", "nvidia-bench-perf", "nvidia-bench-power", "nvidia-compute", "nvidia-targeted-power", "nvidia-pulse",
"nvidia-interconnect", "nvidia-bandwidth", "nvidia-stress", "memory", "memory-stress", "storage", "nvidia-interconnect", "nvidia-bandwidth", "nvidia-stress", "memory", "memory-stress", "storage",
"cpu", "sat-stress", "amd", "amd-mem", "amd-bandwidth", "amd-stress", "cpu", "sat-stress", "amd", "amd-mem", "amd-bandwidth", "amd-stress",
"platform-stress": "platform-stress":

View File

@@ -1946,7 +1946,7 @@ func renderBenchmark(opts HandlerOptions) string {
<div class="grid2"> <div class="grid2">
<div class="card"> <div class="card">
<div class="card-head">NVIDIA Benchmark</div> <div class="card-head">Benchmark Setup</div>
<div class="card-body"> <div class="card-body">
<div class="form-row"> <div class="form-row">
<label>Profile</label> <label>Profile</label>
@@ -1979,21 +1979,25 @@ func renderBenchmark(opts HandlerOptions) string {
<span>Ramp-up — 1 GPU → 2 → … → all selected (separate tasks)</span> <span>Ramp-up — 1 GPU → 2 → … → all selected (separate tasks)</span>
</label> </label>
<p id="benchmark-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 14px">Select one GPU for single-card benchmarking or several GPUs for a constrained multi-GPU run.</p> <p id="benchmark-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 14px">Select one GPU for single-card benchmarking or several GPUs for a constrained multi-GPU run.</p>
<button id="benchmark-run-btn" class="btn btn-primary" onclick="runNvidiaBenchmark()" disabled>&#9654; Run Benchmark</button> <div style="display:flex;gap:8px;flex-wrap:wrap;align-items:center">
<button id="benchmark-run-performance-btn" class="btn btn-primary" onclick="runNvidiaBenchmark('performance')" disabled>&#9654; Run Performance Benchmark</button>
<button id="benchmark-run-power-fit-btn" class="btn btn-secondary" onclick="runNvidiaBenchmark('power-fit')" disabled>&#9654; Run Power / Thermal Fit</button>
</div>
<span id="benchmark-run-nccl" hidden>nccl-auto</span>
<span id="benchmark-run-status" style="margin-left:10px;font-size:12px;color:var(--muted)"></span> <span id="benchmark-run-status" style="margin-left:10px;font-size:12px;color:var(--muted)"></span>
</div> </div>
</div> </div>
<div class="card"> <div class="card">
<div class="card-head">Method</div> <div class="card-head">Method Split</div>
<div class="card-body"> <div class="card-body">
<p style="font-size:13px;color:var(--muted);margin-bottom:10px">Each benchmark run performs warmup, sustained compute, telemetry capture, cooldown, and optional NCCL interconnect checks.</p> <p style="font-size:13px;color:var(--muted);margin-bottom:10px">The benchmark page now exposes two fundamentally different test families so compute score and server power-fit are not mixed into one number.</p>
<table> <table>
<tr><th>Profile</th><th>Purpose</th></tr> <tr><th>Run Type</th><th>Engine</th><th>Question</th></tr>
<tr><td>Standard</td><td>Fast, repeatable performance check for server-to-server comparison.</td></tr> <tr><td>Performance Benchmark</td><td><code>bee-gpu-burn</code></td><td>How much isolated compute performance does the GPU realize in this server?</td></tr>
<tr><td>Stability</td><td>Longer run for thermal drift, power caps, and clock instability.</td></tr> <tr><td>Power / Thermal Fit</td><td><code>dcgmi targeted_power</code></td><td>How much power per GPU can this server sustain as GPU count ramps up?</td></tr>
<tr><td>Overnight</td><td>Extended verification of long-run stability and late throttling.</td></tr>
</table> </table>
<p style="font-size:12px;color:var(--muted);margin-top:10px">Use ramp-up mode for capacity work: it creates 1 GPU → 2 GPU → … → all selected steps so analysis software can derive server total score and watts-per-GPU curves.</p>
</div> </div>
</div> </div>
</div> </div>
@@ -2036,21 +2040,24 @@ function benchmarkMode() {
function benchmarkUpdateSelectionNote() { function benchmarkUpdateSelectionNote() {
const selected = benchmarkSelectedGPUIndices(); const selected = benchmarkSelectedGPUIndices();
const btn = document.getElementById('benchmark-run-btn'); const perfBtn = document.getElementById('benchmark-run-performance-btn');
const fitBtn = document.getElementById('benchmark-run-power-fit-btn');
const note = document.getElementById('benchmark-selection-note'); const note = document.getElementById('benchmark-selection-note');
if (!selected.length) { if (!selected.length) {
btn.disabled = true; perfBtn.disabled = true;
fitBtn.disabled = true;
note.textContent = 'Select at least one NVIDIA GPU to run the benchmark.'; note.textContent = 'Select at least one NVIDIA GPU to run the benchmark.';
return; return;
} }
btn.disabled = false; perfBtn.disabled = false;
fitBtn.disabled = false;
const mode = benchmarkMode(); const mode = benchmarkMode();
if (mode === 'ramp-up') { if (mode === 'ramp-up') {
note.textContent = 'Ramp-up: ' + selected.length + ' tasks (1 GPU → ' + selected.length + ' GPUs). NCCL on final step.'; note.textContent = 'Ramp-up: ' + selected.length + ' tasks (1 GPU → ' + selected.length + ' GPUs). Performance uses compute benchmark; Power / Thermal Fit uses targeted_power per step.';
} else if (mode === 'parallel') { } else if (mode === 'parallel') {
note.textContent = 'Parallel: all ' + selected.length + ' GPU(s) simultaneously.' + (selected.length > 1 ? ' NCCL included.' : ''); note.textContent = 'Parallel: all ' + selected.length + ' GPU(s) simultaneously. Only the performance benchmark supports this mode.';
} else { } else {
note.textContent = 'Sequential: each GPU benchmarked separately.' + (selected.length > 1 ? ' NCCL included on each.' : ''); note.textContent = 'Sequential: each selected GPU benchmarked separately.';
} }
} }
@@ -2124,7 +2131,7 @@ function benchmarkSelectNone() {
benchmarkUpdateSelectionNote(); benchmarkUpdateSelectionNote();
} }
function runNvidiaBenchmark() { function runNvidiaBenchmark(kind) {
const selected = benchmarkSelectedGPUIndices(); const selected = benchmarkSelectedGPUIndices();
const status = document.getElementById('benchmark-run-status'); const status = document.getElementById('benchmark-run-status');
if (!selected.length) { if (!selected.length) {
@@ -2134,21 +2141,26 @@ function runNvidiaBenchmark() {
if (benchmarkES) { benchmarkES.close(); benchmarkES = null; } if (benchmarkES) { benchmarkES.close(); benchmarkES = null; }
const mode = benchmarkMode(); const mode = benchmarkMode();
const rampUp = mode === 'ramp-up' && selected.length > 1; const rampUp = mode === 'ramp-up' && selected.length > 1;
const parallelGPUs = mode === 'parallel'; const parallelGPUs = mode === 'parallel' && kind === 'performance';
if (kind === 'power-fit' && mode === 'parallel') {
status.textContent = 'Power / Thermal Fit supports sequential or ramp-up only.';
return;
}
const body = { const body = {
profile: document.getElementById('benchmark-profile').value || 'standard', profile: document.getElementById('benchmark-profile').value || 'standard',
gpu_indices: selected, gpu_indices: selected,
run_nccl: selected.length > 1, run_nccl: kind === 'performance' && selected.length > 1,
parallel_gpus: parallelGPUs, parallel_gpus: parallelGPUs,
ramp_up: rampUp, ramp_up: rampUp,
display_name: 'NVIDIA Benchmark' display_name: kind === 'power-fit' ? 'NVIDIA Power / Thermal Fit' : 'NVIDIA Performance Benchmark'
}; };
document.getElementById('benchmark-output').style.display = 'block'; document.getElementById('benchmark-output').style.display = 'block';
document.getElementById('benchmark-title').textContent = '— ' + body.profile + ' [' + selected.join(', ') + ']'; document.getElementById('benchmark-title').textContent = '— ' + body.display_name + ' · ' + body.profile + ' [' + selected.join(', ') + ']';
const term = document.getElementById('benchmark-terminal'); const term = document.getElementById('benchmark-terminal');
term.textContent = 'Enqueuing benchmark for GPUs ' + selected.join(', ') + '...\n'; term.textContent = 'Enqueuing ' + body.display_name + ' for GPUs ' + selected.join(', ') + '...\n';
status.textContent = 'Queueing...'; status.textContent = 'Queueing...';
fetch('/api/benchmark/nvidia/run', { const endpoint = kind === 'power-fit' ? '/api/bee-bench/nvidia/power/run' : '/api/bee-bench/nvidia/perf/run';
fetch(endpoint, {
method: 'POST', method: 'POST',
headers: {'Content-Type':'application/json'}, headers: {'Content-Type':'application/json'},
body: JSON.stringify(body) body: JSON.stringify(body)
@@ -2202,7 +2214,7 @@ benchmarkLoadGPUs();
func renderBenchmarkResultsCard(exportDir string) string { func renderBenchmarkResultsCard(exportDir string) string {
maxIdx, runs := loadBenchmarkHistory(exportDir) maxIdx, runs := loadBenchmarkHistory(exportDir)
return renderBenchmarkResultsCardFromRuns( return renderBenchmarkResultsCardFromRuns(
"Benchmark Results", "Perf Results",
"Composite score by saved benchmark run and GPU.", "Composite score by saved benchmark run and GPU.",
"No saved benchmark runs yet.", "No saved benchmark runs yet.",
maxIdx, maxIdx,
@@ -2244,11 +2256,11 @@ func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string,
} }
func loadBenchmarkHistory(exportDir string) (int, []benchmarkHistoryRun) { func loadBenchmarkHistory(exportDir string) (int, []benchmarkHistoryRun) {
baseDir := app.DefaultBenchmarkBaseDir baseDir := app.DefaultBeeBenchPerfDir
if strings.TrimSpace(exportDir) != "" { if strings.TrimSpace(exportDir) != "" {
baseDir = filepath.Join(exportDir, "bee-benchmark") baseDir = filepath.Join(exportDir, "bee-bench", "perf")
} }
paths, err := filepath.Glob(filepath.Join(baseDir, "gpu-benchmark-*", "result.json")) paths, err := filepath.Glob(filepath.Join(baseDir, "perf-*", "result.json"))
if err != nil || len(paths) == 0 { if err != nil || len(paths) == 0 {
return -1, nil return -1, nil
} }

View File

@@ -261,7 +261,8 @@ func NewHandler(opts HandlerOptions) http.Handler {
mux.HandleFunc("POST /api/sat/platform-stress/run", h.handleAPISATRun("platform-stress")) mux.HandleFunc("POST /api/sat/platform-stress/run", h.handleAPISATRun("platform-stress"))
mux.HandleFunc("GET /api/sat/stream", h.handleAPISATStream) mux.HandleFunc("GET /api/sat/stream", h.handleAPISATStream)
mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort) mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
mux.HandleFunc("POST /api/benchmark/nvidia/run", h.handleAPIBenchmarkNvidiaRun) mux.HandleFunc("POST /api/bee-bench/nvidia/perf/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-perf"))
mux.HandleFunc("POST /api/bee-bench/nvidia/power/run", h.handleAPIBenchmarkNvidiaRunKind("nvidia-bench-power"))
// Tasks // Tasks
mux.HandleFunc("GET /api/tasks", h.handleAPITasksList) mux.HandleFunc("GET /api/tasks", h.handleAPITasksList)

View File

@@ -648,8 +648,11 @@ func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
`href="/benchmark"`, `href="/benchmark"`,
`id="benchmark-gpu-list"`, `id="benchmark-gpu-list"`,
`/api/gpu/nvidia`, `/api/gpu/nvidia`,
`/api/benchmark/nvidia/run`, `/api/bee-bench/nvidia/perf/run`,
`/api/bee-bench/nvidia/power/run`,
`benchmark-run-nccl`, `benchmark-run-nccl`,
`Run Performance Benchmark`,
`Run Power / Thermal Fit`,
} { } {
if !strings.Contains(body, needle) { if !strings.Contains(body, needle) {
t.Fatalf("benchmark page missing %q: %s", needle, body) t.Fatalf("benchmark page missing %q: %s", needle, body)
@@ -660,7 +663,7 @@ func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) { func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) {
dir := t.TempDir() dir := t.TempDir()
exportDir := filepath.Join(dir, "export") exportDir := filepath.Join(dir, "export")
runDir := filepath.Join(exportDir, "bee-benchmark", "gpu-benchmark-20260406-120000") runDir := filepath.Join(exportDir, "bee-bench", "perf", "perf-20260406-120000")
if err := os.MkdirAll(runDir, 0755); err != nil { if err := os.MkdirAll(runDir, 0755); err != nil {
t.Fatal(err) t.Fatal(err)
} }
@@ -702,7 +705,7 @@ func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) {
body := rec.Body.String() body := rec.Body.String()
wantTime := result.GeneratedAt.Local().Format("2006-01-02 15:04:05") wantTime := result.GeneratedAt.Local().Format("2006-01-02 15:04:05")
for _, needle := range []string{ for _, needle := range []string{
`Benchmark Results`, `Perf Results`,
`Composite score by saved benchmark run and GPU.`, `Composite score by saved benchmark run and GPU.`,
`GPU 0`, `GPU 0`,
`GPU 1`, `GPU 1`,

View File

@@ -251,7 +251,9 @@ func renderTaskReportFragment(report taskReport, charts map[string]string, logTe
} }
func renderTaskBenchmarkResultsCard(target, logText string) string { func renderTaskBenchmarkResultsCard(target, logText string) string {
if strings.TrimSpace(target) != "nvidia-benchmark" { switch strings.TrimSpace(target) {
case "nvidia-bench-perf":
default:
return "" return ""
} }
resultPath := taskBenchmarkResultPath(logText) resultPath := taskBenchmarkResultPath(logText)
@@ -263,7 +265,7 @@ func renderTaskBenchmarkResultsCard(target, logText string) string {
return "" return ""
} }
return renderBenchmarkResultsCardFromRuns( return renderBenchmarkResultsCardFromRuns(
"Benchmark Results", "Perf Results",
"Composite score for this benchmark task.", "Composite score for this benchmark task.",
"No benchmark results were saved for this task.", "No benchmark results were saved for this task.",
columns, columns,

View File

@@ -32,7 +32,8 @@ const (
var taskNames = map[string]string{ var taskNames = map[string]string{
"nvidia": "NVIDIA SAT", "nvidia": "NVIDIA SAT",
"nvidia-targeted-stress": "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)", "nvidia-targeted-stress": "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)",
"nvidia-benchmark": "NVIDIA Benchmark", "nvidia-bench-perf": "NVIDIA Bee Bench Perf",
"nvidia-bench-power": "NVIDIA Bee Bench Power",
"nvidia-compute": "NVIDIA Max Compute Load (dcgmproftester)", "nvidia-compute": "NVIDIA Max Compute Load (dcgmproftester)",
"nvidia-targeted-power": "NVIDIA Targeted Power (dcgmi diag targeted_power)", "nvidia-targeted-power": "NVIDIA Targeted Power (dcgmi diag targeted_power)",
"nvidia-pulse": "NVIDIA Pulse Test (dcgmi diag pulse_test)", "nvidia-pulse": "NVIDIA Pulse Test (dcgmi diag pulse_test)",
@@ -628,7 +629,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
dur = 300 dur = 300
} }
archive, err = a.RunNvidiaTargetedStressValidatePack(ctx, "", dur, t.params.GPUIndices, j.append) archive, err = a.RunNvidiaTargetedStressValidatePack(ctx, "", dur, t.params.GPUIndices, j.append)
case "nvidia-benchmark": case "nvidia-bench-perf":
if a == nil { if a == nil {
err = fmt.Errorf("app not configured") err = fmt.Errorf("app not configured")
break break
@@ -644,6 +645,31 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
RampTotal: t.params.RampTotal, RampTotal: t.params.RampTotal,
RampRunID: t.params.RampRunID, RampRunID: t.params.RampRunID,
}, j.append) }, j.append)
case "nvidia-bench-power":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
dur := t.params.Duration
if dur <= 0 {
switch strings.TrimSpace(strings.ToLower(t.params.BenchmarkProfile)) {
case platform.NvidiaBenchmarkProfileStability:
dur = 300
case platform.NvidiaBenchmarkProfileOvernight:
dur = 600
default:
dur = 120
}
}
rampPlan, planErr := resolveNvidiaRampPlan(t.params.BenchmarkProfile, t.params.RampTotal > 0, t.params.GPUIndices)
if planErr != nil {
err = planErr
break
}
if t.params.RampTotal > 0 && t.params.RampStep > 0 && dur <= 0 {
dur = rampPlan.DurationSec
}
archive, err = a.RunNvidiaTargetedPowerPack(ctx, app.DefaultBeeBenchPowerDir, dur, t.params.GPUIndices, j.append)
case "nvidia-compute": case "nvidia-compute":
if a == nil { if a == nil {
err = fmt.Errorf("app not configured") err = fmt.Errorf("app not configured")

View File

@@ -366,7 +366,7 @@ func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) {
taskReportMetricsDBPath = metricsPath taskReportMetricsDBPath = metricsPath
t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath }) t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })
benchmarkDir := filepath.Join(dir, "bee-benchmark", "gpu-benchmark-20260406-120000") benchmarkDir := filepath.Join(dir, "bee-bench", "perf", "perf-20260406-120000")
if err := os.MkdirAll(benchmarkDir, 0755); err != nil { if err := os.MkdirAll(benchmarkDir, 0755); err != nil {
t.Fatal(err) t.Fatal(err)
} }
@@ -398,14 +398,14 @@ func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) {
} }
task := &Task{ task := &Task{
ID: "task-bench", ID: "task-bench",
Name: "NVIDIA Benchmark", Name: "NVIDIA Bee Bench Perf",
Target: "nvidia-benchmark", Target: "nvidia-bench-perf",
Status: TaskDone, Status: TaskDone,
CreatedAt: time.Now().UTC().Add(-time.Minute), CreatedAt: time.Now().UTC().Add(-time.Minute),
ArtifactsDir: artifactsDir, ArtifactsDir: artifactsDir,
} }
ensureTaskReportPaths(task) ensureTaskReportPaths(task)
logText := "line-1\nArchive: " + filepath.Join(dir, "bee-benchmark", "gpu-benchmark-20260406-120000.tar.gz") + "\n" logText := "line-1\nArchive: " + filepath.Join(dir, "bee-bench", "perf", "perf-20260406-120000.tar.gz") + "\n"
if err := os.WriteFile(task.LogPath, []byte(logText), 0644); err != nil { if err := os.WriteFile(task.LogPath, []byte(logText), 0644); err != nil {
t.Fatal(err) t.Fatal(err)
} }
@@ -420,7 +420,7 @@ func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) {
} }
html := string(body) html := string(body)
for _, needle := range []string{ for _, needle := range []string{
`Benchmark Results`, `Perf Results`,
`Composite score for this benchmark task.`, `Composite score for this benchmark task.`,
`GPU 0`, `GPU 0`,
`1176.25`, `1176.25`,

View File

@@ -1,5 +1,34 @@
# Benchmark clock calibration research # Benchmark clock calibration research
## Benchmark methodology versioning
Every benchmark methodology change must bump the benchmark version constant in
source code by exactly `+1`.
Methodology change means any change that affects comparability of benchmark
results, including for example:
- phase durations or phase order
- enabled/disabled precisions
- fallback rules
- normalization rules
- score formulas or weights
- degradation thresholds
- power calibration logic
- thermal/power penalty logic
Requirements:
- benchmark version must be stored in source code as an explicit version
constant, not inferred from git tag or build metadata
- benchmark report must always print the benchmark version
- `result.json` must always include the benchmark version
- results from different benchmark versions must be treated as non-comparable by
default
Purpose:
- prevent accidental comparison of runs produced by different methodologies
- make historical benchmark archives self-describing even when detached from git
- force deliberate version bumps whenever scoring or execution semantics change
## Status ## Status
In progress. Baseline data from production servers pending. In progress. Baseline data from production servers pending.

View File

@@ -796,6 +796,9 @@ static cudaDataType_t matmul_scale_type(const struct profile_desc *desc) {
if (desc->compute_type == CUBLAS_COMPUTE_32I) { if (desc->compute_type == CUBLAS_COMPUTE_32I) {
return CUDA_R_32I; return CUDA_R_32I;
} }
if (desc->compute_type == CUBLAS_COMPUTE_64F) {
return CUDA_R_64F;
}
return CUDA_R_32F; return CUDA_R_32F;
} }
@@ -1120,6 +1123,8 @@ static int run_cublas_profile(cublasLtHandle_t handle,
struct prepared_profile *profile) { struct prepared_profile *profile) {
int32_t alpha_i32 = 1; int32_t alpha_i32 = 1;
int32_t beta_i32 = 0; int32_t beta_i32 = 0;
double alpha_f64 = 1.0;
double beta_f64 = 0.0;
float alpha = 1.0f; float alpha = 1.0f;
float beta = 0.0f; float beta = 0.0f;
const void *alpha_ptr = &alpha; const void *alpha_ptr = &alpha;
@@ -1127,6 +1132,9 @@ static int run_cublas_profile(cublasLtHandle_t handle,
if (profile->desc.compute_type == CUBLAS_COMPUTE_32I) { if (profile->desc.compute_type == CUBLAS_COMPUTE_32I) {
alpha_ptr = &alpha_i32; alpha_ptr = &alpha_i32;
beta_ptr = &beta_i32; beta_ptr = &beta_i32;
} else if (profile->desc.compute_type == CUBLAS_COMPUTE_64F) {
alpha_ptr = &alpha_f64;
beta_ptr = &beta_f64;
} }
return check_cublas(profile->desc.name, return check_cublas(profile->desc.name,
cublas->cublasLtMatmul(handle, cublas->cublasLtMatmul(handle,