Compare commits

...

3 Commits
v8.17 ... v8.18

Author SHA1 Message Date
Mikhail Chusavitin
e306250da7 Disable fp64/fp4 in mixed gpu burn 2026-04-16 10:00:03 +03:00
Mikhail Chusavitin
c5b2081ac9 Disable unstable fp4/fp64 benchmark phases 2026-04-16 09:58:02 +03:00
434528083e Power bench: compare GPU-reported TDP vs IPMI server power delta
- NvidiaPowerBenchResult gains ServerPower *BenchmarkServerPower
- RunNvidiaPowerBench samples IPMI idle before Phase 1 and loaded via
  background goroutine throughout Phase 2 ramp
- renderPowerBenchReport: new "Server vs GPU Power Comparison" table
  with ratio annotation (✓ match / ⚠ minor / ✗ over-report)
- renderPowerBenchSummary: server_idle_w, server_loaded_w, server_delta_w,
  server_reporting_ratio keys

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-16 07:21:02 +03:00
5 changed files with 292 additions and 74 deletions

View File

@@ -94,9 +94,13 @@ var (
)
// benchmarkPrecisionPhases lists the precision categories run as individual
// steady-state windows before the combined steady pass. Order is from lowest
// steady-state windows before the combined steady pass. Order is from lowest
// to highest power draw so thermal ramp-up is gradual.
var benchmarkPrecisionPhases = []string{"int8", "fp8", "fp16", "fp32", "fp64", "fp4"}
//
// fp64 and fp4 are intentionally disabled for now: both are currently unstable
// on the target fleet and can abort the mixed steady stage after the earlier
// phases already collected useful telemetry.
var benchmarkPrecisionPhases = []string{"int8", "fp8", "fp16", "fp32"}
func computeCapabilityCode(raw string) int {
raw = strings.TrimSpace(raw)
@@ -124,6 +128,15 @@ func benchmarkSupportedPrecisions(computeCapability string) []string {
return out
}
func benchmarkPrecisionEnabled(category string) bool {
switch category {
case "int8", "fp8", "fp16", "fp16_bf16", "fp32", "fp32_tf32":
return true
default:
return false
}
}
func buildBenchmarkSteadyPlan(spec benchmarkProfileSpec, precisions []string, metricStage func(string) string) (planLabels []string, planPhases []benchmarkPlannedPhase, basePhaseSec int, mixedPhaseSec int) {
if len(precisions) == 0 {
precisions = append([]string(nil), benchmarkPrecisionPhases...)
@@ -514,6 +527,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
appendBenchmarkMetrics(&metricRows, cooldownRows, fmt.Sprintf("gpu-%d-cooldown", idx), &metricTimelineSec, float64(spec.CooldownSec))
}
applyBenchmarkSteadyFallback(&gpuResult)
gpuResult.Scores = scoreBenchmarkGPUResult(gpuResult)
gpuResult.DegradationReasons = detectBenchmarkDegradationReasons(gpuResult, result.Normalization.Status)
if anomaly := detectPowerAnomaly(metricRows, idx); anomaly != "" {
@@ -1398,19 +1412,58 @@ func summarizeBenchmarkCooling(rows []GPUMetricRow) *BenchmarkCoolingSummary {
return summary
}
func benchmarkTelemetryAvailable(summary BenchmarkTelemetrySummary) bool {
return summary.Samples > 0 || summary.DurationSec > 0
}
func benchmarkPrecisionSteadyFallback(phases []BenchmarkPrecisionSteadyPhase) (BenchmarkTelemetrySummary, string, bool) {
var (
best BenchmarkTelemetrySummary
bestLabel string
found bool
)
for _, phase := range phases {
if !benchmarkTelemetryAvailable(phase.Steady) {
continue
}
if !found ||
phase.Steady.DurationSec > best.DurationSec ||
(phase.Steady.DurationSec == best.DurationSec && phase.Steady.P95PowerW > best.P95PowerW) {
best = phase.Steady
bestLabel = phase.Precision
found = true
}
}
return best, bestLabel, found
}
func applyBenchmarkSteadyFallback(gpu *BenchmarkGPUResult) {
if gpu == nil || benchmarkTelemetryAvailable(gpu.Steady) {
return
}
if fallback, label, ok := benchmarkPrecisionSteadyFallback(gpu.PrecisionSteady); ok {
gpu.Steady = fallback
gpu.Notes = append(gpu.Notes,
fmt.Sprintf("mixed steady telemetry unavailable; reporting steady-state fallback from %s precision phase", label))
}
}
func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
score := BenchmarkScorecard{}
// SyntheticScore: sum of fp32-equivalent TOPS from per-precision phases.
// Each precision ran alone with full GPU dedicated — peak capability.
for _, p := range gpu.PrecisionSteady {
if !benchmarkPrecisionEnabled(p.Precision) {
continue
}
score.SyntheticScore += p.WeightedTeraOpsPerSec
}
// MixedScore: sum of fp32-equivalent TOPS from the combined phase.
// All precisions compete simultaneously — closer to real inference workloads.
for _, p := range gpu.PrecisionResults {
if p.Supported {
if p.Supported && benchmarkPrecisionEnabled(p.Category) {
score.MixedScore += p.WeightedTeraOpsPerSec
}
}
@@ -1441,10 +1494,17 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
// so CV reflects genuine power regulation, not workload switching).
if len(gpu.PrecisionSteady) > 0 {
var sum float64
var count int
for _, p := range gpu.PrecisionSteady {
if !benchmarkPrecisionEnabled(p.Precision) {
continue
}
sum += clampScore(100 - p.Steady.PowerCVPct*3)
count++
}
if count > 0 {
score.PowerSustainScore = sum / float64(count)
}
score.PowerSustainScore = sum / float64(len(gpu.PrecisionSteady))
} else if gpu.Steady.PowerCVPct > 0 {
score.PowerSustainScore = clampScore(100 - gpu.Steady.PowerCVPct*3)
}
@@ -2512,6 +2572,7 @@ func runNvidiaBenchmarkParallel(
// Score and finalize each GPU.
for _, idx := range selected {
r := gpuResults[idx]
applyBenchmarkSteadyFallback(r)
r.Scores = scoreBenchmarkGPUResult(*r)
r.DegradationReasons = detectBenchmarkDegradationReasons(*r, result.Normalization.Status)
pr := parseResults[idx]
@@ -2694,18 +2755,21 @@ func summarizeCPULoad(samples []float64) *BenchmarkCPULoad {
return cl
}
// runBenchmarkPowerCalibration runs targeted_power per GPU and actively watches
// throttle counters. If a GPU starts throttling, the current targeted_power run
// is canceled immediately, the power limit is reduced, and a fresh full cycle
// is started again from the beginning. The selected reduced power limit stays
// active for the main benchmark and is restored by the caller afterwards.
// runBenchmarkPowerCalibration runs targeted_power for the supplied GPU set and
// actively watches throttle counters. seedLimits, when provided, are treated as
// the starting point for this calibration pass rather than as immutable fixed
// limits. This matters during cumulative ramp-up: once an additional GPU is
// introduced, every already-active GPU must be revalidated under the new
// thermal state instead of assuming its previous single-step limit is still
// valid. The selected reduced power limits stay active for the main benchmark
// and are restored by the caller afterwards.
func runBenchmarkPowerCalibration(
ctx context.Context,
verboseLog, runDir string,
gpuIndices []int,
infoByIndex map[int]benchmarkGPUInfo,
logFunc func(string),
fixedLimits map[int]int,
seedLimits map[int]int,
) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction) {
const calibDurationSec = 120
const maxDerateW = 150
@@ -2739,7 +2803,6 @@ func runBenchmarkPowerCalibration(
err error
}
// gpuCalibState holds per-GPU binary search state during parallel calibration.
type gpuCalibState struct {
idx int
@@ -2796,19 +2859,20 @@ func runBenchmarkPowerCalibration(
hi: appliedLimitW + 1, // not yet tested, not yet confirmed unstable
calib: benchmarkPowerCalibrationResult{AppliedPowerLimitW: float64(appliedLimitW)},
}
if fixedLimits != nil {
if fixedW, ok := fixedLimits[idx]; ok {
// This GPU's limit was established in a prior ramp step and must
// remain unchanged. Apply it immediately and skip the binary search.
if canDerate && fixedW > 0 {
_ = setBenchmarkPowerLimit(ctx, verboseLog, idx, fixedW)
if seedLimits != nil {
if seedW, ok := seedLimits[idx]; ok && seedW > 0 {
// A previously validated limit is only a starting point. Re-run
// targeted_power under the current multi-GPU thermal load and derate
// again if this step shows new throttling.
if canDerate {
_ = setBenchmarkPowerLimit(ctx, verboseLog, idx, seedW)
}
s.appliedLimitW = fixedW
s.calib.AppliedPowerLimitW = float64(fixedW)
s.calib.Completed = true
s.converged = true
s.appliedLimitW = seedW
s.hi = seedW + 1
s.calib.AppliedPowerLimitW = float64(seedW)
s.calib.Derated = seedW < s.originalLimitW
s.calib.Notes = append(s.calib.Notes,
fmt.Sprintf("fixed limit: %d W (held from prior ramp step)", fixedW))
fmt.Sprintf("seed limit: %d W (revalidating under current thermal load)", seedW))
}
}
states = append(states, s)
@@ -3091,7 +3155,6 @@ func powerBenchDurationSec(profile string) int {
}
}
func cloneBenchmarkGPUInfoMap(src map[int]benchmarkGPUInfo) map[int]benchmarkGPUInfo {
out := make(map[int]benchmarkGPUInfo, len(src))
for k, v := range src {
@@ -3107,7 +3170,42 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile)
fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus)
fmt.Fprintf(&b, "**Platform max TDP:** %.0f W \n\n", result.PlatformMaxTDPW)
fmt.Fprintf(&b, "**Platform max TDP (GPU-reported):** %.0f W \n", result.PlatformMaxTDPW)
if sp := result.ServerPower; sp != nil && sp.Available {
fmt.Fprintf(&b, "**Server power delta (IPMI):** %.0f W \n", sp.DeltaW)
fmt.Fprintf(&b, "**Reporting ratio (IPMI Δ / GPU sum):** %.2f \n", sp.ReportingRatio)
}
b.WriteString("\n")
// Server power comparison table.
if sp := result.ServerPower; sp != nil {
b.WriteString("## Server vs GPU Power Comparison\n\n")
b.WriteString("| Metric | Value |\n")
b.WriteString("|--------|-------|\n")
fmt.Fprintf(&b, "| GPU stable limits sum (nvidia-smi) | %.0f W |\n", result.PlatformMaxTDPW)
if sp.Available {
fmt.Fprintf(&b, "| Server idle power (IPMI) | %.0f W |\n", sp.IdleW)
fmt.Fprintf(&b, "| Server loaded power (IPMI) | %.0f W |\n", sp.LoadedW)
fmt.Fprintf(&b, "| Server Δ power (loaded idle) | %.0f W |\n", sp.DeltaW)
ratio := sp.ReportingRatio
ratioNote := ""
switch {
case ratio >= 0.9:
ratioNote = "✓ GPU telemetry matches server power"
case ratio >= 0.75:
ratioNote = "⚠ minor discrepancy — GPU may slightly over-report TDP"
default:
ratioNote = "✗ significant discrepancy — GPU over-reports TDP vs wall power"
}
fmt.Fprintf(&b, "| Reporting ratio (IPMI Δ / GPU sum) | %.2f — %s |\n", ratio, ratioNote)
} else {
b.WriteString("| IPMI availability | not available — IPMI not supported or ipmitool not found |\n")
}
for _, note := range sp.Notes {
fmt.Fprintf(&b, "\n> %s\n", note)
}
b.WriteString("\n")
}
if len(result.Findings) > 0 {
b.WriteString("## Summary\n\n")
for _, finding := range result.Findings {
@@ -3181,6 +3279,12 @@ func renderPowerBenchSummary(result NvidiaPowerBenchResult) string {
fmt.Fprintf(&b, "gpu_%d_stable_limit_w=%.0f\n", gpu.Index, gpu.StablePowerLimitW)
}
}
if sp := result.ServerPower; sp != nil && sp.Available {
fmt.Fprintf(&b, "server_idle_w=%.0f\n", sp.IdleW)
fmt.Fprintf(&b, "server_loaded_w=%.0f\n", sp.LoadedW)
fmt.Fprintf(&b, "server_delta_w=%.0f\n", sp.DeltaW)
fmt.Fprintf(&b, "server_reporting_ratio=%.2f\n", sp.ReportingRatio)
}
return b.String()
}
@@ -3224,6 +3328,16 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
}
durationSec := powerBenchDurationSec(opts.Profile)
_ = durationSec
// Sample IPMI idle power before any GPU load.
var serverIdleW float64
var serverIdleOK bool
if w, ok := sampleIPMIPowerSeries(ctx, 10); ok {
serverIdleW = w
serverIdleOK = true
logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
}
// Phase 1: calibrate each GPU individually (sequentially, one at a time) to
// establish a true single-card power baseline unaffected by neighbour heat.
calibByIndex := make(map[int]benchmarkPowerCalibrationResult, len(selected))
@@ -3320,20 +3434,35 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
// stableLimits accumulates GPU index → fixed stable limit (W) across steps.
stableLimits := make(map[int]int, len(result.RecommendedSlotOrder))
// Start an IPMI sampling goroutine that runs throughout Phase 2 to capture
// server-side loaded power while GPUs are under stress. The goroutine is
// cancelled as soon as Phase 2 finishes, and the average is used to compare
// against PlatformMaxTDPW (GPU-reported stable limits sum).
var serverLoadedW float64
var serverLoadedOK bool
ipmiPhase2Ctx, ipmiPhase2Cancel := context.WithCancel(ctx)
ipmiPhase2Done := make(chan float64, 1)
go func() {
defer close(ipmiPhase2Done)
if w, ok := sampleIPMIPowerSeries(ipmiPhase2Ctx, 3600); ok {
ipmiPhase2Done <- w
}
}()
// Step 1: reuse single-card calibration result directly.
if len(result.RecommendedSlotOrder) > 0 {
firstIdx := result.RecommendedSlotOrder[0]
firstCalib := calibByIndex[firstIdx]
stableLimits[firstIdx] = int(math.Round(firstCalib.AppliedPowerLimitW))
ramp := NvidiaPowerBenchStep{
StepIndex: 1,
GPUIndices: []int{firstIdx},
NewGPUIndex: firstIdx,
NewGPUStableLimitW: firstCalib.AppliedPowerLimitW,
StepIndex: 1,
GPUIndices: []int{firstIdx},
NewGPUIndex: firstIdx,
NewGPUStableLimitW: firstCalib.AppliedPowerLimitW,
TotalObservedPowerW: firstCalib.Summary.P95PowerW,
AvgObservedPowerW: firstCalib.Summary.P95PowerW,
Derated: firstCalib.Derated,
Status: "OK",
Derated: firstCalib.Derated,
Status: "OK",
}
if !firstCalib.Completed {
ramp.Status = "FAILED"
@@ -3351,8 +3480,9 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
len(result.RecommendedSlotOrder), firstIdx, firstCalib.AppliedPowerLimitW))
}
// Steps 2..N: each step fixes previously calibrated GPUs and searches only
// the new GPU's stable limit in the combined thermal environment.
// Steps 2..N: each step revalidates every already-active GPU under the new
// cumulative thermal environment and also calibrates the newly introduced
// GPU. Previously found limits are used only as seeds for the search.
for stepNum := 1; stepNum < len(result.RecommendedSlotOrder); stepNum++ {
step := stepNum + 1
subset := append([]int(nil), result.RecommendedSlotOrder[:step]...)
@@ -3360,17 +3490,18 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
stepDir := filepath.Join(runDir, fmt.Sprintf("step-%02d", step))
_ = os.MkdirAll(stepDir, 0755)
// All previously calibrated GPUs are fixed at their stable limits.
fixedForStep := make(map[int]int, len(stableLimits))
// Reuse the latest stable limits as starting points, but re-check every
// active GPU in this hotter configuration.
seedForStep := make(map[int]int, len(stableLimits))
for k, v := range stableLimits {
fixedForStep[k] = v
seedForStep[k] = v
}
logFunc(fmt.Sprintf("power ramp: step %d/%d — calibrating GPU %d with %d fixed GPU(s)",
step, len(result.RecommendedSlotOrder), newGPUIdx, len(fixedForStep)))
logFunc(fmt.Sprintf("power ramp: step %d/%d — revalidating %d active GPU(s) including new GPU %d",
step, len(result.RecommendedSlotOrder), len(subset), newGPUIdx))
stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
stepCalib, stepRestore := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, fixedForStep)
stepCalib, stepRestore := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep)
// Accumulate restore actions; they all run in the outer defer.
allRestoreActions = append(allRestoreActions, stepRestore...)
@@ -3391,36 +3522,72 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
ramp.AvgObservedPowerW = ramp.TotalObservedPowerW / float64(len(subset))
}
// Determine stable limit for the new GPU.
if c, ok := stepCalib[newGPUIdx]; ok && c.Completed {
stableLimits[newGPUIdx] = int(math.Round(c.AppliedPowerLimitW))
ramp.NewGPUStableLimitW = c.AppliedPowerLimitW
ramp.Derated = c.Derated
for _, idx := range subset {
c, ok := stepCalib[idx]
if !ok || !c.Completed {
fallback := 0
if lim, ok := stableLimits[idx]; ok && lim > 0 {
fallback = lim
} else if fb, ok := calibByIndex[idx]; ok {
fallback = int(math.Round(fb.AppliedPowerLimitW))
}
if fallback > 0 {
stableLimits[idx] = fallback
}
ramp.Status = "FAILED"
ramp.Notes = append(ramp.Notes,
fmt.Sprintf("GPU %d did not complete targeted_power in ramp step %d; keeping previous stable limit %d W", idx, step, fallback))
result.OverallStatus = "PARTIAL"
continue
}
prevLimit, hadPrev := stableLimits[idx]
newLimit := int(math.Round(c.AppliedPowerLimitW))
stableLimits[idx] = newLimit
if idx == newGPUIdx {
ramp.NewGPUStableLimitW = c.AppliedPowerLimitW
ramp.Derated = c.Derated
}
if c.Derated {
ramp.Status = "PARTIAL"
if result.OverallStatus == "OK" {
result.OverallStatus = "PARTIAL"
}
result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load.", step, newGPUIdx, c.AppliedPowerLimitW))
}
} else {
// Calibration failed — fall back to single-card limit.
fb := calibByIndex[newGPUIdx]
stableLimits[newGPUIdx] = int(math.Round(fb.AppliedPowerLimitW))
ramp.NewGPUStableLimitW = fb.AppliedPowerLimitW
ramp.Status = "FAILED"
ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete targeted_power in ramp step %d; using single-card limit %.0f W", newGPUIdx, step, fb.AppliedPowerLimitW))
result.OverallStatus = "PARTIAL"
if hadPrev && newLimit < prevLimit {
ramp.Notes = append(ramp.Notes,
fmt.Sprintf("GPU %d was re-derated from %d W to %d W under combined thermal load.", idx, prevLimit, newLimit))
}
}
if c, ok := stepCalib[newGPUIdx]; ok && c.Completed && c.Derated {
result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load.", step, newGPUIdx, c.AppliedPowerLimitW))
}
result.RampSteps = append(result.RampSteps, ramp)
}
// Stop IPMI Phase 2 sampling and collect result.
ipmiPhase2Cancel()
if w, ok := <-ipmiPhase2Done; ok {
serverLoadedW = w
serverLoadedOK = true
logFunc(fmt.Sprintf("server loaded power (IPMI, Phase 2 avg): %.0f W", w))
}
// Populate StablePowerLimitW on each GPU entry from the accumulated stable limits.
for i := range result.GPUs {
if lim, ok := stableLimits[result.GPUs[i].Index]; ok {
result.GPUs[i].StablePowerLimitW = float64(lim)
}
if result.GPUs[i].StablePowerLimitW > 0 && result.GPUs[i].AppliedPowerLimitW > 0 &&
result.GPUs[i].StablePowerLimitW < result.GPUs[i].AppliedPowerLimitW {
result.GPUs[i].Derated = true
result.Findings = append(result.Findings, fmt.Sprintf(
"GPU %d required additional derating from %.0f W (single-card) to %.0f W under full-system thermal load.",
result.GPUs[i].Index, result.GPUs[i].AppliedPowerLimitW, result.GPUs[i].StablePowerLimitW,
))
}
}
// PlatformMaxTDPW = sum of all stable limits — the actual sustained power
@@ -3428,6 +3595,13 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
for _, lim := range stableLimits {
result.PlatformMaxTDPW += float64(lim)
}
// Characterize server power from IPMI idle/loaded samples.
// GPUReportedSumW = PlatformMaxTDPW (sum of stable GPU limits, nvidia-smi).
// ReportingRatio = IPMI_delta / GPU_reported_sum:
// ~1.0 → GPU telemetry matches wall power; <0.75 → GPU over-reports its TDP.
_ = serverIdleOK // used implicitly via characterizeServerPower
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, result.PlatformMaxTDPW, serverIdleOK && serverLoadedOK)
resultJSON, err := json.MarshalIndent(result, "", " ")
if err != nil {
return "", fmt.Errorf("marshal power result: %w", err)

View File

@@ -261,14 +261,18 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
b.WriteString("\n")
// Steady-state telemetry
fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
b.WriteString("| | Avg | P95 |\n|---|---|---|\n")
fmt.Fprintf(&b, "| Power | %.1f W | %.1f W |\n", gpu.Steady.AvgPowerW, gpu.Steady.P95PowerW)
fmt.Fprintf(&b, "| Temperature | %.1f °C | %.1f °C |\n", gpu.Steady.AvgTempC, gpu.Steady.P95TempC)
fmt.Fprintf(&b, "| GPU clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgGraphicsClockMHz, gpu.Steady.P95GraphicsClockMHz)
fmt.Fprintf(&b, "| Memory clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgMemoryClockMHz, gpu.Steady.P95MemoryClockMHz)
fmt.Fprintf(&b, "| GPU utilisation | %.1f %% | — |\n", gpu.Steady.AvgUsagePct)
b.WriteString("\n")
if benchmarkTelemetryAvailable(gpu.Steady) {
fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
b.WriteString("| | Avg | P95 |\n|---|---|---|\n")
fmt.Fprintf(&b, "| Power | %.1f W | %.1f W |\n", gpu.Steady.AvgPowerW, gpu.Steady.P95PowerW)
fmt.Fprintf(&b, "| Temperature | %.1f °C | %.1f °C |\n", gpu.Steady.AvgTempC, gpu.Steady.P95TempC)
fmt.Fprintf(&b, "| GPU clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgGraphicsClockMHz, gpu.Steady.P95GraphicsClockMHz)
fmt.Fprintf(&b, "| Memory clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgMemoryClockMHz, gpu.Steady.P95MemoryClockMHz)
fmt.Fprintf(&b, "| GPU utilisation | %.1f %% | — |\n", gpu.Steady.AvgUsagePct)
b.WriteString("\n")
} else {
b.WriteString("**Steady-state telemetry:** unavailable\n\n")
}
// Per-precision stability phases.
if len(gpu.PrecisionSteady) > 0 {

View File

@@ -49,8 +49,8 @@ func TestBuildBenchmarkSteadyPlanStandard(t *testing.T) {
benchmarkPrecisionPhases,
func(label string) string { return label },
)
if len(labels) != 7 || len(phases) != 7 {
t.Fatalf("labels=%d phases=%d want 7", len(labels), len(phases))
if len(labels) != 5 || len(phases) != 5 {
t.Fatalf("labels=%d phases=%d want 5", len(labels), len(phases))
}
if basePhaseSec != 60 {
t.Fatalf("basePhaseSec=%d want 60", basePhaseSec)
@@ -61,7 +61,7 @@ func TestBuildBenchmarkSteadyPlanStandard(t *testing.T) {
if phases[len(phases)-1].PlanLabel != "mixed" || phases[len(phases)-1].DurationSec != 300 {
t.Fatalf("mixed phase=%+v want duration 300", phases[len(phases)-1])
}
if benchmarkPlanDurationsCSV(phases) != "60,60,60,60,60,60,300" {
if benchmarkPlanDurationsCSV(phases) != "60,60,60,60,300" {
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
}
}
@@ -80,7 +80,7 @@ func TestBuildBenchmarkSteadyPlanStability(t *testing.T) {
if mixedPhaseSec != 3600 {
t.Fatalf("mixedPhaseSec=%d want 3600", mixedPhaseSec)
}
if benchmarkPlanDurationsCSV(phases) != "300,300,300,300,300,300,3600" {
if benchmarkPlanDurationsCSV(phases) != "300,300,300,300,3600" {
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
}
}
@@ -99,7 +99,7 @@ func TestBuildBenchmarkSteadyPlanOvernight(t *testing.T) {
if mixedPhaseSec != 14400 {
t.Fatalf("mixedPhaseSec=%d want 14400", mixedPhaseSec)
}
if benchmarkPlanDurationsCSV(phases) != "3600,3600,3600,3600,3600,3600,14400" {
if benchmarkPlanDurationsCSV(phases) != "3600,3600,3600,3600,14400" {
t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases))
}
}
@@ -133,10 +133,10 @@ func TestSplitBenchmarkRowsByPlannedPhaseUsesPhaseDurations(t *testing.T) {
func TestBenchmarkSupportedPrecisionsSkipsFP4BeforeBlackwell(t *testing.T) {
t.Parallel()
if got := benchmarkSupportedPrecisions("9.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32,fp64" {
if got := benchmarkSupportedPrecisions("9.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32" {
t.Fatalf("supported=%v", got)
}
if got := benchmarkSupportedPrecisions("10.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32,fp64,fp4" {
if got := benchmarkSupportedPrecisions("10.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32" {
t.Fatalf("supported=%v", got)
}
}
@@ -314,6 +314,30 @@ func TestRenderBenchmarkReportListsUnifiedArtifacts(t *testing.T) {
}
}
func TestScoreBenchmarkGPUIgnoresDisabledPrecisions(t *testing.T) {
t.Parallel()
score := scoreBenchmarkGPUResult(BenchmarkGPUResult{
PrecisionSteady: []BenchmarkPrecisionSteadyPhase{
{Precision: "fp16", WeightedTeraOpsPerSec: 100},
{Precision: "fp64", WeightedTeraOpsPerSec: 999},
{Precision: "fp4", WeightedTeraOpsPerSec: 999},
},
PrecisionResults: []BenchmarkPrecisionResult{
{Category: "fp32_tf32", Supported: true, WeightedTeraOpsPerSec: 50},
{Category: "fp64", Supported: true, WeightedTeraOpsPerSec: 999},
{Category: "fp4", Supported: true, WeightedTeraOpsPerSec: 999},
},
})
if score.SyntheticScore != 100 {
t.Fatalf("SyntheticScore=%f want 100", score.SyntheticScore)
}
if score.MixedScore != 50 {
t.Fatalf("MixedScore=%f want 50", score.MixedScore)
}
}
func TestEnrichGPUInfoWithMaxClocks(t *testing.T) {
t.Parallel()

View File

@@ -300,8 +300,12 @@ type NvidiaPowerBenchResult struct {
// PlatformMaxTDPW is the sum of per-GPU stable power limits found during the
// cumulative thermal ramp. Represents the actual sustained power budget of
// this server under full GPU load. Use for rack power planning.
PlatformMaxTDPW float64 `json:"platform_max_tdp_w"`
Findings []string `json:"findings,omitempty"`
PlatformMaxTDPW float64 `json:"platform_max_tdp_w"`
// ServerPower captures IPMI server power delta (idle→loaded) measured in
// parallel with the thermal ramp. Use to compare GPU-reported TDP against
// actual wall-power draw as seen by the server's power supply.
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
Findings []string `json:"findings,omitempty"`
GPUs []NvidiaPowerBenchGPU `json:"gpus"`
}

View File

@@ -713,6 +713,19 @@ static const struct profile_desc k_profiles[] = {
#define PROFILE_COUNT ((int)(sizeof(k_profiles) / sizeof(k_profiles[0])))
static int profile_allowed_for_run(const struct profile_desc *desc, int cc, const char *precision_filter) {
if (!(desc->enabled && cc >= desc->min_cc)) {
return 0;
}
if (precision_filter != NULL) {
return strcmp(desc->block_label, precision_filter) == 0;
}
/* Mixed/all phases intentionally exclude fp64/fp4 for now: both paths are
* unstable on the current benchmark fleet and can abort the whole mixed
* pass after earlier phases already collected useful telemetry. */
return strcmp(desc->block_label, "fp64") != 0 && strcmp(desc->block_label, "fp4") != 0;
}
static int load_cublaslt(struct cublaslt_api *api) {
memset(api, 0, sizeof(*api));
api->lib = dlopen("libcublasLt.so.13", RTLD_NOW | RTLD_LOCAL);
@@ -1222,8 +1235,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
/* Count profiles matching the filter (for deciding what to run). */
for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc &&
(precision_filter == NULL || strcmp(k_profiles[i].block_label, precision_filter) == 0)) {
if (profile_allowed_for_run(&k_profiles[i], cc, precision_filter)) {
planned++;
}
}
@@ -1240,7 +1252,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
* profiles matching precision_filter. */
int planned_total = 0;
for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
if (k_profiles[i].enabled && cc >= k_profiles[i].min_cc) {
if (profile_allowed_for_run(&k_profiles[i], cc, precision_filter)) {
planned_total++;
}
}
@@ -1310,10 +1322,10 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
desc->min_cc);
continue;
}
if (precision_filter != NULL && strcmp(desc->block_label, precision_filter) != 0) {
if (!profile_allowed_for_run(desc, cc, precision_filter)) {
append_detail(report->details,
sizeof(report->details),
"%s=SKIPPED precision_filter\n",
"%s=SKIPPED benchmark_disabled\n",
desc->name);
continue;
}