From c5b2081ac98ba9f8776f97966371ca938fe8c209 Mon Sep 17 00:00:00 2001 From: Mikhail Chusavitin Date: Thu, 16 Apr 2026 09:58:02 +0300 Subject: [PATCH] Disable unstable fp4/fp64 benchmark phases --- audit/internal/platform/benchmark.go | 195 +++++++++++++++----- audit/internal/platform/benchmark_report.go | 20 +- audit/internal/platform/benchmark_test.go | 38 +++- 3 files changed, 187 insertions(+), 66 deletions(-) diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go index 5cc1426..0714d77 100644 --- a/audit/internal/platform/benchmark.go +++ b/audit/internal/platform/benchmark.go @@ -94,9 +94,13 @@ var ( ) // benchmarkPrecisionPhases lists the precision categories run as individual -// steady-state windows before the combined steady pass. Order is from lowest +// steady-state windows before the combined steady pass. Order is from lowest // to highest power draw so thermal ramp-up is gradual. -var benchmarkPrecisionPhases = []string{"int8", "fp8", "fp16", "fp32", "fp64", "fp4"} +// +// fp64 and fp4 are intentionally disabled for now: both are currently unstable +// on the target fleet and can abort the mixed steady stage after the earlier +// phases already collected useful telemetry. +var benchmarkPrecisionPhases = []string{"int8", "fp8", "fp16", "fp32"} func computeCapabilityCode(raw string) int { raw = strings.TrimSpace(raw) @@ -124,6 +128,15 @@ func benchmarkSupportedPrecisions(computeCapability string) []string { return out } +func benchmarkPrecisionEnabled(category string) bool { + switch category { + case "int8", "fp8", "fp16", "fp16_bf16", "fp32", "fp32_tf32": + return true + default: + return false + } +} + func buildBenchmarkSteadyPlan(spec benchmarkProfileSpec, precisions []string, metricStage func(string) string) (planLabels []string, planPhases []benchmarkPlannedPhase, basePhaseSec int, mixedPhaseSec int) { if len(precisions) == 0 { precisions = append([]string(nil), benchmarkPrecisionPhases...) @@ -514,6 +527,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv appendBenchmarkMetrics(&metricRows, cooldownRows, fmt.Sprintf("gpu-%d-cooldown", idx), &metricTimelineSec, float64(spec.CooldownSec)) } + applyBenchmarkSteadyFallback(&gpuResult) gpuResult.Scores = scoreBenchmarkGPUResult(gpuResult) gpuResult.DegradationReasons = detectBenchmarkDegradationReasons(gpuResult, result.Normalization.Status) if anomaly := detectPowerAnomaly(metricRows, idx); anomaly != "" { @@ -1398,19 +1412,58 @@ func summarizeBenchmarkCooling(rows []GPUMetricRow) *BenchmarkCoolingSummary { return summary } +func benchmarkTelemetryAvailable(summary BenchmarkTelemetrySummary) bool { + return summary.Samples > 0 || summary.DurationSec > 0 +} + +func benchmarkPrecisionSteadyFallback(phases []BenchmarkPrecisionSteadyPhase) (BenchmarkTelemetrySummary, string, bool) { + var ( + best BenchmarkTelemetrySummary + bestLabel string + found bool + ) + for _, phase := range phases { + if !benchmarkTelemetryAvailable(phase.Steady) { + continue + } + if !found || + phase.Steady.DurationSec > best.DurationSec || + (phase.Steady.DurationSec == best.DurationSec && phase.Steady.P95PowerW > best.P95PowerW) { + best = phase.Steady + bestLabel = phase.Precision + found = true + } + } + return best, bestLabel, found +} + +func applyBenchmarkSteadyFallback(gpu *BenchmarkGPUResult) { + if gpu == nil || benchmarkTelemetryAvailable(gpu.Steady) { + return + } + if fallback, label, ok := benchmarkPrecisionSteadyFallback(gpu.PrecisionSteady); ok { + gpu.Steady = fallback + gpu.Notes = append(gpu.Notes, + fmt.Sprintf("mixed steady telemetry unavailable; reporting steady-state fallback from %s precision phase", label)) + } +} + func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard { score := BenchmarkScorecard{} // SyntheticScore: sum of fp32-equivalent TOPS from per-precision phases. // Each precision ran alone with full GPU dedicated — peak capability. for _, p := range gpu.PrecisionSteady { + if !benchmarkPrecisionEnabled(p.Precision) { + continue + } score.SyntheticScore += p.WeightedTeraOpsPerSec } // MixedScore: sum of fp32-equivalent TOPS from the combined phase. // All precisions compete simultaneously — closer to real inference workloads. for _, p := range gpu.PrecisionResults { - if p.Supported { + if p.Supported && benchmarkPrecisionEnabled(p.Category) { score.MixedScore += p.WeightedTeraOpsPerSec } } @@ -1441,10 +1494,17 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard { // so CV reflects genuine power regulation, not workload switching). if len(gpu.PrecisionSteady) > 0 { var sum float64 + var count int for _, p := range gpu.PrecisionSteady { + if !benchmarkPrecisionEnabled(p.Precision) { + continue + } sum += clampScore(100 - p.Steady.PowerCVPct*3) + count++ + } + if count > 0 { + score.PowerSustainScore = sum / float64(count) } - score.PowerSustainScore = sum / float64(len(gpu.PrecisionSteady)) } else if gpu.Steady.PowerCVPct > 0 { score.PowerSustainScore = clampScore(100 - gpu.Steady.PowerCVPct*3) } @@ -2512,6 +2572,7 @@ func runNvidiaBenchmarkParallel( // Score and finalize each GPU. for _, idx := range selected { r := gpuResults[idx] + applyBenchmarkSteadyFallback(r) r.Scores = scoreBenchmarkGPUResult(*r) r.DegradationReasons = detectBenchmarkDegradationReasons(*r, result.Normalization.Status) pr := parseResults[idx] @@ -2694,18 +2755,21 @@ func summarizeCPULoad(samples []float64) *BenchmarkCPULoad { return cl } -// runBenchmarkPowerCalibration runs targeted_power per GPU and actively watches -// throttle counters. If a GPU starts throttling, the current targeted_power run -// is canceled immediately, the power limit is reduced, and a fresh full cycle -// is started again from the beginning. The selected reduced power limit stays -// active for the main benchmark and is restored by the caller afterwards. +// runBenchmarkPowerCalibration runs targeted_power for the supplied GPU set and +// actively watches throttle counters. seedLimits, when provided, are treated as +// the starting point for this calibration pass rather than as immutable fixed +// limits. This matters during cumulative ramp-up: once an additional GPU is +// introduced, every already-active GPU must be revalidated under the new +// thermal state instead of assuming its previous single-step limit is still +// valid. The selected reduced power limits stay active for the main benchmark +// and are restored by the caller afterwards. func runBenchmarkPowerCalibration( ctx context.Context, verboseLog, runDir string, gpuIndices []int, infoByIndex map[int]benchmarkGPUInfo, logFunc func(string), - fixedLimits map[int]int, + seedLimits map[int]int, ) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction) { const calibDurationSec = 120 const maxDerateW = 150 @@ -2739,7 +2803,6 @@ func runBenchmarkPowerCalibration( err error } - // gpuCalibState holds per-GPU binary search state during parallel calibration. type gpuCalibState struct { idx int @@ -2796,19 +2859,20 @@ func runBenchmarkPowerCalibration( hi: appliedLimitW + 1, // not yet tested, not yet confirmed unstable calib: benchmarkPowerCalibrationResult{AppliedPowerLimitW: float64(appliedLimitW)}, } - if fixedLimits != nil { - if fixedW, ok := fixedLimits[idx]; ok { - // This GPU's limit was established in a prior ramp step and must - // remain unchanged. Apply it immediately and skip the binary search. - if canDerate && fixedW > 0 { - _ = setBenchmarkPowerLimit(ctx, verboseLog, idx, fixedW) + if seedLimits != nil { + if seedW, ok := seedLimits[idx]; ok && seedW > 0 { + // A previously validated limit is only a starting point. Re-run + // targeted_power under the current multi-GPU thermal load and derate + // again if this step shows new throttling. + if canDerate { + _ = setBenchmarkPowerLimit(ctx, verboseLog, idx, seedW) } - s.appliedLimitW = fixedW - s.calib.AppliedPowerLimitW = float64(fixedW) - s.calib.Completed = true - s.converged = true + s.appliedLimitW = seedW + s.hi = seedW + 1 + s.calib.AppliedPowerLimitW = float64(seedW) + s.calib.Derated = seedW < s.originalLimitW s.calib.Notes = append(s.calib.Notes, - fmt.Sprintf("fixed limit: %d W (held from prior ramp step)", fixedW)) + fmt.Sprintf("seed limit: %d W (revalidating under current thermal load)", seedW)) } } states = append(states, s) @@ -3091,7 +3155,6 @@ func powerBenchDurationSec(profile string) int { } } - func cloneBenchmarkGPUInfoMap(src map[int]benchmarkGPUInfo) map[int]benchmarkGPUInfo { out := make(map[int]benchmarkGPUInfo, len(src)) for k, v := range src { @@ -3392,14 +3455,14 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N firstCalib := calibByIndex[firstIdx] stableLimits[firstIdx] = int(math.Round(firstCalib.AppliedPowerLimitW)) ramp := NvidiaPowerBenchStep{ - StepIndex: 1, - GPUIndices: []int{firstIdx}, - NewGPUIndex: firstIdx, - NewGPUStableLimitW: firstCalib.AppliedPowerLimitW, + StepIndex: 1, + GPUIndices: []int{firstIdx}, + NewGPUIndex: firstIdx, + NewGPUStableLimitW: firstCalib.AppliedPowerLimitW, TotalObservedPowerW: firstCalib.Summary.P95PowerW, AvgObservedPowerW: firstCalib.Summary.P95PowerW, - Derated: firstCalib.Derated, - Status: "OK", + Derated: firstCalib.Derated, + Status: "OK", } if !firstCalib.Completed { ramp.Status = "FAILED" @@ -3417,8 +3480,9 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N len(result.RecommendedSlotOrder), firstIdx, firstCalib.AppliedPowerLimitW)) } - // Steps 2..N: each step fixes previously calibrated GPUs and searches only - // the new GPU's stable limit in the combined thermal environment. + // Steps 2..N: each step revalidates every already-active GPU under the new + // cumulative thermal environment and also calibrates the newly introduced + // GPU. Previously found limits are used only as seeds for the search. for stepNum := 1; stepNum < len(result.RecommendedSlotOrder); stepNum++ { step := stepNum + 1 subset := append([]int(nil), result.RecommendedSlotOrder[:step]...) @@ -3426,17 +3490,18 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N stepDir := filepath.Join(runDir, fmt.Sprintf("step-%02d", step)) _ = os.MkdirAll(stepDir, 0755) - // All previously calibrated GPUs are fixed at their stable limits. - fixedForStep := make(map[int]int, len(stableLimits)) + // Reuse the latest stable limits as starting points, but re-check every + // active GPU in this hotter configuration. + seedForStep := make(map[int]int, len(stableLimits)) for k, v := range stableLimits { - fixedForStep[k] = v + seedForStep[k] = v } - logFunc(fmt.Sprintf("power ramp: step %d/%d — calibrating GPU %d with %d fixed GPU(s)", - step, len(result.RecommendedSlotOrder), newGPUIdx, len(fixedForStep))) + logFunc(fmt.Sprintf("power ramp: step %d/%d — revalidating %d active GPU(s) including new GPU %d", + step, len(result.RecommendedSlotOrder), len(subset), newGPUIdx)) stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex) - stepCalib, stepRestore := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, fixedForStep) + stepCalib, stepRestore := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep) // Accumulate restore actions; they all run in the outer defer. allRestoreActions = append(allRestoreActions, stepRestore...) @@ -3457,26 +3522,46 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N ramp.AvgObservedPowerW = ramp.TotalObservedPowerW / float64(len(subset)) } - // Determine stable limit for the new GPU. - if c, ok := stepCalib[newGPUIdx]; ok && c.Completed { - stableLimits[newGPUIdx] = int(math.Round(c.AppliedPowerLimitW)) - ramp.NewGPUStableLimitW = c.AppliedPowerLimitW - ramp.Derated = c.Derated + for _, idx := range subset { + c, ok := stepCalib[idx] + if !ok || !c.Completed { + fallback := 0 + if lim, ok := stableLimits[idx]; ok && lim > 0 { + fallback = lim + } else if fb, ok := calibByIndex[idx]; ok { + fallback = int(math.Round(fb.AppliedPowerLimitW)) + } + if fallback > 0 { + stableLimits[idx] = fallback + } + ramp.Status = "FAILED" + ramp.Notes = append(ramp.Notes, + fmt.Sprintf("GPU %d did not complete targeted_power in ramp step %d; keeping previous stable limit %d W", idx, step, fallback)) + result.OverallStatus = "PARTIAL" + continue + } + + prevLimit, hadPrev := stableLimits[idx] + newLimit := int(math.Round(c.AppliedPowerLimitW)) + stableLimits[idx] = newLimit + if idx == newGPUIdx { + ramp.NewGPUStableLimitW = c.AppliedPowerLimitW + ramp.Derated = c.Derated + } if c.Derated { ramp.Status = "PARTIAL" if result.OverallStatus == "OK" { result.OverallStatus = "PARTIAL" } - result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load.", step, newGPUIdx, c.AppliedPowerLimitW)) } - } else { - // Calibration failed — fall back to single-card limit. - fb := calibByIndex[newGPUIdx] - stableLimits[newGPUIdx] = int(math.Round(fb.AppliedPowerLimitW)) - ramp.NewGPUStableLimitW = fb.AppliedPowerLimitW - ramp.Status = "FAILED" - ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete targeted_power in ramp step %d; using single-card limit %.0f W", newGPUIdx, step, fb.AppliedPowerLimitW)) - result.OverallStatus = "PARTIAL" + if hadPrev && newLimit < prevLimit { + ramp.Notes = append(ramp.Notes, + fmt.Sprintf("GPU %d was re-derated from %d W to %d W under combined thermal load.", idx, prevLimit, newLimit)) + } + } + + if c, ok := stepCalib[newGPUIdx]; ok && c.Completed && c.Derated { + result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load.", step, newGPUIdx, c.AppliedPowerLimitW)) } result.RampSteps = append(result.RampSteps, ramp) @@ -3495,6 +3580,14 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N if lim, ok := stableLimits[result.GPUs[i].Index]; ok { result.GPUs[i].StablePowerLimitW = float64(lim) } + if result.GPUs[i].StablePowerLimitW > 0 && result.GPUs[i].AppliedPowerLimitW > 0 && + result.GPUs[i].StablePowerLimitW < result.GPUs[i].AppliedPowerLimitW { + result.GPUs[i].Derated = true + result.Findings = append(result.Findings, fmt.Sprintf( + "GPU %d required additional derating from %.0f W (single-card) to %.0f W under full-system thermal load.", + result.GPUs[i].Index, result.GPUs[i].AppliedPowerLimitW, result.GPUs[i].StablePowerLimitW, + )) + } } // PlatformMaxTDPW = sum of all stable limits — the actual sustained power diff --git a/audit/internal/platform/benchmark_report.go b/audit/internal/platform/benchmark_report.go index 22ca59e..c285234 100644 --- a/audit/internal/platform/benchmark_report.go +++ b/audit/internal/platform/benchmark_report.go @@ -261,14 +261,18 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string { b.WriteString("\n") // Steady-state telemetry - fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec)) - b.WriteString("| | Avg | P95 |\n|---|---|---|\n") - fmt.Fprintf(&b, "| Power | %.1f W | %.1f W |\n", gpu.Steady.AvgPowerW, gpu.Steady.P95PowerW) - fmt.Fprintf(&b, "| Temperature | %.1f °C | %.1f °C |\n", gpu.Steady.AvgTempC, gpu.Steady.P95TempC) - fmt.Fprintf(&b, "| GPU clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgGraphicsClockMHz, gpu.Steady.P95GraphicsClockMHz) - fmt.Fprintf(&b, "| Memory clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgMemoryClockMHz, gpu.Steady.P95MemoryClockMHz) - fmt.Fprintf(&b, "| GPU utilisation | %.1f %% | — |\n", gpu.Steady.AvgUsagePct) - b.WriteString("\n") + if benchmarkTelemetryAvailable(gpu.Steady) { + fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec)) + b.WriteString("| | Avg | P95 |\n|---|---|---|\n") + fmt.Fprintf(&b, "| Power | %.1f W | %.1f W |\n", gpu.Steady.AvgPowerW, gpu.Steady.P95PowerW) + fmt.Fprintf(&b, "| Temperature | %.1f °C | %.1f °C |\n", gpu.Steady.AvgTempC, gpu.Steady.P95TempC) + fmt.Fprintf(&b, "| GPU clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgGraphicsClockMHz, gpu.Steady.P95GraphicsClockMHz) + fmt.Fprintf(&b, "| Memory clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgMemoryClockMHz, gpu.Steady.P95MemoryClockMHz) + fmt.Fprintf(&b, "| GPU utilisation | %.1f %% | — |\n", gpu.Steady.AvgUsagePct) + b.WriteString("\n") + } else { + b.WriteString("**Steady-state telemetry:** unavailable\n\n") + } // Per-precision stability phases. if len(gpu.PrecisionSteady) > 0 { diff --git a/audit/internal/platform/benchmark_test.go b/audit/internal/platform/benchmark_test.go index 7c0d540..92b26a4 100644 --- a/audit/internal/platform/benchmark_test.go +++ b/audit/internal/platform/benchmark_test.go @@ -49,8 +49,8 @@ func TestBuildBenchmarkSteadyPlanStandard(t *testing.T) { benchmarkPrecisionPhases, func(label string) string { return label }, ) - if len(labels) != 7 || len(phases) != 7 { - t.Fatalf("labels=%d phases=%d want 7", len(labels), len(phases)) + if len(labels) != 5 || len(phases) != 5 { + t.Fatalf("labels=%d phases=%d want 5", len(labels), len(phases)) } if basePhaseSec != 60 { t.Fatalf("basePhaseSec=%d want 60", basePhaseSec) @@ -61,7 +61,7 @@ func TestBuildBenchmarkSteadyPlanStandard(t *testing.T) { if phases[len(phases)-1].PlanLabel != "mixed" || phases[len(phases)-1].DurationSec != 300 { t.Fatalf("mixed phase=%+v want duration 300", phases[len(phases)-1]) } - if benchmarkPlanDurationsCSV(phases) != "60,60,60,60,60,60,300" { + if benchmarkPlanDurationsCSV(phases) != "60,60,60,60,300" { t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases)) } } @@ -80,7 +80,7 @@ func TestBuildBenchmarkSteadyPlanStability(t *testing.T) { if mixedPhaseSec != 3600 { t.Fatalf("mixedPhaseSec=%d want 3600", mixedPhaseSec) } - if benchmarkPlanDurationsCSV(phases) != "300,300,300,300,300,300,3600" { + if benchmarkPlanDurationsCSV(phases) != "300,300,300,300,3600" { t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases)) } } @@ -99,7 +99,7 @@ func TestBuildBenchmarkSteadyPlanOvernight(t *testing.T) { if mixedPhaseSec != 14400 { t.Fatalf("mixedPhaseSec=%d want 14400", mixedPhaseSec) } - if benchmarkPlanDurationsCSV(phases) != "3600,3600,3600,3600,3600,3600,14400" { + if benchmarkPlanDurationsCSV(phases) != "3600,3600,3600,3600,14400" { t.Fatalf("durations=%q", benchmarkPlanDurationsCSV(phases)) } } @@ -133,10 +133,10 @@ func TestSplitBenchmarkRowsByPlannedPhaseUsesPhaseDurations(t *testing.T) { func TestBenchmarkSupportedPrecisionsSkipsFP4BeforeBlackwell(t *testing.T) { t.Parallel() - if got := benchmarkSupportedPrecisions("9.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32,fp64" { + if got := benchmarkSupportedPrecisions("9.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32" { t.Fatalf("supported=%v", got) } - if got := benchmarkSupportedPrecisions("10.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32,fp64,fp4" { + if got := benchmarkSupportedPrecisions("10.0"); strings.Join(got, ",") != "int8,fp8,fp16,fp32" { t.Fatalf("supported=%v", got) } } @@ -314,6 +314,30 @@ func TestRenderBenchmarkReportListsUnifiedArtifacts(t *testing.T) { } } +func TestScoreBenchmarkGPUIgnoresDisabledPrecisions(t *testing.T) { + t.Parallel() + + score := scoreBenchmarkGPUResult(BenchmarkGPUResult{ + PrecisionSteady: []BenchmarkPrecisionSteadyPhase{ + {Precision: "fp16", WeightedTeraOpsPerSec: 100}, + {Precision: "fp64", WeightedTeraOpsPerSec: 999}, + {Precision: "fp4", WeightedTeraOpsPerSec: 999}, + }, + PrecisionResults: []BenchmarkPrecisionResult{ + {Category: "fp32_tf32", Supported: true, WeightedTeraOpsPerSec: 50}, + {Category: "fp64", Supported: true, WeightedTeraOpsPerSec: 999}, + {Category: "fp4", Supported: true, WeightedTeraOpsPerSec: 999}, + }, + }) + + if score.SyntheticScore != 100 { + t.Fatalf("SyntheticScore=%f want 100", score.SyntheticScore) + } + if score.MixedScore != 50 { + t.Fatalf("MixedScore=%f want 50", score.MixedScore) + } +} + func TestEnrichGPUInfoWithMaxClocks(t *testing.T) { t.Parallel()