Add per-precision benchmark phases, weighted TOPS scoring, and ECC tracking

- Split steady window into 6 equal slots: fp8/fp16/fp32/fp64/fp4 + combined - Each precision phase runs bee-gpu-burn with --precision filter so PowerCVPct reflects single-kernel stability (not round-robin artifact) - Add fp4 support in bee-gpu-stress.c for Blackwell (cc>=100) via existing CUDA_R_4F_E2M1 guard - Weighted TOPS: fp64×2.0, fp32×1.0, fp16×0.5, fp8×0.25, fp4×0.125 - SyntheticScore = sum of weighted TOPS from per-precision phases - MixedScore = sum from combined phase; MixedEfficiency = Mixed/Synthetic - ComputeScore = SyntheticScore × (1 + MixedEfficiency × 0.3) - ECC volatile counters sampled before/after each phase and overall - DegradationReasons: ecc_uncorrected_errors, ecc_corrected_errors - Report: per-precision stability table with ECC columns, methodology section - Ramp-up history table redesign: GPU indices as columns, runs as rows Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-13 10:49:49 +03:00
parent 02e44b1172
commit bf6ecab4f0
9 changed files with 390 additions and 144 deletions
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
@@ -73,6 +73,11 @@ var (
 	benchmarkIterationsPattern = regexp.MustCompile(`^([a-z0-9_]+)_iterations=(\d+)$`)
 )

+// benchmarkPrecisionPhases lists the precision categories run as individual
+// steady-state windows before the combined steady pass.  Order is from lowest
+// to highest power draw so thermal ramp-up is gradual.
+var benchmarkPrecisionPhases = []string{"fp8", "fp16", "fp32", "fp64", "fp4"}
+
 func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
 	if ctx == nil {
 		ctx = context.Background()
@@ -225,14 +230,56 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 			continue
 		}

+		// ── Per-precision stability phases ────────────────────────────────────────
+		// Run each precision category alone so PowerCVPct reflects genuine GPU
+		// power stability, not kernel-mix variance.
+		// Time budget: each phase gets steadySec/numPhases, minimum 60 s.
+		// SteadySec is split equally across all precision phases + 1 combined slot.
+		// Skipped phases (unsupported precision) are simply omitted; combined is fixed.
+		totalSlots := len(benchmarkPrecisionPhases) + 1
+		perPhaseSec := spec.SteadySec / totalSlots
+		if perPhaseSec < 60 {
+			perPhaseSec = 60
+		}
+		eccBase, _ := queryECCCounters(idx)
+		for _, prec := range benchmarkPrecisionPhases {
+			phaseCmd := []string{
+				"bee-gpu-burn",
+				"--seconds", strconv.Itoa(perPhaseSec),
+				"--size-mb", strconv.Itoa(opts.SizeMB),
+				"--devices", strconv.Itoa(idx),
+				"--precision", prec,
+			}
+			logFunc(fmt.Sprintf("GPU %d: %s stability phase (%ds)", idx, prec, perPhaseSec))
+			phaseLogName := fmt.Sprintf("gpu-%d-steady-%s", idx, prec)
+			eccBefore, _ := queryECCCounters(idx)
+			phaseOut, phaseRows, phaseErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, phaseLogName+".log", phaseCmd, nil, []int{idx}, runDir, phaseLogName, logFunc)
+			eccAfter, _ := queryECCCounters(idx)
+			if phaseErr != nil || len(phaseRows) == 0 {
+				continue
+			}
+			phase := BenchmarkPrecisionSteadyPhase{
+				Precision: prec,
+				Steady:    summarizeBenchmarkTelemetry(phaseRows),
+				ECC:       diffECCCounters(eccBefore, eccAfter),
+			}
+			for _, p := range parseBenchmarkBurnLog(string(phaseOut)).Profiles {
+				if p.Supported {
+					phase.TeraOpsPerSec += p.TeraOpsPerSec
+					phase.WeightedTeraOpsPerSec += p.WeightedTeraOpsPerSec
+				}
+			}
+			gpuResult.PrecisionSteady = append(gpuResult.PrecisionSteady, phase)
+		}
+
 		beforeThrottle, _ := queryThrottleCounters(idx)
 		steadyCmd := []string{
 			"bee-gpu-burn",
-			"--seconds", strconv.Itoa(spec.SteadySec),
+			"--seconds", strconv.Itoa(perPhaseSec),
 			"--size-mb", strconv.Itoa(opts.SizeMB),
 			"--devices", strconv.Itoa(idx),
 		}
-		logFunc(fmt.Sprintf("GPU %d: steady compute (%ds)", idx, spec.SteadySec))
+		logFunc(fmt.Sprintf("GPU %d: steady compute (combined, %ds)", idx, perPhaseSec))

 		// Sample server power via IPMI in parallel with the steady phase.
 		// We collect readings every 5s and average them.
@@ -293,6 +340,9 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv

 		gpuResult.Steady = summarizeBenchmarkTelemetry(steadyRows)
 		gpuResult.Throttle = diffThrottleCounters(beforeThrottle, afterThrottle)
+		if eccFinal, err := queryECCCounters(idx); err == nil {
+			gpuResult.ECC = diffECCCounters(eccBase, eccFinal)
+		}

 		cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, []int{idx})
 		if err != nil && err != context.Canceled {
@@ -811,8 +861,11 @@ func parseBenchmarkBurnLog(raw string) benchmarkBurnParseResult {
 			Iterations: profile.iterations,
 			Notes:      profile.notes,
 		}
+		w := precisionWeight(profile.category)
+		precision.Weight = w
 		if profile.supported && result.DurationSec > 0 && profile.m > 0 && profile.n > 0 && profile.k > 0 && profile.iterations > 0 {
 			precision.TeraOpsPerSec = (2.0 * float64(profile.m) * float64(profile.n) * float64(profile.k) * float64(profile.iterations)) / float64(result.DurationSec) / 1e12
+			precision.WeightedTeraOpsPerSec = precision.TeraOpsPerSec * w
 		}
 		result.Profiles = append(result.Profiles, precision)
 	}
@@ -841,6 +894,33 @@ func ensureBenchmarkProfile(profiles map[string]*benchmarkBurnProfile, name stri
 	return profile
 }

+// precisionWeight returns the fp32-equivalence factor for a precision category.
+// Each factor represents how much "real" numeric work one operation of that
+// type performs relative to fp32 (single precision = 1.0 baseline):
+//   fp64  = 2.0  — double precision, 2× more bits per operand
+//   fp32  = 1.0  — single precision baseline
+//   fp16  = 0.5  — half precision
+//   fp8   = 0.25 — quarter precision
+//   fp4   = 0.125 — eighth precision
+// Multiplying raw TOPS by the weight gives fp32-equivalent TOPS, enabling
+// cross-precision comparison on the same numeric scale.
+func precisionWeight(category string) float64 {
+	switch category {
+	case "fp64":
+		return 2.0
+	case "fp32_tf32":
+		return 1.0
+	case "fp16_bf16":
+		return 0.5
+	case "fp8":
+		return 0.25
+	case "fp4":
+		return 0.125
+	default:
+		return 1.0
+	}
+}
+
 func stripBenchmarkPrefix(line string) string {
 	if strings.HasPrefix(line, "[gpu ") {
 		if idx := strings.Index(line, "] "); idx >= 0 {
@@ -890,11 +970,39 @@ func summarizeBenchmarkTelemetry(rows []GPUMetricRow) BenchmarkTelemetrySummary

 func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
 	score := BenchmarkScorecard{}
-	for _, precision := range gpu.PrecisionResults {
-		if precision.Supported {
-			score.ComputeScore += precision.TeraOpsPerSec
+
+	// SyntheticScore: sum of fp32-equivalent TOPS from per-precision phases.
+	// Each precision ran alone with full GPU dedicated — peak capability.
+	for _, p := range gpu.PrecisionSteady {
+		score.SyntheticScore += p.WeightedTeraOpsPerSec
+	}
+
+	// MixedScore: sum of fp32-equivalent TOPS from the combined phase.
+	// All precisions compete simultaneously — closer to real inference workloads.
+	for _, p := range gpu.PrecisionResults {
+		if p.Supported {
+			score.MixedScore += p.WeightedTeraOpsPerSec
 		}
 	}
+
+	// MixedEfficiency = MixedScore / SyntheticScore.
+	// Measures how well the GPU sustains throughput under concurrent mixed load.
+	// A healthy GPU scores ~0.8–0.95; severe degradation suggests bandwidth
+	// contention or scheduler inefficiency.
+	if score.SyntheticScore > 0 && score.MixedScore > 0 {
+		score.MixedEfficiency = score.MixedScore / score.SyntheticScore
+	}
+
+	// ComputeScore = SyntheticScore × (1 + MixedEfficiency × 0.3).
+	// SyntheticScore is the primary signal; MixedEfficiency adds up to +30%
+	// bonus for GPUs that handle mixed-precision concurrency well.
+	// Falls back to MixedScore alone when per-precision data is absent.
+	switch {
+	case score.SyntheticScore > 0:
+		score.ComputeScore = score.SyntheticScore * (1 + score.MixedEfficiency*0.3)
+	case score.MixedScore > 0:
+		score.ComputeScore = score.MixedScore
+	}
 	// PowerSustainScore: measures how close the GPU came to its rated TDP under
 	// a full-spectrum load (dcgmi targeted_power). 100 = exactly at rated TDP.
 	// Penalty applied symmetrically for both under- and over-TDP deviations:
@@ -915,7 +1023,19 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
 	runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6)
 	thermalRatio := float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS) / runtimeUS
 	score.ThermalSustainScore = clampScore(100 - thermalRatio*100)
-	score.StabilityScore = clampScore(100 - (gpu.Steady.ClockCVPct*4 + gpu.Steady.PowerCVPct*2 + gpu.Steady.ClockDriftPct*2))
+	// StabilityScore: prefer per-precision steady phases where each window runs a
+	// single kernel type so PowerCVPct is a genuine stability signal (not a
+	// workload-mix artifact). Fall back to combined steady using clock-only metrics
+	// when per-precision data is absent (older results, short profiles).
+	if len(gpu.PrecisionSteady) > 0 {
+		var sum float64
+		for _, p := range gpu.PrecisionSteady {
+			sum += clampScore(100 - (p.Steady.ClockCVPct*4 + p.Steady.PowerCVPct*2 + p.Steady.ClockDriftPct*2))
+		}
+		score.StabilityScore = sum / float64(len(gpu.PrecisionSteady))
+	} else {
+		score.StabilityScore = clampScore(100 - (gpu.Steady.ClockCVPct*4 + gpu.Steady.ClockDriftPct*2))
+	}
 	score.CompositeScore = compositeBenchmarkScore(score)
 	if gpu.MultiprocessorCount > 0 && gpu.Steady.AvgGraphicsClockMHz > 0 && score.ComputeScore > 0 {
 		score.TOPSPerSMPerGHz = score.ComputeScore / float64(gpu.MultiprocessorCount) / (gpu.Steady.AvgGraphicsClockMHz / 1000.0)
@@ -963,6 +1083,12 @@ func detectBenchmarkDegradationReasons(gpu BenchmarkGPUResult, normalizationStat
 	if normalizationStatus != "full" {
 		reasons = append(reasons, "normalization_partial")
 	}
+	if gpu.ECC.Uncorrected > 0 {
+		reasons = append(reasons, "ecc_uncorrected_errors")
+	}
+	if gpu.ECC.Corrected > 0 {
+		reasons = append(reasons, "ecc_corrected_errors")
+	}
 	return dedupeStrings(reasons)
 }

@@ -1064,6 +1190,36 @@ func diffThrottleCounters(before, after BenchmarkThrottleCounters) BenchmarkThro
 	}
 }

+func queryECCCounters(gpuIndex int) (BenchmarkECCCounters, error) {
+	out, err := satExecCommand(
+		"nvidia-smi",
+		"--id="+strconv.Itoa(gpuIndex),
+		"--query-gpu=ecc.errors.corrected.volatile.total,ecc.errors.uncorrected.volatile.total",
+		"--format=csv,noheader,nounits",
+	).Output()
+	if err != nil {
+		return BenchmarkECCCounters{}, err
+	}
+	fields := strings.Split(strings.TrimSpace(string(out)), ",")
+	if len(fields) < 2 {
+		return BenchmarkECCCounters{}, fmt.Errorf("unexpected ECC counter columns: %q", strings.TrimSpace(string(out)))
+	}
+	corrected, err1 := strconv.ParseUint(strings.TrimSpace(fields[0]), 10, 64)
+	uncorrected, err2 := strconv.ParseUint(strings.TrimSpace(fields[1]), 10, 64)
+	if err1 != nil || err2 != nil {
+		// ECC may be disabled on this GPU — return zero counters silently.
+		return BenchmarkECCCounters{}, nil
+	}
+	return BenchmarkECCCounters{Corrected: corrected, Uncorrected: uncorrected}, nil
+}
+
+func diffECCCounters(before, after BenchmarkECCCounters) BenchmarkECCCounters {
+	return BenchmarkECCCounters{
+		Corrected:   saturatingSub(after.Corrected, before.Corrected),
+		Uncorrected: saturatingSub(after.Uncorrected, before.Uncorrected),
+	}
+}
+
 func queryActiveComputeApps(gpuIndices []int) ([]string, error) {
 	args := []string{
 		"--query-compute-apps=gpu_uuid,pid,process_name",
@@ -1141,6 +1297,10 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
 				findings = append(findings, fmt.Sprintf("GPU %d showed unstable clocks/power over the benchmark window.", gpu.Index))
 			case "normalization_partial":
 				findings = append(findings, fmt.Sprintf("GPU %d ran without full benchmark normalization.", gpu.Index))
+			case "ecc_uncorrected_errors":
+				findings = append(findings, fmt.Sprintf("GPU %d reported %d uncorrected ECC error(s) — possible hardware fault.", gpu.Index, gpu.ECC.Uncorrected))
+			case "ecc_corrected_errors":
+				findings = append(findings, fmt.Sprintf("GPU %d reported %d corrected ECC error(s) — possible DRAM degradation.", gpu.Index, gpu.ECC.Corrected))
 			}
 		}
 		if gpu.Backend == "driver-ptx" {
@@ -1580,20 +1740,75 @@ func runNvidiaBenchmarkParallel(
 		}
 	}

+	// ── Per-precision stability phases (parallel) ─────────────────────────────
+	totalSlots := len(benchmarkPrecisionPhases) + 1
+	perPhaseSec := spec.SteadySec / totalSlots
+	if perPhaseSec < 60 {
+		perPhaseSec = 60
+	}
+	eccBase := make(map[int]BenchmarkECCCounters, len(selected))
+	for _, idx := range selected {
+		eccBase[idx], _ = queryECCCounters(idx)
+	}
+	for _, prec := range benchmarkPrecisionPhases {
+		phaseCmd := []string{
+			"bee-gpu-burn",
+			"--seconds", strconv.Itoa(perPhaseSec),
+			"--size-mb", strconv.Itoa(opts.SizeMB),
+			"--devices", allDevices,
+			"--precision", prec,
+		}
+		logFunc(fmt.Sprintf("GPUs %s: %s stability phase (%ds)", allDevices, prec, perPhaseSec))
+		phaseLogName := "gpu-all-steady-" + prec
+		eccBeforePhase := make(map[int]BenchmarkECCCounters, len(selected))
+		for _, idx := range selected {
+			eccBeforePhase[idx], _ = queryECCCounters(idx)
+		}
+		phaseOut, phaseRows, phaseErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, phaseLogName+".log", phaseCmd, nil, selected, runDir, phaseLogName, logFunc)
+		eccAfterPhase := make(map[int]BenchmarkECCCounters, len(selected))
+		for _, idx := range selected {
+			eccAfterPhase[idx], _ = queryECCCounters(idx)
+		}
+		if phaseErr != nil || len(phaseRows) == 0 {
+			continue
+		}
+		parseByGPU := parseBenchmarkBurnLogByGPU(string(phaseOut))
+		for _, idx := range selected {
+			perGPU := filterRowsByGPU(phaseRows, idx)
+			if len(perGPU) == 0 {
+				continue
+			}
+			phase := BenchmarkPrecisionSteadyPhase{
+				Precision: prec,
+				Steady:    summarizeBenchmarkTelemetry(perGPU),
+				ECC:       diffECCCounters(eccBeforePhase[idx], eccAfterPhase[idx]),
+			}
+			if pr, ok := parseByGPU[idx]; ok {
+				for _, p := range pr.Profiles {
+					if p.Supported {
+						phase.TeraOpsPerSec += p.TeraOpsPerSec
+						phase.WeightedTeraOpsPerSec += p.WeightedTeraOpsPerSec
+					}
+				}
+			}
+			gpuResults[idx].PrecisionSteady = append(gpuResults[idx].PrecisionSteady, phase)
+		}
+	}
+
 	// Snapshot throttle counters before steady.
 	beforeThrottle := make(map[int]BenchmarkThrottleCounters, len(selected))
 	for _, idx := range selected {
 		beforeThrottle[idx], _ = queryThrottleCounters(idx)
 	}

-	// Steady: all GPUs simultaneously.
+	// Steady: all GPUs simultaneously (combined). Fixed at one slot = perPhaseSec.
 	steadyCmd := []string{
 		"bee-gpu-burn",
-		"--seconds", strconv.Itoa(spec.SteadySec),
+		"--seconds", strconv.Itoa(perPhaseSec),
 		"--size-mb", strconv.Itoa(opts.SizeMB),
 		"--devices", allDevices,
 	}
-	logFunc(fmt.Sprintf("GPUs %s: parallel steady compute (%ds)", allDevices, spec.SteadySec))
+	logFunc(fmt.Sprintf("GPUs %s: parallel steady compute (combined, %ds)", allDevices, perPhaseSec))

 	// Sample server power via IPMI in parallel with steady phase.
 	ipmiStopCh := make(chan struct{})
@@ -1649,6 +1864,9 @@ func runNvidiaBenchmarkParallel(
 		writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-steady", idx), perGPU)
 		gpuResults[idx].Steady = summarizeBenchmarkTelemetry(perGPU)
 		gpuResults[idx].Throttle = diffThrottleCounters(beforeThrottle[idx], afterThrottle[idx])
+		if eccFinal, err := queryECCCounters(idx); err == nil {
+			gpuResults[idx].ECC = diffECCCounters(eccBase[idx], eccFinal)
+		}

 		if pr, ok := parseResults[idx]; ok {
 			gpuResults[idx].ComputeCapability = pr.ComputeCapability