diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go index 586a290..57cbb8a 100644 --- a/audit/internal/platform/benchmark.go +++ b/audit/internal/platform/benchmark.go @@ -444,8 +444,11 @@ func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []b // Split the verbose output into per-GPU sections on "^GPU " lines. gpuSectionRe := regexp.MustCompile(`(?m)^GPU\s+([\dA-Fa-f:\.]+)`) - maxGfxRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Graphics\s*:\s*(\d+)\s*MHz`) - maxMemRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Memory\s*:\s*(\d+)\s*MHz`) + maxGfxRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Graphics\s*:\s*(\d+)\s*MHz`) + maxMemRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Memory\s*:\s*(\d+)\s*MHz`) + defaultPwrRe := regexp.MustCompile(`(?i)Default Power Limit\s*:\s*([0-9.]+)\s*W`) + currentPwrRe := regexp.MustCompile(`(?i)Current Power Limit\s*:\s*([0-9.]+)\s*W`) + smCountRe := regexp.MustCompile(`(?i)Multiprocessor Count\s*:\s*(\d+)`) sectionStarts := gpuSectionRe.FindAllSubmatchIndex(nvsmiQ, -1) for i, loc := range sectionStarts { @@ -466,17 +469,14 @@ func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []b continue } - info := infoByIndex[benchIdx] - if info.MaxGraphicsClockMHz > 0 && info.MaxMemoryClockMHz > 0 { - continue // already populated - } - end := len(nvsmiQ) if i+1 < len(sectionStarts) { end = sectionStarts[i+1][0] } section := nvsmiQ[loc[0]:end] + info := infoByIndex[benchIdx] + if info.MaxGraphicsClockMHz == 0 { if m := maxGfxRe.FindSubmatch(section); m != nil { if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil { @@ -491,6 +491,27 @@ func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []b } } } + if info.DefaultPowerLimitW == 0 { + if m := defaultPwrRe.FindSubmatch(section); m != nil { + if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 { + info.DefaultPowerLimitW = v + } + } + } + if info.PowerLimitW == 0 { + if m := currentPwrRe.FindSubmatch(section); m != nil { + if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 { + info.PowerLimitW = v + } + } + } + if info.MultiprocessorCount == 0 { + if m := smCountRe.FindSubmatch(section); m != nil { + if v, err := strconv.Atoi(string(m[1])); err == nil && v > 0 { + info.MultiprocessorCount = v + } + } + } infoByIndex[benchIdx] = info } } @@ -857,19 +878,22 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard { score.ComputeScore += precision.TeraOpsPerSec } } - // PowerSustainScore: prefer calibrated peak power (measured under targeted_power - // load) as the reference — it reflects what this GPU actually reaches under a - // full-spectrum workload, unlike the hardware default limit which bee-gpu-burn - // cannot reach. Fall back to default limit, then enforced limit. - referencePowerW := gpu.CalibratedPeakPowerW - if referencePowerW <= 0 { - referencePowerW = gpu.DefaultPowerLimitW - } - if referencePowerW <= 0 { - referencePowerW = gpu.PowerLimitW - } - if referencePowerW > 0 { - score.PowerSustainScore = math.Min(100, (gpu.Steady.AvgPowerW/referencePowerW)*100) + // PowerSustainScore: measures how close the GPU came to its rated TDP under + // a full-spectrum load (dcgmi targeted_power). 100 = exactly at rated TDP. + // Penalty applied symmetrically for both under- and over-TDP deviations: + // score = max(0, 100 − |measured − rated| / rated × 100) + // Under-TDP → power delivery / cooling issue. + // Over-TDP → power limit not properly enforced / power regulation fault. + // Falls back to 0 if calibration was not performed (dcgmi unavailable). + { + ref := gpu.DefaultPowerLimitW + if ref <= 0 { + ref = gpu.PowerLimitW + } + if gpu.CalibratedPeakPowerW > 0 && ref > 0 { + deviationPct := math.Abs(gpu.CalibratedPeakPowerW-ref) / ref * 100 + score.PowerSustainScore = clampScore(100 - deviationPct) + } } runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6) thermalRatio := float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS) / runtimeUS @@ -887,8 +911,8 @@ func compositeBenchmarkScore(score BenchmarkScorecard) float64 { // base 0.35 — floor so a GPU that fails all sustain checks still scores // thermal 0.25 — heaviest: throttle counters are the most reliable signal // stability 0.25 — clock/power variance matters for reproducibility - // power 0.15 — honest with calibrated reference; lower because - // bee-gpu-burn is compute-only (not mem+compute like TDP test) + // power 0.15 — GPU reaches rated TDP under targeted_power? lower weight + // because calibration may be absent (dcgmi not installed) // NCCL bonus 0.10 — interconnect health // cap 1.10 quality := 0.35 + 0.15*(score.PowerSustainScore/100.0) + 0.25*(score.ThermalSustainScore/100.0) + 0.25*(score.StabilityScore/100.0) @@ -1111,6 +1135,28 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string { gpu.Index, gpu.PowerLimitW, gpu.DefaultPowerLimitW, gpu.PowerLimitW/gpu.DefaultPowerLimitW*100, )) } + // Flag significant TDP deviation (over or under) from calibration. + if gpu.CalibratedPeakPowerW > 0 { + ref := gpu.DefaultPowerLimitW + if ref <= 0 { + ref = gpu.PowerLimitW + } + if ref > 0 { + deviationPct := (gpu.CalibratedPeakPowerW - ref) / ref * 100 + switch { + case deviationPct < -10: + findings = append(findings, fmt.Sprintf( + "GPU %d reached only %.0f W (%.0f%% of rated %.0f W) under targeted_power. Check power delivery or cooling.", + gpu.Index, gpu.CalibratedPeakPowerW, gpu.CalibratedPeakPowerW/ref*100, ref, + )) + case deviationPct > 5: + findings = append(findings, fmt.Sprintf( + "GPU %d exceeded rated TDP: %.0f W measured vs %.0f W rated (+%.0f%%). Power limit may not be enforced correctly.", + gpu.Index, gpu.CalibratedPeakPowerW, ref, deviationPct, + )) + } + } + } } if result.Interconnect != nil && result.Interconnect.Supported { findings = append(findings, fmt.Sprintf("Multi-GPU all_reduce max bus bandwidth: %.1f GB/s.", result.Interconnect.MaxBusBWGBps))