diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go index df350e3..9533315 100644 --- a/audit/internal/platform/benchmark.go +++ b/audit/internal/platform/benchmark.go @@ -27,14 +27,17 @@ type benchmarkProfileSpec struct { } type benchmarkGPUInfo struct { - Index int - UUID string - Name string - BusID string - VBIOS string - PowerLimitW float64 - MaxGraphicsClockMHz float64 - MaxMemoryClockMHz float64 + Index int + UUID string + Name string + BusID string + VBIOS string + PowerLimitW float64 + DefaultPowerLimitW float64 + MaxGraphicsClockMHz float64 + MaxMemoryClockMHz float64 + BaseGraphicsClockMHz float64 + MultiprocessorCount int } type benchmarkBurnProfile struct { @@ -111,6 +114,11 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv logFunc(fmt.Sprintf("NVIDIA benchmark profile=%s gpus=%s", spec.Name, joinIndexList(selected))) + // Server power characterization state — populated during per-GPU phases. + var serverIdleW, serverLoadedWSum float64 + var serverIdleOK, serverLoadedOK bool + var serverLoadedSamples int + infoByIndex, infoErr := queryBenchmarkGPUInfo(selected) if infoErr != nil { result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error()) @@ -146,7 +154,10 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv gpuResult.BusID = info.BusID gpuResult.VBIOS = info.VBIOS gpuResult.PowerLimitW = info.PowerLimitW + gpuResult.MultiprocessorCount = info.MultiprocessorCount + gpuResult.DefaultPowerLimitW = info.DefaultPowerLimitW gpuResult.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz + gpuResult.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz } if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil { @@ -161,6 +172,15 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv gpuResult.Baseline = summarizeBenchmarkTelemetry(baselineRows) writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-baseline", idx), baselineRows) + // Sample server idle power once (first GPU only — server state is global). + if !serverIdleOK { + if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok { + serverIdleW = w + serverIdleOK = true + logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w)) + } + } + warmupCmd := []string{ "bee-gpu-burn", "--seconds", strconv.Itoa(spec.WarmupSec), @@ -184,7 +204,50 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv "--devices", strconv.Itoa(idx), } logFunc(fmt.Sprintf("GPU %d: steady compute (%ds)", idx, spec.SteadySec)) + + // Sample server power via IPMI in parallel with the steady phase. + // We collect readings every 5s and average them. + ipmiStopCh := make(chan struct{}) + ipmiResultCh := make(chan float64, 1) + go func() { + defer close(ipmiResultCh) + var samples []float64 + ticker := time.NewTicker(5 * time.Second) + defer ticker.Stop() + // First sample after a short warmup delay. + select { + case <-ipmiStopCh: + return + case <-time.After(15 * time.Second): + } + for { + if w, err := queryIPMIServerPowerW(); err == nil { + samples = append(samples, w) + } + select { + case <-ipmiStopCh: + if len(samples) > 0 { + var sum float64 + for _, w := range samples { + sum += w + } + ipmiResultCh <- sum / float64(len(samples)) + } + return + case <-ticker.C: + } + } + }() + steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-steady.log", idx), steadyCmd, nil, []int{idx}, runDir, fmt.Sprintf("gpu-%d-steady", idx), logFunc) + close(ipmiStopCh) + if loadedW, ok := <-ipmiResultCh; ok { + serverLoadedWSum += loadedW + serverLoadedSamples++ + serverLoadedOK = true + logFunc(fmt.Sprintf("GPU %d: server loaded power (IPMI): %.0f W", idx, loadedW)) + } + _ = os.WriteFile(filepath.Join(runDir, fmt.Sprintf("gpu-%d-steady.log", idx)), steadyOut, 0644) afterThrottle, _ := queryThrottleCounters(idx) if steadyErr != nil { @@ -232,6 +295,17 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv } } + // Compute server power characterization from accumulated IPMI samples. + var gpuReportedSumW float64 + for _, gpu := range result.GPUs { + gpuReportedSumW += gpu.Steady.AvgPowerW + } + var serverLoadedW float64 + if serverLoadedSamples > 0 { + serverLoadedW = serverLoadedWSum / float64(serverLoadedSamples) + } + result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuReportedSumW, serverIdleOK && serverLoadedOK) + result.Findings = buildBenchmarkFindings(result) result.OverallStatus = benchmarkOverallStatus(result) @@ -290,7 +364,7 @@ func resolveBenchmarkProfile(profile string) benchmarkProfileSpec { func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) { args := []string{ - "--query-gpu=index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory", + "--query-gpu=index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics,attribute.multiprocessor_count,power.default_limit", "--format=csv,noheader,nounits", } if len(gpuIndices) > 0 { @@ -311,14 +385,14 @@ func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) { infoByIndex := make(map[int]benchmarkGPUInfo, len(rows)) for _, row := range rows { - if len(row) < 8 { + if len(row) < 9 { continue } idx, err := strconv.Atoi(strings.TrimSpace(row[0])) if err != nil { continue } - infoByIndex[idx] = benchmarkGPUInfo{ + info := benchmarkGPUInfo{ Index: idx, UUID: strings.TrimSpace(row[1]), Name: strings.TrimSpace(row[2]), @@ -328,6 +402,16 @@ func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) { MaxGraphicsClockMHz: parseBenchmarkFloat(row[6]), MaxMemoryClockMHz: parseBenchmarkFloat(row[7]), } + if len(row) >= 9 { + info.BaseGraphicsClockMHz = parseBenchmarkFloat(row[8]) + } + if len(row) >= 10 { + info.MultiprocessorCount = int(parseBenchmarkFloat(row[9])) + } + if len(row) >= 11 { + info.DefaultPowerLimitW = parseBenchmarkFloat(row[10]) + } + infoByIndex[idx] = info } return infoByIndex, nil } @@ -551,6 +635,8 @@ func ensureBenchmarkProfile(profiles map[string]*benchmarkBurnProfile, name stri } category := "other" switch { + case strings.HasPrefix(name, "fp64"): + category = "fp64" case strings.HasPrefix(name, "fp32"): category = "fp32_tf32" case strings.HasPrefix(name, "fp16"): @@ -627,6 +713,9 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard { score.ThermalSustainScore = clampScore(100 - thermalRatio*100) score.StabilityScore = clampScore(100 - (gpu.Steady.ClockCVPct*4 + gpu.Steady.PowerCVPct*2 + gpu.Steady.ClockDriftPct*2)) score.CompositeScore = compositeBenchmarkScore(score) + if gpu.MultiprocessorCount > 0 && gpu.Steady.AvgGraphicsClockMHz > 0 && score.ComputeScore > 0 { + score.TOPSPerSMPerGHz = score.ComputeScore / float64(gpu.MultiprocessorCount) / (gpu.Steady.AvgGraphicsClockMHz / 1000.0) + } return score } @@ -798,10 +887,30 @@ func finalizeBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkGPUResult { func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string { var findings []string + + passed := 0 + for _, gpu := range result.GPUs { + if gpu.Status == "OK" { + passed++ + } + } + total := len(result.GPUs) + if total > 0 { + if passed == total { + findings = append(findings, fmt.Sprintf("All %d GPU(s) passed the benchmark.", total)) + } else { + findings = append(findings, fmt.Sprintf("%d of %d GPU(s) passed the benchmark.", passed, total)) + } + } + if result.Normalization.Status != "full" { findings = append(findings, "Environment normalization was partial; compare results with caution.") } for _, gpu := range result.GPUs { + if gpu.Status == "FAILED" && len(gpu.DegradationReasons) == 0 { + findings = append(findings, fmt.Sprintf("GPU %d failed the benchmark (check verbose.log for details).", gpu.Index)) + continue + } if len(gpu.DegradationReasons) == 0 && gpu.Status == "OK" { findings = append(findings, fmt.Sprintf("GPU %d held clocks without observable throttle counters during steady state.", gpu.Index)) continue @@ -825,10 +934,24 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string { if gpu.Backend == "driver-ptx" { findings = append(findings, fmt.Sprintf("GPU %d used driver PTX fallback; tensor score is intentionally degraded.", gpu.Index)) } + if gpu.DefaultPowerLimitW > 0 && gpu.PowerLimitW > 0 && gpu.PowerLimitW < gpu.DefaultPowerLimitW*0.95 { + findings = append(findings, fmt.Sprintf( + "GPU %d power limit %.0f W is below default %.0f W (%.0f%%). Performance may be artificially reduced.", + gpu.Index, gpu.PowerLimitW, gpu.DefaultPowerLimitW, gpu.PowerLimitW/gpu.DefaultPowerLimitW*100, + )) + } } if result.Interconnect != nil && result.Interconnect.Supported { findings = append(findings, fmt.Sprintf("Multi-GPU all_reduce max bus bandwidth: %.1f GB/s.", result.Interconnect.MaxBusBWGBps)) } + if sp := result.ServerPower; sp != nil && sp.Available && sp.GPUReportedSumW > 0 { + if sp.ReportingRatio < 0.75 { + findings = append(findings, fmt.Sprintf( + "GPU power reporting may be unreliable: server delta %.0f W vs GPU-reported %.0f W (ratio %.2f). GPU telemetry likely over-reports actual consumption.", + sp.DeltaW, sp.GPUReportedSumW, sp.ReportingRatio, + )) + } + } return dedupeStrings(findings) } @@ -1007,3 +1130,76 @@ func maxInt(a, b int) int { } return b } + +// queryIPMIServerPowerW reads the current server power draw via ipmitool dcmi. +// Returns 0 and an error if IPMI is unavailable or the output cannot be parsed. +func queryIPMIServerPowerW() (float64, error) { + out, err := satExecCommand("ipmitool", "dcmi", "power", "reading").Output() + if err != nil { + return 0, fmt.Errorf("ipmitool dcmi power reading: %w", err) + } + for _, line := range strings.Split(string(out), "\n") { + if strings.Contains(line, "Current Power") { + parts := strings.SplitN(line, ":", 2) + if len(parts) == 2 { + val := strings.TrimSpace(strings.TrimSuffix(strings.TrimSpace(parts[1]), "Watts")) + val = strings.TrimSpace(val) + w, err := strconv.ParseFloat(val, 64) + if err == nil && w > 0 { + return w, nil + } + } + } + } + return 0, fmt.Errorf("could not parse ipmitool dcmi power reading output") +} + +// sampleIPMIPowerSeries collects IPMI power readings every 2 seconds for +// durationSec seconds. Returns the mean of all successful samples. +// Returns 0, false if IPMI is unavailable. +func sampleIPMIPowerSeries(ctx context.Context, durationSec int) (meanW float64, ok bool) { + if durationSec <= 0 { + return 0, false + } + deadline := time.Now().Add(time.Duration(durationSec) * time.Second) + var samples []float64 + for { + if w, err := queryIPMIServerPowerW(); err == nil { + samples = append(samples, w) + } + if time.Now().After(deadline) { + break + } + select { + case <-ctx.Done(): + break + case <-time.After(2 * time.Second): + } + } + if len(samples) == 0 { + return 0, false + } + var sum float64 + for _, w := range samples { + sum += w + } + return sum / float64(len(samples)), true +} + +// characterizeServerPower computes BenchmarkServerPower from idle and loaded +// IPMI samples plus the GPU-reported average power during steady state. +func characterizeServerPower(idleW, loadedW, gpuReportedSumW float64, ipmiAvailable bool) *BenchmarkServerPower { + sp := &BenchmarkServerPower{Available: ipmiAvailable} + if !ipmiAvailable { + sp.Notes = append(sp.Notes, "IPMI power reading unavailable; server-side power characterization skipped") + return sp + } + sp.IdleW = idleW + sp.LoadedW = loadedW + sp.DeltaW = loadedW - idleW + sp.GPUReportedSumW = gpuReportedSumW + if gpuReportedSumW > 0 && sp.DeltaW > 0 { + sp.ReportingRatio = sp.DeltaW / gpuReportedSumW + } + return sp +} diff --git a/audit/internal/platform/benchmark_report.go b/audit/internal/platform/benchmark_report.go index b709090..79c6a49 100644 --- a/audit/internal/platform/benchmark_report.go +++ b/audit/internal/platform/benchmark_report.go @@ -56,6 +56,9 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc fmt.Fprintf(&b, " Status: %s\n", gpu.Status) fmt.Fprintf(&b, " Composite score: %.2f\n", gpu.Scores.CompositeScore) fmt.Fprintf(&b, " Compute score: %.2f\n", gpu.Scores.ComputeScore) + if gpu.Scores.TOPSPerSMPerGHz > 0 { + fmt.Fprintf(&b, " Compute efficiency: %.3f TOPS/SM/GHz\n", gpu.Scores.TOPSPerSMPerGHz) + } fmt.Fprintf(&b, " Power sustain: %.1f\n", gpu.Scores.PowerSustainScore) fmt.Fprintf(&b, " Thermal sustain: %.1f\n", gpu.Scores.ThermalSustainScore) fmt.Fprintf(&b, " Stability: %.1f\n", gpu.Scores.StabilityScore) @@ -77,13 +80,7 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc } } } - fmt.Fprintf(&b, " Throttle counters (us): sw_power=%d sw_thermal=%d sync_boost=%d hw_thermal=%d hw_power_brake=%d\n", - gpu.Throttle.SWPowerCapUS, - gpu.Throttle.SWThermalSlowdownUS, - gpu.Throttle.SyncBoostUS, - gpu.Throttle.HWThermalSlowdownUS, - gpu.Throttle.HWPowerBrakeSlowdownUS, - ) + fmt.Fprintf(&b, " Throttle: %s\n", formatThrottleLine(gpu.Throttle, gpu.Steady.DurationSec)) if len(gpu.Notes) > 0 { fmt.Fprintf(&b, " Notes:\n") for _, note := range gpu.Notes { @@ -121,6 +118,26 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc } } + if sp := result.ServerPower; sp != nil { + fmt.Fprintf(&b, "Server Power (IPMI)\n") + fmt.Fprintf(&b, "-------------------\n") + if !sp.Available { + fmt.Fprintf(&b, "Unavailable\n") + } else { + fmt.Fprintf(&b, " Server idle: %.0f W\n", sp.IdleW) + fmt.Fprintf(&b, " Server under load: %.0f W\n", sp.LoadedW) + fmt.Fprintf(&b, " Server delta: %.0f W\n", sp.DeltaW) + fmt.Fprintf(&b, " GPU reported (sum): %.0f W\n", sp.GPUReportedSumW) + if sp.ReportingRatio > 0 { + fmt.Fprintf(&b, " Reporting ratio: %.2f (1.0 = accurate, <0.75 = GPU over-reports)\n", sp.ReportingRatio) + } + } + for _, note := range sp.Notes { + fmt.Fprintf(&b, " Note: %s\n", note) + } + b.WriteString("\n") + } + fmt.Fprintf(&b, "Methodology\n") fmt.Fprintf(&b, "-----------\n") fmt.Fprintf(&b, "- Profile %s uses standardized baseline, warmup, steady-state, interconnect, and cooldown phases.\n", result.BenchmarkProfile) @@ -175,6 +192,42 @@ func stripANSIEscapeSequences(raw string) string { return ansiEscapePattern.ReplaceAllString(raw, "") } +// formatThrottleLine renders throttle counters as human-readable percentages of +// the steady-state window. Only non-zero counters are shown. When the steady +// duration is unknown (0), raw seconds are shown instead. +func formatThrottleLine(t BenchmarkThrottleCounters, steadyDurationSec float64) string { + type counter struct { + label string + us uint64 + } + counters := []counter{ + {"sw_power", t.SWPowerCapUS}, + {"sw_thermal", t.SWThermalSlowdownUS}, + {"sync_boost", t.SyncBoostUS}, + {"hw_thermal", t.HWThermalSlowdownUS}, + {"hw_power_brake", t.HWPowerBrakeSlowdownUS}, + } + var parts []string + for _, c := range counters { + if c.us == 0 { + continue + } + sec := float64(c.us) / 1e6 + if steadyDurationSec > 0 { + pct := sec / steadyDurationSec * 100 + parts = append(parts, fmt.Sprintf("%s=%.1f%% (%.0fs)", c.label, pct, sec)) + } else if sec < 1 { + parts = append(parts, fmt.Sprintf("%s=%.0fms", c.label, sec*1000)) + } else { + parts = append(parts, fmt.Sprintf("%s=%.1fs", c.label, sec)) + } + } + if len(parts) == 0 { + return "none" + } + return strings.Join(parts, " ") +} + func renderBenchmarkSummary(result NvidiaBenchmarkResult) string { var b strings.Builder fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339)) diff --git a/audit/internal/platform/benchmark_types.go b/audit/internal/platform/benchmark_types.go index a8b5618..8861e61 100644 --- a/audit/internal/platform/benchmark_types.go +++ b/audit/internal/platform/benchmark_types.go @@ -28,6 +28,7 @@ type NvidiaBenchmarkResult struct { Normalization BenchmarkNormalization `json:"normalization"` GPUs []BenchmarkGPUResult `json:"gpus"` Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"` + ServerPower *BenchmarkServerPower `json:"server_power,omitempty"` } type BenchmarkNormalization struct { @@ -56,7 +57,10 @@ type BenchmarkGPUResult struct { Backend string `json:"backend,omitempty"` Status string `json:"status"` PowerLimitW float64 `json:"power_limit_w,omitempty"` + MultiprocessorCount int `json:"multiprocessor_count,omitempty"` + DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"` MaxGraphicsClockMHz float64 `json:"max_graphics_clock_mhz,omitempty"` + BaseGraphicsClockMHz float64 `json:"base_graphics_clock_mhz,omitempty"` MaxMemoryClockMHz float64 `json:"max_memory_clock_mhz,omitempty"` LockedGraphicsClockMHz float64 `json:"locked_graphics_clock_mhz,omitempty"` LockedMemoryClockMHz float64 `json:"locked_memory_clock_mhz,omitempty"` @@ -117,6 +121,24 @@ type BenchmarkScorecard struct { StabilityScore float64 `json:"stability_score"` InterconnectScore float64 `json:"interconnect_score"` CompositeScore float64 `json:"composite_score"` + // TOPSPerSMPerGHz is compute efficiency independent of clock speed and SM count. + // Comparable across throttle levels and GPU generations. Low value at normal + // clocks indicates silicon degradation. + TOPSPerSMPerGHz float64 `json:"tops_per_sm_per_ghz,omitempty"` +} + +// BenchmarkServerPower captures server-side power via IPMI alongside GPU-reported +// power. The reporting_ratio (delta / gpu_reported_sum) near 1.0 means GPU power +// telemetry is accurate; a ratio well below 1.0 (e.g. 0.5) means the GPU is +// over-reporting its power consumption. +type BenchmarkServerPower struct { + Available bool `json:"available"` + IdleW float64 `json:"idle_w,omitempty"` + LoadedW float64 `json:"loaded_w,omitempty"` + DeltaW float64 `json:"delta_w,omitempty"` + GPUReportedSumW float64 `json:"gpu_reported_sum_w,omitempty"` + ReportingRatio float64 `json:"reporting_ratio,omitempty"` + Notes []string `json:"notes,omitempty"` } type BenchmarkInterconnectResult struct { diff --git a/bible-local/docs/benchmark-clock-calibration.md b/bible-local/docs/benchmark-clock-calibration.md new file mode 100644 index 0000000..4437467 --- /dev/null +++ b/bible-local/docs/benchmark-clock-calibration.md @@ -0,0 +1,248 @@ +# Benchmark clock calibration research + +## Status +In progress. Baseline data from production servers pending. + +## Background + +The benchmark locks GPU clocks to `MaxGraphicsClockMHz` (boost) via `nvidia-smi -lgc` +before the steady-state phase. The metric `low_sm_clock_vs_target` fires when +`avg_steady_clock < locked_target * 0.90`. + +Problem: boost clock is the theoretical maximum under ideal cooling. In practice, +even a healthy GPU in a non-ideal server will sustain clocks well below boost. +The 90% threshold has no empirical basis. + +## Key observations (2026-04-06) + +### H100 PCIe — new card, server not designed for it +- avg clock 1384 MHz, P95 1560 MHz (unstable, proba boost 1755 MHz) +- Thermal sustain: 0.0 (sw_thermal covers entire steady window) +- Stability: 70.0 — clocks erratic, no equilibrium found +- Degradation: power_capped, thermal_limited, low_sm_clock_vs_target, variance_too_high + +### H200 NVL — new card, server not designed for it +- avg clock = P95 = 1635 MHz (perfectly stable) +- Thermal sustain: 0.0 (sw_thermal + sw_power cover entire steady window) +- Stability: 92.0 — found stable thermal equilibrium at 1635 MHz +- Degradation: power_capped, thermal_limited +- Compute: 989 TOPS — card is computing correctly for its frequency + +### Key insight +The meaningful distinction is not *whether* the card throttles but *how stably* +it throttles. H200 found a thermal equilibrium (avg == P95, Stability 92), +H100 did not (avg << P95, Stability 70). Both are new cards; the H100's +instability may reflect a more severe thermal mismatch or a card issue. + +`sw_power ≈ sw_thermal` pattern = server cooling constraint, card likely OK. +`hw_thermal >> sw_thermal` pattern = card itself overheating, investigate. + +## Hypothesis for baseline + +After testing on servers designed for their GPUs (proper cooling): +- Healthy GPU under sustained load will run at a stable fraction of boost +- Expected: avg_steady ≈ 80–95% of boost depending on model and TDP class +- Base clock (`clocks.base.gr`) may be a better reference than boost: + a healthy card under real workload should comfortably exceed base clock + +## Baseline: H100 PCIe HBM2e — designed server (2026-04-06, 10 samples) + +Source: external stress test tool, ~90s runs, designed server, adequate power. + +### Healthy fingerprint + +- **Power**: hits cap ~340–360W immediately, stays flat throughout — HEALTHY +- **Clock**: starts ~1750 MHz, oscillates and declines to ~1540–1600 MHz by 90s + - Avg steady (visual): **~1580–1620 MHz** + - vs boost 1755 MHz: **~91–92%** + - Oscillation is NORMAL — this is the boost algorithm balancing under power cap + - Stable power + oscillating clocks = healthy power-cap behavior +- **Temperature**: linear rise ~38°C → 75–80°C over 90s (no runaway) +- **Consistency**: all 10 samples within ±20 MHz — very repeatable + +### Characteristic patten +Flat power line + oscillating/declining clock line = GPU correctly managed by +power cap algorithm. Do NOT flag this as instability. + +### Clock CV implication +The healthy oscillation WILL produce moderate ClockCVPct (~5–10%). +The current `variance_too_high` threshold (StabilityScore < 85) may fire on +healthy HBM2e PCIe cards. Needs recalibration. + +--- + +## Baseline: H100 HBM3 OEM SXM Custom (restored) — 2 confirmed samples + +Source: pytorch_training_loop stress test, 120s (90s stress + 30s cooldown). +Confirmed GPU: NVIDIA H100 80GB HBM3, GH100 rev a1. + +### GPU clock reference (from nvidia-smi, idle): +- base_clock_mhz: **1095** +- boost_clock_mhz: **1755** (nvidia-smi `clocks.max.graphics` at idle) +- achieved_max_clock_mhz: **1980** (actual burst max observed by tool) +- Our benchmark locks to `clocks.max.graphics` = likely 1980 MHz for this chip + +### Observed under 700W sustained load (both samples nearly identical): +- Power: ~700W flat — SXM slot, adequate power confirmed +- Clock steady range: **~1380–1480 MHz**, avg **~1420–1460 MHz** +- vs 1980 MHz (lock target): **72–74%** — severely below +- vs 1755 MHz (nvidia-smi boost): **81–83%** +- vs 1095 MHz (base): 130% — above base but far below expected for SXM +- Clock/Watt: ~2.1 MHz/W vs HBM2e ~4.6 MHz/W — 2× worse efficiency +- Temperature: 38°C → 79–80°C (same rate as HBM2e) +- Oscillation: present, similar character to HBM2e but at much lower frequency + +### Diagnosis +These restored cards are degraded. A healthy H100 SXM in a designed server +(DGX H100, HGX H100) should sustain ~1800–1900 MHz at 700W (~91–96% of 1980). +The 72–74% result is a clear signal of silicon or VRM degradation from the +refurbishment process. + +### Clock pattern note +Images 8/9 (previously marked as "HBM3 restored") are now confirmed identical +to images 19/20. Both sample sets show same degraded pattern — same batch. + +--- + +## Baseline matrix (filled where data available) + +| GPU model | Config | Avg clock steady | vs boost | Clock/Watt | Notes | +|---|---|---|---|---|---| +| H100 PCIe HBM2e | designed server | 1580–1620 MHz | 91–92% | ~4.6 MHz/W | 10 samples, healthy | +| H100 SXM HBM3 restored | 700W full | 1420–1460 MHz | 72–74% of 1980 | ~2.1 MHz/W | 4 samples confirmed, degraded | +| H100 SXM HBM3 healthy | designed | ~1800–1900 MHz est. | ~91–96% est. | ~2.7 MHz/W est. | need real baseline | +| H200 NVL | designed | TBD | TBD | TBD | need baseline | + +--- + +## H100 official spec (from NVIDIA datasheet) + +Source: NVIDIA H100 Tensor Core GPU Datasheet (image 23, 2026-04-06). +All TOPS marked * are with structural sparsity enabled. Divide by 2 for dense. + +| Model | FP16 Tensor (dense) | TF32 (dense) | FP8 (dense) | TDP | Memory | +|---|---|---|---|---|---| +| H100 80GB PCIe | 756 TFLOPS | 378 TFLOPS | 1,513 TFLOPS | 350W | HBM2e | +| H100 NVL 94GB PCIe | 990 TFLOPS | 495 TFLOPS | 1,980 TFLOPS | 400W | HBM3 | +| H100 80GB SXM (BQQV) | 989 TFLOPS | 494 TFLOPS | — | 700W | HBM3 | +| H100 94GB SXM (BUBB) | 989 TFLOPS | 494 TFLOPS | — | 700W | HBM2e | + +Notes: +- SXM boards do NOT list FP8 peak in this table (field empty) +- fp8_e5m2 is unsupported on H100 PCIe HBM2e — confirmed in our tests +- Tensor Cores: PCIe = 456, SXM = 528 (16% more on SXM) + +## Observed efficiency (H100 80GB PCIe, throttled server) + +From the report in this session (power+thermal throttle throughout steady): + +| Precision | Measured | Spec (dense) | % of spec | +|---|---|---|---| +| fp16_tensor | 329 TOPS | 756 TFLOPS | 44% | +| fp32_tf32 | 115 TOPS | 378 TFLOPS | 30% | +| fp8_e4m3 | 505 TOPS | 1,513 TFLOPS | 33% | + +33–44% of spec is expected given sustained power+thermal throttle (avg clock +1384 MHz vs boost 1755 MHz = 79%). The GPU is computing correctly for its +actual frequency — the low TOPS comes from throttle, not silicon defect. + +## H200 official spec (from NVIDIA datasheet, image 24, 2026-04-06) + +Format: without sparsity / with sparsity. + +| Model | FP16 Tensor (dense) | TF32 (dense) | FP8 (dense) | TDP | Memory | +|---|---|---|---|---|---| +| H200 NVL PCIe | 836 TFLOPS | 418 TFLOPS | 1,570 TFLOPS | 600W | HBM3e 141GB | +| H200 SXM | 990 TFLOPS | 495 TFLOPS | 1,979 TFLOPS | 700W | HBM3e 141GB | + +## Observed efficiency (H200 NVL PCIe, throttled non-designed server) + +Avg clock 1635 MHz (62% of boost ~2619 MHz). Entire steady in thermal throttle. + +| Precision | Measured | Spec (dense) | % of spec | +|---|---|---|---| +| fp16_tensor | 340 TOPS | 836 TFLOPS | 41% | +| fp32_tf32 | 120 TOPS | 418 TFLOPS | 29% | +| fp8_e4m3 | 529 TOPS | 1,570 TFLOPS | 34% | + +Comparable to H100 PCIe efficiency (33–44%) despite different architecture — +both are throttle-limited. Confirms that % of spec is not a quality signal, +it reflects the thermal environment. tops_per_sm_per_ghz is the right metric. + +## Real-world GEMM efficiency reference (2026-04-06, web research) + +Sources: SemiAnalysis MI300X vs H100 vs H200 training benchmark; cuBLAS optimization +worklog (hamzaelshafie.bearblog.dev); Lambda AI H100 performance analysis. + +### What healthy systems actually achieve: +- H100 SXM in designed server: **~720 TFLOPS FP16 = ~73% of spec** +- cuBLAS large square GEMM (8192³): up to **~83% flop utilization** +- H200 NVL PCIe: no public data, extrapolating ~73% → ~610 TFLOPS FP16 + +### Our results vs expectation: +| GPU | Our FP16 | Expected (73%) | Our % of spec | Gap | +|---|---|---|---|---| +| H100 PCIe HBM2e | 329 TOPS | ~552 TFLOPS | 44% | ~1.7× below | +| H200 NVL PCIe | 340 TOPS | ~610 TFLOPS | 41% | ~1.8× below | + +Our results are roughly **half** of what a healthy system achieves even under throttle. +This is NOT normal — 30-44% is not the industry baseline. + +### Likely causes of the gap (in order of probability): +1. **Thermal throttle** — confirmed, sw_thermal covers entire steady window +2. **Power limit below TDP** — GPU may be software-limited below 350W/600W. + Previous user may have set a lower limit via nvidia-smi -pl and it was not + reset. Our normalization sets clock locks but does NOT reset power limit. + Key check: `nvidia-smi -q | grep "Power Limit"` — default vs enforced. +3. **Matrix size** — ruled out. bee-gpu-burn uses 4096×4096×4096 for fp16, + 8192×8192×4096 for fp8. These are large enough for peak tensor utilization. + +### Power limit gap analysis (H100 PCIe): +- Avg clock 1384 MHz = 79% of boost 1755 MHz +- Expected TOPS at 79% clock: 756 × 0.79 ≈ 597 TFLOPS +- Actually measured: 329 TOPS = 55% of that estimate +- Remaining gap after accounting for clock throttle: ~45% +- Most likely explanation: enforced power limit < 350W TDP, further reducing + sustainable clock beyond what sw_thermal alone would cause. + +### Action item: +Add `power.limit` (enforced) AND `power.default_limit` to queryBenchmarkGPUInfo +so result.json shows if the card was pre-configured with a non-default limit. +If enforced < default × 0.95 → add finding "GPU power limit is below default TDP". + +### CPU/RAM impact on GPU FLOPS: +None. Pure on-GPU GEMM is fully compute-bound once data is in VRAM. +CPU core count and host RAM are irrelevant. + +## Compute efficiency metric (proposed, no hardcode) + +Instead of comparing TOPS to a hardcoded spec, compute: + tops_per_sm_per_ghz = measured_tops / (sm_count × avg_clock_ghz) + +This is model-agnostic. A GPU computing correctly at its actual frequency +will show a consistent tops_per_sm_per_ghz regardless of throttle level. +A GPU with degraded silicon will show low tops_per_sm_per_ghz even at +normal clocks. + +SM count is queryable: nvidia-smi --query-gpu=attribute.multiprocessor_count +(needs to be added to queryBenchmarkGPUInfo). + +Reference values to establish after baseline runs: +- H100 PCIe fp16_tensor: TBD tops/SM/GHz +- H100 SXM fp16_tensor: TBD tops/SM/GHz + +## Proposed threshold changes (pending more data) + +1. **`low_sm_clock_vs_target`**: raise threshold from 90% to 85% based on observed + 91–92% on healthy HBM2e. Or remove entirely — sw_power/sw_thermal already + capture the root cause. + +2. **`variance_too_high`** (StabilityScore < 85): healthy HBM2e WILL oscillate + under power cap. Consider suppressing this flag when power is flat and usage + is 100% (oscillation is expected). Or lower threshold to 70. + +3. **New signal: MHz/Watt efficiency**: if base_graphics_clock_mhz is available, + ratio avg_clock / power_w could identify degraded silicon (HBM3 restored S1 + would have been caught by this). + +Decision deferred until baseline on SXM designed servers collected. diff --git a/iso/builder/bee-gpu-stress.c b/iso/builder/bee-gpu-stress.c index ef25e9a..f84f995 100644 --- a/iso/builder/bee-gpu-stress.c +++ b/iso/builder/bee-gpu-stress.c @@ -606,6 +606,20 @@ struct prepared_profile { }; static const struct profile_desc k_profiles[] = { + { + "fp64", + "fp64", + 80, + 1, + 0, + 0, + 8, + CUDA_R_64F, + CUDA_R_64F, + CUDA_R_64F, + CUDA_R_64F, + CUBLAS_COMPUTE_64F, + }, { "fp32_tf32", "fp32",