Enhance benchmark: server power via IPMI, efficiency metrics, FP64, power limit check
- Sample server power (IPMI dcmi) during baseline+steady phases in parallel; compute delta vs GPU-reported sum; flag ratio < 0.75 as unreliable reporting - Collect base_graphics_clock_mhz, multiprocessor_count, default_power_limit_w from nvidia-smi alongside existing GPU info - Add tops_per_sm_per_ghz efficiency metric (model-agnostic silicon quality signal) - Flag when enforced power limit is below default TDP by >5% - Add fp64 profile to bee-gpu-burn worker (CUDA_R_64F, CUBLAS_COMPUTE_64F, min cc 8.0) - Improve Executive Summary: overall pass count, FAILED GPU finding - Throttle counters now shown as % of steady window instead of raw microseconds - bible-local: clock calibration research, H100/H200 spec, real-world GEMM baselines Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -33,8 +33,11 @@ type benchmarkGPUInfo struct {
|
|||||||
BusID string
|
BusID string
|
||||||
VBIOS string
|
VBIOS string
|
||||||
PowerLimitW float64
|
PowerLimitW float64
|
||||||
|
DefaultPowerLimitW float64
|
||||||
MaxGraphicsClockMHz float64
|
MaxGraphicsClockMHz float64
|
||||||
MaxMemoryClockMHz float64
|
MaxMemoryClockMHz float64
|
||||||
|
BaseGraphicsClockMHz float64
|
||||||
|
MultiprocessorCount int
|
||||||
}
|
}
|
||||||
|
|
||||||
type benchmarkBurnProfile struct {
|
type benchmarkBurnProfile struct {
|
||||||
@@ -111,6 +114,11 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
|
|
||||||
logFunc(fmt.Sprintf("NVIDIA benchmark profile=%s gpus=%s", spec.Name, joinIndexList(selected)))
|
logFunc(fmt.Sprintf("NVIDIA benchmark profile=%s gpus=%s", spec.Name, joinIndexList(selected)))
|
||||||
|
|
||||||
|
// Server power characterization state — populated during per-GPU phases.
|
||||||
|
var serverIdleW, serverLoadedWSum float64
|
||||||
|
var serverIdleOK, serverLoadedOK bool
|
||||||
|
var serverLoadedSamples int
|
||||||
|
|
||||||
infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
|
infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
|
||||||
if infoErr != nil {
|
if infoErr != nil {
|
||||||
result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error())
|
result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error())
|
||||||
@@ -146,7 +154,10 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
gpuResult.BusID = info.BusID
|
gpuResult.BusID = info.BusID
|
||||||
gpuResult.VBIOS = info.VBIOS
|
gpuResult.VBIOS = info.VBIOS
|
||||||
gpuResult.PowerLimitW = info.PowerLimitW
|
gpuResult.PowerLimitW = info.PowerLimitW
|
||||||
|
gpuResult.MultiprocessorCount = info.MultiprocessorCount
|
||||||
|
gpuResult.DefaultPowerLimitW = info.DefaultPowerLimitW
|
||||||
gpuResult.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz
|
gpuResult.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz
|
||||||
|
gpuResult.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
|
||||||
gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz
|
gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz
|
||||||
}
|
}
|
||||||
if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
|
if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
|
||||||
@@ -161,6 +172,15 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
gpuResult.Baseline = summarizeBenchmarkTelemetry(baselineRows)
|
gpuResult.Baseline = summarizeBenchmarkTelemetry(baselineRows)
|
||||||
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-baseline", idx), baselineRows)
|
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-baseline", idx), baselineRows)
|
||||||
|
|
||||||
|
// Sample server idle power once (first GPU only — server state is global).
|
||||||
|
if !serverIdleOK {
|
||||||
|
if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok {
|
||||||
|
serverIdleW = w
|
||||||
|
serverIdleOK = true
|
||||||
|
logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
warmupCmd := []string{
|
warmupCmd := []string{
|
||||||
"bee-gpu-burn",
|
"bee-gpu-burn",
|
||||||
"--seconds", strconv.Itoa(spec.WarmupSec),
|
"--seconds", strconv.Itoa(spec.WarmupSec),
|
||||||
@@ -184,7 +204,50 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
"--devices", strconv.Itoa(idx),
|
"--devices", strconv.Itoa(idx),
|
||||||
}
|
}
|
||||||
logFunc(fmt.Sprintf("GPU %d: steady compute (%ds)", idx, spec.SteadySec))
|
logFunc(fmt.Sprintf("GPU %d: steady compute (%ds)", idx, spec.SteadySec))
|
||||||
|
|
||||||
|
// Sample server power via IPMI in parallel with the steady phase.
|
||||||
|
// We collect readings every 5s and average them.
|
||||||
|
ipmiStopCh := make(chan struct{})
|
||||||
|
ipmiResultCh := make(chan float64, 1)
|
||||||
|
go func() {
|
||||||
|
defer close(ipmiResultCh)
|
||||||
|
var samples []float64
|
||||||
|
ticker := time.NewTicker(5 * time.Second)
|
||||||
|
defer ticker.Stop()
|
||||||
|
// First sample after a short warmup delay.
|
||||||
|
select {
|
||||||
|
case <-ipmiStopCh:
|
||||||
|
return
|
||||||
|
case <-time.After(15 * time.Second):
|
||||||
|
}
|
||||||
|
for {
|
||||||
|
if w, err := queryIPMIServerPowerW(); err == nil {
|
||||||
|
samples = append(samples, w)
|
||||||
|
}
|
||||||
|
select {
|
||||||
|
case <-ipmiStopCh:
|
||||||
|
if len(samples) > 0 {
|
||||||
|
var sum float64
|
||||||
|
for _, w := range samples {
|
||||||
|
sum += w
|
||||||
|
}
|
||||||
|
ipmiResultCh <- sum / float64(len(samples))
|
||||||
|
}
|
||||||
|
return
|
||||||
|
case <-ticker.C:
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-steady.log", idx), steadyCmd, nil, []int{idx}, runDir, fmt.Sprintf("gpu-%d-steady", idx), logFunc)
|
steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-steady.log", idx), steadyCmd, nil, []int{idx}, runDir, fmt.Sprintf("gpu-%d-steady", idx), logFunc)
|
||||||
|
close(ipmiStopCh)
|
||||||
|
if loadedW, ok := <-ipmiResultCh; ok {
|
||||||
|
serverLoadedWSum += loadedW
|
||||||
|
serverLoadedSamples++
|
||||||
|
serverLoadedOK = true
|
||||||
|
logFunc(fmt.Sprintf("GPU %d: server loaded power (IPMI): %.0f W", idx, loadedW))
|
||||||
|
}
|
||||||
|
|
||||||
_ = os.WriteFile(filepath.Join(runDir, fmt.Sprintf("gpu-%d-steady.log", idx)), steadyOut, 0644)
|
_ = os.WriteFile(filepath.Join(runDir, fmt.Sprintf("gpu-%d-steady.log", idx)), steadyOut, 0644)
|
||||||
afterThrottle, _ := queryThrottleCounters(idx)
|
afterThrottle, _ := queryThrottleCounters(idx)
|
||||||
if steadyErr != nil {
|
if steadyErr != nil {
|
||||||
@@ -232,6 +295,17 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Compute server power characterization from accumulated IPMI samples.
|
||||||
|
var gpuReportedSumW float64
|
||||||
|
for _, gpu := range result.GPUs {
|
||||||
|
gpuReportedSumW += gpu.Steady.AvgPowerW
|
||||||
|
}
|
||||||
|
var serverLoadedW float64
|
||||||
|
if serverLoadedSamples > 0 {
|
||||||
|
serverLoadedW = serverLoadedWSum / float64(serverLoadedSamples)
|
||||||
|
}
|
||||||
|
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuReportedSumW, serverIdleOK && serverLoadedOK)
|
||||||
|
|
||||||
result.Findings = buildBenchmarkFindings(result)
|
result.Findings = buildBenchmarkFindings(result)
|
||||||
result.OverallStatus = benchmarkOverallStatus(result)
|
result.OverallStatus = benchmarkOverallStatus(result)
|
||||||
|
|
||||||
@@ -290,7 +364,7 @@ func resolveBenchmarkProfile(profile string) benchmarkProfileSpec {
|
|||||||
|
|
||||||
func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
|
func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
|
||||||
args := []string{
|
args := []string{
|
||||||
"--query-gpu=index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory",
|
"--query-gpu=index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics,attribute.multiprocessor_count,power.default_limit",
|
||||||
"--format=csv,noheader,nounits",
|
"--format=csv,noheader,nounits",
|
||||||
}
|
}
|
||||||
if len(gpuIndices) > 0 {
|
if len(gpuIndices) > 0 {
|
||||||
@@ -311,14 +385,14 @@ func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
|
|||||||
|
|
||||||
infoByIndex := make(map[int]benchmarkGPUInfo, len(rows))
|
infoByIndex := make(map[int]benchmarkGPUInfo, len(rows))
|
||||||
for _, row := range rows {
|
for _, row := range rows {
|
||||||
if len(row) < 8 {
|
if len(row) < 9 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
idx, err := strconv.Atoi(strings.TrimSpace(row[0]))
|
idx, err := strconv.Atoi(strings.TrimSpace(row[0]))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
infoByIndex[idx] = benchmarkGPUInfo{
|
info := benchmarkGPUInfo{
|
||||||
Index: idx,
|
Index: idx,
|
||||||
UUID: strings.TrimSpace(row[1]),
|
UUID: strings.TrimSpace(row[1]),
|
||||||
Name: strings.TrimSpace(row[2]),
|
Name: strings.TrimSpace(row[2]),
|
||||||
@@ -328,6 +402,16 @@ func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
|
|||||||
MaxGraphicsClockMHz: parseBenchmarkFloat(row[6]),
|
MaxGraphicsClockMHz: parseBenchmarkFloat(row[6]),
|
||||||
MaxMemoryClockMHz: parseBenchmarkFloat(row[7]),
|
MaxMemoryClockMHz: parseBenchmarkFloat(row[7]),
|
||||||
}
|
}
|
||||||
|
if len(row) >= 9 {
|
||||||
|
info.BaseGraphicsClockMHz = parseBenchmarkFloat(row[8])
|
||||||
|
}
|
||||||
|
if len(row) >= 10 {
|
||||||
|
info.MultiprocessorCount = int(parseBenchmarkFloat(row[9]))
|
||||||
|
}
|
||||||
|
if len(row) >= 11 {
|
||||||
|
info.DefaultPowerLimitW = parseBenchmarkFloat(row[10])
|
||||||
|
}
|
||||||
|
infoByIndex[idx] = info
|
||||||
}
|
}
|
||||||
return infoByIndex, nil
|
return infoByIndex, nil
|
||||||
}
|
}
|
||||||
@@ -551,6 +635,8 @@ func ensureBenchmarkProfile(profiles map[string]*benchmarkBurnProfile, name stri
|
|||||||
}
|
}
|
||||||
category := "other"
|
category := "other"
|
||||||
switch {
|
switch {
|
||||||
|
case strings.HasPrefix(name, "fp64"):
|
||||||
|
category = "fp64"
|
||||||
case strings.HasPrefix(name, "fp32"):
|
case strings.HasPrefix(name, "fp32"):
|
||||||
category = "fp32_tf32"
|
category = "fp32_tf32"
|
||||||
case strings.HasPrefix(name, "fp16"):
|
case strings.HasPrefix(name, "fp16"):
|
||||||
@@ -627,6 +713,9 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
|
|||||||
score.ThermalSustainScore = clampScore(100 - thermalRatio*100)
|
score.ThermalSustainScore = clampScore(100 - thermalRatio*100)
|
||||||
score.StabilityScore = clampScore(100 - (gpu.Steady.ClockCVPct*4 + gpu.Steady.PowerCVPct*2 + gpu.Steady.ClockDriftPct*2))
|
score.StabilityScore = clampScore(100 - (gpu.Steady.ClockCVPct*4 + gpu.Steady.PowerCVPct*2 + gpu.Steady.ClockDriftPct*2))
|
||||||
score.CompositeScore = compositeBenchmarkScore(score)
|
score.CompositeScore = compositeBenchmarkScore(score)
|
||||||
|
if gpu.MultiprocessorCount > 0 && gpu.Steady.AvgGraphicsClockMHz > 0 && score.ComputeScore > 0 {
|
||||||
|
score.TOPSPerSMPerGHz = score.ComputeScore / float64(gpu.MultiprocessorCount) / (gpu.Steady.AvgGraphicsClockMHz / 1000.0)
|
||||||
|
}
|
||||||
return score
|
return score
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -798,10 +887,30 @@ func finalizeBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkGPUResult {
|
|||||||
|
|
||||||
func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
|
func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
|
||||||
var findings []string
|
var findings []string
|
||||||
|
|
||||||
|
passed := 0
|
||||||
|
for _, gpu := range result.GPUs {
|
||||||
|
if gpu.Status == "OK" {
|
||||||
|
passed++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
total := len(result.GPUs)
|
||||||
|
if total > 0 {
|
||||||
|
if passed == total {
|
||||||
|
findings = append(findings, fmt.Sprintf("All %d GPU(s) passed the benchmark.", total))
|
||||||
|
} else {
|
||||||
|
findings = append(findings, fmt.Sprintf("%d of %d GPU(s) passed the benchmark.", passed, total))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if result.Normalization.Status != "full" {
|
if result.Normalization.Status != "full" {
|
||||||
findings = append(findings, "Environment normalization was partial; compare results with caution.")
|
findings = append(findings, "Environment normalization was partial; compare results with caution.")
|
||||||
}
|
}
|
||||||
for _, gpu := range result.GPUs {
|
for _, gpu := range result.GPUs {
|
||||||
|
if gpu.Status == "FAILED" && len(gpu.DegradationReasons) == 0 {
|
||||||
|
findings = append(findings, fmt.Sprintf("GPU %d failed the benchmark (check verbose.log for details).", gpu.Index))
|
||||||
|
continue
|
||||||
|
}
|
||||||
if len(gpu.DegradationReasons) == 0 && gpu.Status == "OK" {
|
if len(gpu.DegradationReasons) == 0 && gpu.Status == "OK" {
|
||||||
findings = append(findings, fmt.Sprintf("GPU %d held clocks without observable throttle counters during steady state.", gpu.Index))
|
findings = append(findings, fmt.Sprintf("GPU %d held clocks without observable throttle counters during steady state.", gpu.Index))
|
||||||
continue
|
continue
|
||||||
@@ -825,10 +934,24 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
|
|||||||
if gpu.Backend == "driver-ptx" {
|
if gpu.Backend == "driver-ptx" {
|
||||||
findings = append(findings, fmt.Sprintf("GPU %d used driver PTX fallback; tensor score is intentionally degraded.", gpu.Index))
|
findings = append(findings, fmt.Sprintf("GPU %d used driver PTX fallback; tensor score is intentionally degraded.", gpu.Index))
|
||||||
}
|
}
|
||||||
|
if gpu.DefaultPowerLimitW > 0 && gpu.PowerLimitW > 0 && gpu.PowerLimitW < gpu.DefaultPowerLimitW*0.95 {
|
||||||
|
findings = append(findings, fmt.Sprintf(
|
||||||
|
"GPU %d power limit %.0f W is below default %.0f W (%.0f%%). Performance may be artificially reduced.",
|
||||||
|
gpu.Index, gpu.PowerLimitW, gpu.DefaultPowerLimitW, gpu.PowerLimitW/gpu.DefaultPowerLimitW*100,
|
||||||
|
))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if result.Interconnect != nil && result.Interconnect.Supported {
|
if result.Interconnect != nil && result.Interconnect.Supported {
|
||||||
findings = append(findings, fmt.Sprintf("Multi-GPU all_reduce max bus bandwidth: %.1f GB/s.", result.Interconnect.MaxBusBWGBps))
|
findings = append(findings, fmt.Sprintf("Multi-GPU all_reduce max bus bandwidth: %.1f GB/s.", result.Interconnect.MaxBusBWGBps))
|
||||||
}
|
}
|
||||||
|
if sp := result.ServerPower; sp != nil && sp.Available && sp.GPUReportedSumW > 0 {
|
||||||
|
if sp.ReportingRatio < 0.75 {
|
||||||
|
findings = append(findings, fmt.Sprintf(
|
||||||
|
"GPU power reporting may be unreliable: server delta %.0f W vs GPU-reported %.0f W (ratio %.2f). GPU telemetry likely over-reports actual consumption.",
|
||||||
|
sp.DeltaW, sp.GPUReportedSumW, sp.ReportingRatio,
|
||||||
|
))
|
||||||
|
}
|
||||||
|
}
|
||||||
return dedupeStrings(findings)
|
return dedupeStrings(findings)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1007,3 +1130,76 @@ func maxInt(a, b int) int {
|
|||||||
}
|
}
|
||||||
return b
|
return b
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// queryIPMIServerPowerW reads the current server power draw via ipmitool dcmi.
|
||||||
|
// Returns 0 and an error if IPMI is unavailable or the output cannot be parsed.
|
||||||
|
func queryIPMIServerPowerW() (float64, error) {
|
||||||
|
out, err := satExecCommand("ipmitool", "dcmi", "power", "reading").Output()
|
||||||
|
if err != nil {
|
||||||
|
return 0, fmt.Errorf("ipmitool dcmi power reading: %w", err)
|
||||||
|
}
|
||||||
|
for _, line := range strings.Split(string(out), "\n") {
|
||||||
|
if strings.Contains(line, "Current Power") {
|
||||||
|
parts := strings.SplitN(line, ":", 2)
|
||||||
|
if len(parts) == 2 {
|
||||||
|
val := strings.TrimSpace(strings.TrimSuffix(strings.TrimSpace(parts[1]), "Watts"))
|
||||||
|
val = strings.TrimSpace(val)
|
||||||
|
w, err := strconv.ParseFloat(val, 64)
|
||||||
|
if err == nil && w > 0 {
|
||||||
|
return w, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0, fmt.Errorf("could not parse ipmitool dcmi power reading output")
|
||||||
|
}
|
||||||
|
|
||||||
|
// sampleIPMIPowerSeries collects IPMI power readings every 2 seconds for
|
||||||
|
// durationSec seconds. Returns the mean of all successful samples.
|
||||||
|
// Returns 0, false if IPMI is unavailable.
|
||||||
|
func sampleIPMIPowerSeries(ctx context.Context, durationSec int) (meanW float64, ok bool) {
|
||||||
|
if durationSec <= 0 {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
deadline := time.Now().Add(time.Duration(durationSec) * time.Second)
|
||||||
|
var samples []float64
|
||||||
|
for {
|
||||||
|
if w, err := queryIPMIServerPowerW(); err == nil {
|
||||||
|
samples = append(samples, w)
|
||||||
|
}
|
||||||
|
if time.Now().After(deadline) {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
break
|
||||||
|
case <-time.After(2 * time.Second):
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(samples) == 0 {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
var sum float64
|
||||||
|
for _, w := range samples {
|
||||||
|
sum += w
|
||||||
|
}
|
||||||
|
return sum / float64(len(samples)), true
|
||||||
|
}
|
||||||
|
|
||||||
|
// characterizeServerPower computes BenchmarkServerPower from idle and loaded
|
||||||
|
// IPMI samples plus the GPU-reported average power during steady state.
|
||||||
|
func characterizeServerPower(idleW, loadedW, gpuReportedSumW float64, ipmiAvailable bool) *BenchmarkServerPower {
|
||||||
|
sp := &BenchmarkServerPower{Available: ipmiAvailable}
|
||||||
|
if !ipmiAvailable {
|
||||||
|
sp.Notes = append(sp.Notes, "IPMI power reading unavailable; server-side power characterization skipped")
|
||||||
|
return sp
|
||||||
|
}
|
||||||
|
sp.IdleW = idleW
|
||||||
|
sp.LoadedW = loadedW
|
||||||
|
sp.DeltaW = loadedW - idleW
|
||||||
|
sp.GPUReportedSumW = gpuReportedSumW
|
||||||
|
if gpuReportedSumW > 0 && sp.DeltaW > 0 {
|
||||||
|
sp.ReportingRatio = sp.DeltaW / gpuReportedSumW
|
||||||
|
}
|
||||||
|
return sp
|
||||||
|
}
|
||||||
|
|||||||
@@ -56,6 +56,9 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
|
|||||||
fmt.Fprintf(&b, " Status: %s\n", gpu.Status)
|
fmt.Fprintf(&b, " Status: %s\n", gpu.Status)
|
||||||
fmt.Fprintf(&b, " Composite score: %.2f\n", gpu.Scores.CompositeScore)
|
fmt.Fprintf(&b, " Composite score: %.2f\n", gpu.Scores.CompositeScore)
|
||||||
fmt.Fprintf(&b, " Compute score: %.2f\n", gpu.Scores.ComputeScore)
|
fmt.Fprintf(&b, " Compute score: %.2f\n", gpu.Scores.ComputeScore)
|
||||||
|
if gpu.Scores.TOPSPerSMPerGHz > 0 {
|
||||||
|
fmt.Fprintf(&b, " Compute efficiency: %.3f TOPS/SM/GHz\n", gpu.Scores.TOPSPerSMPerGHz)
|
||||||
|
}
|
||||||
fmt.Fprintf(&b, " Power sustain: %.1f\n", gpu.Scores.PowerSustainScore)
|
fmt.Fprintf(&b, " Power sustain: %.1f\n", gpu.Scores.PowerSustainScore)
|
||||||
fmt.Fprintf(&b, " Thermal sustain: %.1f\n", gpu.Scores.ThermalSustainScore)
|
fmt.Fprintf(&b, " Thermal sustain: %.1f\n", gpu.Scores.ThermalSustainScore)
|
||||||
fmt.Fprintf(&b, " Stability: %.1f\n", gpu.Scores.StabilityScore)
|
fmt.Fprintf(&b, " Stability: %.1f\n", gpu.Scores.StabilityScore)
|
||||||
@@ -77,13 +80,7 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fmt.Fprintf(&b, " Throttle counters (us): sw_power=%d sw_thermal=%d sync_boost=%d hw_thermal=%d hw_power_brake=%d\n",
|
fmt.Fprintf(&b, " Throttle: %s\n", formatThrottleLine(gpu.Throttle, gpu.Steady.DurationSec))
|
||||||
gpu.Throttle.SWPowerCapUS,
|
|
||||||
gpu.Throttle.SWThermalSlowdownUS,
|
|
||||||
gpu.Throttle.SyncBoostUS,
|
|
||||||
gpu.Throttle.HWThermalSlowdownUS,
|
|
||||||
gpu.Throttle.HWPowerBrakeSlowdownUS,
|
|
||||||
)
|
|
||||||
if len(gpu.Notes) > 0 {
|
if len(gpu.Notes) > 0 {
|
||||||
fmt.Fprintf(&b, " Notes:\n")
|
fmt.Fprintf(&b, " Notes:\n")
|
||||||
for _, note := range gpu.Notes {
|
for _, note := range gpu.Notes {
|
||||||
@@ -121,6 +118,26 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if sp := result.ServerPower; sp != nil {
|
||||||
|
fmt.Fprintf(&b, "Server Power (IPMI)\n")
|
||||||
|
fmt.Fprintf(&b, "-------------------\n")
|
||||||
|
if !sp.Available {
|
||||||
|
fmt.Fprintf(&b, "Unavailable\n")
|
||||||
|
} else {
|
||||||
|
fmt.Fprintf(&b, " Server idle: %.0f W\n", sp.IdleW)
|
||||||
|
fmt.Fprintf(&b, " Server under load: %.0f W\n", sp.LoadedW)
|
||||||
|
fmt.Fprintf(&b, " Server delta: %.0f W\n", sp.DeltaW)
|
||||||
|
fmt.Fprintf(&b, " GPU reported (sum): %.0f W\n", sp.GPUReportedSumW)
|
||||||
|
if sp.ReportingRatio > 0 {
|
||||||
|
fmt.Fprintf(&b, " Reporting ratio: %.2f (1.0 = accurate, <0.75 = GPU over-reports)\n", sp.ReportingRatio)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, note := range sp.Notes {
|
||||||
|
fmt.Fprintf(&b, " Note: %s\n", note)
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
|
||||||
fmt.Fprintf(&b, "Methodology\n")
|
fmt.Fprintf(&b, "Methodology\n")
|
||||||
fmt.Fprintf(&b, "-----------\n")
|
fmt.Fprintf(&b, "-----------\n")
|
||||||
fmt.Fprintf(&b, "- Profile %s uses standardized baseline, warmup, steady-state, interconnect, and cooldown phases.\n", result.BenchmarkProfile)
|
fmt.Fprintf(&b, "- Profile %s uses standardized baseline, warmup, steady-state, interconnect, and cooldown phases.\n", result.BenchmarkProfile)
|
||||||
@@ -175,6 +192,42 @@ func stripANSIEscapeSequences(raw string) string {
|
|||||||
return ansiEscapePattern.ReplaceAllString(raw, "")
|
return ansiEscapePattern.ReplaceAllString(raw, "")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// formatThrottleLine renders throttle counters as human-readable percentages of
|
||||||
|
// the steady-state window. Only non-zero counters are shown. When the steady
|
||||||
|
// duration is unknown (0), raw seconds are shown instead.
|
||||||
|
func formatThrottleLine(t BenchmarkThrottleCounters, steadyDurationSec float64) string {
|
||||||
|
type counter struct {
|
||||||
|
label string
|
||||||
|
us uint64
|
||||||
|
}
|
||||||
|
counters := []counter{
|
||||||
|
{"sw_power", t.SWPowerCapUS},
|
||||||
|
{"sw_thermal", t.SWThermalSlowdownUS},
|
||||||
|
{"sync_boost", t.SyncBoostUS},
|
||||||
|
{"hw_thermal", t.HWThermalSlowdownUS},
|
||||||
|
{"hw_power_brake", t.HWPowerBrakeSlowdownUS},
|
||||||
|
}
|
||||||
|
var parts []string
|
||||||
|
for _, c := range counters {
|
||||||
|
if c.us == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
sec := float64(c.us) / 1e6
|
||||||
|
if steadyDurationSec > 0 {
|
||||||
|
pct := sec / steadyDurationSec * 100
|
||||||
|
parts = append(parts, fmt.Sprintf("%s=%.1f%% (%.0fs)", c.label, pct, sec))
|
||||||
|
} else if sec < 1 {
|
||||||
|
parts = append(parts, fmt.Sprintf("%s=%.0fms", c.label, sec*1000))
|
||||||
|
} else {
|
||||||
|
parts = append(parts, fmt.Sprintf("%s=%.1fs", c.label, sec))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(parts) == 0 {
|
||||||
|
return "none"
|
||||||
|
}
|
||||||
|
return strings.Join(parts, " ")
|
||||||
|
}
|
||||||
|
|
||||||
func renderBenchmarkSummary(result NvidiaBenchmarkResult) string {
|
func renderBenchmarkSummary(result NvidiaBenchmarkResult) string {
|
||||||
var b strings.Builder
|
var b strings.Builder
|
||||||
fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
|
fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
|
||||||
|
|||||||
@@ -28,6 +28,7 @@ type NvidiaBenchmarkResult struct {
|
|||||||
Normalization BenchmarkNormalization `json:"normalization"`
|
Normalization BenchmarkNormalization `json:"normalization"`
|
||||||
GPUs []BenchmarkGPUResult `json:"gpus"`
|
GPUs []BenchmarkGPUResult `json:"gpus"`
|
||||||
Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
|
Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
|
||||||
|
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type BenchmarkNormalization struct {
|
type BenchmarkNormalization struct {
|
||||||
@@ -56,7 +57,10 @@ type BenchmarkGPUResult struct {
|
|||||||
Backend string `json:"backend,omitempty"`
|
Backend string `json:"backend,omitempty"`
|
||||||
Status string `json:"status"`
|
Status string `json:"status"`
|
||||||
PowerLimitW float64 `json:"power_limit_w,omitempty"`
|
PowerLimitW float64 `json:"power_limit_w,omitempty"`
|
||||||
|
MultiprocessorCount int `json:"multiprocessor_count,omitempty"`
|
||||||
|
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
|
||||||
MaxGraphicsClockMHz float64 `json:"max_graphics_clock_mhz,omitempty"`
|
MaxGraphicsClockMHz float64 `json:"max_graphics_clock_mhz,omitempty"`
|
||||||
|
BaseGraphicsClockMHz float64 `json:"base_graphics_clock_mhz,omitempty"`
|
||||||
MaxMemoryClockMHz float64 `json:"max_memory_clock_mhz,omitempty"`
|
MaxMemoryClockMHz float64 `json:"max_memory_clock_mhz,omitempty"`
|
||||||
LockedGraphicsClockMHz float64 `json:"locked_graphics_clock_mhz,omitempty"`
|
LockedGraphicsClockMHz float64 `json:"locked_graphics_clock_mhz,omitempty"`
|
||||||
LockedMemoryClockMHz float64 `json:"locked_memory_clock_mhz,omitempty"`
|
LockedMemoryClockMHz float64 `json:"locked_memory_clock_mhz,omitempty"`
|
||||||
@@ -117,6 +121,24 @@ type BenchmarkScorecard struct {
|
|||||||
StabilityScore float64 `json:"stability_score"`
|
StabilityScore float64 `json:"stability_score"`
|
||||||
InterconnectScore float64 `json:"interconnect_score"`
|
InterconnectScore float64 `json:"interconnect_score"`
|
||||||
CompositeScore float64 `json:"composite_score"`
|
CompositeScore float64 `json:"composite_score"`
|
||||||
|
// TOPSPerSMPerGHz is compute efficiency independent of clock speed and SM count.
|
||||||
|
// Comparable across throttle levels and GPU generations. Low value at normal
|
||||||
|
// clocks indicates silicon degradation.
|
||||||
|
TOPSPerSMPerGHz float64 `json:"tops_per_sm_per_ghz,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// BenchmarkServerPower captures server-side power via IPMI alongside GPU-reported
|
||||||
|
// power. The reporting_ratio (delta / gpu_reported_sum) near 1.0 means GPU power
|
||||||
|
// telemetry is accurate; a ratio well below 1.0 (e.g. 0.5) means the GPU is
|
||||||
|
// over-reporting its power consumption.
|
||||||
|
type BenchmarkServerPower struct {
|
||||||
|
Available bool `json:"available"`
|
||||||
|
IdleW float64 `json:"idle_w,omitempty"`
|
||||||
|
LoadedW float64 `json:"loaded_w,omitempty"`
|
||||||
|
DeltaW float64 `json:"delta_w,omitempty"`
|
||||||
|
GPUReportedSumW float64 `json:"gpu_reported_sum_w,omitempty"`
|
||||||
|
ReportingRatio float64 `json:"reporting_ratio,omitempty"`
|
||||||
|
Notes []string `json:"notes,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type BenchmarkInterconnectResult struct {
|
type BenchmarkInterconnectResult struct {
|
||||||
|
|||||||
248
bible-local/docs/benchmark-clock-calibration.md
Normal file
248
bible-local/docs/benchmark-clock-calibration.md
Normal file
@@ -0,0 +1,248 @@
|
|||||||
|
# Benchmark clock calibration research
|
||||||
|
|
||||||
|
## Status
|
||||||
|
In progress. Baseline data from production servers pending.
|
||||||
|
|
||||||
|
## Background
|
||||||
|
|
||||||
|
The benchmark locks GPU clocks to `MaxGraphicsClockMHz` (boost) via `nvidia-smi -lgc`
|
||||||
|
before the steady-state phase. The metric `low_sm_clock_vs_target` fires when
|
||||||
|
`avg_steady_clock < locked_target * 0.90`.
|
||||||
|
|
||||||
|
Problem: boost clock is the theoretical maximum under ideal cooling. In practice,
|
||||||
|
even a healthy GPU in a non-ideal server will sustain clocks well below boost.
|
||||||
|
The 90% threshold has no empirical basis.
|
||||||
|
|
||||||
|
## Key observations (2026-04-06)
|
||||||
|
|
||||||
|
### H100 PCIe — new card, server not designed for it
|
||||||
|
- avg clock 1384 MHz, P95 1560 MHz (unstable, proba boost 1755 MHz)
|
||||||
|
- Thermal sustain: 0.0 (sw_thermal covers entire steady window)
|
||||||
|
- Stability: 70.0 — clocks erratic, no equilibrium found
|
||||||
|
- Degradation: power_capped, thermal_limited, low_sm_clock_vs_target, variance_too_high
|
||||||
|
|
||||||
|
### H200 NVL — new card, server not designed for it
|
||||||
|
- avg clock = P95 = 1635 MHz (perfectly stable)
|
||||||
|
- Thermal sustain: 0.0 (sw_thermal + sw_power cover entire steady window)
|
||||||
|
- Stability: 92.0 — found stable thermal equilibrium at 1635 MHz
|
||||||
|
- Degradation: power_capped, thermal_limited
|
||||||
|
- Compute: 989 TOPS — card is computing correctly for its frequency
|
||||||
|
|
||||||
|
### Key insight
|
||||||
|
The meaningful distinction is not *whether* the card throttles but *how stably*
|
||||||
|
it throttles. H200 found a thermal equilibrium (avg == P95, Stability 92),
|
||||||
|
H100 did not (avg << P95, Stability 70). Both are new cards; the H100's
|
||||||
|
instability may reflect a more severe thermal mismatch or a card issue.
|
||||||
|
|
||||||
|
`sw_power ≈ sw_thermal` pattern = server cooling constraint, card likely OK.
|
||||||
|
`hw_thermal >> sw_thermal` pattern = card itself overheating, investigate.
|
||||||
|
|
||||||
|
## Hypothesis for baseline
|
||||||
|
|
||||||
|
After testing on servers designed for their GPUs (proper cooling):
|
||||||
|
- Healthy GPU under sustained load will run at a stable fraction of boost
|
||||||
|
- Expected: avg_steady ≈ 80–95% of boost depending on model and TDP class
|
||||||
|
- Base clock (`clocks.base.gr`) may be a better reference than boost:
|
||||||
|
a healthy card under real workload should comfortably exceed base clock
|
||||||
|
|
||||||
|
## Baseline: H100 PCIe HBM2e — designed server (2026-04-06, 10 samples)
|
||||||
|
|
||||||
|
Source: external stress test tool, ~90s runs, designed server, adequate power.
|
||||||
|
|
||||||
|
### Healthy fingerprint
|
||||||
|
|
||||||
|
- **Power**: hits cap ~340–360W immediately, stays flat throughout — HEALTHY
|
||||||
|
- **Clock**: starts ~1750 MHz, oscillates and declines to ~1540–1600 MHz by 90s
|
||||||
|
- Avg steady (visual): **~1580–1620 MHz**
|
||||||
|
- vs boost 1755 MHz: **~91–92%**
|
||||||
|
- Oscillation is NORMAL — this is the boost algorithm balancing under power cap
|
||||||
|
- Stable power + oscillating clocks = healthy power-cap behavior
|
||||||
|
- **Temperature**: linear rise ~38°C → 75–80°C over 90s (no runaway)
|
||||||
|
- **Consistency**: all 10 samples within ±20 MHz — very repeatable
|
||||||
|
|
||||||
|
### Characteristic patten
|
||||||
|
Flat power line + oscillating/declining clock line = GPU correctly managed by
|
||||||
|
power cap algorithm. Do NOT flag this as instability.
|
||||||
|
|
||||||
|
### Clock CV implication
|
||||||
|
The healthy oscillation WILL produce moderate ClockCVPct (~5–10%).
|
||||||
|
The current `variance_too_high` threshold (StabilityScore < 85) may fire on
|
||||||
|
healthy HBM2e PCIe cards. Needs recalibration.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Baseline: H100 HBM3 OEM SXM Custom (restored) — 2 confirmed samples
|
||||||
|
|
||||||
|
Source: pytorch_training_loop stress test, 120s (90s stress + 30s cooldown).
|
||||||
|
Confirmed GPU: NVIDIA H100 80GB HBM3, GH100 rev a1.
|
||||||
|
|
||||||
|
### GPU clock reference (from nvidia-smi, idle):
|
||||||
|
- base_clock_mhz: **1095**
|
||||||
|
- boost_clock_mhz: **1755** (nvidia-smi `clocks.max.graphics` at idle)
|
||||||
|
- achieved_max_clock_mhz: **1980** (actual burst max observed by tool)
|
||||||
|
- Our benchmark locks to `clocks.max.graphics` = likely 1980 MHz for this chip
|
||||||
|
|
||||||
|
### Observed under 700W sustained load (both samples nearly identical):
|
||||||
|
- Power: ~700W flat — SXM slot, adequate power confirmed
|
||||||
|
- Clock steady range: **~1380–1480 MHz**, avg **~1420–1460 MHz**
|
||||||
|
- vs 1980 MHz (lock target): **72–74%** — severely below
|
||||||
|
- vs 1755 MHz (nvidia-smi boost): **81–83%**
|
||||||
|
- vs 1095 MHz (base): 130% — above base but far below expected for SXM
|
||||||
|
- Clock/Watt: ~2.1 MHz/W vs HBM2e ~4.6 MHz/W — 2× worse efficiency
|
||||||
|
- Temperature: 38°C → 79–80°C (same rate as HBM2e)
|
||||||
|
- Oscillation: present, similar character to HBM2e but at much lower frequency
|
||||||
|
|
||||||
|
### Diagnosis
|
||||||
|
These restored cards are degraded. A healthy H100 SXM in a designed server
|
||||||
|
(DGX H100, HGX H100) should sustain ~1800–1900 MHz at 700W (~91–96% of 1980).
|
||||||
|
The 72–74% result is a clear signal of silicon or VRM degradation from the
|
||||||
|
refurbishment process.
|
||||||
|
|
||||||
|
### Clock pattern note
|
||||||
|
Images 8/9 (previously marked as "HBM3 restored") are now confirmed identical
|
||||||
|
to images 19/20. Both sample sets show same degraded pattern — same batch.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Baseline matrix (filled where data available)
|
||||||
|
|
||||||
|
| GPU model | Config | Avg clock steady | vs boost | Clock/Watt | Notes |
|
||||||
|
|---|---|---|---|---|---|
|
||||||
|
| H100 PCIe HBM2e | designed server | 1580–1620 MHz | 91–92% | ~4.6 MHz/W | 10 samples, healthy |
|
||||||
|
| H100 SXM HBM3 restored | 700W full | 1420–1460 MHz | 72–74% of 1980 | ~2.1 MHz/W | 4 samples confirmed, degraded |
|
||||||
|
| H100 SXM HBM3 healthy | designed | ~1800–1900 MHz est. | ~91–96% est. | ~2.7 MHz/W est. | need real baseline |
|
||||||
|
| H200 NVL | designed | TBD | TBD | TBD | need baseline |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## H100 official spec (from NVIDIA datasheet)
|
||||||
|
|
||||||
|
Source: NVIDIA H100 Tensor Core GPU Datasheet (image 23, 2026-04-06).
|
||||||
|
All TOPS marked * are with structural sparsity enabled. Divide by 2 for dense.
|
||||||
|
|
||||||
|
| Model | FP16 Tensor (dense) | TF32 (dense) | FP8 (dense) | TDP | Memory |
|
||||||
|
|---|---|---|---|---|---|
|
||||||
|
| H100 80GB PCIe | 756 TFLOPS | 378 TFLOPS | 1,513 TFLOPS | 350W | HBM2e |
|
||||||
|
| H100 NVL 94GB PCIe | 990 TFLOPS | 495 TFLOPS | 1,980 TFLOPS | 400W | HBM3 |
|
||||||
|
| H100 80GB SXM (BQQV) | 989 TFLOPS | 494 TFLOPS | — | 700W | HBM3 |
|
||||||
|
| H100 94GB SXM (BUBB) | 989 TFLOPS | 494 TFLOPS | — | 700W | HBM2e |
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
- SXM boards do NOT list FP8 peak in this table (field empty)
|
||||||
|
- fp8_e5m2 is unsupported on H100 PCIe HBM2e — confirmed in our tests
|
||||||
|
- Tensor Cores: PCIe = 456, SXM = 528 (16% more on SXM)
|
||||||
|
|
||||||
|
## Observed efficiency (H100 80GB PCIe, throttled server)
|
||||||
|
|
||||||
|
From the report in this session (power+thermal throttle throughout steady):
|
||||||
|
|
||||||
|
| Precision | Measured | Spec (dense) | % of spec |
|
||||||
|
|---|---|---|---|
|
||||||
|
| fp16_tensor | 329 TOPS | 756 TFLOPS | 44% |
|
||||||
|
| fp32_tf32 | 115 TOPS | 378 TFLOPS | 30% |
|
||||||
|
| fp8_e4m3 | 505 TOPS | 1,513 TFLOPS | 33% |
|
||||||
|
|
||||||
|
33–44% of spec is expected given sustained power+thermal throttle (avg clock
|
||||||
|
1384 MHz vs boost 1755 MHz = 79%). The GPU is computing correctly for its
|
||||||
|
actual frequency — the low TOPS comes from throttle, not silicon defect.
|
||||||
|
|
||||||
|
## H200 official spec (from NVIDIA datasheet, image 24, 2026-04-06)
|
||||||
|
|
||||||
|
Format: without sparsity / with sparsity.
|
||||||
|
|
||||||
|
| Model | FP16 Tensor (dense) | TF32 (dense) | FP8 (dense) | TDP | Memory |
|
||||||
|
|---|---|---|---|---|---|
|
||||||
|
| H200 NVL PCIe | 836 TFLOPS | 418 TFLOPS | 1,570 TFLOPS | 600W | HBM3e 141GB |
|
||||||
|
| H200 SXM | 990 TFLOPS | 495 TFLOPS | 1,979 TFLOPS | 700W | HBM3e 141GB |
|
||||||
|
|
||||||
|
## Observed efficiency (H200 NVL PCIe, throttled non-designed server)
|
||||||
|
|
||||||
|
Avg clock 1635 MHz (62% of boost ~2619 MHz). Entire steady in thermal throttle.
|
||||||
|
|
||||||
|
| Precision | Measured | Spec (dense) | % of spec |
|
||||||
|
|---|---|---|---|
|
||||||
|
| fp16_tensor | 340 TOPS | 836 TFLOPS | 41% |
|
||||||
|
| fp32_tf32 | 120 TOPS | 418 TFLOPS | 29% |
|
||||||
|
| fp8_e4m3 | 529 TOPS | 1,570 TFLOPS | 34% |
|
||||||
|
|
||||||
|
Comparable to H100 PCIe efficiency (33–44%) despite different architecture —
|
||||||
|
both are throttle-limited. Confirms that % of spec is not a quality signal,
|
||||||
|
it reflects the thermal environment. tops_per_sm_per_ghz is the right metric.
|
||||||
|
|
||||||
|
## Real-world GEMM efficiency reference (2026-04-06, web research)
|
||||||
|
|
||||||
|
Sources: SemiAnalysis MI300X vs H100 vs H200 training benchmark; cuBLAS optimization
|
||||||
|
worklog (hamzaelshafie.bearblog.dev); Lambda AI H100 performance analysis.
|
||||||
|
|
||||||
|
### What healthy systems actually achieve:
|
||||||
|
- H100 SXM in designed server: **~720 TFLOPS FP16 = ~73% of spec**
|
||||||
|
- cuBLAS large square GEMM (8192³): up to **~83% flop utilization**
|
||||||
|
- H200 NVL PCIe: no public data, extrapolating ~73% → ~610 TFLOPS FP16
|
||||||
|
|
||||||
|
### Our results vs expectation:
|
||||||
|
| GPU | Our FP16 | Expected (73%) | Our % of spec | Gap |
|
||||||
|
|---|---|---|---|---|
|
||||||
|
| H100 PCIe HBM2e | 329 TOPS | ~552 TFLOPS | 44% | ~1.7× below |
|
||||||
|
| H200 NVL PCIe | 340 TOPS | ~610 TFLOPS | 41% | ~1.8× below |
|
||||||
|
|
||||||
|
Our results are roughly **half** of what a healthy system achieves even under throttle.
|
||||||
|
This is NOT normal — 30-44% is not the industry baseline.
|
||||||
|
|
||||||
|
### Likely causes of the gap (in order of probability):
|
||||||
|
1. **Thermal throttle** — confirmed, sw_thermal covers entire steady window
|
||||||
|
2. **Power limit below TDP** — GPU may be software-limited below 350W/600W.
|
||||||
|
Previous user may have set a lower limit via nvidia-smi -pl and it was not
|
||||||
|
reset. Our normalization sets clock locks but does NOT reset power limit.
|
||||||
|
Key check: `nvidia-smi -q | grep "Power Limit"` — default vs enforced.
|
||||||
|
3. **Matrix size** — ruled out. bee-gpu-burn uses 4096×4096×4096 for fp16,
|
||||||
|
8192×8192×4096 for fp8. These are large enough for peak tensor utilization.
|
||||||
|
|
||||||
|
### Power limit gap analysis (H100 PCIe):
|
||||||
|
- Avg clock 1384 MHz = 79% of boost 1755 MHz
|
||||||
|
- Expected TOPS at 79% clock: 756 × 0.79 ≈ 597 TFLOPS
|
||||||
|
- Actually measured: 329 TOPS = 55% of that estimate
|
||||||
|
- Remaining gap after accounting for clock throttle: ~45%
|
||||||
|
- Most likely explanation: enforced power limit < 350W TDP, further reducing
|
||||||
|
sustainable clock beyond what sw_thermal alone would cause.
|
||||||
|
|
||||||
|
### Action item:
|
||||||
|
Add `power.limit` (enforced) AND `power.default_limit` to queryBenchmarkGPUInfo
|
||||||
|
so result.json shows if the card was pre-configured with a non-default limit.
|
||||||
|
If enforced < default × 0.95 → add finding "GPU power limit is below default TDP".
|
||||||
|
|
||||||
|
### CPU/RAM impact on GPU FLOPS:
|
||||||
|
None. Pure on-GPU GEMM is fully compute-bound once data is in VRAM.
|
||||||
|
CPU core count and host RAM are irrelevant.
|
||||||
|
|
||||||
|
## Compute efficiency metric (proposed, no hardcode)
|
||||||
|
|
||||||
|
Instead of comparing TOPS to a hardcoded spec, compute:
|
||||||
|
tops_per_sm_per_ghz = measured_tops / (sm_count × avg_clock_ghz)
|
||||||
|
|
||||||
|
This is model-agnostic. A GPU computing correctly at its actual frequency
|
||||||
|
will show a consistent tops_per_sm_per_ghz regardless of throttle level.
|
||||||
|
A GPU with degraded silicon will show low tops_per_sm_per_ghz even at
|
||||||
|
normal clocks.
|
||||||
|
|
||||||
|
SM count is queryable: nvidia-smi --query-gpu=attribute.multiprocessor_count
|
||||||
|
(needs to be added to queryBenchmarkGPUInfo).
|
||||||
|
|
||||||
|
Reference values to establish after baseline runs:
|
||||||
|
- H100 PCIe fp16_tensor: TBD tops/SM/GHz
|
||||||
|
- H100 SXM fp16_tensor: TBD tops/SM/GHz
|
||||||
|
|
||||||
|
## Proposed threshold changes (pending more data)
|
||||||
|
|
||||||
|
1. **`low_sm_clock_vs_target`**: raise threshold from 90% to 85% based on observed
|
||||||
|
91–92% on healthy HBM2e. Or remove entirely — sw_power/sw_thermal already
|
||||||
|
capture the root cause.
|
||||||
|
|
||||||
|
2. **`variance_too_high`** (StabilityScore < 85): healthy HBM2e WILL oscillate
|
||||||
|
under power cap. Consider suppressing this flag when power is flat and usage
|
||||||
|
is 100% (oscillation is expected). Or lower threshold to 70.
|
||||||
|
|
||||||
|
3. **New signal: MHz/Watt efficiency**: if base_graphics_clock_mhz is available,
|
||||||
|
ratio avg_clock / power_w could identify degraded silicon (HBM3 restored S1
|
||||||
|
would have been caught by this).
|
||||||
|
|
||||||
|
Decision deferred until baseline on SXM designed servers collected.
|
||||||
@@ -606,6 +606,20 @@ struct prepared_profile {
|
|||||||
};
|
};
|
||||||
|
|
||||||
static const struct profile_desc k_profiles[] = {
|
static const struct profile_desc k_profiles[] = {
|
||||||
|
{
|
||||||
|
"fp64",
|
||||||
|
"fp64",
|
||||||
|
80,
|
||||||
|
1,
|
||||||
|
0,
|
||||||
|
0,
|
||||||
|
8,
|
||||||
|
CUDA_R_64F,
|
||||||
|
CUDA_R_64F,
|
||||||
|
CUDA_R_64F,
|
||||||
|
CUDA_R_64F,
|
||||||
|
CUBLAS_COMPUTE_64F,
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"fp32_tf32",
|
"fp32_tf32",
|
||||||
"fp32",
|
"fp32",
|
||||||
|
|||||||
Reference in New Issue
Block a user