Enhance benchmark: server power via IPMI, efficiency metrics, FP64, power limit check

- Sample server power (IPMI dcmi) during baseline+steady phases in parallel;
  compute delta vs GPU-reported sum; flag ratio < 0.75 as unreliable reporting
- Collect base_graphics_clock_mhz, multiprocessor_count, default_power_limit_w
  from nvidia-smi alongside existing GPU info
- Add tops_per_sm_per_ghz efficiency metric (model-agnostic silicon quality signal)
- Flag when enforced power limit is below default TDP by >5%
- Add fp64 profile to bee-gpu-burn worker (CUDA_R_64F, CUBLAS_COMPUTE_64F, min cc 8.0)
- Improve Executive Summary: overall pass count, FAILED GPU finding
- Throttle counters now shown as % of steady window instead of raw microseconds
- bible-local: clock calibration research, H100/H200 spec, real-world GEMM baselines

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-06 22:26:52 +03:00
parent f5d175f488
commit d973231f37
5 changed files with 551 additions and 18 deletions

View File

@@ -27,14 +27,17 @@ type benchmarkProfileSpec struct {
}
type benchmarkGPUInfo struct {
Index int
UUID string
Name string
BusID string
VBIOS string
PowerLimitW float64
MaxGraphicsClockMHz float64
MaxMemoryClockMHz float64
Index int
UUID string
Name string
BusID string
VBIOS string
PowerLimitW float64
DefaultPowerLimitW float64
MaxGraphicsClockMHz float64
MaxMemoryClockMHz float64
BaseGraphicsClockMHz float64
MultiprocessorCount int
}
type benchmarkBurnProfile struct {
@@ -111,6 +114,11 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
logFunc(fmt.Sprintf("NVIDIA benchmark profile=%s gpus=%s", spec.Name, joinIndexList(selected)))
// Server power characterization state — populated during per-GPU phases.
var serverIdleW, serverLoadedWSum float64
var serverIdleOK, serverLoadedOK bool
var serverLoadedSamples int
infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
if infoErr != nil {
result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error())
@@ -146,7 +154,10 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
gpuResult.BusID = info.BusID
gpuResult.VBIOS = info.VBIOS
gpuResult.PowerLimitW = info.PowerLimitW
gpuResult.MultiprocessorCount = info.MultiprocessorCount
gpuResult.DefaultPowerLimitW = info.DefaultPowerLimitW
gpuResult.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz
gpuResult.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz
}
if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
@@ -161,6 +172,15 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
gpuResult.Baseline = summarizeBenchmarkTelemetry(baselineRows)
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-baseline", idx), baselineRows)
// Sample server idle power once (first GPU only — server state is global).
if !serverIdleOK {
if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok {
serverIdleW = w
serverIdleOK = true
logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
}
}
warmupCmd := []string{
"bee-gpu-burn",
"--seconds", strconv.Itoa(spec.WarmupSec),
@@ -184,7 +204,50 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
"--devices", strconv.Itoa(idx),
}
logFunc(fmt.Sprintf("GPU %d: steady compute (%ds)", idx, spec.SteadySec))
// Sample server power via IPMI in parallel with the steady phase.
// We collect readings every 5s and average them.
ipmiStopCh := make(chan struct{})
ipmiResultCh := make(chan float64, 1)
go func() {
defer close(ipmiResultCh)
var samples []float64
ticker := time.NewTicker(5 * time.Second)
defer ticker.Stop()
// First sample after a short warmup delay.
select {
case <-ipmiStopCh:
return
case <-time.After(15 * time.Second):
}
for {
if w, err := queryIPMIServerPowerW(); err == nil {
samples = append(samples, w)
}
select {
case <-ipmiStopCh:
if len(samples) > 0 {
var sum float64
for _, w := range samples {
sum += w
}
ipmiResultCh <- sum / float64(len(samples))
}
return
case <-ticker.C:
}
}
}()
steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-steady.log", idx), steadyCmd, nil, []int{idx}, runDir, fmt.Sprintf("gpu-%d-steady", idx), logFunc)
close(ipmiStopCh)
if loadedW, ok := <-ipmiResultCh; ok {
serverLoadedWSum += loadedW
serverLoadedSamples++
serverLoadedOK = true
logFunc(fmt.Sprintf("GPU %d: server loaded power (IPMI): %.0f W", idx, loadedW))
}
_ = os.WriteFile(filepath.Join(runDir, fmt.Sprintf("gpu-%d-steady.log", idx)), steadyOut, 0644)
afterThrottle, _ := queryThrottleCounters(idx)
if steadyErr != nil {
@@ -232,6 +295,17 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
}
}
// Compute server power characterization from accumulated IPMI samples.
var gpuReportedSumW float64
for _, gpu := range result.GPUs {
gpuReportedSumW += gpu.Steady.AvgPowerW
}
var serverLoadedW float64
if serverLoadedSamples > 0 {
serverLoadedW = serverLoadedWSum / float64(serverLoadedSamples)
}
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuReportedSumW, serverIdleOK && serverLoadedOK)
result.Findings = buildBenchmarkFindings(result)
result.OverallStatus = benchmarkOverallStatus(result)
@@ -290,7 +364,7 @@ func resolveBenchmarkProfile(profile string) benchmarkProfileSpec {
func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
args := []string{
"--query-gpu=index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory",
"--query-gpu=index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics,attribute.multiprocessor_count,power.default_limit",
"--format=csv,noheader,nounits",
}
if len(gpuIndices) > 0 {
@@ -311,14 +385,14 @@ func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
infoByIndex := make(map[int]benchmarkGPUInfo, len(rows))
for _, row := range rows {
if len(row) < 8 {
if len(row) < 9 {
continue
}
idx, err := strconv.Atoi(strings.TrimSpace(row[0]))
if err != nil {
continue
}
infoByIndex[idx] = benchmarkGPUInfo{
info := benchmarkGPUInfo{
Index: idx,
UUID: strings.TrimSpace(row[1]),
Name: strings.TrimSpace(row[2]),
@@ -328,6 +402,16 @@ func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
MaxGraphicsClockMHz: parseBenchmarkFloat(row[6]),
MaxMemoryClockMHz: parseBenchmarkFloat(row[7]),
}
if len(row) >= 9 {
info.BaseGraphicsClockMHz = parseBenchmarkFloat(row[8])
}
if len(row) >= 10 {
info.MultiprocessorCount = int(parseBenchmarkFloat(row[9]))
}
if len(row) >= 11 {
info.DefaultPowerLimitW = parseBenchmarkFloat(row[10])
}
infoByIndex[idx] = info
}
return infoByIndex, nil
}
@@ -551,6 +635,8 @@ func ensureBenchmarkProfile(profiles map[string]*benchmarkBurnProfile, name stri
}
category := "other"
switch {
case strings.HasPrefix(name, "fp64"):
category = "fp64"
case strings.HasPrefix(name, "fp32"):
category = "fp32_tf32"
case strings.HasPrefix(name, "fp16"):
@@ -627,6 +713,9 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
score.ThermalSustainScore = clampScore(100 - thermalRatio*100)
score.StabilityScore = clampScore(100 - (gpu.Steady.ClockCVPct*4 + gpu.Steady.PowerCVPct*2 + gpu.Steady.ClockDriftPct*2))
score.CompositeScore = compositeBenchmarkScore(score)
if gpu.MultiprocessorCount > 0 && gpu.Steady.AvgGraphicsClockMHz > 0 && score.ComputeScore > 0 {
score.TOPSPerSMPerGHz = score.ComputeScore / float64(gpu.MultiprocessorCount) / (gpu.Steady.AvgGraphicsClockMHz / 1000.0)
}
return score
}
@@ -798,10 +887,30 @@ func finalizeBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkGPUResult {
func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
var findings []string
passed := 0
for _, gpu := range result.GPUs {
if gpu.Status == "OK" {
passed++
}
}
total := len(result.GPUs)
if total > 0 {
if passed == total {
findings = append(findings, fmt.Sprintf("All %d GPU(s) passed the benchmark.", total))
} else {
findings = append(findings, fmt.Sprintf("%d of %d GPU(s) passed the benchmark.", passed, total))
}
}
if result.Normalization.Status != "full" {
findings = append(findings, "Environment normalization was partial; compare results with caution.")
}
for _, gpu := range result.GPUs {
if gpu.Status == "FAILED" && len(gpu.DegradationReasons) == 0 {
findings = append(findings, fmt.Sprintf("GPU %d failed the benchmark (check verbose.log for details).", gpu.Index))
continue
}
if len(gpu.DegradationReasons) == 0 && gpu.Status == "OK" {
findings = append(findings, fmt.Sprintf("GPU %d held clocks without observable throttle counters during steady state.", gpu.Index))
continue
@@ -825,10 +934,24 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
if gpu.Backend == "driver-ptx" {
findings = append(findings, fmt.Sprintf("GPU %d used driver PTX fallback; tensor score is intentionally degraded.", gpu.Index))
}
if gpu.DefaultPowerLimitW > 0 && gpu.PowerLimitW > 0 && gpu.PowerLimitW < gpu.DefaultPowerLimitW*0.95 {
findings = append(findings, fmt.Sprintf(
"GPU %d power limit %.0f W is below default %.0f W (%.0f%%). Performance may be artificially reduced.",
gpu.Index, gpu.PowerLimitW, gpu.DefaultPowerLimitW, gpu.PowerLimitW/gpu.DefaultPowerLimitW*100,
))
}
}
if result.Interconnect != nil && result.Interconnect.Supported {
findings = append(findings, fmt.Sprintf("Multi-GPU all_reduce max bus bandwidth: %.1f GB/s.", result.Interconnect.MaxBusBWGBps))
}
if sp := result.ServerPower; sp != nil && sp.Available && sp.GPUReportedSumW > 0 {
if sp.ReportingRatio < 0.75 {
findings = append(findings, fmt.Sprintf(
"GPU power reporting may be unreliable: server delta %.0f W vs GPU-reported %.0f W (ratio %.2f). GPU telemetry likely over-reports actual consumption.",
sp.DeltaW, sp.GPUReportedSumW, sp.ReportingRatio,
))
}
}
return dedupeStrings(findings)
}
@@ -1007,3 +1130,76 @@ func maxInt(a, b int) int {
}
return b
}
// queryIPMIServerPowerW reads the current server power draw via ipmitool dcmi.
// Returns 0 and an error if IPMI is unavailable or the output cannot be parsed.
func queryIPMIServerPowerW() (float64, error) {
out, err := satExecCommand("ipmitool", "dcmi", "power", "reading").Output()
if err != nil {
return 0, fmt.Errorf("ipmitool dcmi power reading: %w", err)
}
for _, line := range strings.Split(string(out), "\n") {
if strings.Contains(line, "Current Power") {
parts := strings.SplitN(line, ":", 2)
if len(parts) == 2 {
val := strings.TrimSpace(strings.TrimSuffix(strings.TrimSpace(parts[1]), "Watts"))
val = strings.TrimSpace(val)
w, err := strconv.ParseFloat(val, 64)
if err == nil && w > 0 {
return w, nil
}
}
}
}
return 0, fmt.Errorf("could not parse ipmitool dcmi power reading output")
}
// sampleIPMIPowerSeries collects IPMI power readings every 2 seconds for
// durationSec seconds. Returns the mean of all successful samples.
// Returns 0, false if IPMI is unavailable.
func sampleIPMIPowerSeries(ctx context.Context, durationSec int) (meanW float64, ok bool) {
if durationSec <= 0 {
return 0, false
}
deadline := time.Now().Add(time.Duration(durationSec) * time.Second)
var samples []float64
for {
if w, err := queryIPMIServerPowerW(); err == nil {
samples = append(samples, w)
}
if time.Now().After(deadline) {
break
}
select {
case <-ctx.Done():
break
case <-time.After(2 * time.Second):
}
}
if len(samples) == 0 {
return 0, false
}
var sum float64
for _, w := range samples {
sum += w
}
return sum / float64(len(samples)), true
}
// characterizeServerPower computes BenchmarkServerPower from idle and loaded
// IPMI samples plus the GPU-reported average power during steady state.
func characterizeServerPower(idleW, loadedW, gpuReportedSumW float64, ipmiAvailable bool) *BenchmarkServerPower {
sp := &BenchmarkServerPower{Available: ipmiAvailable}
if !ipmiAvailable {
sp.Notes = append(sp.Notes, "IPMI power reading unavailable; server-side power characterization skipped")
return sp
}
sp.IdleW = idleW
sp.LoadedW = loadedW
sp.DeltaW = loadedW - idleW
sp.GPUReportedSumW = gpuReportedSumW
if gpuReportedSumW > 0 && sp.DeltaW > 0 {
sp.ReportingRatio = sp.DeltaW / gpuReportedSumW
}
return sp
}