Enhance benchmark: server power via IPMI, efficiency metrics, FP64, power limit check
- Sample server power (IPMI dcmi) during baseline+steady phases in parallel; compute delta vs GPU-reported sum; flag ratio < 0.75 as unreliable reporting - Collect base_graphics_clock_mhz, multiprocessor_count, default_power_limit_w from nvidia-smi alongside existing GPU info - Add tops_per_sm_per_ghz efficiency metric (model-agnostic silicon quality signal) - Flag when enforced power limit is below default TDP by >5% - Add fp64 profile to bee-gpu-burn worker (CUDA_R_64F, CUBLAS_COMPUTE_64F, min cc 8.0) - Improve Executive Summary: overall pass count, FAILED GPU finding - Throttle counters now shown as % of steady window instead of raw microseconds - bible-local: clock calibration research, H100/H200 spec, real-world GEMM baselines Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -56,6 +56,9 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
|
||||
fmt.Fprintf(&b, " Status: %s\n", gpu.Status)
|
||||
fmt.Fprintf(&b, " Composite score: %.2f\n", gpu.Scores.CompositeScore)
|
||||
fmt.Fprintf(&b, " Compute score: %.2f\n", gpu.Scores.ComputeScore)
|
||||
if gpu.Scores.TOPSPerSMPerGHz > 0 {
|
||||
fmt.Fprintf(&b, " Compute efficiency: %.3f TOPS/SM/GHz\n", gpu.Scores.TOPSPerSMPerGHz)
|
||||
}
|
||||
fmt.Fprintf(&b, " Power sustain: %.1f\n", gpu.Scores.PowerSustainScore)
|
||||
fmt.Fprintf(&b, " Thermal sustain: %.1f\n", gpu.Scores.ThermalSustainScore)
|
||||
fmt.Fprintf(&b, " Stability: %.1f\n", gpu.Scores.StabilityScore)
|
||||
@@ -77,13 +80,7 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
|
||||
}
|
||||
}
|
||||
}
|
||||
fmt.Fprintf(&b, " Throttle counters (us): sw_power=%d sw_thermal=%d sync_boost=%d hw_thermal=%d hw_power_brake=%d\n",
|
||||
gpu.Throttle.SWPowerCapUS,
|
||||
gpu.Throttle.SWThermalSlowdownUS,
|
||||
gpu.Throttle.SyncBoostUS,
|
||||
gpu.Throttle.HWThermalSlowdownUS,
|
||||
gpu.Throttle.HWPowerBrakeSlowdownUS,
|
||||
)
|
||||
fmt.Fprintf(&b, " Throttle: %s\n", formatThrottleLine(gpu.Throttle, gpu.Steady.DurationSec))
|
||||
if len(gpu.Notes) > 0 {
|
||||
fmt.Fprintf(&b, " Notes:\n")
|
||||
for _, note := range gpu.Notes {
|
||||
@@ -121,6 +118,26 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
|
||||
}
|
||||
}
|
||||
|
||||
if sp := result.ServerPower; sp != nil {
|
||||
fmt.Fprintf(&b, "Server Power (IPMI)\n")
|
||||
fmt.Fprintf(&b, "-------------------\n")
|
||||
if !sp.Available {
|
||||
fmt.Fprintf(&b, "Unavailable\n")
|
||||
} else {
|
||||
fmt.Fprintf(&b, " Server idle: %.0f W\n", sp.IdleW)
|
||||
fmt.Fprintf(&b, " Server under load: %.0f W\n", sp.LoadedW)
|
||||
fmt.Fprintf(&b, " Server delta: %.0f W\n", sp.DeltaW)
|
||||
fmt.Fprintf(&b, " GPU reported (sum): %.0f W\n", sp.GPUReportedSumW)
|
||||
if sp.ReportingRatio > 0 {
|
||||
fmt.Fprintf(&b, " Reporting ratio: %.2f (1.0 = accurate, <0.75 = GPU over-reports)\n", sp.ReportingRatio)
|
||||
}
|
||||
}
|
||||
for _, note := range sp.Notes {
|
||||
fmt.Fprintf(&b, " Note: %s\n", note)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
fmt.Fprintf(&b, "Methodology\n")
|
||||
fmt.Fprintf(&b, "-----------\n")
|
||||
fmt.Fprintf(&b, "- Profile %s uses standardized baseline, warmup, steady-state, interconnect, and cooldown phases.\n", result.BenchmarkProfile)
|
||||
@@ -175,6 +192,42 @@ func stripANSIEscapeSequences(raw string) string {
|
||||
return ansiEscapePattern.ReplaceAllString(raw, "")
|
||||
}
|
||||
|
||||
// formatThrottleLine renders throttle counters as human-readable percentages of
|
||||
// the steady-state window. Only non-zero counters are shown. When the steady
|
||||
// duration is unknown (0), raw seconds are shown instead.
|
||||
func formatThrottleLine(t BenchmarkThrottleCounters, steadyDurationSec float64) string {
|
||||
type counter struct {
|
||||
label string
|
||||
us uint64
|
||||
}
|
||||
counters := []counter{
|
||||
{"sw_power", t.SWPowerCapUS},
|
||||
{"sw_thermal", t.SWThermalSlowdownUS},
|
||||
{"sync_boost", t.SyncBoostUS},
|
||||
{"hw_thermal", t.HWThermalSlowdownUS},
|
||||
{"hw_power_brake", t.HWPowerBrakeSlowdownUS},
|
||||
}
|
||||
var parts []string
|
||||
for _, c := range counters {
|
||||
if c.us == 0 {
|
||||
continue
|
||||
}
|
||||
sec := float64(c.us) / 1e6
|
||||
if steadyDurationSec > 0 {
|
||||
pct := sec / steadyDurationSec * 100
|
||||
parts = append(parts, fmt.Sprintf("%s=%.1f%% (%.0fs)", c.label, pct, sec))
|
||||
} else if sec < 1 {
|
||||
parts = append(parts, fmt.Sprintf("%s=%.0fms", c.label, sec*1000))
|
||||
} else {
|
||||
parts = append(parts, fmt.Sprintf("%s=%.1fs", c.label, sec))
|
||||
}
|
||||
}
|
||||
if len(parts) == 0 {
|
||||
return "none"
|
||||
}
|
||||
return strings.Join(parts, " ")
|
||||
}
|
||||
|
||||
func renderBenchmarkSummary(result NvidiaBenchmarkResult) string {
|
||||
var b strings.Builder
|
||||
fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
|
||||
|
||||
Reference in New Issue
Block a user