Enhance benchmark: server power via IPMI, efficiency metrics, FP64, power limit check
- Sample server power (IPMI dcmi) during baseline+steady phases in parallel; compute delta vs GPU-reported sum; flag ratio < 0.75 as unreliable reporting - Collect base_graphics_clock_mhz, multiprocessor_count, default_power_limit_w from nvidia-smi alongside existing GPU info - Add tops_per_sm_per_ghz efficiency metric (model-agnostic silicon quality signal) - Flag when enforced power limit is below default TDP by >5% - Add fp64 profile to bee-gpu-burn worker (CUDA_R_64F, CUBLAS_COMPUTE_64F, min cc 8.0) - Improve Executive Summary: overall pass count, FAILED GPU finding - Throttle counters now shown as % of steady window instead of raw microseconds - bible-local: clock calibration research, H100/H200 spec, real-world GEMM baselines Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -27,14 +27,17 @@ type benchmarkProfileSpec struct {
|
||||
}
|
||||
|
||||
type benchmarkGPUInfo struct {
|
||||
Index int
|
||||
UUID string
|
||||
Name string
|
||||
BusID string
|
||||
VBIOS string
|
||||
PowerLimitW float64
|
||||
MaxGraphicsClockMHz float64
|
||||
MaxMemoryClockMHz float64
|
||||
Index int
|
||||
UUID string
|
||||
Name string
|
||||
BusID string
|
||||
VBIOS string
|
||||
PowerLimitW float64
|
||||
DefaultPowerLimitW float64
|
||||
MaxGraphicsClockMHz float64
|
||||
MaxMemoryClockMHz float64
|
||||
BaseGraphicsClockMHz float64
|
||||
MultiprocessorCount int
|
||||
}
|
||||
|
||||
type benchmarkBurnProfile struct {
|
||||
@@ -111,6 +114,11 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
||||
|
||||
logFunc(fmt.Sprintf("NVIDIA benchmark profile=%s gpus=%s", spec.Name, joinIndexList(selected)))
|
||||
|
||||
// Server power characterization state — populated during per-GPU phases.
|
||||
var serverIdleW, serverLoadedWSum float64
|
||||
var serverIdleOK, serverLoadedOK bool
|
||||
var serverLoadedSamples int
|
||||
|
||||
infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
|
||||
if infoErr != nil {
|
||||
result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error())
|
||||
@@ -146,7 +154,10 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
||||
gpuResult.BusID = info.BusID
|
||||
gpuResult.VBIOS = info.VBIOS
|
||||
gpuResult.PowerLimitW = info.PowerLimitW
|
||||
gpuResult.MultiprocessorCount = info.MultiprocessorCount
|
||||
gpuResult.DefaultPowerLimitW = info.DefaultPowerLimitW
|
||||
gpuResult.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz
|
||||
gpuResult.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
|
||||
gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz
|
||||
}
|
||||
if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
|
||||
@@ -161,6 +172,15 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
||||
gpuResult.Baseline = summarizeBenchmarkTelemetry(baselineRows)
|
||||
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-baseline", idx), baselineRows)
|
||||
|
||||
// Sample server idle power once (first GPU only — server state is global).
|
||||
if !serverIdleOK {
|
||||
if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok {
|
||||
serverIdleW = w
|
||||
serverIdleOK = true
|
||||
logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
|
||||
}
|
||||
}
|
||||
|
||||
warmupCmd := []string{
|
||||
"bee-gpu-burn",
|
||||
"--seconds", strconv.Itoa(spec.WarmupSec),
|
||||
@@ -184,7 +204,50 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
||||
"--devices", strconv.Itoa(idx),
|
||||
}
|
||||
logFunc(fmt.Sprintf("GPU %d: steady compute (%ds)", idx, spec.SteadySec))
|
||||
|
||||
// Sample server power via IPMI in parallel with the steady phase.
|
||||
// We collect readings every 5s and average them.
|
||||
ipmiStopCh := make(chan struct{})
|
||||
ipmiResultCh := make(chan float64, 1)
|
||||
go func() {
|
||||
defer close(ipmiResultCh)
|
||||
var samples []float64
|
||||
ticker := time.NewTicker(5 * time.Second)
|
||||
defer ticker.Stop()
|
||||
// First sample after a short warmup delay.
|
||||
select {
|
||||
case <-ipmiStopCh:
|
||||
return
|
||||
case <-time.After(15 * time.Second):
|
||||
}
|
||||
for {
|
||||
if w, err := queryIPMIServerPowerW(); err == nil {
|
||||
samples = append(samples, w)
|
||||
}
|
||||
select {
|
||||
case <-ipmiStopCh:
|
||||
if len(samples) > 0 {
|
||||
var sum float64
|
||||
for _, w := range samples {
|
||||
sum += w
|
||||
}
|
||||
ipmiResultCh <- sum / float64(len(samples))
|
||||
}
|
||||
return
|
||||
case <-ticker.C:
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-steady.log", idx), steadyCmd, nil, []int{idx}, runDir, fmt.Sprintf("gpu-%d-steady", idx), logFunc)
|
||||
close(ipmiStopCh)
|
||||
if loadedW, ok := <-ipmiResultCh; ok {
|
||||
serverLoadedWSum += loadedW
|
||||
serverLoadedSamples++
|
||||
serverLoadedOK = true
|
||||
logFunc(fmt.Sprintf("GPU %d: server loaded power (IPMI): %.0f W", idx, loadedW))
|
||||
}
|
||||
|
||||
_ = os.WriteFile(filepath.Join(runDir, fmt.Sprintf("gpu-%d-steady.log", idx)), steadyOut, 0644)
|
||||
afterThrottle, _ := queryThrottleCounters(idx)
|
||||
if steadyErr != nil {
|
||||
@@ -232,6 +295,17 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
||||
}
|
||||
}
|
||||
|
||||
// Compute server power characterization from accumulated IPMI samples.
|
||||
var gpuReportedSumW float64
|
||||
for _, gpu := range result.GPUs {
|
||||
gpuReportedSumW += gpu.Steady.AvgPowerW
|
||||
}
|
||||
var serverLoadedW float64
|
||||
if serverLoadedSamples > 0 {
|
||||
serverLoadedW = serverLoadedWSum / float64(serverLoadedSamples)
|
||||
}
|
||||
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuReportedSumW, serverIdleOK && serverLoadedOK)
|
||||
|
||||
result.Findings = buildBenchmarkFindings(result)
|
||||
result.OverallStatus = benchmarkOverallStatus(result)
|
||||
|
||||
@@ -290,7 +364,7 @@ func resolveBenchmarkProfile(profile string) benchmarkProfileSpec {
|
||||
|
||||
func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
|
||||
args := []string{
|
||||
"--query-gpu=index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory",
|
||||
"--query-gpu=index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics,attribute.multiprocessor_count,power.default_limit",
|
||||
"--format=csv,noheader,nounits",
|
||||
}
|
||||
if len(gpuIndices) > 0 {
|
||||
@@ -311,14 +385,14 @@ func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
|
||||
|
||||
infoByIndex := make(map[int]benchmarkGPUInfo, len(rows))
|
||||
for _, row := range rows {
|
||||
if len(row) < 8 {
|
||||
if len(row) < 9 {
|
||||
continue
|
||||
}
|
||||
idx, err := strconv.Atoi(strings.TrimSpace(row[0]))
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
infoByIndex[idx] = benchmarkGPUInfo{
|
||||
info := benchmarkGPUInfo{
|
||||
Index: idx,
|
||||
UUID: strings.TrimSpace(row[1]),
|
||||
Name: strings.TrimSpace(row[2]),
|
||||
@@ -328,6 +402,16 @@ func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
|
||||
MaxGraphicsClockMHz: parseBenchmarkFloat(row[6]),
|
||||
MaxMemoryClockMHz: parseBenchmarkFloat(row[7]),
|
||||
}
|
||||
if len(row) >= 9 {
|
||||
info.BaseGraphicsClockMHz = parseBenchmarkFloat(row[8])
|
||||
}
|
||||
if len(row) >= 10 {
|
||||
info.MultiprocessorCount = int(parseBenchmarkFloat(row[9]))
|
||||
}
|
||||
if len(row) >= 11 {
|
||||
info.DefaultPowerLimitW = parseBenchmarkFloat(row[10])
|
||||
}
|
||||
infoByIndex[idx] = info
|
||||
}
|
||||
return infoByIndex, nil
|
||||
}
|
||||
@@ -551,6 +635,8 @@ func ensureBenchmarkProfile(profiles map[string]*benchmarkBurnProfile, name stri
|
||||
}
|
||||
category := "other"
|
||||
switch {
|
||||
case strings.HasPrefix(name, "fp64"):
|
||||
category = "fp64"
|
||||
case strings.HasPrefix(name, "fp32"):
|
||||
category = "fp32_tf32"
|
||||
case strings.HasPrefix(name, "fp16"):
|
||||
@@ -627,6 +713,9 @@ func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
|
||||
score.ThermalSustainScore = clampScore(100 - thermalRatio*100)
|
||||
score.StabilityScore = clampScore(100 - (gpu.Steady.ClockCVPct*4 + gpu.Steady.PowerCVPct*2 + gpu.Steady.ClockDriftPct*2))
|
||||
score.CompositeScore = compositeBenchmarkScore(score)
|
||||
if gpu.MultiprocessorCount > 0 && gpu.Steady.AvgGraphicsClockMHz > 0 && score.ComputeScore > 0 {
|
||||
score.TOPSPerSMPerGHz = score.ComputeScore / float64(gpu.MultiprocessorCount) / (gpu.Steady.AvgGraphicsClockMHz / 1000.0)
|
||||
}
|
||||
return score
|
||||
}
|
||||
|
||||
@@ -798,10 +887,30 @@ func finalizeBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkGPUResult {
|
||||
|
||||
func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
|
||||
var findings []string
|
||||
|
||||
passed := 0
|
||||
for _, gpu := range result.GPUs {
|
||||
if gpu.Status == "OK" {
|
||||
passed++
|
||||
}
|
||||
}
|
||||
total := len(result.GPUs)
|
||||
if total > 0 {
|
||||
if passed == total {
|
||||
findings = append(findings, fmt.Sprintf("All %d GPU(s) passed the benchmark.", total))
|
||||
} else {
|
||||
findings = append(findings, fmt.Sprintf("%d of %d GPU(s) passed the benchmark.", passed, total))
|
||||
}
|
||||
}
|
||||
|
||||
if result.Normalization.Status != "full" {
|
||||
findings = append(findings, "Environment normalization was partial; compare results with caution.")
|
||||
}
|
||||
for _, gpu := range result.GPUs {
|
||||
if gpu.Status == "FAILED" && len(gpu.DegradationReasons) == 0 {
|
||||
findings = append(findings, fmt.Sprintf("GPU %d failed the benchmark (check verbose.log for details).", gpu.Index))
|
||||
continue
|
||||
}
|
||||
if len(gpu.DegradationReasons) == 0 && gpu.Status == "OK" {
|
||||
findings = append(findings, fmt.Sprintf("GPU %d held clocks without observable throttle counters during steady state.", gpu.Index))
|
||||
continue
|
||||
@@ -825,10 +934,24 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
|
||||
if gpu.Backend == "driver-ptx" {
|
||||
findings = append(findings, fmt.Sprintf("GPU %d used driver PTX fallback; tensor score is intentionally degraded.", gpu.Index))
|
||||
}
|
||||
if gpu.DefaultPowerLimitW > 0 && gpu.PowerLimitW > 0 && gpu.PowerLimitW < gpu.DefaultPowerLimitW*0.95 {
|
||||
findings = append(findings, fmt.Sprintf(
|
||||
"GPU %d power limit %.0f W is below default %.0f W (%.0f%%). Performance may be artificially reduced.",
|
||||
gpu.Index, gpu.PowerLimitW, gpu.DefaultPowerLimitW, gpu.PowerLimitW/gpu.DefaultPowerLimitW*100,
|
||||
))
|
||||
}
|
||||
}
|
||||
if result.Interconnect != nil && result.Interconnect.Supported {
|
||||
findings = append(findings, fmt.Sprintf("Multi-GPU all_reduce max bus bandwidth: %.1f GB/s.", result.Interconnect.MaxBusBWGBps))
|
||||
}
|
||||
if sp := result.ServerPower; sp != nil && sp.Available && sp.GPUReportedSumW > 0 {
|
||||
if sp.ReportingRatio < 0.75 {
|
||||
findings = append(findings, fmt.Sprintf(
|
||||
"GPU power reporting may be unreliable: server delta %.0f W vs GPU-reported %.0f W (ratio %.2f). GPU telemetry likely over-reports actual consumption.",
|
||||
sp.DeltaW, sp.GPUReportedSumW, sp.ReportingRatio,
|
||||
))
|
||||
}
|
||||
}
|
||||
return dedupeStrings(findings)
|
||||
}
|
||||
|
||||
@@ -1007,3 +1130,76 @@ func maxInt(a, b int) int {
|
||||
}
|
||||
return b
|
||||
}
|
||||
|
||||
// queryIPMIServerPowerW reads the current server power draw via ipmitool dcmi.
|
||||
// Returns 0 and an error if IPMI is unavailable or the output cannot be parsed.
|
||||
func queryIPMIServerPowerW() (float64, error) {
|
||||
out, err := satExecCommand("ipmitool", "dcmi", "power", "reading").Output()
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("ipmitool dcmi power reading: %w", err)
|
||||
}
|
||||
for _, line := range strings.Split(string(out), "\n") {
|
||||
if strings.Contains(line, "Current Power") {
|
||||
parts := strings.SplitN(line, ":", 2)
|
||||
if len(parts) == 2 {
|
||||
val := strings.TrimSpace(strings.TrimSuffix(strings.TrimSpace(parts[1]), "Watts"))
|
||||
val = strings.TrimSpace(val)
|
||||
w, err := strconv.ParseFloat(val, 64)
|
||||
if err == nil && w > 0 {
|
||||
return w, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0, fmt.Errorf("could not parse ipmitool dcmi power reading output")
|
||||
}
|
||||
|
||||
// sampleIPMIPowerSeries collects IPMI power readings every 2 seconds for
|
||||
// durationSec seconds. Returns the mean of all successful samples.
|
||||
// Returns 0, false if IPMI is unavailable.
|
||||
func sampleIPMIPowerSeries(ctx context.Context, durationSec int) (meanW float64, ok bool) {
|
||||
if durationSec <= 0 {
|
||||
return 0, false
|
||||
}
|
||||
deadline := time.Now().Add(time.Duration(durationSec) * time.Second)
|
||||
var samples []float64
|
||||
for {
|
||||
if w, err := queryIPMIServerPowerW(); err == nil {
|
||||
samples = append(samples, w)
|
||||
}
|
||||
if time.Now().After(deadline) {
|
||||
break
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
break
|
||||
case <-time.After(2 * time.Second):
|
||||
}
|
||||
}
|
||||
if len(samples) == 0 {
|
||||
return 0, false
|
||||
}
|
||||
var sum float64
|
||||
for _, w := range samples {
|
||||
sum += w
|
||||
}
|
||||
return sum / float64(len(samples)), true
|
||||
}
|
||||
|
||||
// characterizeServerPower computes BenchmarkServerPower from idle and loaded
|
||||
// IPMI samples plus the GPU-reported average power during steady state.
|
||||
func characterizeServerPower(idleW, loadedW, gpuReportedSumW float64, ipmiAvailable bool) *BenchmarkServerPower {
|
||||
sp := &BenchmarkServerPower{Available: ipmiAvailable}
|
||||
if !ipmiAvailable {
|
||||
sp.Notes = append(sp.Notes, "IPMI power reading unavailable; server-side power characterization skipped")
|
||||
return sp
|
||||
}
|
||||
sp.IdleW = idleW
|
||||
sp.LoadedW = loadedW
|
||||
sp.DeltaW = loadedW - idleW
|
||||
sp.GPUReportedSumW = gpuReportedSumW
|
||||
if gpuReportedSumW > 0 && sp.DeltaW > 0 {
|
||||
sp.ReportingRatio = sp.DeltaW / gpuReportedSumW
|
||||
}
|
||||
return sp
|
||||
}
|
||||
|
||||
@@ -56,6 +56,9 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
|
||||
fmt.Fprintf(&b, " Status: %s\n", gpu.Status)
|
||||
fmt.Fprintf(&b, " Composite score: %.2f\n", gpu.Scores.CompositeScore)
|
||||
fmt.Fprintf(&b, " Compute score: %.2f\n", gpu.Scores.ComputeScore)
|
||||
if gpu.Scores.TOPSPerSMPerGHz > 0 {
|
||||
fmt.Fprintf(&b, " Compute efficiency: %.3f TOPS/SM/GHz\n", gpu.Scores.TOPSPerSMPerGHz)
|
||||
}
|
||||
fmt.Fprintf(&b, " Power sustain: %.1f\n", gpu.Scores.PowerSustainScore)
|
||||
fmt.Fprintf(&b, " Thermal sustain: %.1f\n", gpu.Scores.ThermalSustainScore)
|
||||
fmt.Fprintf(&b, " Stability: %.1f\n", gpu.Scores.StabilityScore)
|
||||
@@ -77,13 +80,7 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
|
||||
}
|
||||
}
|
||||
}
|
||||
fmt.Fprintf(&b, " Throttle counters (us): sw_power=%d sw_thermal=%d sync_boost=%d hw_thermal=%d hw_power_brake=%d\n",
|
||||
gpu.Throttle.SWPowerCapUS,
|
||||
gpu.Throttle.SWThermalSlowdownUS,
|
||||
gpu.Throttle.SyncBoostUS,
|
||||
gpu.Throttle.HWThermalSlowdownUS,
|
||||
gpu.Throttle.HWPowerBrakeSlowdownUS,
|
||||
)
|
||||
fmt.Fprintf(&b, " Throttle: %s\n", formatThrottleLine(gpu.Throttle, gpu.Steady.DurationSec))
|
||||
if len(gpu.Notes) > 0 {
|
||||
fmt.Fprintf(&b, " Notes:\n")
|
||||
for _, note := range gpu.Notes {
|
||||
@@ -121,6 +118,26 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
|
||||
}
|
||||
}
|
||||
|
||||
if sp := result.ServerPower; sp != nil {
|
||||
fmt.Fprintf(&b, "Server Power (IPMI)\n")
|
||||
fmt.Fprintf(&b, "-------------------\n")
|
||||
if !sp.Available {
|
||||
fmt.Fprintf(&b, "Unavailable\n")
|
||||
} else {
|
||||
fmt.Fprintf(&b, " Server idle: %.0f W\n", sp.IdleW)
|
||||
fmt.Fprintf(&b, " Server under load: %.0f W\n", sp.LoadedW)
|
||||
fmt.Fprintf(&b, " Server delta: %.0f W\n", sp.DeltaW)
|
||||
fmt.Fprintf(&b, " GPU reported (sum): %.0f W\n", sp.GPUReportedSumW)
|
||||
if sp.ReportingRatio > 0 {
|
||||
fmt.Fprintf(&b, " Reporting ratio: %.2f (1.0 = accurate, <0.75 = GPU over-reports)\n", sp.ReportingRatio)
|
||||
}
|
||||
}
|
||||
for _, note := range sp.Notes {
|
||||
fmt.Fprintf(&b, " Note: %s\n", note)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
fmt.Fprintf(&b, "Methodology\n")
|
||||
fmt.Fprintf(&b, "-----------\n")
|
||||
fmt.Fprintf(&b, "- Profile %s uses standardized baseline, warmup, steady-state, interconnect, and cooldown phases.\n", result.BenchmarkProfile)
|
||||
@@ -175,6 +192,42 @@ func stripANSIEscapeSequences(raw string) string {
|
||||
return ansiEscapePattern.ReplaceAllString(raw, "")
|
||||
}
|
||||
|
||||
// formatThrottleLine renders throttle counters as human-readable percentages of
|
||||
// the steady-state window. Only non-zero counters are shown. When the steady
|
||||
// duration is unknown (0), raw seconds are shown instead.
|
||||
func formatThrottleLine(t BenchmarkThrottleCounters, steadyDurationSec float64) string {
|
||||
type counter struct {
|
||||
label string
|
||||
us uint64
|
||||
}
|
||||
counters := []counter{
|
||||
{"sw_power", t.SWPowerCapUS},
|
||||
{"sw_thermal", t.SWThermalSlowdownUS},
|
||||
{"sync_boost", t.SyncBoostUS},
|
||||
{"hw_thermal", t.HWThermalSlowdownUS},
|
||||
{"hw_power_brake", t.HWPowerBrakeSlowdownUS},
|
||||
}
|
||||
var parts []string
|
||||
for _, c := range counters {
|
||||
if c.us == 0 {
|
||||
continue
|
||||
}
|
||||
sec := float64(c.us) / 1e6
|
||||
if steadyDurationSec > 0 {
|
||||
pct := sec / steadyDurationSec * 100
|
||||
parts = append(parts, fmt.Sprintf("%s=%.1f%% (%.0fs)", c.label, pct, sec))
|
||||
} else if sec < 1 {
|
||||
parts = append(parts, fmt.Sprintf("%s=%.0fms", c.label, sec*1000))
|
||||
} else {
|
||||
parts = append(parts, fmt.Sprintf("%s=%.1fs", c.label, sec))
|
||||
}
|
||||
}
|
||||
if len(parts) == 0 {
|
||||
return "none"
|
||||
}
|
||||
return strings.Join(parts, " ")
|
||||
}
|
||||
|
||||
func renderBenchmarkSummary(result NvidiaBenchmarkResult) string {
|
||||
var b strings.Builder
|
||||
fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
|
||||
|
||||
@@ -28,6 +28,7 @@ type NvidiaBenchmarkResult struct {
|
||||
Normalization BenchmarkNormalization `json:"normalization"`
|
||||
GPUs []BenchmarkGPUResult `json:"gpus"`
|
||||
Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
|
||||
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
||||
}
|
||||
|
||||
type BenchmarkNormalization struct {
|
||||
@@ -56,7 +57,10 @@ type BenchmarkGPUResult struct {
|
||||
Backend string `json:"backend,omitempty"`
|
||||
Status string `json:"status"`
|
||||
PowerLimitW float64 `json:"power_limit_w,omitempty"`
|
||||
MultiprocessorCount int `json:"multiprocessor_count,omitempty"`
|
||||
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
|
||||
MaxGraphicsClockMHz float64 `json:"max_graphics_clock_mhz,omitempty"`
|
||||
BaseGraphicsClockMHz float64 `json:"base_graphics_clock_mhz,omitempty"`
|
||||
MaxMemoryClockMHz float64 `json:"max_memory_clock_mhz,omitempty"`
|
||||
LockedGraphicsClockMHz float64 `json:"locked_graphics_clock_mhz,omitempty"`
|
||||
LockedMemoryClockMHz float64 `json:"locked_memory_clock_mhz,omitempty"`
|
||||
@@ -117,6 +121,24 @@ type BenchmarkScorecard struct {
|
||||
StabilityScore float64 `json:"stability_score"`
|
||||
InterconnectScore float64 `json:"interconnect_score"`
|
||||
CompositeScore float64 `json:"composite_score"`
|
||||
// TOPSPerSMPerGHz is compute efficiency independent of clock speed and SM count.
|
||||
// Comparable across throttle levels and GPU generations. Low value at normal
|
||||
// clocks indicates silicon degradation.
|
||||
TOPSPerSMPerGHz float64 `json:"tops_per_sm_per_ghz,omitempty"`
|
||||
}
|
||||
|
||||
// BenchmarkServerPower captures server-side power via IPMI alongside GPU-reported
|
||||
// power. The reporting_ratio (delta / gpu_reported_sum) near 1.0 means GPU power
|
||||
// telemetry is accurate; a ratio well below 1.0 (e.g. 0.5) means the GPU is
|
||||
// over-reporting its power consumption.
|
||||
type BenchmarkServerPower struct {
|
||||
Available bool `json:"available"`
|
||||
IdleW float64 `json:"idle_w,omitempty"`
|
||||
LoadedW float64 `json:"loaded_w,omitempty"`
|
||||
DeltaW float64 `json:"delta_w,omitempty"`
|
||||
GPUReportedSumW float64 `json:"gpu_reported_sum_w,omitempty"`
|
||||
ReportingRatio float64 `json:"reporting_ratio,omitempty"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
}
|
||||
|
||||
type BenchmarkInterconnectResult struct {
|
||||
|
||||
Reference in New Issue
Block a user