Record host CPU/RAM config in benchmark results; check CPU load

- BenchmarkHostConfig captures CPU model, sockets, cores, threads, and
  total RAM from /proc/cpuinfo and /proc/meminfo at benchmark start.
- BenchmarkCPULoad samples host CPU utilisation every 10 s throughout
  the GPU steady-state phase (sequential and parallel paths).
- Summarises avg/max/p95 and classifies status as ok / high / unstable.
- Adds a finding when CPU load is elevated (avg >20% or max >40%) or
  erratic (stddev >12%), with a plain-English description in the report.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-12 20:02:04 +03:00
parent 098e19f760
commit 9e3dcf9b4d
2 changed files with 219 additions and 0 deletions

View File

@@ -109,6 +109,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
BenchmarkProfile: spec.Name,
ParallelGPUs: opts.ParallelGPUs,
SelectedGPUIndices: append([]int(nil), selected...),
HostConfig: readBenchmarkHostConfig(),
Normalization: BenchmarkNormalization{
Status: "full",
},
@@ -152,6 +153,10 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
}
}()
// Start background CPU load sampler — samples every 10s during GPU phases.
cpuStopCh := make(chan struct{})
cpuSamplesCh := startCPULoadSampler(cpuStopCh, 10)
if opts.ParallelGPUs {
runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples)
} else {
@@ -310,6 +315,16 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
}
}
// Stop CPU load sampler and attach results.
close(cpuStopCh)
if cpuSamples := <-cpuSamplesCh; len(cpuSamples) > 0 {
result.CPULoad = summarizeCPULoad(cpuSamples)
if result.CPULoad != nil && result.CPULoad.Status != "ok" {
logFunc(fmt.Sprintf("host CPU load during benchmark: avg=%.1f%% max=%.1f%% status=%s",
result.CPULoad.AvgPct, result.CPULoad.MaxPct, result.CPULoad.Status))
}
}
// Compute server power characterization from accumulated IPMI samples.
var gpuReportedSumW float64
for _, gpu := range result.GPUs {
@@ -1079,6 +1094,20 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
if result.Interconnect != nil && result.Interconnect.Supported {
findings = append(findings, fmt.Sprintf("Multi-GPU all_reduce max bus bandwidth: %.1f GB/s.", result.Interconnect.MaxBusBWGBps))
}
if cl := result.CPULoad; cl != nil {
switch cl.Status {
case "high":
findings = append(findings, fmt.Sprintf(
"Host CPU load was elevated during the benchmark (avg %.1f%%, max %.1f%%). A competing CPU workload may skew GPU results.",
cl.AvgPct, cl.MaxPct,
))
case "unstable":
findings = append(findings, fmt.Sprintf(
"Host CPU load was erratic during the benchmark (avg %.1f%%, p95 %.1f%%). Results may be less reproducible.",
cl.AvgPct, cl.P95Pct,
))
}
}
if sp := result.ServerPower; sp != nil && sp.Available && sp.GPUReportedSumW > 0 {
if sp.ReportingRatio < 0.75 {
findings = append(findings, fmt.Sprintf(
@@ -1571,3 +1600,168 @@ func runNvidiaBenchmarkParallel(
result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(*r))
}
}
// readBenchmarkHostConfig reads static CPU and memory configuration from
// /proc/cpuinfo and /proc/meminfo. Returns nil if neither source is readable.
func readBenchmarkHostConfig() *BenchmarkHostConfig {
cfg := &BenchmarkHostConfig{}
populated := false
// Parse /proc/cpuinfo for CPU model, sockets, cores, threads.
if data, err := os.ReadFile("/proc/cpuinfo"); err == nil {
socketIDs := map[string]struct{}{}
coresPerSocket := map[string]int{}
var modelName string
threads := 0
for _, line := range strings.Split(string(data), "\n") {
kv := strings.SplitN(line, ":", 2)
if len(kv) != 2 {
continue
}
key := strings.TrimSpace(kv[0])
val := strings.TrimSpace(kv[1])
switch key {
case "processor":
threads++
case "model name":
if modelName == "" {
modelName = val
}
case "physical id":
socketIDs[val] = struct{}{}
case "cpu cores":
// Overwrite per-socket core count (last wins per socket, but all
// entries for the same socket report the same value).
if physLine := ""; physLine == "" {
// We accumulate below by treating cpu cores as a per-thread
// field; sum by socket requires a two-pass approach. Use the
// simpler approximation: totalCores = threads / (threads per core).
_ = val
}
}
}
// Second pass: per-socket core count.
var curSocket string
for _, line := range strings.Split(string(data), "\n") {
kv := strings.SplitN(line, ":", 2)
if len(kv) != 2 {
continue
}
key := strings.TrimSpace(kv[0])
val := strings.TrimSpace(kv[1])
switch key {
case "physical id":
curSocket = val
case "cpu cores":
if curSocket != "" {
if _, seen := coresPerSocket[curSocket]; !seen {
v, _ := strconv.Atoi(val)
coresPerSocket[curSocket] = v
}
}
}
}
totalCores := 0
for _, c := range coresPerSocket {
totalCores += c
}
cfg.CPUModel = modelName
cfg.CPUSockets = len(socketIDs)
if cfg.CPUSockets == 0 && threads > 0 {
cfg.CPUSockets = 1
}
cfg.CPUCores = totalCores
cfg.CPUThreads = threads
if modelName != "" || threads > 0 {
populated = true
}
}
// Parse /proc/meminfo for total physical RAM.
if data, err := os.ReadFile("/proc/meminfo"); err == nil {
for _, line := range strings.Split(string(data), "\n") {
if strings.HasPrefix(line, "MemTotal:") {
fields := strings.Fields(line)
if len(fields) >= 2 {
kb, _ := strconv.ParseUint(fields[1], 10, 64)
cfg.MemTotalGiB = float64(kb) / (1024 * 1024)
populated = true
}
break
}
}
}
if !populated {
return nil
}
return cfg
}
// startCPULoadSampler starts a goroutine that samples host CPU load every
// intervalSec seconds until stopCh is closed, then sends the collected
// samples on the returned channel.
func startCPULoadSampler(stopCh <-chan struct{}, intervalSec int) <-chan []float64 {
ch := make(chan []float64, 1)
go func() {
var samples []float64
ticker := time.NewTicker(time.Duration(intervalSec) * time.Second)
defer ticker.Stop()
for {
select {
case <-stopCh:
ch <- samples
return
case <-ticker.C:
if pct := sampleCPULoadPct(); pct > 0 {
samples = append(samples, pct)
}
}
}
}()
return ch
}
// summarizeCPULoad computes stats over sampled CPU load values and assigns
// a health status.
func summarizeCPULoad(samples []float64) *BenchmarkCPULoad {
if len(samples) == 0 {
return nil
}
sorted := append([]float64(nil), samples...)
sort.Float64s(sorted)
var sum float64
for _, v := range sorted {
sum += v
}
avg := sum / float64(len(sorted))
p95 := sorted[int(float64(len(sorted))*0.95)]
max := sorted[len(sorted)-1]
cl := &BenchmarkCPULoad{
AvgPct: math.Round(avg*10) / 10,
MaxPct: math.Round(max*10) / 10,
P95Pct: math.Round(p95*10) / 10,
Samples: len(sorted),
}
// Compute standard deviation to detect instability.
var variance float64
for _, v := range sorted {
d := v - avg
variance += d * d
}
stdDev := math.Sqrt(variance / float64(len(sorted)))
switch {
case avg > 20 || max > 40:
cl.Status = "high"
cl.Note = fmt.Sprintf("avg %.1f%% max %.1f%% — elevated host CPU load may interfere with GPU benchmark results", avg, max)
case stdDev > 12:
cl.Status = "unstable"
cl.Note = fmt.Sprintf("avg %.1f%% stddev %.1f%% — host CPU load was erratic during the benchmark", avg, stdDev)
default:
cl.Status = "ok"
}
return cl
}

View File

@@ -2,6 +2,29 @@ package platform
import "time"
// BenchmarkHostConfig holds static CPU and memory configuration captured at
// benchmark start. Useful for correlating results across runs on different hardware.
type BenchmarkHostConfig struct {
CPUModel string `json:"cpu_model,omitempty"`
CPUSockets int `json:"cpu_sockets,omitempty"`
CPUCores int `json:"cpu_cores,omitempty"`
CPUThreads int `json:"cpu_threads,omitempty"`
MemTotalGiB float64 `json:"mem_total_gib,omitempty"`
}
// BenchmarkCPULoad summarises host CPU utilisation sampled during the GPU
// steady-state phase. High or unstable CPU load during a GPU benchmark may
// indicate a competing workload or a CPU-bound driver bottleneck.
type BenchmarkCPULoad struct {
AvgPct float64 `json:"avg_pct"`
MaxPct float64 `json:"max_pct"`
P95Pct float64 `json:"p95_pct"`
Samples int `json:"samples"`
// Status is "ok", "high", or "unstable".
Status string `json:"status"`
Note string `json:"note,omitempty"`
}
const (
NvidiaBenchmarkProfileStandard = "standard"
NvidiaBenchmarkProfileStability = "stability"
@@ -30,6 +53,8 @@ type NvidiaBenchmarkResult struct {
Findings []string `json:"findings,omitempty"`
Warnings []string `json:"warnings,omitempty"`
Normalization BenchmarkNormalization `json:"normalization"`
HostConfig *BenchmarkHostConfig `json:"host_config,omitempty"`
CPULoad *BenchmarkCPULoad `json:"cpu_load,omitempty"`
GPUs []BenchmarkGPUResult `json:"gpus"`
Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`