Record host CPU/RAM config in benchmark results; check CPU load
- BenchmarkHostConfig captures CPU model, sockets, cores, threads, and total RAM from /proc/cpuinfo and /proc/meminfo at benchmark start. - BenchmarkCPULoad samples host CPU utilisation every 10 s throughout the GPU steady-state phase (sequential and parallel paths). - Summarises avg/max/p95 and classifies status as ok / high / unstable. - Adds a finding when CPU load is elevated (avg >20% or max >40%) or erratic (stddev >12%), with a plain-English description in the report. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -109,6 +109,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
||||
BenchmarkProfile: spec.Name,
|
||||
ParallelGPUs: opts.ParallelGPUs,
|
||||
SelectedGPUIndices: append([]int(nil), selected...),
|
||||
HostConfig: readBenchmarkHostConfig(),
|
||||
Normalization: BenchmarkNormalization{
|
||||
Status: "full",
|
||||
},
|
||||
@@ -152,6 +153,10 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
||||
}
|
||||
}()
|
||||
|
||||
// Start background CPU load sampler — samples every 10s during GPU phases.
|
||||
cpuStopCh := make(chan struct{})
|
||||
cpuSamplesCh := startCPULoadSampler(cpuStopCh, 10)
|
||||
|
||||
if opts.ParallelGPUs {
|
||||
runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples)
|
||||
} else {
|
||||
@@ -310,6 +315,16 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
||||
}
|
||||
}
|
||||
|
||||
// Stop CPU load sampler and attach results.
|
||||
close(cpuStopCh)
|
||||
if cpuSamples := <-cpuSamplesCh; len(cpuSamples) > 0 {
|
||||
result.CPULoad = summarizeCPULoad(cpuSamples)
|
||||
if result.CPULoad != nil && result.CPULoad.Status != "ok" {
|
||||
logFunc(fmt.Sprintf("host CPU load during benchmark: avg=%.1f%% max=%.1f%% status=%s",
|
||||
result.CPULoad.AvgPct, result.CPULoad.MaxPct, result.CPULoad.Status))
|
||||
}
|
||||
}
|
||||
|
||||
// Compute server power characterization from accumulated IPMI samples.
|
||||
var gpuReportedSumW float64
|
||||
for _, gpu := range result.GPUs {
|
||||
@@ -1079,6 +1094,20 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
|
||||
if result.Interconnect != nil && result.Interconnect.Supported {
|
||||
findings = append(findings, fmt.Sprintf("Multi-GPU all_reduce max bus bandwidth: %.1f GB/s.", result.Interconnect.MaxBusBWGBps))
|
||||
}
|
||||
if cl := result.CPULoad; cl != nil {
|
||||
switch cl.Status {
|
||||
case "high":
|
||||
findings = append(findings, fmt.Sprintf(
|
||||
"Host CPU load was elevated during the benchmark (avg %.1f%%, max %.1f%%). A competing CPU workload may skew GPU results.",
|
||||
cl.AvgPct, cl.MaxPct,
|
||||
))
|
||||
case "unstable":
|
||||
findings = append(findings, fmt.Sprintf(
|
||||
"Host CPU load was erratic during the benchmark (avg %.1f%%, p95 %.1f%%). Results may be less reproducible.",
|
||||
cl.AvgPct, cl.P95Pct,
|
||||
))
|
||||
}
|
||||
}
|
||||
if sp := result.ServerPower; sp != nil && sp.Available && sp.GPUReportedSumW > 0 {
|
||||
if sp.ReportingRatio < 0.75 {
|
||||
findings = append(findings, fmt.Sprintf(
|
||||
@@ -1571,3 +1600,168 @@ func runNvidiaBenchmarkParallel(
|
||||
result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(*r))
|
||||
}
|
||||
}
|
||||
|
||||
// readBenchmarkHostConfig reads static CPU and memory configuration from
|
||||
// /proc/cpuinfo and /proc/meminfo. Returns nil if neither source is readable.
|
||||
func readBenchmarkHostConfig() *BenchmarkHostConfig {
|
||||
cfg := &BenchmarkHostConfig{}
|
||||
populated := false
|
||||
|
||||
// Parse /proc/cpuinfo for CPU model, sockets, cores, threads.
|
||||
if data, err := os.ReadFile("/proc/cpuinfo"); err == nil {
|
||||
socketIDs := map[string]struct{}{}
|
||||
coresPerSocket := map[string]int{}
|
||||
var modelName string
|
||||
threads := 0
|
||||
for _, line := range strings.Split(string(data), "\n") {
|
||||
kv := strings.SplitN(line, ":", 2)
|
||||
if len(kv) != 2 {
|
||||
continue
|
||||
}
|
||||
key := strings.TrimSpace(kv[0])
|
||||
val := strings.TrimSpace(kv[1])
|
||||
switch key {
|
||||
case "processor":
|
||||
threads++
|
||||
case "model name":
|
||||
if modelName == "" {
|
||||
modelName = val
|
||||
}
|
||||
case "physical id":
|
||||
socketIDs[val] = struct{}{}
|
||||
case "cpu cores":
|
||||
// Overwrite per-socket core count (last wins per socket, but all
|
||||
// entries for the same socket report the same value).
|
||||
if physLine := ""; physLine == "" {
|
||||
// We accumulate below by treating cpu cores as a per-thread
|
||||
// field; sum by socket requires a two-pass approach. Use the
|
||||
// simpler approximation: totalCores = threads / (threads per core).
|
||||
_ = val
|
||||
}
|
||||
}
|
||||
}
|
||||
// Second pass: per-socket core count.
|
||||
var curSocket string
|
||||
for _, line := range strings.Split(string(data), "\n") {
|
||||
kv := strings.SplitN(line, ":", 2)
|
||||
if len(kv) != 2 {
|
||||
continue
|
||||
}
|
||||
key := strings.TrimSpace(kv[0])
|
||||
val := strings.TrimSpace(kv[1])
|
||||
switch key {
|
||||
case "physical id":
|
||||
curSocket = val
|
||||
case "cpu cores":
|
||||
if curSocket != "" {
|
||||
if _, seen := coresPerSocket[curSocket]; !seen {
|
||||
v, _ := strconv.Atoi(val)
|
||||
coresPerSocket[curSocket] = v
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
totalCores := 0
|
||||
for _, c := range coresPerSocket {
|
||||
totalCores += c
|
||||
}
|
||||
cfg.CPUModel = modelName
|
||||
cfg.CPUSockets = len(socketIDs)
|
||||
if cfg.CPUSockets == 0 && threads > 0 {
|
||||
cfg.CPUSockets = 1
|
||||
}
|
||||
cfg.CPUCores = totalCores
|
||||
cfg.CPUThreads = threads
|
||||
if modelName != "" || threads > 0 {
|
||||
populated = true
|
||||
}
|
||||
}
|
||||
|
||||
// Parse /proc/meminfo for total physical RAM.
|
||||
if data, err := os.ReadFile("/proc/meminfo"); err == nil {
|
||||
for _, line := range strings.Split(string(data), "\n") {
|
||||
if strings.HasPrefix(line, "MemTotal:") {
|
||||
fields := strings.Fields(line)
|
||||
if len(fields) >= 2 {
|
||||
kb, _ := strconv.ParseUint(fields[1], 10, 64)
|
||||
cfg.MemTotalGiB = float64(kb) / (1024 * 1024)
|
||||
populated = true
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !populated {
|
||||
return nil
|
||||
}
|
||||
return cfg
|
||||
}
|
||||
|
||||
// startCPULoadSampler starts a goroutine that samples host CPU load every
|
||||
// intervalSec seconds until stopCh is closed, then sends the collected
|
||||
// samples on the returned channel.
|
||||
func startCPULoadSampler(stopCh <-chan struct{}, intervalSec int) <-chan []float64 {
|
||||
ch := make(chan []float64, 1)
|
||||
go func() {
|
||||
var samples []float64
|
||||
ticker := time.NewTicker(time.Duration(intervalSec) * time.Second)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-stopCh:
|
||||
ch <- samples
|
||||
return
|
||||
case <-ticker.C:
|
||||
if pct := sampleCPULoadPct(); pct > 0 {
|
||||
samples = append(samples, pct)
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
return ch
|
||||
}
|
||||
|
||||
// summarizeCPULoad computes stats over sampled CPU load values and assigns
|
||||
// a health status.
|
||||
func summarizeCPULoad(samples []float64) *BenchmarkCPULoad {
|
||||
if len(samples) == 0 {
|
||||
return nil
|
||||
}
|
||||
sorted := append([]float64(nil), samples...)
|
||||
sort.Float64s(sorted)
|
||||
var sum float64
|
||||
for _, v := range sorted {
|
||||
sum += v
|
||||
}
|
||||
avg := sum / float64(len(sorted))
|
||||
p95 := sorted[int(float64(len(sorted))*0.95)]
|
||||
max := sorted[len(sorted)-1]
|
||||
|
||||
cl := &BenchmarkCPULoad{
|
||||
AvgPct: math.Round(avg*10) / 10,
|
||||
MaxPct: math.Round(max*10) / 10,
|
||||
P95Pct: math.Round(p95*10) / 10,
|
||||
Samples: len(sorted),
|
||||
}
|
||||
|
||||
// Compute standard deviation to detect instability.
|
||||
var variance float64
|
||||
for _, v := range sorted {
|
||||
d := v - avg
|
||||
variance += d * d
|
||||
}
|
||||
stdDev := math.Sqrt(variance / float64(len(sorted)))
|
||||
|
||||
switch {
|
||||
case avg > 20 || max > 40:
|
||||
cl.Status = "high"
|
||||
cl.Note = fmt.Sprintf("avg %.1f%% max %.1f%% — elevated host CPU load may interfere with GPU benchmark results", avg, max)
|
||||
case stdDev > 12:
|
||||
cl.Status = "unstable"
|
||||
cl.Note = fmt.Sprintf("avg %.1f%% stddev %.1f%% — host CPU load was erratic during the benchmark", avg, stdDev)
|
||||
default:
|
||||
cl.Status = "ok"
|
||||
}
|
||||
return cl
|
||||
}
|
||||
|
||||
@@ -2,6 +2,29 @@ package platform
|
||||
|
||||
import "time"
|
||||
|
||||
// BenchmarkHostConfig holds static CPU and memory configuration captured at
|
||||
// benchmark start. Useful for correlating results across runs on different hardware.
|
||||
type BenchmarkHostConfig struct {
|
||||
CPUModel string `json:"cpu_model,omitempty"`
|
||||
CPUSockets int `json:"cpu_sockets,omitempty"`
|
||||
CPUCores int `json:"cpu_cores,omitempty"`
|
||||
CPUThreads int `json:"cpu_threads,omitempty"`
|
||||
MemTotalGiB float64 `json:"mem_total_gib,omitempty"`
|
||||
}
|
||||
|
||||
// BenchmarkCPULoad summarises host CPU utilisation sampled during the GPU
|
||||
// steady-state phase. High or unstable CPU load during a GPU benchmark may
|
||||
// indicate a competing workload or a CPU-bound driver bottleneck.
|
||||
type BenchmarkCPULoad struct {
|
||||
AvgPct float64 `json:"avg_pct"`
|
||||
MaxPct float64 `json:"max_pct"`
|
||||
P95Pct float64 `json:"p95_pct"`
|
||||
Samples int `json:"samples"`
|
||||
// Status is "ok", "high", or "unstable".
|
||||
Status string `json:"status"`
|
||||
Note string `json:"note,omitempty"`
|
||||
}
|
||||
|
||||
const (
|
||||
NvidiaBenchmarkProfileStandard = "standard"
|
||||
NvidiaBenchmarkProfileStability = "stability"
|
||||
@@ -30,6 +53,8 @@ type NvidiaBenchmarkResult struct {
|
||||
Findings []string `json:"findings,omitempty"`
|
||||
Warnings []string `json:"warnings,omitempty"`
|
||||
Normalization BenchmarkNormalization `json:"normalization"`
|
||||
HostConfig *BenchmarkHostConfig `json:"host_config,omitempty"`
|
||||
CPULoad *BenchmarkCPULoad `json:"cpu_load,omitempty"`
|
||||
GPUs []BenchmarkGPUResult `json:"gpus"`
|
||||
Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
|
||||
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
||||
|
||||
Reference in New Issue
Block a user