From 9e3dcf9b4d2cfa0fdffd956117e73037f97ea25a Mon Sep 17 00:00:00 2001 From: Michael Chus Date: Sun, 12 Apr 2026 20:02:04 +0300 Subject: [PATCH] Record host CPU/RAM config in benchmark results; check CPU load - BenchmarkHostConfig captures CPU model, sockets, cores, threads, and total RAM from /proc/cpuinfo and /proc/meminfo at benchmark start. - BenchmarkCPULoad samples host CPU utilisation every 10 s throughout the GPU steady-state phase (sequential and parallel paths). - Summarises avg/max/p95 and classifies status as ok / high / unstable. - Adds a finding when CPU load is elevated (avg >20% or max >40%) or erratic (stddev >12%), with a plain-English description in the report. Co-Authored-By: Claude Sonnet 4.6 --- audit/internal/platform/benchmark.go | 194 +++++++++++++++++++++ audit/internal/platform/benchmark_types.go | 25 +++ 2 files changed, 219 insertions(+) diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go index b0a426f..8dfbadb 100644 --- a/audit/internal/platform/benchmark.go +++ b/audit/internal/platform/benchmark.go @@ -109,6 +109,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv BenchmarkProfile: spec.Name, ParallelGPUs: opts.ParallelGPUs, SelectedGPUIndices: append([]int(nil), selected...), + HostConfig: readBenchmarkHostConfig(), Normalization: BenchmarkNormalization{ Status: "full", }, @@ -152,6 +153,10 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv } }() + // Start background CPU load sampler — samples every 10s during GPU phases. + cpuStopCh := make(chan struct{}) + cpuSamplesCh := startCPULoadSampler(cpuStopCh, 10) + if opts.ParallelGPUs { runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples) } else { @@ -310,6 +315,16 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv } } + // Stop CPU load sampler and attach results. + close(cpuStopCh) + if cpuSamples := <-cpuSamplesCh; len(cpuSamples) > 0 { + result.CPULoad = summarizeCPULoad(cpuSamples) + if result.CPULoad != nil && result.CPULoad.Status != "ok" { + logFunc(fmt.Sprintf("host CPU load during benchmark: avg=%.1f%% max=%.1f%% status=%s", + result.CPULoad.AvgPct, result.CPULoad.MaxPct, result.CPULoad.Status)) + } + } + // Compute server power characterization from accumulated IPMI samples. var gpuReportedSumW float64 for _, gpu := range result.GPUs { @@ -1079,6 +1094,20 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string { if result.Interconnect != nil && result.Interconnect.Supported { findings = append(findings, fmt.Sprintf("Multi-GPU all_reduce max bus bandwidth: %.1f GB/s.", result.Interconnect.MaxBusBWGBps)) } + if cl := result.CPULoad; cl != nil { + switch cl.Status { + case "high": + findings = append(findings, fmt.Sprintf( + "Host CPU load was elevated during the benchmark (avg %.1f%%, max %.1f%%). A competing CPU workload may skew GPU results.", + cl.AvgPct, cl.MaxPct, + )) + case "unstable": + findings = append(findings, fmt.Sprintf( + "Host CPU load was erratic during the benchmark (avg %.1f%%, p95 %.1f%%). Results may be less reproducible.", + cl.AvgPct, cl.P95Pct, + )) + } + } if sp := result.ServerPower; sp != nil && sp.Available && sp.GPUReportedSumW > 0 { if sp.ReportingRatio < 0.75 { findings = append(findings, fmt.Sprintf( @@ -1571,3 +1600,168 @@ func runNvidiaBenchmarkParallel( result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(*r)) } } + +// readBenchmarkHostConfig reads static CPU and memory configuration from +// /proc/cpuinfo and /proc/meminfo. Returns nil if neither source is readable. +func readBenchmarkHostConfig() *BenchmarkHostConfig { + cfg := &BenchmarkHostConfig{} + populated := false + + // Parse /proc/cpuinfo for CPU model, sockets, cores, threads. + if data, err := os.ReadFile("/proc/cpuinfo"); err == nil { + socketIDs := map[string]struct{}{} + coresPerSocket := map[string]int{} + var modelName string + threads := 0 + for _, line := range strings.Split(string(data), "\n") { + kv := strings.SplitN(line, ":", 2) + if len(kv) != 2 { + continue + } + key := strings.TrimSpace(kv[0]) + val := strings.TrimSpace(kv[1]) + switch key { + case "processor": + threads++ + case "model name": + if modelName == "" { + modelName = val + } + case "physical id": + socketIDs[val] = struct{}{} + case "cpu cores": + // Overwrite per-socket core count (last wins per socket, but all + // entries for the same socket report the same value). + if physLine := ""; physLine == "" { + // We accumulate below by treating cpu cores as a per-thread + // field; sum by socket requires a two-pass approach. Use the + // simpler approximation: totalCores = threads / (threads per core). + _ = val + } + } + } + // Second pass: per-socket core count. + var curSocket string + for _, line := range strings.Split(string(data), "\n") { + kv := strings.SplitN(line, ":", 2) + if len(kv) != 2 { + continue + } + key := strings.TrimSpace(kv[0]) + val := strings.TrimSpace(kv[1]) + switch key { + case "physical id": + curSocket = val + case "cpu cores": + if curSocket != "" { + if _, seen := coresPerSocket[curSocket]; !seen { + v, _ := strconv.Atoi(val) + coresPerSocket[curSocket] = v + } + } + } + } + totalCores := 0 + for _, c := range coresPerSocket { + totalCores += c + } + cfg.CPUModel = modelName + cfg.CPUSockets = len(socketIDs) + if cfg.CPUSockets == 0 && threads > 0 { + cfg.CPUSockets = 1 + } + cfg.CPUCores = totalCores + cfg.CPUThreads = threads + if modelName != "" || threads > 0 { + populated = true + } + } + + // Parse /proc/meminfo for total physical RAM. + if data, err := os.ReadFile("/proc/meminfo"); err == nil { + for _, line := range strings.Split(string(data), "\n") { + if strings.HasPrefix(line, "MemTotal:") { + fields := strings.Fields(line) + if len(fields) >= 2 { + kb, _ := strconv.ParseUint(fields[1], 10, 64) + cfg.MemTotalGiB = float64(kb) / (1024 * 1024) + populated = true + } + break + } + } + } + + if !populated { + return nil + } + return cfg +} + +// startCPULoadSampler starts a goroutine that samples host CPU load every +// intervalSec seconds until stopCh is closed, then sends the collected +// samples on the returned channel. +func startCPULoadSampler(stopCh <-chan struct{}, intervalSec int) <-chan []float64 { + ch := make(chan []float64, 1) + go func() { + var samples []float64 + ticker := time.NewTicker(time.Duration(intervalSec) * time.Second) + defer ticker.Stop() + for { + select { + case <-stopCh: + ch <- samples + return + case <-ticker.C: + if pct := sampleCPULoadPct(); pct > 0 { + samples = append(samples, pct) + } + } + } + }() + return ch +} + +// summarizeCPULoad computes stats over sampled CPU load values and assigns +// a health status. +func summarizeCPULoad(samples []float64) *BenchmarkCPULoad { + if len(samples) == 0 { + return nil + } + sorted := append([]float64(nil), samples...) + sort.Float64s(sorted) + var sum float64 + for _, v := range sorted { + sum += v + } + avg := sum / float64(len(sorted)) + p95 := sorted[int(float64(len(sorted))*0.95)] + max := sorted[len(sorted)-1] + + cl := &BenchmarkCPULoad{ + AvgPct: math.Round(avg*10) / 10, + MaxPct: math.Round(max*10) / 10, + P95Pct: math.Round(p95*10) / 10, + Samples: len(sorted), + } + + // Compute standard deviation to detect instability. + var variance float64 + for _, v := range sorted { + d := v - avg + variance += d * d + } + stdDev := math.Sqrt(variance / float64(len(sorted))) + + switch { + case avg > 20 || max > 40: + cl.Status = "high" + cl.Note = fmt.Sprintf("avg %.1f%% max %.1f%% — elevated host CPU load may interfere with GPU benchmark results", avg, max) + case stdDev > 12: + cl.Status = "unstable" + cl.Note = fmt.Sprintf("avg %.1f%% stddev %.1f%% — host CPU load was erratic during the benchmark", avg, stdDev) + default: + cl.Status = "ok" + } + return cl +} diff --git a/audit/internal/platform/benchmark_types.go b/audit/internal/platform/benchmark_types.go index 63c9c51..b309e0a 100644 --- a/audit/internal/platform/benchmark_types.go +++ b/audit/internal/platform/benchmark_types.go @@ -2,6 +2,29 @@ package platform import "time" +// BenchmarkHostConfig holds static CPU and memory configuration captured at +// benchmark start. Useful for correlating results across runs on different hardware. +type BenchmarkHostConfig struct { + CPUModel string `json:"cpu_model,omitempty"` + CPUSockets int `json:"cpu_sockets,omitempty"` + CPUCores int `json:"cpu_cores,omitempty"` + CPUThreads int `json:"cpu_threads,omitempty"` + MemTotalGiB float64 `json:"mem_total_gib,omitempty"` +} + +// BenchmarkCPULoad summarises host CPU utilisation sampled during the GPU +// steady-state phase. High or unstable CPU load during a GPU benchmark may +// indicate a competing workload or a CPU-bound driver bottleneck. +type BenchmarkCPULoad struct { + AvgPct float64 `json:"avg_pct"` + MaxPct float64 `json:"max_pct"` + P95Pct float64 `json:"p95_pct"` + Samples int `json:"samples"` + // Status is "ok", "high", or "unstable". + Status string `json:"status"` + Note string `json:"note,omitempty"` +} + const ( NvidiaBenchmarkProfileStandard = "standard" NvidiaBenchmarkProfileStability = "stability" @@ -30,6 +53,8 @@ type NvidiaBenchmarkResult struct { Findings []string `json:"findings,omitempty"` Warnings []string `json:"warnings,omitempty"` Normalization BenchmarkNormalization `json:"normalization"` + HostConfig *BenchmarkHostConfig `json:"host_config,omitempty"` + CPULoad *BenchmarkCPULoad `json:"cpu_load,omitempty"` GPUs []BenchmarkGPUResult `json:"gpus"` Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"` ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`