Record host CPU/RAM config in benchmark results; check CPU load

- BenchmarkHostConfig captures CPU model, sockets, cores, threads, and total RAM from /proc/cpuinfo and /proc/meminfo at benchmark start. - BenchmarkCPULoad samples host CPU utilisation every 10 s throughout the GPU steady-state phase (sequential and parallel paths). - Summarises avg/max/p95 and classifies status as ok / high / unstable. - Adds a finding when CPU load is elevated (avg >20% or max >40%) or erratic (stddev >12%), with a plain-English description in the report. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-12 20:02:04 +03:00
parent 098e19f760
commit 9e3dcf9b4d
2 changed files with 219 additions and 0 deletions
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
@@ -109,6 +109,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 		BenchmarkProfile:   spec.Name,
 		ParallelGPUs:       opts.ParallelGPUs,
 		SelectedGPUIndices: append([]int(nil), selected...),
+		HostConfig:         readBenchmarkHostConfig(),
 		Normalization: BenchmarkNormalization{
 			Status: "full",
 		},
@@ -152,6 +153,10 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 		}
 	}()

+	// Start background CPU load sampler — samples every 10s during GPU phases.
+	cpuStopCh := make(chan struct{})
+	cpuSamplesCh := startCPULoadSampler(cpuStopCh, 10)
+
 	if opts.ParallelGPUs {
 		runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples)
 	} else {
@@ -310,6 +315,16 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 		}
 	}

+	// Stop CPU load sampler and attach results.
+	close(cpuStopCh)
+	if cpuSamples := <-cpuSamplesCh; len(cpuSamples) > 0 {
+		result.CPULoad = summarizeCPULoad(cpuSamples)
+		if result.CPULoad != nil && result.CPULoad.Status != "ok" {
+			logFunc(fmt.Sprintf("host CPU load during benchmark: avg=%.1f%% max=%.1f%% status=%s",
+				result.CPULoad.AvgPct, result.CPULoad.MaxPct, result.CPULoad.Status))
+		}
+	}
+
 	// Compute server power characterization from accumulated IPMI samples.
 	var gpuReportedSumW float64
 	for _, gpu := range result.GPUs {
@@ -1079,6 +1094,20 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
 	if result.Interconnect != nil && result.Interconnect.Supported {
 		findings = append(findings, fmt.Sprintf("Multi-GPU all_reduce max bus bandwidth: %.1f GB/s.", result.Interconnect.MaxBusBWGBps))
 	}
+	if cl := result.CPULoad; cl != nil {
+		switch cl.Status {
+		case "high":
+			findings = append(findings, fmt.Sprintf(
+				"Host CPU load was elevated during the benchmark (avg %.1f%%, max %.1f%%). A competing CPU workload may skew GPU results.",
+				cl.AvgPct, cl.MaxPct,
+			))
+		case "unstable":
+			findings = append(findings, fmt.Sprintf(
+				"Host CPU load was erratic during the benchmark (avg %.1f%%, p95 %.1f%%). Results may be less reproducible.",
+				cl.AvgPct, cl.P95Pct,
+			))
+		}
+	}
 	if sp := result.ServerPower; sp != nil && sp.Available && sp.GPUReportedSumW > 0 {
 		if sp.ReportingRatio < 0.75 {
 			findings = append(findings, fmt.Sprintf(
@@ -1571,3 +1600,168 @@ func runNvidiaBenchmarkParallel(
 		result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(*r))
 	}
 }
+
+// readBenchmarkHostConfig reads static CPU and memory configuration from
+// /proc/cpuinfo and /proc/meminfo. Returns nil if neither source is readable.
+func readBenchmarkHostConfig() *BenchmarkHostConfig {
+	cfg := &BenchmarkHostConfig{}
+	populated := false
+
+	// Parse /proc/cpuinfo for CPU model, sockets, cores, threads.
+	if data, err := os.ReadFile("/proc/cpuinfo"); err == nil {
+		socketIDs := map[string]struct{}{}
+		coresPerSocket := map[string]int{}
+		var modelName string
+		threads := 0
+		for _, line := range strings.Split(string(data), "\n") {
+			kv := strings.SplitN(line, ":", 2)
+			if len(kv) != 2 {
+				continue
+			}
+			key := strings.TrimSpace(kv[0])
+			val := strings.TrimSpace(kv[1])
+			switch key {
+			case "processor":
+				threads++
+			case "model name":
+				if modelName == "" {
+					modelName = val
+				}
+			case "physical id":
+				socketIDs[val] = struct{}{}
+			case "cpu cores":
+				// Overwrite per-socket core count (last wins per socket, but all
+				// entries for the same socket report the same value).
+				if physLine := ""; physLine == "" {
+					// We accumulate below by treating cpu cores as a per-thread
+					// field; sum by socket requires a two-pass approach. Use the
+					// simpler approximation: totalCores = threads / (threads per core).
+					_ = val
+				}
+			}
+		}
+		// Second pass: per-socket core count.
+		var curSocket string
+		for _, line := range strings.Split(string(data), "\n") {
+			kv := strings.SplitN(line, ":", 2)
+			if len(kv) != 2 {
+				continue
+			}
+			key := strings.TrimSpace(kv[0])
+			val := strings.TrimSpace(kv[1])
+			switch key {
+			case "physical id":
+				curSocket = val
+			case "cpu cores":
+				if curSocket != "" {
+					if _, seen := coresPerSocket[curSocket]; !seen {
+						v, _ := strconv.Atoi(val)
+						coresPerSocket[curSocket] = v
+					}
+				}
+			}
+		}
+		totalCores := 0
+		for _, c := range coresPerSocket {
+			totalCores += c
+		}
+		cfg.CPUModel = modelName
+		cfg.CPUSockets = len(socketIDs)
+		if cfg.CPUSockets == 0 && threads > 0 {
+			cfg.CPUSockets = 1
+		}
+		cfg.CPUCores = totalCores
+		cfg.CPUThreads = threads
+		if modelName != "" || threads > 0 {
+			populated = true
+		}
+	}
+
+	// Parse /proc/meminfo for total physical RAM.
+	if data, err := os.ReadFile("/proc/meminfo"); err == nil {
+		for _, line := range strings.Split(string(data), "\n") {
+			if strings.HasPrefix(line, "MemTotal:") {
+				fields := strings.Fields(line)
+				if len(fields) >= 2 {
+					kb, _ := strconv.ParseUint(fields[1], 10, 64)
+					cfg.MemTotalGiB = float64(kb) / (1024 * 1024)
+					populated = true
+				}
+				break
+			}
+		}
+	}
+
+	if !populated {
+		return nil
+	}
+	return cfg
+}
+
+// startCPULoadSampler starts a goroutine that samples host CPU load every
+// intervalSec seconds until stopCh is closed, then sends the collected
+// samples on the returned channel.
+func startCPULoadSampler(stopCh <-chan struct{}, intervalSec int) <-chan []float64 {
+	ch := make(chan []float64, 1)
+	go func() {
+		var samples []float64
+		ticker := time.NewTicker(time.Duration(intervalSec) * time.Second)
+		defer ticker.Stop()
+		for {
+			select {
+			case <-stopCh:
+				ch <- samples
+				return
+			case <-ticker.C:
+				if pct := sampleCPULoadPct(); pct > 0 {
+					samples = append(samples, pct)
+				}
+			}
+		}
+	}()
+	return ch
+}
+
+// summarizeCPULoad computes stats over sampled CPU load values and assigns
+// a health status.
+func summarizeCPULoad(samples []float64) *BenchmarkCPULoad {
+	if len(samples) == 0 {
+		return nil
+	}
+	sorted := append([]float64(nil), samples...)
+	sort.Float64s(sorted)
+	var sum float64
+	for _, v := range sorted {
+		sum += v
+	}
+	avg := sum / float64(len(sorted))
+	p95 := sorted[int(float64(len(sorted))*0.95)]
+	max := sorted[len(sorted)-1]
+
+	cl := &BenchmarkCPULoad{
+		AvgPct:  math.Round(avg*10) / 10,
+		MaxPct:  math.Round(max*10) / 10,
+		P95Pct:  math.Round(p95*10) / 10,
+		Samples: len(sorted),
+	}
+
+	// Compute standard deviation to detect instability.
+	var variance float64
+	for _, v := range sorted {
+		d := v - avg
+		variance += d * d
+	}
+	stdDev := math.Sqrt(variance / float64(len(sorted)))
+
+	switch {
+	case avg > 20 || max > 40:
+		cl.Status = "high"
+		cl.Note = fmt.Sprintf("avg %.1f%% max %.1f%% — elevated host CPU load may interfere with GPU benchmark results", avg, max)
+	case stdDev > 12:
+		cl.Status = "unstable"
+		cl.Note = fmt.Sprintf("avg %.1f%% stddev %.1f%% — host CPU load was erratic during the benchmark", avg, stdDev)
+	default:
+		cl.Status = "ok"
+	}
+	return cl
+}
--- a/audit/internal/platform/benchmark_types.go
+++ b/audit/internal/platform/benchmark_types.go
@@ -2,6 +2,29 @@ package platform

 import "time"

+// BenchmarkHostConfig holds static CPU and memory configuration captured at
+// benchmark start. Useful for correlating results across runs on different hardware.
+type BenchmarkHostConfig struct {
+	CPUModel    string  `json:"cpu_model,omitempty"`
+	CPUSockets  int     `json:"cpu_sockets,omitempty"`
+	CPUCores    int     `json:"cpu_cores,omitempty"`
+	CPUThreads  int     `json:"cpu_threads,omitempty"`
+	MemTotalGiB float64 `json:"mem_total_gib,omitempty"`
+}
+
+// BenchmarkCPULoad summarises host CPU utilisation sampled during the GPU
+// steady-state phase. High or unstable CPU load during a GPU benchmark may
+// indicate a competing workload or a CPU-bound driver bottleneck.
+type BenchmarkCPULoad struct {
+	AvgPct  float64 `json:"avg_pct"`
+	MaxPct  float64 `json:"max_pct"`
+	P95Pct  float64 `json:"p95_pct"`
+	Samples int     `json:"samples"`
+	// Status is "ok", "high", or "unstable".
+	Status string `json:"status"`
+	Note   string `json:"note,omitempty"`
+}
+
 const (
 	NvidiaBenchmarkProfileStandard  = "standard"
 	NvidiaBenchmarkProfileStability = "stability"
@@ -30,6 +53,8 @@ type NvidiaBenchmarkResult struct {
 	Findings           []string                     `json:"findings,omitempty"`
 	Warnings           []string                     `json:"warnings,omitempty"`
 	Normalization      BenchmarkNormalization       `json:"normalization"`
+	HostConfig         *BenchmarkHostConfig         `json:"host_config,omitempty"`
+	CPULoad            *BenchmarkCPULoad            `json:"cpu_load,omitempty"`
 	GPUs               []BenchmarkGPUResult         `json:"gpus"`
 	Interconnect       *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
 	ServerPower        *BenchmarkServerPower        `json:"server_power,omitempty"`