Record host CPU/RAM config in benchmark results; check CPU load

- BenchmarkHostConfig captures CPU model, sockets, cores, threads, and total RAM from /proc/cpuinfo and /proc/meminfo at benchmark start. - BenchmarkCPULoad samples host CPU utilisation every 10 s throughout the GPU steady-state phase (sequential and parallel paths). - Summarises avg/max/p95 and classifies status as ok / high / unstable. - Adds a finding when CPU load is elevated (avg >20% or max >40%) or erratic (stddev >12%), with a plain-English description in the report. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-12 20:02:04 +03:00
parent 098e19f760
commit 9e3dcf9b4d
2 changed files with 219 additions and 0 deletions
--- a/audit/internal/platform/benchmark.go
+++ b/audit/internal/platform/benchmark.go
@@ -109,6 +109,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 		BenchmarkProfile:   spec.Name,
 		ParallelGPUs:       opts.ParallelGPUs,
 		SelectedGPUIndices: append([]int(nil), selected...),
 		HostConfig:         readBenchmarkHostConfig(),
 		Normalization: BenchmarkNormalization{
 			Status: "full",
 		},
@@ -152,6 +153,10 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 		}
 	}()
 	// Start background CPU load sampler — samples every 10s during GPU phases.
 	cpuStopCh := make(chan struct{})
 	cpuSamplesCh := startCPULoadSampler(cpuStopCh, 10)
 	if opts.ParallelGPUs {
 		runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples)
 	} else {
@@ -310,6 +315,16 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
 		}
 	}
 	// Stop CPU load sampler and attach results.
 	close(cpuStopCh)
 	if cpuSamples := <-cpuSamplesCh; len(cpuSamples) > 0 {
 		result.CPULoad = summarizeCPULoad(cpuSamples)
 		if result.CPULoad != nil && result.CPULoad.Status != "ok" {
 			logFunc(fmt.Sprintf("host CPU load during benchmark: avg=%.1f%% max=%.1f%% status=%s",
 				result.CPULoad.AvgPct, result.CPULoad.MaxPct, result.CPULoad.Status))
 		}
 	}
 	// Compute server power characterization from accumulated IPMI samples.
 	var gpuReportedSumW float64
 	for _, gpu := range result.GPUs {
@@ -1079,6 +1094,20 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
 	if result.Interconnect != nil && result.Interconnect.Supported {
 		findings = append(findings, fmt.Sprintf("Multi-GPU all_reduce max bus bandwidth: %.1f GB/s.", result.Interconnect.MaxBusBWGBps))
 	}
 	if cl := result.CPULoad; cl != nil {
 		switch cl.Status {
 		case "high":
 			findings = append(findings, fmt.Sprintf(
 				"Host CPU load was elevated during the benchmark (avg %.1f%%, max %.1f%%). A competing CPU workload may skew GPU results.",
 				cl.AvgPct, cl.MaxPct,
 			))
 		case "unstable":
 			findings = append(findings, fmt.Sprintf(
 				"Host CPU load was erratic during the benchmark (avg %.1f%%, p95 %.1f%%). Results may be less reproducible.",
 				cl.AvgPct, cl.P95Pct,
 			))
 		}
 	}
 	if sp := result.ServerPower; sp != nil && sp.Available && sp.GPUReportedSumW > 0 {
 		if sp.ReportingRatio < 0.75 {
 			findings = append(findings, fmt.Sprintf(
@@ -1571,3 +1600,168 @@ func runNvidiaBenchmarkParallel(
 		result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(*r))
 	}
 }
 // readBenchmarkHostConfig reads static CPU and memory configuration from
 // /proc/cpuinfo and /proc/meminfo. Returns nil if neither source is readable.
 func readBenchmarkHostConfig() *BenchmarkHostConfig {
 	cfg := &BenchmarkHostConfig{}
 	populated := false
 	// Parse /proc/cpuinfo for CPU model, sockets, cores, threads.
 	if data, err := os.ReadFile("/proc/cpuinfo"); err == nil {
 		socketIDs := map[string]struct{}{}
 		coresPerSocket := map[string]int{}
 		var modelName string
 		threads := 0
 		for _, line := range strings.Split(string(data), "\n") {
 			kv := strings.SplitN(line, ":", 2)
 			if len(kv) != 2 {
 				continue
 			}
 			key := strings.TrimSpace(kv[0])
 			val := strings.TrimSpace(kv[1])
 			switch key {
 			case "processor":
 				threads++
 			case "model name":
 				if modelName == "" {
 					modelName = val
 				}
 			case "physical id":
 				socketIDs[val] = struct{}{}
 			case "cpu cores":
 				// Overwrite per-socket core count (last wins per socket, but all
 				// entries for the same socket report the same value).
 				if physLine := ""; physLine == "" {
 					// We accumulate below by treating cpu cores as a per-thread
 					// field; sum by socket requires a two-pass approach. Use the
 					// simpler approximation: totalCores = threads / (threads per core).
 					_ = val
 				}
 			}
 		}
 		// Second pass: per-socket core count.
 		var curSocket string
 		for _, line := range strings.Split(string(data), "\n") {
 			kv := strings.SplitN(line, ":", 2)
 			if len(kv) != 2 {
 				continue
 			}
 			key := strings.TrimSpace(kv[0])
 			val := strings.TrimSpace(kv[1])
 			switch key {
 			case "physical id":
 				curSocket = val
 			case "cpu cores":
 				if curSocket != "" {
 					if _, seen := coresPerSocket[curSocket]; !seen {
 						v, _ := strconv.Atoi(val)
 						coresPerSocket[curSocket] = v
 					}
 				}
 			}
 		}
 		totalCores := 0
 		for _, c := range coresPerSocket {
 			totalCores += c
 		}
 		cfg.CPUModel = modelName
 		cfg.CPUSockets = len(socketIDs)
 		if cfg.CPUSockets == 0 && threads > 0 {
 			cfg.CPUSockets = 1
 		}
 		cfg.CPUCores = totalCores
 		cfg.CPUThreads = threads
 		if modelName != "" || threads > 0 {
 			populated = true
 		}
 	}
 	// Parse /proc/meminfo for total physical RAM.
 	if data, err := os.ReadFile("/proc/meminfo"); err == nil {
 		for _, line := range strings.Split(string(data), "\n") {
 			if strings.HasPrefix(line, "MemTotal:") {
 				fields := strings.Fields(line)
 				if len(fields) >= 2 {
 					kb, _ := strconv.ParseUint(fields[1], 10, 64)
 					cfg.MemTotalGiB = float64(kb) / (1024 * 1024)
 					populated = true
 				}
 				break
 			}
 		}
 	}
 	if !populated {
 		return nil
 	}
 	return cfg
 }
 // startCPULoadSampler starts a goroutine that samples host CPU load every
 // intervalSec seconds until stopCh is closed, then sends the collected
 // samples on the returned channel.
 func startCPULoadSampler(stopCh <-chan struct{}, intervalSec int) <-chan []float64 {
 	ch := make(chan []float64, 1)
 	go func() {
 		var samples []float64
 		ticker := time.NewTicker(time.Duration(intervalSec) * time.Second)
 		defer ticker.Stop()
 		for {
 			select {
 			case <-stopCh:
 				ch <- samples
 				return
 			case <-ticker.C:
 				if pct := sampleCPULoadPct(); pct > 0 {
 					samples = append(samples, pct)
 				}
 			}
 		}
 	}()
 	return ch
 }
 // summarizeCPULoad computes stats over sampled CPU load values and assigns
 // a health status.
 func summarizeCPULoad(samples []float64) *BenchmarkCPULoad {
 	if len(samples) == 0 {
 		return nil
 	}
 	sorted := append([]float64(nil), samples...)
 	sort.Float64s(sorted)
 	var sum float64
 	for _, v := range sorted {
 		sum += v
 	}
 	avg := sum / float64(len(sorted))
 	p95 := sorted[int(float64(len(sorted))*0.95)]
 	max := sorted[len(sorted)-1]
 	cl := &BenchmarkCPULoad{
 		AvgPct:  math.Round(avg*10) / 10,
 		MaxPct:  math.Round(max*10) / 10,
 		P95Pct:  math.Round(p95*10) / 10,
 		Samples: len(sorted),
 	}
 	// Compute standard deviation to detect instability.
 	var variance float64
 	for _, v := range sorted {
 		d := v - avg
 		variance += d * d
 	}
 	stdDev := math.Sqrt(variance / float64(len(sorted)))
 	switch {
 	case avg > 20 || max > 40:
 		cl.Status = "high"
 		cl.Note = fmt.Sprintf("avg %.1f%% max %.1f%% — elevated host CPU load may interfere with GPU benchmark results", avg, max)
 	case stdDev > 12:
 		cl.Status = "unstable"
 		cl.Note = fmt.Sprintf("avg %.1f%% stddev %.1f%% — host CPU load was erratic during the benchmark", avg, stdDev)
 	default:
 		cl.Status = "ok"
 	}
 	return cl
 }
--- a/audit/internal/platform/benchmark_types.go
+++ b/audit/internal/platform/benchmark_types.go
@@ -2,6 +2,29 @@ package platform
 import "time"
 // BenchmarkHostConfig holds static CPU and memory configuration captured at
 // benchmark start. Useful for correlating results across runs on different hardware.
 type BenchmarkHostConfig struct {
 	CPUModel    string  `json:"cpu_model,omitempty"`
 	CPUSockets  int     `json:"cpu_sockets,omitempty"`
 	CPUCores    int     `json:"cpu_cores,omitempty"`
 	CPUThreads  int     `json:"cpu_threads,omitempty"`
 	MemTotalGiB float64 `json:"mem_total_gib,omitempty"`
 }
 // BenchmarkCPULoad summarises host CPU utilisation sampled during the GPU
 // steady-state phase. High or unstable CPU load during a GPU benchmark may
 // indicate a competing workload or a CPU-bound driver bottleneck.
 type BenchmarkCPULoad struct {
 	AvgPct  float64 `json:"avg_pct"`
 	MaxPct  float64 `json:"max_pct"`
 	P95Pct  float64 `json:"p95_pct"`
 	Samples int     `json:"samples"`
 	// Status is "ok", "high", or "unstable".
 	Status string `json:"status"`
 	Note   string `json:"note,omitempty"`
 }
 const (
 	NvidiaBenchmarkProfileStandard  = "standard"
 	NvidiaBenchmarkProfileStability = "stability"
@@ -30,6 +53,8 @@ type NvidiaBenchmarkResult struct {
 	Findings           []string                     `json:"findings,omitempty"`
 	Warnings           []string                     `json:"warnings,omitempty"`
 	Normalization      BenchmarkNormalization       `json:"normalization"`
 	HostConfig         *BenchmarkHostConfig         `json:"host_config,omitempty"`
 	CPULoad            *BenchmarkCPULoad            `json:"cpu_load,omitempty"`
 	GPUs               []BenchmarkGPUResult         `json:"gpus"`
 	Interconnect       *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
 	ServerPower        *BenchmarkServerPower        `json:"server_power,omitempty"`