Record host CPU/RAM config in benchmark results; check CPU load
- BenchmarkHostConfig captures CPU model, sockets, cores, threads, and total RAM from /proc/cpuinfo and /proc/meminfo at benchmark start. - BenchmarkCPULoad samples host CPU utilisation every 10 s throughout the GPU steady-state phase (sequential and parallel paths). - Summarises avg/max/p95 and classifies status as ok / high / unstable. - Adds a finding when CPU load is elevated (avg >20% or max >40%) or erratic (stddev >12%), with a plain-English description in the report. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -109,6 +109,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
BenchmarkProfile: spec.Name,
|
BenchmarkProfile: spec.Name,
|
||||||
ParallelGPUs: opts.ParallelGPUs,
|
ParallelGPUs: opts.ParallelGPUs,
|
||||||
SelectedGPUIndices: append([]int(nil), selected...),
|
SelectedGPUIndices: append([]int(nil), selected...),
|
||||||
|
HostConfig: readBenchmarkHostConfig(),
|
||||||
Normalization: BenchmarkNormalization{
|
Normalization: BenchmarkNormalization{
|
||||||
Status: "full",
|
Status: "full",
|
||||||
},
|
},
|
||||||
@@ -152,6 +153,10 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
// Start background CPU load sampler — samples every 10s during GPU phases.
|
||||||
|
cpuStopCh := make(chan struct{})
|
||||||
|
cpuSamplesCh := startCPULoadSampler(cpuStopCh, 10)
|
||||||
|
|
||||||
if opts.ParallelGPUs {
|
if opts.ParallelGPUs {
|
||||||
runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples)
|
runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples)
|
||||||
} else {
|
} else {
|
||||||
@@ -310,6 +315,16 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Stop CPU load sampler and attach results.
|
||||||
|
close(cpuStopCh)
|
||||||
|
if cpuSamples := <-cpuSamplesCh; len(cpuSamples) > 0 {
|
||||||
|
result.CPULoad = summarizeCPULoad(cpuSamples)
|
||||||
|
if result.CPULoad != nil && result.CPULoad.Status != "ok" {
|
||||||
|
logFunc(fmt.Sprintf("host CPU load during benchmark: avg=%.1f%% max=%.1f%% status=%s",
|
||||||
|
result.CPULoad.AvgPct, result.CPULoad.MaxPct, result.CPULoad.Status))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Compute server power characterization from accumulated IPMI samples.
|
// Compute server power characterization from accumulated IPMI samples.
|
||||||
var gpuReportedSumW float64
|
var gpuReportedSumW float64
|
||||||
for _, gpu := range result.GPUs {
|
for _, gpu := range result.GPUs {
|
||||||
@@ -1079,6 +1094,20 @@ func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
|
|||||||
if result.Interconnect != nil && result.Interconnect.Supported {
|
if result.Interconnect != nil && result.Interconnect.Supported {
|
||||||
findings = append(findings, fmt.Sprintf("Multi-GPU all_reduce max bus bandwidth: %.1f GB/s.", result.Interconnect.MaxBusBWGBps))
|
findings = append(findings, fmt.Sprintf("Multi-GPU all_reduce max bus bandwidth: %.1f GB/s.", result.Interconnect.MaxBusBWGBps))
|
||||||
}
|
}
|
||||||
|
if cl := result.CPULoad; cl != nil {
|
||||||
|
switch cl.Status {
|
||||||
|
case "high":
|
||||||
|
findings = append(findings, fmt.Sprintf(
|
||||||
|
"Host CPU load was elevated during the benchmark (avg %.1f%%, max %.1f%%). A competing CPU workload may skew GPU results.",
|
||||||
|
cl.AvgPct, cl.MaxPct,
|
||||||
|
))
|
||||||
|
case "unstable":
|
||||||
|
findings = append(findings, fmt.Sprintf(
|
||||||
|
"Host CPU load was erratic during the benchmark (avg %.1f%%, p95 %.1f%%). Results may be less reproducible.",
|
||||||
|
cl.AvgPct, cl.P95Pct,
|
||||||
|
))
|
||||||
|
}
|
||||||
|
}
|
||||||
if sp := result.ServerPower; sp != nil && sp.Available && sp.GPUReportedSumW > 0 {
|
if sp := result.ServerPower; sp != nil && sp.Available && sp.GPUReportedSumW > 0 {
|
||||||
if sp.ReportingRatio < 0.75 {
|
if sp.ReportingRatio < 0.75 {
|
||||||
findings = append(findings, fmt.Sprintf(
|
findings = append(findings, fmt.Sprintf(
|
||||||
@@ -1571,3 +1600,168 @@ func runNvidiaBenchmarkParallel(
|
|||||||
result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(*r))
|
result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(*r))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// readBenchmarkHostConfig reads static CPU and memory configuration from
|
||||||
|
// /proc/cpuinfo and /proc/meminfo. Returns nil if neither source is readable.
|
||||||
|
func readBenchmarkHostConfig() *BenchmarkHostConfig {
|
||||||
|
cfg := &BenchmarkHostConfig{}
|
||||||
|
populated := false
|
||||||
|
|
||||||
|
// Parse /proc/cpuinfo for CPU model, sockets, cores, threads.
|
||||||
|
if data, err := os.ReadFile("/proc/cpuinfo"); err == nil {
|
||||||
|
socketIDs := map[string]struct{}{}
|
||||||
|
coresPerSocket := map[string]int{}
|
||||||
|
var modelName string
|
||||||
|
threads := 0
|
||||||
|
for _, line := range strings.Split(string(data), "\n") {
|
||||||
|
kv := strings.SplitN(line, ":", 2)
|
||||||
|
if len(kv) != 2 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
key := strings.TrimSpace(kv[0])
|
||||||
|
val := strings.TrimSpace(kv[1])
|
||||||
|
switch key {
|
||||||
|
case "processor":
|
||||||
|
threads++
|
||||||
|
case "model name":
|
||||||
|
if modelName == "" {
|
||||||
|
modelName = val
|
||||||
|
}
|
||||||
|
case "physical id":
|
||||||
|
socketIDs[val] = struct{}{}
|
||||||
|
case "cpu cores":
|
||||||
|
// Overwrite per-socket core count (last wins per socket, but all
|
||||||
|
// entries for the same socket report the same value).
|
||||||
|
if physLine := ""; physLine == "" {
|
||||||
|
// We accumulate below by treating cpu cores as a per-thread
|
||||||
|
// field; sum by socket requires a two-pass approach. Use the
|
||||||
|
// simpler approximation: totalCores = threads / (threads per core).
|
||||||
|
_ = val
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Second pass: per-socket core count.
|
||||||
|
var curSocket string
|
||||||
|
for _, line := range strings.Split(string(data), "\n") {
|
||||||
|
kv := strings.SplitN(line, ":", 2)
|
||||||
|
if len(kv) != 2 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
key := strings.TrimSpace(kv[0])
|
||||||
|
val := strings.TrimSpace(kv[1])
|
||||||
|
switch key {
|
||||||
|
case "physical id":
|
||||||
|
curSocket = val
|
||||||
|
case "cpu cores":
|
||||||
|
if curSocket != "" {
|
||||||
|
if _, seen := coresPerSocket[curSocket]; !seen {
|
||||||
|
v, _ := strconv.Atoi(val)
|
||||||
|
coresPerSocket[curSocket] = v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
totalCores := 0
|
||||||
|
for _, c := range coresPerSocket {
|
||||||
|
totalCores += c
|
||||||
|
}
|
||||||
|
cfg.CPUModel = modelName
|
||||||
|
cfg.CPUSockets = len(socketIDs)
|
||||||
|
if cfg.CPUSockets == 0 && threads > 0 {
|
||||||
|
cfg.CPUSockets = 1
|
||||||
|
}
|
||||||
|
cfg.CPUCores = totalCores
|
||||||
|
cfg.CPUThreads = threads
|
||||||
|
if modelName != "" || threads > 0 {
|
||||||
|
populated = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse /proc/meminfo for total physical RAM.
|
||||||
|
if data, err := os.ReadFile("/proc/meminfo"); err == nil {
|
||||||
|
for _, line := range strings.Split(string(data), "\n") {
|
||||||
|
if strings.HasPrefix(line, "MemTotal:") {
|
||||||
|
fields := strings.Fields(line)
|
||||||
|
if len(fields) >= 2 {
|
||||||
|
kb, _ := strconv.ParseUint(fields[1], 10, 64)
|
||||||
|
cfg.MemTotalGiB = float64(kb) / (1024 * 1024)
|
||||||
|
populated = true
|
||||||
|
}
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if !populated {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return cfg
|
||||||
|
}
|
||||||
|
|
||||||
|
// startCPULoadSampler starts a goroutine that samples host CPU load every
|
||||||
|
// intervalSec seconds until stopCh is closed, then sends the collected
|
||||||
|
// samples on the returned channel.
|
||||||
|
func startCPULoadSampler(stopCh <-chan struct{}, intervalSec int) <-chan []float64 {
|
||||||
|
ch := make(chan []float64, 1)
|
||||||
|
go func() {
|
||||||
|
var samples []float64
|
||||||
|
ticker := time.NewTicker(time.Duration(intervalSec) * time.Second)
|
||||||
|
defer ticker.Stop()
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-stopCh:
|
||||||
|
ch <- samples
|
||||||
|
return
|
||||||
|
case <-ticker.C:
|
||||||
|
if pct := sampleCPULoadPct(); pct > 0 {
|
||||||
|
samples = append(samples, pct)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
return ch
|
||||||
|
}
|
||||||
|
|
||||||
|
// summarizeCPULoad computes stats over sampled CPU load values and assigns
|
||||||
|
// a health status.
|
||||||
|
func summarizeCPULoad(samples []float64) *BenchmarkCPULoad {
|
||||||
|
if len(samples) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
sorted := append([]float64(nil), samples...)
|
||||||
|
sort.Float64s(sorted)
|
||||||
|
var sum float64
|
||||||
|
for _, v := range sorted {
|
||||||
|
sum += v
|
||||||
|
}
|
||||||
|
avg := sum / float64(len(sorted))
|
||||||
|
p95 := sorted[int(float64(len(sorted))*0.95)]
|
||||||
|
max := sorted[len(sorted)-1]
|
||||||
|
|
||||||
|
cl := &BenchmarkCPULoad{
|
||||||
|
AvgPct: math.Round(avg*10) / 10,
|
||||||
|
MaxPct: math.Round(max*10) / 10,
|
||||||
|
P95Pct: math.Round(p95*10) / 10,
|
||||||
|
Samples: len(sorted),
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compute standard deviation to detect instability.
|
||||||
|
var variance float64
|
||||||
|
for _, v := range sorted {
|
||||||
|
d := v - avg
|
||||||
|
variance += d * d
|
||||||
|
}
|
||||||
|
stdDev := math.Sqrt(variance / float64(len(sorted)))
|
||||||
|
|
||||||
|
switch {
|
||||||
|
case avg > 20 || max > 40:
|
||||||
|
cl.Status = "high"
|
||||||
|
cl.Note = fmt.Sprintf("avg %.1f%% max %.1f%% — elevated host CPU load may interfere with GPU benchmark results", avg, max)
|
||||||
|
case stdDev > 12:
|
||||||
|
cl.Status = "unstable"
|
||||||
|
cl.Note = fmt.Sprintf("avg %.1f%% stddev %.1f%% — host CPU load was erratic during the benchmark", avg, stdDev)
|
||||||
|
default:
|
||||||
|
cl.Status = "ok"
|
||||||
|
}
|
||||||
|
return cl
|
||||||
|
}
|
||||||
|
|||||||
@@ -2,6 +2,29 @@ package platform
|
|||||||
|
|
||||||
import "time"
|
import "time"
|
||||||
|
|
||||||
|
// BenchmarkHostConfig holds static CPU and memory configuration captured at
|
||||||
|
// benchmark start. Useful for correlating results across runs on different hardware.
|
||||||
|
type BenchmarkHostConfig struct {
|
||||||
|
CPUModel string `json:"cpu_model,omitempty"`
|
||||||
|
CPUSockets int `json:"cpu_sockets,omitempty"`
|
||||||
|
CPUCores int `json:"cpu_cores,omitempty"`
|
||||||
|
CPUThreads int `json:"cpu_threads,omitempty"`
|
||||||
|
MemTotalGiB float64 `json:"mem_total_gib,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// BenchmarkCPULoad summarises host CPU utilisation sampled during the GPU
|
||||||
|
// steady-state phase. High or unstable CPU load during a GPU benchmark may
|
||||||
|
// indicate a competing workload or a CPU-bound driver bottleneck.
|
||||||
|
type BenchmarkCPULoad struct {
|
||||||
|
AvgPct float64 `json:"avg_pct"`
|
||||||
|
MaxPct float64 `json:"max_pct"`
|
||||||
|
P95Pct float64 `json:"p95_pct"`
|
||||||
|
Samples int `json:"samples"`
|
||||||
|
// Status is "ok", "high", or "unstable".
|
||||||
|
Status string `json:"status"`
|
||||||
|
Note string `json:"note,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
const (
|
const (
|
||||||
NvidiaBenchmarkProfileStandard = "standard"
|
NvidiaBenchmarkProfileStandard = "standard"
|
||||||
NvidiaBenchmarkProfileStability = "stability"
|
NvidiaBenchmarkProfileStability = "stability"
|
||||||
@@ -30,6 +53,8 @@ type NvidiaBenchmarkResult struct {
|
|||||||
Findings []string `json:"findings,omitempty"`
|
Findings []string `json:"findings,omitempty"`
|
||||||
Warnings []string `json:"warnings,omitempty"`
|
Warnings []string `json:"warnings,omitempty"`
|
||||||
Normalization BenchmarkNormalization `json:"normalization"`
|
Normalization BenchmarkNormalization `json:"normalization"`
|
||||||
|
HostConfig *BenchmarkHostConfig `json:"host_config,omitempty"`
|
||||||
|
CPULoad *BenchmarkCPULoad `json:"cpu_load,omitempty"`
|
||||||
GPUs []BenchmarkGPUResult `json:"gpus"`
|
GPUs []BenchmarkGPUResult `json:"gpus"`
|
||||||
Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
|
Interconnect *BenchmarkInterconnectResult `json:"interconnect,omitempty"`
|
||||||
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
ServerPower *BenchmarkServerPower `json:"server_power,omitempty"`
|
||||||
|
|||||||
Reference in New Issue
Block a user