package platform import ( "context" "encoding/csv" "encoding/json" "fmt" "math" "os" "os/exec" "path/filepath" "regexp" "sort" "strconv" "strings" "time" ) const benchmarkVersion = "1" type benchmarkProfileSpec struct { Name string BaselineSec int WarmupSec int SteadySec int NCCLSec int CooldownSec int } type benchmarkGPUInfo struct { Index int UUID string Name string BusID string VBIOS string PowerLimitW float64 DefaultPowerLimitW float64 MaxGraphicsClockMHz float64 MaxMemoryClockMHz float64 BaseGraphicsClockMHz float64 MultiprocessorCount int } type benchmarkBurnProfile struct { name string category string supported bool lanes int m uint64 n uint64 k uint64 iterations uint64 notes string } type benchmarkBurnParseResult struct { Device string ComputeCapability string Backend string DurationSec int Profiles []BenchmarkPrecisionResult Fallback bool } type benchmarkRestoreAction struct { name string fn func() } var ( benchmarkReadyPattern = regexp.MustCompile(`^([a-z0-9_]+)\[(\d+)\]=READY dim=(\d+)x(\d+)x(\d+)\b`) benchmarkSkippedPattern = regexp.MustCompile(`^([a-z0-9_]+)(?:\[\d+\])?=SKIPPED (.+)$`) benchmarkIterationsPattern = regexp.MustCompile(`^([a-z0-9_]+)_iterations=(\d+)$`) ) // benchmarkPrecisionPhases lists the precision categories run as individual // steady-state windows before the combined steady pass. Order is from lowest // to highest power draw so thermal ramp-up is gradual. var benchmarkPrecisionPhases = []string{"int8", "fp8", "fp16", "fp32", "fp64", "fp4"} func buildBenchmarkSteadyPlan(spec benchmarkProfileSpec, metricStage func(string) string) (planLabels []string, planPhases []benchmarkPlannedPhase, basePhaseSec int, mixedPhaseSec int) { switch spec.Name { case NvidiaBenchmarkProfileStandard: basePhaseSec = 60 mixedPhaseSec = 300 case NvidiaBenchmarkProfileStability: basePhaseSec = 300 mixedPhaseSec = 3600 case NvidiaBenchmarkProfileOvernight: basePhaseSec = 3600 mixedPhaseSec = 14400 default: totalWeight := len(benchmarkPrecisionPhases) + 5 if totalWeight <= 0 { return nil, nil, 0, 0 } basePhaseSec = spec.SteadySec / totalWeight if basePhaseSec <= 0 { basePhaseSec = 1 } mixedPhaseSec = basePhaseSec * 5 } planLabels = make([]string, 0, len(benchmarkPrecisionPhases)+1) planPhases = make([]benchmarkPlannedPhase, 0, len(benchmarkPrecisionPhases)+1) for _, prec := range benchmarkPrecisionPhases { planLabels = append(planLabels, prec) planPhases = append(planPhases, benchmarkPlannedPhase{ PlanLabel: prec, MetricStage: metricStage(prec), DurationSec: basePhaseSec, }) } planLabels = append(planLabels, "mixed") planPhases = append(planPhases, benchmarkPlannedPhase{ PlanLabel: "mixed", MetricStage: metricStage("mixed"), DurationSec: mixedPhaseSec, }) return planLabels, planPhases, basePhaseSec, mixedPhaseSec } func benchmarkPlanDurationsCSV(phases []benchmarkPlannedPhase) string { values := make([]string, 0, len(phases)) for _, phase := range phases { values = append(values, strconv.Itoa(phase.DurationSec)) } return strings.Join(values, ",") } func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) { if ctx == nil { ctx = context.Background() } if logFunc == nil { logFunc = func(string) {} } if strings.TrimSpace(baseDir) == "" { baseDir = "/var/log/bee-benchmark" } spec := resolveBenchmarkProfile(opts.Profile) opts = normalizeNvidiaBenchmarkOptionsForBenchmark(opts) selected, err := resolveNvidiaGPUSelection(opts.GPUIndices, opts.ExcludeGPUIndices) if err != nil { return "", err } if len(selected) == 0 { return "", fmt.Errorf("no NVIDIA GPUs selected") } ts := time.Now().UTC().Format("20060102-150405") runDir := filepath.Join(baseDir, "gpu-benchmark-"+ts) if err := os.MkdirAll(runDir, 0755); err != nil { return "", fmt.Errorf("mkdir %s: %w", runDir, err) } verboseLog := filepath.Join(runDir, "verbose.log") hostname, _ := os.Hostname() result := NvidiaBenchmarkResult{ BenchmarkVersion: benchmarkVersion, GeneratedAt: time.Now().UTC(), Hostname: hostname, ServerModel: readServerModel(), BenchmarkProfile: spec.Name, ParallelGPUs: opts.ParallelGPUs, RampStep: opts.RampStep, RampTotal: opts.RampTotal, RampRunID: opts.RampRunID, SelectedGPUIndices: append([]int(nil), selected...), HostConfig: readBenchmarkHostConfig(), Normalization: BenchmarkNormalization{ Status: "full", }, } logFunc(fmt.Sprintf("NVIDIA benchmark profile=%s gpus=%s", spec.Name, joinIndexList(selected))) var metricRows []GPUMetricRow gpuBurnLog := filepath.Join(runDir, "gpu-burn.log") // Server power characterization state — populated during per-GPU phases. var serverIdleW, serverLoadedWSum float64 var serverIdleOK, serverLoadedOK bool var serverLoadedSamples int // Run nvidia-smi -q first: used both for the log file and as a fallback // source of max clock values when CSV clock fields are unsupported. var nvsmiQOut []byte if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil { nvsmiQOut = out _ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644) } infoByIndex, infoErr := queryBenchmarkGPUInfo(selected) if infoErr != nil { result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error()) result.Normalization.Status = "partial" } // Enrich with max clocks from verbose output — covers GPUs where // clocks.max.* CSV fields are unsupported (e.g. Blackwell / driver 98.x). enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQOut) activeApps, err := queryActiveComputeApps(selected) if err == nil && len(activeApps) > 0 { result.Warnings = append(result.Warnings, "active GPU compute processes detected before benchmark") result.Normalization.Notes = append(result.Normalization.Notes, activeApps...) result.Normalization.Status = "partial" } restoreActions := applyBenchmarkNormalization(ctx, verboseLog, selected, infoByIndex, &result) defer func() { for i := len(restoreActions) - 1; i >= 0; i-- { restoreActions[i].fn() } }() // Power calibration: run dcgmi targeted_power while sampling nvidia-smi power. // Returns per-GPU p95 power as an honest TDP reference for PowerSustainScore. calibPowerByIndex := runBenchmarkPowerCalibration(ctx, verboseLog, runDir, selected, logFunc) // Start background CPU load sampler — samples every 10s during GPU phases. cpuStopCh := make(chan struct{}) cpuSamplesCh := startCPULoadSampler(cpuStopCh, 10) if opts.ParallelGPUs { runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, calibPowerByIndex, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples, &metricRows, gpuBurnLog) } else { for _, idx := range selected { gpuResult := BenchmarkGPUResult{ Index: idx, Status: "FAILED", } if info, ok := infoByIndex[idx]; ok { gpuResult.UUID = info.UUID gpuResult.Name = info.Name gpuResult.BusID = info.BusID gpuResult.VBIOS = info.VBIOS gpuResult.PowerLimitW = info.PowerLimitW gpuResult.MultiprocessorCount = info.MultiprocessorCount gpuResult.DefaultPowerLimitW = info.DefaultPowerLimitW gpuResult.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz gpuResult.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz } if w, ok := calibPowerByIndex[idx]; ok && w > 0 { gpuResult.CalibratedPeakPowerW = w } if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil { gpuResult.LockedGraphicsClockMHz = norm.GPUClockLockMHz gpuResult.LockedMemoryClockMHz = norm.MemoryClockLockMHz } baselineRows, err := collectBenchmarkSamples(ctx, spec.BaselineSec, []int{idx}) if err != nil && err != context.Canceled { gpuResult.Notes = append(gpuResult.Notes, "baseline sampling failed: "+err.Error()) } gpuResult.Baseline = summarizeBenchmarkTelemetry(baselineRows) appendBenchmarkMetrics(&metricRows, baselineRows, fmt.Sprintf("gpu-%d-baseline", idx)) // Sample server idle power once (first GPU only — server state is global). if !serverIdleOK { if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok { serverIdleW = w serverIdleOK = true logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w)) } } warmupCmd := []string{ "bee-gpu-burn", "--seconds", strconv.Itoa(spec.WarmupSec), "--size-mb", strconv.Itoa(opts.SizeMB), "--devices", strconv.Itoa(idx), } logFunc(fmt.Sprintf("GPU %d: warmup (%ds)", idx, spec.WarmupSec)) warmupOut, warmupRows, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-warmup.log", idx), warmupCmd, nil, []int{idx}, logFunc) appendBenchmarkMetrics(&metricRows, warmupRows, fmt.Sprintf("gpu-%d-warmup", idx)) appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", fmt.Sprintf("gpu-%d-warmup", idx), warmupOut) if warmupErr != nil { gpuResult.Notes = append(gpuResult.Notes, "warmup failed: "+warmupErr.Error()) result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult)) continue } // Run synthetic precision phases and the combined steady phase as one // uninterrupted command so the GPU stays hot between windows. eccBase, _ := queryECCCounters(idx) planLabels, planPhases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(spec, func(label string) string { if label == "mixed" { return fmt.Sprintf("gpu-%d-steady", idx) } return fmt.Sprintf("gpu-%d-steady-%s", idx, label) }) planCmd := []string{ "bee-gpu-burn", "--seconds", strconv.Itoa(basePhaseSec), "--size-mb", strconv.Itoa(opts.SizeMB), "--devices", strconv.Itoa(idx), "--precision-plan", strings.Join(planLabels, ","), "--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases), } logFunc(fmt.Sprintf("GPU %d: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", idx, len(benchmarkPrecisionPhases), basePhaseSec, mixedPhaseSec)) _, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-precision-plan.log", idx), planCmd, nil, []int{idx}, planPhases, logFunc) for _, phaseSpec := range planPhases { if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 { appendBenchmarkMetrics(&metricRows, rows, phaseSpec.MetricStage) } appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseSpec.MetricStage, phaseLogs[phaseSpec.PlanLabel]) } for _, prec := range benchmarkPrecisionPhases { stageName := fmt.Sprintf("gpu-%d-steady-%s", idx, prec) phaseRows := phaseRowsByStage[stageName] if len(phaseRows) == 0 { continue } phase := BenchmarkPrecisionSteadyPhase{ Precision: prec, Steady: summarizeBenchmarkTelemetry(phaseRows), } for _, p := range parseBenchmarkBurnLog(string(phaseLogs[prec])).Profiles { if p.Supported { phase.TeraOpsPerSec += p.TeraOpsPerSec phase.WeightedTeraOpsPerSec += p.WeightedTeraOpsPerSec } } gpuResult.PrecisionSteady = append(gpuResult.PrecisionSteady, phase) } beforeThrottle, _ := queryThrottleCounters(idx) logFunc(fmt.Sprintf("GPU %d: steady compute (combined, %ds)", idx, mixedPhaseSec)) // Sample server power via IPMI in parallel with the steady phase. // We collect readings every 5s and average them. ipmiStopCh := make(chan struct{}) ipmiResultCh := make(chan float64, 1) go func() { defer close(ipmiResultCh) var samples []float64 ticker := time.NewTicker(5 * time.Second) defer ticker.Stop() // First sample after a short warmup delay. select { case <-ipmiStopCh: return case <-time.After(15 * time.Second): } for { if w, err := queryIPMIServerPowerW(); err == nil { samples = append(samples, w) } select { case <-ipmiStopCh: if len(samples) > 0 { var sum float64 for _, w := range samples { sum += w } ipmiResultCh <- sum / float64(len(samples)) } return case <-ticker.C: } } }() close(ipmiStopCh) if loadedW, ok := <-ipmiResultCh; ok { serverLoadedWSum += loadedW serverLoadedSamples++ serverLoadedOK = true logFunc(fmt.Sprintf("GPU %d: server loaded power (IPMI): %.0f W", idx, loadedW)) } afterThrottle, _ := queryThrottleCounters(idx) if planErr != nil { gpuResult.Notes = append(gpuResult.Notes, "precision plan failed: "+planErr.Error()) } steadyRows := phaseRowsByStage[fmt.Sprintf("gpu-%d-steady", idx)] parseResult := parseBenchmarkBurnLog(string(phaseLogs["mixed"])) gpuResult.ComputeCapability = parseResult.ComputeCapability gpuResult.Backend = parseResult.Backend gpuResult.PrecisionResults = parseResult.Profiles if parseResult.Fallback { gpuResult.Notes = append(gpuResult.Notes, "benchmark used driver PTX fallback; tensor throughput score is not comparable") } gpuResult.Steady = summarizeBenchmarkTelemetry(steadyRows) gpuResult.Throttle = diffThrottleCounters(beforeThrottle, afterThrottle) if eccFinal, err := queryECCCounters(idx); err == nil { gpuResult.ECC = diffECCCounters(eccBase, eccFinal) } if spec.CooldownSec > 0 { cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, []int{idx}) if err != nil && err != context.Canceled { gpuResult.Notes = append(gpuResult.Notes, "cooldown sampling failed: "+err.Error()) } gpuResult.Cooldown = summarizeBenchmarkTelemetry(cooldownRows) appendBenchmarkMetrics(&metricRows, cooldownRows, fmt.Sprintf("gpu-%d-cooldown", idx)) } gpuResult.Scores = scoreBenchmarkGPUResult(gpuResult) gpuResult.DegradationReasons = detectBenchmarkDegradationReasons(gpuResult, result.Normalization.Status) if planErr != nil { gpuResult.Status = classifySATErrorStatus(phaseLogs["mixed"], planErr) } else if parseResult.Fallback { gpuResult.Status = "PARTIAL" } else { gpuResult.Status = "OK" } result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult)) } } // end sequential path if len(selected) > 1 && opts.RunNCCL { result.Interconnect = runBenchmarkInterconnect(ctx, verboseLog, runDir, selected, spec, logFunc) if result.Interconnect != nil && result.Interconnect.Supported { for i := range result.GPUs { result.GPUs[i].Scores.InterconnectScore = result.Interconnect.MaxBusBWGBps result.GPUs[i].Scores.CompositeScore = compositeBenchmarkScore(result.GPUs[i].Scores) } } } // Stop CPU load sampler and attach results. close(cpuStopCh) if cpuSamples := <-cpuSamplesCh; len(cpuSamples) > 0 { result.CPULoad = summarizeCPULoad(cpuSamples) if result.CPULoad != nil && result.CPULoad.Status != "ok" { logFunc(fmt.Sprintf("host CPU load during benchmark: avg=%.1f%% max=%.1f%% status=%s", result.CPULoad.AvgPct, result.CPULoad.MaxPct, result.CPULoad.Status)) } } // Compute server power characterization from accumulated IPMI samples. var gpuReportedSumW float64 for _, gpu := range result.GPUs { gpuReportedSumW += gpu.Steady.AvgPowerW } var serverLoadedW float64 if serverLoadedSamples > 0 { serverLoadedW = serverLoadedWSum / float64(serverLoadedSamples) } result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuReportedSumW, serverIdleOK && serverLoadedOK) result.Cooling = summarizeBenchmarkCooling(metricRows) // Apply server-power penalty when IPMI reports the server delta is much // lower than GPU-reported sum: GPU power telemetry is over-stated, making // CalibratedPeakPowerW and PowerSustainScore unreliable. // Penalty factor scales from 1.0 (ratio ≥ 0.75, no penalty) down to 0. if sp := result.ServerPower; sp != nil && sp.Available && sp.ReportingRatio > 0 && sp.ReportingRatio < 0.75 { factor := sp.ReportingRatio / 0.75 for i := range result.GPUs { result.GPUs[i].Scores.CompositeScore *= factor result.GPUs[i].Notes = append(result.GPUs[i].Notes, fmt.Sprintf("server-power penalty applied (reporting_ratio=%.2f < 0.75): composite score reduced to %.1f%%", sp.ReportingRatio, factor*100)) } } result.Findings = buildBenchmarkFindings(result) result.OverallStatus = benchmarkOverallStatus(result) writeBenchmarkMetricsFiles(runDir, metricRows) resultJSON, err := json.MarshalIndent(result, "", " ") if err != nil { return "", fmt.Errorf("marshal benchmark result: %w", err) } if err := os.WriteFile(filepath.Join(runDir, "result.json"), resultJSON, 0644); err != nil { return "", fmt.Errorf("write result.json: %w", err) } report := renderBenchmarkReportWithCharts(result) if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(report), 0644); err != nil { return "", fmt.Errorf("write report.md: %w", err) } summary := renderBenchmarkSummary(result) if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644); err != nil { return "", fmt.Errorf("write summary.txt: %w", err) } return runDir, nil } func normalizeNvidiaBenchmarkOptionsForBenchmark(opts NvidiaBenchmarkOptions) NvidiaBenchmarkOptions { switch strings.TrimSpace(strings.ToLower(opts.Profile)) { case NvidiaBenchmarkProfileStability: opts.Profile = NvidiaBenchmarkProfileStability case NvidiaBenchmarkProfileOvernight: opts.Profile = NvidiaBenchmarkProfileOvernight default: opts.Profile = NvidiaBenchmarkProfileStandard } if opts.SizeMB < 0 { opts.SizeMB = 0 } opts.GPUIndices = dedupeSortedIndices(opts.GPUIndices) opts.ExcludeGPUIndices = dedupeSortedIndices(opts.ExcludeGPUIndices) return opts } func resolveBenchmarkProfile(profile string) benchmarkProfileSpec { switch strings.TrimSpace(strings.ToLower(profile)) { case NvidiaBenchmarkProfileStability: return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 120, SteadySec: 3600, NCCLSec: 300, CooldownSec: 0} case NvidiaBenchmarkProfileOvernight: return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 180, SteadySec: 27000, NCCLSec: 600, CooldownSec: 0} default: return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 45, SteadySec: 480, NCCLSec: 180, CooldownSec: 0} } } // benchmarkGPUInfoQuery describes a nvidia-smi --query-gpu field set to try. // Fields are tried in order; the first successful query wins. Extended fields // (attribute.multiprocessor_count, power.default_limit) are not supported on // all driver versions, so we fall back to the base set if the full query fails. // The minimal fallback omits clock fields entirely — clocks.max.* returns // exit status 2 on some GPU generations (e.g. Blackwell); max clocks are // then recovered from nvidia-smi -q via enrichGPUInfoWithMaxClocks. var benchmarkGPUInfoQueries = []struct { fields string extended bool // whether this query includes optional extended fields minimal bool // clock fields omitted; max clocks must be filled separately }{ { fields: "index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics,attribute.multiprocessor_count,power.default_limit", extended: true, }, { fields: "index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics", extended: false, }, { fields: "index,uuid,name,pci.bus_id,vbios_version,power.limit", minimal: true, }, } // enrichGPUInfoWithMaxClocks fills MaxGraphicsClockMHz / MaxMemoryClockMHz for // any GPU in infoByIndex where those values are still zero. It parses the // "Max Clocks" section of nvidia-smi -q output (already available as nvsmiQ). // This is the fallback for GPUs (e.g. Blackwell) where clocks.max.* CSV fields // return exit status 2 but the verbose query works fine. func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []byte) { if len(infoByIndex) == 0 || len(nvsmiQ) == 0 { return } // Build bus_id → index map for matching verbose sections to GPU indices. busToBenchIdx := make(map[string]int, len(infoByIndex)) for idx, info := range infoByIndex { if info.BusID != "" { // nvidia-smi -q uses "GPU 00000000:4E:00.0" (8-digit domain), // while --query-gpu returns the same format; normalise to lower. busToBenchIdx[strings.ToLower(strings.TrimSpace(info.BusID))] = idx } } // Split the verbose output into per-GPU sections on "^GPU " lines. gpuSectionRe := regexp.MustCompile(`(?m)^GPU\s+([\dA-Fa-f:\.]+)`) maxGfxRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Graphics\s*:\s*(\d+)\s*MHz`) maxMemRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Memory\s*:\s*(\d+)\s*MHz`) defaultPwrRe := regexp.MustCompile(`(?i)Default Power Limit\s*:\s*([0-9.]+)\s*W`) currentPwrRe := regexp.MustCompile(`(?i)Current Power Limit\s*:\s*([0-9.]+)\s*W`) smCountRe := regexp.MustCompile(`(?i)Multiprocessor Count\s*:\s*(\d+)`) sectionStarts := gpuSectionRe.FindAllSubmatchIndex(nvsmiQ, -1) for i, loc := range sectionStarts { busID := strings.ToLower(string(nvsmiQ[loc[2]:loc[3]])) benchIdx, ok := busToBenchIdx[busID] if !ok { // Bus IDs from verbose output may have a different domain prefix; // try suffix match on the slot portion (XX:XX.X). for k, v := range busToBenchIdx { if strings.HasSuffix(k, busID) || strings.HasSuffix(busID, k) { benchIdx = v ok = true break } } } if !ok { continue } end := len(nvsmiQ) if i+1 < len(sectionStarts) { end = sectionStarts[i+1][0] } section := nvsmiQ[loc[0]:end] info := infoByIndex[benchIdx] if info.MaxGraphicsClockMHz == 0 { if m := maxGfxRe.FindSubmatch(section); m != nil { if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil { info.MaxGraphicsClockMHz = v } } } if info.MaxMemoryClockMHz == 0 { if m := maxMemRe.FindSubmatch(section); m != nil { if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil { info.MaxMemoryClockMHz = v } } } if info.DefaultPowerLimitW == 0 { if m := defaultPwrRe.FindSubmatch(section); m != nil { if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 { info.DefaultPowerLimitW = v } } } if info.PowerLimitW == 0 { if m := currentPwrRe.FindSubmatch(section); m != nil { if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 { info.PowerLimitW = v } } } if info.MultiprocessorCount == 0 { if m := smCountRe.FindSubmatch(section); m != nil { if v, err := strconv.Atoi(string(m[1])); err == nil && v > 0 { info.MultiprocessorCount = v } } } infoByIndex[benchIdx] = info } } func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) { var lastErr error for _, q := range benchmarkGPUInfoQueries { args := []string{ "--query-gpu=" + q.fields, "--format=csv,noheader,nounits", } if len(gpuIndices) > 0 { args = append([]string{"--id=" + joinIndexList(gpuIndices)}, args...) } out, err := satExecCommand("nvidia-smi", args...).Output() if err != nil { lastErr = fmt.Errorf("nvidia-smi gpu info (%s): %w", q.fields[:min(len(q.fields), 40)], err) continue } r := csv.NewReader(strings.NewReader(string(out))) r.TrimLeadingSpace = true r.FieldsPerRecord = -1 rows, err := r.ReadAll() if err != nil { lastErr = fmt.Errorf("parse nvidia-smi gpu info: %w", err) continue } minFields := 6 if !q.minimal { minFields = 9 } infoByIndex := make(map[int]benchmarkGPUInfo, len(rows)) for _, row := range rows { if len(row) < minFields { continue } idx, err := strconv.Atoi(strings.TrimSpace(row[0])) if err != nil { continue } info := benchmarkGPUInfo{ Index: idx, UUID: strings.TrimSpace(row[1]), Name: strings.TrimSpace(row[2]), BusID: strings.TrimSpace(row[3]), VBIOS: strings.TrimSpace(row[4]), PowerLimitW: parseBenchmarkFloat(row[5]), } if !q.minimal { info.MaxGraphicsClockMHz = parseBenchmarkFloat(row[6]) info.MaxMemoryClockMHz = parseBenchmarkFloat(row[7]) if len(row) >= 9 { info.BaseGraphicsClockMHz = parseBenchmarkFloat(row[8]) } if q.extended { if len(row) >= 10 { info.MultiprocessorCount = int(parseBenchmarkFloat(row[9])) } if len(row) >= 11 { info.DefaultPowerLimitW = parseBenchmarkFloat(row[10]) } } } infoByIndex[idx] = info } return infoByIndex, nil } return nil, lastErr } func applyBenchmarkNormalization(ctx context.Context, verboseLog string, gpuIndices []int, infoByIndex map[int]benchmarkGPUInfo, result *NvidiaBenchmarkResult) []benchmarkRestoreAction { if os.Geteuid() != 0 { result.Normalization.Status = "partial" result.Normalization.Notes = append(result.Normalization.Notes, "benchmark normalization skipped: root privileges are required for persistence mode and clock locks") for _, idx := range gpuIndices { result.Normalization.GPUs = append(result.Normalization.GPUs, BenchmarkNormalizationGPU{ Index: idx, Notes: []string{"normalization skipped: root privileges are required"}, }) } return nil } var restore []benchmarkRestoreAction for _, idx := range gpuIndices { rec := BenchmarkNormalizationGPU{Index: idx} if _, err := runSATCommandCtx(ctx, verboseLog, fmt.Sprintf("normalize-gpu-%d-pm", idx), []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-pm", "1"}, nil, nil); err != nil { rec.PersistenceMode = "failed" rec.Notes = append(rec.Notes, "failed to enable persistence mode") result.Normalization.Status = "partial" } else { rec.PersistenceMode = "applied" } if info, ok := infoByIndex[idx]; ok && info.MaxGraphicsClockMHz > 0 { target := int(math.Round(info.MaxGraphicsClockMHz)) if out, err := runSATCommandCtx(ctx, verboseLog, fmt.Sprintf("normalize-gpu-%d-lgc", idx), []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-lgc", strconv.Itoa(target)}, nil, nil); err != nil { rec.GPUClockLockStatus = "failed" rec.Notes = append(rec.Notes, "graphics clock lock failed: "+strings.TrimSpace(string(out))) result.Normalization.Status = "partial" } else { rec.GPUClockLockStatus = "applied" rec.GPUClockLockMHz = float64(target) idxCopy := idx restore = append(restore, benchmarkRestoreAction{name: fmt.Sprintf("gpu-%d-rgc", idxCopy), fn: func() { _, _ = runSATCommandCtx(context.Background(), verboseLog, fmt.Sprintf("restore-gpu-%d-rgc", idxCopy), []string{"nvidia-smi", "-i", strconv.Itoa(idxCopy), "-rgc"}, nil, nil) }}) } } else { rec.GPUClockLockStatus = "skipped" rec.Notes = append(rec.Notes, "graphics clock lock skipped: gpu inventory unavailable or MaxGraphicsClockMHz=0") result.Normalization.Status = "partial" } if info, ok := infoByIndex[idx]; ok && info.MaxMemoryClockMHz > 0 { target := int(math.Round(info.MaxMemoryClockMHz)) out, err := runSATCommandCtx(ctx, verboseLog, fmt.Sprintf("normalize-gpu-%d-lmc", idx), []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-lmc", strconv.Itoa(target)}, nil, nil) switch { case err == nil: rec.MemoryClockLockStatus = "applied" rec.MemoryClockLockMHz = float64(target) idxCopy := idx restore = append(restore, benchmarkRestoreAction{name: fmt.Sprintf("gpu-%d-rmc", idxCopy), fn: func() { _, _ = runSATCommandCtx(context.Background(), verboseLog, fmt.Sprintf("restore-gpu-%d-rmc", idxCopy), []string{"nvidia-smi", "-i", strconv.Itoa(idxCopy), "-rmc"}, nil, nil) }}) case strings.Contains(strings.ToLower(string(out)), "deferred") || strings.Contains(strings.ToLower(string(out)), "not supported"): rec.MemoryClockLockStatus = "unsupported" rec.Notes = append(rec.Notes, "memory clock lock unsupported on this GPU/driver path") result.Normalization.Status = "partial" default: rec.MemoryClockLockStatus = "failed" rec.Notes = append(rec.Notes, "memory clock lock failed: "+strings.TrimSpace(string(out))) result.Normalization.Status = "partial" } } result.Normalization.GPUs = append(result.Normalization.GPUs, rec) } return restore } func collectBenchmarkSamples(ctx context.Context, durationSec int, gpuIndices []int) ([]GPUMetricRow, error) { if durationSec <= 0 { return nil, nil } deadline := time.Now().Add(time.Duration(durationSec) * time.Second) var rows []GPUMetricRow start := time.Now() for { if ctx.Err() != nil { return rows, ctx.Err() } samples, err := sampleBenchmarkTelemetry(gpuIndices) if err == nil { elapsed := time.Since(start).Seconds() for i := range samples { samples[i].ElapsedSec = elapsed } rows = append(rows, samples...) } if time.Now().After(deadline) { break } select { case <-ctx.Done(): return rows, ctx.Err() case <-time.After(time.Second): } } return rows, nil } func runBenchmarkCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, logFunc func(string)) ([]byte, []GPUMetricRow, error) { stopCh := make(chan struct{}) doneCh := make(chan struct{}) var metricRows []GPUMetricRow start := time.Now() go func() { defer close(doneCh) ticker := time.NewTicker(time.Second) defer ticker.Stop() for { select { case <-stopCh: return case <-ticker.C: samples, err := sampleBenchmarkTelemetry(gpuIndices) if err != nil { continue } elapsed := time.Since(start).Seconds() for i := range samples { samples[i].ElapsedSec = elapsed } metricRows = append(metricRows, samples...) } } }() out, err := runSATCommandCtx(ctx, verboseLog, name, cmd, env, logFunc) close(stopCh) <-doneCh return out, metricRows, err } type benchmarkPlannedPhase struct { PlanLabel string MetricStage string DurationSec int } func runBenchmarkPlannedCommandWithMetrics( ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, phases []benchmarkPlannedPhase, logFunc func(string), ) ([]byte, map[string][]GPUMetricRow, map[string][]byte, error) { out, rows, err := runBenchmarkCommandWithMetrics(ctx, verboseLog, name, cmd, env, gpuIndices, logFunc) return out, splitBenchmarkRowsByPlannedPhase(rows, phases), splitBenchmarkLogByPlannedPhase(out), err } func splitBenchmarkRowsByPlannedPhase(rows []GPUMetricRow, phases []benchmarkPlannedPhase) map[string][]GPUMetricRow { out := make(map[string][]GPUMetricRow, len(phases)) if len(rows) == 0 || len(phases) == 0 { return out } for _, row := range rows { idx := len(phases) - 1 var elapsed float64 for i, phase := range phases { durationSec := phase.DurationSec if durationSec <= 0 { durationSec = 1 } elapsed += float64(durationSec) if row.ElapsedSec < elapsed { idx = i break } } out[phases[idx].MetricStage] = append(out[phases[idx].MetricStage], row) } return out } func splitBenchmarkLogByPlannedPhase(raw []byte) map[string][]byte { out := make(map[string][]byte) var current string for _, line := range strings.Split(strings.ReplaceAll(string(raw), "\r\n", "\n"), "\n") { trimmed := strings.TrimSpace(stripBenchmarkPrefix(line)) switch { case strings.HasPrefix(trimmed, "phase_begin="): current = strings.TrimSpace(strings.TrimPrefix(trimmed, "phase_begin=")) case strings.HasPrefix(trimmed, "phase_end="): current = "" case current != "": out[current] = append(out[current], []byte(line+"\n")...) } } return out } type benchmarkCoolingSample struct { AvgFanRPM float64 AvgFanDutyCyclePct float64 FanDutyCycleAvailable bool } func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) { samples, err := sampleGPUMetrics(gpuIndices) if err != nil { return nil, err } fanSample := sampleBenchmarkCoolingSample() for i := range samples { samples[i].FanAvgRPM = fanSample.AvgFanRPM samples[i].FanDutyCyclePct = fanSample.AvgFanDutyCyclePct samples[i].FanDutyCycleAvailable = fanSample.FanDutyCycleAvailable } return samples, nil } func sampleBenchmarkCoolingSample() benchmarkCoolingSample { fans, _ := sampleFanSpeeds() avgRPM, _, _ := fanRPMStats(fans) dutyPct, dutyAvailable := sampleFanDutyCyclePct() return benchmarkCoolingSample{ AvgFanRPM: avgRPM, AvgFanDutyCyclePct: dutyPct, FanDutyCycleAvailable: dutyAvailable, } } func annotateBenchmarkMetricRows(rows []GPUMetricRow, stage string, offset float64) []GPUMetricRow { if len(rows) == 0 { return nil } out := make([]GPUMetricRow, len(rows)) for i, row := range rows { row.Stage = stage row.ElapsedSec += offset out[i] = row } return out } func benchmarkMetricOffset(rows []GPUMetricRow) float64 { if len(rows) == 0 { return 0 } var maxElapsed float64 for _, row := range rows { if row.ElapsedSec > maxElapsed { maxElapsed = row.ElapsedSec } } return maxElapsed } func appendBenchmarkMetrics(allRows *[]GPUMetricRow, rows []GPUMetricRow, stage string) { annotated := annotateBenchmarkMetricRows(rows, stage, benchmarkMetricOffset(*allRows)) *allRows = append(*allRows, annotated...) } func writeBenchmarkMetricsFiles(runDir string, rows []GPUMetricRow) { if len(rows) == 0 { return } _ = WriteGPUMetricsCSV(filepath.Join(runDir, "gpu-metrics.csv"), rows) _ = WriteGPUMetricsHTML(filepath.Join(runDir, "gpu-metrics.html"), rows) } func appendBenchmarkStageLog(path, source, stage string, raw []byte) { if path == "" || len(raw) == 0 { return } f, err := os.OpenFile(path, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644) if err != nil { return } defer f.Close() header := fmt.Sprintf("\n========== %s | stage=%s ==========\n", source, stage) _, _ = f.WriteString(header) if len(raw) > 0 { _, _ = f.Write(raw) if raw[len(raw)-1] != '\n' { _, _ = f.WriteString("\n") } } } func parseBenchmarkBurnLog(raw string) benchmarkBurnParseResult { result := benchmarkBurnParseResult{} lines := strings.Split(strings.ReplaceAll(raw, "\r\n", "\n"), "\n") profiles := make(map[string]*benchmarkBurnProfile) for _, line := range lines { line = stripBenchmarkPrefix(strings.TrimSpace(line)) if line == "" { continue } switch { case strings.HasPrefix(line, "device="): result.Device = strings.TrimSpace(strings.TrimPrefix(line, "device=")) case strings.HasPrefix(line, "compute_capability="): result.ComputeCapability = strings.TrimSpace(strings.TrimPrefix(line, "compute_capability=")) case strings.HasPrefix(line, "backend="): result.Backend = strings.TrimSpace(strings.TrimPrefix(line, "backend=")) result.Fallback = result.Backend == "driver-ptx" case strings.HasPrefix(line, "duration_s="): result.DurationSec, _ = strconv.Atoi(strings.TrimSpace(strings.TrimPrefix(line, "duration_s="))) default: if m := benchmarkReadyPattern.FindStringSubmatch(line); len(m) == 6 { profile := ensureBenchmarkProfile(profiles, m[1]) profile.supported = true profile.lanes++ profile.m, _ = strconv.ParseUint(m[3], 10, 64) profile.n, _ = strconv.ParseUint(m[4], 10, 64) profile.k, _ = strconv.ParseUint(m[5], 10, 64) continue } if m := benchmarkSkippedPattern.FindStringSubmatch(line); len(m) == 3 { profile := ensureBenchmarkProfile(profiles, m[1]) profile.supported = false profile.notes = strings.TrimSpace(m[2]) continue } if m := benchmarkIterationsPattern.FindStringSubmatch(line); len(m) == 3 { profile := ensureBenchmarkProfile(profiles, m[1]) iters, _ := strconv.ParseUint(m[2], 10, 64) profile.iterations += iters } } } keys := make([]string, 0, len(profiles)) for key := range profiles { keys = append(keys, key) } sort.Strings(keys) for _, key := range keys { profile := profiles[key] precision := BenchmarkPrecisionResult{ Name: profile.name, Category: profile.category, Supported: profile.supported, Lanes: profile.lanes, M: profile.m, N: profile.n, K: profile.k, Iterations: profile.iterations, Notes: profile.notes, } w := precisionWeight(profile.category) precision.Weight = w if profile.supported && result.DurationSec > 0 && profile.m > 0 && profile.n > 0 && profile.k > 0 && profile.iterations > 0 { precision.TeraOpsPerSec = (2.0 * float64(profile.m) * float64(profile.n) * float64(profile.k) * float64(profile.iterations)) / float64(result.DurationSec) / 1e12 precision.WeightedTeraOpsPerSec = precision.TeraOpsPerSec * w } result.Profiles = append(result.Profiles, precision) } return result } func ensureBenchmarkProfile(profiles map[string]*benchmarkBurnProfile, name string) *benchmarkBurnProfile { if profile, ok := profiles[name]; ok { return profile } category := "other" switch { case strings.HasPrefix(name, "fp64"): category = "fp64" case strings.HasPrefix(name, "fp32"): category = "fp32_tf32" case strings.HasPrefix(name, "fp16"): category = "fp16_bf16" case strings.HasPrefix(name, "int8"): category = "int8" case strings.HasPrefix(name, "fp8"): category = "fp8" case strings.HasPrefix(name, "fp4"): category = "fp4" } profile := &benchmarkBurnProfile{name: name, category: category, supported: true} profiles[name] = profile return profile } // precisionWeight returns the fp32-equivalence factor for a precision category. // Each factor represents how much "real" numeric work one operation of that // type performs relative to fp32 (single precision = 1.0 baseline): // // fp64 = 2.0 — double precision, 2× more bits per operand // fp32 = 1.0 — single precision baseline // fp16 = 0.5 — half precision // int8 = 0.25 — quarter precision // fp8 = 0.25 — quarter precision // fp4 = 0.125 — eighth precision // // Multiplying raw TOPS by the weight gives fp32-equivalent TOPS, enabling // cross-precision comparison on the same numeric scale. func precisionWeight(category string) float64 { switch category { case "fp64": return 2.0 case "fp32_tf32": return 1.0 case "fp16_bf16": return 0.5 case "int8": return 0.25 case "fp8": return 0.25 case "fp4": return 0.125 default: return 1.0 } } func stripBenchmarkPrefix(line string) string { if strings.HasPrefix(line, "[gpu ") { if idx := strings.Index(line, "] "); idx >= 0 { return line[idx+2:] } } return line } func summarizeBenchmarkTelemetry(rows []GPUMetricRow) BenchmarkTelemetrySummary { summary := BenchmarkTelemetrySummary{} if len(rows) == 0 { return summary } temps := make([]float64, 0, len(rows)) powers := make([]float64, 0, len(rows)) clocks := make([]float64, 0, len(rows)) memClocks := make([]float64, 0, len(rows)) usages := make([]float64, 0, len(rows)) memUsages := make([]float64, 0, len(rows)) summary.DurationSec = rows[len(rows)-1].ElapsedSec summary.Samples = len(rows) for _, row := range rows { temps = append(temps, row.TempC) powers = append(powers, row.PowerW) clocks = append(clocks, row.ClockMHz) memClocks = append(memClocks, row.MemClockMHz) usages = append(usages, row.UsagePct) memUsages = append(memUsages, row.MemUsagePct) } summary.AvgTempC = benchmarkMean(temps) summary.P95TempC = benchmarkPercentile(temps, 95) summary.AvgPowerW = benchmarkMean(powers) summary.P95PowerW = benchmarkPercentile(powers, 95) summary.AvgGraphicsClockMHz = benchmarkMean(clocks) summary.P95GraphicsClockMHz = benchmarkPercentile(clocks, 95) summary.AvgMemoryClockMHz = benchmarkMean(memClocks) summary.P95MemoryClockMHz = benchmarkPercentile(memClocks, 95) summary.AvgUsagePct = benchmarkMean(usages) summary.AvgMemUsagePct = benchmarkMean(memUsages) summary.ClockCVPct = benchmarkCV(clocks) summary.PowerCVPct = benchmarkCV(powers) summary.TempCVPct = benchmarkCV(temps) summary.ClockDriftPct = benchmarkClockDrift(clocks) return summary } func summarizeBenchmarkCooling(rows []GPUMetricRow) *BenchmarkCoolingSummary { if len(rows) == 0 { return nil } var rpmValues []float64 var dutyValues []float64 for _, row := range rows { if row.FanAvgRPM > 0 { rpmValues = append(rpmValues, row.FanAvgRPM) } if row.FanDutyCycleAvailable { dutyValues = append(dutyValues, row.FanDutyCyclePct) } } if len(rpmValues) == 0 && len(dutyValues) == 0 { return nil } summary := &BenchmarkCoolingSummary{ Available: true, AvgFanRPM: benchmarkMean(rpmValues), } if len(dutyValues) > 0 { summary.FanDutyCycleAvailable = true summary.AvgFanDutyCyclePct = benchmarkMean(dutyValues) summary.P95FanDutyCyclePct = benchmarkPercentile(dutyValues, 95) } else { summary.Notes = append(summary.Notes, "fan duty cycle unavailable on this host; RPM-only fan telemetry was collected") } return summary } func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard { score := BenchmarkScorecard{} // SyntheticScore: sum of fp32-equivalent TOPS from per-precision phases. // Each precision ran alone with full GPU dedicated — peak capability. for _, p := range gpu.PrecisionSteady { score.SyntheticScore += p.WeightedTeraOpsPerSec } // MixedScore: sum of fp32-equivalent TOPS from the combined phase. // All precisions compete simultaneously — closer to real inference workloads. for _, p := range gpu.PrecisionResults { if p.Supported { score.MixedScore += p.WeightedTeraOpsPerSec } } // MixedEfficiency = MixedScore / SyntheticScore. // Measures how well the GPU sustains throughput under concurrent mixed load. // A healthy GPU scores ~0.8–0.95; severe degradation suggests bandwidth // contention or scheduler inefficiency. if score.SyntheticScore > 0 && score.MixedScore > 0 { score.MixedEfficiency = score.MixedScore / score.SyntheticScore } // ComputeScore = SyntheticScore × (1 + MixedEfficiency × 0.3). // SyntheticScore is the primary signal; MixedEfficiency adds up to +30% // bonus for GPUs that handle mixed-precision concurrency well. // Falls back to MixedScore alone when per-precision data is absent. switch { case score.SyntheticScore > 0: score.ComputeScore = score.SyntheticScore * (1 + score.MixedEfficiency*0.3) case score.MixedScore > 0: score.ComputeScore = score.MixedScore } // PowerSustainScore: measures how close the GPU came to its rated TDP under // a full-spectrum load (dcgmi targeted_power). 100 = exactly at rated TDP. // Penalty applied symmetrically for both under- and over-TDP deviations: // score = max(0, 100 − |measured − rated| / rated × 100) // Under-TDP → power delivery / cooling issue. // Over-TDP → power limit not properly enforced / power regulation fault. // Falls back to 0 if calibration was not performed (dcgmi unavailable). { ref := gpu.DefaultPowerLimitW if ref <= 0 { ref = gpu.PowerLimitW } if gpu.CalibratedPeakPowerW > 0 && ref > 0 { deviationPct := math.Abs(gpu.CalibratedPeakPowerW-ref) / ref * 100 score.PowerSustainScore = clampScore(100 - deviationPct) } } runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6) thermalRatio := float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS) / runtimeUS score.ThermalSustainScore = clampScore(100 - thermalRatio*100) // StabilityScore: prefer per-precision steady phases where each window runs a // single kernel type so PowerCVPct is a genuine stability signal (not a // workload-mix artifact). Fall back to combined steady using clock-only metrics // when per-precision data is absent (older results, short profiles). if len(gpu.PrecisionSteady) > 0 { var sum float64 for _, p := range gpu.PrecisionSteady { sum += clampScore(100 - (p.Steady.ClockCVPct*4 + p.Steady.PowerCVPct*2 + p.Steady.ClockDriftPct*2)) } score.StabilityScore = sum / float64(len(gpu.PrecisionSteady)) } else { score.StabilityScore = clampScore(100 - (gpu.Steady.ClockCVPct*4 + gpu.Steady.ClockDriftPct*2)) } score.CompositeScore = compositeBenchmarkScore(score) if gpu.MultiprocessorCount > 0 && gpu.Steady.AvgGraphicsClockMHz > 0 && score.ComputeScore > 0 { score.TOPSPerSMPerGHz = score.ComputeScore / float64(gpu.MultiprocessorCount) / (gpu.Steady.AvgGraphicsClockMHz / 1000.0) } return score } func compositeBenchmarkScore(score BenchmarkScorecard) float64 { // Weights after introducing calibrated power reference: // base 0.35 — floor so a GPU that fails all sustain checks still scores // thermal 0.25 — heaviest: throttle counters are the most reliable signal // stability 0.25 — clock/power variance matters for reproducibility // power 0.15 — GPU reaches rated TDP under targeted_power? lower weight // because calibration may be absent (dcgmi not installed) // NCCL bonus 0.10 — interconnect health // cap 1.10 quality := 0.35 + 0.15*(score.PowerSustainScore/100.0) + 0.25*(score.ThermalSustainScore/100.0) + 0.25*(score.StabilityScore/100.0) if score.InterconnectScore > 0 { quality += 0.10 } if quality > 1.10 { quality = 1.10 } return score.ComputeScore * quality } func detectBenchmarkDegradationReasons(gpu BenchmarkGPUResult, normalizationStatus string) []string { var reasons []string runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6) if float64(gpu.Throttle.SWPowerCapUS)/runtimeUS >= 0.05 { reasons = append(reasons, "power_capped") } if float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS)/runtimeUS >= 0.01 { reasons = append(reasons, "thermal_limited") } if float64(gpu.Throttle.SyncBoostUS)/runtimeUS >= 0.01 { reasons = append(reasons, "sync_boost_limited") } if gpu.LockedGraphicsClockMHz > 0 && gpu.Steady.AvgGraphicsClockMHz < gpu.LockedGraphicsClockMHz*0.90 { reasons = append(reasons, "low_sm_clock_vs_target") } if gpu.Scores.StabilityScore > 0 && gpu.Scores.StabilityScore < 85 { reasons = append(reasons, "variance_too_high") } if normalizationStatus != "full" { reasons = append(reasons, "normalization_partial") } if gpu.ECC.Uncorrected > 0 { reasons = append(reasons, "ecc_uncorrected_errors") } if gpu.ECC.Corrected > 0 { reasons = append(reasons, "ecc_corrected_errors") } return dedupeStrings(reasons) } func runBenchmarkInterconnect(ctx context.Context, verboseLog, runDir string, gpuIndices []int, spec benchmarkProfileSpec, logFunc func(string)) *BenchmarkInterconnectResult { result := &BenchmarkInterconnectResult{ Status: "UNSUPPORTED", Attempted: true, SelectedGPUIndices: append([]int(nil), gpuIndices...), } cmd := []string{ "all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2", "-g", strconv.Itoa(len(gpuIndices)), "--iters", strconv.Itoa(maxInt(20, spec.NCCLSec/10)), } env := []string{ "CUDA_DEVICE_ORDER=PCI_BUS_ID", "CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices), } logFunc(fmt.Sprintf("NCCL interconnect: gpus=%s", joinIndexList(gpuIndices))) out, err := runSATCommandCtx(ctx, verboseLog, "nccl-all-reduce.log", cmd, env, logFunc) _ = os.WriteFile(filepath.Join(runDir, "nccl-all-reduce.log"), out, 0644) if err != nil { result.Notes = append(result.Notes, strings.TrimSpace(string(out))) return result } avgAlg, maxAlg, avgBus, maxBus := parseNCCLAllReduceOutput(string(out)) result.Status = "OK" result.Supported = true result.AvgAlgBWGBps = avgAlg result.MaxAlgBWGBps = maxAlg result.AvgBusBWGBps = avgBus result.MaxBusBWGBps = maxBus return result } func parseNCCLAllReduceOutput(raw string) (avgAlg, maxAlg, avgBus, maxBus float64) { lines := strings.Split(strings.ReplaceAll(raw, "\r\n", "\n"), "\n") var algs []float64 var buses []float64 for _, line := range lines { line = strings.TrimSpace(line) if line == "" || strings.HasPrefix(line, "#") { continue } fields := strings.Fields(line) if len(fields) < 8 { continue } for i := 0; i+2 < len(fields); i++ { timeVal, err1 := strconv.ParseFloat(fields[i], 64) algVal, err2 := strconv.ParseFloat(fields[i+1], 64) busVal, err3 := strconv.ParseFloat(fields[i+2], 64) if err1 == nil && err2 == nil && err3 == nil && timeVal > 0 { algs = append(algs, algVal) buses = append(buses, busVal) break } } } if len(algs) == 0 { return 0, 0, 0, 0 } return benchmarkMean(algs), benchmarkMax(algs), benchmarkMean(buses), benchmarkMax(buses) } func queryThrottleCounters(gpuIndex int) (BenchmarkThrottleCounters, error) { out, err := satExecCommand( "nvidia-smi", "--id="+strconv.Itoa(gpuIndex), "--query-gpu=clocks_event_reasons_counters.sw_power_cap,clocks_event_reasons_counters.sw_thermal_slowdown,clocks_event_reasons_counters.sync_boost,clocks_event_reasons_counters.hw_thermal_slowdown,clocks_event_reasons_counters.hw_power_brake_slowdown", "--format=csv,noheader,nounits", ).Output() if err != nil { return BenchmarkThrottleCounters{}, err } fields := strings.Split(strings.TrimSpace(string(out)), ",") if len(fields) < 5 { return BenchmarkThrottleCounters{}, fmt.Errorf("unexpected throttle counter columns: %q", strings.TrimSpace(string(out))) } return BenchmarkThrottleCounters{ SWPowerCapUS: parseBenchmarkUint64(fields[0]), SWThermalSlowdownUS: parseBenchmarkUint64(fields[1]), SyncBoostUS: parseBenchmarkUint64(fields[2]), HWThermalSlowdownUS: parseBenchmarkUint64(fields[3]), HWPowerBrakeSlowdownUS: parseBenchmarkUint64(fields[4]), }, nil } func diffThrottleCounters(before, after BenchmarkThrottleCounters) BenchmarkThrottleCounters { return BenchmarkThrottleCounters{ SWPowerCapUS: saturatingSub(after.SWPowerCapUS, before.SWPowerCapUS), SWThermalSlowdownUS: saturatingSub(after.SWThermalSlowdownUS, before.SWThermalSlowdownUS), SyncBoostUS: saturatingSub(after.SyncBoostUS, before.SyncBoostUS), HWThermalSlowdownUS: saturatingSub(after.HWThermalSlowdownUS, before.HWThermalSlowdownUS), HWPowerBrakeSlowdownUS: saturatingSub(after.HWPowerBrakeSlowdownUS, before.HWPowerBrakeSlowdownUS), } } func queryECCCounters(gpuIndex int) (BenchmarkECCCounters, error) { out, err := satExecCommand( "nvidia-smi", "--id="+strconv.Itoa(gpuIndex), "--query-gpu=ecc.errors.corrected.volatile.total,ecc.errors.uncorrected.volatile.total", "--format=csv,noheader,nounits", ).Output() if err != nil { return BenchmarkECCCounters{}, err } fields := strings.Split(strings.TrimSpace(string(out)), ",") if len(fields) < 2 { return BenchmarkECCCounters{}, fmt.Errorf("unexpected ECC counter columns: %q", strings.TrimSpace(string(out))) } corrected, err1 := strconv.ParseUint(strings.TrimSpace(fields[0]), 10, 64) uncorrected, err2 := strconv.ParseUint(strings.TrimSpace(fields[1]), 10, 64) if err1 != nil || err2 != nil { // ECC may be disabled on this GPU — return zero counters silently. return BenchmarkECCCounters{}, nil } return BenchmarkECCCounters{Corrected: corrected, Uncorrected: uncorrected}, nil } func diffECCCounters(before, after BenchmarkECCCounters) BenchmarkECCCounters { return BenchmarkECCCounters{ Corrected: saturatingSub(after.Corrected, before.Corrected), Uncorrected: saturatingSub(after.Uncorrected, before.Uncorrected), } } func queryActiveComputeApps(gpuIndices []int) ([]string, error) { args := []string{ "--query-compute-apps=gpu_uuid,pid,process_name", "--format=csv,noheader,nounits", } if len(gpuIndices) > 0 { args = append([]string{"--id=" + joinIndexList(gpuIndices)}, args...) } out, err := satExecCommand("nvidia-smi", args...).Output() if err != nil { return nil, err } var lines []string for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") { line = strings.TrimSpace(line) if line == "" { continue } lines = append(lines, line) } return lines, nil } func finalizeBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkGPUResult { if gpu.Status == "" { gpu.Status = "OK" } if gpu.Scores.CompositeScore == 0 { gpu.Scores.CompositeScore = compositeBenchmarkScore(gpu.Scores) } return gpu } func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string { var findings []string passed := 0 for _, gpu := range result.GPUs { if gpu.Status == "OK" { passed++ } } total := len(result.GPUs) if total > 0 { if passed == total { findings = append(findings, fmt.Sprintf("All %d GPU(s) passed the benchmark.", total)) } else { findings = append(findings, fmt.Sprintf("%d of %d GPU(s) passed the benchmark.", passed, total)) } } if result.Normalization.Status != "full" { findings = append(findings, "Environment normalization was partial; compare results with caution.") } for _, gpu := range result.GPUs { if gpu.Status == "FAILED" && len(gpu.DegradationReasons) == 0 { findings = append(findings, fmt.Sprintf("GPU %d failed the benchmark (check verbose.log for details).", gpu.Index)) continue } if len(gpu.DegradationReasons) == 0 && gpu.Status == "OK" { findings = append(findings, fmt.Sprintf("GPU %d held clocks without observable throttle counters during steady state.", gpu.Index)) continue } for _, reason := range gpu.DegradationReasons { switch reason { case "power_capped": findings = append(findings, fmt.Sprintf("GPU %d spent measurable time under SW power cap.", gpu.Index)) case "thermal_limited": findings = append(findings, fmt.Sprintf("GPU %d reported thermal slowdown during steady state.", gpu.Index)) case "sync_boost_limited": findings = append(findings, fmt.Sprintf("GPU %d was limited by sync boost behaviour.", gpu.Index)) case "low_sm_clock_vs_target": findings = append(findings, fmt.Sprintf("GPU %d average SM clock stayed below the requested lock target.", gpu.Index)) case "variance_too_high": findings = append(findings, fmt.Sprintf("GPU %d showed unstable clocks/power over the benchmark window.", gpu.Index)) case "normalization_partial": findings = append(findings, fmt.Sprintf("GPU %d ran without full benchmark normalization.", gpu.Index)) case "ecc_uncorrected_errors": findings = append(findings, fmt.Sprintf("GPU %d reported %d uncorrected ECC error(s) — possible hardware fault.", gpu.Index, gpu.ECC.Uncorrected)) case "ecc_corrected_errors": findings = append(findings, fmt.Sprintf("GPU %d reported %d corrected ECC error(s) — possible DRAM degradation.", gpu.Index, gpu.ECC.Corrected)) } } if gpu.Backend == "driver-ptx" { findings = append(findings, fmt.Sprintf("GPU %d used driver PTX fallback; tensor score is intentionally degraded.", gpu.Index)) } if gpu.DefaultPowerLimitW > 0 && gpu.PowerLimitW > 0 && gpu.PowerLimitW < gpu.DefaultPowerLimitW*0.95 { findings = append(findings, fmt.Sprintf( "GPU %d power limit %.0f W is below default %.0f W (%.0f%%). Performance may be artificially reduced.", gpu.Index, gpu.PowerLimitW, gpu.DefaultPowerLimitW, gpu.PowerLimitW/gpu.DefaultPowerLimitW*100, )) } // Flag significant TDP deviation (over or under) from calibration. if gpu.CalibratedPeakPowerW > 0 { ref := gpu.DefaultPowerLimitW if ref <= 0 { ref = gpu.PowerLimitW } if ref > 0 { deviationPct := (gpu.CalibratedPeakPowerW - ref) / ref * 100 switch { case deviationPct < -10: findings = append(findings, fmt.Sprintf( "GPU %d reached only %.0f W (%.0f%% of rated %.0f W) under targeted_power. Check power delivery or cooling.", gpu.Index, gpu.CalibratedPeakPowerW, gpu.CalibratedPeakPowerW/ref*100, ref, )) case deviationPct > 5: findings = append(findings, fmt.Sprintf( "GPU %d exceeded rated TDP: %.0f W measured vs %.0f W rated (+%.0f%%). Power limit may not be enforced correctly.", gpu.Index, gpu.CalibratedPeakPowerW, ref, deviationPct, )) } } } } if result.Interconnect != nil && result.Interconnect.Supported { findings = append(findings, fmt.Sprintf("Multi-GPU all_reduce max bus bandwidth: %.1f GB/s.", result.Interconnect.MaxBusBWGBps)) } if cl := result.CPULoad; cl != nil { switch cl.Status { case "high": findings = append(findings, fmt.Sprintf( "Host CPU load was elevated during the benchmark (avg %.1f%%, max %.1f%%). A competing CPU workload may skew GPU results.", cl.AvgPct, cl.MaxPct, )) case "unstable": findings = append(findings, fmt.Sprintf( "Host CPU load was erratic during the benchmark (avg %.1f%%, p95 %.1f%%). Results may be less reproducible.", cl.AvgPct, cl.P95Pct, )) } } if sp := result.ServerPower; sp != nil && sp.Available && sp.GPUReportedSumW > 0 { if sp.ReportingRatio < 0.75 { findings = append(findings, fmt.Sprintf( "GPU power reporting may be unreliable: server delta %.0f W vs GPU-reported %.0f W (ratio %.2f). GPU telemetry likely over-reports actual consumption. Composite scores have been penalized accordingly.", sp.DeltaW, sp.GPUReportedSumW, sp.ReportingRatio, )) } else if sp.ReportingRatio > 1.25 { findings = append(findings, fmt.Sprintf( "Server power delta %.0f W exceeds GPU-reported sum %.0f W by %.0f%%. Other components (CPU, NVMe, networking) may be drawing substantial power under GPU load.", sp.DeltaW, sp.GPUReportedSumW, (sp.ReportingRatio-1)*100, )) } } return dedupeStrings(findings) } func benchmarkOverallStatus(result NvidiaBenchmarkResult) string { if len(result.GPUs) == 0 { return "FAILED" } hasOK := false hasPartial := result.Normalization.Status != "full" for _, gpu := range result.GPUs { switch gpu.Status { case "OK": hasOK = true case "PARTIAL", "UNSUPPORTED": hasPartial = true } } if !hasOK { return "FAILED" } if hasPartial { return "PARTIAL" } return "OK" } func findBenchmarkNormalization(items []BenchmarkNormalizationGPU, idx int) *BenchmarkNormalizationGPU { for i := range items { if items[i].Index == idx { return &items[i] } } return nil } func classifySATErrorStatus(out []byte, err error) string { status, _ := classifySATResult("benchmark", out, err) if status == "UNSUPPORTED" { return "UNSUPPORTED" } return "FAILED" } func parseBenchmarkFloat(raw string) float64 { raw = strings.TrimSpace(raw) if raw == "" || strings.EqualFold(raw, "n/a") || strings.EqualFold(raw, "[not supported]") { return 0 } value, _ := strconv.ParseFloat(raw, 64) return value } func parseBenchmarkUint64(raw string) uint64 { raw = strings.TrimSpace(raw) if raw == "" || strings.EqualFold(raw, "n/a") || strings.EqualFold(raw, "[not supported]") { return 0 } value, _ := strconv.ParseUint(raw, 10, 64) return value } func benchmarkMean(values []float64) float64 { if len(values) == 0 { return 0 } var sum float64 for _, value := range values { sum += value } return sum / float64(len(values)) } func benchmarkPercentile(values []float64, p float64) float64 { if len(values) == 0 { return 0 } copyValues := append([]float64(nil), values...) sort.Float64s(copyValues) if len(copyValues) == 1 { return copyValues[0] } rank := (p / 100.0) * float64(len(copyValues)-1) lower := int(math.Floor(rank)) upper := int(math.Ceil(rank)) if lower == upper { return copyValues[lower] } frac := rank - float64(lower) return copyValues[lower] + (copyValues[upper]-copyValues[lower])*frac } func benchmarkCV(values []float64) float64 { if len(values) == 0 { return 0 } mean := benchmarkMean(values) if mean == 0 { return 0 } var variance float64 for _, value := range values { diff := value - mean variance += diff * diff } variance /= float64(len(values)) return math.Sqrt(variance) / mean * 100 } func benchmarkClockDrift(values []float64) float64 { if len(values) < 4 { return 0 } window := len(values) / 4 if window < 1 { window = 1 } head := benchmarkMean(values[:window]) tail := benchmarkMean(values[len(values)-window:]) if head <= 0 || tail >= head { return 0 } return ((head - tail) / head) * 100 } func benchmarkMax(values []float64) float64 { var max float64 for i, value := range values { if i == 0 || value > max { max = value } } return max } func clampScore(value float64) float64 { switch { case value < 0: return 0 case value > 100: return 100 default: return value } } func dedupeStrings(values []string) []string { if len(values) == 0 { return nil } seen := make(map[string]struct{}, len(values)) out := make([]string, 0, len(values)) for _, value := range values { value = strings.TrimSpace(value) if value == "" { continue } if _, ok := seen[value]; ok { continue } seen[value] = struct{}{} out = append(out, value) } return out } func saturatingSub(after, before uint64) uint64 { if after <= before { return 0 } return after - before } func maxInt(a, b int) int { if a > b { return a } return b } // queryIPMIServerPowerW reads the current server power draw via ipmitool dcmi. // Returns 0 and an error if IPMI is unavailable or the output cannot be parsed. func queryIPMIServerPowerW() (float64, error) { ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() cmd := exec.CommandContext(ctx, "ipmitool", "dcmi", "power", "reading") out, err := cmd.Output() if err != nil { return 0, fmt.Errorf("ipmitool dcmi power reading: %w", err) } if w := parseDCMIPowerReading(string(out)); w > 0 { return w, nil } return 0, fmt.Errorf("could not parse ipmitool dcmi power reading output") } // sampleIPMIPowerSeries collects IPMI power readings every 2 seconds for // durationSec seconds. Returns the mean of all successful samples. // Returns 0, false if IPMI is unavailable. func sampleIPMIPowerSeries(ctx context.Context, durationSec int) (meanW float64, ok bool) { if durationSec <= 0 { return 0, false } deadline := time.Now().Add(time.Duration(durationSec) * time.Second) var samples []float64 loop: for { if w, err := queryIPMIServerPowerW(); err == nil { samples = append(samples, w) } if time.Now().After(deadline) { break } select { case <-ctx.Done(): break loop case <-time.After(2 * time.Second): } } if len(samples) == 0 { return 0, false } var sum float64 for _, w := range samples { sum += w } return sum / float64(len(samples)), true } // characterizeServerPower computes BenchmarkServerPower from idle and loaded // IPMI samples plus the GPU-reported average power during steady state. func characterizeServerPower(idleW, loadedW, gpuReportedSumW float64, ipmiAvailable bool) *BenchmarkServerPower { sp := &BenchmarkServerPower{Available: ipmiAvailable} if !ipmiAvailable { sp.Notes = append(sp.Notes, "IPMI power reading unavailable; server-side power characterization skipped") return sp } sp.IdleW = idleW sp.LoadedW = loadedW sp.DeltaW = loadedW - idleW sp.GPUReportedSumW = gpuReportedSumW if gpuReportedSumW > 0 && sp.DeltaW > 0 { sp.ReportingRatio = sp.DeltaW / gpuReportedSumW } return sp } // readServerModel returns the DMI system product name (e.g. "SuperMicro SYS-421GE-TNRT"). // Returns empty string if unavailable (non-Linux or missing DMI entry). func readServerModel() string { data, err := os.ReadFile("/sys/class/dmi/id/product_name") if err != nil { return "" } return strings.TrimSpace(string(data)) } // filterRowsByGPU returns only the metric rows for a specific GPU index. func filterRowsByGPU(rows []GPUMetricRow, gpuIndex int) []GPUMetricRow { var out []GPUMetricRow for _, r := range rows { if r.GPUIndex == gpuIndex { out = append(out, r) } } return out } // parseBenchmarkBurnLogByGPU splits a multi-GPU bee-gpu-burn output by [gpu N] prefix // and returns a per-GPU parse result map. func parseBenchmarkBurnLogByGPU(raw string) map[int]benchmarkBurnParseResult { gpuLines := make(map[int][]string) for _, line := range strings.Split(strings.ReplaceAll(raw, "\r\n", "\n"), "\n") { line = strings.TrimSpace(line) if !strings.HasPrefix(line, "[gpu ") { continue } end := strings.Index(line, "] ") if end < 0 { continue } gpuIdx, err := strconv.Atoi(strings.TrimSpace(line[5:end])) if err != nil { continue } gpuLines[gpuIdx] = append(gpuLines[gpuIdx], line[end+2:]) } results := make(map[int]benchmarkBurnParseResult, len(gpuLines)) for gpuIdx, lines := range gpuLines { // Lines are already stripped of the [gpu N] prefix; parseBenchmarkBurnLog // calls stripBenchmarkPrefix which is a no-op on already-stripped lines. results[gpuIdx] = parseBenchmarkBurnLog(strings.Join(lines, "\n")) } return results } // runNvidiaBenchmarkParallel runs warmup and steady compute on all selected GPUs // simultaneously using a single bee-gpu-burn invocation per phase. func runNvidiaBenchmarkParallel( ctx context.Context, verboseLog, runDir string, selected []int, infoByIndex map[int]benchmarkGPUInfo, opts NvidiaBenchmarkOptions, spec benchmarkProfileSpec, logFunc func(string), result *NvidiaBenchmarkResult, calibPowerByIndex map[int]float64, serverIdleW *float64, serverLoadedWSum *float64, serverIdleOK *bool, serverLoadedOK *bool, serverLoadedSamples *int, allMetricRows *[]GPUMetricRow, gpuBurnLog string, ) { allDevices := joinIndexList(selected) // Build per-GPU result stubs. gpuResults := make(map[int]*BenchmarkGPUResult, len(selected)) for _, idx := range selected { r := &BenchmarkGPUResult{Index: idx, Status: "FAILED"} if info, ok := infoByIndex[idx]; ok { r.UUID = info.UUID r.Name = info.Name r.BusID = info.BusID r.VBIOS = info.VBIOS r.PowerLimitW = info.PowerLimitW r.MultiprocessorCount = info.MultiprocessorCount r.DefaultPowerLimitW = info.DefaultPowerLimitW r.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz r.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz r.MaxMemoryClockMHz = info.MaxMemoryClockMHz } if w, ok := calibPowerByIndex[idx]; ok && w > 0 { r.CalibratedPeakPowerW = w } if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil { r.LockedGraphicsClockMHz = norm.GPUClockLockMHz r.LockedMemoryClockMHz = norm.MemoryClockLockMHz } gpuResults[idx] = r } // Baseline: sample all GPUs together. baselineRows, err := collectBenchmarkSamples(ctx, spec.BaselineSec, selected) if err != nil && err != context.Canceled { for _, idx := range selected { gpuResults[idx].Notes = append(gpuResults[idx].Notes, "baseline sampling failed: "+err.Error()) } } for _, idx := range selected { perGPU := filterRowsByGPU(baselineRows, idx) gpuResults[idx].Baseline = summarizeBenchmarkTelemetry(perGPU) } appendBenchmarkMetrics(allMetricRows, baselineRows, "baseline") // Sample server idle power once. if !*serverIdleOK { if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok { *serverIdleW = w *serverIdleOK = true logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w)) } } // Warmup: all GPUs simultaneously. warmupCmd := []string{ "bee-gpu-burn", "--seconds", strconv.Itoa(spec.WarmupSec), "--size-mb", strconv.Itoa(opts.SizeMB), "--devices", allDevices, } logFunc(fmt.Sprintf("GPUs %s: parallel warmup (%ds)", allDevices, spec.WarmupSec)) warmupOut, warmupRows, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-warmup.log", warmupCmd, nil, selected, logFunc) appendBenchmarkMetrics(allMetricRows, warmupRows, "warmup") appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", "warmup", warmupOut) if warmupErr != nil { for _, idx := range selected { gpuResults[idx].Notes = append(gpuResults[idx].Notes, "parallel warmup failed: "+warmupErr.Error()) } } // Run synthetic precision phases and the combined steady phase as one // uninterrupted command so the GPUs stay hot between windows. eccBase := make(map[int]BenchmarkECCCounters, len(selected)) for _, idx := range selected { eccBase[idx], _ = queryECCCounters(idx) } planLabels, planPhases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(spec, func(label string) string { if label == "mixed" { return "steady" } return "gpu-all-steady-" + label }) planCmd := []string{ "bee-gpu-burn", "--seconds", strconv.Itoa(basePhaseSec), "--size-mb", strconv.Itoa(opts.SizeMB), "--devices", allDevices, "--precision-plan", strings.Join(planLabels, ","), "--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases), } logFunc(fmt.Sprintf("GPUs %s: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", allDevices, len(benchmarkPrecisionPhases), basePhaseSec, mixedPhaseSec)) _, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, "gpu-all-precision-plan.log", planCmd, nil, selected, planPhases, logFunc) for _, phaseSpec := range planPhases { if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 { appendBenchmarkMetrics(allMetricRows, rows, phaseSpec.MetricStage) } appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseSpec.MetricStage, phaseLogs[phaseSpec.PlanLabel]) } for _, prec := range benchmarkPrecisionPhases { phaseLogName := "gpu-all-steady-" + prec phaseRows := phaseRowsByStage[phaseLogName] if len(phaseRows) == 0 { continue } parseByGPU := parseBenchmarkBurnLogByGPU(string(phaseLogs[prec])) for _, idx := range selected { perGPU := filterRowsByGPU(phaseRows, idx) if len(perGPU) == 0 { continue } phase := BenchmarkPrecisionSteadyPhase{ Precision: prec, Steady: summarizeBenchmarkTelemetry(perGPU), } if pr, ok := parseByGPU[idx]; ok { for _, p := range pr.Profiles { if p.Supported { phase.TeraOpsPerSec += p.TeraOpsPerSec phase.WeightedTeraOpsPerSec += p.WeightedTeraOpsPerSec } } } gpuResults[idx].PrecisionSteady = append(gpuResults[idx].PrecisionSteady, phase) } } // Snapshot throttle counters before steady. beforeThrottle := make(map[int]BenchmarkThrottleCounters, len(selected)) for _, idx := range selected { beforeThrottle[idx], _ = queryThrottleCounters(idx) } logFunc(fmt.Sprintf("GPUs %s: parallel steady compute (combined, %ds)", allDevices, mixedPhaseSec)) // Sample server power via IPMI in parallel with steady phase. ipmiStopCh := make(chan struct{}) ipmiResultCh := make(chan float64, 1) go func() { defer close(ipmiResultCh) var samples []float64 ticker := time.NewTicker(5 * time.Second) defer ticker.Stop() select { case <-ipmiStopCh: return case <-time.After(15 * time.Second): } for { if w, err := queryIPMIServerPowerW(); err == nil { samples = append(samples, w) } select { case <-ipmiStopCh: if len(samples) > 0 { var sum float64 for _, w := range samples { sum += w } ipmiResultCh <- sum / float64(len(samples)) } return case <-ticker.C: } } }() close(ipmiStopCh) if loadedW, ok := <-ipmiResultCh; ok { *serverLoadedWSum += loadedW (*serverLoadedSamples)++ *serverLoadedOK = true logFunc(fmt.Sprintf("GPUs %s: server loaded power (IPMI): %.0f W", allDevices, loadedW)) } afterThrottle := make(map[int]BenchmarkThrottleCounters, len(selected)) for _, idx := range selected { afterThrottle[idx], _ = queryThrottleCounters(idx) } steadyRows := phaseRowsByStage["steady"] parseResults := parseBenchmarkBurnLogByGPU(string(phaseLogs["mixed"])) for _, idx := range selected { perGPU := filterRowsByGPU(steadyRows, idx) gpuResults[idx].Steady = summarizeBenchmarkTelemetry(perGPU) gpuResults[idx].Throttle = diffThrottleCounters(beforeThrottle[idx], afterThrottle[idx]) if eccFinal, err := queryECCCounters(idx); err == nil { gpuResults[idx].ECC = diffECCCounters(eccBase[idx], eccFinal) } if pr, ok := parseResults[idx]; ok { gpuResults[idx].ComputeCapability = pr.ComputeCapability gpuResults[idx].Backend = pr.Backend gpuResults[idx].PrecisionResults = pr.Profiles if pr.Fallback { gpuResults[idx].Notes = append(gpuResults[idx].Notes, "benchmark used driver PTX fallback; tensor throughput score is not comparable") } } if planErr != nil { gpuResults[idx].Notes = append(gpuResults[idx].Notes, "precision plan failed: "+planErr.Error()) } } // Cooldown: all GPUs together. if spec.CooldownSec > 0 { cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, selected) if err != nil && err != context.Canceled { for _, idx := range selected { gpuResults[idx].Notes = append(gpuResults[idx].Notes, "cooldown sampling failed: "+err.Error()) } } for _, idx := range selected { perGPU := filterRowsByGPU(cooldownRows, idx) gpuResults[idx].Cooldown = summarizeBenchmarkTelemetry(perGPU) } appendBenchmarkMetrics(allMetricRows, cooldownRows, "cooldown") } // Score and finalize each GPU. for _, idx := range selected { r := gpuResults[idx] r.Scores = scoreBenchmarkGPUResult(*r) r.DegradationReasons = detectBenchmarkDegradationReasons(*r, result.Normalization.Status) pr := parseResults[idx] switch { case planErr != nil: r.Status = classifySATErrorStatus(phaseLogs["mixed"], planErr) case pr.Fallback: r.Status = "PARTIAL" default: r.Status = "OK" } result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(*r)) } } // readBenchmarkHostConfig reads static CPU and memory configuration from // /proc/cpuinfo and /proc/meminfo. Returns nil if neither source is readable. func readBenchmarkHostConfig() *BenchmarkHostConfig { cfg := &BenchmarkHostConfig{} populated := false // Parse /proc/cpuinfo for CPU model, sockets, cores, threads. if data, err := os.ReadFile("/proc/cpuinfo"); err == nil { socketIDs := map[string]struct{}{} coresPerSocket := map[string]int{} var modelName string threads := 0 for _, line := range strings.Split(string(data), "\n") { kv := strings.SplitN(line, ":", 2) if len(kv) != 2 { continue } key := strings.TrimSpace(kv[0]) val := strings.TrimSpace(kv[1]) switch key { case "processor": threads++ case "model name": if modelName == "" { modelName = val } case "physical id": socketIDs[val] = struct{}{} case "cpu cores": // Overwrite per-socket core count (last wins per socket, but all // entries for the same socket report the same value). if physLine := ""; physLine == "" { // We accumulate below by treating cpu cores as a per-thread // field; sum by socket requires a two-pass approach. Use the // simpler approximation: totalCores = threads / (threads per core). _ = val } } } // Second pass: per-socket core count. var curSocket string for _, line := range strings.Split(string(data), "\n") { kv := strings.SplitN(line, ":", 2) if len(kv) != 2 { continue } key := strings.TrimSpace(kv[0]) val := strings.TrimSpace(kv[1]) switch key { case "physical id": curSocket = val case "cpu cores": if curSocket != "" { if _, seen := coresPerSocket[curSocket]; !seen { v, _ := strconv.Atoi(val) coresPerSocket[curSocket] = v } } } } totalCores := 0 for _, c := range coresPerSocket { totalCores += c } cfg.CPUModel = modelName cfg.CPUSockets = len(socketIDs) if cfg.CPUSockets == 0 && threads > 0 { cfg.CPUSockets = 1 } cfg.CPUCores = totalCores cfg.CPUThreads = threads if modelName != "" || threads > 0 { populated = true } } // Parse /proc/meminfo for total physical RAM. if data, err := os.ReadFile("/proc/meminfo"); err == nil { for _, line := range strings.Split(string(data), "\n") { if strings.HasPrefix(line, "MemTotal:") { fields := strings.Fields(line) if len(fields) >= 2 { kb, _ := strconv.ParseUint(fields[1], 10, 64) cfg.MemTotalGiB = float64(kb) / (1024 * 1024) populated = true } break } } } if !populated { return nil } return cfg } // startCPULoadSampler starts a goroutine that samples host CPU load every // intervalSec seconds until stopCh is closed, then sends the collected // samples on the returned channel. func startCPULoadSampler(stopCh <-chan struct{}, intervalSec int) <-chan []float64 { ch := make(chan []float64, 1) go func() { var samples []float64 ticker := time.NewTicker(time.Duration(intervalSec) * time.Second) defer ticker.Stop() for { select { case <-stopCh: ch <- samples return case <-ticker.C: if pct := sampleCPULoadPct(); pct > 0 { samples = append(samples, pct) } } } }() return ch } // summarizeCPULoad computes stats over sampled CPU load values and assigns // a health status. func summarizeCPULoad(samples []float64) *BenchmarkCPULoad { if len(samples) == 0 { return nil } sorted := append([]float64(nil), samples...) sort.Float64s(sorted) var sum float64 for _, v := range sorted { sum += v } avg := sum / float64(len(sorted)) p95 := sorted[int(float64(len(sorted))*0.95)] max := sorted[len(sorted)-1] cl := &BenchmarkCPULoad{ AvgPct: math.Round(avg*10) / 10, MaxPct: math.Round(max*10) / 10, P95Pct: math.Round(p95*10) / 10, Samples: len(sorted), } // Compute standard deviation to detect instability. var variance float64 for _, v := range sorted { d := v - avg variance += d * d } stdDev := math.Sqrt(variance / float64(len(sorted))) switch { case avg > 20 || max > 40: cl.Status = "high" cl.Note = fmt.Sprintf("avg %.1f%% max %.1f%% — elevated host CPU load may interfere with GPU benchmark results", avg, max) case stdDev > 12: cl.Status = "unstable" cl.Note = fmt.Sprintf("avg %.1f%% stddev %.1f%% — host CPU load was erratic during the benchmark", avg, stdDev) default: cl.Status = "ok" } return cl } // runBenchmarkPowerCalibration runs a short dcgmi targeted_power test while // collecting nvidia-smi power samples in parallel. It returns a map from GPU // index to p95 observed power (watts), which is used as the reference for // PowerSustainScore instead of the hardware default limit. // // If dcgmi is unavailable or the run fails the function returns an empty map // and the caller falls back to DefaultPowerLimitW. The calibration is skipped // gracefully — it must never block or fail the main benchmark. func runBenchmarkPowerCalibration( ctx context.Context, verboseLog, runDir string, gpuIndices []int, logFunc func(string), ) map[int]float64 { const calibDurationSec = 120 // dcgmi must be present. if _, err := exec.LookPath("dcgmi"); err != nil { logFunc("power calibration: dcgmi not found, skipping (will use default power limit)") return map[int]float64{} } logFunc(fmt.Sprintf("power calibration: running dcgmi targeted_power for %ds on GPUs %s", calibDurationSec, joinIndexList(gpuIndices))) cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, gpuIndices) out, rows, err := runBenchmarkCommandWithMetrics(ctx, verboseLog, "power-calibration.log", cmd, nil, gpuIndices, logFunc) _ = os.WriteFile(filepath.Join(runDir, "power-calibration.log"), out, 0644) if err != nil { logFunc(fmt.Sprintf("power calibration: dcgmi targeted_power failed (%v), skipping", err)) return map[int]float64{} } // Group rows by GPU index and compute p95 power for each. result := make(map[int]float64, len(gpuIndices)) for _, idx := range gpuIndices { perGPU := filterRowsByGPU(rows, idx) if len(perGPU) == 0 { continue } powers := make([]float64, 0, len(perGPU)) for _, r := range perGPU { if r.PowerW > 0 { powers = append(powers, r.PowerW) } } if len(powers) == 0 { continue } p95 := benchmarkPercentile(powers, 95) if p95 > 0 { result[idx] = p95 logFunc(fmt.Sprintf("power calibration: GPU %d p95=%.0f W (%d samples)", idx, p95, len(powers))) } } return result }