package platform import ( "context" "encoding/csv" "encoding/json" "errors" "fmt" "math" "os" "os/exec" "path/filepath" "regexp" "sort" "strconv" "strings" "time" ) const benchmarkVersion = "2" type benchmarkProfileSpec struct { Name string BaselineSec int WarmupSec int SteadySec int NCCLSec int CooldownSec int } type benchmarkGPUInfo struct { Index int UUID string Name string BusID string VBIOS string PowerLimitW float64 DefaultPowerLimitW float64 MaxGraphicsClockMHz float64 MaxMemoryClockMHz float64 BaseGraphicsClockMHz float64 MultiprocessorCount int } type benchmarkPowerCalibrationResult struct { Summary BenchmarkTelemetrySummary AppliedPowerLimitW float64 Attempts int Derated bool Completed bool Notes []string // CoolingWarning is set when the GPU throttled thermally with a clock drop // ≥20% while server fans were below 100% duty cycle — a signal that the // cooling system may not be correctly configured for full GPU load. CoolingWarning string } type benchmarkBurnProfile struct { name string category string supported bool lanes int m uint64 n uint64 k uint64 iterations uint64 notes string } type benchmarkBurnParseResult struct { Device string ComputeCapability string Backend string DurationSec int Profiles []BenchmarkPrecisionResult Fallback bool } type benchmarkRestoreAction struct { name string fn func() } var ( benchmarkReadyPattern = regexp.MustCompile(`^([a-z0-9_]+)\[(\d+)\]=READY dim=(\d+)x(\d+)x(\d+)\b`) benchmarkSkippedPattern = regexp.MustCompile(`^([a-z0-9_]+)(?:\[\d+\])?=SKIPPED (.+)$`) benchmarkIterationsPattern = regexp.MustCompile(`^([a-z0-9_]+)_iterations=(\d+)$`) ) // benchmarkPrecisionPhases lists the precision categories run as individual // steady-state windows before the combined steady pass. Order is from lowest // to highest power draw so thermal ramp-up is gradual. var benchmarkPrecisionPhases = []string{"int8", "fp8", "fp16", "fp32", "fp64", "fp4"} func computeCapabilityCode(raw string) int { raw = strings.TrimSpace(raw) if raw == "" { return 0 } parts := strings.SplitN(raw, ".", 2) major, _ := strconv.Atoi(strings.TrimSpace(parts[0])) minor := 0 if len(parts) > 1 { minor, _ = strconv.Atoi(strings.TrimSpace(parts[1])) } return major*10 + minor } func benchmarkSupportedPrecisions(computeCapability string) []string { cc := computeCapabilityCode(computeCapability) out := make([]string, 0, len(benchmarkPrecisionPhases)) for _, prec := range benchmarkPrecisionPhases { if prec == "fp4" && cc > 0 && cc < 100 { continue } out = append(out, prec) } return out } func buildBenchmarkSteadyPlan(spec benchmarkProfileSpec, precisions []string, metricStage func(string) string) (planLabels []string, planPhases []benchmarkPlannedPhase, basePhaseSec int, mixedPhaseSec int) { if len(precisions) == 0 { precisions = append([]string(nil), benchmarkPrecisionPhases...) } switch spec.Name { case NvidiaBenchmarkProfileStandard: basePhaseSec = 60 mixedPhaseSec = 300 case NvidiaBenchmarkProfileStability: basePhaseSec = 300 mixedPhaseSec = 3600 case NvidiaBenchmarkProfileOvernight: basePhaseSec = 3600 mixedPhaseSec = 14400 default: totalWeight := len(precisions) + 5 if totalWeight <= 0 { return nil, nil, 0, 0 } basePhaseSec = spec.SteadySec / totalWeight if basePhaseSec <= 0 { basePhaseSec = 1 } mixedPhaseSec = basePhaseSec * 5 } planLabels = make([]string, 0, len(precisions)+1) planPhases = make([]benchmarkPlannedPhase, 0, len(precisions)+1) for _, prec := range precisions { planLabels = append(planLabels, prec) planPhases = append(planPhases, benchmarkPlannedPhase{ PlanLabel: prec, MetricStage: metricStage(prec), DurationSec: basePhaseSec, }) } planLabels = append(planLabels, "mixed") planPhases = append(planPhases, benchmarkPlannedPhase{ PlanLabel: "mixed", MetricStage: metricStage("mixed"), DurationSec: mixedPhaseSec, }) return planLabels, planPhases, basePhaseSec, mixedPhaseSec } func benchmarkPlanDurationsCSV(phases []benchmarkPlannedPhase) string { values := make([]string, 0, len(phases)) for _, phase := range phases { values = append(values, strconv.Itoa(phase.DurationSec)) } return strings.Join(values, ",") } func benchmarkPlannedPhaseStatus(raw []byte) (string, string) { text := strings.ToLower(strings.TrimSpace(string(raw))) switch { case text == "": return "FAILED", "phase produced no output" case strings.Contains(text, "phase_error="): if strings.Contains(text, "unsupported") || strings.Contains(text, "not supported") || strings.Contains(text, "cublaslt_profiles=unsupported") { return "UNSUPPORTED", "precision phase unsupported on this GPU/userspace path" } return "FAILED", "precision phase failed" case strings.Contains(text, "status=failed"): if strings.Contains(text, "unsupported") || strings.Contains(text, "not supported") { return "UNSUPPORTED", "precision phase unsupported on this GPU/userspace path" } return "FAILED", "precision phase failed" default: return "OK", "" } } func benchmarkCalibrationThrottleReason(before, after BenchmarkThrottleCounters) string { diff := diffThrottleCounters(before, after) switch { case diff.HWThermalSlowdownUS > 0: return "hw_thermal" case diff.SWThermalSlowdownUS > 0: return "sw_thermal" case diff.HWPowerBrakeSlowdownUS > 0: return "hw_power_brake" default: return "" } } func setBenchmarkPowerLimit(ctx context.Context, verboseLog string, gpuIndex, powerLimitW int) error { if powerLimitW <= 0 { return fmt.Errorf("invalid power limit %d", powerLimitW) } out, err := runSATCommandCtx(ctx, verboseLog, fmt.Sprintf("gpu-%d-set-power-limit-%dw", gpuIndex, powerLimitW), []string{ "nvidia-smi", "-i", strconv.Itoa(gpuIndex), "-pl", strconv.Itoa(powerLimitW), }, nil, nil) if err != nil { return fmt.Errorf("set power limit gpu=%d limit=%dw: %w (%s)", gpuIndex, powerLimitW, err, strings.TrimSpace(string(out))) } return nil } func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) { if ctx == nil { ctx = context.Background() } if logFunc == nil { logFunc = func(string) {} } if strings.TrimSpace(baseDir) == "" { baseDir = "/var/log/bee-bench/perf" } spec := resolveBenchmarkProfile(opts.Profile) opts = normalizeNvidiaBenchmarkOptionsForBenchmark(opts) selected, err := resolveNvidiaGPUSelection(opts.GPUIndices, opts.ExcludeGPUIndices) if err != nil { return "", err } if len(selected) == 0 { return "", fmt.Errorf("no NVIDIA GPUs selected") } ts := time.Now().UTC().Format("20060102-150405") runDir := filepath.Join(baseDir, "perf-"+ts) if err := os.MkdirAll(runDir, 0755); err != nil { return "", fmt.Errorf("mkdir %s: %w", runDir, err) } verboseLog := filepath.Join(runDir, "verbose.log") hostname, _ := os.Hostname() result := NvidiaBenchmarkResult{ BenchmarkVersion: benchmarkVersion, GeneratedAt: time.Now().UTC(), Hostname: hostname, ServerModel: readServerModel(), BenchmarkProfile: spec.Name, ParallelGPUs: opts.ParallelGPUs, RampStep: opts.RampStep, RampTotal: opts.RampTotal, RampRunID: opts.RampRunID, SelectedGPUIndices: append([]int(nil), selected...), HostConfig: readBenchmarkHostConfig(), Normalization: BenchmarkNormalization{ Status: "full", }, } logFunc(fmt.Sprintf("NVIDIA benchmark profile=%s gpus=%s", spec.Name, joinIndexList(selected))) var metricRows []GPUMetricRow metricTimelineSec := 0.0 gpuBurnLog := filepath.Join(runDir, "gpu-burn.log") // Server power characterization state — populated during per-GPU phases. var serverIdleW, serverLoadedWSum float64 var serverIdleOK, serverLoadedOK bool var serverLoadedSamples int // Run nvidia-smi -q first: used both for the log file and as a fallback // source of max clock values when CSV clock fields are unsupported. var nvsmiQOut []byte if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil { nvsmiQOut = out _ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644) } infoByIndex, infoErr := queryBenchmarkGPUInfo(selected) if infoErr != nil { result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error()) result.Normalization.Status = "partial" } // Enrich with max clocks from verbose output — covers GPUs where // clocks.max.* CSV fields are unsupported (e.g. Blackwell / driver 98.x). enrichGPUInfoWithMaxClocks(infoByIndex, nvsmiQOut) activeApps, err := queryActiveComputeApps(selected) if err == nil && len(activeApps) > 0 { result.Warnings = append(result.Warnings, "active GPU compute processes detected before benchmark") result.Normalization.Notes = append(result.Normalization.Notes, activeApps...) result.Normalization.Status = "partial" } restoreActions := applyBenchmarkNormalization(ctx, verboseLog, selected, infoByIndex, &result) defer func() { for i := len(restoreActions) - 1; i >= 0; i-- { restoreActions[i].fn() } }() // No power calibration before performance benchmark — GPUs run at their // default power limits. PowerSustainScore is derived from steady-state power // observed during the benchmark itself. calibByIndex := make(map[int]benchmarkPowerCalibrationResult) // Start background CPU load sampler — samples every 10s during GPU phases. cpuStopCh := make(chan struct{}) cpuSamplesCh := startCPULoadSampler(cpuStopCh, 10) if opts.ParallelGPUs { runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, calibByIndex, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples, &metricRows, &metricTimelineSec, gpuBurnLog) } else { for _, idx := range selected { gpuResult := BenchmarkGPUResult{ Index: idx, Status: "FAILED", } if info, ok := infoByIndex[idx]; ok { gpuResult.UUID = info.UUID gpuResult.Name = info.Name gpuResult.BusID = info.BusID gpuResult.VBIOS = info.VBIOS gpuResult.PowerLimitW = info.PowerLimitW gpuResult.MultiprocessorCount = info.MultiprocessorCount gpuResult.DefaultPowerLimitW = info.DefaultPowerLimitW gpuResult.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz gpuResult.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz } if calib, ok := calibByIndex[idx]; ok { gpuResult.CalibratedPeakPowerW = calib.Summary.P95PowerW gpuResult.CalibratedPeakTempC = calib.Summary.P95TempC gpuResult.PowerCalibrationTries = calib.Attempts gpuResult.PowerLimitDerated = calib.Derated gpuResult.Notes = append(gpuResult.Notes, calib.Notes...) if calib.CoolingWarning != "" { gpuResult.CoolingWarning = calib.CoolingWarning } } if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil { gpuResult.LockedGraphicsClockMHz = norm.GPUClockLockMHz gpuResult.LockedMemoryClockMHz = norm.MemoryClockLockMHz } baselineRows, err := collectBenchmarkSamples(ctx, spec.BaselineSec, []int{idx}) if err != nil && err != context.Canceled { gpuResult.Notes = append(gpuResult.Notes, "baseline sampling failed: "+err.Error()) } gpuResult.Baseline = summarizeBenchmarkTelemetry(baselineRows) appendBenchmarkMetrics(&metricRows, baselineRows, fmt.Sprintf("gpu-%d-baseline", idx), &metricTimelineSec, float64(spec.BaselineSec)) // Sample server idle power once (first GPU only — server state is global). if !serverIdleOK { if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok { serverIdleW = w serverIdleOK = true logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w)) } } warmupCmd := []string{ "bee-gpu-burn", "--seconds", strconv.Itoa(spec.WarmupSec), "--size-mb", strconv.Itoa(opts.SizeMB), "--devices", strconv.Itoa(idx), } logFunc(fmt.Sprintf("GPU %d: warmup (%ds)", idx, spec.WarmupSec)) warmupOut, warmupRows, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-warmup.log", idx), warmupCmd, nil, []int{idx}, logFunc) appendBenchmarkMetrics(&metricRows, warmupRows, fmt.Sprintf("gpu-%d-warmup", idx), &metricTimelineSec, float64(spec.WarmupSec)) appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", fmt.Sprintf("gpu-%d-warmup", idx), warmupOut) if warmupErr != nil { gpuResult.Notes = append(gpuResult.Notes, "warmup failed: "+warmupErr.Error()) result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult)) continue } warmupParse := parseBenchmarkBurnLog(string(warmupOut)) if gpuResult.ComputeCapability == "" { gpuResult.ComputeCapability = warmupParse.ComputeCapability } // Run synthetic precision phases and the combined steady phase as one // uninterrupted command so the GPU stays hot between windows. eccBase, _ := queryECCCounters(idx) supportedPrecisions := benchmarkSupportedPrecisions(gpuResult.ComputeCapability) planLabels, planPhases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(spec, supportedPrecisions, func(label string) string { if label == "mixed" { return fmt.Sprintf("gpu-%d-steady", idx) } return fmt.Sprintf("gpu-%d-steady-%s", idx, label) }) planCmd := []string{ "bee-gpu-burn", "--seconds", strconv.Itoa(basePhaseSec), "--size-mb", strconv.Itoa(opts.SizeMB), "--devices", strconv.Itoa(idx), "--precision-plan", strings.Join(planLabels, ","), "--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases), } logFunc(fmt.Sprintf("GPU %d: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", idx, len(supportedPrecisions), basePhaseSec, mixedPhaseSec)) _, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-precision-plan.log", idx), planCmd, nil, []int{idx}, planPhases, logFunc) for _, phaseSpec := range planPhases { if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 { appendBenchmarkMetrics(&metricRows, rows, phaseSpec.MetricStage, &metricTimelineSec, float64(phaseSpec.DurationSec)) } appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseSpec.MetricStage, phaseLogs[phaseSpec.PlanLabel]) } for _, prec := range supportedPrecisions { stageName := fmt.Sprintf("gpu-%d-steady-%s", idx, prec) phaseRows := phaseRowsByStage[stageName] phase := BenchmarkPrecisionSteadyPhase{ Precision: prec, Status: "OK", Steady: summarizeBenchmarkTelemetry(phaseRows), } if status, note := benchmarkPlannedPhaseStatus(phaseLogs[prec]); status != "OK" { phase.Status = status phase.Notes = note gpuResult.PrecisionFailures = append(gpuResult.PrecisionFailures, prec+":"+status) } for _, p := range parseBenchmarkBurnLog(string(phaseLogs[prec])).Profiles { if p.Supported { phase.TeraOpsPerSec += p.TeraOpsPerSec phase.WeightedTeraOpsPerSec += p.WeightedTeraOpsPerSec } } gpuResult.PrecisionSteady = append(gpuResult.PrecisionSteady, phase) } beforeThrottle, _ := queryThrottleCounters(idx) logFunc(fmt.Sprintf("GPU %d: steady compute (combined, %ds)", idx, mixedPhaseSec)) // Sample server power via IPMI in parallel with the steady phase. // We collect readings every 5s and average them. ipmiStopCh := make(chan struct{}) ipmiResultCh := make(chan float64, 1) go func() { defer close(ipmiResultCh) var samples []float64 ticker := time.NewTicker(5 * time.Second) defer ticker.Stop() // First sample after a short warmup delay. select { case <-ipmiStopCh: return case <-time.After(15 * time.Second): } for { if w, err := queryIPMIServerPowerW(); err == nil { samples = append(samples, w) } select { case <-ipmiStopCh: if len(samples) > 0 { var sum float64 for _, w := range samples { sum += w } ipmiResultCh <- sum / float64(len(samples)) } return case <-ticker.C: } } }() close(ipmiStopCh) if loadedW, ok := <-ipmiResultCh; ok { serverLoadedWSum += loadedW serverLoadedSamples++ serverLoadedOK = true logFunc(fmt.Sprintf("GPU %d: server loaded power (IPMI): %.0f W", idx, loadedW)) } afterThrottle, _ := queryThrottleCounters(idx) if planErr != nil { gpuResult.Notes = append(gpuResult.Notes, "precision plan failed: "+planErr.Error()) } steadyRows := phaseRowsByStage[fmt.Sprintf("gpu-%d-steady", idx)] parseResult := parseBenchmarkBurnLog(string(phaseLogs["mixed"])) gpuResult.ComputeCapability = parseResult.ComputeCapability gpuResult.Backend = parseResult.Backend gpuResult.PrecisionResults = parseResult.Profiles if parseResult.Fallback { gpuResult.Notes = append(gpuResult.Notes, "benchmark used driver PTX fallback; tensor throughput score is not comparable") } gpuResult.Steady = summarizeBenchmarkTelemetry(steadyRows) gpuResult.Throttle = diffThrottleCounters(beforeThrottle, afterThrottle) if eccFinal, err := queryECCCounters(idx); err == nil { gpuResult.ECC = diffECCCounters(eccBase, eccFinal) } if spec.CooldownSec > 0 { cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, []int{idx}) if err != nil && err != context.Canceled { gpuResult.Notes = append(gpuResult.Notes, "cooldown sampling failed: "+err.Error()) } gpuResult.Cooldown = summarizeBenchmarkTelemetry(cooldownRows) appendBenchmarkMetrics(&metricRows, cooldownRows, fmt.Sprintf("gpu-%d-cooldown", idx), &metricTimelineSec, float64(spec.CooldownSec)) } gpuResult.Scores = scoreBenchmarkGPUResult(gpuResult) gpuResult.DegradationReasons = detectBenchmarkDegradationReasons(gpuResult, result.Normalization.Status) if planErr != nil { gpuResult.Status = classifySATErrorStatus(phaseLogs["mixed"], planErr) } else if len(gpuResult.PrecisionFailures) > 0 { gpuResult.Status = "PARTIAL" } else if parseResult.Fallback { gpuResult.Status = "PARTIAL" } else { gpuResult.Status = "OK" } result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult)) } } // end sequential path // Performance scalability ramp-up: run parallel benchmarks for k=2..N GPUs // and compute compute scalability relative to the best single-GPU result. // Only runs in sequential mode (each GPU was tested individually above) and // when there are at least 2 GPUs. if !opts.ParallelGPUs && len(selected) >= 2 { // Find the best single-card SyntheticScore as the 1-GPU baseline. var bestTOPS float64 for _, g := range result.GPUs { if g.Scores.SyntheticScore > bestTOPS { bestTOPS = g.Scores.SyntheticScore } } if bestTOPS > 0 { var rampSteps []NvidiaPerformanceRampStep var scalabilityPcts []float64 for k := 2; k <= len(selected); k++ { subset := append([]int(nil), selected[:k]...) rampDir := filepath.Join(runDir, fmt.Sprintf("ramp-%02d", k)) _ = os.MkdirAll(rampDir, 0755) logFunc(fmt.Sprintf("performance ramp: step %d/%d — running %d GPUs in parallel", k, len(selected), k)) var rampResult NvidiaBenchmarkResult var rampIdleW, rampLoadedWSum float64 var rampIdleOK, rampLoadedOK bool var rampLoadedSamples int var rampMetricRows []GPUMetricRow var rampTimelineSec float64 emptyCalib := make(map[int]benchmarkPowerCalibrationResult) runNvidiaBenchmarkParallel(ctx, verboseLog, rampDir, subset, infoByIndex, opts, spec, logFunc, &rampResult, emptyCalib, &rampIdleW, &rampLoadedWSum, &rampIdleOK, &rampLoadedOK, &rampLoadedSamples, &rampMetricRows, &rampTimelineSec, "") var totalSynth, totalMixed float64 for _, g := range rampResult.GPUs { totalSynth += g.Scores.SyntheticScore totalMixed += g.Scores.MixedScore } scalPct := totalSynth / (float64(k) * bestTOPS) * 100 scalabilityPcts = append(scalabilityPcts, scalPct) stepStatus := "OK" if len(rampResult.GPUs) < k { stepStatus = "PARTIAL" } rampSteps = append(rampSteps, NvidiaPerformanceRampStep{ StepIndex: k, GPUIndices: subset, TotalSyntheticTOPS: totalSynth, TotalMixedTOPS: totalMixed, ScalabilityPct: scalPct, Status: stepStatus, }) } result.PerformanceRampSteps = rampSteps result.PlatformPowerScore = benchmarkMean(scalabilityPcts) if len(scalabilityPcts) > 0 { result.ScalabilityScore = scalabilityPcts[len(scalabilityPcts)-1] } } } if len(selected) > 1 && opts.RunNCCL { result.Interconnect = runBenchmarkInterconnect(ctx, verboseLog, runDir, selected, spec, logFunc) if result.Interconnect != nil && result.Interconnect.Supported { for i := range result.GPUs { result.GPUs[i].Scores.InterconnectScore = result.Interconnect.MaxBusBWGBps result.GPUs[i].Scores.CompositeScore = compositeBenchmarkScore(result.GPUs[i].Scores) } } } // Stop CPU load sampler and attach results. close(cpuStopCh) if cpuSamples := <-cpuSamplesCh; len(cpuSamples) > 0 { result.CPULoad = summarizeCPULoad(cpuSamples) if result.CPULoad != nil && result.CPULoad.Status != "ok" { logFunc(fmt.Sprintf("host CPU load during benchmark: avg=%.1f%% max=%.1f%% status=%s", result.CPULoad.AvgPct, result.CPULoad.MaxPct, result.CPULoad.Status)) } } // Compute server power characterization from accumulated IPMI samples. var gpuReportedSumW float64 for _, gpu := range result.GPUs { gpuReportedSumW += gpu.Steady.AvgPowerW } var serverLoadedW float64 if serverLoadedSamples > 0 { serverLoadedW = serverLoadedWSum / float64(serverLoadedSamples) } result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, gpuReportedSumW, serverIdleOK && serverLoadedOK) result.Cooling = summarizeBenchmarkCooling(metricRows) // Apply server-power penalty when IPMI reports the server delta is much // lower than GPU-reported sum: GPU power telemetry is over-stated, making // CalibratedPeakPowerW and PowerSustainScore unreliable. // Penalty factor scales from 1.0 (ratio ≥ 0.75, no penalty) down to 0. if sp := result.ServerPower; sp != nil && sp.Available && sp.ReportingRatio > 0 && sp.ReportingRatio < 0.75 { factor := sp.ReportingRatio / 0.75 for i := range result.GPUs { result.GPUs[i].Scores.CompositeScore *= factor result.GPUs[i].Notes = append(result.GPUs[i].Notes, fmt.Sprintf("server-power penalty applied (reporting_ratio=%.2f < 0.75): composite score reduced to %.1f%%", sp.ReportingRatio, factor*100)) } } result.Findings = buildBenchmarkFindings(result) result.OverallStatus = benchmarkOverallStatus(result) writeBenchmarkMetricsFiles(runDir, metricRows) resultJSON, err := json.MarshalIndent(result, "", " ") if err != nil { return "", fmt.Errorf("marshal benchmark result: %w", err) } if err := os.WriteFile(filepath.Join(runDir, "result.json"), resultJSON, 0644); err != nil { return "", fmt.Errorf("write result.json: %w", err) } report := renderBenchmarkReportWithCharts(result) if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(report), 0644); err != nil { return "", fmt.Errorf("write report.md: %w", err) } summary := renderBenchmarkSummary(result) if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644); err != nil { return "", fmt.Errorf("write summary.txt: %w", err) } return runDir, nil } func normalizeNvidiaBenchmarkOptionsForBenchmark(opts NvidiaBenchmarkOptions) NvidiaBenchmarkOptions { switch strings.TrimSpace(strings.ToLower(opts.Profile)) { case NvidiaBenchmarkProfileStability: opts.Profile = NvidiaBenchmarkProfileStability case NvidiaBenchmarkProfileOvernight: opts.Profile = NvidiaBenchmarkProfileOvernight default: opts.Profile = NvidiaBenchmarkProfileStandard } if opts.SizeMB < 0 { opts.SizeMB = 0 } opts.GPUIndices = dedupeSortedIndices(opts.GPUIndices) opts.ExcludeGPUIndices = dedupeSortedIndices(opts.ExcludeGPUIndices) return opts } func resolveBenchmarkProfile(profile string) benchmarkProfileSpec { switch strings.TrimSpace(strings.ToLower(profile)) { case NvidiaBenchmarkProfileStability: return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 120, SteadySec: 3600, NCCLSec: 300, CooldownSec: 0} case NvidiaBenchmarkProfileOvernight: return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 180, SteadySec: 27000, NCCLSec: 600, CooldownSec: 0} default: return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 45, SteadySec: 480, NCCLSec: 180, CooldownSec: 0} } } // benchmarkGPUInfoQuery describes a nvidia-smi --query-gpu field set to try. // Fields are tried in order; the first successful query wins. Extended fields // (attribute.multiprocessor_count, power.default_limit) are not supported on // all driver versions, so we fall back to the base set if the full query fails. // The minimal fallback omits clock fields entirely — clocks.max.* returns // exit status 2 on some GPU generations (e.g. Blackwell); max clocks are // then recovered from nvidia-smi -q via enrichGPUInfoWithMaxClocks. var benchmarkGPUInfoQueries = []struct { fields string extended bool // whether this query includes optional extended fields minimal bool // clock fields omitted; max clocks must be filled separately }{ { fields: "index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics,attribute.multiprocessor_count,power.default_limit", extended: true, }, { fields: "index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics", extended: false, }, { fields: "index,uuid,name,pci.bus_id,vbios_version,power.limit", minimal: true, }, } // enrichGPUInfoWithMaxClocks fills MaxGraphicsClockMHz / MaxMemoryClockMHz for // any GPU in infoByIndex where those values are still zero. It parses the // "Max Clocks" section of nvidia-smi -q output (already available as nvsmiQ). // This is the fallback for GPUs (e.g. Blackwell) where clocks.max.* CSV fields // return exit status 2 but the verbose query works fine. func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []byte) { if len(infoByIndex) == 0 || len(nvsmiQ) == 0 { return } // Build bus_id → index map for matching verbose sections to GPU indices. busToBenchIdx := make(map[string]int, len(infoByIndex)) for idx, info := range infoByIndex { if info.BusID != "" { // nvidia-smi -q uses "GPU 00000000:4E:00.0" (8-digit domain), // while --query-gpu returns the same format; normalise to lower. busToBenchIdx[strings.ToLower(strings.TrimSpace(info.BusID))] = idx } } // Split the verbose output into per-GPU sections on "^GPU " lines. gpuSectionRe := regexp.MustCompile(`(?m)^GPU\s+([\dA-Fa-f:\.]+)`) maxGfxRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Graphics\s*:\s*(\d+)\s*MHz`) maxMemRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Memory\s*:\s*(\d+)\s*MHz`) defaultPwrRe := regexp.MustCompile(`(?i)Default Power Limit\s*:\s*([0-9.]+)\s*W`) currentPwrRe := regexp.MustCompile(`(?i)Current Power Limit\s*:\s*([0-9.]+)\s*W`) smCountRe := regexp.MustCompile(`(?i)Multiprocessor Count\s*:\s*(\d+)`) sectionStarts := gpuSectionRe.FindAllSubmatchIndex(nvsmiQ, -1) for i, loc := range sectionStarts { busID := strings.ToLower(string(nvsmiQ[loc[2]:loc[3]])) benchIdx, ok := busToBenchIdx[busID] if !ok { // Bus IDs from verbose output may have a different domain prefix; // try suffix match on the slot portion (XX:XX.X). for k, v := range busToBenchIdx { if strings.HasSuffix(k, busID) || strings.HasSuffix(busID, k) { benchIdx = v ok = true break } } } if !ok { continue } end := len(nvsmiQ) if i+1 < len(sectionStarts) { end = sectionStarts[i+1][0] } section := nvsmiQ[loc[0]:end] info := infoByIndex[benchIdx] if info.MaxGraphicsClockMHz == 0 { if m := maxGfxRe.FindSubmatch(section); m != nil { if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil { info.MaxGraphicsClockMHz = v } } } if info.MaxMemoryClockMHz == 0 { if m := maxMemRe.FindSubmatch(section); m != nil { if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil { info.MaxMemoryClockMHz = v } } } if info.DefaultPowerLimitW == 0 { if m := defaultPwrRe.FindSubmatch(section); m != nil { if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 { info.DefaultPowerLimitW = v } } } if info.PowerLimitW == 0 { if m := currentPwrRe.FindSubmatch(section); m != nil { if v, err := strconv.ParseFloat(string(m[1]), 64); err == nil && v > 0 { info.PowerLimitW = v } } } if info.MultiprocessorCount == 0 { if m := smCountRe.FindSubmatch(section); m != nil { if v, err := strconv.Atoi(string(m[1])); err == nil && v > 0 { info.MultiprocessorCount = v } } } infoByIndex[benchIdx] = info } } func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) { var lastErr error for _, q := range benchmarkGPUInfoQueries { args := []string{ "--query-gpu=" + q.fields, "--format=csv,noheader,nounits", } if len(gpuIndices) > 0 { args = append([]string{"--id=" + joinIndexList(gpuIndices)}, args...) } out, err := satExecCommand("nvidia-smi", args...).Output() if err != nil { lastErr = fmt.Errorf("nvidia-smi gpu info (%s): %w", q.fields[:min(len(q.fields), 40)], err) continue } r := csv.NewReader(strings.NewReader(string(out))) r.TrimLeadingSpace = true r.FieldsPerRecord = -1 rows, err := r.ReadAll() if err != nil { lastErr = fmt.Errorf("parse nvidia-smi gpu info: %w", err) continue } minFields := 6 if !q.minimal { minFields = 9 } infoByIndex := make(map[int]benchmarkGPUInfo, len(rows)) for _, row := range rows { if len(row) < minFields { continue } idx, err := strconv.Atoi(strings.TrimSpace(row[0])) if err != nil { continue } info := benchmarkGPUInfo{ Index: idx, UUID: strings.TrimSpace(row[1]), Name: strings.TrimSpace(row[2]), BusID: strings.TrimSpace(row[3]), VBIOS: strings.TrimSpace(row[4]), PowerLimitW: parseBenchmarkFloat(row[5]), } if !q.minimal { info.MaxGraphicsClockMHz = parseBenchmarkFloat(row[6]) info.MaxMemoryClockMHz = parseBenchmarkFloat(row[7]) if len(row) >= 9 { info.BaseGraphicsClockMHz = parseBenchmarkFloat(row[8]) } if q.extended { if len(row) >= 10 { info.MultiprocessorCount = int(parseBenchmarkFloat(row[9])) } if len(row) >= 11 { info.DefaultPowerLimitW = parseBenchmarkFloat(row[10]) } } } infoByIndex[idx] = info } return infoByIndex, nil } return nil, lastErr } func applyBenchmarkNormalization(ctx context.Context, verboseLog string, gpuIndices []int, infoByIndex map[int]benchmarkGPUInfo, result *NvidiaBenchmarkResult) []benchmarkRestoreAction { if os.Geteuid() != 0 { result.Normalization.Status = "partial" result.Normalization.Notes = append(result.Normalization.Notes, "benchmark normalization skipped: root privileges are required for persistence mode and clock locks") for _, idx := range gpuIndices { result.Normalization.GPUs = append(result.Normalization.GPUs, BenchmarkNormalizationGPU{ Index: idx, Notes: []string{"normalization skipped: root privileges are required"}, }) } return nil } var restore []benchmarkRestoreAction for _, idx := range gpuIndices { rec := BenchmarkNormalizationGPU{Index: idx} if _, err := runSATCommandCtx(ctx, verboseLog, fmt.Sprintf("normalize-gpu-%d-pm", idx), []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-pm", "1"}, nil, nil); err != nil { rec.PersistenceMode = "failed" rec.Notes = append(rec.Notes, "failed to enable persistence mode") result.Normalization.Status = "partial" } else { rec.PersistenceMode = "applied" } if info, ok := infoByIndex[idx]; ok && info.MaxGraphicsClockMHz > 0 { target := int(math.Round(info.MaxGraphicsClockMHz)) if out, err := runSATCommandCtx(ctx, verboseLog, fmt.Sprintf("normalize-gpu-%d-lgc", idx), []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-lgc", strconv.Itoa(target)}, nil, nil); err != nil { rec.GPUClockLockStatus = "failed" rec.Notes = append(rec.Notes, "graphics clock lock failed: "+strings.TrimSpace(string(out))) result.Normalization.Status = "partial" } else { rec.GPUClockLockStatus = "applied" rec.GPUClockLockMHz = float64(target) idxCopy := idx restore = append(restore, benchmarkRestoreAction{name: fmt.Sprintf("gpu-%d-rgc", idxCopy), fn: func() { _, _ = runSATCommandCtx(context.Background(), verboseLog, fmt.Sprintf("restore-gpu-%d-rgc", idxCopy), []string{"nvidia-smi", "-i", strconv.Itoa(idxCopy), "-rgc"}, nil, nil) }}) } } else { rec.GPUClockLockStatus = "skipped" rec.Notes = append(rec.Notes, "graphics clock lock skipped: gpu inventory unavailable or MaxGraphicsClockMHz=0") result.Normalization.Status = "partial" } if info, ok := infoByIndex[idx]; ok && info.MaxMemoryClockMHz > 0 { target := int(math.Round(info.MaxMemoryClockMHz)) out, err := runSATCommandCtx(ctx, verboseLog, fmt.Sprintf("normalize-gpu-%d-lmc", idx), []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-lmc", strconv.Itoa(target)}, nil, nil) switch { case err == nil: rec.MemoryClockLockStatus = "applied" rec.MemoryClockLockMHz = float64(target) idxCopy := idx restore = append(restore, benchmarkRestoreAction{name: fmt.Sprintf("gpu-%d-rmc", idxCopy), fn: func() { _, _ = runSATCommandCtx(context.Background(), verboseLog, fmt.Sprintf("restore-gpu-%d-rmc", idxCopy), []string{"nvidia-smi", "-i", strconv.Itoa(idxCopy), "-rmc"}, nil, nil) }}) case strings.Contains(strings.ToLower(string(out)), "deferred") || strings.Contains(strings.ToLower(string(out)), "not supported"): rec.MemoryClockLockStatus = "unsupported" rec.Notes = append(rec.Notes, "memory clock lock unsupported on this GPU/driver path") result.Normalization.Status = "partial" default: rec.MemoryClockLockStatus = "failed" rec.Notes = append(rec.Notes, "memory clock lock failed: "+strings.TrimSpace(string(out))) result.Normalization.Status = "partial" } } result.Normalization.GPUs = append(result.Normalization.GPUs, rec) } return restore } func collectBenchmarkSamples(ctx context.Context, durationSec int, gpuIndices []int) ([]GPUMetricRow, error) { if durationSec <= 0 { return nil, nil } deadline := time.Now().Add(time.Duration(durationSec) * time.Second) var rows []GPUMetricRow start := time.Now() for { if ctx.Err() != nil { return rows, ctx.Err() } samples, err := sampleBenchmarkTelemetry(gpuIndices) if err == nil { elapsed := time.Since(start).Seconds() for i := range samples { samples[i].ElapsedSec = elapsed } rows = append(rows, samples...) } if time.Now().After(deadline) { break } select { case <-ctx.Done(): return rows, ctx.Err() case <-time.After(time.Second): } } return rows, nil } func runBenchmarkCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, logFunc func(string)) ([]byte, []GPUMetricRow, error) { stopCh := make(chan struct{}) doneCh := make(chan struct{}) var metricRows []GPUMetricRow start := time.Now() go func() { defer close(doneCh) ticker := time.NewTicker(time.Second) defer ticker.Stop() for { select { case <-stopCh: return case <-ticker.C: samples, err := sampleBenchmarkTelemetry(gpuIndices) if err != nil { continue } elapsed := time.Since(start).Seconds() for i := range samples { samples[i].ElapsedSec = elapsed } metricRows = append(metricRows, samples...) } } }() out, err := runSATCommandCtx(ctx, verboseLog, name, cmd, env, logFunc) close(stopCh) <-doneCh return out, metricRows, err } type benchmarkPlannedPhase struct { PlanLabel string MetricStage string DurationSec int } func runBenchmarkPlannedCommandWithMetrics( ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, phases []benchmarkPlannedPhase, logFunc func(string), ) ([]byte, map[string][]GPUMetricRow, map[string][]byte, error) { out, rows, err := runBenchmarkCommandWithMetrics(ctx, verboseLog, name, cmd, env, gpuIndices, logFunc) return out, splitBenchmarkRowsByPlannedPhase(rows, phases), splitBenchmarkLogByPlannedPhase(out), err } func splitBenchmarkRowsByPlannedPhase(rows []GPUMetricRow, phases []benchmarkPlannedPhase) map[string][]GPUMetricRow { out := make(map[string][]GPUMetricRow, len(phases)) if len(rows) == 0 || len(phases) == 0 { return out } for _, row := range rows { idx := len(phases) - 1 var elapsed float64 for i, phase := range phases { durationSec := phase.DurationSec if durationSec <= 0 { durationSec = 1 } elapsed += float64(durationSec) if row.ElapsedSec < elapsed { idx = i break } } out[phases[idx].MetricStage] = append(out[phases[idx].MetricStage], row) } return out } func splitBenchmarkLogByPlannedPhase(raw []byte) map[string][]byte { out := make(map[string][]byte) var current string for _, line := range strings.Split(strings.ReplaceAll(string(raw), "\r\n", "\n"), "\n") { trimmed := strings.TrimSpace(stripBenchmarkPrefix(line)) switch { case strings.HasPrefix(trimmed, "phase_begin="): current = strings.TrimSpace(strings.TrimPrefix(trimmed, "phase_begin=")) case strings.HasPrefix(trimmed, "phase_end="): current = "" case current != "": out[current] = append(out[current], []byte(line+"\n")...) } } return out } type benchmarkCoolingSample struct { AvgFanRPM float64 AvgFanDutyCyclePct float64 FanDutyCycleAvailable bool } func sampleBenchmarkTelemetry(gpuIndices []int) ([]GPUMetricRow, error) { samples, err := sampleGPUMetrics(gpuIndices) if err != nil { return nil, err } fanSample := sampleBenchmarkCoolingSample() for i := range samples { samples[i].FanAvgRPM = fanSample.AvgFanRPM samples[i].FanDutyCyclePct = fanSample.AvgFanDutyCyclePct samples[i].FanDutyCycleAvailable = fanSample.FanDutyCycleAvailable } return samples, nil } func sampleBenchmarkCoolingSample() benchmarkCoolingSample { fans, _ := sampleFanSpeeds() avgRPM, _, _ := fanRPMStats(fans) dutyPct, dutyAvailable := sampleFanDutyCyclePct() return benchmarkCoolingSample{ AvgFanRPM: avgRPM, AvgFanDutyCyclePct: dutyPct, FanDutyCycleAvailable: dutyAvailable, } } func annotateBenchmarkMetricRows(rows []GPUMetricRow, stage string, offset, durationSec float64) []GPUMetricRow { if len(rows) == 0 { return nil } stageEnd := offset + durationSec if stageEnd <= offset { stageEnd = offset for _, row := range rows { if row.ElapsedSec+offset > stageEnd { stageEnd = row.ElapsedSec + offset } } } out := make([]GPUMetricRow, len(rows)) for i, row := range rows { row.Stage = stage row.ElapsedSec += offset row.StageStartSec = offset row.StageEndSec = stageEnd out[i] = row } return out } func appendBenchmarkMetrics(allRows *[]GPUMetricRow, rows []GPUMetricRow, stage string, cursor *float64, durationSec float64) { annotated := annotateBenchmarkMetricRows(rows, stage, *cursor, durationSec) *allRows = append(*allRows, annotated...) *cursor += durationSec } func writeBenchmarkMetricsFiles(runDir string, rows []GPUMetricRow) { if len(rows) == 0 { return } _ = WriteGPUMetricsCSV(filepath.Join(runDir, "gpu-metrics.csv"), rows) _ = WriteGPUMetricsHTML(filepath.Join(runDir, "gpu-metrics.html"), rows) } func appendBenchmarkStageLog(path, source, stage string, raw []byte) { if path == "" || len(raw) == 0 { return } f, err := os.OpenFile(path, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644) if err != nil { return } defer f.Close() header := fmt.Sprintf("\n========== %s | stage=%s ==========\n", source, stage) _, _ = f.WriteString(header) if len(raw) > 0 { _, _ = f.Write(raw) if raw[len(raw)-1] != '\n' { _, _ = f.WriteString("\n") } } } func parseBenchmarkBurnLog(raw string) benchmarkBurnParseResult { result := benchmarkBurnParseResult{} lines := strings.Split(strings.ReplaceAll(raw, "\r\n", "\n"), "\n") profiles := make(map[string]*benchmarkBurnProfile) for _, line := range lines { line = stripBenchmarkPrefix(strings.TrimSpace(line)) if line == "" { continue } switch { case strings.HasPrefix(line, "device="): result.Device = strings.TrimSpace(strings.TrimPrefix(line, "device=")) case strings.HasPrefix(line, "compute_capability="): result.ComputeCapability = strings.TrimSpace(strings.TrimPrefix(line, "compute_capability=")) case strings.HasPrefix(line, "backend="): result.Backend = strings.TrimSpace(strings.TrimPrefix(line, "backend=")) result.Fallback = result.Backend == "driver-ptx" case strings.HasPrefix(line, "duration_s="): result.DurationSec, _ = strconv.Atoi(strings.TrimSpace(strings.TrimPrefix(line, "duration_s="))) default: if m := benchmarkReadyPattern.FindStringSubmatch(line); len(m) == 6 { profile := ensureBenchmarkProfile(profiles, m[1]) profile.supported = true profile.lanes++ profile.m, _ = strconv.ParseUint(m[3], 10, 64) profile.n, _ = strconv.ParseUint(m[4], 10, 64) profile.k, _ = strconv.ParseUint(m[5], 10, 64) continue } if m := benchmarkSkippedPattern.FindStringSubmatch(line); len(m) == 3 { profile := ensureBenchmarkProfile(profiles, m[1]) profile.supported = false profile.notes = strings.TrimSpace(m[2]) continue } if m := benchmarkIterationsPattern.FindStringSubmatch(line); len(m) == 3 { profile := ensureBenchmarkProfile(profiles, m[1]) iters, _ := strconv.ParseUint(m[2], 10, 64) profile.iterations += iters } } } keys := make([]string, 0, len(profiles)) for key := range profiles { keys = append(keys, key) } sort.Strings(keys) for _, key := range keys { profile := profiles[key] precision := BenchmarkPrecisionResult{ Name: profile.name, Category: profile.category, Supported: profile.supported, Lanes: profile.lanes, M: profile.m, N: profile.n, K: profile.k, Iterations: profile.iterations, Notes: profile.notes, } w := precisionWeight(profile.category) precision.Weight = w if profile.supported && result.DurationSec > 0 && profile.m > 0 && profile.n > 0 && profile.k > 0 && profile.iterations > 0 { precision.TeraOpsPerSec = (2.0 * float64(profile.m) * float64(profile.n) * float64(profile.k) * float64(profile.iterations)) / float64(result.DurationSec) / 1e12 precision.WeightedTeraOpsPerSec = precision.TeraOpsPerSec * w } result.Profiles = append(result.Profiles, precision) } return result } func ensureBenchmarkProfile(profiles map[string]*benchmarkBurnProfile, name string) *benchmarkBurnProfile { if profile, ok := profiles[name]; ok { return profile } category := "other" switch { case strings.HasPrefix(name, "fp64"): category = "fp64" case strings.HasPrefix(name, "fp32"): category = "fp32_tf32" case strings.HasPrefix(name, "fp16"): category = "fp16_bf16" case strings.HasPrefix(name, "int8"): category = "int8" case strings.HasPrefix(name, "fp8"): category = "fp8" case strings.HasPrefix(name, "fp4"): category = "fp4" } profile := &benchmarkBurnProfile{name: name, category: category, supported: true} profiles[name] = profile return profile } // precisionWeight returns the fp32-equivalence factor for a precision category. // Each factor represents how much "real" numeric work one operation of that // type performs relative to fp32 (single precision = 1.0 baseline): // // fp64 = 2.0 — double precision, 2× more bits per operand // fp32 = 1.0 — single precision baseline // fp16 = 0.5 — half precision // int8 = 0.25 — quarter precision // fp8 = 0.25 — quarter precision // fp4 = 0.125 — eighth precision // // Multiplying raw TOPS by the weight gives fp32-equivalent TOPS, enabling // cross-precision comparison on the same numeric scale. func precisionWeight(category string) float64 { switch category { case "fp64": return 2.0 case "fp32_tf32": return 1.0 case "fp16_bf16": return 0.5 case "int8": return 0.25 case "fp8": return 0.25 case "fp4": return 0.125 default: return 1.0 } } func stripBenchmarkPrefix(line string) string { if strings.HasPrefix(line, "[gpu ") { if idx := strings.Index(line, "] "); idx >= 0 { return line[idx+2:] } } return line } func summarizeBenchmarkTelemetry(rows []GPUMetricRow) BenchmarkTelemetrySummary { summary := BenchmarkTelemetrySummary{} if len(rows) == 0 { return summary } temps := make([]float64, 0, len(rows)) powers := make([]float64, 0, len(rows)) clocks := make([]float64, 0, len(rows)) memClocks := make([]float64, 0, len(rows)) usages := make([]float64, 0, len(rows)) memUsages := make([]float64, 0, len(rows)) summary.DurationSec = rows[len(rows)-1].ElapsedSec summary.Samples = len(rows) for _, row := range rows { temps = append(temps, row.TempC) powers = append(powers, row.PowerW) clocks = append(clocks, row.ClockMHz) memClocks = append(memClocks, row.MemClockMHz) usages = append(usages, row.UsagePct) memUsages = append(memUsages, row.MemUsagePct) } summary.AvgTempC = benchmarkMean(temps) summary.P95TempC = benchmarkPercentile(temps, 95) summary.AvgPowerW = benchmarkMean(powers) summary.P95PowerW = benchmarkPercentile(powers, 95) summary.AvgGraphicsClockMHz = benchmarkMean(clocks) summary.P95GraphicsClockMHz = benchmarkPercentile(clocks, 95) summary.AvgMemoryClockMHz = benchmarkMean(memClocks) summary.P95MemoryClockMHz = benchmarkPercentile(memClocks, 95) summary.AvgUsagePct = benchmarkMean(usages) summary.AvgMemUsagePct = benchmarkMean(memUsages) summary.ClockCVPct = benchmarkCV(clocks) summary.PowerCVPct = benchmarkCV(powers) summary.TempCVPct = benchmarkCV(temps) summary.ClockDriftPct = benchmarkClockDrift(clocks) return summary } func summarizeBenchmarkCooling(rows []GPUMetricRow) *BenchmarkCoolingSummary { if len(rows) == 0 { return nil } var rpmValues []float64 var dutyValues []float64 for _, row := range rows { if row.FanAvgRPM > 0 { rpmValues = append(rpmValues, row.FanAvgRPM) } if row.FanDutyCycleAvailable { dutyValues = append(dutyValues, row.FanDutyCyclePct) } } if len(rpmValues) == 0 && len(dutyValues) == 0 { return nil } summary := &BenchmarkCoolingSummary{ Available: true, AvgFanRPM: benchmarkMean(rpmValues), } if len(dutyValues) > 0 { summary.FanDutyCycleAvailable = true summary.AvgFanDutyCyclePct = benchmarkMean(dutyValues) summary.P95FanDutyCyclePct = benchmarkPercentile(dutyValues, 95) } else { summary.Notes = append(summary.Notes, "fan duty cycle unavailable on this host; RPM-only fan telemetry was collected") } return summary } func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard { score := BenchmarkScorecard{} // SyntheticScore: sum of fp32-equivalent TOPS from per-precision phases. // Each precision ran alone with full GPU dedicated — peak capability. for _, p := range gpu.PrecisionSteady { score.SyntheticScore += p.WeightedTeraOpsPerSec } // MixedScore: sum of fp32-equivalent TOPS from the combined phase. // All precisions compete simultaneously — closer to real inference workloads. for _, p := range gpu.PrecisionResults { if p.Supported { score.MixedScore += p.WeightedTeraOpsPerSec } } // MixedEfficiency = MixedScore / SyntheticScore. // Measures how well the GPU sustains throughput under concurrent mixed load. // A healthy GPU scores ~0.8–0.95; severe degradation suggests bandwidth // contention or scheduler inefficiency. if score.SyntheticScore > 0 && score.MixedScore > 0 { score.MixedEfficiency = score.MixedScore / score.SyntheticScore } // ComputeScore = SyntheticScore × (1 + MixedEfficiency × 0.3). // SyntheticScore is the primary signal; MixedEfficiency adds up to +30% // bonus for GPUs that handle mixed-precision concurrency well. // Falls back to MixedScore alone when per-precision data is absent. switch { case score.SyntheticScore > 0: score.ComputeScore = score.SyntheticScore * (1 + score.MixedEfficiency*0.3) case score.MixedScore > 0: score.ComputeScore = score.MixedScore } // PowerSustainScore: how stable is GPU power draw during the benchmark? // High variance means the workload is bursting or the power delivery is // unstable. Score = max(0, 100 − PowerCVPct × 3). // At 10% CV → score 70; at 33%+ CV → score 0. // Uses per-precision windows when available (each runs a single kernel, // so CV reflects genuine power regulation, not workload switching). if len(gpu.PrecisionSteady) > 0 { var sum float64 for _, p := range gpu.PrecisionSteady { sum += clampScore(100 - p.Steady.PowerCVPct*3) } score.PowerSustainScore = sum / float64(len(gpu.PrecisionSteady)) } else if gpu.Steady.PowerCVPct > 0 { score.PowerSustainScore = clampScore(100 - gpu.Steady.PowerCVPct*3) } // ThermalSustainScore: how stable is GPU temperature during the benchmark? // High variance means cooling is inconsistent (fan bursts, liquid flow // instability, or frequent transitions in and out of throttle). // Score = max(0, 100 − TempCVPct × 3). if gpu.Steady.TempCVPct > 0 { score.ThermalSustainScore = clampScore(100 - gpu.Steady.TempCVPct*3) } else { // TempCV not recorded — fall back to 100 (no penalty). score.ThermalSustainScore = 100 } // StabilityScore: what fraction of the benchmark did the GPU spend throttling? // Counts both thermal (HW+SW) and power-cap throttle events. // Score = max(0, 100 − throttle_ratio × 100). // 1% throttle → score 99; 10% throttle → score 90; 100% → score 0. runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6) throttleUS := float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS) + float64(gpu.Throttle.SWPowerCapUS) score.StabilityScore = clampScore(100 - throttleUS/runtimeUS*100) score.CompositeScore = compositeBenchmarkScore(score) if gpu.MultiprocessorCount > 0 && gpu.Steady.AvgGraphicsClockMHz > 0 && score.ComputeScore > 0 { score.TOPSPerSMPerGHz = score.ComputeScore / float64(gpu.MultiprocessorCount) / (gpu.Steady.AvgGraphicsClockMHz / 1000.0) } return score } func compositeBenchmarkScore(score BenchmarkScorecard) float64 { // quality_factor weights: // base 0.35 — floor so a GPU that fails all sustain checks still scores // StabilityScore 0.35 — throttle time: heaviest, direct signal of GPU not keeping up // PowerSustainScore 0.15 — power variance: unstable draw hints at regulation issues // ThermalSustainScore 0.15 — temp variance: unstable cooling hints at airflow issues // cap 1.00 quality := 0.35 + 0.35*(score.StabilityScore/100.0) + 0.15*(score.PowerSustainScore/100.0) + 0.15*(score.ThermalSustainScore/100.0) if quality > 1.00 { quality = 1.00 } return score.ComputeScore * quality } func detectBenchmarkDegradationReasons(gpu BenchmarkGPUResult, normalizationStatus string) []string { var reasons []string runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6) if float64(gpu.Throttle.SWPowerCapUS)/runtimeUS >= 0.05 { reasons = append(reasons, "power_capped") } if float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS)/runtimeUS >= 0.01 { reasons = append(reasons, "thermal_limited") } if float64(gpu.Throttle.SyncBoostUS)/runtimeUS >= 0.01 { reasons = append(reasons, "sync_boost_limited") } if gpu.LockedGraphicsClockMHz > 0 && gpu.Steady.AvgGraphicsClockMHz < gpu.LockedGraphicsClockMHz*0.90 { reasons = append(reasons, "low_sm_clock_vs_target") } if gpu.Scores.StabilityScore > 0 && gpu.Scores.StabilityScore < 85 { reasons = append(reasons, "variance_too_high") } if normalizationStatus != "full" { reasons = append(reasons, "normalization_partial") } if gpu.PowerLimitDerated { reasons = append(reasons, "power_limit_derated") } if gpu.ECC.Uncorrected > 0 { reasons = append(reasons, "ecc_uncorrected_errors") } if gpu.ECC.Corrected > 0 { reasons = append(reasons, "ecc_corrected_errors") } return dedupeStrings(reasons) } func runBenchmarkInterconnect(ctx context.Context, verboseLog, runDir string, gpuIndices []int, spec benchmarkProfileSpec, logFunc func(string)) *BenchmarkInterconnectResult { result := &BenchmarkInterconnectResult{ Status: "UNSUPPORTED", Attempted: true, SelectedGPUIndices: append([]int(nil), gpuIndices...), } cmd := []string{ "all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2", "-g", strconv.Itoa(len(gpuIndices)), "--iters", strconv.Itoa(maxInt(20, spec.NCCLSec/10)), } env := []string{ "CUDA_DEVICE_ORDER=PCI_BUS_ID", "CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices), } logFunc(fmt.Sprintf("NCCL interconnect: gpus=%s", joinIndexList(gpuIndices))) out, err := runSATCommandCtx(ctx, verboseLog, "nccl-all-reduce.log", cmd, env, logFunc) _ = os.WriteFile(filepath.Join(runDir, "nccl-all-reduce.log"), out, 0644) if err != nil { result.Notes = append(result.Notes, strings.TrimSpace(string(out))) return result } avgAlg, maxAlg, avgBus, maxBus := parseNCCLAllReduceOutput(string(out)) result.Status = "OK" result.Supported = true result.AvgAlgBWGBps = avgAlg result.MaxAlgBWGBps = maxAlg result.AvgBusBWGBps = avgBus result.MaxBusBWGBps = maxBus return result } func parseNCCLAllReduceOutput(raw string) (avgAlg, maxAlg, avgBus, maxBus float64) { lines := strings.Split(strings.ReplaceAll(raw, "\r\n", "\n"), "\n") var algs []float64 var buses []float64 for _, line := range lines { line = strings.TrimSpace(line) if line == "" || strings.HasPrefix(line, "#") { continue } fields := strings.Fields(line) if len(fields) < 8 { continue } for i := 0; i+2 < len(fields); i++ { timeVal, err1 := strconv.ParseFloat(fields[i], 64) algVal, err2 := strconv.ParseFloat(fields[i+1], 64) busVal, err3 := strconv.ParseFloat(fields[i+2], 64) if err1 == nil && err2 == nil && err3 == nil && timeVal > 0 { algs = append(algs, algVal) buses = append(buses, busVal) break } } } if len(algs) == 0 { return 0, 0, 0, 0 } return benchmarkMean(algs), benchmarkMax(algs), benchmarkMean(buses), benchmarkMax(buses) } func queryThrottleCounters(gpuIndex int) (BenchmarkThrottleCounters, error) { out, err := satExecCommand( "nvidia-smi", "--id="+strconv.Itoa(gpuIndex), "--query-gpu=clocks_event_reasons_counters.sw_power_cap,clocks_event_reasons_counters.sw_thermal_slowdown,clocks_event_reasons_counters.sync_boost,clocks_event_reasons_counters.hw_thermal_slowdown,clocks_event_reasons_counters.hw_power_brake_slowdown", "--format=csv,noheader,nounits", ).Output() if err != nil { return BenchmarkThrottleCounters{}, err } fields := strings.Split(strings.TrimSpace(string(out)), ",") if len(fields) < 5 { return BenchmarkThrottleCounters{}, fmt.Errorf("unexpected throttle counter columns: %q", strings.TrimSpace(string(out))) } return BenchmarkThrottleCounters{ SWPowerCapUS: parseBenchmarkUint64(fields[0]), SWThermalSlowdownUS: parseBenchmarkUint64(fields[1]), SyncBoostUS: parseBenchmarkUint64(fields[2]), HWThermalSlowdownUS: parseBenchmarkUint64(fields[3]), HWPowerBrakeSlowdownUS: parseBenchmarkUint64(fields[4]), }, nil } func diffThrottleCounters(before, after BenchmarkThrottleCounters) BenchmarkThrottleCounters { return BenchmarkThrottleCounters{ SWPowerCapUS: saturatingSub(after.SWPowerCapUS, before.SWPowerCapUS), SWThermalSlowdownUS: saturatingSub(after.SWThermalSlowdownUS, before.SWThermalSlowdownUS), SyncBoostUS: saturatingSub(after.SyncBoostUS, before.SyncBoostUS), HWThermalSlowdownUS: saturatingSub(after.HWThermalSlowdownUS, before.HWThermalSlowdownUS), HWPowerBrakeSlowdownUS: saturatingSub(after.HWPowerBrakeSlowdownUS, before.HWPowerBrakeSlowdownUS), } } func queryECCCounters(gpuIndex int) (BenchmarkECCCounters, error) { out, err := satExecCommand( "nvidia-smi", "--id="+strconv.Itoa(gpuIndex), "--query-gpu=ecc.errors.corrected.volatile.total,ecc.errors.uncorrected.volatile.total", "--format=csv,noheader,nounits", ).Output() if err != nil { return BenchmarkECCCounters{}, err } fields := strings.Split(strings.TrimSpace(string(out)), ",") if len(fields) < 2 { return BenchmarkECCCounters{}, fmt.Errorf("unexpected ECC counter columns: %q", strings.TrimSpace(string(out))) } corrected, err1 := strconv.ParseUint(strings.TrimSpace(fields[0]), 10, 64) uncorrected, err2 := strconv.ParseUint(strings.TrimSpace(fields[1]), 10, 64) if err1 != nil || err2 != nil { // ECC may be disabled on this GPU — return zero counters silently. return BenchmarkECCCounters{}, nil } return BenchmarkECCCounters{Corrected: corrected, Uncorrected: uncorrected}, nil } func diffECCCounters(before, after BenchmarkECCCounters) BenchmarkECCCounters { return BenchmarkECCCounters{ Corrected: saturatingSub(after.Corrected, before.Corrected), Uncorrected: saturatingSub(after.Uncorrected, before.Uncorrected), } } func queryActiveComputeApps(gpuIndices []int) ([]string, error) { args := []string{ "--query-compute-apps=gpu_uuid,pid,process_name", "--format=csv,noheader,nounits", } if len(gpuIndices) > 0 { args = append([]string{"--id=" + joinIndexList(gpuIndices)}, args...) } out, err := satExecCommand("nvidia-smi", args...).Output() if err != nil { return nil, err } var lines []string for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") { line = strings.TrimSpace(line) if line == "" { continue } lines = append(lines, line) } return lines, nil } func finalizeBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkGPUResult { if gpu.Status == "" { gpu.Status = "OK" } if gpu.Scores.CompositeScore == 0 { gpu.Scores.CompositeScore = compositeBenchmarkScore(gpu.Scores) } return gpu } func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string { var findings []string passed := 0 for _, gpu := range result.GPUs { if gpu.Status == "OK" { passed++ } } total := len(result.GPUs) if total > 0 { if passed == total { findings = append(findings, fmt.Sprintf("All %d GPU(s) passed the benchmark.", total)) } else { findings = append(findings, fmt.Sprintf("%d of %d GPU(s) passed the benchmark.", passed, total)) } } if result.Normalization.Status != "full" { findings = append(findings, "Environment normalization was partial; compare results with caution.") } for _, gpu := range result.GPUs { if gpu.Status == "FAILED" && len(gpu.DegradationReasons) == 0 { findings = append(findings, fmt.Sprintf("GPU %d failed the benchmark (check verbose.log for details).", gpu.Index)) continue } if len(gpu.DegradationReasons) == 0 && gpu.Status == "OK" { findings = append(findings, fmt.Sprintf("GPU %d held clocks without observable throttle counters during steady state.", gpu.Index)) continue } for _, reason := range gpu.DegradationReasons { switch reason { case "power_capped": findings = append(findings, fmt.Sprintf("GPU %d spent measurable time under SW power cap.", gpu.Index)) case "thermal_limited": msg := fmt.Sprintf("GPU %d reported thermal slowdown during steady state.", gpu.Index) if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && result.Cooling.P95FanDutyCyclePct < 98 && gpu.Steady.ClockDriftPct >= 20 { msg += fmt.Sprintf( " Fans peaked at %.0f%% duty cycle (not at maximum) while clocks dropped %.0f%% — possible cooling misconfiguration; rerun the benchmark with fan speed manually fixed at 100%%.", result.Cooling.P95FanDutyCyclePct, gpu.Steady.ClockDriftPct, ) } findings = append(findings, msg) case "sync_boost_limited": findings = append(findings, fmt.Sprintf("GPU %d was limited by sync boost behaviour.", gpu.Index)) case "low_sm_clock_vs_target": findings = append(findings, fmt.Sprintf("GPU %d average SM clock stayed below the requested lock target.", gpu.Index)) case "variance_too_high": findings = append(findings, fmt.Sprintf("GPU %d showed unstable clocks/power over the benchmark window.", gpu.Index)) case "normalization_partial": findings = append(findings, fmt.Sprintf("GPU %d ran without full benchmark normalization.", gpu.Index)) case "power_limit_derated": findings = append(findings, fmt.Sprintf("GPU %d could not sustain targeted_power in this server at the default limit; benchmark ran derated at %.0f W.", gpu.Index, gpu.PowerLimitW)) case "ecc_uncorrected_errors": findings = append(findings, fmt.Sprintf("GPU %d reported %d uncorrected ECC error(s) — possible hardware fault.", gpu.Index, gpu.ECC.Uncorrected)) case "ecc_corrected_errors": findings = append(findings, fmt.Sprintf("GPU %d reported %d corrected ECC error(s) — possible DRAM degradation.", gpu.Index, gpu.ECC.Corrected)) } } if gpu.CoolingWarning != "" { findings = append(findings, fmt.Sprintf( "GPU %d: %s. Operator action: rerun the benchmark with fan speed manually fixed at 100%% to confirm actual thermal headroom.", gpu.Index, gpu.CoolingWarning, )) } if len(gpu.PrecisionFailures) > 0 { findings = append(findings, fmt.Sprintf("GPU %d had incomplete precision coverage: %s.", gpu.Index, strings.Join(gpu.PrecisionFailures, ", "))) } if gpu.Backend == "driver-ptx" { findings = append(findings, fmt.Sprintf("GPU %d used driver PTX fallback; tensor score is intentionally degraded.", gpu.Index)) } if gpu.DefaultPowerLimitW > 0 && gpu.PowerLimitW > 0 && gpu.PowerLimitW < gpu.DefaultPowerLimitW*0.95 { findings = append(findings, fmt.Sprintf( "GPU %d power limit %.0f W is below default %.0f W (%.0f%%). Performance may be artificially reduced.", gpu.Index, gpu.PowerLimitW, gpu.DefaultPowerLimitW, gpu.PowerLimitW/gpu.DefaultPowerLimitW*100, )) } // Flag significant TDP deviation (over or under) from calibration. if gpu.CalibratedPeakPowerW > 0 { ref := gpu.DefaultPowerLimitW if ref <= 0 { ref = gpu.PowerLimitW } if ref > 0 { deviationPct := (gpu.CalibratedPeakPowerW - ref) / ref * 100 switch { case deviationPct < -10: findings = append(findings, fmt.Sprintf( "GPU %d reached only %.0f W (%.0f%% of rated %.0f W) under targeted_power. Check power delivery or cooling.", gpu.Index, gpu.CalibratedPeakPowerW, gpu.CalibratedPeakPowerW/ref*100, ref, )) case deviationPct > 5: findings = append(findings, fmt.Sprintf( "GPU %d exceeded rated TDP: %.0f W measured vs %.0f W rated (+%.0f%%). Power limit may not be enforced correctly.", gpu.Index, gpu.CalibratedPeakPowerW, ref, deviationPct, )) } } } } if result.Interconnect != nil && result.Interconnect.Supported { findings = append(findings, fmt.Sprintf("Multi-GPU all_reduce max bus bandwidth: %.1f GB/s.", result.Interconnect.MaxBusBWGBps)) } if cl := result.CPULoad; cl != nil { switch cl.Status { case "high": findings = append(findings, fmt.Sprintf( "Host CPU load was elevated during the benchmark (avg %.1f%%, max %.1f%%). A competing CPU workload may skew GPU results.", cl.AvgPct, cl.MaxPct, )) case "unstable": findings = append(findings, fmt.Sprintf( "Host CPU load was erratic during the benchmark (avg %.1f%%, p95 %.1f%%). Results may be less reproducible.", cl.AvgPct, cl.P95Pct, )) } } if sp := result.ServerPower; sp != nil && sp.Available && sp.GPUReportedSumW > 0 { if sp.ReportingRatio < 0.75 { findings = append(findings, fmt.Sprintf( "GPU power reporting may be unreliable: server delta %.0f W vs GPU-reported %.0f W (ratio %.2f). GPU telemetry likely over-reports actual consumption. Composite scores have been penalized accordingly.", sp.DeltaW, sp.GPUReportedSumW, sp.ReportingRatio, )) } else if sp.ReportingRatio > 1.25 { findings = append(findings, fmt.Sprintf( "Server power delta %.0f W exceeds GPU-reported sum %.0f W by %.0f%%. Other components (CPU, NVMe, networking) may be drawing substantial power under GPU load.", sp.DeltaW, sp.GPUReportedSumW, (sp.ReportingRatio-1)*100, )) } } return dedupeStrings(findings) } func benchmarkOverallStatus(result NvidiaBenchmarkResult) string { if len(result.GPUs) == 0 { return "FAILED" } hasOK := false hasPartial := result.Normalization.Status != "full" for _, gpu := range result.GPUs { switch gpu.Status { case "OK": hasOK = true case "PARTIAL", "UNSUPPORTED": hasPartial = true } } if !hasOK { return "FAILED" } if hasPartial { return "PARTIAL" } return "OK" } func findBenchmarkNormalization(items []BenchmarkNormalizationGPU, idx int) *BenchmarkNormalizationGPU { for i := range items { if items[i].Index == idx { return &items[i] } } return nil } func classifySATErrorStatus(out []byte, err error) string { status, _ := classifySATResult("benchmark", out, err) if status == "UNSUPPORTED" { return "UNSUPPORTED" } return "FAILED" } func parseBenchmarkFloat(raw string) float64 { raw = strings.TrimSpace(raw) if raw == "" || strings.EqualFold(raw, "n/a") || strings.EqualFold(raw, "[not supported]") { return 0 } value, _ := strconv.ParseFloat(raw, 64) return value } func parseBenchmarkUint64(raw string) uint64 { raw = strings.TrimSpace(raw) if raw == "" || strings.EqualFold(raw, "n/a") || strings.EqualFold(raw, "[not supported]") { return 0 } value, _ := strconv.ParseUint(raw, 10, 64) return value } func benchmarkMean(values []float64) float64 { if len(values) == 0 { return 0 } var sum float64 for _, value := range values { sum += value } return sum / float64(len(values)) } func benchmarkPercentile(values []float64, p float64) float64 { if len(values) == 0 { return 0 } copyValues := append([]float64(nil), values...) sort.Float64s(copyValues) if len(copyValues) == 1 { return copyValues[0] } rank := (p / 100.0) * float64(len(copyValues)-1) lower := int(math.Floor(rank)) upper := int(math.Ceil(rank)) if lower == upper { return copyValues[lower] } frac := rank - float64(lower) return copyValues[lower] + (copyValues[upper]-copyValues[lower])*frac } func benchmarkCV(values []float64) float64 { if len(values) == 0 { return 0 } mean := benchmarkMean(values) if mean == 0 { return 0 } var variance float64 for _, value := range values { diff := value - mean variance += diff * diff } variance /= float64(len(values)) return math.Sqrt(variance) / mean * 100 } func benchmarkClockDrift(values []float64) float64 { if len(values) < 4 { return 0 } window := len(values) / 4 if window < 1 { window = 1 } head := benchmarkMean(values[:window]) tail := benchmarkMean(values[len(values)-window:]) if head <= 0 || tail >= head { return 0 } return ((head - tail) / head) * 100 } func benchmarkMax(values []float64) float64 { var max float64 for i, value := range values { if i == 0 || value > max { max = value } } return max } func clampScore(value float64) float64 { switch { case value < 0: return 0 case value > 100: return 100 default: return value } } func dedupeStrings(values []string) []string { if len(values) == 0 { return nil } seen := make(map[string]struct{}, len(values)) out := make([]string, 0, len(values)) for _, value := range values { value = strings.TrimSpace(value) if value == "" { continue } if _, ok := seen[value]; ok { continue } seen[value] = struct{}{} out = append(out, value) } return out } func saturatingSub(after, before uint64) uint64 { if after <= before { return 0 } return after - before } func maxInt(a, b int) int { if a > b { return a } return b } // queryIPMIServerPowerW reads the current server power draw via ipmitool dcmi. // Returns 0 and an error if IPMI is unavailable or the output cannot be parsed. func queryIPMIServerPowerW() (float64, error) { ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() cmd := exec.CommandContext(ctx, "ipmitool", "dcmi", "power", "reading") out, err := cmd.Output() if err != nil { return 0, fmt.Errorf("ipmitool dcmi power reading: %w", err) } if w := parseDCMIPowerReading(string(out)); w > 0 { return w, nil } return 0, fmt.Errorf("could not parse ipmitool dcmi power reading output") } // sampleIPMIPowerSeries collects IPMI power readings every 2 seconds for // durationSec seconds. Returns the mean of all successful samples. // Returns 0, false if IPMI is unavailable. func sampleIPMIPowerSeries(ctx context.Context, durationSec int) (meanW float64, ok bool) { if durationSec <= 0 { return 0, false } deadline := time.Now().Add(time.Duration(durationSec) * time.Second) var samples []float64 loop: for { if w, err := queryIPMIServerPowerW(); err == nil { samples = append(samples, w) } if time.Now().After(deadline) { break } select { case <-ctx.Done(): break loop case <-time.After(2 * time.Second): } } if len(samples) == 0 { return 0, false } var sum float64 for _, w := range samples { sum += w } return sum / float64(len(samples)), true } // characterizeServerPower computes BenchmarkServerPower from idle and loaded // IPMI samples plus the GPU-reported average power during steady state. func characterizeServerPower(idleW, loadedW, gpuReportedSumW float64, ipmiAvailable bool) *BenchmarkServerPower { sp := &BenchmarkServerPower{Available: ipmiAvailable} if !ipmiAvailable { sp.Notes = append(sp.Notes, "IPMI power reading unavailable; server-side power characterization skipped") return sp } sp.IdleW = idleW sp.LoadedW = loadedW sp.DeltaW = loadedW - idleW sp.GPUReportedSumW = gpuReportedSumW if gpuReportedSumW > 0 && sp.DeltaW > 0 { sp.ReportingRatio = sp.DeltaW / gpuReportedSumW } return sp } // readServerModel returns the DMI system product name (e.g. "SuperMicro SYS-421GE-TNRT"). // Returns empty string if unavailable (non-Linux or missing DMI entry). func readServerModel() string { data, err := os.ReadFile("/sys/class/dmi/id/product_name") if err != nil { return "" } return strings.TrimSpace(string(data)) } // filterRowsByGPU returns only the metric rows for a specific GPU index. func filterRowsByGPU(rows []GPUMetricRow, gpuIndex int) []GPUMetricRow { var out []GPUMetricRow for _, r := range rows { if r.GPUIndex == gpuIndex { out = append(out, r) } } return out } // parseBenchmarkBurnLogByGPU splits a multi-GPU bee-gpu-burn output by [gpu N] prefix // and returns a per-GPU parse result map. func parseBenchmarkBurnLogByGPU(raw string) map[int]benchmarkBurnParseResult { gpuLines := make(map[int][]string) for _, line := range strings.Split(strings.ReplaceAll(raw, "\r\n", "\n"), "\n") { line = strings.TrimSpace(line) if !strings.HasPrefix(line, "[gpu ") { continue } end := strings.Index(line, "] ") if end < 0 { continue } gpuIdx, err := strconv.Atoi(strings.TrimSpace(line[5:end])) if err != nil { continue } gpuLines[gpuIdx] = append(gpuLines[gpuIdx], line[end+2:]) } results := make(map[int]benchmarkBurnParseResult, len(gpuLines)) for gpuIdx, lines := range gpuLines { // Lines are already stripped of the [gpu N] prefix; parseBenchmarkBurnLog // calls stripBenchmarkPrefix which is a no-op on already-stripped lines. results[gpuIdx] = parseBenchmarkBurnLog(strings.Join(lines, "\n")) } return results } // runNvidiaBenchmarkParallel runs warmup and steady compute on all selected GPUs // simultaneously using a single bee-gpu-burn invocation per phase. func runNvidiaBenchmarkParallel( ctx context.Context, verboseLog, runDir string, selected []int, infoByIndex map[int]benchmarkGPUInfo, opts NvidiaBenchmarkOptions, spec benchmarkProfileSpec, logFunc func(string), result *NvidiaBenchmarkResult, calibByIndex map[int]benchmarkPowerCalibrationResult, serverIdleW *float64, serverLoadedWSum *float64, serverIdleOK *bool, serverLoadedOK *bool, serverLoadedSamples *int, allMetricRows *[]GPUMetricRow, metricTimelineSec *float64, gpuBurnLog string, ) { allDevices := joinIndexList(selected) // Build per-GPU result stubs. gpuResults := make(map[int]*BenchmarkGPUResult, len(selected)) for _, idx := range selected { r := &BenchmarkGPUResult{Index: idx, Status: "FAILED"} if info, ok := infoByIndex[idx]; ok { r.UUID = info.UUID r.Name = info.Name r.BusID = info.BusID r.VBIOS = info.VBIOS r.PowerLimitW = info.PowerLimitW r.MultiprocessorCount = info.MultiprocessorCount r.DefaultPowerLimitW = info.DefaultPowerLimitW r.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz r.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz r.MaxMemoryClockMHz = info.MaxMemoryClockMHz } if calib, ok := calibByIndex[idx]; ok { r.CalibratedPeakPowerW = calib.Summary.P95PowerW r.CalibratedPeakTempC = calib.Summary.P95TempC r.PowerCalibrationTries = calib.Attempts r.PowerLimitDerated = calib.Derated r.Notes = append(r.Notes, calib.Notes...) if calib.CoolingWarning != "" { r.CoolingWarning = calib.CoolingWarning } } if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil { r.LockedGraphicsClockMHz = norm.GPUClockLockMHz r.LockedMemoryClockMHz = norm.MemoryClockLockMHz } gpuResults[idx] = r } // Baseline: sample all GPUs together. baselineRows, err := collectBenchmarkSamples(ctx, spec.BaselineSec, selected) if err != nil && err != context.Canceled { for _, idx := range selected { gpuResults[idx].Notes = append(gpuResults[idx].Notes, "baseline sampling failed: "+err.Error()) } } for _, idx := range selected { perGPU := filterRowsByGPU(baselineRows, idx) gpuResults[idx].Baseline = summarizeBenchmarkTelemetry(perGPU) } appendBenchmarkMetrics(allMetricRows, baselineRows, "baseline", metricTimelineSec, float64(spec.BaselineSec)) // Sample server idle power once. if !*serverIdleOK { if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok { *serverIdleW = w *serverIdleOK = true logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w)) } } // Warmup: all GPUs simultaneously. warmupCmd := []string{ "bee-gpu-burn", "--seconds", strconv.Itoa(spec.WarmupSec), "--size-mb", strconv.Itoa(opts.SizeMB), "--devices", allDevices, } logFunc(fmt.Sprintf("GPUs %s: parallel warmup (%ds)", allDevices, spec.WarmupSec)) warmupOut, warmupRows, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-warmup.log", warmupCmd, nil, selected, logFunc) appendBenchmarkMetrics(allMetricRows, warmupRows, "warmup", metricTimelineSec, float64(spec.WarmupSec)) appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", "warmup", warmupOut) if warmupErr != nil { for _, idx := range selected { gpuResults[idx].Notes = append(gpuResults[idx].Notes, "parallel warmup failed: "+warmupErr.Error()) } } warmupParseByGPU := parseBenchmarkBurnLogByGPU(string(warmupOut)) supportedPrecisions := append([]string(nil), benchmarkPrecisionPhases...) for _, idx := range selected { if pr, ok := warmupParseByGPU[idx]; ok && pr.ComputeCapability != "" { if gpuResults[idx].ComputeCapability == "" { gpuResults[idx].ComputeCapability = pr.ComputeCapability } if ccPrecisions := benchmarkSupportedPrecisions(pr.ComputeCapability); len(ccPrecisions) < len(supportedPrecisions) { supportedPrecisions = ccPrecisions } } } // Run synthetic precision phases and the combined steady phase as one // uninterrupted command so the GPUs stay hot between windows. eccBase := make(map[int]BenchmarkECCCounters, len(selected)) for _, idx := range selected { eccBase[idx], _ = queryECCCounters(idx) } planLabels, planPhases, basePhaseSec, mixedPhaseSec := buildBenchmarkSteadyPlan(spec, supportedPrecisions, func(label string) string { if label == "mixed" { return "steady" } return "gpu-all-steady-" + label }) planCmd := []string{ "bee-gpu-burn", "--seconds", strconv.Itoa(basePhaseSec), "--size-mb", strconv.Itoa(opts.SizeMB), "--devices", allDevices, "--precision-plan", strings.Join(planLabels, ","), "--precision-plan-seconds", benchmarkPlanDurationsCSV(planPhases), } logFunc(fmt.Sprintf("GPUs %s: uninterrupted precision plan (%d precision phases x %ds, mixed %ds)", allDevices, len(supportedPrecisions), basePhaseSec, mixedPhaseSec)) _, phaseRowsByStage, phaseLogs, planErr := runBenchmarkPlannedCommandWithMetrics(ctx, verboseLog, "gpu-all-precision-plan.log", planCmd, nil, selected, planPhases, logFunc) for _, phaseSpec := range planPhases { if rows := phaseRowsByStage[phaseSpec.MetricStage]; len(rows) > 0 { appendBenchmarkMetrics(allMetricRows, rows, phaseSpec.MetricStage, metricTimelineSec, float64(phaseSpec.DurationSec)) } appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseSpec.MetricStage, phaseLogs[phaseSpec.PlanLabel]) } for _, prec := range supportedPrecisions { phaseLogName := "gpu-all-steady-" + prec phaseRows := phaseRowsByStage[phaseLogName] parseByGPU := parseBenchmarkBurnLogByGPU(string(phaseLogs[prec])) for _, idx := range selected { perGPU := filterRowsByGPU(phaseRows, idx) phase := BenchmarkPrecisionSteadyPhase{ Precision: prec, Status: "OK", Steady: summarizeBenchmarkTelemetry(perGPU), } if status, note := benchmarkPlannedPhaseStatus(phaseLogs[prec]); status != "OK" { phase.Status = status phase.Notes = note gpuResults[idx].PrecisionFailures = append(gpuResults[idx].PrecisionFailures, prec+":"+status) } if pr, ok := parseByGPU[idx]; ok { for _, p := range pr.Profiles { if p.Supported { phase.TeraOpsPerSec += p.TeraOpsPerSec phase.WeightedTeraOpsPerSec += p.WeightedTeraOpsPerSec } } } gpuResults[idx].PrecisionSteady = append(gpuResults[idx].PrecisionSteady, phase) } } // Snapshot throttle counters before steady. beforeThrottle := make(map[int]BenchmarkThrottleCounters, len(selected)) for _, idx := range selected { beforeThrottle[idx], _ = queryThrottleCounters(idx) } logFunc(fmt.Sprintf("GPUs %s: parallel steady compute (combined, %ds)", allDevices, mixedPhaseSec)) // Sample server power via IPMI in parallel with steady phase. ipmiStopCh := make(chan struct{}) ipmiResultCh := make(chan float64, 1) go func() { defer close(ipmiResultCh) var samples []float64 ticker := time.NewTicker(5 * time.Second) defer ticker.Stop() select { case <-ipmiStopCh: return case <-time.After(15 * time.Second): } for { if w, err := queryIPMIServerPowerW(); err == nil { samples = append(samples, w) } select { case <-ipmiStopCh: if len(samples) > 0 { var sum float64 for _, w := range samples { sum += w } ipmiResultCh <- sum / float64(len(samples)) } return case <-ticker.C: } } }() close(ipmiStopCh) if loadedW, ok := <-ipmiResultCh; ok { *serverLoadedWSum += loadedW (*serverLoadedSamples)++ *serverLoadedOK = true logFunc(fmt.Sprintf("GPUs %s: server loaded power (IPMI): %.0f W", allDevices, loadedW)) } afterThrottle := make(map[int]BenchmarkThrottleCounters, len(selected)) for _, idx := range selected { afterThrottle[idx], _ = queryThrottleCounters(idx) } steadyRows := phaseRowsByStage["steady"] parseResults := parseBenchmarkBurnLogByGPU(string(phaseLogs["mixed"])) for _, idx := range selected { perGPU := filterRowsByGPU(steadyRows, idx) gpuResults[idx].Steady = summarizeBenchmarkTelemetry(perGPU) gpuResults[idx].Throttle = diffThrottleCounters(beforeThrottle[idx], afterThrottle[idx]) if eccFinal, err := queryECCCounters(idx); err == nil { gpuResults[idx].ECC = diffECCCounters(eccBase[idx], eccFinal) } if pr, ok := parseResults[idx]; ok { gpuResults[idx].ComputeCapability = pr.ComputeCapability gpuResults[idx].Backend = pr.Backend gpuResults[idx].PrecisionResults = pr.Profiles if pr.Fallback { gpuResults[idx].Notes = append(gpuResults[idx].Notes, "benchmark used driver PTX fallback; tensor throughput score is not comparable") } } if planErr != nil { gpuResults[idx].Notes = append(gpuResults[idx].Notes, "precision plan failed: "+planErr.Error()) } } // Cooldown: all GPUs together. if spec.CooldownSec > 0 { cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, selected) if err != nil && err != context.Canceled { for _, idx := range selected { gpuResults[idx].Notes = append(gpuResults[idx].Notes, "cooldown sampling failed: "+err.Error()) } } for _, idx := range selected { perGPU := filterRowsByGPU(cooldownRows, idx) gpuResults[idx].Cooldown = summarizeBenchmarkTelemetry(perGPU) } appendBenchmarkMetrics(allMetricRows, cooldownRows, "cooldown", metricTimelineSec, float64(spec.CooldownSec)) } // Score and finalize each GPU. for _, idx := range selected { r := gpuResults[idx] r.Scores = scoreBenchmarkGPUResult(*r) r.DegradationReasons = detectBenchmarkDegradationReasons(*r, result.Normalization.Status) pr := parseResults[idx] switch { case planErr != nil: r.Status = classifySATErrorStatus(phaseLogs["mixed"], planErr) case len(r.PrecisionFailures) > 0: r.Status = "PARTIAL" case pr.Fallback: r.Status = "PARTIAL" default: r.Status = "OK" } result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(*r)) } } // readBenchmarkHostConfig reads static CPU and memory configuration from // /proc/cpuinfo and /proc/meminfo. Returns nil if neither source is readable. func readBenchmarkHostConfig() *BenchmarkHostConfig { cfg := &BenchmarkHostConfig{} populated := false // Parse /proc/cpuinfo for CPU model, sockets, cores, threads. if data, err := os.ReadFile("/proc/cpuinfo"); err == nil { socketIDs := map[string]struct{}{} coresPerSocket := map[string]int{} var modelName string threads := 0 for _, line := range strings.Split(string(data), "\n") { kv := strings.SplitN(line, ":", 2) if len(kv) != 2 { continue } key := strings.TrimSpace(kv[0]) val := strings.TrimSpace(kv[1]) switch key { case "processor": threads++ case "model name": if modelName == "" { modelName = val } case "physical id": socketIDs[val] = struct{}{} case "cpu cores": // Overwrite per-socket core count (last wins per socket, but all // entries for the same socket report the same value). if physLine := ""; physLine == "" { // We accumulate below by treating cpu cores as a per-thread // field; sum by socket requires a two-pass approach. Use the // simpler approximation: totalCores = threads / (threads per core). _ = val } } } // Second pass: per-socket core count. var curSocket string for _, line := range strings.Split(string(data), "\n") { kv := strings.SplitN(line, ":", 2) if len(kv) != 2 { continue } key := strings.TrimSpace(kv[0]) val := strings.TrimSpace(kv[1]) switch key { case "physical id": curSocket = val case "cpu cores": if curSocket != "" { if _, seen := coresPerSocket[curSocket]; !seen { v, _ := strconv.Atoi(val) coresPerSocket[curSocket] = v } } } } totalCores := 0 for _, c := range coresPerSocket { totalCores += c } cfg.CPUModel = modelName cfg.CPUSockets = len(socketIDs) if cfg.CPUSockets == 0 && threads > 0 { cfg.CPUSockets = 1 } cfg.CPUCores = totalCores cfg.CPUThreads = threads if modelName != "" || threads > 0 { populated = true } } // Parse /proc/meminfo for total physical RAM. if data, err := os.ReadFile("/proc/meminfo"); err == nil { for _, line := range strings.Split(string(data), "\n") { if strings.HasPrefix(line, "MemTotal:") { fields := strings.Fields(line) if len(fields) >= 2 { kb, _ := strconv.ParseUint(fields[1], 10, 64) cfg.MemTotalGiB = float64(kb) / (1024 * 1024) populated = true } break } } } if !populated { return nil } return cfg } // startCPULoadSampler starts a goroutine that samples host CPU load every // intervalSec seconds until stopCh is closed, then sends the collected // samples on the returned channel. func startCPULoadSampler(stopCh <-chan struct{}, intervalSec int) <-chan []float64 { ch := make(chan []float64, 1) go func() { var samples []float64 ticker := time.NewTicker(time.Duration(intervalSec) * time.Second) defer ticker.Stop() for { select { case <-stopCh: ch <- samples return case <-ticker.C: if pct := sampleCPULoadPct(); pct > 0 { samples = append(samples, pct) } } } }() return ch } // summarizeCPULoad computes stats over sampled CPU load values and assigns // a health status. func summarizeCPULoad(samples []float64) *BenchmarkCPULoad { if len(samples) == 0 { return nil } sorted := append([]float64(nil), samples...) sort.Float64s(sorted) var sum float64 for _, v := range sorted { sum += v } avg := sum / float64(len(sorted)) p95 := sorted[int(float64(len(sorted))*0.95)] max := sorted[len(sorted)-1] cl := &BenchmarkCPULoad{ AvgPct: math.Round(avg*10) / 10, MaxPct: math.Round(max*10) / 10, P95Pct: math.Round(p95*10) / 10, Samples: len(sorted), } // Compute standard deviation to detect instability. var variance float64 for _, v := range sorted { d := v - avg variance += d * d } stdDev := math.Sqrt(variance / float64(len(sorted))) switch { case avg > 20 || max > 40: cl.Status = "high" cl.Note = fmt.Sprintf("avg %.1f%% max %.1f%% — elevated host CPU load may interfere with GPU benchmark results", avg, max) case stdDev > 12: cl.Status = "unstable" cl.Note = fmt.Sprintf("avg %.1f%% stddev %.1f%% — host CPU load was erratic during the benchmark", avg, stdDev) default: cl.Status = "ok" } return cl } // runBenchmarkPowerCalibration runs targeted_power per GPU and actively watches // throttle counters. If a GPU starts throttling, the current targeted_power run // is canceled immediately, the power limit is reduced, and a fresh full cycle // is started again from the beginning. The selected reduced power limit stays // active for the main benchmark and is restored by the caller afterwards. func runBenchmarkPowerCalibration( ctx context.Context, verboseLog, runDir string, gpuIndices []int, infoByIndex map[int]benchmarkGPUInfo, logFunc func(string), fixedLimits map[int]int, ) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction) { const calibDurationSec = 120 const maxDerateW = 150 // calibSearchTolerance is the binary-search convergence threshold in watts. // When hi-lo ≤ this, the highest verified-stable limit (lo) is used. const calibSearchTolerance = 10 // dcgmResourceBusyMaxDelaySec caps the exponential back-off when DCGM // returns DCGM_ST_IN_USE (exit 222). The sequence is 1 s, 2 s, 4 s, … // doubling each retry until it would exceed the cap, at which point the // next busy response fails the calibration immediately. const dcgmResourceBusyMaxDelaySec = 300 if _, err := exec.LookPath("dcgmi"); err != nil { logFunc("power calibration: dcgmi not found, skipping (will use default power limit)") return map[int]benchmarkPowerCalibrationResult{}, nil } if killed := KillTestWorkers(); len(killed) > 0 { for _, p := range killed { logFunc(fmt.Sprintf("power calibration pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name)) } } canDerate := os.Geteuid() == 0 if !canDerate { logFunc("power calibration: root privileges unavailable, adaptive power-limit derating disabled") } type calibrationAttemptResult struct { out []byte rows []GPUMetricRow err error } // gpuCalibState holds per-GPU binary search state during parallel calibration. type gpuCalibState struct { idx int info benchmarkGPUInfo originalLimitW int appliedLimitW int minLimitW int lo int // highest verified-stable limit (assumed: minLimitW) hi int // lowest verified-unstable limit (exclusive sentinel above start) calib benchmarkPowerCalibrationResult converged bool } results := make(map[int]benchmarkPowerCalibrationResult, len(gpuIndices)) var restore []benchmarkRestoreAction // Initialise per-GPU state. states := make([]*gpuCalibState, 0, len(gpuIndices)) for _, idx := range gpuIndices { info := infoByIndex[idx] originalLimitW := int(math.Round(info.PowerLimitW)) if originalLimitW <= 0 { originalLimitW = int(math.Round(info.DefaultPowerLimitW)) } defaultLimitW := int(math.Round(info.DefaultPowerLimitW)) if defaultLimitW <= 0 { defaultLimitW = originalLimitW } appliedLimitW := originalLimitW if appliedLimitW <= 0 { appliedLimitW = defaultLimitW } minLimitW := appliedLimitW switch { case defaultLimitW > 0: minLimitW = defaultLimitW - maxDerateW floorByRatio := int(math.Round(float64(defaultLimitW) * 0.70)) if minLimitW < floorByRatio { minLimitW = floorByRatio } case appliedLimitW > 0: minLimitW = appliedLimitW - maxDerateW } if minLimitW < calibSearchTolerance { minLimitW = calibSearchTolerance } s := &gpuCalibState{ idx: idx, info: info, originalLimitW: originalLimitW, appliedLimitW: appliedLimitW, minLimitW: minLimitW, lo: minLimitW, hi: appliedLimitW + 1, // not yet tested, not yet confirmed unstable calib: benchmarkPowerCalibrationResult{AppliedPowerLimitW: float64(appliedLimitW)}, } if fixedLimits != nil { if fixedW, ok := fixedLimits[idx]; ok { // This GPU's limit was established in a prior ramp step and must // remain unchanged. Apply it immediately and skip the binary search. if canDerate && fixedW > 0 { _ = setBenchmarkPowerLimit(ctx, verboseLog, idx, fixedW) } s.appliedLimitW = fixedW s.calib.AppliedPowerLimitW = float64(fixedW) s.calib.Completed = true s.converged = true s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("fixed limit: %d W (held from prior ramp step)", fixedW)) } } states = append(states, s) if canDerate && originalLimitW > 0 { idxCopy := idx orig := originalLimitW restore = append(restore, benchmarkRestoreAction{ name: fmt.Sprintf("gpu-%d-restore-power-limit", idxCopy), fn: func() { _ = setBenchmarkPowerLimit(context.Background(), verboseLog, idxCopy, orig) }, }) } } // Shared DCGM resource-busy back-off state (single diagnostic session). busyRetries := 0 busyDelaySec := 1 sharedAttempt := 0 type sharedAttemptResult struct { out []byte rows []GPUMetricRow err error } calibDone: for { // Collect non-converged GPUs. var active []*gpuCalibState for _, s := range states { if !s.converged { active = append(active, s) } } if len(active) == 0 || ctx.Err() != nil { break } sharedAttempt++ for _, s := range active { s.calib.Attempts++ logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power attempt %d at %d W for %ds", s.idx, s.calib.Attempts, s.appliedLimitW, calibDurationSec)) } // Snapshot throttle counters for all active GPUs before the run. beforeThrottle := make(map[int]BenchmarkThrottleCounters, len(active)) for _, s := range active { beforeThrottle[s.idx], _ = queryThrottleCounters(s.idx) } // Run targeted_power for ALL gpuIndices simultaneously so every card // is under load during calibration — this reflects real server thermals. logName := fmt.Sprintf("power-calibration-attempt-%d.log", sharedAttempt) cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, gpuIndices) attemptCtx, cancelAttempt := context.WithCancel(ctx) doneCh := make(chan sharedAttemptResult, 1) go func() { out, rows, err := runBenchmarkCommandWithMetrics(attemptCtx, verboseLog, logName, cmd, nil, gpuIndices, logFunc) doneCh <- sharedAttemptResult{out: out, rows: rows, err: err} }() ticker := time.NewTicker(time.Second) throttleReasons := make(map[int]string, len(active)) var ar sharedAttemptResult attemptLoop: for { select { case ar = <-doneCh: break attemptLoop case <-ticker.C: // Poll throttle counters for each active GPU independently. for _, s := range active { if throttleReasons[s.idx] != "" { continue // already detected for this GPU } after, err := queryThrottleCounters(s.idx) if err != nil { continue } // Record throttle but do NOT cancel — let dcgmi finish so // nv-hostengine releases the slot cleanly before the next attempt. if reason := benchmarkCalibrationThrottleReason(beforeThrottle[s.idx], after); reason != "" { throttleReasons[s.idx] = reason logFunc(fmt.Sprintf("power calibration: GPU %d detected %s throttle at %d W, waiting for run to finish", s.idx, reason, s.appliedLimitW)) } } case <-ctx.Done(): cancelAttempt() ar = <-doneCh break attemptLoop } } ticker.Stop() cancelAttempt() _ = os.WriteFile(filepath.Join(runDir, logName), ar.out, 0644) // Resource busy: retry with exponential back-off (shared — one DCGM session). if ar.err != nil && isDCGMResourceBusy(ar.err) { if busyDelaySec > dcgmResourceBusyMaxDelaySec { for _, s := range active { s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("DCGM resource busy after %d retries, giving up", busyRetries)) s.converged = true } logFunc(fmt.Sprintf("power calibration: DCGM resource persistently busy after %d retries, stopping", busyRetries)) break calibDone } busyRetries++ // Undo attempt counter: busy retries don't count as real attempts. for _, s := range active { s.calib.Attempts-- } logFunc(fmt.Sprintf("power calibration: DCGM resource busy (attempt %d), retrying in %ds", sharedAttempt, busyDelaySec)) select { case <-ctx.Done(): break calibDone case <-time.After(time.Duration(busyDelaySec) * time.Second): } next := busyDelaySec * 2 if next > dcgmResourceBusyMaxDelaySec { next = dcgmResourceBusyMaxDelaySec + 1 } busyDelaySec = next sharedAttempt-- // retry same logical attempt number continue } busyRetries = 0 busyDelaySec = 1 // Per-GPU analysis and binary search update. for _, s := range active { perGPU := filterRowsByGPU(ar.rows, s.idx) summary := summarizeBenchmarkTelemetry(perGPU) throttle := throttleReasons[s.idx] // Cooling warning: thermal throttle with fans not at maximum. if strings.Contains(throttle, "thermal") && s.calib.CoolingWarning == "" { clocks := make([]float64, 0, len(perGPU)) var fanDutyValues []float64 fanDutyAvail := false for _, r := range perGPU { if r.ClockMHz > 0 { clocks = append(clocks, r.ClockMHz) } if r.FanDutyCycleAvailable { fanDutyAvail = true fanDutyValues = append(fanDutyValues, r.FanDutyCyclePct) } } dropPct := benchmarkClockDrift(clocks) p95FanDuty := benchmarkPercentile(fanDutyValues, 95) if dropPct >= 20 && fanDutyAvail && p95FanDuty < 98 { s.calib.CoolingWarning = fmt.Sprintf( "thermal throttle (%s) caused a %.0f%% clock drop while fans were at %.0f%% duty cycle — server cooling may not be configured for full GPU load", throttle, dropPct, p95FanDuty, ) logFunc(fmt.Sprintf("power calibration: GPU %d cooling warning: %s", s.idx, s.calib.CoolingWarning)) } } if throttle == "" && ar.err == nil && summary.P95PowerW > 0 { // Stable at current limit — update lo and binary-search upward. s.calib.Summary = summary s.calib.Completed = true s.calib.AppliedPowerLimitW = float64(s.appliedLimitW) logFunc(fmt.Sprintf("power calibration: GPU %d stable at %d W, p95=%.0f W p95_temp=%.1f C (%d samples)", s.idx, s.appliedLimitW, summary.P95PowerW, summary.P95TempC, summary.Samples)) s.lo = s.appliedLimitW if canDerate && s.hi-s.lo > calibSearchTolerance { next := roundTo5W((s.lo + s.hi) / 2) if next > s.lo && next < s.hi { if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, next); err == nil { s.appliedLimitW = next s.calib.AppliedPowerLimitW = float64(next) s.calib.Completed = false // keep searching s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: stable at %d W, trying %d W (lo=%d hi=%d)", s.lo, next, s.lo, s.hi)) logFunc(fmt.Sprintf("power calibration: GPU %d binary search up: stable at %d W, trying %d W", s.idx, s.lo, next)) continue // next GPU in active list } } } s.converged = true continue } // Failed or throttled — log and binary-search downward. switch { case throttle != "": s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d: %s throttle at %d W", s.calib.Attempts, throttle, s.appliedLimitW)) logFunc(fmt.Sprintf("power calibration: GPU %d throttled (%s) at %d W, reducing power limit", s.idx, throttle, s.appliedLimitW)) case ar.err != nil: s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d failed at %d W: %v", s.calib.Attempts, s.appliedLimitW, ar.err)) logFunc(fmt.Sprintf("power calibration: GPU %d targeted_power failed at %d W: %v", s.idx, s.appliedLimitW, ar.err)) default: s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("targeted_power attempt %d at %d W: no valid power telemetry", s.calib.Attempts, s.appliedLimitW)) logFunc(fmt.Sprintf("power calibration: GPU %d attempt %d at %d W: no valid telemetry", s.idx, s.calib.Attempts, s.appliedLimitW)) } if !canDerate || s.appliedLimitW <= 0 { s.converged = true continue } s.hi = s.appliedLimitW if s.hi-s.lo <= calibSearchTolerance { if s.lo > s.minLimitW { s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search converged: using %d W (lo=%d hi=%d)", s.lo, s.lo, s.hi)) if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, s.lo); err == nil { s.appliedLimitW = s.lo s.calib.AppliedPowerLimitW = float64(s.lo) s.calib.Derated = s.lo < s.originalLimitW // Summary was captured when we last verified stability at s.lo, // so the result is valid — mark as completed even though we // converged from the failure path (tried higher, failed, fell back). s.calib.Completed = true } } else { s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW)) } s.converged = true continue } next := roundTo5W((s.lo + s.hi) / 2) if next <= s.lo { next = s.lo + calibSearchTolerance } if next >= s.hi { next = (s.lo + s.hi) / 2 } if next < s.minLimitW { s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW)) s.converged = true continue } if err := setBenchmarkPowerLimit(ctx, verboseLog, s.idx, next); err != nil { s.calib.Notes = append(s.calib.Notes, "failed to set power limit: "+err.Error()) logFunc(fmt.Sprintf("power calibration: GPU %d failed to set power limit %d W: %v", s.idx, next, err)) s.converged = true continue } s.appliedLimitW = next s.calib.AppliedPowerLimitW = float64(next) s.calib.Derated = next < s.originalLimitW s.info.PowerLimitW = float64(next) infoByIndex[s.idx] = s.info s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("binary search: trying %d W (lo=%d hi=%d)", next, s.lo, s.hi)) logFunc(fmt.Sprintf("power calibration: GPU %d binary search: trying %d W (lo=%d hi=%d)", s.idx, next, s.lo, s.hi)) } } for _, s := range states { if s.calib.Completed || s.calib.Attempts > 0 || len(s.calib.Notes) > 0 { results[s.idx] = s.calib } } return results, restore } // isDCGMResourceBusy returns true when dcgmi exits with DCGM_ST_IN_USE (222), // meaning nv-hostengine still holds the diagnostic slot from a prior run. func isDCGMResourceBusy(err error) bool { var exitErr *exec.ExitError return errors.As(err, &exitErr) && exitErr.ExitCode() == 222 } // roundTo5W rounds w to the nearest 5 W boundary. func roundTo5W(w int) int { return ((w + 2) / 5) * 5 } func powerBenchDurationSec(profile string) int { switch strings.TrimSpace(strings.ToLower(profile)) { case NvidiaBenchmarkProfileStability: return 300 case NvidiaBenchmarkProfileOvernight: return 600 default: return 120 } } func cloneBenchmarkGPUInfoMap(src map[int]benchmarkGPUInfo) map[int]benchmarkGPUInfo { out := make(map[int]benchmarkGPUInfo, len(src)) for k, v := range src { out[k] = v } return out } func renderPowerBenchReport(result NvidiaPowerBenchResult) string { var b strings.Builder b.WriteString("# Bee Bench Power Report\n\n") fmt.Fprintf(&b, "**Benchmark version:** %s \n", result.BenchmarkVersion) fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile) fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC")) fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus) fmt.Fprintf(&b, "**Platform max TDP:** %.0f W \n\n", result.PlatformMaxTDPW) if len(result.Findings) > 0 { b.WriteString("## Summary\n\n") for _, finding := range result.Findings { fmt.Fprintf(&b, "- %s\n", finding) } b.WriteString("\n") } if len(result.RecommendedSlotOrder) > 0 { b.WriteString("## Recommended Slot Order\n\n") fmt.Fprintf(&b, "Populate GPUs in this order for best single-card power realization: `%s`\n\n", joinIndexList(result.RecommendedSlotOrder)) } if len(result.RampSteps) > 0 { b.WriteString("## Ramp Sequence\n\n") b.WriteString("| Step | New GPU | Stable Limit | Total Observed | Derated | Status |\n") b.WriteString("|------|---------|--------------|----------------|---------|--------|\n") for _, step := range result.RampSteps { derated := "-" if step.Derated { derated = "⚠ yes" } fmt.Fprintf(&b, "| %d | GPU %d | %.0f W | %.0f W | %s | %s |\n", step.StepIndex, step.NewGPUIndex, step.NewGPUStableLimitW, step.TotalObservedPowerW, derated, step.Status) } b.WriteString("\n") } b.WriteString("## Per-Slot Results\n\n") b.WriteString("| GPU | Status | Single-card Limit | Stable Limit | Temp | Attempts |\n") b.WriteString("|-----|--------|-------------------|--------------|------|----------|\n") for _, gpu := range result.GPUs { stableLimit := "-" if gpu.StablePowerLimitW > 0 { if gpu.Derated { stableLimit = fmt.Sprintf("%.0f W ⚠", gpu.StablePowerLimitW) } else { stableLimit = fmt.Sprintf("%.0f W", gpu.StablePowerLimitW) } } fmt.Fprintf(&b, "| GPU %d | %s | %.0f W | %s | %.1f C | %d |\n", gpu.Index, gpu.Status, gpu.AppliedPowerLimitW, stableLimit, gpu.MaxObservedTempC, gpu.CalibrationAttempts) } b.WriteString("\n") for _, gpu := range result.GPUs { fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, gpu.Name) for _, note := range gpu.Notes { fmt.Fprintf(&b, "- %s\n", note) } b.WriteString("\n") } return b.String() } func renderPowerBenchSummary(result NvidiaPowerBenchResult) string { var b strings.Builder fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339)) fmt.Fprintf(&b, "benchmark_version=%s\n", result.BenchmarkVersion) fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile) fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus) fmt.Fprintf(&b, "platform_max_tdp_w=%.0f\n", result.PlatformMaxTDPW) fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs)) if len(result.RecommendedSlotOrder) > 0 { fmt.Fprintf(&b, "recommended_slot_order=%s\n", joinIndexList(result.RecommendedSlotOrder)) } for _, step := range result.RampSteps { fmt.Fprintf(&b, "ramp_step_%d_gpus=%s\n", step.StepIndex, joinIndexList(step.GPUIndices)) fmt.Fprintf(&b, "ramp_step_%d_new_gpu=%d\n", step.StepIndex, step.NewGPUIndex) fmt.Fprintf(&b, "ramp_step_%d_stable_limit_w=%.0f\n", step.StepIndex, step.NewGPUStableLimitW) fmt.Fprintf(&b, "ramp_step_%d_total_power_w=%.0f\n", step.StepIndex, step.TotalObservedPowerW) } for _, gpu := range result.GPUs { if gpu.StablePowerLimitW > 0 { fmt.Fprintf(&b, "gpu_%d_stable_limit_w=%.0f\n", gpu.Index, gpu.StablePowerLimitW) } } return b.String() } func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) { if ctx == nil { ctx = context.Background() } if logFunc == nil { logFunc = func(string) {} } if strings.TrimSpace(baseDir) == "" { baseDir = "/var/log/bee-bench/power" } opts = normalizeNvidiaBenchmarkOptionsForBenchmark(opts) selected, err := resolveNvidiaGPUSelection(opts.GPUIndices, opts.ExcludeGPUIndices) if err != nil { return "", err } if len(selected) == 0 { return "", fmt.Errorf("no NVIDIA GPUs selected") } ts := time.Now().UTC().Format("20060102-150405") runDir := filepath.Join(baseDir, "power-"+ts) if err := os.MkdirAll(runDir, 0755); err != nil { return "", fmt.Errorf("mkdir %s: %w", runDir, err) } verboseLog := filepath.Join(runDir, "verbose.log") infoByIndex, infoErr := queryBenchmarkGPUInfo(selected) if infoErr != nil { return "", infoErr } hostname, _ := os.Hostname() result := NvidiaPowerBenchResult{ BenchmarkVersion: benchmarkVersion, GeneratedAt: time.Now().UTC(), Hostname: hostname, ServerModel: readServerModel(), BenchmarkProfile: opts.Profile, SelectedGPUIndices: append([]int(nil), selected...), OverallStatus: "OK", } durationSec := powerBenchDurationSec(opts.Profile) _ = durationSec // Phase 1: calibrate each GPU individually (sequentially, one at a time) to // establish a true single-card power baseline unaffected by neighbour heat. calibByIndex := make(map[int]benchmarkPowerCalibrationResult, len(selected)) var allRestoreActions []benchmarkRestoreAction for _, idx := range selected { singleDir := filepath.Join(runDir, fmt.Sprintf("single-%02d", idx)) _ = os.MkdirAll(singleDir, 0755) singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex) logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx)) c, restore := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil) allRestoreActions = append(allRestoreActions, restore...) if r, ok := c[idx]; ok { calibByIndex[idx] = r } } defer func() { for i := len(allRestoreActions) - 1; i >= 0; i-- { allRestoreActions[i].fn() } }() gpus := make([]NvidiaPowerBenchGPU, 0, len(selected)) for _, idx := range selected { info := infoByIndex[idx] calib := calibByIndex[idx] status := "OK" if !calib.Completed { status = "FAILED" result.OverallStatus = "PARTIAL" } else if calib.Derated { status = "PARTIAL" if result.OverallStatus == "OK" { result.OverallStatus = "PARTIAL" } } gpus = append(gpus, NvidiaPowerBenchGPU{ Index: idx, Name: info.Name, BusID: info.BusID, DefaultPowerLimitW: info.DefaultPowerLimitW, AppliedPowerLimitW: calib.AppliedPowerLimitW, MaxObservedPowerW: calib.Summary.P95PowerW, MaxObservedTempC: calib.Summary.P95TempC, CalibrationAttempts: calib.Attempts, Derated: calib.Derated, Status: status, Notes: append([]string(nil), calib.Notes...), CoolingWarning: calib.CoolingWarning, }) } sort.Slice(gpus, func(i, j int) bool { if gpus[i].MaxObservedPowerW != gpus[j].MaxObservedPowerW { return gpus[i].MaxObservedPowerW > gpus[j].MaxObservedPowerW } if gpus[i].AppliedPowerLimitW != gpus[j].AppliedPowerLimitW { return gpus[i].AppliedPowerLimitW > gpus[j].AppliedPowerLimitW } if gpus[i].Derated != gpus[j].Derated { return !gpus[i].Derated } return gpus[i].Index < gpus[j].Index }) result.GPUs = gpus result.RecommendedSlotOrder = make([]int, 0, len(gpus)) for _, gpu := range gpus { result.RecommendedSlotOrder = append(result.RecommendedSlotOrder, gpu.Index) } if len(result.RecommendedSlotOrder) > 0 { result.Findings = append(result.Findings, fmt.Sprintf("Recommended slot order for installation based on single-card targeted_power: %s.", joinIndexList(result.RecommendedSlotOrder))) } for _, gpu := range gpus { if gpu.Derated { result.Findings = append(result.Findings, fmt.Sprintf("GPU %d required reduced power limit %.0f W to complete targeted_power.", gpu.Index, gpu.AppliedPowerLimitW)) } if gpu.CoolingWarning != "" { result.Findings = append(result.Findings, fmt.Sprintf( "GPU %d: %s. Operator action: rerun the benchmark with fan speed manually fixed at 100%% to confirm actual thermal headroom.", gpu.Index, gpu.CoolingWarning, )) } } singleByIndex := make(map[int]NvidiaPowerBenchGPU, len(gpus)) for _, gpu := range gpus { singleByIndex[gpu.Index] = gpu } // Phase 2: cumulative thermal ramp. // Each step introduces one new GPU into an environment where all previously // calibrated GPUs are already running at their fixed stable limits. The new // GPU's stable TDP is searched via binary search (targeted_power) under real // multi-GPU thermal load. Once found, its limit is fixed permanently for all // subsequent steps. This ensures each GPU's limit reflects actual sustained // power in the final full-system thermal state. // // stableLimits accumulates GPU index → fixed stable limit (W) across steps. stableLimits := make(map[int]int, len(result.RecommendedSlotOrder)) // Step 1: reuse single-card calibration result directly. if len(result.RecommendedSlotOrder) > 0 { firstIdx := result.RecommendedSlotOrder[0] firstCalib := calibByIndex[firstIdx] stableLimits[firstIdx] = int(math.Round(firstCalib.AppliedPowerLimitW)) ramp := NvidiaPowerBenchStep{ StepIndex: 1, GPUIndices: []int{firstIdx}, NewGPUIndex: firstIdx, NewGPUStableLimitW: firstCalib.AppliedPowerLimitW, TotalObservedPowerW: firstCalib.Summary.P95PowerW, AvgObservedPowerW: firstCalib.Summary.P95PowerW, Derated: firstCalib.Derated, Status: "OK", } if !firstCalib.Completed { ramp.Status = "FAILED" ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card targeted_power", firstIdx)) result.OverallStatus = "PARTIAL" } else if firstCalib.Derated { ramp.Status = "PARTIAL" if result.OverallStatus == "OK" { result.OverallStatus = "PARTIAL" } result.Findings = append(result.Findings, fmt.Sprintf("Ramp step 1 (GPU %d) required derating to %.0f W.", firstIdx, firstCalib.AppliedPowerLimitW)) } result.RampSteps = append(result.RampSteps, ramp) logFunc(fmt.Sprintf("power ramp: step 1/%d — reused single-card calibration for GPU %d, stable limit %.0f W", len(result.RecommendedSlotOrder), firstIdx, firstCalib.AppliedPowerLimitW)) } // Steps 2..N: each step fixes previously calibrated GPUs and searches only // the new GPU's stable limit in the combined thermal environment. for stepNum := 1; stepNum < len(result.RecommendedSlotOrder); stepNum++ { step := stepNum + 1 subset := append([]int(nil), result.RecommendedSlotOrder[:step]...) newGPUIdx := result.RecommendedSlotOrder[stepNum] stepDir := filepath.Join(runDir, fmt.Sprintf("step-%02d", step)) _ = os.MkdirAll(stepDir, 0755) // All previously calibrated GPUs are fixed at their stable limits. fixedForStep := make(map[int]int, len(stableLimits)) for k, v := range stableLimits { fixedForStep[k] = v } logFunc(fmt.Sprintf("power ramp: step %d/%d — calibrating GPU %d with %d fixed GPU(s)", step, len(result.RecommendedSlotOrder), newGPUIdx, len(fixedForStep))) stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex) stepCalib, stepRestore := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, fixedForStep) // Accumulate restore actions; they all run in the outer defer. allRestoreActions = append(allRestoreActions, stepRestore...) ramp := NvidiaPowerBenchStep{ StepIndex: step, GPUIndices: subset, NewGPUIndex: newGPUIdx, Status: "OK", } // Total observed power = sum of p95 across all GPUs in this step. for _, idx := range subset { if c, ok := stepCalib[idx]; ok { ramp.TotalObservedPowerW += c.Summary.P95PowerW } } if len(subset) > 0 { ramp.AvgObservedPowerW = ramp.TotalObservedPowerW / float64(len(subset)) } // Determine stable limit for the new GPU. if c, ok := stepCalib[newGPUIdx]; ok && c.Completed { stableLimits[newGPUIdx] = int(math.Round(c.AppliedPowerLimitW)) ramp.NewGPUStableLimitW = c.AppliedPowerLimitW ramp.Derated = c.Derated if c.Derated { ramp.Status = "PARTIAL" if result.OverallStatus == "OK" { result.OverallStatus = "PARTIAL" } result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load.", step, newGPUIdx, c.AppliedPowerLimitW)) } } else { // Calibration failed — fall back to single-card limit. fb := calibByIndex[newGPUIdx] stableLimits[newGPUIdx] = int(math.Round(fb.AppliedPowerLimitW)) ramp.NewGPUStableLimitW = fb.AppliedPowerLimitW ramp.Status = "FAILED" ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete targeted_power in ramp step %d; using single-card limit %.0f W", newGPUIdx, step, fb.AppliedPowerLimitW)) result.OverallStatus = "PARTIAL" } result.RampSteps = append(result.RampSteps, ramp) } // Populate StablePowerLimitW on each GPU entry from the accumulated stable limits. for i := range result.GPUs { if lim, ok := stableLimits[result.GPUs[i].Index]; ok { result.GPUs[i].StablePowerLimitW = float64(lim) } } // PlatformMaxTDPW = sum of all stable limits — the actual sustained power // budget of this server with all GPUs running simultaneously without throttling. for _, lim := range stableLimits { result.PlatformMaxTDPW += float64(lim) } resultJSON, err := json.MarshalIndent(result, "", " ") if err != nil { return "", fmt.Errorf("marshal power result: %w", err) } if err := os.WriteFile(filepath.Join(runDir, "result.json"), resultJSON, 0644); err != nil { return "", fmt.Errorf("write result.json: %w", err) } if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(renderPowerBenchReport(result)), 0644); err != nil { return "", fmt.Errorf("write report.md: %w", err) } if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(renderPowerBenchSummary(result)), 0644); err != nil { return "", fmt.Errorf("write summary.txt: %w", err) } return runDir, nil }