package platform import ( "context" "encoding/csv" "encoding/json" "fmt" "math" "os" "path/filepath" "regexp" "sort" "strconv" "strings" "time" ) const benchmarkVersion = "1" type benchmarkProfileSpec struct { Name string BaselineSec int WarmupSec int SteadySec int NCCLSec int CooldownSec int } type benchmarkGPUInfo struct { Index int UUID string Name string BusID string VBIOS string PowerLimitW float64 MaxGraphicsClockMHz float64 MaxMemoryClockMHz float64 } type benchmarkBurnProfile struct { name string category string supported bool lanes int m uint64 n uint64 k uint64 iterations uint64 notes string } type benchmarkBurnParseResult struct { Device string ComputeCapability string Backend string DurationSec int Profiles []BenchmarkPrecisionResult Fallback bool } type benchmarkRestoreAction struct { name string fn func() } var ( benchmarkReadyPattern = regexp.MustCompile(`^([a-z0-9_]+)\[(\d+)\]=READY dim=(\d+)x(\d+)x(\d+)\b`) benchmarkSkippedPattern = regexp.MustCompile(`^([a-z0-9_]+)(?:\[\d+\])?=SKIPPED (.+)$`) benchmarkIterationsPattern = regexp.MustCompile(`^([a-z0-9_]+)_iterations=(\d+)$`) ) func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) { if ctx == nil { ctx = context.Background() } if logFunc == nil { logFunc = func(string) {} } if strings.TrimSpace(baseDir) == "" { baseDir = "/var/log/bee-benchmark" } spec := resolveBenchmarkProfile(opts.Profile) opts = normalizeNvidiaBenchmarkOptionsForBenchmark(opts) selected, err := resolveNvidiaGPUSelection(opts.GPUIndices, opts.ExcludeGPUIndices) if err != nil { return "", err } if len(selected) == 0 { return "", fmt.Errorf("no NVIDIA GPUs selected") } ts := time.Now().UTC().Format("20060102-150405") runDir := filepath.Join(baseDir, "gpu-benchmark-"+ts) if err := os.MkdirAll(runDir, 0755); err != nil { return "", fmt.Errorf("mkdir %s: %w", runDir, err) } verboseLog := filepath.Join(runDir, "verbose.log") hostname, _ := os.Hostname() result := NvidiaBenchmarkResult{ BenchmarkVersion: benchmarkVersion, GeneratedAt: time.Now().UTC(), Hostname: hostname, BenchmarkProfile: spec.Name, SelectedGPUIndices: append([]int(nil), selected...), Normalization: BenchmarkNormalization{ Status: "full", }, } logFunc(fmt.Sprintf("NVIDIA benchmark profile=%s gpus=%s", spec.Name, joinIndexList(selected))) infoByIndex, infoErr := queryBenchmarkGPUInfo(selected) if infoErr != nil { result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error()) result.Normalization.Status = "partial" } if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil { _ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644) } activeApps, err := queryActiveComputeApps(selected) if err == nil && len(activeApps) > 0 { result.Warnings = append(result.Warnings, "active GPU compute processes detected before benchmark") result.Normalization.Notes = append(result.Normalization.Notes, activeApps...) result.Normalization.Status = "partial" } restoreActions := applyBenchmarkNormalization(ctx, verboseLog, selected, infoByIndex, &result) defer func() { for i := len(restoreActions) - 1; i >= 0; i-- { restoreActions[i].fn() } }() for _, idx := range selected { gpuResult := BenchmarkGPUResult{ Index: idx, Status: "FAILED", } if info, ok := infoByIndex[idx]; ok { gpuResult.UUID = info.UUID gpuResult.Name = info.Name gpuResult.BusID = info.BusID gpuResult.VBIOS = info.VBIOS gpuResult.PowerLimitW = info.PowerLimitW gpuResult.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz } if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil { gpuResult.LockedGraphicsClockMHz = norm.GPUClockLockMHz gpuResult.LockedMemoryClockMHz = norm.MemoryClockLockMHz } baselineRows, err := collectBenchmarkSamples(ctx, spec.BaselineSec, []int{idx}) if err != nil && err != context.Canceled { gpuResult.Notes = append(gpuResult.Notes, "baseline sampling failed: "+err.Error()) } gpuResult.Baseline = summarizeBenchmarkTelemetry(baselineRows) writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-baseline", idx), baselineRows) warmupCmd := []string{ "bee-gpu-burn", "--seconds", strconv.Itoa(spec.WarmupSec), "--size-mb", strconv.Itoa(opts.SizeMB), "--devices", strconv.Itoa(idx), } logFunc(fmt.Sprintf("GPU %d: warmup (%ds)", idx, spec.WarmupSec)) warmupOut, _, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-warmup.log", idx), warmupCmd, nil, []int{idx}, runDir, fmt.Sprintf("gpu-%d-warmup", idx), logFunc) _ = os.WriteFile(filepath.Join(runDir, fmt.Sprintf("gpu-%d-warmup.log", idx)), warmupOut, 0644) if warmupErr != nil { gpuResult.Notes = append(gpuResult.Notes, "warmup failed: "+warmupErr.Error()) result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult)) continue } beforeThrottle, _ := queryThrottleCounters(idx) steadyCmd := []string{ "bee-gpu-burn", "--seconds", strconv.Itoa(spec.SteadySec), "--size-mb", strconv.Itoa(opts.SizeMB), "--devices", strconv.Itoa(idx), } logFunc(fmt.Sprintf("GPU %d: steady compute (%ds)", idx, spec.SteadySec)) steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-steady.log", idx), steadyCmd, nil, []int{idx}, runDir, fmt.Sprintf("gpu-%d-steady", idx), logFunc) _ = os.WriteFile(filepath.Join(runDir, fmt.Sprintf("gpu-%d-steady.log", idx)), steadyOut, 0644) afterThrottle, _ := queryThrottleCounters(idx) if steadyErr != nil { gpuResult.Notes = append(gpuResult.Notes, "steady compute failed: "+steadyErr.Error()) } parseResult := parseBenchmarkBurnLog(string(steadyOut)) gpuResult.ComputeCapability = parseResult.ComputeCapability gpuResult.Backend = parseResult.Backend gpuResult.PrecisionResults = parseResult.Profiles if parseResult.Fallback { gpuResult.Notes = append(gpuResult.Notes, "benchmark used driver PTX fallback; tensor throughput score is not comparable") } gpuResult.Steady = summarizeBenchmarkTelemetry(steadyRows) gpuResult.Throttle = diffThrottleCounters(beforeThrottle, afterThrottle) cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, []int{idx}) if err != nil && err != context.Canceled { gpuResult.Notes = append(gpuResult.Notes, "cooldown sampling failed: "+err.Error()) } gpuResult.Cooldown = summarizeBenchmarkTelemetry(cooldownRows) writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-cooldown", idx), cooldownRows) gpuResult.Scores = scoreBenchmarkGPUResult(gpuResult) gpuResult.DegradationReasons = detectBenchmarkDegradationReasons(gpuResult, result.Normalization.Status) if steadyErr != nil { gpuResult.Status = classifySATErrorStatus(steadyOut, steadyErr) } else if parseResult.Fallback { gpuResult.Status = "PARTIAL" } else { gpuResult.Status = "OK" } result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult)) } if len(selected) > 1 && opts.RunNCCL { result.Interconnect = runBenchmarkInterconnect(ctx, verboseLog, runDir, selected, spec, logFunc) if result.Interconnect != nil && result.Interconnect.Supported { for i := range result.GPUs { result.GPUs[i].Scores.InterconnectScore = result.Interconnect.MaxBusBWGBps result.GPUs[i].Scores.CompositeScore = compositeBenchmarkScore(result.GPUs[i].Scores) } } } result.Findings = buildBenchmarkFindings(result) result.OverallStatus = benchmarkOverallStatus(result) resultJSON, err := json.MarshalIndent(result, "", " ") if err != nil { return "", fmt.Errorf("marshal benchmark result: %w", err) } if err := os.WriteFile(filepath.Join(runDir, "result.json"), resultJSON, 0644); err != nil { return "", fmt.Errorf("write result.json: %w", err) } report := renderBenchmarkReport(result) if err := os.WriteFile(filepath.Join(runDir, "report.txt"), []byte(report), 0644); err != nil { return "", fmt.Errorf("write report.txt: %w", err) } summary := renderBenchmarkSummary(result) if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644); err != nil { return "", fmt.Errorf("write summary.txt: %w", err) } archive := filepath.Join(baseDir, "gpu-benchmark-"+ts+".tar.gz") if err := createTarGz(archive, runDir); err != nil { return "", fmt.Errorf("pack benchmark archive: %w", err) } return archive, nil } func normalizeNvidiaBenchmarkOptionsForBenchmark(opts NvidiaBenchmarkOptions) NvidiaBenchmarkOptions { switch strings.TrimSpace(strings.ToLower(opts.Profile)) { case NvidiaBenchmarkProfileStability: opts.Profile = NvidiaBenchmarkProfileStability case NvidiaBenchmarkProfileOvernight: opts.Profile = NvidiaBenchmarkProfileOvernight default: opts.Profile = NvidiaBenchmarkProfileStandard } if opts.SizeMB < 0 { opts.SizeMB = 0 } opts.GPUIndices = dedupeSortedIndices(opts.GPUIndices) opts.ExcludeGPUIndices = dedupeSortedIndices(opts.ExcludeGPUIndices) return opts } func resolveBenchmarkProfile(profile string) benchmarkProfileSpec { switch strings.TrimSpace(strings.ToLower(profile)) { case NvidiaBenchmarkProfileStability: return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 300, SteadySec: 3600, NCCLSec: 300, CooldownSec: 300} case NvidiaBenchmarkProfileOvernight: return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 600, SteadySec: 27000, NCCLSec: 600, CooldownSec: 300} default: return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 120, SteadySec: 480, NCCLSec: 180, CooldownSec: 120} } } func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) { args := []string{ "--query-gpu=index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory", "--format=csv,noheader,nounits", } if len(gpuIndices) > 0 { args = append([]string{"--id=" + joinIndexList(gpuIndices)}, args...) } out, err := satExecCommand("nvidia-smi", args...).Output() if err != nil { return nil, fmt.Errorf("nvidia-smi gpu info: %w", err) } r := csv.NewReader(strings.NewReader(string(out))) r.TrimLeadingSpace = true r.FieldsPerRecord = -1 rows, err := r.ReadAll() if err != nil { return nil, fmt.Errorf("parse nvidia-smi gpu info: %w", err) } infoByIndex := make(map[int]benchmarkGPUInfo, len(rows)) for _, row := range rows { if len(row) < 8 { continue } idx, err := strconv.Atoi(strings.TrimSpace(row[0])) if err != nil { continue } infoByIndex[idx] = benchmarkGPUInfo{ Index: idx, UUID: strings.TrimSpace(row[1]), Name: strings.TrimSpace(row[2]), BusID: strings.TrimSpace(row[3]), VBIOS: strings.TrimSpace(row[4]), PowerLimitW: parseBenchmarkFloat(row[5]), MaxGraphicsClockMHz: parseBenchmarkFloat(row[6]), MaxMemoryClockMHz: parseBenchmarkFloat(row[7]), } } return infoByIndex, nil } func applyBenchmarkNormalization(ctx context.Context, verboseLog string, gpuIndices []int, infoByIndex map[int]benchmarkGPUInfo, result *NvidiaBenchmarkResult) []benchmarkRestoreAction { if os.Geteuid() != 0 { result.Normalization.Status = "partial" result.Normalization.Notes = append(result.Normalization.Notes, "benchmark normalization skipped: root privileges are required for persistence mode and clock locks") for _, idx := range gpuIndices { result.Normalization.GPUs = append(result.Normalization.GPUs, BenchmarkNormalizationGPU{ Index: idx, Notes: []string{"normalization skipped: root privileges are required"}, }) } return nil } var restore []benchmarkRestoreAction for _, idx := range gpuIndices { rec := BenchmarkNormalizationGPU{Index: idx} if _, err := runSATCommandCtx(ctx, verboseLog, fmt.Sprintf("normalize-gpu-%d-pm", idx), []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-pm", "1"}, nil, nil); err != nil { rec.PersistenceMode = "failed" rec.Notes = append(rec.Notes, "failed to enable persistence mode") result.Normalization.Status = "partial" } else { rec.PersistenceMode = "applied" } if info, ok := infoByIndex[idx]; ok && info.MaxGraphicsClockMHz > 0 { target := int(math.Round(info.MaxGraphicsClockMHz)) if out, err := runSATCommandCtx(ctx, verboseLog, fmt.Sprintf("normalize-gpu-%d-lgc", idx), []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-lgc", strconv.Itoa(target)}, nil, nil); err != nil { rec.GPUClockLockStatus = "failed" rec.Notes = append(rec.Notes, "graphics clock lock failed: "+strings.TrimSpace(string(out))) result.Normalization.Status = "partial" } else { rec.GPUClockLockStatus = "applied" rec.GPUClockLockMHz = float64(target) idxCopy := idx restore = append(restore, benchmarkRestoreAction{name: fmt.Sprintf("gpu-%d-rgc", idxCopy), fn: func() { _, _ = runSATCommandCtx(context.Background(), verboseLog, fmt.Sprintf("restore-gpu-%d-rgc", idxCopy), []string{"nvidia-smi", "-i", strconv.Itoa(idxCopy), "-rgc"}, nil, nil) }}) } } if info, ok := infoByIndex[idx]; ok && info.MaxMemoryClockMHz > 0 { target := int(math.Round(info.MaxMemoryClockMHz)) out, err := runSATCommandCtx(ctx, verboseLog, fmt.Sprintf("normalize-gpu-%d-lmc", idx), []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-lmc", strconv.Itoa(target)}, nil, nil) switch { case err == nil: rec.MemoryClockLockStatus = "applied" rec.MemoryClockLockMHz = float64(target) idxCopy := idx restore = append(restore, benchmarkRestoreAction{name: fmt.Sprintf("gpu-%d-rmc", idxCopy), fn: func() { _, _ = runSATCommandCtx(context.Background(), verboseLog, fmt.Sprintf("restore-gpu-%d-rmc", idxCopy), []string{"nvidia-smi", "-i", strconv.Itoa(idxCopy), "-rmc"}, nil, nil) }}) case strings.Contains(strings.ToLower(string(out)), "deferred") || strings.Contains(strings.ToLower(string(out)), "not supported"): rec.MemoryClockLockStatus = "unsupported" rec.Notes = append(rec.Notes, "memory clock lock unsupported on this GPU/driver path") result.Normalization.Status = "partial" default: rec.MemoryClockLockStatus = "failed" rec.Notes = append(rec.Notes, "memory clock lock failed: "+strings.TrimSpace(string(out))) result.Normalization.Status = "partial" } } result.Normalization.GPUs = append(result.Normalization.GPUs, rec) } return restore } func collectBenchmarkSamples(ctx context.Context, durationSec int, gpuIndices []int) ([]GPUMetricRow, error) { if durationSec <= 0 { return nil, nil } deadline := time.Now().Add(time.Duration(durationSec) * time.Second) var rows []GPUMetricRow start := time.Now() for { if ctx.Err() != nil { return rows, ctx.Err() } samples, err := sampleGPUMetrics(gpuIndices) if err == nil { elapsed := time.Since(start).Seconds() for i := range samples { samples[i].ElapsedSec = elapsed } rows = append(rows, samples...) } if time.Now().After(deadline) { break } select { case <-ctx.Done(): return rows, ctx.Err() case <-time.After(time.Second): } } return rows, nil } func runBenchmarkCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, runDir, baseName string, logFunc func(string)) ([]byte, []GPUMetricRow, error) { stopCh := make(chan struct{}) doneCh := make(chan struct{}) var metricRows []GPUMetricRow start := time.Now() go func() { defer close(doneCh) ticker := time.NewTicker(time.Second) defer ticker.Stop() for { select { case <-stopCh: return case <-ticker.C: samples, err := sampleGPUMetrics(gpuIndices) if err != nil { continue } elapsed := time.Since(start).Seconds() for i := range samples { samples[i].ElapsedSec = elapsed } metricRows = append(metricRows, samples...) } } }() out, err := runSATCommandCtx(ctx, verboseLog, name, cmd, env, logFunc) close(stopCh) <-doneCh writeBenchmarkMetricsFiles(runDir, baseName, metricRows) return out, metricRows, err } func writeBenchmarkMetricsFiles(runDir, baseName string, rows []GPUMetricRow) { if len(rows) == 0 { return } _ = WriteGPUMetricsCSV(filepath.Join(runDir, baseName+"-metrics.csv"), rows) _ = WriteGPUMetricsHTML(filepath.Join(runDir, baseName+"-metrics.html"), rows) chart := RenderGPUTerminalChart(rows) _ = os.WriteFile(filepath.Join(runDir, baseName+"-metrics-term.txt"), []byte(chart), 0644) } func parseBenchmarkBurnLog(raw string) benchmarkBurnParseResult { result := benchmarkBurnParseResult{} lines := strings.Split(strings.ReplaceAll(raw, "\r\n", "\n"), "\n") profiles := make(map[string]*benchmarkBurnProfile) for _, line := range lines { line = stripBenchmarkPrefix(strings.TrimSpace(line)) if line == "" { continue } switch { case strings.HasPrefix(line, "device="): result.Device = strings.TrimSpace(strings.TrimPrefix(line, "device=")) case strings.HasPrefix(line, "compute_capability="): result.ComputeCapability = strings.TrimSpace(strings.TrimPrefix(line, "compute_capability=")) case strings.HasPrefix(line, "backend="): result.Backend = strings.TrimSpace(strings.TrimPrefix(line, "backend=")) result.Fallback = result.Backend == "driver-ptx" case strings.HasPrefix(line, "duration_s="): result.DurationSec, _ = strconv.Atoi(strings.TrimSpace(strings.TrimPrefix(line, "duration_s="))) default: if m := benchmarkReadyPattern.FindStringSubmatch(line); len(m) == 6 { profile := ensureBenchmarkProfile(profiles, m[1]) profile.supported = true profile.lanes++ profile.m, _ = strconv.ParseUint(m[3], 10, 64) profile.n, _ = strconv.ParseUint(m[4], 10, 64) profile.k, _ = strconv.ParseUint(m[5], 10, 64) continue } if m := benchmarkSkippedPattern.FindStringSubmatch(line); len(m) == 3 { profile := ensureBenchmarkProfile(profiles, m[1]) profile.supported = false profile.notes = strings.TrimSpace(m[2]) continue } if m := benchmarkIterationsPattern.FindStringSubmatch(line); len(m) == 3 { profile := ensureBenchmarkProfile(profiles, m[1]) iters, _ := strconv.ParseUint(m[2], 10, 64) profile.iterations += iters } } } keys := make([]string, 0, len(profiles)) for key := range profiles { keys = append(keys, key) } sort.Strings(keys) for _, key := range keys { profile := profiles[key] precision := BenchmarkPrecisionResult{ Name: profile.name, Category: profile.category, Supported: profile.supported, Lanes: profile.lanes, M: profile.m, N: profile.n, K: profile.k, Iterations: profile.iterations, Notes: profile.notes, } if profile.supported && result.DurationSec > 0 && profile.m > 0 && profile.n > 0 && profile.k > 0 && profile.iterations > 0 { precision.TeraOpsPerSec = (2.0 * float64(profile.m) * float64(profile.n) * float64(profile.k) * float64(profile.iterations)) / float64(result.DurationSec) / 1e12 } result.Profiles = append(result.Profiles, precision) } return result } func ensureBenchmarkProfile(profiles map[string]*benchmarkBurnProfile, name string) *benchmarkBurnProfile { if profile, ok := profiles[name]; ok { return profile } category := "other" switch { case strings.HasPrefix(name, "fp32"): category = "fp32_tf32" case strings.HasPrefix(name, "fp16"): category = "fp16_bf16" case strings.HasPrefix(name, "fp8"): category = "fp8" case strings.HasPrefix(name, "fp4"): category = "fp4" } profile := &benchmarkBurnProfile{name: name, category: category, supported: true} profiles[name] = profile return profile } func stripBenchmarkPrefix(line string) string { if strings.HasPrefix(line, "[gpu ") { if idx := strings.Index(line, "] "); idx >= 0 { return line[idx+2:] } } return line } func summarizeBenchmarkTelemetry(rows []GPUMetricRow) BenchmarkTelemetrySummary { summary := BenchmarkTelemetrySummary{} if len(rows) == 0 { return summary } temps := make([]float64, 0, len(rows)) powers := make([]float64, 0, len(rows)) clocks := make([]float64, 0, len(rows)) memClocks := make([]float64, 0, len(rows)) usages := make([]float64, 0, len(rows)) memUsages := make([]float64, 0, len(rows)) summary.DurationSec = rows[len(rows)-1].ElapsedSec summary.Samples = len(rows) for _, row := range rows { temps = append(temps, row.TempC) powers = append(powers, row.PowerW) clocks = append(clocks, row.ClockMHz) memClocks = append(memClocks, row.MemClockMHz) usages = append(usages, row.UsagePct) memUsages = append(memUsages, row.MemUsagePct) } summary.AvgTempC = benchmarkMean(temps) summary.P95TempC = benchmarkPercentile(temps, 95) summary.AvgPowerW = benchmarkMean(powers) summary.P95PowerW = benchmarkPercentile(powers, 95) summary.AvgGraphicsClockMHz = benchmarkMean(clocks) summary.P95GraphicsClockMHz = benchmarkPercentile(clocks, 95) summary.AvgMemoryClockMHz = benchmarkMean(memClocks) summary.P95MemoryClockMHz = benchmarkPercentile(memClocks, 95) summary.AvgUsagePct = benchmarkMean(usages) summary.AvgMemUsagePct = benchmarkMean(memUsages) summary.ClockCVPct = benchmarkCV(clocks) summary.PowerCVPct = benchmarkCV(powers) summary.TempCVPct = benchmarkCV(temps) summary.ClockDriftPct = benchmarkClockDrift(clocks) return summary } func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard { score := BenchmarkScorecard{} for _, precision := range gpu.PrecisionResults { if precision.Supported { score.ComputeScore += precision.TeraOpsPerSec } } if gpu.PowerLimitW > 0 { score.PowerSustainScore = math.Min(100, (gpu.Steady.AvgPowerW/gpu.PowerLimitW)*100) } runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6) thermalRatio := float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS) / runtimeUS score.ThermalSustainScore = clampScore(100 - thermalRatio*100) score.StabilityScore = clampScore(100 - (gpu.Steady.ClockCVPct*4 + gpu.Steady.PowerCVPct*2 + gpu.Steady.ClockDriftPct*2)) score.CompositeScore = compositeBenchmarkScore(score) return score } func compositeBenchmarkScore(score BenchmarkScorecard) float64 { quality := 0.40 + 0.20*(score.PowerSustainScore/100.0) + 0.20*(score.ThermalSustainScore/100.0) + 0.20*(score.StabilityScore/100.0) if score.InterconnectScore > 0 { quality += 0.10 } if quality > 1.10 { quality = 1.10 } return score.ComputeScore * quality } func detectBenchmarkDegradationReasons(gpu BenchmarkGPUResult, normalizationStatus string) []string { var reasons []string runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6) if float64(gpu.Throttle.SWPowerCapUS)/runtimeUS >= 0.05 { reasons = append(reasons, "power_capped") } if float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS)/runtimeUS >= 0.01 { reasons = append(reasons, "thermal_limited") } if float64(gpu.Throttle.SyncBoostUS)/runtimeUS >= 0.01 { reasons = append(reasons, "sync_boost_limited") } if gpu.LockedGraphicsClockMHz > 0 && gpu.Steady.AvgGraphicsClockMHz < gpu.LockedGraphicsClockMHz*0.90 { reasons = append(reasons, "low_sm_clock_vs_target") } if gpu.Scores.StabilityScore > 0 && gpu.Scores.StabilityScore < 85 { reasons = append(reasons, "variance_too_high") } if normalizationStatus != "full" { reasons = append(reasons, "normalization_partial") } return dedupeStrings(reasons) } func runBenchmarkInterconnect(ctx context.Context, verboseLog, runDir string, gpuIndices []int, spec benchmarkProfileSpec, logFunc func(string)) *BenchmarkInterconnectResult { result := &BenchmarkInterconnectResult{ Status: "UNSUPPORTED", Attempted: true, SelectedGPUIndices: append([]int(nil), gpuIndices...), } cmd := []string{ "all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2", "-g", strconv.Itoa(len(gpuIndices)), "--iters", strconv.Itoa(maxInt(20, spec.NCCLSec/10)), } env := []string{"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices)} logFunc(fmt.Sprintf("NCCL interconnect: gpus=%s", joinIndexList(gpuIndices))) out, err := runSATCommandCtx(ctx, verboseLog, "nccl-all-reduce.log", cmd, env, logFunc) _ = os.WriteFile(filepath.Join(runDir, "nccl-all-reduce.log"), out, 0644) if err != nil { result.Notes = append(result.Notes, strings.TrimSpace(string(out))) return result } avgAlg, maxAlg, avgBus, maxBus := parseNCCLAllReduceOutput(string(out)) result.Status = "OK" result.Supported = true result.AvgAlgBWGBps = avgAlg result.MaxAlgBWGBps = maxAlg result.AvgBusBWGBps = avgBus result.MaxBusBWGBps = maxBus return result } func parseNCCLAllReduceOutput(raw string) (avgAlg, maxAlg, avgBus, maxBus float64) { lines := strings.Split(strings.ReplaceAll(raw, "\r\n", "\n"), "\n") var algs []float64 var buses []float64 for _, line := range lines { line = strings.TrimSpace(line) if line == "" || strings.HasPrefix(line, "#") { continue } fields := strings.Fields(line) if len(fields) < 8 { continue } for i := 0; i+2 < len(fields); i++ { timeVal, err1 := strconv.ParseFloat(fields[i], 64) algVal, err2 := strconv.ParseFloat(fields[i+1], 64) busVal, err3 := strconv.ParseFloat(fields[i+2], 64) if err1 == nil && err2 == nil && err3 == nil && timeVal > 0 { algs = append(algs, algVal) buses = append(buses, busVal) break } } } if len(algs) == 0 { return 0, 0, 0, 0 } return benchmarkMean(algs), benchmarkMax(algs), benchmarkMean(buses), benchmarkMax(buses) } func queryThrottleCounters(gpuIndex int) (BenchmarkThrottleCounters, error) { out, err := satExecCommand( "nvidia-smi", "--id="+strconv.Itoa(gpuIndex), "--query-gpu=clocks_event_reasons_counters.sw_power_cap,clocks_event_reasons_counters.sw_thermal_slowdown,clocks_event_reasons_counters.sync_boost,clocks_event_reasons_counters.hw_thermal_slowdown,clocks_event_reasons_counters.hw_power_brake_slowdown", "--format=csv,noheader,nounits", ).Output() if err != nil { return BenchmarkThrottleCounters{}, err } fields := strings.Split(strings.TrimSpace(string(out)), ",") if len(fields) < 5 { return BenchmarkThrottleCounters{}, fmt.Errorf("unexpected throttle counter columns: %q", strings.TrimSpace(string(out))) } return BenchmarkThrottleCounters{ SWPowerCapUS: parseBenchmarkUint64(fields[0]), SWThermalSlowdownUS: parseBenchmarkUint64(fields[1]), SyncBoostUS: parseBenchmarkUint64(fields[2]), HWThermalSlowdownUS: parseBenchmarkUint64(fields[3]), HWPowerBrakeSlowdownUS: parseBenchmarkUint64(fields[4]), }, nil } func diffThrottleCounters(before, after BenchmarkThrottleCounters) BenchmarkThrottleCounters { return BenchmarkThrottleCounters{ SWPowerCapUS: saturatingSub(after.SWPowerCapUS, before.SWPowerCapUS), SWThermalSlowdownUS: saturatingSub(after.SWThermalSlowdownUS, before.SWThermalSlowdownUS), SyncBoostUS: saturatingSub(after.SyncBoostUS, before.SyncBoostUS), HWThermalSlowdownUS: saturatingSub(after.HWThermalSlowdownUS, before.HWThermalSlowdownUS), HWPowerBrakeSlowdownUS: saturatingSub(after.HWPowerBrakeSlowdownUS, before.HWPowerBrakeSlowdownUS), } } func queryActiveComputeApps(gpuIndices []int) ([]string, error) { args := []string{ "--query-compute-apps=gpu_uuid,pid,process_name", "--format=csv,noheader,nounits", } if len(gpuIndices) > 0 { args = append([]string{"--id=" + joinIndexList(gpuIndices)}, args...) } out, err := satExecCommand("nvidia-smi", args...).Output() if err != nil { return nil, err } var lines []string for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") { line = strings.TrimSpace(line) if line == "" { continue } lines = append(lines, line) } return lines, nil } func finalizeBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkGPUResult { if gpu.Status == "" { gpu.Status = "OK" } if gpu.Scores.CompositeScore == 0 { gpu.Scores.CompositeScore = compositeBenchmarkScore(gpu.Scores) } return gpu } func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string { var findings []string if result.Normalization.Status != "full" { findings = append(findings, "Environment normalization was partial; compare results with caution.") } for _, gpu := range result.GPUs { if len(gpu.DegradationReasons) == 0 && gpu.Status == "OK" { findings = append(findings, fmt.Sprintf("GPU %d held clocks without observable throttle counters during steady state.", gpu.Index)) continue } for _, reason := range gpu.DegradationReasons { switch reason { case "power_capped": findings = append(findings, fmt.Sprintf("GPU %d spent measurable time under SW power cap.", gpu.Index)) case "thermal_limited": findings = append(findings, fmt.Sprintf("GPU %d reported thermal slowdown during steady state.", gpu.Index)) case "sync_boost_limited": findings = append(findings, fmt.Sprintf("GPU %d was limited by sync boost behaviour.", gpu.Index)) case "low_sm_clock_vs_target": findings = append(findings, fmt.Sprintf("GPU %d average SM clock stayed below the requested lock target.", gpu.Index)) case "variance_too_high": findings = append(findings, fmt.Sprintf("GPU %d showed unstable clocks/power over the benchmark window.", gpu.Index)) case "normalization_partial": findings = append(findings, fmt.Sprintf("GPU %d ran without full benchmark normalization.", gpu.Index)) } } if gpu.Backend == "driver-ptx" { findings = append(findings, fmt.Sprintf("GPU %d used driver PTX fallback; tensor score is intentionally degraded.", gpu.Index)) } } if result.Interconnect != nil && result.Interconnect.Supported { findings = append(findings, fmt.Sprintf("Multi-GPU all_reduce max bus bandwidth: %.1f GB/s.", result.Interconnect.MaxBusBWGBps)) } return dedupeStrings(findings) } func benchmarkOverallStatus(result NvidiaBenchmarkResult) string { if len(result.GPUs) == 0 { return "FAILED" } hasOK := false hasPartial := result.Normalization.Status != "full" for _, gpu := range result.GPUs { switch gpu.Status { case "OK": hasOK = true case "PARTIAL", "UNSUPPORTED": hasPartial = true } } if !hasOK { return "FAILED" } if hasPartial { return "PARTIAL" } return "OK" } func findBenchmarkNormalization(items []BenchmarkNormalizationGPU, idx int) *BenchmarkNormalizationGPU { for i := range items { if items[i].Index == idx { return &items[i] } } return nil } func classifySATErrorStatus(out []byte, err error) string { status, _ := classifySATResult("benchmark", out, err) if status == "UNSUPPORTED" { return "UNSUPPORTED" } return "FAILED" } func parseBenchmarkFloat(raw string) float64 { raw = strings.TrimSpace(raw) if raw == "" || strings.EqualFold(raw, "n/a") || strings.EqualFold(raw, "[not supported]") { return 0 } value, _ := strconv.ParseFloat(raw, 64) return value } func parseBenchmarkUint64(raw string) uint64 { raw = strings.TrimSpace(raw) if raw == "" || strings.EqualFold(raw, "n/a") || strings.EqualFold(raw, "[not supported]") { return 0 } value, _ := strconv.ParseUint(raw, 10, 64) return value } func benchmarkMean(values []float64) float64 { if len(values) == 0 { return 0 } var sum float64 for _, value := range values { sum += value } return sum / float64(len(values)) } func benchmarkPercentile(values []float64, p float64) float64 { if len(values) == 0 { return 0 } copyValues := append([]float64(nil), values...) sort.Float64s(copyValues) if len(copyValues) == 1 { return copyValues[0] } rank := (p / 100.0) * float64(len(copyValues)-1) lower := int(math.Floor(rank)) upper := int(math.Ceil(rank)) if lower == upper { return copyValues[lower] } frac := rank - float64(lower) return copyValues[lower] + (copyValues[upper]-copyValues[lower])*frac } func benchmarkCV(values []float64) float64 { if len(values) == 0 { return 0 } mean := benchmarkMean(values) if mean == 0 { return 0 } var variance float64 for _, value := range values { diff := value - mean variance += diff * diff } variance /= float64(len(values)) return math.Sqrt(variance) / mean * 100 } func benchmarkClockDrift(values []float64) float64 { if len(values) < 4 { return 0 } window := len(values) / 4 if window < 1 { window = 1 } head := benchmarkMean(values[:window]) tail := benchmarkMean(values[len(values)-window:]) if head <= 0 || tail >= head { return 0 } return ((head - tail) / head) * 100 } func benchmarkMax(values []float64) float64 { var max float64 for i, value := range values { if i == 0 || value > max { max = value } } return max } func clampScore(value float64) float64 { switch { case value < 0: return 0 case value > 100: return 100 default: return value } } func dedupeStrings(values []string) []string { if len(values) == 0 { return nil } seen := make(map[string]struct{}, len(values)) out := make([]string, 0, len(values)) for _, value := range values { value = strings.TrimSpace(value) if value == "" { continue } if _, ok := seen[value]; ok { continue } seen[value] = struct{}{} out = append(out, value) } return out } func saturatingSub(after, before uint64) uint64 { if after <= before { return 0 } return after - before } func maxInt(a, b int) int { if a > b { return a } return b }