bee/audit/internal/platform/benchmark.go

package platform

import (
	"context"
	"encoding/csv"
	"encoding/json"
	"fmt"
	"math"
	"os"
	"path/filepath"
	"regexp"
	"sort"
	"strconv"
	"strings"
	"time"
)

const benchmarkVersion = "1"

type benchmarkProfileSpec struct {
	Name        string
	BaselineSec int
	WarmupSec   int
	SteadySec   int
	NCCLSec     int
	CooldownSec int
}

type benchmarkGPUInfo struct {
	Index               int
	UUID                string
	Name                string
	BusID               string
	VBIOS               string
	PowerLimitW         float64
	MaxGraphicsClockMHz float64
	MaxMemoryClockMHz   float64
}

type benchmarkBurnProfile struct {
	name       string
	category   string
	supported  bool
	lanes      int
	m          uint64
	n          uint64
	k          uint64
	iterations uint64
	notes      string
}

type benchmarkBurnParseResult struct {
	Device            string
	ComputeCapability string
	Backend           string
	DurationSec       int
	Profiles          []BenchmarkPrecisionResult
	Fallback          bool
}

type benchmarkRestoreAction struct {
	name string
	fn   func()
}

var (
	benchmarkReadyPattern      = regexp.MustCompile(`^([a-z0-9_]+)\[(\d+)\]=READY dim=(\d+)x(\d+)x(\d+)\b`)
	benchmarkSkippedPattern    = regexp.MustCompile(`^([a-z0-9_]+)(?:\[\d+\])?=SKIPPED (.+)$`)
	benchmarkIterationsPattern = regexp.MustCompile(`^([a-z0-9_]+)_iterations=(\d+)$`)
)

func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
	if ctx == nil {
		ctx = context.Background()
	}
	if logFunc == nil {
		logFunc = func(string) {}
	}
	if strings.TrimSpace(baseDir) == "" {
		baseDir = "/var/log/bee-benchmark"
	}
	spec := resolveBenchmarkProfile(opts.Profile)
	opts = normalizeNvidiaBenchmarkOptionsForBenchmark(opts)

	selected, err := resolveNvidiaGPUSelection(opts.GPUIndices, opts.ExcludeGPUIndices)
	if err != nil {
		return "", err
	}
	if len(selected) == 0 {
		return "", fmt.Errorf("no NVIDIA GPUs selected")
	}

	ts := time.Now().UTC().Format("20060102-150405")
	runDir := filepath.Join(baseDir, "gpu-benchmark-"+ts)
	if err := os.MkdirAll(runDir, 0755); err != nil {
		return "", fmt.Errorf("mkdir %s: %w", runDir, err)
	}
	verboseLog := filepath.Join(runDir, "verbose.log")

	hostname, _ := os.Hostname()
	result := NvidiaBenchmarkResult{
		BenchmarkVersion:   benchmarkVersion,
		GeneratedAt:        time.Now().UTC(),
		Hostname:           hostname,
		BenchmarkProfile:   spec.Name,
		SelectedGPUIndices: append([]int(nil), selected...),
		Normalization: BenchmarkNormalization{
			Status: "full",
		},
	}

	logFunc(fmt.Sprintf("NVIDIA benchmark profile=%s gpus=%s", spec.Name, joinIndexList(selected)))

	infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
	if infoErr != nil {
		result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error())
		result.Normalization.Status = "partial"
	}

	if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil {
		_ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644)
	}

	activeApps, err := queryActiveComputeApps(selected)
	if err == nil && len(activeApps) > 0 {
		result.Warnings = append(result.Warnings, "active GPU compute processes detected before benchmark")
		result.Normalization.Notes = append(result.Normalization.Notes, activeApps...)
		result.Normalization.Status = "partial"
	}

	restoreActions := applyBenchmarkNormalization(ctx, verboseLog, selected, infoByIndex, &result)
	defer func() {
		for i := len(restoreActions) - 1; i >= 0; i-- {
			restoreActions[i].fn()
		}
	}()

	for _, idx := range selected {
		gpuResult := BenchmarkGPUResult{
			Index:  idx,
			Status: "FAILED",
		}
		if info, ok := infoByIndex[idx]; ok {
			gpuResult.UUID = info.UUID
			gpuResult.Name = info.Name
			gpuResult.BusID = info.BusID
			gpuResult.VBIOS = info.VBIOS
			gpuResult.PowerLimitW = info.PowerLimitW
			gpuResult.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz
			gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz
		}
		if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
			gpuResult.LockedGraphicsClockMHz = norm.GPUClockLockMHz
			gpuResult.LockedMemoryClockMHz = norm.MemoryClockLockMHz
		}

		baselineRows, err := collectBenchmarkSamples(ctx, spec.BaselineSec, []int{idx})
		if err != nil && err != context.Canceled {
			gpuResult.Notes = append(gpuResult.Notes, "baseline sampling failed: "+err.Error())
		}
		gpuResult.Baseline = summarizeBenchmarkTelemetry(baselineRows)
		writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-baseline", idx), baselineRows)

		warmupCmd := []string{
			"bee-gpu-burn",
			"--seconds", strconv.Itoa(spec.WarmupSec),
			"--size-mb", strconv.Itoa(opts.SizeMB),
			"--devices", strconv.Itoa(idx),
		}
		logFunc(fmt.Sprintf("GPU %d: warmup (%ds)", idx, spec.WarmupSec))
		warmupOut, _, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-warmup.log", idx), warmupCmd, nil, []int{idx}, runDir, fmt.Sprintf("gpu-%d-warmup", idx), logFunc)
		_ = os.WriteFile(filepath.Join(runDir, fmt.Sprintf("gpu-%d-warmup.log", idx)), warmupOut, 0644)
		if warmupErr != nil {
			gpuResult.Notes = append(gpuResult.Notes, "warmup failed: "+warmupErr.Error())
			result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult))
			continue
		}

		beforeThrottle, _ := queryThrottleCounters(idx)
		steadyCmd := []string{
			"bee-gpu-burn",
			"--seconds", strconv.Itoa(spec.SteadySec),
			"--size-mb", strconv.Itoa(opts.SizeMB),
			"--devices", strconv.Itoa(idx),
		}
		logFunc(fmt.Sprintf("GPU %d: steady compute (%ds)", idx, spec.SteadySec))
		steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-steady.log", idx), steadyCmd, nil, []int{idx}, runDir, fmt.Sprintf("gpu-%d-steady", idx), logFunc)
		_ = os.WriteFile(filepath.Join(runDir, fmt.Sprintf("gpu-%d-steady.log", idx)), steadyOut, 0644)
		afterThrottle, _ := queryThrottleCounters(idx)
		if steadyErr != nil {
			gpuResult.Notes = append(gpuResult.Notes, "steady compute failed: "+steadyErr.Error())
		}

		parseResult := parseBenchmarkBurnLog(string(steadyOut))
		gpuResult.ComputeCapability = parseResult.ComputeCapability
		gpuResult.Backend = parseResult.Backend
		gpuResult.PrecisionResults = parseResult.Profiles
		if parseResult.Fallback {
			gpuResult.Notes = append(gpuResult.Notes, "benchmark used driver PTX fallback; tensor throughput score is not comparable")
		}

		gpuResult.Steady = summarizeBenchmarkTelemetry(steadyRows)
		gpuResult.Throttle = diffThrottleCounters(beforeThrottle, afterThrottle)

		cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, []int{idx})
		if err != nil && err != context.Canceled {
			gpuResult.Notes = append(gpuResult.Notes, "cooldown sampling failed: "+err.Error())
		}
		gpuResult.Cooldown = summarizeBenchmarkTelemetry(cooldownRows)
		writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-cooldown", idx), cooldownRows)

		gpuResult.Scores = scoreBenchmarkGPUResult(gpuResult)
		gpuResult.DegradationReasons = detectBenchmarkDegradationReasons(gpuResult, result.Normalization.Status)
		if steadyErr != nil {
			gpuResult.Status = classifySATErrorStatus(steadyOut, steadyErr)
		} else if parseResult.Fallback {
			gpuResult.Status = "PARTIAL"
		} else {
			gpuResult.Status = "OK"
		}

		result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult))
	}

	if len(selected) > 1 && opts.RunNCCL {
		result.Interconnect = runBenchmarkInterconnect(ctx, verboseLog, runDir, selected, spec, logFunc)
		if result.Interconnect != nil && result.Interconnect.Supported {
			for i := range result.GPUs {
				result.GPUs[i].Scores.InterconnectScore = result.Interconnect.MaxBusBWGBps
				result.GPUs[i].Scores.CompositeScore = compositeBenchmarkScore(result.GPUs[i].Scores)
			}
		}
	}

	result.Findings = buildBenchmarkFindings(result)
	result.OverallStatus = benchmarkOverallStatus(result)

	resultJSON, err := json.MarshalIndent(result, "", "  ")
	if err != nil {
		return "", fmt.Errorf("marshal benchmark result: %w", err)
	}
	if err := os.WriteFile(filepath.Join(runDir, "result.json"), resultJSON, 0644); err != nil {
		return "", fmt.Errorf("write result.json: %w", err)
	}

	report := renderBenchmarkReportWithCharts(result, loadBenchmarkReportCharts(runDir, selected))
	if err := os.WriteFile(filepath.Join(runDir, "report.txt"), []byte(report), 0644); err != nil {
		return "", fmt.Errorf("write report.txt: %w", err)
	}

	summary := renderBenchmarkSummary(result)
	if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644); err != nil {
		return "", fmt.Errorf("write summary.txt: %w", err)
	}

	archive := filepath.Join(baseDir, "gpu-benchmark-"+ts+".tar.gz")
	if err := createTarGz(archive, runDir); err != nil {
		return "", fmt.Errorf("pack benchmark archive: %w", err)
	}
	return archive, nil
}

func normalizeNvidiaBenchmarkOptionsForBenchmark(opts NvidiaBenchmarkOptions) NvidiaBenchmarkOptions {
	switch strings.TrimSpace(strings.ToLower(opts.Profile)) {
	case NvidiaBenchmarkProfileStability:
		opts.Profile = NvidiaBenchmarkProfileStability
	case NvidiaBenchmarkProfileOvernight:
		opts.Profile = NvidiaBenchmarkProfileOvernight
	default:
		opts.Profile = NvidiaBenchmarkProfileStandard
	}
	if opts.SizeMB < 0 {
		opts.SizeMB = 0
	}
	opts.GPUIndices = dedupeSortedIndices(opts.GPUIndices)
	opts.ExcludeGPUIndices = dedupeSortedIndices(opts.ExcludeGPUIndices)
	return opts
}

func resolveBenchmarkProfile(profile string) benchmarkProfileSpec {
	switch strings.TrimSpace(strings.ToLower(profile)) {
	case NvidiaBenchmarkProfileStability:
		return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 300, SteadySec: 3600, NCCLSec: 300, CooldownSec: 300}
	case NvidiaBenchmarkProfileOvernight:
		return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 600, SteadySec: 27000, NCCLSec: 600, CooldownSec: 300}
	default:
		return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 120, SteadySec: 480, NCCLSec: 180, CooldownSec: 120}
	}
}

func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
	args := []string{
		"--query-gpu=index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory",
		"--format=csv,noheader,nounits",
	}
	if len(gpuIndices) > 0 {
		args = append([]string{"--id=" + joinIndexList(gpuIndices)}, args...)
	}
	out, err := satExecCommand("nvidia-smi", args...).Output()
	if err != nil {
		return nil, fmt.Errorf("nvidia-smi gpu info: %w", err)
	}

	r := csv.NewReader(strings.NewReader(string(out)))
	r.TrimLeadingSpace = true
	r.FieldsPerRecord = -1
	rows, err := r.ReadAll()
	if err != nil {
		return nil, fmt.Errorf("parse nvidia-smi gpu info: %w", err)
	}

	infoByIndex := make(map[int]benchmarkGPUInfo, len(rows))
	for _, row := range rows {
		if len(row) < 8 {
			continue
		}
		idx, err := strconv.Atoi(strings.TrimSpace(row[0]))
		if err != nil {
			continue
		}
		infoByIndex[idx] = benchmarkGPUInfo{
			Index:               idx,
			UUID:                strings.TrimSpace(row[1]),
			Name:                strings.TrimSpace(row[2]),
			BusID:               strings.TrimSpace(row[3]),
			VBIOS:               strings.TrimSpace(row[4]),
			PowerLimitW:         parseBenchmarkFloat(row[5]),
			MaxGraphicsClockMHz: parseBenchmarkFloat(row[6]),
			MaxMemoryClockMHz:   parseBenchmarkFloat(row[7]),
		}
	}
	return infoByIndex, nil
}

func applyBenchmarkNormalization(ctx context.Context, verboseLog string, gpuIndices []int, infoByIndex map[int]benchmarkGPUInfo, result *NvidiaBenchmarkResult) []benchmarkRestoreAction {
	if os.Geteuid() != 0 {
		result.Normalization.Status = "partial"
		result.Normalization.Notes = append(result.Normalization.Notes, "benchmark normalization skipped: root privileges are required for persistence mode and clock locks")
		for _, idx := range gpuIndices {
			result.Normalization.GPUs = append(result.Normalization.GPUs, BenchmarkNormalizationGPU{
				Index: idx,
				Notes: []string{"normalization skipped: root privileges are required"},
			})
		}
		return nil
	}

	var restore []benchmarkRestoreAction
	for _, idx := range gpuIndices {
		rec := BenchmarkNormalizationGPU{Index: idx}
		if _, err := runSATCommandCtx(ctx, verboseLog, fmt.Sprintf("normalize-gpu-%d-pm", idx), []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-pm", "1"}, nil, nil); err != nil {
			rec.PersistenceMode = "failed"
			rec.Notes = append(rec.Notes, "failed to enable persistence mode")
			result.Normalization.Status = "partial"
		} else {
			rec.PersistenceMode = "applied"
		}

		if info, ok := infoByIndex[idx]; ok && info.MaxGraphicsClockMHz > 0 {
			target := int(math.Round(info.MaxGraphicsClockMHz))
			if out, err := runSATCommandCtx(ctx, verboseLog, fmt.Sprintf("normalize-gpu-%d-lgc", idx), []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-lgc", strconv.Itoa(target)}, nil, nil); err != nil {
				rec.GPUClockLockStatus = "failed"
				rec.Notes = append(rec.Notes, "graphics clock lock failed: "+strings.TrimSpace(string(out)))
				result.Normalization.Status = "partial"
			} else {
				rec.GPUClockLockStatus = "applied"
				rec.GPUClockLockMHz = float64(target)
				idxCopy := idx
				restore = append(restore, benchmarkRestoreAction{name: fmt.Sprintf("gpu-%d-rgc", idxCopy), fn: func() {
					_, _ = runSATCommandCtx(context.Background(), verboseLog, fmt.Sprintf("restore-gpu-%d-rgc", idxCopy), []string{"nvidia-smi", "-i", strconv.Itoa(idxCopy), "-rgc"}, nil, nil)
				}})
			}
		}

		if info, ok := infoByIndex[idx]; ok && info.MaxMemoryClockMHz > 0 {
			target := int(math.Round(info.MaxMemoryClockMHz))
			out, err := runSATCommandCtx(ctx, verboseLog, fmt.Sprintf("normalize-gpu-%d-lmc", idx), []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-lmc", strconv.Itoa(target)}, nil, nil)
			switch {
			case err == nil:
				rec.MemoryClockLockStatus = "applied"
				rec.MemoryClockLockMHz = float64(target)
				idxCopy := idx
				restore = append(restore, benchmarkRestoreAction{name: fmt.Sprintf("gpu-%d-rmc", idxCopy), fn: func() {
					_, _ = runSATCommandCtx(context.Background(), verboseLog, fmt.Sprintf("restore-gpu-%d-rmc", idxCopy), []string{"nvidia-smi", "-i", strconv.Itoa(idxCopy), "-rmc"}, nil, nil)
				}})
			case strings.Contains(strings.ToLower(string(out)), "deferred") || strings.Contains(strings.ToLower(string(out)), "not supported"):
				rec.MemoryClockLockStatus = "unsupported"
				rec.Notes = append(rec.Notes, "memory clock lock unsupported on this GPU/driver path")
				result.Normalization.Status = "partial"
			default:
				rec.MemoryClockLockStatus = "failed"
				rec.Notes = append(rec.Notes, "memory clock lock failed: "+strings.TrimSpace(string(out)))
				result.Normalization.Status = "partial"
			}
		}

		result.Normalization.GPUs = append(result.Normalization.GPUs, rec)
	}
	return restore
}

func collectBenchmarkSamples(ctx context.Context, durationSec int, gpuIndices []int) ([]GPUMetricRow, error) {
	if durationSec <= 0 {
		return nil, nil
	}
	deadline := time.Now().Add(time.Duration(durationSec) * time.Second)
	var rows []GPUMetricRow
	start := time.Now()
	for {
		if ctx.Err() != nil {
			return rows, ctx.Err()
		}
		samples, err := sampleGPUMetrics(gpuIndices)
		if err == nil {
			elapsed := time.Since(start).Seconds()
			for i := range samples {
				samples[i].ElapsedSec = elapsed
			}
			rows = append(rows, samples...)
		}
		if time.Now().After(deadline) {
			break
		}
		select {
		case <-ctx.Done():
			return rows, ctx.Err()
		case <-time.After(time.Second):
		}
	}
	return rows, nil
}

func runBenchmarkCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, runDir, baseName string, logFunc func(string)) ([]byte, []GPUMetricRow, error) {
	stopCh := make(chan struct{})
	doneCh := make(chan struct{})
	var metricRows []GPUMetricRow
	start := time.Now()

	go func() {
		defer close(doneCh)
		ticker := time.NewTicker(time.Second)
		defer ticker.Stop()
		for {
			select {
			case <-stopCh:
				return
			case <-ticker.C:
				samples, err := sampleGPUMetrics(gpuIndices)
				if err != nil {
					continue
				}
				elapsed := time.Since(start).Seconds()
				for i := range samples {
					samples[i].ElapsedSec = elapsed
				}
				metricRows = append(metricRows, samples...)
			}
		}
	}()

	out, err := runSATCommandCtx(ctx, verboseLog, name, cmd, env, logFunc)
	close(stopCh)
	<-doneCh

	writeBenchmarkMetricsFiles(runDir, baseName, metricRows)
	return out, metricRows, err
}

func writeBenchmarkMetricsFiles(runDir, baseName string, rows []GPUMetricRow) {
	if len(rows) == 0 {
		return
	}
	_ = WriteGPUMetricsCSV(filepath.Join(runDir, baseName+"-metrics.csv"), rows)
	_ = WriteGPUMetricsHTML(filepath.Join(runDir, baseName+"-metrics.html"), rows)
	chart := RenderGPUTerminalChart(rows)
	_ = os.WriteFile(filepath.Join(runDir, baseName+"-metrics-term.txt"), []byte(chart), 0644)
}

func parseBenchmarkBurnLog(raw string) benchmarkBurnParseResult {
	result := benchmarkBurnParseResult{}
	lines := strings.Split(strings.ReplaceAll(raw, "\r\n", "\n"), "\n")
	profiles := make(map[string]*benchmarkBurnProfile)
	for _, line := range lines {
		line = stripBenchmarkPrefix(strings.TrimSpace(line))
		if line == "" {
			continue
		}
		switch {
		case strings.HasPrefix(line, "device="):
			result.Device = strings.TrimSpace(strings.TrimPrefix(line, "device="))
		case strings.HasPrefix(line, "compute_capability="):
			result.ComputeCapability = strings.TrimSpace(strings.TrimPrefix(line, "compute_capability="))
		case strings.HasPrefix(line, "backend="):
			result.Backend = strings.TrimSpace(strings.TrimPrefix(line, "backend="))
			result.Fallback = result.Backend == "driver-ptx"
		case strings.HasPrefix(line, "duration_s="):
			result.DurationSec, _ = strconv.Atoi(strings.TrimSpace(strings.TrimPrefix(line, "duration_s=")))
		default:
			if m := benchmarkReadyPattern.FindStringSubmatch(line); len(m) == 6 {
				profile := ensureBenchmarkProfile(profiles, m[1])
				profile.supported = true
				profile.lanes++
				profile.m, _ = strconv.ParseUint(m[3], 10, 64)
				profile.n, _ = strconv.ParseUint(m[4], 10, 64)
				profile.k, _ = strconv.ParseUint(m[5], 10, 64)
				continue
			}
			if m := benchmarkSkippedPattern.FindStringSubmatch(line); len(m) == 3 {
				profile := ensureBenchmarkProfile(profiles, m[1])
				profile.supported = false
				profile.notes = strings.TrimSpace(m[2])
				continue
			}
			if m := benchmarkIterationsPattern.FindStringSubmatch(line); len(m) == 3 {
				profile := ensureBenchmarkProfile(profiles, m[1])
				iters, _ := strconv.ParseUint(m[2], 10, 64)
				profile.iterations += iters
			}
		}
	}

	keys := make([]string, 0, len(profiles))
	for key := range profiles {
		keys = append(keys, key)
	}
	sort.Strings(keys)
	for _, key := range keys {
		profile := profiles[key]
		precision := BenchmarkPrecisionResult{
			Name:       profile.name,
			Category:   profile.category,
			Supported:  profile.supported,
			Lanes:      profile.lanes,
			M:          profile.m,
			N:          profile.n,
			K:          profile.k,
			Iterations: profile.iterations,
			Notes:      profile.notes,
		}
		if profile.supported && result.DurationSec > 0 && profile.m > 0 && profile.n > 0 && profile.k > 0 && profile.iterations > 0 {
			precision.TeraOpsPerSec = (2.0 * float64(profile.m) * float64(profile.n) * float64(profile.k) * float64(profile.iterations)) / float64(result.DurationSec) / 1e12
		}
		result.Profiles = append(result.Profiles, precision)
	}
	return result
}

func ensureBenchmarkProfile(profiles map[string]*benchmarkBurnProfile, name string) *benchmarkBurnProfile {
	if profile, ok := profiles[name]; ok {
		return profile
	}
	category := "other"
	switch {
	case strings.HasPrefix(name, "fp32"):
		category = "fp32_tf32"
	case strings.HasPrefix(name, "fp16"):
		category = "fp16_bf16"
	case strings.HasPrefix(name, "fp8"):
		category = "fp8"
	case strings.HasPrefix(name, "fp4"):
		category = "fp4"
	}
	profile := &benchmarkBurnProfile{name: name, category: category, supported: true}
	profiles[name] = profile
	return profile
}

func stripBenchmarkPrefix(line string) string {
	if strings.HasPrefix(line, "[gpu ") {
		if idx := strings.Index(line, "] "); idx >= 0 {
			return line[idx+2:]
		}
	}
	return line
}

func summarizeBenchmarkTelemetry(rows []GPUMetricRow) BenchmarkTelemetrySummary {
	summary := BenchmarkTelemetrySummary{}
	if len(rows) == 0 {
		return summary
	}
	temps := make([]float64, 0, len(rows))
	powers := make([]float64, 0, len(rows))
	clocks := make([]float64, 0, len(rows))
	memClocks := make([]float64, 0, len(rows))
	usages := make([]float64, 0, len(rows))
	memUsages := make([]float64, 0, len(rows))
	summary.DurationSec = rows[len(rows)-1].ElapsedSec
	summary.Samples = len(rows)
	for _, row := range rows {
		temps = append(temps, row.TempC)
		powers = append(powers, row.PowerW)
		clocks = append(clocks, row.ClockMHz)
		memClocks = append(memClocks, row.MemClockMHz)
		usages = append(usages, row.UsagePct)
		memUsages = append(memUsages, row.MemUsagePct)
	}
	summary.AvgTempC = benchmarkMean(temps)
	summary.P95TempC = benchmarkPercentile(temps, 95)
	summary.AvgPowerW = benchmarkMean(powers)
	summary.P95PowerW = benchmarkPercentile(powers, 95)
	summary.AvgGraphicsClockMHz = benchmarkMean(clocks)
	summary.P95GraphicsClockMHz = benchmarkPercentile(clocks, 95)
	summary.AvgMemoryClockMHz = benchmarkMean(memClocks)
	summary.P95MemoryClockMHz = benchmarkPercentile(memClocks, 95)
	summary.AvgUsagePct = benchmarkMean(usages)
	summary.AvgMemUsagePct = benchmarkMean(memUsages)
	summary.ClockCVPct = benchmarkCV(clocks)
	summary.PowerCVPct = benchmarkCV(powers)
	summary.TempCVPct = benchmarkCV(temps)
	summary.ClockDriftPct = benchmarkClockDrift(clocks)
	return summary
}

func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
	score := BenchmarkScorecard{}
	for _, precision := range gpu.PrecisionResults {
		if precision.Supported {
			score.ComputeScore += precision.TeraOpsPerSec
		}
	}
	if gpu.PowerLimitW > 0 {
		score.PowerSustainScore = math.Min(100, (gpu.Steady.AvgPowerW/gpu.PowerLimitW)*100)
	}
	runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6)
	thermalRatio := float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS) / runtimeUS
	score.ThermalSustainScore = clampScore(100 - thermalRatio*100)
	score.StabilityScore = clampScore(100 - (gpu.Steady.ClockCVPct*4 + gpu.Steady.PowerCVPct*2 + gpu.Steady.ClockDriftPct*2))
	score.CompositeScore = compositeBenchmarkScore(score)
	return score
}

func compositeBenchmarkScore(score BenchmarkScorecard) float64 {
	quality := 0.40 + 0.20*(score.PowerSustainScore/100.0) + 0.20*(score.ThermalSustainScore/100.0) + 0.20*(score.StabilityScore/100.0)
	if score.InterconnectScore > 0 {
		quality += 0.10
	}
	if quality > 1.10 {
		quality = 1.10
	}
	return score.ComputeScore * quality
}

func detectBenchmarkDegradationReasons(gpu BenchmarkGPUResult, normalizationStatus string) []string {
	var reasons []string
	runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6)
	if float64(gpu.Throttle.SWPowerCapUS)/runtimeUS >= 0.05 {
		reasons = append(reasons, "power_capped")
	}
	if float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS)/runtimeUS >= 0.01 {
		reasons = append(reasons, "thermal_limited")
	}
	if float64(gpu.Throttle.SyncBoostUS)/runtimeUS >= 0.01 {
		reasons = append(reasons, "sync_boost_limited")
	}
	if gpu.LockedGraphicsClockMHz > 0 && gpu.Steady.AvgGraphicsClockMHz < gpu.LockedGraphicsClockMHz*0.90 {
		reasons = append(reasons, "low_sm_clock_vs_target")
	}
	if gpu.Scores.StabilityScore > 0 && gpu.Scores.StabilityScore < 85 {
		reasons = append(reasons, "variance_too_high")
	}
	if normalizationStatus != "full" {
		reasons = append(reasons, "normalization_partial")
	}
	return dedupeStrings(reasons)
}

func runBenchmarkInterconnect(ctx context.Context, verboseLog, runDir string, gpuIndices []int, spec benchmarkProfileSpec, logFunc func(string)) *BenchmarkInterconnectResult {
	result := &BenchmarkInterconnectResult{
		Status:             "UNSUPPORTED",
		Attempted:          true,
		SelectedGPUIndices: append([]int(nil), gpuIndices...),
	}
	cmd := []string{
		"all_reduce_perf",
		"-b", "512M",
		"-e", "4G",
		"-f", "2",
		"-g", strconv.Itoa(len(gpuIndices)),
		"--iters", strconv.Itoa(maxInt(20, spec.NCCLSec/10)),
	}
	env := []string{
		"CUDA_DEVICE_ORDER=PCI_BUS_ID",
		"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices),
	}
	logFunc(fmt.Sprintf("NCCL interconnect: gpus=%s", joinIndexList(gpuIndices)))
	out, err := runSATCommandCtx(ctx, verboseLog, "nccl-all-reduce.log", cmd, env, logFunc)
	_ = os.WriteFile(filepath.Join(runDir, "nccl-all-reduce.log"), out, 0644)
	if err != nil {
		result.Notes = append(result.Notes, strings.TrimSpace(string(out)))
		return result
	}
	avgAlg, maxAlg, avgBus, maxBus := parseNCCLAllReduceOutput(string(out))
	result.Status = "OK"
	result.Supported = true
	result.AvgAlgBWGBps = avgAlg
	result.MaxAlgBWGBps = maxAlg
	result.AvgBusBWGBps = avgBus
	result.MaxBusBWGBps = maxBus
	return result
}

func parseNCCLAllReduceOutput(raw string) (avgAlg, maxAlg, avgBus, maxBus float64) {
	lines := strings.Split(strings.ReplaceAll(raw, "\r\n", "\n"), "\n")
	var algs []float64
	var buses []float64
	for _, line := range lines {
		line = strings.TrimSpace(line)
		if line == "" || strings.HasPrefix(line, "#") {
			continue
		}
		fields := strings.Fields(line)
		if len(fields) < 8 {
			continue
		}
		for i := 0; i+2 < len(fields); i++ {
			timeVal, err1 := strconv.ParseFloat(fields[i], 64)
			algVal, err2 := strconv.ParseFloat(fields[i+1], 64)
			busVal, err3 := strconv.ParseFloat(fields[i+2], 64)
			if err1 == nil && err2 == nil && err3 == nil && timeVal > 0 {
				algs = append(algs, algVal)
				buses = append(buses, busVal)
				break
			}
		}
	}
	if len(algs) == 0 {
		return 0, 0, 0, 0
	}
	return benchmarkMean(algs), benchmarkMax(algs), benchmarkMean(buses), benchmarkMax(buses)
}

func queryThrottleCounters(gpuIndex int) (BenchmarkThrottleCounters, error) {
	out, err := satExecCommand(
		"nvidia-smi",
		"--id="+strconv.Itoa(gpuIndex),
		"--query-gpu=clocks_event_reasons_counters.sw_power_cap,clocks_event_reasons_counters.sw_thermal_slowdown,clocks_event_reasons_counters.sync_boost,clocks_event_reasons_counters.hw_thermal_slowdown,clocks_event_reasons_counters.hw_power_brake_slowdown",
		"--format=csv,noheader,nounits",
	).Output()
	if err != nil {
		return BenchmarkThrottleCounters{}, err
	}
	fields := strings.Split(strings.TrimSpace(string(out)), ",")
	if len(fields) < 5 {
		return BenchmarkThrottleCounters{}, fmt.Errorf("unexpected throttle counter columns: %q", strings.TrimSpace(string(out)))
	}
	return BenchmarkThrottleCounters{
		SWPowerCapUS:           parseBenchmarkUint64(fields[0]),
		SWThermalSlowdownUS:    parseBenchmarkUint64(fields[1]),
		SyncBoostUS:            parseBenchmarkUint64(fields[2]),
		HWThermalSlowdownUS:    parseBenchmarkUint64(fields[3]),
		HWPowerBrakeSlowdownUS: parseBenchmarkUint64(fields[4]),
	}, nil
}

func diffThrottleCounters(before, after BenchmarkThrottleCounters) BenchmarkThrottleCounters {
	return BenchmarkThrottleCounters{
		SWPowerCapUS:           saturatingSub(after.SWPowerCapUS, before.SWPowerCapUS),
		SWThermalSlowdownUS:    saturatingSub(after.SWThermalSlowdownUS, before.SWThermalSlowdownUS),
		SyncBoostUS:            saturatingSub(after.SyncBoostUS, before.SyncBoostUS),
		HWThermalSlowdownUS:    saturatingSub(after.HWThermalSlowdownUS, before.HWThermalSlowdownUS),
		HWPowerBrakeSlowdownUS: saturatingSub(after.HWPowerBrakeSlowdownUS, before.HWPowerBrakeSlowdownUS),
	}
}

func queryActiveComputeApps(gpuIndices []int) ([]string, error) {
	args := []string{
		"--query-compute-apps=gpu_uuid,pid,process_name",
		"--format=csv,noheader,nounits",
	}
	if len(gpuIndices) > 0 {
		args = append([]string{"--id=" + joinIndexList(gpuIndices)}, args...)
	}
	out, err := satExecCommand("nvidia-smi", args...).Output()
	if err != nil {
		return nil, err
	}
	var lines []string
	for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
		line = strings.TrimSpace(line)
		if line == "" {
			continue
		}
		lines = append(lines, line)
	}
	return lines, nil
}

func finalizeBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkGPUResult {
	if gpu.Status == "" {
		gpu.Status = "OK"
	}
	if gpu.Scores.CompositeScore == 0 {
		gpu.Scores.CompositeScore = compositeBenchmarkScore(gpu.Scores)
	}
	return gpu
}

func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
	var findings []string
	if result.Normalization.Status != "full" {
		findings = append(findings, "Environment normalization was partial; compare results with caution.")
	}
	for _, gpu := range result.GPUs {
		if len(gpu.DegradationReasons) == 0 && gpu.Status == "OK" {
			findings = append(findings, fmt.Sprintf("GPU %d held clocks without observable throttle counters during steady state.", gpu.Index))
			continue
		}
		for _, reason := range gpu.DegradationReasons {
			switch reason {
			case "power_capped":
				findings = append(findings, fmt.Sprintf("GPU %d spent measurable time under SW power cap.", gpu.Index))
			case "thermal_limited":
				findings = append(findings, fmt.Sprintf("GPU %d reported thermal slowdown during steady state.", gpu.Index))
			case "sync_boost_limited":
				findings = append(findings, fmt.Sprintf("GPU %d was limited by sync boost behaviour.", gpu.Index))
			case "low_sm_clock_vs_target":
				findings = append(findings, fmt.Sprintf("GPU %d average SM clock stayed below the requested lock target.", gpu.Index))
			case "variance_too_high":
				findings = append(findings, fmt.Sprintf("GPU %d showed unstable clocks/power over the benchmark window.", gpu.Index))
			case "normalization_partial":
				findings = append(findings, fmt.Sprintf("GPU %d ran without full benchmark normalization.", gpu.Index))
			}
		}
		if gpu.Backend == "driver-ptx" {
			findings = append(findings, fmt.Sprintf("GPU %d used driver PTX fallback; tensor score is intentionally degraded.", gpu.Index))
		}
	}
	if result.Interconnect != nil && result.Interconnect.Supported {
		findings = append(findings, fmt.Sprintf("Multi-GPU all_reduce max bus bandwidth: %.1f GB/s.", result.Interconnect.MaxBusBWGBps))
	}
	return dedupeStrings(findings)
}

func benchmarkOverallStatus(result NvidiaBenchmarkResult) string {
	if len(result.GPUs) == 0 {
		return "FAILED"
	}
	hasOK := false
	hasPartial := result.Normalization.Status != "full"
	for _, gpu := range result.GPUs {
		switch gpu.Status {
		case "OK":
			hasOK = true
		case "PARTIAL", "UNSUPPORTED":
			hasPartial = true
		}
	}
	if !hasOK {
		return "FAILED"
	}
	if hasPartial {
		return "PARTIAL"
	}
	return "OK"
}

func findBenchmarkNormalization(items []BenchmarkNormalizationGPU, idx int) *BenchmarkNormalizationGPU {
	for i := range items {
		if items[i].Index == idx {
			return &items[i]
		}
	}
	return nil
}

func classifySATErrorStatus(out []byte, err error) string {
	status, _ := classifySATResult("benchmark", out, err)
	if status == "UNSUPPORTED" {
		return "UNSUPPORTED"
	}
	return "FAILED"
}

func parseBenchmarkFloat(raw string) float64 {
	raw = strings.TrimSpace(raw)
	if raw == "" || strings.EqualFold(raw, "n/a") || strings.EqualFold(raw, "[not supported]") {
		return 0
	}
	value, _ := strconv.ParseFloat(raw, 64)
	return value
}

func parseBenchmarkUint64(raw string) uint64 {
	raw = strings.TrimSpace(raw)
	if raw == "" || strings.EqualFold(raw, "n/a") || strings.EqualFold(raw, "[not supported]") {
		return 0
	}
	value, _ := strconv.ParseUint(raw, 10, 64)
	return value
}

func benchmarkMean(values []float64) float64 {
	if len(values) == 0 {
		return 0
	}
	var sum float64
	for _, value := range values {
		sum += value
	}
	return sum / float64(len(values))
}

func benchmarkPercentile(values []float64, p float64) float64 {
	if len(values) == 0 {
		return 0
	}
	copyValues := append([]float64(nil), values...)
	sort.Float64s(copyValues)
	if len(copyValues) == 1 {
		return copyValues[0]
	}
	rank := (p / 100.0) * float64(len(copyValues)-1)
	lower := int(math.Floor(rank))
	upper := int(math.Ceil(rank))
	if lower == upper {
		return copyValues[lower]
	}
	frac := rank - float64(lower)
	return copyValues[lower] + (copyValues[upper]-copyValues[lower])*frac
}

func benchmarkCV(values []float64) float64 {
	if len(values) == 0 {
		return 0
	}
	mean := benchmarkMean(values)
	if mean == 0 {
		return 0
	}
	var variance float64
	for _, value := range values {
		diff := value - mean
		variance += diff * diff
	}
	variance /= float64(len(values))
	return math.Sqrt(variance) / mean * 100
}

func benchmarkClockDrift(values []float64) float64 {
	if len(values) < 4 {
		return 0
	}
	window := len(values) / 4
	if window < 1 {
		window = 1
	}
	head := benchmarkMean(values[:window])
	tail := benchmarkMean(values[len(values)-window:])
	if head <= 0 || tail >= head {
		return 0
	}
	return ((head - tail) / head) * 100
}

func benchmarkMax(values []float64) float64 {
	var max float64
	for i, value := range values {
		if i == 0 || value > max {
			max = value
		}
	}
	return max
}

func clampScore(value float64) float64 {
	switch {
	case value < 0:
		return 0
	case value > 100:
		return 100
	default:
		return value
	}
}

func dedupeStrings(values []string) []string {
	if len(values) == 0 {
		return nil
	}
	seen := make(map[string]struct{}, len(values))
	out := make([]string, 0, len(values))
	for _, value := range values {
		value = strings.TrimSpace(value)
		if value == "" {
			continue
		}
		if _, ok := seen[value]; ok {
			continue
		}
		seen[value] = struct{}{}
		out = append(out, value)
	}
	return out
}

func saturatingSub(after, before uint64) uint64 {
	if after <= before {
		return 0
	}
	return after - before
}

func maxInt(a, b int) int {
	if a > b {
		return a
	}
	return b
}