1010 lines
33 KiB
Go
1010 lines
33 KiB
Go
package platform
|
|
|
|
import (
|
|
"context"
|
|
"encoding/csv"
|
|
"encoding/json"
|
|
"fmt"
|
|
"math"
|
|
"os"
|
|
"path/filepath"
|
|
"regexp"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
const benchmarkVersion = "1"
|
|
|
|
type benchmarkProfileSpec struct {
|
|
Name string
|
|
BaselineSec int
|
|
WarmupSec int
|
|
SteadySec int
|
|
NCCLSec int
|
|
CooldownSec int
|
|
}
|
|
|
|
type benchmarkGPUInfo struct {
|
|
Index int
|
|
UUID string
|
|
Name string
|
|
BusID string
|
|
VBIOS string
|
|
PowerLimitW float64
|
|
MaxGraphicsClockMHz float64
|
|
MaxMemoryClockMHz float64
|
|
}
|
|
|
|
type benchmarkBurnProfile struct {
|
|
name string
|
|
category string
|
|
supported bool
|
|
lanes int
|
|
m uint64
|
|
n uint64
|
|
k uint64
|
|
iterations uint64
|
|
notes string
|
|
}
|
|
|
|
type benchmarkBurnParseResult struct {
|
|
Device string
|
|
ComputeCapability string
|
|
Backend string
|
|
DurationSec int
|
|
Profiles []BenchmarkPrecisionResult
|
|
Fallback bool
|
|
}
|
|
|
|
type benchmarkRestoreAction struct {
|
|
name string
|
|
fn func()
|
|
}
|
|
|
|
var (
|
|
benchmarkReadyPattern = regexp.MustCompile(`^([a-z0-9_]+)\[(\d+)\]=READY dim=(\d+)x(\d+)x(\d+)\b`)
|
|
benchmarkSkippedPattern = regexp.MustCompile(`^([a-z0-9_]+)(?:\[\d+\])?=SKIPPED (.+)$`)
|
|
benchmarkIterationsPattern = regexp.MustCompile(`^([a-z0-9_]+)_iterations=(\d+)$`)
|
|
)
|
|
|
|
func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
|
if ctx == nil {
|
|
ctx = context.Background()
|
|
}
|
|
if logFunc == nil {
|
|
logFunc = func(string) {}
|
|
}
|
|
if strings.TrimSpace(baseDir) == "" {
|
|
baseDir = "/var/log/bee-benchmark"
|
|
}
|
|
spec := resolveBenchmarkProfile(opts.Profile)
|
|
opts = normalizeNvidiaBenchmarkOptionsForBenchmark(opts)
|
|
|
|
selected, err := resolveNvidiaGPUSelection(opts.GPUIndices, opts.ExcludeGPUIndices)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
if len(selected) == 0 {
|
|
return "", fmt.Errorf("no NVIDIA GPUs selected")
|
|
}
|
|
|
|
ts := time.Now().UTC().Format("20060102-150405")
|
|
runDir := filepath.Join(baseDir, "gpu-benchmark-"+ts)
|
|
if err := os.MkdirAll(runDir, 0755); err != nil {
|
|
return "", fmt.Errorf("mkdir %s: %w", runDir, err)
|
|
}
|
|
verboseLog := filepath.Join(runDir, "verbose.log")
|
|
|
|
hostname, _ := os.Hostname()
|
|
result := NvidiaBenchmarkResult{
|
|
BenchmarkVersion: benchmarkVersion,
|
|
GeneratedAt: time.Now().UTC(),
|
|
Hostname: hostname,
|
|
BenchmarkProfile: spec.Name,
|
|
SelectedGPUIndices: append([]int(nil), selected...),
|
|
Normalization: BenchmarkNormalization{
|
|
Status: "full",
|
|
},
|
|
}
|
|
|
|
logFunc(fmt.Sprintf("NVIDIA benchmark profile=%s gpus=%s", spec.Name, joinIndexList(selected)))
|
|
|
|
infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
|
|
if infoErr != nil {
|
|
result.Warnings = append(result.Warnings, "gpu inventory query failed: "+infoErr.Error())
|
|
result.Normalization.Status = "partial"
|
|
}
|
|
|
|
if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil {
|
|
_ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644)
|
|
}
|
|
|
|
activeApps, err := queryActiveComputeApps(selected)
|
|
if err == nil && len(activeApps) > 0 {
|
|
result.Warnings = append(result.Warnings, "active GPU compute processes detected before benchmark")
|
|
result.Normalization.Notes = append(result.Normalization.Notes, activeApps...)
|
|
result.Normalization.Status = "partial"
|
|
}
|
|
|
|
restoreActions := applyBenchmarkNormalization(ctx, verboseLog, selected, infoByIndex, &result)
|
|
defer func() {
|
|
for i := len(restoreActions) - 1; i >= 0; i-- {
|
|
restoreActions[i].fn()
|
|
}
|
|
}()
|
|
|
|
for _, idx := range selected {
|
|
gpuResult := BenchmarkGPUResult{
|
|
Index: idx,
|
|
Status: "FAILED",
|
|
}
|
|
if info, ok := infoByIndex[idx]; ok {
|
|
gpuResult.UUID = info.UUID
|
|
gpuResult.Name = info.Name
|
|
gpuResult.BusID = info.BusID
|
|
gpuResult.VBIOS = info.VBIOS
|
|
gpuResult.PowerLimitW = info.PowerLimitW
|
|
gpuResult.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz
|
|
gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz
|
|
}
|
|
if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
|
|
gpuResult.LockedGraphicsClockMHz = norm.GPUClockLockMHz
|
|
gpuResult.LockedMemoryClockMHz = norm.MemoryClockLockMHz
|
|
}
|
|
|
|
baselineRows, err := collectBenchmarkSamples(ctx, spec.BaselineSec, []int{idx})
|
|
if err != nil && err != context.Canceled {
|
|
gpuResult.Notes = append(gpuResult.Notes, "baseline sampling failed: "+err.Error())
|
|
}
|
|
gpuResult.Baseline = summarizeBenchmarkTelemetry(baselineRows)
|
|
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-baseline", idx), baselineRows)
|
|
|
|
warmupCmd := []string{
|
|
"bee-gpu-burn",
|
|
"--seconds", strconv.Itoa(spec.WarmupSec),
|
|
"--size-mb", strconv.Itoa(opts.SizeMB),
|
|
"--devices", strconv.Itoa(idx),
|
|
}
|
|
logFunc(fmt.Sprintf("GPU %d: warmup (%ds)", idx, spec.WarmupSec))
|
|
warmupOut, _, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-warmup.log", idx), warmupCmd, nil, []int{idx}, runDir, fmt.Sprintf("gpu-%d-warmup", idx), logFunc)
|
|
_ = os.WriteFile(filepath.Join(runDir, fmt.Sprintf("gpu-%d-warmup.log", idx)), warmupOut, 0644)
|
|
if warmupErr != nil {
|
|
gpuResult.Notes = append(gpuResult.Notes, "warmup failed: "+warmupErr.Error())
|
|
result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult))
|
|
continue
|
|
}
|
|
|
|
beforeThrottle, _ := queryThrottleCounters(idx)
|
|
steadyCmd := []string{
|
|
"bee-gpu-burn",
|
|
"--seconds", strconv.Itoa(spec.SteadySec),
|
|
"--size-mb", strconv.Itoa(opts.SizeMB),
|
|
"--devices", strconv.Itoa(idx),
|
|
}
|
|
logFunc(fmt.Sprintf("GPU %d: steady compute (%ds)", idx, spec.SteadySec))
|
|
steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-steady.log", idx), steadyCmd, nil, []int{idx}, runDir, fmt.Sprintf("gpu-%d-steady", idx), logFunc)
|
|
_ = os.WriteFile(filepath.Join(runDir, fmt.Sprintf("gpu-%d-steady.log", idx)), steadyOut, 0644)
|
|
afterThrottle, _ := queryThrottleCounters(idx)
|
|
if steadyErr != nil {
|
|
gpuResult.Notes = append(gpuResult.Notes, "steady compute failed: "+steadyErr.Error())
|
|
}
|
|
|
|
parseResult := parseBenchmarkBurnLog(string(steadyOut))
|
|
gpuResult.ComputeCapability = parseResult.ComputeCapability
|
|
gpuResult.Backend = parseResult.Backend
|
|
gpuResult.PrecisionResults = parseResult.Profiles
|
|
if parseResult.Fallback {
|
|
gpuResult.Notes = append(gpuResult.Notes, "benchmark used driver PTX fallback; tensor throughput score is not comparable")
|
|
}
|
|
|
|
gpuResult.Steady = summarizeBenchmarkTelemetry(steadyRows)
|
|
gpuResult.Throttle = diffThrottleCounters(beforeThrottle, afterThrottle)
|
|
|
|
cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, []int{idx})
|
|
if err != nil && err != context.Canceled {
|
|
gpuResult.Notes = append(gpuResult.Notes, "cooldown sampling failed: "+err.Error())
|
|
}
|
|
gpuResult.Cooldown = summarizeBenchmarkTelemetry(cooldownRows)
|
|
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-cooldown", idx), cooldownRows)
|
|
|
|
gpuResult.Scores = scoreBenchmarkGPUResult(gpuResult)
|
|
gpuResult.DegradationReasons = detectBenchmarkDegradationReasons(gpuResult, result.Normalization.Status)
|
|
if steadyErr != nil {
|
|
gpuResult.Status = classifySATErrorStatus(steadyOut, steadyErr)
|
|
} else if parseResult.Fallback {
|
|
gpuResult.Status = "PARTIAL"
|
|
} else {
|
|
gpuResult.Status = "OK"
|
|
}
|
|
|
|
result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult))
|
|
}
|
|
|
|
if len(selected) > 1 && opts.RunNCCL {
|
|
result.Interconnect = runBenchmarkInterconnect(ctx, verboseLog, runDir, selected, spec, logFunc)
|
|
if result.Interconnect != nil && result.Interconnect.Supported {
|
|
for i := range result.GPUs {
|
|
result.GPUs[i].Scores.InterconnectScore = result.Interconnect.MaxBusBWGBps
|
|
result.GPUs[i].Scores.CompositeScore = compositeBenchmarkScore(result.GPUs[i].Scores)
|
|
}
|
|
}
|
|
}
|
|
|
|
result.Findings = buildBenchmarkFindings(result)
|
|
result.OverallStatus = benchmarkOverallStatus(result)
|
|
|
|
resultJSON, err := json.MarshalIndent(result, "", " ")
|
|
if err != nil {
|
|
return "", fmt.Errorf("marshal benchmark result: %w", err)
|
|
}
|
|
if err := os.WriteFile(filepath.Join(runDir, "result.json"), resultJSON, 0644); err != nil {
|
|
return "", fmt.Errorf("write result.json: %w", err)
|
|
}
|
|
|
|
report := renderBenchmarkReportWithCharts(result, loadBenchmarkReportCharts(runDir, selected))
|
|
if err := os.WriteFile(filepath.Join(runDir, "report.txt"), []byte(report), 0644); err != nil {
|
|
return "", fmt.Errorf("write report.txt: %w", err)
|
|
}
|
|
|
|
summary := renderBenchmarkSummary(result)
|
|
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644); err != nil {
|
|
return "", fmt.Errorf("write summary.txt: %w", err)
|
|
}
|
|
|
|
archive := filepath.Join(baseDir, "gpu-benchmark-"+ts+".tar.gz")
|
|
if err := createTarGz(archive, runDir); err != nil {
|
|
return "", fmt.Errorf("pack benchmark archive: %w", err)
|
|
}
|
|
return archive, nil
|
|
}
|
|
|
|
func normalizeNvidiaBenchmarkOptionsForBenchmark(opts NvidiaBenchmarkOptions) NvidiaBenchmarkOptions {
|
|
switch strings.TrimSpace(strings.ToLower(opts.Profile)) {
|
|
case NvidiaBenchmarkProfileStability:
|
|
opts.Profile = NvidiaBenchmarkProfileStability
|
|
case NvidiaBenchmarkProfileOvernight:
|
|
opts.Profile = NvidiaBenchmarkProfileOvernight
|
|
default:
|
|
opts.Profile = NvidiaBenchmarkProfileStandard
|
|
}
|
|
if opts.SizeMB < 0 {
|
|
opts.SizeMB = 0
|
|
}
|
|
opts.GPUIndices = dedupeSortedIndices(opts.GPUIndices)
|
|
opts.ExcludeGPUIndices = dedupeSortedIndices(opts.ExcludeGPUIndices)
|
|
return opts
|
|
}
|
|
|
|
func resolveBenchmarkProfile(profile string) benchmarkProfileSpec {
|
|
switch strings.TrimSpace(strings.ToLower(profile)) {
|
|
case NvidiaBenchmarkProfileStability:
|
|
return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStability, BaselineSec: 30, WarmupSec: 300, SteadySec: 3600, NCCLSec: 300, CooldownSec: 300}
|
|
case NvidiaBenchmarkProfileOvernight:
|
|
return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileOvernight, BaselineSec: 60, WarmupSec: 600, SteadySec: 27000, NCCLSec: 600, CooldownSec: 300}
|
|
default:
|
|
return benchmarkProfileSpec{Name: NvidiaBenchmarkProfileStandard, BaselineSec: 15, WarmupSec: 120, SteadySec: 480, NCCLSec: 180, CooldownSec: 120}
|
|
}
|
|
}
|
|
|
|
func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
|
|
args := []string{
|
|
"--query-gpu=index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory",
|
|
"--format=csv,noheader,nounits",
|
|
}
|
|
if len(gpuIndices) > 0 {
|
|
args = append([]string{"--id=" + joinIndexList(gpuIndices)}, args...)
|
|
}
|
|
out, err := satExecCommand("nvidia-smi", args...).Output()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("nvidia-smi gpu info: %w", err)
|
|
}
|
|
|
|
r := csv.NewReader(strings.NewReader(string(out)))
|
|
r.TrimLeadingSpace = true
|
|
r.FieldsPerRecord = -1
|
|
rows, err := r.ReadAll()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("parse nvidia-smi gpu info: %w", err)
|
|
}
|
|
|
|
infoByIndex := make(map[int]benchmarkGPUInfo, len(rows))
|
|
for _, row := range rows {
|
|
if len(row) < 8 {
|
|
continue
|
|
}
|
|
idx, err := strconv.Atoi(strings.TrimSpace(row[0]))
|
|
if err != nil {
|
|
continue
|
|
}
|
|
infoByIndex[idx] = benchmarkGPUInfo{
|
|
Index: idx,
|
|
UUID: strings.TrimSpace(row[1]),
|
|
Name: strings.TrimSpace(row[2]),
|
|
BusID: strings.TrimSpace(row[3]),
|
|
VBIOS: strings.TrimSpace(row[4]),
|
|
PowerLimitW: parseBenchmarkFloat(row[5]),
|
|
MaxGraphicsClockMHz: parseBenchmarkFloat(row[6]),
|
|
MaxMemoryClockMHz: parseBenchmarkFloat(row[7]),
|
|
}
|
|
}
|
|
return infoByIndex, nil
|
|
}
|
|
|
|
func applyBenchmarkNormalization(ctx context.Context, verboseLog string, gpuIndices []int, infoByIndex map[int]benchmarkGPUInfo, result *NvidiaBenchmarkResult) []benchmarkRestoreAction {
|
|
if os.Geteuid() != 0 {
|
|
result.Normalization.Status = "partial"
|
|
result.Normalization.Notes = append(result.Normalization.Notes, "benchmark normalization skipped: root privileges are required for persistence mode and clock locks")
|
|
for _, idx := range gpuIndices {
|
|
result.Normalization.GPUs = append(result.Normalization.GPUs, BenchmarkNormalizationGPU{
|
|
Index: idx,
|
|
Notes: []string{"normalization skipped: root privileges are required"},
|
|
})
|
|
}
|
|
return nil
|
|
}
|
|
|
|
var restore []benchmarkRestoreAction
|
|
for _, idx := range gpuIndices {
|
|
rec := BenchmarkNormalizationGPU{Index: idx}
|
|
if _, err := runSATCommandCtx(ctx, verboseLog, fmt.Sprintf("normalize-gpu-%d-pm", idx), []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-pm", "1"}, nil, nil); err != nil {
|
|
rec.PersistenceMode = "failed"
|
|
rec.Notes = append(rec.Notes, "failed to enable persistence mode")
|
|
result.Normalization.Status = "partial"
|
|
} else {
|
|
rec.PersistenceMode = "applied"
|
|
}
|
|
|
|
if info, ok := infoByIndex[idx]; ok && info.MaxGraphicsClockMHz > 0 {
|
|
target := int(math.Round(info.MaxGraphicsClockMHz))
|
|
if out, err := runSATCommandCtx(ctx, verboseLog, fmt.Sprintf("normalize-gpu-%d-lgc", idx), []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-lgc", strconv.Itoa(target)}, nil, nil); err != nil {
|
|
rec.GPUClockLockStatus = "failed"
|
|
rec.Notes = append(rec.Notes, "graphics clock lock failed: "+strings.TrimSpace(string(out)))
|
|
result.Normalization.Status = "partial"
|
|
} else {
|
|
rec.GPUClockLockStatus = "applied"
|
|
rec.GPUClockLockMHz = float64(target)
|
|
idxCopy := idx
|
|
restore = append(restore, benchmarkRestoreAction{name: fmt.Sprintf("gpu-%d-rgc", idxCopy), fn: func() {
|
|
_, _ = runSATCommandCtx(context.Background(), verboseLog, fmt.Sprintf("restore-gpu-%d-rgc", idxCopy), []string{"nvidia-smi", "-i", strconv.Itoa(idxCopy), "-rgc"}, nil, nil)
|
|
}})
|
|
}
|
|
}
|
|
|
|
if info, ok := infoByIndex[idx]; ok && info.MaxMemoryClockMHz > 0 {
|
|
target := int(math.Round(info.MaxMemoryClockMHz))
|
|
out, err := runSATCommandCtx(ctx, verboseLog, fmt.Sprintf("normalize-gpu-%d-lmc", idx), []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-lmc", strconv.Itoa(target)}, nil, nil)
|
|
switch {
|
|
case err == nil:
|
|
rec.MemoryClockLockStatus = "applied"
|
|
rec.MemoryClockLockMHz = float64(target)
|
|
idxCopy := idx
|
|
restore = append(restore, benchmarkRestoreAction{name: fmt.Sprintf("gpu-%d-rmc", idxCopy), fn: func() {
|
|
_, _ = runSATCommandCtx(context.Background(), verboseLog, fmt.Sprintf("restore-gpu-%d-rmc", idxCopy), []string{"nvidia-smi", "-i", strconv.Itoa(idxCopy), "-rmc"}, nil, nil)
|
|
}})
|
|
case strings.Contains(strings.ToLower(string(out)), "deferred") || strings.Contains(strings.ToLower(string(out)), "not supported"):
|
|
rec.MemoryClockLockStatus = "unsupported"
|
|
rec.Notes = append(rec.Notes, "memory clock lock unsupported on this GPU/driver path")
|
|
result.Normalization.Status = "partial"
|
|
default:
|
|
rec.MemoryClockLockStatus = "failed"
|
|
rec.Notes = append(rec.Notes, "memory clock lock failed: "+strings.TrimSpace(string(out)))
|
|
result.Normalization.Status = "partial"
|
|
}
|
|
}
|
|
|
|
result.Normalization.GPUs = append(result.Normalization.GPUs, rec)
|
|
}
|
|
return restore
|
|
}
|
|
|
|
func collectBenchmarkSamples(ctx context.Context, durationSec int, gpuIndices []int) ([]GPUMetricRow, error) {
|
|
if durationSec <= 0 {
|
|
return nil, nil
|
|
}
|
|
deadline := time.Now().Add(time.Duration(durationSec) * time.Second)
|
|
var rows []GPUMetricRow
|
|
start := time.Now()
|
|
for {
|
|
if ctx.Err() != nil {
|
|
return rows, ctx.Err()
|
|
}
|
|
samples, err := sampleGPUMetrics(gpuIndices)
|
|
if err == nil {
|
|
elapsed := time.Since(start).Seconds()
|
|
for i := range samples {
|
|
samples[i].ElapsedSec = elapsed
|
|
}
|
|
rows = append(rows, samples...)
|
|
}
|
|
if time.Now().After(deadline) {
|
|
break
|
|
}
|
|
select {
|
|
case <-ctx.Done():
|
|
return rows, ctx.Err()
|
|
case <-time.After(time.Second):
|
|
}
|
|
}
|
|
return rows, nil
|
|
}
|
|
|
|
func runBenchmarkCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, runDir, baseName string, logFunc func(string)) ([]byte, []GPUMetricRow, error) {
|
|
stopCh := make(chan struct{})
|
|
doneCh := make(chan struct{})
|
|
var metricRows []GPUMetricRow
|
|
start := time.Now()
|
|
|
|
go func() {
|
|
defer close(doneCh)
|
|
ticker := time.NewTicker(time.Second)
|
|
defer ticker.Stop()
|
|
for {
|
|
select {
|
|
case <-stopCh:
|
|
return
|
|
case <-ticker.C:
|
|
samples, err := sampleGPUMetrics(gpuIndices)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
elapsed := time.Since(start).Seconds()
|
|
for i := range samples {
|
|
samples[i].ElapsedSec = elapsed
|
|
}
|
|
metricRows = append(metricRows, samples...)
|
|
}
|
|
}
|
|
}()
|
|
|
|
out, err := runSATCommandCtx(ctx, verboseLog, name, cmd, env, logFunc)
|
|
close(stopCh)
|
|
<-doneCh
|
|
|
|
writeBenchmarkMetricsFiles(runDir, baseName, metricRows)
|
|
return out, metricRows, err
|
|
}
|
|
|
|
func writeBenchmarkMetricsFiles(runDir, baseName string, rows []GPUMetricRow) {
|
|
if len(rows) == 0 {
|
|
return
|
|
}
|
|
_ = WriteGPUMetricsCSV(filepath.Join(runDir, baseName+"-metrics.csv"), rows)
|
|
_ = WriteGPUMetricsHTML(filepath.Join(runDir, baseName+"-metrics.html"), rows)
|
|
chart := RenderGPUTerminalChart(rows)
|
|
_ = os.WriteFile(filepath.Join(runDir, baseName+"-metrics-term.txt"), []byte(chart), 0644)
|
|
}
|
|
|
|
func parseBenchmarkBurnLog(raw string) benchmarkBurnParseResult {
|
|
result := benchmarkBurnParseResult{}
|
|
lines := strings.Split(strings.ReplaceAll(raw, "\r\n", "\n"), "\n")
|
|
profiles := make(map[string]*benchmarkBurnProfile)
|
|
for _, line := range lines {
|
|
line = stripBenchmarkPrefix(strings.TrimSpace(line))
|
|
if line == "" {
|
|
continue
|
|
}
|
|
switch {
|
|
case strings.HasPrefix(line, "device="):
|
|
result.Device = strings.TrimSpace(strings.TrimPrefix(line, "device="))
|
|
case strings.HasPrefix(line, "compute_capability="):
|
|
result.ComputeCapability = strings.TrimSpace(strings.TrimPrefix(line, "compute_capability="))
|
|
case strings.HasPrefix(line, "backend="):
|
|
result.Backend = strings.TrimSpace(strings.TrimPrefix(line, "backend="))
|
|
result.Fallback = result.Backend == "driver-ptx"
|
|
case strings.HasPrefix(line, "duration_s="):
|
|
result.DurationSec, _ = strconv.Atoi(strings.TrimSpace(strings.TrimPrefix(line, "duration_s=")))
|
|
default:
|
|
if m := benchmarkReadyPattern.FindStringSubmatch(line); len(m) == 6 {
|
|
profile := ensureBenchmarkProfile(profiles, m[1])
|
|
profile.supported = true
|
|
profile.lanes++
|
|
profile.m, _ = strconv.ParseUint(m[3], 10, 64)
|
|
profile.n, _ = strconv.ParseUint(m[4], 10, 64)
|
|
profile.k, _ = strconv.ParseUint(m[5], 10, 64)
|
|
continue
|
|
}
|
|
if m := benchmarkSkippedPattern.FindStringSubmatch(line); len(m) == 3 {
|
|
profile := ensureBenchmarkProfile(profiles, m[1])
|
|
profile.supported = false
|
|
profile.notes = strings.TrimSpace(m[2])
|
|
continue
|
|
}
|
|
if m := benchmarkIterationsPattern.FindStringSubmatch(line); len(m) == 3 {
|
|
profile := ensureBenchmarkProfile(profiles, m[1])
|
|
iters, _ := strconv.ParseUint(m[2], 10, 64)
|
|
profile.iterations += iters
|
|
}
|
|
}
|
|
}
|
|
|
|
keys := make([]string, 0, len(profiles))
|
|
for key := range profiles {
|
|
keys = append(keys, key)
|
|
}
|
|
sort.Strings(keys)
|
|
for _, key := range keys {
|
|
profile := profiles[key]
|
|
precision := BenchmarkPrecisionResult{
|
|
Name: profile.name,
|
|
Category: profile.category,
|
|
Supported: profile.supported,
|
|
Lanes: profile.lanes,
|
|
M: profile.m,
|
|
N: profile.n,
|
|
K: profile.k,
|
|
Iterations: profile.iterations,
|
|
Notes: profile.notes,
|
|
}
|
|
if profile.supported && result.DurationSec > 0 && profile.m > 0 && profile.n > 0 && profile.k > 0 && profile.iterations > 0 {
|
|
precision.TeraOpsPerSec = (2.0 * float64(profile.m) * float64(profile.n) * float64(profile.k) * float64(profile.iterations)) / float64(result.DurationSec) / 1e12
|
|
}
|
|
result.Profiles = append(result.Profiles, precision)
|
|
}
|
|
return result
|
|
}
|
|
|
|
func ensureBenchmarkProfile(profiles map[string]*benchmarkBurnProfile, name string) *benchmarkBurnProfile {
|
|
if profile, ok := profiles[name]; ok {
|
|
return profile
|
|
}
|
|
category := "other"
|
|
switch {
|
|
case strings.HasPrefix(name, "fp32"):
|
|
category = "fp32_tf32"
|
|
case strings.HasPrefix(name, "fp16"):
|
|
category = "fp16_bf16"
|
|
case strings.HasPrefix(name, "fp8"):
|
|
category = "fp8"
|
|
case strings.HasPrefix(name, "fp4"):
|
|
category = "fp4"
|
|
}
|
|
profile := &benchmarkBurnProfile{name: name, category: category, supported: true}
|
|
profiles[name] = profile
|
|
return profile
|
|
}
|
|
|
|
func stripBenchmarkPrefix(line string) string {
|
|
if strings.HasPrefix(line, "[gpu ") {
|
|
if idx := strings.Index(line, "] "); idx >= 0 {
|
|
return line[idx+2:]
|
|
}
|
|
}
|
|
return line
|
|
}
|
|
|
|
func summarizeBenchmarkTelemetry(rows []GPUMetricRow) BenchmarkTelemetrySummary {
|
|
summary := BenchmarkTelemetrySummary{}
|
|
if len(rows) == 0 {
|
|
return summary
|
|
}
|
|
temps := make([]float64, 0, len(rows))
|
|
powers := make([]float64, 0, len(rows))
|
|
clocks := make([]float64, 0, len(rows))
|
|
memClocks := make([]float64, 0, len(rows))
|
|
usages := make([]float64, 0, len(rows))
|
|
memUsages := make([]float64, 0, len(rows))
|
|
summary.DurationSec = rows[len(rows)-1].ElapsedSec
|
|
summary.Samples = len(rows)
|
|
for _, row := range rows {
|
|
temps = append(temps, row.TempC)
|
|
powers = append(powers, row.PowerW)
|
|
clocks = append(clocks, row.ClockMHz)
|
|
memClocks = append(memClocks, row.MemClockMHz)
|
|
usages = append(usages, row.UsagePct)
|
|
memUsages = append(memUsages, row.MemUsagePct)
|
|
}
|
|
summary.AvgTempC = benchmarkMean(temps)
|
|
summary.P95TempC = benchmarkPercentile(temps, 95)
|
|
summary.AvgPowerW = benchmarkMean(powers)
|
|
summary.P95PowerW = benchmarkPercentile(powers, 95)
|
|
summary.AvgGraphicsClockMHz = benchmarkMean(clocks)
|
|
summary.P95GraphicsClockMHz = benchmarkPercentile(clocks, 95)
|
|
summary.AvgMemoryClockMHz = benchmarkMean(memClocks)
|
|
summary.P95MemoryClockMHz = benchmarkPercentile(memClocks, 95)
|
|
summary.AvgUsagePct = benchmarkMean(usages)
|
|
summary.AvgMemUsagePct = benchmarkMean(memUsages)
|
|
summary.ClockCVPct = benchmarkCV(clocks)
|
|
summary.PowerCVPct = benchmarkCV(powers)
|
|
summary.TempCVPct = benchmarkCV(temps)
|
|
summary.ClockDriftPct = benchmarkClockDrift(clocks)
|
|
return summary
|
|
}
|
|
|
|
func scoreBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkScorecard {
|
|
score := BenchmarkScorecard{}
|
|
for _, precision := range gpu.PrecisionResults {
|
|
if precision.Supported {
|
|
score.ComputeScore += precision.TeraOpsPerSec
|
|
}
|
|
}
|
|
if gpu.PowerLimitW > 0 {
|
|
score.PowerSustainScore = math.Min(100, (gpu.Steady.AvgPowerW/gpu.PowerLimitW)*100)
|
|
}
|
|
runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6)
|
|
thermalRatio := float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS) / runtimeUS
|
|
score.ThermalSustainScore = clampScore(100 - thermalRatio*100)
|
|
score.StabilityScore = clampScore(100 - (gpu.Steady.ClockCVPct*4 + gpu.Steady.PowerCVPct*2 + gpu.Steady.ClockDriftPct*2))
|
|
score.CompositeScore = compositeBenchmarkScore(score)
|
|
return score
|
|
}
|
|
|
|
func compositeBenchmarkScore(score BenchmarkScorecard) float64 {
|
|
quality := 0.40 + 0.20*(score.PowerSustainScore/100.0) + 0.20*(score.ThermalSustainScore/100.0) + 0.20*(score.StabilityScore/100.0)
|
|
if score.InterconnectScore > 0 {
|
|
quality += 0.10
|
|
}
|
|
if quality > 1.10 {
|
|
quality = 1.10
|
|
}
|
|
return score.ComputeScore * quality
|
|
}
|
|
|
|
func detectBenchmarkDegradationReasons(gpu BenchmarkGPUResult, normalizationStatus string) []string {
|
|
var reasons []string
|
|
runtimeUS := math.Max(1, gpu.Steady.DurationSec*1e6)
|
|
if float64(gpu.Throttle.SWPowerCapUS)/runtimeUS >= 0.05 {
|
|
reasons = append(reasons, "power_capped")
|
|
}
|
|
if float64(gpu.Throttle.HWThermalSlowdownUS+gpu.Throttle.SWThermalSlowdownUS)/runtimeUS >= 0.01 {
|
|
reasons = append(reasons, "thermal_limited")
|
|
}
|
|
if float64(gpu.Throttle.SyncBoostUS)/runtimeUS >= 0.01 {
|
|
reasons = append(reasons, "sync_boost_limited")
|
|
}
|
|
if gpu.LockedGraphicsClockMHz > 0 && gpu.Steady.AvgGraphicsClockMHz < gpu.LockedGraphicsClockMHz*0.90 {
|
|
reasons = append(reasons, "low_sm_clock_vs_target")
|
|
}
|
|
if gpu.Scores.StabilityScore > 0 && gpu.Scores.StabilityScore < 85 {
|
|
reasons = append(reasons, "variance_too_high")
|
|
}
|
|
if normalizationStatus != "full" {
|
|
reasons = append(reasons, "normalization_partial")
|
|
}
|
|
return dedupeStrings(reasons)
|
|
}
|
|
|
|
func runBenchmarkInterconnect(ctx context.Context, verboseLog, runDir string, gpuIndices []int, spec benchmarkProfileSpec, logFunc func(string)) *BenchmarkInterconnectResult {
|
|
result := &BenchmarkInterconnectResult{
|
|
Status: "UNSUPPORTED",
|
|
Attempted: true,
|
|
SelectedGPUIndices: append([]int(nil), gpuIndices...),
|
|
}
|
|
cmd := []string{
|
|
"all_reduce_perf",
|
|
"-b", "512M",
|
|
"-e", "4G",
|
|
"-f", "2",
|
|
"-g", strconv.Itoa(len(gpuIndices)),
|
|
"--iters", strconv.Itoa(maxInt(20, spec.NCCLSec/10)),
|
|
}
|
|
env := []string{
|
|
"CUDA_DEVICE_ORDER=PCI_BUS_ID",
|
|
"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices),
|
|
}
|
|
logFunc(fmt.Sprintf("NCCL interconnect: gpus=%s", joinIndexList(gpuIndices)))
|
|
out, err := runSATCommandCtx(ctx, verboseLog, "nccl-all-reduce.log", cmd, env, logFunc)
|
|
_ = os.WriteFile(filepath.Join(runDir, "nccl-all-reduce.log"), out, 0644)
|
|
if err != nil {
|
|
result.Notes = append(result.Notes, strings.TrimSpace(string(out)))
|
|
return result
|
|
}
|
|
avgAlg, maxAlg, avgBus, maxBus := parseNCCLAllReduceOutput(string(out))
|
|
result.Status = "OK"
|
|
result.Supported = true
|
|
result.AvgAlgBWGBps = avgAlg
|
|
result.MaxAlgBWGBps = maxAlg
|
|
result.AvgBusBWGBps = avgBus
|
|
result.MaxBusBWGBps = maxBus
|
|
return result
|
|
}
|
|
|
|
func parseNCCLAllReduceOutput(raw string) (avgAlg, maxAlg, avgBus, maxBus float64) {
|
|
lines := strings.Split(strings.ReplaceAll(raw, "\r\n", "\n"), "\n")
|
|
var algs []float64
|
|
var buses []float64
|
|
for _, line := range lines {
|
|
line = strings.TrimSpace(line)
|
|
if line == "" || strings.HasPrefix(line, "#") {
|
|
continue
|
|
}
|
|
fields := strings.Fields(line)
|
|
if len(fields) < 8 {
|
|
continue
|
|
}
|
|
for i := 0; i+2 < len(fields); i++ {
|
|
timeVal, err1 := strconv.ParseFloat(fields[i], 64)
|
|
algVal, err2 := strconv.ParseFloat(fields[i+1], 64)
|
|
busVal, err3 := strconv.ParseFloat(fields[i+2], 64)
|
|
if err1 == nil && err2 == nil && err3 == nil && timeVal > 0 {
|
|
algs = append(algs, algVal)
|
|
buses = append(buses, busVal)
|
|
break
|
|
}
|
|
}
|
|
}
|
|
if len(algs) == 0 {
|
|
return 0, 0, 0, 0
|
|
}
|
|
return benchmarkMean(algs), benchmarkMax(algs), benchmarkMean(buses), benchmarkMax(buses)
|
|
}
|
|
|
|
func queryThrottleCounters(gpuIndex int) (BenchmarkThrottleCounters, error) {
|
|
out, err := satExecCommand(
|
|
"nvidia-smi",
|
|
"--id="+strconv.Itoa(gpuIndex),
|
|
"--query-gpu=clocks_event_reasons_counters.sw_power_cap,clocks_event_reasons_counters.sw_thermal_slowdown,clocks_event_reasons_counters.sync_boost,clocks_event_reasons_counters.hw_thermal_slowdown,clocks_event_reasons_counters.hw_power_brake_slowdown",
|
|
"--format=csv,noheader,nounits",
|
|
).Output()
|
|
if err != nil {
|
|
return BenchmarkThrottleCounters{}, err
|
|
}
|
|
fields := strings.Split(strings.TrimSpace(string(out)), ",")
|
|
if len(fields) < 5 {
|
|
return BenchmarkThrottleCounters{}, fmt.Errorf("unexpected throttle counter columns: %q", strings.TrimSpace(string(out)))
|
|
}
|
|
return BenchmarkThrottleCounters{
|
|
SWPowerCapUS: parseBenchmarkUint64(fields[0]),
|
|
SWThermalSlowdownUS: parseBenchmarkUint64(fields[1]),
|
|
SyncBoostUS: parseBenchmarkUint64(fields[2]),
|
|
HWThermalSlowdownUS: parseBenchmarkUint64(fields[3]),
|
|
HWPowerBrakeSlowdownUS: parseBenchmarkUint64(fields[4]),
|
|
}, nil
|
|
}
|
|
|
|
func diffThrottleCounters(before, after BenchmarkThrottleCounters) BenchmarkThrottleCounters {
|
|
return BenchmarkThrottleCounters{
|
|
SWPowerCapUS: saturatingSub(after.SWPowerCapUS, before.SWPowerCapUS),
|
|
SWThermalSlowdownUS: saturatingSub(after.SWThermalSlowdownUS, before.SWThermalSlowdownUS),
|
|
SyncBoostUS: saturatingSub(after.SyncBoostUS, before.SyncBoostUS),
|
|
HWThermalSlowdownUS: saturatingSub(after.HWThermalSlowdownUS, before.HWThermalSlowdownUS),
|
|
HWPowerBrakeSlowdownUS: saturatingSub(after.HWPowerBrakeSlowdownUS, before.HWPowerBrakeSlowdownUS),
|
|
}
|
|
}
|
|
|
|
func queryActiveComputeApps(gpuIndices []int) ([]string, error) {
|
|
args := []string{
|
|
"--query-compute-apps=gpu_uuid,pid,process_name",
|
|
"--format=csv,noheader,nounits",
|
|
}
|
|
if len(gpuIndices) > 0 {
|
|
args = append([]string{"--id=" + joinIndexList(gpuIndices)}, args...)
|
|
}
|
|
out, err := satExecCommand("nvidia-smi", args...).Output()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
var lines []string
|
|
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
|
line = strings.TrimSpace(line)
|
|
if line == "" {
|
|
continue
|
|
}
|
|
lines = append(lines, line)
|
|
}
|
|
return lines, nil
|
|
}
|
|
|
|
func finalizeBenchmarkGPUResult(gpu BenchmarkGPUResult) BenchmarkGPUResult {
|
|
if gpu.Status == "" {
|
|
gpu.Status = "OK"
|
|
}
|
|
if gpu.Scores.CompositeScore == 0 {
|
|
gpu.Scores.CompositeScore = compositeBenchmarkScore(gpu.Scores)
|
|
}
|
|
return gpu
|
|
}
|
|
|
|
func buildBenchmarkFindings(result NvidiaBenchmarkResult) []string {
|
|
var findings []string
|
|
if result.Normalization.Status != "full" {
|
|
findings = append(findings, "Environment normalization was partial; compare results with caution.")
|
|
}
|
|
for _, gpu := range result.GPUs {
|
|
if len(gpu.DegradationReasons) == 0 && gpu.Status == "OK" {
|
|
findings = append(findings, fmt.Sprintf("GPU %d held clocks without observable throttle counters during steady state.", gpu.Index))
|
|
continue
|
|
}
|
|
for _, reason := range gpu.DegradationReasons {
|
|
switch reason {
|
|
case "power_capped":
|
|
findings = append(findings, fmt.Sprintf("GPU %d spent measurable time under SW power cap.", gpu.Index))
|
|
case "thermal_limited":
|
|
findings = append(findings, fmt.Sprintf("GPU %d reported thermal slowdown during steady state.", gpu.Index))
|
|
case "sync_boost_limited":
|
|
findings = append(findings, fmt.Sprintf("GPU %d was limited by sync boost behaviour.", gpu.Index))
|
|
case "low_sm_clock_vs_target":
|
|
findings = append(findings, fmt.Sprintf("GPU %d average SM clock stayed below the requested lock target.", gpu.Index))
|
|
case "variance_too_high":
|
|
findings = append(findings, fmt.Sprintf("GPU %d showed unstable clocks/power over the benchmark window.", gpu.Index))
|
|
case "normalization_partial":
|
|
findings = append(findings, fmt.Sprintf("GPU %d ran without full benchmark normalization.", gpu.Index))
|
|
}
|
|
}
|
|
if gpu.Backend == "driver-ptx" {
|
|
findings = append(findings, fmt.Sprintf("GPU %d used driver PTX fallback; tensor score is intentionally degraded.", gpu.Index))
|
|
}
|
|
}
|
|
if result.Interconnect != nil && result.Interconnect.Supported {
|
|
findings = append(findings, fmt.Sprintf("Multi-GPU all_reduce max bus bandwidth: %.1f GB/s.", result.Interconnect.MaxBusBWGBps))
|
|
}
|
|
return dedupeStrings(findings)
|
|
}
|
|
|
|
func benchmarkOverallStatus(result NvidiaBenchmarkResult) string {
|
|
if len(result.GPUs) == 0 {
|
|
return "FAILED"
|
|
}
|
|
hasOK := false
|
|
hasPartial := result.Normalization.Status != "full"
|
|
for _, gpu := range result.GPUs {
|
|
switch gpu.Status {
|
|
case "OK":
|
|
hasOK = true
|
|
case "PARTIAL", "UNSUPPORTED":
|
|
hasPartial = true
|
|
}
|
|
}
|
|
if !hasOK {
|
|
return "FAILED"
|
|
}
|
|
if hasPartial {
|
|
return "PARTIAL"
|
|
}
|
|
return "OK"
|
|
}
|
|
|
|
func findBenchmarkNormalization(items []BenchmarkNormalizationGPU, idx int) *BenchmarkNormalizationGPU {
|
|
for i := range items {
|
|
if items[i].Index == idx {
|
|
return &items[i]
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func classifySATErrorStatus(out []byte, err error) string {
|
|
status, _ := classifySATResult("benchmark", out, err)
|
|
if status == "UNSUPPORTED" {
|
|
return "UNSUPPORTED"
|
|
}
|
|
return "FAILED"
|
|
}
|
|
|
|
func parseBenchmarkFloat(raw string) float64 {
|
|
raw = strings.TrimSpace(raw)
|
|
if raw == "" || strings.EqualFold(raw, "n/a") || strings.EqualFold(raw, "[not supported]") {
|
|
return 0
|
|
}
|
|
value, _ := strconv.ParseFloat(raw, 64)
|
|
return value
|
|
}
|
|
|
|
func parseBenchmarkUint64(raw string) uint64 {
|
|
raw = strings.TrimSpace(raw)
|
|
if raw == "" || strings.EqualFold(raw, "n/a") || strings.EqualFold(raw, "[not supported]") {
|
|
return 0
|
|
}
|
|
value, _ := strconv.ParseUint(raw, 10, 64)
|
|
return value
|
|
}
|
|
|
|
func benchmarkMean(values []float64) float64 {
|
|
if len(values) == 0 {
|
|
return 0
|
|
}
|
|
var sum float64
|
|
for _, value := range values {
|
|
sum += value
|
|
}
|
|
return sum / float64(len(values))
|
|
}
|
|
|
|
func benchmarkPercentile(values []float64, p float64) float64 {
|
|
if len(values) == 0 {
|
|
return 0
|
|
}
|
|
copyValues := append([]float64(nil), values...)
|
|
sort.Float64s(copyValues)
|
|
if len(copyValues) == 1 {
|
|
return copyValues[0]
|
|
}
|
|
rank := (p / 100.0) * float64(len(copyValues)-1)
|
|
lower := int(math.Floor(rank))
|
|
upper := int(math.Ceil(rank))
|
|
if lower == upper {
|
|
return copyValues[lower]
|
|
}
|
|
frac := rank - float64(lower)
|
|
return copyValues[lower] + (copyValues[upper]-copyValues[lower])*frac
|
|
}
|
|
|
|
func benchmarkCV(values []float64) float64 {
|
|
if len(values) == 0 {
|
|
return 0
|
|
}
|
|
mean := benchmarkMean(values)
|
|
if mean == 0 {
|
|
return 0
|
|
}
|
|
var variance float64
|
|
for _, value := range values {
|
|
diff := value - mean
|
|
variance += diff * diff
|
|
}
|
|
variance /= float64(len(values))
|
|
return math.Sqrt(variance) / mean * 100
|
|
}
|
|
|
|
func benchmarkClockDrift(values []float64) float64 {
|
|
if len(values) < 4 {
|
|
return 0
|
|
}
|
|
window := len(values) / 4
|
|
if window < 1 {
|
|
window = 1
|
|
}
|
|
head := benchmarkMean(values[:window])
|
|
tail := benchmarkMean(values[len(values)-window:])
|
|
if head <= 0 || tail >= head {
|
|
return 0
|
|
}
|
|
return ((head - tail) / head) * 100
|
|
}
|
|
|
|
func benchmarkMax(values []float64) float64 {
|
|
var max float64
|
|
for i, value := range values {
|
|
if i == 0 || value > max {
|
|
max = value
|
|
}
|
|
}
|
|
return max
|
|
}
|
|
|
|
func clampScore(value float64) float64 {
|
|
switch {
|
|
case value < 0:
|
|
return 0
|
|
case value > 100:
|
|
return 100
|
|
default:
|
|
return value
|
|
}
|
|
}
|
|
|
|
func dedupeStrings(values []string) []string {
|
|
if len(values) == 0 {
|
|
return nil
|
|
}
|
|
seen := make(map[string]struct{}, len(values))
|
|
out := make([]string, 0, len(values))
|
|
for _, value := range values {
|
|
value = strings.TrimSpace(value)
|
|
if value == "" {
|
|
continue
|
|
}
|
|
if _, ok := seen[value]; ok {
|
|
continue
|
|
}
|
|
seen[value] = struct{}{}
|
|
out = append(out, value)
|
|
}
|
|
return out
|
|
}
|
|
|
|
func saturatingSub(after, before uint64) uint64 {
|
|
if after <= before {
|
|
return 0
|
|
}
|
|
return after - before
|
|
}
|
|
|
|
func maxInt(a, b int) int {
|
|
if a > b {
|
|
return a
|
|
}
|
|
return b
|
|
}
|