Compare commits

..

3 Commits
v7.14 ... v7.18

9 changed files with 511 additions and 556 deletions

View File

@@ -125,6 +125,8 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
} }
logFunc(fmt.Sprintf("NVIDIA benchmark profile=%s gpus=%s", spec.Name, joinIndexList(selected))) logFunc(fmt.Sprintf("NVIDIA benchmark profile=%s gpus=%s", spec.Name, joinIndexList(selected)))
var metricRows []GPUMetricRow
gpuBurnLog := filepath.Join(runDir, "gpu-burn.log")
// Server power characterization state — populated during per-GPU phases. // Server power characterization state — populated during per-GPU phases.
var serverIdleW, serverLoadedWSum float64 var serverIdleW, serverLoadedWSum float64
@@ -171,199 +173,202 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
cpuSamplesCh := startCPULoadSampler(cpuStopCh, 10) cpuSamplesCh := startCPULoadSampler(cpuStopCh, 10)
if opts.ParallelGPUs { if opts.ParallelGPUs {
runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, calibPowerByIndex, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples) runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, calibPowerByIndex, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples, &metricRows, gpuBurnLog)
} else { } else {
for _, idx := range selected { for _, idx := range selected {
gpuResult := BenchmarkGPUResult{ gpuResult := BenchmarkGPUResult{
Index: idx, Index: idx,
Status: "FAILED", Status: "FAILED",
} }
if info, ok := infoByIndex[idx]; ok { if info, ok := infoByIndex[idx]; ok {
gpuResult.UUID = info.UUID gpuResult.UUID = info.UUID
gpuResult.Name = info.Name gpuResult.Name = info.Name
gpuResult.BusID = info.BusID gpuResult.BusID = info.BusID
gpuResult.VBIOS = info.VBIOS gpuResult.VBIOS = info.VBIOS
gpuResult.PowerLimitW = info.PowerLimitW gpuResult.PowerLimitW = info.PowerLimitW
gpuResult.MultiprocessorCount = info.MultiprocessorCount gpuResult.MultiprocessorCount = info.MultiprocessorCount
gpuResult.DefaultPowerLimitW = info.DefaultPowerLimitW gpuResult.DefaultPowerLimitW = info.DefaultPowerLimitW
gpuResult.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz gpuResult.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz
gpuResult.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz gpuResult.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz
} }
if w, ok := calibPowerByIndex[idx]; ok && w > 0 { if w, ok := calibPowerByIndex[idx]; ok && w > 0 {
gpuResult.CalibratedPeakPowerW = w gpuResult.CalibratedPeakPowerW = w
} }
if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil { if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
gpuResult.LockedGraphicsClockMHz = norm.GPUClockLockMHz gpuResult.LockedGraphicsClockMHz = norm.GPUClockLockMHz
gpuResult.LockedMemoryClockMHz = norm.MemoryClockLockMHz gpuResult.LockedMemoryClockMHz = norm.MemoryClockLockMHz
}
baselineRows, err := collectBenchmarkSamples(ctx, spec.BaselineSec, []int{idx})
if err != nil && err != context.Canceled {
gpuResult.Notes = append(gpuResult.Notes, "baseline sampling failed: "+err.Error())
}
gpuResult.Baseline = summarizeBenchmarkTelemetry(baselineRows)
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-baseline", idx), baselineRows)
// Sample server idle power once (first GPU only — server state is global).
if !serverIdleOK {
if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok {
serverIdleW = w
serverIdleOK = true
logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
} }
}
warmupCmd := []string{ baselineRows, err := collectBenchmarkSamples(ctx, spec.BaselineSec, []int{idx})
"bee-gpu-burn", if err != nil && err != context.Canceled {
"--seconds", strconv.Itoa(spec.WarmupSec), gpuResult.Notes = append(gpuResult.Notes, "baseline sampling failed: "+err.Error())
"--size-mb", strconv.Itoa(opts.SizeMB), }
"--devices", strconv.Itoa(idx), gpuResult.Baseline = summarizeBenchmarkTelemetry(baselineRows)
} appendBenchmarkMetrics(&metricRows, baselineRows, fmt.Sprintf("gpu-%d-baseline", idx))
logFunc(fmt.Sprintf("GPU %d: warmup (%ds)", idx, spec.WarmupSec))
warmupOut, _, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-warmup.log", idx), warmupCmd, nil, []int{idx}, runDir, fmt.Sprintf("gpu-%d-warmup", idx), logFunc)
_ = os.WriteFile(filepath.Join(runDir, fmt.Sprintf("gpu-%d-warmup.log", idx)), warmupOut, 0644)
if warmupErr != nil {
gpuResult.Notes = append(gpuResult.Notes, "warmup failed: "+warmupErr.Error())
result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult))
continue
}
// ── Per-precision stability phases ──────────────────────────────────────── // Sample server idle power once (first GPU only — server state is global).
// Run each precision category alone so PowerCVPct reflects genuine GPU if !serverIdleOK {
// power stability, not kernel-mix variance. if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok {
// Time budget: each phase gets steadySec/numPhases, minimum 60 s. serverIdleW = w
// SteadySec is split equally across all precision phases + 1 combined slot. serverIdleOK = true
// Skipped phases (unsupported precision) are simply omitted; combined is fixed. logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
totalSlots := len(benchmarkPrecisionPhases) + 1 }
perPhaseSec := spec.SteadySec / totalSlots }
if perPhaseSec < 60 {
perPhaseSec = 60 warmupCmd := []string{
} "bee-gpu-burn",
eccBase, _ := queryECCCounters(idx) "--seconds", strconv.Itoa(spec.WarmupSec),
for _, prec := range benchmarkPrecisionPhases { "--size-mb", strconv.Itoa(opts.SizeMB),
phaseCmd := []string{ "--devices", strconv.Itoa(idx),
}
logFunc(fmt.Sprintf("GPU %d: warmup (%ds)", idx, spec.WarmupSec))
warmupOut, warmupRows, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-warmup.log", idx), warmupCmd, nil, []int{idx}, logFunc)
appendBenchmarkMetrics(&metricRows, warmupRows, fmt.Sprintf("gpu-%d-warmup", idx))
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", fmt.Sprintf("gpu-%d-warmup", idx), warmupOut)
if warmupErr != nil {
gpuResult.Notes = append(gpuResult.Notes, "warmup failed: "+warmupErr.Error())
result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult))
continue
}
// ── Per-precision stability phases ────────────────────────────────────────
// Run each precision category alone so PowerCVPct reflects genuine GPU
// power stability, not kernel-mix variance.
// Time budget: each phase gets steadySec/numPhases, minimum 60 s.
// SteadySec is split equally across all precision phases + 1 combined slot.
// Skipped phases (unsupported precision) are simply omitted; combined is fixed.
totalSlots := len(benchmarkPrecisionPhases) + 1
perPhaseSec := spec.SteadySec / totalSlots
if perPhaseSec < 60 {
perPhaseSec = 60
}
eccBase, _ := queryECCCounters(idx)
for _, prec := range benchmarkPrecisionPhases {
phaseCmd := []string{
"bee-gpu-burn",
"--seconds", strconv.Itoa(perPhaseSec),
"--size-mb", strconv.Itoa(opts.SizeMB),
"--devices", strconv.Itoa(idx),
"--precision", prec,
}
logFunc(fmt.Sprintf("GPU %d: %s stability phase (%ds)", idx, prec, perPhaseSec))
phaseLogName := fmt.Sprintf("gpu-%d-steady-%s", idx, prec)
eccBefore, _ := queryECCCounters(idx)
phaseOut, phaseRows, phaseErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, phaseLogName+".log", phaseCmd, nil, []int{idx}, logFunc)
appendBenchmarkMetrics(&metricRows, phaseRows, phaseLogName)
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseLogName, phaseOut)
eccAfter, _ := queryECCCounters(idx)
if phaseErr != nil || len(phaseRows) == 0 {
continue
}
phase := BenchmarkPrecisionSteadyPhase{
Precision: prec,
Steady: summarizeBenchmarkTelemetry(phaseRows),
ECC: diffECCCounters(eccBefore, eccAfter),
}
for _, p := range parseBenchmarkBurnLog(string(phaseOut)).Profiles {
if p.Supported {
phase.TeraOpsPerSec += p.TeraOpsPerSec
phase.WeightedTeraOpsPerSec += p.WeightedTeraOpsPerSec
}
}
gpuResult.PrecisionSteady = append(gpuResult.PrecisionSteady, phase)
}
beforeThrottle, _ := queryThrottleCounters(idx)
steadyCmd := []string{
"bee-gpu-burn", "bee-gpu-burn",
"--seconds", strconv.Itoa(perPhaseSec), "--seconds", strconv.Itoa(perPhaseSec),
"--size-mb", strconv.Itoa(opts.SizeMB), "--size-mb", strconv.Itoa(opts.SizeMB),
"--devices", strconv.Itoa(idx), "--devices", strconv.Itoa(idx),
"--precision", prec,
} }
logFunc(fmt.Sprintf("GPU %d: %s stability phase (%ds)", idx, prec, perPhaseSec)) logFunc(fmt.Sprintf("GPU %d: steady compute (combined, %ds)", idx, perPhaseSec))
phaseLogName := fmt.Sprintf("gpu-%d-steady-%s", idx, prec)
eccBefore, _ := queryECCCounters(idx)
phaseOut, phaseRows, phaseErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, phaseLogName+".log", phaseCmd, nil, []int{idx}, runDir, phaseLogName, logFunc)
eccAfter, _ := queryECCCounters(idx)
if phaseErr != nil || len(phaseRows) == 0 {
continue
}
phase := BenchmarkPrecisionSteadyPhase{
Precision: prec,
Steady: summarizeBenchmarkTelemetry(phaseRows),
ECC: diffECCCounters(eccBefore, eccAfter),
}
for _, p := range parseBenchmarkBurnLog(string(phaseOut)).Profiles {
if p.Supported {
phase.TeraOpsPerSec += p.TeraOpsPerSec
phase.WeightedTeraOpsPerSec += p.WeightedTeraOpsPerSec
}
}
gpuResult.PrecisionSteady = append(gpuResult.PrecisionSteady, phase)
}
beforeThrottle, _ := queryThrottleCounters(idx) // Sample server power via IPMI in parallel with the steady phase.
steadyCmd := []string{ // We collect readings every 5s and average them.
"bee-gpu-burn", ipmiStopCh := make(chan struct{})
"--seconds", strconv.Itoa(perPhaseSec), ipmiResultCh := make(chan float64, 1)
"--size-mb", strconv.Itoa(opts.SizeMB), go func() {
"--devices", strconv.Itoa(idx), defer close(ipmiResultCh)
} var samples []float64
logFunc(fmt.Sprintf("GPU %d: steady compute (combined, %ds)", idx, perPhaseSec)) ticker := time.NewTicker(5 * time.Second)
defer ticker.Stop()
// Sample server power via IPMI in parallel with the steady phase. // First sample after a short warmup delay.
// We collect readings every 5s and average them.
ipmiStopCh := make(chan struct{})
ipmiResultCh := make(chan float64, 1)
go func() {
defer close(ipmiResultCh)
var samples []float64
ticker := time.NewTicker(5 * time.Second)
defer ticker.Stop()
// First sample after a short warmup delay.
select {
case <-ipmiStopCh:
return
case <-time.After(15 * time.Second):
}
for {
if w, err := queryIPMIServerPowerW(); err == nil {
samples = append(samples, w)
}
select { select {
case <-ipmiStopCh: case <-ipmiStopCh:
if len(samples) > 0 {
var sum float64
for _, w := range samples {
sum += w
}
ipmiResultCh <- sum / float64(len(samples))
}
return return
case <-ticker.C: case <-time.After(15 * time.Second):
} }
for {
if w, err := queryIPMIServerPowerW(); err == nil {
samples = append(samples, w)
}
select {
case <-ipmiStopCh:
if len(samples) > 0 {
var sum float64
for _, w := range samples {
sum += w
}
ipmiResultCh <- sum / float64(len(samples))
}
return
case <-ticker.C:
}
}
}()
steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-steady.log", idx), steadyCmd, nil, []int{idx}, logFunc)
appendBenchmarkMetrics(&metricRows, steadyRows, fmt.Sprintf("gpu-%d-steady", idx))
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", fmt.Sprintf("gpu-%d-steady", idx), steadyOut)
close(ipmiStopCh)
if loadedW, ok := <-ipmiResultCh; ok {
serverLoadedWSum += loadedW
serverLoadedSamples++
serverLoadedOK = true
logFunc(fmt.Sprintf("GPU %d: server loaded power (IPMI): %.0f W", idx, loadedW))
}
afterThrottle, _ := queryThrottleCounters(idx)
if steadyErr != nil {
gpuResult.Notes = append(gpuResult.Notes, "steady compute failed: "+steadyErr.Error())
} }
}()
steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-steady.log", idx), steadyCmd, nil, []int{idx}, runDir, fmt.Sprintf("gpu-%d-steady", idx), logFunc) parseResult := parseBenchmarkBurnLog(string(steadyOut))
close(ipmiStopCh) gpuResult.ComputeCapability = parseResult.ComputeCapability
if loadedW, ok := <-ipmiResultCh; ok { gpuResult.Backend = parseResult.Backend
serverLoadedWSum += loadedW gpuResult.PrecisionResults = parseResult.Profiles
serverLoadedSamples++ if parseResult.Fallback {
serverLoadedOK = true gpuResult.Notes = append(gpuResult.Notes, "benchmark used driver PTX fallback; tensor throughput score is not comparable")
logFunc(fmt.Sprintf("GPU %d: server loaded power (IPMI): %.0f W", idx, loadedW)) }
gpuResult.Steady = summarizeBenchmarkTelemetry(steadyRows)
gpuResult.Throttle = diffThrottleCounters(beforeThrottle, afterThrottle)
if eccFinal, err := queryECCCounters(idx); err == nil {
gpuResult.ECC = diffECCCounters(eccBase, eccFinal)
}
cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, []int{idx})
if err != nil && err != context.Canceled {
gpuResult.Notes = append(gpuResult.Notes, "cooldown sampling failed: "+err.Error())
}
gpuResult.Cooldown = summarizeBenchmarkTelemetry(cooldownRows)
appendBenchmarkMetrics(&metricRows, cooldownRows, fmt.Sprintf("gpu-%d-cooldown", idx))
gpuResult.Scores = scoreBenchmarkGPUResult(gpuResult)
gpuResult.DegradationReasons = detectBenchmarkDegradationReasons(gpuResult, result.Normalization.Status)
if steadyErr != nil {
gpuResult.Status = classifySATErrorStatus(steadyOut, steadyErr)
} else if parseResult.Fallback {
gpuResult.Status = "PARTIAL"
} else {
gpuResult.Status = "OK"
}
result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult))
} }
_ = os.WriteFile(filepath.Join(runDir, fmt.Sprintf("gpu-%d-steady.log", idx)), steadyOut, 0644)
afterThrottle, _ := queryThrottleCounters(idx)
if steadyErr != nil {
gpuResult.Notes = append(gpuResult.Notes, "steady compute failed: "+steadyErr.Error())
}
parseResult := parseBenchmarkBurnLog(string(steadyOut))
gpuResult.ComputeCapability = parseResult.ComputeCapability
gpuResult.Backend = parseResult.Backend
gpuResult.PrecisionResults = parseResult.Profiles
if parseResult.Fallback {
gpuResult.Notes = append(gpuResult.Notes, "benchmark used driver PTX fallback; tensor throughput score is not comparable")
}
gpuResult.Steady = summarizeBenchmarkTelemetry(steadyRows)
gpuResult.Throttle = diffThrottleCounters(beforeThrottle, afterThrottle)
if eccFinal, err := queryECCCounters(idx); err == nil {
gpuResult.ECC = diffECCCounters(eccBase, eccFinal)
}
cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, []int{idx})
if err != nil && err != context.Canceled {
gpuResult.Notes = append(gpuResult.Notes, "cooldown sampling failed: "+err.Error())
}
gpuResult.Cooldown = summarizeBenchmarkTelemetry(cooldownRows)
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-cooldown", idx), cooldownRows)
gpuResult.Scores = scoreBenchmarkGPUResult(gpuResult)
gpuResult.DegradationReasons = detectBenchmarkDegradationReasons(gpuResult, result.Normalization.Status)
if steadyErr != nil {
gpuResult.Status = classifySATErrorStatus(steadyOut, steadyErr)
} else if parseResult.Fallback {
gpuResult.Status = "PARTIAL"
} else {
gpuResult.Status = "OK"
}
result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult))
}
} // end sequential path } // end sequential path
if len(selected) > 1 && opts.RunNCCL { if len(selected) > 1 && opts.RunNCCL {
@@ -413,6 +418,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
result.Findings = buildBenchmarkFindings(result) result.Findings = buildBenchmarkFindings(result)
result.OverallStatus = benchmarkOverallStatus(result) result.OverallStatus = benchmarkOverallStatus(result)
writeBenchmarkMetricsFiles(runDir, metricRows)
resultJSON, err := json.MarshalIndent(result, "", " ") resultJSON, err := json.MarshalIndent(result, "", " ")
if err != nil { if err != nil {
@@ -422,7 +428,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
return "", fmt.Errorf("write result.json: %w", err) return "", fmt.Errorf("write result.json: %w", err)
} }
report := renderBenchmarkReportWithCharts(result, loadBenchmarkReportCharts(runDir, selected)) report := renderBenchmarkReportWithCharts(result)
if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(report), 0644); err != nil { if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(report), 0644); err != nil {
return "", fmt.Errorf("write report.md: %w", err) return "", fmt.Errorf("write report.md: %w", err)
} }
@@ -511,11 +517,11 @@ func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []b
// Split the verbose output into per-GPU sections on "^GPU " lines. // Split the verbose output into per-GPU sections on "^GPU " lines.
gpuSectionRe := regexp.MustCompile(`(?m)^GPU\s+([\dA-Fa-f:\.]+)`) gpuSectionRe := regexp.MustCompile(`(?m)^GPU\s+([\dA-Fa-f:\.]+)`)
maxGfxRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Graphics\s*:\s*(\d+)\s*MHz`) maxGfxRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Graphics\s*:\s*(\d+)\s*MHz`)
maxMemRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Memory\s*:\s*(\d+)\s*MHz`) maxMemRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Memory\s*:\s*(\d+)\s*MHz`)
defaultPwrRe := regexp.MustCompile(`(?i)Default Power Limit\s*:\s*([0-9.]+)\s*W`) defaultPwrRe := regexp.MustCompile(`(?i)Default Power Limit\s*:\s*([0-9.]+)\s*W`)
currentPwrRe := regexp.MustCompile(`(?i)Current Power Limit\s*:\s*([0-9.]+)\s*W`) currentPwrRe := regexp.MustCompile(`(?i)Current Power Limit\s*:\s*([0-9.]+)\s*W`)
smCountRe := regexp.MustCompile(`(?i)Multiprocessor Count\s*:\s*(\d+)`) smCountRe := regexp.MustCompile(`(?i)Multiprocessor Count\s*:\s*(\d+)`)
sectionStarts := gpuSectionRe.FindAllSubmatchIndex(nvsmiQ, -1) sectionStarts := gpuSectionRe.FindAllSubmatchIndex(nvsmiQ, -1)
for i, loc := range sectionStarts { for i, loc := range sectionStarts {
@@ -651,7 +657,6 @@ func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
return nil, lastErr return nil, lastErr
} }
func applyBenchmarkNormalization(ctx context.Context, verboseLog string, gpuIndices []int, infoByIndex map[int]benchmarkGPUInfo, result *NvidiaBenchmarkResult) []benchmarkRestoreAction { func applyBenchmarkNormalization(ctx context.Context, verboseLog string, gpuIndices []int, infoByIndex map[int]benchmarkGPUInfo, result *NvidiaBenchmarkResult) []benchmarkRestoreAction {
if os.Geteuid() != 0 { if os.Geteuid() != 0 {
result.Normalization.Status = "partial" result.Normalization.Status = "partial"
@@ -754,7 +759,7 @@ func collectBenchmarkSamples(ctx context.Context, durationSec int, gpuIndices []
return rows, nil return rows, nil
} }
func runBenchmarkCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, runDir, baseName string, logFunc func(string)) ([]byte, []GPUMetricRow, error) { func runBenchmarkCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, logFunc func(string)) ([]byte, []GPUMetricRow, error) {
stopCh := make(chan struct{}) stopCh := make(chan struct{})
doneCh := make(chan struct{}) doneCh := make(chan struct{})
var metricRows []GPUMetricRow var metricRows []GPUMetricRow
@@ -786,18 +791,65 @@ func runBenchmarkCommandWithMetrics(ctx context.Context, verboseLog, name string
close(stopCh) close(stopCh)
<-doneCh <-doneCh
writeBenchmarkMetricsFiles(runDir, baseName, metricRows)
return out, metricRows, err return out, metricRows, err
} }
func writeBenchmarkMetricsFiles(runDir, baseName string, rows []GPUMetricRow) { func annotateBenchmarkMetricRows(rows []GPUMetricRow, stage string, offset float64) []GPUMetricRow {
if len(rows) == 0 {
return nil
}
out := make([]GPUMetricRow, len(rows))
for i, row := range rows {
row.Stage = stage
row.ElapsedSec += offset
out[i] = row
}
return out
}
func benchmarkMetricOffset(rows []GPUMetricRow) float64 {
if len(rows) == 0 {
return 0
}
var maxElapsed float64
for _, row := range rows {
if row.ElapsedSec > maxElapsed {
maxElapsed = row.ElapsedSec
}
}
return maxElapsed
}
func appendBenchmarkMetrics(allRows *[]GPUMetricRow, rows []GPUMetricRow, stage string) {
annotated := annotateBenchmarkMetricRows(rows, stage, benchmarkMetricOffset(*allRows))
*allRows = append(*allRows, annotated...)
}
func writeBenchmarkMetricsFiles(runDir string, rows []GPUMetricRow) {
if len(rows) == 0 { if len(rows) == 0 {
return return
} }
_ = WriteGPUMetricsCSV(filepath.Join(runDir, baseName+"-metrics.csv"), rows) _ = WriteGPUMetricsCSV(filepath.Join(runDir, "gpu-metrics.csv"), rows)
_ = WriteGPUMetricsHTML(filepath.Join(runDir, baseName+"-metrics.html"), rows) _ = WriteGPUMetricsHTML(filepath.Join(runDir, "gpu-metrics.html"), rows)
chart := RenderGPUTerminalChart(rows) }
_ = os.WriteFile(filepath.Join(runDir, baseName+"-metrics-term.txt"), []byte(chart), 0644)
func appendBenchmarkStageLog(path, source, stage string, raw []byte) {
if path == "" || len(raw) == 0 {
return
}
f, err := os.OpenFile(path, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644)
if err != nil {
return
}
defer f.Close()
header := fmt.Sprintf("\n========== %s | stage=%s ==========\n", source, stage)
_, _ = f.WriteString(header)
if len(raw) > 0 {
_, _ = f.Write(raw)
if raw[len(raw)-1] != '\n' {
_, _ = f.WriteString("\n")
}
}
} }
func parseBenchmarkBurnLog(raw string) benchmarkBurnParseResult { func parseBenchmarkBurnLog(raw string) benchmarkBurnParseResult {
@@ -897,11 +949,13 @@ func ensureBenchmarkProfile(profiles map[string]*benchmarkBurnProfile, name stri
// precisionWeight returns the fp32-equivalence factor for a precision category. // precisionWeight returns the fp32-equivalence factor for a precision category.
// Each factor represents how much "real" numeric work one operation of that // Each factor represents how much "real" numeric work one operation of that
// type performs relative to fp32 (single precision = 1.0 baseline): // type performs relative to fp32 (single precision = 1.0 baseline):
// fp64 = 2.0 — double precision, 2× more bits per operand //
// fp32 = 1.0 — single precision baseline // fp64 = 2.0 — double precision, 2× more bits per operand
// fp16 = 0.5half precision // fp32 = 1.0single precision baseline
// fp8 = 0.25 — quarter precision // fp16 = 0.5 half precision
// fp4 = 0.125 — eighth precision // fp8 = 0.25 — quarter precision
// fp4 = 0.125 — eighth precision
//
// Multiplying raw TOPS by the weight gives fp32-equivalent TOPS, enabling // Multiplying raw TOPS by the weight gives fp32-equivalent TOPS, enabling
// cross-precision comparison on the same numeric scale. // cross-precision comparison on the same numeric scale.
func precisionWeight(category string) float64 { func precisionWeight(category string) float64 {
@@ -1670,6 +1724,8 @@ func runNvidiaBenchmarkParallel(
calibPowerByIndex map[int]float64, calibPowerByIndex map[int]float64,
serverIdleW *float64, serverLoadedWSum *float64, serverIdleW *float64, serverLoadedWSum *float64,
serverIdleOK *bool, serverLoadedOK *bool, serverLoadedSamples *int, serverIdleOK *bool, serverLoadedOK *bool, serverLoadedSamples *int,
allMetricRows *[]GPUMetricRow,
gpuBurnLog string,
) { ) {
allDevices := joinIndexList(selected) allDevices := joinIndexList(selected)
@@ -1709,8 +1765,8 @@ func runNvidiaBenchmarkParallel(
for _, idx := range selected { for _, idx := range selected {
perGPU := filterRowsByGPU(baselineRows, idx) perGPU := filterRowsByGPU(baselineRows, idx)
gpuResults[idx].Baseline = summarizeBenchmarkTelemetry(perGPU) gpuResults[idx].Baseline = summarizeBenchmarkTelemetry(perGPU)
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-baseline", idx), perGPU)
} }
appendBenchmarkMetrics(allMetricRows, baselineRows, "baseline")
// Sample server idle power once. // Sample server idle power once.
if !*serverIdleOK { if !*serverIdleOK {
@@ -1729,11 +1785,9 @@ func runNvidiaBenchmarkParallel(
"--devices", allDevices, "--devices", allDevices,
} }
logFunc(fmt.Sprintf("GPUs %s: parallel warmup (%ds)", allDevices, spec.WarmupSec)) logFunc(fmt.Sprintf("GPUs %s: parallel warmup (%ds)", allDevices, spec.WarmupSec))
warmupOut, warmupRows, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-warmup.log", warmupCmd, nil, selected, runDir, "gpu-all-warmup", logFunc) warmupOut, warmupRows, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-warmup.log", warmupCmd, nil, selected, logFunc)
_ = os.WriteFile(filepath.Join(runDir, "gpu-all-warmup.log"), warmupOut, 0644) appendBenchmarkMetrics(allMetricRows, warmupRows, "warmup")
for _, idx := range selected { appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", "warmup", warmupOut)
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-warmup", idx), filterRowsByGPU(warmupRows, idx))
}
if warmupErr != nil { if warmupErr != nil {
for _, idx := range selected { for _, idx := range selected {
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "parallel warmup failed: "+warmupErr.Error()) gpuResults[idx].Notes = append(gpuResults[idx].Notes, "parallel warmup failed: "+warmupErr.Error())
@@ -1764,7 +1818,9 @@ func runNvidiaBenchmarkParallel(
for _, idx := range selected { for _, idx := range selected {
eccBeforePhase[idx], _ = queryECCCounters(idx) eccBeforePhase[idx], _ = queryECCCounters(idx)
} }
phaseOut, phaseRows, phaseErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, phaseLogName+".log", phaseCmd, nil, selected, runDir, phaseLogName, logFunc) phaseOut, phaseRows, phaseErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, phaseLogName+".log", phaseCmd, nil, selected, logFunc)
appendBenchmarkMetrics(allMetricRows, phaseRows, phaseLogName)
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseLogName, phaseOut)
eccAfterPhase := make(map[int]BenchmarkECCCounters, len(selected)) eccAfterPhase := make(map[int]BenchmarkECCCounters, len(selected))
for _, idx := range selected { for _, idx := range selected {
eccAfterPhase[idx], _ = queryECCCounters(idx) eccAfterPhase[idx], _ = queryECCCounters(idx)
@@ -1842,7 +1898,9 @@ func runNvidiaBenchmarkParallel(
} }
}() }()
steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-steady.log", steadyCmd, nil, selected, runDir, "gpu-all-steady", logFunc) steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-steady.log", steadyCmd, nil, selected, logFunc)
appendBenchmarkMetrics(allMetricRows, steadyRows, "steady")
appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", "steady", steadyOut)
close(ipmiStopCh) close(ipmiStopCh)
if loadedW, ok := <-ipmiResultCh; ok { if loadedW, ok := <-ipmiResultCh; ok {
*serverLoadedWSum += loadedW *serverLoadedWSum += loadedW
@@ -1850,8 +1908,6 @@ func runNvidiaBenchmarkParallel(
*serverLoadedOK = true *serverLoadedOK = true
logFunc(fmt.Sprintf("GPUs %s: server loaded power (IPMI): %.0f W", allDevices, loadedW)) logFunc(fmt.Sprintf("GPUs %s: server loaded power (IPMI): %.0f W", allDevices, loadedW))
} }
_ = os.WriteFile(filepath.Join(runDir, "gpu-all-steady.log"), steadyOut, 0644)
afterThrottle := make(map[int]BenchmarkThrottleCounters, len(selected)) afterThrottle := make(map[int]BenchmarkThrottleCounters, len(selected))
for _, idx := range selected { for _, idx := range selected {
afterThrottle[idx], _ = queryThrottleCounters(idx) afterThrottle[idx], _ = queryThrottleCounters(idx)
@@ -1861,7 +1917,6 @@ func runNvidiaBenchmarkParallel(
for _, idx := range selected { for _, idx := range selected {
perGPU := filterRowsByGPU(steadyRows, idx) perGPU := filterRowsByGPU(steadyRows, idx)
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-steady", idx), perGPU)
gpuResults[idx].Steady = summarizeBenchmarkTelemetry(perGPU) gpuResults[idx].Steady = summarizeBenchmarkTelemetry(perGPU)
gpuResults[idx].Throttle = diffThrottleCounters(beforeThrottle[idx], afterThrottle[idx]) gpuResults[idx].Throttle = diffThrottleCounters(beforeThrottle[idx], afterThrottle[idx])
if eccFinal, err := queryECCCounters(idx); err == nil { if eccFinal, err := queryECCCounters(idx); err == nil {
@@ -1891,8 +1946,8 @@ func runNvidiaBenchmarkParallel(
for _, idx := range selected { for _, idx := range selected {
perGPU := filterRowsByGPU(cooldownRows, idx) perGPU := filterRowsByGPU(cooldownRows, idx)
gpuResults[idx].Cooldown = summarizeBenchmarkTelemetry(perGPU) gpuResults[idx].Cooldown = summarizeBenchmarkTelemetry(perGPU)
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-cooldown", idx), perGPU)
} }
appendBenchmarkMetrics(allMetricRows, cooldownRows, "cooldown")
// Score and finalize each GPU. // Score and finalize each GPU.
for _, idx := range selected { for _, idx := range selected {
@@ -2102,7 +2157,7 @@ func runBenchmarkPowerCalibration(
logFunc(fmt.Sprintf("power calibration: running dcgmi targeted_power for %ds on GPUs %s", calibDurationSec, joinIndexList(gpuIndices))) logFunc(fmt.Sprintf("power calibration: running dcgmi targeted_power for %ds on GPUs %s", calibDurationSec, joinIndexList(gpuIndices)))
cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, gpuIndices) cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, gpuIndices)
out, rows, err := runBenchmarkCommandWithMetrics(ctx, verboseLog, "power-calibration.log", cmd, nil, gpuIndices, runDir, "power-calibration", logFunc) out, rows, err := runBenchmarkCommandWithMetrics(ctx, verboseLog, "power-calibration.log", cmd, nil, gpuIndices, logFunc)
_ = os.WriteFile(filepath.Join(runDir, "power-calibration.log"), out, 0644) _ = os.WriteFile(filepath.Join(runDir, "power-calibration.log"), out, 0644)
if err != nil { if err != nil {
logFunc(fmt.Sprintf("power calibration: dcgmi targeted_power failed (%v), skipping", err)) logFunc(fmt.Sprintf("power calibration: dcgmi targeted_power failed (%v), skipping", err))

View File

@@ -2,25 +2,15 @@ package platform
import ( import (
"fmt" "fmt"
"os"
"path/filepath"
"regexp"
"strings" "strings"
"time" "time"
) )
func renderBenchmarkReport(result NvidiaBenchmarkResult) string { func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
return renderBenchmarkReportWithCharts(result, nil) return renderBenchmarkReportWithCharts(result)
} }
type benchmarkReportChart struct { func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
Title string
Content string
}
var ansiEscapePattern = regexp.MustCompile(`\x1b\[[0-9;]*m`)
func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benchmarkReportChart) string {
var b strings.Builder var b strings.Builder
// ── Header ──────────────────────────────────────────────────────────────── // ── Header ────────────────────────────────────────────────────────────────
@@ -91,8 +81,12 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
b.WriteString("\n") b.WriteString("\n")
} }
// ── Scoring methodology ─────────────────────────────────────────────────── // ── Methodology ───────────────────────────────────────────────────────────
b.WriteString("## Scoring Methodology\n\n") b.WriteString("## Methodology\n\n")
fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline -> warmup -> steady-state -> interconnect -> cooldown phases.\n", result.BenchmarkProfile)
b.WriteString("- Single-GPU compute score comes from `bee-gpu-burn` on the cuBLASLt path when available.\n")
b.WriteString("- Thermal and power limits are inferred from NVIDIA clock-event counters plus sustained telemetry.\n")
b.WriteString("- `result.json` is the canonical machine-readable source for the run.\n\n")
b.WriteString("**Compute score** is derived from two phases:\n\n") b.WriteString("**Compute score** is derived from two phases:\n\n")
b.WriteString("- **Synthetic** — each precision type (fp8, fp16, fp32, fp64, fp4) runs alone for a dedicated window. ") b.WriteString("- **Synthetic** — each precision type (fp8, fp16, fp32, fp64, fp4) runs alone for a dedicated window. ")
b.WriteString("Measures peak throughput with the full GPU dedicated to one kernel type. ") b.WriteString("Measures peak throughput with the full GPU dedicated to one kernel type. ")
@@ -213,7 +207,6 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
gpu.Steady.ClockCVPct, gpu.Steady.PowerCVPct, gpu.Steady.ClockDriftPct) gpu.Steady.ClockCVPct, gpu.Steady.PowerCVPct, gpu.Steady.ClockDriftPct)
} }
// ECC summary // ECC summary
if !gpu.ECC.IsZero() { if !gpu.ECC.IsZero() {
fmt.Fprintf(&b, "**ECC errors (total):** corrected=%d uncorrected=%d\n\n", fmt.Fprintf(&b, "**ECC errors (total):** corrected=%d uncorrected=%d\n\n",
@@ -297,61 +290,16 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
} }
} }
// ── Terminal charts (steady-state only) ───────────────────────────────────
if len(charts) > 0 {
b.WriteString("## Steady-State Charts\n\n")
for _, chart := range charts {
content := strings.TrimSpace(stripANSIEscapeSequences(chart.Content))
if content == "" {
continue
}
fmt.Fprintf(&b, "### %s\n\n```\n%s\n```\n\n", chart.Title, content)
}
}
// ── Methodology ───────────────────────────────────────────────────────────
b.WriteString("## Methodology\n\n")
fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline → warmup → steady-state → interconnect → cooldown phases.\n", result.BenchmarkProfile)
b.WriteString("- Single-GPU compute score from bee-gpu-burn cuBLASLt when available.\n")
b.WriteString("- Thermal and power limitations inferred from NVIDIA clock event reason counters and sustained telemetry.\n")
b.WriteString("- `result.json` is the canonical machine-readable source for this benchmark run.\n\n")
// ── Raw files ───────────────────────────────────────────────────────────── // ── Raw files ─────────────────────────────────────────────────────────────
b.WriteString("## Raw Files\n\n") b.WriteString("## Raw Files\n\n")
b.WriteString("- `result.json`\n- `report.md`\n- `summary.txt`\n- `verbose.log`\n") b.WriteString("- `result.json`\n- `report.md`\n- `summary.txt`\n- `verbose.log`\n")
b.WriteString("- `gpu-*-baseline-metrics.csv/html/term.txt`\n") b.WriteString("- `gpu-metrics.csv`\n- `gpu-metrics.html`\n- `gpu-burn.log`\n")
b.WriteString("- `gpu-*-warmup.log`\n")
b.WriteString("- `gpu-*-steady.log`\n")
b.WriteString("- `gpu-*-steady-metrics.csv/html/term.txt`\n")
b.WriteString("- `gpu-*-cooldown-metrics.csv/html/term.txt`\n")
if result.Interconnect != nil { if result.Interconnect != nil {
b.WriteString("- `nccl-all-reduce.log`\n") b.WriteString("- `nccl-all-reduce.log`\n")
} }
return b.String() return b.String()
} }
// loadBenchmarkReportCharts loads only steady-state terminal charts (baseline and
// cooldown charts are not useful for human review).
func loadBenchmarkReportCharts(runDir string, gpuIndices []int) []benchmarkReportChart {
var charts []benchmarkReportChart
for _, idx := range gpuIndices {
path := filepath.Join(runDir, fmt.Sprintf("gpu-%d-steady-metrics-term.txt", idx))
raw, err := os.ReadFile(path)
if err != nil || len(raw) == 0 {
continue
}
charts = append(charts, benchmarkReportChart{
Title: fmt.Sprintf("GPU %d — Steady State", idx),
Content: string(raw),
})
}
return charts
}
func stripANSIEscapeSequences(raw string) string {
return ansiEscapePattern.ReplaceAllString(raw, "")
}
// formatThrottleLine renders throttle counters as human-readable percentages of // formatThrottleLine renders throttle counters as human-readable percentages of
// the steady-state window. Only non-zero counters are shown. When the steady // the steady-state window. Only non-zero counters are shown. When the steady
// duration is unknown (0), raw seconds are shown instead. // duration is unknown (0), raw seconds are shown instead.

View File

@@ -147,36 +147,27 @@ func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
} }
} }
func TestRenderBenchmarkReportIncludesTerminalChartsWithoutANSI(t *testing.T) { func TestRenderBenchmarkReportListsUnifiedArtifacts(t *testing.T) {
t.Parallel() t.Parallel()
report := renderBenchmarkReportWithCharts(NvidiaBenchmarkResult{ report := renderBenchmarkReport(NvidiaBenchmarkResult{
BenchmarkProfile: NvidiaBenchmarkProfileStandard, BenchmarkProfile: NvidiaBenchmarkProfileStandard,
OverallStatus: "OK", OverallStatus: "OK",
SelectedGPUIndices: []int{0}, SelectedGPUIndices: []int{0},
Normalization: BenchmarkNormalization{ Normalization: BenchmarkNormalization{
Status: "full", Status: "full",
}, },
}, []benchmarkReportChart{
{
Title: "GPU 0 Steady State",
Content: "\x1b[31mGPU 0 chart\x1b[0m\n 42┤───",
},
}) })
for _, needle := range []string{ for _, needle := range []string{
"Steady-State Charts", "gpu-metrics.csv",
"GPU 0 Steady State", "gpu-metrics.html",
"GPU 0 chart", "gpu-burn.log",
"42┤───",
} { } {
if !strings.Contains(report, needle) { if !strings.Contains(report, needle) {
t.Fatalf("report missing %q\n%s", needle, report) t.Fatalf("report missing %q\n%s", needle, report)
} }
} }
if strings.Contains(report, "\x1b[31m") {
t.Fatalf("report should not contain ANSI escapes\n%s", report)
}
} }
func TestEnrichGPUInfoWithMaxClocks(t *testing.T) { func TestEnrichGPUInfoWithMaxClocks(t *testing.T) {

View File

@@ -43,7 +43,6 @@ type NvidiaBenchmarkOptions struct {
RampRunID string // shared identifier across all steps of the same ramp-up run RampRunID string // shared identifier across all steps of the same ramp-up run
} }
type NvidiaBenchmarkResult struct { type NvidiaBenchmarkResult struct {
BenchmarkVersion string `json:"benchmark_version"` BenchmarkVersion string `json:"benchmark_version"`
GeneratedAt time.Time `json:"generated_at"` GeneratedAt time.Time `json:"generated_at"`
@@ -84,38 +83,38 @@ type BenchmarkNormalizationGPU struct {
} }
type BenchmarkGPUResult struct { type BenchmarkGPUResult struct {
Index int `json:"index"` Index int `json:"index"`
UUID string `json:"uuid,omitempty"` UUID string `json:"uuid,omitempty"`
Name string `json:"name,omitempty"` Name string `json:"name,omitempty"`
BusID string `json:"bus_id,omitempty"` BusID string `json:"bus_id,omitempty"`
VBIOS string `json:"vbios,omitempty"` VBIOS string `json:"vbios,omitempty"`
ComputeCapability string `json:"compute_capability,omitempty"` ComputeCapability string `json:"compute_capability,omitempty"`
Backend string `json:"backend,omitempty"` Backend string `json:"backend,omitempty"`
Status string `json:"status"` Status string `json:"status"`
PowerLimitW float64 `json:"power_limit_w,omitempty"` PowerLimitW float64 `json:"power_limit_w,omitempty"`
MultiprocessorCount int `json:"multiprocessor_count,omitempty"` MultiprocessorCount int `json:"multiprocessor_count,omitempty"`
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"` DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
// CalibratedPeakPowerW is the p95 power measured during a short // CalibratedPeakPowerW is the p95 power measured during a short
// dcgmi targeted_power calibration run before the main benchmark. // dcgmi targeted_power calibration run before the main benchmark.
// Used as the reference denominator for PowerSustainScore instead of // Used as the reference denominator for PowerSustainScore instead of
// the hardware default limit, which bee-gpu-burn cannot reach. // the hardware default limit, which bee-gpu-burn cannot reach.
CalibratedPeakPowerW float64 `json:"calibrated_peak_power_w,omitempty"` CalibratedPeakPowerW float64 `json:"calibrated_peak_power_w,omitempty"`
MaxGraphicsClockMHz float64 `json:"max_graphics_clock_mhz,omitempty"` MaxGraphicsClockMHz float64 `json:"max_graphics_clock_mhz,omitempty"`
BaseGraphicsClockMHz float64 `json:"base_graphics_clock_mhz,omitempty"` BaseGraphicsClockMHz float64 `json:"base_graphics_clock_mhz,omitempty"`
MaxMemoryClockMHz float64 `json:"max_memory_clock_mhz,omitempty"` MaxMemoryClockMHz float64 `json:"max_memory_clock_mhz,omitempty"`
LockedGraphicsClockMHz float64 `json:"locked_graphics_clock_mhz,omitempty"` LockedGraphicsClockMHz float64 `json:"locked_graphics_clock_mhz,omitempty"`
LockedMemoryClockMHz float64 `json:"locked_memory_clock_mhz,omitempty"` LockedMemoryClockMHz float64 `json:"locked_memory_clock_mhz,omitempty"`
Baseline BenchmarkTelemetrySummary `json:"baseline"` Baseline BenchmarkTelemetrySummary `json:"baseline"`
Steady BenchmarkTelemetrySummary `json:"steady"` Steady BenchmarkTelemetrySummary `json:"steady"`
PrecisionSteady []BenchmarkPrecisionSteadyPhase `json:"precision_steady,omitempty"` PrecisionSteady []BenchmarkPrecisionSteadyPhase `json:"precision_steady,omitempty"`
Cooldown BenchmarkTelemetrySummary `json:"cooldown"` Cooldown BenchmarkTelemetrySummary `json:"cooldown"`
Throttle BenchmarkThrottleCounters `json:"throttle_counters"` Throttle BenchmarkThrottleCounters `json:"throttle_counters"`
// ECC error delta accumulated over the full benchmark (all phases combined). // ECC error delta accumulated over the full benchmark (all phases combined).
ECC BenchmarkECCCounters `json:"ecc,omitempty"` ECC BenchmarkECCCounters `json:"ecc,omitempty"`
PrecisionResults []BenchmarkPrecisionResult `json:"precision_results,omitempty"` PrecisionResults []BenchmarkPrecisionResult `json:"precision_results,omitempty"`
Scores BenchmarkScorecard `json:"scores"` Scores BenchmarkScorecard `json:"scores"`
DegradationReasons []string `json:"degradation_reasons,omitempty"` DegradationReasons []string `json:"degradation_reasons,omitempty"`
Notes []string `json:"notes,omitempty"` Notes []string `json:"notes,omitempty"`
} }
type BenchmarkTelemetrySummary struct { type BenchmarkTelemetrySummary struct {
@@ -170,19 +169,19 @@ type BenchmarkPrecisionResult struct {
// Weight is the fp32-equivalence factor for this precision category. // Weight is the fp32-equivalence factor for this precision category.
// fp32 = 1.0 (baseline), fp64 = 2.0, fp16 = 0.5, fp8 = 0.25, fp4 = 0.125. // fp32 = 1.0 (baseline), fp64 = 2.0, fp16 = 0.5, fp8 = 0.25, fp4 = 0.125.
// WeightedTOPS = TeraOpsPerSec * Weight gives fp32-equivalent throughput. // WeightedTOPS = TeraOpsPerSec * Weight gives fp32-equivalent throughput.
Weight float64 `json:"weight,omitempty"` Weight float64 `json:"weight,omitempty"`
WeightedTeraOpsPerSec float64 `json:"weighted_teraops_per_sec,omitempty"` WeightedTeraOpsPerSec float64 `json:"weighted_teraops_per_sec,omitempty"`
Notes string `json:"notes,omitempty"` Notes string `json:"notes,omitempty"`
} }
type BenchmarkScorecard struct { type BenchmarkScorecard struct {
ComputeScore float64 `json:"compute_score"` ComputeScore float64 `json:"compute_score"`
// SyntheticScore is the sum of fp32-equivalent TOPS from per-precision // SyntheticScore is the sum of fp32-equivalent TOPS from per-precision
// steady phases (each precision ran alone, full GPU dedicated). // steady phases (each precision ran alone, full GPU dedicated).
SyntheticScore float64 `json:"synthetic_score,omitempty"` SyntheticScore float64 `json:"synthetic_score,omitempty"`
// MixedScore is the sum of fp32-equivalent TOPS from the combined phase // MixedScore is the sum of fp32-equivalent TOPS from the combined phase
// (all precisions competing simultaneously — closer to real workloads). // (all precisions competing simultaneously — closer to real workloads).
MixedScore float64 `json:"mixed_score,omitempty"` MixedScore float64 `json:"mixed_score,omitempty"`
// MixedEfficiency = MixedScore / SyntheticScore. Measures how well the GPU // MixedEfficiency = MixedScore / SyntheticScore. Measures how well the GPU
// sustains throughput under concurrent mixed-precision load. // sustains throughput under concurrent mixed-precision load.
MixedEfficiency float64 `json:"mixed_efficiency,omitempty"` MixedEfficiency float64 `json:"mixed_efficiency,omitempty"`
@@ -220,7 +219,7 @@ type BenchmarkPrecisionSteadyPhase struct {
// ECC errors accumulated during this precision phase only. // ECC errors accumulated during this precision phase only.
// Non-zero corrected = stress-induced DRAM errors for this kernel type. // Non-zero corrected = stress-induced DRAM errors for this kernel type.
// Any uncorrected = serious fault triggered by this precision workload. // Any uncorrected = serious fault triggered by this precision workload.
ECC BenchmarkECCCounters `json:"ecc,omitempty"` ECC BenchmarkECCCounters `json:"ecc,omitempty"`
} }
type BenchmarkInterconnectResult struct { type BenchmarkInterconnectResult struct {

View File

@@ -13,6 +13,7 @@ import (
// GPUMetricRow is one telemetry sample from nvidia-smi during a stress test. // GPUMetricRow is one telemetry sample from nvidia-smi during a stress test.
type GPUMetricRow struct { type GPUMetricRow struct {
Stage string `json:"stage,omitempty"`
ElapsedSec float64 `json:"elapsed_sec"` ElapsedSec float64 `json:"elapsed_sec"`
GPUIndex int `json:"index"` GPUIndex int `json:"index"`
TempC float64 `json:"temp_c"` TempC float64 `json:"temp_c"`
@@ -141,14 +142,20 @@ func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
// WriteGPUMetricsCSV writes collected rows as a CSV file. // WriteGPUMetricsCSV writes collected rows as a CSV file.
func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error { func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error {
var b bytes.Buffer var b bytes.Buffer
b.WriteString("elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz\n") b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz\n")
for _, r := range rows { for _, r := range rows {
fmt.Fprintf(&b, "%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f\n", fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f\n",
r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz) strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz)
} }
return os.WriteFile(path, b.Bytes(), 0644) return os.WriteFile(path, b.Bytes(), 0644)
} }
type gpuMetricStageSpan struct {
Name string
Start float64
End float64
}
// WriteGPUMetricsHTML writes a standalone HTML file with one SVG chart per GPU. // WriteGPUMetricsHTML writes a standalone HTML file with one SVG chart per GPU.
func WriteGPUMetricsHTML(path string, rows []GPUMetricRow) error { func WriteGPUMetricsHTML(path string, rows []GPUMetricRow) error {
// Group by GPU index preserving order. // Group by GPU index preserving order.
@@ -163,9 +170,25 @@ func WriteGPUMetricsHTML(path string, rows []GPUMetricRow) error {
gpuMap[r.GPUIndex] = append(gpuMap[r.GPUIndex], r) gpuMap[r.GPUIndex] = append(gpuMap[r.GPUIndex], r)
} }
stageSpans := buildGPUMetricStageSpans(rows)
stageColorByName := make(map[string]string, len(stageSpans))
for i, span := range stageSpans {
stageColorByName[span.Name] = gpuMetricStagePalette[i%len(gpuMetricStagePalette)]
}
var legend strings.Builder
if len(stageSpans) > 0 {
legend.WriteString(`<div class="stage-legend">`)
for _, span := range stageSpans {
fmt.Fprintf(&legend, `<span class="stage-chip"><span class="stage-swatch" style="background:%s"></span>%s</span>`,
stageColorByName[span.Name], gpuHTMLEscape(span.Name))
}
legend.WriteString(`</div>`)
}
var svgs strings.Builder var svgs strings.Builder
for _, gpuIdx := range order { for _, gpuIdx := range order {
svgs.WriteString(drawGPUChartSVG(gpuMap[gpuIdx], gpuIdx)) svgs.WriteString(drawGPUChartSVG(gpuMap[gpuIdx], gpuIdx, stageSpans, stageColorByName))
svgs.WriteString("\n") svgs.WriteString("\n")
} }
@@ -175,21 +198,39 @@ func WriteGPUMetricsHTML(path string, rows []GPUMetricRow) error {
<meta charset="utf-8"> <meta charset="utf-8">
<title>GPU Stress Test Metrics</title> <title>GPU Stress Test Metrics</title>
<style> <style>
body { font-family: sans-serif; background: #f0f0f0; margin: 0; padding: 20px; } :root{--bg:#fff;--surface:#fff;--surface-2:#f9fafb;--border:rgba(34,36,38,.15);--border-lite:rgba(34,36,38,.1);--ink:rgba(0,0,0,.87);--muted:rgba(0,0,0,.6)}
h1 { text-align: center; color: #333; margin: 0 0 8px; } *{box-sizing:border-box}
p { text-align: center; color: #888; font-size: 13px; margin: 0 0 24px; } body{font:14px/1.5 Lato,"Helvetica Neue",Arial,Helvetica,sans-serif;background:var(--bg);color:var(--ink);margin:0}
.page{padding:24px}
.card{background:var(--surface);border:1px solid var(--border);border-radius:4px;box-shadow:0 1px 2px rgba(34,36,38,.15);overflow:hidden}
.card-head{padding:11px 16px;background:var(--surface-2);border-bottom:1px solid var(--border);font-weight:700;font-size:13px}
.card-body{padding:16px}
h1{font-size:22px;margin:0 0 6px}
p{color:var(--muted);font-size:13px;margin:0 0 16px}
.stage-legend{display:flex;flex-wrap:wrap;gap:10px;margin:0 0 16px}
.stage-chip{display:inline-flex;align-items:center;gap:8px;padding:4px 10px;border-radius:999px;background:var(--surface-2);border:1px solid var(--border-lite);font-size:12px}
.stage-swatch{display:inline-block;width:12px;height:12px;border-radius:999px}
.chart-block{margin-top:16px}
</style> </style>
</head><body> </head><body>
<div class="page">
<div class="card">
<div class="card-head">GPU Stress Test Metrics</div>
<div class="card-body">
<h1>GPU Stress Test Metrics</h1> <h1>GPU Stress Test Metrics</h1>
<p>Generated %s</p> <p>Generated %s</p>
%s %s
</body></html>`, ts, svgs.String()) <div class="chart-block">%s</div>
</div>
</div>
</div>
</body></html>`, ts, legend.String(), svgs.String())
return os.WriteFile(path, []byte(html), 0644) return os.WriteFile(path, []byte(html), 0644)
} }
// drawGPUChartSVG generates a self-contained SVG chart for one GPU. // drawGPUChartSVG generates a self-contained SVG chart for one GPU.
func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string { func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int, stageSpans []gpuMetricStageSpan, stageColorByName map[string]string) string {
// Layout // Layout
const W, H = 960, 520 const W, H = 960, 520
const plotX1 = 120 // usage axis / chart left border const plotX1 = 120 // usage axis / chart left border
@@ -284,6 +325,23 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
} }
b.WriteString("</g>\n") b.WriteString("</g>\n")
// Stage backgrounds
for _, span := range stageSpans {
x1 := xv(span.Start)
x2 := xv(span.End)
if x2 < x1 {
x1, x2 = x2, x1
}
if x2-x1 < 1 {
x2 = x1 + 1
}
color := stageColorByName[span.Name]
fmt.Fprintf(&b, `<rect x="%.1f" y="%d" width="%.1f" height="%d" fill="%s" fill-opacity="0.18"/>`+"\n",
x1, plotY1, x2-x1, PH, color)
fmt.Fprintf(&b, `<text x="%.1f" y="%d" font-family="sans-serif" font-size="10" fill="#444" text-anchor="middle">%s</text>`+"\n",
x1+(x2-x1)/2, plotY1+12, gpuHTMLEscape(span.Name))
}
// Chart border // Chart border
fmt.Fprintf(&b, `<rect x="%d" y="%d" width="%d" height="%d"`+ fmt.Fprintf(&b, `<rect x="%d" y="%d" width="%d" height="%d"`+
` fill="none" stroke="#333" stroke-width="1"/>`+"\n", ` fill="none" stroke="#333" stroke-width="1"/>`+"\n",
@@ -382,221 +440,6 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
return b.String() return b.String()
} }
const (
ansiAmber = "\033[38;5;214m"
ansiReset = "\033[0m"
)
const (
termChartWidth = 70
termChartHeight = 12
)
// RenderGPUTerminalChart returns ANSI line charts (asciigraph-style) per GPU.
// Used in SAT stress-test logs.
func RenderGPUTerminalChart(rows []GPUMetricRow) string {
seen := make(map[int]bool)
var order []int
gpuMap := make(map[int][]GPUMetricRow)
for _, r := range rows {
if !seen[r.GPUIndex] {
seen[r.GPUIndex] = true
order = append(order, r.GPUIndex)
}
gpuMap[r.GPUIndex] = append(gpuMap[r.GPUIndex], r)
}
type seriesDef struct {
caption string
color string
fn func(GPUMetricRow) float64
}
defs := []seriesDef{
{"Temperature (°C)", ansiAmber, func(r GPUMetricRow) float64 { return r.TempC }},
{"GPU Usage (%)", ansiAmber, func(r GPUMetricRow) float64 { return r.UsagePct }},
{"Power (W)", ansiAmber, func(r GPUMetricRow) float64 { return r.PowerW }},
{"Clock (MHz)", ansiAmber, func(r GPUMetricRow) float64 { return r.ClockMHz }},
}
var b strings.Builder
for _, gpuIdx := range order {
gr := gpuMap[gpuIdx]
if len(gr) == 0 {
continue
}
tMax := gr[len(gr)-1].ElapsedSec - gr[0].ElapsedSec
fmt.Fprintf(&b, "GPU %d — Stress Test Metrics (%.0f seconds)\n\n", gpuIdx, tMax)
for _, d := range defs {
b.WriteString(renderLineChart(extractGPUField(gr, d.fn), d.color, d.caption,
termChartHeight, termChartWidth))
b.WriteRune('\n')
}
}
return strings.TrimRight(b.String(), "\n")
}
// renderLineChart draws a single time-series line chart using box-drawing characters.
// Produces output in the style of asciigraph: ╭─╮ │ ╰─╯ with a Y axis and caption.
func renderLineChart(vals []float64, color, caption string, height, width int) string {
if len(vals) == 0 {
return caption + "\n"
}
mn, mx := gpuMinMax(vals)
if mn == mx {
mx = mn + 1
}
// Use the smaller of width or len(vals) to avoid stretching sparse data.
w := width
if len(vals) < w {
w = len(vals)
}
data := gpuDownsample(vals, w)
// row[i] = display row index: 0 = top = max value, height = bottom = min value.
row := make([]int, w)
for i, v := range data {
r := int(math.Round((mx - v) / (mx - mn) * float64(height)))
if r < 0 {
r = 0
}
if r > height {
r = height
}
row[i] = r
}
// Fill the character grid.
grid := make([][]rune, height+1)
for i := range grid {
grid[i] = make([]rune, w)
for j := range grid[i] {
grid[i][j] = ' '
}
}
for x := 0; x < w; x++ {
r := row[x]
if x == 0 {
grid[r][0] = '─'
continue
}
p := row[x-1]
switch {
case r == p:
grid[r][x] = '─'
case r < p: // value went up (row index decreased toward top)
grid[r][x] = '╭'
grid[p][x] = '╯'
for y := r + 1; y < p; y++ {
grid[y][x] = '│'
}
default: // r > p, value went down
grid[p][x] = '╮'
grid[r][x] = '╰'
for y := p + 1; y < r; y++ {
grid[y][x] = '│'
}
}
}
// Y axis tick labels.
ticks := gpuNiceTicks(mn, mx, height/2)
tickAtRow := make(map[int]string)
labelWidth := 4
for _, t := range ticks {
r := int(math.Round((mx - t) / (mx - mn) * float64(height)))
if r < 0 || r > height {
continue
}
s := gpuFormatTick(t)
tickAtRow[r] = s
if len(s) > labelWidth {
labelWidth = len(s)
}
}
var b strings.Builder
for r := 0; r <= height; r++ {
label := tickAtRow[r]
fmt.Fprintf(&b, "%*s", labelWidth, label)
switch {
case label != "":
b.WriteRune('┤')
case r == height:
b.WriteRune('┼')
default:
b.WriteRune('│')
}
b.WriteString(color)
b.WriteString(string(grid[r]))
b.WriteString(ansiReset)
b.WriteRune('\n')
}
// Bottom axis.
b.WriteString(strings.Repeat(" ", labelWidth))
b.WriteRune('└')
b.WriteString(strings.Repeat("─", w))
b.WriteRune('\n')
// Caption centered under the chart.
if caption != "" {
total := labelWidth + 1 + w
if pad := (total - len(caption)) / 2; pad > 0 {
b.WriteString(strings.Repeat(" ", pad))
}
b.WriteString(caption)
b.WriteRune('\n')
}
return b.String()
}
func extractGPUField(rows []GPUMetricRow, fn func(GPUMetricRow) float64) []float64 {
v := make([]float64, len(rows))
for i, r := range rows {
v[i] = fn(r)
}
return v
}
// gpuDownsample averages vals into w buckets (or nearest-neighbor upsamples if len(vals) < w).
func gpuDownsample(vals []float64, w int) []float64 {
n := len(vals)
if n == 0 {
return make([]float64, w)
}
result := make([]float64, w)
if n >= w {
counts := make([]int, w)
for i, v := range vals {
bucket := i * w / n
if bucket >= w {
bucket = w - 1
}
result[bucket] += v
counts[bucket]++
}
for i := range result {
if counts[i] > 0 {
result[i] /= float64(counts[i])
}
}
} else {
// Nearest-neighbour upsample.
for i := range result {
src := i * (n - 1) / (w - 1)
if src >= n {
src = n - 1
}
result[i] = vals[src]
}
}
return result
}
func gpuMinMax(vals []float64) (float64, float64) { func gpuMinMax(vals []float64) (float64, float64) {
if len(vals) == 0 { if len(vals) == 0 {
return 0, 1 return 0, 1
@@ -641,3 +484,46 @@ func gpuFormatTick(v float64) string {
} }
return strconv.FormatFloat(v, 'f', 1, 64) return strconv.FormatFloat(v, 'f', 1, 64)
} }
var gpuMetricStagePalette = []string{
"#d95c5c",
"#2185d0",
"#21ba45",
"#f2c037",
"#6435c9",
"#00b5ad",
"#a5673f",
}
func buildGPUMetricStageSpans(rows []GPUMetricRow) []gpuMetricStageSpan {
var spans []gpuMetricStageSpan
for _, row := range rows {
name := strings.TrimSpace(row.Stage)
if name == "" {
name = "run"
}
if len(spans) == 0 || spans[len(spans)-1].Name != name {
spans = append(spans, gpuMetricStageSpan{Name: name, Start: row.ElapsedSec, End: row.ElapsedSec})
continue
}
spans[len(spans)-1].End = row.ElapsedSec
}
for i := range spans {
if spans[i].End <= spans[i].Start {
spans[i].End = spans[i].Start + 1
}
}
return spans
}
var gpuHTMLReplacer = strings.NewReplacer(
"&", "&amp;",
"<", "&lt;",
">", "&gt;",
`"`, "&quot;",
"'", "&#39;",
)
func gpuHTMLEscape(s string) string {
return gpuHTMLReplacer.Replace(s)
}

View File

@@ -0,0 +1,65 @@
package platform
import (
"os"
"path/filepath"
"strings"
"testing"
)
func TestWriteGPUMetricsCSVIncludesStageColumn(t *testing.T) {
t.Parallel()
dir := t.TempDir()
path := filepath.Join(dir, "gpu-metrics.csv")
rows := []GPUMetricRow{
{Stage: "warmup", ElapsedSec: 1, GPUIndex: 0, TempC: 71, UsagePct: 99, MemUsagePct: 80, PowerW: 420, ClockMHz: 1800, MemClockMHz: 1200},
}
if err := WriteGPUMetricsCSV(path, rows); err != nil {
t.Fatalf("WriteGPUMetricsCSV: %v", err)
}
raw, err := os.ReadFile(path)
if err != nil {
t.Fatalf("ReadFile: %v", err)
}
text := string(raw)
for _, needle := range []string{
"stage,elapsed_sec,gpu_index",
`"warmup",1.0,0,71.0,99.0,80.0,420.0,1800,1200`,
} {
if !strings.Contains(text, needle) {
t.Fatalf("csv missing %q\n%s", needle, text)
}
}
}
func TestWriteGPUMetricsHTMLShowsStageLegendAndLabels(t *testing.T) {
t.Parallel()
dir := t.TempDir()
path := filepath.Join(dir, "gpu-metrics.html")
rows := []GPUMetricRow{
{Stage: "baseline", ElapsedSec: 1, GPUIndex: 0, TempC: 50, UsagePct: 10, MemUsagePct: 5, PowerW: 100, ClockMHz: 500, MemClockMHz: 400},
{Stage: "baseline", ElapsedSec: 2, GPUIndex: 0, TempC: 51, UsagePct: 11, MemUsagePct: 5, PowerW: 101, ClockMHz: 510, MemClockMHz: 400},
{Stage: "steady-fp16", ElapsedSec: 3, GPUIndex: 0, TempC: 70, UsagePct: 98, MemUsagePct: 75, PowerW: 390, ClockMHz: 1700, MemClockMHz: 1100},
{Stage: "steady-fp16", ElapsedSec: 4, GPUIndex: 0, TempC: 71, UsagePct: 99, MemUsagePct: 76, PowerW: 395, ClockMHz: 1710, MemClockMHz: 1110},
}
if err := WriteGPUMetricsHTML(path, rows); err != nil {
t.Fatalf("WriteGPUMetricsHTML: %v", err)
}
raw, err := os.ReadFile(path)
if err != nil {
t.Fatalf("ReadFile: %v", err)
}
text := string(raw)
for _, needle := range []string{
"stage-legend",
"baseline",
"steady-fp16",
"GPU Stress Test Metrics",
} {
if !strings.Contains(text, needle) {
t.Fatalf("html missing %q\n%s", needle, text)
}
}
}

View File

@@ -108,15 +108,15 @@ type nvidiaGPUHealth struct {
} }
type nvidiaGPUStatusFile struct { type nvidiaGPUStatusFile struct {
Index int Index int
Name string Name string
RunStatus string RunStatus string
Reason string Reason string
Health string Health string
HealthRaw string HealthRaw string
Observed bool Observed bool
Selected bool Selected bool
FailingJob string FailingJob string
} }
// AMDGPUInfo holds basic info about an AMD GPU from rocm-smi. // AMDGPUInfo holds basic info about an AMD GPU from rocm-smi.
@@ -410,13 +410,13 @@ func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir strin
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", withNvidiaPersistenceMode( return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", withNvidiaPersistenceMode(
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
satJob{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}}, satJob{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
satJob{ satJob{
name: "03-dcgmproftester.log", name: "03-dcgmproftester.log",
cmd: profCmd, cmd: profCmd,
env: profEnv, env: profEnv,
collectGPU: true, collectGPU: true,
gpuIndices: selected, gpuIndices: selected,
}, },
satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}}, satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
), logFunc) ), logFunc)
} }
@@ -1382,8 +1382,6 @@ func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd
if len(metricRows) > 0 { if len(metricRows) > 0 {
_ = WriteGPUMetricsCSV(filepath.Join(runDir, "gpu-metrics.csv"), metricRows) _ = WriteGPUMetricsCSV(filepath.Join(runDir, "gpu-metrics.csv"), metricRows)
_ = WriteGPUMetricsHTML(filepath.Join(runDir, "gpu-metrics.html"), metricRows) _ = WriteGPUMetricsHTML(filepath.Join(runDir, "gpu-metrics.html"), metricRows)
chart := RenderGPUTerminalChart(metricRows)
_ = os.WriteFile(filepath.Join(runDir, "gpu-metrics-term.txt"), []byte(chart), 0644)
} }
return out, err return out, err

View File

@@ -33,7 +33,6 @@ typedef void *CUstream;
#define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR 75 #define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR 75
#define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR 76 #define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR 76
#define MAX_STRESS_STREAMS 16 #define MAX_STRESS_STREAMS 16
#define MAX_CUBLAS_PROFILES 5
#define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u) #define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
#define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u) #define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
@@ -689,6 +688,8 @@ static const struct profile_desc k_profiles[] = {
#endif #endif
}; };
#define PROFILE_COUNT ((int)(sizeof(k_profiles) / sizeof(k_profiles[0])))
static int load_cublaslt(struct cublaslt_api *api) { static int load_cublaslt(struct cublaslt_api *api) {
memset(api, 0, sizeof(*api)); memset(api, 0, sizeof(*api));
api->lib = dlopen("libcublasLt.so.13", RTLD_NOW | RTLD_LOCAL); api->lib = dlopen("libcublasLt.so.13", RTLD_NOW | RTLD_LOCAL);
@@ -1124,7 +1125,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
const char *precision_filter, const char *precision_filter,
struct stress_report *report) { struct stress_report *report) {
struct cublaslt_api cublas; struct cublaslt_api cublas;
struct prepared_profile prepared[MAX_STRESS_STREAMS * MAX_CUBLAS_PROFILES]; struct prepared_profile prepared[MAX_STRESS_STREAMS * PROFILE_COUNT];
cublasLtHandle_t handle = NULL; cublasLtHandle_t handle = NULL;
CUcontext ctx = NULL; CUcontext ctx = NULL;
CUstream streams[MAX_STRESS_STREAMS] = {0}; CUstream streams[MAX_STRESS_STREAMS] = {0};
@@ -1134,7 +1135,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
int active = 0; int active = 0;
int mp_count = 0; int mp_count = 0;
int stream_count = 1; int stream_count = 1;
int profile_count = (int)(sizeof(k_profiles) / sizeof(k_profiles[0])); int profile_count = PROFILE_COUNT;
int prepared_count = 0; int prepared_count = 0;
size_t requested_budget = 0; size_t requested_budget = 0;
size_t total_budget = 0; size_t total_budget = 0;

View File

@@ -874,8 +874,20 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}" CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"
GPU_STRESS_NEED_BUILD=1 GPU_STRESS_NEED_BUILD=1
if [ -f "$GPU_BURN_WORKER_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_BURN_WORKER_BIN" ]; then if [ -f "$GPU_BURN_WORKER_BIN" ]; then
GPU_STRESS_NEED_BUILD=0 GPU_STRESS_NEED_BUILD=0
for dep in \
"${BUILDER_DIR}/bee-gpu-stress.c" \
"${BUILDER_DIR}/VERSIONS"; do
if [ "$dep" -nt "$GPU_BURN_WORKER_BIN" ]; then
GPU_STRESS_NEED_BUILD=1
break
fi
done
if [ "$GPU_STRESS_NEED_BUILD" = "0" ] && \
find "${CUBLAS_CACHE}/include" "${CUBLAS_CACHE}/lib" -type f -newer "$GPU_BURN_WORKER_BIN" | grep -q .; then
GPU_STRESS_NEED_BUILD=1
fi
fi fi
if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then