From 457ea1cf0452ede774c5bf292a29907e731e6b3a Mon Sep 17 00:00:00 2001 From: Michael Chus Date: Mon, 13 Apr 2026 21:38:28 +0300 Subject: [PATCH] Unify benchmark exports and drop ASCII charts --- audit/internal/platform/benchmark.go | 467 +++++++++++--------- audit/internal/platform/benchmark_report.go | 55 +-- audit/internal/platform/benchmark_test.go | 19 +- audit/internal/platform/benchmark_types.go | 57 ++- audit/internal/platform/gpu_metrics.go | 334 +++++--------- audit/internal/platform/gpu_metrics_test.go | 65 +++ audit/internal/platform/sat.go | 34 +- 7 files changed, 488 insertions(+), 543 deletions(-) create mode 100644 audit/internal/platform/gpu_metrics_test.go diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go index 014a4f9..e94fefa 100644 --- a/audit/internal/platform/benchmark.go +++ b/audit/internal/platform/benchmark.go @@ -125,6 +125,8 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv } logFunc(fmt.Sprintf("NVIDIA benchmark profile=%s gpus=%s", spec.Name, joinIndexList(selected))) + var metricRows []GPUMetricRow + gpuBurnLog := filepath.Join(runDir, "gpu-burn.log") // Server power characterization state — populated during per-GPU phases. var serverIdleW, serverLoadedWSum float64 @@ -171,199 +173,202 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv cpuSamplesCh := startCPULoadSampler(cpuStopCh, 10) if opts.ParallelGPUs { - runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, calibPowerByIndex, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples) + runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, calibPowerByIndex, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples, &metricRows, gpuBurnLog) } else { - for _, idx := range selected { - gpuResult := BenchmarkGPUResult{ - Index: idx, - Status: "FAILED", - } - if info, ok := infoByIndex[idx]; ok { - gpuResult.UUID = info.UUID - gpuResult.Name = info.Name - gpuResult.BusID = info.BusID - gpuResult.VBIOS = info.VBIOS - gpuResult.PowerLimitW = info.PowerLimitW - gpuResult.MultiprocessorCount = info.MultiprocessorCount - gpuResult.DefaultPowerLimitW = info.DefaultPowerLimitW - gpuResult.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz - gpuResult.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz - gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz - } - if w, ok := calibPowerByIndex[idx]; ok && w > 0 { - gpuResult.CalibratedPeakPowerW = w - } - if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil { - gpuResult.LockedGraphicsClockMHz = norm.GPUClockLockMHz - gpuResult.LockedMemoryClockMHz = norm.MemoryClockLockMHz - } - - baselineRows, err := collectBenchmarkSamples(ctx, spec.BaselineSec, []int{idx}) - if err != nil && err != context.Canceled { - gpuResult.Notes = append(gpuResult.Notes, "baseline sampling failed: "+err.Error()) - } - gpuResult.Baseline = summarizeBenchmarkTelemetry(baselineRows) - writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-baseline", idx), baselineRows) - - // Sample server idle power once (first GPU only — server state is global). - if !serverIdleOK { - if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok { - serverIdleW = w - serverIdleOK = true - logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w)) + for _, idx := range selected { + gpuResult := BenchmarkGPUResult{ + Index: idx, + Status: "FAILED", + } + if info, ok := infoByIndex[idx]; ok { + gpuResult.UUID = info.UUID + gpuResult.Name = info.Name + gpuResult.BusID = info.BusID + gpuResult.VBIOS = info.VBIOS + gpuResult.PowerLimitW = info.PowerLimitW + gpuResult.MultiprocessorCount = info.MultiprocessorCount + gpuResult.DefaultPowerLimitW = info.DefaultPowerLimitW + gpuResult.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz + gpuResult.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz + gpuResult.MaxMemoryClockMHz = info.MaxMemoryClockMHz + } + if w, ok := calibPowerByIndex[idx]; ok && w > 0 { + gpuResult.CalibratedPeakPowerW = w + } + if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil { + gpuResult.LockedGraphicsClockMHz = norm.GPUClockLockMHz + gpuResult.LockedMemoryClockMHz = norm.MemoryClockLockMHz } - } - warmupCmd := []string{ - "bee-gpu-burn", - "--seconds", strconv.Itoa(spec.WarmupSec), - "--size-mb", strconv.Itoa(opts.SizeMB), - "--devices", strconv.Itoa(idx), - } - logFunc(fmt.Sprintf("GPU %d: warmup (%ds)", idx, spec.WarmupSec)) - warmupOut, _, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-warmup.log", idx), warmupCmd, nil, []int{idx}, runDir, fmt.Sprintf("gpu-%d-warmup", idx), logFunc) - _ = os.WriteFile(filepath.Join(runDir, fmt.Sprintf("gpu-%d-warmup.log", idx)), warmupOut, 0644) - if warmupErr != nil { - gpuResult.Notes = append(gpuResult.Notes, "warmup failed: "+warmupErr.Error()) - result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult)) - continue - } + baselineRows, err := collectBenchmarkSamples(ctx, spec.BaselineSec, []int{idx}) + if err != nil && err != context.Canceled { + gpuResult.Notes = append(gpuResult.Notes, "baseline sampling failed: "+err.Error()) + } + gpuResult.Baseline = summarizeBenchmarkTelemetry(baselineRows) + appendBenchmarkMetrics(&metricRows, baselineRows, fmt.Sprintf("gpu-%d-baseline", idx)) - // ── Per-precision stability phases ──────────────────────────────────────── - // Run each precision category alone so PowerCVPct reflects genuine GPU - // power stability, not kernel-mix variance. - // Time budget: each phase gets steadySec/numPhases, minimum 60 s. - // SteadySec is split equally across all precision phases + 1 combined slot. - // Skipped phases (unsupported precision) are simply omitted; combined is fixed. - totalSlots := len(benchmarkPrecisionPhases) + 1 - perPhaseSec := spec.SteadySec / totalSlots - if perPhaseSec < 60 { - perPhaseSec = 60 - } - eccBase, _ := queryECCCounters(idx) - for _, prec := range benchmarkPrecisionPhases { - phaseCmd := []string{ + // Sample server idle power once (first GPU only — server state is global). + if !serverIdleOK { + if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok { + serverIdleW = w + serverIdleOK = true + logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w)) + } + } + + warmupCmd := []string{ + "bee-gpu-burn", + "--seconds", strconv.Itoa(spec.WarmupSec), + "--size-mb", strconv.Itoa(opts.SizeMB), + "--devices", strconv.Itoa(idx), + } + logFunc(fmt.Sprintf("GPU %d: warmup (%ds)", idx, spec.WarmupSec)) + warmupOut, warmupRows, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-warmup.log", idx), warmupCmd, nil, []int{idx}, logFunc) + appendBenchmarkMetrics(&metricRows, warmupRows, fmt.Sprintf("gpu-%d-warmup", idx)) + appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", fmt.Sprintf("gpu-%d-warmup", idx), warmupOut) + if warmupErr != nil { + gpuResult.Notes = append(gpuResult.Notes, "warmup failed: "+warmupErr.Error()) + result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult)) + continue + } + + // ── Per-precision stability phases ──────────────────────────────────────── + // Run each precision category alone so PowerCVPct reflects genuine GPU + // power stability, not kernel-mix variance. + // Time budget: each phase gets steadySec/numPhases, minimum 60 s. + // SteadySec is split equally across all precision phases + 1 combined slot. + // Skipped phases (unsupported precision) are simply omitted; combined is fixed. + totalSlots := len(benchmarkPrecisionPhases) + 1 + perPhaseSec := spec.SteadySec / totalSlots + if perPhaseSec < 60 { + perPhaseSec = 60 + } + eccBase, _ := queryECCCounters(idx) + for _, prec := range benchmarkPrecisionPhases { + phaseCmd := []string{ + "bee-gpu-burn", + "--seconds", strconv.Itoa(perPhaseSec), + "--size-mb", strconv.Itoa(opts.SizeMB), + "--devices", strconv.Itoa(idx), + "--precision", prec, + } + logFunc(fmt.Sprintf("GPU %d: %s stability phase (%ds)", idx, prec, perPhaseSec)) + phaseLogName := fmt.Sprintf("gpu-%d-steady-%s", idx, prec) + eccBefore, _ := queryECCCounters(idx) + phaseOut, phaseRows, phaseErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, phaseLogName+".log", phaseCmd, nil, []int{idx}, logFunc) + appendBenchmarkMetrics(&metricRows, phaseRows, phaseLogName) + appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseLogName, phaseOut) + eccAfter, _ := queryECCCounters(idx) + if phaseErr != nil || len(phaseRows) == 0 { + continue + } + phase := BenchmarkPrecisionSteadyPhase{ + Precision: prec, + Steady: summarizeBenchmarkTelemetry(phaseRows), + ECC: diffECCCounters(eccBefore, eccAfter), + } + for _, p := range parseBenchmarkBurnLog(string(phaseOut)).Profiles { + if p.Supported { + phase.TeraOpsPerSec += p.TeraOpsPerSec + phase.WeightedTeraOpsPerSec += p.WeightedTeraOpsPerSec + } + } + gpuResult.PrecisionSteady = append(gpuResult.PrecisionSteady, phase) + } + + beforeThrottle, _ := queryThrottleCounters(idx) + steadyCmd := []string{ "bee-gpu-burn", "--seconds", strconv.Itoa(perPhaseSec), "--size-mb", strconv.Itoa(opts.SizeMB), "--devices", strconv.Itoa(idx), - "--precision", prec, } - logFunc(fmt.Sprintf("GPU %d: %s stability phase (%ds)", idx, prec, perPhaseSec)) - phaseLogName := fmt.Sprintf("gpu-%d-steady-%s", idx, prec) - eccBefore, _ := queryECCCounters(idx) - phaseOut, phaseRows, phaseErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, phaseLogName+".log", phaseCmd, nil, []int{idx}, runDir, phaseLogName, logFunc) - eccAfter, _ := queryECCCounters(idx) - if phaseErr != nil || len(phaseRows) == 0 { - continue - } - phase := BenchmarkPrecisionSteadyPhase{ - Precision: prec, - Steady: summarizeBenchmarkTelemetry(phaseRows), - ECC: diffECCCounters(eccBefore, eccAfter), - } - for _, p := range parseBenchmarkBurnLog(string(phaseOut)).Profiles { - if p.Supported { - phase.TeraOpsPerSec += p.TeraOpsPerSec - phase.WeightedTeraOpsPerSec += p.WeightedTeraOpsPerSec - } - } - gpuResult.PrecisionSteady = append(gpuResult.PrecisionSteady, phase) - } + logFunc(fmt.Sprintf("GPU %d: steady compute (combined, %ds)", idx, perPhaseSec)) - beforeThrottle, _ := queryThrottleCounters(idx) - steadyCmd := []string{ - "bee-gpu-burn", - "--seconds", strconv.Itoa(perPhaseSec), - "--size-mb", strconv.Itoa(opts.SizeMB), - "--devices", strconv.Itoa(idx), - } - logFunc(fmt.Sprintf("GPU %d: steady compute (combined, %ds)", idx, perPhaseSec)) - - // Sample server power via IPMI in parallel with the steady phase. - // We collect readings every 5s and average them. - ipmiStopCh := make(chan struct{}) - ipmiResultCh := make(chan float64, 1) - go func() { - defer close(ipmiResultCh) - var samples []float64 - ticker := time.NewTicker(5 * time.Second) - defer ticker.Stop() - // First sample after a short warmup delay. - select { - case <-ipmiStopCh: - return - case <-time.After(15 * time.Second): - } - for { - if w, err := queryIPMIServerPowerW(); err == nil { - samples = append(samples, w) - } + // Sample server power via IPMI in parallel with the steady phase. + // We collect readings every 5s and average them. + ipmiStopCh := make(chan struct{}) + ipmiResultCh := make(chan float64, 1) + go func() { + defer close(ipmiResultCh) + var samples []float64 + ticker := time.NewTicker(5 * time.Second) + defer ticker.Stop() + // First sample after a short warmup delay. select { case <-ipmiStopCh: - if len(samples) > 0 { - var sum float64 - for _, w := range samples { - sum += w - } - ipmiResultCh <- sum / float64(len(samples)) - } return - case <-ticker.C: + case <-time.After(15 * time.Second): } + for { + if w, err := queryIPMIServerPowerW(); err == nil { + samples = append(samples, w) + } + select { + case <-ipmiStopCh: + if len(samples) > 0 { + var sum float64 + for _, w := range samples { + sum += w + } + ipmiResultCh <- sum / float64(len(samples)) + } + return + case <-ticker.C: + } + } + }() + + steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-steady.log", idx), steadyCmd, nil, []int{idx}, logFunc) + appendBenchmarkMetrics(&metricRows, steadyRows, fmt.Sprintf("gpu-%d-steady", idx)) + appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", fmt.Sprintf("gpu-%d-steady", idx), steadyOut) + close(ipmiStopCh) + if loadedW, ok := <-ipmiResultCh; ok { + serverLoadedWSum += loadedW + serverLoadedSamples++ + serverLoadedOK = true + logFunc(fmt.Sprintf("GPU %d: server loaded power (IPMI): %.0f W", idx, loadedW)) + } + afterThrottle, _ := queryThrottleCounters(idx) + if steadyErr != nil { + gpuResult.Notes = append(gpuResult.Notes, "steady compute failed: "+steadyErr.Error()) } - }() - steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, fmt.Sprintf("gpu-%d-steady.log", idx), steadyCmd, nil, []int{idx}, runDir, fmt.Sprintf("gpu-%d-steady", idx), logFunc) - close(ipmiStopCh) - if loadedW, ok := <-ipmiResultCh; ok { - serverLoadedWSum += loadedW - serverLoadedSamples++ - serverLoadedOK = true - logFunc(fmt.Sprintf("GPU %d: server loaded power (IPMI): %.0f W", idx, loadedW)) + parseResult := parseBenchmarkBurnLog(string(steadyOut)) + gpuResult.ComputeCapability = parseResult.ComputeCapability + gpuResult.Backend = parseResult.Backend + gpuResult.PrecisionResults = parseResult.Profiles + if parseResult.Fallback { + gpuResult.Notes = append(gpuResult.Notes, "benchmark used driver PTX fallback; tensor throughput score is not comparable") + } + + gpuResult.Steady = summarizeBenchmarkTelemetry(steadyRows) + gpuResult.Throttle = diffThrottleCounters(beforeThrottle, afterThrottle) + if eccFinal, err := queryECCCounters(idx); err == nil { + gpuResult.ECC = diffECCCounters(eccBase, eccFinal) + } + + cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, []int{idx}) + if err != nil && err != context.Canceled { + gpuResult.Notes = append(gpuResult.Notes, "cooldown sampling failed: "+err.Error()) + } + gpuResult.Cooldown = summarizeBenchmarkTelemetry(cooldownRows) + appendBenchmarkMetrics(&metricRows, cooldownRows, fmt.Sprintf("gpu-%d-cooldown", idx)) + + gpuResult.Scores = scoreBenchmarkGPUResult(gpuResult) + gpuResult.DegradationReasons = detectBenchmarkDegradationReasons(gpuResult, result.Normalization.Status) + if steadyErr != nil { + gpuResult.Status = classifySATErrorStatus(steadyOut, steadyErr) + } else if parseResult.Fallback { + gpuResult.Status = "PARTIAL" + } else { + gpuResult.Status = "OK" + } + + result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult)) } - _ = os.WriteFile(filepath.Join(runDir, fmt.Sprintf("gpu-%d-steady.log", idx)), steadyOut, 0644) - afterThrottle, _ := queryThrottleCounters(idx) - if steadyErr != nil { - gpuResult.Notes = append(gpuResult.Notes, "steady compute failed: "+steadyErr.Error()) - } - - parseResult := parseBenchmarkBurnLog(string(steadyOut)) - gpuResult.ComputeCapability = parseResult.ComputeCapability - gpuResult.Backend = parseResult.Backend - gpuResult.PrecisionResults = parseResult.Profiles - if parseResult.Fallback { - gpuResult.Notes = append(gpuResult.Notes, "benchmark used driver PTX fallback; tensor throughput score is not comparable") - } - - gpuResult.Steady = summarizeBenchmarkTelemetry(steadyRows) - gpuResult.Throttle = diffThrottleCounters(beforeThrottle, afterThrottle) - if eccFinal, err := queryECCCounters(idx); err == nil { - gpuResult.ECC = diffECCCounters(eccBase, eccFinal) - } - - cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, []int{idx}) - if err != nil && err != context.Canceled { - gpuResult.Notes = append(gpuResult.Notes, "cooldown sampling failed: "+err.Error()) - } - gpuResult.Cooldown = summarizeBenchmarkTelemetry(cooldownRows) - writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-cooldown", idx), cooldownRows) - - gpuResult.Scores = scoreBenchmarkGPUResult(gpuResult) - gpuResult.DegradationReasons = detectBenchmarkDegradationReasons(gpuResult, result.Normalization.Status) - if steadyErr != nil { - gpuResult.Status = classifySATErrorStatus(steadyOut, steadyErr) - } else if parseResult.Fallback { - gpuResult.Status = "PARTIAL" - } else { - gpuResult.Status = "OK" - } - - result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult)) - } - } // end sequential path if len(selected) > 1 && opts.RunNCCL { @@ -413,6 +418,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv result.Findings = buildBenchmarkFindings(result) result.OverallStatus = benchmarkOverallStatus(result) + writeBenchmarkMetricsFiles(runDir, metricRows) resultJSON, err := json.MarshalIndent(result, "", " ") if err != nil { @@ -422,7 +428,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv return "", fmt.Errorf("write result.json: %w", err) } - report := renderBenchmarkReportWithCharts(result, loadBenchmarkReportCharts(runDir, selected)) + report := renderBenchmarkReportWithCharts(result) if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(report), 0644); err != nil { return "", fmt.Errorf("write report.md: %w", err) } @@ -511,11 +517,11 @@ func enrichGPUInfoWithMaxClocks(infoByIndex map[int]benchmarkGPUInfo, nvsmiQ []b // Split the verbose output into per-GPU sections on "^GPU " lines. gpuSectionRe := regexp.MustCompile(`(?m)^GPU\s+([\dA-Fa-f:\.]+)`) - maxGfxRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Graphics\s*:\s*(\d+)\s*MHz`) - maxMemRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Memory\s*:\s*(\d+)\s*MHz`) - defaultPwrRe := regexp.MustCompile(`(?i)Default Power Limit\s*:\s*([0-9.]+)\s*W`) - currentPwrRe := regexp.MustCompile(`(?i)Current Power Limit\s*:\s*([0-9.]+)\s*W`) - smCountRe := regexp.MustCompile(`(?i)Multiprocessor Count\s*:\s*(\d+)`) + maxGfxRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Graphics\s*:\s*(\d+)\s*MHz`) + maxMemRe := regexp.MustCompile(`(?i)Max Clocks[\s\S]*?Memory\s*:\s*(\d+)\s*MHz`) + defaultPwrRe := regexp.MustCompile(`(?i)Default Power Limit\s*:\s*([0-9.]+)\s*W`) + currentPwrRe := regexp.MustCompile(`(?i)Current Power Limit\s*:\s*([0-9.]+)\s*W`) + smCountRe := regexp.MustCompile(`(?i)Multiprocessor Count\s*:\s*(\d+)`) sectionStarts := gpuSectionRe.FindAllSubmatchIndex(nvsmiQ, -1) for i, loc := range sectionStarts { @@ -651,7 +657,6 @@ func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) { return nil, lastErr } - func applyBenchmarkNormalization(ctx context.Context, verboseLog string, gpuIndices []int, infoByIndex map[int]benchmarkGPUInfo, result *NvidiaBenchmarkResult) []benchmarkRestoreAction { if os.Geteuid() != 0 { result.Normalization.Status = "partial" @@ -754,7 +759,7 @@ func collectBenchmarkSamples(ctx context.Context, durationSec int, gpuIndices [] return rows, nil } -func runBenchmarkCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, runDir, baseName string, logFunc func(string)) ([]byte, []GPUMetricRow, error) { +func runBenchmarkCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd []string, env []string, gpuIndices []int, logFunc func(string)) ([]byte, []GPUMetricRow, error) { stopCh := make(chan struct{}) doneCh := make(chan struct{}) var metricRows []GPUMetricRow @@ -786,18 +791,65 @@ func runBenchmarkCommandWithMetrics(ctx context.Context, verboseLog, name string close(stopCh) <-doneCh - writeBenchmarkMetricsFiles(runDir, baseName, metricRows) return out, metricRows, err } -func writeBenchmarkMetricsFiles(runDir, baseName string, rows []GPUMetricRow) { +func annotateBenchmarkMetricRows(rows []GPUMetricRow, stage string, offset float64) []GPUMetricRow { + if len(rows) == 0 { + return nil + } + out := make([]GPUMetricRow, len(rows)) + for i, row := range rows { + row.Stage = stage + row.ElapsedSec += offset + out[i] = row + } + return out +} + +func benchmarkMetricOffset(rows []GPUMetricRow) float64 { + if len(rows) == 0 { + return 0 + } + var maxElapsed float64 + for _, row := range rows { + if row.ElapsedSec > maxElapsed { + maxElapsed = row.ElapsedSec + } + } + return maxElapsed +} + +func appendBenchmarkMetrics(allRows *[]GPUMetricRow, rows []GPUMetricRow, stage string) { + annotated := annotateBenchmarkMetricRows(rows, stage, benchmarkMetricOffset(*allRows)) + *allRows = append(*allRows, annotated...) +} + +func writeBenchmarkMetricsFiles(runDir string, rows []GPUMetricRow) { if len(rows) == 0 { return } - _ = WriteGPUMetricsCSV(filepath.Join(runDir, baseName+"-metrics.csv"), rows) - _ = WriteGPUMetricsHTML(filepath.Join(runDir, baseName+"-metrics.html"), rows) - chart := RenderGPUTerminalChart(rows) - _ = os.WriteFile(filepath.Join(runDir, baseName+"-metrics-term.txt"), []byte(chart), 0644) + _ = WriteGPUMetricsCSV(filepath.Join(runDir, "gpu-metrics.csv"), rows) + _ = WriteGPUMetricsHTML(filepath.Join(runDir, "gpu-metrics.html"), rows) +} + +func appendBenchmarkStageLog(path, source, stage string, raw []byte) { + if path == "" || len(raw) == 0 { + return + } + f, err := os.OpenFile(path, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0644) + if err != nil { + return + } + defer f.Close() + header := fmt.Sprintf("\n========== %s | stage=%s ==========\n", source, stage) + _, _ = f.WriteString(header) + if len(raw) > 0 { + _, _ = f.Write(raw) + if raw[len(raw)-1] != '\n' { + _, _ = f.WriteString("\n") + } + } } func parseBenchmarkBurnLog(raw string) benchmarkBurnParseResult { @@ -897,11 +949,13 @@ func ensureBenchmarkProfile(profiles map[string]*benchmarkBurnProfile, name stri // precisionWeight returns the fp32-equivalence factor for a precision category. // Each factor represents how much "real" numeric work one operation of that // type performs relative to fp32 (single precision = 1.0 baseline): -// fp64 = 2.0 — double precision, 2× more bits per operand -// fp32 = 1.0 — single precision baseline -// fp16 = 0.5 — half precision -// fp8 = 0.25 — quarter precision -// fp4 = 0.125 — eighth precision +// +// fp64 = 2.0 — double precision, 2× more bits per operand +// fp32 = 1.0 — single precision baseline +// fp16 = 0.5 — half precision +// fp8 = 0.25 — quarter precision +// fp4 = 0.125 — eighth precision +// // Multiplying raw TOPS by the weight gives fp32-equivalent TOPS, enabling // cross-precision comparison on the same numeric scale. func precisionWeight(category string) float64 { @@ -1670,6 +1724,8 @@ func runNvidiaBenchmarkParallel( calibPowerByIndex map[int]float64, serverIdleW *float64, serverLoadedWSum *float64, serverIdleOK *bool, serverLoadedOK *bool, serverLoadedSamples *int, + allMetricRows *[]GPUMetricRow, + gpuBurnLog string, ) { allDevices := joinIndexList(selected) @@ -1709,8 +1765,8 @@ func runNvidiaBenchmarkParallel( for _, idx := range selected { perGPU := filterRowsByGPU(baselineRows, idx) gpuResults[idx].Baseline = summarizeBenchmarkTelemetry(perGPU) - writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-baseline", idx), perGPU) } + appendBenchmarkMetrics(allMetricRows, baselineRows, "baseline") // Sample server idle power once. if !*serverIdleOK { @@ -1729,11 +1785,9 @@ func runNvidiaBenchmarkParallel( "--devices", allDevices, } logFunc(fmt.Sprintf("GPUs %s: parallel warmup (%ds)", allDevices, spec.WarmupSec)) - warmupOut, warmupRows, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-warmup.log", warmupCmd, nil, selected, runDir, "gpu-all-warmup", logFunc) - _ = os.WriteFile(filepath.Join(runDir, "gpu-all-warmup.log"), warmupOut, 0644) - for _, idx := range selected { - writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-warmup", idx), filterRowsByGPU(warmupRows, idx)) - } + warmupOut, warmupRows, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-warmup.log", warmupCmd, nil, selected, logFunc) + appendBenchmarkMetrics(allMetricRows, warmupRows, "warmup") + appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", "warmup", warmupOut) if warmupErr != nil { for _, idx := range selected { gpuResults[idx].Notes = append(gpuResults[idx].Notes, "parallel warmup failed: "+warmupErr.Error()) @@ -1764,7 +1818,9 @@ func runNvidiaBenchmarkParallel( for _, idx := range selected { eccBeforePhase[idx], _ = queryECCCounters(idx) } - phaseOut, phaseRows, phaseErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, phaseLogName+".log", phaseCmd, nil, selected, runDir, phaseLogName, logFunc) + phaseOut, phaseRows, phaseErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, phaseLogName+".log", phaseCmd, nil, selected, logFunc) + appendBenchmarkMetrics(allMetricRows, phaseRows, phaseLogName) + appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", phaseLogName, phaseOut) eccAfterPhase := make(map[int]BenchmarkECCCounters, len(selected)) for _, idx := range selected { eccAfterPhase[idx], _ = queryECCCounters(idx) @@ -1842,7 +1898,9 @@ func runNvidiaBenchmarkParallel( } }() - steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-steady.log", steadyCmd, nil, selected, runDir, "gpu-all-steady", logFunc) + steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-steady.log", steadyCmd, nil, selected, logFunc) + appendBenchmarkMetrics(allMetricRows, steadyRows, "steady") + appendBenchmarkStageLog(gpuBurnLog, "bee-gpu-burn", "steady", steadyOut) close(ipmiStopCh) if loadedW, ok := <-ipmiResultCh; ok { *serverLoadedWSum += loadedW @@ -1850,8 +1908,6 @@ func runNvidiaBenchmarkParallel( *serverLoadedOK = true logFunc(fmt.Sprintf("GPUs %s: server loaded power (IPMI): %.0f W", allDevices, loadedW)) } - _ = os.WriteFile(filepath.Join(runDir, "gpu-all-steady.log"), steadyOut, 0644) - afterThrottle := make(map[int]BenchmarkThrottleCounters, len(selected)) for _, idx := range selected { afterThrottle[idx], _ = queryThrottleCounters(idx) @@ -1861,7 +1917,6 @@ func runNvidiaBenchmarkParallel( for _, idx := range selected { perGPU := filterRowsByGPU(steadyRows, idx) - writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-steady", idx), perGPU) gpuResults[idx].Steady = summarizeBenchmarkTelemetry(perGPU) gpuResults[idx].Throttle = diffThrottleCounters(beforeThrottle[idx], afterThrottle[idx]) if eccFinal, err := queryECCCounters(idx); err == nil { @@ -1891,8 +1946,8 @@ func runNvidiaBenchmarkParallel( for _, idx := range selected { perGPU := filterRowsByGPU(cooldownRows, idx) gpuResults[idx].Cooldown = summarizeBenchmarkTelemetry(perGPU) - writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-cooldown", idx), perGPU) } + appendBenchmarkMetrics(allMetricRows, cooldownRows, "cooldown") // Score and finalize each GPU. for _, idx := range selected { @@ -2102,7 +2157,7 @@ func runBenchmarkPowerCalibration( logFunc(fmt.Sprintf("power calibration: running dcgmi targeted_power for %ds on GPUs %s", calibDurationSec, joinIndexList(gpuIndices))) cmd := nvidiaDCGMNamedDiagCommand("targeted_power", calibDurationSec, gpuIndices) - out, rows, err := runBenchmarkCommandWithMetrics(ctx, verboseLog, "power-calibration.log", cmd, nil, gpuIndices, runDir, "power-calibration", logFunc) + out, rows, err := runBenchmarkCommandWithMetrics(ctx, verboseLog, "power-calibration.log", cmd, nil, gpuIndices, logFunc) _ = os.WriteFile(filepath.Join(runDir, "power-calibration.log"), out, 0644) if err != nil { logFunc(fmt.Sprintf("power calibration: dcgmi targeted_power failed (%v), skipping", err)) diff --git a/audit/internal/platform/benchmark_report.go b/audit/internal/platform/benchmark_report.go index efcbac9..614846b 100644 --- a/audit/internal/platform/benchmark_report.go +++ b/audit/internal/platform/benchmark_report.go @@ -2,25 +2,15 @@ package platform import ( "fmt" - "os" - "path/filepath" - "regexp" "strings" "time" ) func renderBenchmarkReport(result NvidiaBenchmarkResult) string { - return renderBenchmarkReportWithCharts(result, nil) + return renderBenchmarkReportWithCharts(result) } -type benchmarkReportChart struct { - Title string - Content string -} - -var ansiEscapePattern = regexp.MustCompile(`\x1b\[[0-9;]*m`) - -func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benchmarkReportChart) string { +func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string { var b strings.Builder // ── Header ──────────────────────────────────────────────────────────────── @@ -213,7 +203,6 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc gpu.Steady.ClockCVPct, gpu.Steady.PowerCVPct, gpu.Steady.ClockDriftPct) } - // ECC summary if !gpu.ECC.IsZero() { fmt.Fprintf(&b, "**ECC errors (total):** corrected=%d uncorrected=%d\n\n", @@ -297,18 +286,6 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc } } - // ── Terminal charts (steady-state only) ─────────────────────────────────── - if len(charts) > 0 { - b.WriteString("## Steady-State Charts\n\n") - for _, chart := range charts { - content := strings.TrimSpace(stripANSIEscapeSequences(chart.Content)) - if content == "" { - continue - } - fmt.Fprintf(&b, "### %s\n\n```\n%s\n```\n\n", chart.Title, content) - } - } - // ── Methodology ─────────────────────────────────────────────────────────── b.WriteString("## Methodology\n\n") fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline → warmup → steady-state → interconnect → cooldown phases.\n", result.BenchmarkProfile) @@ -319,39 +296,13 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc // ── Raw files ───────────────────────────────────────────────────────────── b.WriteString("## Raw Files\n\n") b.WriteString("- `result.json`\n- `report.md`\n- `summary.txt`\n- `verbose.log`\n") - b.WriteString("- `gpu-*-baseline-metrics.csv/html/term.txt`\n") - b.WriteString("- `gpu-*-warmup.log`\n") - b.WriteString("- `gpu-*-steady.log`\n") - b.WriteString("- `gpu-*-steady-metrics.csv/html/term.txt`\n") - b.WriteString("- `gpu-*-cooldown-metrics.csv/html/term.txt`\n") + b.WriteString("- `gpu-metrics.csv`\n- `gpu-metrics.html`\n- `gpu-burn.log`\n") if result.Interconnect != nil { b.WriteString("- `nccl-all-reduce.log`\n") } return b.String() } -// loadBenchmarkReportCharts loads only steady-state terminal charts (baseline and -// cooldown charts are not useful for human review). -func loadBenchmarkReportCharts(runDir string, gpuIndices []int) []benchmarkReportChart { - var charts []benchmarkReportChart - for _, idx := range gpuIndices { - path := filepath.Join(runDir, fmt.Sprintf("gpu-%d-steady-metrics-term.txt", idx)) - raw, err := os.ReadFile(path) - if err != nil || len(raw) == 0 { - continue - } - charts = append(charts, benchmarkReportChart{ - Title: fmt.Sprintf("GPU %d — Steady State", idx), - Content: string(raw), - }) - } - return charts -} - -func stripANSIEscapeSequences(raw string) string { - return ansiEscapePattern.ReplaceAllString(raw, "") -} - // formatThrottleLine renders throttle counters as human-readable percentages of // the steady-state window. Only non-zero counters are shown. When the steady // duration is unknown (0), raw seconds are shown instead. diff --git a/audit/internal/platform/benchmark_test.go b/audit/internal/platform/benchmark_test.go index b8cb5f4..c57058f 100644 --- a/audit/internal/platform/benchmark_test.go +++ b/audit/internal/platform/benchmark_test.go @@ -147,36 +147,27 @@ func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) { } } -func TestRenderBenchmarkReportIncludesTerminalChartsWithoutANSI(t *testing.T) { +func TestRenderBenchmarkReportListsUnifiedArtifacts(t *testing.T) { t.Parallel() - report := renderBenchmarkReportWithCharts(NvidiaBenchmarkResult{ + report := renderBenchmarkReport(NvidiaBenchmarkResult{ BenchmarkProfile: NvidiaBenchmarkProfileStandard, OverallStatus: "OK", SelectedGPUIndices: []int{0}, Normalization: BenchmarkNormalization{ Status: "full", }, - }, []benchmarkReportChart{ - { - Title: "GPU 0 Steady State", - Content: "\x1b[31mGPU 0 chart\x1b[0m\n 42┤───", - }, }) for _, needle := range []string{ - "Steady-State Charts", - "GPU 0 Steady State", - "GPU 0 chart", - "42┤───", + "gpu-metrics.csv", + "gpu-metrics.html", + "gpu-burn.log", } { if !strings.Contains(report, needle) { t.Fatalf("report missing %q\n%s", needle, report) } } - if strings.Contains(report, "\x1b[31m") { - t.Fatalf("report should not contain ANSI escapes\n%s", report) - } } func TestEnrichGPUInfoWithMaxClocks(t *testing.T) { diff --git a/audit/internal/platform/benchmark_types.go b/audit/internal/platform/benchmark_types.go index 1500ea2..b716fcc 100644 --- a/audit/internal/platform/benchmark_types.go +++ b/audit/internal/platform/benchmark_types.go @@ -43,7 +43,6 @@ type NvidiaBenchmarkOptions struct { RampRunID string // shared identifier across all steps of the same ramp-up run } - type NvidiaBenchmarkResult struct { BenchmarkVersion string `json:"benchmark_version"` GeneratedAt time.Time `json:"generated_at"` @@ -84,38 +83,38 @@ type BenchmarkNormalizationGPU struct { } type BenchmarkGPUResult struct { - Index int `json:"index"` - UUID string `json:"uuid,omitempty"` - Name string `json:"name,omitempty"` - BusID string `json:"bus_id,omitempty"` - VBIOS string `json:"vbios,omitempty"` - ComputeCapability string `json:"compute_capability,omitempty"` - Backend string `json:"backend,omitempty"` - Status string `json:"status"` - PowerLimitW float64 `json:"power_limit_w,omitempty"` - MultiprocessorCount int `json:"multiprocessor_count,omitempty"` - DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"` + Index int `json:"index"` + UUID string `json:"uuid,omitempty"` + Name string `json:"name,omitempty"` + BusID string `json:"bus_id,omitempty"` + VBIOS string `json:"vbios,omitempty"` + ComputeCapability string `json:"compute_capability,omitempty"` + Backend string `json:"backend,omitempty"` + Status string `json:"status"` + PowerLimitW float64 `json:"power_limit_w,omitempty"` + MultiprocessorCount int `json:"multiprocessor_count,omitempty"` + DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"` // CalibratedPeakPowerW is the p95 power measured during a short // dcgmi targeted_power calibration run before the main benchmark. // Used as the reference denominator for PowerSustainScore instead of // the hardware default limit, which bee-gpu-burn cannot reach. - CalibratedPeakPowerW float64 `json:"calibrated_peak_power_w,omitempty"` - MaxGraphicsClockMHz float64 `json:"max_graphics_clock_mhz,omitempty"` - BaseGraphicsClockMHz float64 `json:"base_graphics_clock_mhz,omitempty"` - MaxMemoryClockMHz float64 `json:"max_memory_clock_mhz,omitempty"` - LockedGraphicsClockMHz float64 `json:"locked_graphics_clock_mhz,omitempty"` - LockedMemoryClockMHz float64 `json:"locked_memory_clock_mhz,omitempty"` + CalibratedPeakPowerW float64 `json:"calibrated_peak_power_w,omitempty"` + MaxGraphicsClockMHz float64 `json:"max_graphics_clock_mhz,omitempty"` + BaseGraphicsClockMHz float64 `json:"base_graphics_clock_mhz,omitempty"` + MaxMemoryClockMHz float64 `json:"max_memory_clock_mhz,omitempty"` + LockedGraphicsClockMHz float64 `json:"locked_graphics_clock_mhz,omitempty"` + LockedMemoryClockMHz float64 `json:"locked_memory_clock_mhz,omitempty"` Baseline BenchmarkTelemetrySummary `json:"baseline"` Steady BenchmarkTelemetrySummary `json:"steady"` PrecisionSteady []BenchmarkPrecisionSteadyPhase `json:"precision_steady,omitempty"` Cooldown BenchmarkTelemetrySummary `json:"cooldown"` Throttle BenchmarkThrottleCounters `json:"throttle_counters"` // ECC error delta accumulated over the full benchmark (all phases combined). - ECC BenchmarkECCCounters `json:"ecc,omitempty"` - PrecisionResults []BenchmarkPrecisionResult `json:"precision_results,omitempty"` - Scores BenchmarkScorecard `json:"scores"` - DegradationReasons []string `json:"degradation_reasons,omitempty"` - Notes []string `json:"notes,omitempty"` + ECC BenchmarkECCCounters `json:"ecc,omitempty"` + PrecisionResults []BenchmarkPrecisionResult `json:"precision_results,omitempty"` + Scores BenchmarkScorecard `json:"scores"` + DegradationReasons []string `json:"degradation_reasons,omitempty"` + Notes []string `json:"notes,omitempty"` } type BenchmarkTelemetrySummary struct { @@ -170,19 +169,19 @@ type BenchmarkPrecisionResult struct { // Weight is the fp32-equivalence factor for this precision category. // fp32 = 1.0 (baseline), fp64 = 2.0, fp16 = 0.5, fp8 = 0.25, fp4 = 0.125. // WeightedTOPS = TeraOpsPerSec * Weight gives fp32-equivalent throughput. - Weight float64 `json:"weight,omitempty"` + Weight float64 `json:"weight,omitempty"` WeightedTeraOpsPerSec float64 `json:"weighted_teraops_per_sec,omitempty"` - Notes string `json:"notes,omitempty"` + Notes string `json:"notes,omitempty"` } type BenchmarkScorecard struct { - ComputeScore float64 `json:"compute_score"` + ComputeScore float64 `json:"compute_score"` // SyntheticScore is the sum of fp32-equivalent TOPS from per-precision // steady phases (each precision ran alone, full GPU dedicated). - SyntheticScore float64 `json:"synthetic_score,omitempty"` + SyntheticScore float64 `json:"synthetic_score,omitempty"` // MixedScore is the sum of fp32-equivalent TOPS from the combined phase // (all precisions competing simultaneously — closer to real workloads). - MixedScore float64 `json:"mixed_score,omitempty"` + MixedScore float64 `json:"mixed_score,omitempty"` // MixedEfficiency = MixedScore / SyntheticScore. Measures how well the GPU // sustains throughput under concurrent mixed-precision load. MixedEfficiency float64 `json:"mixed_efficiency,omitempty"` @@ -220,7 +219,7 @@ type BenchmarkPrecisionSteadyPhase struct { // ECC errors accumulated during this precision phase only. // Non-zero corrected = stress-induced DRAM errors for this kernel type. // Any uncorrected = serious fault triggered by this precision workload. - ECC BenchmarkECCCounters `json:"ecc,omitempty"` + ECC BenchmarkECCCounters `json:"ecc,omitempty"` } type BenchmarkInterconnectResult struct { diff --git a/audit/internal/platform/gpu_metrics.go b/audit/internal/platform/gpu_metrics.go index b22f438..5bc0f4b 100644 --- a/audit/internal/platform/gpu_metrics.go +++ b/audit/internal/platform/gpu_metrics.go @@ -13,6 +13,7 @@ import ( // GPUMetricRow is one telemetry sample from nvidia-smi during a stress test. type GPUMetricRow struct { + Stage string `json:"stage,omitempty"` ElapsedSec float64 `json:"elapsed_sec"` GPUIndex int `json:"index"` TempC float64 `json:"temp_c"` @@ -141,14 +142,20 @@ func sampleAMDGPUMetrics() ([]GPUMetricRow, error) { // WriteGPUMetricsCSV writes collected rows as a CSV file. func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error { var b bytes.Buffer - b.WriteString("elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz\n") + b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz\n") for _, r := range rows { - fmt.Fprintf(&b, "%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f\n", - r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz) + fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f\n", + strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz) } return os.WriteFile(path, b.Bytes(), 0644) } +type gpuMetricStageSpan struct { + Name string + Start float64 + End float64 +} + // WriteGPUMetricsHTML writes a standalone HTML file with one SVG chart per GPU. func WriteGPUMetricsHTML(path string, rows []GPUMetricRow) error { // Group by GPU index preserving order. @@ -163,9 +170,25 @@ func WriteGPUMetricsHTML(path string, rows []GPUMetricRow) error { gpuMap[r.GPUIndex] = append(gpuMap[r.GPUIndex], r) } + stageSpans := buildGPUMetricStageSpans(rows) + stageColorByName := make(map[string]string, len(stageSpans)) + for i, span := range stageSpans { + stageColorByName[span.Name] = gpuMetricStagePalette[i%len(gpuMetricStagePalette)] + } + + var legend strings.Builder + if len(stageSpans) > 0 { + legend.WriteString(`
`) + for _, span := range stageSpans { + fmt.Fprintf(&legend, `%s`, + stageColorByName[span.Name], gpuHTMLEscape(span.Name)) + } + legend.WriteString(`
`) + } + var svgs strings.Builder for _, gpuIdx := range order { - svgs.WriteString(drawGPUChartSVG(gpuMap[gpuIdx], gpuIdx)) + svgs.WriteString(drawGPUChartSVG(gpuMap[gpuIdx], gpuIdx, stageSpans, stageColorByName)) svgs.WriteString("\n") } @@ -175,21 +198,39 @@ func WriteGPUMetricsHTML(path string, rows []GPUMetricRow) error { GPU Stress Test Metrics +
+
+
GPU Stress Test Metrics
+

GPU Stress Test Metrics

Generated %s

%s -`, ts, svgs.String()) +
%s
+
+
+
+`, ts, legend.String(), svgs.String()) return os.WriteFile(path, []byte(html), 0644) } // drawGPUChartSVG generates a self-contained SVG chart for one GPU. -func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string { +func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int, stageSpans []gpuMetricStageSpan, stageColorByName map[string]string) string { // Layout const W, H = 960, 520 const plotX1 = 120 // usage axis / chart left border @@ -284,6 +325,23 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string { } b.WriteString("\n") + // Stage backgrounds + for _, span := range stageSpans { + x1 := xv(span.Start) + x2 := xv(span.End) + if x2 < x1 { + x1, x2 = x2, x1 + } + if x2-x1 < 1 { + x2 = x1 + 1 + } + color := stageColorByName[span.Name] + fmt.Fprintf(&b, ``+"\n", + x1, plotY1, x2-x1, PH, color) + fmt.Fprintf(&b, `%s`+"\n", + x1+(x2-x1)/2, plotY1+12, gpuHTMLEscape(span.Name)) + } + // Chart border fmt.Fprintf(&b, ``+"\n", @@ -382,221 +440,6 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string { return b.String() } -const ( - ansiAmber = "\033[38;5;214m" - ansiReset = "\033[0m" -) - -const ( - termChartWidth = 70 - termChartHeight = 12 -) - -// RenderGPUTerminalChart returns ANSI line charts (asciigraph-style) per GPU. -// Used in SAT stress-test logs. -func RenderGPUTerminalChart(rows []GPUMetricRow) string { - seen := make(map[int]bool) - var order []int - gpuMap := make(map[int][]GPUMetricRow) - for _, r := range rows { - if !seen[r.GPUIndex] { - seen[r.GPUIndex] = true - order = append(order, r.GPUIndex) - } - gpuMap[r.GPUIndex] = append(gpuMap[r.GPUIndex], r) - } - - type seriesDef struct { - caption string - color string - fn func(GPUMetricRow) float64 - } - defs := []seriesDef{ - {"Temperature (°C)", ansiAmber, func(r GPUMetricRow) float64 { return r.TempC }}, - {"GPU Usage (%)", ansiAmber, func(r GPUMetricRow) float64 { return r.UsagePct }}, - {"Power (W)", ansiAmber, func(r GPUMetricRow) float64 { return r.PowerW }}, - {"Clock (MHz)", ansiAmber, func(r GPUMetricRow) float64 { return r.ClockMHz }}, - } - - var b strings.Builder - for _, gpuIdx := range order { - gr := gpuMap[gpuIdx] - if len(gr) == 0 { - continue - } - tMax := gr[len(gr)-1].ElapsedSec - gr[0].ElapsedSec - fmt.Fprintf(&b, "GPU %d — Stress Test Metrics (%.0f seconds)\n\n", gpuIdx, tMax) - for _, d := range defs { - b.WriteString(renderLineChart(extractGPUField(gr, d.fn), d.color, d.caption, - termChartHeight, termChartWidth)) - b.WriteRune('\n') - } - } - - return strings.TrimRight(b.String(), "\n") -} - -// renderLineChart draws a single time-series line chart using box-drawing characters. -// Produces output in the style of asciigraph: ╭─╮ │ ╰─╯ with a Y axis and caption. -func renderLineChart(vals []float64, color, caption string, height, width int) string { - if len(vals) == 0 { - return caption + "\n" - } - - mn, mx := gpuMinMax(vals) - if mn == mx { - mx = mn + 1 - } - - // Use the smaller of width or len(vals) to avoid stretching sparse data. - w := width - if len(vals) < w { - w = len(vals) - } - data := gpuDownsample(vals, w) - - // row[i] = display row index: 0 = top = max value, height = bottom = min value. - row := make([]int, w) - for i, v := range data { - r := int(math.Round((mx - v) / (mx - mn) * float64(height))) - if r < 0 { - r = 0 - } - if r > height { - r = height - } - row[i] = r - } - - // Fill the character grid. - grid := make([][]rune, height+1) - for i := range grid { - grid[i] = make([]rune, w) - for j := range grid[i] { - grid[i][j] = ' ' - } - } - for x := 0; x < w; x++ { - r := row[x] - if x == 0 { - grid[r][0] = '─' - continue - } - p := row[x-1] - switch { - case r == p: - grid[r][x] = '─' - case r < p: // value went up (row index decreased toward top) - grid[r][x] = '╭' - grid[p][x] = '╯' - for y := r + 1; y < p; y++ { - grid[y][x] = '│' - } - default: // r > p, value went down - grid[p][x] = '╮' - grid[r][x] = '╰' - for y := p + 1; y < r; y++ { - grid[y][x] = '│' - } - } - } - - // Y axis tick labels. - ticks := gpuNiceTicks(mn, mx, height/2) - tickAtRow := make(map[int]string) - labelWidth := 4 - for _, t := range ticks { - r := int(math.Round((mx - t) / (mx - mn) * float64(height))) - if r < 0 || r > height { - continue - } - s := gpuFormatTick(t) - tickAtRow[r] = s - if len(s) > labelWidth { - labelWidth = len(s) - } - } - - var b strings.Builder - for r := 0; r <= height; r++ { - label := tickAtRow[r] - fmt.Fprintf(&b, "%*s", labelWidth, label) - switch { - case label != "": - b.WriteRune('┤') - case r == height: - b.WriteRune('┼') - default: - b.WriteRune('│') - } - b.WriteString(color) - b.WriteString(string(grid[r])) - b.WriteString(ansiReset) - b.WriteRune('\n') - } - - // Bottom axis. - b.WriteString(strings.Repeat(" ", labelWidth)) - b.WriteRune('└') - b.WriteString(strings.Repeat("─", w)) - b.WriteRune('\n') - - // Caption centered under the chart. - if caption != "" { - total := labelWidth + 1 + w - if pad := (total - len(caption)) / 2; pad > 0 { - b.WriteString(strings.Repeat(" ", pad)) - } - b.WriteString(caption) - b.WriteRune('\n') - } - - return b.String() -} - -func extractGPUField(rows []GPUMetricRow, fn func(GPUMetricRow) float64) []float64 { - v := make([]float64, len(rows)) - for i, r := range rows { - v[i] = fn(r) - } - return v -} - -// gpuDownsample averages vals into w buckets (or nearest-neighbor upsamples if len(vals) < w). -func gpuDownsample(vals []float64, w int) []float64 { - n := len(vals) - if n == 0 { - return make([]float64, w) - } - result := make([]float64, w) - if n >= w { - counts := make([]int, w) - for i, v := range vals { - bucket := i * w / n - if bucket >= w { - bucket = w - 1 - } - result[bucket] += v - counts[bucket]++ - } - for i := range result { - if counts[i] > 0 { - result[i] /= float64(counts[i]) - } - } - } else { - // Nearest-neighbour upsample. - for i := range result { - src := i * (n - 1) / (w - 1) - if src >= n { - src = n - 1 - } - result[i] = vals[src] - } - } - return result -} - func gpuMinMax(vals []float64) (float64, float64) { if len(vals) == 0 { return 0, 1 @@ -641,3 +484,46 @@ func gpuFormatTick(v float64) string { } return strconv.FormatFloat(v, 'f', 1, 64) } + +var gpuMetricStagePalette = []string{ + "#d95c5c", + "#2185d0", + "#21ba45", + "#f2c037", + "#6435c9", + "#00b5ad", + "#a5673f", +} + +func buildGPUMetricStageSpans(rows []GPUMetricRow) []gpuMetricStageSpan { + var spans []gpuMetricStageSpan + for _, row := range rows { + name := strings.TrimSpace(row.Stage) + if name == "" { + name = "run" + } + if len(spans) == 0 || spans[len(spans)-1].Name != name { + spans = append(spans, gpuMetricStageSpan{Name: name, Start: row.ElapsedSec, End: row.ElapsedSec}) + continue + } + spans[len(spans)-1].End = row.ElapsedSec + } + for i := range spans { + if spans[i].End <= spans[i].Start { + spans[i].End = spans[i].Start + 1 + } + } + return spans +} + +var gpuHTMLReplacer = strings.NewReplacer( + "&", "&", + "<", "<", + ">", ">", + `"`, """, + "'", "'", +) + +func gpuHTMLEscape(s string) string { + return gpuHTMLReplacer.Replace(s) +} diff --git a/audit/internal/platform/gpu_metrics_test.go b/audit/internal/platform/gpu_metrics_test.go new file mode 100644 index 0000000..f4590f4 --- /dev/null +++ b/audit/internal/platform/gpu_metrics_test.go @@ -0,0 +1,65 @@ +package platform + +import ( + "os" + "path/filepath" + "strings" + "testing" +) + +func TestWriteGPUMetricsCSVIncludesStageColumn(t *testing.T) { + t.Parallel() + + dir := t.TempDir() + path := filepath.Join(dir, "gpu-metrics.csv") + rows := []GPUMetricRow{ + {Stage: "warmup", ElapsedSec: 1, GPUIndex: 0, TempC: 71, UsagePct: 99, MemUsagePct: 80, PowerW: 420, ClockMHz: 1800, MemClockMHz: 1200}, + } + if err := WriteGPUMetricsCSV(path, rows); err != nil { + t.Fatalf("WriteGPUMetricsCSV: %v", err) + } + raw, err := os.ReadFile(path) + if err != nil { + t.Fatalf("ReadFile: %v", err) + } + text := string(raw) + for _, needle := range []string{ + "stage,elapsed_sec,gpu_index", + `"warmup",1.0,0,71.0,99.0,80.0,420.0,1800,1200`, + } { + if !strings.Contains(text, needle) { + t.Fatalf("csv missing %q\n%s", needle, text) + } + } +} + +func TestWriteGPUMetricsHTMLShowsStageLegendAndLabels(t *testing.T) { + t.Parallel() + + dir := t.TempDir() + path := filepath.Join(dir, "gpu-metrics.html") + rows := []GPUMetricRow{ + {Stage: "baseline", ElapsedSec: 1, GPUIndex: 0, TempC: 50, UsagePct: 10, MemUsagePct: 5, PowerW: 100, ClockMHz: 500, MemClockMHz: 400}, + {Stage: "baseline", ElapsedSec: 2, GPUIndex: 0, TempC: 51, UsagePct: 11, MemUsagePct: 5, PowerW: 101, ClockMHz: 510, MemClockMHz: 400}, + {Stage: "steady-fp16", ElapsedSec: 3, GPUIndex: 0, TempC: 70, UsagePct: 98, MemUsagePct: 75, PowerW: 390, ClockMHz: 1700, MemClockMHz: 1100}, + {Stage: "steady-fp16", ElapsedSec: 4, GPUIndex: 0, TempC: 71, UsagePct: 99, MemUsagePct: 76, PowerW: 395, ClockMHz: 1710, MemClockMHz: 1110}, + } + if err := WriteGPUMetricsHTML(path, rows); err != nil { + t.Fatalf("WriteGPUMetricsHTML: %v", err) + } + raw, err := os.ReadFile(path) + if err != nil { + t.Fatalf("ReadFile: %v", err) + } + text := string(raw) + for _, needle := range []string{ + "stage-legend", + "baseline", + "steady-fp16", + "GPU Stress Test Metrics", + } { + if !strings.Contains(text, needle) { + t.Fatalf("html missing %q\n%s", needle, text) + } + } +} diff --git a/audit/internal/platform/sat.go b/audit/internal/platform/sat.go index 6702c92..41ebb89 100644 --- a/audit/internal/platform/sat.go +++ b/audit/internal/platform/sat.go @@ -108,15 +108,15 @@ type nvidiaGPUHealth struct { } type nvidiaGPUStatusFile struct { - Index int - Name string - RunStatus string - Reason string - Health string - HealthRaw string - Observed bool - Selected bool - FailingJob string + Index int + Name string + RunStatus string + Reason string + Health string + HealthRaw string + Observed bool + Selected bool + FailingJob string } // AMDGPUInfo holds basic info about an AMD GPU from rocm-smi. @@ -410,13 +410,13 @@ func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir strin return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", withNvidiaPersistenceMode( satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, satJob{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}}, - satJob{ - name: "03-dcgmproftester.log", - cmd: profCmd, - env: profEnv, - collectGPU: true, - gpuIndices: selected, - }, + satJob{ + name: "03-dcgmproftester.log", + cmd: profCmd, + env: profEnv, + collectGPU: true, + gpuIndices: selected, + }, satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}}, ), logFunc) } @@ -1382,8 +1382,6 @@ func runSATCommandWithMetrics(ctx context.Context, verboseLog, name string, cmd if len(metricRows) > 0 { _ = WriteGPUMetricsCSV(filepath.Join(runDir, "gpu-metrics.csv"), metricRows) _ = WriteGPUMetricsHTML(filepath.Join(runDir, "gpu-metrics.html"), metricRows) - chart := RenderGPUTerminalChart(metricRows) - _ = os.WriteFile(filepath.Join(runDir, "gpu-metrics-term.txt"), []byte(chart), 0644) } return out, err