From f8cd9a73761ef09f53363a89f4c8e2f99e80b2ec Mon Sep 17 00:00:00 2001 From: Michael Chus Date: Sun, 19 Apr 2026 18:04:12 +0300 Subject: [PATCH] Rework Power Fit report: 90 min stability, aligned tables, PSU/fan sections - Increase stability profile duration from 33 min to 90 min by wiring powerBenchDurationSec() into runBenchmarkPowerCalibration (was discarded) - Collect per-step PSU slot readings, fan RPM/duty, and per-GPU telemetry in ramp loop; add matching fields to NvidiaPowerBenchStep/NvidiaPowerBenchGPU - Rewrite renderPowerBenchReport: replace Per-Slot Results with Single GPU section, rework Ramp Sequence rows=runs/cols=GPUs, add PSU Performance section (conditional on IPMI data), add transposed Single vs All-GPU comparison table in per-GPU sections - Add fmtMDTable helper (benchmark_table.go) and apply to all tables in both power and performance reports so columns align in plain-text view Co-Authored-By: Claude Sonnet 4.6 --- audit/internal/platform/benchmark.go | 446 ++++++++++++++++---- audit/internal/platform/benchmark_report.go | 338 ++++++++------- audit/internal/platform/benchmark_table.go | 75 ++++ audit/internal/platform/benchmark_types.go | 14 +- 4 files changed, 647 insertions(+), 226 deletions(-) create mode 100644 audit/internal/platform/benchmark_table.go diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go index 613f60f..7a1b991 100644 --- a/audit/internal/platform/benchmark.go +++ b/audit/internal/platform/benchmark.go @@ -3055,8 +3055,12 @@ func runBenchmarkPowerCalibration( infoByIndex map[int]benchmarkGPUInfo, logFunc func(string), seedLimits map[int]int, + durationSec int, ) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction, []GPUMetricRow) { - const calibDurationSec = 120 + calibDurationSec := durationSec + if calibDurationSec <= 0 { + calibDurationSec = 120 + } const maxDerateW = 150 // calibSearchTolerance is the binary-search convergence threshold in watts. // When hi-lo ≤ this, the highest verified-stable limit (lo) is used. @@ -3436,6 +3440,18 @@ func roundTo5W(w int) int { return ((w + 2) / 5) * 5 } +// meanFanRPM returns the average RPM across a set of fan readings. +func meanFanRPM(fans []FanReading) float64 { + if len(fans) == 0 { + return 0 + } + var sum float64 + for _, f := range fans { + sum += f.RPM + } + return sum / float64(len(fans)) +} + func powerBenchDurationSec(profile string) int { switch strings.TrimSpace(strings.ToLower(profile)) { case NvidiaBenchmarkProfileStability: @@ -3475,30 +3491,29 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string { // Server power comparison table. if sp := result.ServerPower; sp != nil { b.WriteString("## Server vs GPU Power Comparison\n\n") - b.WriteString("| Metric | Source | Value |\n") - b.WriteString("|--------|--------|-------|\n") - fmt.Fprintf(&b, "| GPU stable limits sum | nvidia-smi | %.0f W |\n", result.PlatformMaxTDPW) - fmt.Fprintf(&b, "| GPU actual power sum (p95, last step) | nvidia-smi | %.0f W |\n", sp.GPUReportedSumW) + var spRows [][]string + spRows = append(spRows, []string{"GPU stable limits sum", "nvidia-smi", fmt.Sprintf("%.0f W", result.PlatformMaxTDPW)}) + spRows = append(spRows, []string{"GPU actual power sum (p95, last step)", "nvidia-smi", fmt.Sprintf("%.0f W", sp.GPUReportedSumW)}) if sp.GPUSlotTotalW > 0 { - fmt.Fprintf(&b, "| GPU PCIe slot power (at peak load) | IPMI SDR | %.0f W |\n", sp.GPUSlotTotalW) + spRows = append(spRows, []string{"GPU PCIe slot power (at peak load)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.GPUSlotTotalW)}) } if sp.Available { - fmt.Fprintf(&b, "| Server idle power | IPMI DCMI | %.0f W |\n", sp.IdleW) - fmt.Fprintf(&b, "| Server loaded power | IPMI DCMI | %.0f W |\n", sp.LoadedW) - fmt.Fprintf(&b, "| Server Δ power (loaded − idle) | IPMI DCMI | %.0f W |\n", sp.DeltaW) + spRows = append(spRows, []string{"Server idle power", "IPMI DCMI", fmt.Sprintf("%.0f W", sp.IdleW)}) + spRows = append(spRows, []string{"Server loaded power", "IPMI DCMI", fmt.Sprintf("%.0f W", sp.LoadedW)}) + spRows = append(spRows, []string{"Server Δ power (loaded − idle)", "IPMI DCMI", fmt.Sprintf("%.0f W", sp.DeltaW)}) } if sp.PSUInputLoadedW > 0 { - fmt.Fprintf(&b, "| PSU AC input (idle) | IPMI SDR | %.0f W |\n", sp.PSUInputIdleW) - fmt.Fprintf(&b, "| PSU AC input (loaded) | IPMI SDR | %.0f W |\n", sp.PSUInputLoadedW) + spRows = append(spRows, []string{"PSU AC input (idle)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUInputIdleW)}) + spRows = append(spRows, []string{"PSU AC input (loaded)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUInputLoadedW)}) psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW - fmt.Fprintf(&b, "| PSU AC input Δ (loaded − idle) | IPMI SDR | %.0f W |\n", psuDelta) + spRows = append(spRows, []string{"PSU AC input Δ (loaded − idle)", "IPMI SDR", fmt.Sprintf("%.0f W", psuDelta)}) } if sp.PSUOutputLoadedW > 0 { - fmt.Fprintf(&b, "| PSU DC output (idle) | IPMI SDR | %.0f W |\n", sp.PSUOutputIdleW) - fmt.Fprintf(&b, "| PSU DC output (loaded) | IPMI SDR | %.0f W |\n", sp.PSUOutputLoadedW) + spRows = append(spRows, []string{"PSU DC output (idle)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUOutputIdleW)}) + spRows = append(spRows, []string{"PSU DC output (loaded)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUOutputLoadedW)}) if sp.PSUInputLoadedW > 0 && sp.PSUInputIdleW > 0 { psuEff := sp.PSUOutputIdleW / sp.PSUInputIdleW * 100 - fmt.Fprintf(&b, "| PSU conversion efficiency (idle) | IPMI SDR | %.1f%% |\n", psuEff) + spRows = append(spRows, []string{"PSU conversion efficiency (idle)", "IPMI SDR", fmt.Sprintf("%.1f%%", psuEff)}) } } if sp.Available { @@ -3516,7 +3531,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string { default: ratioNote = "✗ significant discrepancy — GPU over-reports TDP vs wall power" } - fmt.Fprintf(&b, "| Reporting ratio (DCMI Δ / GPU actual) | IPMI DCMI | %.2f — %s |\n", ratio, ratioNote) + spRows = append(spRows, []string{"Reporting ratio (DCMI Δ / GPU actual)", "IPMI DCMI", fmt.Sprintf("%.2f — %s", ratio, ratioNote)}) if sp.PSUInputLoadedW > 0 && sp.GPUReportedSumW > 0 { psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW sdrRatio := psuDelta / sp.GPUReportedSumW @@ -3529,11 +3544,12 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string { default: sdrNote = "✗ significant discrepancy" } - fmt.Fprintf(&b, "| Reporting ratio (SDR PSU Δ / GPU actual) | IPMI SDR | %.2f — %s |\n", sdrRatio, sdrNote) + spRows = append(spRows, []string{"Reporting ratio (SDR PSU Δ / GPU actual)", "IPMI SDR", fmt.Sprintf("%.2f — %s", sdrRatio, sdrNote)}) } } else { - b.WriteString("| IPMI availability | — | not available — IPMI not supported or ipmitool not found |\n") + spRows = append(spRows, []string{"IPMI availability", "—", "not available — IPMI not supported or ipmitool not found"}) } + b.WriteString(fmtMDTable([]string{"Metric", "Source", "Value"}, spRows)) for _, note := range sp.Notes { fmt.Fprintf(&b, "\n> %s\n", note) } @@ -3541,10 +3557,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string { if len(sp.PSUSlotReadingsIdle) > 0 || len(sp.PSUSlotReadingsLoaded) > 0 { b.WriteString("## PSU Load Distribution\n\n") - b.WriteString("| Slot | AC Input (idle) | AC Input (loaded) | DC Output (idle) | DC Output (loaded) | Load Δ | Status |\n") - b.WriteString("|------|-----------------|-------------------|------------------|--------------------|--------|--------|\n") - // collect all slot keys slotSet := map[string]struct{}{} for k := range sp.PSUSlotReadingsIdle { slotSet[k] = struct{}{} @@ -3558,17 +3571,18 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string { } sort.Strings(slots) + fmtW := func(v *float64) string { + if v == nil { + return "—" + } + return fmt.Sprintf("%.0f W", *v) + } + + var psuDistRows [][]string for _, slot := range slots { idle := sp.PSUSlotReadingsIdle[slot] loaded := sp.PSUSlotReadingsLoaded[slot] - fmtW := func(v *float64) string { - if v == nil { - return "—" - } - return fmt.Sprintf("%.0f W", *v) - } - var deltaStr string if idle.InputW != nil && loaded.InputW != nil { deltaStr = fmt.Sprintf("%+.0f W", *loaded.InputW-*idle.InputW) @@ -3584,13 +3598,14 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string { status = "—" } - fmt.Fprintf(&b, "| %s | %s | %s | %s | %s | %s | %s |\n", + psuDistRows = append(psuDistRows, []string{ slot, fmtW(idle.InputW), fmtW(loaded.InputW), fmtW(idle.OutputW), fmtW(loaded.OutputW), deltaStr, status, - ) + }) } + b.WriteString(fmtMDTable([]string{"Slot", "AC Input (idle)", "AC Input (loaded)", "DC Output (idle)", "DC Output (loaded)", "Load Δ", "Status"}, psuDistRows)) b.WriteString("\n") } } @@ -3602,28 +3617,194 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string { } b.WriteString("\n") } - if len(result.RecommendedSlotOrder) > 0 { - b.WriteString("## Recommended Slot Order\n\n") - fmt.Fprintf(&b, "Populate GPUs in this order for best single-card power realization: `%s`\n\n", joinIndexList(result.RecommendedSlotOrder)) - } - if len(result.RampSteps) > 0 { - b.WriteString("## Ramp Sequence\n\n") - b.WriteString("| Step | New GPU | Stable Limit | Total Observed | Server Δ (IPMI) | Derated | Status |\n") - b.WriteString("|------|---------|--------------|----------------|-----------------|---------|--------|\n") - for _, step := range result.RampSteps { - derated := "-" - if step.Derated { - derated = "⚠ yes" + // ── Single GPU section ─────────────────────────────────────────────────── + b.WriteString("## Single GPU\n\n") + { + var sgRows [][]string + for _, gpu := range result.GPUs { + clk := "—" + mem := "—" + temp := "—" + pwr := "—" + if gpu.Telemetry != nil { + clk = fmt.Sprintf("%.0f", gpu.Telemetry.AvgGraphicsClockMHz) + mem = fmt.Sprintf("%.0f", gpu.Telemetry.AvgMemoryClockMHz) + temp = fmt.Sprintf("%.1f", gpu.Telemetry.AvgTempC) + pwr = fmt.Sprintf("%.0f W", gpu.Telemetry.AvgPowerW) } - serverDelta := "-" - if step.ServerDeltaW > 0 { - serverDelta = fmt.Sprintf("%.0f W", step.ServerDeltaW) + serverDelta := "—" + if gpu.ServerDeltaW > 0 { + serverDelta = fmt.Sprintf("%.0f W", gpu.ServerDeltaW) } - fmt.Fprintf(&b, "| %d | GPU %d | %.0f W | %.0f W | %s | %s | %s |\n", - step.StepIndex, step.NewGPUIndex, step.NewGPUStableLimitW, step.TotalObservedPowerW, serverDelta, derated, step.Status) + fan := "—" + if gpu.AvgFanRPM > 0 { + if gpu.AvgFanDutyCyclePct > 0 { + fan = fmt.Sprintf("%.0f RPM (%.0f%%)", gpu.AvgFanRPM, gpu.AvgFanDutyCyclePct) + } else { + fan = fmt.Sprintf("%.0f RPM", gpu.AvgFanRPM) + } + } + sgRows = append(sgRows, []string{ + fmt.Sprintf("GPU %d", gpu.Index), + fmt.Sprintf("%s (%s)", clk, mem), + temp, + pwr, + serverDelta, + fan, + }) } + b.WriteString(fmtMDTable([]string{"GPU", "Clock MHz (Mem MHz)", "Avg Temp °C", "Power W", "Server Δ W", "Fan RPM (duty%)"}, sgRows)) b.WriteString("\n") } + if len(result.RecommendedSlotOrder) > 0 { + fmt.Fprintf(&b, "Recommended slot order for best single-card power realization: `%s`\n\n", joinIndexList(result.RecommendedSlotOrder)) + } + + // ── Ramp Sequence ──────────────────────────────────────────────────────── + // Rows = run number; Cols = per-GPU power (from step telemetry) + aggregates. + if len(result.RampSteps) > 0 { + b.WriteString("## Ramp Sequence\n\n") + + // Collect all GPU indices that appear across all steps (ordered by first appearance). + allGPUIndices := make([]int, 0, len(result.GPUs)) + seen := map[int]bool{} + for _, step := range result.RampSteps { + for _, idx := range step.GPUIndices { + if !seen[idx] { + seen[idx] = true + allGPUIndices = append(allGPUIndices, idx) + } + } + } + + var idleW float64 + if result.ServerPower != nil { + idleW = result.ServerPower.IdleW + } + + // Build header: Run | GPU 0 | GPU 1 | ... | Server wall W | Per GPU wall W | Platform eff. + headers := []string{"Run"} + for _, idx := range allGPUIndices { + headers = append(headers, fmt.Sprintf("GPU %d W", idx)) + } + headers = append(headers, "Server wall W", "Per GPU wall W", "Platform eff.") + + var rampRows [][]string + for _, step := range result.RampSteps { + row := []string{fmt.Sprintf("%d", step.StepIndex)} + for _, idx := range allGPUIndices { + inStep := false + for _, si := range step.GPUIndices { + if si == idx { + inStep = true + break + } + } + if !inStep { + row = append(row, "—") + continue + } + gpuPwr := "—" + if t, ok := step.PerGPUTelemetry[idx]; ok && t != nil && t.AvgPowerW > 0 { + gpuPwr = fmt.Sprintf("%.0f", t.AvgPowerW) + } + row = append(row, gpuPwr) + } + // Server wall W + serverWall := "—" + if step.ServerLoadedW > 0 { + serverWall = fmt.Sprintf("%.0f", step.ServerLoadedW) + } + // Per GPU wall W = ServerDeltaW / len(GPUIndices) + perGPUWall := "—" + if step.ServerDeltaW > 0 && len(step.GPUIndices) > 0 { + perGPUWall = fmt.Sprintf("%.0f", step.ServerDeltaW/float64(len(step.GPUIndices))) + } + // Platform eff. = (ServerLoadedW − idleW) / TotalObservedPowerW + platEff := "—" + if step.TotalObservedPowerW > 0 { + eff := step.ServerDeltaW / step.TotalObservedPowerW + if idleW > 0 && step.ServerLoadedW > 0 { + eff = (step.ServerLoadedW - idleW) / step.TotalObservedPowerW + } + platEff = fmt.Sprintf("%.2f", eff) + } + row = append(row, serverWall, perGPUWall, platEff) + rampRows = append(rampRows, row) + } + b.WriteString(fmtMDTable(headers, rampRows)) + b.WriteString("\n") + } + + // ── PSU Performance ─────────────────────────────────────────────────────── + { + // Collect all PSU slot keys from any ramp step. + psuSlotSet := map[string]struct{}{} + for _, step := range result.RampSteps { + for k := range step.PSUSlotReadings { + psuSlotSet[k] = struct{}{} + } + } + if len(psuSlotSet) > 0 { + b.WriteString("## PSU Performance\n\n") + psuSlots := make([]string, 0, len(psuSlotSet)) + for k := range psuSlotSet { + psuSlots = append(psuSlots, k) + } + sort.Strings(psuSlots) + + var idleW float64 + if result.ServerPower != nil { + idleW = result.ServerPower.IdleW + } + + psuHeaders := []string{"Run"} + for _, slot := range psuSlots { + psuHeaders = append(psuHeaders, fmt.Sprintf("PSU %s W", slot)) + } + psuHeaders = append(psuHeaders, "PSU Total W", "Platform eff.", "Fan RPM (duty%)") + + var psuRows [][]string + for _, step := range result.RampSteps { + row := []string{fmt.Sprintf("%d", step.StepIndex)} + var psuTotal float64 + for _, slot := range psuSlots { + sp, ok := step.PSUSlotReadings[slot] + if !ok || sp.InputW == nil { + row = append(row, "—") + continue + } + row = append(row, fmt.Sprintf("%.0f", *sp.InputW)) + psuTotal += *sp.InputW + } + totalStr := "—" + if psuTotal > 0 { + totalStr = fmt.Sprintf("%.0f", psuTotal) + } + platEff := "—" + if step.TotalObservedPowerW > 0 { + eff := step.ServerDeltaW / step.TotalObservedPowerW + if idleW > 0 && step.ServerLoadedW > 0 { + eff = (step.ServerLoadedW - idleW) / step.TotalObservedPowerW + } + platEff = fmt.Sprintf("%.2f", eff) + } + fan := "—" + if step.AvgFanRPM > 0 { + if step.AvgFanDutyCyclePct > 0 { + fan = fmt.Sprintf("%.0f (%.0f%%)", step.AvgFanRPM, step.AvgFanDutyCyclePct) + } else { + fan = fmt.Sprintf("%.0f", step.AvgFanRPM) + } + } + row = append(row, totalStr, platEff, fan) + psuRows = append(psuRows, row) + } + b.WriteString(fmtMDTable(psuHeaders, psuRows)) + b.WriteString("\n") + } + } + // ── PSU Issues ──────────────────────────────────────────────────────────── if len(result.PSUIssues) > 0 { b.WriteString("## PSU Issues\n\n") @@ -3646,8 +3827,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string { totalDefault += gpu.DefaultPowerLimitW totalStable += stable } - b.WriteString("| GPU | Default TDP | Single-card limit | Stable limit | Realization | Derated |\n") - b.WriteString("|-----|-------------|-------------------|--------------|-------------|----------|\n") + var pdRows [][]string for _, gpu := range result.GPUs { stable := gpu.StablePowerLimitW if stable <= 0 { @@ -3661,15 +3841,29 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string { if gpu.Derated { derated = "⚠ yes" } - fmt.Fprintf(&b, "| GPU %d | %.0f W | %.0f W | %.0f W | %s | %s |\n", - gpu.Index, gpu.DefaultPowerLimitW, gpu.AppliedPowerLimitW, stable, realization, derated) + pdRows = append(pdRows, []string{ + fmt.Sprintf("GPU %d", gpu.Index), + fmt.Sprintf("%.0f W", gpu.DefaultPowerLimitW), + fmt.Sprintf("%.0f W", gpu.AppliedPowerLimitW), + fmt.Sprintf("%.0f W", stable), + realization, + derated, + }) } platformReal := "-" if totalDefault > 0 && totalStable > 0 { platformReal = fmt.Sprintf("%.1f%%", totalStable/totalDefault*100) } - fmt.Fprintf(&b, "| **Platform** | **%.0f W** | — | **%.0f W** | **%s** | |\n\n", - totalDefault, totalStable, platformReal) + pdRows = append(pdRows, []string{ + "**Platform**", + fmt.Sprintf("**%.0f W**", totalDefault), + "—", + fmt.Sprintf("**%.0f W**", totalStable), + fmt.Sprintf("**%s**", platformReal), + "", + }) + b.WriteString(fmtMDTable([]string{"GPU", "Default TDP", "Single-card limit", "Stable limit", "Realization", "Derated"}, pdRows)) + b.WriteString("\n") // Balance across GPUs — only meaningful with 2+ GPUs. if len(result.GPUs) > 1 { @@ -3710,9 +3904,6 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string { // Ramp scalability table — power efficiency of adding each GPU. if len(result.RampSteps) > 1 { b.WriteString("**Ramp power scalability** (stable TDP per step):\n\n") - b.WriteString("| Step | GPUs | Cumulative stable TDP | Incremental | Efficiency vs GPU 1 |\n") - b.WriteString("|------|------|-----------------------|-------------|---------------------|\n") - // First GPU stable TDP as the reference unit for efficiency. var firstStable float64 if len(result.GPUs) > 0 { firstStable = result.GPUs[0].StablePowerLimitW @@ -3721,6 +3912,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string { } } var prevCumulative float64 + var scalRows [][]string for _, step := range result.RampSteps { var cumulative float64 for _, gpuIdx := range step.GPUIndices { @@ -3740,40 +3932,104 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string { if step.StepIndex > 1 && firstStable > 0 { efficiency = fmt.Sprintf("%.1f%%", incremental/firstStable*100) } - fmt.Fprintf(&b, "| %d | %s | %.0f W | %.0f W | %s |\n", - step.StepIndex, joinIndexList(step.GPUIndices), cumulative, incremental, efficiency) + scalRows = append(scalRows, []string{ + fmt.Sprintf("%d", step.StepIndex), + joinIndexList(step.GPUIndices), + fmt.Sprintf("%.0f W", cumulative), + fmt.Sprintf("%.0f W", incremental), + efficiency, + }) prevCumulative = cumulative } + b.WriteString(fmtMDTable([]string{"Step", "GPUs", "Cumulative stable TDP", "Incremental", "Efficiency vs GPU 1"}, scalRows)) b.WriteString("\n") } } - b.WriteString("## Per-Slot Results\n\n") - b.WriteString("| GPU | Status | Single-card Limit | Stable Limit | Server Δ (IPMI) | Temp | Attempts |\n") - b.WriteString("|-----|--------|-------------------|--------------|-----------------|------|----------|\n") - for _, gpu := range result.GPUs { - stableLimit := "-" - if gpu.StablePowerLimitW > 0 { - if gpu.Derated { - stableLimit = fmt.Sprintf("%.0f W ⚠", gpu.StablePowerLimitW) - } else { - stableLimit = fmt.Sprintf("%.0f W", gpu.StablePowerLimitW) - } - } - serverDelta := "-" - if gpu.ServerDeltaW > 0 { - serverDelta = fmt.Sprintf("%.0f W", gpu.ServerDeltaW) - } - fmt.Fprintf(&b, "| GPU %d | %s | %.0f W | %s | %s | %.1f C | %d |\n", - gpu.Index, gpu.Status, gpu.AppliedPowerLimitW, stableLimit, serverDelta, gpu.MaxObservedTempC, gpu.CalibrationAttempts) + // ── Per-GPU sections ────────────────────────────────────────────────────── + var lastStep *NvidiaPowerBenchStep + if n := len(result.RampSteps); n > 0 { + lastStep = &result.RampSteps[n-1] } - b.WriteString("\n") for _, gpu := range result.GPUs { fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, gpu.Name) + + // Transposed comparison table: Single Run vs All GPU Run. + singleClk := "—" + singleMem := "—" + singleTemp := "—" + singlePwr := "—" + singleWall := "—" + singleFan := "—" + if gpu.Telemetry != nil { + singleClk = fmt.Sprintf("%.0f", gpu.Telemetry.AvgGraphicsClockMHz) + singleMem = fmt.Sprintf("%.0f", gpu.Telemetry.AvgMemoryClockMHz) + singleTemp = fmt.Sprintf("%.1f", gpu.Telemetry.AvgTempC) + singlePwr = fmt.Sprintf("%.0f W", gpu.Telemetry.AvgPowerW) + } + if gpu.ServerDeltaW > 0 { + singleWall = fmt.Sprintf("%.0f W", gpu.ServerDeltaW) + } + if gpu.AvgFanRPM > 0 { + if gpu.AvgFanDutyCyclePct > 0 { + singleFan = fmt.Sprintf("%.0f RPM (%.0f%%)", gpu.AvgFanRPM, gpu.AvgFanDutyCyclePct) + } else { + singleFan = fmt.Sprintf("%.0f RPM", gpu.AvgFanRPM) + } + } + + allClk := "—" + allMem := "—" + allTemp := "—" + allPwr := "—" + allWall := "—" + allFan := "—" + if lastStep != nil { + if t, ok := lastStep.PerGPUTelemetry[gpu.Index]; ok && t != nil { + allClk = fmt.Sprintf("%.0f", t.AvgGraphicsClockMHz) + allMem = fmt.Sprintf("%.0f", t.AvgMemoryClockMHz) + allTemp = fmt.Sprintf("%.1f", t.AvgTempC) + allPwr = fmt.Sprintf("%.0f W", t.AvgPowerW) + } + if lastStep.ServerDeltaW > 0 && len(lastStep.GPUIndices) > 0 { + allWall = fmt.Sprintf("%.0f W", lastStep.ServerDeltaW/float64(len(lastStep.GPUIndices))) + } + if lastStep.AvgFanRPM > 0 { + if lastStep.AvgFanDutyCyclePct > 0 { + allFan = fmt.Sprintf("%.0f RPM (%.0f%%)", lastStep.AvgFanRPM, lastStep.AvgFanDutyCyclePct) + } else { + allFan = fmt.Sprintf("%.0f RPM", lastStep.AvgFanRPM) + } + } + } + + tableHeaders := []string{"", "Single Run"} + if lastStep != nil { + tableHeaders = append(tableHeaders, "All GPU Run") + } + compRows := [][]string{ + {"Clock MHz (Mem MHz)", fmt.Sprintf("%s (%s)", singleClk, singleMem)}, + {"Avg Temp °C", singleTemp}, + {"Power W", singlePwr}, + {"Per GPU wall W", singleWall}, + {"Fan RPM (duty%)", singleFan}, + } + if lastStep != nil { + compRows[0] = append(compRows[0], fmt.Sprintf("%s (%s)", allClk, allMem)) + compRows[1] = append(compRows[1], allTemp) + compRows[2] = append(compRows[2], allPwr) + compRows[3] = append(compRows[3], allWall) + compRows[4] = append(compRows[4], allFan) + } + b.WriteString(fmtMDTable(tableHeaders, compRows)) + b.WriteString("\n") + for _, note := range gpu.Notes { fmt.Fprintf(&b, "- %s\n", note) } - b.WriteString("\n") + if len(gpu.Notes) > 0 { + b.WriteString("\n") + } } return b.String() } @@ -3860,7 +4116,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N OverallStatus: "OK", } durationSec := powerBenchDurationSec(opts.Profile) - _ = durationSec // Sample IPMI idle power before any GPU load. var serverIdleW float64 @@ -3894,7 +4149,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N ipmiSingleDone <- w } }() - c, restore, singleRows := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil) + c, restore, singleRows := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil, durationSec) appendBenchmarkMetrics(&allPowerRows, singleRows, fmt.Sprintf("single-gpu-%d", idx), &powerCursor, 0) ipmiSingleCancel() if w, ok := <-ipmiSingleDone; ok { @@ -3947,6 +4202,12 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N t := summarizeBenchmarkTelemetry(calib.MetricRows) gpu.Telemetry = &t } + if fans, err := sampleFanSpeeds(); err == nil && len(fans) > 0 { + gpu.AvgFanRPM = meanFanRPM(fans) + if duty, ok, _ := sampleFanDutyCyclePctFromFans(fans); ok { + gpu.AvgFanDutyCyclePct = duty + } + } gpus = append(gpus, gpu) } sort.Slice(gpus, func(i, j int) bool { @@ -4077,7 +4338,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N ipmiStepDone <- w } }() - stepCalib, stepRestore, stepRows := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep) + stepCalib, stepRestore, stepRows := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep, durationSec) appendBenchmarkMetrics(&allPowerRows, stepRows, fmt.Sprintf("ramp-step-%d", step), &powerCursor, 0) ipmiStepCancel() var stepIPMILoadedW float64 @@ -4159,6 +4420,29 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N } } + // Per-step PSU slot snapshot. + sdrStep := sampleIPMISDRPowerSensors() + if len(sdrStep.PSUSlots) > 0 { + ramp.PSUSlotReadings = sdrStep.PSUSlots + } + + // Fan state at end of ramp step. + if fans, err := sampleFanSpeeds(); err == nil && len(fans) > 0 { + ramp.AvgFanRPM = meanFanRPM(fans) + if duty, ok, _ := sampleFanDutyCyclePctFromFans(fans); ok { + ramp.AvgFanDutyCyclePct = duty + } + } + + // Per-GPU telemetry from this ramp step's calibration. + ramp.PerGPUTelemetry = make(map[int]*BenchmarkTelemetrySummary, len(subset)) + for _, gpuIdx := range subset { + if c, ok := stepCalib[gpuIdx]; ok { + s := c.Summary + ramp.PerGPUTelemetry[gpuIdx] = &s + } + } + result.RampSteps = append(result.RampSteps, ramp) } diff --git a/audit/internal/platform/benchmark_report.go b/audit/internal/platform/benchmark_report.go index 646d3b1..142e63a 100644 --- a/audit/internal/platform/benchmark_report.go +++ b/audit/internal/platform/benchmark_report.go @@ -89,136 +89,159 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string { // Perspective 1: Compatibility — hard stops b.WriteString("### 1. Compatibility\n\n") - b.WriteString("| GPU | Thermal throttle | Fan duty at throttle | ECC uncorr | Status |\n") - b.WriteString("|-----|------------------|----------------------|------------|--------|\n") - for _, gpu := range result.GPUs { - thermalThrottle := "-" - if gpu.Scores.ThermalThrottlePct > 0 { - thermalThrottle = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct) + { + var rows [][]string + for _, gpu := range result.GPUs { + thermalThrottle := "-" + if gpu.Scores.ThermalThrottlePct > 0 { + thermalThrottle = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct) + } + fanAtThrottle := "-" + if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && gpu.Scores.ThermalThrottlePct > 0 { + fanAtThrottle = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct) + } + ecc := "-" + if gpu.ECC.Uncorrected > 0 { + ecc = fmt.Sprintf("⛔ %d", gpu.ECC.Uncorrected) + } + compatStatus := "✓ OK" + if gpu.ECC.Uncorrected > 0 || (gpu.Scores.ThermalThrottlePct > 0 && result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && result.Cooling.P95FanDutyCyclePct < 95) { + compatStatus = "⛔ HARD STOP" + } + rows = append(rows, []string{fmt.Sprintf("GPU %d", gpu.Index), thermalThrottle, fanAtThrottle, ecc, compatStatus}) } - fanAtThrottle := "-" - if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && gpu.Scores.ThermalThrottlePct > 0 { - fanAtThrottle = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct) - } - ecc := "-" - if gpu.ECC.Uncorrected > 0 { - ecc = fmt.Sprintf("⛔ %d", gpu.ECC.Uncorrected) - } - compatStatus := "✓ OK" - if gpu.ECC.Uncorrected > 0 || (gpu.Scores.ThermalThrottlePct > 0 && result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && result.Cooling.P95FanDutyCyclePct < 95) { - compatStatus = "⛔ HARD STOP" - } - fmt.Fprintf(&b, "| GPU %d | %s | %s | %s | %s |\n", - gpu.Index, thermalThrottle, fanAtThrottle, ecc, compatStatus) + b.WriteString(fmtMDTable([]string{"GPU", "Thermal throttle", "Fan duty at throttle", "ECC uncorr", "Status"}, rows)) + b.WriteString("\n") } - b.WriteString("\n") // Perspective 2: Thermal headroom b.WriteString("### 2. Thermal Headroom\n\n") - b.WriteString("| GPU | p95 temp | Slowdown limit | Shutdown limit | Headroom | Thermal throttle | Status |\n") - b.WriteString("|-----|----------|----------------|----------------|----------|------------------|--------|\n") - for _, gpu := range result.GPUs { - shutdownTemp := gpu.ShutdownTempC - if shutdownTemp <= 0 { - shutdownTemp = 90 + { + var rows [][]string + for _, gpu := range result.GPUs { + shutdownTemp := gpu.ShutdownTempC + if shutdownTemp <= 0 { + shutdownTemp = 90 + } + slowdownTemp := gpu.SlowdownTempC + if slowdownTemp <= 0 { + slowdownTemp = 80 + } + headroom := gpu.Scores.TempHeadroomC + thermalStatus := "✓ OK" + switch { + case headroom < 10: + thermalStatus = "⛔ CRITICAL" + case gpu.Steady.P95TempC >= slowdownTemp: + thermalStatus = "⚠ WARNING" + } + throttlePct := "-" + if gpu.Scores.ThermalThrottlePct > 0 { + throttlePct = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct) + } + rows = append(rows, []string{ + fmt.Sprintf("GPU %d", gpu.Index), + fmt.Sprintf("%.1f°C", gpu.Steady.P95TempC), + fmt.Sprintf("%.0f°C", slowdownTemp), + fmt.Sprintf("%.0f°C", shutdownTemp), + fmt.Sprintf("%.1f°C", headroom), + throttlePct, + thermalStatus, + }) } - slowdownTemp := gpu.SlowdownTempC - if slowdownTemp <= 0 { - slowdownTemp = 80 - } - headroom := gpu.Scores.TempHeadroomC - thermalStatus := "✓ OK" - switch { - case headroom < 10: - thermalStatus = "⛔ CRITICAL" - case gpu.Steady.P95TempC >= slowdownTemp: - thermalStatus = "⚠ WARNING" - } - throttlePct := "-" - if gpu.Scores.ThermalThrottlePct > 0 { - throttlePct = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct) - } - fmt.Fprintf(&b, "| GPU %d | %.1f°C | %.0f°C | %.0f°C | %.1f°C | %s | %s |\n", - gpu.Index, gpu.Steady.P95TempC, slowdownTemp, shutdownTemp, headroom, throttlePct, thermalStatus) + b.WriteString(fmtMDTable([]string{"GPU", "p95 temp", "Slowdown limit", "Shutdown limit", "Headroom", "Thermal throttle", "Status"}, rows)) + b.WriteString("\n") } - b.WriteString("\n") // Perspective 3: Power delivery b.WriteString("### 3. Power Delivery\n\n") - b.WriteString("| GPU | Power cap throttle | Power stability | Fan duty (p95) | Status |\n") - b.WriteString("|-----|-------------------|-----------------|----------------|--------|\n") - for _, gpu := range result.GPUs { - powerCap := "-" - if gpu.Scores.PowerCapThrottlePct > 0 { - powerCap = fmt.Sprintf("%.1f%%", gpu.Scores.PowerCapThrottlePct) + { + var rows [][]string + for _, gpu := range result.GPUs { + powerCap := "-" + if gpu.Scores.PowerCapThrottlePct > 0 { + powerCap = fmt.Sprintf("%.1f%%", gpu.Scores.PowerCapThrottlePct) + } + fanDuty := "-" + if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable { + fanDuty = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct) + } + powerStatus := "✓ OK" + if gpu.Scores.PowerCapThrottlePct > 5 { + powerStatus = "⚠ POWER LIMITED" + } + rows = append(rows, []string{ + fmt.Sprintf("GPU %d", gpu.Index), + powerCap, + fmt.Sprintf("%.1f", gpu.Scores.PowerSustainScore), + fanDuty, + powerStatus, + }) } - fanDuty := "-" - if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable { - fanDuty = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct) - } - powerStatus := "✓ OK" - if gpu.Scores.PowerCapThrottlePct > 5 { - powerStatus = "⚠ POWER LIMITED" - } - fmt.Fprintf(&b, "| GPU %d | %s | %.1f | %s | %s |\n", - gpu.Index, powerCap, gpu.Scores.PowerSustainScore, fanDuty, powerStatus) + b.WriteString(fmtMDTable([]string{"GPU", "Power cap throttle", "Power stability", "Fan duty (p95)", "Status"}, rows)) + b.WriteString("\n") } - b.WriteString("\n") // Perspective 4: Performance b.WriteString("### 4. Performance\n\n") - b.WriteString("| GPU | Compute TOPS | Synthetic | Mixed | Mixed Eff. | TOPS/SM/GHz |\n") - b.WriteString("|-----|--------------|-----------|-------|------------|-------------|\n") - for _, gpu := range result.GPUs { - synthetic := "-" - if gpu.Scores.SyntheticScore > 0 { - synthetic = fmt.Sprintf("%.2f", gpu.Scores.SyntheticScore) + { + var rows [][]string + for _, gpu := range result.GPUs { + synthetic := "-" + if gpu.Scores.SyntheticScore > 0 { + synthetic = fmt.Sprintf("%.2f", gpu.Scores.SyntheticScore) + } + mixed := "-" + if gpu.Scores.MixedScore > 0 { + mixed = fmt.Sprintf("%.2f", gpu.Scores.MixedScore) + } + mixedEff := "-" + if gpu.Scores.MixedEfficiency > 0 { + mixedEff = fmt.Sprintf("%.1f%%", gpu.Scores.MixedEfficiency*100) + } + topsPerSM := "-" + if gpu.Scores.TOPSPerSMPerGHz > 0 { + topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz) + } + rows = append(rows, []string{ + fmt.Sprintf("GPU %d", gpu.Index), + fmt.Sprintf("**%.2f**", gpu.Scores.CompositeScore), + synthetic, mixed, mixedEff, topsPerSM, + }) } - mixed := "-" - if gpu.Scores.MixedScore > 0 { - mixed = fmt.Sprintf("%.2f", gpu.Scores.MixedScore) + b.WriteString(fmtMDTable([]string{"GPU", "Compute TOPS", "Synthetic", "Mixed", "Mixed Eff.", "TOPS/SM/GHz"}, rows)) + if len(result.PerformanceRampSteps) > 0 { + fmt.Fprintf(&b, "\n**Platform power score (scalability):** %.1f%%\n", result.PlatformPowerScore) } - mixedEff := "-" - if gpu.Scores.MixedEfficiency > 0 { - mixedEff = fmt.Sprintf("%.1f%%", gpu.Scores.MixedEfficiency*100) - } - topsPerSM := "-" - if gpu.Scores.TOPSPerSMPerGHz > 0 { - topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz) - } - fmt.Fprintf(&b, "| GPU %d | **%.2f** | %s | %s | %s | %s |\n", - gpu.Index, gpu.Scores.CompositeScore, synthetic, mixed, mixedEff, topsPerSM) + b.WriteString("\n") } - if len(result.PerformanceRampSteps) > 0 { - fmt.Fprintf(&b, "\n**Platform power score (scalability):** %.1f%%\n", result.PlatformPowerScore) - } - b.WriteString("\n") // Perspective 5: Anomaly flags b.WriteString("### 5. Anomalies\n\n") - b.WriteString("| GPU | ECC corrected | Sync boost throttle | Power instability | Thermal instability |\n") - b.WriteString("|-----|---------------|---------------------|-------------------|---------------------|\n") - for _, gpu := range result.GPUs { - eccCorr := "-" - if gpu.ECC.Corrected > 0 { - eccCorr = fmt.Sprintf("⚠ %d", gpu.ECC.Corrected) + { + var rows [][]string + for _, gpu := range result.GPUs { + eccCorr := "-" + if gpu.ECC.Corrected > 0 { + eccCorr = fmt.Sprintf("⚠ %d", gpu.ECC.Corrected) + } + syncBoost := "-" + if gpu.Scores.SyncBoostThrottlePct > 0 { + syncBoost = fmt.Sprintf("%.1f%%", gpu.Scores.SyncBoostThrottlePct) + } + powerVar := "OK" + if gpu.Scores.PowerSustainScore < 70 { + powerVar = "⚠ unstable" + } + thermalVar := "OK" + if gpu.Scores.ThermalSustainScore < 70 { + thermalVar = "⚠ unstable" + } + rows = append(rows, []string{fmt.Sprintf("GPU %d", gpu.Index), eccCorr, syncBoost, powerVar, thermalVar}) } - syncBoost := "-" - if gpu.Scores.SyncBoostThrottlePct > 0 { - syncBoost = fmt.Sprintf("%.1f%%", gpu.Scores.SyncBoostThrottlePct) - } - powerVar := "OK" - if gpu.Scores.PowerSustainScore < 70 { - powerVar = "⚠ unstable" - } - thermalVar := "OK" - if gpu.Scores.ThermalSustainScore < 70 { - thermalVar = "⚠ unstable" - } - fmt.Fprintf(&b, "| GPU %d | %s | %s | %s | %s |\n", - gpu.Index, eccCorr, syncBoost, powerVar, thermalVar) + b.WriteString(fmtMDTable([]string{"GPU", "ECC corrected", "Sync boost throttle", "Power instability", "Thermal instability"}, rows)) + b.WriteString("\n") } - b.WriteString("\n") // ── Per GPU detail ──────────────────────────────────────────────────────── b.WriteString("## Per-GPU Details\n\n") @@ -263,12 +286,16 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string { // Steady-state telemetry if benchmarkTelemetryAvailable(gpu.Steady) { fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec)) - b.WriteString("| | Avg | P95 |\n|---|---|---|\n") - fmt.Fprintf(&b, "| Power | %.1f W | %.1f W |\n", gpu.Steady.AvgPowerW, gpu.Steady.P95PowerW) - fmt.Fprintf(&b, "| Temperature | %.1f °C | %.1f °C |\n", gpu.Steady.AvgTempC, gpu.Steady.P95TempC) - fmt.Fprintf(&b, "| GPU clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgGraphicsClockMHz, gpu.Steady.P95GraphicsClockMHz) - fmt.Fprintf(&b, "| Memory clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgMemoryClockMHz, gpu.Steady.P95MemoryClockMHz) - fmt.Fprintf(&b, "| GPU utilisation | %.1f %% | — |\n", gpu.Steady.AvgUsagePct) + b.WriteString(fmtMDTable( + []string{"", "Avg", "P95"}, + [][]string{ + {"Power", fmt.Sprintf("%.1f W", gpu.Steady.AvgPowerW), fmt.Sprintf("%.1f W", gpu.Steady.P95PowerW)}, + {"Temperature", fmt.Sprintf("%.1f °C", gpu.Steady.AvgTempC), fmt.Sprintf("%.1f °C", gpu.Steady.P95TempC)}, + {"GPU clock", fmt.Sprintf("%.0f MHz", gpu.Steady.AvgGraphicsClockMHz), fmt.Sprintf("%.0f MHz", gpu.Steady.P95GraphicsClockMHz)}, + {"Memory clock", fmt.Sprintf("%.0f MHz", gpu.Steady.AvgMemoryClockMHz), fmt.Sprintf("%.0f MHz", gpu.Steady.P95MemoryClockMHz)}, + {"GPU utilisation", fmt.Sprintf("%.1f %%", gpu.Steady.AvgUsagePct), "—"}, + }, + )) b.WriteString("\n") } else { b.WriteString("**Steady-state telemetry:** unavailable\n\n") @@ -277,7 +304,7 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string { // Per-precision stability phases. if len(gpu.PrecisionSteady) > 0 { b.WriteString("**Per-precision stability:**\n\n") - b.WriteString("| Precision | Status | Clock CV | Power CV | Clock Drift | ECC corr | ECC uncorr |\n|-----------|--------|----------|----------|-------------|----------|------------|\n") + var precRows [][]string for _, p := range gpu.PrecisionSteady { eccCorr := "—" eccUncorr := "—" @@ -289,10 +316,15 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string { if strings.TrimSpace(status) == "" { status = "OK" } - fmt.Fprintf(&b, "| %s | %s | %.1f%% | %.1f%% | %.1f%% | %s | %s |\n", - p.Precision, status, p.Steady.ClockCVPct, p.Steady.PowerCVPct, p.Steady.ClockDriftPct, - eccCorr, eccUncorr) + precRows = append(precRows, []string{ + p.Precision, status, + fmt.Sprintf("%.1f%%", p.Steady.ClockCVPct), + fmt.Sprintf("%.1f%%", p.Steady.PowerCVPct), + fmt.Sprintf("%.1f%%", p.Steady.ClockDriftPct), + eccCorr, eccUncorr, + }) } + b.WriteString(fmtMDTable([]string{"Precision", "Status", "Clock CV", "Power CV", "Clock Drift", "ECC corr", "ECC uncorr"}, precRows)) b.WriteString("\n") } else { // Legacy: show combined-window variance. @@ -315,16 +347,22 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string { // Precision results if len(gpu.PrecisionResults) > 0 { b.WriteString("**Precision results:**\n\n") - b.WriteString("| Precision | TOPS (raw) | Weight | TOPS (fp32-eq) | Lanes | Iterations |\n|-----------|------------|--------|----------------|-------|------------|\n") + var presRows [][]string for _, p := range gpu.PrecisionResults { if p.Supported { - weightStr := fmt.Sprintf("×%.3g", p.Weight) - fmt.Fprintf(&b, "| %s | %.2f | %s | %.2f | %d | %d |\n", - p.Name, p.TeraOpsPerSec, weightStr, p.WeightedTeraOpsPerSec, p.Lanes, p.Iterations) + presRows = append(presRows, []string{ + p.Name, + fmt.Sprintf("%.2f", p.TeraOpsPerSec), + fmt.Sprintf("×%.3g", p.Weight), + fmt.Sprintf("%.2f", p.WeightedTeraOpsPerSec), + fmt.Sprintf("%d", p.Lanes), + fmt.Sprintf("%d", p.Iterations), + }) } else { - fmt.Fprintf(&b, "| %s | — (unsupported) | — | — | — | — |\n", p.Name) + presRows = append(presRows, []string{p.Name, "— (unsupported)", "—", "—", "—", "—"}) } } + b.WriteString(fmtMDTable([]string{"Precision", "TOPS (raw)", "Weight", "TOPS (fp32-eq)", "Lanes", "Iterations"}, presRows)) b.WriteString("\n") } @@ -346,9 +384,13 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string { b.WriteString("## Interconnect (NCCL)\n\n") fmt.Fprintf(&b, "**Status:** %s\n\n", result.Interconnect.Status) if result.Interconnect.Supported { - b.WriteString("| Metric | Avg | Max |\n|--------|-----|-----|\n") - fmt.Fprintf(&b, "| Alg BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.MaxAlgBWGBps) - fmt.Fprintf(&b, "| Bus BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgBusBWGBps, result.Interconnect.MaxBusBWGBps) + b.WriteString(fmtMDTable( + []string{"Metric", "Avg", "Max"}, + [][]string{ + {"Alg BW", fmt.Sprintf("%.1f GB/s", result.Interconnect.AvgAlgBWGBps), fmt.Sprintf("%.1f GB/s", result.Interconnect.MaxAlgBWGBps)}, + {"Bus BW", fmt.Sprintf("%.1f GB/s", result.Interconnect.AvgBusBWGBps), fmt.Sprintf("%.1f GB/s", result.Interconnect.MaxBusBWGBps)}, + }, + )) b.WriteString("\n") } for _, note := range result.Interconnect.Notes { @@ -365,14 +407,16 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string { if !sp.Available { b.WriteString("IPMI power measurement unavailable.\n\n") } else { - b.WriteString("| | Value |\n|---|---|\n") - fmt.Fprintf(&b, "| Server idle | %.0f W |\n", sp.IdleW) - fmt.Fprintf(&b, "| Server under load | %.0f W |\n", sp.LoadedW) - fmt.Fprintf(&b, "| Server delta (load − idle) | %.0f W |\n", sp.DeltaW) - fmt.Fprintf(&b, "| GPU-reported sum | %.0f W |\n", sp.GPUReportedSumW) - if sp.ReportingRatio > 0 { - fmt.Fprintf(&b, "| Reporting ratio | %.2f (1.0 = accurate, <0.75 = GPU over-reports) |\n", sp.ReportingRatio) + spRows := [][]string{ + {"Server idle", fmt.Sprintf("%.0f W", sp.IdleW)}, + {"Server under load", fmt.Sprintf("%.0f W", sp.LoadedW)}, + {"Server delta (load − idle)", fmt.Sprintf("%.0f W", sp.DeltaW)}, + {"GPU-reported sum", fmt.Sprintf("%.0f W", sp.GPUReportedSumW)}, } + if sp.ReportingRatio > 0 { + spRows = append(spRows, []string{"Reporting ratio", fmt.Sprintf("%.2f (1.0 = accurate, <0.75 = GPU over-reports)", sp.ReportingRatio)}) + } + b.WriteString(fmtMDTable([]string{"", "Value"}, spRows)) b.WriteString("\n") } for _, note := range sp.Notes { @@ -397,15 +441,19 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string { if cooling := result.Cooling; cooling != nil { b.WriteString("## Cooling\n\n") if cooling.Available { - b.WriteString("| Metric | Value |\n|--------|-------|\n") - fmt.Fprintf(&b, "| Average fan speed | %.0f RPM |\n", cooling.AvgFanRPM) + dutyAvg, dutyP95 := "N/A", "N/A" if cooling.FanDutyCycleAvailable { - fmt.Fprintf(&b, "| Average fan duty cycle | %.1f%% |\n", cooling.AvgFanDutyCyclePct) - fmt.Fprintf(&b, "| P95 fan duty cycle | %.1f%% |\n", cooling.P95FanDutyCyclePct) - } else { - b.WriteString("| Average fan duty cycle | N/A |\n") - b.WriteString("| P95 fan duty cycle | N/A |\n") + dutyAvg = fmt.Sprintf("%.1f%%", cooling.AvgFanDutyCyclePct) + dutyP95 = fmt.Sprintf("%.1f%%", cooling.P95FanDutyCyclePct) } + b.WriteString(fmtMDTable( + []string{"Metric", "Value"}, + [][]string{ + {"Average fan speed", fmt.Sprintf("%.0f RPM", cooling.AvgFanRPM)}, + {"Average fan duty cycle", dutyAvg}, + {"P95 fan duty cycle", dutyP95}, + }, + )) b.WriteString("\n") } else { b.WriteString("Cooling telemetry unavailable.\n\n") @@ -422,12 +470,16 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string { if len(result.PerformanceRampSteps) > 0 { b.WriteString("## Platform Scalability (Performance Ramp)\n\n") fmt.Fprintf(&b, "**Platform power score:** %.1f%% \n\n", result.PlatformPowerScore) - b.WriteString("| k GPUs | GPU Indices | Total Synthetic TOPS | Scalability |\n") - b.WriteString("|--------|-------------|----------------------|-------------|\n") + var scalRows [][]string for _, step := range result.PerformanceRampSteps { - fmt.Fprintf(&b, "| %d | %s | %.2f | %.1f%% |\n", - step.StepIndex, joinIndexList(step.GPUIndices), step.TotalSyntheticTOPS, step.ScalabilityPct) + scalRows = append(scalRows, []string{ + fmt.Sprintf("%d", step.StepIndex), + joinIndexList(step.GPUIndices), + fmt.Sprintf("%.2f", step.TotalSyntheticTOPS), + fmt.Sprintf("%.1f%%", step.ScalabilityPct), + }) } + b.WriteString(fmtMDTable([]string{"k GPUs", "GPU Indices", "Total Synthetic TOPS", "Scalability"}, scalRows)) b.WriteString("\n") } diff --git a/audit/internal/platform/benchmark_table.go b/audit/internal/platform/benchmark_table.go new file mode 100644 index 0000000..8e57f73 --- /dev/null +++ b/audit/internal/platform/benchmark_table.go @@ -0,0 +1,75 @@ +package platform + +import ( + "strings" +) + +// fmtMDTable renders a markdown table with column widths padded so the table +// is readable as plain text without a markdown renderer. +// +// headers contains the column header strings. +// rows contains data rows; each row must have the same number of cells as headers. +// Cells with fewer entries than headers are treated as empty. +func fmtMDTable(headers []string, rows [][]string) string { + ncols := len(headers) + if ncols == 0 { + return "" + } + + // Compute max width per column. + widths := make([]int, ncols) + for i, h := range headers { + if len(h) > widths[i] { + widths[i] = len(h) + } + } + for _, row := range rows { + for i := 0; i < ncols; i++ { + cell := "" + if i < len(row) { + cell = row[i] + } + if len(cell) > widths[i] { + widths[i] = len(cell) + } + } + } + + var b strings.Builder + + // Header row. + b.WriteByte('|') + for i, h := range headers { + b.WriteByte(' ') + b.WriteString(h) + b.WriteString(strings.Repeat(" ", widths[i]-len(h))) + b.WriteString(" |") + } + b.WriteByte('\n') + + // Separator row. + b.WriteByte('|') + for i := range headers { + b.WriteString(strings.Repeat("-", widths[i]+2)) + b.WriteByte('|') + } + b.WriteByte('\n') + + // Data rows. + for _, row := range rows { + b.WriteByte('|') + for i := 0; i < ncols; i++ { + cell := "" + if i < len(row) { + cell = row[i] + } + b.WriteByte(' ') + b.WriteString(cell) + b.WriteString(strings.Repeat(" ", widths[i]-len(cell))) + b.WriteString(" |") + } + b.WriteByte('\n') + } + + return b.String() +} diff --git a/audit/internal/platform/benchmark_types.go b/audit/internal/platform/benchmark_types.go index 0b60bcb..d0c83ef 100644 --- a/audit/internal/platform/benchmark_types.go +++ b/audit/internal/platform/benchmark_types.go @@ -52,7 +52,7 @@ const ( // - BenchmarkEstimatedPerfStabilitySec: xFusion v8.22 ramp 1-8: 5532 s // - BenchmarkEstimatedPerfOvernightSec: derived from profile phases (SteadySec=27000) // - BenchmarkEstimatedPowerStandardSec: MLT v8.22 ramp 1-4: 2663 s; MSI v8.22 ramp 1-8: 2375 s -// - BenchmarkEstimatedPowerStabilitySec: xFusion v8.17/v8.22 ramp 1-8: 1977-2002 s +// - BenchmarkEstimatedPowerStabilitySec: target ~90 min with calibDurationSec=300 (8 GPU × ~2-3 attempts) const ( // Performance Benchmark (bee-gpu-burn). // Duration is per full ramp-up run (ramp 1→N) or per single parallel run. @@ -64,7 +64,7 @@ const ( // Power / Thermal Fit (dcgmi targeted_power binary-search calibration). // Duration is for the full ramp-up run; individual steps vary with convergence speed. BenchmarkEstimatedPowerStandardSec = 2600 // ~43 min; ramp 1-4: 2663 s, ramp 1-8: 2375 s - BenchmarkEstimatedPowerStabilitySec = 2000 // ~33 min; stability profile converges faster (longer steady → faster convergence) + BenchmarkEstimatedPowerStabilitySec = 5400 // ~90 min; calibDurationSec=300 × 8 GPU × ~2-3 attempts BenchmarkEstimatedPowerOvernightSec = 3 * 3600 ) @@ -408,6 +408,9 @@ type NvidiaPowerBenchGPU struct { // Telemetry holds the aggregated stats from the final converged calibration // attempt for this GPU (temperature, power, fan, clock percentiles). Telemetry *BenchmarkTelemetrySummary `json:"telemetry,omitempty"` + // Fan state sampled at the end of single-card calibration. + AvgFanRPM float64 `json:"avg_fan_rpm,omitempty"` + AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"` } type NvidiaPowerBenchStep struct { @@ -426,6 +429,13 @@ type NvidiaPowerBenchStep struct { // ramp step's calibration run. ServerDeltaW = ServerLoadedW − idle. ServerLoadedW float64 `json:"server_loaded_w,omitempty"` ServerDeltaW float64 `json:"server_delta_w,omitempty"` + // PSU slot readings sampled at end of this ramp step. + PSUSlotReadings map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings,omitempty"` + // Fan state at end of this ramp step. + AvgFanRPM float64 `json:"avg_fan_rpm,omitempty"` + AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"` + // Per-GPU telemetry from this step's calibration, keyed by GPU index. + PerGPUTelemetry map[int]*BenchmarkTelemetrySummary `json:"per_gpu_telemetry,omitempty"` } // NvidiaPerformanceRampStep holds per-step performance data for the