Rework Power Fit report: 90 min stability, aligned tables, PSU/fan sections

- Increase stability profile duration from 33 min to 90 min by wiring
  powerBenchDurationSec() into runBenchmarkPowerCalibration (was discarded)
- Collect per-step PSU slot readings, fan RPM/duty, and per-GPU telemetry
  in ramp loop; add matching fields to NvidiaPowerBenchStep/NvidiaPowerBenchGPU
- Rewrite renderPowerBenchReport: replace Per-Slot Results with Single GPU
  section, rework Ramp Sequence rows=runs/cols=GPUs, add PSU Performance
  section (conditional on IPMI data), add transposed Single vs All-GPU
  comparison table in per-GPU sections
- Add fmtMDTable helper (benchmark_table.go) and apply to all tables in
  both power and performance reports so columns align in plain-text view

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-19 18:04:12 +03:00
parent d52ec67f8f
commit f8cd9a7376
4 changed files with 647 additions and 226 deletions

View File

@@ -3055,8 +3055,12 @@ func runBenchmarkPowerCalibration(
infoByIndex map[int]benchmarkGPUInfo, infoByIndex map[int]benchmarkGPUInfo,
logFunc func(string), logFunc func(string),
seedLimits map[int]int, seedLimits map[int]int,
durationSec int,
) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction, []GPUMetricRow) { ) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction, []GPUMetricRow) {
const calibDurationSec = 120 calibDurationSec := durationSec
if calibDurationSec <= 0 {
calibDurationSec = 120
}
const maxDerateW = 150 const maxDerateW = 150
// calibSearchTolerance is the binary-search convergence threshold in watts. // calibSearchTolerance is the binary-search convergence threshold in watts.
// When hi-lo ≤ this, the highest verified-stable limit (lo) is used. // When hi-lo ≤ this, the highest verified-stable limit (lo) is used.
@@ -3436,6 +3440,18 @@ func roundTo5W(w int) int {
return ((w + 2) / 5) * 5 return ((w + 2) / 5) * 5
} }
// meanFanRPM returns the average RPM across a set of fan readings.
func meanFanRPM(fans []FanReading) float64 {
if len(fans) == 0 {
return 0
}
var sum float64
for _, f := range fans {
sum += f.RPM
}
return sum / float64(len(fans))
}
func powerBenchDurationSec(profile string) int { func powerBenchDurationSec(profile string) int {
switch strings.TrimSpace(strings.ToLower(profile)) { switch strings.TrimSpace(strings.ToLower(profile)) {
case NvidiaBenchmarkProfileStability: case NvidiaBenchmarkProfileStability:
@@ -3475,30 +3491,29 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
// Server power comparison table. // Server power comparison table.
if sp := result.ServerPower; sp != nil { if sp := result.ServerPower; sp != nil {
b.WriteString("## Server vs GPU Power Comparison\n\n") b.WriteString("## Server vs GPU Power Comparison\n\n")
b.WriteString("| Metric | Source | Value |\n") var spRows [][]string
b.WriteString("|--------|--------|-------|\n") spRows = append(spRows, []string{"GPU stable limits sum", "nvidia-smi", fmt.Sprintf("%.0f W", result.PlatformMaxTDPW)})
fmt.Fprintf(&b, "| GPU stable limits sum | nvidia-smi | %.0f W |\n", result.PlatformMaxTDPW) spRows = append(spRows, []string{"GPU actual power sum (p95, last step)", "nvidia-smi", fmt.Sprintf("%.0f W", sp.GPUReportedSumW)})
fmt.Fprintf(&b, "| GPU actual power sum (p95, last step) | nvidia-smi | %.0f W |\n", sp.GPUReportedSumW)
if sp.GPUSlotTotalW > 0 { if sp.GPUSlotTotalW > 0 {
fmt.Fprintf(&b, "| GPU PCIe slot power (at peak load) | IPMI SDR | %.0f W |\n", sp.GPUSlotTotalW) spRows = append(spRows, []string{"GPU PCIe slot power (at peak load)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.GPUSlotTotalW)})
} }
if sp.Available { if sp.Available {
fmt.Fprintf(&b, "| Server idle power | IPMI DCMI | %.0f W |\n", sp.IdleW) spRows = append(spRows, []string{"Server idle power", "IPMI DCMI", fmt.Sprintf("%.0f W", sp.IdleW)})
fmt.Fprintf(&b, "| Server loaded power | IPMI DCMI | %.0f W |\n", sp.LoadedW) spRows = append(spRows, []string{"Server loaded power", "IPMI DCMI", fmt.Sprintf("%.0f W", sp.LoadedW)})
fmt.Fprintf(&b, "| Server Δ power (loaded idle) | IPMI DCMI | %.0f W |\n", sp.DeltaW) spRows = append(spRows, []string{"Server Δ power (loaded idle)", "IPMI DCMI", fmt.Sprintf("%.0f W", sp.DeltaW)})
} }
if sp.PSUInputLoadedW > 0 { if sp.PSUInputLoadedW > 0 {
fmt.Fprintf(&b, "| PSU AC input (idle) | IPMI SDR | %.0f W |\n", sp.PSUInputIdleW) spRows = append(spRows, []string{"PSU AC input (idle)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUInputIdleW)})
fmt.Fprintf(&b, "| PSU AC input (loaded) | IPMI SDR | %.0f W |\n", sp.PSUInputLoadedW) spRows = append(spRows, []string{"PSU AC input (loaded)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUInputLoadedW)})
psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
fmt.Fprintf(&b, "| PSU AC input Δ (loaded idle) | IPMI SDR | %.0f W |\n", psuDelta) spRows = append(spRows, []string{"PSU AC input Δ (loaded idle)", "IPMI SDR", fmt.Sprintf("%.0f W", psuDelta)})
} }
if sp.PSUOutputLoadedW > 0 { if sp.PSUOutputLoadedW > 0 {
fmt.Fprintf(&b, "| PSU DC output (idle) | IPMI SDR | %.0f W |\n", sp.PSUOutputIdleW) spRows = append(spRows, []string{"PSU DC output (idle)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUOutputIdleW)})
fmt.Fprintf(&b, "| PSU DC output (loaded) | IPMI SDR | %.0f W |\n", sp.PSUOutputLoadedW) spRows = append(spRows, []string{"PSU DC output (loaded)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUOutputLoadedW)})
if sp.PSUInputLoadedW > 0 && sp.PSUInputIdleW > 0 { if sp.PSUInputLoadedW > 0 && sp.PSUInputIdleW > 0 {
psuEff := sp.PSUOutputIdleW / sp.PSUInputIdleW * 100 psuEff := sp.PSUOutputIdleW / sp.PSUInputIdleW * 100
fmt.Fprintf(&b, "| PSU conversion efficiency (idle) | IPMI SDR | %.1f%% |\n", psuEff) spRows = append(spRows, []string{"PSU conversion efficiency (idle)", "IPMI SDR", fmt.Sprintf("%.1f%%", psuEff)})
} }
} }
if sp.Available { if sp.Available {
@@ -3516,7 +3531,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
default: default:
ratioNote = "✗ significant discrepancy — GPU over-reports TDP vs wall power" ratioNote = "✗ significant discrepancy — GPU over-reports TDP vs wall power"
} }
fmt.Fprintf(&b, "| Reporting ratio (DCMI Δ / GPU actual) | IPMI DCMI | %.2f — %s |\n", ratio, ratioNote) spRows = append(spRows, []string{"Reporting ratio (DCMI Δ / GPU actual)", "IPMI DCMI", fmt.Sprintf("%.2f — %s", ratio, ratioNote)})
if sp.PSUInputLoadedW > 0 && sp.GPUReportedSumW > 0 { if sp.PSUInputLoadedW > 0 && sp.GPUReportedSumW > 0 {
psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
sdrRatio := psuDelta / sp.GPUReportedSumW sdrRatio := psuDelta / sp.GPUReportedSumW
@@ -3529,11 +3544,12 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
default: default:
sdrNote = "✗ significant discrepancy" sdrNote = "✗ significant discrepancy"
} }
fmt.Fprintf(&b, "| Reporting ratio (SDR PSU Δ / GPU actual) | IPMI SDR | %.2f — %s |\n", sdrRatio, sdrNote) spRows = append(spRows, []string{"Reporting ratio (SDR PSU Δ / GPU actual)", "IPMI SDR", fmt.Sprintf("%.2f — %s", sdrRatio, sdrNote)})
} }
} else { } else {
b.WriteString("| IPMI availability | — | not available — IPMI not supported or ipmitool not found |\n") spRows = append(spRows, []string{"IPMI availability", "—", "not available — IPMI not supported or ipmitool not found"})
} }
b.WriteString(fmtMDTable([]string{"Metric", "Source", "Value"}, spRows))
for _, note := range sp.Notes { for _, note := range sp.Notes {
fmt.Fprintf(&b, "\n> %s\n", note) fmt.Fprintf(&b, "\n> %s\n", note)
} }
@@ -3541,10 +3557,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
if len(sp.PSUSlotReadingsIdle) > 0 || len(sp.PSUSlotReadingsLoaded) > 0 { if len(sp.PSUSlotReadingsIdle) > 0 || len(sp.PSUSlotReadingsLoaded) > 0 {
b.WriteString("## PSU Load Distribution\n\n") b.WriteString("## PSU Load Distribution\n\n")
b.WriteString("| Slot | AC Input (idle) | AC Input (loaded) | DC Output (idle) | DC Output (loaded) | Load Δ | Status |\n")
b.WriteString("|------|-----------------|-------------------|------------------|--------------------|--------|--------|\n")
// collect all slot keys
slotSet := map[string]struct{}{} slotSet := map[string]struct{}{}
for k := range sp.PSUSlotReadingsIdle { for k := range sp.PSUSlotReadingsIdle {
slotSet[k] = struct{}{} slotSet[k] = struct{}{}
@@ -3558,17 +3571,18 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
} }
sort.Strings(slots) sort.Strings(slots)
fmtW := func(v *float64) string {
if v == nil {
return "—"
}
return fmt.Sprintf("%.0f W", *v)
}
var psuDistRows [][]string
for _, slot := range slots { for _, slot := range slots {
idle := sp.PSUSlotReadingsIdle[slot] idle := sp.PSUSlotReadingsIdle[slot]
loaded := sp.PSUSlotReadingsLoaded[slot] loaded := sp.PSUSlotReadingsLoaded[slot]
fmtW := func(v *float64) string {
if v == nil {
return "—"
}
return fmt.Sprintf("%.0f W", *v)
}
var deltaStr string var deltaStr string
if idle.InputW != nil && loaded.InputW != nil { if idle.InputW != nil && loaded.InputW != nil {
deltaStr = fmt.Sprintf("%+.0f W", *loaded.InputW-*idle.InputW) deltaStr = fmt.Sprintf("%+.0f W", *loaded.InputW-*idle.InputW)
@@ -3584,13 +3598,14 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
status = "—" status = "—"
} }
fmt.Fprintf(&b, "| %s | %s | %s | %s | %s | %s | %s |\n", psuDistRows = append(psuDistRows, []string{
slot, slot,
fmtW(idle.InputW), fmtW(loaded.InputW), fmtW(idle.InputW), fmtW(loaded.InputW),
fmtW(idle.OutputW), fmtW(loaded.OutputW), fmtW(idle.OutputW), fmtW(loaded.OutputW),
deltaStr, status, deltaStr, status,
) })
} }
b.WriteString(fmtMDTable([]string{"Slot", "AC Input (idle)", "AC Input (loaded)", "DC Output (idle)", "DC Output (loaded)", "Load Δ", "Status"}, psuDistRows))
b.WriteString("\n") b.WriteString("\n")
} }
} }
@@ -3602,28 +3617,194 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
} }
b.WriteString("\n") b.WriteString("\n")
} }
if len(result.RecommendedSlotOrder) > 0 { // ── Single GPU section ───────────────────────────────────────────────────
b.WriteString("## Recommended Slot Order\n\n") b.WriteString("## Single GPU\n\n")
fmt.Fprintf(&b, "Populate GPUs in this order for best single-card power realization: `%s`\n\n", joinIndexList(result.RecommendedSlotOrder)) {
} var sgRows [][]string
if len(result.RampSteps) > 0 { for _, gpu := range result.GPUs {
b.WriteString("## Ramp Sequence\n\n") clk := "—"
b.WriteString("| Step | New GPU | Stable Limit | Total Observed | Server Δ (IPMI) | Derated | Status |\n") mem := "—"
b.WriteString("|------|---------|--------------|----------------|-----------------|---------|--------|\n") temp := "—"
for _, step := range result.RampSteps { pwr := "—"
derated := "-" if gpu.Telemetry != nil {
if step.Derated { clk = fmt.Sprintf("%.0f", gpu.Telemetry.AvgGraphicsClockMHz)
derated = "⚠ yes" mem = fmt.Sprintf("%.0f", gpu.Telemetry.AvgMemoryClockMHz)
temp = fmt.Sprintf("%.1f", gpu.Telemetry.AvgTempC)
pwr = fmt.Sprintf("%.0f W", gpu.Telemetry.AvgPowerW)
} }
serverDelta := "-" serverDelta := ""
if step.ServerDeltaW > 0 { if gpu.ServerDeltaW > 0 {
serverDelta = fmt.Sprintf("%.0f W", step.ServerDeltaW) serverDelta = fmt.Sprintf("%.0f W", gpu.ServerDeltaW)
} }
fmt.Fprintf(&b, "| %d | GPU %d | %.0f W | %.0f W | %s | %s | %s |\n", fan := "—"
step.StepIndex, step.NewGPUIndex, step.NewGPUStableLimitW, step.TotalObservedPowerW, serverDelta, derated, step.Status) if gpu.AvgFanRPM > 0 {
if gpu.AvgFanDutyCyclePct > 0 {
fan = fmt.Sprintf("%.0f RPM (%.0f%%)", gpu.AvgFanRPM, gpu.AvgFanDutyCyclePct)
} else {
fan = fmt.Sprintf("%.0f RPM", gpu.AvgFanRPM)
}
}
sgRows = append(sgRows, []string{
fmt.Sprintf("GPU %d", gpu.Index),
fmt.Sprintf("%s (%s)", clk, mem),
temp,
pwr,
serverDelta,
fan,
})
} }
b.WriteString(fmtMDTable([]string{"GPU", "Clock MHz (Mem MHz)", "Avg Temp °C", "Power W", "Server Δ W", "Fan RPM (duty%)"}, sgRows))
b.WriteString("\n") b.WriteString("\n")
} }
if len(result.RecommendedSlotOrder) > 0 {
fmt.Fprintf(&b, "Recommended slot order for best single-card power realization: `%s`\n\n", joinIndexList(result.RecommendedSlotOrder))
}
// ── Ramp Sequence ────────────────────────────────────────────────────────
// Rows = run number; Cols = per-GPU power (from step telemetry) + aggregates.
if len(result.RampSteps) > 0 {
b.WriteString("## Ramp Sequence\n\n")
// Collect all GPU indices that appear across all steps (ordered by first appearance).
allGPUIndices := make([]int, 0, len(result.GPUs))
seen := map[int]bool{}
for _, step := range result.RampSteps {
for _, idx := range step.GPUIndices {
if !seen[idx] {
seen[idx] = true
allGPUIndices = append(allGPUIndices, idx)
}
}
}
var idleW float64
if result.ServerPower != nil {
idleW = result.ServerPower.IdleW
}
// Build header: Run | GPU 0 | GPU 1 | ... | Server wall W | Per GPU wall W | Platform eff.
headers := []string{"Run"}
for _, idx := range allGPUIndices {
headers = append(headers, fmt.Sprintf("GPU %d W", idx))
}
headers = append(headers, "Server wall W", "Per GPU wall W", "Platform eff.")
var rampRows [][]string
for _, step := range result.RampSteps {
row := []string{fmt.Sprintf("%d", step.StepIndex)}
for _, idx := range allGPUIndices {
inStep := false
for _, si := range step.GPUIndices {
if si == idx {
inStep = true
break
}
}
if !inStep {
row = append(row, "—")
continue
}
gpuPwr := "—"
if t, ok := step.PerGPUTelemetry[idx]; ok && t != nil && t.AvgPowerW > 0 {
gpuPwr = fmt.Sprintf("%.0f", t.AvgPowerW)
}
row = append(row, gpuPwr)
}
// Server wall W
serverWall := "—"
if step.ServerLoadedW > 0 {
serverWall = fmt.Sprintf("%.0f", step.ServerLoadedW)
}
// Per GPU wall W = ServerDeltaW / len(GPUIndices)
perGPUWall := "—"
if step.ServerDeltaW > 0 && len(step.GPUIndices) > 0 {
perGPUWall = fmt.Sprintf("%.0f", step.ServerDeltaW/float64(len(step.GPUIndices)))
}
// Platform eff. = (ServerLoadedW idleW) / TotalObservedPowerW
platEff := "—"
if step.TotalObservedPowerW > 0 {
eff := step.ServerDeltaW / step.TotalObservedPowerW
if idleW > 0 && step.ServerLoadedW > 0 {
eff = (step.ServerLoadedW - idleW) / step.TotalObservedPowerW
}
platEff = fmt.Sprintf("%.2f", eff)
}
row = append(row, serverWall, perGPUWall, platEff)
rampRows = append(rampRows, row)
}
b.WriteString(fmtMDTable(headers, rampRows))
b.WriteString("\n")
}
// ── PSU Performance ───────────────────────────────────────────────────────
{
// Collect all PSU slot keys from any ramp step.
psuSlotSet := map[string]struct{}{}
for _, step := range result.RampSteps {
for k := range step.PSUSlotReadings {
psuSlotSet[k] = struct{}{}
}
}
if len(psuSlotSet) > 0 {
b.WriteString("## PSU Performance\n\n")
psuSlots := make([]string, 0, len(psuSlotSet))
for k := range psuSlotSet {
psuSlots = append(psuSlots, k)
}
sort.Strings(psuSlots)
var idleW float64
if result.ServerPower != nil {
idleW = result.ServerPower.IdleW
}
psuHeaders := []string{"Run"}
for _, slot := range psuSlots {
psuHeaders = append(psuHeaders, fmt.Sprintf("PSU %s W", slot))
}
psuHeaders = append(psuHeaders, "PSU Total W", "Platform eff.", "Fan RPM (duty%)")
var psuRows [][]string
for _, step := range result.RampSteps {
row := []string{fmt.Sprintf("%d", step.StepIndex)}
var psuTotal float64
for _, slot := range psuSlots {
sp, ok := step.PSUSlotReadings[slot]
if !ok || sp.InputW == nil {
row = append(row, "—")
continue
}
row = append(row, fmt.Sprintf("%.0f", *sp.InputW))
psuTotal += *sp.InputW
}
totalStr := "—"
if psuTotal > 0 {
totalStr = fmt.Sprintf("%.0f", psuTotal)
}
platEff := "—"
if step.TotalObservedPowerW > 0 {
eff := step.ServerDeltaW / step.TotalObservedPowerW
if idleW > 0 && step.ServerLoadedW > 0 {
eff = (step.ServerLoadedW - idleW) / step.TotalObservedPowerW
}
platEff = fmt.Sprintf("%.2f", eff)
}
fan := "—"
if step.AvgFanRPM > 0 {
if step.AvgFanDutyCyclePct > 0 {
fan = fmt.Sprintf("%.0f (%.0f%%)", step.AvgFanRPM, step.AvgFanDutyCyclePct)
} else {
fan = fmt.Sprintf("%.0f", step.AvgFanRPM)
}
}
row = append(row, totalStr, platEff, fan)
psuRows = append(psuRows, row)
}
b.WriteString(fmtMDTable(psuHeaders, psuRows))
b.WriteString("\n")
}
}
// ── PSU Issues ──────────────────────────────────────────────────────────── // ── PSU Issues ────────────────────────────────────────────────────────────
if len(result.PSUIssues) > 0 { if len(result.PSUIssues) > 0 {
b.WriteString("## PSU Issues\n\n") b.WriteString("## PSU Issues\n\n")
@@ -3646,8 +3827,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
totalDefault += gpu.DefaultPowerLimitW totalDefault += gpu.DefaultPowerLimitW
totalStable += stable totalStable += stable
} }
b.WriteString("| GPU | Default TDP | Single-card limit | Stable limit | Realization | Derated |\n") var pdRows [][]string
b.WriteString("|-----|-------------|-------------------|--------------|-------------|----------|\n")
for _, gpu := range result.GPUs { for _, gpu := range result.GPUs {
stable := gpu.StablePowerLimitW stable := gpu.StablePowerLimitW
if stable <= 0 { if stable <= 0 {
@@ -3661,15 +3841,29 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
if gpu.Derated { if gpu.Derated {
derated = "⚠ yes" derated = "⚠ yes"
} }
fmt.Fprintf(&b, "| GPU %d | %.0f W | %.0f W | %.0f W | %s | %s |\n", pdRows = append(pdRows, []string{
gpu.Index, gpu.DefaultPowerLimitW, gpu.AppliedPowerLimitW, stable, realization, derated) fmt.Sprintf("GPU %d", gpu.Index),
fmt.Sprintf("%.0f W", gpu.DefaultPowerLimitW),
fmt.Sprintf("%.0f W", gpu.AppliedPowerLimitW),
fmt.Sprintf("%.0f W", stable),
realization,
derated,
})
} }
platformReal := "-" platformReal := "-"
if totalDefault > 0 && totalStable > 0 { if totalDefault > 0 && totalStable > 0 {
platformReal = fmt.Sprintf("%.1f%%", totalStable/totalDefault*100) platformReal = fmt.Sprintf("%.1f%%", totalStable/totalDefault*100)
} }
fmt.Fprintf(&b, "| **Platform** | **%.0f W** | — | **%.0f W** | **%s** | |\n\n", pdRows = append(pdRows, []string{
totalDefault, totalStable, platformReal) "**Platform**",
fmt.Sprintf("**%.0f W**", totalDefault),
"—",
fmt.Sprintf("**%.0f W**", totalStable),
fmt.Sprintf("**%s**", platformReal),
"",
})
b.WriteString(fmtMDTable([]string{"GPU", "Default TDP", "Single-card limit", "Stable limit", "Realization", "Derated"}, pdRows))
b.WriteString("\n")
// Balance across GPUs — only meaningful with 2+ GPUs. // Balance across GPUs — only meaningful with 2+ GPUs.
if len(result.GPUs) > 1 { if len(result.GPUs) > 1 {
@@ -3710,9 +3904,6 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
// Ramp scalability table — power efficiency of adding each GPU. // Ramp scalability table — power efficiency of adding each GPU.
if len(result.RampSteps) > 1 { if len(result.RampSteps) > 1 {
b.WriteString("**Ramp power scalability** (stable TDP per step):\n\n") b.WriteString("**Ramp power scalability** (stable TDP per step):\n\n")
b.WriteString("| Step | GPUs | Cumulative stable TDP | Incremental | Efficiency vs GPU 1 |\n")
b.WriteString("|------|------|-----------------------|-------------|---------------------|\n")
// First GPU stable TDP as the reference unit for efficiency.
var firstStable float64 var firstStable float64
if len(result.GPUs) > 0 { if len(result.GPUs) > 0 {
firstStable = result.GPUs[0].StablePowerLimitW firstStable = result.GPUs[0].StablePowerLimitW
@@ -3721,6 +3912,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
} }
} }
var prevCumulative float64 var prevCumulative float64
var scalRows [][]string
for _, step := range result.RampSteps { for _, step := range result.RampSteps {
var cumulative float64 var cumulative float64
for _, gpuIdx := range step.GPUIndices { for _, gpuIdx := range step.GPUIndices {
@@ -3740,40 +3932,104 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
if step.StepIndex > 1 && firstStable > 0 { if step.StepIndex > 1 && firstStable > 0 {
efficiency = fmt.Sprintf("%.1f%%", incremental/firstStable*100) efficiency = fmt.Sprintf("%.1f%%", incremental/firstStable*100)
} }
fmt.Fprintf(&b, "| %d | %s | %.0f W | %.0f W | %s |\n", scalRows = append(scalRows, []string{
step.StepIndex, joinIndexList(step.GPUIndices), cumulative, incremental, efficiency) fmt.Sprintf("%d", step.StepIndex),
joinIndexList(step.GPUIndices),
fmt.Sprintf("%.0f W", cumulative),
fmt.Sprintf("%.0f W", incremental),
efficiency,
})
prevCumulative = cumulative prevCumulative = cumulative
} }
b.WriteString(fmtMDTable([]string{"Step", "GPUs", "Cumulative stable TDP", "Incremental", "Efficiency vs GPU 1"}, scalRows))
b.WriteString("\n") b.WriteString("\n")
} }
} }
b.WriteString("## Per-Slot Results\n\n") // ── Per-GPU sections ──────────────────────────────────────────────────────
b.WriteString("| GPU | Status | Single-card Limit | Stable Limit | Server Δ (IPMI) | Temp | Attempts |\n") var lastStep *NvidiaPowerBenchStep
b.WriteString("|-----|--------|-------------------|--------------|-----------------|------|----------|\n") if n := len(result.RampSteps); n > 0 {
for _, gpu := range result.GPUs { lastStep = &result.RampSteps[n-1]
stableLimit := "-"
if gpu.StablePowerLimitW > 0 {
if gpu.Derated {
stableLimit = fmt.Sprintf("%.0f W ⚠", gpu.StablePowerLimitW)
} else {
stableLimit = fmt.Sprintf("%.0f W", gpu.StablePowerLimitW)
}
}
serverDelta := "-"
if gpu.ServerDeltaW > 0 {
serverDelta = fmt.Sprintf("%.0f W", gpu.ServerDeltaW)
}
fmt.Fprintf(&b, "| GPU %d | %s | %.0f W | %s | %s | %.1f C | %d |\n",
gpu.Index, gpu.Status, gpu.AppliedPowerLimitW, stableLimit, serverDelta, gpu.MaxObservedTempC, gpu.CalibrationAttempts)
} }
b.WriteString("\n")
for _, gpu := range result.GPUs { for _, gpu := range result.GPUs {
fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, gpu.Name) fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, gpu.Name)
// Transposed comparison table: Single Run vs All GPU Run.
singleClk := "—"
singleMem := "—"
singleTemp := "—"
singlePwr := "—"
singleWall := "—"
singleFan := "—"
if gpu.Telemetry != nil {
singleClk = fmt.Sprintf("%.0f", gpu.Telemetry.AvgGraphicsClockMHz)
singleMem = fmt.Sprintf("%.0f", gpu.Telemetry.AvgMemoryClockMHz)
singleTemp = fmt.Sprintf("%.1f", gpu.Telemetry.AvgTempC)
singlePwr = fmt.Sprintf("%.0f W", gpu.Telemetry.AvgPowerW)
}
if gpu.ServerDeltaW > 0 {
singleWall = fmt.Sprintf("%.0f W", gpu.ServerDeltaW)
}
if gpu.AvgFanRPM > 0 {
if gpu.AvgFanDutyCyclePct > 0 {
singleFan = fmt.Sprintf("%.0f RPM (%.0f%%)", gpu.AvgFanRPM, gpu.AvgFanDutyCyclePct)
} else {
singleFan = fmt.Sprintf("%.0f RPM", gpu.AvgFanRPM)
}
}
allClk := "—"
allMem := "—"
allTemp := "—"
allPwr := "—"
allWall := "—"
allFan := "—"
if lastStep != nil {
if t, ok := lastStep.PerGPUTelemetry[gpu.Index]; ok && t != nil {
allClk = fmt.Sprintf("%.0f", t.AvgGraphicsClockMHz)
allMem = fmt.Sprintf("%.0f", t.AvgMemoryClockMHz)
allTemp = fmt.Sprintf("%.1f", t.AvgTempC)
allPwr = fmt.Sprintf("%.0f W", t.AvgPowerW)
}
if lastStep.ServerDeltaW > 0 && len(lastStep.GPUIndices) > 0 {
allWall = fmt.Sprintf("%.0f W", lastStep.ServerDeltaW/float64(len(lastStep.GPUIndices)))
}
if lastStep.AvgFanRPM > 0 {
if lastStep.AvgFanDutyCyclePct > 0 {
allFan = fmt.Sprintf("%.0f RPM (%.0f%%)", lastStep.AvgFanRPM, lastStep.AvgFanDutyCyclePct)
} else {
allFan = fmt.Sprintf("%.0f RPM", lastStep.AvgFanRPM)
}
}
}
tableHeaders := []string{"", "Single Run"}
if lastStep != nil {
tableHeaders = append(tableHeaders, "All GPU Run")
}
compRows := [][]string{
{"Clock MHz (Mem MHz)", fmt.Sprintf("%s (%s)", singleClk, singleMem)},
{"Avg Temp °C", singleTemp},
{"Power W", singlePwr},
{"Per GPU wall W", singleWall},
{"Fan RPM (duty%)", singleFan},
}
if lastStep != nil {
compRows[0] = append(compRows[0], fmt.Sprintf("%s (%s)", allClk, allMem))
compRows[1] = append(compRows[1], allTemp)
compRows[2] = append(compRows[2], allPwr)
compRows[3] = append(compRows[3], allWall)
compRows[4] = append(compRows[4], allFan)
}
b.WriteString(fmtMDTable(tableHeaders, compRows))
b.WriteString("\n")
for _, note := range gpu.Notes { for _, note := range gpu.Notes {
fmt.Fprintf(&b, "- %s\n", note) fmt.Fprintf(&b, "- %s\n", note)
} }
b.WriteString("\n") if len(gpu.Notes) > 0 {
b.WriteString("\n")
}
} }
return b.String() return b.String()
} }
@@ -3860,7 +4116,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
OverallStatus: "OK", OverallStatus: "OK",
} }
durationSec := powerBenchDurationSec(opts.Profile) durationSec := powerBenchDurationSec(opts.Profile)
_ = durationSec
// Sample IPMI idle power before any GPU load. // Sample IPMI idle power before any GPU load.
var serverIdleW float64 var serverIdleW float64
@@ -3894,7 +4149,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
ipmiSingleDone <- w ipmiSingleDone <- w
} }
}() }()
c, restore, singleRows := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil) c, restore, singleRows := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil, durationSec)
appendBenchmarkMetrics(&allPowerRows, singleRows, fmt.Sprintf("single-gpu-%d", idx), &powerCursor, 0) appendBenchmarkMetrics(&allPowerRows, singleRows, fmt.Sprintf("single-gpu-%d", idx), &powerCursor, 0)
ipmiSingleCancel() ipmiSingleCancel()
if w, ok := <-ipmiSingleDone; ok { if w, ok := <-ipmiSingleDone; ok {
@@ -3947,6 +4202,12 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
t := summarizeBenchmarkTelemetry(calib.MetricRows) t := summarizeBenchmarkTelemetry(calib.MetricRows)
gpu.Telemetry = &t gpu.Telemetry = &t
} }
if fans, err := sampleFanSpeeds(); err == nil && len(fans) > 0 {
gpu.AvgFanRPM = meanFanRPM(fans)
if duty, ok, _ := sampleFanDutyCyclePctFromFans(fans); ok {
gpu.AvgFanDutyCyclePct = duty
}
}
gpus = append(gpus, gpu) gpus = append(gpus, gpu)
} }
sort.Slice(gpus, func(i, j int) bool { sort.Slice(gpus, func(i, j int) bool {
@@ -4077,7 +4338,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
ipmiStepDone <- w ipmiStepDone <- w
} }
}() }()
stepCalib, stepRestore, stepRows := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep) stepCalib, stepRestore, stepRows := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep, durationSec)
appendBenchmarkMetrics(&allPowerRows, stepRows, fmt.Sprintf("ramp-step-%d", step), &powerCursor, 0) appendBenchmarkMetrics(&allPowerRows, stepRows, fmt.Sprintf("ramp-step-%d", step), &powerCursor, 0)
ipmiStepCancel() ipmiStepCancel()
var stepIPMILoadedW float64 var stepIPMILoadedW float64
@@ -4159,6 +4420,29 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
} }
} }
// Per-step PSU slot snapshot.
sdrStep := sampleIPMISDRPowerSensors()
if len(sdrStep.PSUSlots) > 0 {
ramp.PSUSlotReadings = sdrStep.PSUSlots
}
// Fan state at end of ramp step.
if fans, err := sampleFanSpeeds(); err == nil && len(fans) > 0 {
ramp.AvgFanRPM = meanFanRPM(fans)
if duty, ok, _ := sampleFanDutyCyclePctFromFans(fans); ok {
ramp.AvgFanDutyCyclePct = duty
}
}
// Per-GPU telemetry from this ramp step's calibration.
ramp.PerGPUTelemetry = make(map[int]*BenchmarkTelemetrySummary, len(subset))
for _, gpuIdx := range subset {
if c, ok := stepCalib[gpuIdx]; ok {
s := c.Summary
ramp.PerGPUTelemetry[gpuIdx] = &s
}
}
result.RampSteps = append(result.RampSteps, ramp) result.RampSteps = append(result.RampSteps, ramp)
} }

View File

@@ -89,136 +89,159 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
// Perspective 1: Compatibility — hard stops // Perspective 1: Compatibility — hard stops
b.WriteString("### 1. Compatibility\n\n") b.WriteString("### 1. Compatibility\n\n")
b.WriteString("| GPU | Thermal throttle | Fan duty at throttle | ECC uncorr | Status |\n") {
b.WriteString("|-----|------------------|----------------------|------------|--------|\n") var rows [][]string
for _, gpu := range result.GPUs { for _, gpu := range result.GPUs {
thermalThrottle := "-" thermalThrottle := "-"
if gpu.Scores.ThermalThrottlePct > 0 { if gpu.Scores.ThermalThrottlePct > 0 {
thermalThrottle = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct) thermalThrottle = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
}
fanAtThrottle := "-"
if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && gpu.Scores.ThermalThrottlePct > 0 {
fanAtThrottle = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
}
ecc := "-"
if gpu.ECC.Uncorrected > 0 {
ecc = fmt.Sprintf("⛔ %d", gpu.ECC.Uncorrected)
}
compatStatus := "✓ OK"
if gpu.ECC.Uncorrected > 0 || (gpu.Scores.ThermalThrottlePct > 0 && result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && result.Cooling.P95FanDutyCyclePct < 95) {
compatStatus = "⛔ HARD STOP"
}
rows = append(rows, []string{fmt.Sprintf("GPU %d", gpu.Index), thermalThrottle, fanAtThrottle, ecc, compatStatus})
} }
fanAtThrottle := "-" b.WriteString(fmtMDTable([]string{"GPU", "Thermal throttle", "Fan duty at throttle", "ECC uncorr", "Status"}, rows))
if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && gpu.Scores.ThermalThrottlePct > 0 { b.WriteString("\n")
fanAtThrottle = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
}
ecc := "-"
if gpu.ECC.Uncorrected > 0 {
ecc = fmt.Sprintf("⛔ %d", gpu.ECC.Uncorrected)
}
compatStatus := "✓ OK"
if gpu.ECC.Uncorrected > 0 || (gpu.Scores.ThermalThrottlePct > 0 && result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && result.Cooling.P95FanDutyCyclePct < 95) {
compatStatus = "⛔ HARD STOP"
}
fmt.Fprintf(&b, "| GPU %d | %s | %s | %s | %s |\n",
gpu.Index, thermalThrottle, fanAtThrottle, ecc, compatStatus)
} }
b.WriteString("\n")
// Perspective 2: Thermal headroom // Perspective 2: Thermal headroom
b.WriteString("### 2. Thermal Headroom\n\n") b.WriteString("### 2. Thermal Headroom\n\n")
b.WriteString("| GPU | p95 temp | Slowdown limit | Shutdown limit | Headroom | Thermal throttle | Status |\n") {
b.WriteString("|-----|----------|----------------|----------------|----------|------------------|--------|\n") var rows [][]string
for _, gpu := range result.GPUs { for _, gpu := range result.GPUs {
shutdownTemp := gpu.ShutdownTempC shutdownTemp := gpu.ShutdownTempC
if shutdownTemp <= 0 { if shutdownTemp <= 0 {
shutdownTemp = 90 shutdownTemp = 90
}
slowdownTemp := gpu.SlowdownTempC
if slowdownTemp <= 0 {
slowdownTemp = 80
}
headroom := gpu.Scores.TempHeadroomC
thermalStatus := "✓ OK"
switch {
case headroom < 10:
thermalStatus = "⛔ CRITICAL"
case gpu.Steady.P95TempC >= slowdownTemp:
thermalStatus = "⚠ WARNING"
}
throttlePct := "-"
if gpu.Scores.ThermalThrottlePct > 0 {
throttlePct = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
}
rows = append(rows, []string{
fmt.Sprintf("GPU %d", gpu.Index),
fmt.Sprintf("%.1f°C", gpu.Steady.P95TempC),
fmt.Sprintf("%.0f°C", slowdownTemp),
fmt.Sprintf("%.0f°C", shutdownTemp),
fmt.Sprintf("%.1f°C", headroom),
throttlePct,
thermalStatus,
})
} }
slowdownTemp := gpu.SlowdownTempC b.WriteString(fmtMDTable([]string{"GPU", "p95 temp", "Slowdown limit", "Shutdown limit", "Headroom", "Thermal throttle", "Status"}, rows))
if slowdownTemp <= 0 { b.WriteString("\n")
slowdownTemp = 80
}
headroom := gpu.Scores.TempHeadroomC
thermalStatus := "✓ OK"
switch {
case headroom < 10:
thermalStatus = "⛔ CRITICAL"
case gpu.Steady.P95TempC >= slowdownTemp:
thermalStatus = "⚠ WARNING"
}
throttlePct := "-"
if gpu.Scores.ThermalThrottlePct > 0 {
throttlePct = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
}
fmt.Fprintf(&b, "| GPU %d | %.1f°C | %.0f°C | %.0f°C | %.1f°C | %s | %s |\n",
gpu.Index, gpu.Steady.P95TempC, slowdownTemp, shutdownTemp, headroom, throttlePct, thermalStatus)
} }
b.WriteString("\n")
// Perspective 3: Power delivery // Perspective 3: Power delivery
b.WriteString("### 3. Power Delivery\n\n") b.WriteString("### 3. Power Delivery\n\n")
b.WriteString("| GPU | Power cap throttle | Power stability | Fan duty (p95) | Status |\n") {
b.WriteString("|-----|-------------------|-----------------|----------------|--------|\n") var rows [][]string
for _, gpu := range result.GPUs { for _, gpu := range result.GPUs {
powerCap := "-" powerCap := "-"
if gpu.Scores.PowerCapThrottlePct > 0 { if gpu.Scores.PowerCapThrottlePct > 0 {
powerCap = fmt.Sprintf("%.1f%%", gpu.Scores.PowerCapThrottlePct) powerCap = fmt.Sprintf("%.1f%%", gpu.Scores.PowerCapThrottlePct)
}
fanDuty := "-"
if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable {
fanDuty = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
}
powerStatus := "✓ OK"
if gpu.Scores.PowerCapThrottlePct > 5 {
powerStatus = "⚠ POWER LIMITED"
}
rows = append(rows, []string{
fmt.Sprintf("GPU %d", gpu.Index),
powerCap,
fmt.Sprintf("%.1f", gpu.Scores.PowerSustainScore),
fanDuty,
powerStatus,
})
} }
fanDuty := "-" b.WriteString(fmtMDTable([]string{"GPU", "Power cap throttle", "Power stability", "Fan duty (p95)", "Status"}, rows))
if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable { b.WriteString("\n")
fanDuty = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
}
powerStatus := "✓ OK"
if gpu.Scores.PowerCapThrottlePct > 5 {
powerStatus = "⚠ POWER LIMITED"
}
fmt.Fprintf(&b, "| GPU %d | %s | %.1f | %s | %s |\n",
gpu.Index, powerCap, gpu.Scores.PowerSustainScore, fanDuty, powerStatus)
} }
b.WriteString("\n")
// Perspective 4: Performance // Perspective 4: Performance
b.WriteString("### 4. Performance\n\n") b.WriteString("### 4. Performance\n\n")
b.WriteString("| GPU | Compute TOPS | Synthetic | Mixed | Mixed Eff. | TOPS/SM/GHz |\n") {
b.WriteString("|-----|--------------|-----------|-------|------------|-------------|\n") var rows [][]string
for _, gpu := range result.GPUs { for _, gpu := range result.GPUs {
synthetic := "-" synthetic := "-"
if gpu.Scores.SyntheticScore > 0 { if gpu.Scores.SyntheticScore > 0 {
synthetic = fmt.Sprintf("%.2f", gpu.Scores.SyntheticScore) synthetic = fmt.Sprintf("%.2f", gpu.Scores.SyntheticScore)
}
mixed := "-"
if gpu.Scores.MixedScore > 0 {
mixed = fmt.Sprintf("%.2f", gpu.Scores.MixedScore)
}
mixedEff := "-"
if gpu.Scores.MixedEfficiency > 0 {
mixedEff = fmt.Sprintf("%.1f%%", gpu.Scores.MixedEfficiency*100)
}
topsPerSM := "-"
if gpu.Scores.TOPSPerSMPerGHz > 0 {
topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
}
rows = append(rows, []string{
fmt.Sprintf("GPU %d", gpu.Index),
fmt.Sprintf("**%.2f**", gpu.Scores.CompositeScore),
synthetic, mixed, mixedEff, topsPerSM,
})
} }
mixed := "-" b.WriteString(fmtMDTable([]string{"GPU", "Compute TOPS", "Synthetic", "Mixed", "Mixed Eff.", "TOPS/SM/GHz"}, rows))
if gpu.Scores.MixedScore > 0 { if len(result.PerformanceRampSteps) > 0 {
mixed = fmt.Sprintf("%.2f", gpu.Scores.MixedScore) fmt.Fprintf(&b, "\n**Platform power score (scalability):** %.1f%%\n", result.PlatformPowerScore)
} }
mixedEff := "-" b.WriteString("\n")
if gpu.Scores.MixedEfficiency > 0 {
mixedEff = fmt.Sprintf("%.1f%%", gpu.Scores.MixedEfficiency*100)
}
topsPerSM := "-"
if gpu.Scores.TOPSPerSMPerGHz > 0 {
topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
}
fmt.Fprintf(&b, "| GPU %d | **%.2f** | %s | %s | %s | %s |\n",
gpu.Index, gpu.Scores.CompositeScore, synthetic, mixed, mixedEff, topsPerSM)
} }
if len(result.PerformanceRampSteps) > 0 {
fmt.Fprintf(&b, "\n**Platform power score (scalability):** %.1f%%\n", result.PlatformPowerScore)
}
b.WriteString("\n")
// Perspective 5: Anomaly flags // Perspective 5: Anomaly flags
b.WriteString("### 5. Anomalies\n\n") b.WriteString("### 5. Anomalies\n\n")
b.WriteString("| GPU | ECC corrected | Sync boost throttle | Power instability | Thermal instability |\n") {
b.WriteString("|-----|---------------|---------------------|-------------------|---------------------|\n") var rows [][]string
for _, gpu := range result.GPUs { for _, gpu := range result.GPUs {
eccCorr := "-" eccCorr := "-"
if gpu.ECC.Corrected > 0 { if gpu.ECC.Corrected > 0 {
eccCorr = fmt.Sprintf("⚠ %d", gpu.ECC.Corrected) eccCorr = fmt.Sprintf("⚠ %d", gpu.ECC.Corrected)
}
syncBoost := "-"
if gpu.Scores.SyncBoostThrottlePct > 0 {
syncBoost = fmt.Sprintf("%.1f%%", gpu.Scores.SyncBoostThrottlePct)
}
powerVar := "OK"
if gpu.Scores.PowerSustainScore < 70 {
powerVar = "⚠ unstable"
}
thermalVar := "OK"
if gpu.Scores.ThermalSustainScore < 70 {
thermalVar = "⚠ unstable"
}
rows = append(rows, []string{fmt.Sprintf("GPU %d", gpu.Index), eccCorr, syncBoost, powerVar, thermalVar})
} }
syncBoost := "-" b.WriteString(fmtMDTable([]string{"GPU", "ECC corrected", "Sync boost throttle", "Power instability", "Thermal instability"}, rows))
if gpu.Scores.SyncBoostThrottlePct > 0 { b.WriteString("\n")
syncBoost = fmt.Sprintf("%.1f%%", gpu.Scores.SyncBoostThrottlePct)
}
powerVar := "OK"
if gpu.Scores.PowerSustainScore < 70 {
powerVar = "⚠ unstable"
}
thermalVar := "OK"
if gpu.Scores.ThermalSustainScore < 70 {
thermalVar = "⚠ unstable"
}
fmt.Fprintf(&b, "| GPU %d | %s | %s | %s | %s |\n",
gpu.Index, eccCorr, syncBoost, powerVar, thermalVar)
} }
b.WriteString("\n")
// ── Per GPU detail ──────────────────────────────────────────────────────── // ── Per GPU detail ────────────────────────────────────────────────────────
b.WriteString("## Per-GPU Details\n\n") b.WriteString("## Per-GPU Details\n\n")
@@ -263,12 +286,16 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
// Steady-state telemetry // Steady-state telemetry
if benchmarkTelemetryAvailable(gpu.Steady) { if benchmarkTelemetryAvailable(gpu.Steady) {
fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec)) fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
b.WriteString("| | Avg | P95 |\n|---|---|---|\n") b.WriteString(fmtMDTable(
fmt.Fprintf(&b, "| Power | %.1f W | %.1f W |\n", gpu.Steady.AvgPowerW, gpu.Steady.P95PowerW) []string{"", "Avg", "P95"},
fmt.Fprintf(&b, "| Temperature | %.1f °C | %.1f °C |\n", gpu.Steady.AvgTempC, gpu.Steady.P95TempC) [][]string{
fmt.Fprintf(&b, "| GPU clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgGraphicsClockMHz, gpu.Steady.P95GraphicsClockMHz) {"Power", fmt.Sprintf("%.1f W", gpu.Steady.AvgPowerW), fmt.Sprintf("%.1f W", gpu.Steady.P95PowerW)},
fmt.Fprintf(&b, "| Memory clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgMemoryClockMHz, gpu.Steady.P95MemoryClockMHz) {"Temperature", fmt.Sprintf("%.1f °C", gpu.Steady.AvgTempC), fmt.Sprintf("%.1f °C", gpu.Steady.P95TempC)},
fmt.Fprintf(&b, "| GPU utilisation | %.1f %% | — |\n", gpu.Steady.AvgUsagePct) {"GPU clock", fmt.Sprintf("%.0f MHz", gpu.Steady.AvgGraphicsClockMHz), fmt.Sprintf("%.0f MHz", gpu.Steady.P95GraphicsClockMHz)},
{"Memory clock", fmt.Sprintf("%.0f MHz", gpu.Steady.AvgMemoryClockMHz), fmt.Sprintf("%.0f MHz", gpu.Steady.P95MemoryClockMHz)},
{"GPU utilisation", fmt.Sprintf("%.1f %%", gpu.Steady.AvgUsagePct), "—"},
},
))
b.WriteString("\n") b.WriteString("\n")
} else { } else {
b.WriteString("**Steady-state telemetry:** unavailable\n\n") b.WriteString("**Steady-state telemetry:** unavailable\n\n")
@@ -277,7 +304,7 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
// Per-precision stability phases. // Per-precision stability phases.
if len(gpu.PrecisionSteady) > 0 { if len(gpu.PrecisionSteady) > 0 {
b.WriteString("**Per-precision stability:**\n\n") b.WriteString("**Per-precision stability:**\n\n")
b.WriteString("| Precision | Status | Clock CV | Power CV | Clock Drift | ECC corr | ECC uncorr |\n|-----------|--------|----------|----------|-------------|----------|------------|\n") var precRows [][]string
for _, p := range gpu.PrecisionSteady { for _, p := range gpu.PrecisionSteady {
eccCorr := "—" eccCorr := "—"
eccUncorr := "—" eccUncorr := "—"
@@ -289,10 +316,15 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
if strings.TrimSpace(status) == "" { if strings.TrimSpace(status) == "" {
status = "OK" status = "OK"
} }
fmt.Fprintf(&b, "| %s | %s | %.1f%% | %.1f%% | %.1f%% | %s | %s |\n", precRows = append(precRows, []string{
p.Precision, status, p.Steady.ClockCVPct, p.Steady.PowerCVPct, p.Steady.ClockDriftPct, p.Precision, status,
eccCorr, eccUncorr) fmt.Sprintf("%.1f%%", p.Steady.ClockCVPct),
fmt.Sprintf("%.1f%%", p.Steady.PowerCVPct),
fmt.Sprintf("%.1f%%", p.Steady.ClockDriftPct),
eccCorr, eccUncorr,
})
} }
b.WriteString(fmtMDTable([]string{"Precision", "Status", "Clock CV", "Power CV", "Clock Drift", "ECC corr", "ECC uncorr"}, precRows))
b.WriteString("\n") b.WriteString("\n")
} else { } else {
// Legacy: show combined-window variance. // Legacy: show combined-window variance.
@@ -315,16 +347,22 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
// Precision results // Precision results
if len(gpu.PrecisionResults) > 0 { if len(gpu.PrecisionResults) > 0 {
b.WriteString("**Precision results:**\n\n") b.WriteString("**Precision results:**\n\n")
b.WriteString("| Precision | TOPS (raw) | Weight | TOPS (fp32-eq) | Lanes | Iterations |\n|-----------|------------|--------|----------------|-------|------------|\n") var presRows [][]string
for _, p := range gpu.PrecisionResults { for _, p := range gpu.PrecisionResults {
if p.Supported { if p.Supported {
weightStr := fmt.Sprintf("×%.3g", p.Weight) presRows = append(presRows, []string{
fmt.Fprintf(&b, "| %s | %.2f | %s | %.2f | %d | %d |\n", p.Name,
p.Name, p.TeraOpsPerSec, weightStr, p.WeightedTeraOpsPerSec, p.Lanes, p.Iterations) fmt.Sprintf("%.2f", p.TeraOpsPerSec),
fmt.Sprintf("×%.3g", p.Weight),
fmt.Sprintf("%.2f", p.WeightedTeraOpsPerSec),
fmt.Sprintf("%d", p.Lanes),
fmt.Sprintf("%d", p.Iterations),
})
} else { } else {
fmt.Fprintf(&b, "| %s | — (unsupported) | — | — | — | — |\n", p.Name) presRows = append(presRows, []string{p.Name, "— (unsupported)", "—", "—", "—", "—"})
} }
} }
b.WriteString(fmtMDTable([]string{"Precision", "TOPS (raw)", "Weight", "TOPS (fp32-eq)", "Lanes", "Iterations"}, presRows))
b.WriteString("\n") b.WriteString("\n")
} }
@@ -346,9 +384,13 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
b.WriteString("## Interconnect (NCCL)\n\n") b.WriteString("## Interconnect (NCCL)\n\n")
fmt.Fprintf(&b, "**Status:** %s\n\n", result.Interconnect.Status) fmt.Fprintf(&b, "**Status:** %s\n\n", result.Interconnect.Status)
if result.Interconnect.Supported { if result.Interconnect.Supported {
b.WriteString("| Metric | Avg | Max |\n|--------|-----|-----|\n") b.WriteString(fmtMDTable(
fmt.Fprintf(&b, "| Alg BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.MaxAlgBWGBps) []string{"Metric", "Avg", "Max"},
fmt.Fprintf(&b, "| Bus BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgBusBWGBps, result.Interconnect.MaxBusBWGBps) [][]string{
{"Alg BW", fmt.Sprintf("%.1f GB/s", result.Interconnect.AvgAlgBWGBps), fmt.Sprintf("%.1f GB/s", result.Interconnect.MaxAlgBWGBps)},
{"Bus BW", fmt.Sprintf("%.1f GB/s", result.Interconnect.AvgBusBWGBps), fmt.Sprintf("%.1f GB/s", result.Interconnect.MaxBusBWGBps)},
},
))
b.WriteString("\n") b.WriteString("\n")
} }
for _, note := range result.Interconnect.Notes { for _, note := range result.Interconnect.Notes {
@@ -365,14 +407,16 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
if !sp.Available { if !sp.Available {
b.WriteString("IPMI power measurement unavailable.\n\n") b.WriteString("IPMI power measurement unavailable.\n\n")
} else { } else {
b.WriteString("| | Value |\n|---|---|\n") spRows := [][]string{
fmt.Fprintf(&b, "| Server idle | %.0f W |\n", sp.IdleW) {"Server idle", fmt.Sprintf("%.0f W", sp.IdleW)},
fmt.Fprintf(&b, "| Server under load | %.0f W |\n", sp.LoadedW) {"Server under load", fmt.Sprintf("%.0f W", sp.LoadedW)},
fmt.Fprintf(&b, "| Server delta (load idle) | %.0f W |\n", sp.DeltaW) {"Server delta (load idle)", fmt.Sprintf("%.0f W", sp.DeltaW)},
fmt.Fprintf(&b, "| GPU-reported sum | %.0f W |\n", sp.GPUReportedSumW) {"GPU-reported sum", fmt.Sprintf("%.0f W", sp.GPUReportedSumW)},
if sp.ReportingRatio > 0 {
fmt.Fprintf(&b, "| Reporting ratio | %.2f (1.0 = accurate, <0.75 = GPU over-reports) |\n", sp.ReportingRatio)
} }
if sp.ReportingRatio > 0 {
spRows = append(spRows, []string{"Reporting ratio", fmt.Sprintf("%.2f (1.0 = accurate, <0.75 = GPU over-reports)", sp.ReportingRatio)})
}
b.WriteString(fmtMDTable([]string{"", "Value"}, spRows))
b.WriteString("\n") b.WriteString("\n")
} }
for _, note := range sp.Notes { for _, note := range sp.Notes {
@@ -397,15 +441,19 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
if cooling := result.Cooling; cooling != nil { if cooling := result.Cooling; cooling != nil {
b.WriteString("## Cooling\n\n") b.WriteString("## Cooling\n\n")
if cooling.Available { if cooling.Available {
b.WriteString("| Metric | Value |\n|--------|-------|\n") dutyAvg, dutyP95 := "N/A", "N/A"
fmt.Fprintf(&b, "| Average fan speed | %.0f RPM |\n", cooling.AvgFanRPM)
if cooling.FanDutyCycleAvailable { if cooling.FanDutyCycleAvailable {
fmt.Fprintf(&b, "| Average fan duty cycle | %.1f%% |\n", cooling.AvgFanDutyCyclePct) dutyAvg = fmt.Sprintf("%.1f%%", cooling.AvgFanDutyCyclePct)
fmt.Fprintf(&b, "| P95 fan duty cycle | %.1f%% |\n", cooling.P95FanDutyCyclePct) dutyP95 = fmt.Sprintf("%.1f%%", cooling.P95FanDutyCyclePct)
} else {
b.WriteString("| Average fan duty cycle | N/A |\n")
b.WriteString("| P95 fan duty cycle | N/A |\n")
} }
b.WriteString(fmtMDTable(
[]string{"Metric", "Value"},
[][]string{
{"Average fan speed", fmt.Sprintf("%.0f RPM", cooling.AvgFanRPM)},
{"Average fan duty cycle", dutyAvg},
{"P95 fan duty cycle", dutyP95},
},
))
b.WriteString("\n") b.WriteString("\n")
} else { } else {
b.WriteString("Cooling telemetry unavailable.\n\n") b.WriteString("Cooling telemetry unavailable.\n\n")
@@ -422,12 +470,16 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
if len(result.PerformanceRampSteps) > 0 { if len(result.PerformanceRampSteps) > 0 {
b.WriteString("## Platform Scalability (Performance Ramp)\n\n") b.WriteString("## Platform Scalability (Performance Ramp)\n\n")
fmt.Fprintf(&b, "**Platform power score:** %.1f%% \n\n", result.PlatformPowerScore) fmt.Fprintf(&b, "**Platform power score:** %.1f%% \n\n", result.PlatformPowerScore)
b.WriteString("| k GPUs | GPU Indices | Total Synthetic TOPS | Scalability |\n") var scalRows [][]string
b.WriteString("|--------|-------------|----------------------|-------------|\n")
for _, step := range result.PerformanceRampSteps { for _, step := range result.PerformanceRampSteps {
fmt.Fprintf(&b, "| %d | %s | %.2f | %.1f%% |\n", scalRows = append(scalRows, []string{
step.StepIndex, joinIndexList(step.GPUIndices), step.TotalSyntheticTOPS, step.ScalabilityPct) fmt.Sprintf("%d", step.StepIndex),
joinIndexList(step.GPUIndices),
fmt.Sprintf("%.2f", step.TotalSyntheticTOPS),
fmt.Sprintf("%.1f%%", step.ScalabilityPct),
})
} }
b.WriteString(fmtMDTable([]string{"k GPUs", "GPU Indices", "Total Synthetic TOPS", "Scalability"}, scalRows))
b.WriteString("\n") b.WriteString("\n")
} }

View File

@@ -0,0 +1,75 @@
package platform
import (
"strings"
)
// fmtMDTable renders a markdown table with column widths padded so the table
// is readable as plain text without a markdown renderer.
//
// headers contains the column header strings.
// rows contains data rows; each row must have the same number of cells as headers.
// Cells with fewer entries than headers are treated as empty.
func fmtMDTable(headers []string, rows [][]string) string {
ncols := len(headers)
if ncols == 0 {
return ""
}
// Compute max width per column.
widths := make([]int, ncols)
for i, h := range headers {
if len(h) > widths[i] {
widths[i] = len(h)
}
}
for _, row := range rows {
for i := 0; i < ncols; i++ {
cell := ""
if i < len(row) {
cell = row[i]
}
if len(cell) > widths[i] {
widths[i] = len(cell)
}
}
}
var b strings.Builder
// Header row.
b.WriteByte('|')
for i, h := range headers {
b.WriteByte(' ')
b.WriteString(h)
b.WriteString(strings.Repeat(" ", widths[i]-len(h)))
b.WriteString(" |")
}
b.WriteByte('\n')
// Separator row.
b.WriteByte('|')
for i := range headers {
b.WriteString(strings.Repeat("-", widths[i]+2))
b.WriteByte('|')
}
b.WriteByte('\n')
// Data rows.
for _, row := range rows {
b.WriteByte('|')
for i := 0; i < ncols; i++ {
cell := ""
if i < len(row) {
cell = row[i]
}
b.WriteByte(' ')
b.WriteString(cell)
b.WriteString(strings.Repeat(" ", widths[i]-len(cell)))
b.WriteString(" |")
}
b.WriteByte('\n')
}
return b.String()
}

View File

@@ -52,7 +52,7 @@ const (
// - BenchmarkEstimatedPerfStabilitySec: xFusion v8.22 ramp 1-8: 5532 s // - BenchmarkEstimatedPerfStabilitySec: xFusion v8.22 ramp 1-8: 5532 s
// - BenchmarkEstimatedPerfOvernightSec: derived from profile phases (SteadySec=27000) // - BenchmarkEstimatedPerfOvernightSec: derived from profile phases (SteadySec=27000)
// - BenchmarkEstimatedPowerStandardSec: MLT v8.22 ramp 1-4: 2663 s; MSI v8.22 ramp 1-8: 2375 s // - BenchmarkEstimatedPowerStandardSec: MLT v8.22 ramp 1-4: 2663 s; MSI v8.22 ramp 1-8: 2375 s
// - BenchmarkEstimatedPowerStabilitySec: xFusion v8.17/v8.22 ramp 1-8: 1977-2002 s // - BenchmarkEstimatedPowerStabilitySec: target ~90 min with calibDurationSec=300 (8 GPU × ~2-3 attempts)
const ( const (
// Performance Benchmark (bee-gpu-burn). // Performance Benchmark (bee-gpu-burn).
// Duration is per full ramp-up run (ramp 1→N) or per single parallel run. // Duration is per full ramp-up run (ramp 1→N) or per single parallel run.
@@ -64,7 +64,7 @@ const (
// Power / Thermal Fit (dcgmi targeted_power binary-search calibration). // Power / Thermal Fit (dcgmi targeted_power binary-search calibration).
// Duration is for the full ramp-up run; individual steps vary with convergence speed. // Duration is for the full ramp-up run; individual steps vary with convergence speed.
BenchmarkEstimatedPowerStandardSec = 2600 // ~43 min; ramp 1-4: 2663 s, ramp 1-8: 2375 s BenchmarkEstimatedPowerStandardSec = 2600 // ~43 min; ramp 1-4: 2663 s, ramp 1-8: 2375 s
BenchmarkEstimatedPowerStabilitySec = 2000 // ~33 min; stability profile converges faster (longer steady → faster convergence) BenchmarkEstimatedPowerStabilitySec = 5400 // ~90 min; calibDurationSec=300 × 8 GPU × ~2-3 attempts
BenchmarkEstimatedPowerOvernightSec = 3 * 3600 BenchmarkEstimatedPowerOvernightSec = 3 * 3600
) )
@@ -408,6 +408,9 @@ type NvidiaPowerBenchGPU struct {
// Telemetry holds the aggregated stats from the final converged calibration // Telemetry holds the aggregated stats from the final converged calibration
// attempt for this GPU (temperature, power, fan, clock percentiles). // attempt for this GPU (temperature, power, fan, clock percentiles).
Telemetry *BenchmarkTelemetrySummary `json:"telemetry,omitempty"` Telemetry *BenchmarkTelemetrySummary `json:"telemetry,omitempty"`
// Fan state sampled at the end of single-card calibration.
AvgFanRPM float64 `json:"avg_fan_rpm,omitempty"`
AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
} }
type NvidiaPowerBenchStep struct { type NvidiaPowerBenchStep struct {
@@ -426,6 +429,13 @@ type NvidiaPowerBenchStep struct {
// ramp step's calibration run. ServerDeltaW = ServerLoadedW idle. // ramp step's calibration run. ServerDeltaW = ServerLoadedW idle.
ServerLoadedW float64 `json:"server_loaded_w,omitempty"` ServerLoadedW float64 `json:"server_loaded_w,omitempty"`
ServerDeltaW float64 `json:"server_delta_w,omitempty"` ServerDeltaW float64 `json:"server_delta_w,omitempty"`
// PSU slot readings sampled at end of this ramp step.
PSUSlotReadings map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings,omitempty"`
// Fan state at end of this ramp step.
AvgFanRPM float64 `json:"avg_fan_rpm,omitempty"`
AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
// Per-GPU telemetry from this step's calibration, keyed by GPU index.
PerGPUTelemetry map[int]*BenchmarkTelemetrySummary `json:"per_gpu_telemetry,omitempty"`
} }
// NvidiaPerformanceRampStep holds per-step performance data for the // NvidiaPerformanceRampStep holds per-step performance data for the