Rework Power Fit report: 90 min stability, aligned tables, PSU/fan sections
- Increase stability profile duration from 33 min to 90 min by wiring powerBenchDurationSec() into runBenchmarkPowerCalibration (was discarded) - Collect per-step PSU slot readings, fan RPM/duty, and per-GPU telemetry in ramp loop; add matching fields to NvidiaPowerBenchStep/NvidiaPowerBenchGPU - Rewrite renderPowerBenchReport: replace Per-Slot Results with Single GPU section, rework Ramp Sequence rows=runs/cols=GPUs, add PSU Performance section (conditional on IPMI data), add transposed Single vs All-GPU comparison table in per-GPU sections - Add fmtMDTable helper (benchmark_table.go) and apply to all tables in both power and performance reports so columns align in plain-text view Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -3055,8 +3055,12 @@ func runBenchmarkPowerCalibration(
|
||||
infoByIndex map[int]benchmarkGPUInfo,
|
||||
logFunc func(string),
|
||||
seedLimits map[int]int,
|
||||
durationSec int,
|
||||
) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction, []GPUMetricRow) {
|
||||
const calibDurationSec = 120
|
||||
calibDurationSec := durationSec
|
||||
if calibDurationSec <= 0 {
|
||||
calibDurationSec = 120
|
||||
}
|
||||
const maxDerateW = 150
|
||||
// calibSearchTolerance is the binary-search convergence threshold in watts.
|
||||
// When hi-lo ≤ this, the highest verified-stable limit (lo) is used.
|
||||
@@ -3436,6 +3440,18 @@ func roundTo5W(w int) int {
|
||||
return ((w + 2) / 5) * 5
|
||||
}
|
||||
|
||||
// meanFanRPM returns the average RPM across a set of fan readings.
|
||||
func meanFanRPM(fans []FanReading) float64 {
|
||||
if len(fans) == 0 {
|
||||
return 0
|
||||
}
|
||||
var sum float64
|
||||
for _, f := range fans {
|
||||
sum += f.RPM
|
||||
}
|
||||
return sum / float64(len(fans))
|
||||
}
|
||||
|
||||
func powerBenchDurationSec(profile string) int {
|
||||
switch strings.TrimSpace(strings.ToLower(profile)) {
|
||||
case NvidiaBenchmarkProfileStability:
|
||||
@@ -3475,30 +3491,29 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
// Server power comparison table.
|
||||
if sp := result.ServerPower; sp != nil {
|
||||
b.WriteString("## Server vs GPU Power Comparison\n\n")
|
||||
b.WriteString("| Metric | Source | Value |\n")
|
||||
b.WriteString("|--------|--------|-------|\n")
|
||||
fmt.Fprintf(&b, "| GPU stable limits sum | nvidia-smi | %.0f W |\n", result.PlatformMaxTDPW)
|
||||
fmt.Fprintf(&b, "| GPU actual power sum (p95, last step) | nvidia-smi | %.0f W |\n", sp.GPUReportedSumW)
|
||||
var spRows [][]string
|
||||
spRows = append(spRows, []string{"GPU stable limits sum", "nvidia-smi", fmt.Sprintf("%.0f W", result.PlatformMaxTDPW)})
|
||||
spRows = append(spRows, []string{"GPU actual power sum (p95, last step)", "nvidia-smi", fmt.Sprintf("%.0f W", sp.GPUReportedSumW)})
|
||||
if sp.GPUSlotTotalW > 0 {
|
||||
fmt.Fprintf(&b, "| GPU PCIe slot power (at peak load) | IPMI SDR | %.0f W |\n", sp.GPUSlotTotalW)
|
||||
spRows = append(spRows, []string{"GPU PCIe slot power (at peak load)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.GPUSlotTotalW)})
|
||||
}
|
||||
if sp.Available {
|
||||
fmt.Fprintf(&b, "| Server idle power | IPMI DCMI | %.0f W |\n", sp.IdleW)
|
||||
fmt.Fprintf(&b, "| Server loaded power | IPMI DCMI | %.0f W |\n", sp.LoadedW)
|
||||
fmt.Fprintf(&b, "| Server Δ power (loaded − idle) | IPMI DCMI | %.0f W |\n", sp.DeltaW)
|
||||
spRows = append(spRows, []string{"Server idle power", "IPMI DCMI", fmt.Sprintf("%.0f W", sp.IdleW)})
|
||||
spRows = append(spRows, []string{"Server loaded power", "IPMI DCMI", fmt.Sprintf("%.0f W", sp.LoadedW)})
|
||||
spRows = append(spRows, []string{"Server Δ power (loaded − idle)", "IPMI DCMI", fmt.Sprintf("%.0f W", sp.DeltaW)})
|
||||
}
|
||||
if sp.PSUInputLoadedW > 0 {
|
||||
fmt.Fprintf(&b, "| PSU AC input (idle) | IPMI SDR | %.0f W |\n", sp.PSUInputIdleW)
|
||||
fmt.Fprintf(&b, "| PSU AC input (loaded) | IPMI SDR | %.0f W |\n", sp.PSUInputLoadedW)
|
||||
spRows = append(spRows, []string{"PSU AC input (idle)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUInputIdleW)})
|
||||
spRows = append(spRows, []string{"PSU AC input (loaded)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUInputLoadedW)})
|
||||
psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
|
||||
fmt.Fprintf(&b, "| PSU AC input Δ (loaded − idle) | IPMI SDR | %.0f W |\n", psuDelta)
|
||||
spRows = append(spRows, []string{"PSU AC input Δ (loaded − idle)", "IPMI SDR", fmt.Sprintf("%.0f W", psuDelta)})
|
||||
}
|
||||
if sp.PSUOutputLoadedW > 0 {
|
||||
fmt.Fprintf(&b, "| PSU DC output (idle) | IPMI SDR | %.0f W |\n", sp.PSUOutputIdleW)
|
||||
fmt.Fprintf(&b, "| PSU DC output (loaded) | IPMI SDR | %.0f W |\n", sp.PSUOutputLoadedW)
|
||||
spRows = append(spRows, []string{"PSU DC output (idle)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUOutputIdleW)})
|
||||
spRows = append(spRows, []string{"PSU DC output (loaded)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUOutputLoadedW)})
|
||||
if sp.PSUInputLoadedW > 0 && sp.PSUInputIdleW > 0 {
|
||||
psuEff := sp.PSUOutputIdleW / sp.PSUInputIdleW * 100
|
||||
fmt.Fprintf(&b, "| PSU conversion efficiency (idle) | IPMI SDR | %.1f%% |\n", psuEff)
|
||||
spRows = append(spRows, []string{"PSU conversion efficiency (idle)", "IPMI SDR", fmt.Sprintf("%.1f%%", psuEff)})
|
||||
}
|
||||
}
|
||||
if sp.Available {
|
||||
@@ -3516,7 +3531,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
default:
|
||||
ratioNote = "✗ significant discrepancy — GPU over-reports TDP vs wall power"
|
||||
}
|
||||
fmt.Fprintf(&b, "| Reporting ratio (DCMI Δ / GPU actual) | IPMI DCMI | %.2f — %s |\n", ratio, ratioNote)
|
||||
spRows = append(spRows, []string{"Reporting ratio (DCMI Δ / GPU actual)", "IPMI DCMI", fmt.Sprintf("%.2f — %s", ratio, ratioNote)})
|
||||
if sp.PSUInputLoadedW > 0 && sp.GPUReportedSumW > 0 {
|
||||
psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
|
||||
sdrRatio := psuDelta / sp.GPUReportedSumW
|
||||
@@ -3529,11 +3544,12 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
default:
|
||||
sdrNote = "✗ significant discrepancy"
|
||||
}
|
||||
fmt.Fprintf(&b, "| Reporting ratio (SDR PSU Δ / GPU actual) | IPMI SDR | %.2f — %s |\n", sdrRatio, sdrNote)
|
||||
spRows = append(spRows, []string{"Reporting ratio (SDR PSU Δ / GPU actual)", "IPMI SDR", fmt.Sprintf("%.2f — %s", sdrRatio, sdrNote)})
|
||||
}
|
||||
} else {
|
||||
b.WriteString("| IPMI availability | — | not available — IPMI not supported or ipmitool not found |\n")
|
||||
spRows = append(spRows, []string{"IPMI availability", "—", "not available — IPMI not supported or ipmitool not found"})
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"Metric", "Source", "Value"}, spRows))
|
||||
for _, note := range sp.Notes {
|
||||
fmt.Fprintf(&b, "\n> %s\n", note)
|
||||
}
|
||||
@@ -3541,10 +3557,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
|
||||
if len(sp.PSUSlotReadingsIdle) > 0 || len(sp.PSUSlotReadingsLoaded) > 0 {
|
||||
b.WriteString("## PSU Load Distribution\n\n")
|
||||
b.WriteString("| Slot | AC Input (idle) | AC Input (loaded) | DC Output (idle) | DC Output (loaded) | Load Δ | Status |\n")
|
||||
b.WriteString("|------|-----------------|-------------------|------------------|--------------------|--------|--------|\n")
|
||||
|
||||
// collect all slot keys
|
||||
slotSet := map[string]struct{}{}
|
||||
for k := range sp.PSUSlotReadingsIdle {
|
||||
slotSet[k] = struct{}{}
|
||||
@@ -3558,17 +3571,18 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
}
|
||||
sort.Strings(slots)
|
||||
|
||||
fmtW := func(v *float64) string {
|
||||
if v == nil {
|
||||
return "—"
|
||||
}
|
||||
return fmt.Sprintf("%.0f W", *v)
|
||||
}
|
||||
|
||||
var psuDistRows [][]string
|
||||
for _, slot := range slots {
|
||||
idle := sp.PSUSlotReadingsIdle[slot]
|
||||
loaded := sp.PSUSlotReadingsLoaded[slot]
|
||||
|
||||
fmtW := func(v *float64) string {
|
||||
if v == nil {
|
||||
return "—"
|
||||
}
|
||||
return fmt.Sprintf("%.0f W", *v)
|
||||
}
|
||||
|
||||
var deltaStr string
|
||||
if idle.InputW != nil && loaded.InputW != nil {
|
||||
deltaStr = fmt.Sprintf("%+.0f W", *loaded.InputW-*idle.InputW)
|
||||
@@ -3584,13 +3598,14 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
status = "—"
|
||||
}
|
||||
|
||||
fmt.Fprintf(&b, "| %s | %s | %s | %s | %s | %s | %s |\n",
|
||||
psuDistRows = append(psuDistRows, []string{
|
||||
slot,
|
||||
fmtW(idle.InputW), fmtW(loaded.InputW),
|
||||
fmtW(idle.OutputW), fmtW(loaded.OutputW),
|
||||
deltaStr, status,
|
||||
)
|
||||
})
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"Slot", "AC Input (idle)", "AC Input (loaded)", "DC Output (idle)", "DC Output (loaded)", "Load Δ", "Status"}, psuDistRows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
}
|
||||
@@ -3602,28 +3617,194 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
if len(result.RecommendedSlotOrder) > 0 {
|
||||
b.WriteString("## Recommended Slot Order\n\n")
|
||||
fmt.Fprintf(&b, "Populate GPUs in this order for best single-card power realization: `%s`\n\n", joinIndexList(result.RecommendedSlotOrder))
|
||||
}
|
||||
if len(result.RampSteps) > 0 {
|
||||
b.WriteString("## Ramp Sequence\n\n")
|
||||
b.WriteString("| Step | New GPU | Stable Limit | Total Observed | Server Δ (IPMI) | Derated | Status |\n")
|
||||
b.WriteString("|------|---------|--------------|----------------|-----------------|---------|--------|\n")
|
||||
for _, step := range result.RampSteps {
|
||||
derated := "-"
|
||||
if step.Derated {
|
||||
derated = "⚠ yes"
|
||||
// ── Single GPU section ───────────────────────────────────────────────────
|
||||
b.WriteString("## Single GPU\n\n")
|
||||
{
|
||||
var sgRows [][]string
|
||||
for _, gpu := range result.GPUs {
|
||||
clk := "—"
|
||||
mem := "—"
|
||||
temp := "—"
|
||||
pwr := "—"
|
||||
if gpu.Telemetry != nil {
|
||||
clk = fmt.Sprintf("%.0f", gpu.Telemetry.AvgGraphicsClockMHz)
|
||||
mem = fmt.Sprintf("%.0f", gpu.Telemetry.AvgMemoryClockMHz)
|
||||
temp = fmt.Sprintf("%.1f", gpu.Telemetry.AvgTempC)
|
||||
pwr = fmt.Sprintf("%.0f W", gpu.Telemetry.AvgPowerW)
|
||||
}
|
||||
serverDelta := "-"
|
||||
if step.ServerDeltaW > 0 {
|
||||
serverDelta = fmt.Sprintf("%.0f W", step.ServerDeltaW)
|
||||
serverDelta := "—"
|
||||
if gpu.ServerDeltaW > 0 {
|
||||
serverDelta = fmt.Sprintf("%.0f W", gpu.ServerDeltaW)
|
||||
}
|
||||
fmt.Fprintf(&b, "| %d | GPU %d | %.0f W | %.0f W | %s | %s | %s |\n",
|
||||
step.StepIndex, step.NewGPUIndex, step.NewGPUStableLimitW, step.TotalObservedPowerW, serverDelta, derated, step.Status)
|
||||
fan := "—"
|
||||
if gpu.AvgFanRPM > 0 {
|
||||
if gpu.AvgFanDutyCyclePct > 0 {
|
||||
fan = fmt.Sprintf("%.0f RPM (%.0f%%)", gpu.AvgFanRPM, gpu.AvgFanDutyCyclePct)
|
||||
} else {
|
||||
fan = fmt.Sprintf("%.0f RPM", gpu.AvgFanRPM)
|
||||
}
|
||||
}
|
||||
sgRows = append(sgRows, []string{
|
||||
fmt.Sprintf("GPU %d", gpu.Index),
|
||||
fmt.Sprintf("%s (%s)", clk, mem),
|
||||
temp,
|
||||
pwr,
|
||||
serverDelta,
|
||||
fan,
|
||||
})
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"GPU", "Clock MHz (Mem MHz)", "Avg Temp °C", "Power W", "Server Δ W", "Fan RPM (duty%)"}, sgRows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
if len(result.RecommendedSlotOrder) > 0 {
|
||||
fmt.Fprintf(&b, "Recommended slot order for best single-card power realization: `%s`\n\n", joinIndexList(result.RecommendedSlotOrder))
|
||||
}
|
||||
|
||||
// ── Ramp Sequence ────────────────────────────────────────────────────────
|
||||
// Rows = run number; Cols = per-GPU power (from step telemetry) + aggregates.
|
||||
if len(result.RampSteps) > 0 {
|
||||
b.WriteString("## Ramp Sequence\n\n")
|
||||
|
||||
// Collect all GPU indices that appear across all steps (ordered by first appearance).
|
||||
allGPUIndices := make([]int, 0, len(result.GPUs))
|
||||
seen := map[int]bool{}
|
||||
for _, step := range result.RampSteps {
|
||||
for _, idx := range step.GPUIndices {
|
||||
if !seen[idx] {
|
||||
seen[idx] = true
|
||||
allGPUIndices = append(allGPUIndices, idx)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var idleW float64
|
||||
if result.ServerPower != nil {
|
||||
idleW = result.ServerPower.IdleW
|
||||
}
|
||||
|
||||
// Build header: Run | GPU 0 | GPU 1 | ... | Server wall W | Per GPU wall W | Platform eff.
|
||||
headers := []string{"Run"}
|
||||
for _, idx := range allGPUIndices {
|
||||
headers = append(headers, fmt.Sprintf("GPU %d W", idx))
|
||||
}
|
||||
headers = append(headers, "Server wall W", "Per GPU wall W", "Platform eff.")
|
||||
|
||||
var rampRows [][]string
|
||||
for _, step := range result.RampSteps {
|
||||
row := []string{fmt.Sprintf("%d", step.StepIndex)}
|
||||
for _, idx := range allGPUIndices {
|
||||
inStep := false
|
||||
for _, si := range step.GPUIndices {
|
||||
if si == idx {
|
||||
inStep = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !inStep {
|
||||
row = append(row, "—")
|
||||
continue
|
||||
}
|
||||
gpuPwr := "—"
|
||||
if t, ok := step.PerGPUTelemetry[idx]; ok && t != nil && t.AvgPowerW > 0 {
|
||||
gpuPwr = fmt.Sprintf("%.0f", t.AvgPowerW)
|
||||
}
|
||||
row = append(row, gpuPwr)
|
||||
}
|
||||
// Server wall W
|
||||
serverWall := "—"
|
||||
if step.ServerLoadedW > 0 {
|
||||
serverWall = fmt.Sprintf("%.0f", step.ServerLoadedW)
|
||||
}
|
||||
// Per GPU wall W = ServerDeltaW / len(GPUIndices)
|
||||
perGPUWall := "—"
|
||||
if step.ServerDeltaW > 0 && len(step.GPUIndices) > 0 {
|
||||
perGPUWall = fmt.Sprintf("%.0f", step.ServerDeltaW/float64(len(step.GPUIndices)))
|
||||
}
|
||||
// Platform eff. = (ServerLoadedW − idleW) / TotalObservedPowerW
|
||||
platEff := "—"
|
||||
if step.TotalObservedPowerW > 0 {
|
||||
eff := step.ServerDeltaW / step.TotalObservedPowerW
|
||||
if idleW > 0 && step.ServerLoadedW > 0 {
|
||||
eff = (step.ServerLoadedW - idleW) / step.TotalObservedPowerW
|
||||
}
|
||||
platEff = fmt.Sprintf("%.2f", eff)
|
||||
}
|
||||
row = append(row, serverWall, perGPUWall, platEff)
|
||||
rampRows = append(rampRows, row)
|
||||
}
|
||||
b.WriteString(fmtMDTable(headers, rampRows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
// ── PSU Performance ───────────────────────────────────────────────────────
|
||||
{
|
||||
// Collect all PSU slot keys from any ramp step.
|
||||
psuSlotSet := map[string]struct{}{}
|
||||
for _, step := range result.RampSteps {
|
||||
for k := range step.PSUSlotReadings {
|
||||
psuSlotSet[k] = struct{}{}
|
||||
}
|
||||
}
|
||||
if len(psuSlotSet) > 0 {
|
||||
b.WriteString("## PSU Performance\n\n")
|
||||
psuSlots := make([]string, 0, len(psuSlotSet))
|
||||
for k := range psuSlotSet {
|
||||
psuSlots = append(psuSlots, k)
|
||||
}
|
||||
sort.Strings(psuSlots)
|
||||
|
||||
var idleW float64
|
||||
if result.ServerPower != nil {
|
||||
idleW = result.ServerPower.IdleW
|
||||
}
|
||||
|
||||
psuHeaders := []string{"Run"}
|
||||
for _, slot := range psuSlots {
|
||||
psuHeaders = append(psuHeaders, fmt.Sprintf("PSU %s W", slot))
|
||||
}
|
||||
psuHeaders = append(psuHeaders, "PSU Total W", "Platform eff.", "Fan RPM (duty%)")
|
||||
|
||||
var psuRows [][]string
|
||||
for _, step := range result.RampSteps {
|
||||
row := []string{fmt.Sprintf("%d", step.StepIndex)}
|
||||
var psuTotal float64
|
||||
for _, slot := range psuSlots {
|
||||
sp, ok := step.PSUSlotReadings[slot]
|
||||
if !ok || sp.InputW == nil {
|
||||
row = append(row, "—")
|
||||
continue
|
||||
}
|
||||
row = append(row, fmt.Sprintf("%.0f", *sp.InputW))
|
||||
psuTotal += *sp.InputW
|
||||
}
|
||||
totalStr := "—"
|
||||
if psuTotal > 0 {
|
||||
totalStr = fmt.Sprintf("%.0f", psuTotal)
|
||||
}
|
||||
platEff := "—"
|
||||
if step.TotalObservedPowerW > 0 {
|
||||
eff := step.ServerDeltaW / step.TotalObservedPowerW
|
||||
if idleW > 0 && step.ServerLoadedW > 0 {
|
||||
eff = (step.ServerLoadedW - idleW) / step.TotalObservedPowerW
|
||||
}
|
||||
platEff = fmt.Sprintf("%.2f", eff)
|
||||
}
|
||||
fan := "—"
|
||||
if step.AvgFanRPM > 0 {
|
||||
if step.AvgFanDutyCyclePct > 0 {
|
||||
fan = fmt.Sprintf("%.0f (%.0f%%)", step.AvgFanRPM, step.AvgFanDutyCyclePct)
|
||||
} else {
|
||||
fan = fmt.Sprintf("%.0f", step.AvgFanRPM)
|
||||
}
|
||||
}
|
||||
row = append(row, totalStr, platEff, fan)
|
||||
psuRows = append(psuRows, row)
|
||||
}
|
||||
b.WriteString(fmtMDTable(psuHeaders, psuRows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
}
|
||||
|
||||
// ── PSU Issues ────────────────────────────────────────────────────────────
|
||||
if len(result.PSUIssues) > 0 {
|
||||
b.WriteString("## PSU Issues\n\n")
|
||||
@@ -3646,8 +3827,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
totalDefault += gpu.DefaultPowerLimitW
|
||||
totalStable += stable
|
||||
}
|
||||
b.WriteString("| GPU | Default TDP | Single-card limit | Stable limit | Realization | Derated |\n")
|
||||
b.WriteString("|-----|-------------|-------------------|--------------|-------------|----------|\n")
|
||||
var pdRows [][]string
|
||||
for _, gpu := range result.GPUs {
|
||||
stable := gpu.StablePowerLimitW
|
||||
if stable <= 0 {
|
||||
@@ -3661,15 +3841,29 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
if gpu.Derated {
|
||||
derated = "⚠ yes"
|
||||
}
|
||||
fmt.Fprintf(&b, "| GPU %d | %.0f W | %.0f W | %.0f W | %s | %s |\n",
|
||||
gpu.Index, gpu.DefaultPowerLimitW, gpu.AppliedPowerLimitW, stable, realization, derated)
|
||||
pdRows = append(pdRows, []string{
|
||||
fmt.Sprintf("GPU %d", gpu.Index),
|
||||
fmt.Sprintf("%.0f W", gpu.DefaultPowerLimitW),
|
||||
fmt.Sprintf("%.0f W", gpu.AppliedPowerLimitW),
|
||||
fmt.Sprintf("%.0f W", stable),
|
||||
realization,
|
||||
derated,
|
||||
})
|
||||
}
|
||||
platformReal := "-"
|
||||
if totalDefault > 0 && totalStable > 0 {
|
||||
platformReal = fmt.Sprintf("%.1f%%", totalStable/totalDefault*100)
|
||||
}
|
||||
fmt.Fprintf(&b, "| **Platform** | **%.0f W** | — | **%.0f W** | **%s** | |\n\n",
|
||||
totalDefault, totalStable, platformReal)
|
||||
pdRows = append(pdRows, []string{
|
||||
"**Platform**",
|
||||
fmt.Sprintf("**%.0f W**", totalDefault),
|
||||
"—",
|
||||
fmt.Sprintf("**%.0f W**", totalStable),
|
||||
fmt.Sprintf("**%s**", platformReal),
|
||||
"",
|
||||
})
|
||||
b.WriteString(fmtMDTable([]string{"GPU", "Default TDP", "Single-card limit", "Stable limit", "Realization", "Derated"}, pdRows))
|
||||
b.WriteString("\n")
|
||||
|
||||
// Balance across GPUs — only meaningful with 2+ GPUs.
|
||||
if len(result.GPUs) > 1 {
|
||||
@@ -3710,9 +3904,6 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
// Ramp scalability table — power efficiency of adding each GPU.
|
||||
if len(result.RampSteps) > 1 {
|
||||
b.WriteString("**Ramp power scalability** (stable TDP per step):\n\n")
|
||||
b.WriteString("| Step | GPUs | Cumulative stable TDP | Incremental | Efficiency vs GPU 1 |\n")
|
||||
b.WriteString("|------|------|-----------------------|-------------|---------------------|\n")
|
||||
// First GPU stable TDP as the reference unit for efficiency.
|
||||
var firstStable float64
|
||||
if len(result.GPUs) > 0 {
|
||||
firstStable = result.GPUs[0].StablePowerLimitW
|
||||
@@ -3721,6 +3912,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
}
|
||||
}
|
||||
var prevCumulative float64
|
||||
var scalRows [][]string
|
||||
for _, step := range result.RampSteps {
|
||||
var cumulative float64
|
||||
for _, gpuIdx := range step.GPUIndices {
|
||||
@@ -3740,40 +3932,104 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
if step.StepIndex > 1 && firstStable > 0 {
|
||||
efficiency = fmt.Sprintf("%.1f%%", incremental/firstStable*100)
|
||||
}
|
||||
fmt.Fprintf(&b, "| %d | %s | %.0f W | %.0f W | %s |\n",
|
||||
step.StepIndex, joinIndexList(step.GPUIndices), cumulative, incremental, efficiency)
|
||||
scalRows = append(scalRows, []string{
|
||||
fmt.Sprintf("%d", step.StepIndex),
|
||||
joinIndexList(step.GPUIndices),
|
||||
fmt.Sprintf("%.0f W", cumulative),
|
||||
fmt.Sprintf("%.0f W", incremental),
|
||||
efficiency,
|
||||
})
|
||||
prevCumulative = cumulative
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"Step", "GPUs", "Cumulative stable TDP", "Incremental", "Efficiency vs GPU 1"}, scalRows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
}
|
||||
|
||||
b.WriteString("## Per-Slot Results\n\n")
|
||||
b.WriteString("| GPU | Status | Single-card Limit | Stable Limit | Server Δ (IPMI) | Temp | Attempts |\n")
|
||||
b.WriteString("|-----|--------|-------------------|--------------|-----------------|------|----------|\n")
|
||||
for _, gpu := range result.GPUs {
|
||||
stableLimit := "-"
|
||||
if gpu.StablePowerLimitW > 0 {
|
||||
if gpu.Derated {
|
||||
stableLimit = fmt.Sprintf("%.0f W ⚠", gpu.StablePowerLimitW)
|
||||
} else {
|
||||
stableLimit = fmt.Sprintf("%.0f W", gpu.StablePowerLimitW)
|
||||
}
|
||||
}
|
||||
serverDelta := "-"
|
||||
if gpu.ServerDeltaW > 0 {
|
||||
serverDelta = fmt.Sprintf("%.0f W", gpu.ServerDeltaW)
|
||||
}
|
||||
fmt.Fprintf(&b, "| GPU %d | %s | %.0f W | %s | %s | %.1f C | %d |\n",
|
||||
gpu.Index, gpu.Status, gpu.AppliedPowerLimitW, stableLimit, serverDelta, gpu.MaxObservedTempC, gpu.CalibrationAttempts)
|
||||
// ── Per-GPU sections ──────────────────────────────────────────────────────
|
||||
var lastStep *NvidiaPowerBenchStep
|
||||
if n := len(result.RampSteps); n > 0 {
|
||||
lastStep = &result.RampSteps[n-1]
|
||||
}
|
||||
b.WriteString("\n")
|
||||
for _, gpu := range result.GPUs {
|
||||
fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, gpu.Name)
|
||||
|
||||
// Transposed comparison table: Single Run vs All GPU Run.
|
||||
singleClk := "—"
|
||||
singleMem := "—"
|
||||
singleTemp := "—"
|
||||
singlePwr := "—"
|
||||
singleWall := "—"
|
||||
singleFan := "—"
|
||||
if gpu.Telemetry != nil {
|
||||
singleClk = fmt.Sprintf("%.0f", gpu.Telemetry.AvgGraphicsClockMHz)
|
||||
singleMem = fmt.Sprintf("%.0f", gpu.Telemetry.AvgMemoryClockMHz)
|
||||
singleTemp = fmt.Sprintf("%.1f", gpu.Telemetry.AvgTempC)
|
||||
singlePwr = fmt.Sprintf("%.0f W", gpu.Telemetry.AvgPowerW)
|
||||
}
|
||||
if gpu.ServerDeltaW > 0 {
|
||||
singleWall = fmt.Sprintf("%.0f W", gpu.ServerDeltaW)
|
||||
}
|
||||
if gpu.AvgFanRPM > 0 {
|
||||
if gpu.AvgFanDutyCyclePct > 0 {
|
||||
singleFan = fmt.Sprintf("%.0f RPM (%.0f%%)", gpu.AvgFanRPM, gpu.AvgFanDutyCyclePct)
|
||||
} else {
|
||||
singleFan = fmt.Sprintf("%.0f RPM", gpu.AvgFanRPM)
|
||||
}
|
||||
}
|
||||
|
||||
allClk := "—"
|
||||
allMem := "—"
|
||||
allTemp := "—"
|
||||
allPwr := "—"
|
||||
allWall := "—"
|
||||
allFan := "—"
|
||||
if lastStep != nil {
|
||||
if t, ok := lastStep.PerGPUTelemetry[gpu.Index]; ok && t != nil {
|
||||
allClk = fmt.Sprintf("%.0f", t.AvgGraphicsClockMHz)
|
||||
allMem = fmt.Sprintf("%.0f", t.AvgMemoryClockMHz)
|
||||
allTemp = fmt.Sprintf("%.1f", t.AvgTempC)
|
||||
allPwr = fmt.Sprintf("%.0f W", t.AvgPowerW)
|
||||
}
|
||||
if lastStep.ServerDeltaW > 0 && len(lastStep.GPUIndices) > 0 {
|
||||
allWall = fmt.Sprintf("%.0f W", lastStep.ServerDeltaW/float64(len(lastStep.GPUIndices)))
|
||||
}
|
||||
if lastStep.AvgFanRPM > 0 {
|
||||
if lastStep.AvgFanDutyCyclePct > 0 {
|
||||
allFan = fmt.Sprintf("%.0f RPM (%.0f%%)", lastStep.AvgFanRPM, lastStep.AvgFanDutyCyclePct)
|
||||
} else {
|
||||
allFan = fmt.Sprintf("%.0f RPM", lastStep.AvgFanRPM)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tableHeaders := []string{"", "Single Run"}
|
||||
if lastStep != nil {
|
||||
tableHeaders = append(tableHeaders, "All GPU Run")
|
||||
}
|
||||
compRows := [][]string{
|
||||
{"Clock MHz (Mem MHz)", fmt.Sprintf("%s (%s)", singleClk, singleMem)},
|
||||
{"Avg Temp °C", singleTemp},
|
||||
{"Power W", singlePwr},
|
||||
{"Per GPU wall W", singleWall},
|
||||
{"Fan RPM (duty%)", singleFan},
|
||||
}
|
||||
if lastStep != nil {
|
||||
compRows[0] = append(compRows[0], fmt.Sprintf("%s (%s)", allClk, allMem))
|
||||
compRows[1] = append(compRows[1], allTemp)
|
||||
compRows[2] = append(compRows[2], allPwr)
|
||||
compRows[3] = append(compRows[3], allWall)
|
||||
compRows[4] = append(compRows[4], allFan)
|
||||
}
|
||||
b.WriteString(fmtMDTable(tableHeaders, compRows))
|
||||
b.WriteString("\n")
|
||||
|
||||
for _, note := range gpu.Notes {
|
||||
fmt.Fprintf(&b, "- %s\n", note)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
if len(gpu.Notes) > 0 {
|
||||
b.WriteString("\n")
|
||||
}
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
@@ -3860,7 +4116,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
OverallStatus: "OK",
|
||||
}
|
||||
durationSec := powerBenchDurationSec(opts.Profile)
|
||||
_ = durationSec
|
||||
|
||||
// Sample IPMI idle power before any GPU load.
|
||||
var serverIdleW float64
|
||||
@@ -3894,7 +4149,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
ipmiSingleDone <- w
|
||||
}
|
||||
}()
|
||||
c, restore, singleRows := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil)
|
||||
c, restore, singleRows := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil, durationSec)
|
||||
appendBenchmarkMetrics(&allPowerRows, singleRows, fmt.Sprintf("single-gpu-%d", idx), &powerCursor, 0)
|
||||
ipmiSingleCancel()
|
||||
if w, ok := <-ipmiSingleDone; ok {
|
||||
@@ -3947,6 +4202,12 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
t := summarizeBenchmarkTelemetry(calib.MetricRows)
|
||||
gpu.Telemetry = &t
|
||||
}
|
||||
if fans, err := sampleFanSpeeds(); err == nil && len(fans) > 0 {
|
||||
gpu.AvgFanRPM = meanFanRPM(fans)
|
||||
if duty, ok, _ := sampleFanDutyCyclePctFromFans(fans); ok {
|
||||
gpu.AvgFanDutyCyclePct = duty
|
||||
}
|
||||
}
|
||||
gpus = append(gpus, gpu)
|
||||
}
|
||||
sort.Slice(gpus, func(i, j int) bool {
|
||||
@@ -4077,7 +4338,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
ipmiStepDone <- w
|
||||
}
|
||||
}()
|
||||
stepCalib, stepRestore, stepRows := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep)
|
||||
stepCalib, stepRestore, stepRows := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep, durationSec)
|
||||
appendBenchmarkMetrics(&allPowerRows, stepRows, fmt.Sprintf("ramp-step-%d", step), &powerCursor, 0)
|
||||
ipmiStepCancel()
|
||||
var stepIPMILoadedW float64
|
||||
@@ -4159,6 +4420,29 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
}
|
||||
}
|
||||
|
||||
// Per-step PSU slot snapshot.
|
||||
sdrStep := sampleIPMISDRPowerSensors()
|
||||
if len(sdrStep.PSUSlots) > 0 {
|
||||
ramp.PSUSlotReadings = sdrStep.PSUSlots
|
||||
}
|
||||
|
||||
// Fan state at end of ramp step.
|
||||
if fans, err := sampleFanSpeeds(); err == nil && len(fans) > 0 {
|
||||
ramp.AvgFanRPM = meanFanRPM(fans)
|
||||
if duty, ok, _ := sampleFanDutyCyclePctFromFans(fans); ok {
|
||||
ramp.AvgFanDutyCyclePct = duty
|
||||
}
|
||||
}
|
||||
|
||||
// Per-GPU telemetry from this ramp step's calibration.
|
||||
ramp.PerGPUTelemetry = make(map[int]*BenchmarkTelemetrySummary, len(subset))
|
||||
for _, gpuIdx := range subset {
|
||||
if c, ok := stepCalib[gpuIdx]; ok {
|
||||
s := c.Summary
|
||||
ramp.PerGPUTelemetry[gpuIdx] = &s
|
||||
}
|
||||
}
|
||||
|
||||
result.RampSteps = append(result.RampSteps, ramp)
|
||||
}
|
||||
|
||||
|
||||
@@ -89,136 +89,159 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
||||
|
||||
// Perspective 1: Compatibility — hard stops
|
||||
b.WriteString("### 1. Compatibility\n\n")
|
||||
b.WriteString("| GPU | Thermal throttle | Fan duty at throttle | ECC uncorr | Status |\n")
|
||||
b.WriteString("|-----|------------------|----------------------|------------|--------|\n")
|
||||
for _, gpu := range result.GPUs {
|
||||
thermalThrottle := "-"
|
||||
if gpu.Scores.ThermalThrottlePct > 0 {
|
||||
thermalThrottle = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
|
||||
{
|
||||
var rows [][]string
|
||||
for _, gpu := range result.GPUs {
|
||||
thermalThrottle := "-"
|
||||
if gpu.Scores.ThermalThrottlePct > 0 {
|
||||
thermalThrottle = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
|
||||
}
|
||||
fanAtThrottle := "-"
|
||||
if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && gpu.Scores.ThermalThrottlePct > 0 {
|
||||
fanAtThrottle = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
|
||||
}
|
||||
ecc := "-"
|
||||
if gpu.ECC.Uncorrected > 0 {
|
||||
ecc = fmt.Sprintf("⛔ %d", gpu.ECC.Uncorrected)
|
||||
}
|
||||
compatStatus := "✓ OK"
|
||||
if gpu.ECC.Uncorrected > 0 || (gpu.Scores.ThermalThrottlePct > 0 && result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && result.Cooling.P95FanDutyCyclePct < 95) {
|
||||
compatStatus = "⛔ HARD STOP"
|
||||
}
|
||||
rows = append(rows, []string{fmt.Sprintf("GPU %d", gpu.Index), thermalThrottle, fanAtThrottle, ecc, compatStatus})
|
||||
}
|
||||
fanAtThrottle := "-"
|
||||
if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && gpu.Scores.ThermalThrottlePct > 0 {
|
||||
fanAtThrottle = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
|
||||
}
|
||||
ecc := "-"
|
||||
if gpu.ECC.Uncorrected > 0 {
|
||||
ecc = fmt.Sprintf("⛔ %d", gpu.ECC.Uncorrected)
|
||||
}
|
||||
compatStatus := "✓ OK"
|
||||
if gpu.ECC.Uncorrected > 0 || (gpu.Scores.ThermalThrottlePct > 0 && result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && result.Cooling.P95FanDutyCyclePct < 95) {
|
||||
compatStatus = "⛔ HARD STOP"
|
||||
}
|
||||
fmt.Fprintf(&b, "| GPU %d | %s | %s | %s | %s |\n",
|
||||
gpu.Index, thermalThrottle, fanAtThrottle, ecc, compatStatus)
|
||||
b.WriteString(fmtMDTable([]string{"GPU", "Thermal throttle", "Fan duty at throttle", "ECC uncorr", "Status"}, rows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// Perspective 2: Thermal headroom
|
||||
b.WriteString("### 2. Thermal Headroom\n\n")
|
||||
b.WriteString("| GPU | p95 temp | Slowdown limit | Shutdown limit | Headroom | Thermal throttle | Status |\n")
|
||||
b.WriteString("|-----|----------|----------------|----------------|----------|------------------|--------|\n")
|
||||
for _, gpu := range result.GPUs {
|
||||
shutdownTemp := gpu.ShutdownTempC
|
||||
if shutdownTemp <= 0 {
|
||||
shutdownTemp = 90
|
||||
{
|
||||
var rows [][]string
|
||||
for _, gpu := range result.GPUs {
|
||||
shutdownTemp := gpu.ShutdownTempC
|
||||
if shutdownTemp <= 0 {
|
||||
shutdownTemp = 90
|
||||
}
|
||||
slowdownTemp := gpu.SlowdownTempC
|
||||
if slowdownTemp <= 0 {
|
||||
slowdownTemp = 80
|
||||
}
|
||||
headroom := gpu.Scores.TempHeadroomC
|
||||
thermalStatus := "✓ OK"
|
||||
switch {
|
||||
case headroom < 10:
|
||||
thermalStatus = "⛔ CRITICAL"
|
||||
case gpu.Steady.P95TempC >= slowdownTemp:
|
||||
thermalStatus = "⚠ WARNING"
|
||||
}
|
||||
throttlePct := "-"
|
||||
if gpu.Scores.ThermalThrottlePct > 0 {
|
||||
throttlePct = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
|
||||
}
|
||||
rows = append(rows, []string{
|
||||
fmt.Sprintf("GPU %d", gpu.Index),
|
||||
fmt.Sprintf("%.1f°C", gpu.Steady.P95TempC),
|
||||
fmt.Sprintf("%.0f°C", slowdownTemp),
|
||||
fmt.Sprintf("%.0f°C", shutdownTemp),
|
||||
fmt.Sprintf("%.1f°C", headroom),
|
||||
throttlePct,
|
||||
thermalStatus,
|
||||
})
|
||||
}
|
||||
slowdownTemp := gpu.SlowdownTempC
|
||||
if slowdownTemp <= 0 {
|
||||
slowdownTemp = 80
|
||||
}
|
||||
headroom := gpu.Scores.TempHeadroomC
|
||||
thermalStatus := "✓ OK"
|
||||
switch {
|
||||
case headroom < 10:
|
||||
thermalStatus = "⛔ CRITICAL"
|
||||
case gpu.Steady.P95TempC >= slowdownTemp:
|
||||
thermalStatus = "⚠ WARNING"
|
||||
}
|
||||
throttlePct := "-"
|
||||
if gpu.Scores.ThermalThrottlePct > 0 {
|
||||
throttlePct = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
|
||||
}
|
||||
fmt.Fprintf(&b, "| GPU %d | %.1f°C | %.0f°C | %.0f°C | %.1f°C | %s | %s |\n",
|
||||
gpu.Index, gpu.Steady.P95TempC, slowdownTemp, shutdownTemp, headroom, throttlePct, thermalStatus)
|
||||
b.WriteString(fmtMDTable([]string{"GPU", "p95 temp", "Slowdown limit", "Shutdown limit", "Headroom", "Thermal throttle", "Status"}, rows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// Perspective 3: Power delivery
|
||||
b.WriteString("### 3. Power Delivery\n\n")
|
||||
b.WriteString("| GPU | Power cap throttle | Power stability | Fan duty (p95) | Status |\n")
|
||||
b.WriteString("|-----|-------------------|-----------------|----------------|--------|\n")
|
||||
for _, gpu := range result.GPUs {
|
||||
powerCap := "-"
|
||||
if gpu.Scores.PowerCapThrottlePct > 0 {
|
||||
powerCap = fmt.Sprintf("%.1f%%", gpu.Scores.PowerCapThrottlePct)
|
||||
{
|
||||
var rows [][]string
|
||||
for _, gpu := range result.GPUs {
|
||||
powerCap := "-"
|
||||
if gpu.Scores.PowerCapThrottlePct > 0 {
|
||||
powerCap = fmt.Sprintf("%.1f%%", gpu.Scores.PowerCapThrottlePct)
|
||||
}
|
||||
fanDuty := "-"
|
||||
if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable {
|
||||
fanDuty = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
|
||||
}
|
||||
powerStatus := "✓ OK"
|
||||
if gpu.Scores.PowerCapThrottlePct > 5 {
|
||||
powerStatus = "⚠ POWER LIMITED"
|
||||
}
|
||||
rows = append(rows, []string{
|
||||
fmt.Sprintf("GPU %d", gpu.Index),
|
||||
powerCap,
|
||||
fmt.Sprintf("%.1f", gpu.Scores.PowerSustainScore),
|
||||
fanDuty,
|
||||
powerStatus,
|
||||
})
|
||||
}
|
||||
fanDuty := "-"
|
||||
if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable {
|
||||
fanDuty = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
|
||||
}
|
||||
powerStatus := "✓ OK"
|
||||
if gpu.Scores.PowerCapThrottlePct > 5 {
|
||||
powerStatus = "⚠ POWER LIMITED"
|
||||
}
|
||||
fmt.Fprintf(&b, "| GPU %d | %s | %.1f | %s | %s |\n",
|
||||
gpu.Index, powerCap, gpu.Scores.PowerSustainScore, fanDuty, powerStatus)
|
||||
b.WriteString(fmtMDTable([]string{"GPU", "Power cap throttle", "Power stability", "Fan duty (p95)", "Status"}, rows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// Perspective 4: Performance
|
||||
b.WriteString("### 4. Performance\n\n")
|
||||
b.WriteString("| GPU | Compute TOPS | Synthetic | Mixed | Mixed Eff. | TOPS/SM/GHz |\n")
|
||||
b.WriteString("|-----|--------------|-----------|-------|------------|-------------|\n")
|
||||
for _, gpu := range result.GPUs {
|
||||
synthetic := "-"
|
||||
if gpu.Scores.SyntheticScore > 0 {
|
||||
synthetic = fmt.Sprintf("%.2f", gpu.Scores.SyntheticScore)
|
||||
{
|
||||
var rows [][]string
|
||||
for _, gpu := range result.GPUs {
|
||||
synthetic := "-"
|
||||
if gpu.Scores.SyntheticScore > 0 {
|
||||
synthetic = fmt.Sprintf("%.2f", gpu.Scores.SyntheticScore)
|
||||
}
|
||||
mixed := "-"
|
||||
if gpu.Scores.MixedScore > 0 {
|
||||
mixed = fmt.Sprintf("%.2f", gpu.Scores.MixedScore)
|
||||
}
|
||||
mixedEff := "-"
|
||||
if gpu.Scores.MixedEfficiency > 0 {
|
||||
mixedEff = fmt.Sprintf("%.1f%%", gpu.Scores.MixedEfficiency*100)
|
||||
}
|
||||
topsPerSM := "-"
|
||||
if gpu.Scores.TOPSPerSMPerGHz > 0 {
|
||||
topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
|
||||
}
|
||||
rows = append(rows, []string{
|
||||
fmt.Sprintf("GPU %d", gpu.Index),
|
||||
fmt.Sprintf("**%.2f**", gpu.Scores.CompositeScore),
|
||||
synthetic, mixed, mixedEff, topsPerSM,
|
||||
})
|
||||
}
|
||||
mixed := "-"
|
||||
if gpu.Scores.MixedScore > 0 {
|
||||
mixed = fmt.Sprintf("%.2f", gpu.Scores.MixedScore)
|
||||
b.WriteString(fmtMDTable([]string{"GPU", "Compute TOPS", "Synthetic", "Mixed", "Mixed Eff.", "TOPS/SM/GHz"}, rows))
|
||||
if len(result.PerformanceRampSteps) > 0 {
|
||||
fmt.Fprintf(&b, "\n**Platform power score (scalability):** %.1f%%\n", result.PlatformPowerScore)
|
||||
}
|
||||
mixedEff := "-"
|
||||
if gpu.Scores.MixedEfficiency > 0 {
|
||||
mixedEff = fmt.Sprintf("%.1f%%", gpu.Scores.MixedEfficiency*100)
|
||||
}
|
||||
topsPerSM := "-"
|
||||
if gpu.Scores.TOPSPerSMPerGHz > 0 {
|
||||
topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
|
||||
}
|
||||
fmt.Fprintf(&b, "| GPU %d | **%.2f** | %s | %s | %s | %s |\n",
|
||||
gpu.Index, gpu.Scores.CompositeScore, synthetic, mixed, mixedEff, topsPerSM)
|
||||
b.WriteString("\n")
|
||||
}
|
||||
if len(result.PerformanceRampSteps) > 0 {
|
||||
fmt.Fprintf(&b, "\n**Platform power score (scalability):** %.1f%%\n", result.PlatformPowerScore)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// Perspective 5: Anomaly flags
|
||||
b.WriteString("### 5. Anomalies\n\n")
|
||||
b.WriteString("| GPU | ECC corrected | Sync boost throttle | Power instability | Thermal instability |\n")
|
||||
b.WriteString("|-----|---------------|---------------------|-------------------|---------------------|\n")
|
||||
for _, gpu := range result.GPUs {
|
||||
eccCorr := "-"
|
||||
if gpu.ECC.Corrected > 0 {
|
||||
eccCorr = fmt.Sprintf("⚠ %d", gpu.ECC.Corrected)
|
||||
{
|
||||
var rows [][]string
|
||||
for _, gpu := range result.GPUs {
|
||||
eccCorr := "-"
|
||||
if gpu.ECC.Corrected > 0 {
|
||||
eccCorr = fmt.Sprintf("⚠ %d", gpu.ECC.Corrected)
|
||||
}
|
||||
syncBoost := "-"
|
||||
if gpu.Scores.SyncBoostThrottlePct > 0 {
|
||||
syncBoost = fmt.Sprintf("%.1f%%", gpu.Scores.SyncBoostThrottlePct)
|
||||
}
|
||||
powerVar := "OK"
|
||||
if gpu.Scores.PowerSustainScore < 70 {
|
||||
powerVar = "⚠ unstable"
|
||||
}
|
||||
thermalVar := "OK"
|
||||
if gpu.Scores.ThermalSustainScore < 70 {
|
||||
thermalVar = "⚠ unstable"
|
||||
}
|
||||
rows = append(rows, []string{fmt.Sprintf("GPU %d", gpu.Index), eccCorr, syncBoost, powerVar, thermalVar})
|
||||
}
|
||||
syncBoost := "-"
|
||||
if gpu.Scores.SyncBoostThrottlePct > 0 {
|
||||
syncBoost = fmt.Sprintf("%.1f%%", gpu.Scores.SyncBoostThrottlePct)
|
||||
}
|
||||
powerVar := "OK"
|
||||
if gpu.Scores.PowerSustainScore < 70 {
|
||||
powerVar = "⚠ unstable"
|
||||
}
|
||||
thermalVar := "OK"
|
||||
if gpu.Scores.ThermalSustainScore < 70 {
|
||||
thermalVar = "⚠ unstable"
|
||||
}
|
||||
fmt.Fprintf(&b, "| GPU %d | %s | %s | %s | %s |\n",
|
||||
gpu.Index, eccCorr, syncBoost, powerVar, thermalVar)
|
||||
b.WriteString(fmtMDTable([]string{"GPU", "ECC corrected", "Sync boost throttle", "Power instability", "Thermal instability"}, rows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// ── Per GPU detail ────────────────────────────────────────────────────────
|
||||
b.WriteString("## Per-GPU Details\n\n")
|
||||
@@ -263,12 +286,16 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
||||
// Steady-state telemetry
|
||||
if benchmarkTelemetryAvailable(gpu.Steady) {
|
||||
fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
|
||||
b.WriteString("| | Avg | P95 |\n|---|---|---|\n")
|
||||
fmt.Fprintf(&b, "| Power | %.1f W | %.1f W |\n", gpu.Steady.AvgPowerW, gpu.Steady.P95PowerW)
|
||||
fmt.Fprintf(&b, "| Temperature | %.1f °C | %.1f °C |\n", gpu.Steady.AvgTempC, gpu.Steady.P95TempC)
|
||||
fmt.Fprintf(&b, "| GPU clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgGraphicsClockMHz, gpu.Steady.P95GraphicsClockMHz)
|
||||
fmt.Fprintf(&b, "| Memory clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgMemoryClockMHz, gpu.Steady.P95MemoryClockMHz)
|
||||
fmt.Fprintf(&b, "| GPU utilisation | %.1f %% | — |\n", gpu.Steady.AvgUsagePct)
|
||||
b.WriteString(fmtMDTable(
|
||||
[]string{"", "Avg", "P95"},
|
||||
[][]string{
|
||||
{"Power", fmt.Sprintf("%.1f W", gpu.Steady.AvgPowerW), fmt.Sprintf("%.1f W", gpu.Steady.P95PowerW)},
|
||||
{"Temperature", fmt.Sprintf("%.1f °C", gpu.Steady.AvgTempC), fmt.Sprintf("%.1f °C", gpu.Steady.P95TempC)},
|
||||
{"GPU clock", fmt.Sprintf("%.0f MHz", gpu.Steady.AvgGraphicsClockMHz), fmt.Sprintf("%.0f MHz", gpu.Steady.P95GraphicsClockMHz)},
|
||||
{"Memory clock", fmt.Sprintf("%.0f MHz", gpu.Steady.AvgMemoryClockMHz), fmt.Sprintf("%.0f MHz", gpu.Steady.P95MemoryClockMHz)},
|
||||
{"GPU utilisation", fmt.Sprintf("%.1f %%", gpu.Steady.AvgUsagePct), "—"},
|
||||
},
|
||||
))
|
||||
b.WriteString("\n")
|
||||
} else {
|
||||
b.WriteString("**Steady-state telemetry:** unavailable\n\n")
|
||||
@@ -277,7 +304,7 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
||||
// Per-precision stability phases.
|
||||
if len(gpu.PrecisionSteady) > 0 {
|
||||
b.WriteString("**Per-precision stability:**\n\n")
|
||||
b.WriteString("| Precision | Status | Clock CV | Power CV | Clock Drift | ECC corr | ECC uncorr |\n|-----------|--------|----------|----------|-------------|----------|------------|\n")
|
||||
var precRows [][]string
|
||||
for _, p := range gpu.PrecisionSteady {
|
||||
eccCorr := "—"
|
||||
eccUncorr := "—"
|
||||
@@ -289,10 +316,15 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
||||
if strings.TrimSpace(status) == "" {
|
||||
status = "OK"
|
||||
}
|
||||
fmt.Fprintf(&b, "| %s | %s | %.1f%% | %.1f%% | %.1f%% | %s | %s |\n",
|
||||
p.Precision, status, p.Steady.ClockCVPct, p.Steady.PowerCVPct, p.Steady.ClockDriftPct,
|
||||
eccCorr, eccUncorr)
|
||||
precRows = append(precRows, []string{
|
||||
p.Precision, status,
|
||||
fmt.Sprintf("%.1f%%", p.Steady.ClockCVPct),
|
||||
fmt.Sprintf("%.1f%%", p.Steady.PowerCVPct),
|
||||
fmt.Sprintf("%.1f%%", p.Steady.ClockDriftPct),
|
||||
eccCorr, eccUncorr,
|
||||
})
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"Precision", "Status", "Clock CV", "Power CV", "Clock Drift", "ECC corr", "ECC uncorr"}, precRows))
|
||||
b.WriteString("\n")
|
||||
} else {
|
||||
// Legacy: show combined-window variance.
|
||||
@@ -315,16 +347,22 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
||||
// Precision results
|
||||
if len(gpu.PrecisionResults) > 0 {
|
||||
b.WriteString("**Precision results:**\n\n")
|
||||
b.WriteString("| Precision | TOPS (raw) | Weight | TOPS (fp32-eq) | Lanes | Iterations |\n|-----------|------------|--------|----------------|-------|------------|\n")
|
||||
var presRows [][]string
|
||||
for _, p := range gpu.PrecisionResults {
|
||||
if p.Supported {
|
||||
weightStr := fmt.Sprintf("×%.3g", p.Weight)
|
||||
fmt.Fprintf(&b, "| %s | %.2f | %s | %.2f | %d | %d |\n",
|
||||
p.Name, p.TeraOpsPerSec, weightStr, p.WeightedTeraOpsPerSec, p.Lanes, p.Iterations)
|
||||
presRows = append(presRows, []string{
|
||||
p.Name,
|
||||
fmt.Sprintf("%.2f", p.TeraOpsPerSec),
|
||||
fmt.Sprintf("×%.3g", p.Weight),
|
||||
fmt.Sprintf("%.2f", p.WeightedTeraOpsPerSec),
|
||||
fmt.Sprintf("%d", p.Lanes),
|
||||
fmt.Sprintf("%d", p.Iterations),
|
||||
})
|
||||
} else {
|
||||
fmt.Fprintf(&b, "| %s | — (unsupported) | — | — | — | — |\n", p.Name)
|
||||
presRows = append(presRows, []string{p.Name, "— (unsupported)", "—", "—", "—", "—"})
|
||||
}
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"Precision", "TOPS (raw)", "Weight", "TOPS (fp32-eq)", "Lanes", "Iterations"}, presRows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
@@ -346,9 +384,13 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
||||
b.WriteString("## Interconnect (NCCL)\n\n")
|
||||
fmt.Fprintf(&b, "**Status:** %s\n\n", result.Interconnect.Status)
|
||||
if result.Interconnect.Supported {
|
||||
b.WriteString("| Metric | Avg | Max |\n|--------|-----|-----|\n")
|
||||
fmt.Fprintf(&b, "| Alg BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.MaxAlgBWGBps)
|
||||
fmt.Fprintf(&b, "| Bus BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgBusBWGBps, result.Interconnect.MaxBusBWGBps)
|
||||
b.WriteString(fmtMDTable(
|
||||
[]string{"Metric", "Avg", "Max"},
|
||||
[][]string{
|
||||
{"Alg BW", fmt.Sprintf("%.1f GB/s", result.Interconnect.AvgAlgBWGBps), fmt.Sprintf("%.1f GB/s", result.Interconnect.MaxAlgBWGBps)},
|
||||
{"Bus BW", fmt.Sprintf("%.1f GB/s", result.Interconnect.AvgBusBWGBps), fmt.Sprintf("%.1f GB/s", result.Interconnect.MaxBusBWGBps)},
|
||||
},
|
||||
))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
for _, note := range result.Interconnect.Notes {
|
||||
@@ -365,14 +407,16 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
||||
if !sp.Available {
|
||||
b.WriteString("IPMI power measurement unavailable.\n\n")
|
||||
} else {
|
||||
b.WriteString("| | Value |\n|---|---|\n")
|
||||
fmt.Fprintf(&b, "| Server idle | %.0f W |\n", sp.IdleW)
|
||||
fmt.Fprintf(&b, "| Server under load | %.0f W |\n", sp.LoadedW)
|
||||
fmt.Fprintf(&b, "| Server delta (load − idle) | %.0f W |\n", sp.DeltaW)
|
||||
fmt.Fprintf(&b, "| GPU-reported sum | %.0f W |\n", sp.GPUReportedSumW)
|
||||
if sp.ReportingRatio > 0 {
|
||||
fmt.Fprintf(&b, "| Reporting ratio | %.2f (1.0 = accurate, <0.75 = GPU over-reports) |\n", sp.ReportingRatio)
|
||||
spRows := [][]string{
|
||||
{"Server idle", fmt.Sprintf("%.0f W", sp.IdleW)},
|
||||
{"Server under load", fmt.Sprintf("%.0f W", sp.LoadedW)},
|
||||
{"Server delta (load − idle)", fmt.Sprintf("%.0f W", sp.DeltaW)},
|
||||
{"GPU-reported sum", fmt.Sprintf("%.0f W", sp.GPUReportedSumW)},
|
||||
}
|
||||
if sp.ReportingRatio > 0 {
|
||||
spRows = append(spRows, []string{"Reporting ratio", fmt.Sprintf("%.2f (1.0 = accurate, <0.75 = GPU over-reports)", sp.ReportingRatio)})
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"", "Value"}, spRows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
for _, note := range sp.Notes {
|
||||
@@ -397,15 +441,19 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
||||
if cooling := result.Cooling; cooling != nil {
|
||||
b.WriteString("## Cooling\n\n")
|
||||
if cooling.Available {
|
||||
b.WriteString("| Metric | Value |\n|--------|-------|\n")
|
||||
fmt.Fprintf(&b, "| Average fan speed | %.0f RPM |\n", cooling.AvgFanRPM)
|
||||
dutyAvg, dutyP95 := "N/A", "N/A"
|
||||
if cooling.FanDutyCycleAvailable {
|
||||
fmt.Fprintf(&b, "| Average fan duty cycle | %.1f%% |\n", cooling.AvgFanDutyCyclePct)
|
||||
fmt.Fprintf(&b, "| P95 fan duty cycle | %.1f%% |\n", cooling.P95FanDutyCyclePct)
|
||||
} else {
|
||||
b.WriteString("| Average fan duty cycle | N/A |\n")
|
||||
b.WriteString("| P95 fan duty cycle | N/A |\n")
|
||||
dutyAvg = fmt.Sprintf("%.1f%%", cooling.AvgFanDutyCyclePct)
|
||||
dutyP95 = fmt.Sprintf("%.1f%%", cooling.P95FanDutyCyclePct)
|
||||
}
|
||||
b.WriteString(fmtMDTable(
|
||||
[]string{"Metric", "Value"},
|
||||
[][]string{
|
||||
{"Average fan speed", fmt.Sprintf("%.0f RPM", cooling.AvgFanRPM)},
|
||||
{"Average fan duty cycle", dutyAvg},
|
||||
{"P95 fan duty cycle", dutyP95},
|
||||
},
|
||||
))
|
||||
b.WriteString("\n")
|
||||
} else {
|
||||
b.WriteString("Cooling telemetry unavailable.\n\n")
|
||||
@@ -422,12 +470,16 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
||||
if len(result.PerformanceRampSteps) > 0 {
|
||||
b.WriteString("## Platform Scalability (Performance Ramp)\n\n")
|
||||
fmt.Fprintf(&b, "**Platform power score:** %.1f%% \n\n", result.PlatformPowerScore)
|
||||
b.WriteString("| k GPUs | GPU Indices | Total Synthetic TOPS | Scalability |\n")
|
||||
b.WriteString("|--------|-------------|----------------------|-------------|\n")
|
||||
var scalRows [][]string
|
||||
for _, step := range result.PerformanceRampSteps {
|
||||
fmt.Fprintf(&b, "| %d | %s | %.2f | %.1f%% |\n",
|
||||
step.StepIndex, joinIndexList(step.GPUIndices), step.TotalSyntheticTOPS, step.ScalabilityPct)
|
||||
scalRows = append(scalRows, []string{
|
||||
fmt.Sprintf("%d", step.StepIndex),
|
||||
joinIndexList(step.GPUIndices),
|
||||
fmt.Sprintf("%.2f", step.TotalSyntheticTOPS),
|
||||
fmt.Sprintf("%.1f%%", step.ScalabilityPct),
|
||||
})
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"k GPUs", "GPU Indices", "Total Synthetic TOPS", "Scalability"}, scalRows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
|
||||
75
audit/internal/platform/benchmark_table.go
Normal file
75
audit/internal/platform/benchmark_table.go
Normal file
@@ -0,0 +1,75 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
// fmtMDTable renders a markdown table with column widths padded so the table
|
||||
// is readable as plain text without a markdown renderer.
|
||||
//
|
||||
// headers contains the column header strings.
|
||||
// rows contains data rows; each row must have the same number of cells as headers.
|
||||
// Cells with fewer entries than headers are treated as empty.
|
||||
func fmtMDTable(headers []string, rows [][]string) string {
|
||||
ncols := len(headers)
|
||||
if ncols == 0 {
|
||||
return ""
|
||||
}
|
||||
|
||||
// Compute max width per column.
|
||||
widths := make([]int, ncols)
|
||||
for i, h := range headers {
|
||||
if len(h) > widths[i] {
|
||||
widths[i] = len(h)
|
||||
}
|
||||
}
|
||||
for _, row := range rows {
|
||||
for i := 0; i < ncols; i++ {
|
||||
cell := ""
|
||||
if i < len(row) {
|
||||
cell = row[i]
|
||||
}
|
||||
if len(cell) > widths[i] {
|
||||
widths[i] = len(cell)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var b strings.Builder
|
||||
|
||||
// Header row.
|
||||
b.WriteByte('|')
|
||||
for i, h := range headers {
|
||||
b.WriteByte(' ')
|
||||
b.WriteString(h)
|
||||
b.WriteString(strings.Repeat(" ", widths[i]-len(h)))
|
||||
b.WriteString(" |")
|
||||
}
|
||||
b.WriteByte('\n')
|
||||
|
||||
// Separator row.
|
||||
b.WriteByte('|')
|
||||
for i := range headers {
|
||||
b.WriteString(strings.Repeat("-", widths[i]+2))
|
||||
b.WriteByte('|')
|
||||
}
|
||||
b.WriteByte('\n')
|
||||
|
||||
// Data rows.
|
||||
for _, row := range rows {
|
||||
b.WriteByte('|')
|
||||
for i := 0; i < ncols; i++ {
|
||||
cell := ""
|
||||
if i < len(row) {
|
||||
cell = row[i]
|
||||
}
|
||||
b.WriteByte(' ')
|
||||
b.WriteString(cell)
|
||||
b.WriteString(strings.Repeat(" ", widths[i]-len(cell)))
|
||||
b.WriteString(" |")
|
||||
}
|
||||
b.WriteByte('\n')
|
||||
}
|
||||
|
||||
return b.String()
|
||||
}
|
||||
@@ -52,7 +52,7 @@ const (
|
||||
// - BenchmarkEstimatedPerfStabilitySec: xFusion v8.22 ramp 1-8: 5532 s
|
||||
// - BenchmarkEstimatedPerfOvernightSec: derived from profile phases (SteadySec=27000)
|
||||
// - BenchmarkEstimatedPowerStandardSec: MLT v8.22 ramp 1-4: 2663 s; MSI v8.22 ramp 1-8: 2375 s
|
||||
// - BenchmarkEstimatedPowerStabilitySec: xFusion v8.17/v8.22 ramp 1-8: 1977-2002 s
|
||||
// - BenchmarkEstimatedPowerStabilitySec: target ~90 min with calibDurationSec=300 (8 GPU × ~2-3 attempts)
|
||||
const (
|
||||
// Performance Benchmark (bee-gpu-burn).
|
||||
// Duration is per full ramp-up run (ramp 1→N) or per single parallel run.
|
||||
@@ -64,7 +64,7 @@ const (
|
||||
// Power / Thermal Fit (dcgmi targeted_power binary-search calibration).
|
||||
// Duration is for the full ramp-up run; individual steps vary with convergence speed.
|
||||
BenchmarkEstimatedPowerStandardSec = 2600 // ~43 min; ramp 1-4: 2663 s, ramp 1-8: 2375 s
|
||||
BenchmarkEstimatedPowerStabilitySec = 2000 // ~33 min; stability profile converges faster (longer steady → faster convergence)
|
||||
BenchmarkEstimatedPowerStabilitySec = 5400 // ~90 min; calibDurationSec=300 × 8 GPU × ~2-3 attempts
|
||||
BenchmarkEstimatedPowerOvernightSec = 3 * 3600
|
||||
)
|
||||
|
||||
@@ -408,6 +408,9 @@ type NvidiaPowerBenchGPU struct {
|
||||
// Telemetry holds the aggregated stats from the final converged calibration
|
||||
// attempt for this GPU (temperature, power, fan, clock percentiles).
|
||||
Telemetry *BenchmarkTelemetrySummary `json:"telemetry,omitempty"`
|
||||
// Fan state sampled at the end of single-card calibration.
|
||||
AvgFanRPM float64 `json:"avg_fan_rpm,omitempty"`
|
||||
AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
|
||||
}
|
||||
|
||||
type NvidiaPowerBenchStep struct {
|
||||
@@ -426,6 +429,13 @@ type NvidiaPowerBenchStep struct {
|
||||
// ramp step's calibration run. ServerDeltaW = ServerLoadedW − idle.
|
||||
ServerLoadedW float64 `json:"server_loaded_w,omitempty"`
|
||||
ServerDeltaW float64 `json:"server_delta_w,omitempty"`
|
||||
// PSU slot readings sampled at end of this ramp step.
|
||||
PSUSlotReadings map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings,omitempty"`
|
||||
// Fan state at end of this ramp step.
|
||||
AvgFanRPM float64 `json:"avg_fan_rpm,omitempty"`
|
||||
AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
|
||||
// Per-GPU telemetry from this step's calibration, keyed by GPU index.
|
||||
PerGPUTelemetry map[int]*BenchmarkTelemetrySummary `json:"per_gpu_telemetry,omitempty"`
|
||||
}
|
||||
|
||||
// NvidiaPerformanceRampStep holds per-step performance data for the
|
||||
|
||||
Reference in New Issue
Block a user