Rework Power Fit report: 90 min stability, aligned tables, PSU/fan sections
- Increase stability profile duration from 33 min to 90 min by wiring powerBenchDurationSec() into runBenchmarkPowerCalibration (was discarded) - Collect per-step PSU slot readings, fan RPM/duty, and per-GPU telemetry in ramp loop; add matching fields to NvidiaPowerBenchStep/NvidiaPowerBenchGPU - Rewrite renderPowerBenchReport: replace Per-Slot Results with Single GPU section, rework Ramp Sequence rows=runs/cols=GPUs, add PSU Performance section (conditional on IPMI data), add transposed Single vs All-GPU comparison table in per-GPU sections - Add fmtMDTable helper (benchmark_table.go) and apply to all tables in both power and performance reports so columns align in plain-text view Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -3055,8 +3055,12 @@ func runBenchmarkPowerCalibration(
|
||||
infoByIndex map[int]benchmarkGPUInfo,
|
||||
logFunc func(string),
|
||||
seedLimits map[int]int,
|
||||
durationSec int,
|
||||
) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction, []GPUMetricRow) {
|
||||
const calibDurationSec = 120
|
||||
calibDurationSec := durationSec
|
||||
if calibDurationSec <= 0 {
|
||||
calibDurationSec = 120
|
||||
}
|
||||
const maxDerateW = 150
|
||||
// calibSearchTolerance is the binary-search convergence threshold in watts.
|
||||
// When hi-lo ≤ this, the highest verified-stable limit (lo) is used.
|
||||
@@ -3436,6 +3440,18 @@ func roundTo5W(w int) int {
|
||||
return ((w + 2) / 5) * 5
|
||||
}
|
||||
|
||||
// meanFanRPM returns the average RPM across a set of fan readings.
|
||||
func meanFanRPM(fans []FanReading) float64 {
|
||||
if len(fans) == 0 {
|
||||
return 0
|
||||
}
|
||||
var sum float64
|
||||
for _, f := range fans {
|
||||
sum += f.RPM
|
||||
}
|
||||
return sum / float64(len(fans))
|
||||
}
|
||||
|
||||
func powerBenchDurationSec(profile string) int {
|
||||
switch strings.TrimSpace(strings.ToLower(profile)) {
|
||||
case NvidiaBenchmarkProfileStability:
|
||||
@@ -3475,30 +3491,29 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
// Server power comparison table.
|
||||
if sp := result.ServerPower; sp != nil {
|
||||
b.WriteString("## Server vs GPU Power Comparison\n\n")
|
||||
b.WriteString("| Metric | Source | Value |\n")
|
||||
b.WriteString("|--------|--------|-------|\n")
|
||||
fmt.Fprintf(&b, "| GPU stable limits sum | nvidia-smi | %.0f W |\n", result.PlatformMaxTDPW)
|
||||
fmt.Fprintf(&b, "| GPU actual power sum (p95, last step) | nvidia-smi | %.0f W |\n", sp.GPUReportedSumW)
|
||||
var spRows [][]string
|
||||
spRows = append(spRows, []string{"GPU stable limits sum", "nvidia-smi", fmt.Sprintf("%.0f W", result.PlatformMaxTDPW)})
|
||||
spRows = append(spRows, []string{"GPU actual power sum (p95, last step)", "nvidia-smi", fmt.Sprintf("%.0f W", sp.GPUReportedSumW)})
|
||||
if sp.GPUSlotTotalW > 0 {
|
||||
fmt.Fprintf(&b, "| GPU PCIe slot power (at peak load) | IPMI SDR | %.0f W |\n", sp.GPUSlotTotalW)
|
||||
spRows = append(spRows, []string{"GPU PCIe slot power (at peak load)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.GPUSlotTotalW)})
|
||||
}
|
||||
if sp.Available {
|
||||
fmt.Fprintf(&b, "| Server idle power | IPMI DCMI | %.0f W |\n", sp.IdleW)
|
||||
fmt.Fprintf(&b, "| Server loaded power | IPMI DCMI | %.0f W |\n", sp.LoadedW)
|
||||
fmt.Fprintf(&b, "| Server Δ power (loaded − idle) | IPMI DCMI | %.0f W |\n", sp.DeltaW)
|
||||
spRows = append(spRows, []string{"Server idle power", "IPMI DCMI", fmt.Sprintf("%.0f W", sp.IdleW)})
|
||||
spRows = append(spRows, []string{"Server loaded power", "IPMI DCMI", fmt.Sprintf("%.0f W", sp.LoadedW)})
|
||||
spRows = append(spRows, []string{"Server Δ power (loaded − idle)", "IPMI DCMI", fmt.Sprintf("%.0f W", sp.DeltaW)})
|
||||
}
|
||||
if sp.PSUInputLoadedW > 0 {
|
||||
fmt.Fprintf(&b, "| PSU AC input (idle) | IPMI SDR | %.0f W |\n", sp.PSUInputIdleW)
|
||||
fmt.Fprintf(&b, "| PSU AC input (loaded) | IPMI SDR | %.0f W |\n", sp.PSUInputLoadedW)
|
||||
spRows = append(spRows, []string{"PSU AC input (idle)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUInputIdleW)})
|
||||
spRows = append(spRows, []string{"PSU AC input (loaded)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUInputLoadedW)})
|
||||
psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
|
||||
fmt.Fprintf(&b, "| PSU AC input Δ (loaded − idle) | IPMI SDR | %.0f W |\n", psuDelta)
|
||||
spRows = append(spRows, []string{"PSU AC input Δ (loaded − idle)", "IPMI SDR", fmt.Sprintf("%.0f W", psuDelta)})
|
||||
}
|
||||
if sp.PSUOutputLoadedW > 0 {
|
||||
fmt.Fprintf(&b, "| PSU DC output (idle) | IPMI SDR | %.0f W |\n", sp.PSUOutputIdleW)
|
||||
fmt.Fprintf(&b, "| PSU DC output (loaded) | IPMI SDR | %.0f W |\n", sp.PSUOutputLoadedW)
|
||||
spRows = append(spRows, []string{"PSU DC output (idle)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUOutputIdleW)})
|
||||
spRows = append(spRows, []string{"PSU DC output (loaded)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUOutputLoadedW)})
|
||||
if sp.PSUInputLoadedW > 0 && sp.PSUInputIdleW > 0 {
|
||||
psuEff := sp.PSUOutputIdleW / sp.PSUInputIdleW * 100
|
||||
fmt.Fprintf(&b, "| PSU conversion efficiency (idle) | IPMI SDR | %.1f%% |\n", psuEff)
|
||||
spRows = append(spRows, []string{"PSU conversion efficiency (idle)", "IPMI SDR", fmt.Sprintf("%.1f%%", psuEff)})
|
||||
}
|
||||
}
|
||||
if sp.Available {
|
||||
@@ -3516,7 +3531,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
default:
|
||||
ratioNote = "✗ significant discrepancy — GPU over-reports TDP vs wall power"
|
||||
}
|
||||
fmt.Fprintf(&b, "| Reporting ratio (DCMI Δ / GPU actual) | IPMI DCMI | %.2f — %s |\n", ratio, ratioNote)
|
||||
spRows = append(spRows, []string{"Reporting ratio (DCMI Δ / GPU actual)", "IPMI DCMI", fmt.Sprintf("%.2f — %s", ratio, ratioNote)})
|
||||
if sp.PSUInputLoadedW > 0 && sp.GPUReportedSumW > 0 {
|
||||
psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
|
||||
sdrRatio := psuDelta / sp.GPUReportedSumW
|
||||
@@ -3529,11 +3544,12 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
default:
|
||||
sdrNote = "✗ significant discrepancy"
|
||||
}
|
||||
fmt.Fprintf(&b, "| Reporting ratio (SDR PSU Δ / GPU actual) | IPMI SDR | %.2f — %s |\n", sdrRatio, sdrNote)
|
||||
spRows = append(spRows, []string{"Reporting ratio (SDR PSU Δ / GPU actual)", "IPMI SDR", fmt.Sprintf("%.2f — %s", sdrRatio, sdrNote)})
|
||||
}
|
||||
} else {
|
||||
b.WriteString("| IPMI availability | — | not available — IPMI not supported or ipmitool not found |\n")
|
||||
spRows = append(spRows, []string{"IPMI availability", "—", "not available — IPMI not supported or ipmitool not found"})
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"Metric", "Source", "Value"}, spRows))
|
||||
for _, note := range sp.Notes {
|
||||
fmt.Fprintf(&b, "\n> %s\n", note)
|
||||
}
|
||||
@@ -3541,10 +3557,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
|
||||
if len(sp.PSUSlotReadingsIdle) > 0 || len(sp.PSUSlotReadingsLoaded) > 0 {
|
||||
b.WriteString("## PSU Load Distribution\n\n")
|
||||
b.WriteString("| Slot | AC Input (idle) | AC Input (loaded) | DC Output (idle) | DC Output (loaded) | Load Δ | Status |\n")
|
||||
b.WriteString("|------|-----------------|-------------------|------------------|--------------------|--------|--------|\n")
|
||||
|
||||
// collect all slot keys
|
||||
slotSet := map[string]struct{}{}
|
||||
for k := range sp.PSUSlotReadingsIdle {
|
||||
slotSet[k] = struct{}{}
|
||||
@@ -3558,17 +3571,18 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
}
|
||||
sort.Strings(slots)
|
||||
|
||||
fmtW := func(v *float64) string {
|
||||
if v == nil {
|
||||
return "—"
|
||||
}
|
||||
return fmt.Sprintf("%.0f W", *v)
|
||||
}
|
||||
|
||||
var psuDistRows [][]string
|
||||
for _, slot := range slots {
|
||||
idle := sp.PSUSlotReadingsIdle[slot]
|
||||
loaded := sp.PSUSlotReadingsLoaded[slot]
|
||||
|
||||
fmtW := func(v *float64) string {
|
||||
if v == nil {
|
||||
return "—"
|
||||
}
|
||||
return fmt.Sprintf("%.0f W", *v)
|
||||
}
|
||||
|
||||
var deltaStr string
|
||||
if idle.InputW != nil && loaded.InputW != nil {
|
||||
deltaStr = fmt.Sprintf("%+.0f W", *loaded.InputW-*idle.InputW)
|
||||
@@ -3584,13 +3598,14 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
status = "—"
|
||||
}
|
||||
|
||||
fmt.Fprintf(&b, "| %s | %s | %s | %s | %s | %s | %s |\n",
|
||||
psuDistRows = append(psuDistRows, []string{
|
||||
slot,
|
||||
fmtW(idle.InputW), fmtW(loaded.InputW),
|
||||
fmtW(idle.OutputW), fmtW(loaded.OutputW),
|
||||
deltaStr, status,
|
||||
)
|
||||
})
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"Slot", "AC Input (idle)", "AC Input (loaded)", "DC Output (idle)", "DC Output (loaded)", "Load Δ", "Status"}, psuDistRows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
}
|
||||
@@ -3602,28 +3617,194 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
if len(result.RecommendedSlotOrder) > 0 {
|
||||
b.WriteString("## Recommended Slot Order\n\n")
|
||||
fmt.Fprintf(&b, "Populate GPUs in this order for best single-card power realization: `%s`\n\n", joinIndexList(result.RecommendedSlotOrder))
|
||||
}
|
||||
if len(result.RampSteps) > 0 {
|
||||
b.WriteString("## Ramp Sequence\n\n")
|
||||
b.WriteString("| Step | New GPU | Stable Limit | Total Observed | Server Δ (IPMI) | Derated | Status |\n")
|
||||
b.WriteString("|------|---------|--------------|----------------|-----------------|---------|--------|\n")
|
||||
for _, step := range result.RampSteps {
|
||||
derated := "-"
|
||||
if step.Derated {
|
||||
derated = "⚠ yes"
|
||||
// ── Single GPU section ───────────────────────────────────────────────────
|
||||
b.WriteString("## Single GPU\n\n")
|
||||
{
|
||||
var sgRows [][]string
|
||||
for _, gpu := range result.GPUs {
|
||||
clk := "—"
|
||||
mem := "—"
|
||||
temp := "—"
|
||||
pwr := "—"
|
||||
if gpu.Telemetry != nil {
|
||||
clk = fmt.Sprintf("%.0f", gpu.Telemetry.AvgGraphicsClockMHz)
|
||||
mem = fmt.Sprintf("%.0f", gpu.Telemetry.AvgMemoryClockMHz)
|
||||
temp = fmt.Sprintf("%.1f", gpu.Telemetry.AvgTempC)
|
||||
pwr = fmt.Sprintf("%.0f W", gpu.Telemetry.AvgPowerW)
|
||||
}
|
||||
serverDelta := "-"
|
||||
if step.ServerDeltaW > 0 {
|
||||
serverDelta = fmt.Sprintf("%.0f W", step.ServerDeltaW)
|
||||
serverDelta := "—"
|
||||
if gpu.ServerDeltaW > 0 {
|
||||
serverDelta = fmt.Sprintf("%.0f W", gpu.ServerDeltaW)
|
||||
}
|
||||
fmt.Fprintf(&b, "| %d | GPU %d | %.0f W | %.0f W | %s | %s | %s |\n",
|
||||
step.StepIndex, step.NewGPUIndex, step.NewGPUStableLimitW, step.TotalObservedPowerW, serverDelta, derated, step.Status)
|
||||
fan := "—"
|
||||
if gpu.AvgFanRPM > 0 {
|
||||
if gpu.AvgFanDutyCyclePct > 0 {
|
||||
fan = fmt.Sprintf("%.0f RPM (%.0f%%)", gpu.AvgFanRPM, gpu.AvgFanDutyCyclePct)
|
||||
} else {
|
||||
fan = fmt.Sprintf("%.0f RPM", gpu.AvgFanRPM)
|
||||
}
|
||||
}
|
||||
sgRows = append(sgRows, []string{
|
||||
fmt.Sprintf("GPU %d", gpu.Index),
|
||||
fmt.Sprintf("%s (%s)", clk, mem),
|
||||
temp,
|
||||
pwr,
|
||||
serverDelta,
|
||||
fan,
|
||||
})
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"GPU", "Clock MHz (Mem MHz)", "Avg Temp °C", "Power W", "Server Δ W", "Fan RPM (duty%)"}, sgRows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
if len(result.RecommendedSlotOrder) > 0 {
|
||||
fmt.Fprintf(&b, "Recommended slot order for best single-card power realization: `%s`\n\n", joinIndexList(result.RecommendedSlotOrder))
|
||||
}
|
||||
|
||||
// ── Ramp Sequence ────────────────────────────────────────────────────────
|
||||
// Rows = run number; Cols = per-GPU power (from step telemetry) + aggregates.
|
||||
if len(result.RampSteps) > 0 {
|
||||
b.WriteString("## Ramp Sequence\n\n")
|
||||
|
||||
// Collect all GPU indices that appear across all steps (ordered by first appearance).
|
||||
allGPUIndices := make([]int, 0, len(result.GPUs))
|
||||
seen := map[int]bool{}
|
||||
for _, step := range result.RampSteps {
|
||||
for _, idx := range step.GPUIndices {
|
||||
if !seen[idx] {
|
||||
seen[idx] = true
|
||||
allGPUIndices = append(allGPUIndices, idx)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var idleW float64
|
||||
if result.ServerPower != nil {
|
||||
idleW = result.ServerPower.IdleW
|
||||
}
|
||||
|
||||
// Build header: Run | GPU 0 | GPU 1 | ... | Server wall W | Per GPU wall W | Platform eff.
|
||||
headers := []string{"Run"}
|
||||
for _, idx := range allGPUIndices {
|
||||
headers = append(headers, fmt.Sprintf("GPU %d W", idx))
|
||||
}
|
||||
headers = append(headers, "Server wall W", "Per GPU wall W", "Platform eff.")
|
||||
|
||||
var rampRows [][]string
|
||||
for _, step := range result.RampSteps {
|
||||
row := []string{fmt.Sprintf("%d", step.StepIndex)}
|
||||
for _, idx := range allGPUIndices {
|
||||
inStep := false
|
||||
for _, si := range step.GPUIndices {
|
||||
if si == idx {
|
||||
inStep = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !inStep {
|
||||
row = append(row, "—")
|
||||
continue
|
||||
}
|
||||
gpuPwr := "—"
|
||||
if t, ok := step.PerGPUTelemetry[idx]; ok && t != nil && t.AvgPowerW > 0 {
|
||||
gpuPwr = fmt.Sprintf("%.0f", t.AvgPowerW)
|
||||
}
|
||||
row = append(row, gpuPwr)
|
||||
}
|
||||
// Server wall W
|
||||
serverWall := "—"
|
||||
if step.ServerLoadedW > 0 {
|
||||
serverWall = fmt.Sprintf("%.0f", step.ServerLoadedW)
|
||||
}
|
||||
// Per GPU wall W = ServerDeltaW / len(GPUIndices)
|
||||
perGPUWall := "—"
|
||||
if step.ServerDeltaW > 0 && len(step.GPUIndices) > 0 {
|
||||
perGPUWall = fmt.Sprintf("%.0f", step.ServerDeltaW/float64(len(step.GPUIndices)))
|
||||
}
|
||||
// Platform eff. = (ServerLoadedW − idleW) / TotalObservedPowerW
|
||||
platEff := "—"
|
||||
if step.TotalObservedPowerW > 0 {
|
||||
eff := step.ServerDeltaW / step.TotalObservedPowerW
|
||||
if idleW > 0 && step.ServerLoadedW > 0 {
|
||||
eff = (step.ServerLoadedW - idleW) / step.TotalObservedPowerW
|
||||
}
|
||||
platEff = fmt.Sprintf("%.2f", eff)
|
||||
}
|
||||
row = append(row, serverWall, perGPUWall, platEff)
|
||||
rampRows = append(rampRows, row)
|
||||
}
|
||||
b.WriteString(fmtMDTable(headers, rampRows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
// ── PSU Performance ───────────────────────────────────────────────────────
|
||||
{
|
||||
// Collect all PSU slot keys from any ramp step.
|
||||
psuSlotSet := map[string]struct{}{}
|
||||
for _, step := range result.RampSteps {
|
||||
for k := range step.PSUSlotReadings {
|
||||
psuSlotSet[k] = struct{}{}
|
||||
}
|
||||
}
|
||||
if len(psuSlotSet) > 0 {
|
||||
b.WriteString("## PSU Performance\n\n")
|
||||
psuSlots := make([]string, 0, len(psuSlotSet))
|
||||
for k := range psuSlotSet {
|
||||
psuSlots = append(psuSlots, k)
|
||||
}
|
||||
sort.Strings(psuSlots)
|
||||
|
||||
var idleW float64
|
||||
if result.ServerPower != nil {
|
||||
idleW = result.ServerPower.IdleW
|
||||
}
|
||||
|
||||
psuHeaders := []string{"Run"}
|
||||
for _, slot := range psuSlots {
|
||||
psuHeaders = append(psuHeaders, fmt.Sprintf("PSU %s W", slot))
|
||||
}
|
||||
psuHeaders = append(psuHeaders, "PSU Total W", "Platform eff.", "Fan RPM (duty%)")
|
||||
|
||||
var psuRows [][]string
|
||||
for _, step := range result.RampSteps {
|
||||
row := []string{fmt.Sprintf("%d", step.StepIndex)}
|
||||
var psuTotal float64
|
||||
for _, slot := range psuSlots {
|
||||
sp, ok := step.PSUSlotReadings[slot]
|
||||
if !ok || sp.InputW == nil {
|
||||
row = append(row, "—")
|
||||
continue
|
||||
}
|
||||
row = append(row, fmt.Sprintf("%.0f", *sp.InputW))
|
||||
psuTotal += *sp.InputW
|
||||
}
|
||||
totalStr := "—"
|
||||
if psuTotal > 0 {
|
||||
totalStr = fmt.Sprintf("%.0f", psuTotal)
|
||||
}
|
||||
platEff := "—"
|
||||
if step.TotalObservedPowerW > 0 {
|
||||
eff := step.ServerDeltaW / step.TotalObservedPowerW
|
||||
if idleW > 0 && step.ServerLoadedW > 0 {
|
||||
eff = (step.ServerLoadedW - idleW) / step.TotalObservedPowerW
|
||||
}
|
||||
platEff = fmt.Sprintf("%.2f", eff)
|
||||
}
|
||||
fan := "—"
|
||||
if step.AvgFanRPM > 0 {
|
||||
if step.AvgFanDutyCyclePct > 0 {
|
||||
fan = fmt.Sprintf("%.0f (%.0f%%)", step.AvgFanRPM, step.AvgFanDutyCyclePct)
|
||||
} else {
|
||||
fan = fmt.Sprintf("%.0f", step.AvgFanRPM)
|
||||
}
|
||||
}
|
||||
row = append(row, totalStr, platEff, fan)
|
||||
psuRows = append(psuRows, row)
|
||||
}
|
||||
b.WriteString(fmtMDTable(psuHeaders, psuRows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
}
|
||||
|
||||
// ── PSU Issues ────────────────────────────────────────────────────────────
|
||||
if len(result.PSUIssues) > 0 {
|
||||
b.WriteString("## PSU Issues\n\n")
|
||||
@@ -3646,8 +3827,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
totalDefault += gpu.DefaultPowerLimitW
|
||||
totalStable += stable
|
||||
}
|
||||
b.WriteString("| GPU | Default TDP | Single-card limit | Stable limit | Realization | Derated |\n")
|
||||
b.WriteString("|-----|-------------|-------------------|--------------|-------------|----------|\n")
|
||||
var pdRows [][]string
|
||||
for _, gpu := range result.GPUs {
|
||||
stable := gpu.StablePowerLimitW
|
||||
if stable <= 0 {
|
||||
@@ -3661,15 +3841,29 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
if gpu.Derated {
|
||||
derated = "⚠ yes"
|
||||
}
|
||||
fmt.Fprintf(&b, "| GPU %d | %.0f W | %.0f W | %.0f W | %s | %s |\n",
|
||||
gpu.Index, gpu.DefaultPowerLimitW, gpu.AppliedPowerLimitW, stable, realization, derated)
|
||||
pdRows = append(pdRows, []string{
|
||||
fmt.Sprintf("GPU %d", gpu.Index),
|
||||
fmt.Sprintf("%.0f W", gpu.DefaultPowerLimitW),
|
||||
fmt.Sprintf("%.0f W", gpu.AppliedPowerLimitW),
|
||||
fmt.Sprintf("%.0f W", stable),
|
||||
realization,
|
||||
derated,
|
||||
})
|
||||
}
|
||||
platformReal := "-"
|
||||
if totalDefault > 0 && totalStable > 0 {
|
||||
platformReal = fmt.Sprintf("%.1f%%", totalStable/totalDefault*100)
|
||||
}
|
||||
fmt.Fprintf(&b, "| **Platform** | **%.0f W** | — | **%.0f W** | **%s** | |\n\n",
|
||||
totalDefault, totalStable, platformReal)
|
||||
pdRows = append(pdRows, []string{
|
||||
"**Platform**",
|
||||
fmt.Sprintf("**%.0f W**", totalDefault),
|
||||
"—",
|
||||
fmt.Sprintf("**%.0f W**", totalStable),
|
||||
fmt.Sprintf("**%s**", platformReal),
|
||||
"",
|
||||
})
|
||||
b.WriteString(fmtMDTable([]string{"GPU", "Default TDP", "Single-card limit", "Stable limit", "Realization", "Derated"}, pdRows))
|
||||
b.WriteString("\n")
|
||||
|
||||
// Balance across GPUs — only meaningful with 2+ GPUs.
|
||||
if len(result.GPUs) > 1 {
|
||||
@@ -3710,9 +3904,6 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
// Ramp scalability table — power efficiency of adding each GPU.
|
||||
if len(result.RampSteps) > 1 {
|
||||
b.WriteString("**Ramp power scalability** (stable TDP per step):\n\n")
|
||||
b.WriteString("| Step | GPUs | Cumulative stable TDP | Incremental | Efficiency vs GPU 1 |\n")
|
||||
b.WriteString("|------|------|-----------------------|-------------|---------------------|\n")
|
||||
// First GPU stable TDP as the reference unit for efficiency.
|
||||
var firstStable float64
|
||||
if len(result.GPUs) > 0 {
|
||||
firstStable = result.GPUs[0].StablePowerLimitW
|
||||
@@ -3721,6 +3912,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
}
|
||||
}
|
||||
var prevCumulative float64
|
||||
var scalRows [][]string
|
||||
for _, step := range result.RampSteps {
|
||||
var cumulative float64
|
||||
for _, gpuIdx := range step.GPUIndices {
|
||||
@@ -3740,40 +3932,104 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
if step.StepIndex > 1 && firstStable > 0 {
|
||||
efficiency = fmt.Sprintf("%.1f%%", incremental/firstStable*100)
|
||||
}
|
||||
fmt.Fprintf(&b, "| %d | %s | %.0f W | %.0f W | %s |\n",
|
||||
step.StepIndex, joinIndexList(step.GPUIndices), cumulative, incremental, efficiency)
|
||||
scalRows = append(scalRows, []string{
|
||||
fmt.Sprintf("%d", step.StepIndex),
|
||||
joinIndexList(step.GPUIndices),
|
||||
fmt.Sprintf("%.0f W", cumulative),
|
||||
fmt.Sprintf("%.0f W", incremental),
|
||||
efficiency,
|
||||
})
|
||||
prevCumulative = cumulative
|
||||
}
|
||||
b.WriteString(fmtMDTable([]string{"Step", "GPUs", "Cumulative stable TDP", "Incremental", "Efficiency vs GPU 1"}, scalRows))
|
||||
b.WriteString("\n")
|
||||
}
|
||||
}
|
||||
|
||||
b.WriteString("## Per-Slot Results\n\n")
|
||||
b.WriteString("| GPU | Status | Single-card Limit | Stable Limit | Server Δ (IPMI) | Temp | Attempts |\n")
|
||||
b.WriteString("|-----|--------|-------------------|--------------|-----------------|------|----------|\n")
|
||||
for _, gpu := range result.GPUs {
|
||||
stableLimit := "-"
|
||||
if gpu.StablePowerLimitW > 0 {
|
||||
if gpu.Derated {
|
||||
stableLimit = fmt.Sprintf("%.0f W ⚠", gpu.StablePowerLimitW)
|
||||
} else {
|
||||
stableLimit = fmt.Sprintf("%.0f W", gpu.StablePowerLimitW)
|
||||
}
|
||||
}
|
||||
serverDelta := "-"
|
||||
if gpu.ServerDeltaW > 0 {
|
||||
serverDelta = fmt.Sprintf("%.0f W", gpu.ServerDeltaW)
|
||||
}
|
||||
fmt.Fprintf(&b, "| GPU %d | %s | %.0f W | %s | %s | %.1f C | %d |\n",
|
||||
gpu.Index, gpu.Status, gpu.AppliedPowerLimitW, stableLimit, serverDelta, gpu.MaxObservedTempC, gpu.CalibrationAttempts)
|
||||
// ── Per-GPU sections ──────────────────────────────────────────────────────
|
||||
var lastStep *NvidiaPowerBenchStep
|
||||
if n := len(result.RampSteps); n > 0 {
|
||||
lastStep = &result.RampSteps[n-1]
|
||||
}
|
||||
b.WriteString("\n")
|
||||
for _, gpu := range result.GPUs {
|
||||
fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, gpu.Name)
|
||||
|
||||
// Transposed comparison table: Single Run vs All GPU Run.
|
||||
singleClk := "—"
|
||||
singleMem := "—"
|
||||
singleTemp := "—"
|
||||
singlePwr := "—"
|
||||
singleWall := "—"
|
||||
singleFan := "—"
|
||||
if gpu.Telemetry != nil {
|
||||
singleClk = fmt.Sprintf("%.0f", gpu.Telemetry.AvgGraphicsClockMHz)
|
||||
singleMem = fmt.Sprintf("%.0f", gpu.Telemetry.AvgMemoryClockMHz)
|
||||
singleTemp = fmt.Sprintf("%.1f", gpu.Telemetry.AvgTempC)
|
||||
singlePwr = fmt.Sprintf("%.0f W", gpu.Telemetry.AvgPowerW)
|
||||
}
|
||||
if gpu.ServerDeltaW > 0 {
|
||||
singleWall = fmt.Sprintf("%.0f W", gpu.ServerDeltaW)
|
||||
}
|
||||
if gpu.AvgFanRPM > 0 {
|
||||
if gpu.AvgFanDutyCyclePct > 0 {
|
||||
singleFan = fmt.Sprintf("%.0f RPM (%.0f%%)", gpu.AvgFanRPM, gpu.AvgFanDutyCyclePct)
|
||||
} else {
|
||||
singleFan = fmt.Sprintf("%.0f RPM", gpu.AvgFanRPM)
|
||||
}
|
||||
}
|
||||
|
||||
allClk := "—"
|
||||
allMem := "—"
|
||||
allTemp := "—"
|
||||
allPwr := "—"
|
||||
allWall := "—"
|
||||
allFan := "—"
|
||||
if lastStep != nil {
|
||||
if t, ok := lastStep.PerGPUTelemetry[gpu.Index]; ok && t != nil {
|
||||
allClk = fmt.Sprintf("%.0f", t.AvgGraphicsClockMHz)
|
||||
allMem = fmt.Sprintf("%.0f", t.AvgMemoryClockMHz)
|
||||
allTemp = fmt.Sprintf("%.1f", t.AvgTempC)
|
||||
allPwr = fmt.Sprintf("%.0f W", t.AvgPowerW)
|
||||
}
|
||||
if lastStep.ServerDeltaW > 0 && len(lastStep.GPUIndices) > 0 {
|
||||
allWall = fmt.Sprintf("%.0f W", lastStep.ServerDeltaW/float64(len(lastStep.GPUIndices)))
|
||||
}
|
||||
if lastStep.AvgFanRPM > 0 {
|
||||
if lastStep.AvgFanDutyCyclePct > 0 {
|
||||
allFan = fmt.Sprintf("%.0f RPM (%.0f%%)", lastStep.AvgFanRPM, lastStep.AvgFanDutyCyclePct)
|
||||
} else {
|
||||
allFan = fmt.Sprintf("%.0f RPM", lastStep.AvgFanRPM)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tableHeaders := []string{"", "Single Run"}
|
||||
if lastStep != nil {
|
||||
tableHeaders = append(tableHeaders, "All GPU Run")
|
||||
}
|
||||
compRows := [][]string{
|
||||
{"Clock MHz (Mem MHz)", fmt.Sprintf("%s (%s)", singleClk, singleMem)},
|
||||
{"Avg Temp °C", singleTemp},
|
||||
{"Power W", singlePwr},
|
||||
{"Per GPU wall W", singleWall},
|
||||
{"Fan RPM (duty%)", singleFan},
|
||||
}
|
||||
if lastStep != nil {
|
||||
compRows[0] = append(compRows[0], fmt.Sprintf("%s (%s)", allClk, allMem))
|
||||
compRows[1] = append(compRows[1], allTemp)
|
||||
compRows[2] = append(compRows[2], allPwr)
|
||||
compRows[3] = append(compRows[3], allWall)
|
||||
compRows[4] = append(compRows[4], allFan)
|
||||
}
|
||||
b.WriteString(fmtMDTable(tableHeaders, compRows))
|
||||
b.WriteString("\n")
|
||||
|
||||
for _, note := range gpu.Notes {
|
||||
fmt.Fprintf(&b, "- %s\n", note)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
if len(gpu.Notes) > 0 {
|
||||
b.WriteString("\n")
|
||||
}
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
@@ -3860,7 +4116,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
OverallStatus: "OK",
|
||||
}
|
||||
durationSec := powerBenchDurationSec(opts.Profile)
|
||||
_ = durationSec
|
||||
|
||||
// Sample IPMI idle power before any GPU load.
|
||||
var serverIdleW float64
|
||||
@@ -3894,7 +4149,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
ipmiSingleDone <- w
|
||||
}
|
||||
}()
|
||||
c, restore, singleRows := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil)
|
||||
c, restore, singleRows := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil, durationSec)
|
||||
appendBenchmarkMetrics(&allPowerRows, singleRows, fmt.Sprintf("single-gpu-%d", idx), &powerCursor, 0)
|
||||
ipmiSingleCancel()
|
||||
if w, ok := <-ipmiSingleDone; ok {
|
||||
@@ -3947,6 +4202,12 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
t := summarizeBenchmarkTelemetry(calib.MetricRows)
|
||||
gpu.Telemetry = &t
|
||||
}
|
||||
if fans, err := sampleFanSpeeds(); err == nil && len(fans) > 0 {
|
||||
gpu.AvgFanRPM = meanFanRPM(fans)
|
||||
if duty, ok, _ := sampleFanDutyCyclePctFromFans(fans); ok {
|
||||
gpu.AvgFanDutyCyclePct = duty
|
||||
}
|
||||
}
|
||||
gpus = append(gpus, gpu)
|
||||
}
|
||||
sort.Slice(gpus, func(i, j int) bool {
|
||||
@@ -4077,7 +4338,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
ipmiStepDone <- w
|
||||
}
|
||||
}()
|
||||
stepCalib, stepRestore, stepRows := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep)
|
||||
stepCalib, stepRestore, stepRows := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep, durationSec)
|
||||
appendBenchmarkMetrics(&allPowerRows, stepRows, fmt.Sprintf("ramp-step-%d", step), &powerCursor, 0)
|
||||
ipmiStepCancel()
|
||||
var stepIPMILoadedW float64
|
||||
@@ -4159,6 +4420,29 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
}
|
||||
}
|
||||
|
||||
// Per-step PSU slot snapshot.
|
||||
sdrStep := sampleIPMISDRPowerSensors()
|
||||
if len(sdrStep.PSUSlots) > 0 {
|
||||
ramp.PSUSlotReadings = sdrStep.PSUSlots
|
||||
}
|
||||
|
||||
// Fan state at end of ramp step.
|
||||
if fans, err := sampleFanSpeeds(); err == nil && len(fans) > 0 {
|
||||
ramp.AvgFanRPM = meanFanRPM(fans)
|
||||
if duty, ok, _ := sampleFanDutyCyclePctFromFans(fans); ok {
|
||||
ramp.AvgFanDutyCyclePct = duty
|
||||
}
|
||||
}
|
||||
|
||||
// Per-GPU telemetry from this ramp step's calibration.
|
||||
ramp.PerGPUTelemetry = make(map[int]*BenchmarkTelemetrySummary, len(subset))
|
||||
for _, gpuIdx := range subset {
|
||||
if c, ok := stepCalib[gpuIdx]; ok {
|
||||
s := c.Summary
|
||||
ramp.PerGPUTelemetry[gpuIdx] = &s
|
||||
}
|
||||
}
|
||||
|
||||
result.RampSteps = append(result.RampSteps, ramp)
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user