Rework Power Fit report: 90 min stability, aligned tables, PSU/fan sections

- Increase stability profile duration from 33 min to 90 min by wiring
  powerBenchDurationSec() into runBenchmarkPowerCalibration (was discarded)
- Collect per-step PSU slot readings, fan RPM/duty, and per-GPU telemetry
  in ramp loop; add matching fields to NvidiaPowerBenchStep/NvidiaPowerBenchGPU
- Rewrite renderPowerBenchReport: replace Per-Slot Results with Single GPU
  section, rework Ramp Sequence rows=runs/cols=GPUs, add PSU Performance
  section (conditional on IPMI data), add transposed Single vs All-GPU
  comparison table in per-GPU sections
- Add fmtMDTable helper (benchmark_table.go) and apply to all tables in
  both power and performance reports so columns align in plain-text view

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-19 18:04:12 +03:00
parent d52ec67f8f
commit f8cd9a7376
4 changed files with 647 additions and 226 deletions

View File

@@ -3055,8 +3055,12 @@ func runBenchmarkPowerCalibration(
infoByIndex map[int]benchmarkGPUInfo,
logFunc func(string),
seedLimits map[int]int,
durationSec int,
) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction, []GPUMetricRow) {
const calibDurationSec = 120
calibDurationSec := durationSec
if calibDurationSec <= 0 {
calibDurationSec = 120
}
const maxDerateW = 150
// calibSearchTolerance is the binary-search convergence threshold in watts.
// When hi-lo ≤ this, the highest verified-stable limit (lo) is used.
@@ -3436,6 +3440,18 @@ func roundTo5W(w int) int {
return ((w + 2) / 5) * 5
}
// meanFanRPM returns the average RPM across a set of fan readings.
func meanFanRPM(fans []FanReading) float64 {
if len(fans) == 0 {
return 0
}
var sum float64
for _, f := range fans {
sum += f.RPM
}
return sum / float64(len(fans))
}
func powerBenchDurationSec(profile string) int {
switch strings.TrimSpace(strings.ToLower(profile)) {
case NvidiaBenchmarkProfileStability:
@@ -3475,30 +3491,29 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
// Server power comparison table.
if sp := result.ServerPower; sp != nil {
b.WriteString("## Server vs GPU Power Comparison\n\n")
b.WriteString("| Metric | Source | Value |\n")
b.WriteString("|--------|--------|-------|\n")
fmt.Fprintf(&b, "| GPU stable limits sum | nvidia-smi | %.0f W |\n", result.PlatformMaxTDPW)
fmt.Fprintf(&b, "| GPU actual power sum (p95, last step) | nvidia-smi | %.0f W |\n", sp.GPUReportedSumW)
var spRows [][]string
spRows = append(spRows, []string{"GPU stable limits sum", "nvidia-smi", fmt.Sprintf("%.0f W", result.PlatformMaxTDPW)})
spRows = append(spRows, []string{"GPU actual power sum (p95, last step)", "nvidia-smi", fmt.Sprintf("%.0f W", sp.GPUReportedSumW)})
if sp.GPUSlotTotalW > 0 {
fmt.Fprintf(&b, "| GPU PCIe slot power (at peak load) | IPMI SDR | %.0f W |\n", sp.GPUSlotTotalW)
spRows = append(spRows, []string{"GPU PCIe slot power (at peak load)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.GPUSlotTotalW)})
}
if sp.Available {
fmt.Fprintf(&b, "| Server idle power | IPMI DCMI | %.0f W |\n", sp.IdleW)
fmt.Fprintf(&b, "| Server loaded power | IPMI DCMI | %.0f W |\n", sp.LoadedW)
fmt.Fprintf(&b, "| Server Δ power (loaded idle) | IPMI DCMI | %.0f W |\n", sp.DeltaW)
spRows = append(spRows, []string{"Server idle power", "IPMI DCMI", fmt.Sprintf("%.0f W", sp.IdleW)})
spRows = append(spRows, []string{"Server loaded power", "IPMI DCMI", fmt.Sprintf("%.0f W", sp.LoadedW)})
spRows = append(spRows, []string{"Server Δ power (loaded idle)", "IPMI DCMI", fmt.Sprintf("%.0f W", sp.DeltaW)})
}
if sp.PSUInputLoadedW > 0 {
fmt.Fprintf(&b, "| PSU AC input (idle) | IPMI SDR | %.0f W |\n", sp.PSUInputIdleW)
fmt.Fprintf(&b, "| PSU AC input (loaded) | IPMI SDR | %.0f W |\n", sp.PSUInputLoadedW)
spRows = append(spRows, []string{"PSU AC input (idle)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUInputIdleW)})
spRows = append(spRows, []string{"PSU AC input (loaded)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUInputLoadedW)})
psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
fmt.Fprintf(&b, "| PSU AC input Δ (loaded idle) | IPMI SDR | %.0f W |\n", psuDelta)
spRows = append(spRows, []string{"PSU AC input Δ (loaded idle)", "IPMI SDR", fmt.Sprintf("%.0f W", psuDelta)})
}
if sp.PSUOutputLoadedW > 0 {
fmt.Fprintf(&b, "| PSU DC output (idle) | IPMI SDR | %.0f W |\n", sp.PSUOutputIdleW)
fmt.Fprintf(&b, "| PSU DC output (loaded) | IPMI SDR | %.0f W |\n", sp.PSUOutputLoadedW)
spRows = append(spRows, []string{"PSU DC output (idle)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUOutputIdleW)})
spRows = append(spRows, []string{"PSU DC output (loaded)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUOutputLoadedW)})
if sp.PSUInputLoadedW > 0 && sp.PSUInputIdleW > 0 {
psuEff := sp.PSUOutputIdleW / sp.PSUInputIdleW * 100
fmt.Fprintf(&b, "| PSU conversion efficiency (idle) | IPMI SDR | %.1f%% |\n", psuEff)
spRows = append(spRows, []string{"PSU conversion efficiency (idle)", "IPMI SDR", fmt.Sprintf("%.1f%%", psuEff)})
}
}
if sp.Available {
@@ -3516,7 +3531,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
default:
ratioNote = "✗ significant discrepancy — GPU over-reports TDP vs wall power"
}
fmt.Fprintf(&b, "| Reporting ratio (DCMI Δ / GPU actual) | IPMI DCMI | %.2f — %s |\n", ratio, ratioNote)
spRows = append(spRows, []string{"Reporting ratio (DCMI Δ / GPU actual)", "IPMI DCMI", fmt.Sprintf("%.2f — %s", ratio, ratioNote)})
if sp.PSUInputLoadedW > 0 && sp.GPUReportedSumW > 0 {
psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
sdrRatio := psuDelta / sp.GPUReportedSumW
@@ -3529,11 +3544,12 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
default:
sdrNote = "✗ significant discrepancy"
}
fmt.Fprintf(&b, "| Reporting ratio (SDR PSU Δ / GPU actual) | IPMI SDR | %.2f — %s |\n", sdrRatio, sdrNote)
spRows = append(spRows, []string{"Reporting ratio (SDR PSU Δ / GPU actual)", "IPMI SDR", fmt.Sprintf("%.2f — %s", sdrRatio, sdrNote)})
}
} else {
b.WriteString("| IPMI availability | — | not available — IPMI not supported or ipmitool not found |\n")
spRows = append(spRows, []string{"IPMI availability", "—", "not available — IPMI not supported or ipmitool not found"})
}
b.WriteString(fmtMDTable([]string{"Metric", "Source", "Value"}, spRows))
for _, note := range sp.Notes {
fmt.Fprintf(&b, "\n> %s\n", note)
}
@@ -3541,10 +3557,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
if len(sp.PSUSlotReadingsIdle) > 0 || len(sp.PSUSlotReadingsLoaded) > 0 {
b.WriteString("## PSU Load Distribution\n\n")
b.WriteString("| Slot | AC Input (idle) | AC Input (loaded) | DC Output (idle) | DC Output (loaded) | Load Δ | Status |\n")
b.WriteString("|------|-----------------|-------------------|------------------|--------------------|--------|--------|\n")
// collect all slot keys
slotSet := map[string]struct{}{}
for k := range sp.PSUSlotReadingsIdle {
slotSet[k] = struct{}{}
@@ -3558,17 +3571,18 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
}
sort.Strings(slots)
fmtW := func(v *float64) string {
if v == nil {
return "—"
}
return fmt.Sprintf("%.0f W", *v)
}
var psuDistRows [][]string
for _, slot := range slots {
idle := sp.PSUSlotReadingsIdle[slot]
loaded := sp.PSUSlotReadingsLoaded[slot]
fmtW := func(v *float64) string {
if v == nil {
return "—"
}
return fmt.Sprintf("%.0f W", *v)
}
var deltaStr string
if idle.InputW != nil && loaded.InputW != nil {
deltaStr = fmt.Sprintf("%+.0f W", *loaded.InputW-*idle.InputW)
@@ -3584,13 +3598,14 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
status = "—"
}
fmt.Fprintf(&b, "| %s | %s | %s | %s | %s | %s | %s |\n",
psuDistRows = append(psuDistRows, []string{
slot,
fmtW(idle.InputW), fmtW(loaded.InputW),
fmtW(idle.OutputW), fmtW(loaded.OutputW),
deltaStr, status,
)
})
}
b.WriteString(fmtMDTable([]string{"Slot", "AC Input (idle)", "AC Input (loaded)", "DC Output (idle)", "DC Output (loaded)", "Load Δ", "Status"}, psuDistRows))
b.WriteString("\n")
}
}
@@ -3602,28 +3617,194 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
}
b.WriteString("\n")
}
if len(result.RecommendedSlotOrder) > 0 {
b.WriteString("## Recommended Slot Order\n\n")
fmt.Fprintf(&b, "Populate GPUs in this order for best single-card power realization: `%s`\n\n", joinIndexList(result.RecommendedSlotOrder))
}
if len(result.RampSteps) > 0 {
b.WriteString("## Ramp Sequence\n\n")
b.WriteString("| Step | New GPU | Stable Limit | Total Observed | Server Δ (IPMI) | Derated | Status |\n")
b.WriteString("|------|---------|--------------|----------------|-----------------|---------|--------|\n")
for _, step := range result.RampSteps {
derated := "-"
if step.Derated {
derated = "⚠ yes"
// ── Single GPU section ───────────────────────────────────────────────────
b.WriteString("## Single GPU\n\n")
{
var sgRows [][]string
for _, gpu := range result.GPUs {
clk := "—"
mem := "—"
temp := "—"
pwr := "—"
if gpu.Telemetry != nil {
clk = fmt.Sprintf("%.0f", gpu.Telemetry.AvgGraphicsClockMHz)
mem = fmt.Sprintf("%.0f", gpu.Telemetry.AvgMemoryClockMHz)
temp = fmt.Sprintf("%.1f", gpu.Telemetry.AvgTempC)
pwr = fmt.Sprintf("%.0f W", gpu.Telemetry.AvgPowerW)
}
serverDelta := "-"
if step.ServerDeltaW > 0 {
serverDelta = fmt.Sprintf("%.0f W", step.ServerDeltaW)
serverDelta := ""
if gpu.ServerDeltaW > 0 {
serverDelta = fmt.Sprintf("%.0f W", gpu.ServerDeltaW)
}
fmt.Fprintf(&b, "| %d | GPU %d | %.0f W | %.0f W | %s | %s | %s |\n",
step.StepIndex, step.NewGPUIndex, step.NewGPUStableLimitW, step.TotalObservedPowerW, serverDelta, derated, step.Status)
fan := "—"
if gpu.AvgFanRPM > 0 {
if gpu.AvgFanDutyCyclePct > 0 {
fan = fmt.Sprintf("%.0f RPM (%.0f%%)", gpu.AvgFanRPM, gpu.AvgFanDutyCyclePct)
} else {
fan = fmt.Sprintf("%.0f RPM", gpu.AvgFanRPM)
}
}
sgRows = append(sgRows, []string{
fmt.Sprintf("GPU %d", gpu.Index),
fmt.Sprintf("%s (%s)", clk, mem),
temp,
pwr,
serverDelta,
fan,
})
}
b.WriteString(fmtMDTable([]string{"GPU", "Clock MHz (Mem MHz)", "Avg Temp °C", "Power W", "Server Δ W", "Fan RPM (duty%)"}, sgRows))
b.WriteString("\n")
}
if len(result.RecommendedSlotOrder) > 0 {
fmt.Fprintf(&b, "Recommended slot order for best single-card power realization: `%s`\n\n", joinIndexList(result.RecommendedSlotOrder))
}
// ── Ramp Sequence ────────────────────────────────────────────────────────
// Rows = run number; Cols = per-GPU power (from step telemetry) + aggregates.
if len(result.RampSteps) > 0 {
b.WriteString("## Ramp Sequence\n\n")
// Collect all GPU indices that appear across all steps (ordered by first appearance).
allGPUIndices := make([]int, 0, len(result.GPUs))
seen := map[int]bool{}
for _, step := range result.RampSteps {
for _, idx := range step.GPUIndices {
if !seen[idx] {
seen[idx] = true
allGPUIndices = append(allGPUIndices, idx)
}
}
}
var idleW float64
if result.ServerPower != nil {
idleW = result.ServerPower.IdleW
}
// Build header: Run | GPU 0 | GPU 1 | ... | Server wall W | Per GPU wall W | Platform eff.
headers := []string{"Run"}
for _, idx := range allGPUIndices {
headers = append(headers, fmt.Sprintf("GPU %d W", idx))
}
headers = append(headers, "Server wall W", "Per GPU wall W", "Platform eff.")
var rampRows [][]string
for _, step := range result.RampSteps {
row := []string{fmt.Sprintf("%d", step.StepIndex)}
for _, idx := range allGPUIndices {
inStep := false
for _, si := range step.GPUIndices {
if si == idx {
inStep = true
break
}
}
if !inStep {
row = append(row, "—")
continue
}
gpuPwr := "—"
if t, ok := step.PerGPUTelemetry[idx]; ok && t != nil && t.AvgPowerW > 0 {
gpuPwr = fmt.Sprintf("%.0f", t.AvgPowerW)
}
row = append(row, gpuPwr)
}
// Server wall W
serverWall := "—"
if step.ServerLoadedW > 0 {
serverWall = fmt.Sprintf("%.0f", step.ServerLoadedW)
}
// Per GPU wall W = ServerDeltaW / len(GPUIndices)
perGPUWall := "—"
if step.ServerDeltaW > 0 && len(step.GPUIndices) > 0 {
perGPUWall = fmt.Sprintf("%.0f", step.ServerDeltaW/float64(len(step.GPUIndices)))
}
// Platform eff. = (ServerLoadedW idleW) / TotalObservedPowerW
platEff := "—"
if step.TotalObservedPowerW > 0 {
eff := step.ServerDeltaW / step.TotalObservedPowerW
if idleW > 0 && step.ServerLoadedW > 0 {
eff = (step.ServerLoadedW - idleW) / step.TotalObservedPowerW
}
platEff = fmt.Sprintf("%.2f", eff)
}
row = append(row, serverWall, perGPUWall, platEff)
rampRows = append(rampRows, row)
}
b.WriteString(fmtMDTable(headers, rampRows))
b.WriteString("\n")
}
// ── PSU Performance ───────────────────────────────────────────────────────
{
// Collect all PSU slot keys from any ramp step.
psuSlotSet := map[string]struct{}{}
for _, step := range result.RampSteps {
for k := range step.PSUSlotReadings {
psuSlotSet[k] = struct{}{}
}
}
if len(psuSlotSet) > 0 {
b.WriteString("## PSU Performance\n\n")
psuSlots := make([]string, 0, len(psuSlotSet))
for k := range psuSlotSet {
psuSlots = append(psuSlots, k)
}
sort.Strings(psuSlots)
var idleW float64
if result.ServerPower != nil {
idleW = result.ServerPower.IdleW
}
psuHeaders := []string{"Run"}
for _, slot := range psuSlots {
psuHeaders = append(psuHeaders, fmt.Sprintf("PSU %s W", slot))
}
psuHeaders = append(psuHeaders, "PSU Total W", "Platform eff.", "Fan RPM (duty%)")
var psuRows [][]string
for _, step := range result.RampSteps {
row := []string{fmt.Sprintf("%d", step.StepIndex)}
var psuTotal float64
for _, slot := range psuSlots {
sp, ok := step.PSUSlotReadings[slot]
if !ok || sp.InputW == nil {
row = append(row, "—")
continue
}
row = append(row, fmt.Sprintf("%.0f", *sp.InputW))
psuTotal += *sp.InputW
}
totalStr := "—"
if psuTotal > 0 {
totalStr = fmt.Sprintf("%.0f", psuTotal)
}
platEff := "—"
if step.TotalObservedPowerW > 0 {
eff := step.ServerDeltaW / step.TotalObservedPowerW
if idleW > 0 && step.ServerLoadedW > 0 {
eff = (step.ServerLoadedW - idleW) / step.TotalObservedPowerW
}
platEff = fmt.Sprintf("%.2f", eff)
}
fan := "—"
if step.AvgFanRPM > 0 {
if step.AvgFanDutyCyclePct > 0 {
fan = fmt.Sprintf("%.0f (%.0f%%)", step.AvgFanRPM, step.AvgFanDutyCyclePct)
} else {
fan = fmt.Sprintf("%.0f", step.AvgFanRPM)
}
}
row = append(row, totalStr, platEff, fan)
psuRows = append(psuRows, row)
}
b.WriteString(fmtMDTable(psuHeaders, psuRows))
b.WriteString("\n")
}
}
// ── PSU Issues ────────────────────────────────────────────────────────────
if len(result.PSUIssues) > 0 {
b.WriteString("## PSU Issues\n\n")
@@ -3646,8 +3827,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
totalDefault += gpu.DefaultPowerLimitW
totalStable += stable
}
b.WriteString("| GPU | Default TDP | Single-card limit | Stable limit | Realization | Derated |\n")
b.WriteString("|-----|-------------|-------------------|--------------|-------------|----------|\n")
var pdRows [][]string
for _, gpu := range result.GPUs {
stable := gpu.StablePowerLimitW
if stable <= 0 {
@@ -3661,15 +3841,29 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
if gpu.Derated {
derated = "⚠ yes"
}
fmt.Fprintf(&b, "| GPU %d | %.0f W | %.0f W | %.0f W | %s | %s |\n",
gpu.Index, gpu.DefaultPowerLimitW, gpu.AppliedPowerLimitW, stable, realization, derated)
pdRows = append(pdRows, []string{
fmt.Sprintf("GPU %d", gpu.Index),
fmt.Sprintf("%.0f W", gpu.DefaultPowerLimitW),
fmt.Sprintf("%.0f W", gpu.AppliedPowerLimitW),
fmt.Sprintf("%.0f W", stable),
realization,
derated,
})
}
platformReal := "-"
if totalDefault > 0 && totalStable > 0 {
platformReal = fmt.Sprintf("%.1f%%", totalStable/totalDefault*100)
}
fmt.Fprintf(&b, "| **Platform** | **%.0f W** | — | **%.0f W** | **%s** | |\n\n",
totalDefault, totalStable, platformReal)
pdRows = append(pdRows, []string{
"**Platform**",
fmt.Sprintf("**%.0f W**", totalDefault),
"—",
fmt.Sprintf("**%.0f W**", totalStable),
fmt.Sprintf("**%s**", platformReal),
"",
})
b.WriteString(fmtMDTable([]string{"GPU", "Default TDP", "Single-card limit", "Stable limit", "Realization", "Derated"}, pdRows))
b.WriteString("\n")
// Balance across GPUs — only meaningful with 2+ GPUs.
if len(result.GPUs) > 1 {
@@ -3710,9 +3904,6 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
// Ramp scalability table — power efficiency of adding each GPU.
if len(result.RampSteps) > 1 {
b.WriteString("**Ramp power scalability** (stable TDP per step):\n\n")
b.WriteString("| Step | GPUs | Cumulative stable TDP | Incremental | Efficiency vs GPU 1 |\n")
b.WriteString("|------|------|-----------------------|-------------|---------------------|\n")
// First GPU stable TDP as the reference unit for efficiency.
var firstStable float64
if len(result.GPUs) > 0 {
firstStable = result.GPUs[0].StablePowerLimitW
@@ -3721,6 +3912,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
}
}
var prevCumulative float64
var scalRows [][]string
for _, step := range result.RampSteps {
var cumulative float64
for _, gpuIdx := range step.GPUIndices {
@@ -3740,40 +3932,104 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
if step.StepIndex > 1 && firstStable > 0 {
efficiency = fmt.Sprintf("%.1f%%", incremental/firstStable*100)
}
fmt.Fprintf(&b, "| %d | %s | %.0f W | %.0f W | %s |\n",
step.StepIndex, joinIndexList(step.GPUIndices), cumulative, incremental, efficiency)
scalRows = append(scalRows, []string{
fmt.Sprintf("%d", step.StepIndex),
joinIndexList(step.GPUIndices),
fmt.Sprintf("%.0f W", cumulative),
fmt.Sprintf("%.0f W", incremental),
efficiency,
})
prevCumulative = cumulative
}
b.WriteString(fmtMDTable([]string{"Step", "GPUs", "Cumulative stable TDP", "Incremental", "Efficiency vs GPU 1"}, scalRows))
b.WriteString("\n")
}
}
b.WriteString("## Per-Slot Results\n\n")
b.WriteString("| GPU | Status | Single-card Limit | Stable Limit | Server Δ (IPMI) | Temp | Attempts |\n")
b.WriteString("|-----|--------|-------------------|--------------|-----------------|------|----------|\n")
for _, gpu := range result.GPUs {
stableLimit := "-"
if gpu.StablePowerLimitW > 0 {
if gpu.Derated {
stableLimit = fmt.Sprintf("%.0f W ⚠", gpu.StablePowerLimitW)
} else {
stableLimit = fmt.Sprintf("%.0f W", gpu.StablePowerLimitW)
}
}
serverDelta := "-"
if gpu.ServerDeltaW > 0 {
serverDelta = fmt.Sprintf("%.0f W", gpu.ServerDeltaW)
}
fmt.Fprintf(&b, "| GPU %d | %s | %.0f W | %s | %s | %.1f C | %d |\n",
gpu.Index, gpu.Status, gpu.AppliedPowerLimitW, stableLimit, serverDelta, gpu.MaxObservedTempC, gpu.CalibrationAttempts)
// ── Per-GPU sections ──────────────────────────────────────────────────────
var lastStep *NvidiaPowerBenchStep
if n := len(result.RampSteps); n > 0 {
lastStep = &result.RampSteps[n-1]
}
b.WriteString("\n")
for _, gpu := range result.GPUs {
fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, gpu.Name)
// Transposed comparison table: Single Run vs All GPU Run.
singleClk := "—"
singleMem := "—"
singleTemp := "—"
singlePwr := "—"
singleWall := "—"
singleFan := "—"
if gpu.Telemetry != nil {
singleClk = fmt.Sprintf("%.0f", gpu.Telemetry.AvgGraphicsClockMHz)
singleMem = fmt.Sprintf("%.0f", gpu.Telemetry.AvgMemoryClockMHz)
singleTemp = fmt.Sprintf("%.1f", gpu.Telemetry.AvgTempC)
singlePwr = fmt.Sprintf("%.0f W", gpu.Telemetry.AvgPowerW)
}
if gpu.ServerDeltaW > 0 {
singleWall = fmt.Sprintf("%.0f W", gpu.ServerDeltaW)
}
if gpu.AvgFanRPM > 0 {
if gpu.AvgFanDutyCyclePct > 0 {
singleFan = fmt.Sprintf("%.0f RPM (%.0f%%)", gpu.AvgFanRPM, gpu.AvgFanDutyCyclePct)
} else {
singleFan = fmt.Sprintf("%.0f RPM", gpu.AvgFanRPM)
}
}
allClk := "—"
allMem := "—"
allTemp := "—"
allPwr := "—"
allWall := "—"
allFan := "—"
if lastStep != nil {
if t, ok := lastStep.PerGPUTelemetry[gpu.Index]; ok && t != nil {
allClk = fmt.Sprintf("%.0f", t.AvgGraphicsClockMHz)
allMem = fmt.Sprintf("%.0f", t.AvgMemoryClockMHz)
allTemp = fmt.Sprintf("%.1f", t.AvgTempC)
allPwr = fmt.Sprintf("%.0f W", t.AvgPowerW)
}
if lastStep.ServerDeltaW > 0 && len(lastStep.GPUIndices) > 0 {
allWall = fmt.Sprintf("%.0f W", lastStep.ServerDeltaW/float64(len(lastStep.GPUIndices)))
}
if lastStep.AvgFanRPM > 0 {
if lastStep.AvgFanDutyCyclePct > 0 {
allFan = fmt.Sprintf("%.0f RPM (%.0f%%)", lastStep.AvgFanRPM, lastStep.AvgFanDutyCyclePct)
} else {
allFan = fmt.Sprintf("%.0f RPM", lastStep.AvgFanRPM)
}
}
}
tableHeaders := []string{"", "Single Run"}
if lastStep != nil {
tableHeaders = append(tableHeaders, "All GPU Run")
}
compRows := [][]string{
{"Clock MHz (Mem MHz)", fmt.Sprintf("%s (%s)", singleClk, singleMem)},
{"Avg Temp °C", singleTemp},
{"Power W", singlePwr},
{"Per GPU wall W", singleWall},
{"Fan RPM (duty%)", singleFan},
}
if lastStep != nil {
compRows[0] = append(compRows[0], fmt.Sprintf("%s (%s)", allClk, allMem))
compRows[1] = append(compRows[1], allTemp)
compRows[2] = append(compRows[2], allPwr)
compRows[3] = append(compRows[3], allWall)
compRows[4] = append(compRows[4], allFan)
}
b.WriteString(fmtMDTable(tableHeaders, compRows))
b.WriteString("\n")
for _, note := range gpu.Notes {
fmt.Fprintf(&b, "- %s\n", note)
}
b.WriteString("\n")
if len(gpu.Notes) > 0 {
b.WriteString("\n")
}
}
return b.String()
}
@@ -3860,7 +4116,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
OverallStatus: "OK",
}
durationSec := powerBenchDurationSec(opts.Profile)
_ = durationSec
// Sample IPMI idle power before any GPU load.
var serverIdleW float64
@@ -3894,7 +4149,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
ipmiSingleDone <- w
}
}()
c, restore, singleRows := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil)
c, restore, singleRows := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil, durationSec)
appendBenchmarkMetrics(&allPowerRows, singleRows, fmt.Sprintf("single-gpu-%d", idx), &powerCursor, 0)
ipmiSingleCancel()
if w, ok := <-ipmiSingleDone; ok {
@@ -3947,6 +4202,12 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
t := summarizeBenchmarkTelemetry(calib.MetricRows)
gpu.Telemetry = &t
}
if fans, err := sampleFanSpeeds(); err == nil && len(fans) > 0 {
gpu.AvgFanRPM = meanFanRPM(fans)
if duty, ok, _ := sampleFanDutyCyclePctFromFans(fans); ok {
gpu.AvgFanDutyCyclePct = duty
}
}
gpus = append(gpus, gpu)
}
sort.Slice(gpus, func(i, j int) bool {
@@ -4077,7 +4338,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
ipmiStepDone <- w
}
}()
stepCalib, stepRestore, stepRows := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep)
stepCalib, stepRestore, stepRows := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep, durationSec)
appendBenchmarkMetrics(&allPowerRows, stepRows, fmt.Sprintf("ramp-step-%d", step), &powerCursor, 0)
ipmiStepCancel()
var stepIPMILoadedW float64
@@ -4159,6 +4420,29 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
}
}
// Per-step PSU slot snapshot.
sdrStep := sampleIPMISDRPowerSensors()
if len(sdrStep.PSUSlots) > 0 {
ramp.PSUSlotReadings = sdrStep.PSUSlots
}
// Fan state at end of ramp step.
if fans, err := sampleFanSpeeds(); err == nil && len(fans) > 0 {
ramp.AvgFanRPM = meanFanRPM(fans)
if duty, ok, _ := sampleFanDutyCyclePctFromFans(fans); ok {
ramp.AvgFanDutyCyclePct = duty
}
}
// Per-GPU telemetry from this ramp step's calibration.
ramp.PerGPUTelemetry = make(map[int]*BenchmarkTelemetrySummary, len(subset))
for _, gpuIdx := range subset {
if c, ok := stepCalib[gpuIdx]; ok {
s := c.Summary
ramp.PerGPUTelemetry[gpuIdx] = &s
}
}
result.RampSteps = append(result.RampSteps, ramp)
}