Rework Power Fit report: 90 min stability, aligned tables, PSU/fan sections
- Increase stability profile duration from 33 min to 90 min by wiring powerBenchDurationSec() into runBenchmarkPowerCalibration (was discarded) - Collect per-step PSU slot readings, fan RPM/duty, and per-GPU telemetry in ramp loop; add matching fields to NvidiaPowerBenchStep/NvidiaPowerBenchGPU - Rewrite renderPowerBenchReport: replace Per-Slot Results with Single GPU section, rework Ramp Sequence rows=runs/cols=GPUs, add PSU Performance section (conditional on IPMI data), add transposed Single vs All-GPU comparison table in per-GPU sections - Add fmtMDTable helper (benchmark_table.go) and apply to all tables in both power and performance reports so columns align in plain-text view Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -3055,8 +3055,12 @@ func runBenchmarkPowerCalibration(
|
|||||||
infoByIndex map[int]benchmarkGPUInfo,
|
infoByIndex map[int]benchmarkGPUInfo,
|
||||||
logFunc func(string),
|
logFunc func(string),
|
||||||
seedLimits map[int]int,
|
seedLimits map[int]int,
|
||||||
|
durationSec int,
|
||||||
) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction, []GPUMetricRow) {
|
) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction, []GPUMetricRow) {
|
||||||
const calibDurationSec = 120
|
calibDurationSec := durationSec
|
||||||
|
if calibDurationSec <= 0 {
|
||||||
|
calibDurationSec = 120
|
||||||
|
}
|
||||||
const maxDerateW = 150
|
const maxDerateW = 150
|
||||||
// calibSearchTolerance is the binary-search convergence threshold in watts.
|
// calibSearchTolerance is the binary-search convergence threshold in watts.
|
||||||
// When hi-lo ≤ this, the highest verified-stable limit (lo) is used.
|
// When hi-lo ≤ this, the highest verified-stable limit (lo) is used.
|
||||||
@@ -3436,6 +3440,18 @@ func roundTo5W(w int) int {
|
|||||||
return ((w + 2) / 5) * 5
|
return ((w + 2) / 5) * 5
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// meanFanRPM returns the average RPM across a set of fan readings.
|
||||||
|
func meanFanRPM(fans []FanReading) float64 {
|
||||||
|
if len(fans) == 0 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
var sum float64
|
||||||
|
for _, f := range fans {
|
||||||
|
sum += f.RPM
|
||||||
|
}
|
||||||
|
return sum / float64(len(fans))
|
||||||
|
}
|
||||||
|
|
||||||
func powerBenchDurationSec(profile string) int {
|
func powerBenchDurationSec(profile string) int {
|
||||||
switch strings.TrimSpace(strings.ToLower(profile)) {
|
switch strings.TrimSpace(strings.ToLower(profile)) {
|
||||||
case NvidiaBenchmarkProfileStability:
|
case NvidiaBenchmarkProfileStability:
|
||||||
@@ -3475,30 +3491,29 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
|||||||
// Server power comparison table.
|
// Server power comparison table.
|
||||||
if sp := result.ServerPower; sp != nil {
|
if sp := result.ServerPower; sp != nil {
|
||||||
b.WriteString("## Server vs GPU Power Comparison\n\n")
|
b.WriteString("## Server vs GPU Power Comparison\n\n")
|
||||||
b.WriteString("| Metric | Source | Value |\n")
|
var spRows [][]string
|
||||||
b.WriteString("|--------|--------|-------|\n")
|
spRows = append(spRows, []string{"GPU stable limits sum", "nvidia-smi", fmt.Sprintf("%.0f W", result.PlatformMaxTDPW)})
|
||||||
fmt.Fprintf(&b, "| GPU stable limits sum | nvidia-smi | %.0f W |\n", result.PlatformMaxTDPW)
|
spRows = append(spRows, []string{"GPU actual power sum (p95, last step)", "nvidia-smi", fmt.Sprintf("%.0f W", sp.GPUReportedSumW)})
|
||||||
fmt.Fprintf(&b, "| GPU actual power sum (p95, last step) | nvidia-smi | %.0f W |\n", sp.GPUReportedSumW)
|
|
||||||
if sp.GPUSlotTotalW > 0 {
|
if sp.GPUSlotTotalW > 0 {
|
||||||
fmt.Fprintf(&b, "| GPU PCIe slot power (at peak load) | IPMI SDR | %.0f W |\n", sp.GPUSlotTotalW)
|
spRows = append(spRows, []string{"GPU PCIe slot power (at peak load)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.GPUSlotTotalW)})
|
||||||
}
|
}
|
||||||
if sp.Available {
|
if sp.Available {
|
||||||
fmt.Fprintf(&b, "| Server idle power | IPMI DCMI | %.0f W |\n", sp.IdleW)
|
spRows = append(spRows, []string{"Server idle power", "IPMI DCMI", fmt.Sprintf("%.0f W", sp.IdleW)})
|
||||||
fmt.Fprintf(&b, "| Server loaded power | IPMI DCMI | %.0f W |\n", sp.LoadedW)
|
spRows = append(spRows, []string{"Server loaded power", "IPMI DCMI", fmt.Sprintf("%.0f W", sp.LoadedW)})
|
||||||
fmt.Fprintf(&b, "| Server Δ power (loaded − idle) | IPMI DCMI | %.0f W |\n", sp.DeltaW)
|
spRows = append(spRows, []string{"Server Δ power (loaded − idle)", "IPMI DCMI", fmt.Sprintf("%.0f W", sp.DeltaW)})
|
||||||
}
|
}
|
||||||
if sp.PSUInputLoadedW > 0 {
|
if sp.PSUInputLoadedW > 0 {
|
||||||
fmt.Fprintf(&b, "| PSU AC input (idle) | IPMI SDR | %.0f W |\n", sp.PSUInputIdleW)
|
spRows = append(spRows, []string{"PSU AC input (idle)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUInputIdleW)})
|
||||||
fmt.Fprintf(&b, "| PSU AC input (loaded) | IPMI SDR | %.0f W |\n", sp.PSUInputLoadedW)
|
spRows = append(spRows, []string{"PSU AC input (loaded)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUInputLoadedW)})
|
||||||
psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
|
psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
|
||||||
fmt.Fprintf(&b, "| PSU AC input Δ (loaded − idle) | IPMI SDR | %.0f W |\n", psuDelta)
|
spRows = append(spRows, []string{"PSU AC input Δ (loaded − idle)", "IPMI SDR", fmt.Sprintf("%.0f W", psuDelta)})
|
||||||
}
|
}
|
||||||
if sp.PSUOutputLoadedW > 0 {
|
if sp.PSUOutputLoadedW > 0 {
|
||||||
fmt.Fprintf(&b, "| PSU DC output (idle) | IPMI SDR | %.0f W |\n", sp.PSUOutputIdleW)
|
spRows = append(spRows, []string{"PSU DC output (idle)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUOutputIdleW)})
|
||||||
fmt.Fprintf(&b, "| PSU DC output (loaded) | IPMI SDR | %.0f W |\n", sp.PSUOutputLoadedW)
|
spRows = append(spRows, []string{"PSU DC output (loaded)", "IPMI SDR", fmt.Sprintf("%.0f W", sp.PSUOutputLoadedW)})
|
||||||
if sp.PSUInputLoadedW > 0 && sp.PSUInputIdleW > 0 {
|
if sp.PSUInputLoadedW > 0 && sp.PSUInputIdleW > 0 {
|
||||||
psuEff := sp.PSUOutputIdleW / sp.PSUInputIdleW * 100
|
psuEff := sp.PSUOutputIdleW / sp.PSUInputIdleW * 100
|
||||||
fmt.Fprintf(&b, "| PSU conversion efficiency (idle) | IPMI SDR | %.1f%% |\n", psuEff)
|
spRows = append(spRows, []string{"PSU conversion efficiency (idle)", "IPMI SDR", fmt.Sprintf("%.1f%%", psuEff)})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if sp.Available {
|
if sp.Available {
|
||||||
@@ -3516,7 +3531,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
|||||||
default:
|
default:
|
||||||
ratioNote = "✗ significant discrepancy — GPU over-reports TDP vs wall power"
|
ratioNote = "✗ significant discrepancy — GPU over-reports TDP vs wall power"
|
||||||
}
|
}
|
||||||
fmt.Fprintf(&b, "| Reporting ratio (DCMI Δ / GPU actual) | IPMI DCMI | %.2f — %s |\n", ratio, ratioNote)
|
spRows = append(spRows, []string{"Reporting ratio (DCMI Δ / GPU actual)", "IPMI DCMI", fmt.Sprintf("%.2f — %s", ratio, ratioNote)})
|
||||||
if sp.PSUInputLoadedW > 0 && sp.GPUReportedSumW > 0 {
|
if sp.PSUInputLoadedW > 0 && sp.GPUReportedSumW > 0 {
|
||||||
psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
|
psuDelta := sp.PSUInputLoadedW - sp.PSUInputIdleW
|
||||||
sdrRatio := psuDelta / sp.GPUReportedSumW
|
sdrRatio := psuDelta / sp.GPUReportedSumW
|
||||||
@@ -3529,11 +3544,12 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
|||||||
default:
|
default:
|
||||||
sdrNote = "✗ significant discrepancy"
|
sdrNote = "✗ significant discrepancy"
|
||||||
}
|
}
|
||||||
fmt.Fprintf(&b, "| Reporting ratio (SDR PSU Δ / GPU actual) | IPMI SDR | %.2f — %s |\n", sdrRatio, sdrNote)
|
spRows = append(spRows, []string{"Reporting ratio (SDR PSU Δ / GPU actual)", "IPMI SDR", fmt.Sprintf("%.2f — %s", sdrRatio, sdrNote)})
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
b.WriteString("| IPMI availability | — | not available — IPMI not supported or ipmitool not found |\n")
|
spRows = append(spRows, []string{"IPMI availability", "—", "not available — IPMI not supported or ipmitool not found"})
|
||||||
}
|
}
|
||||||
|
b.WriteString(fmtMDTable([]string{"Metric", "Source", "Value"}, spRows))
|
||||||
for _, note := range sp.Notes {
|
for _, note := range sp.Notes {
|
||||||
fmt.Fprintf(&b, "\n> %s\n", note)
|
fmt.Fprintf(&b, "\n> %s\n", note)
|
||||||
}
|
}
|
||||||
@@ -3541,10 +3557,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
|||||||
|
|
||||||
if len(sp.PSUSlotReadingsIdle) > 0 || len(sp.PSUSlotReadingsLoaded) > 0 {
|
if len(sp.PSUSlotReadingsIdle) > 0 || len(sp.PSUSlotReadingsLoaded) > 0 {
|
||||||
b.WriteString("## PSU Load Distribution\n\n")
|
b.WriteString("## PSU Load Distribution\n\n")
|
||||||
b.WriteString("| Slot | AC Input (idle) | AC Input (loaded) | DC Output (idle) | DC Output (loaded) | Load Δ | Status |\n")
|
|
||||||
b.WriteString("|------|-----------------|-------------------|------------------|--------------------|--------|--------|\n")
|
|
||||||
|
|
||||||
// collect all slot keys
|
|
||||||
slotSet := map[string]struct{}{}
|
slotSet := map[string]struct{}{}
|
||||||
for k := range sp.PSUSlotReadingsIdle {
|
for k := range sp.PSUSlotReadingsIdle {
|
||||||
slotSet[k] = struct{}{}
|
slotSet[k] = struct{}{}
|
||||||
@@ -3558,17 +3571,18 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
|||||||
}
|
}
|
||||||
sort.Strings(slots)
|
sort.Strings(slots)
|
||||||
|
|
||||||
|
fmtW := func(v *float64) string {
|
||||||
|
if v == nil {
|
||||||
|
return "—"
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("%.0f W", *v)
|
||||||
|
}
|
||||||
|
|
||||||
|
var psuDistRows [][]string
|
||||||
for _, slot := range slots {
|
for _, slot := range slots {
|
||||||
idle := sp.PSUSlotReadingsIdle[slot]
|
idle := sp.PSUSlotReadingsIdle[slot]
|
||||||
loaded := sp.PSUSlotReadingsLoaded[slot]
|
loaded := sp.PSUSlotReadingsLoaded[slot]
|
||||||
|
|
||||||
fmtW := func(v *float64) string {
|
|
||||||
if v == nil {
|
|
||||||
return "—"
|
|
||||||
}
|
|
||||||
return fmt.Sprintf("%.0f W", *v)
|
|
||||||
}
|
|
||||||
|
|
||||||
var deltaStr string
|
var deltaStr string
|
||||||
if idle.InputW != nil && loaded.InputW != nil {
|
if idle.InputW != nil && loaded.InputW != nil {
|
||||||
deltaStr = fmt.Sprintf("%+.0f W", *loaded.InputW-*idle.InputW)
|
deltaStr = fmt.Sprintf("%+.0f W", *loaded.InputW-*idle.InputW)
|
||||||
@@ -3584,13 +3598,14 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
|||||||
status = "—"
|
status = "—"
|
||||||
}
|
}
|
||||||
|
|
||||||
fmt.Fprintf(&b, "| %s | %s | %s | %s | %s | %s | %s |\n",
|
psuDistRows = append(psuDistRows, []string{
|
||||||
slot,
|
slot,
|
||||||
fmtW(idle.InputW), fmtW(loaded.InputW),
|
fmtW(idle.InputW), fmtW(loaded.InputW),
|
||||||
fmtW(idle.OutputW), fmtW(loaded.OutputW),
|
fmtW(idle.OutputW), fmtW(loaded.OutputW),
|
||||||
deltaStr, status,
|
deltaStr, status,
|
||||||
)
|
})
|
||||||
}
|
}
|
||||||
|
b.WriteString(fmtMDTable([]string{"Slot", "AC Input (idle)", "AC Input (loaded)", "DC Output (idle)", "DC Output (loaded)", "Load Δ", "Status"}, psuDistRows))
|
||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -3602,28 +3617,194 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
|||||||
}
|
}
|
||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
}
|
}
|
||||||
if len(result.RecommendedSlotOrder) > 0 {
|
// ── Single GPU section ───────────────────────────────────────────────────
|
||||||
b.WriteString("## Recommended Slot Order\n\n")
|
b.WriteString("## Single GPU\n\n")
|
||||||
fmt.Fprintf(&b, "Populate GPUs in this order for best single-card power realization: `%s`\n\n", joinIndexList(result.RecommendedSlotOrder))
|
{
|
||||||
}
|
var sgRows [][]string
|
||||||
if len(result.RampSteps) > 0 {
|
for _, gpu := range result.GPUs {
|
||||||
b.WriteString("## Ramp Sequence\n\n")
|
clk := "—"
|
||||||
b.WriteString("| Step | New GPU | Stable Limit | Total Observed | Server Δ (IPMI) | Derated | Status |\n")
|
mem := "—"
|
||||||
b.WriteString("|------|---------|--------------|----------------|-----------------|---------|--------|\n")
|
temp := "—"
|
||||||
for _, step := range result.RampSteps {
|
pwr := "—"
|
||||||
derated := "-"
|
if gpu.Telemetry != nil {
|
||||||
if step.Derated {
|
clk = fmt.Sprintf("%.0f", gpu.Telemetry.AvgGraphicsClockMHz)
|
||||||
derated = "⚠ yes"
|
mem = fmt.Sprintf("%.0f", gpu.Telemetry.AvgMemoryClockMHz)
|
||||||
|
temp = fmt.Sprintf("%.1f", gpu.Telemetry.AvgTempC)
|
||||||
|
pwr = fmt.Sprintf("%.0f W", gpu.Telemetry.AvgPowerW)
|
||||||
}
|
}
|
||||||
serverDelta := "-"
|
serverDelta := "—"
|
||||||
if step.ServerDeltaW > 0 {
|
if gpu.ServerDeltaW > 0 {
|
||||||
serverDelta = fmt.Sprintf("%.0f W", step.ServerDeltaW)
|
serverDelta = fmt.Sprintf("%.0f W", gpu.ServerDeltaW)
|
||||||
}
|
}
|
||||||
fmt.Fprintf(&b, "| %d | GPU %d | %.0f W | %.0f W | %s | %s | %s |\n",
|
fan := "—"
|
||||||
step.StepIndex, step.NewGPUIndex, step.NewGPUStableLimitW, step.TotalObservedPowerW, serverDelta, derated, step.Status)
|
if gpu.AvgFanRPM > 0 {
|
||||||
|
if gpu.AvgFanDutyCyclePct > 0 {
|
||||||
|
fan = fmt.Sprintf("%.0f RPM (%.0f%%)", gpu.AvgFanRPM, gpu.AvgFanDutyCyclePct)
|
||||||
|
} else {
|
||||||
|
fan = fmt.Sprintf("%.0f RPM", gpu.AvgFanRPM)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
sgRows = append(sgRows, []string{
|
||||||
|
fmt.Sprintf("GPU %d", gpu.Index),
|
||||||
|
fmt.Sprintf("%s (%s)", clk, mem),
|
||||||
|
temp,
|
||||||
|
pwr,
|
||||||
|
serverDelta,
|
||||||
|
fan,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
b.WriteString(fmtMDTable([]string{"GPU", "Clock MHz (Mem MHz)", "Avg Temp °C", "Power W", "Server Δ W", "Fan RPM (duty%)"}, sgRows))
|
||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
}
|
}
|
||||||
|
if len(result.RecommendedSlotOrder) > 0 {
|
||||||
|
fmt.Fprintf(&b, "Recommended slot order for best single-card power realization: `%s`\n\n", joinIndexList(result.RecommendedSlotOrder))
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Ramp Sequence ────────────────────────────────────────────────────────
|
||||||
|
// Rows = run number; Cols = per-GPU power (from step telemetry) + aggregates.
|
||||||
|
if len(result.RampSteps) > 0 {
|
||||||
|
b.WriteString("## Ramp Sequence\n\n")
|
||||||
|
|
||||||
|
// Collect all GPU indices that appear across all steps (ordered by first appearance).
|
||||||
|
allGPUIndices := make([]int, 0, len(result.GPUs))
|
||||||
|
seen := map[int]bool{}
|
||||||
|
for _, step := range result.RampSteps {
|
||||||
|
for _, idx := range step.GPUIndices {
|
||||||
|
if !seen[idx] {
|
||||||
|
seen[idx] = true
|
||||||
|
allGPUIndices = append(allGPUIndices, idx)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var idleW float64
|
||||||
|
if result.ServerPower != nil {
|
||||||
|
idleW = result.ServerPower.IdleW
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build header: Run | GPU 0 | GPU 1 | ... | Server wall W | Per GPU wall W | Platform eff.
|
||||||
|
headers := []string{"Run"}
|
||||||
|
for _, idx := range allGPUIndices {
|
||||||
|
headers = append(headers, fmt.Sprintf("GPU %d W", idx))
|
||||||
|
}
|
||||||
|
headers = append(headers, "Server wall W", "Per GPU wall W", "Platform eff.")
|
||||||
|
|
||||||
|
var rampRows [][]string
|
||||||
|
for _, step := range result.RampSteps {
|
||||||
|
row := []string{fmt.Sprintf("%d", step.StepIndex)}
|
||||||
|
for _, idx := range allGPUIndices {
|
||||||
|
inStep := false
|
||||||
|
for _, si := range step.GPUIndices {
|
||||||
|
if si == idx {
|
||||||
|
inStep = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !inStep {
|
||||||
|
row = append(row, "—")
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
gpuPwr := "—"
|
||||||
|
if t, ok := step.PerGPUTelemetry[idx]; ok && t != nil && t.AvgPowerW > 0 {
|
||||||
|
gpuPwr = fmt.Sprintf("%.0f", t.AvgPowerW)
|
||||||
|
}
|
||||||
|
row = append(row, gpuPwr)
|
||||||
|
}
|
||||||
|
// Server wall W
|
||||||
|
serverWall := "—"
|
||||||
|
if step.ServerLoadedW > 0 {
|
||||||
|
serverWall = fmt.Sprintf("%.0f", step.ServerLoadedW)
|
||||||
|
}
|
||||||
|
// Per GPU wall W = ServerDeltaW / len(GPUIndices)
|
||||||
|
perGPUWall := "—"
|
||||||
|
if step.ServerDeltaW > 0 && len(step.GPUIndices) > 0 {
|
||||||
|
perGPUWall = fmt.Sprintf("%.0f", step.ServerDeltaW/float64(len(step.GPUIndices)))
|
||||||
|
}
|
||||||
|
// Platform eff. = (ServerLoadedW − idleW) / TotalObservedPowerW
|
||||||
|
platEff := "—"
|
||||||
|
if step.TotalObservedPowerW > 0 {
|
||||||
|
eff := step.ServerDeltaW / step.TotalObservedPowerW
|
||||||
|
if idleW > 0 && step.ServerLoadedW > 0 {
|
||||||
|
eff = (step.ServerLoadedW - idleW) / step.TotalObservedPowerW
|
||||||
|
}
|
||||||
|
platEff = fmt.Sprintf("%.2f", eff)
|
||||||
|
}
|
||||||
|
row = append(row, serverWall, perGPUWall, platEff)
|
||||||
|
rampRows = append(rampRows, row)
|
||||||
|
}
|
||||||
|
b.WriteString(fmtMDTable(headers, rampRows))
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── PSU Performance ───────────────────────────────────────────────────────
|
||||||
|
{
|
||||||
|
// Collect all PSU slot keys from any ramp step.
|
||||||
|
psuSlotSet := map[string]struct{}{}
|
||||||
|
for _, step := range result.RampSteps {
|
||||||
|
for k := range step.PSUSlotReadings {
|
||||||
|
psuSlotSet[k] = struct{}{}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(psuSlotSet) > 0 {
|
||||||
|
b.WriteString("## PSU Performance\n\n")
|
||||||
|
psuSlots := make([]string, 0, len(psuSlotSet))
|
||||||
|
for k := range psuSlotSet {
|
||||||
|
psuSlots = append(psuSlots, k)
|
||||||
|
}
|
||||||
|
sort.Strings(psuSlots)
|
||||||
|
|
||||||
|
var idleW float64
|
||||||
|
if result.ServerPower != nil {
|
||||||
|
idleW = result.ServerPower.IdleW
|
||||||
|
}
|
||||||
|
|
||||||
|
psuHeaders := []string{"Run"}
|
||||||
|
for _, slot := range psuSlots {
|
||||||
|
psuHeaders = append(psuHeaders, fmt.Sprintf("PSU %s W", slot))
|
||||||
|
}
|
||||||
|
psuHeaders = append(psuHeaders, "PSU Total W", "Platform eff.", "Fan RPM (duty%)")
|
||||||
|
|
||||||
|
var psuRows [][]string
|
||||||
|
for _, step := range result.RampSteps {
|
||||||
|
row := []string{fmt.Sprintf("%d", step.StepIndex)}
|
||||||
|
var psuTotal float64
|
||||||
|
for _, slot := range psuSlots {
|
||||||
|
sp, ok := step.PSUSlotReadings[slot]
|
||||||
|
if !ok || sp.InputW == nil {
|
||||||
|
row = append(row, "—")
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
row = append(row, fmt.Sprintf("%.0f", *sp.InputW))
|
||||||
|
psuTotal += *sp.InputW
|
||||||
|
}
|
||||||
|
totalStr := "—"
|
||||||
|
if psuTotal > 0 {
|
||||||
|
totalStr = fmt.Sprintf("%.0f", psuTotal)
|
||||||
|
}
|
||||||
|
platEff := "—"
|
||||||
|
if step.TotalObservedPowerW > 0 {
|
||||||
|
eff := step.ServerDeltaW / step.TotalObservedPowerW
|
||||||
|
if idleW > 0 && step.ServerLoadedW > 0 {
|
||||||
|
eff = (step.ServerLoadedW - idleW) / step.TotalObservedPowerW
|
||||||
|
}
|
||||||
|
platEff = fmt.Sprintf("%.2f", eff)
|
||||||
|
}
|
||||||
|
fan := "—"
|
||||||
|
if step.AvgFanRPM > 0 {
|
||||||
|
if step.AvgFanDutyCyclePct > 0 {
|
||||||
|
fan = fmt.Sprintf("%.0f (%.0f%%)", step.AvgFanRPM, step.AvgFanDutyCyclePct)
|
||||||
|
} else {
|
||||||
|
fan = fmt.Sprintf("%.0f", step.AvgFanRPM)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
row = append(row, totalStr, platEff, fan)
|
||||||
|
psuRows = append(psuRows, row)
|
||||||
|
}
|
||||||
|
b.WriteString(fmtMDTable(psuHeaders, psuRows))
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ── PSU Issues ────────────────────────────────────────────────────────────
|
// ── PSU Issues ────────────────────────────────────────────────────────────
|
||||||
if len(result.PSUIssues) > 0 {
|
if len(result.PSUIssues) > 0 {
|
||||||
b.WriteString("## PSU Issues\n\n")
|
b.WriteString("## PSU Issues\n\n")
|
||||||
@@ -3646,8 +3827,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
|||||||
totalDefault += gpu.DefaultPowerLimitW
|
totalDefault += gpu.DefaultPowerLimitW
|
||||||
totalStable += stable
|
totalStable += stable
|
||||||
}
|
}
|
||||||
b.WriteString("| GPU | Default TDP | Single-card limit | Stable limit | Realization | Derated |\n")
|
var pdRows [][]string
|
||||||
b.WriteString("|-----|-------------|-------------------|--------------|-------------|----------|\n")
|
|
||||||
for _, gpu := range result.GPUs {
|
for _, gpu := range result.GPUs {
|
||||||
stable := gpu.StablePowerLimitW
|
stable := gpu.StablePowerLimitW
|
||||||
if stable <= 0 {
|
if stable <= 0 {
|
||||||
@@ -3661,15 +3841,29 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
|||||||
if gpu.Derated {
|
if gpu.Derated {
|
||||||
derated = "⚠ yes"
|
derated = "⚠ yes"
|
||||||
}
|
}
|
||||||
fmt.Fprintf(&b, "| GPU %d | %.0f W | %.0f W | %.0f W | %s | %s |\n",
|
pdRows = append(pdRows, []string{
|
||||||
gpu.Index, gpu.DefaultPowerLimitW, gpu.AppliedPowerLimitW, stable, realization, derated)
|
fmt.Sprintf("GPU %d", gpu.Index),
|
||||||
|
fmt.Sprintf("%.0f W", gpu.DefaultPowerLimitW),
|
||||||
|
fmt.Sprintf("%.0f W", gpu.AppliedPowerLimitW),
|
||||||
|
fmt.Sprintf("%.0f W", stable),
|
||||||
|
realization,
|
||||||
|
derated,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
platformReal := "-"
|
platformReal := "-"
|
||||||
if totalDefault > 0 && totalStable > 0 {
|
if totalDefault > 0 && totalStable > 0 {
|
||||||
platformReal = fmt.Sprintf("%.1f%%", totalStable/totalDefault*100)
|
platformReal = fmt.Sprintf("%.1f%%", totalStable/totalDefault*100)
|
||||||
}
|
}
|
||||||
fmt.Fprintf(&b, "| **Platform** | **%.0f W** | — | **%.0f W** | **%s** | |\n\n",
|
pdRows = append(pdRows, []string{
|
||||||
totalDefault, totalStable, platformReal)
|
"**Platform**",
|
||||||
|
fmt.Sprintf("**%.0f W**", totalDefault),
|
||||||
|
"—",
|
||||||
|
fmt.Sprintf("**%.0f W**", totalStable),
|
||||||
|
fmt.Sprintf("**%s**", platformReal),
|
||||||
|
"",
|
||||||
|
})
|
||||||
|
b.WriteString(fmtMDTable([]string{"GPU", "Default TDP", "Single-card limit", "Stable limit", "Realization", "Derated"}, pdRows))
|
||||||
|
b.WriteString("\n")
|
||||||
|
|
||||||
// Balance across GPUs — only meaningful with 2+ GPUs.
|
// Balance across GPUs — only meaningful with 2+ GPUs.
|
||||||
if len(result.GPUs) > 1 {
|
if len(result.GPUs) > 1 {
|
||||||
@@ -3710,9 +3904,6 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
|||||||
// Ramp scalability table — power efficiency of adding each GPU.
|
// Ramp scalability table — power efficiency of adding each GPU.
|
||||||
if len(result.RampSteps) > 1 {
|
if len(result.RampSteps) > 1 {
|
||||||
b.WriteString("**Ramp power scalability** (stable TDP per step):\n\n")
|
b.WriteString("**Ramp power scalability** (stable TDP per step):\n\n")
|
||||||
b.WriteString("| Step | GPUs | Cumulative stable TDP | Incremental | Efficiency vs GPU 1 |\n")
|
|
||||||
b.WriteString("|------|------|-----------------------|-------------|---------------------|\n")
|
|
||||||
// First GPU stable TDP as the reference unit for efficiency.
|
|
||||||
var firstStable float64
|
var firstStable float64
|
||||||
if len(result.GPUs) > 0 {
|
if len(result.GPUs) > 0 {
|
||||||
firstStable = result.GPUs[0].StablePowerLimitW
|
firstStable = result.GPUs[0].StablePowerLimitW
|
||||||
@@ -3721,6 +3912,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
var prevCumulative float64
|
var prevCumulative float64
|
||||||
|
var scalRows [][]string
|
||||||
for _, step := range result.RampSteps {
|
for _, step := range result.RampSteps {
|
||||||
var cumulative float64
|
var cumulative float64
|
||||||
for _, gpuIdx := range step.GPUIndices {
|
for _, gpuIdx := range step.GPUIndices {
|
||||||
@@ -3740,40 +3932,104 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
|||||||
if step.StepIndex > 1 && firstStable > 0 {
|
if step.StepIndex > 1 && firstStable > 0 {
|
||||||
efficiency = fmt.Sprintf("%.1f%%", incremental/firstStable*100)
|
efficiency = fmt.Sprintf("%.1f%%", incremental/firstStable*100)
|
||||||
}
|
}
|
||||||
fmt.Fprintf(&b, "| %d | %s | %.0f W | %.0f W | %s |\n",
|
scalRows = append(scalRows, []string{
|
||||||
step.StepIndex, joinIndexList(step.GPUIndices), cumulative, incremental, efficiency)
|
fmt.Sprintf("%d", step.StepIndex),
|
||||||
|
joinIndexList(step.GPUIndices),
|
||||||
|
fmt.Sprintf("%.0f W", cumulative),
|
||||||
|
fmt.Sprintf("%.0f W", incremental),
|
||||||
|
efficiency,
|
||||||
|
})
|
||||||
prevCumulative = cumulative
|
prevCumulative = cumulative
|
||||||
}
|
}
|
||||||
|
b.WriteString(fmtMDTable([]string{"Step", "GPUs", "Cumulative stable TDP", "Incremental", "Efficiency vs GPU 1"}, scalRows))
|
||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
b.WriteString("## Per-Slot Results\n\n")
|
// ── Per-GPU sections ──────────────────────────────────────────────────────
|
||||||
b.WriteString("| GPU | Status | Single-card Limit | Stable Limit | Server Δ (IPMI) | Temp | Attempts |\n")
|
var lastStep *NvidiaPowerBenchStep
|
||||||
b.WriteString("|-----|--------|-------------------|--------------|-----------------|------|----------|\n")
|
if n := len(result.RampSteps); n > 0 {
|
||||||
for _, gpu := range result.GPUs {
|
lastStep = &result.RampSteps[n-1]
|
||||||
stableLimit := "-"
|
|
||||||
if gpu.StablePowerLimitW > 0 {
|
|
||||||
if gpu.Derated {
|
|
||||||
stableLimit = fmt.Sprintf("%.0f W ⚠", gpu.StablePowerLimitW)
|
|
||||||
} else {
|
|
||||||
stableLimit = fmt.Sprintf("%.0f W", gpu.StablePowerLimitW)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
serverDelta := "-"
|
|
||||||
if gpu.ServerDeltaW > 0 {
|
|
||||||
serverDelta = fmt.Sprintf("%.0f W", gpu.ServerDeltaW)
|
|
||||||
}
|
|
||||||
fmt.Fprintf(&b, "| GPU %d | %s | %.0f W | %s | %s | %.1f C | %d |\n",
|
|
||||||
gpu.Index, gpu.Status, gpu.AppliedPowerLimitW, stableLimit, serverDelta, gpu.MaxObservedTempC, gpu.CalibrationAttempts)
|
|
||||||
}
|
}
|
||||||
b.WriteString("\n")
|
|
||||||
for _, gpu := range result.GPUs {
|
for _, gpu := range result.GPUs {
|
||||||
fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, gpu.Name)
|
fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, gpu.Name)
|
||||||
|
|
||||||
|
// Transposed comparison table: Single Run vs All GPU Run.
|
||||||
|
singleClk := "—"
|
||||||
|
singleMem := "—"
|
||||||
|
singleTemp := "—"
|
||||||
|
singlePwr := "—"
|
||||||
|
singleWall := "—"
|
||||||
|
singleFan := "—"
|
||||||
|
if gpu.Telemetry != nil {
|
||||||
|
singleClk = fmt.Sprintf("%.0f", gpu.Telemetry.AvgGraphicsClockMHz)
|
||||||
|
singleMem = fmt.Sprintf("%.0f", gpu.Telemetry.AvgMemoryClockMHz)
|
||||||
|
singleTemp = fmt.Sprintf("%.1f", gpu.Telemetry.AvgTempC)
|
||||||
|
singlePwr = fmt.Sprintf("%.0f W", gpu.Telemetry.AvgPowerW)
|
||||||
|
}
|
||||||
|
if gpu.ServerDeltaW > 0 {
|
||||||
|
singleWall = fmt.Sprintf("%.0f W", gpu.ServerDeltaW)
|
||||||
|
}
|
||||||
|
if gpu.AvgFanRPM > 0 {
|
||||||
|
if gpu.AvgFanDutyCyclePct > 0 {
|
||||||
|
singleFan = fmt.Sprintf("%.0f RPM (%.0f%%)", gpu.AvgFanRPM, gpu.AvgFanDutyCyclePct)
|
||||||
|
} else {
|
||||||
|
singleFan = fmt.Sprintf("%.0f RPM", gpu.AvgFanRPM)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
allClk := "—"
|
||||||
|
allMem := "—"
|
||||||
|
allTemp := "—"
|
||||||
|
allPwr := "—"
|
||||||
|
allWall := "—"
|
||||||
|
allFan := "—"
|
||||||
|
if lastStep != nil {
|
||||||
|
if t, ok := lastStep.PerGPUTelemetry[gpu.Index]; ok && t != nil {
|
||||||
|
allClk = fmt.Sprintf("%.0f", t.AvgGraphicsClockMHz)
|
||||||
|
allMem = fmt.Sprintf("%.0f", t.AvgMemoryClockMHz)
|
||||||
|
allTemp = fmt.Sprintf("%.1f", t.AvgTempC)
|
||||||
|
allPwr = fmt.Sprintf("%.0f W", t.AvgPowerW)
|
||||||
|
}
|
||||||
|
if lastStep.ServerDeltaW > 0 && len(lastStep.GPUIndices) > 0 {
|
||||||
|
allWall = fmt.Sprintf("%.0f W", lastStep.ServerDeltaW/float64(len(lastStep.GPUIndices)))
|
||||||
|
}
|
||||||
|
if lastStep.AvgFanRPM > 0 {
|
||||||
|
if lastStep.AvgFanDutyCyclePct > 0 {
|
||||||
|
allFan = fmt.Sprintf("%.0f RPM (%.0f%%)", lastStep.AvgFanRPM, lastStep.AvgFanDutyCyclePct)
|
||||||
|
} else {
|
||||||
|
allFan = fmt.Sprintf("%.0f RPM", lastStep.AvgFanRPM)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
tableHeaders := []string{"", "Single Run"}
|
||||||
|
if lastStep != nil {
|
||||||
|
tableHeaders = append(tableHeaders, "All GPU Run")
|
||||||
|
}
|
||||||
|
compRows := [][]string{
|
||||||
|
{"Clock MHz (Mem MHz)", fmt.Sprintf("%s (%s)", singleClk, singleMem)},
|
||||||
|
{"Avg Temp °C", singleTemp},
|
||||||
|
{"Power W", singlePwr},
|
||||||
|
{"Per GPU wall W", singleWall},
|
||||||
|
{"Fan RPM (duty%)", singleFan},
|
||||||
|
}
|
||||||
|
if lastStep != nil {
|
||||||
|
compRows[0] = append(compRows[0], fmt.Sprintf("%s (%s)", allClk, allMem))
|
||||||
|
compRows[1] = append(compRows[1], allTemp)
|
||||||
|
compRows[2] = append(compRows[2], allPwr)
|
||||||
|
compRows[3] = append(compRows[3], allWall)
|
||||||
|
compRows[4] = append(compRows[4], allFan)
|
||||||
|
}
|
||||||
|
b.WriteString(fmtMDTable(tableHeaders, compRows))
|
||||||
|
b.WriteString("\n")
|
||||||
|
|
||||||
for _, note := range gpu.Notes {
|
for _, note := range gpu.Notes {
|
||||||
fmt.Fprintf(&b, "- %s\n", note)
|
fmt.Fprintf(&b, "- %s\n", note)
|
||||||
}
|
}
|
||||||
b.WriteString("\n")
|
if len(gpu.Notes) > 0 {
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return b.String()
|
return b.String()
|
||||||
}
|
}
|
||||||
@@ -3860,7 +4116,6 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
OverallStatus: "OK",
|
OverallStatus: "OK",
|
||||||
}
|
}
|
||||||
durationSec := powerBenchDurationSec(opts.Profile)
|
durationSec := powerBenchDurationSec(opts.Profile)
|
||||||
_ = durationSec
|
|
||||||
|
|
||||||
// Sample IPMI idle power before any GPU load.
|
// Sample IPMI idle power before any GPU load.
|
||||||
var serverIdleW float64
|
var serverIdleW float64
|
||||||
@@ -3894,7 +4149,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
ipmiSingleDone <- w
|
ipmiSingleDone <- w
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
c, restore, singleRows := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil)
|
c, restore, singleRows := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil, durationSec)
|
||||||
appendBenchmarkMetrics(&allPowerRows, singleRows, fmt.Sprintf("single-gpu-%d", idx), &powerCursor, 0)
|
appendBenchmarkMetrics(&allPowerRows, singleRows, fmt.Sprintf("single-gpu-%d", idx), &powerCursor, 0)
|
||||||
ipmiSingleCancel()
|
ipmiSingleCancel()
|
||||||
if w, ok := <-ipmiSingleDone; ok {
|
if w, ok := <-ipmiSingleDone; ok {
|
||||||
@@ -3947,6 +4202,12 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
t := summarizeBenchmarkTelemetry(calib.MetricRows)
|
t := summarizeBenchmarkTelemetry(calib.MetricRows)
|
||||||
gpu.Telemetry = &t
|
gpu.Telemetry = &t
|
||||||
}
|
}
|
||||||
|
if fans, err := sampleFanSpeeds(); err == nil && len(fans) > 0 {
|
||||||
|
gpu.AvgFanRPM = meanFanRPM(fans)
|
||||||
|
if duty, ok, _ := sampleFanDutyCyclePctFromFans(fans); ok {
|
||||||
|
gpu.AvgFanDutyCyclePct = duty
|
||||||
|
}
|
||||||
|
}
|
||||||
gpus = append(gpus, gpu)
|
gpus = append(gpus, gpu)
|
||||||
}
|
}
|
||||||
sort.Slice(gpus, func(i, j int) bool {
|
sort.Slice(gpus, func(i, j int) bool {
|
||||||
@@ -4077,7 +4338,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
ipmiStepDone <- w
|
ipmiStepDone <- w
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
stepCalib, stepRestore, stepRows := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep)
|
stepCalib, stepRestore, stepRows := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep, durationSec)
|
||||||
appendBenchmarkMetrics(&allPowerRows, stepRows, fmt.Sprintf("ramp-step-%d", step), &powerCursor, 0)
|
appendBenchmarkMetrics(&allPowerRows, stepRows, fmt.Sprintf("ramp-step-%d", step), &powerCursor, 0)
|
||||||
ipmiStepCancel()
|
ipmiStepCancel()
|
||||||
var stepIPMILoadedW float64
|
var stepIPMILoadedW float64
|
||||||
@@ -4159,6 +4420,29 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Per-step PSU slot snapshot.
|
||||||
|
sdrStep := sampleIPMISDRPowerSensors()
|
||||||
|
if len(sdrStep.PSUSlots) > 0 {
|
||||||
|
ramp.PSUSlotReadings = sdrStep.PSUSlots
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fan state at end of ramp step.
|
||||||
|
if fans, err := sampleFanSpeeds(); err == nil && len(fans) > 0 {
|
||||||
|
ramp.AvgFanRPM = meanFanRPM(fans)
|
||||||
|
if duty, ok, _ := sampleFanDutyCyclePctFromFans(fans); ok {
|
||||||
|
ramp.AvgFanDutyCyclePct = duty
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Per-GPU telemetry from this ramp step's calibration.
|
||||||
|
ramp.PerGPUTelemetry = make(map[int]*BenchmarkTelemetrySummary, len(subset))
|
||||||
|
for _, gpuIdx := range subset {
|
||||||
|
if c, ok := stepCalib[gpuIdx]; ok {
|
||||||
|
s := c.Summary
|
||||||
|
ramp.PerGPUTelemetry[gpuIdx] = &s
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
result.RampSteps = append(result.RampSteps, ramp)
|
result.RampSteps = append(result.RampSteps, ramp)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -89,136 +89,159 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
|||||||
|
|
||||||
// Perspective 1: Compatibility — hard stops
|
// Perspective 1: Compatibility — hard stops
|
||||||
b.WriteString("### 1. Compatibility\n\n")
|
b.WriteString("### 1. Compatibility\n\n")
|
||||||
b.WriteString("| GPU | Thermal throttle | Fan duty at throttle | ECC uncorr | Status |\n")
|
{
|
||||||
b.WriteString("|-----|------------------|----------------------|------------|--------|\n")
|
var rows [][]string
|
||||||
for _, gpu := range result.GPUs {
|
for _, gpu := range result.GPUs {
|
||||||
thermalThrottle := "-"
|
thermalThrottle := "-"
|
||||||
if gpu.Scores.ThermalThrottlePct > 0 {
|
if gpu.Scores.ThermalThrottlePct > 0 {
|
||||||
thermalThrottle = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
|
thermalThrottle = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
|
||||||
|
}
|
||||||
|
fanAtThrottle := "-"
|
||||||
|
if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && gpu.Scores.ThermalThrottlePct > 0 {
|
||||||
|
fanAtThrottle = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
|
||||||
|
}
|
||||||
|
ecc := "-"
|
||||||
|
if gpu.ECC.Uncorrected > 0 {
|
||||||
|
ecc = fmt.Sprintf("⛔ %d", gpu.ECC.Uncorrected)
|
||||||
|
}
|
||||||
|
compatStatus := "✓ OK"
|
||||||
|
if gpu.ECC.Uncorrected > 0 || (gpu.Scores.ThermalThrottlePct > 0 && result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && result.Cooling.P95FanDutyCyclePct < 95) {
|
||||||
|
compatStatus = "⛔ HARD STOP"
|
||||||
|
}
|
||||||
|
rows = append(rows, []string{fmt.Sprintf("GPU %d", gpu.Index), thermalThrottle, fanAtThrottle, ecc, compatStatus})
|
||||||
}
|
}
|
||||||
fanAtThrottle := "-"
|
b.WriteString(fmtMDTable([]string{"GPU", "Thermal throttle", "Fan duty at throttle", "ECC uncorr", "Status"}, rows))
|
||||||
if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && gpu.Scores.ThermalThrottlePct > 0 {
|
b.WriteString("\n")
|
||||||
fanAtThrottle = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
|
|
||||||
}
|
|
||||||
ecc := "-"
|
|
||||||
if gpu.ECC.Uncorrected > 0 {
|
|
||||||
ecc = fmt.Sprintf("⛔ %d", gpu.ECC.Uncorrected)
|
|
||||||
}
|
|
||||||
compatStatus := "✓ OK"
|
|
||||||
if gpu.ECC.Uncorrected > 0 || (gpu.Scores.ThermalThrottlePct > 0 && result.Cooling != nil && result.Cooling.FanDutyCycleAvailable && result.Cooling.P95FanDutyCyclePct < 95) {
|
|
||||||
compatStatus = "⛔ HARD STOP"
|
|
||||||
}
|
|
||||||
fmt.Fprintf(&b, "| GPU %d | %s | %s | %s | %s |\n",
|
|
||||||
gpu.Index, thermalThrottle, fanAtThrottle, ecc, compatStatus)
|
|
||||||
}
|
}
|
||||||
b.WriteString("\n")
|
|
||||||
|
|
||||||
// Perspective 2: Thermal headroom
|
// Perspective 2: Thermal headroom
|
||||||
b.WriteString("### 2. Thermal Headroom\n\n")
|
b.WriteString("### 2. Thermal Headroom\n\n")
|
||||||
b.WriteString("| GPU | p95 temp | Slowdown limit | Shutdown limit | Headroom | Thermal throttle | Status |\n")
|
{
|
||||||
b.WriteString("|-----|----------|----------------|----------------|----------|------------------|--------|\n")
|
var rows [][]string
|
||||||
for _, gpu := range result.GPUs {
|
for _, gpu := range result.GPUs {
|
||||||
shutdownTemp := gpu.ShutdownTempC
|
shutdownTemp := gpu.ShutdownTempC
|
||||||
if shutdownTemp <= 0 {
|
if shutdownTemp <= 0 {
|
||||||
shutdownTemp = 90
|
shutdownTemp = 90
|
||||||
|
}
|
||||||
|
slowdownTemp := gpu.SlowdownTempC
|
||||||
|
if slowdownTemp <= 0 {
|
||||||
|
slowdownTemp = 80
|
||||||
|
}
|
||||||
|
headroom := gpu.Scores.TempHeadroomC
|
||||||
|
thermalStatus := "✓ OK"
|
||||||
|
switch {
|
||||||
|
case headroom < 10:
|
||||||
|
thermalStatus = "⛔ CRITICAL"
|
||||||
|
case gpu.Steady.P95TempC >= slowdownTemp:
|
||||||
|
thermalStatus = "⚠ WARNING"
|
||||||
|
}
|
||||||
|
throttlePct := "-"
|
||||||
|
if gpu.Scores.ThermalThrottlePct > 0 {
|
||||||
|
throttlePct = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
|
||||||
|
}
|
||||||
|
rows = append(rows, []string{
|
||||||
|
fmt.Sprintf("GPU %d", gpu.Index),
|
||||||
|
fmt.Sprintf("%.1f°C", gpu.Steady.P95TempC),
|
||||||
|
fmt.Sprintf("%.0f°C", slowdownTemp),
|
||||||
|
fmt.Sprintf("%.0f°C", shutdownTemp),
|
||||||
|
fmt.Sprintf("%.1f°C", headroom),
|
||||||
|
throttlePct,
|
||||||
|
thermalStatus,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
slowdownTemp := gpu.SlowdownTempC
|
b.WriteString(fmtMDTable([]string{"GPU", "p95 temp", "Slowdown limit", "Shutdown limit", "Headroom", "Thermal throttle", "Status"}, rows))
|
||||||
if slowdownTemp <= 0 {
|
b.WriteString("\n")
|
||||||
slowdownTemp = 80
|
|
||||||
}
|
|
||||||
headroom := gpu.Scores.TempHeadroomC
|
|
||||||
thermalStatus := "✓ OK"
|
|
||||||
switch {
|
|
||||||
case headroom < 10:
|
|
||||||
thermalStatus = "⛔ CRITICAL"
|
|
||||||
case gpu.Steady.P95TempC >= slowdownTemp:
|
|
||||||
thermalStatus = "⚠ WARNING"
|
|
||||||
}
|
|
||||||
throttlePct := "-"
|
|
||||||
if gpu.Scores.ThermalThrottlePct > 0 {
|
|
||||||
throttlePct = fmt.Sprintf("%.1f%%", gpu.Scores.ThermalThrottlePct)
|
|
||||||
}
|
|
||||||
fmt.Fprintf(&b, "| GPU %d | %.1f°C | %.0f°C | %.0f°C | %.1f°C | %s | %s |\n",
|
|
||||||
gpu.Index, gpu.Steady.P95TempC, slowdownTemp, shutdownTemp, headroom, throttlePct, thermalStatus)
|
|
||||||
}
|
}
|
||||||
b.WriteString("\n")
|
|
||||||
|
|
||||||
// Perspective 3: Power delivery
|
// Perspective 3: Power delivery
|
||||||
b.WriteString("### 3. Power Delivery\n\n")
|
b.WriteString("### 3. Power Delivery\n\n")
|
||||||
b.WriteString("| GPU | Power cap throttle | Power stability | Fan duty (p95) | Status |\n")
|
{
|
||||||
b.WriteString("|-----|-------------------|-----------------|----------------|--------|\n")
|
var rows [][]string
|
||||||
for _, gpu := range result.GPUs {
|
for _, gpu := range result.GPUs {
|
||||||
powerCap := "-"
|
powerCap := "-"
|
||||||
if gpu.Scores.PowerCapThrottlePct > 0 {
|
if gpu.Scores.PowerCapThrottlePct > 0 {
|
||||||
powerCap = fmt.Sprintf("%.1f%%", gpu.Scores.PowerCapThrottlePct)
|
powerCap = fmt.Sprintf("%.1f%%", gpu.Scores.PowerCapThrottlePct)
|
||||||
|
}
|
||||||
|
fanDuty := "-"
|
||||||
|
if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable {
|
||||||
|
fanDuty = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
|
||||||
|
}
|
||||||
|
powerStatus := "✓ OK"
|
||||||
|
if gpu.Scores.PowerCapThrottlePct > 5 {
|
||||||
|
powerStatus = "⚠ POWER LIMITED"
|
||||||
|
}
|
||||||
|
rows = append(rows, []string{
|
||||||
|
fmt.Sprintf("GPU %d", gpu.Index),
|
||||||
|
powerCap,
|
||||||
|
fmt.Sprintf("%.1f", gpu.Scores.PowerSustainScore),
|
||||||
|
fanDuty,
|
||||||
|
powerStatus,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
fanDuty := "-"
|
b.WriteString(fmtMDTable([]string{"GPU", "Power cap throttle", "Power stability", "Fan duty (p95)", "Status"}, rows))
|
||||||
if result.Cooling != nil && result.Cooling.FanDutyCycleAvailable {
|
b.WriteString("\n")
|
||||||
fanDuty = fmt.Sprintf("%.0f%%", result.Cooling.P95FanDutyCyclePct)
|
|
||||||
}
|
|
||||||
powerStatus := "✓ OK"
|
|
||||||
if gpu.Scores.PowerCapThrottlePct > 5 {
|
|
||||||
powerStatus = "⚠ POWER LIMITED"
|
|
||||||
}
|
|
||||||
fmt.Fprintf(&b, "| GPU %d | %s | %.1f | %s | %s |\n",
|
|
||||||
gpu.Index, powerCap, gpu.Scores.PowerSustainScore, fanDuty, powerStatus)
|
|
||||||
}
|
}
|
||||||
b.WriteString("\n")
|
|
||||||
|
|
||||||
// Perspective 4: Performance
|
// Perspective 4: Performance
|
||||||
b.WriteString("### 4. Performance\n\n")
|
b.WriteString("### 4. Performance\n\n")
|
||||||
b.WriteString("| GPU | Compute TOPS | Synthetic | Mixed | Mixed Eff. | TOPS/SM/GHz |\n")
|
{
|
||||||
b.WriteString("|-----|--------------|-----------|-------|------------|-------------|\n")
|
var rows [][]string
|
||||||
for _, gpu := range result.GPUs {
|
for _, gpu := range result.GPUs {
|
||||||
synthetic := "-"
|
synthetic := "-"
|
||||||
if gpu.Scores.SyntheticScore > 0 {
|
if gpu.Scores.SyntheticScore > 0 {
|
||||||
synthetic = fmt.Sprintf("%.2f", gpu.Scores.SyntheticScore)
|
synthetic = fmt.Sprintf("%.2f", gpu.Scores.SyntheticScore)
|
||||||
|
}
|
||||||
|
mixed := "-"
|
||||||
|
if gpu.Scores.MixedScore > 0 {
|
||||||
|
mixed = fmt.Sprintf("%.2f", gpu.Scores.MixedScore)
|
||||||
|
}
|
||||||
|
mixedEff := "-"
|
||||||
|
if gpu.Scores.MixedEfficiency > 0 {
|
||||||
|
mixedEff = fmt.Sprintf("%.1f%%", gpu.Scores.MixedEfficiency*100)
|
||||||
|
}
|
||||||
|
topsPerSM := "-"
|
||||||
|
if gpu.Scores.TOPSPerSMPerGHz > 0 {
|
||||||
|
topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
|
||||||
|
}
|
||||||
|
rows = append(rows, []string{
|
||||||
|
fmt.Sprintf("GPU %d", gpu.Index),
|
||||||
|
fmt.Sprintf("**%.2f**", gpu.Scores.CompositeScore),
|
||||||
|
synthetic, mixed, mixedEff, topsPerSM,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
mixed := "-"
|
b.WriteString(fmtMDTable([]string{"GPU", "Compute TOPS", "Synthetic", "Mixed", "Mixed Eff.", "TOPS/SM/GHz"}, rows))
|
||||||
if gpu.Scores.MixedScore > 0 {
|
if len(result.PerformanceRampSteps) > 0 {
|
||||||
mixed = fmt.Sprintf("%.2f", gpu.Scores.MixedScore)
|
fmt.Fprintf(&b, "\n**Platform power score (scalability):** %.1f%%\n", result.PlatformPowerScore)
|
||||||
}
|
}
|
||||||
mixedEff := "-"
|
b.WriteString("\n")
|
||||||
if gpu.Scores.MixedEfficiency > 0 {
|
|
||||||
mixedEff = fmt.Sprintf("%.1f%%", gpu.Scores.MixedEfficiency*100)
|
|
||||||
}
|
|
||||||
topsPerSM := "-"
|
|
||||||
if gpu.Scores.TOPSPerSMPerGHz > 0 {
|
|
||||||
topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
|
|
||||||
}
|
|
||||||
fmt.Fprintf(&b, "| GPU %d | **%.2f** | %s | %s | %s | %s |\n",
|
|
||||||
gpu.Index, gpu.Scores.CompositeScore, synthetic, mixed, mixedEff, topsPerSM)
|
|
||||||
}
|
}
|
||||||
if len(result.PerformanceRampSteps) > 0 {
|
|
||||||
fmt.Fprintf(&b, "\n**Platform power score (scalability):** %.1f%%\n", result.PlatformPowerScore)
|
|
||||||
}
|
|
||||||
b.WriteString("\n")
|
|
||||||
|
|
||||||
// Perspective 5: Anomaly flags
|
// Perspective 5: Anomaly flags
|
||||||
b.WriteString("### 5. Anomalies\n\n")
|
b.WriteString("### 5. Anomalies\n\n")
|
||||||
b.WriteString("| GPU | ECC corrected | Sync boost throttle | Power instability | Thermal instability |\n")
|
{
|
||||||
b.WriteString("|-----|---------------|---------------------|-------------------|---------------------|\n")
|
var rows [][]string
|
||||||
for _, gpu := range result.GPUs {
|
for _, gpu := range result.GPUs {
|
||||||
eccCorr := "-"
|
eccCorr := "-"
|
||||||
if gpu.ECC.Corrected > 0 {
|
if gpu.ECC.Corrected > 0 {
|
||||||
eccCorr = fmt.Sprintf("⚠ %d", gpu.ECC.Corrected)
|
eccCorr = fmt.Sprintf("⚠ %d", gpu.ECC.Corrected)
|
||||||
|
}
|
||||||
|
syncBoost := "-"
|
||||||
|
if gpu.Scores.SyncBoostThrottlePct > 0 {
|
||||||
|
syncBoost = fmt.Sprintf("%.1f%%", gpu.Scores.SyncBoostThrottlePct)
|
||||||
|
}
|
||||||
|
powerVar := "OK"
|
||||||
|
if gpu.Scores.PowerSustainScore < 70 {
|
||||||
|
powerVar = "⚠ unstable"
|
||||||
|
}
|
||||||
|
thermalVar := "OK"
|
||||||
|
if gpu.Scores.ThermalSustainScore < 70 {
|
||||||
|
thermalVar = "⚠ unstable"
|
||||||
|
}
|
||||||
|
rows = append(rows, []string{fmt.Sprintf("GPU %d", gpu.Index), eccCorr, syncBoost, powerVar, thermalVar})
|
||||||
}
|
}
|
||||||
syncBoost := "-"
|
b.WriteString(fmtMDTable([]string{"GPU", "ECC corrected", "Sync boost throttle", "Power instability", "Thermal instability"}, rows))
|
||||||
if gpu.Scores.SyncBoostThrottlePct > 0 {
|
b.WriteString("\n")
|
||||||
syncBoost = fmt.Sprintf("%.1f%%", gpu.Scores.SyncBoostThrottlePct)
|
|
||||||
}
|
|
||||||
powerVar := "OK"
|
|
||||||
if gpu.Scores.PowerSustainScore < 70 {
|
|
||||||
powerVar = "⚠ unstable"
|
|
||||||
}
|
|
||||||
thermalVar := "OK"
|
|
||||||
if gpu.Scores.ThermalSustainScore < 70 {
|
|
||||||
thermalVar = "⚠ unstable"
|
|
||||||
}
|
|
||||||
fmt.Fprintf(&b, "| GPU %d | %s | %s | %s | %s |\n",
|
|
||||||
gpu.Index, eccCorr, syncBoost, powerVar, thermalVar)
|
|
||||||
}
|
}
|
||||||
b.WriteString("\n")
|
|
||||||
|
|
||||||
// ── Per GPU detail ────────────────────────────────────────────────────────
|
// ── Per GPU detail ────────────────────────────────────────────────────────
|
||||||
b.WriteString("## Per-GPU Details\n\n")
|
b.WriteString("## Per-GPU Details\n\n")
|
||||||
@@ -263,12 +286,16 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
|||||||
// Steady-state telemetry
|
// Steady-state telemetry
|
||||||
if benchmarkTelemetryAvailable(gpu.Steady) {
|
if benchmarkTelemetryAvailable(gpu.Steady) {
|
||||||
fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
|
fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
|
||||||
b.WriteString("| | Avg | P95 |\n|---|---|---|\n")
|
b.WriteString(fmtMDTable(
|
||||||
fmt.Fprintf(&b, "| Power | %.1f W | %.1f W |\n", gpu.Steady.AvgPowerW, gpu.Steady.P95PowerW)
|
[]string{"", "Avg", "P95"},
|
||||||
fmt.Fprintf(&b, "| Temperature | %.1f °C | %.1f °C |\n", gpu.Steady.AvgTempC, gpu.Steady.P95TempC)
|
[][]string{
|
||||||
fmt.Fprintf(&b, "| GPU clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgGraphicsClockMHz, gpu.Steady.P95GraphicsClockMHz)
|
{"Power", fmt.Sprintf("%.1f W", gpu.Steady.AvgPowerW), fmt.Sprintf("%.1f W", gpu.Steady.P95PowerW)},
|
||||||
fmt.Fprintf(&b, "| Memory clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgMemoryClockMHz, gpu.Steady.P95MemoryClockMHz)
|
{"Temperature", fmt.Sprintf("%.1f °C", gpu.Steady.AvgTempC), fmt.Sprintf("%.1f °C", gpu.Steady.P95TempC)},
|
||||||
fmt.Fprintf(&b, "| GPU utilisation | %.1f %% | — |\n", gpu.Steady.AvgUsagePct)
|
{"GPU clock", fmt.Sprintf("%.0f MHz", gpu.Steady.AvgGraphicsClockMHz), fmt.Sprintf("%.0f MHz", gpu.Steady.P95GraphicsClockMHz)},
|
||||||
|
{"Memory clock", fmt.Sprintf("%.0f MHz", gpu.Steady.AvgMemoryClockMHz), fmt.Sprintf("%.0f MHz", gpu.Steady.P95MemoryClockMHz)},
|
||||||
|
{"GPU utilisation", fmt.Sprintf("%.1f %%", gpu.Steady.AvgUsagePct), "—"},
|
||||||
|
},
|
||||||
|
))
|
||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
} else {
|
} else {
|
||||||
b.WriteString("**Steady-state telemetry:** unavailable\n\n")
|
b.WriteString("**Steady-state telemetry:** unavailable\n\n")
|
||||||
@@ -277,7 +304,7 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
|||||||
// Per-precision stability phases.
|
// Per-precision stability phases.
|
||||||
if len(gpu.PrecisionSteady) > 0 {
|
if len(gpu.PrecisionSteady) > 0 {
|
||||||
b.WriteString("**Per-precision stability:**\n\n")
|
b.WriteString("**Per-precision stability:**\n\n")
|
||||||
b.WriteString("| Precision | Status | Clock CV | Power CV | Clock Drift | ECC corr | ECC uncorr |\n|-----------|--------|----------|----------|-------------|----------|------------|\n")
|
var precRows [][]string
|
||||||
for _, p := range gpu.PrecisionSteady {
|
for _, p := range gpu.PrecisionSteady {
|
||||||
eccCorr := "—"
|
eccCorr := "—"
|
||||||
eccUncorr := "—"
|
eccUncorr := "—"
|
||||||
@@ -289,10 +316,15 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
|||||||
if strings.TrimSpace(status) == "" {
|
if strings.TrimSpace(status) == "" {
|
||||||
status = "OK"
|
status = "OK"
|
||||||
}
|
}
|
||||||
fmt.Fprintf(&b, "| %s | %s | %.1f%% | %.1f%% | %.1f%% | %s | %s |\n",
|
precRows = append(precRows, []string{
|
||||||
p.Precision, status, p.Steady.ClockCVPct, p.Steady.PowerCVPct, p.Steady.ClockDriftPct,
|
p.Precision, status,
|
||||||
eccCorr, eccUncorr)
|
fmt.Sprintf("%.1f%%", p.Steady.ClockCVPct),
|
||||||
|
fmt.Sprintf("%.1f%%", p.Steady.PowerCVPct),
|
||||||
|
fmt.Sprintf("%.1f%%", p.Steady.ClockDriftPct),
|
||||||
|
eccCorr, eccUncorr,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
b.WriteString(fmtMDTable([]string{"Precision", "Status", "Clock CV", "Power CV", "Clock Drift", "ECC corr", "ECC uncorr"}, precRows))
|
||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
} else {
|
} else {
|
||||||
// Legacy: show combined-window variance.
|
// Legacy: show combined-window variance.
|
||||||
@@ -315,16 +347,22 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
|||||||
// Precision results
|
// Precision results
|
||||||
if len(gpu.PrecisionResults) > 0 {
|
if len(gpu.PrecisionResults) > 0 {
|
||||||
b.WriteString("**Precision results:**\n\n")
|
b.WriteString("**Precision results:**\n\n")
|
||||||
b.WriteString("| Precision | TOPS (raw) | Weight | TOPS (fp32-eq) | Lanes | Iterations |\n|-----------|------------|--------|----------------|-------|------------|\n")
|
var presRows [][]string
|
||||||
for _, p := range gpu.PrecisionResults {
|
for _, p := range gpu.PrecisionResults {
|
||||||
if p.Supported {
|
if p.Supported {
|
||||||
weightStr := fmt.Sprintf("×%.3g", p.Weight)
|
presRows = append(presRows, []string{
|
||||||
fmt.Fprintf(&b, "| %s | %.2f | %s | %.2f | %d | %d |\n",
|
p.Name,
|
||||||
p.Name, p.TeraOpsPerSec, weightStr, p.WeightedTeraOpsPerSec, p.Lanes, p.Iterations)
|
fmt.Sprintf("%.2f", p.TeraOpsPerSec),
|
||||||
|
fmt.Sprintf("×%.3g", p.Weight),
|
||||||
|
fmt.Sprintf("%.2f", p.WeightedTeraOpsPerSec),
|
||||||
|
fmt.Sprintf("%d", p.Lanes),
|
||||||
|
fmt.Sprintf("%d", p.Iterations),
|
||||||
|
})
|
||||||
} else {
|
} else {
|
||||||
fmt.Fprintf(&b, "| %s | — (unsupported) | — | — | — | — |\n", p.Name)
|
presRows = append(presRows, []string{p.Name, "— (unsupported)", "—", "—", "—", "—"})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
b.WriteString(fmtMDTable([]string{"Precision", "TOPS (raw)", "Weight", "TOPS (fp32-eq)", "Lanes", "Iterations"}, presRows))
|
||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -346,9 +384,13 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
|||||||
b.WriteString("## Interconnect (NCCL)\n\n")
|
b.WriteString("## Interconnect (NCCL)\n\n")
|
||||||
fmt.Fprintf(&b, "**Status:** %s\n\n", result.Interconnect.Status)
|
fmt.Fprintf(&b, "**Status:** %s\n\n", result.Interconnect.Status)
|
||||||
if result.Interconnect.Supported {
|
if result.Interconnect.Supported {
|
||||||
b.WriteString("| Metric | Avg | Max |\n|--------|-----|-----|\n")
|
b.WriteString(fmtMDTable(
|
||||||
fmt.Fprintf(&b, "| Alg BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.MaxAlgBWGBps)
|
[]string{"Metric", "Avg", "Max"},
|
||||||
fmt.Fprintf(&b, "| Bus BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgBusBWGBps, result.Interconnect.MaxBusBWGBps)
|
[][]string{
|
||||||
|
{"Alg BW", fmt.Sprintf("%.1f GB/s", result.Interconnect.AvgAlgBWGBps), fmt.Sprintf("%.1f GB/s", result.Interconnect.MaxAlgBWGBps)},
|
||||||
|
{"Bus BW", fmt.Sprintf("%.1f GB/s", result.Interconnect.AvgBusBWGBps), fmt.Sprintf("%.1f GB/s", result.Interconnect.MaxBusBWGBps)},
|
||||||
|
},
|
||||||
|
))
|
||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
}
|
}
|
||||||
for _, note := range result.Interconnect.Notes {
|
for _, note := range result.Interconnect.Notes {
|
||||||
@@ -365,14 +407,16 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
|||||||
if !sp.Available {
|
if !sp.Available {
|
||||||
b.WriteString("IPMI power measurement unavailable.\n\n")
|
b.WriteString("IPMI power measurement unavailable.\n\n")
|
||||||
} else {
|
} else {
|
||||||
b.WriteString("| | Value |\n|---|---|\n")
|
spRows := [][]string{
|
||||||
fmt.Fprintf(&b, "| Server idle | %.0f W |\n", sp.IdleW)
|
{"Server idle", fmt.Sprintf("%.0f W", sp.IdleW)},
|
||||||
fmt.Fprintf(&b, "| Server under load | %.0f W |\n", sp.LoadedW)
|
{"Server under load", fmt.Sprintf("%.0f W", sp.LoadedW)},
|
||||||
fmt.Fprintf(&b, "| Server delta (load − idle) | %.0f W |\n", sp.DeltaW)
|
{"Server delta (load − idle)", fmt.Sprintf("%.0f W", sp.DeltaW)},
|
||||||
fmt.Fprintf(&b, "| GPU-reported sum | %.0f W |\n", sp.GPUReportedSumW)
|
{"GPU-reported sum", fmt.Sprintf("%.0f W", sp.GPUReportedSumW)},
|
||||||
if sp.ReportingRatio > 0 {
|
|
||||||
fmt.Fprintf(&b, "| Reporting ratio | %.2f (1.0 = accurate, <0.75 = GPU over-reports) |\n", sp.ReportingRatio)
|
|
||||||
}
|
}
|
||||||
|
if sp.ReportingRatio > 0 {
|
||||||
|
spRows = append(spRows, []string{"Reporting ratio", fmt.Sprintf("%.2f (1.0 = accurate, <0.75 = GPU over-reports)", sp.ReportingRatio)})
|
||||||
|
}
|
||||||
|
b.WriteString(fmtMDTable([]string{"", "Value"}, spRows))
|
||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
}
|
}
|
||||||
for _, note := range sp.Notes {
|
for _, note := range sp.Notes {
|
||||||
@@ -397,15 +441,19 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
|||||||
if cooling := result.Cooling; cooling != nil {
|
if cooling := result.Cooling; cooling != nil {
|
||||||
b.WriteString("## Cooling\n\n")
|
b.WriteString("## Cooling\n\n")
|
||||||
if cooling.Available {
|
if cooling.Available {
|
||||||
b.WriteString("| Metric | Value |\n|--------|-------|\n")
|
dutyAvg, dutyP95 := "N/A", "N/A"
|
||||||
fmt.Fprintf(&b, "| Average fan speed | %.0f RPM |\n", cooling.AvgFanRPM)
|
|
||||||
if cooling.FanDutyCycleAvailable {
|
if cooling.FanDutyCycleAvailable {
|
||||||
fmt.Fprintf(&b, "| Average fan duty cycle | %.1f%% |\n", cooling.AvgFanDutyCyclePct)
|
dutyAvg = fmt.Sprintf("%.1f%%", cooling.AvgFanDutyCyclePct)
|
||||||
fmt.Fprintf(&b, "| P95 fan duty cycle | %.1f%% |\n", cooling.P95FanDutyCyclePct)
|
dutyP95 = fmt.Sprintf("%.1f%%", cooling.P95FanDutyCyclePct)
|
||||||
} else {
|
|
||||||
b.WriteString("| Average fan duty cycle | N/A |\n")
|
|
||||||
b.WriteString("| P95 fan duty cycle | N/A |\n")
|
|
||||||
}
|
}
|
||||||
|
b.WriteString(fmtMDTable(
|
||||||
|
[]string{"Metric", "Value"},
|
||||||
|
[][]string{
|
||||||
|
{"Average fan speed", fmt.Sprintf("%.0f RPM", cooling.AvgFanRPM)},
|
||||||
|
{"Average fan duty cycle", dutyAvg},
|
||||||
|
{"P95 fan duty cycle", dutyP95},
|
||||||
|
},
|
||||||
|
))
|
||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
} else {
|
} else {
|
||||||
b.WriteString("Cooling telemetry unavailable.\n\n")
|
b.WriteString("Cooling telemetry unavailable.\n\n")
|
||||||
@@ -422,12 +470,16 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
|
|||||||
if len(result.PerformanceRampSteps) > 0 {
|
if len(result.PerformanceRampSteps) > 0 {
|
||||||
b.WriteString("## Platform Scalability (Performance Ramp)\n\n")
|
b.WriteString("## Platform Scalability (Performance Ramp)\n\n")
|
||||||
fmt.Fprintf(&b, "**Platform power score:** %.1f%% \n\n", result.PlatformPowerScore)
|
fmt.Fprintf(&b, "**Platform power score:** %.1f%% \n\n", result.PlatformPowerScore)
|
||||||
b.WriteString("| k GPUs | GPU Indices | Total Synthetic TOPS | Scalability |\n")
|
var scalRows [][]string
|
||||||
b.WriteString("|--------|-------------|----------------------|-------------|\n")
|
|
||||||
for _, step := range result.PerformanceRampSteps {
|
for _, step := range result.PerformanceRampSteps {
|
||||||
fmt.Fprintf(&b, "| %d | %s | %.2f | %.1f%% |\n",
|
scalRows = append(scalRows, []string{
|
||||||
step.StepIndex, joinIndexList(step.GPUIndices), step.TotalSyntheticTOPS, step.ScalabilityPct)
|
fmt.Sprintf("%d", step.StepIndex),
|
||||||
|
joinIndexList(step.GPUIndices),
|
||||||
|
fmt.Sprintf("%.2f", step.TotalSyntheticTOPS),
|
||||||
|
fmt.Sprintf("%.1f%%", step.ScalabilityPct),
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
b.WriteString(fmtMDTable([]string{"k GPUs", "GPU Indices", "Total Synthetic TOPS", "Scalability"}, scalRows))
|
||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
75
audit/internal/platform/benchmark_table.go
Normal file
75
audit/internal/platform/benchmark_table.go
Normal file
@@ -0,0 +1,75 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
// fmtMDTable renders a markdown table with column widths padded so the table
|
||||||
|
// is readable as plain text without a markdown renderer.
|
||||||
|
//
|
||||||
|
// headers contains the column header strings.
|
||||||
|
// rows contains data rows; each row must have the same number of cells as headers.
|
||||||
|
// Cells with fewer entries than headers are treated as empty.
|
||||||
|
func fmtMDTable(headers []string, rows [][]string) string {
|
||||||
|
ncols := len(headers)
|
||||||
|
if ncols == 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compute max width per column.
|
||||||
|
widths := make([]int, ncols)
|
||||||
|
for i, h := range headers {
|
||||||
|
if len(h) > widths[i] {
|
||||||
|
widths[i] = len(h)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, row := range rows {
|
||||||
|
for i := 0; i < ncols; i++ {
|
||||||
|
cell := ""
|
||||||
|
if i < len(row) {
|
||||||
|
cell = row[i]
|
||||||
|
}
|
||||||
|
if len(cell) > widths[i] {
|
||||||
|
widths[i] = len(cell)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var b strings.Builder
|
||||||
|
|
||||||
|
// Header row.
|
||||||
|
b.WriteByte('|')
|
||||||
|
for i, h := range headers {
|
||||||
|
b.WriteByte(' ')
|
||||||
|
b.WriteString(h)
|
||||||
|
b.WriteString(strings.Repeat(" ", widths[i]-len(h)))
|
||||||
|
b.WriteString(" |")
|
||||||
|
}
|
||||||
|
b.WriteByte('\n')
|
||||||
|
|
||||||
|
// Separator row.
|
||||||
|
b.WriteByte('|')
|
||||||
|
for i := range headers {
|
||||||
|
b.WriteString(strings.Repeat("-", widths[i]+2))
|
||||||
|
b.WriteByte('|')
|
||||||
|
}
|
||||||
|
b.WriteByte('\n')
|
||||||
|
|
||||||
|
// Data rows.
|
||||||
|
for _, row := range rows {
|
||||||
|
b.WriteByte('|')
|
||||||
|
for i := 0; i < ncols; i++ {
|
||||||
|
cell := ""
|
||||||
|
if i < len(row) {
|
||||||
|
cell = row[i]
|
||||||
|
}
|
||||||
|
b.WriteByte(' ')
|
||||||
|
b.WriteString(cell)
|
||||||
|
b.WriteString(strings.Repeat(" ", widths[i]-len(cell)))
|
||||||
|
b.WriteString(" |")
|
||||||
|
}
|
||||||
|
b.WriteByte('\n')
|
||||||
|
}
|
||||||
|
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
@@ -52,7 +52,7 @@ const (
|
|||||||
// - BenchmarkEstimatedPerfStabilitySec: xFusion v8.22 ramp 1-8: 5532 s
|
// - BenchmarkEstimatedPerfStabilitySec: xFusion v8.22 ramp 1-8: 5532 s
|
||||||
// - BenchmarkEstimatedPerfOvernightSec: derived from profile phases (SteadySec=27000)
|
// - BenchmarkEstimatedPerfOvernightSec: derived from profile phases (SteadySec=27000)
|
||||||
// - BenchmarkEstimatedPowerStandardSec: MLT v8.22 ramp 1-4: 2663 s; MSI v8.22 ramp 1-8: 2375 s
|
// - BenchmarkEstimatedPowerStandardSec: MLT v8.22 ramp 1-4: 2663 s; MSI v8.22 ramp 1-8: 2375 s
|
||||||
// - BenchmarkEstimatedPowerStabilitySec: xFusion v8.17/v8.22 ramp 1-8: 1977-2002 s
|
// - BenchmarkEstimatedPowerStabilitySec: target ~90 min with calibDurationSec=300 (8 GPU × ~2-3 attempts)
|
||||||
const (
|
const (
|
||||||
// Performance Benchmark (bee-gpu-burn).
|
// Performance Benchmark (bee-gpu-burn).
|
||||||
// Duration is per full ramp-up run (ramp 1→N) or per single parallel run.
|
// Duration is per full ramp-up run (ramp 1→N) or per single parallel run.
|
||||||
@@ -64,7 +64,7 @@ const (
|
|||||||
// Power / Thermal Fit (dcgmi targeted_power binary-search calibration).
|
// Power / Thermal Fit (dcgmi targeted_power binary-search calibration).
|
||||||
// Duration is for the full ramp-up run; individual steps vary with convergence speed.
|
// Duration is for the full ramp-up run; individual steps vary with convergence speed.
|
||||||
BenchmarkEstimatedPowerStandardSec = 2600 // ~43 min; ramp 1-4: 2663 s, ramp 1-8: 2375 s
|
BenchmarkEstimatedPowerStandardSec = 2600 // ~43 min; ramp 1-4: 2663 s, ramp 1-8: 2375 s
|
||||||
BenchmarkEstimatedPowerStabilitySec = 2000 // ~33 min; stability profile converges faster (longer steady → faster convergence)
|
BenchmarkEstimatedPowerStabilitySec = 5400 // ~90 min; calibDurationSec=300 × 8 GPU × ~2-3 attempts
|
||||||
BenchmarkEstimatedPowerOvernightSec = 3 * 3600
|
BenchmarkEstimatedPowerOvernightSec = 3 * 3600
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -408,6 +408,9 @@ type NvidiaPowerBenchGPU struct {
|
|||||||
// Telemetry holds the aggregated stats from the final converged calibration
|
// Telemetry holds the aggregated stats from the final converged calibration
|
||||||
// attempt for this GPU (temperature, power, fan, clock percentiles).
|
// attempt for this GPU (temperature, power, fan, clock percentiles).
|
||||||
Telemetry *BenchmarkTelemetrySummary `json:"telemetry,omitempty"`
|
Telemetry *BenchmarkTelemetrySummary `json:"telemetry,omitempty"`
|
||||||
|
// Fan state sampled at the end of single-card calibration.
|
||||||
|
AvgFanRPM float64 `json:"avg_fan_rpm,omitempty"`
|
||||||
|
AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type NvidiaPowerBenchStep struct {
|
type NvidiaPowerBenchStep struct {
|
||||||
@@ -426,6 +429,13 @@ type NvidiaPowerBenchStep struct {
|
|||||||
// ramp step's calibration run. ServerDeltaW = ServerLoadedW − idle.
|
// ramp step's calibration run. ServerDeltaW = ServerLoadedW − idle.
|
||||||
ServerLoadedW float64 `json:"server_loaded_w,omitempty"`
|
ServerLoadedW float64 `json:"server_loaded_w,omitempty"`
|
||||||
ServerDeltaW float64 `json:"server_delta_w,omitempty"`
|
ServerDeltaW float64 `json:"server_delta_w,omitempty"`
|
||||||
|
// PSU slot readings sampled at end of this ramp step.
|
||||||
|
PSUSlotReadings map[string]BenchmarkPSUSlotPower `json:"psu_slot_readings,omitempty"`
|
||||||
|
// Fan state at end of this ramp step.
|
||||||
|
AvgFanRPM float64 `json:"avg_fan_rpm,omitempty"`
|
||||||
|
AvgFanDutyCyclePct float64 `json:"avg_fan_duty_cycle_pct,omitempty"`
|
||||||
|
// Per-GPU telemetry from this step's calibration, keyed by GPU index.
|
||||||
|
PerGPUTelemetry map[int]*BenchmarkTelemetrySummary `json:"per_gpu_telemetry,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// NvidiaPerformanceRampStep holds per-step performance data for the
|
// NvidiaPerformanceRampStep holds per-step performance data for the
|
||||||
|
|||||||
Reference in New Issue
Block a user