Capture per-run IPMI power and GPU telemetry in power benchmark
- Sample IPMI loaded_w per single-card calibration and per ramp step instead of averaging over the entire Phase 2; top-level ServerPower uses the final (all-GPU) ramp step value - Add ServerLoadedW/ServerDeltaW to NvidiaPowerBenchGPU and NvidiaPowerBenchStep so external tooling can compare wall power per phase without re-parsing logs - Write gpu-metrics.csv/.html inside each single-XX/ and step-XX/ subdir; aggregate all phases into a top-level gpu-metrics.csv/.html - Write 00-nvidia-smi-q.log at the start of every power run - Add Telemetry (p95 temp/power/fan/clock) to NvidiaPowerBenchGPU in result.json from the converged calibration attempt - Power benchmark page: split "Achieved W" into Single-card W and Multi-GPU W (StablePowerLimitW); derate highlight and status color now reflect the final multi-GPU limit vs nominal - Performance benchmark page: add Status column and per-GPU score color coding (green/yellow/red) based on gpu.Status and OverallStatus Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -59,6 +59,9 @@ type benchmarkPowerCalibrationResult struct {
|
|||||||
// ≥20% while server fans were below 100% duty cycle — a signal that the
|
// ≥20% while server fans were below 100% duty cycle — a signal that the
|
||||||
// cooling system may not be correctly configured for full GPU load.
|
// cooling system may not be correctly configured for full GPU load.
|
||||||
CoolingWarning string
|
CoolingWarning string
|
||||||
|
// MetricRows holds the telemetry rows from the final (converged) attempt
|
||||||
|
// for this GPU. Used to build per-run gpu-metrics.csv.
|
||||||
|
MetricRows []GPUMetricRow
|
||||||
}
|
}
|
||||||
|
|
||||||
type benchmarkBurnProfile struct {
|
type benchmarkBurnProfile struct {
|
||||||
@@ -2781,7 +2784,7 @@ func runBenchmarkPowerCalibration(
|
|||||||
infoByIndex map[int]benchmarkGPUInfo,
|
infoByIndex map[int]benchmarkGPUInfo,
|
||||||
logFunc func(string),
|
logFunc func(string),
|
||||||
seedLimits map[int]int,
|
seedLimits map[int]int,
|
||||||
) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction) {
|
) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction, []GPUMetricRow) {
|
||||||
const calibDurationSec = 120
|
const calibDurationSec = 120
|
||||||
const maxDerateW = 150
|
const maxDerateW = 150
|
||||||
// calibSearchTolerance is the binary-search convergence threshold in watts.
|
// calibSearchTolerance is the binary-search convergence threshold in watts.
|
||||||
@@ -2795,7 +2798,7 @@ func runBenchmarkPowerCalibration(
|
|||||||
|
|
||||||
if _, err := exec.LookPath("dcgmi"); err != nil {
|
if _, err := exec.LookPath("dcgmi"); err != nil {
|
||||||
logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
|
logFunc("power calibration: dcgmi not found, skipping (will use default power limit)")
|
||||||
return map[int]benchmarkPowerCalibrationResult{}, nil
|
return map[int]benchmarkPowerCalibrationResult{}, nil, nil
|
||||||
}
|
}
|
||||||
if killed := KillTestWorkers(); len(killed) > 0 {
|
if killed := KillTestWorkers(); len(killed) > 0 {
|
||||||
for _, p := range killed {
|
for _, p := range killed {
|
||||||
@@ -2829,6 +2832,8 @@ func runBenchmarkPowerCalibration(
|
|||||||
|
|
||||||
results := make(map[int]benchmarkPowerCalibrationResult, len(gpuIndices))
|
results := make(map[int]benchmarkPowerCalibrationResult, len(gpuIndices))
|
||||||
var restore []benchmarkRestoreAction
|
var restore []benchmarkRestoreAction
|
||||||
|
var allCalibRows []GPUMetricRow // accumulated telemetry across all attempts
|
||||||
|
var calibCursor float64
|
||||||
|
|
||||||
// Initialise per-GPU state.
|
// Initialise per-GPU state.
|
||||||
states := make([]*gpuCalibState, 0, len(gpuIndices))
|
states := make([]*gpuCalibState, 0, len(gpuIndices))
|
||||||
@@ -2981,6 +2986,8 @@ calibDone:
|
|||||||
ticker.Stop()
|
ticker.Stop()
|
||||||
cancelAttempt()
|
cancelAttempt()
|
||||||
_ = os.WriteFile(filepath.Join(runDir, logName), ar.out, 0644)
|
_ = os.WriteFile(filepath.Join(runDir, logName), ar.out, 0644)
|
||||||
|
// Accumulate telemetry rows with attempt stage label.
|
||||||
|
appendBenchmarkMetrics(&allCalibRows, ar.rows, fmt.Sprintf("attempt-%d", sharedAttempt), &calibCursor, float64(calibDurationSec))
|
||||||
|
|
||||||
// Resource busy: retry with exponential back-off (shared — one DCGM session).
|
// Resource busy: retry with exponential back-off (shared — one DCGM session).
|
||||||
if ar.err != nil && isDCGMResourceBusy(ar.err) {
|
if ar.err != nil && isDCGMResourceBusy(ar.err) {
|
||||||
@@ -3065,6 +3072,7 @@ calibDone:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
s.calib.MetricRows = filterRowsByGPU(ar.rows, s.idx)
|
||||||
s.converged = true
|
s.converged = true
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
@@ -3103,6 +3111,7 @@ calibDone:
|
|||||||
} else {
|
} else {
|
||||||
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
|
s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW))
|
||||||
}
|
}
|
||||||
|
s.calib.MetricRows = filterRowsByGPU(ar.rows, s.idx)
|
||||||
s.converged = true
|
s.converged = true
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
@@ -3140,7 +3149,8 @@ calibDone:
|
|||||||
results[s.idx] = s.calib
|
results[s.idx] = s.calib
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return results, restore
|
writeBenchmarkMetricsFiles(runDir, allCalibRows)
|
||||||
|
return results, restore, allCalibRows
|
||||||
}
|
}
|
||||||
|
|
||||||
// isDCGMResourceBusy returns true when dcgmi exits with DCGM_ST_IN_USE (222),
|
// isDCGMResourceBusy returns true when dcgmi exits with DCGM_ST_IN_USE (222),
|
||||||
@@ -3230,21 +3240,25 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
|||||||
}
|
}
|
||||||
if len(result.RampSteps) > 0 {
|
if len(result.RampSteps) > 0 {
|
||||||
b.WriteString("## Ramp Sequence\n\n")
|
b.WriteString("## Ramp Sequence\n\n")
|
||||||
b.WriteString("| Step | New GPU | Stable Limit | Total Observed | Derated | Status |\n")
|
b.WriteString("| Step | New GPU | Stable Limit | Total Observed | Server Δ (IPMI) | Derated | Status |\n")
|
||||||
b.WriteString("|------|---------|--------------|----------------|---------|--------|\n")
|
b.WriteString("|------|---------|--------------|----------------|-----------------|---------|--------|\n")
|
||||||
for _, step := range result.RampSteps {
|
for _, step := range result.RampSteps {
|
||||||
derated := "-"
|
derated := "-"
|
||||||
if step.Derated {
|
if step.Derated {
|
||||||
derated = "⚠ yes"
|
derated = "⚠ yes"
|
||||||
}
|
}
|
||||||
fmt.Fprintf(&b, "| %d | GPU %d | %.0f W | %.0f W | %s | %s |\n",
|
serverDelta := "-"
|
||||||
step.StepIndex, step.NewGPUIndex, step.NewGPUStableLimitW, step.TotalObservedPowerW, derated, step.Status)
|
if step.ServerDeltaW > 0 {
|
||||||
|
serverDelta = fmt.Sprintf("%.0f W", step.ServerDeltaW)
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, "| %d | GPU %d | %.0f W | %.0f W | %s | %s | %s |\n",
|
||||||
|
step.StepIndex, step.NewGPUIndex, step.NewGPUStableLimitW, step.TotalObservedPowerW, serverDelta, derated, step.Status)
|
||||||
}
|
}
|
||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
}
|
}
|
||||||
b.WriteString("## Per-Slot Results\n\n")
|
b.WriteString("## Per-Slot Results\n\n")
|
||||||
b.WriteString("| GPU | Status | Single-card Limit | Stable Limit | Temp | Attempts |\n")
|
b.WriteString("| GPU | Status | Single-card Limit | Stable Limit | Server Δ (IPMI) | Temp | Attempts |\n")
|
||||||
b.WriteString("|-----|--------|-------------------|--------------|------|----------|\n")
|
b.WriteString("|-----|--------|-------------------|--------------|-----------------|------|----------|\n")
|
||||||
for _, gpu := range result.GPUs {
|
for _, gpu := range result.GPUs {
|
||||||
stableLimit := "-"
|
stableLimit := "-"
|
||||||
if gpu.StablePowerLimitW > 0 {
|
if gpu.StablePowerLimitW > 0 {
|
||||||
@@ -3254,8 +3268,12 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
|||||||
stableLimit = fmt.Sprintf("%.0f W", gpu.StablePowerLimitW)
|
stableLimit = fmt.Sprintf("%.0f W", gpu.StablePowerLimitW)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fmt.Fprintf(&b, "| GPU %d | %s | %.0f W | %s | %.1f C | %d |\n",
|
serverDelta := "-"
|
||||||
gpu.Index, gpu.Status, gpu.AppliedPowerLimitW, stableLimit, gpu.MaxObservedTempC, gpu.CalibrationAttempts)
|
if gpu.ServerDeltaW > 0 {
|
||||||
|
serverDelta = fmt.Sprintf("%.0f W", gpu.ServerDeltaW)
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, "| GPU %d | %s | %.0f W | %s | %s | %.1f C | %d |\n",
|
||||||
|
gpu.Index, gpu.Status, gpu.AppliedPowerLimitW, stableLimit, serverDelta, gpu.MaxObservedTempC, gpu.CalibrationAttempts)
|
||||||
}
|
}
|
||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
for _, gpu := range result.GPUs {
|
for _, gpu := range result.GPUs {
|
||||||
@@ -3284,11 +3302,19 @@ func renderPowerBenchSummary(result NvidiaPowerBenchResult) string {
|
|||||||
fmt.Fprintf(&b, "ramp_step_%d_new_gpu=%d\n", step.StepIndex, step.NewGPUIndex)
|
fmt.Fprintf(&b, "ramp_step_%d_new_gpu=%d\n", step.StepIndex, step.NewGPUIndex)
|
||||||
fmt.Fprintf(&b, "ramp_step_%d_stable_limit_w=%.0f\n", step.StepIndex, step.NewGPUStableLimitW)
|
fmt.Fprintf(&b, "ramp_step_%d_stable_limit_w=%.0f\n", step.StepIndex, step.NewGPUStableLimitW)
|
||||||
fmt.Fprintf(&b, "ramp_step_%d_total_power_w=%.0f\n", step.StepIndex, step.TotalObservedPowerW)
|
fmt.Fprintf(&b, "ramp_step_%d_total_power_w=%.0f\n", step.StepIndex, step.TotalObservedPowerW)
|
||||||
|
if step.ServerLoadedW > 0 {
|
||||||
|
fmt.Fprintf(&b, "ramp_step_%d_server_loaded_w=%.0f\n", step.StepIndex, step.ServerLoadedW)
|
||||||
|
fmt.Fprintf(&b, "ramp_step_%d_server_delta_w=%.0f\n", step.StepIndex, step.ServerDeltaW)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
for _, gpu := range result.GPUs {
|
for _, gpu := range result.GPUs {
|
||||||
if gpu.StablePowerLimitW > 0 {
|
if gpu.StablePowerLimitW > 0 {
|
||||||
fmt.Fprintf(&b, "gpu_%d_stable_limit_w=%.0f\n", gpu.Index, gpu.StablePowerLimitW)
|
fmt.Fprintf(&b, "gpu_%d_stable_limit_w=%.0f\n", gpu.Index, gpu.StablePowerLimitW)
|
||||||
}
|
}
|
||||||
|
if gpu.ServerLoadedW > 0 {
|
||||||
|
fmt.Fprintf(&b, "gpu_%d_server_loaded_w=%.0f\n", gpu.Index, gpu.ServerLoadedW)
|
||||||
|
fmt.Fprintf(&b, "gpu_%d_server_delta_w=%.0f\n", gpu.Index, gpu.ServerDeltaW)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if sp := result.ServerPower; sp != nil && sp.Available {
|
if sp := result.ServerPower; sp != nil && sp.Available {
|
||||||
fmt.Fprintf(&b, "server_idle_w=%.0f\n", sp.IdleW)
|
fmt.Fprintf(&b, "server_idle_w=%.0f\n", sp.IdleW)
|
||||||
@@ -3327,6 +3353,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
if infoErr != nil {
|
if infoErr != nil {
|
||||||
return "", infoErr
|
return "", infoErr
|
||||||
}
|
}
|
||||||
|
// Capture full nvidia-smi -q snapshot at the start of the run.
|
||||||
|
if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil {
|
||||||
|
_ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644)
|
||||||
|
}
|
||||||
hostname, _ := os.Hostname()
|
hostname, _ := os.Hostname()
|
||||||
result := NvidiaPowerBenchResult{
|
result := NvidiaPowerBenchResult{
|
||||||
BenchmarkVersion: benchmarkVersion,
|
BenchmarkVersion: benchmarkVersion,
|
||||||
@@ -3352,13 +3382,31 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
// Phase 1: calibrate each GPU individually (sequentially, one at a time) to
|
// Phase 1: calibrate each GPU individually (sequentially, one at a time) to
|
||||||
// establish a true single-card power baseline unaffected by neighbour heat.
|
// establish a true single-card power baseline unaffected by neighbour heat.
|
||||||
calibByIndex := make(map[int]benchmarkPowerCalibrationResult, len(selected))
|
calibByIndex := make(map[int]benchmarkPowerCalibrationResult, len(selected))
|
||||||
|
singleIPMILoadedW := make(map[int]float64, len(selected))
|
||||||
var allRestoreActions []benchmarkRestoreAction
|
var allRestoreActions []benchmarkRestoreAction
|
||||||
|
// allPowerRows accumulates telemetry from all phases for the top-level gpu-metrics.csv.
|
||||||
|
var allPowerRows []GPUMetricRow
|
||||||
|
var powerCursor float64
|
||||||
for _, idx := range selected {
|
for _, idx := range selected {
|
||||||
singleDir := filepath.Join(runDir, fmt.Sprintf("single-%02d", idx))
|
singleDir := filepath.Join(runDir, fmt.Sprintf("single-%02d", idx))
|
||||||
_ = os.MkdirAll(singleDir, 0755)
|
_ = os.MkdirAll(singleDir, 0755)
|
||||||
singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
||||||
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
|
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
|
||||||
c, restore := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil)
|
ipmiSingleCtx, ipmiSingleCancel := context.WithCancel(ctx)
|
||||||
|
ipmiSingleDone := make(chan float64, 1)
|
||||||
|
go func() {
|
||||||
|
defer close(ipmiSingleDone)
|
||||||
|
if w, ok := sampleIPMIPowerSeries(ipmiSingleCtx, 3600); ok {
|
||||||
|
ipmiSingleDone <- w
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
c, restore, singleRows := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil)
|
||||||
|
appendBenchmarkMetrics(&allPowerRows, singleRows, fmt.Sprintf("single-gpu-%d", idx), &powerCursor, 0)
|
||||||
|
ipmiSingleCancel()
|
||||||
|
if w, ok := <-ipmiSingleDone; ok {
|
||||||
|
singleIPMILoadedW[idx] = w
|
||||||
|
logFunc(fmt.Sprintf("power calibration: GPU %d single-card IPMI loaded: %.0f W", idx, w))
|
||||||
|
}
|
||||||
allRestoreActions = append(allRestoreActions, restore...)
|
allRestoreActions = append(allRestoreActions, restore...)
|
||||||
if r, ok := c[idx]; ok {
|
if r, ok := c[idx]; ok {
|
||||||
calibByIndex[idx] = r
|
calibByIndex[idx] = r
|
||||||
@@ -3383,7 +3431,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
result.OverallStatus = "PARTIAL"
|
result.OverallStatus = "PARTIAL"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
gpus = append(gpus, NvidiaPowerBenchGPU{
|
gpu := NvidiaPowerBenchGPU{
|
||||||
Index: idx,
|
Index: idx,
|
||||||
Name: info.Name,
|
Name: info.Name,
|
||||||
BusID: info.BusID,
|
BusID: info.BusID,
|
||||||
@@ -3396,7 +3444,16 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
Status: status,
|
Status: status,
|
||||||
Notes: append([]string(nil), calib.Notes...),
|
Notes: append([]string(nil), calib.Notes...),
|
||||||
CoolingWarning: calib.CoolingWarning,
|
CoolingWarning: calib.CoolingWarning,
|
||||||
})
|
}
|
||||||
|
if w, ok := singleIPMILoadedW[idx]; ok && serverIdleOK && w > 0 {
|
||||||
|
gpu.ServerLoadedW = w
|
||||||
|
gpu.ServerDeltaW = w - serverIdleW
|
||||||
|
}
|
||||||
|
if len(calib.MetricRows) > 0 {
|
||||||
|
t := summarizeBenchmarkTelemetry(calib.MetricRows)
|
||||||
|
gpu.Telemetry = &t
|
||||||
|
}
|
||||||
|
gpus = append(gpus, gpu)
|
||||||
}
|
}
|
||||||
sort.Slice(gpus, func(i, j int) bool {
|
sort.Slice(gpus, func(i, j int) bool {
|
||||||
if gpus[i].MaxObservedPowerW != gpus[j].MaxObservedPowerW {
|
if gpus[i].MaxObservedPowerW != gpus[j].MaxObservedPowerW {
|
||||||
@@ -3445,20 +3502,11 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
// stableLimits accumulates GPU index → fixed stable limit (W) across steps.
|
// stableLimits accumulates GPU index → fixed stable limit (W) across steps.
|
||||||
stableLimits := make(map[int]int, len(result.RecommendedSlotOrder))
|
stableLimits := make(map[int]int, len(result.RecommendedSlotOrder))
|
||||||
|
|
||||||
// Start an IPMI sampling goroutine that runs throughout Phase 2 to capture
|
// serverLoadedW tracks the IPMI server power from the final ramp step
|
||||||
// server-side loaded power while GPUs are under stress. The goroutine is
|
// (all GPUs simultaneously loaded). Earlier steps' values are stored
|
||||||
// cancelled as soon as Phase 2 finishes, and the average is used to compare
|
// per-step in NvidiaPowerBenchStep.ServerLoadedW.
|
||||||
// against PlatformMaxTDPW (GPU-reported stable limits sum).
|
|
||||||
var serverLoadedW float64
|
var serverLoadedW float64
|
||||||
var serverLoadedOK bool
|
var serverLoadedOK bool
|
||||||
ipmiPhase2Ctx, ipmiPhase2Cancel := context.WithCancel(ctx)
|
|
||||||
ipmiPhase2Done := make(chan float64, 1)
|
|
||||||
go func() {
|
|
||||||
defer close(ipmiPhase2Done)
|
|
||||||
if w, ok := sampleIPMIPowerSeries(ipmiPhase2Ctx, 3600); ok {
|
|
||||||
ipmiPhase2Done <- w
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
|
|
||||||
// Step 1: reuse single-card calibration result directly.
|
// Step 1: reuse single-card calibration result directly.
|
||||||
if len(result.RecommendedSlotOrder) > 0 {
|
if len(result.RecommendedSlotOrder) > 0 {
|
||||||
@@ -3475,6 +3523,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
Derated: firstCalib.Derated,
|
Derated: firstCalib.Derated,
|
||||||
Status: "OK",
|
Status: "OK",
|
||||||
}
|
}
|
||||||
|
if w, ok := singleIPMILoadedW[firstIdx]; ok && serverIdleOK && w > 0 {
|
||||||
|
ramp.ServerLoadedW = w
|
||||||
|
ramp.ServerDeltaW = w - serverIdleW
|
||||||
|
}
|
||||||
if !firstCalib.Completed {
|
if !firstCalib.Completed {
|
||||||
ramp.Status = "FAILED"
|
ramp.Status = "FAILED"
|
||||||
ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card targeted_power", firstIdx))
|
ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card targeted_power", firstIdx))
|
||||||
@@ -3523,7 +3575,24 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
step, len(result.RecommendedSlotOrder), len(subset), newGPUIdx))
|
step, len(result.RecommendedSlotOrder), len(subset), newGPUIdx))
|
||||||
|
|
||||||
stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
||||||
stepCalib, stepRestore := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep)
|
ipmiStepCtx, ipmiStepCancel := context.WithCancel(ctx)
|
||||||
|
ipmiStepDone := make(chan float64, 1)
|
||||||
|
go func() {
|
||||||
|
defer close(ipmiStepDone)
|
||||||
|
if w, ok := sampleIPMIPowerSeries(ipmiStepCtx, 3600); ok {
|
||||||
|
ipmiStepDone <- w
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
stepCalib, stepRestore, stepRows := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep)
|
||||||
|
appendBenchmarkMetrics(&allPowerRows, stepRows, fmt.Sprintf("ramp-step-%d", step), &powerCursor, 0)
|
||||||
|
ipmiStepCancel()
|
||||||
|
var stepIPMILoadedW float64
|
||||||
|
var stepIPMIOK bool
|
||||||
|
if w, ok := <-ipmiStepDone; ok {
|
||||||
|
stepIPMILoadedW = w
|
||||||
|
stepIPMIOK = true
|
||||||
|
logFunc(fmt.Sprintf("power ramp: step %d IPMI loaded: %.0f W", step, w))
|
||||||
|
}
|
||||||
// Accumulate restore actions; they all run in the outer defer.
|
// Accumulate restore actions; they all run in the outer defer.
|
||||||
allRestoreActions = append(allRestoreActions, stepRestore...)
|
allRestoreActions = append(allRestoreActions, stepRestore...)
|
||||||
|
|
||||||
@@ -3586,15 +3655,17 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load.", step, newGPUIdx, c.AppliedPowerLimitW))
|
result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load.", step, newGPUIdx, c.AppliedPowerLimitW))
|
||||||
}
|
}
|
||||||
|
|
||||||
result.RampSteps = append(result.RampSteps, ramp)
|
if stepIPMIOK && serverIdleOK && stepIPMILoadedW > 0 {
|
||||||
}
|
ramp.ServerLoadedW = stepIPMILoadedW
|
||||||
|
ramp.ServerDeltaW = stepIPMILoadedW - serverIdleW
|
||||||
|
// The last step has all GPUs loaded — use it as the top-level loaded_w.
|
||||||
|
if step == len(result.RecommendedSlotOrder) {
|
||||||
|
serverLoadedW = stepIPMILoadedW
|
||||||
|
serverLoadedOK = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Stop IPMI Phase 2 sampling and collect result.
|
result.RampSteps = append(result.RampSteps, ramp)
|
||||||
ipmiPhase2Cancel()
|
|
||||||
if w, ok := <-ipmiPhase2Done; ok {
|
|
||||||
serverLoadedW = w
|
|
||||||
serverLoadedOK = true
|
|
||||||
logFunc(fmt.Sprintf("server loaded power (IPMI, Phase 2 avg): %.0f W", w))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Populate StablePowerLimitW on each GPU entry from the accumulated stable limits.
|
// Populate StablePowerLimitW on each GPU entry from the accumulated stable limits.
|
||||||
@@ -3624,6 +3695,8 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
|||||||
// ~1.0 → GPU telemetry matches wall power; <0.75 → GPU over-reports its TDP.
|
// ~1.0 → GPU telemetry matches wall power; <0.75 → GPU over-reports its TDP.
|
||||||
_ = serverIdleOK // used implicitly via characterizeServerPower
|
_ = serverIdleOK // used implicitly via characterizeServerPower
|
||||||
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, result.PlatformMaxTDPW, serverIdleOK && serverLoadedOK)
|
result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, result.PlatformMaxTDPW, serverIdleOK && serverLoadedOK)
|
||||||
|
// Write top-level gpu-metrics.csv/.html aggregating all phases.
|
||||||
|
writeBenchmarkMetricsFiles(runDir, allPowerRows)
|
||||||
resultJSON, err := json.MarshalIndent(result, "", " ")
|
resultJSON, err := json.MarshalIndent(result, "", " ")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", fmt.Errorf("marshal power result: %w", err)
|
return "", fmt.Errorf("marshal power result: %w", err)
|
||||||
|
|||||||
@@ -331,6 +331,13 @@ type NvidiaPowerBenchGPU struct {
|
|||||||
Notes []string `json:"notes,omitempty"`
|
Notes []string `json:"notes,omitempty"`
|
||||||
// CoolingWarning mirrors BenchmarkGPUResult.CoolingWarning for the power workflow.
|
// CoolingWarning mirrors BenchmarkGPUResult.CoolingWarning for the power workflow.
|
||||||
CoolingWarning string `json:"cooling_warning,omitempty"`
|
CoolingWarning string `json:"cooling_warning,omitempty"`
|
||||||
|
// ServerLoadedW is the IPMI server power reading captured during this
|
||||||
|
// GPU's single-card calibration run. ServerDeltaW = ServerLoadedW − idle.
|
||||||
|
ServerLoadedW float64 `json:"server_loaded_w,omitempty"`
|
||||||
|
ServerDeltaW float64 `json:"server_delta_w,omitempty"`
|
||||||
|
// Telemetry holds the aggregated stats from the final converged calibration
|
||||||
|
// attempt for this GPU (temperature, power, fan, clock percentiles).
|
||||||
|
Telemetry *BenchmarkTelemetrySummary `json:"telemetry,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type NvidiaPowerBenchStep struct {
|
type NvidiaPowerBenchStep struct {
|
||||||
@@ -345,6 +352,10 @@ type NvidiaPowerBenchStep struct {
|
|||||||
Derated bool `json:"derated,omitempty"`
|
Derated bool `json:"derated,omitempty"`
|
||||||
Status string `json:"status"`
|
Status string `json:"status"`
|
||||||
Notes []string `json:"notes,omitempty"`
|
Notes []string `json:"notes,omitempty"`
|
||||||
|
// ServerLoadedW is the IPMI server power reading captured during this
|
||||||
|
// ramp step's calibration run. ServerDeltaW = ServerLoadedW − idle.
|
||||||
|
ServerLoadedW float64 `json:"server_loaded_w,omitempty"`
|
||||||
|
ServerDeltaW float64 `json:"server_delta_w,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// NvidiaPerformanceRampStep holds per-step performance data for the
|
// NvidiaPerformanceRampStep holds per-step performance data for the
|
||||||
|
|||||||
@@ -2014,9 +2014,11 @@ func renderSATCard(id, label, runAction, headerActions, body string) string {
|
|||||||
// ── Benchmark ─────────────────────────────────────────────────────────────────
|
// ── Benchmark ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
type benchmarkHistoryRun struct {
|
type benchmarkHistoryRun struct {
|
||||||
generatedAt time.Time
|
generatedAt time.Time
|
||||||
displayTime string
|
displayTime string
|
||||||
gpuScores map[int]float64 // GPU index → composite score
|
gpuScores map[int]float64 // GPU index → composite score
|
||||||
|
gpuStatuses map[int]string // GPU index → status ("OK", "WARNING", "FAILED", …)
|
||||||
|
overallStatus string
|
||||||
}
|
}
|
||||||
|
|
||||||
func renderBenchmark(opts HandlerOptions) string {
|
func renderBenchmark(opts HandlerOptions) string {
|
||||||
@@ -2324,7 +2326,7 @@ func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string,
|
|||||||
b.WriteString(`<p style="color:var(--muted);font-size:13px;margin-bottom:12px">` + html.EscapeString(description) + `</p>`)
|
b.WriteString(`<p style="color:var(--muted);font-size:13px;margin-bottom:12px">` + html.EscapeString(description) + `</p>`)
|
||||||
}
|
}
|
||||||
b.WriteString(`<div style="overflow-x:auto">`)
|
b.WriteString(`<div style="overflow-x:auto">`)
|
||||||
b.WriteString(`<table><thead><tr><th>Run</th><th>Time</th>`)
|
b.WriteString(`<table><thead><tr><th>Run</th><th>Time</th><th>Status</th>`)
|
||||||
for i := 0; i <= maxGPUIndex; i++ {
|
for i := 0; i <= maxGPUIndex; i++ {
|
||||||
b.WriteString(`<th>GPU ` + strconv.Itoa(i) + `</th>`)
|
b.WriteString(`<th>GPU ` + strconv.Itoa(i) + `</th>`)
|
||||||
}
|
}
|
||||||
@@ -2333,13 +2335,36 @@ func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string,
|
|||||||
b.WriteString(`<tr>`)
|
b.WriteString(`<tr>`)
|
||||||
b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
|
b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
|
||||||
b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
|
b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
|
||||||
|
overallColor := "var(--ok)"
|
||||||
|
overallLabel := run.overallStatus
|
||||||
|
if overallLabel == "" {
|
||||||
|
overallLabel = "OK"
|
||||||
|
}
|
||||||
|
if overallLabel == "FAILED" {
|
||||||
|
overallColor = "var(--crit-fg,#9f3a38)"
|
||||||
|
} else if overallLabel != "OK" {
|
||||||
|
overallColor = "var(--warn)"
|
||||||
|
}
|
||||||
|
b.WriteString(`<td style="color:` + overallColor + `;font-weight:600">` + html.EscapeString(overallLabel) + `</td>`)
|
||||||
for idx := 0; idx <= maxGPUIndex; idx++ {
|
for idx := 0; idx <= maxGPUIndex; idx++ {
|
||||||
score, ok := run.gpuScores[idx]
|
score, ok := run.gpuScores[idx]
|
||||||
if !ok {
|
if !ok {
|
||||||
b.WriteString(`<td style="color:var(--muted)">-</td>`)
|
b.WriteString(`<td style="color:var(--muted)">-</td>`)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
b.WriteString(`<td>` + fmt.Sprintf("%.2f", score) + `</td>`)
|
gpuStatus := run.gpuStatuses[idx]
|
||||||
|
scoreColor := ""
|
||||||
|
switch gpuStatus {
|
||||||
|
case "FAILED":
|
||||||
|
scoreColor = ` style="color:var(--crit-fg,#9f3a38);font-weight:600"`
|
||||||
|
case "WARNING", "PARTIAL":
|
||||||
|
scoreColor = ` style="color:var(--warn);font-weight:600"`
|
||||||
|
case "", "OK":
|
||||||
|
// no override
|
||||||
|
default:
|
||||||
|
scoreColor = ` style="color:var(--warn);font-weight:600"`
|
||||||
|
}
|
||||||
|
b.WriteString(`<td` + scoreColor + `>` + fmt.Sprintf("%.2f", score) + `</td>`)
|
||||||
}
|
}
|
||||||
b.WriteString(`</tr>`)
|
b.WriteString(`</tr>`)
|
||||||
}
|
}
|
||||||
@@ -2373,12 +2398,15 @@ func loadBenchmarkHistoryFromPaths(paths []string) (int, []benchmarkHistoryRun)
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
run := benchmarkHistoryRun{
|
run := benchmarkHistoryRun{
|
||||||
generatedAt: result.GeneratedAt,
|
generatedAt: result.GeneratedAt,
|
||||||
displayTime: result.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
|
displayTime: result.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
|
||||||
gpuScores: make(map[int]float64),
|
gpuScores: make(map[int]float64),
|
||||||
|
gpuStatuses: make(map[int]string),
|
||||||
|
overallStatus: result.OverallStatus,
|
||||||
}
|
}
|
||||||
for _, gpu := range result.GPUs {
|
for _, gpu := range result.GPUs {
|
||||||
run.gpuScores[gpu.Index] = gpu.Scores.CompositeScore
|
run.gpuScores[gpu.Index] = gpu.Scores.CompositeScore
|
||||||
|
run.gpuStatuses[gpu.Index] = gpu.Status
|
||||||
if gpu.Index > maxGPUIndex {
|
if gpu.Index > maxGPUIndex {
|
||||||
maxGPUIndex = gpu.Index
|
maxGPUIndex = gpu.Index
|
||||||
}
|
}
|
||||||
@@ -2447,31 +2475,45 @@ func renderPowerBenchmarkResultsCard(exportDir string) string {
|
|||||||
|
|
||||||
if len(latest.GPUs) > 0 {
|
if len(latest.GPUs) > 0 {
|
||||||
b.WriteString(`<div style="overflow-x:auto"><table><thead><tr>`)
|
b.WriteString(`<div style="overflow-x:auto"><table><thead><tr>`)
|
||||||
b.WriteString(`<th>GPU</th><th>Model</th><th>Nominal W</th><th>Achieved W</th><th>P95 Observed W</th><th>Status</th>`)
|
b.WriteString(`<th>GPU</th><th>Model</th><th>Nominal W</th><th>Single-card W</th><th>Multi-GPU W</th><th>P95 Observed W</th><th>Status</th>`)
|
||||||
b.WriteString(`</tr></thead><tbody>`)
|
b.WriteString(`</tr></thead><tbody>`)
|
||||||
for _, gpu := range latest.GPUs {
|
for _, gpu := range latest.GPUs {
|
||||||
derated := gpu.Derated || (gpu.DefaultPowerLimitW > 0 && gpu.AppliedPowerLimitW < gpu.DefaultPowerLimitW-1)
|
// finalLimitW is the definitive TDP: multi-GPU stable limit from the ramp,
|
||||||
|
// falling back to single-card applied limit if the ramp hasn't run.
|
||||||
|
finalLimitW := gpu.StablePowerLimitW
|
||||||
|
if finalLimitW <= 0 {
|
||||||
|
finalLimitW = gpu.AppliedPowerLimitW
|
||||||
|
}
|
||||||
|
// Derate is relative to nominal (DefaultPowerLimitW), using the final limit.
|
||||||
|
derated := gpu.Derated ||
|
||||||
|
(gpu.DefaultPowerLimitW > 0 && finalLimitW > 0 && finalLimitW < gpu.DefaultPowerLimitW-1)
|
||||||
rowStyle := ""
|
rowStyle := ""
|
||||||
achievedStyle := ""
|
finalStyle := ""
|
||||||
if derated {
|
if derated {
|
||||||
rowStyle = ` style="background:rgba(255,180,0,0.08)"`
|
rowStyle = ` style="background:rgba(255,180,0,0.08)"`
|
||||||
achievedStyle = ` style="color:#e6a000;font-weight:600"`
|
finalStyle = ` style="color:#e6a000;font-weight:600"`
|
||||||
}
|
}
|
||||||
statusLabel := gpu.Status
|
statusLabel := gpu.Status
|
||||||
if statusLabel == "" {
|
if statusLabel == "" {
|
||||||
statusLabel = "OK"
|
statusLabel = "OK"
|
||||||
}
|
}
|
||||||
statusColor := "var(--ok)"
|
statusColor := "var(--ok)"
|
||||||
if statusLabel != "OK" {
|
if statusLabel == "FAILED" {
|
||||||
|
statusColor = "var(--crit-fg,#9f3a38)"
|
||||||
|
} else if statusLabel != "OK" {
|
||||||
statusColor = "var(--warn)"
|
statusColor = "var(--warn)"
|
||||||
}
|
}
|
||||||
nominalStr := "-"
|
nominalStr := "-"
|
||||||
if gpu.DefaultPowerLimitW > 0 {
|
if gpu.DefaultPowerLimitW > 0 {
|
||||||
nominalStr = fmt.Sprintf("%.0f", gpu.DefaultPowerLimitW)
|
nominalStr = fmt.Sprintf("%.0f", gpu.DefaultPowerLimitW)
|
||||||
}
|
}
|
||||||
achievedStr := "-"
|
singleStr := "-"
|
||||||
if gpu.AppliedPowerLimitW > 0 {
|
if gpu.AppliedPowerLimitW > 0 {
|
||||||
achievedStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW)
|
singleStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW)
|
||||||
|
}
|
||||||
|
multiStr := "-"
|
||||||
|
if gpu.StablePowerLimitW > 0 {
|
||||||
|
multiStr = fmt.Sprintf("%.0f", gpu.StablePowerLimitW)
|
||||||
}
|
}
|
||||||
p95Str := "-"
|
p95Str := "-"
|
||||||
if gpu.MaxObservedPowerW > 0 {
|
if gpu.MaxObservedPowerW > 0 {
|
||||||
@@ -2481,7 +2523,8 @@ func renderPowerBenchmarkResultsCard(exportDir string) string {
|
|||||||
b.WriteString(`<td>` + strconv.Itoa(gpu.Index) + `</td>`)
|
b.WriteString(`<td>` + strconv.Itoa(gpu.Index) + `</td>`)
|
||||||
b.WriteString(`<td>` + html.EscapeString(gpu.Name) + `</td>`)
|
b.WriteString(`<td>` + html.EscapeString(gpu.Name) + `</td>`)
|
||||||
b.WriteString(`<td>` + nominalStr + `</td>`)
|
b.WriteString(`<td>` + nominalStr + `</td>`)
|
||||||
b.WriteString(`<td` + achievedStyle + `>` + achievedStr + `</td>`)
|
b.WriteString(`<td>` + singleStr + `</td>`)
|
||||||
|
b.WriteString(`<td` + finalStyle + `>` + multiStr + `</td>`)
|
||||||
b.WriteString(`<td>` + p95Str + `</td>`)
|
b.WriteString(`<td>` + p95Str + `</td>`)
|
||||||
b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(statusLabel) + `</td>`)
|
b.WriteString(`<td style="color:` + statusColor + `;font-weight:600">` + html.EscapeString(statusLabel) + `</td>`)
|
||||||
b.WriteString(`</tr>`)
|
b.WriteString(`</tr>`)
|
||||||
|
|||||||
Reference in New Issue
Block a user