diff --git a/audit/internal/platform/benchmark.go b/audit/internal/platform/benchmark.go index 29f7db4..48b863f 100644 --- a/audit/internal/platform/benchmark.go +++ b/audit/internal/platform/benchmark.go @@ -59,6 +59,9 @@ type benchmarkPowerCalibrationResult struct { // ≥20% while server fans were below 100% duty cycle — a signal that the // cooling system may not be correctly configured for full GPU load. CoolingWarning string + // MetricRows holds the telemetry rows from the final (converged) attempt + // for this GPU. Used to build per-run gpu-metrics.csv. + MetricRows []GPUMetricRow } type benchmarkBurnProfile struct { @@ -2781,7 +2784,7 @@ func runBenchmarkPowerCalibration( infoByIndex map[int]benchmarkGPUInfo, logFunc func(string), seedLimits map[int]int, -) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction) { +) (map[int]benchmarkPowerCalibrationResult, []benchmarkRestoreAction, []GPUMetricRow) { const calibDurationSec = 120 const maxDerateW = 150 // calibSearchTolerance is the binary-search convergence threshold in watts. @@ -2795,7 +2798,7 @@ func runBenchmarkPowerCalibration( if _, err := exec.LookPath("dcgmi"); err != nil { logFunc("power calibration: dcgmi not found, skipping (will use default power limit)") - return map[int]benchmarkPowerCalibrationResult{}, nil + return map[int]benchmarkPowerCalibrationResult{}, nil, nil } if killed := KillTestWorkers(); len(killed) > 0 { for _, p := range killed { @@ -2829,6 +2832,8 @@ func runBenchmarkPowerCalibration( results := make(map[int]benchmarkPowerCalibrationResult, len(gpuIndices)) var restore []benchmarkRestoreAction + var allCalibRows []GPUMetricRow // accumulated telemetry across all attempts + var calibCursor float64 // Initialise per-GPU state. states := make([]*gpuCalibState, 0, len(gpuIndices)) @@ -2981,6 +2986,8 @@ calibDone: ticker.Stop() cancelAttempt() _ = os.WriteFile(filepath.Join(runDir, logName), ar.out, 0644) + // Accumulate telemetry rows with attempt stage label. + appendBenchmarkMetrics(&allCalibRows, ar.rows, fmt.Sprintf("attempt-%d", sharedAttempt), &calibCursor, float64(calibDurationSec)) // Resource busy: retry with exponential back-off (shared — one DCGM session). if ar.err != nil && isDCGMResourceBusy(ar.err) { @@ -3065,6 +3072,7 @@ calibDone: } } } + s.calib.MetricRows = filterRowsByGPU(ar.rows, s.idx) s.converged = true continue } @@ -3103,6 +3111,7 @@ calibDone: } else { s.calib.Notes = append(s.calib.Notes, fmt.Sprintf("could not find a stable targeted_power limit within %d W of the default", maxDerateW)) } + s.calib.MetricRows = filterRowsByGPU(ar.rows, s.idx) s.converged = true continue } @@ -3140,7 +3149,8 @@ calibDone: results[s.idx] = s.calib } } - return results, restore + writeBenchmarkMetricsFiles(runDir, allCalibRows) + return results, restore, allCalibRows } // isDCGMResourceBusy returns true when dcgmi exits with DCGM_ST_IN_USE (222), @@ -3230,21 +3240,25 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string { } if len(result.RampSteps) > 0 { b.WriteString("## Ramp Sequence\n\n") - b.WriteString("| Step | New GPU | Stable Limit | Total Observed | Derated | Status |\n") - b.WriteString("|------|---------|--------------|----------------|---------|--------|\n") + b.WriteString("| Step | New GPU | Stable Limit | Total Observed | Server Δ (IPMI) | Derated | Status |\n") + b.WriteString("|------|---------|--------------|----------------|-----------------|---------|--------|\n") for _, step := range result.RampSteps { derated := "-" if step.Derated { derated = "⚠ yes" } - fmt.Fprintf(&b, "| %d | GPU %d | %.0f W | %.0f W | %s | %s |\n", - step.StepIndex, step.NewGPUIndex, step.NewGPUStableLimitW, step.TotalObservedPowerW, derated, step.Status) + serverDelta := "-" + if step.ServerDeltaW > 0 { + serverDelta = fmt.Sprintf("%.0f W", step.ServerDeltaW) + } + fmt.Fprintf(&b, "| %d | GPU %d | %.0f W | %.0f W | %s | %s | %s |\n", + step.StepIndex, step.NewGPUIndex, step.NewGPUStableLimitW, step.TotalObservedPowerW, serverDelta, derated, step.Status) } b.WriteString("\n") } b.WriteString("## Per-Slot Results\n\n") - b.WriteString("| GPU | Status | Single-card Limit | Stable Limit | Temp | Attempts |\n") - b.WriteString("|-----|--------|-------------------|--------------|------|----------|\n") + b.WriteString("| GPU | Status | Single-card Limit | Stable Limit | Server Δ (IPMI) | Temp | Attempts |\n") + b.WriteString("|-----|--------|-------------------|--------------|-----------------|------|----------|\n") for _, gpu := range result.GPUs { stableLimit := "-" if gpu.StablePowerLimitW > 0 { @@ -3254,8 +3268,12 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string { stableLimit = fmt.Sprintf("%.0f W", gpu.StablePowerLimitW) } } - fmt.Fprintf(&b, "| GPU %d | %s | %.0f W | %s | %.1f C | %d |\n", - gpu.Index, gpu.Status, gpu.AppliedPowerLimitW, stableLimit, gpu.MaxObservedTempC, gpu.CalibrationAttempts) + serverDelta := "-" + if gpu.ServerDeltaW > 0 { + serverDelta = fmt.Sprintf("%.0f W", gpu.ServerDeltaW) + } + fmt.Fprintf(&b, "| GPU %d | %s | %.0f W | %s | %s | %.1f C | %d |\n", + gpu.Index, gpu.Status, gpu.AppliedPowerLimitW, stableLimit, serverDelta, gpu.MaxObservedTempC, gpu.CalibrationAttempts) } b.WriteString("\n") for _, gpu := range result.GPUs { @@ -3284,11 +3302,19 @@ func renderPowerBenchSummary(result NvidiaPowerBenchResult) string { fmt.Fprintf(&b, "ramp_step_%d_new_gpu=%d\n", step.StepIndex, step.NewGPUIndex) fmt.Fprintf(&b, "ramp_step_%d_stable_limit_w=%.0f\n", step.StepIndex, step.NewGPUStableLimitW) fmt.Fprintf(&b, "ramp_step_%d_total_power_w=%.0f\n", step.StepIndex, step.TotalObservedPowerW) + if step.ServerLoadedW > 0 { + fmt.Fprintf(&b, "ramp_step_%d_server_loaded_w=%.0f\n", step.StepIndex, step.ServerLoadedW) + fmt.Fprintf(&b, "ramp_step_%d_server_delta_w=%.0f\n", step.StepIndex, step.ServerDeltaW) + } } for _, gpu := range result.GPUs { if gpu.StablePowerLimitW > 0 { fmt.Fprintf(&b, "gpu_%d_stable_limit_w=%.0f\n", gpu.Index, gpu.StablePowerLimitW) } + if gpu.ServerLoadedW > 0 { + fmt.Fprintf(&b, "gpu_%d_server_loaded_w=%.0f\n", gpu.Index, gpu.ServerLoadedW) + fmt.Fprintf(&b, "gpu_%d_server_delta_w=%.0f\n", gpu.Index, gpu.ServerDeltaW) + } } if sp := result.ServerPower; sp != nil && sp.Available { fmt.Fprintf(&b, "server_idle_w=%.0f\n", sp.IdleW) @@ -3327,6 +3353,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N if infoErr != nil { return "", infoErr } + // Capture full nvidia-smi -q snapshot at the start of the run. + if out, err := runSATCommandCtx(ctx, verboseLog, "00-nvidia-smi-q.log", []string{"nvidia-smi", "-q"}, nil, nil); err == nil { + _ = os.WriteFile(filepath.Join(runDir, "00-nvidia-smi-q.log"), out, 0644) + } hostname, _ := os.Hostname() result := NvidiaPowerBenchResult{ BenchmarkVersion: benchmarkVersion, @@ -3352,13 +3382,31 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N // Phase 1: calibrate each GPU individually (sequentially, one at a time) to // establish a true single-card power baseline unaffected by neighbour heat. calibByIndex := make(map[int]benchmarkPowerCalibrationResult, len(selected)) + singleIPMILoadedW := make(map[int]float64, len(selected)) var allRestoreActions []benchmarkRestoreAction + // allPowerRows accumulates telemetry from all phases for the top-level gpu-metrics.csv. + var allPowerRows []GPUMetricRow + var powerCursor float64 for _, idx := range selected { singleDir := filepath.Join(runDir, fmt.Sprintf("single-%02d", idx)) _ = os.MkdirAll(singleDir, 0755) singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex) logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx)) - c, restore := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil) + ipmiSingleCtx, ipmiSingleCancel := context.WithCancel(ctx) + ipmiSingleDone := make(chan float64, 1) + go func() { + defer close(ipmiSingleDone) + if w, ok := sampleIPMIPowerSeries(ipmiSingleCtx, 3600); ok { + ipmiSingleDone <- w + } + }() + c, restore, singleRows := runBenchmarkPowerCalibration(ctx, verboseLog, singleDir, []int{idx}, singleInfo, logFunc, nil) + appendBenchmarkMetrics(&allPowerRows, singleRows, fmt.Sprintf("single-gpu-%d", idx), &powerCursor, 0) + ipmiSingleCancel() + if w, ok := <-ipmiSingleDone; ok { + singleIPMILoadedW[idx] = w + logFunc(fmt.Sprintf("power calibration: GPU %d single-card IPMI loaded: %.0f W", idx, w)) + } allRestoreActions = append(allRestoreActions, restore...) if r, ok := c[idx]; ok { calibByIndex[idx] = r @@ -3383,7 +3431,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N result.OverallStatus = "PARTIAL" } } - gpus = append(gpus, NvidiaPowerBenchGPU{ + gpu := NvidiaPowerBenchGPU{ Index: idx, Name: info.Name, BusID: info.BusID, @@ -3396,7 +3444,16 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N Status: status, Notes: append([]string(nil), calib.Notes...), CoolingWarning: calib.CoolingWarning, - }) + } + if w, ok := singleIPMILoadedW[idx]; ok && serverIdleOK && w > 0 { + gpu.ServerLoadedW = w + gpu.ServerDeltaW = w - serverIdleW + } + if len(calib.MetricRows) > 0 { + t := summarizeBenchmarkTelemetry(calib.MetricRows) + gpu.Telemetry = &t + } + gpus = append(gpus, gpu) } sort.Slice(gpus, func(i, j int) bool { if gpus[i].MaxObservedPowerW != gpus[j].MaxObservedPowerW { @@ -3445,20 +3502,11 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N // stableLimits accumulates GPU index → fixed stable limit (W) across steps. stableLimits := make(map[int]int, len(result.RecommendedSlotOrder)) - // Start an IPMI sampling goroutine that runs throughout Phase 2 to capture - // server-side loaded power while GPUs are under stress. The goroutine is - // cancelled as soon as Phase 2 finishes, and the average is used to compare - // against PlatformMaxTDPW (GPU-reported stable limits sum). + // serverLoadedW tracks the IPMI server power from the final ramp step + // (all GPUs simultaneously loaded). Earlier steps' values are stored + // per-step in NvidiaPowerBenchStep.ServerLoadedW. var serverLoadedW float64 var serverLoadedOK bool - ipmiPhase2Ctx, ipmiPhase2Cancel := context.WithCancel(ctx) - ipmiPhase2Done := make(chan float64, 1) - go func() { - defer close(ipmiPhase2Done) - if w, ok := sampleIPMIPowerSeries(ipmiPhase2Ctx, 3600); ok { - ipmiPhase2Done <- w - } - }() // Step 1: reuse single-card calibration result directly. if len(result.RecommendedSlotOrder) > 0 { @@ -3475,6 +3523,10 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N Derated: firstCalib.Derated, Status: "OK", } + if w, ok := singleIPMILoadedW[firstIdx]; ok && serverIdleOK && w > 0 { + ramp.ServerLoadedW = w + ramp.ServerDeltaW = w - serverIdleW + } if !firstCalib.Completed { ramp.Status = "FAILED" ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card targeted_power", firstIdx)) @@ -3523,7 +3575,24 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N step, len(result.RecommendedSlotOrder), len(subset), newGPUIdx)) stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex) - stepCalib, stepRestore := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep) + ipmiStepCtx, ipmiStepCancel := context.WithCancel(ctx) + ipmiStepDone := make(chan float64, 1) + go func() { + defer close(ipmiStepDone) + if w, ok := sampleIPMIPowerSeries(ipmiStepCtx, 3600); ok { + ipmiStepDone <- w + } + }() + stepCalib, stepRestore, stepRows := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc, seedForStep) + appendBenchmarkMetrics(&allPowerRows, stepRows, fmt.Sprintf("ramp-step-%d", step), &powerCursor, 0) + ipmiStepCancel() + var stepIPMILoadedW float64 + var stepIPMIOK bool + if w, ok := <-ipmiStepDone; ok { + stepIPMILoadedW = w + stepIPMIOK = true + logFunc(fmt.Sprintf("power ramp: step %d IPMI loaded: %.0f W", step, w)) + } // Accumulate restore actions; they all run in the outer defer. allRestoreActions = append(allRestoreActions, stepRestore...) @@ -3586,15 +3655,17 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (GPU %d) required derating to %.0f W under combined thermal load.", step, newGPUIdx, c.AppliedPowerLimitW)) } - result.RampSteps = append(result.RampSteps, ramp) - } + if stepIPMIOK && serverIdleOK && stepIPMILoadedW > 0 { + ramp.ServerLoadedW = stepIPMILoadedW + ramp.ServerDeltaW = stepIPMILoadedW - serverIdleW + // The last step has all GPUs loaded — use it as the top-level loaded_w. + if step == len(result.RecommendedSlotOrder) { + serverLoadedW = stepIPMILoadedW + serverLoadedOK = true + } + } - // Stop IPMI Phase 2 sampling and collect result. - ipmiPhase2Cancel() - if w, ok := <-ipmiPhase2Done; ok { - serverLoadedW = w - serverLoadedOK = true - logFunc(fmt.Sprintf("server loaded power (IPMI, Phase 2 avg): %.0f W", w)) + result.RampSteps = append(result.RampSteps, ramp) } // Populate StablePowerLimitW on each GPU entry from the accumulated stable limits. @@ -3624,6 +3695,8 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N // ~1.0 → GPU telemetry matches wall power; <0.75 → GPU over-reports its TDP. _ = serverIdleOK // used implicitly via characterizeServerPower result.ServerPower = characterizeServerPower(serverIdleW, serverLoadedW, result.PlatformMaxTDPW, serverIdleOK && serverLoadedOK) + // Write top-level gpu-metrics.csv/.html aggregating all phases. + writeBenchmarkMetricsFiles(runDir, allPowerRows) resultJSON, err := json.MarshalIndent(result, "", " ") if err != nil { return "", fmt.Errorf("marshal power result: %w", err) diff --git a/audit/internal/platform/benchmark_types.go b/audit/internal/platform/benchmark_types.go index c78d9d7..902eeb7 100644 --- a/audit/internal/platform/benchmark_types.go +++ b/audit/internal/platform/benchmark_types.go @@ -331,6 +331,13 @@ type NvidiaPowerBenchGPU struct { Notes []string `json:"notes,omitempty"` // CoolingWarning mirrors BenchmarkGPUResult.CoolingWarning for the power workflow. CoolingWarning string `json:"cooling_warning,omitempty"` + // ServerLoadedW is the IPMI server power reading captured during this + // GPU's single-card calibration run. ServerDeltaW = ServerLoadedW − idle. + ServerLoadedW float64 `json:"server_loaded_w,omitempty"` + ServerDeltaW float64 `json:"server_delta_w,omitempty"` + // Telemetry holds the aggregated stats from the final converged calibration + // attempt for this GPU (temperature, power, fan, clock percentiles). + Telemetry *BenchmarkTelemetrySummary `json:"telemetry,omitempty"` } type NvidiaPowerBenchStep struct { @@ -345,6 +352,10 @@ type NvidiaPowerBenchStep struct { Derated bool `json:"derated,omitempty"` Status string `json:"status"` Notes []string `json:"notes,omitempty"` + // ServerLoadedW is the IPMI server power reading captured during this + // ramp step's calibration run. ServerDeltaW = ServerLoadedW − idle. + ServerLoadedW float64 `json:"server_loaded_w,omitempty"` + ServerDeltaW float64 `json:"server_delta_w,omitempty"` } // NvidiaPerformanceRampStep holds per-step performance data for the diff --git a/audit/internal/webui/pages.go b/audit/internal/webui/pages.go index 67942c1..c7720e0 100644 --- a/audit/internal/webui/pages.go +++ b/audit/internal/webui/pages.go @@ -2014,9 +2014,11 @@ func renderSATCard(id, label, runAction, headerActions, body string) string { // ── Benchmark ───────────────────────────────────────────────────────────────── type benchmarkHistoryRun struct { - generatedAt time.Time - displayTime string - gpuScores map[int]float64 // GPU index → composite score + generatedAt time.Time + displayTime string + gpuScores map[int]float64 // GPU index → composite score + gpuStatuses map[int]string // GPU index → status ("OK", "WARNING", "FAILED", …) + overallStatus string } func renderBenchmark(opts HandlerOptions) string { @@ -2324,7 +2326,7 @@ func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string, b.WriteString(`

` + html.EscapeString(description) + `

`) } b.WriteString(`
`) - b.WriteString(``) + b.WriteString(`
RunTime
`) for i := 0; i <= maxGPUIndex; i++ { b.WriteString(``) } @@ -2333,13 +2335,36 @@ func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string, b.WriteString(``) b.WriteString(``) b.WriteString(``) + overallColor := "var(--ok)" + overallLabel := run.overallStatus + if overallLabel == "" { + overallLabel = "OK" + } + if overallLabel == "FAILED" { + overallColor = "var(--crit-fg,#9f3a38)" + } else if overallLabel != "OK" { + overallColor = "var(--warn)" + } + b.WriteString(``) for idx := 0; idx <= maxGPUIndex; idx++ { score, ok := run.gpuScores[idx] if !ok { b.WriteString(``) continue } - b.WriteString(``) + gpuStatus := run.gpuStatuses[idx] + scoreColor := "" + switch gpuStatus { + case "FAILED": + scoreColor = ` style="color:var(--crit-fg,#9f3a38);font-weight:600"` + case "WARNING", "PARTIAL": + scoreColor = ` style="color:var(--warn);font-weight:600"` + case "", "OK": + // no override + default: + scoreColor = ` style="color:var(--warn);font-weight:600"` + } + b.WriteString(`` + fmt.Sprintf("%.2f", score) + ``) } b.WriteString(``) } @@ -2373,12 +2398,15 @@ func loadBenchmarkHistoryFromPaths(paths []string) (int, []benchmarkHistoryRun) continue } run := benchmarkHistoryRun{ - generatedAt: result.GeneratedAt, - displayTime: result.GeneratedAt.Local().Format("2006-01-02 15:04:05"), - gpuScores: make(map[int]float64), + generatedAt: result.GeneratedAt, + displayTime: result.GeneratedAt.Local().Format("2006-01-02 15:04:05"), + gpuScores: make(map[int]float64), + gpuStatuses: make(map[int]string), + overallStatus: result.OverallStatus, } for _, gpu := range result.GPUs { run.gpuScores[gpu.Index] = gpu.Scores.CompositeScore + run.gpuStatuses[gpu.Index] = gpu.Status if gpu.Index > maxGPUIndex { maxGPUIndex = gpu.Index } @@ -2447,31 +2475,45 @@ func renderPowerBenchmarkResultsCard(exportDir string) string { if len(latest.GPUs) > 0 { b.WriteString(`
RunTimeStatusGPU ` + strconv.Itoa(i) + `
#` + strconv.Itoa(i+1) + `` + html.EscapeString(run.displayTime) + `` + html.EscapeString(overallLabel) + `-` + fmt.Sprintf("%.2f", score) + `
`) - b.WriteString(``) + b.WriteString(``) b.WriteString(``) for _, gpu := range latest.GPUs { - derated := gpu.Derated || (gpu.DefaultPowerLimitW > 0 && gpu.AppliedPowerLimitW < gpu.DefaultPowerLimitW-1) + // finalLimitW is the definitive TDP: multi-GPU stable limit from the ramp, + // falling back to single-card applied limit if the ramp hasn't run. + finalLimitW := gpu.StablePowerLimitW + if finalLimitW <= 0 { + finalLimitW = gpu.AppliedPowerLimitW + } + // Derate is relative to nominal (DefaultPowerLimitW), using the final limit. + derated := gpu.Derated || + (gpu.DefaultPowerLimitW > 0 && finalLimitW > 0 && finalLimitW < gpu.DefaultPowerLimitW-1) rowStyle := "" - achievedStyle := "" + finalStyle := "" if derated { rowStyle = ` style="background:rgba(255,180,0,0.08)"` - achievedStyle = ` style="color:#e6a000;font-weight:600"` + finalStyle = ` style="color:#e6a000;font-weight:600"` } statusLabel := gpu.Status if statusLabel == "" { statusLabel = "OK" } statusColor := "var(--ok)" - if statusLabel != "OK" { + if statusLabel == "FAILED" { + statusColor = "var(--crit-fg,#9f3a38)" + } else if statusLabel != "OK" { statusColor = "var(--warn)" } nominalStr := "-" if gpu.DefaultPowerLimitW > 0 { nominalStr = fmt.Sprintf("%.0f", gpu.DefaultPowerLimitW) } - achievedStr := "-" + singleStr := "-" if gpu.AppliedPowerLimitW > 0 { - achievedStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW) + singleStr = fmt.Sprintf("%.0f", gpu.AppliedPowerLimitW) + } + multiStr := "-" + if gpu.StablePowerLimitW > 0 { + multiStr = fmt.Sprintf("%.0f", gpu.StablePowerLimitW) } p95Str := "-" if gpu.MaxObservedPowerW > 0 { @@ -2481,7 +2523,8 @@ func renderPowerBenchmarkResultsCard(exportDir string) string { b.WriteString(``) b.WriteString(``) b.WriteString(``) - b.WriteString(`` + achievedStr + ``) + b.WriteString(``) + b.WriteString(`` + multiStr + ``) b.WriteString(``) b.WriteString(``) b.WriteString(``)
GPUModelNominal WAchieved WP95 Observed WStatusGPUModelNominal WSingle-card WMulti-GPU WP95 Observed WStatus
` + strconv.Itoa(gpu.Index) + `` + html.EscapeString(gpu.Name) + `` + nominalStr + `` + singleStr + `` + p95Str + `` + html.EscapeString(statusLabel) + `