Add slot-aware ramp sequence to bee-bench power
This commit is contained in:
@@ -124,6 +124,7 @@ type satRunner interface {
|
|||||||
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
|
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
|
RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
|
||||||
|
RunNvidiaPowerBench(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
|
||||||
RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error)
|
RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error)
|
||||||
RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
@@ -574,6 +575,13 @@ func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts pl
|
|||||||
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
|
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaPowerBenchCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultBeeBenchPowerDir
|
||||||
|
}
|
||||||
|
return a.sat.RunNvidiaPowerBench(ctx, baseDir, opts, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
|
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
|
||||||
if strings.TrimSpace(baseDir) == "" {
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
baseDir = DefaultSATBaseDir
|
baseDir = DefaultSATBaseDir
|
||||||
|
|||||||
@@ -122,6 +122,7 @@ func (f fakeTools) CheckTools(names []string) []platform.ToolStatus {
|
|||||||
type fakeSAT struct {
|
type fakeSAT struct {
|
||||||
runNvidiaFn func(string) (string, error)
|
runNvidiaFn func(string) (string, error)
|
||||||
runNvidiaBenchmarkFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
|
runNvidiaBenchmarkFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
|
||||||
|
runNvidiaPowerBenchFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
|
||||||
runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error)
|
runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error)
|
||||||
runNvidiaComputeFn func(string, int, []int) (string, error)
|
runNvidiaComputeFn func(string, int, []int) (string, error)
|
||||||
runNvidiaPowerFn func(string, int, []int) (string, error)
|
runNvidiaPowerFn func(string, int, []int) (string, error)
|
||||||
@@ -154,6 +155,13 @@ func (f fakeSAT) RunNvidiaBenchmark(_ context.Context, baseDir string, opts plat
|
|||||||
return f.runNvidiaFn(baseDir)
|
return f.runNvidiaFn(baseDir)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunNvidiaPowerBench(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, _ func(string)) (string, error) {
|
||||||
|
if f.runNvidiaPowerBenchFn != nil {
|
||||||
|
return f.runNvidiaPowerBenchFn(baseDir, opts)
|
||||||
|
}
|
||||||
|
return f.runNvidiaFn(baseDir)
|
||||||
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
|
func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
|
||||||
if f.runNvidiaTargetedStressFn != nil {
|
if f.runNvidiaTargetedStressFn != nil {
|
||||||
return f.runNvidiaTargetedStressFn(baseDir, durationSec, gpuIndices)
|
return f.runNvidiaTargetedStressFn(baseDir, durationSec, gpuIndices)
|
||||||
|
|||||||
@@ -2603,3 +2603,279 @@ func runBenchmarkPowerCalibration(
|
|||||||
}
|
}
|
||||||
return results, restore
|
return results, restore
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func powerBenchDurationSec(profile string) int {
|
||||||
|
switch strings.TrimSpace(strings.ToLower(profile)) {
|
||||||
|
case NvidiaBenchmarkProfileStability:
|
||||||
|
return 300
|
||||||
|
case NvidiaBenchmarkProfileOvernight:
|
||||||
|
return 600
|
||||||
|
default:
|
||||||
|
return 120
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func occupiedSlots(indices []int, current int) []int {
|
||||||
|
out := make([]int, 0, len(indices))
|
||||||
|
for _, idx := range indices {
|
||||||
|
if idx != current {
|
||||||
|
out = append(out, idx)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func cloneBenchmarkGPUInfoMap(src map[int]benchmarkGPUInfo) map[int]benchmarkGPUInfo {
|
||||||
|
out := make(map[int]benchmarkGPUInfo, len(src))
|
||||||
|
for k, v := range src {
|
||||||
|
out[k] = v
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||||
|
var b strings.Builder
|
||||||
|
b.WriteString("# Bee Bench Power Report\n\n")
|
||||||
|
fmt.Fprintf(&b, "**Benchmark version:** %s \n", result.BenchmarkVersion)
|
||||||
|
fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile)
|
||||||
|
fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
|
||||||
|
fmt.Fprintf(&b, "**Overall status:** %s \n\n", result.OverallStatus)
|
||||||
|
if len(result.Findings) > 0 {
|
||||||
|
b.WriteString("## Summary\n\n")
|
||||||
|
for _, finding := range result.Findings {
|
||||||
|
fmt.Fprintf(&b, "- %s\n", finding)
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
if len(result.RecommendedSlotOrder) > 0 {
|
||||||
|
b.WriteString("## Recommended Slot Order\n\n")
|
||||||
|
fmt.Fprintf(&b, "Populate GPUs in this order for best single-card power realization: `%s`\n\n", joinIndexList(result.RecommendedSlotOrder))
|
||||||
|
}
|
||||||
|
if len(result.RampSteps) > 0 {
|
||||||
|
b.WriteString("## Ramp Sequence\n\n")
|
||||||
|
b.WriteString("| Step | GPUs | Total Power | Avg / GPU | Avg Realization | Min Realization | Derated |\n")
|
||||||
|
b.WriteString("|------|------|-------------|-----------|-----------------|-----------------|---------|\n")
|
||||||
|
for _, step := range result.RampSteps {
|
||||||
|
fmt.Fprintf(&b, "| %d | %s | %.0f W | %.0f W | %.1f%% | %.1f%% | %d |\n",
|
||||||
|
step.StepIndex, joinIndexList(step.GPUIndices), step.TotalObservedPowerW, step.AvgObservedPowerW, step.AvgPowerRealizationPct, step.MinPowerRealizationPct, step.DeratedGPUCount)
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
b.WriteString("## Per-Slot Results\n\n")
|
||||||
|
b.WriteString("| GPU | Status | Max Power | Temp | Applied Limit | Default Limit | Attempts |\n")
|
||||||
|
b.WriteString("|-----|--------|-----------|------|---------------|---------------|----------|\n")
|
||||||
|
for _, gpu := range result.GPUs {
|
||||||
|
fmt.Fprintf(&b, "| GPU %d | %s | %.0f W | %.1f C | %.0f W | %.0f W | %d |\n",
|
||||||
|
gpu.Index, gpu.Status, gpu.MaxObservedPowerW, gpu.MaxObservedTempC, gpu.AppliedPowerLimitW, gpu.DefaultPowerLimitW, gpu.CalibrationAttempts)
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
for _, gpu := range result.GPUs {
|
||||||
|
fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, gpu.Name)
|
||||||
|
if gpu.OccupiedSlotsNote != "" {
|
||||||
|
fmt.Fprintf(&b, "- %s\n", gpu.OccupiedSlotsNote)
|
||||||
|
}
|
||||||
|
for _, note := range gpu.Notes {
|
||||||
|
fmt.Fprintf(&b, "- %s\n", note)
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderPowerBenchSummary(result NvidiaPowerBenchResult) string {
|
||||||
|
var b strings.Builder
|
||||||
|
fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
|
||||||
|
fmt.Fprintf(&b, "benchmark_version=%s\n", result.BenchmarkVersion)
|
||||||
|
fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile)
|
||||||
|
fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus)
|
||||||
|
fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs))
|
||||||
|
if len(result.RecommendedSlotOrder) > 0 {
|
||||||
|
fmt.Fprintf(&b, "recommended_slot_order=%s\n", joinIndexList(result.RecommendedSlotOrder))
|
||||||
|
}
|
||||||
|
for _, step := range result.RampSteps {
|
||||||
|
fmt.Fprintf(&b, "ramp_step_%d_gpus=%s\n", step.StepIndex, joinIndexList(step.GPUIndices))
|
||||||
|
fmt.Fprintf(&b, "ramp_step_%d_total_power_w=%.0f\n", step.StepIndex, step.TotalObservedPowerW)
|
||||||
|
}
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||||
|
if ctx == nil {
|
||||||
|
ctx = context.Background()
|
||||||
|
}
|
||||||
|
if logFunc == nil {
|
||||||
|
logFunc = func(string) {}
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = "/var/log/bee-bench/power"
|
||||||
|
}
|
||||||
|
opts = normalizeNvidiaBenchmarkOptionsForBenchmark(opts)
|
||||||
|
selected, err := resolveNvidiaGPUSelection(opts.GPUIndices, opts.ExcludeGPUIndices)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
if len(selected) == 0 {
|
||||||
|
return "", fmt.Errorf("no NVIDIA GPUs selected")
|
||||||
|
}
|
||||||
|
ts := time.Now().UTC().Format("20060102-150405")
|
||||||
|
runDir := filepath.Join(baseDir, "power-"+ts)
|
||||||
|
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||||
|
return "", fmt.Errorf("mkdir %s: %w", runDir, err)
|
||||||
|
}
|
||||||
|
verboseLog := filepath.Join(runDir, "verbose.log")
|
||||||
|
infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
|
||||||
|
if infoErr != nil {
|
||||||
|
return "", infoErr
|
||||||
|
}
|
||||||
|
hostname, _ := os.Hostname()
|
||||||
|
result := NvidiaPowerBenchResult{
|
||||||
|
BenchmarkVersion: benchmarkVersion,
|
||||||
|
GeneratedAt: time.Now().UTC(),
|
||||||
|
Hostname: hostname,
|
||||||
|
ServerModel: readServerModel(),
|
||||||
|
BenchmarkProfile: opts.Profile,
|
||||||
|
SelectedGPUIndices: append([]int(nil), selected...),
|
||||||
|
OverallStatus: "OK",
|
||||||
|
}
|
||||||
|
durationSec := powerBenchDurationSec(opts.Profile)
|
||||||
|
_ = durationSec
|
||||||
|
calibByIndex, restoreActions := runBenchmarkPowerCalibration(ctx, verboseLog, runDir, selected, infoByIndex, logFunc)
|
||||||
|
defer func() {
|
||||||
|
for i := len(restoreActions) - 1; i >= 0; i-- {
|
||||||
|
restoreActions[i].fn()
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
gpus := make([]NvidiaPowerBenchGPU, 0, len(selected))
|
||||||
|
for _, idx := range selected {
|
||||||
|
info := infoByIndex[idx]
|
||||||
|
calib := calibByIndex[idx]
|
||||||
|
status := "OK"
|
||||||
|
if !calib.Completed {
|
||||||
|
status = "FAILED"
|
||||||
|
result.OverallStatus = "PARTIAL"
|
||||||
|
} else if calib.Derated {
|
||||||
|
status = "PARTIAL"
|
||||||
|
if result.OverallStatus == "OK" {
|
||||||
|
result.OverallStatus = "PARTIAL"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
occupied := occupiedSlots(selected, idx)
|
||||||
|
note := ""
|
||||||
|
if len(occupied) > 0 {
|
||||||
|
note = fmt.Sprintf("Slot recommendation was measured while slots %s were populated; airflow in a different chassis fill pattern may differ.", joinIndexList(occupied))
|
||||||
|
}
|
||||||
|
gpus = append(gpus, NvidiaPowerBenchGPU{
|
||||||
|
Index: idx,
|
||||||
|
Name: info.Name,
|
||||||
|
BusID: info.BusID,
|
||||||
|
DefaultPowerLimitW: info.DefaultPowerLimitW,
|
||||||
|
AppliedPowerLimitW: calib.AppliedPowerLimitW,
|
||||||
|
MaxObservedPowerW: calib.Summary.P95PowerW,
|
||||||
|
MaxObservedTempC: calib.Summary.P95TempC,
|
||||||
|
CalibrationAttempts: calib.Attempts,
|
||||||
|
Derated: calib.Derated,
|
||||||
|
Status: status,
|
||||||
|
OccupiedSlots: occupied,
|
||||||
|
OccupiedSlotsNote: note,
|
||||||
|
Notes: append([]string(nil), calib.Notes...),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
sort.Slice(gpus, func(i, j int) bool {
|
||||||
|
if gpus[i].MaxObservedPowerW != gpus[j].MaxObservedPowerW {
|
||||||
|
return gpus[i].MaxObservedPowerW > gpus[j].MaxObservedPowerW
|
||||||
|
}
|
||||||
|
if gpus[i].AppliedPowerLimitW != gpus[j].AppliedPowerLimitW {
|
||||||
|
return gpus[i].AppliedPowerLimitW > gpus[j].AppliedPowerLimitW
|
||||||
|
}
|
||||||
|
if gpus[i].Derated != gpus[j].Derated {
|
||||||
|
return !gpus[i].Derated
|
||||||
|
}
|
||||||
|
return gpus[i].Index < gpus[j].Index
|
||||||
|
})
|
||||||
|
result.GPUs = gpus
|
||||||
|
result.RecommendedSlotOrder = make([]int, 0, len(gpus))
|
||||||
|
for _, gpu := range gpus {
|
||||||
|
result.RecommendedSlotOrder = append(result.RecommendedSlotOrder, gpu.Index)
|
||||||
|
}
|
||||||
|
if len(result.RecommendedSlotOrder) > 0 {
|
||||||
|
result.Findings = append(result.Findings, fmt.Sprintf("Recommended slot order for installation based on single-card targeted_power: %s.", joinIndexList(result.RecommendedSlotOrder)))
|
||||||
|
}
|
||||||
|
for _, gpu := range gpus {
|
||||||
|
if gpu.Derated {
|
||||||
|
result.Findings = append(result.Findings, fmt.Sprintf("GPU %d required reduced power limit %.0f W to complete targeted_power.", gpu.Index, gpu.AppliedPowerLimitW))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
singleByIndex := make(map[int]NvidiaPowerBenchGPU, len(gpus))
|
||||||
|
for _, gpu := range gpus {
|
||||||
|
singleByIndex[gpu.Index] = gpu
|
||||||
|
}
|
||||||
|
for step := 1; step <= len(result.RecommendedSlotOrder); step++ {
|
||||||
|
subset := append([]int(nil), result.RecommendedSlotOrder[:step]...)
|
||||||
|
stepDir := filepath.Join(runDir, fmt.Sprintf("step-%02d", step))
|
||||||
|
_ = os.MkdirAll(stepDir, 0755)
|
||||||
|
stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
||||||
|
stepCalib, stepRestore := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc)
|
||||||
|
for i := len(stepRestore) - 1; i >= 0; i-- {
|
||||||
|
stepRestore[i].fn()
|
||||||
|
}
|
||||||
|
ramp := NvidiaPowerBenchStep{
|
||||||
|
StepIndex: step,
|
||||||
|
GPUIndices: subset,
|
||||||
|
Status: "OK",
|
||||||
|
}
|
||||||
|
var realizationValues []float64
|
||||||
|
for _, idx := range subset {
|
||||||
|
calib := stepCalib[idx]
|
||||||
|
ramp.TotalObservedPowerW += calib.Summary.P95PowerW
|
||||||
|
if calib.Derated {
|
||||||
|
ramp.DeratedGPUCount++
|
||||||
|
ramp.Status = "PARTIAL"
|
||||||
|
}
|
||||||
|
if !calib.Completed {
|
||||||
|
ramp.Status = "FAILED"
|
||||||
|
ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete targeted_power in ramp step %d", idx, step))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if single, ok := singleByIndex[idx]; ok && single.MaxObservedPowerW > 0 {
|
||||||
|
realization := calib.Summary.P95PowerW / single.MaxObservedPowerW * 100
|
||||||
|
realizationValues = append(realizationValues, realization)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(subset) > 0 {
|
||||||
|
ramp.AvgObservedPowerW = ramp.TotalObservedPowerW / float64(len(subset))
|
||||||
|
}
|
||||||
|
if len(realizationValues) > 0 {
|
||||||
|
ramp.AvgPowerRealizationPct = benchmarkMean(realizationValues)
|
||||||
|
ramp.MinPowerRealizationPct = realizationValues[0]
|
||||||
|
for _, v := range realizationValues[1:] {
|
||||||
|
if v < ramp.MinPowerRealizationPct {
|
||||||
|
ramp.MinPowerRealizationPct = v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ramp.MinPowerRealizationPct > 0 && ramp.MinPowerRealizationPct < 90 {
|
||||||
|
ramp.Notes = append(ramp.Notes, fmt.Sprintf("Power realization fell to %.1f%% of single-card baseline by step %d.", ramp.MinPowerRealizationPct, step))
|
||||||
|
if result.OverallStatus == "OK" {
|
||||||
|
result.OverallStatus = "PARTIAL"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ramp.DeratedGPUCount > 0 {
|
||||||
|
result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (%s) needed derating on %d GPU(s).", step, joinIndexList(subset), ramp.DeratedGPUCount))
|
||||||
|
}
|
||||||
|
result.RampSteps = append(result.RampSteps, ramp)
|
||||||
|
}
|
||||||
|
resultJSON, err := json.MarshalIndent(result, "", " ")
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("marshal power result: %w", err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(filepath.Join(runDir, "result.json"), resultJSON, 0644); err != nil {
|
||||||
|
return "", fmt.Errorf("write result.json: %w", err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(renderPowerBenchReport(result)), 0644); err != nil {
|
||||||
|
return "", fmt.Errorf("write report.md: %w", err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(renderPowerBenchSummary(result)), 0644); err != nil {
|
||||||
|
return "", fmt.Errorf("write summary.txt: %w", err)
|
||||||
|
}
|
||||||
|
return runDir, nil
|
||||||
|
}
|
||||||
|
|||||||
@@ -251,3 +251,45 @@ type BenchmarkInterconnectResult struct {
|
|||||||
MaxBusBWGBps float64 `json:"max_busbw_gbps,omitempty"`
|
MaxBusBWGBps float64 `json:"max_busbw_gbps,omitempty"`
|
||||||
Notes []string `json:"notes,omitempty"`
|
Notes []string `json:"notes,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type NvidiaPowerBenchResult struct {
|
||||||
|
BenchmarkVersion string `json:"benchmark_version"`
|
||||||
|
GeneratedAt time.Time `json:"generated_at"`
|
||||||
|
Hostname string `json:"hostname,omitempty"`
|
||||||
|
ServerModel string `json:"server_model,omitempty"`
|
||||||
|
BenchmarkProfile string `json:"benchmark_profile"`
|
||||||
|
SelectedGPUIndices []int `json:"selected_gpu_indices"`
|
||||||
|
RecommendedSlotOrder []int `json:"recommended_slot_order,omitempty"`
|
||||||
|
RampSteps []NvidiaPowerBenchStep `json:"ramp_steps,omitempty"`
|
||||||
|
OverallStatus string `json:"overall_status"`
|
||||||
|
Findings []string `json:"findings,omitempty"`
|
||||||
|
GPUs []NvidiaPowerBenchGPU `json:"gpus"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type NvidiaPowerBenchGPU struct {
|
||||||
|
Index int `json:"index"`
|
||||||
|
Name string `json:"name,omitempty"`
|
||||||
|
BusID string `json:"bus_id,omitempty"`
|
||||||
|
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
|
||||||
|
AppliedPowerLimitW float64 `json:"applied_power_limit_w,omitempty"`
|
||||||
|
MaxObservedPowerW float64 `json:"max_observed_power_w,omitempty"`
|
||||||
|
MaxObservedTempC float64 `json:"max_observed_temp_c,omitempty"`
|
||||||
|
CalibrationAttempts int `json:"calibration_attempts,omitempty"`
|
||||||
|
Derated bool `json:"derated,omitempty"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
OccupiedSlots []int `json:"occupied_slots,omitempty"`
|
||||||
|
OccupiedSlotsNote string `json:"occupied_slots_note,omitempty"`
|
||||||
|
Notes []string `json:"notes,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type NvidiaPowerBenchStep struct {
|
||||||
|
StepIndex int `json:"step_index"`
|
||||||
|
GPUIndices []int `json:"gpu_indices"`
|
||||||
|
TotalObservedPowerW float64 `json:"total_observed_power_w,omitempty"`
|
||||||
|
AvgObservedPowerW float64 `json:"avg_observed_power_w,omitempty"`
|
||||||
|
MinPowerRealizationPct float64 `json:"min_power_realization_pct,omitempty"`
|
||||||
|
AvgPowerRealizationPct float64 `json:"avg_power_realization_pct,omitempty"`
|
||||||
|
DeratedGPUCount int `json:"derated_gpu_count,omitempty"`
|
||||||
|
Status string `json:"status"`
|
||||||
|
Notes []string `json:"notes,omitempty"`
|
||||||
|
}
|
||||||
|
|||||||
@@ -233,6 +233,9 @@ func renderTaskReportFragment(report taskReport, charts map[string]string, logTe
|
|||||||
if benchmarkCard := renderTaskBenchmarkResultsCard(report.Target, logText); benchmarkCard != "" {
|
if benchmarkCard := renderTaskBenchmarkResultsCard(report.Target, logText); benchmarkCard != "" {
|
||||||
b.WriteString(benchmarkCard)
|
b.WriteString(benchmarkCard)
|
||||||
}
|
}
|
||||||
|
if powerCard := renderTaskPowerResultsCard(report.Target, logText); powerCard != "" {
|
||||||
|
b.WriteString(powerCard)
|
||||||
|
}
|
||||||
|
|
||||||
if len(report.Charts) > 0 {
|
if len(report.Charts) > 0 {
|
||||||
for _, chart := range report.Charts {
|
for _, chart := range report.Charts {
|
||||||
@@ -273,15 +276,42 @@ func renderTaskBenchmarkResultsCard(target, logText string) string {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func renderTaskPowerResultsCard(target, logText string) string {
|
||||||
|
if strings.TrimSpace(target) != "nvidia-bench-power" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
resultPath := taskBenchmarkResultPath(logText)
|
||||||
|
if strings.TrimSpace(resultPath) == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
raw, err := os.ReadFile(resultPath)
|
||||||
|
if err != nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
var result platform.NvidiaPowerBenchResult
|
||||||
|
if err := json.Unmarshal(raw, &result); err != nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
var b strings.Builder
|
||||||
|
b.WriteString(`<div class="card"><div class="card-head">Power Results</div><div class="card-body">`)
|
||||||
|
if len(result.RecommendedSlotOrder) > 0 {
|
||||||
|
b.WriteString(`<p style="margin-bottom:10px"><strong>Recommended slot order:</strong> ` + html.EscapeString(joinTaskIndices(result.RecommendedSlotOrder)) + `</p>`)
|
||||||
|
}
|
||||||
|
b.WriteString(`<table><tr><th>GPU</th><th>Status</th><th>Max Power</th><th>Applied Limit</th></tr>`)
|
||||||
|
for _, gpu := range result.GPUs {
|
||||||
|
fmt.Fprintf(&b, `<tr><td>GPU %d</td><td>%s</td><td>%.0f W</td><td>%.0f W</td></tr>`,
|
||||||
|
gpu.Index, html.EscapeString(gpu.Status), gpu.MaxObservedPowerW, gpu.AppliedPowerLimitW)
|
||||||
|
}
|
||||||
|
b.WriteString(`</table></div></div>`)
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
func taskBenchmarkResultPath(logText string) string {
|
func taskBenchmarkResultPath(logText string) string {
|
||||||
archivePath := taskArchivePathFromLog(logText)
|
archivePath := taskArchivePathFromLog(logText)
|
||||||
if archivePath == "" {
|
if archivePath == "" {
|
||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||||
if runDir == archivePath {
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
return filepath.Join(runDir, "result.json")
|
return filepath.Join(runDir, "result.json")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -650,26 +650,14 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
err = fmt.Errorf("app not configured")
|
err = fmt.Errorf("app not configured")
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
dur := t.params.Duration
|
archive, err = a.RunNvidiaPowerBenchCtx(ctx, app.DefaultBeeBenchPowerDir, platform.NvidiaBenchmarkOptions{
|
||||||
if dur <= 0 {
|
Profile: t.params.BenchmarkProfile,
|
||||||
switch strings.TrimSpace(strings.ToLower(t.params.BenchmarkProfile)) {
|
GPUIndices: t.params.GPUIndices,
|
||||||
case platform.NvidiaBenchmarkProfileStability:
|
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||||
dur = 300
|
RampStep: t.params.RampStep,
|
||||||
case platform.NvidiaBenchmarkProfileOvernight:
|
RampTotal: t.params.RampTotal,
|
||||||
dur = 600
|
RampRunID: t.params.RampRunID,
|
||||||
default:
|
}, j.append)
|
||||||
dur = 120
|
|
||||||
}
|
|
||||||
}
|
|
||||||
rampPlan, planErr := resolveNvidiaRampPlan(t.params.BenchmarkProfile, t.params.RampTotal > 0, t.params.GPUIndices)
|
|
||||||
if planErr != nil {
|
|
||||||
err = planErr
|
|
||||||
break
|
|
||||||
}
|
|
||||||
if t.params.RampTotal > 0 && t.params.RampStep > 0 && dur <= 0 {
|
|
||||||
dur = rampPlan.DurationSec
|
|
||||||
}
|
|
||||||
archive, err = a.RunNvidiaTargetedPowerPack(ctx, app.DefaultBeeBenchPowerDir, dur, t.params.GPUIndices, j.append)
|
|
||||||
case "nvidia-compute":
|
case "nvidia-compute":
|
||||||
if a == nil {
|
if a == nil {
|
||||||
err = fmt.Errorf("app not configured")
|
err = fmt.Errorf("app not configured")
|
||||||
|
|||||||
Reference in New Issue
Block a user