Add slot-aware ramp sequence to bee-bench power
This commit is contained in:
@@ -124,6 +124,7 @@ type satRunner interface {
|
||||
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
|
||||
RunNvidiaPowerBench(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
|
||||
RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error)
|
||||
RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
@@ -574,6 +575,13 @@ func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts pl
|
||||
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaPowerBenchCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultBeeBenchPowerDir
|
||||
}
|
||||
return a.sat.RunNvidiaPowerBench(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
|
||||
@@ -122,6 +122,7 @@ func (f fakeTools) CheckTools(names []string) []platform.ToolStatus {
|
||||
type fakeSAT struct {
|
||||
runNvidiaFn func(string) (string, error)
|
||||
runNvidiaBenchmarkFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
|
||||
runNvidiaPowerBenchFn func(string, platform.NvidiaBenchmarkOptions) (string, error)
|
||||
runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error)
|
||||
runNvidiaComputeFn func(string, int, []int) (string, error)
|
||||
runNvidiaPowerFn func(string, int, []int) (string, error)
|
||||
@@ -154,6 +155,13 @@ func (f fakeSAT) RunNvidiaBenchmark(_ context.Context, baseDir string, opts plat
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaPowerBench(_ context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, _ func(string)) (string, error) {
|
||||
if f.runNvidiaPowerBenchFn != nil {
|
||||
return f.runNvidiaPowerBenchFn(baseDir, opts)
|
||||
}
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
|
||||
if f.runNvidiaTargetedStressFn != nil {
|
||||
return f.runNvidiaTargetedStressFn(baseDir, durationSec, gpuIndices)
|
||||
|
||||
@@ -2603,3 +2603,279 @@ func runBenchmarkPowerCalibration(
|
||||
}
|
||||
return results, restore
|
||||
}
|
||||
|
||||
func powerBenchDurationSec(profile string) int {
|
||||
switch strings.TrimSpace(strings.ToLower(profile)) {
|
||||
case NvidiaBenchmarkProfileStability:
|
||||
return 300
|
||||
case NvidiaBenchmarkProfileOvernight:
|
||||
return 600
|
||||
default:
|
||||
return 120
|
||||
}
|
||||
}
|
||||
|
||||
func occupiedSlots(indices []int, current int) []int {
|
||||
out := make([]int, 0, len(indices))
|
||||
for _, idx := range indices {
|
||||
if idx != current {
|
||||
out = append(out, idx)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func cloneBenchmarkGPUInfoMap(src map[int]benchmarkGPUInfo) map[int]benchmarkGPUInfo {
|
||||
out := make(map[int]benchmarkGPUInfo, len(src))
|
||||
for k, v := range src {
|
||||
out[k] = v
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
var b strings.Builder
|
||||
b.WriteString("# Bee Bench Power Report\n\n")
|
||||
fmt.Fprintf(&b, "**Benchmark version:** %s \n", result.BenchmarkVersion)
|
||||
fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile)
|
||||
fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
|
||||
fmt.Fprintf(&b, "**Overall status:** %s \n\n", result.OverallStatus)
|
||||
if len(result.Findings) > 0 {
|
||||
b.WriteString("## Summary\n\n")
|
||||
for _, finding := range result.Findings {
|
||||
fmt.Fprintf(&b, "- %s\n", finding)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
if len(result.RecommendedSlotOrder) > 0 {
|
||||
b.WriteString("## Recommended Slot Order\n\n")
|
||||
fmt.Fprintf(&b, "Populate GPUs in this order for best single-card power realization: `%s`\n\n", joinIndexList(result.RecommendedSlotOrder))
|
||||
}
|
||||
if len(result.RampSteps) > 0 {
|
||||
b.WriteString("## Ramp Sequence\n\n")
|
||||
b.WriteString("| Step | GPUs | Total Power | Avg / GPU | Avg Realization | Min Realization | Derated |\n")
|
||||
b.WriteString("|------|------|-------------|-----------|-----------------|-----------------|---------|\n")
|
||||
for _, step := range result.RampSteps {
|
||||
fmt.Fprintf(&b, "| %d | %s | %.0f W | %.0f W | %.1f%% | %.1f%% | %d |\n",
|
||||
step.StepIndex, joinIndexList(step.GPUIndices), step.TotalObservedPowerW, step.AvgObservedPowerW, step.AvgPowerRealizationPct, step.MinPowerRealizationPct, step.DeratedGPUCount)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
b.WriteString("## Per-Slot Results\n\n")
|
||||
b.WriteString("| GPU | Status | Max Power | Temp | Applied Limit | Default Limit | Attempts |\n")
|
||||
b.WriteString("|-----|--------|-----------|------|---------------|---------------|----------|\n")
|
||||
for _, gpu := range result.GPUs {
|
||||
fmt.Fprintf(&b, "| GPU %d | %s | %.0f W | %.1f C | %.0f W | %.0f W | %d |\n",
|
||||
gpu.Index, gpu.Status, gpu.MaxObservedPowerW, gpu.MaxObservedTempC, gpu.AppliedPowerLimitW, gpu.DefaultPowerLimitW, gpu.CalibrationAttempts)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
for _, gpu := range result.GPUs {
|
||||
fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, gpu.Name)
|
||||
if gpu.OccupiedSlotsNote != "" {
|
||||
fmt.Fprintf(&b, "- %s\n", gpu.OccupiedSlotsNote)
|
||||
}
|
||||
for _, note := range gpu.Notes {
|
||||
fmt.Fprintf(&b, "- %s\n", note)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func renderPowerBenchSummary(result NvidiaPowerBenchResult) string {
|
||||
var b strings.Builder
|
||||
fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
|
||||
fmt.Fprintf(&b, "benchmark_version=%s\n", result.BenchmarkVersion)
|
||||
fmt.Fprintf(&b, "benchmark_profile=%s\n", result.BenchmarkProfile)
|
||||
fmt.Fprintf(&b, "overall_status=%s\n", result.OverallStatus)
|
||||
fmt.Fprintf(&b, "gpu_count=%d\n", len(result.GPUs))
|
||||
if len(result.RecommendedSlotOrder) > 0 {
|
||||
fmt.Fprintf(&b, "recommended_slot_order=%s\n", joinIndexList(result.RecommendedSlotOrder))
|
||||
}
|
||||
for _, step := range result.RampSteps {
|
||||
fmt.Fprintf(&b, "ramp_step_%d_gpus=%s\n", step.StepIndex, joinIndexList(step.GPUIndices))
|
||||
fmt.Fprintf(&b, "ramp_step_%d_total_power_w=%.0f\n", step.StepIndex, step.TotalObservedPowerW)
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||
if ctx == nil {
|
||||
ctx = context.Background()
|
||||
}
|
||||
if logFunc == nil {
|
||||
logFunc = func(string) {}
|
||||
}
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = "/var/log/bee-bench/power"
|
||||
}
|
||||
opts = normalizeNvidiaBenchmarkOptionsForBenchmark(opts)
|
||||
selected, err := resolveNvidiaGPUSelection(opts.GPUIndices, opts.ExcludeGPUIndices)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
if len(selected) == 0 {
|
||||
return "", fmt.Errorf("no NVIDIA GPUs selected")
|
||||
}
|
||||
ts := time.Now().UTC().Format("20060102-150405")
|
||||
runDir := filepath.Join(baseDir, "power-"+ts)
|
||||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||
return "", fmt.Errorf("mkdir %s: %w", runDir, err)
|
||||
}
|
||||
verboseLog := filepath.Join(runDir, "verbose.log")
|
||||
infoByIndex, infoErr := queryBenchmarkGPUInfo(selected)
|
||||
if infoErr != nil {
|
||||
return "", infoErr
|
||||
}
|
||||
hostname, _ := os.Hostname()
|
||||
result := NvidiaPowerBenchResult{
|
||||
BenchmarkVersion: benchmarkVersion,
|
||||
GeneratedAt: time.Now().UTC(),
|
||||
Hostname: hostname,
|
||||
ServerModel: readServerModel(),
|
||||
BenchmarkProfile: opts.Profile,
|
||||
SelectedGPUIndices: append([]int(nil), selected...),
|
||||
OverallStatus: "OK",
|
||||
}
|
||||
durationSec := powerBenchDurationSec(opts.Profile)
|
||||
_ = durationSec
|
||||
calibByIndex, restoreActions := runBenchmarkPowerCalibration(ctx, verboseLog, runDir, selected, infoByIndex, logFunc)
|
||||
defer func() {
|
||||
for i := len(restoreActions) - 1; i >= 0; i-- {
|
||||
restoreActions[i].fn()
|
||||
}
|
||||
}()
|
||||
gpus := make([]NvidiaPowerBenchGPU, 0, len(selected))
|
||||
for _, idx := range selected {
|
||||
info := infoByIndex[idx]
|
||||
calib := calibByIndex[idx]
|
||||
status := "OK"
|
||||
if !calib.Completed {
|
||||
status = "FAILED"
|
||||
result.OverallStatus = "PARTIAL"
|
||||
} else if calib.Derated {
|
||||
status = "PARTIAL"
|
||||
if result.OverallStatus == "OK" {
|
||||
result.OverallStatus = "PARTIAL"
|
||||
}
|
||||
}
|
||||
occupied := occupiedSlots(selected, idx)
|
||||
note := ""
|
||||
if len(occupied) > 0 {
|
||||
note = fmt.Sprintf("Slot recommendation was measured while slots %s were populated; airflow in a different chassis fill pattern may differ.", joinIndexList(occupied))
|
||||
}
|
||||
gpus = append(gpus, NvidiaPowerBenchGPU{
|
||||
Index: idx,
|
||||
Name: info.Name,
|
||||
BusID: info.BusID,
|
||||
DefaultPowerLimitW: info.DefaultPowerLimitW,
|
||||
AppliedPowerLimitW: calib.AppliedPowerLimitW,
|
||||
MaxObservedPowerW: calib.Summary.P95PowerW,
|
||||
MaxObservedTempC: calib.Summary.P95TempC,
|
||||
CalibrationAttempts: calib.Attempts,
|
||||
Derated: calib.Derated,
|
||||
Status: status,
|
||||
OccupiedSlots: occupied,
|
||||
OccupiedSlotsNote: note,
|
||||
Notes: append([]string(nil), calib.Notes...),
|
||||
})
|
||||
}
|
||||
sort.Slice(gpus, func(i, j int) bool {
|
||||
if gpus[i].MaxObservedPowerW != gpus[j].MaxObservedPowerW {
|
||||
return gpus[i].MaxObservedPowerW > gpus[j].MaxObservedPowerW
|
||||
}
|
||||
if gpus[i].AppliedPowerLimitW != gpus[j].AppliedPowerLimitW {
|
||||
return gpus[i].AppliedPowerLimitW > gpus[j].AppliedPowerLimitW
|
||||
}
|
||||
if gpus[i].Derated != gpus[j].Derated {
|
||||
return !gpus[i].Derated
|
||||
}
|
||||
return gpus[i].Index < gpus[j].Index
|
||||
})
|
||||
result.GPUs = gpus
|
||||
result.RecommendedSlotOrder = make([]int, 0, len(gpus))
|
||||
for _, gpu := range gpus {
|
||||
result.RecommendedSlotOrder = append(result.RecommendedSlotOrder, gpu.Index)
|
||||
}
|
||||
if len(result.RecommendedSlotOrder) > 0 {
|
||||
result.Findings = append(result.Findings, fmt.Sprintf("Recommended slot order for installation based on single-card targeted_power: %s.", joinIndexList(result.RecommendedSlotOrder)))
|
||||
}
|
||||
for _, gpu := range gpus {
|
||||
if gpu.Derated {
|
||||
result.Findings = append(result.Findings, fmt.Sprintf("GPU %d required reduced power limit %.0f W to complete targeted_power.", gpu.Index, gpu.AppliedPowerLimitW))
|
||||
}
|
||||
}
|
||||
singleByIndex := make(map[int]NvidiaPowerBenchGPU, len(gpus))
|
||||
for _, gpu := range gpus {
|
||||
singleByIndex[gpu.Index] = gpu
|
||||
}
|
||||
for step := 1; step <= len(result.RecommendedSlotOrder); step++ {
|
||||
subset := append([]int(nil), result.RecommendedSlotOrder[:step]...)
|
||||
stepDir := filepath.Join(runDir, fmt.Sprintf("step-%02d", step))
|
||||
_ = os.MkdirAll(stepDir, 0755)
|
||||
stepInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
||||
stepCalib, stepRestore := runBenchmarkPowerCalibration(ctx, verboseLog, stepDir, subset, stepInfo, logFunc)
|
||||
for i := len(stepRestore) - 1; i >= 0; i-- {
|
||||
stepRestore[i].fn()
|
||||
}
|
||||
ramp := NvidiaPowerBenchStep{
|
||||
StepIndex: step,
|
||||
GPUIndices: subset,
|
||||
Status: "OK",
|
||||
}
|
||||
var realizationValues []float64
|
||||
for _, idx := range subset {
|
||||
calib := stepCalib[idx]
|
||||
ramp.TotalObservedPowerW += calib.Summary.P95PowerW
|
||||
if calib.Derated {
|
||||
ramp.DeratedGPUCount++
|
||||
ramp.Status = "PARTIAL"
|
||||
}
|
||||
if !calib.Completed {
|
||||
ramp.Status = "FAILED"
|
||||
ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete targeted_power in ramp step %d", idx, step))
|
||||
continue
|
||||
}
|
||||
if single, ok := singleByIndex[idx]; ok && single.MaxObservedPowerW > 0 {
|
||||
realization := calib.Summary.P95PowerW / single.MaxObservedPowerW * 100
|
||||
realizationValues = append(realizationValues, realization)
|
||||
}
|
||||
}
|
||||
if len(subset) > 0 {
|
||||
ramp.AvgObservedPowerW = ramp.TotalObservedPowerW / float64(len(subset))
|
||||
}
|
||||
if len(realizationValues) > 0 {
|
||||
ramp.AvgPowerRealizationPct = benchmarkMean(realizationValues)
|
||||
ramp.MinPowerRealizationPct = realizationValues[0]
|
||||
for _, v := range realizationValues[1:] {
|
||||
if v < ramp.MinPowerRealizationPct {
|
||||
ramp.MinPowerRealizationPct = v
|
||||
}
|
||||
}
|
||||
}
|
||||
if ramp.MinPowerRealizationPct > 0 && ramp.MinPowerRealizationPct < 90 {
|
||||
ramp.Notes = append(ramp.Notes, fmt.Sprintf("Power realization fell to %.1f%% of single-card baseline by step %d.", ramp.MinPowerRealizationPct, step))
|
||||
if result.OverallStatus == "OK" {
|
||||
result.OverallStatus = "PARTIAL"
|
||||
}
|
||||
}
|
||||
if ramp.DeratedGPUCount > 0 {
|
||||
result.Findings = append(result.Findings, fmt.Sprintf("Ramp step %d (%s) needed derating on %d GPU(s).", step, joinIndexList(subset), ramp.DeratedGPUCount))
|
||||
}
|
||||
result.RampSteps = append(result.RampSteps, ramp)
|
||||
}
|
||||
resultJSON, err := json.MarshalIndent(result, "", " ")
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("marshal power result: %w", err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(runDir, "result.json"), resultJSON, 0644); err != nil {
|
||||
return "", fmt.Errorf("write result.json: %w", err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(renderPowerBenchReport(result)), 0644); err != nil {
|
||||
return "", fmt.Errorf("write report.md: %w", err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(renderPowerBenchSummary(result)), 0644); err != nil {
|
||||
return "", fmt.Errorf("write summary.txt: %w", err)
|
||||
}
|
||||
return runDir, nil
|
||||
}
|
||||
|
||||
@@ -251,3 +251,45 @@ type BenchmarkInterconnectResult struct {
|
||||
MaxBusBWGBps float64 `json:"max_busbw_gbps,omitempty"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
}
|
||||
|
||||
type NvidiaPowerBenchResult struct {
|
||||
BenchmarkVersion string `json:"benchmark_version"`
|
||||
GeneratedAt time.Time `json:"generated_at"`
|
||||
Hostname string `json:"hostname,omitempty"`
|
||||
ServerModel string `json:"server_model,omitempty"`
|
||||
BenchmarkProfile string `json:"benchmark_profile"`
|
||||
SelectedGPUIndices []int `json:"selected_gpu_indices"`
|
||||
RecommendedSlotOrder []int `json:"recommended_slot_order,omitempty"`
|
||||
RampSteps []NvidiaPowerBenchStep `json:"ramp_steps,omitempty"`
|
||||
OverallStatus string `json:"overall_status"`
|
||||
Findings []string `json:"findings,omitempty"`
|
||||
GPUs []NvidiaPowerBenchGPU `json:"gpus"`
|
||||
}
|
||||
|
||||
type NvidiaPowerBenchGPU struct {
|
||||
Index int `json:"index"`
|
||||
Name string `json:"name,omitempty"`
|
||||
BusID string `json:"bus_id,omitempty"`
|
||||
DefaultPowerLimitW float64 `json:"default_power_limit_w,omitempty"`
|
||||
AppliedPowerLimitW float64 `json:"applied_power_limit_w,omitempty"`
|
||||
MaxObservedPowerW float64 `json:"max_observed_power_w,omitempty"`
|
||||
MaxObservedTempC float64 `json:"max_observed_temp_c,omitempty"`
|
||||
CalibrationAttempts int `json:"calibration_attempts,omitempty"`
|
||||
Derated bool `json:"derated,omitempty"`
|
||||
Status string `json:"status"`
|
||||
OccupiedSlots []int `json:"occupied_slots,omitempty"`
|
||||
OccupiedSlotsNote string `json:"occupied_slots_note,omitempty"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
}
|
||||
|
||||
type NvidiaPowerBenchStep struct {
|
||||
StepIndex int `json:"step_index"`
|
||||
GPUIndices []int `json:"gpu_indices"`
|
||||
TotalObservedPowerW float64 `json:"total_observed_power_w,omitempty"`
|
||||
AvgObservedPowerW float64 `json:"avg_observed_power_w,omitempty"`
|
||||
MinPowerRealizationPct float64 `json:"min_power_realization_pct,omitempty"`
|
||||
AvgPowerRealizationPct float64 `json:"avg_power_realization_pct,omitempty"`
|
||||
DeratedGPUCount int `json:"derated_gpu_count,omitempty"`
|
||||
Status string `json:"status"`
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
}
|
||||
|
||||
@@ -233,6 +233,9 @@ func renderTaskReportFragment(report taskReport, charts map[string]string, logTe
|
||||
if benchmarkCard := renderTaskBenchmarkResultsCard(report.Target, logText); benchmarkCard != "" {
|
||||
b.WriteString(benchmarkCard)
|
||||
}
|
||||
if powerCard := renderTaskPowerResultsCard(report.Target, logText); powerCard != "" {
|
||||
b.WriteString(powerCard)
|
||||
}
|
||||
|
||||
if len(report.Charts) > 0 {
|
||||
for _, chart := range report.Charts {
|
||||
@@ -273,15 +276,42 @@ func renderTaskBenchmarkResultsCard(target, logText string) string {
|
||||
)
|
||||
}
|
||||
|
||||
func renderTaskPowerResultsCard(target, logText string) string {
|
||||
if strings.TrimSpace(target) != "nvidia-bench-power" {
|
||||
return ""
|
||||
}
|
||||
resultPath := taskBenchmarkResultPath(logText)
|
||||
if strings.TrimSpace(resultPath) == "" {
|
||||
return ""
|
||||
}
|
||||
raw, err := os.ReadFile(resultPath)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
var result platform.NvidiaPowerBenchResult
|
||||
if err := json.Unmarshal(raw, &result); err != nil {
|
||||
return ""
|
||||
}
|
||||
var b strings.Builder
|
||||
b.WriteString(`<div class="card"><div class="card-head">Power Results</div><div class="card-body">`)
|
||||
if len(result.RecommendedSlotOrder) > 0 {
|
||||
b.WriteString(`<p style="margin-bottom:10px"><strong>Recommended slot order:</strong> ` + html.EscapeString(joinTaskIndices(result.RecommendedSlotOrder)) + `</p>`)
|
||||
}
|
||||
b.WriteString(`<table><tr><th>GPU</th><th>Status</th><th>Max Power</th><th>Applied Limit</th></tr>`)
|
||||
for _, gpu := range result.GPUs {
|
||||
fmt.Fprintf(&b, `<tr><td>GPU %d</td><td>%s</td><td>%.0f W</td><td>%.0f W</td></tr>`,
|
||||
gpu.Index, html.EscapeString(gpu.Status), gpu.MaxObservedPowerW, gpu.AppliedPowerLimitW)
|
||||
}
|
||||
b.WriteString(`</table></div></div>`)
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func taskBenchmarkResultPath(logText string) string {
|
||||
archivePath := taskArchivePathFromLog(logText)
|
||||
if archivePath == "" {
|
||||
return ""
|
||||
}
|
||||
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||
if runDir == archivePath {
|
||||
return ""
|
||||
}
|
||||
return filepath.Join(runDir, "result.json")
|
||||
}
|
||||
|
||||
|
||||
@@ -650,26 +650,14 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
dur := t.params.Duration
|
||||
if dur <= 0 {
|
||||
switch strings.TrimSpace(strings.ToLower(t.params.BenchmarkProfile)) {
|
||||
case platform.NvidiaBenchmarkProfileStability:
|
||||
dur = 300
|
||||
case platform.NvidiaBenchmarkProfileOvernight:
|
||||
dur = 600
|
||||
default:
|
||||
dur = 120
|
||||
}
|
||||
}
|
||||
rampPlan, planErr := resolveNvidiaRampPlan(t.params.BenchmarkProfile, t.params.RampTotal > 0, t.params.GPUIndices)
|
||||
if planErr != nil {
|
||||
err = planErr
|
||||
break
|
||||
}
|
||||
if t.params.RampTotal > 0 && t.params.RampStep > 0 && dur <= 0 {
|
||||
dur = rampPlan.DurationSec
|
||||
}
|
||||
archive, err = a.RunNvidiaTargetedPowerPack(ctx, app.DefaultBeeBenchPowerDir, dur, t.params.GPUIndices, j.append)
|
||||
archive, err = a.RunNvidiaPowerBenchCtx(ctx, app.DefaultBeeBenchPowerDir, platform.NvidiaBenchmarkOptions{
|
||||
Profile: t.params.BenchmarkProfile,
|
||||
GPUIndices: t.params.GPUIndices,
|
||||
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||
RampStep: t.params.RampStep,
|
||||
RampTotal: t.params.RampTotal,
|
||||
RampRunID: t.params.RampRunID,
|
||||
}, j.append)
|
||||
case "nvidia-compute":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
|
||||
Reference in New Issue
Block a user