736 lines
26 KiB
Go
736 lines
26 KiB
Go
package platform
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"math"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"sort"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
const (
|
|
benchmarkPowerAutotuneVersion = 1
|
|
benchmarkPowerAutotuneIdleSec = 60
|
|
benchmarkPowerAutotuneLoadSec = 90
|
|
benchmarkPowerAutotuneSampleInterval = 3
|
|
defaultBenchmarkPowerSourceConfigPath = "/appdata/bee/export/bee-bench/power-source-autotune.json"
|
|
)
|
|
|
|
func BenchmarkPowerSourceConfigPath(baseDir string) string {
|
|
baseDir = strings.TrimSpace(baseDir)
|
|
if baseDir == "" {
|
|
return defaultBenchmarkPowerSourceConfigPath
|
|
}
|
|
return filepath.Join(filepath.Dir(baseDir), "power-source-autotune.json")
|
|
}
|
|
|
|
func LoadBenchmarkPowerAutotuneConfig(path string) (*BenchmarkPowerAutotuneConfig, error) {
|
|
raw, err := os.ReadFile(path)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
var cfg BenchmarkPowerAutotuneConfig
|
|
if err := json.Unmarshal(raw, &cfg); err != nil {
|
|
return nil, err
|
|
}
|
|
if strings.TrimSpace(cfg.SelectedSource) == "" {
|
|
return nil, fmt.Errorf("autotune config missing selected_source")
|
|
}
|
|
return &cfg, nil
|
|
}
|
|
|
|
func SaveBenchmarkPowerAutotuneConfig(path string, cfg BenchmarkPowerAutotuneConfig) error {
|
|
if strings.TrimSpace(path) == "" {
|
|
return fmt.Errorf("empty autotune config path")
|
|
}
|
|
if cfg.Version <= 0 {
|
|
cfg.Version = benchmarkPowerAutotuneVersion
|
|
}
|
|
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
|
return err
|
|
}
|
|
data, err := json.MarshalIndent(cfg, "", " ")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
tmp := path + ".tmp"
|
|
if err := os.WriteFile(tmp, data, 0644); err != nil {
|
|
return err
|
|
}
|
|
return os.Rename(tmp, path)
|
|
}
|
|
|
|
func LoadSystemPowerSourceConfig(exportDir string) (*BenchmarkPowerAutotuneConfig, error) {
|
|
return LoadBenchmarkPowerAutotuneConfig(BenchmarkPowerSourceConfigPath(exportDir))
|
|
}
|
|
|
|
func ResetBenchmarkPowerAutotuneConfig(path string) error {
|
|
if strings.TrimSpace(path) == "" {
|
|
return fmt.Errorf("empty autotune config path")
|
|
}
|
|
if err := os.Remove(path); err != nil && !os.IsNotExist(err) {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func normalizeBenchmarkPowerSource(source string) string {
|
|
switch strings.TrimSpace(strings.ToLower(source)) {
|
|
case BenchmarkPowerSourceSDRPSUInput:
|
|
return BenchmarkPowerSourceSDRPSUInput
|
|
default:
|
|
return BenchmarkPowerSourceDCMI
|
|
}
|
|
}
|
|
|
|
func ResolveSystemPowerDecision(exportDir string) SystemPowerSourceDecision {
|
|
cfg, err := LoadSystemPowerSourceConfig(exportDir)
|
|
if err == nil && cfg != nil && strings.TrimSpace(cfg.SelectedSource) != "" {
|
|
selected := normalizeBenchmarkPowerSource(cfg.SelectedSource)
|
|
return SystemPowerSourceDecision{
|
|
Configured: true,
|
|
SelectedSource: selected,
|
|
EffectiveSource: selected,
|
|
Mode: "autotuned",
|
|
Reason: strings.TrimSpace(cfg.Reason),
|
|
ConfiguredAt: cfg.UpdatedAt,
|
|
}
|
|
}
|
|
|
|
sources := sampleBenchmarkPowerSources()
|
|
if value := sources[BenchmarkPowerSourceSDRPSUInput]; value > 0 {
|
|
return SystemPowerSourceDecision{
|
|
Configured: false,
|
|
EffectiveSource: BenchmarkPowerSourceSDRPSUInput,
|
|
Mode: "fallback",
|
|
Reason: "autotune config not found; using temporary fallback source sdr_psu_input",
|
|
}
|
|
}
|
|
return SystemPowerSourceDecision{
|
|
Configured: false,
|
|
EffectiveSource: BenchmarkPowerSourceDCMI,
|
|
Mode: "fallback",
|
|
Reason: "autotune config not found; using temporary fallback source dcmi",
|
|
}
|
|
}
|
|
|
|
func SampleSystemPowerResolved(exportDir string) (float64, SystemPowerSourceDecision, error) {
|
|
decision := ResolveSystemPowerDecision(exportDir)
|
|
if decision.EffectiveSource != "" {
|
|
if value, err := queryBenchmarkPowerSourceW(decision.EffectiveSource); err == nil && value > 0 {
|
|
return value, decision, nil
|
|
} else if decision.Configured {
|
|
fallback := BenchmarkPowerSourceDCMI
|
|
if decision.EffectiveSource == BenchmarkPowerSourceDCMI {
|
|
fallback = BenchmarkPowerSourceSDRPSUInput
|
|
}
|
|
if fallbackValue, fallbackErr := queryBenchmarkPowerSourceW(fallback); fallbackErr == nil && fallbackValue > 0 {
|
|
decision.Mode = "degraded"
|
|
decision.Reason = fmt.Sprintf("configured source %s unavailable; using degraded fallback %s", decision.SelectedSource, fallback)
|
|
decision.EffectiveSource = fallback
|
|
return fallbackValue, decision, nil
|
|
}
|
|
decision.Mode = "degraded"
|
|
decision.Reason = fmt.Sprintf("configured source %s unavailable and no fallback source responded", decision.SelectedSource)
|
|
return 0, decision, err
|
|
}
|
|
}
|
|
return 0, decision, fmt.Errorf("system power source unavailable")
|
|
}
|
|
|
|
func queryBenchmarkPowerSourceW(source string) (float64, error) {
|
|
switch normalizeBenchmarkPowerSource(source) {
|
|
case BenchmarkPowerSourceSDRPSUInput:
|
|
sdr := sampleIPMISDRPowerSensors()
|
|
if sdr.PSUInW > 0 {
|
|
return sdr.PSUInW, nil
|
|
}
|
|
return 0, fmt.Errorf("sdr psu input unavailable")
|
|
default:
|
|
return queryIPMIServerPowerW()
|
|
}
|
|
}
|
|
|
|
func sampleBenchmarkPowerSources() map[string]float64 {
|
|
out := map[string]float64{}
|
|
if w, err := queryIPMIServerPowerW(); err == nil && w > 0 {
|
|
out[BenchmarkPowerSourceDCMI] = w
|
|
}
|
|
if w, err := queryBenchmarkPowerSourceW(BenchmarkPowerSourceSDRPSUInput); err == nil && w > 0 {
|
|
out[BenchmarkPowerSourceSDRPSUInput] = w
|
|
}
|
|
return out
|
|
}
|
|
|
|
func sampleBenchmarkPowerSourceSeries(ctx context.Context, source string, durationSec, intervalSec int) (float64, bool) {
|
|
if durationSec <= 0 {
|
|
return 0, false
|
|
}
|
|
samples := collectSelectedPowerSourceSamples(ctx, source, durationSec, intervalSec)
|
|
if len(samples) == 0 {
|
|
return 0, false
|
|
}
|
|
return benchmarkMean(samples), true
|
|
}
|
|
|
|
func collectSelectedPowerSourceSamples(ctx context.Context, source string, durationSec, intervalSec int) []float64 {
|
|
if durationSec <= 0 {
|
|
return nil
|
|
}
|
|
stopCh := make(chan struct{})
|
|
doneCh := startSelectedPowerSourceSampler(stopCh, source, intervalSec)
|
|
select {
|
|
case <-ctx.Done():
|
|
case <-time.After(time.Duration(durationSec) * time.Second):
|
|
}
|
|
close(stopCh)
|
|
return <-doneCh
|
|
}
|
|
|
|
func startSelectedPowerSourceSampler(stopCh <-chan struct{}, source string, intervalSec int) <-chan []float64 {
|
|
if intervalSec <= 0 {
|
|
intervalSec = benchmarkPowerAutotuneSampleInterval
|
|
}
|
|
ch := make(chan []float64, 1)
|
|
go func() {
|
|
defer close(ch)
|
|
var samples []float64
|
|
record := func() {
|
|
if w, err := queryBenchmarkPowerSourceW(source); err == nil && w > 0 {
|
|
samples = append(samples, w)
|
|
}
|
|
}
|
|
record()
|
|
ticker := time.NewTicker(time.Duration(intervalSec) * time.Second)
|
|
defer ticker.Stop()
|
|
for {
|
|
select {
|
|
case <-stopCh:
|
|
ch <- samples
|
|
return
|
|
case <-ticker.C:
|
|
record()
|
|
}
|
|
}
|
|
}()
|
|
return ch
|
|
}
|
|
|
|
type benchmarkPowerAutotuneSample struct {
|
|
ElapsedSec float64
|
|
GPUAvgUsagePct float64
|
|
CPUUsagePct float64
|
|
GPUSumPowerW float64
|
|
Sources map[string]float64
|
|
}
|
|
|
|
func collectBenchmarkPowerAutotuneSamples(ctx context.Context, phase string, gpuIndices []int, durationSec int, logFunc func(string)) []benchmarkPowerAutotuneSample {
|
|
if durationSec <= 0 {
|
|
return nil
|
|
}
|
|
var out []benchmarkPowerAutotuneSample
|
|
deadline := time.Now().Add(time.Duration(durationSec) * time.Second)
|
|
start := time.Now()
|
|
for {
|
|
if ctx.Err() != nil {
|
|
return out
|
|
}
|
|
row := benchmarkPowerAutotuneSample{
|
|
ElapsedSec: time.Since(start).Seconds(),
|
|
CPUUsagePct: sampleCPULoadPct(),
|
|
Sources: sampleBenchmarkPowerSources(),
|
|
}
|
|
if gpuRows, err := sampleGPUMetrics(gpuIndices); err == nil && len(gpuRows) > 0 {
|
|
var usageSum float64
|
|
for _, gpu := range gpuRows {
|
|
row.GPUSumPowerW += gpu.PowerW
|
|
usageSum += gpu.UsagePct
|
|
}
|
|
row.GPUAvgUsagePct = usageSum / float64(len(gpuRows))
|
|
}
|
|
out = append(out, row)
|
|
logBenchmarkPowerAutotuneSample(phase, row, logFunc)
|
|
if time.Now().After(deadline) {
|
|
return out
|
|
}
|
|
select {
|
|
case <-ctx.Done():
|
|
return out
|
|
case <-time.After(benchmarkPowerAutotuneSampleInterval * time.Second):
|
|
}
|
|
}
|
|
}
|
|
|
|
func logBenchmarkPowerAutotuneSample(phase string, sample benchmarkPowerAutotuneSample, logFunc func(string)) {
|
|
if logFunc == nil {
|
|
return
|
|
}
|
|
var sourceParts []string
|
|
for _, source := range []string{BenchmarkPowerSourceDCMI, BenchmarkPowerSourceSDRPSUInput} {
|
|
if value, ok := sample.Sources[source]; ok && value > 0 {
|
|
sourceParts = append(sourceParts, fmt.Sprintf("%s=%.0fW", source, value))
|
|
} else {
|
|
sourceParts = append(sourceParts, fmt.Sprintf("%s=n/a", source))
|
|
}
|
|
}
|
|
logFunc(fmt.Sprintf(
|
|
"autotune %s sample t=%.0fs gpu_avg_util=%.1f%% gpu_sum_power=%.0fW cpu_load=%.1f%% %s",
|
|
phase,
|
|
sample.ElapsedSec,
|
|
sample.GPUAvgUsagePct,
|
|
sample.GPUSumPowerW,
|
|
sample.CPUUsagePct,
|
|
strings.Join(sourceParts, " "),
|
|
))
|
|
}
|
|
|
|
func logBenchmarkPowerAutotunePhaseSummary(phase string, samples []benchmarkPowerAutotuneSample, logFunc func(string)) {
|
|
if logFunc == nil || len(samples) == 0 {
|
|
return
|
|
}
|
|
var gpuUsage []float64
|
|
var cpuUsage []float64
|
|
var gpuPower []float64
|
|
sourceBuckets := map[string][]float64{}
|
|
for _, sample := range samples {
|
|
gpuUsage = append(gpuUsage, sample.GPUAvgUsagePct)
|
|
cpuUsage = append(cpuUsage, sample.CPUUsagePct)
|
|
gpuPower = append(gpuPower, sample.GPUSumPowerW)
|
|
for source, value := range sample.Sources {
|
|
if value > 0 {
|
|
sourceBuckets[source] = append(sourceBuckets[source], value)
|
|
}
|
|
}
|
|
}
|
|
var sourceParts []string
|
|
for _, source := range []string{BenchmarkPowerSourceDCMI, BenchmarkPowerSourceSDRPSUInput} {
|
|
values := sourceBuckets[source]
|
|
if len(values) == 0 {
|
|
sourceParts = append(sourceParts, fmt.Sprintf("%s_avg=n/a", source))
|
|
continue
|
|
}
|
|
sourceParts = append(sourceParts, fmt.Sprintf("%s_avg=%.0fW", source, benchmarkMean(values)))
|
|
}
|
|
logFunc(fmt.Sprintf(
|
|
"autotune %s summary samples=%d gpu_avg_util=%.1f%% gpu_p95_util=%.1f%% gpu_avg_power=%.0fW cpu_avg=%.1f%% cpu_p95=%.1f%% %s",
|
|
phase,
|
|
len(samples),
|
|
benchmarkMean(gpuUsage),
|
|
benchmarkPercentile(gpuUsage, 95),
|
|
benchmarkMean(gpuPower),
|
|
benchmarkMean(cpuUsage),
|
|
benchmarkPercentile(cpuUsage, 95),
|
|
strings.Join(sourceParts, " "),
|
|
))
|
|
}
|
|
|
|
func logBenchmarkPowerAutotuneSelection(candidates []BenchmarkPowerAutotuneCandidate, selectedSource string, gpuDelta float64, logFunc func(string)) {
|
|
if logFunc == nil {
|
|
return
|
|
}
|
|
for _, candidate := range candidates {
|
|
if !candidate.Available {
|
|
logFunc(fmt.Sprintf("autotune candidate %s unavailable", candidate.Source))
|
|
continue
|
|
}
|
|
logFunc(fmt.Sprintf(
|
|
"autotune candidate %s idle_avg=%.0fW load_avg=%.0fW delta=%.0fW gpu_delta=%.0fW relative_error=%.3f confidence=%.0f%%%s",
|
|
candidate.Source,
|
|
candidate.IdleAvgW,
|
|
candidate.LoadAvgW,
|
|
candidate.DeltaW,
|
|
gpuDelta,
|
|
candidate.RelativeError,
|
|
candidate.Confidence*100,
|
|
map[bool]string{true: " SELECTED", false: ""}[candidate.Source == selectedSource],
|
|
))
|
|
if strings.TrimSpace(candidate.SelectionNotes) != "" {
|
|
logFunc(fmt.Sprintf("autotune candidate %s reason: %s", candidate.Source, candidate.SelectionNotes))
|
|
}
|
|
}
|
|
}
|
|
|
|
func validateBenchmarkPowerAutotuneIdle(samples []benchmarkPowerAutotuneSample) *BenchmarkPowerAutotuneValidation {
|
|
result := &BenchmarkPowerAutotuneValidation{}
|
|
if len(samples) == 0 {
|
|
result.Reason = "no idle telemetry samples collected"
|
|
return result
|
|
}
|
|
var gpuUsage []float64
|
|
var cpuUsage []float64
|
|
for _, sample := range samples {
|
|
gpuUsage = append(gpuUsage, sample.GPUAvgUsagePct)
|
|
if sample.CPUUsagePct > 0 {
|
|
cpuUsage = append(cpuUsage, sample.CPUUsagePct)
|
|
}
|
|
}
|
|
result.GPUSamples = len(gpuUsage)
|
|
result.CPUSamples = len(cpuUsage)
|
|
result.GPUAvgUsagePct = math.Round(benchmarkMean(gpuUsage)*10) / 10
|
|
result.GPUP95UsagePct = math.Round(benchmarkPercentile(gpuUsage, 95)*10) / 10
|
|
result.CPUAvgUsagePct = math.Round(benchmarkMean(cpuUsage)*10) / 10
|
|
result.CPUP95UsagePct = math.Round(benchmarkPercentile(cpuUsage, 95)*10) / 10
|
|
switch {
|
|
case result.GPUAvgUsagePct > 5:
|
|
result.Reason = fmt.Sprintf("idle validation failed: average GPU load %.1f%% exceeds 5%%", result.GPUAvgUsagePct)
|
|
case result.GPUP95UsagePct > 10:
|
|
result.Reason = fmt.Sprintf("idle validation failed: p95 GPU load %.1f%% exceeds 10%%", result.GPUP95UsagePct)
|
|
case result.CPUAvgUsagePct > 20:
|
|
result.Reason = fmt.Sprintf("idle validation failed: average CPU load %.1f%% exceeds 20%%", result.CPUAvgUsagePct)
|
|
case result.CPUP95UsagePct > 35:
|
|
result.Reason = fmt.Sprintf("idle validation failed: p95 CPU load %.1f%% exceeds 35%%", result.CPUP95UsagePct)
|
|
default:
|
|
result.Valid = true
|
|
}
|
|
return result
|
|
}
|
|
|
|
func chooseBenchmarkPowerAutotuneSource(idle, load []benchmarkPowerAutotuneSample) (string, []BenchmarkPowerAutotuneCandidate, float64, float64, error) {
|
|
idleBySource := map[string][]float64{}
|
|
loadBySource := map[string][]float64{}
|
|
var idleGPU []float64
|
|
var loadGPU []float64
|
|
for _, sample := range idle {
|
|
idleGPU = append(idleGPU, sample.GPUSumPowerW)
|
|
for source, value := range sample.Sources {
|
|
if value > 0 {
|
|
idleBySource[source] = append(idleBySource[source], value)
|
|
}
|
|
}
|
|
}
|
|
for _, sample := range load {
|
|
loadGPU = append(loadGPU, sample.GPUSumPowerW)
|
|
for source, value := range sample.Sources {
|
|
if value > 0 {
|
|
loadBySource[source] = append(loadBySource[source], value)
|
|
}
|
|
}
|
|
}
|
|
idleGPUAvg := benchmarkMean(idleGPU)
|
|
loadGPUAvg := benchmarkMean(loadGPU)
|
|
gpuDelta := loadGPUAvg - idleGPUAvg
|
|
if gpuDelta <= 0 {
|
|
gpuDelta = loadGPUAvg
|
|
}
|
|
|
|
candidates := []BenchmarkPowerAutotuneCandidate{
|
|
buildBenchmarkPowerAutotuneCandidate(BenchmarkPowerSourceDCMI, idleBySource[BenchmarkPowerSourceDCMI], loadBySource[BenchmarkPowerSourceDCMI], gpuDelta),
|
|
buildBenchmarkPowerAutotuneCandidate(BenchmarkPowerSourceSDRPSUInput, idleBySource[BenchmarkPowerSourceSDRPSUInput], loadBySource[BenchmarkPowerSourceSDRPSUInput], gpuDelta),
|
|
}
|
|
available := make([]BenchmarkPowerAutotuneCandidate, 0, len(candidates))
|
|
for _, candidate := range candidates {
|
|
if candidate.Available && candidate.DeltaW > 0 {
|
|
available = append(available, candidate)
|
|
}
|
|
}
|
|
if len(available) == 0 {
|
|
return "", candidates, idleGPUAvg, loadGPUAvg, fmt.Errorf("no usable server power source samples collected")
|
|
}
|
|
sort.Slice(available, func(i, j int) bool {
|
|
if math.Abs(available[i].RelativeError-available[j].RelativeError) <= 0.10 {
|
|
if available[i].Source != available[j].Source {
|
|
return available[i].Source == BenchmarkPowerSourceSDRPSUInput
|
|
}
|
|
}
|
|
if available[i].RelativeError != available[j].RelativeError {
|
|
return available[i].RelativeError < available[j].RelativeError
|
|
}
|
|
return available[i].Samples > available[j].Samples
|
|
})
|
|
selected := available[0]
|
|
for idx := range candidates {
|
|
if candidates[idx].Source == selected.Source {
|
|
candidates[idx].Selected = true
|
|
candidates[idx].SelectionNotes = fmt.Sprintf("selected because delta %.0f W is closest to GPU delta %.0f W (relative error %.3f)", selected.DeltaW, gpuDelta, selected.RelativeError)
|
|
}
|
|
}
|
|
return selected.Source, candidates, idleGPUAvg, loadGPUAvg, nil
|
|
}
|
|
|
|
func buildBenchmarkPowerAutotuneCandidate(source string, idle, load []float64, gpuDelta float64) BenchmarkPowerAutotuneCandidate {
|
|
candidate := BenchmarkPowerAutotuneCandidate{
|
|
Source: source,
|
|
Available: len(idle) > 0 && len(load) > 0,
|
|
Samples: minInt(len(idle), len(load)),
|
|
}
|
|
if !candidate.Available {
|
|
return candidate
|
|
}
|
|
candidate.IdleAvgW = benchmarkMean(idle)
|
|
candidate.LoadAvgW = benchmarkMean(load)
|
|
candidate.DeltaW = candidate.LoadAvgW - candidate.IdleAvgW
|
|
if gpuDelta > 0 {
|
|
candidate.RelativeError = math.Abs(candidate.DeltaW-gpuDelta) / gpuDelta
|
|
candidate.Confidence = math.Max(0, 1-candidate.RelativeError)
|
|
}
|
|
return candidate
|
|
}
|
|
|
|
func renderBenchmarkPowerAutotuneSummary(result BenchmarkPowerAutotuneResult) string {
|
|
var b strings.Builder
|
|
fmt.Fprintf(&b, "generated_at=%s\n", result.GeneratedAt.UTC().Format(time.RFC3339))
|
|
fmt.Fprintf(&b, "status=%s\n", result.Status)
|
|
fmt.Fprintf(&b, "benchmark_kind=%s\n", result.BenchmarkKind)
|
|
fmt.Fprintf(&b, "profile=%s\n", result.Profile)
|
|
fmt.Fprintf(&b, "idle_duration_sec=%d\n", result.IdleDurationSec)
|
|
fmt.Fprintf(&b, "load_duration_sec=%d\n", result.LoadDurationSec)
|
|
fmt.Fprintf(&b, "sample_interval_sec=%d\n", result.SampleIntervalSec)
|
|
if result.SelectedSource != "" {
|
|
fmt.Fprintf(&b, "selected_source=%s\n", result.SelectedSource)
|
|
}
|
|
if result.IdleValidation != nil {
|
|
fmt.Fprintf(&b, "idle_valid=%t\n", result.IdleValidation.Valid)
|
|
fmt.Fprintf(&b, "idle_gpu_avg_usage_pct=%.1f\n", result.IdleValidation.GPUAvgUsagePct)
|
|
fmt.Fprintf(&b, "idle_gpu_p95_usage_pct=%.1f\n", result.IdleValidation.GPUP95UsagePct)
|
|
fmt.Fprintf(&b, "idle_cpu_avg_usage_pct=%.1f\n", result.IdleValidation.CPUAvgUsagePct)
|
|
fmt.Fprintf(&b, "idle_cpu_p95_usage_pct=%.1f\n", result.IdleValidation.CPUP95UsagePct)
|
|
if result.IdleValidation.Reason != "" {
|
|
fmt.Fprintf(&b, "idle_validation_error=%s\n", result.IdleValidation.Reason)
|
|
}
|
|
}
|
|
for _, candidate := range result.Candidates {
|
|
fmt.Fprintf(&b, "candidate_%s_available=%t\n", candidate.Source, candidate.Available)
|
|
if candidate.Available {
|
|
fmt.Fprintf(&b, "candidate_%s_idle_avg_w=%.0f\n", candidate.Source, candidate.IdleAvgW)
|
|
fmt.Fprintf(&b, "candidate_%s_load_avg_w=%.0f\n", candidate.Source, candidate.LoadAvgW)
|
|
fmt.Fprintf(&b, "candidate_%s_delta_w=%.0f\n", candidate.Source, candidate.DeltaW)
|
|
fmt.Fprintf(&b, "candidate_%s_relative_error=%.3f\n", candidate.Source, candidate.RelativeError)
|
|
}
|
|
}
|
|
return b.String()
|
|
}
|
|
|
|
func renderBenchmarkPowerAutotuneReport(result BenchmarkPowerAutotuneResult) string {
|
|
var b strings.Builder
|
|
b.WriteString("# Bee Bench Power Source Autotune\n\n")
|
|
fmt.Fprintf(&b, "**Status:** %s \n", result.Status)
|
|
fmt.Fprintf(&b, "**Benchmark kind:** %s \n", result.BenchmarkKind)
|
|
fmt.Fprintf(&b, "**Profile:** %s \n", result.Profile)
|
|
fmt.Fprintf(&b, "**Idle window:** %ds \n", result.IdleDurationSec)
|
|
fmt.Fprintf(&b, "**Load window:** %ds \n", result.LoadDurationSec)
|
|
fmt.Fprintf(&b, "**Sample interval:** %ds \n", result.SampleIntervalSec)
|
|
if result.SelectedSource != "" {
|
|
fmt.Fprintf(&b, "**Selected source:** `%s` \n", result.SelectedSource)
|
|
}
|
|
b.WriteString("\n")
|
|
if result.IdleValidation != nil {
|
|
b.WriteString("## Idle Validation\n\n")
|
|
fmt.Fprintf(&b, "- valid: %t\n", result.IdleValidation.Valid)
|
|
fmt.Fprintf(&b, "- GPU avg usage: %.1f%%\n", result.IdleValidation.GPUAvgUsagePct)
|
|
fmt.Fprintf(&b, "- GPU p95 usage: %.1f%%\n", result.IdleValidation.GPUP95UsagePct)
|
|
fmt.Fprintf(&b, "- CPU avg usage: %.1f%%\n", result.IdleValidation.CPUAvgUsagePct)
|
|
fmt.Fprintf(&b, "- CPU p95 usage: %.1f%%\n", result.IdleValidation.CPUP95UsagePct)
|
|
if result.IdleValidation.Reason != "" {
|
|
fmt.Fprintf(&b, "- reason: %s\n", result.IdleValidation.Reason)
|
|
}
|
|
b.WriteString("\n")
|
|
}
|
|
if len(result.Candidates) > 0 {
|
|
b.WriteString("## Candidates\n\n")
|
|
b.WriteString("| Source | Idle avg W | Load avg W | Delta W | Relative error | Selected |\n")
|
|
b.WriteString("|--------|------------|------------|---------|----------------|----------|\n")
|
|
for _, candidate := range result.Candidates {
|
|
if !candidate.Available {
|
|
fmt.Fprintf(&b, "| %s | — | — | — | — | no |\n", candidate.Source)
|
|
continue
|
|
}
|
|
selected := "no"
|
|
if candidate.Selected {
|
|
selected = "yes"
|
|
}
|
|
fmt.Fprintf(&b, "| %s | %.0f | %.0f | %.0f | %.2f | %s |\n",
|
|
candidate.Source, candidate.IdleAvgW, candidate.LoadAvgW, candidate.DeltaW, candidate.RelativeError, selected)
|
|
}
|
|
b.WriteString("\n")
|
|
}
|
|
for _, note := range result.Notes {
|
|
fmt.Fprintf(&b, "- %s\n", note)
|
|
}
|
|
return b.String()
|
|
}
|
|
|
|
func benchmarkAutotuneLoadCommand(kind string, durationSec int, gpuIndices []int, sizeMB int) ([]string, string) {
|
|
allDevices := joinIndexList(gpuIndices)
|
|
switch strings.TrimSpace(strings.ToLower(kind)) {
|
|
case "power-fit", "power", "nvidia-bench-power":
|
|
cmd, _, err := resolveBenchmarkPowerLoadCommand(durationSec, gpuIndices)
|
|
if err == nil {
|
|
return cmd, "power-fit"
|
|
}
|
|
return nvidiaDCGMNamedDiagCommand("targeted_power", durationSec, gpuIndices), "power-fit"
|
|
default:
|
|
cmd := []string{
|
|
"bee-gpu-burn",
|
|
"--seconds", fmt.Sprintf("%d", durationSec),
|
|
"--devices", allDevices,
|
|
}
|
|
if sizeMB > 0 {
|
|
cmd = append(cmd, "--size-mb", fmt.Sprintf("%d", sizeMB))
|
|
}
|
|
return cmd, "performance"
|
|
}
|
|
}
|
|
|
|
func (s *System) RunNvidiaPowerSourceAutotune(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error) {
|
|
if ctx == nil {
|
|
ctx = context.Background()
|
|
}
|
|
if logFunc == nil {
|
|
logFunc = func(string) {}
|
|
}
|
|
if strings.TrimSpace(baseDir) == "" {
|
|
baseDir = "/var/log/bee-bench/autotune"
|
|
}
|
|
if err := os.MkdirAll(baseDir, 0755); err != nil {
|
|
return "", fmt.Errorf("mkdir %s: %w", baseDir, err)
|
|
}
|
|
selected, err := resolveNvidiaGPUSelection(nil, nil)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
if len(selected) == 0 {
|
|
return "", fmt.Errorf("no NVIDIA GPUs detected for autotune")
|
|
}
|
|
ts := time.Now().UTC().Format("20060102-150405")
|
|
runDir := filepath.Join(baseDir, "autotune-"+ts)
|
|
if err := os.MkdirAll(runDir, 0755); err != nil {
|
|
return "", fmt.Errorf("mkdir %s: %w", runDir, err)
|
|
}
|
|
verboseLog := filepath.Join(runDir, "verbose.log")
|
|
hostname, _ := os.Hostname()
|
|
loadCmd, normalizedKind := benchmarkAutotuneLoadCommand(benchmarkKind, benchmarkPowerAutotuneLoadSec, selected, opts.SizeMB)
|
|
result := BenchmarkPowerAutotuneResult{
|
|
GeneratedAt: time.Now().UTC(),
|
|
Hostname: hostname,
|
|
ServerModel: readServerModel(),
|
|
BenchmarkKind: normalizedKind,
|
|
Profile: opts.Profile,
|
|
Status: "FAILED",
|
|
IdleDurationSec: benchmarkPowerAutotuneIdleSec,
|
|
LoadDurationSec: benchmarkPowerAutotuneLoadSec,
|
|
SampleIntervalSec: benchmarkPowerAutotuneSampleInterval,
|
|
}
|
|
|
|
logFunc(fmt.Sprintf("autotune: idle validation window %ds on GPUs %s", benchmarkPowerAutotuneIdleSec, joinIndexList(selected)))
|
|
idleSamples := collectBenchmarkPowerAutotuneSamples(ctx, "idle", selected, benchmarkPowerAutotuneIdleSec, logFunc)
|
|
logBenchmarkPowerAutotunePhaseSummary("idle", idleSamples, logFunc)
|
|
result.IdleValidation = validateBenchmarkPowerAutotuneIdle(idleSamples)
|
|
if result.IdleValidation == nil || !result.IdleValidation.Valid {
|
|
if result.IdleValidation != nil {
|
|
result.IdleValidationError = result.IdleValidation.Reason
|
|
logFunc(result.IdleValidation.Reason)
|
|
}
|
|
result.Notes = append(result.Notes, "autotune stopped before load stage because idle validation failed")
|
|
if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
|
|
return "", err
|
|
}
|
|
return runDir, fmt.Errorf("%s", result.IdleValidationError)
|
|
}
|
|
|
|
logFunc(fmt.Sprintf("autotune: full-load stage using %s for %ds", normalizedKind, benchmarkPowerAutotuneLoadSec))
|
|
loadSamplesCh := make(chan []benchmarkPowerAutotuneSample, 1)
|
|
go func() {
|
|
loadSamplesCh <- collectBenchmarkPowerAutotuneSamples(ctx, "load", selected, benchmarkPowerAutotuneLoadSec, logFunc)
|
|
}()
|
|
out, runErr := runSATCommandCtx(ctx, verboseLog, "autotune-load.log", loadCmd, nil, logFunc)
|
|
_ = os.WriteFile(filepath.Join(runDir, "autotune-load.log"), out, 0644)
|
|
loadSamples := <-loadSamplesCh
|
|
logBenchmarkPowerAutotunePhaseSummary("load", loadSamples, logFunc)
|
|
if runErr != nil {
|
|
result.Notes = append(result.Notes, "full-load stage failed: "+runErr.Error())
|
|
if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
|
|
return "", err
|
|
}
|
|
return runDir, fmt.Errorf("autotune load stage: %w", runErr)
|
|
}
|
|
|
|
selectedSource, candidates, idleGPUAvg, loadGPUAvg, chooseErr := chooseBenchmarkPowerAutotuneSource(idleSamples, loadSamples)
|
|
result.Candidates = candidates
|
|
result.GPUPowerIdleW = idleGPUAvg
|
|
result.GPUPowerLoadW = loadGPUAvg
|
|
if chooseErr != nil {
|
|
result.Notes = append(result.Notes, chooseErr.Error())
|
|
if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
|
|
return "", err
|
|
}
|
|
return runDir, chooseErr
|
|
}
|
|
gpuDelta := loadGPUAvg - idleGPUAvg
|
|
if gpuDelta <= 0 {
|
|
gpuDelta = loadGPUAvg
|
|
}
|
|
logBenchmarkPowerAutotuneSelection(candidates, selectedSource, gpuDelta, logFunc)
|
|
result.SelectedSource = selectedSource
|
|
result.Status = "OK"
|
|
var confidence float64
|
|
selectionReason := fmt.Sprintf("selected %s after comparing full-load average against GPU-reported delta", selectedSource)
|
|
for _, candidate := range candidates {
|
|
if candidate.Selected {
|
|
confidence = candidate.Confidence
|
|
if strings.TrimSpace(candidate.SelectionNotes) != "" {
|
|
selectionReason = candidate.SelectionNotes
|
|
}
|
|
break
|
|
}
|
|
}
|
|
cfg := BenchmarkPowerAutotuneConfig{
|
|
Version: benchmarkPowerAutotuneVersion,
|
|
UpdatedAt: time.Now().UTC(),
|
|
SelectedSource: selectedSource,
|
|
BenchmarkKind: normalizedKind,
|
|
Profile: opts.Profile,
|
|
IdleDurationSec: benchmarkPowerAutotuneIdleSec,
|
|
LoadDurationSec: benchmarkPowerAutotuneLoadSec,
|
|
SampleIntervalSec: benchmarkPowerAutotuneSampleInterval,
|
|
Confidence: confidence,
|
|
Reason: selectionReason,
|
|
}
|
|
result.Config = &cfg
|
|
configPath := BenchmarkPowerSourceConfigPath(baseDir)
|
|
if err := SaveBenchmarkPowerAutotuneConfig(configPath, cfg); err != nil {
|
|
result.Status = "FAILED"
|
|
result.Notes = append(result.Notes, "failed to save autotune config: "+err.Error())
|
|
if writeErr := writeBenchmarkPowerAutotuneArtifacts(runDir, result); writeErr != nil {
|
|
return "", writeErr
|
|
}
|
|
return runDir, err
|
|
}
|
|
logFunc(fmt.Sprintf("autotune conclusion: selected source %s; reason: %s", selectedSource, cfg.Reason))
|
|
result.Notes = append(result.Notes, "saved autotune config to "+configPath)
|
|
if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
|
|
return "", err
|
|
}
|
|
return runDir, nil
|
|
}
|
|
|
|
func writeBenchmarkPowerAutotuneArtifacts(runDir string, result BenchmarkPowerAutotuneResult) error {
|
|
resultJSON, err := json.MarshalIndent(result, "", " ")
|
|
if err != nil {
|
|
return fmt.Errorf("marshal autotune result: %w", err)
|
|
}
|
|
if err := os.WriteFile(filepath.Join(runDir, "result.json"), resultJSON, 0644); err != nil {
|
|
return fmt.Errorf("write autotune result.json: %w", err)
|
|
}
|
|
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(renderBenchmarkPowerAutotuneSummary(result)), 0644); err != nil {
|
|
return fmt.Errorf("write autotune summary.txt: %w", err)
|
|
}
|
|
if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(renderBenchmarkPowerAutotuneReport(result)), 0644); err != nil {
|
|
return fmt.Errorf("write autotune report.md: %w", err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func minInt(a, b int) int {
|
|
if a < b {
|
|
return a
|
|
}
|
|
return b
|
|
}
|
|
|
|
var _ = exec.ErrNotFound
|