Files
bee/audit/internal/platform/benchmark_power_autotune.go

736 lines
26 KiB
Go

package platform
import (
"context"
"encoding/json"
"fmt"
"math"
"os"
"os/exec"
"path/filepath"
"sort"
"strings"
"time"
)
const (
benchmarkPowerAutotuneVersion = 1
benchmarkPowerAutotuneIdleSec = 60
benchmarkPowerAutotuneLoadSec = 90
benchmarkPowerAutotuneSampleInterval = 3
defaultBenchmarkPowerSourceConfigPath = "/appdata/bee/export/bee-bench/power-source-autotune.json"
)
func BenchmarkPowerSourceConfigPath(baseDir string) string {
baseDir = strings.TrimSpace(baseDir)
if baseDir == "" {
return defaultBenchmarkPowerSourceConfigPath
}
return filepath.Join(filepath.Dir(baseDir), "power-source-autotune.json")
}
func LoadBenchmarkPowerAutotuneConfig(path string) (*BenchmarkPowerAutotuneConfig, error) {
raw, err := os.ReadFile(path)
if err != nil {
return nil, err
}
var cfg BenchmarkPowerAutotuneConfig
if err := json.Unmarshal(raw, &cfg); err != nil {
return nil, err
}
if strings.TrimSpace(cfg.SelectedSource) == "" {
return nil, fmt.Errorf("autotune config missing selected_source")
}
return &cfg, nil
}
func SaveBenchmarkPowerAutotuneConfig(path string, cfg BenchmarkPowerAutotuneConfig) error {
if strings.TrimSpace(path) == "" {
return fmt.Errorf("empty autotune config path")
}
if cfg.Version <= 0 {
cfg.Version = benchmarkPowerAutotuneVersion
}
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
return err
}
data, err := json.MarshalIndent(cfg, "", " ")
if err != nil {
return err
}
tmp := path + ".tmp"
if err := os.WriteFile(tmp, data, 0644); err != nil {
return err
}
return os.Rename(tmp, path)
}
func LoadSystemPowerSourceConfig(exportDir string) (*BenchmarkPowerAutotuneConfig, error) {
return LoadBenchmarkPowerAutotuneConfig(BenchmarkPowerSourceConfigPath(exportDir))
}
func ResetBenchmarkPowerAutotuneConfig(path string) error {
if strings.TrimSpace(path) == "" {
return fmt.Errorf("empty autotune config path")
}
if err := os.Remove(path); err != nil && !os.IsNotExist(err) {
return err
}
return nil
}
func normalizeBenchmarkPowerSource(source string) string {
switch strings.TrimSpace(strings.ToLower(source)) {
case BenchmarkPowerSourceSDRPSUInput:
return BenchmarkPowerSourceSDRPSUInput
default:
return BenchmarkPowerSourceDCMI
}
}
func ResolveSystemPowerDecision(exportDir string) SystemPowerSourceDecision {
cfg, err := LoadSystemPowerSourceConfig(exportDir)
if err == nil && cfg != nil && strings.TrimSpace(cfg.SelectedSource) != "" {
selected := normalizeBenchmarkPowerSource(cfg.SelectedSource)
return SystemPowerSourceDecision{
Configured: true,
SelectedSource: selected,
EffectiveSource: selected,
Mode: "autotuned",
Reason: strings.TrimSpace(cfg.Reason),
ConfiguredAt: cfg.UpdatedAt,
}
}
sources := sampleBenchmarkPowerSources()
if value := sources[BenchmarkPowerSourceSDRPSUInput]; value > 0 {
return SystemPowerSourceDecision{
Configured: false,
EffectiveSource: BenchmarkPowerSourceSDRPSUInput,
Mode: "fallback",
Reason: "autotune config not found; using temporary fallback source sdr_psu_input",
}
}
return SystemPowerSourceDecision{
Configured: false,
EffectiveSource: BenchmarkPowerSourceDCMI,
Mode: "fallback",
Reason: "autotune config not found; using temporary fallback source dcmi",
}
}
func SampleSystemPowerResolved(exportDir string) (float64, SystemPowerSourceDecision, error) {
decision := ResolveSystemPowerDecision(exportDir)
if decision.EffectiveSource != "" {
if value, err := queryBenchmarkPowerSourceW(decision.EffectiveSource); err == nil && value > 0 {
return value, decision, nil
} else if decision.Configured {
fallback := BenchmarkPowerSourceDCMI
if decision.EffectiveSource == BenchmarkPowerSourceDCMI {
fallback = BenchmarkPowerSourceSDRPSUInput
}
if fallbackValue, fallbackErr := queryBenchmarkPowerSourceW(fallback); fallbackErr == nil && fallbackValue > 0 {
decision.Mode = "degraded"
decision.Reason = fmt.Sprintf("configured source %s unavailable; using degraded fallback %s", decision.SelectedSource, fallback)
decision.EffectiveSource = fallback
return fallbackValue, decision, nil
}
decision.Mode = "degraded"
decision.Reason = fmt.Sprintf("configured source %s unavailable and no fallback source responded", decision.SelectedSource)
return 0, decision, err
}
}
return 0, decision, fmt.Errorf("system power source unavailable")
}
func queryBenchmarkPowerSourceW(source string) (float64, error) {
switch normalizeBenchmarkPowerSource(source) {
case BenchmarkPowerSourceSDRPSUInput:
sdr := sampleIPMISDRPowerSensors()
if sdr.PSUInW > 0 {
return sdr.PSUInW, nil
}
return 0, fmt.Errorf("sdr psu input unavailable")
default:
return queryIPMIServerPowerW()
}
}
func sampleBenchmarkPowerSources() map[string]float64 {
out := map[string]float64{}
if w, err := queryIPMIServerPowerW(); err == nil && w > 0 {
out[BenchmarkPowerSourceDCMI] = w
}
if w, err := queryBenchmarkPowerSourceW(BenchmarkPowerSourceSDRPSUInput); err == nil && w > 0 {
out[BenchmarkPowerSourceSDRPSUInput] = w
}
return out
}
func sampleBenchmarkPowerSourceSeries(ctx context.Context, source string, durationSec, intervalSec int) (float64, bool) {
if durationSec <= 0 {
return 0, false
}
samples := collectSelectedPowerSourceSamples(ctx, source, durationSec, intervalSec)
if len(samples) == 0 {
return 0, false
}
return benchmarkMean(samples), true
}
func collectSelectedPowerSourceSamples(ctx context.Context, source string, durationSec, intervalSec int) []float64 {
if durationSec <= 0 {
return nil
}
stopCh := make(chan struct{})
doneCh := startSelectedPowerSourceSampler(stopCh, source, intervalSec)
select {
case <-ctx.Done():
case <-time.After(time.Duration(durationSec) * time.Second):
}
close(stopCh)
return <-doneCh
}
func startSelectedPowerSourceSampler(stopCh <-chan struct{}, source string, intervalSec int) <-chan []float64 {
if intervalSec <= 0 {
intervalSec = benchmarkPowerAutotuneSampleInterval
}
ch := make(chan []float64, 1)
go func() {
defer close(ch)
var samples []float64
record := func() {
if w, err := queryBenchmarkPowerSourceW(source); err == nil && w > 0 {
samples = append(samples, w)
}
}
record()
ticker := time.NewTicker(time.Duration(intervalSec) * time.Second)
defer ticker.Stop()
for {
select {
case <-stopCh:
ch <- samples
return
case <-ticker.C:
record()
}
}
}()
return ch
}
type benchmarkPowerAutotuneSample struct {
ElapsedSec float64
GPUAvgUsagePct float64
CPUUsagePct float64
GPUSumPowerW float64
Sources map[string]float64
}
func collectBenchmarkPowerAutotuneSamples(ctx context.Context, phase string, gpuIndices []int, durationSec int, logFunc func(string)) []benchmarkPowerAutotuneSample {
if durationSec <= 0 {
return nil
}
var out []benchmarkPowerAutotuneSample
deadline := time.Now().Add(time.Duration(durationSec) * time.Second)
start := time.Now()
for {
if ctx.Err() != nil {
return out
}
row := benchmarkPowerAutotuneSample{
ElapsedSec: time.Since(start).Seconds(),
CPUUsagePct: sampleCPULoadPct(),
Sources: sampleBenchmarkPowerSources(),
}
if gpuRows, err := sampleGPUMetrics(gpuIndices); err == nil && len(gpuRows) > 0 {
var usageSum float64
for _, gpu := range gpuRows {
row.GPUSumPowerW += gpu.PowerW
usageSum += gpu.UsagePct
}
row.GPUAvgUsagePct = usageSum / float64(len(gpuRows))
}
out = append(out, row)
logBenchmarkPowerAutotuneSample(phase, row, logFunc)
if time.Now().After(deadline) {
return out
}
select {
case <-ctx.Done():
return out
case <-time.After(benchmarkPowerAutotuneSampleInterval * time.Second):
}
}
}
func logBenchmarkPowerAutotuneSample(phase string, sample benchmarkPowerAutotuneSample, logFunc func(string)) {
if logFunc == nil {
return
}
var sourceParts []string
for _, source := range []string{BenchmarkPowerSourceDCMI, BenchmarkPowerSourceSDRPSUInput} {
if value, ok := sample.Sources[source]; ok && value > 0 {
sourceParts = append(sourceParts, fmt.Sprintf("%s=%.0fW", source, value))
} else {
sourceParts = append(sourceParts, fmt.Sprintf("%s=n/a", source))
}
}
logFunc(fmt.Sprintf(
"autotune %s sample t=%.0fs gpu_avg_util=%.1f%% gpu_sum_power=%.0fW cpu_load=%.1f%% %s",
phase,
sample.ElapsedSec,
sample.GPUAvgUsagePct,
sample.GPUSumPowerW,
sample.CPUUsagePct,
strings.Join(sourceParts, " "),
))
}
func logBenchmarkPowerAutotunePhaseSummary(phase string, samples []benchmarkPowerAutotuneSample, logFunc func(string)) {
if logFunc == nil || len(samples) == 0 {
return
}
var gpuUsage []float64
var cpuUsage []float64
var gpuPower []float64
sourceBuckets := map[string][]float64{}
for _, sample := range samples {
gpuUsage = append(gpuUsage, sample.GPUAvgUsagePct)
cpuUsage = append(cpuUsage, sample.CPUUsagePct)
gpuPower = append(gpuPower, sample.GPUSumPowerW)
for source, value := range sample.Sources {
if value > 0 {
sourceBuckets[source] = append(sourceBuckets[source], value)
}
}
}
var sourceParts []string
for _, source := range []string{BenchmarkPowerSourceDCMI, BenchmarkPowerSourceSDRPSUInput} {
values := sourceBuckets[source]
if len(values) == 0 {
sourceParts = append(sourceParts, fmt.Sprintf("%s_avg=n/a", source))
continue
}
sourceParts = append(sourceParts, fmt.Sprintf("%s_avg=%.0fW", source, benchmarkMean(values)))
}
logFunc(fmt.Sprintf(
"autotune %s summary samples=%d gpu_avg_util=%.1f%% gpu_p95_util=%.1f%% gpu_avg_power=%.0fW cpu_avg=%.1f%% cpu_p95=%.1f%% %s",
phase,
len(samples),
benchmarkMean(gpuUsage),
benchmarkPercentile(gpuUsage, 95),
benchmarkMean(gpuPower),
benchmarkMean(cpuUsage),
benchmarkPercentile(cpuUsage, 95),
strings.Join(sourceParts, " "),
))
}
func logBenchmarkPowerAutotuneSelection(candidates []BenchmarkPowerAutotuneCandidate, selectedSource string, gpuDelta float64, logFunc func(string)) {
if logFunc == nil {
return
}
for _, candidate := range candidates {
if !candidate.Available {
logFunc(fmt.Sprintf("autotune candidate %s unavailable", candidate.Source))
continue
}
logFunc(fmt.Sprintf(
"autotune candidate %s idle_avg=%.0fW load_avg=%.0fW delta=%.0fW gpu_delta=%.0fW relative_error=%.3f confidence=%.0f%%%s",
candidate.Source,
candidate.IdleAvgW,
candidate.LoadAvgW,
candidate.DeltaW,
gpuDelta,
candidate.RelativeError,
candidate.Confidence*100,
map[bool]string{true: " SELECTED", false: ""}[candidate.Source == selectedSource],
))
if strings.TrimSpace(candidate.SelectionNotes) != "" {
logFunc(fmt.Sprintf("autotune candidate %s reason: %s", candidate.Source, candidate.SelectionNotes))
}
}
}
func validateBenchmarkPowerAutotuneIdle(samples []benchmarkPowerAutotuneSample) *BenchmarkPowerAutotuneValidation {
result := &BenchmarkPowerAutotuneValidation{}
if len(samples) == 0 {
result.Reason = "no idle telemetry samples collected"
return result
}
var gpuUsage []float64
var cpuUsage []float64
for _, sample := range samples {
gpuUsage = append(gpuUsage, sample.GPUAvgUsagePct)
if sample.CPUUsagePct > 0 {
cpuUsage = append(cpuUsage, sample.CPUUsagePct)
}
}
result.GPUSamples = len(gpuUsage)
result.CPUSamples = len(cpuUsage)
result.GPUAvgUsagePct = math.Round(benchmarkMean(gpuUsage)*10) / 10
result.GPUP95UsagePct = math.Round(benchmarkPercentile(gpuUsage, 95)*10) / 10
result.CPUAvgUsagePct = math.Round(benchmarkMean(cpuUsage)*10) / 10
result.CPUP95UsagePct = math.Round(benchmarkPercentile(cpuUsage, 95)*10) / 10
switch {
case result.GPUAvgUsagePct > 5:
result.Reason = fmt.Sprintf("idle validation failed: average GPU load %.1f%% exceeds 5%%", result.GPUAvgUsagePct)
case result.GPUP95UsagePct > 10:
result.Reason = fmt.Sprintf("idle validation failed: p95 GPU load %.1f%% exceeds 10%%", result.GPUP95UsagePct)
case result.CPUAvgUsagePct > 20:
result.Reason = fmt.Sprintf("idle validation failed: average CPU load %.1f%% exceeds 20%%", result.CPUAvgUsagePct)
case result.CPUP95UsagePct > 35:
result.Reason = fmt.Sprintf("idle validation failed: p95 CPU load %.1f%% exceeds 35%%", result.CPUP95UsagePct)
default:
result.Valid = true
}
return result
}
func chooseBenchmarkPowerAutotuneSource(idle, load []benchmarkPowerAutotuneSample) (string, []BenchmarkPowerAutotuneCandidate, float64, float64, error) {
idleBySource := map[string][]float64{}
loadBySource := map[string][]float64{}
var idleGPU []float64
var loadGPU []float64
for _, sample := range idle {
idleGPU = append(idleGPU, sample.GPUSumPowerW)
for source, value := range sample.Sources {
if value > 0 {
idleBySource[source] = append(idleBySource[source], value)
}
}
}
for _, sample := range load {
loadGPU = append(loadGPU, sample.GPUSumPowerW)
for source, value := range sample.Sources {
if value > 0 {
loadBySource[source] = append(loadBySource[source], value)
}
}
}
idleGPUAvg := benchmarkMean(idleGPU)
loadGPUAvg := benchmarkMean(loadGPU)
gpuDelta := loadGPUAvg - idleGPUAvg
if gpuDelta <= 0 {
gpuDelta = loadGPUAvg
}
candidates := []BenchmarkPowerAutotuneCandidate{
buildBenchmarkPowerAutotuneCandidate(BenchmarkPowerSourceDCMI, idleBySource[BenchmarkPowerSourceDCMI], loadBySource[BenchmarkPowerSourceDCMI], gpuDelta),
buildBenchmarkPowerAutotuneCandidate(BenchmarkPowerSourceSDRPSUInput, idleBySource[BenchmarkPowerSourceSDRPSUInput], loadBySource[BenchmarkPowerSourceSDRPSUInput], gpuDelta),
}
available := make([]BenchmarkPowerAutotuneCandidate, 0, len(candidates))
for _, candidate := range candidates {
if candidate.Available && candidate.DeltaW > 0 {
available = append(available, candidate)
}
}
if len(available) == 0 {
return "", candidates, idleGPUAvg, loadGPUAvg, fmt.Errorf("no usable server power source samples collected")
}
sort.Slice(available, func(i, j int) bool {
if math.Abs(available[i].RelativeError-available[j].RelativeError) <= 0.10 {
if available[i].Source != available[j].Source {
return available[i].Source == BenchmarkPowerSourceSDRPSUInput
}
}
if available[i].RelativeError != available[j].RelativeError {
return available[i].RelativeError < available[j].RelativeError
}
return available[i].Samples > available[j].Samples
})
selected := available[0]
for idx := range candidates {
if candidates[idx].Source == selected.Source {
candidates[idx].Selected = true
candidates[idx].SelectionNotes = fmt.Sprintf("selected because delta %.0f W is closest to GPU delta %.0f W (relative error %.3f)", selected.DeltaW, gpuDelta, selected.RelativeError)
}
}
return selected.Source, candidates, idleGPUAvg, loadGPUAvg, nil
}
func buildBenchmarkPowerAutotuneCandidate(source string, idle, load []float64, gpuDelta float64) BenchmarkPowerAutotuneCandidate {
candidate := BenchmarkPowerAutotuneCandidate{
Source: source,
Available: len(idle) > 0 && len(load) > 0,
Samples: minInt(len(idle), len(load)),
}
if !candidate.Available {
return candidate
}
candidate.IdleAvgW = benchmarkMean(idle)
candidate.LoadAvgW = benchmarkMean(load)
candidate.DeltaW = candidate.LoadAvgW - candidate.IdleAvgW
if gpuDelta > 0 {
candidate.RelativeError = math.Abs(candidate.DeltaW-gpuDelta) / gpuDelta
candidate.Confidence = math.Max(0, 1-candidate.RelativeError)
}
return candidate
}
func renderBenchmarkPowerAutotuneSummary(result BenchmarkPowerAutotuneResult) string {
var b strings.Builder
fmt.Fprintf(&b, "generated_at=%s\n", result.GeneratedAt.UTC().Format(time.RFC3339))
fmt.Fprintf(&b, "status=%s\n", result.Status)
fmt.Fprintf(&b, "benchmark_kind=%s\n", result.BenchmarkKind)
fmt.Fprintf(&b, "profile=%s\n", result.Profile)
fmt.Fprintf(&b, "idle_duration_sec=%d\n", result.IdleDurationSec)
fmt.Fprintf(&b, "load_duration_sec=%d\n", result.LoadDurationSec)
fmt.Fprintf(&b, "sample_interval_sec=%d\n", result.SampleIntervalSec)
if result.SelectedSource != "" {
fmt.Fprintf(&b, "selected_source=%s\n", result.SelectedSource)
}
if result.IdleValidation != nil {
fmt.Fprintf(&b, "idle_valid=%t\n", result.IdleValidation.Valid)
fmt.Fprintf(&b, "idle_gpu_avg_usage_pct=%.1f\n", result.IdleValidation.GPUAvgUsagePct)
fmt.Fprintf(&b, "idle_gpu_p95_usage_pct=%.1f\n", result.IdleValidation.GPUP95UsagePct)
fmt.Fprintf(&b, "idle_cpu_avg_usage_pct=%.1f\n", result.IdleValidation.CPUAvgUsagePct)
fmt.Fprintf(&b, "idle_cpu_p95_usage_pct=%.1f\n", result.IdleValidation.CPUP95UsagePct)
if result.IdleValidation.Reason != "" {
fmt.Fprintf(&b, "idle_validation_error=%s\n", result.IdleValidation.Reason)
}
}
for _, candidate := range result.Candidates {
fmt.Fprintf(&b, "candidate_%s_available=%t\n", candidate.Source, candidate.Available)
if candidate.Available {
fmt.Fprintf(&b, "candidate_%s_idle_avg_w=%.0f\n", candidate.Source, candidate.IdleAvgW)
fmt.Fprintf(&b, "candidate_%s_load_avg_w=%.0f\n", candidate.Source, candidate.LoadAvgW)
fmt.Fprintf(&b, "candidate_%s_delta_w=%.0f\n", candidate.Source, candidate.DeltaW)
fmt.Fprintf(&b, "candidate_%s_relative_error=%.3f\n", candidate.Source, candidate.RelativeError)
}
}
return b.String()
}
func renderBenchmarkPowerAutotuneReport(result BenchmarkPowerAutotuneResult) string {
var b strings.Builder
b.WriteString("# Bee Bench Power Source Autotune\n\n")
fmt.Fprintf(&b, "**Status:** %s \n", result.Status)
fmt.Fprintf(&b, "**Benchmark kind:** %s \n", result.BenchmarkKind)
fmt.Fprintf(&b, "**Profile:** %s \n", result.Profile)
fmt.Fprintf(&b, "**Idle window:** %ds \n", result.IdleDurationSec)
fmt.Fprintf(&b, "**Load window:** %ds \n", result.LoadDurationSec)
fmt.Fprintf(&b, "**Sample interval:** %ds \n", result.SampleIntervalSec)
if result.SelectedSource != "" {
fmt.Fprintf(&b, "**Selected source:** `%s` \n", result.SelectedSource)
}
b.WriteString("\n")
if result.IdleValidation != nil {
b.WriteString("## Idle Validation\n\n")
fmt.Fprintf(&b, "- valid: %t\n", result.IdleValidation.Valid)
fmt.Fprintf(&b, "- GPU avg usage: %.1f%%\n", result.IdleValidation.GPUAvgUsagePct)
fmt.Fprintf(&b, "- GPU p95 usage: %.1f%%\n", result.IdleValidation.GPUP95UsagePct)
fmt.Fprintf(&b, "- CPU avg usage: %.1f%%\n", result.IdleValidation.CPUAvgUsagePct)
fmt.Fprintf(&b, "- CPU p95 usage: %.1f%%\n", result.IdleValidation.CPUP95UsagePct)
if result.IdleValidation.Reason != "" {
fmt.Fprintf(&b, "- reason: %s\n", result.IdleValidation.Reason)
}
b.WriteString("\n")
}
if len(result.Candidates) > 0 {
b.WriteString("## Candidates\n\n")
b.WriteString("| Source | Idle avg W | Load avg W | Delta W | Relative error | Selected |\n")
b.WriteString("|--------|------------|------------|---------|----------------|----------|\n")
for _, candidate := range result.Candidates {
if !candidate.Available {
fmt.Fprintf(&b, "| %s | — | — | — | — | no |\n", candidate.Source)
continue
}
selected := "no"
if candidate.Selected {
selected = "yes"
}
fmt.Fprintf(&b, "| %s | %.0f | %.0f | %.0f | %.2f | %s |\n",
candidate.Source, candidate.IdleAvgW, candidate.LoadAvgW, candidate.DeltaW, candidate.RelativeError, selected)
}
b.WriteString("\n")
}
for _, note := range result.Notes {
fmt.Fprintf(&b, "- %s\n", note)
}
return b.String()
}
func benchmarkAutotuneLoadCommand(kind string, durationSec int, gpuIndices []int, sizeMB int) ([]string, string) {
allDevices := joinIndexList(gpuIndices)
switch strings.TrimSpace(strings.ToLower(kind)) {
case "power-fit", "power", "nvidia-bench-power":
cmd, _, err := resolveBenchmarkPowerLoadCommand(durationSec, gpuIndices)
if err == nil {
return cmd, "power-fit"
}
return nvidiaDCGMNamedDiagCommand("targeted_power", durationSec, gpuIndices), "power-fit"
default:
cmd := []string{
"bee-gpu-burn",
"--seconds", fmt.Sprintf("%d", durationSec),
"--devices", allDevices,
}
if sizeMB > 0 {
cmd = append(cmd, "--size-mb", fmt.Sprintf("%d", sizeMB))
}
return cmd, "performance"
}
}
func (s *System) RunNvidiaPowerSourceAutotune(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error) {
if ctx == nil {
ctx = context.Background()
}
if logFunc == nil {
logFunc = func(string) {}
}
if strings.TrimSpace(baseDir) == "" {
baseDir = "/var/log/bee-bench/autotune"
}
if err := os.MkdirAll(baseDir, 0755); err != nil {
return "", fmt.Errorf("mkdir %s: %w", baseDir, err)
}
selected, err := resolveNvidiaGPUSelection(nil, nil)
if err != nil {
return "", err
}
if len(selected) == 0 {
return "", fmt.Errorf("no NVIDIA GPUs detected for autotune")
}
ts := time.Now().UTC().Format("20060102-150405")
runDir := filepath.Join(baseDir, "autotune-"+ts)
if err := os.MkdirAll(runDir, 0755); err != nil {
return "", fmt.Errorf("mkdir %s: %w", runDir, err)
}
verboseLog := filepath.Join(runDir, "verbose.log")
hostname, _ := os.Hostname()
loadCmd, normalizedKind := benchmarkAutotuneLoadCommand(benchmarkKind, benchmarkPowerAutotuneLoadSec, selected, opts.SizeMB)
result := BenchmarkPowerAutotuneResult{
GeneratedAt: time.Now().UTC(),
Hostname: hostname,
ServerModel: readServerModel(),
BenchmarkKind: normalizedKind,
Profile: opts.Profile,
Status: "FAILED",
IdleDurationSec: benchmarkPowerAutotuneIdleSec,
LoadDurationSec: benchmarkPowerAutotuneLoadSec,
SampleIntervalSec: benchmarkPowerAutotuneSampleInterval,
}
logFunc(fmt.Sprintf("autotune: idle validation window %ds on GPUs %s", benchmarkPowerAutotuneIdleSec, joinIndexList(selected)))
idleSamples := collectBenchmarkPowerAutotuneSamples(ctx, "idle", selected, benchmarkPowerAutotuneIdleSec, logFunc)
logBenchmarkPowerAutotunePhaseSummary("idle", idleSamples, logFunc)
result.IdleValidation = validateBenchmarkPowerAutotuneIdle(idleSamples)
if result.IdleValidation == nil || !result.IdleValidation.Valid {
if result.IdleValidation != nil {
result.IdleValidationError = result.IdleValidation.Reason
logFunc(result.IdleValidation.Reason)
}
result.Notes = append(result.Notes, "autotune stopped before load stage because idle validation failed")
if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
return "", err
}
return runDir, fmt.Errorf("%s", result.IdleValidationError)
}
logFunc(fmt.Sprintf("autotune: full-load stage using %s for %ds", normalizedKind, benchmarkPowerAutotuneLoadSec))
loadSamplesCh := make(chan []benchmarkPowerAutotuneSample, 1)
go func() {
loadSamplesCh <- collectBenchmarkPowerAutotuneSamples(ctx, "load", selected, benchmarkPowerAutotuneLoadSec, logFunc)
}()
out, runErr := runSATCommandCtx(ctx, verboseLog, "autotune-load.log", loadCmd, nil, logFunc)
_ = os.WriteFile(filepath.Join(runDir, "autotune-load.log"), out, 0644)
loadSamples := <-loadSamplesCh
logBenchmarkPowerAutotunePhaseSummary("load", loadSamples, logFunc)
if runErr != nil {
result.Notes = append(result.Notes, "full-load stage failed: "+runErr.Error())
if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
return "", err
}
return runDir, fmt.Errorf("autotune load stage: %w", runErr)
}
selectedSource, candidates, idleGPUAvg, loadGPUAvg, chooseErr := chooseBenchmarkPowerAutotuneSource(idleSamples, loadSamples)
result.Candidates = candidates
result.GPUPowerIdleW = idleGPUAvg
result.GPUPowerLoadW = loadGPUAvg
if chooseErr != nil {
result.Notes = append(result.Notes, chooseErr.Error())
if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
return "", err
}
return runDir, chooseErr
}
gpuDelta := loadGPUAvg - idleGPUAvg
if gpuDelta <= 0 {
gpuDelta = loadGPUAvg
}
logBenchmarkPowerAutotuneSelection(candidates, selectedSource, gpuDelta, logFunc)
result.SelectedSource = selectedSource
result.Status = "OK"
var confidence float64
selectionReason := fmt.Sprintf("selected %s after comparing full-load average against GPU-reported delta", selectedSource)
for _, candidate := range candidates {
if candidate.Selected {
confidence = candidate.Confidence
if strings.TrimSpace(candidate.SelectionNotes) != "" {
selectionReason = candidate.SelectionNotes
}
break
}
}
cfg := BenchmarkPowerAutotuneConfig{
Version: benchmarkPowerAutotuneVersion,
UpdatedAt: time.Now().UTC(),
SelectedSource: selectedSource,
BenchmarkKind: normalizedKind,
Profile: opts.Profile,
IdleDurationSec: benchmarkPowerAutotuneIdleSec,
LoadDurationSec: benchmarkPowerAutotuneLoadSec,
SampleIntervalSec: benchmarkPowerAutotuneSampleInterval,
Confidence: confidence,
Reason: selectionReason,
}
result.Config = &cfg
configPath := BenchmarkPowerSourceConfigPath(baseDir)
if err := SaveBenchmarkPowerAutotuneConfig(configPath, cfg); err != nil {
result.Status = "FAILED"
result.Notes = append(result.Notes, "failed to save autotune config: "+err.Error())
if writeErr := writeBenchmarkPowerAutotuneArtifacts(runDir, result); writeErr != nil {
return "", writeErr
}
return runDir, err
}
logFunc(fmt.Sprintf("autotune conclusion: selected source %s; reason: %s", selectedSource, cfg.Reason))
result.Notes = append(result.Notes, "saved autotune config to "+configPath)
if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil {
return "", err
}
return runDir, nil
}
func writeBenchmarkPowerAutotuneArtifacts(runDir string, result BenchmarkPowerAutotuneResult) error {
resultJSON, err := json.MarshalIndent(result, "", " ")
if err != nil {
return fmt.Errorf("marshal autotune result: %w", err)
}
if err := os.WriteFile(filepath.Join(runDir, "result.json"), resultJSON, 0644); err != nil {
return fmt.Errorf("write autotune result.json: %w", err)
}
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(renderBenchmarkPowerAutotuneSummary(result)), 0644); err != nil {
return fmt.Errorf("write autotune summary.txt: %w", err)
}
if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(renderBenchmarkPowerAutotuneReport(result)), 0644); err != nil {
return fmt.Errorf("write autotune report.md: %w", err)
}
return nil
}
func minInt(a, b int) int {
if a < b {
return a
}
return b
}
var _ = exec.ErrNotFound