package platform import ( "context" "encoding/json" "fmt" "math" "os" "os/exec" "path/filepath" "sort" "strings" "time" ) const ( benchmarkPowerAutotuneVersion = 1 benchmarkPowerAutotuneIdleSec = 60 benchmarkPowerAutotuneLoadSec = 90 benchmarkPowerAutotuneSampleInterval = 3 defaultBenchmarkPowerSourceConfigPath = "/appdata/bee/export/bee-bench/power-source-autotune.json" ) func BenchmarkPowerSourceConfigPath(baseDir string) string { baseDir = strings.TrimSpace(baseDir) if baseDir == "" { return defaultBenchmarkPowerSourceConfigPath } return filepath.Join(filepath.Dir(baseDir), "power-source-autotune.json") } func LoadBenchmarkPowerAutotuneConfig(path string) (*BenchmarkPowerAutotuneConfig, error) { raw, err := os.ReadFile(path) if err != nil { return nil, err } var cfg BenchmarkPowerAutotuneConfig if err := json.Unmarshal(raw, &cfg); err != nil { return nil, err } if strings.TrimSpace(cfg.SelectedSource) == "" { return nil, fmt.Errorf("autotune config missing selected_source") } return &cfg, nil } func SaveBenchmarkPowerAutotuneConfig(path string, cfg BenchmarkPowerAutotuneConfig) error { if strings.TrimSpace(path) == "" { return fmt.Errorf("empty autotune config path") } if cfg.Version <= 0 { cfg.Version = benchmarkPowerAutotuneVersion } if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil { return err } data, err := json.MarshalIndent(cfg, "", " ") if err != nil { return err } tmp := path + ".tmp" if err := os.WriteFile(tmp, data, 0644); err != nil { return err } return os.Rename(tmp, path) } func LoadSystemPowerSourceConfig(exportDir string) (*BenchmarkPowerAutotuneConfig, error) { return LoadBenchmarkPowerAutotuneConfig(BenchmarkPowerSourceConfigPath(exportDir)) } func ResetBenchmarkPowerAutotuneConfig(path string) error { if strings.TrimSpace(path) == "" { return fmt.Errorf("empty autotune config path") } if err := os.Remove(path); err != nil && !os.IsNotExist(err) { return err } return nil } func normalizeBenchmarkPowerSource(source string) string { switch strings.TrimSpace(strings.ToLower(source)) { case BenchmarkPowerSourceSDRPSUInput: return BenchmarkPowerSourceSDRPSUInput default: return BenchmarkPowerSourceDCMI } } func ResolveSystemPowerDecision(exportDir string) SystemPowerSourceDecision { cfg, err := LoadSystemPowerSourceConfig(exportDir) if err == nil && cfg != nil && strings.TrimSpace(cfg.SelectedSource) != "" { selected := normalizeBenchmarkPowerSource(cfg.SelectedSource) return SystemPowerSourceDecision{ Configured: true, SelectedSource: selected, EffectiveSource: selected, Mode: "autotuned", Reason: strings.TrimSpace(cfg.Reason), ConfiguredAt: cfg.UpdatedAt, } } sources := sampleBenchmarkPowerSources() if value := sources[BenchmarkPowerSourceSDRPSUInput]; value > 0 { return SystemPowerSourceDecision{ Configured: false, EffectiveSource: BenchmarkPowerSourceSDRPSUInput, Mode: "fallback", Reason: "autotune config not found; using temporary fallback source sdr_psu_input", } } return SystemPowerSourceDecision{ Configured: false, EffectiveSource: BenchmarkPowerSourceDCMI, Mode: "fallback", Reason: "autotune config not found; using temporary fallback source dcmi", } } func SampleSystemPowerResolved(exportDir string) (float64, SystemPowerSourceDecision, error) { decision := ResolveSystemPowerDecision(exportDir) if decision.EffectiveSource != "" { if value, err := queryBenchmarkPowerSourceW(decision.EffectiveSource); err == nil && value > 0 { return value, decision, nil } else if decision.Configured { fallback := BenchmarkPowerSourceDCMI if decision.EffectiveSource == BenchmarkPowerSourceDCMI { fallback = BenchmarkPowerSourceSDRPSUInput } if fallbackValue, fallbackErr := queryBenchmarkPowerSourceW(fallback); fallbackErr == nil && fallbackValue > 0 { decision.Mode = "degraded" decision.Reason = fmt.Sprintf("configured source %s unavailable; using degraded fallback %s", decision.SelectedSource, fallback) decision.EffectiveSource = fallback return fallbackValue, decision, nil } decision.Mode = "degraded" decision.Reason = fmt.Sprintf("configured source %s unavailable and no fallback source responded", decision.SelectedSource) return 0, decision, err } } return 0, decision, fmt.Errorf("system power source unavailable") } func queryBenchmarkPowerSourceW(source string) (float64, error) { switch normalizeBenchmarkPowerSource(source) { case BenchmarkPowerSourceSDRPSUInput: sdr := sampleIPMISDRPowerSensors() if sdr.PSUInW > 0 { return sdr.PSUInW, nil } return 0, fmt.Errorf("sdr psu input unavailable") default: return queryIPMIServerPowerW() } } func sampleBenchmarkPowerSources() map[string]float64 { out := map[string]float64{} if w, err := queryIPMIServerPowerW(); err == nil && w > 0 { out[BenchmarkPowerSourceDCMI] = w } if w, err := queryBenchmarkPowerSourceW(BenchmarkPowerSourceSDRPSUInput); err == nil && w > 0 { out[BenchmarkPowerSourceSDRPSUInput] = w } return out } func sampleBenchmarkPowerSourceSeries(ctx context.Context, source string, durationSec, intervalSec int) (float64, bool) { if durationSec <= 0 { return 0, false } samples := collectSelectedPowerSourceSamples(ctx, source, durationSec, intervalSec) if len(samples) == 0 { return 0, false } return benchmarkMean(samples), true } func collectSelectedPowerSourceSamples(ctx context.Context, source string, durationSec, intervalSec int) []float64 { if durationSec <= 0 { return nil } stopCh := make(chan struct{}) doneCh := startSelectedPowerSourceSampler(stopCh, source, intervalSec) select { case <-ctx.Done(): case <-time.After(time.Duration(durationSec) * time.Second): } close(stopCh) return <-doneCh } func startSelectedPowerSourceSampler(stopCh <-chan struct{}, source string, intervalSec int) <-chan []float64 { if intervalSec <= 0 { intervalSec = benchmarkPowerAutotuneSampleInterval } ch := make(chan []float64, 1) go func() { defer close(ch) var samples []float64 record := func() { if w, err := queryBenchmarkPowerSourceW(source); err == nil && w > 0 { samples = append(samples, w) } } record() ticker := time.NewTicker(time.Duration(intervalSec) * time.Second) defer ticker.Stop() for { select { case <-stopCh: ch <- samples return case <-ticker.C: record() } } }() return ch } type benchmarkPowerAutotuneSample struct { ElapsedSec float64 GPUAvgUsagePct float64 CPUUsagePct float64 GPUSumPowerW float64 Sources map[string]float64 } func collectBenchmarkPowerAutotuneSamples(ctx context.Context, phase string, gpuIndices []int, durationSec int, logFunc func(string)) []benchmarkPowerAutotuneSample { if durationSec <= 0 { return nil } var out []benchmarkPowerAutotuneSample deadline := time.Now().Add(time.Duration(durationSec) * time.Second) start := time.Now() for { if ctx.Err() != nil { return out } row := benchmarkPowerAutotuneSample{ ElapsedSec: time.Since(start).Seconds(), CPUUsagePct: sampleCPULoadPct(), Sources: sampleBenchmarkPowerSources(), } if gpuRows, err := sampleGPUMetrics(gpuIndices); err == nil && len(gpuRows) > 0 { var usageSum float64 for _, gpu := range gpuRows { row.GPUSumPowerW += gpu.PowerW usageSum += gpu.UsagePct } row.GPUAvgUsagePct = usageSum / float64(len(gpuRows)) } out = append(out, row) logBenchmarkPowerAutotuneSample(phase, row, logFunc) if time.Now().After(deadline) { return out } select { case <-ctx.Done(): return out case <-time.After(benchmarkPowerAutotuneSampleInterval * time.Second): } } } func logBenchmarkPowerAutotuneSample(phase string, sample benchmarkPowerAutotuneSample, logFunc func(string)) { if logFunc == nil { return } var sourceParts []string for _, source := range []string{BenchmarkPowerSourceDCMI, BenchmarkPowerSourceSDRPSUInput} { if value, ok := sample.Sources[source]; ok && value > 0 { sourceParts = append(sourceParts, fmt.Sprintf("%s=%.0fW", source, value)) } else { sourceParts = append(sourceParts, fmt.Sprintf("%s=n/a", source)) } } logFunc(fmt.Sprintf( "autotune %s sample t=%.0fs gpu_avg_util=%.1f%% gpu_sum_power=%.0fW cpu_load=%.1f%% %s", phase, sample.ElapsedSec, sample.GPUAvgUsagePct, sample.GPUSumPowerW, sample.CPUUsagePct, strings.Join(sourceParts, " "), )) } func logBenchmarkPowerAutotunePhaseSummary(phase string, samples []benchmarkPowerAutotuneSample, logFunc func(string)) { if logFunc == nil || len(samples) == 0 { return } var gpuUsage []float64 var cpuUsage []float64 var gpuPower []float64 sourceBuckets := map[string][]float64{} for _, sample := range samples { gpuUsage = append(gpuUsage, sample.GPUAvgUsagePct) cpuUsage = append(cpuUsage, sample.CPUUsagePct) gpuPower = append(gpuPower, sample.GPUSumPowerW) for source, value := range sample.Sources { if value > 0 { sourceBuckets[source] = append(sourceBuckets[source], value) } } } var sourceParts []string for _, source := range []string{BenchmarkPowerSourceDCMI, BenchmarkPowerSourceSDRPSUInput} { values := sourceBuckets[source] if len(values) == 0 { sourceParts = append(sourceParts, fmt.Sprintf("%s_avg=n/a", source)) continue } sourceParts = append(sourceParts, fmt.Sprintf("%s_avg=%.0fW", source, benchmarkMean(values))) } logFunc(fmt.Sprintf( "autotune %s summary samples=%d gpu_avg_util=%.1f%% gpu_p95_util=%.1f%% gpu_avg_power=%.0fW cpu_avg=%.1f%% cpu_p95=%.1f%% %s", phase, len(samples), benchmarkMean(gpuUsage), benchmarkPercentile(gpuUsage, 95), benchmarkMean(gpuPower), benchmarkMean(cpuUsage), benchmarkPercentile(cpuUsage, 95), strings.Join(sourceParts, " "), )) } func logBenchmarkPowerAutotuneSelection(candidates []BenchmarkPowerAutotuneCandidate, selectedSource string, gpuDelta float64, logFunc func(string)) { if logFunc == nil { return } for _, candidate := range candidates { if !candidate.Available { logFunc(fmt.Sprintf("autotune candidate %s unavailable", candidate.Source)) continue } logFunc(fmt.Sprintf( "autotune candidate %s idle_avg=%.0fW load_avg=%.0fW delta=%.0fW gpu_delta=%.0fW relative_error=%.3f confidence=%.0f%%%s", candidate.Source, candidate.IdleAvgW, candidate.LoadAvgW, candidate.DeltaW, gpuDelta, candidate.RelativeError, candidate.Confidence*100, map[bool]string{true: " SELECTED", false: ""}[candidate.Source == selectedSource], )) if strings.TrimSpace(candidate.SelectionNotes) != "" { logFunc(fmt.Sprintf("autotune candidate %s reason: %s", candidate.Source, candidate.SelectionNotes)) } } } func validateBenchmarkPowerAutotuneIdle(samples []benchmarkPowerAutotuneSample) *BenchmarkPowerAutotuneValidation { result := &BenchmarkPowerAutotuneValidation{} if len(samples) == 0 { result.Reason = "no idle telemetry samples collected" return result } var gpuUsage []float64 var cpuUsage []float64 for _, sample := range samples { gpuUsage = append(gpuUsage, sample.GPUAvgUsagePct) if sample.CPUUsagePct > 0 { cpuUsage = append(cpuUsage, sample.CPUUsagePct) } } result.GPUSamples = len(gpuUsage) result.CPUSamples = len(cpuUsage) result.GPUAvgUsagePct = math.Round(benchmarkMean(gpuUsage)*10) / 10 result.GPUP95UsagePct = math.Round(benchmarkPercentile(gpuUsage, 95)*10) / 10 result.CPUAvgUsagePct = math.Round(benchmarkMean(cpuUsage)*10) / 10 result.CPUP95UsagePct = math.Round(benchmarkPercentile(cpuUsage, 95)*10) / 10 switch { case result.GPUAvgUsagePct > 5: result.Reason = fmt.Sprintf("idle validation failed: average GPU load %.1f%% exceeds 5%%", result.GPUAvgUsagePct) case result.GPUP95UsagePct > 10: result.Reason = fmt.Sprintf("idle validation failed: p95 GPU load %.1f%% exceeds 10%%", result.GPUP95UsagePct) case result.CPUAvgUsagePct > 20: result.Reason = fmt.Sprintf("idle validation failed: average CPU load %.1f%% exceeds 20%%", result.CPUAvgUsagePct) case result.CPUP95UsagePct > 35: result.Reason = fmt.Sprintf("idle validation failed: p95 CPU load %.1f%% exceeds 35%%", result.CPUP95UsagePct) default: result.Valid = true } return result } func chooseBenchmarkPowerAutotuneSource(idle, load []benchmarkPowerAutotuneSample) (string, []BenchmarkPowerAutotuneCandidate, float64, float64, error) { idleBySource := map[string][]float64{} loadBySource := map[string][]float64{} var idleGPU []float64 var loadGPU []float64 for _, sample := range idle { idleGPU = append(idleGPU, sample.GPUSumPowerW) for source, value := range sample.Sources { if value > 0 { idleBySource[source] = append(idleBySource[source], value) } } } for _, sample := range load { loadGPU = append(loadGPU, sample.GPUSumPowerW) for source, value := range sample.Sources { if value > 0 { loadBySource[source] = append(loadBySource[source], value) } } } idleGPUAvg := benchmarkMean(idleGPU) loadGPUAvg := benchmarkMean(loadGPU) gpuDelta := loadGPUAvg - idleGPUAvg if gpuDelta <= 0 { gpuDelta = loadGPUAvg } candidates := []BenchmarkPowerAutotuneCandidate{ buildBenchmarkPowerAutotuneCandidate(BenchmarkPowerSourceDCMI, idleBySource[BenchmarkPowerSourceDCMI], loadBySource[BenchmarkPowerSourceDCMI], gpuDelta), buildBenchmarkPowerAutotuneCandidate(BenchmarkPowerSourceSDRPSUInput, idleBySource[BenchmarkPowerSourceSDRPSUInput], loadBySource[BenchmarkPowerSourceSDRPSUInput], gpuDelta), } available := make([]BenchmarkPowerAutotuneCandidate, 0, len(candidates)) for _, candidate := range candidates { if candidate.Available && candidate.DeltaW > 0 { available = append(available, candidate) } } if len(available) == 0 { return "", candidates, idleGPUAvg, loadGPUAvg, fmt.Errorf("no usable server power source samples collected") } sort.Slice(available, func(i, j int) bool { if math.Abs(available[i].RelativeError-available[j].RelativeError) <= 0.10 { if available[i].Source != available[j].Source { return available[i].Source == BenchmarkPowerSourceSDRPSUInput } } if available[i].RelativeError != available[j].RelativeError { return available[i].RelativeError < available[j].RelativeError } return available[i].Samples > available[j].Samples }) selected := available[0] for idx := range candidates { if candidates[idx].Source == selected.Source { candidates[idx].Selected = true candidates[idx].SelectionNotes = fmt.Sprintf("selected because delta %.0f W is closest to GPU delta %.0f W (relative error %.3f)", selected.DeltaW, gpuDelta, selected.RelativeError) } } return selected.Source, candidates, idleGPUAvg, loadGPUAvg, nil } func buildBenchmarkPowerAutotuneCandidate(source string, idle, load []float64, gpuDelta float64) BenchmarkPowerAutotuneCandidate { candidate := BenchmarkPowerAutotuneCandidate{ Source: source, Available: len(idle) > 0 && len(load) > 0, Samples: minInt(len(idle), len(load)), } if !candidate.Available { return candidate } candidate.IdleAvgW = benchmarkMean(idle) candidate.LoadAvgW = benchmarkMean(load) candidate.DeltaW = candidate.LoadAvgW - candidate.IdleAvgW if gpuDelta > 0 { candidate.RelativeError = math.Abs(candidate.DeltaW-gpuDelta) / gpuDelta candidate.Confidence = math.Max(0, 1-candidate.RelativeError) } return candidate } func renderBenchmarkPowerAutotuneSummary(result BenchmarkPowerAutotuneResult) string { var b strings.Builder fmt.Fprintf(&b, "generated_at=%s\n", result.GeneratedAt.UTC().Format(time.RFC3339)) fmt.Fprintf(&b, "status=%s\n", result.Status) fmt.Fprintf(&b, "benchmark_kind=%s\n", result.BenchmarkKind) fmt.Fprintf(&b, "profile=%s\n", result.Profile) fmt.Fprintf(&b, "idle_duration_sec=%d\n", result.IdleDurationSec) fmt.Fprintf(&b, "load_duration_sec=%d\n", result.LoadDurationSec) fmt.Fprintf(&b, "sample_interval_sec=%d\n", result.SampleIntervalSec) if result.SelectedSource != "" { fmt.Fprintf(&b, "selected_source=%s\n", result.SelectedSource) } if result.IdleValidation != nil { fmt.Fprintf(&b, "idle_valid=%t\n", result.IdleValidation.Valid) fmt.Fprintf(&b, "idle_gpu_avg_usage_pct=%.1f\n", result.IdleValidation.GPUAvgUsagePct) fmt.Fprintf(&b, "idle_gpu_p95_usage_pct=%.1f\n", result.IdleValidation.GPUP95UsagePct) fmt.Fprintf(&b, "idle_cpu_avg_usage_pct=%.1f\n", result.IdleValidation.CPUAvgUsagePct) fmt.Fprintf(&b, "idle_cpu_p95_usage_pct=%.1f\n", result.IdleValidation.CPUP95UsagePct) if result.IdleValidation.Reason != "" { fmt.Fprintf(&b, "idle_validation_error=%s\n", result.IdleValidation.Reason) } } for _, candidate := range result.Candidates { fmt.Fprintf(&b, "candidate_%s_available=%t\n", candidate.Source, candidate.Available) if candidate.Available { fmt.Fprintf(&b, "candidate_%s_idle_avg_w=%.0f\n", candidate.Source, candidate.IdleAvgW) fmt.Fprintf(&b, "candidate_%s_load_avg_w=%.0f\n", candidate.Source, candidate.LoadAvgW) fmt.Fprintf(&b, "candidate_%s_delta_w=%.0f\n", candidate.Source, candidate.DeltaW) fmt.Fprintf(&b, "candidate_%s_relative_error=%.3f\n", candidate.Source, candidate.RelativeError) } } return b.String() } func renderBenchmarkPowerAutotuneReport(result BenchmarkPowerAutotuneResult) string { var b strings.Builder b.WriteString("# Bee Bench Power Source Autotune\n\n") fmt.Fprintf(&b, "**Status:** %s \n", result.Status) fmt.Fprintf(&b, "**Benchmark kind:** %s \n", result.BenchmarkKind) fmt.Fprintf(&b, "**Profile:** %s \n", result.Profile) fmt.Fprintf(&b, "**Idle window:** %ds \n", result.IdleDurationSec) fmt.Fprintf(&b, "**Load window:** %ds \n", result.LoadDurationSec) fmt.Fprintf(&b, "**Sample interval:** %ds \n", result.SampleIntervalSec) if result.SelectedSource != "" { fmt.Fprintf(&b, "**Selected source:** `%s` \n", result.SelectedSource) } b.WriteString("\n") if result.IdleValidation != nil { b.WriteString("## Idle Validation\n\n") fmt.Fprintf(&b, "- valid: %t\n", result.IdleValidation.Valid) fmt.Fprintf(&b, "- GPU avg usage: %.1f%%\n", result.IdleValidation.GPUAvgUsagePct) fmt.Fprintf(&b, "- GPU p95 usage: %.1f%%\n", result.IdleValidation.GPUP95UsagePct) fmt.Fprintf(&b, "- CPU avg usage: %.1f%%\n", result.IdleValidation.CPUAvgUsagePct) fmt.Fprintf(&b, "- CPU p95 usage: %.1f%%\n", result.IdleValidation.CPUP95UsagePct) if result.IdleValidation.Reason != "" { fmt.Fprintf(&b, "- reason: %s\n", result.IdleValidation.Reason) } b.WriteString("\n") } if len(result.Candidates) > 0 { b.WriteString("## Candidates\n\n") b.WriteString("| Source | Idle avg W | Load avg W | Delta W | Relative error | Selected |\n") b.WriteString("|--------|------------|------------|---------|----------------|----------|\n") for _, candidate := range result.Candidates { if !candidate.Available { fmt.Fprintf(&b, "| %s | — | — | — | — | no |\n", candidate.Source) continue } selected := "no" if candidate.Selected { selected = "yes" } fmt.Fprintf(&b, "| %s | %.0f | %.0f | %.0f | %.2f | %s |\n", candidate.Source, candidate.IdleAvgW, candidate.LoadAvgW, candidate.DeltaW, candidate.RelativeError, selected) } b.WriteString("\n") } for _, note := range result.Notes { fmt.Fprintf(&b, "- %s\n", note) } return b.String() } func benchmarkAutotuneLoadCommand(kind string, durationSec int, gpuIndices []int, sizeMB int) ([]string, string) { allDevices := joinIndexList(gpuIndices) switch strings.TrimSpace(strings.ToLower(kind)) { case "power-fit", "power", "nvidia-bench-power": cmd, _, err := resolveBenchmarkPowerLoadCommand(durationSec, gpuIndices) if err == nil { return cmd, "power-fit" } return nvidiaDCGMNamedDiagCommand("targeted_power", durationSec, gpuIndices), "power-fit" default: cmd := []string{ "bee-gpu-burn", "--seconds", fmt.Sprintf("%d", durationSec), "--devices", allDevices, } if sizeMB > 0 { cmd = append(cmd, "--size-mb", fmt.Sprintf("%d", sizeMB)) } return cmd, "performance" } } func (s *System) RunNvidiaPowerSourceAutotune(ctx context.Context, baseDir string, opts NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error) { if ctx == nil { ctx = context.Background() } if logFunc == nil { logFunc = func(string) {} } if strings.TrimSpace(baseDir) == "" { baseDir = "/var/log/bee-bench/autotune" } if err := os.MkdirAll(baseDir, 0755); err != nil { return "", fmt.Errorf("mkdir %s: %w", baseDir, err) } selected, err := resolveNvidiaGPUSelection(nil, nil) if err != nil { return "", err } if len(selected) == 0 { return "", fmt.Errorf("no NVIDIA GPUs detected for autotune") } ts := time.Now().UTC().Format("20060102-150405") runDir := filepath.Join(baseDir, "autotune-"+ts) if err := os.MkdirAll(runDir, 0755); err != nil { return "", fmt.Errorf("mkdir %s: %w", runDir, err) } verboseLog := filepath.Join(runDir, "verbose.log") hostname, _ := os.Hostname() loadCmd, normalizedKind := benchmarkAutotuneLoadCommand(benchmarkKind, benchmarkPowerAutotuneLoadSec, selected, opts.SizeMB) result := BenchmarkPowerAutotuneResult{ GeneratedAt: time.Now().UTC(), Hostname: hostname, ServerModel: readServerModel(), BenchmarkKind: normalizedKind, Profile: opts.Profile, Status: "FAILED", IdleDurationSec: benchmarkPowerAutotuneIdleSec, LoadDurationSec: benchmarkPowerAutotuneLoadSec, SampleIntervalSec: benchmarkPowerAutotuneSampleInterval, } logFunc(fmt.Sprintf("autotune: idle validation window %ds on GPUs %s", benchmarkPowerAutotuneIdleSec, joinIndexList(selected))) idleSamples := collectBenchmarkPowerAutotuneSamples(ctx, "idle", selected, benchmarkPowerAutotuneIdleSec, logFunc) logBenchmarkPowerAutotunePhaseSummary("idle", idleSamples, logFunc) result.IdleValidation = validateBenchmarkPowerAutotuneIdle(idleSamples) if result.IdleValidation == nil || !result.IdleValidation.Valid { if result.IdleValidation != nil { result.IdleValidationError = result.IdleValidation.Reason logFunc(result.IdleValidation.Reason) } result.Notes = append(result.Notes, "autotune stopped before load stage because idle validation failed") if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil { return "", err } return runDir, fmt.Errorf("%s", result.IdleValidationError) } logFunc(fmt.Sprintf("autotune: full-load stage using %s for %ds", normalizedKind, benchmarkPowerAutotuneLoadSec)) loadSamplesCh := make(chan []benchmarkPowerAutotuneSample, 1) go func() { loadSamplesCh <- collectBenchmarkPowerAutotuneSamples(ctx, "load", selected, benchmarkPowerAutotuneLoadSec, logFunc) }() out, runErr := runSATCommandCtx(ctx, verboseLog, "autotune-load.log", loadCmd, nil, logFunc) _ = os.WriteFile(filepath.Join(runDir, "autotune-load.log"), out, 0644) loadSamples := <-loadSamplesCh logBenchmarkPowerAutotunePhaseSummary("load", loadSamples, logFunc) if runErr != nil { result.Notes = append(result.Notes, "full-load stage failed: "+runErr.Error()) if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil { return "", err } return runDir, fmt.Errorf("autotune load stage: %w", runErr) } selectedSource, candidates, idleGPUAvg, loadGPUAvg, chooseErr := chooseBenchmarkPowerAutotuneSource(idleSamples, loadSamples) result.Candidates = candidates result.GPUPowerIdleW = idleGPUAvg result.GPUPowerLoadW = loadGPUAvg if chooseErr != nil { result.Notes = append(result.Notes, chooseErr.Error()) if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil { return "", err } return runDir, chooseErr } gpuDelta := loadGPUAvg - idleGPUAvg if gpuDelta <= 0 { gpuDelta = loadGPUAvg } logBenchmarkPowerAutotuneSelection(candidates, selectedSource, gpuDelta, logFunc) result.SelectedSource = selectedSource result.Status = "OK" var confidence float64 selectionReason := fmt.Sprintf("selected %s after comparing full-load average against GPU-reported delta", selectedSource) for _, candidate := range candidates { if candidate.Selected { confidence = candidate.Confidence if strings.TrimSpace(candidate.SelectionNotes) != "" { selectionReason = candidate.SelectionNotes } break } } cfg := BenchmarkPowerAutotuneConfig{ Version: benchmarkPowerAutotuneVersion, UpdatedAt: time.Now().UTC(), SelectedSource: selectedSource, BenchmarkKind: normalizedKind, Profile: opts.Profile, IdleDurationSec: benchmarkPowerAutotuneIdleSec, LoadDurationSec: benchmarkPowerAutotuneLoadSec, SampleIntervalSec: benchmarkPowerAutotuneSampleInterval, Confidence: confidence, Reason: selectionReason, } result.Config = &cfg configPath := BenchmarkPowerSourceConfigPath(baseDir) if err := SaveBenchmarkPowerAutotuneConfig(configPath, cfg); err != nil { result.Status = "FAILED" result.Notes = append(result.Notes, "failed to save autotune config: "+err.Error()) if writeErr := writeBenchmarkPowerAutotuneArtifacts(runDir, result); writeErr != nil { return "", writeErr } return runDir, err } logFunc(fmt.Sprintf("autotune conclusion: selected source %s; reason: %s", selectedSource, cfg.Reason)) result.Notes = append(result.Notes, "saved autotune config to "+configPath) if err := writeBenchmarkPowerAutotuneArtifacts(runDir, result); err != nil { return "", err } return runDir, nil } func writeBenchmarkPowerAutotuneArtifacts(runDir string, result BenchmarkPowerAutotuneResult) error { resultJSON, err := json.MarshalIndent(result, "", " ") if err != nil { return fmt.Errorf("marshal autotune result: %w", err) } if err := os.WriteFile(filepath.Join(runDir, "result.json"), resultJSON, 0644); err != nil { return fmt.Errorf("write autotune result.json: %w", err) } if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(renderBenchmarkPowerAutotuneSummary(result)), 0644); err != nil { return fmt.Errorf("write autotune summary.txt: %w", err) } if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(renderBenchmarkPowerAutotuneReport(result)), 0644); err != nil { return fmt.Errorf("write autotune report.md: %w", err) } return nil } func minInt(a, b int) int { if a < b { return a } return b } var _ = exec.ErrNotFound