Globalize autotuned system power source

This commit is contained in:
2026-04-20 07:02:12 +03:00
parent 17118298bd
commit b3cf8e3893
14 changed files with 327 additions and 108 deletions

View File

@@ -401,11 +401,15 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult) string {
}
}
// ── Server Power (IPMI) ───────────────────────────────────────────────────
// ── Server Power ───────────────────────────────────────────────────────────
if sp := result.ServerPower; sp != nil {
b.WriteString("## Server Power (IPMI)\n\n")
title := "## Server Power\n\n"
if sp.Source != "" {
title = fmt.Sprintf("## Server Power (`%s`)\n\n", sp.Source)
}
b.WriteString(title)
if !sp.Available {
b.WriteString("IPMI power measurement unavailable.\n\n")
b.WriteString("Server power measurement unavailable.\n\n")
} else {
spRows := [][]string{
{"Server idle", fmt.Sprintf("%.0f W", sp.IdleW)},

View File

@@ -16,14 +16,17 @@ import (
// LiveMetricSample is a single point-in-time snapshot of server metrics
// collected for the web UI metrics page.
type LiveMetricSample struct {
Timestamp time.Time `json:"ts"`
Fans []FanReading `json:"fans"`
Temps []TempReading `json:"temps"`
PowerW float64 `json:"power_w"`
PSUs []PSUReading `json:"psus,omitempty"`
CPULoadPct float64 `json:"cpu_load_pct"`
MemLoadPct float64 `json:"mem_load_pct"`
GPUs []GPUMetricRow `json:"gpus"`
Timestamp time.Time `json:"ts"`
Fans []FanReading `json:"fans"`
Temps []TempReading `json:"temps"`
PowerW float64 `json:"power_w"`
PowerSource string `json:"power_source,omitempty"`
PowerMode string `json:"power_mode,omitempty"`
PowerReason string `json:"power_reason,omitempty"`
PSUs []PSUReading `json:"psus,omitempty"`
CPULoadPct float64 `json:"cpu_load_pct"`
MemLoadPct float64 `json:"mem_load_pct"`
GPUs []GPUMetricRow `json:"gpus"`
}
// PSUReading is a per-slot power supply input power reading.
@@ -67,15 +70,13 @@ func SampleLiveMetrics() LiveMetricSample {
// Per-PSU power — populated when IPMI SDR has Power Supply entities with Watt readings
s.PSUs = samplePSUPower()
// System power: prefer sum of PSU AC inputs (full wall draw); fall back to DCMI.
if len(s.PSUs) > 0 {
var total float64
for _, p := range s.PSUs {
total += p.PowerW
}
s.PowerW = total
} else {
s.PowerW = sampleSystemPower()
// System power: use the global autotune-selected source when configured,
// otherwise fall back to the historical heuristic and mark the mode.
if powerW, decision, err := SampleSystemPowerResolved(""); err == nil {
s.PowerW = powerW
s.PowerSource = decision.EffectiveSource
s.PowerMode = decision.Mode
s.PowerReason = decision.Reason
}
// CPU load — from /proc/stat

View File

@@ -43,17 +43,22 @@ type GPUStressMetric struct {
// FanStressRow is one second-interval telemetry sample covering all monitored dimensions.
type FanStressRow struct {
TimestampUTC string
ElapsedSec float64
Phase string // "baseline", "load1", "pause", "load2", "cooldown"
GPUs []GPUStressMetric
Fans []FanReading
CPUMaxTempC float64 // highest CPU temperature from ipmitool / sensors
SysPowerW float64 // DCMI system power reading
TimestampUTC string
ElapsedSec float64
Phase string // "baseline", "load1", "pause", "load2", "cooldown"
GPUs []GPUStressMetric
Fans []FanReading
CPUMaxTempC float64 // highest CPU temperature from ipmitool / sensors
SysPowerW float64
SysPowerSource string
SysPowerMode string
}
type cachedPowerReading struct {
Value float64
Source string
Mode string
Reason string
UpdatedAt time.Time
}
@@ -278,7 +283,7 @@ func sampleFanStressRow(gpuIndices []int, phase string, elapsed float64) FanStre
row.GPUs = sampleGPUStressMetrics(gpuIndices)
row.Fans, _ = sampleFanSpeeds()
row.CPUMaxTempC = sampleCPUMaxTemp()
row.SysPowerW = sampleSystemPower()
row.SysPowerW, row.SysPowerSource, row.SysPowerMode = sampleSystemPowerResolved()
return row
}
@@ -763,19 +768,19 @@ func sampleCPUTempViaSensors() float64 {
return max
}
// sampleSystemPower reads system power draw via DCMI.
func sampleSystemPower() float64 {
// sampleSystemPowerResolved reads system power via the global autotune source,
// falling back to the historical heuristic before autotune or when degraded.
func sampleSystemPowerResolved() (float64, string, string) {
now := time.Now()
current := 0.0
out, err := exec.Command("ipmitool", "dcmi", "power", "reading").Output()
if err == nil {
current = parseDCMIPowerReading(string(out))
}
current, decision, err := SampleSystemPowerResolved("")
systemPowerCacheMu.Lock()
defer systemPowerCacheMu.Unlock()
value, updated := effectiveSystemPowerReading(systemPowerCache, current, now)
if err != nil {
current = 0
}
value, updated := effectiveSystemPowerReading(systemPowerCache, current, decision.EffectiveSource, decision.Mode, decision.Reason, now)
systemPowerCache = updated
return value
return value, updated.Source, updated.Mode
}
// parseDCMIPowerReading extracts the instantaneous power reading from ipmitool dcmi output.
@@ -798,9 +803,9 @@ func parseDCMIPowerReading(raw string) float64 {
return 0
}
func effectiveSystemPowerReading(cache cachedPowerReading, current float64, now time.Time) (float64, cachedPowerReading) {
func effectiveSystemPowerReading(cache cachedPowerReading, current float64, source, mode, reason string, now time.Time) (float64, cachedPowerReading) {
if current > 0 {
cache = cachedPowerReading{Value: current, UpdatedAt: now}
cache = cachedPowerReading{Value: current, Source: source, Mode: mode, Reason: reason, UpdatedAt: now}
return current, cache
}
if cache.Value > 0 && !cache.UpdatedAt.IsZero() && now.Sub(cache.UpdatedAt) <= systemPowerHoldTTL {

View File

@@ -112,7 +112,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) {
now := time.Now()
cache := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-5 * time.Second)}
got, updated := effectiveSystemPowerReading(cache, 0, now)
got, updated := effectiveSystemPowerReading(cache, 0, "", "", "", now)
if got != 480 {
t.Fatalf("got=%v want cached 480", got)
}
@@ -120,7 +120,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) {
t.Fatalf("updated=%+v", updated)
}
got, updated = effectiveSystemPowerReading(cache, 530, now)
got, updated = effectiveSystemPowerReading(cache, 530, "dcmi", "fallback", "test", now)
if got != 530 {
t.Fatalf("got=%v want 530", got)
}
@@ -129,7 +129,7 @@ func TestEffectiveSystemPowerReading(t *testing.T) {
}
expired := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-systemPowerHoldTTL - time.Second)}
got, _ = effectiveSystemPowerReading(expired, 0, now)
got, _ = effectiveSystemPowerReading(expired, 0, "", "", "", now)
if got != 0 {
t.Fatalf("expired cache returned %v want 0", got)
}