Globalize autotuned system power source

This commit is contained in:
2026-04-20 07:02:12 +03:00
parent 17118298bd
commit b3cf8e3893
14 changed files with 327 additions and 108 deletions

View File

@@ -43,17 +43,22 @@ type GPUStressMetric struct {
// FanStressRow is one second-interval telemetry sample covering all monitored dimensions.
type FanStressRow struct {
TimestampUTC string
ElapsedSec float64
Phase string // "baseline", "load1", "pause", "load2", "cooldown"
GPUs []GPUStressMetric
Fans []FanReading
CPUMaxTempC float64 // highest CPU temperature from ipmitool / sensors
SysPowerW float64 // DCMI system power reading
TimestampUTC string
ElapsedSec float64
Phase string // "baseline", "load1", "pause", "load2", "cooldown"
GPUs []GPUStressMetric
Fans []FanReading
CPUMaxTempC float64 // highest CPU temperature from ipmitool / sensors
SysPowerW float64
SysPowerSource string
SysPowerMode string
}
type cachedPowerReading struct {
Value float64
Source string
Mode string
Reason string
UpdatedAt time.Time
}
@@ -278,7 +283,7 @@ func sampleFanStressRow(gpuIndices []int, phase string, elapsed float64) FanStre
row.GPUs = sampleGPUStressMetrics(gpuIndices)
row.Fans, _ = sampleFanSpeeds()
row.CPUMaxTempC = sampleCPUMaxTemp()
row.SysPowerW = sampleSystemPower()
row.SysPowerW, row.SysPowerSource, row.SysPowerMode = sampleSystemPowerResolved()
return row
}
@@ -763,19 +768,19 @@ func sampleCPUTempViaSensors() float64 {
return max
}
// sampleSystemPower reads system power draw via DCMI.
func sampleSystemPower() float64 {
// sampleSystemPowerResolved reads system power via the global autotune source,
// falling back to the historical heuristic before autotune or when degraded.
func sampleSystemPowerResolved() (float64, string, string) {
now := time.Now()
current := 0.0
out, err := exec.Command("ipmitool", "dcmi", "power", "reading").Output()
if err == nil {
current = parseDCMIPowerReading(string(out))
}
current, decision, err := SampleSystemPowerResolved("")
systemPowerCacheMu.Lock()
defer systemPowerCacheMu.Unlock()
value, updated := effectiveSystemPowerReading(systemPowerCache, current, now)
if err != nil {
current = 0
}
value, updated := effectiveSystemPowerReading(systemPowerCache, current, decision.EffectiveSource, decision.Mode, decision.Reason, now)
systemPowerCache = updated
return value
return value, updated.Source, updated.Mode
}
// parseDCMIPowerReading extracts the instantaneous power reading from ipmitool dcmi output.
@@ -798,9 +803,9 @@ func parseDCMIPowerReading(raw string) float64 {
return 0
}
func effectiveSystemPowerReading(cache cachedPowerReading, current float64, now time.Time) (float64, cachedPowerReading) {
func effectiveSystemPowerReading(cache cachedPowerReading, current float64, source, mode, reason string, now time.Time) (float64, cachedPowerReading) {
if current > 0 {
cache = cachedPowerReading{Value: current, UpdatedAt: now}
cache = cachedPowerReading{Value: current, Source: source, Mode: mode, Reason: reason, UpdatedAt: now}
return current, cache
}
if cache.Value > 0 && !cache.UpdatedAt.IsZero() && now.Sub(cache.UpdatedAt) <= systemPowerHoldTTL {