fix(metrics): stabilize cpu and power sampling
This commit is contained in:
@@ -68,18 +68,20 @@ func SampleLiveMetrics() LiveMetricSample {
|
|||||||
|
|
||||||
// sampleCPULoadPct reads two /proc/stat snapshots 200ms apart and returns
|
// sampleCPULoadPct reads two /proc/stat snapshots 200ms apart and returns
|
||||||
// the overall CPU utilisation percentage.
|
// the overall CPU utilisation percentage.
|
||||||
var cpuStatPrev [2]uint64 // [total, idle]
|
|
||||||
|
|
||||||
func sampleCPULoadPct() float64 {
|
func sampleCPULoadPct() float64 {
|
||||||
total, idle := readCPUStat()
|
total0, idle0 := readCPUStat()
|
||||||
if total == 0 {
|
if total0 == 0 {
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
prevTotal, prevIdle := cpuStatPrev[0], cpuStatPrev[1]
|
time.Sleep(200 * time.Millisecond)
|
||||||
cpuStatPrev = [2]uint64{total, idle}
|
total1, idle1 := readCPUStat()
|
||||||
if prevTotal == 0 {
|
if total1 == 0 {
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
return cpuLoadPctBetween(total0, idle0, total1, idle1)
|
||||||
|
}
|
||||||
|
|
||||||
|
func cpuLoadPctBetween(prevTotal, prevIdle, total, idle uint64) float64 {
|
||||||
dt := float64(total - prevTotal)
|
dt := float64(total - prevTotal)
|
||||||
di := float64(idle - prevIdle)
|
di := float64(idle - prevIdle)
|
||||||
if dt <= 0 {
|
if dt <= 0 {
|
||||||
|
|||||||
@@ -42,3 +42,53 @@ func TestCompactAmbientTempName(t *testing.T) {
|
|||||||
t.Fatalf("got %q", got)
|
t.Fatalf("got %q", got)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestCPULoadPctBetween(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
name string
|
||||||
|
prevTotal uint64
|
||||||
|
prevIdle uint64
|
||||||
|
total uint64
|
||||||
|
idle uint64
|
||||||
|
want float64
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "busy half",
|
||||||
|
prevTotal: 100,
|
||||||
|
prevIdle: 40,
|
||||||
|
total: 200,
|
||||||
|
idle: 90,
|
||||||
|
want: 50,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "fully busy",
|
||||||
|
prevTotal: 100,
|
||||||
|
prevIdle: 40,
|
||||||
|
total: 200,
|
||||||
|
idle: 40,
|
||||||
|
want: 100,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "no progress",
|
||||||
|
prevTotal: 100,
|
||||||
|
prevIdle: 40,
|
||||||
|
total: 100,
|
||||||
|
idle: 40,
|
||||||
|
want: 0,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "idle delta larger than total clamps to zero",
|
||||||
|
prevTotal: 100,
|
||||||
|
prevIdle: 40,
|
||||||
|
total: 200,
|
||||||
|
idle: 150,
|
||||||
|
want: 0,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tc := range tests {
|
||||||
|
if got := cpuLoadPctBetween(tc.prevTotal, tc.prevIdle, tc.total, tc.idle); got != tc.want {
|
||||||
|
t.Fatalf("%s: cpuLoadPctBetween(...)=%v want %v", tc.name, got, tc.want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -51,6 +51,18 @@ type FanStressRow struct {
|
|||||||
SysPowerW float64 // DCMI system power reading
|
SysPowerW float64 // DCMI system power reading
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type cachedPowerReading struct {
|
||||||
|
Value float64
|
||||||
|
UpdatedAt time.Time
|
||||||
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
systemPowerCacheMu sync.Mutex
|
||||||
|
systemPowerCache cachedPowerReading
|
||||||
|
)
|
||||||
|
|
||||||
|
const systemPowerHoldTTL = 15 * time.Second
|
||||||
|
|
||||||
// RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds,
|
// RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds,
|
||||||
// temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv.
|
// temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv.
|
||||||
// Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling.
|
// Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling.
|
||||||
@@ -508,11 +520,17 @@ func sampleCPUTempViaSensors() float64 {
|
|||||||
|
|
||||||
// sampleSystemPower reads system power draw via DCMI.
|
// sampleSystemPower reads system power draw via DCMI.
|
||||||
func sampleSystemPower() float64 {
|
func sampleSystemPower() float64 {
|
||||||
|
now := time.Now()
|
||||||
|
current := 0.0
|
||||||
out, err := exec.Command("ipmitool", "dcmi", "power", "reading").Output()
|
out, err := exec.Command("ipmitool", "dcmi", "power", "reading").Output()
|
||||||
if err != nil {
|
if err == nil {
|
||||||
return 0
|
current = parseDCMIPowerReading(string(out))
|
||||||
}
|
}
|
||||||
return parseDCMIPowerReading(string(out))
|
systemPowerCacheMu.Lock()
|
||||||
|
defer systemPowerCacheMu.Unlock()
|
||||||
|
value, updated := effectiveSystemPowerReading(systemPowerCache, current, now)
|
||||||
|
systemPowerCache = updated
|
||||||
|
return value
|
||||||
}
|
}
|
||||||
|
|
||||||
// parseDCMIPowerReading extracts the instantaneous power reading from ipmitool dcmi output.
|
// parseDCMIPowerReading extracts the instantaneous power reading from ipmitool dcmi output.
|
||||||
@@ -535,6 +553,17 @@ func parseDCMIPowerReading(raw string) float64 {
|
|||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func effectiveSystemPowerReading(cache cachedPowerReading, current float64, now time.Time) (float64, cachedPowerReading) {
|
||||||
|
if current > 0 {
|
||||||
|
cache = cachedPowerReading{Value: current, UpdatedAt: now}
|
||||||
|
return current, cache
|
||||||
|
}
|
||||||
|
if cache.Value > 0 && !cache.UpdatedAt.IsZero() && now.Sub(cache.UpdatedAt) <= systemPowerHoldTTL {
|
||||||
|
return cache.Value, cache
|
||||||
|
}
|
||||||
|
return 0, cache
|
||||||
|
}
|
||||||
|
|
||||||
// analyzeThrottling returns true if any GPU reported an active throttle reason
|
// analyzeThrottling returns true if any GPU reported an active throttle reason
|
||||||
// during either load phase.
|
// during either load phase.
|
||||||
func analyzeThrottling(rows []FanStressRow) bool {
|
func analyzeThrottling(rows []FanStressRow) bool {
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
package platform
|
package platform
|
||||||
|
|
||||||
import "testing"
|
import (
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
func TestParseFanSpeeds(t *testing.T) {
|
func TestParseFanSpeeds(t *testing.T) {
|
||||||
raw := "FAN1 | 2400.000 | RPM | ok\nFAN2 | 1800 RPM | ok | ok\nFAN3 | na | RPM | ns\n"
|
raw := "FAN1 | 2400.000 | RPM | ok\nFAN2 | 1800 RPM | ok | ok\nFAN3 | na | RPM | ns\n"
|
||||||
@@ -25,3 +28,40 @@ func TestFirstFanInputValue(t *testing.T) {
|
|||||||
t.Fatalf("got=%v ok=%v", got, ok)
|
t.Fatalf("got=%v ok=%v", got, ok)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestParseDCMIPowerReading(t *testing.T) {
|
||||||
|
raw := `
|
||||||
|
Instantaneous power reading: 512 Watts
|
||||||
|
Minimum during sampling period: 498 Watts
|
||||||
|
`
|
||||||
|
if got := parseDCMIPowerReading(raw); got != 512 {
|
||||||
|
t.Fatalf("parseDCMIPowerReading()=%v want 512", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestEffectiveSystemPowerReading(t *testing.T) {
|
||||||
|
now := time.Now()
|
||||||
|
cache := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-5 * time.Second)}
|
||||||
|
|
||||||
|
got, updated := effectiveSystemPowerReading(cache, 0, now)
|
||||||
|
if got != 480 {
|
||||||
|
t.Fatalf("got=%v want cached 480", got)
|
||||||
|
}
|
||||||
|
if updated.Value != 480 {
|
||||||
|
t.Fatalf("updated=%+v", updated)
|
||||||
|
}
|
||||||
|
|
||||||
|
got, updated = effectiveSystemPowerReading(cache, 530, now)
|
||||||
|
if got != 530 {
|
||||||
|
t.Fatalf("got=%v want 530", got)
|
||||||
|
}
|
||||||
|
if updated.Value != 530 {
|
||||||
|
t.Fatalf("updated=%+v", updated)
|
||||||
|
}
|
||||||
|
|
||||||
|
expired := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-systemPowerHoldTTL - time.Second)}
|
||||||
|
got, _ = effectiveSystemPowerReading(expired, 0, now)
|
||||||
|
if got != 0 {
|
||||||
|
t.Fatalf("expired cache returned %v want 0", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -89,6 +89,19 @@ func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestNormalizePowerSeriesHoldsLastPositive(t *testing.T) {
|
||||||
|
got := normalizePowerSeries([]float64{0, 480, 0, 0, 510, 0})
|
||||||
|
want := []float64{0, 480, 480, 480, 510, 510}
|
||||||
|
if len(got) != len(want) {
|
||||||
|
t.Fatalf("len=%d want %d", len(got), len(want))
|
||||||
|
}
|
||||||
|
for i := range want {
|
||||||
|
if got[i] != want[i] {
|
||||||
|
t.Fatalf("got[%d]=%v want %v", i, got[i], want[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestRootRendersDashboard(t *testing.T) {
|
func TestRootRendersDashboard(t *testing.T) {
|
||||||
dir := t.TempDir()
|
dir := t.TempDir()
|
||||||
path := filepath.Join(dir, "audit.json")
|
path := filepath.Join(dir, "audit.json")
|
||||||
|
|||||||
Reference in New Issue
Block a user