From c95bbff23bf337db79a0778b594e24f2f3f4ef1d Mon Sep 17 00:00:00 2001 From: Mikhail Chusavitin Date: Wed, 1 Apr 2026 09:40:42 +0300 Subject: [PATCH] fix(metrics): stabilize cpu and power sampling --- audit/internal/platform/live_metrics.go | 16 +++--- audit/internal/platform/live_metrics_test.go | 50 +++++++++++++++++++ audit/internal/platform/sat_fan_stress.go | 35 +++++++++++-- .../internal/platform/sat_fan_stress_test.go | 42 +++++++++++++++- audit/internal/webui/server_test.go | 13 +++++ 5 files changed, 145 insertions(+), 11 deletions(-) diff --git a/audit/internal/platform/live_metrics.go b/audit/internal/platform/live_metrics.go index 9640b0a..ebecc78 100644 --- a/audit/internal/platform/live_metrics.go +++ b/audit/internal/platform/live_metrics.go @@ -68,18 +68,20 @@ func SampleLiveMetrics() LiveMetricSample { // sampleCPULoadPct reads two /proc/stat snapshots 200ms apart and returns // the overall CPU utilisation percentage. -var cpuStatPrev [2]uint64 // [total, idle] - func sampleCPULoadPct() float64 { - total, idle := readCPUStat() - if total == 0 { + total0, idle0 := readCPUStat() + if total0 == 0 { return 0 } - prevTotal, prevIdle := cpuStatPrev[0], cpuStatPrev[1] - cpuStatPrev = [2]uint64{total, idle} - if prevTotal == 0 { + time.Sleep(200 * time.Millisecond) + total1, idle1 := readCPUStat() + if total1 == 0 { return 0 } + return cpuLoadPctBetween(total0, idle0, total1, idle1) +} + +func cpuLoadPctBetween(prevTotal, prevIdle, total, idle uint64) float64 { dt := float64(total - prevTotal) di := float64(idle - prevIdle) if dt <= 0 { diff --git a/audit/internal/platform/live_metrics_test.go b/audit/internal/platform/live_metrics_test.go index c0db585..9b259c5 100644 --- a/audit/internal/platform/live_metrics_test.go +++ b/audit/internal/platform/live_metrics_test.go @@ -42,3 +42,53 @@ func TestCompactAmbientTempName(t *testing.T) { t.Fatalf("got %q", got) } } + +func TestCPULoadPctBetween(t *testing.T) { + tests := []struct { + name string + prevTotal uint64 + prevIdle uint64 + total uint64 + idle uint64 + want float64 + }{ + { + name: "busy half", + prevTotal: 100, + prevIdle: 40, + total: 200, + idle: 90, + want: 50, + }, + { + name: "fully busy", + prevTotal: 100, + prevIdle: 40, + total: 200, + idle: 40, + want: 100, + }, + { + name: "no progress", + prevTotal: 100, + prevIdle: 40, + total: 100, + idle: 40, + want: 0, + }, + { + name: "idle delta larger than total clamps to zero", + prevTotal: 100, + prevIdle: 40, + total: 200, + idle: 150, + want: 0, + }, + } + + for _, tc := range tests { + if got := cpuLoadPctBetween(tc.prevTotal, tc.prevIdle, tc.total, tc.idle); got != tc.want { + t.Fatalf("%s: cpuLoadPctBetween(...)=%v want %v", tc.name, got, tc.want) + } + } +} diff --git a/audit/internal/platform/sat_fan_stress.go b/audit/internal/platform/sat_fan_stress.go index ae401f1..2ec6a31 100644 --- a/audit/internal/platform/sat_fan_stress.go +++ b/audit/internal/platform/sat_fan_stress.go @@ -51,6 +51,18 @@ type FanStressRow struct { SysPowerW float64 // DCMI system power reading } +type cachedPowerReading struct { + Value float64 + UpdatedAt time.Time +} + +var ( + systemPowerCacheMu sync.Mutex + systemPowerCache cachedPowerReading +) + +const systemPowerHoldTTL = 15 * time.Second + // RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds, // temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv. // Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling. @@ -508,11 +520,17 @@ func sampleCPUTempViaSensors() float64 { // sampleSystemPower reads system power draw via DCMI. func sampleSystemPower() float64 { + now := time.Now() + current := 0.0 out, err := exec.Command("ipmitool", "dcmi", "power", "reading").Output() - if err != nil { - return 0 + if err == nil { + current = parseDCMIPowerReading(string(out)) } - return parseDCMIPowerReading(string(out)) + systemPowerCacheMu.Lock() + defer systemPowerCacheMu.Unlock() + value, updated := effectiveSystemPowerReading(systemPowerCache, current, now) + systemPowerCache = updated + return value } // parseDCMIPowerReading extracts the instantaneous power reading from ipmitool dcmi output. @@ -535,6 +553,17 @@ func parseDCMIPowerReading(raw string) float64 { return 0 } +func effectiveSystemPowerReading(cache cachedPowerReading, current float64, now time.Time) (float64, cachedPowerReading) { + if current > 0 { + cache = cachedPowerReading{Value: current, UpdatedAt: now} + return current, cache + } + if cache.Value > 0 && !cache.UpdatedAt.IsZero() && now.Sub(cache.UpdatedAt) <= systemPowerHoldTTL { + return cache.Value, cache + } + return 0, cache +} + // analyzeThrottling returns true if any GPU reported an active throttle reason // during either load phase. func analyzeThrottling(rows []FanStressRow) bool { diff --git a/audit/internal/platform/sat_fan_stress_test.go b/audit/internal/platform/sat_fan_stress_test.go index e7c00b3..39de2a6 100644 --- a/audit/internal/platform/sat_fan_stress_test.go +++ b/audit/internal/platform/sat_fan_stress_test.go @@ -1,6 +1,9 @@ package platform -import "testing" +import ( + "testing" + "time" +) func TestParseFanSpeeds(t *testing.T) { raw := "FAN1 | 2400.000 | RPM | ok\nFAN2 | 1800 RPM | ok | ok\nFAN3 | na | RPM | ns\n" @@ -25,3 +28,40 @@ func TestFirstFanInputValue(t *testing.T) { t.Fatalf("got=%v ok=%v", got, ok) } } + +func TestParseDCMIPowerReading(t *testing.T) { + raw := ` +Instantaneous power reading: 512 Watts +Minimum during sampling period: 498 Watts +` + if got := parseDCMIPowerReading(raw); got != 512 { + t.Fatalf("parseDCMIPowerReading()=%v want 512", got) + } +} + +func TestEffectiveSystemPowerReading(t *testing.T) { + now := time.Now() + cache := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-5 * time.Second)} + + got, updated := effectiveSystemPowerReading(cache, 0, now) + if got != 480 { + t.Fatalf("got=%v want cached 480", got) + } + if updated.Value != 480 { + t.Fatalf("updated=%+v", updated) + } + + got, updated = effectiveSystemPowerReading(cache, 530, now) + if got != 530 { + t.Fatalf("got=%v want 530", got) + } + if updated.Value != 530 { + t.Fatalf("updated=%+v", updated) + } + + expired := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-systemPowerHoldTTL - time.Second)} + got, _ = effectiveSystemPowerReading(expired, 0, now) + if got != 0 { + t.Fatalf("expired cache returned %v want 0", got) + } +} diff --git a/audit/internal/webui/server_test.go b/audit/internal/webui/server_test.go index f6c5886..d929692 100644 --- a/audit/internal/webui/server_test.go +++ b/audit/internal/webui/server_test.go @@ -89,6 +89,19 @@ func TestChartDataFromSamplesUsesFullHistory(t *testing.T) { } } +func TestNormalizePowerSeriesHoldsLastPositive(t *testing.T) { + got := normalizePowerSeries([]float64{0, 480, 0, 0, 510, 0}) + want := []float64{0, 480, 480, 480, 510, 510} + if len(got) != len(want) { + t.Fatalf("len=%d want %d", len(got), len(want)) + } + for i := range want { + if got[i] != want[i] { + t.Fatalf("got[%d]=%v want %v", i, got[i], want[i]) + } + } +} + func TestRootRendersDashboard(t *testing.T) { dir := t.TempDir() path := filepath.Join(dir, "audit.json")