From 52c3a24b763ce9a798f59e15aafff8f091b8a898 Mon Sep 17 00:00:00 2001 From: Michael Chus Date: Sat, 18 Apr 2026 15:28:05 +0300 Subject: [PATCH] Compact metrics DB in background to prevent CPU spin under load MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As metrics.db grew (1 sample/5 s × hours), handleMetricsChartSVG called LoadAll() on every chart request — loading all rows across 4 tables through a single SQLite connection. With ~10 charts auto-refreshing in parallel, requests queued behind each other, saturating the connection pool and pegging a CPU core. Fix: add a background compactor that runs every hour via the metrics collector: • Downsample: rows older than 2 h are thinned to 1 per minute (keep MIN(ts) per ts/60 bucket) — retains chart shape while cutting row count by ~92 %. • Prune: rows older than 48 h are deleted entirely. • After prune: WAL checkpoint/truncate to release disk space. LoadAll() in handleMetricsChartSVG is unchanged — it now stays fast because the DB is kept small rather than capping the query window. Co-Authored-By: Claude Sonnet 4.6 --- audit/internal/webui/metricsdb.go | 50 +++++++++++++++++++++++++++++++ audit/internal/webui/server.go | 31 +++++++++++++++---- 2 files changed, 75 insertions(+), 6 deletions(-) diff --git a/audit/internal/webui/metricsdb.go b/audit/internal/webui/metricsdb.go index ac6cb0a..21977cf 100644 --- a/audit/internal/webui/metricsdb.go +++ b/audit/internal/webui/metricsdb.go @@ -161,6 +161,56 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error { return tx.Commit() } +// Downsample reduces density of old metrics rows to 1 sample per minute. +// Only rows in the half-open window [deleteOlderThan, downsampleBefore) are +// affected — rows newer than downsampleBefore keep full 5-second resolution. +// For each 60-second bucket the row with the smallest ts is kept; the rest +// are deleted. This trims ~92 % of rows in that window while preserving +// the overall shape of every chart. +// +// Called hourly by the metrics collector background goroutine. +func (m *MetricsDB) Downsample(downsampleBefore, deleteOlderThan time.Time) error { + if m == nil || m.db == nil { + return nil + } + start := deleteOlderThan.Unix() + end := downsampleBefore.Unix() + if end <= start { + return nil + } + // For each table: delete rows in [start, end) whose ts is NOT the minimum + // ts in its 60-second bucket (ts/60 integer division = bucket ID). + for _, table := range []string{"sys_metrics", "gpu_metrics", "fan_metrics", "temp_metrics"} { + _, err := m.db.Exec(` +DELETE FROM `+table+` WHERE ts >= ? AND ts < ? + AND ts NOT IN ( + SELECT MIN(ts) FROM `+table+` + WHERE ts >= ? AND ts < ? + GROUP BY ts / 60 + )`, start, end, start, end) + if err != nil { + return err + } + } + return nil +} + +// Prune deletes all rows older than the given cutoff from every metrics table. +// Called hourly by the metrics collector to keep the DB size bounded. +func (m *MetricsDB) Prune(before time.Time) error { + if m == nil || m.db == nil { + return nil + } + cutTS := before.Unix() + for _, table := range []string{"sys_metrics", "gpu_metrics", "fan_metrics", "temp_metrics"} { + if _, err := m.db.Exec("DELETE FROM "+table+" WHERE ts < ?", cutTS); err != nil { + return err + } + } + _, _ = m.db.Exec("PRAGMA wal_checkpoint(TRUNCATE)") + return nil +} + // LoadRecent returns up to n samples in chronological order (oldest first). func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) { return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n) diff --git a/audit/internal/webui/server.go b/audit/internal/webui/server.go index c87582b..af200dd 100644 --- a/audit/internal/webui/server.go +++ b/audit/internal/webui/server.go @@ -135,6 +135,14 @@ type namedMetricsRing struct { // At metricsCollectInterval = 5 s this covers 30 minutes of live history. const metricsChartWindow = 360 +// metricsDownsampleAge is the age after which old metrics rows are downsampled +// to 1 sample per minute. Data fresher than this is kept at full resolution. +const metricsDownsampleAge = 2 * time.Hour + +// metricsRetainWindow is the total retention period for metrics rows. +// Rows older than this are deleted entirely by the background compactor. +const metricsRetainWindow = 48 * time.Hour + var metricsCollectInterval = 5 * time.Second // pendingNetChange tracks a network state change awaiting confirmation. @@ -335,13 +343,24 @@ func (h *handler) startMetricsCollector() { goRecoverLoop("metrics collector", 2*time.Second, func() { ticker := time.NewTicker(metricsCollectInterval) defer ticker.Stop() - for range ticker.C { - sample := platform.SampleLiveMetrics() - if h.metricsDB != nil { - _ = h.metricsDB.Write(sample) + pruneTicker := time.NewTicker(time.Hour) + defer pruneTicker.Stop() + for { + select { + case <-ticker.C: + sample := platform.SampleLiveMetrics() + if h.metricsDB != nil { + _ = h.metricsDB.Write(sample) + } + h.feedRings(sample) + h.setLatestMetric(sample) + case <-pruneTicker.C: + if h.metricsDB != nil { + now := time.Now().UTC() + _ = h.metricsDB.Downsample(now.Add(-metricsDownsampleAge), now.Add(-metricsRetainWindow)) + _ = h.metricsDB.Prune(now.Add(-metricsRetainWindow)) + } } - h.feedRings(sample) - h.setLatestMetric(sample) } }) }