Compact metrics DB in background to prevent CPU spin under load

As metrics.db grew (1 sample/5 s × hours), handleMetricsChartSVG called LoadAll() on every chart request — loading all rows across 4 tables through a single SQLite connection. With ~10 charts auto-refreshing in parallel, requests queued behind each other, saturating the connection pool and pegging a CPU core. Fix: add a background compactor that runs every hour via the metrics collector: • Downsample: rows older than 2 h are thinned to 1 per minute (keep MIN(ts) per ts/60 bucket) — retains chart shape while cutting row count by ~92 %. • Prune: rows older than 48 h are deleted entirely. • After prune: WAL checkpoint/truncate to release disk space. LoadAll() in handleMetricsChartSVG is unchanged — it now stays fast because the DB is kept small rather than capping the query window. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-18 15:28:05 +03:00
parent 028bb30333
commit 52c3a24b76
2 changed files with 75 additions and 6 deletions
--- a/audit/internal/webui/metricsdb.go
+++ b/audit/internal/webui/metricsdb.go
@@ -161,6 +161,56 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
 	return tx.Commit()
 }

+// Downsample reduces density of old metrics rows to 1 sample per minute.
+// Only rows in the half-open window [deleteOlderThan, downsampleBefore) are
+// affected — rows newer than downsampleBefore keep full 5-second resolution.
+// For each 60-second bucket the row with the smallest ts is kept; the rest
+// are deleted. This trims ~92 % of rows in that window while preserving
+// the overall shape of every chart.
+//
+// Called hourly by the metrics collector background goroutine.
+func (m *MetricsDB) Downsample(downsampleBefore, deleteOlderThan time.Time) error {
+	if m == nil || m.db == nil {
+		return nil
+	}
+	start := deleteOlderThan.Unix()
+	end := downsampleBefore.Unix()
+	if end <= start {
+		return nil
+	}
+	// For each table: delete rows in [start, end) whose ts is NOT the minimum
+	// ts in its 60-second bucket (ts/60 integer division = bucket ID).
+	for _, table := range []string{"sys_metrics", "gpu_metrics", "fan_metrics", "temp_metrics"} {
+		_, err := m.db.Exec(`
+DELETE FROM `+table+` WHERE ts >= ? AND ts < ?
+  AND ts NOT IN (
+    SELECT MIN(ts) FROM `+table+`
+    WHERE ts >= ? AND ts < ?
+    GROUP BY ts / 60
+  )`, start, end, start, end)
+		if err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// Prune deletes all rows older than the given cutoff from every metrics table.
+// Called hourly by the metrics collector to keep the DB size bounded.
+func (m *MetricsDB) Prune(before time.Time) error {
+	if m == nil || m.db == nil {
+		return nil
+	}
+	cutTS := before.Unix()
+	for _, table := range []string{"sys_metrics", "gpu_metrics", "fan_metrics", "temp_metrics"} {
+		if _, err := m.db.Exec("DELETE FROM "+table+" WHERE ts < ?", cutTS); err != nil {
+			return err
+		}
+	}
+	_, _ = m.db.Exec("PRAGMA wal_checkpoint(TRUNCATE)")
+	return nil
+}
+
 // LoadRecent returns up to n samples in chronological order (oldest first).
 func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
 	return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
--- a/audit/internal/webui/server.go
+++ b/audit/internal/webui/server.go
@@ -135,6 +135,14 @@ type namedMetricsRing struct {
 // At metricsCollectInterval = 5 s this covers 30 minutes of live history.
 const metricsChartWindow = 360

+// metricsDownsampleAge is the age after which old metrics rows are downsampled
+// to 1 sample per minute. Data fresher than this is kept at full resolution.
+const metricsDownsampleAge = 2 * time.Hour
+
+// metricsRetainWindow is the total retention period for metrics rows.
+// Rows older than this are deleted entirely by the background compactor.
+const metricsRetainWindow = 48 * time.Hour
+
 var metricsCollectInterval = 5 * time.Second

 // pendingNetChange tracks a network state change awaiting confirmation.
@@ -335,13 +343,24 @@ func (h *handler) startMetricsCollector() {
 	goRecoverLoop("metrics collector", 2*time.Second, func() {
 		ticker := time.NewTicker(metricsCollectInterval)
 		defer ticker.Stop()
-		for range ticker.C {
-			sample := platform.SampleLiveMetrics()
-			if h.metricsDB != nil {
-				_ = h.metricsDB.Write(sample)
+		pruneTicker := time.NewTicker(time.Hour)
+		defer pruneTicker.Stop()
+		for {
+			select {
+			case <-ticker.C:
+				sample := platform.SampleLiveMetrics()
+				if h.metricsDB != nil {
+					_ = h.metricsDB.Write(sample)
+				}
+				h.feedRings(sample)
+				h.setLatestMetric(sample)
+			case <-pruneTicker.C:
+				if h.metricsDB != nil {
+					now := time.Now().UTC()
+					_ = h.metricsDB.Downsample(now.Add(-metricsDownsampleAge), now.Add(-metricsRetainWindow))
+					_ = h.metricsDB.Prune(now.Add(-metricsRetainWindow))
+				}
 			}
-			h.feedRings(sample)
-			h.setLatestMetric(sample)
 		}
 	})
 }