Compact metrics DB in background to prevent CPU spin under load
As metrics.db grew (1 sample/5 s × hours), handleMetricsChartSVG called
LoadAll() on every chart request — loading all rows across 4 tables through a
single SQLite connection. With ~10 charts auto-refreshing in parallel, requests
queued behind each other, saturating the connection pool and pegging a CPU core.
Fix: add a background compactor that runs every hour via the metrics collector:
• Downsample: rows older than 2 h are thinned to 1 per minute (keep MIN(ts)
per ts/60 bucket) — retains chart shape while cutting row count by ~92 %.
• Prune: rows older than 48 h are deleted entirely.
• After prune: WAL checkpoint/truncate to release disk space.
LoadAll() in handleMetricsChartSVG is unchanged — it now stays fast because
the DB is kept small rather than capping the query window.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -135,6 +135,14 @@ type namedMetricsRing struct {
|
||||
// At metricsCollectInterval = 5 s this covers 30 minutes of live history.
|
||||
const metricsChartWindow = 360
|
||||
|
||||
// metricsDownsampleAge is the age after which old metrics rows are downsampled
|
||||
// to 1 sample per minute. Data fresher than this is kept at full resolution.
|
||||
const metricsDownsampleAge = 2 * time.Hour
|
||||
|
||||
// metricsRetainWindow is the total retention period for metrics rows.
|
||||
// Rows older than this are deleted entirely by the background compactor.
|
||||
const metricsRetainWindow = 48 * time.Hour
|
||||
|
||||
var metricsCollectInterval = 5 * time.Second
|
||||
|
||||
// pendingNetChange tracks a network state change awaiting confirmation.
|
||||
@@ -335,13 +343,24 @@ func (h *handler) startMetricsCollector() {
|
||||
goRecoverLoop("metrics collector", 2*time.Second, func() {
|
||||
ticker := time.NewTicker(metricsCollectInterval)
|
||||
defer ticker.Stop()
|
||||
for range ticker.C {
|
||||
sample := platform.SampleLiveMetrics()
|
||||
if h.metricsDB != nil {
|
||||
_ = h.metricsDB.Write(sample)
|
||||
pruneTicker := time.NewTicker(time.Hour)
|
||||
defer pruneTicker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
sample := platform.SampleLiveMetrics()
|
||||
if h.metricsDB != nil {
|
||||
_ = h.metricsDB.Write(sample)
|
||||
}
|
||||
h.feedRings(sample)
|
||||
h.setLatestMetric(sample)
|
||||
case <-pruneTicker.C:
|
||||
if h.metricsDB != nil {
|
||||
now := time.Now().UTC()
|
||||
_ = h.metricsDB.Downsample(now.Add(-metricsDownsampleAge), now.Add(-metricsRetainWindow))
|
||||
_ = h.metricsDB.Prune(now.Add(-metricsRetainWindow))
|
||||
}
|
||||
}
|
||||
h.feedRings(sample)
|
||||
h.setLatestMetric(sample)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user