Compact metrics DB in background to prevent CPU spin under load
As metrics.db grew (1 sample/5 s × hours), handleMetricsChartSVG called
LoadAll() on every chart request — loading all rows across 4 tables through a
single SQLite connection. With ~10 charts auto-refreshing in parallel, requests
queued behind each other, saturating the connection pool and pegging a CPU core.
Fix: add a background compactor that runs every hour via the metrics collector:
• Downsample: rows older than 2 h are thinned to 1 per minute (keep MIN(ts)
per ts/60 bucket) — retains chart shape while cutting row count by ~92 %.
• Prune: rows older than 48 h are deleted entirely.
• After prune: WAL checkpoint/truncate to release disk space.
LoadAll() in handleMetricsChartSVG is unchanged — it now stays fast because
the DB is kept small rather than capping the query window.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -161,6 +161,56 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
|
||||
return tx.Commit()
|
||||
}
|
||||
|
||||
// Downsample reduces density of old metrics rows to 1 sample per minute.
|
||||
// Only rows in the half-open window [deleteOlderThan, downsampleBefore) are
|
||||
// affected — rows newer than downsampleBefore keep full 5-second resolution.
|
||||
// For each 60-second bucket the row with the smallest ts is kept; the rest
|
||||
// are deleted. This trims ~92 % of rows in that window while preserving
|
||||
// the overall shape of every chart.
|
||||
//
|
||||
// Called hourly by the metrics collector background goroutine.
|
||||
func (m *MetricsDB) Downsample(downsampleBefore, deleteOlderThan time.Time) error {
|
||||
if m == nil || m.db == nil {
|
||||
return nil
|
||||
}
|
||||
start := deleteOlderThan.Unix()
|
||||
end := downsampleBefore.Unix()
|
||||
if end <= start {
|
||||
return nil
|
||||
}
|
||||
// For each table: delete rows in [start, end) whose ts is NOT the minimum
|
||||
// ts in its 60-second bucket (ts/60 integer division = bucket ID).
|
||||
for _, table := range []string{"sys_metrics", "gpu_metrics", "fan_metrics", "temp_metrics"} {
|
||||
_, err := m.db.Exec(`
|
||||
DELETE FROM `+table+` WHERE ts >= ? AND ts < ?
|
||||
AND ts NOT IN (
|
||||
SELECT MIN(ts) FROM `+table+`
|
||||
WHERE ts >= ? AND ts < ?
|
||||
GROUP BY ts / 60
|
||||
)`, start, end, start, end)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Prune deletes all rows older than the given cutoff from every metrics table.
|
||||
// Called hourly by the metrics collector to keep the DB size bounded.
|
||||
func (m *MetricsDB) Prune(before time.Time) error {
|
||||
if m == nil || m.db == nil {
|
||||
return nil
|
||||
}
|
||||
cutTS := before.Unix()
|
||||
for _, table := range []string{"sys_metrics", "gpu_metrics", "fan_metrics", "temp_metrics"} {
|
||||
if _, err := m.db.Exec("DELETE FROM "+table+" WHERE ts < ?", cutTS); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
_, _ = m.db.Exec("PRAGMA wal_checkpoint(TRUNCATE)")
|
||||
return nil
|
||||
}
|
||||
|
||||
// LoadRecent returns up to n samples in chronological order (oldest first).
|
||||
func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
|
||||
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
|
||||
|
||||
@@ -135,6 +135,14 @@ type namedMetricsRing struct {
|
||||
// At metricsCollectInterval = 5 s this covers 30 minutes of live history.
|
||||
const metricsChartWindow = 360
|
||||
|
||||
// metricsDownsampleAge is the age after which old metrics rows are downsampled
|
||||
// to 1 sample per minute. Data fresher than this is kept at full resolution.
|
||||
const metricsDownsampleAge = 2 * time.Hour
|
||||
|
||||
// metricsRetainWindow is the total retention period for metrics rows.
|
||||
// Rows older than this are deleted entirely by the background compactor.
|
||||
const metricsRetainWindow = 48 * time.Hour
|
||||
|
||||
var metricsCollectInterval = 5 * time.Second
|
||||
|
||||
// pendingNetChange tracks a network state change awaiting confirmation.
|
||||
@@ -335,13 +343,24 @@ func (h *handler) startMetricsCollector() {
|
||||
goRecoverLoop("metrics collector", 2*time.Second, func() {
|
||||
ticker := time.NewTicker(metricsCollectInterval)
|
||||
defer ticker.Stop()
|
||||
for range ticker.C {
|
||||
sample := platform.SampleLiveMetrics()
|
||||
if h.metricsDB != nil {
|
||||
_ = h.metricsDB.Write(sample)
|
||||
pruneTicker := time.NewTicker(time.Hour)
|
||||
defer pruneTicker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
sample := platform.SampleLiveMetrics()
|
||||
if h.metricsDB != nil {
|
||||
_ = h.metricsDB.Write(sample)
|
||||
}
|
||||
h.feedRings(sample)
|
||||
h.setLatestMetric(sample)
|
||||
case <-pruneTicker.C:
|
||||
if h.metricsDB != nil {
|
||||
now := time.Now().UTC()
|
||||
_ = h.metricsDB.Downsample(now.Add(-metricsDownsampleAge), now.Add(-metricsRetainWindow))
|
||||
_ = h.metricsDB.Prune(now.Add(-metricsRetainWindow))
|
||||
}
|
||||
}
|
||||
h.feedRings(sample)
|
||||
h.setLatestMetric(sample)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user