Compact metrics DB in background to prevent CPU spin under load
As metrics.db grew (1 sample/5 s × hours), handleMetricsChartSVG called
LoadAll() on every chart request — loading all rows across 4 tables through a
single SQLite connection. With ~10 charts auto-refreshing in parallel, requests
queued behind each other, saturating the connection pool and pegging a CPU core.
Fix: add a background compactor that runs every hour via the metrics collector:
• Downsample: rows older than 2 h are thinned to 1 per minute (keep MIN(ts)
per ts/60 bucket) — retains chart shape while cutting row count by ~92 %.
• Prune: rows older than 48 h are deleted entirely.
• After prune: WAL checkpoint/truncate to release disk space.
LoadAll() in handleMetricsChartSVG is unchanged — it now stays fast because
the DB is kept small rather than capping the query window.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -161,6 +161,56 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
|
|||||||
return tx.Commit()
|
return tx.Commit()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Downsample reduces density of old metrics rows to 1 sample per minute.
|
||||||
|
// Only rows in the half-open window [deleteOlderThan, downsampleBefore) are
|
||||||
|
// affected — rows newer than downsampleBefore keep full 5-second resolution.
|
||||||
|
// For each 60-second bucket the row with the smallest ts is kept; the rest
|
||||||
|
// are deleted. This trims ~92 % of rows in that window while preserving
|
||||||
|
// the overall shape of every chart.
|
||||||
|
//
|
||||||
|
// Called hourly by the metrics collector background goroutine.
|
||||||
|
func (m *MetricsDB) Downsample(downsampleBefore, deleteOlderThan time.Time) error {
|
||||||
|
if m == nil || m.db == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
start := deleteOlderThan.Unix()
|
||||||
|
end := downsampleBefore.Unix()
|
||||||
|
if end <= start {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
// For each table: delete rows in [start, end) whose ts is NOT the minimum
|
||||||
|
// ts in its 60-second bucket (ts/60 integer division = bucket ID).
|
||||||
|
for _, table := range []string{"sys_metrics", "gpu_metrics", "fan_metrics", "temp_metrics"} {
|
||||||
|
_, err := m.db.Exec(`
|
||||||
|
DELETE FROM `+table+` WHERE ts >= ? AND ts < ?
|
||||||
|
AND ts NOT IN (
|
||||||
|
SELECT MIN(ts) FROM `+table+`
|
||||||
|
WHERE ts >= ? AND ts < ?
|
||||||
|
GROUP BY ts / 60
|
||||||
|
)`, start, end, start, end)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Prune deletes all rows older than the given cutoff from every metrics table.
|
||||||
|
// Called hourly by the metrics collector to keep the DB size bounded.
|
||||||
|
func (m *MetricsDB) Prune(before time.Time) error {
|
||||||
|
if m == nil || m.db == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
cutTS := before.Unix()
|
||||||
|
for _, table := range []string{"sys_metrics", "gpu_metrics", "fan_metrics", "temp_metrics"} {
|
||||||
|
if _, err := m.db.Exec("DELETE FROM "+table+" WHERE ts < ?", cutTS); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_, _ = m.db.Exec("PRAGMA wal_checkpoint(TRUNCATE)")
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
// LoadRecent returns up to n samples in chronological order (oldest first).
|
// LoadRecent returns up to n samples in chronological order (oldest first).
|
||||||
func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
|
func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
|
||||||
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
|
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
|
||||||
|
|||||||
@@ -135,6 +135,14 @@ type namedMetricsRing struct {
|
|||||||
// At metricsCollectInterval = 5 s this covers 30 minutes of live history.
|
// At metricsCollectInterval = 5 s this covers 30 minutes of live history.
|
||||||
const metricsChartWindow = 360
|
const metricsChartWindow = 360
|
||||||
|
|
||||||
|
// metricsDownsampleAge is the age after which old metrics rows are downsampled
|
||||||
|
// to 1 sample per minute. Data fresher than this is kept at full resolution.
|
||||||
|
const metricsDownsampleAge = 2 * time.Hour
|
||||||
|
|
||||||
|
// metricsRetainWindow is the total retention period for metrics rows.
|
||||||
|
// Rows older than this are deleted entirely by the background compactor.
|
||||||
|
const metricsRetainWindow = 48 * time.Hour
|
||||||
|
|
||||||
var metricsCollectInterval = 5 * time.Second
|
var metricsCollectInterval = 5 * time.Second
|
||||||
|
|
||||||
// pendingNetChange tracks a network state change awaiting confirmation.
|
// pendingNetChange tracks a network state change awaiting confirmation.
|
||||||
@@ -335,13 +343,24 @@ func (h *handler) startMetricsCollector() {
|
|||||||
goRecoverLoop("metrics collector", 2*time.Second, func() {
|
goRecoverLoop("metrics collector", 2*time.Second, func() {
|
||||||
ticker := time.NewTicker(metricsCollectInterval)
|
ticker := time.NewTicker(metricsCollectInterval)
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
for range ticker.C {
|
pruneTicker := time.NewTicker(time.Hour)
|
||||||
sample := platform.SampleLiveMetrics()
|
defer pruneTicker.Stop()
|
||||||
if h.metricsDB != nil {
|
for {
|
||||||
_ = h.metricsDB.Write(sample)
|
select {
|
||||||
|
case <-ticker.C:
|
||||||
|
sample := platform.SampleLiveMetrics()
|
||||||
|
if h.metricsDB != nil {
|
||||||
|
_ = h.metricsDB.Write(sample)
|
||||||
|
}
|
||||||
|
h.feedRings(sample)
|
||||||
|
h.setLatestMetric(sample)
|
||||||
|
case <-pruneTicker.C:
|
||||||
|
if h.metricsDB != nil {
|
||||||
|
now := time.Now().UTC()
|
||||||
|
_ = h.metricsDB.Downsample(now.Add(-metricsDownsampleAge), now.Add(-metricsRetainWindow))
|
||||||
|
_ = h.metricsDB.Prune(now.Add(-metricsRetainWindow))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
h.feedRings(sample)
|
|
||||||
h.setLatestMetric(sample)
|
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user