feat(metrics): persist history in sqlite and add AMD memory validate tests

This commit is contained in:
2026-03-29 12:28:06 +03:00
parent 98f0cf0d52
commit e15bcc91c5
10 changed files with 539 additions and 29 deletions

View File

@@ -599,10 +599,9 @@ func (h *handler) handleAPIMetricsStream(w http.ResponseWriter, r *http.Request)
case <-r.Context().Done():
return
case <-ticker.C:
sample := platform.SampleLiveMetrics()
h.feedRings(sample)
if h.metricsDB != nil {
_ = h.metricsDB.Write(sample)
sample, ok := h.latestMetric()
if !ok {
continue
}
b, err := json.Marshal(sample)
if err != nil {

View File

@@ -3,7 +3,6 @@ package webui
import (
"database/sql"
"encoding/csv"
"fmt"
"io"
"strconv"
"time"
@@ -13,7 +12,6 @@ import (
)
const metricsDBPath = "/appdata/bee/metrics.db"
const metricsKeepDuration = 24 * time.Hour
// MetricsDB persists live metric samples to SQLite.
type MetricsDB struct {
@@ -116,11 +114,18 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
}
// LoadRecent returns up to n samples in chronological order (oldest first).
// It reconstructs LiveMetricSample from the normalized tables.
func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
rows, err := m.db.Query(
`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?`, n,
)
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?`, n)
}
// LoadAll returns all persisted samples in chronological order (oldest first).
func (m *MetricsDB) LoadAll() ([]platform.LiveMetricSample, error) {
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts`, nil)
}
// loadSamples reconstructs LiveMetricSample rows from the normalized tables.
func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetricSample, error) {
rows, err := m.db.Query(query, args...)
if err != nil {
return nil, err
}
@@ -257,14 +262,6 @@ func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
return samples, nil
}
// Prune deletes samples older than keepDuration.
func (m *MetricsDB) Prune(keepDuration time.Duration) {
cutoff := time.Now().Add(-keepDuration).Unix()
for _, table := range []string{"sys_metrics", "gpu_metrics", "fan_metrics", "temp_metrics"} {
_, _ = m.db.Exec(fmt.Sprintf("DELETE FROM %s WHERE ts < ?", table), cutoff)
}
}
// ExportCSV writes all sys+gpu data as CSV to w.
func (m *MetricsDB) ExportCSV(w io.Writer) error {
rows, err := m.db.Query(`

View File

@@ -494,7 +494,11 @@ func renderValidate() string {
renderSATCard("memory", "Memory", "") +
renderSATCard("storage", "Storage", "") +
renderSATCard("cpu", "CPU", `<div class="form-row"><label>Duration (seconds)</label><input type="number" id="sat-cpu-dur" value="60" min="10"></div>`) +
renderSATCard("amd", "AMD GPU", "") +
renderSATCard("amd", "AMD GPU", `<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
<button id="sat-btn-amd-mem" class="btn" type="button" onclick="runSAT('amd-mem')">MEM Integrity</button>
<button id="sat-btn-amd-bandwidth" class="btn" type="button" onclick="runSAT('amd-bandwidth')">MEM Bandwidth</button>
</div>
<p style="color:var(--muted);font-size:12px;margin:0">Additional AMD memory diagnostics: RVS MEM for integrity and BABEL + rocm-bandwidth-test for memory/interconnect bandwidth.</p>`) +
`</div>
<div id="sat-output" style="display:none;margin-top:16px" class="card">
<div class="card-head">Test Output <span id="sat-title"></span></div>
@@ -505,7 +509,7 @@ let satES = null;
function runSAT(target) {
if (satES) { satES.close(); satES = null; }
const body = {};
const labels = {nvidia:'Validate GPU', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU'};
const labels = {nvidia:'Validate GPU', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
body.display_name = labels[target] || ('Validate ' + target);
if (target === 'nvidia') body.diag_level = parseInt(document.getElementById('sat-nvidia-level').value)||1;
if (target === 'cpu') body.duration = parseInt(document.getElementById('sat-cpu-dur').value)||60;
@@ -524,7 +528,7 @@ function runSAT(target) {
}
function runAllSAT() {
const cycles = Math.max(1, parseInt(document.getElementById('sat-cycles').value)||1);
const targets = ['nvidia','memory','storage','cpu','amd'];
const targets = ['nvidia','memory','storage','cpu','amd','amd-mem','amd-bandwidth'];
const total = targets.length * cycles;
let enqueued = 0;
const status = document.getElementById('sat-all-status');
@@ -536,7 +540,7 @@ function runAllSAT() {
const btn = document.getElementById('sat-btn-' + target);
if (btn && btn.disabled) { enqueueNext(cycle, idx+1); return; }
const body = {};
const labels = {nvidia:'Validate GPU', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU'};
const labels = {nvidia:'Validate GPU', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
body.display_name = labels[target] || ('Validate ' + target);
if (target === 'nvidia') body.diag_level = parseInt(document.getElementById('sat-nvidia-level').value)||1;
if (target === 'cpu') body.duration = parseInt(document.getElementById('sat-cpu-dur').value)||60;
@@ -554,6 +558,8 @@ function runAllSAT() {
fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
if (!gp.nvidia) disableSATCard('nvidia', 'No NVIDIA GPU detected');
if (!gp.amd) disableSATCard('amd', 'No AMD GPU detected');
if (!gp.amd) disableSATCard('amd-mem', 'No AMD GPU detected');
if (!gp.amd) disableSATCard('amd-bandwidth', 'No AMD GPU detected');
});
function disableSATCard(id, reason) {
const btn = document.getElementById('sat-btn-' + id);

View File

@@ -132,6 +132,8 @@ type handler struct {
// per-GPU rings (index = GPU index)
gpuRings []*gpuRings
ringsMu sync.Mutex
latestMu sync.RWMutex
latest *platform.LiveMetricSample
// metrics persistence (nil if DB unavailable)
metricsDB *MetricsDB
// install job (at most one at a time)
@@ -164,13 +166,16 @@ func NewHandler(opts HandlerOptions) http.Handler {
// Open metrics DB and pre-fill ring buffers from history.
if db, err := openMetricsDB(metricsDBPath); err == nil {
h.metricsDB = db
db.Prune(metricsKeepDuration)
if samples, err := db.LoadRecent(120); err == nil {
for _, s := range samples {
h.feedRings(s)
}
if len(samples) > 0 {
h.setLatestMetric(samples[len(samples)-1])
}
}
}
h.startMetricsCollector()
globalQueue.startWorker(&opts)
mux := http.NewServeMux()
@@ -198,6 +203,8 @@ func NewHandler(opts HandlerOptions) http.Handler {
mux.HandleFunc("POST /api/sat/storage/run", h.handleAPISATRun("storage"))
mux.HandleFunc("POST /api/sat/cpu/run", h.handleAPISATRun("cpu"))
mux.HandleFunc("POST /api/sat/amd/run", h.handleAPISATRun("amd"))
mux.HandleFunc("POST /api/sat/amd-mem/run", h.handleAPISATRun("amd-mem"))
mux.HandleFunc("POST /api/sat/amd-bandwidth/run", h.handleAPISATRun("amd-bandwidth"))
mux.HandleFunc("POST /api/sat/amd-stress/run", h.handleAPISATRun("amd-stress"))
mux.HandleFunc("POST /api/sat/memory-stress/run", h.handleAPISATRun("memory-stress"))
mux.HandleFunc("POST /api/sat/sat-stress/run", h.handleAPISATRun("sat-stress"))
@@ -260,6 +267,37 @@ func NewHandler(opts HandlerOptions) http.Handler {
return mux
}
func (h *handler) startMetricsCollector() {
go func() {
ticker := time.NewTicker(1 * time.Second)
defer ticker.Stop()
for range ticker.C {
sample := platform.SampleLiveMetrics()
h.feedRings(sample)
h.setLatestMetric(sample)
if h.metricsDB != nil {
_ = h.metricsDB.Write(sample)
}
}
}()
}
func (h *handler) setLatestMetric(sample platform.LiveMetricSample) {
h.latestMu.Lock()
defer h.latestMu.Unlock()
cp := sample
h.latest = &cp
}
func (h *handler) latestMetric() (platform.LiveMetricSample, bool) {
h.latestMu.RLock()
defer h.latestMu.RUnlock()
if h.latest == nil {
return platform.LiveMetricSample{}, false
}
return *h.latest, true
}
// ListenAndServe starts the HTTP server.
func ListenAndServe(addr string, opts HandlerOptions) error {
return http.ListenAndServe(addr, NewHandler(opts))
@@ -387,6 +425,20 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
path := strings.TrimPrefix(r.URL.Path, "/api/metrics/chart/")
path = strings.TrimSuffix(path, ".svg")
if h.metricsDB != nil {
if datasets, names, labels, title, yMin, yMax, ok := h.chartDataFromDB(path); ok {
buf, err := renderChartSVG(title, datasets, names, labels, yMin, yMax)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "image/svg+xml")
w.Header().Set("Cache-Control", "no-store")
_, _ = w.Write(buf)
return
}
}
var datasets [][]float64
var names []string
var labels []string
@@ -601,6 +653,268 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
_, _ = w.Write(buf)
}
func (h *handler) chartDataFromDB(path string) ([][]float64, []string, []string, string, *float64, *float64, bool) {
samples, err := h.metricsDB.LoadAll()
if err != nil || len(samples) == 0 {
return nil, nil, nil, "", nil, nil, false
}
return chartDataFromSamples(path, samples)
}
func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][]float64, []string, []string, string, *float64, *float64, bool) {
var datasets [][]float64
var names []string
var title string
var yMin, yMax *float64
labels := sampleTimeLabels(samples)
switch {
case path == "server-load":
title = "CPU / Memory Load"
cpu := make([]float64, len(samples))
mem := make([]float64, len(samples))
for i, s := range samples {
cpu[i] = s.CPULoadPct
mem[i] = s.MemLoadPct
}
datasets = [][]float64{cpu, mem}
names = []string{"CPU Load %", "Mem Load %"}
yMin = floatPtr(0)
yMax = floatPtr(100)
case path == "server-temp", path == "server-temp-cpu":
title = "CPU Temperature"
datasets, names = namedTempDatasets(samples, "cpu")
yMin = floatPtr(0)
yMax = autoMax120(datasets...)
case path == "server-temp-gpu":
title = "GPU Temperature"
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.TempC })
yMin = floatPtr(0)
yMax = autoMax120(datasets...)
case path == "server-temp-ambient":
title = "Ambient / Other Sensors"
datasets, names = namedTempDatasets(samples, "ambient")
yMin = floatPtr(0)
yMax = autoMax120(datasets...)
case path == "server-power":
title = "System Power"
power := make([]float64, len(samples))
for i, s := range samples {
power[i] = s.PowerW
}
datasets = [][]float64{power}
names = []string{"Power W"}
yMin, yMax = autoBounds120(power)
case path == "server-fans":
title = "Fan RPM"
datasets, names = namedFanDatasets(samples)
yMin, yMax = autoBounds120(datasets...)
case path == "gpu-all-load":
title = "GPU Compute Load"
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.UsagePct })
yMin = floatPtr(0)
yMax = floatPtr(100)
case path == "gpu-all-memload":
title = "GPU Memory Load"
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct })
yMin = floatPtr(0)
yMax = floatPtr(100)
case path == "gpu-all-power":
title = "GPU Power"
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.PowerW })
yMin, yMax = autoBounds120(datasets...)
case path == "gpu-all-temp":
title = "GPU Temperature"
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.TempC })
yMin = floatPtr(0)
yMax = autoMax120(datasets...)
case strings.HasPrefix(path, "gpu/"):
rest := strings.TrimPrefix(path, "gpu/")
sub := ""
if i := strings.LastIndex(rest, "-"); i > 0 {
sub = rest[i+1:]
rest = rest[:i]
}
idx := 0
fmt.Sscanf(rest, "%d", &idx)
switch sub {
case "load":
title = fmt.Sprintf("GPU %d Load", idx)
util := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.UsagePct })
mem := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct })
if util == nil && mem == nil {
return nil, nil, nil, "", nil, nil, false
}
datasets = [][]float64{coalesceDataset(util, len(samples)), coalesceDataset(mem, len(samples))}
names = []string{"Load %", "Mem %"}
yMin = floatPtr(0)
yMax = floatPtr(100)
case "temp":
title = fmt.Sprintf("GPU %d Temperature", idx)
temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
if temp == nil {
return nil, nil, nil, "", nil, nil, false
}
datasets = [][]float64{temp}
names = []string{"Temp °C"}
yMin = floatPtr(0)
yMax = autoMax120(temp)
default:
title = fmt.Sprintf("GPU %d Power", idx)
power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
if power == nil {
return nil, nil, nil, "", nil, nil, false
}
datasets = [][]float64{power}
names = []string{"Power W"}
yMin, yMax = autoBounds120(power)
}
default:
return nil, nil, nil, "", nil, nil, false
}
return datasets, names, labels, title, yMin, yMax, len(datasets) > 0
}
func sampleTimeLabels(samples []platform.LiveMetricSample) []string {
labels := make([]string, len(samples))
if len(samples) == 0 {
return labels
}
sameDay := true
first := samples[0].Timestamp.Local()
for _, s := range samples {
ts := s.Timestamp.Local()
if ts.Year() != first.Year() || ts.YearDay() != first.YearDay() {
sameDay = false
break
}
}
for i, s := range samples {
ts := s.Timestamp.Local()
if sameDay {
labels[i] = ts.Format("15:04")
} else {
labels[i] = ts.Format("01-02 15:04")
}
}
return labels
}
func namedTempDatasets(samples []platform.LiveMetricSample, group string) ([][]float64, []string) {
seen := map[string]bool{}
var names []string
for _, s := range samples {
for _, t := range s.Temps {
if t.Group == group && !seen[t.Name] {
seen[t.Name] = true
names = append(names, t.Name)
}
}
}
datasets := make([][]float64, 0, len(names))
for _, name := range names {
ds := make([]float64, len(samples))
for i, s := range samples {
for _, t := range s.Temps {
if t.Group == group && t.Name == name {
ds[i] = t.Celsius
break
}
}
}
datasets = append(datasets, ds)
}
return datasets, names
}
func namedFanDatasets(samples []platform.LiveMetricSample) ([][]float64, []string) {
seen := map[string]bool{}
var names []string
for _, s := range samples {
for _, f := range s.Fans {
if !seen[f.Name] {
seen[f.Name] = true
names = append(names, f.Name)
}
}
}
datasets := make([][]float64, 0, len(names))
for _, name := range names {
ds := make([]float64, len(samples))
for i, s := range samples {
for _, f := range s.Fans {
if f.Name == name {
ds[i] = f.RPM
break
}
}
}
datasets = append(datasets, ds)
}
return datasets, names
}
func gpuDatasets(samples []platform.LiveMetricSample, pick func(platform.GPUMetricRow) float64) ([][]float64, []string) {
seen := map[int]bool{}
var indices []int
for _, s := range samples {
for _, g := range s.GPUs {
if !seen[g.GPUIndex] {
seen[g.GPUIndex] = true
indices = append(indices, g.GPUIndex)
}
}
}
datasets := make([][]float64, 0, len(indices))
names := make([]string, 0, len(indices))
for _, idx := range indices {
ds := gpuDatasetByIndex(samples, idx, pick)
if ds == nil {
continue
}
datasets = append(datasets, ds)
names = append(names, fmt.Sprintf("GPU %d", idx))
}
return datasets, names
}
func gpuDatasetByIndex(samples []platform.LiveMetricSample, idx int, pick func(platform.GPUMetricRow) float64) []float64 {
found := false
ds := make([]float64, len(samples))
for i, s := range samples {
for _, g := range s.GPUs {
if g.GPUIndex == idx {
ds[i] = pick(g)
found = true
break
}
}
}
if !found {
return nil
}
return ds
}
func coalesceDataset(ds []float64, n int) []float64 {
if ds != nil {
return ds
}
return make([]float64, n)
}
// floatPtr returns a pointer to a float64 value.
func floatPtr(v float64) *float64 { return &v }
@@ -621,6 +935,47 @@ func autoMax120(datasets ...[]float64) *float64 {
return &v
}
func autoBounds120(datasets ...[]float64) (*float64, *float64) {
min := 0.0
max := 0.0
first := true
for _, ds := range datasets {
for _, v := range ds {
if first {
min, max = v, v
first = false
continue
}
if v < min {
min = v
}
if v > max {
max = v
}
}
}
if first {
return nil, nil
}
if max <= 0 {
return floatPtr(0), nil
}
span := max - min
if span <= 0 {
span = max * 0.1
if span <= 0 {
span = 1
}
}
pad := span * 0.2
low := min - pad
if low < 0 {
low = 0
}
high := max + pad
return floatPtr(low), floatPtr(high)
}
// renderChartSVG renders a line chart SVG with a fixed Y-axis range.
func renderChartSVG(title string, datasets [][]float64, names []string, labels []string, yMin, yMax *float64) ([]byte, error) {
n := len(labels)

View File

@@ -7,6 +7,9 @@ import (
"path/filepath"
"strings"
"testing"
"time"
"bee/audit/internal/platform"
)
func TestChartLegendNumber(t *testing.T) {
@@ -31,6 +34,61 @@ func TestChartLegendNumber(t *testing.T) {
}
}
func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
samples := []platform.LiveMetricSample{
{
Timestamp: time.Now().Add(-3 * time.Minute),
CPULoadPct: 10,
MemLoadPct: 20,
PowerW: 300,
GPUs: []platform.GPUMetricRow{
{GPUIndex: 0, UsagePct: 90, MemUsagePct: 5, PowerW: 120, TempC: 50},
},
},
{
Timestamp: time.Now().Add(-2 * time.Minute),
CPULoadPct: 30,
MemLoadPct: 40,
PowerW: 320,
GPUs: []platform.GPUMetricRow{
{GPUIndex: 0, UsagePct: 95, MemUsagePct: 7, PowerW: 125, TempC: 51},
},
},
{
Timestamp: time.Now().Add(-1 * time.Minute),
CPULoadPct: 50,
MemLoadPct: 60,
PowerW: 340,
GPUs: []platform.GPUMetricRow{
{GPUIndex: 0, UsagePct: 97, MemUsagePct: 9, PowerW: 130, TempC: 52},
},
},
}
datasets, names, labels, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
if !ok {
t.Fatal("chartDataFromSamples returned ok=false")
}
if title != "GPU Power" {
t.Fatalf("title=%q", title)
}
if len(names) != 1 || names[0] != "GPU 0" {
t.Fatalf("names=%v", names)
}
if len(labels) != len(samples) {
t.Fatalf("labels len=%d want %d", len(labels), len(samples))
}
if len(datasets) != 1 || len(datasets[0]) != len(samples) {
t.Fatalf("datasets shape=%v", datasets)
}
if got := datasets[0][0]; got != 120 {
t.Fatalf("datasets[0][0]=%v want 120", got)
}
if got := datasets[0][2]; got != 130 {
t.Fatalf("datasets[0][2]=%v want 130", got)
}
}
func TestRootRendersDashboard(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "audit.json")

View File

@@ -30,6 +30,8 @@ var taskNames = map[string]string{
"storage": "Storage SAT",
"cpu": "CPU SAT",
"amd": "AMD GPU SAT",
"amd-mem": "AMD GPU MEM Integrity",
"amd-bandwidth": "AMD GPU MEM Bandwidth",
"amd-stress": "AMD GPU Burn-in",
"memory-stress": "Memory Burn-in",
"sat-stress": "SAT Stress (stressapptest)",
@@ -124,6 +126,12 @@ var (
runAMDAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
return a.RunAMDAcceptancePackCtx(ctx, baseDir, logFunc)
}
runAMDMemIntegrityPackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
return a.RunAMDMemIntegrityPackCtx(ctx, baseDir, logFunc)
}
runAMDMemBandwidthPackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
return a.RunAMDMemBandwidthPackCtx(ctx, baseDir, logFunc)
}
runAMDStressPackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
return a.RunAMDStressPackCtx(ctx, baseDir, durationSec, logFunc)
}
@@ -380,6 +388,10 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
archive, err = runCPUAcceptancePackCtx(a, ctx, "", dur, j.append)
case "amd":
archive, err = runAMDAcceptancePackCtx(a, ctx, "", j.append)
case "amd-mem":
archive, err = runAMDMemIntegrityPackCtx(a, ctx, "", j.append)
case "amd-bandwidth":
archive, err = runAMDMemBandwidthPackCtx(a, ctx, "", j.append)
case "amd-stress":
dur := t.params.Duration
if t.params.BurnProfile != "" && dur <= 0 {