feat(metrics): persist history in sqlite and add AMD memory validate tests

This commit is contained in:
2026-03-29 12:28:06 +03:00
parent 98f0cf0d52
commit e15bcc91c5
10 changed files with 539 additions and 29 deletions

View File

@@ -132,6 +132,8 @@ type handler struct {
// per-GPU rings (index = GPU index)
gpuRings []*gpuRings
ringsMu sync.Mutex
latestMu sync.RWMutex
latest *platform.LiveMetricSample
// metrics persistence (nil if DB unavailable)
metricsDB *MetricsDB
// install job (at most one at a time)
@@ -164,13 +166,16 @@ func NewHandler(opts HandlerOptions) http.Handler {
// Open metrics DB and pre-fill ring buffers from history.
if db, err := openMetricsDB(metricsDBPath); err == nil {
h.metricsDB = db
db.Prune(metricsKeepDuration)
if samples, err := db.LoadRecent(120); err == nil {
for _, s := range samples {
h.feedRings(s)
}
if len(samples) > 0 {
h.setLatestMetric(samples[len(samples)-1])
}
}
}
h.startMetricsCollector()
globalQueue.startWorker(&opts)
mux := http.NewServeMux()
@@ -198,6 +203,8 @@ func NewHandler(opts HandlerOptions) http.Handler {
mux.HandleFunc("POST /api/sat/storage/run", h.handleAPISATRun("storage"))
mux.HandleFunc("POST /api/sat/cpu/run", h.handleAPISATRun("cpu"))
mux.HandleFunc("POST /api/sat/amd/run", h.handleAPISATRun("amd"))
mux.HandleFunc("POST /api/sat/amd-mem/run", h.handleAPISATRun("amd-mem"))
mux.HandleFunc("POST /api/sat/amd-bandwidth/run", h.handleAPISATRun("amd-bandwidth"))
mux.HandleFunc("POST /api/sat/amd-stress/run", h.handleAPISATRun("amd-stress"))
mux.HandleFunc("POST /api/sat/memory-stress/run", h.handleAPISATRun("memory-stress"))
mux.HandleFunc("POST /api/sat/sat-stress/run", h.handleAPISATRun("sat-stress"))
@@ -260,6 +267,37 @@ func NewHandler(opts HandlerOptions) http.Handler {
return mux
}
func (h *handler) startMetricsCollector() {
go func() {
ticker := time.NewTicker(1 * time.Second)
defer ticker.Stop()
for range ticker.C {
sample := platform.SampleLiveMetrics()
h.feedRings(sample)
h.setLatestMetric(sample)
if h.metricsDB != nil {
_ = h.metricsDB.Write(sample)
}
}
}()
}
func (h *handler) setLatestMetric(sample platform.LiveMetricSample) {
h.latestMu.Lock()
defer h.latestMu.Unlock()
cp := sample
h.latest = &cp
}
func (h *handler) latestMetric() (platform.LiveMetricSample, bool) {
h.latestMu.RLock()
defer h.latestMu.RUnlock()
if h.latest == nil {
return platform.LiveMetricSample{}, false
}
return *h.latest, true
}
// ListenAndServe starts the HTTP server.
func ListenAndServe(addr string, opts HandlerOptions) error {
return http.ListenAndServe(addr, NewHandler(opts))
@@ -387,6 +425,20 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
path := strings.TrimPrefix(r.URL.Path, "/api/metrics/chart/")
path = strings.TrimSuffix(path, ".svg")
if h.metricsDB != nil {
if datasets, names, labels, title, yMin, yMax, ok := h.chartDataFromDB(path); ok {
buf, err := renderChartSVG(title, datasets, names, labels, yMin, yMax)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "image/svg+xml")
w.Header().Set("Cache-Control", "no-store")
_, _ = w.Write(buf)
return
}
}
var datasets [][]float64
var names []string
var labels []string
@@ -601,6 +653,268 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
_, _ = w.Write(buf)
}
func (h *handler) chartDataFromDB(path string) ([][]float64, []string, []string, string, *float64, *float64, bool) {
samples, err := h.metricsDB.LoadAll()
if err != nil || len(samples) == 0 {
return nil, nil, nil, "", nil, nil, false
}
return chartDataFromSamples(path, samples)
}
func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][]float64, []string, []string, string, *float64, *float64, bool) {
var datasets [][]float64
var names []string
var title string
var yMin, yMax *float64
labels := sampleTimeLabels(samples)
switch {
case path == "server-load":
title = "CPU / Memory Load"
cpu := make([]float64, len(samples))
mem := make([]float64, len(samples))
for i, s := range samples {
cpu[i] = s.CPULoadPct
mem[i] = s.MemLoadPct
}
datasets = [][]float64{cpu, mem}
names = []string{"CPU Load %", "Mem Load %"}
yMin = floatPtr(0)
yMax = floatPtr(100)
case path == "server-temp", path == "server-temp-cpu":
title = "CPU Temperature"
datasets, names = namedTempDatasets(samples, "cpu")
yMin = floatPtr(0)
yMax = autoMax120(datasets...)
case path == "server-temp-gpu":
title = "GPU Temperature"
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.TempC })
yMin = floatPtr(0)
yMax = autoMax120(datasets...)
case path == "server-temp-ambient":
title = "Ambient / Other Sensors"
datasets, names = namedTempDatasets(samples, "ambient")
yMin = floatPtr(0)
yMax = autoMax120(datasets...)
case path == "server-power":
title = "System Power"
power := make([]float64, len(samples))
for i, s := range samples {
power[i] = s.PowerW
}
datasets = [][]float64{power}
names = []string{"Power W"}
yMin, yMax = autoBounds120(power)
case path == "server-fans":
title = "Fan RPM"
datasets, names = namedFanDatasets(samples)
yMin, yMax = autoBounds120(datasets...)
case path == "gpu-all-load":
title = "GPU Compute Load"
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.UsagePct })
yMin = floatPtr(0)
yMax = floatPtr(100)
case path == "gpu-all-memload":
title = "GPU Memory Load"
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct })
yMin = floatPtr(0)
yMax = floatPtr(100)
case path == "gpu-all-power":
title = "GPU Power"
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.PowerW })
yMin, yMax = autoBounds120(datasets...)
case path == "gpu-all-temp":
title = "GPU Temperature"
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.TempC })
yMin = floatPtr(0)
yMax = autoMax120(datasets...)
case strings.HasPrefix(path, "gpu/"):
rest := strings.TrimPrefix(path, "gpu/")
sub := ""
if i := strings.LastIndex(rest, "-"); i > 0 {
sub = rest[i+1:]
rest = rest[:i]
}
idx := 0
fmt.Sscanf(rest, "%d", &idx)
switch sub {
case "load":
title = fmt.Sprintf("GPU %d Load", idx)
util := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.UsagePct })
mem := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct })
if util == nil && mem == nil {
return nil, nil, nil, "", nil, nil, false
}
datasets = [][]float64{coalesceDataset(util, len(samples)), coalesceDataset(mem, len(samples))}
names = []string{"Load %", "Mem %"}
yMin = floatPtr(0)
yMax = floatPtr(100)
case "temp":
title = fmt.Sprintf("GPU %d Temperature", idx)
temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
if temp == nil {
return nil, nil, nil, "", nil, nil, false
}
datasets = [][]float64{temp}
names = []string{"Temp °C"}
yMin = floatPtr(0)
yMax = autoMax120(temp)
default:
title = fmt.Sprintf("GPU %d Power", idx)
power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
if power == nil {
return nil, nil, nil, "", nil, nil, false
}
datasets = [][]float64{power}
names = []string{"Power W"}
yMin, yMax = autoBounds120(power)
}
default:
return nil, nil, nil, "", nil, nil, false
}
return datasets, names, labels, title, yMin, yMax, len(datasets) > 0
}
func sampleTimeLabels(samples []platform.LiveMetricSample) []string {
labels := make([]string, len(samples))
if len(samples) == 0 {
return labels
}
sameDay := true
first := samples[0].Timestamp.Local()
for _, s := range samples {
ts := s.Timestamp.Local()
if ts.Year() != first.Year() || ts.YearDay() != first.YearDay() {
sameDay = false
break
}
}
for i, s := range samples {
ts := s.Timestamp.Local()
if sameDay {
labels[i] = ts.Format("15:04")
} else {
labels[i] = ts.Format("01-02 15:04")
}
}
return labels
}
func namedTempDatasets(samples []platform.LiveMetricSample, group string) ([][]float64, []string) {
seen := map[string]bool{}
var names []string
for _, s := range samples {
for _, t := range s.Temps {
if t.Group == group && !seen[t.Name] {
seen[t.Name] = true
names = append(names, t.Name)
}
}
}
datasets := make([][]float64, 0, len(names))
for _, name := range names {
ds := make([]float64, len(samples))
for i, s := range samples {
for _, t := range s.Temps {
if t.Group == group && t.Name == name {
ds[i] = t.Celsius
break
}
}
}
datasets = append(datasets, ds)
}
return datasets, names
}
func namedFanDatasets(samples []platform.LiveMetricSample) ([][]float64, []string) {
seen := map[string]bool{}
var names []string
for _, s := range samples {
for _, f := range s.Fans {
if !seen[f.Name] {
seen[f.Name] = true
names = append(names, f.Name)
}
}
}
datasets := make([][]float64, 0, len(names))
for _, name := range names {
ds := make([]float64, len(samples))
for i, s := range samples {
for _, f := range s.Fans {
if f.Name == name {
ds[i] = f.RPM
break
}
}
}
datasets = append(datasets, ds)
}
return datasets, names
}
func gpuDatasets(samples []platform.LiveMetricSample, pick func(platform.GPUMetricRow) float64) ([][]float64, []string) {
seen := map[int]bool{}
var indices []int
for _, s := range samples {
for _, g := range s.GPUs {
if !seen[g.GPUIndex] {
seen[g.GPUIndex] = true
indices = append(indices, g.GPUIndex)
}
}
}
datasets := make([][]float64, 0, len(indices))
names := make([]string, 0, len(indices))
for _, idx := range indices {
ds := gpuDatasetByIndex(samples, idx, pick)
if ds == nil {
continue
}
datasets = append(datasets, ds)
names = append(names, fmt.Sprintf("GPU %d", idx))
}
return datasets, names
}
func gpuDatasetByIndex(samples []platform.LiveMetricSample, idx int, pick func(platform.GPUMetricRow) float64) []float64 {
found := false
ds := make([]float64, len(samples))
for i, s := range samples {
for _, g := range s.GPUs {
if g.GPUIndex == idx {
ds[i] = pick(g)
found = true
break
}
}
}
if !found {
return nil
}
return ds
}
func coalesceDataset(ds []float64, n int) []float64 {
if ds != nil {
return ds
}
return make([]float64, n)
}
// floatPtr returns a pointer to a float64 value.
func floatPtr(v float64) *float64 { return &v }
@@ -621,6 +935,47 @@ func autoMax120(datasets ...[]float64) *float64 {
return &v
}
func autoBounds120(datasets ...[]float64) (*float64, *float64) {
min := 0.0
max := 0.0
first := true
for _, ds := range datasets {
for _, v := range ds {
if first {
min, max = v, v
first = false
continue
}
if v < min {
min = v
}
if v > max {
max = v
}
}
}
if first {
return nil, nil
}
if max <= 0 {
return floatPtr(0), nil
}
span := max - min
if span <= 0 {
span = max * 0.1
if span <= 0 {
span = 1
}
}
pad := span * 0.2
low := min - pad
if low < 0 {
low = 0
}
high := max + pad
return floatPtr(low), floatPtr(high)
}
// renderChartSVG renders a line chart SVG with a fixed Y-axis range.
func renderChartSVG(title string, datasets [][]float64, names []string, labels []string, yMin, yMax *float64) ([]byte, error) {
n := len(labels)