feat(metrics): persist history in sqlite and add AMD memory validate tests
This commit is contained in:
@@ -132,6 +132,8 @@ type handler struct {
|
||||
// per-GPU rings (index = GPU index)
|
||||
gpuRings []*gpuRings
|
||||
ringsMu sync.Mutex
|
||||
latestMu sync.RWMutex
|
||||
latest *platform.LiveMetricSample
|
||||
// metrics persistence (nil if DB unavailable)
|
||||
metricsDB *MetricsDB
|
||||
// install job (at most one at a time)
|
||||
@@ -164,13 +166,16 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
// Open metrics DB and pre-fill ring buffers from history.
|
||||
if db, err := openMetricsDB(metricsDBPath); err == nil {
|
||||
h.metricsDB = db
|
||||
db.Prune(metricsKeepDuration)
|
||||
if samples, err := db.LoadRecent(120); err == nil {
|
||||
for _, s := range samples {
|
||||
h.feedRings(s)
|
||||
}
|
||||
if len(samples) > 0 {
|
||||
h.setLatestMetric(samples[len(samples)-1])
|
||||
}
|
||||
}
|
||||
}
|
||||
h.startMetricsCollector()
|
||||
|
||||
globalQueue.startWorker(&opts)
|
||||
mux := http.NewServeMux()
|
||||
@@ -198,6 +203,8 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
mux.HandleFunc("POST /api/sat/storage/run", h.handleAPISATRun("storage"))
|
||||
mux.HandleFunc("POST /api/sat/cpu/run", h.handleAPISATRun("cpu"))
|
||||
mux.HandleFunc("POST /api/sat/amd/run", h.handleAPISATRun("amd"))
|
||||
mux.HandleFunc("POST /api/sat/amd-mem/run", h.handleAPISATRun("amd-mem"))
|
||||
mux.HandleFunc("POST /api/sat/amd-bandwidth/run", h.handleAPISATRun("amd-bandwidth"))
|
||||
mux.HandleFunc("POST /api/sat/amd-stress/run", h.handleAPISATRun("amd-stress"))
|
||||
mux.HandleFunc("POST /api/sat/memory-stress/run", h.handleAPISATRun("memory-stress"))
|
||||
mux.HandleFunc("POST /api/sat/sat-stress/run", h.handleAPISATRun("sat-stress"))
|
||||
@@ -260,6 +267,37 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
return mux
|
||||
}
|
||||
|
||||
func (h *handler) startMetricsCollector() {
|
||||
go func() {
|
||||
ticker := time.NewTicker(1 * time.Second)
|
||||
defer ticker.Stop()
|
||||
for range ticker.C {
|
||||
sample := platform.SampleLiveMetrics()
|
||||
h.feedRings(sample)
|
||||
h.setLatestMetric(sample)
|
||||
if h.metricsDB != nil {
|
||||
_ = h.metricsDB.Write(sample)
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
func (h *handler) setLatestMetric(sample platform.LiveMetricSample) {
|
||||
h.latestMu.Lock()
|
||||
defer h.latestMu.Unlock()
|
||||
cp := sample
|
||||
h.latest = &cp
|
||||
}
|
||||
|
||||
func (h *handler) latestMetric() (platform.LiveMetricSample, bool) {
|
||||
h.latestMu.RLock()
|
||||
defer h.latestMu.RUnlock()
|
||||
if h.latest == nil {
|
||||
return platform.LiveMetricSample{}, false
|
||||
}
|
||||
return *h.latest, true
|
||||
}
|
||||
|
||||
// ListenAndServe starts the HTTP server.
|
||||
func ListenAndServe(addr string, opts HandlerOptions) error {
|
||||
return http.ListenAndServe(addr, NewHandler(opts))
|
||||
@@ -387,6 +425,20 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
|
||||
path := strings.TrimPrefix(r.URL.Path, "/api/metrics/chart/")
|
||||
path = strings.TrimSuffix(path, ".svg")
|
||||
|
||||
if h.metricsDB != nil {
|
||||
if datasets, names, labels, title, yMin, yMax, ok := h.chartDataFromDB(path); ok {
|
||||
buf, err := renderChartSVG(title, datasets, names, labels, yMin, yMax)
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
w.Header().Set("Content-Type", "image/svg+xml")
|
||||
w.Header().Set("Cache-Control", "no-store")
|
||||
_, _ = w.Write(buf)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
var datasets [][]float64
|
||||
var names []string
|
||||
var labels []string
|
||||
@@ -601,6 +653,268 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
|
||||
_, _ = w.Write(buf)
|
||||
}
|
||||
|
||||
func (h *handler) chartDataFromDB(path string) ([][]float64, []string, []string, string, *float64, *float64, bool) {
|
||||
samples, err := h.metricsDB.LoadAll()
|
||||
if err != nil || len(samples) == 0 {
|
||||
return nil, nil, nil, "", nil, nil, false
|
||||
}
|
||||
return chartDataFromSamples(path, samples)
|
||||
}
|
||||
|
||||
func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][]float64, []string, []string, string, *float64, *float64, bool) {
|
||||
var datasets [][]float64
|
||||
var names []string
|
||||
var title string
|
||||
var yMin, yMax *float64
|
||||
labels := sampleTimeLabels(samples)
|
||||
|
||||
switch {
|
||||
case path == "server-load":
|
||||
title = "CPU / Memory Load"
|
||||
cpu := make([]float64, len(samples))
|
||||
mem := make([]float64, len(samples))
|
||||
for i, s := range samples {
|
||||
cpu[i] = s.CPULoadPct
|
||||
mem[i] = s.MemLoadPct
|
||||
}
|
||||
datasets = [][]float64{cpu, mem}
|
||||
names = []string{"CPU Load %", "Mem Load %"}
|
||||
yMin = floatPtr(0)
|
||||
yMax = floatPtr(100)
|
||||
|
||||
case path == "server-temp", path == "server-temp-cpu":
|
||||
title = "CPU Temperature"
|
||||
datasets, names = namedTempDatasets(samples, "cpu")
|
||||
yMin = floatPtr(0)
|
||||
yMax = autoMax120(datasets...)
|
||||
|
||||
case path == "server-temp-gpu":
|
||||
title = "GPU Temperature"
|
||||
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.TempC })
|
||||
yMin = floatPtr(0)
|
||||
yMax = autoMax120(datasets...)
|
||||
|
||||
case path == "server-temp-ambient":
|
||||
title = "Ambient / Other Sensors"
|
||||
datasets, names = namedTempDatasets(samples, "ambient")
|
||||
yMin = floatPtr(0)
|
||||
yMax = autoMax120(datasets...)
|
||||
|
||||
case path == "server-power":
|
||||
title = "System Power"
|
||||
power := make([]float64, len(samples))
|
||||
for i, s := range samples {
|
||||
power[i] = s.PowerW
|
||||
}
|
||||
datasets = [][]float64{power}
|
||||
names = []string{"Power W"}
|
||||
yMin, yMax = autoBounds120(power)
|
||||
|
||||
case path == "server-fans":
|
||||
title = "Fan RPM"
|
||||
datasets, names = namedFanDatasets(samples)
|
||||
yMin, yMax = autoBounds120(datasets...)
|
||||
|
||||
case path == "gpu-all-load":
|
||||
title = "GPU Compute Load"
|
||||
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.UsagePct })
|
||||
yMin = floatPtr(0)
|
||||
yMax = floatPtr(100)
|
||||
|
||||
case path == "gpu-all-memload":
|
||||
title = "GPU Memory Load"
|
||||
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct })
|
||||
yMin = floatPtr(0)
|
||||
yMax = floatPtr(100)
|
||||
|
||||
case path == "gpu-all-power":
|
||||
title = "GPU Power"
|
||||
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.PowerW })
|
||||
yMin, yMax = autoBounds120(datasets...)
|
||||
|
||||
case path == "gpu-all-temp":
|
||||
title = "GPU Temperature"
|
||||
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.TempC })
|
||||
yMin = floatPtr(0)
|
||||
yMax = autoMax120(datasets...)
|
||||
|
||||
case strings.HasPrefix(path, "gpu/"):
|
||||
rest := strings.TrimPrefix(path, "gpu/")
|
||||
sub := ""
|
||||
if i := strings.LastIndex(rest, "-"); i > 0 {
|
||||
sub = rest[i+1:]
|
||||
rest = rest[:i]
|
||||
}
|
||||
idx := 0
|
||||
fmt.Sscanf(rest, "%d", &idx)
|
||||
switch sub {
|
||||
case "load":
|
||||
title = fmt.Sprintf("GPU %d Load", idx)
|
||||
util := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.UsagePct })
|
||||
mem := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct })
|
||||
if util == nil && mem == nil {
|
||||
return nil, nil, nil, "", nil, nil, false
|
||||
}
|
||||
datasets = [][]float64{coalesceDataset(util, len(samples)), coalesceDataset(mem, len(samples))}
|
||||
names = []string{"Load %", "Mem %"}
|
||||
yMin = floatPtr(0)
|
||||
yMax = floatPtr(100)
|
||||
case "temp":
|
||||
title = fmt.Sprintf("GPU %d Temperature", idx)
|
||||
temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
|
||||
if temp == nil {
|
||||
return nil, nil, nil, "", nil, nil, false
|
||||
}
|
||||
datasets = [][]float64{temp}
|
||||
names = []string{"Temp °C"}
|
||||
yMin = floatPtr(0)
|
||||
yMax = autoMax120(temp)
|
||||
default:
|
||||
title = fmt.Sprintf("GPU %d Power", idx)
|
||||
power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
|
||||
if power == nil {
|
||||
return nil, nil, nil, "", nil, nil, false
|
||||
}
|
||||
datasets = [][]float64{power}
|
||||
names = []string{"Power W"}
|
||||
yMin, yMax = autoBounds120(power)
|
||||
}
|
||||
|
||||
default:
|
||||
return nil, nil, nil, "", nil, nil, false
|
||||
}
|
||||
|
||||
return datasets, names, labels, title, yMin, yMax, len(datasets) > 0
|
||||
}
|
||||
|
||||
func sampleTimeLabels(samples []platform.LiveMetricSample) []string {
|
||||
labels := make([]string, len(samples))
|
||||
if len(samples) == 0 {
|
||||
return labels
|
||||
}
|
||||
sameDay := true
|
||||
first := samples[0].Timestamp.Local()
|
||||
for _, s := range samples {
|
||||
ts := s.Timestamp.Local()
|
||||
if ts.Year() != first.Year() || ts.YearDay() != first.YearDay() {
|
||||
sameDay = false
|
||||
break
|
||||
}
|
||||
}
|
||||
for i, s := range samples {
|
||||
ts := s.Timestamp.Local()
|
||||
if sameDay {
|
||||
labels[i] = ts.Format("15:04")
|
||||
} else {
|
||||
labels[i] = ts.Format("01-02 15:04")
|
||||
}
|
||||
}
|
||||
return labels
|
||||
}
|
||||
|
||||
func namedTempDatasets(samples []platform.LiveMetricSample, group string) ([][]float64, []string) {
|
||||
seen := map[string]bool{}
|
||||
var names []string
|
||||
for _, s := range samples {
|
||||
for _, t := range s.Temps {
|
||||
if t.Group == group && !seen[t.Name] {
|
||||
seen[t.Name] = true
|
||||
names = append(names, t.Name)
|
||||
}
|
||||
}
|
||||
}
|
||||
datasets := make([][]float64, 0, len(names))
|
||||
for _, name := range names {
|
||||
ds := make([]float64, len(samples))
|
||||
for i, s := range samples {
|
||||
for _, t := range s.Temps {
|
||||
if t.Group == group && t.Name == name {
|
||||
ds[i] = t.Celsius
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
datasets = append(datasets, ds)
|
||||
}
|
||||
return datasets, names
|
||||
}
|
||||
|
||||
func namedFanDatasets(samples []platform.LiveMetricSample) ([][]float64, []string) {
|
||||
seen := map[string]bool{}
|
||||
var names []string
|
||||
for _, s := range samples {
|
||||
for _, f := range s.Fans {
|
||||
if !seen[f.Name] {
|
||||
seen[f.Name] = true
|
||||
names = append(names, f.Name)
|
||||
}
|
||||
}
|
||||
}
|
||||
datasets := make([][]float64, 0, len(names))
|
||||
for _, name := range names {
|
||||
ds := make([]float64, len(samples))
|
||||
for i, s := range samples {
|
||||
for _, f := range s.Fans {
|
||||
if f.Name == name {
|
||||
ds[i] = f.RPM
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
datasets = append(datasets, ds)
|
||||
}
|
||||
return datasets, names
|
||||
}
|
||||
|
||||
func gpuDatasets(samples []platform.LiveMetricSample, pick func(platform.GPUMetricRow) float64) ([][]float64, []string) {
|
||||
seen := map[int]bool{}
|
||||
var indices []int
|
||||
for _, s := range samples {
|
||||
for _, g := range s.GPUs {
|
||||
if !seen[g.GPUIndex] {
|
||||
seen[g.GPUIndex] = true
|
||||
indices = append(indices, g.GPUIndex)
|
||||
}
|
||||
}
|
||||
}
|
||||
datasets := make([][]float64, 0, len(indices))
|
||||
names := make([]string, 0, len(indices))
|
||||
for _, idx := range indices {
|
||||
ds := gpuDatasetByIndex(samples, idx, pick)
|
||||
if ds == nil {
|
||||
continue
|
||||
}
|
||||
datasets = append(datasets, ds)
|
||||
names = append(names, fmt.Sprintf("GPU %d", idx))
|
||||
}
|
||||
return datasets, names
|
||||
}
|
||||
|
||||
func gpuDatasetByIndex(samples []platform.LiveMetricSample, idx int, pick func(platform.GPUMetricRow) float64) []float64 {
|
||||
found := false
|
||||
ds := make([]float64, len(samples))
|
||||
for i, s := range samples {
|
||||
for _, g := range s.GPUs {
|
||||
if g.GPUIndex == idx {
|
||||
ds[i] = pick(g)
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
return nil
|
||||
}
|
||||
return ds
|
||||
}
|
||||
|
||||
func coalesceDataset(ds []float64, n int) []float64 {
|
||||
if ds != nil {
|
||||
return ds
|
||||
}
|
||||
return make([]float64, n)
|
||||
}
|
||||
|
||||
// floatPtr returns a pointer to a float64 value.
|
||||
func floatPtr(v float64) *float64 { return &v }
|
||||
|
||||
@@ -621,6 +935,47 @@ func autoMax120(datasets ...[]float64) *float64 {
|
||||
return &v
|
||||
}
|
||||
|
||||
func autoBounds120(datasets ...[]float64) (*float64, *float64) {
|
||||
min := 0.0
|
||||
max := 0.0
|
||||
first := true
|
||||
for _, ds := range datasets {
|
||||
for _, v := range ds {
|
||||
if first {
|
||||
min, max = v, v
|
||||
first = false
|
||||
continue
|
||||
}
|
||||
if v < min {
|
||||
min = v
|
||||
}
|
||||
if v > max {
|
||||
max = v
|
||||
}
|
||||
}
|
||||
}
|
||||
if first {
|
||||
return nil, nil
|
||||
}
|
||||
if max <= 0 {
|
||||
return floatPtr(0), nil
|
||||
}
|
||||
span := max - min
|
||||
if span <= 0 {
|
||||
span = max * 0.1
|
||||
if span <= 0 {
|
||||
span = 1
|
||||
}
|
||||
}
|
||||
pad := span * 0.2
|
||||
low := min - pad
|
||||
if low < 0 {
|
||||
low = 0
|
||||
}
|
||||
high := max + pad
|
||||
return floatPtr(low), floatPtr(high)
|
||||
}
|
||||
|
||||
// renderChartSVG renders a line chart SVG with a fixed Y-axis range.
|
||||
func renderChartSVG(title string, datasets [][]float64, names []string, labels []string, yMin, yMax *float64) ([]byte, error) {
|
||||
n := len(labels)
|
||||
|
||||
Reference in New Issue
Block a user