feat(metrics): single chart engine + full-width stacked layout

- One engine: go-analyze/charts (grafana theme) for all live metrics
- Server chart: CPU temp, CPU load%, mem load%, power W, fan RPMs
- GPU charts: temp, load%, mem%, power W — one card per GPU, added dynamically
- Charts 1400x280px SVG, rendered at width:100% in single-column layout
- Add CPU load (from /proc/stat) and mem load (from /proc/meminfo) to LiveMetricSample
- Add GPU mem utilization to GPUMetricRow (nvidia-smi utilization.memory)
- Document charting architecture in bible-local/architecture/charting.md

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-27 23:26:13 +03:00
parent e7a7ff54b9
commit ec0b7f7ff9
6 changed files with 336 additions and 78 deletions

View File

@@ -13,18 +13,19 @@ import (
// GPUMetricRow is one telemetry sample from nvidia-smi during a stress test. // GPUMetricRow is one telemetry sample from nvidia-smi during a stress test.
type GPUMetricRow struct { type GPUMetricRow struct {
ElapsedSec float64 ElapsedSec float64 `json:"elapsed_sec"`
GPUIndex int GPUIndex int `json:"index"`
TempC float64 TempC float64 `json:"temp_c"`
UsagePct float64 UsagePct float64 `json:"usage_pct"`
PowerW float64 MemUsagePct float64 `json:"mem_usage_pct"`
ClockMHz float64 PowerW float64 `json:"power_w"`
ClockMHz float64 `json:"clock_mhz"`
} }
// sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU. // sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) { func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
args := []string{ args := []string{
"--query-gpu=index,temperature.gpu,utilization.gpu,power.draw,clocks.current.graphics", "--query-gpu=index,temperature.gpu,utilization.gpu,utilization.memory,power.draw,clocks.current.graphics",
"--format=csv,noheader,nounits", "--format=csv,noheader,nounits",
} }
if len(gpuIndices) > 0 { if len(gpuIndices) > 0 {
@@ -45,7 +46,7 @@ func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
continue continue
} }
parts := strings.Split(line, ", ") parts := strings.Split(line, ", ")
if len(parts) < 5 { if len(parts) < 6 {
continue continue
} }
idx, _ := strconv.Atoi(strings.TrimSpace(parts[0])) idx, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
@@ -53,8 +54,9 @@ func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
GPUIndex: idx, GPUIndex: idx,
TempC: parseGPUFloat(parts[1]), TempC: parseGPUFloat(parts[1]),
UsagePct: parseGPUFloat(parts[2]), UsagePct: parseGPUFloat(parts[2]),
PowerW: parseGPUFloat(parts[3]), MemUsagePct: parseGPUFloat(parts[3]),
ClockMHz: parseGPUFloat(parts[4]), PowerW: parseGPUFloat(parts[4]),
ClockMHz: parseGPUFloat(parts[5]),
}) })
} }
return rows, nil return rows, nil

View File

@@ -1,6 +1,12 @@
package platform package platform
import "time" import (
"bufio"
"os"
"strconv"
"strings"
"time"
)
// LiveMetricSample is a single point-in-time snapshot of server metrics // LiveMetricSample is a single point-in-time snapshot of server metrics
// collected for the web UI metrics page. // collected for the web UI metrics page.
@@ -9,6 +15,8 @@ type LiveMetricSample struct {
Fans []FanReading `json:"fans"` Fans []FanReading `json:"fans"`
Temps []TempReading `json:"temps"` Temps []TempReading `json:"temps"`
PowerW float64 `json:"power_w"` PowerW float64 `json:"power_w"`
CPULoadPct float64 `json:"cpu_load_pct"`
MemLoadPct float64 `json:"mem_load_pct"`
GPUs []GPUMetricRow `json:"gpus"` GPUs []GPUMetricRow `json:"gpus"`
} }
@@ -41,5 +49,91 @@ func SampleLiveMetrics() LiveMetricSample {
// System power — returns 0 if unavailable // System power — returns 0 if unavailable
s.PowerW = sampleSystemPower() s.PowerW = sampleSystemPower()
// CPU load — from /proc/stat
s.CPULoadPct = sampleCPULoadPct()
// Memory load — from /proc/meminfo
s.MemLoadPct = sampleMemLoadPct()
return s return s
} }
// sampleCPULoadPct reads two /proc/stat snapshots 200ms apart and returns
// the overall CPU utilisation percentage.
var cpuStatPrev [2]uint64 // [total, idle]
func sampleCPULoadPct() float64 {
total, idle := readCPUStat()
if total == 0 {
return 0
}
prevTotal, prevIdle := cpuStatPrev[0], cpuStatPrev[1]
cpuStatPrev = [2]uint64{total, idle}
if prevTotal == 0 {
return 0
}
dt := float64(total - prevTotal)
di := float64(idle - prevIdle)
if dt <= 0 {
return 0
}
pct := (1 - di/dt) * 100
if pct < 0 {
return 0
}
if pct > 100 {
return 100
}
return pct
}
func readCPUStat() (total, idle uint64) {
f, err := os.Open("/proc/stat")
if err != nil {
return 0, 0
}
defer f.Close()
sc := bufio.NewScanner(f)
for sc.Scan() {
line := sc.Text()
if !strings.HasPrefix(line, "cpu ") {
continue
}
fields := strings.Fields(line)[1:] // skip "cpu"
var vals [10]uint64
for i := 0; i < len(fields) && i < 10; i++ {
vals[i], _ = strconv.ParseUint(fields[i], 10, 64)
}
// idle = idle + iowait
idle = vals[3] + vals[4]
for _, v := range vals {
total += v
}
return total, idle
}
return 0, 0
}
func sampleMemLoadPct() float64 {
f, err := os.Open("/proc/meminfo")
if err != nil {
return 0
}
defer f.Close()
vals := map[string]uint64{}
sc := bufio.NewScanner(f)
for sc.Scan() {
fields := strings.Fields(sc.Text())
if len(fields) >= 2 {
v, _ := strconv.ParseUint(fields[1], 10, 64)
vals[strings.TrimSuffix(fields[0], ":")] = v
}
}
total := vals["MemTotal"]
avail := vals["MemAvailable"]
if total == 0 {
return 0
}
used := total - avail
return float64(used) / float64(total) * 100
}

View File

@@ -424,7 +424,7 @@ func (h *handler) handleAPIMetricsStream(w http.ResponseWriter, r *http.Request)
case <-ticker.C: case <-ticker.C:
sample := platform.SampleLiveMetrics() sample := platform.SampleLiveMetrics()
// Feed ring buffers for server-side SVG charts // Feed server ring buffers
for _, t := range sample.Temps { for _, t := range sample.Temps {
if t.Name == "CPU" { if t.Name == "CPU" {
h.ringCPUTemp.push(t.Celsius) h.ringCPUTemp.push(t.Celsius)
@@ -432,6 +432,35 @@ func (h *handler) handleAPIMetricsStream(w http.ResponseWriter, r *http.Request)
} }
} }
h.ringPower.push(sample.PowerW) h.ringPower.push(sample.PowerW)
h.ringCPULoad.push(sample.CPULoadPct)
h.ringMemLoad.push(sample.MemLoadPct)
// Feed fan ring buffers (grow on first sight)
h.ringsMu.Lock()
for i, fan := range sample.Fans {
for len(h.ringFans) <= i {
h.ringFans = append(h.ringFans, newMetricsRing(120))
h.fanNames = append(h.fanNames, fan.Name)
}
h.ringFans[i].push(float64(fan.RPM))
}
// Feed per-GPU ring buffers (grow on first sight)
for _, gpu := range sample.GPUs {
idx := gpu.GPUIndex
for len(h.gpuRings) <= idx {
h.gpuRings = append(h.gpuRings, &gpuRings{
Temp: newMetricsRing(120),
Util: newMetricsRing(120),
MemUtil: newMetricsRing(120),
Power: newMetricsRing(120),
})
}
h.gpuRings[idx].Temp.push(gpu.TempC)
h.gpuRings[idx].Util.push(gpu.UsagePct)
h.gpuRings[idx].MemUtil.push(gpu.MemUsagePct)
h.gpuRings[idx].Power.push(gpu.PowerW)
}
h.ringsMu.Unlock()
b, err := json.Marshal(sample) b, err := json.Marshal(sample)
if err != nil { if err != nil {

View File

@@ -242,28 +242,27 @@ func renderHealthCard(opts HandlerOptions) string {
// ── Metrics ─────────────────────────────────────────────────────────────────── // ── Metrics ───────────────────────────────────────────────────────────────────
func renderMetrics() string { func renderMetrics() string {
return `<p style="color:#64748b;font-size:13px;margin-bottom:16px">Live server metrics, charts updated every 2 seconds.</p> return `<p style="color:#64748b;font-size:13px;margin-bottom:16px">Live metrics — updated every 2 seconds. Charts use go-analyze/charts (grafana theme).</p>
<div class="grid2">
<div class="card"> <div class="card" style="margin-bottom:16px">
<div class="card-head">System</div> <div class="card-head">Server</div>
<div class="card-body"> <div class="card-body" style="padding:8px">
<img id="chart-cpu-temp" src="/api/metrics/chart/cpu-temp.svg" style="width:100%;border-radius:6px" alt="CPU Temp"> <img id="chart-server" src="/api/metrics/chart/server.svg" style="width:100%;display:block;border-radius:6px" alt="Server metrics">
<img id="chart-power" src="/api/metrics/chart/power.svg" style="width:100%;border-radius:6px;margin-top:8px" alt="Power"> <div id="sys-table" style="margin-top:8px;font-size:12px"></div>
<div id="sys-table" style="margin-top:8px"></div>
</div>
</div>
<div class="card">
<div class="card-head">GPU</div>
<div class="card-body">
<div id="gpu-table"><p style="color:#64748b;font-size:12px">Waiting for data...</p></div>
</div>
</div> </div>
</div> </div>
<div id="gpu-charts"></div>
<script> <script>
let knownGPUs = [];
function refreshCharts() { function refreshCharts() {
const t = '?t=' + Date.now(); const t = '?t=' + Date.now();
['chart-cpu-temp','chart-power'].forEach(id => { const srv = document.getElementById('chart-server');
const el = document.getElementById(id); if (srv) srv.src = srv.src.split('?')[0] + t;
knownGPUs.forEach(idx => {
const el = document.getElementById('chart-gpu-' + idx);
if (el) el.src = el.src.split('?')[0] + t; if (el) el.src = el.src.split('?')[0] + t;
}); });
} }
@@ -272,21 +271,42 @@ setInterval(refreshCharts, 2000);
const es = new EventSource('/api/metrics/stream'); const es = new EventSource('/api/metrics/stream');
es.addEventListener('metrics', e => { es.addEventListener('metrics', e => {
const d = JSON.parse(e.data); const d = JSON.parse(e.data);
const gpuRows = (d.gpus||[]).map(g =>
'<tr><td>GPU '+g.index+'</td><td>'+g.temp_c+'°C</td><td>'+g.usage_pct+'%</td><td>'+g.power_w+'W</td><td>'+g.clock_mhz+'MHz</td></tr>'
).join('');
document.getElementById('gpu-table').innerHTML = gpuRows ?
'<table><tr><th>GPU</th><th>Temp</th><th>Usage</th><th>Power</th><th>Clock</th></tr>'+gpuRows+'</table>' :
'<p style="color:#64748b;font-size:12px">No NVIDIA GPU detected</p>';
// Add GPU chart cards as GPUs appear
(d.gpus||[]).forEach(g => {
if (knownGPUs.includes(g.index)) return;
knownGPUs.push(g.index);
const div = document.createElement('div');
div.className = 'card';
div.style.marginBottom = '16px';
div.innerHTML = '<div class="card-head">GPU ' + g.index + '</div>' +
'<div class="card-body" style="padding:8px">' +
'<img id="chart-gpu-' + g.index + '" src="/api/metrics/chart/gpu/' + g.index + '.svg" style="width:100%;display:block;border-radius:6px" alt="GPU ' + g.index + '">' +
'<div id="gpu-table-' + g.index + '" style="margin-top:8px;font-size:12px"></div>' +
'</div>';
document.getElementById('gpu-charts').appendChild(div);
});
// Update numeric tables
let sysHTML = ''; let sysHTML = '';
const cpuTemp = (d.temps||[]).find(t => t.name==='CPU'); const cpuTemp = (d.temps||[]).find(t => t.name==='CPU');
if (cpuTemp) sysHTML += '<tr><td>CPU Temp</td><td>'+cpuTemp.celsius.toFixed(1)+'°C</td></tr>'; if (cpuTemp) sysHTML += '<tr><td>CPU Temp</td><td>'+cpuTemp.celsius.toFixed(1)+'°C</td></tr>';
if (d.cpu_load_pct) sysHTML += '<tr><td>CPU Load</td><td>'+d.cpu_load_pct.toFixed(1)+'%</td></tr>';
if (d.mem_load_pct) sysHTML += '<tr><td>Mem Load</td><td>'+d.mem_load_pct.toFixed(1)+'%</td></tr>';
(d.fans||[]).forEach(f => sysHTML += '<tr><td>'+f.name+'</td><td>'+f.rpm+' RPM</td></tr>'); (d.fans||[]).forEach(f => sysHTML += '<tr><td>'+f.name+'</td><td>'+f.rpm+' RPM</td></tr>');
if (d.power_w) sysHTML += '<tr><td>System Power</td><td>'+d.power_w.toFixed(0)+'W</td></tr>'; if (d.power_w) sysHTML += '<tr><td>Power</td><td>'+d.power_w.toFixed(0)+' W</td></tr>';
document.getElementById('sys-table').innerHTML = sysHTML ? const st = document.getElementById('sys-table');
'<table>'+sysHTML+'</table>' : if (st) st.innerHTML = sysHTML ? '<table>'+sysHTML+'</table>' : '<p style="color:#64748b">No sensor data (ipmitool/sensors required)</p>';
'<p style="color:#64748b;font-size:12px">No sensor data (ipmitool/sensors required)</p>';
(d.gpus||[]).forEach(g => {
const t = document.getElementById('gpu-table-' + g.index);
if (!t) return;
t.innerHTML = '<table>' +
'<tr><td>Temp</td><td>'+g.temp_c+'°C</td>' +
'<td>Load</td><td>'+g.usage_pct+'%</td>' +
'<td>Mem</td><td>'+g.mem_usage_pct+'%</td>' +
'<td>Power</td><td>'+g.power_w+' W</td></tr></table>';
});
}); });
es.onerror = () => {}; es.onerror = () => {};
</script>` </script>`

View File

@@ -62,15 +62,27 @@ func (r *metricsRing) snapshot() ([]float64, []string) {
return v, l return v, l
} }
// gpuRings holds per-GPU ring buffers.
type gpuRings struct {
Temp *metricsRing
Util *metricsRing
MemUtil *metricsRing
Power *metricsRing
}
// handler is the HTTP handler for the web UI. // handler is the HTTP handler for the web UI.
type handler struct { type handler struct {
opts HandlerOptions opts HandlerOptions
mux *http.ServeMux mux *http.ServeMux
// server rings
ringCPUTemp *metricsRing ringCPUTemp *metricsRing
ringCPULoad *metricsRing
ringMemLoad *metricsRing
ringPower *metricsRing ringPower *metricsRing
ringFans []*metricsRing ringFans []*metricsRing
ringGPUTemp []*metricsRing fanNames []string
ringGPUUtil []*metricsRing // per-GPU rings (index = GPU index)
gpuRings []*gpuRings
ringsMu sync.Mutex ringsMu sync.Mutex
} }
@@ -89,6 +101,8 @@ func NewHandler(opts HandlerOptions) http.Handler {
h := &handler{ h := &handler{
opts: opts, opts: opts,
ringCPUTemp: newMetricsRing(120), ringCPUTemp: newMetricsRing(120),
ringCPULoad: newMetricsRing(120),
ringMemLoad: newMetricsRing(120),
ringPower: newMetricsRing(120), ringPower: newMetricsRing(120),
} }
mux := http.NewServeMux() mux := http.NewServeMux()
@@ -244,48 +258,88 @@ func (h *handler) handleViewer(w http.ResponseWriter, r *http.Request) {
} }
func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request) { func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request) {
name := strings.TrimPrefix(r.URL.Path, "/api/metrics/chart/") path := strings.TrimPrefix(r.URL.Path, "/api/metrics/chart/")
name = strings.TrimSuffix(name, ".svg") path = strings.TrimSuffix(path, ".svg")
var datasets [][]float64
var names []string
var labels []string
var title string
switch {
case path == "server":
title = "Server"
vCPUTemp, l := h.ringCPUTemp.snapshot()
vCPULoad, _ := h.ringCPULoad.snapshot()
vMemLoad, _ := h.ringMemLoad.snapshot()
vPower, _ := h.ringPower.snapshot()
labels = l
datasets = [][]float64{vCPUTemp, vCPULoad, vMemLoad, vPower}
names = []string{"CPU Temp °C", "CPU Load %", "Mem Load %", "Power W"}
h.ringsMu.Lock()
for i, fr := range h.ringFans {
fv, _ := fr.snapshot()
datasets = append(datasets, fv)
name := "Fan"
if i < len(h.fanNames) {
name = h.fanNames[i]
}
names = append(names, name+" RPM")
}
h.ringsMu.Unlock()
case strings.HasPrefix(path, "gpu/"):
idxStr := strings.TrimPrefix(path, "gpu/")
idx := 0
fmt.Sscanf(idxStr, "%d", &idx)
h.ringsMu.Lock()
var gr *gpuRings
if idx < len(h.gpuRings) {
gr = h.gpuRings[idx]
}
h.ringsMu.Unlock()
if gr == nil {
http.NotFound(w, r)
return
}
vTemp, l := gr.Temp.snapshot()
vUtil, _ := gr.Util.snapshot()
vMemUtil, _ := gr.MemUtil.snapshot()
vPower, _ := gr.Power.snapshot()
labels = l
title = fmt.Sprintf("GPU %d", idx)
datasets = [][]float64{vTemp, vUtil, vMemUtil, vPower}
names = []string{"Temp °C", "Load %", "Mem %", "Power W"}
var ring *metricsRing
var title, unit string
switch name {
case "cpu-temp":
ring, title, unit = h.ringCPUTemp, "CPU Temperature", "°C"
case "power":
ring, title, unit = h.ringPower, "System Power", "W"
default: default:
http.NotFound(w, r) http.NotFound(w, r)
return return
} }
vals, labels := ring.snapshot() // Ensure all datasets same length as labels
if len(vals) == 0 { n := len(labels)
vals = []float64{0} if n == 0 {
n = 1
labels = []string{""} labels = []string{""}
} }
for i := range datasets {
// Sparse x-axis labels if len(datasets[i]) == 0 {
sparse := make([]string, len(labels)) datasets[i] = make([]float64, n)
step := len(labels) / 6
if step < 1 {
step = 1
}
for i := range labels {
if i%step == 0 {
sparse[i] = labels[i]
} }
} }
opt := gocharts.NewLineChartOptionWithData([][]float64{vals}) sparse := sparseLabels(labels, 6)
opt.Title = gocharts.TitleOption{Text: title + " (" + unit + ")"}
opt := gocharts.NewLineChartOptionWithData(datasets)
opt.Title = gocharts.TitleOption{Text: title}
opt.XAxis.Labels = sparse opt.XAxis.Labels = sparse
opt.Legend = gocharts.LegendOption{Show: gocharts.Ptr(false)} opt.Legend = gocharts.LegendOption{SeriesNames: names}
p := gocharts.NewPainter(gocharts.PainterOptions{ p := gocharts.NewPainter(gocharts.PainterOptions{
OutputFormat: gocharts.ChartOutputSVG, OutputFormat: gocharts.ChartOutputSVG,
Width: 600, Width: 1400,
Height: 180, Height: 280,
}, gocharts.PainterThemeOption(gocharts.GetTheme("grafana"))) }, gocharts.PainterThemeOption(gocharts.GetTheme("grafana")))
if err := p.LineChart(opt); err != nil { if err := p.LineChart(opt); err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError) http.Error(w, err.Error(), http.StatusInternalServerError)
@@ -301,6 +355,27 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
_, _ = w.Write(buf) _, _ = w.Write(buf)
} }
func safeIdx(s []float64, i int) float64 {
if i < len(s) {
return s[i]
}
return 0
}
func sparseLabels(labels []string, n int) []string {
out := make([]string, len(labels))
step := len(labels) / n
if step < 1 {
step = 1
}
for i, l := range labels {
if i%step == 0 {
out[i] = l
}
}
return out
}
// ── Page handler ───────────────────────────────────────────────────────────── // ── Page handler ─────────────────────────────────────────────────────────────
func (h *handler) handlePage(w http.ResponseWriter, r *http.Request) { func (h *handler) handlePage(w http.ResponseWriter, r *http.Request) {

View File

@@ -0,0 +1,38 @@
# Charting architecture
## Decision: one chart engine for all live metrics
**Engine:** `github.com/go-analyze/charts` (pure Go, no CGO, SVG output)
**Theme:** `grafana` (dark background, coloured lines)
All live metrics charts in the web UI are server-side SVG images served by Go
and polled by the browser every 2 seconds via `<img src="...?t=now">`.
There is no client-side canvas or JS chart library.
### Why go-analyze/charts
- Pure Go, no CGO — builds cleanly inside the live-build container
- SVG output — crisp at any display resolution, full-width without pixelation
- Grafana theme matches the dark web UI colour scheme
- Active fork of the archived wcharczuk/go-chart
### SAT stress-test charts
The `drawGPUChartSVG` function in `platform/gpu_metrics.go` is a separate
self-contained SVG renderer used **only** for completed SAT run reports
(HTML export, burn-in summaries). It is not used for live metrics.
### Live metrics chart endpoints
| Path | Content |
|------|---------|
| `GET /api/metrics/chart/server.svg` | CPU temp, CPU load %, mem load %, power W, fan RPMs |
| `GET /api/metrics/chart/gpu/{idx}.svg` | GPU temp °C, load %, mem %, power W |
Charts are 1400 × 280 px SVG. The page renders them at `width: 100%` in a
single-column layout so they always fill the viewport width.
### Ring buffers
Each metric is stored in a 120-sample ring buffer (2 minutes of history at 1 Hz).
Buffers are per-server or per-GPU and grow dynamically as new GPUs appear.