diff --git a/audit/internal/platform/gpu_metrics.go b/audit/internal/platform/gpu_metrics.go index b828b29..8cb1de7 100644 --- a/audit/internal/platform/gpu_metrics.go +++ b/audit/internal/platform/gpu_metrics.go @@ -13,18 +13,19 @@ import ( // GPUMetricRow is one telemetry sample from nvidia-smi during a stress test. type GPUMetricRow struct { - ElapsedSec float64 - GPUIndex int - TempC float64 - UsagePct float64 - PowerW float64 - ClockMHz float64 + ElapsedSec float64 `json:"elapsed_sec"` + GPUIndex int `json:"index"` + TempC float64 `json:"temp_c"` + UsagePct float64 `json:"usage_pct"` + MemUsagePct float64 `json:"mem_usage_pct"` + PowerW float64 `json:"power_w"` + ClockMHz float64 `json:"clock_mhz"` } // sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU. func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) { args := []string{ - "--query-gpu=index,temperature.gpu,utilization.gpu,power.draw,clocks.current.graphics", + "--query-gpu=index,temperature.gpu,utilization.gpu,utilization.memory,power.draw,clocks.current.graphics", "--format=csv,noheader,nounits", } if len(gpuIndices) > 0 { @@ -45,16 +46,17 @@ func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) { continue } parts := strings.Split(line, ", ") - if len(parts) < 5 { + if len(parts) < 6 { continue } idx, _ := strconv.Atoi(strings.TrimSpace(parts[0])) rows = append(rows, GPUMetricRow{ - GPUIndex: idx, - TempC: parseGPUFloat(parts[1]), - UsagePct: parseGPUFloat(parts[2]), - PowerW: parseGPUFloat(parts[3]), - ClockMHz: parseGPUFloat(parts[4]), + GPUIndex: idx, + TempC: parseGPUFloat(parts[1]), + UsagePct: parseGPUFloat(parts[2]), + MemUsagePct: parseGPUFloat(parts[3]), + PowerW: parseGPUFloat(parts[4]), + ClockMHz: parseGPUFloat(parts[5]), }) } return rows, nil diff --git a/audit/internal/platform/live_metrics.go b/audit/internal/platform/live_metrics.go index 76b64a0..0b516a9 100644 --- a/audit/internal/platform/live_metrics.go +++ b/audit/internal/platform/live_metrics.go @@ -1,15 +1,23 @@ package platform -import "time" +import ( + "bufio" + "os" + "strconv" + "strings" + "time" +) // LiveMetricSample is a single point-in-time snapshot of server metrics // collected for the web UI metrics page. type LiveMetricSample struct { - Timestamp time.Time `json:"ts"` - Fans []FanReading `json:"fans"` - Temps []TempReading `json:"temps"` - PowerW float64 `json:"power_w"` - GPUs []GPUMetricRow `json:"gpus"` + Timestamp time.Time `json:"ts"` + Fans []FanReading `json:"fans"` + Temps []TempReading `json:"temps"` + PowerW float64 `json:"power_w"` + CPULoadPct float64 `json:"cpu_load_pct"` + MemLoadPct float64 `json:"mem_load_pct"` + GPUs []GPUMetricRow `json:"gpus"` } // TempReading is a named temperature sensor value. @@ -41,5 +49,91 @@ func SampleLiveMetrics() LiveMetricSample { // System power — returns 0 if unavailable s.PowerW = sampleSystemPower() + // CPU load — from /proc/stat + s.CPULoadPct = sampleCPULoadPct() + + // Memory load — from /proc/meminfo + s.MemLoadPct = sampleMemLoadPct() + return s } + +// sampleCPULoadPct reads two /proc/stat snapshots 200ms apart and returns +// the overall CPU utilisation percentage. +var cpuStatPrev [2]uint64 // [total, idle] + +func sampleCPULoadPct() float64 { + total, idle := readCPUStat() + if total == 0 { + return 0 + } + prevTotal, prevIdle := cpuStatPrev[0], cpuStatPrev[1] + cpuStatPrev = [2]uint64{total, idle} + if prevTotal == 0 { + return 0 + } + dt := float64(total - prevTotal) + di := float64(idle - prevIdle) + if dt <= 0 { + return 0 + } + pct := (1 - di/dt) * 100 + if pct < 0 { + return 0 + } + if pct > 100 { + return 100 + } + return pct +} + +func readCPUStat() (total, idle uint64) { + f, err := os.Open("/proc/stat") + if err != nil { + return 0, 0 + } + defer f.Close() + sc := bufio.NewScanner(f) + for sc.Scan() { + line := sc.Text() + if !strings.HasPrefix(line, "cpu ") { + continue + } + fields := strings.Fields(line)[1:] // skip "cpu" + var vals [10]uint64 + for i := 0; i < len(fields) && i < 10; i++ { + vals[i], _ = strconv.ParseUint(fields[i], 10, 64) + } + // idle = idle + iowait + idle = vals[3] + vals[4] + for _, v := range vals { + total += v + } + return total, idle + } + return 0, 0 +} + +func sampleMemLoadPct() float64 { + f, err := os.Open("/proc/meminfo") + if err != nil { + return 0 + } + defer f.Close() + vals := map[string]uint64{} + sc := bufio.NewScanner(f) + for sc.Scan() { + fields := strings.Fields(sc.Text()) + if len(fields) >= 2 { + v, _ := strconv.ParseUint(fields[1], 10, 64) + vals[strings.TrimSuffix(fields[0], ":")] = v + } + } + total := vals["MemTotal"] + avail := vals["MemAvailable"] + if total == 0 { + return 0 + } + used := total - avail + return float64(used) / float64(total) * 100 +} diff --git a/audit/internal/webui/api.go b/audit/internal/webui/api.go index 025aad1..54f7bab 100644 --- a/audit/internal/webui/api.go +++ b/audit/internal/webui/api.go @@ -424,7 +424,7 @@ func (h *handler) handleAPIMetricsStream(w http.ResponseWriter, r *http.Request) case <-ticker.C: sample := platform.SampleLiveMetrics() - // Feed ring buffers for server-side SVG charts + // Feed server ring buffers for _, t := range sample.Temps { if t.Name == "CPU" { h.ringCPUTemp.push(t.Celsius) @@ -432,6 +432,35 @@ func (h *handler) handleAPIMetricsStream(w http.ResponseWriter, r *http.Request) } } h.ringPower.push(sample.PowerW) + h.ringCPULoad.push(sample.CPULoadPct) + h.ringMemLoad.push(sample.MemLoadPct) + + // Feed fan ring buffers (grow on first sight) + h.ringsMu.Lock() + for i, fan := range sample.Fans { + for len(h.ringFans) <= i { + h.ringFans = append(h.ringFans, newMetricsRing(120)) + h.fanNames = append(h.fanNames, fan.Name) + } + h.ringFans[i].push(float64(fan.RPM)) + } + // Feed per-GPU ring buffers (grow on first sight) + for _, gpu := range sample.GPUs { + idx := gpu.GPUIndex + for len(h.gpuRings) <= idx { + h.gpuRings = append(h.gpuRings, &gpuRings{ + Temp: newMetricsRing(120), + Util: newMetricsRing(120), + MemUtil: newMetricsRing(120), + Power: newMetricsRing(120), + }) + } + h.gpuRings[idx].Temp.push(gpu.TempC) + h.gpuRings[idx].Util.push(gpu.UsagePct) + h.gpuRings[idx].MemUtil.push(gpu.MemUsagePct) + h.gpuRings[idx].Power.push(gpu.PowerW) + } + h.ringsMu.Unlock() b, err := json.Marshal(sample) if err != nil { diff --git a/audit/internal/webui/pages.go b/audit/internal/webui/pages.go index 4b59643..3ebbd68 100644 --- a/audit/internal/webui/pages.go +++ b/audit/internal/webui/pages.go @@ -242,28 +242,27 @@ func renderHealthCard(opts HandlerOptions) string { // ── Metrics ─────────────────────────────────────────────────────────────────── func renderMetrics() string { - return `

Live server metrics, charts updated every 2 seconds.

-
-
-
System
-
- CPU Temp - Power -
-
-
-
-
GPU
-
-

Waiting for data...

-
+ return `

Live metrics — updated every 2 seconds. Charts use go-analyze/charts (grafana theme).

+ +
+
Server
+
+ Server metrics +
+ +
+ ` diff --git a/audit/internal/webui/server.go b/audit/internal/webui/server.go index 9723863..4853da7 100644 --- a/audit/internal/webui/server.go +++ b/audit/internal/webui/server.go @@ -62,15 +62,27 @@ func (r *metricsRing) snapshot() ([]float64, []string) { return v, l } +// gpuRings holds per-GPU ring buffers. +type gpuRings struct { + Temp *metricsRing + Util *metricsRing + MemUtil *metricsRing + Power *metricsRing +} + // handler is the HTTP handler for the web UI. type handler struct { - opts HandlerOptions - mux *http.ServeMux + opts HandlerOptions + mux *http.ServeMux + // server rings ringCPUTemp *metricsRing + ringCPULoad *metricsRing + ringMemLoad *metricsRing ringPower *metricsRing ringFans []*metricsRing - ringGPUTemp []*metricsRing - ringGPUUtil []*metricsRing + fanNames []string + // per-GPU rings (index = GPU index) + gpuRings []*gpuRings ringsMu sync.Mutex } @@ -89,6 +101,8 @@ func NewHandler(opts HandlerOptions) http.Handler { h := &handler{ opts: opts, ringCPUTemp: newMetricsRing(120), + ringCPULoad: newMetricsRing(120), + ringMemLoad: newMetricsRing(120), ringPower: newMetricsRing(120), } mux := http.NewServeMux() @@ -244,48 +258,88 @@ func (h *handler) handleViewer(w http.ResponseWriter, r *http.Request) { } func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request) { - name := strings.TrimPrefix(r.URL.Path, "/api/metrics/chart/") - name = strings.TrimSuffix(name, ".svg") + path := strings.TrimPrefix(r.URL.Path, "/api/metrics/chart/") + path = strings.TrimSuffix(path, ".svg") + + var datasets [][]float64 + var names []string + var labels []string + var title string + + switch { + case path == "server": + title = "Server" + vCPUTemp, l := h.ringCPUTemp.snapshot() + vCPULoad, _ := h.ringCPULoad.snapshot() + vMemLoad, _ := h.ringMemLoad.snapshot() + vPower, _ := h.ringPower.snapshot() + labels = l + datasets = [][]float64{vCPUTemp, vCPULoad, vMemLoad, vPower} + names = []string{"CPU Temp °C", "CPU Load %", "Mem Load %", "Power W"} + + h.ringsMu.Lock() + for i, fr := range h.ringFans { + fv, _ := fr.snapshot() + datasets = append(datasets, fv) + name := "Fan" + if i < len(h.fanNames) { + name = h.fanNames[i] + } + names = append(names, name+" RPM") + } + h.ringsMu.Unlock() + + case strings.HasPrefix(path, "gpu/"): + idxStr := strings.TrimPrefix(path, "gpu/") + idx := 0 + fmt.Sscanf(idxStr, "%d", &idx) + h.ringsMu.Lock() + var gr *gpuRings + if idx < len(h.gpuRings) { + gr = h.gpuRings[idx] + } + h.ringsMu.Unlock() + if gr == nil { + http.NotFound(w, r) + return + } + vTemp, l := gr.Temp.snapshot() + vUtil, _ := gr.Util.snapshot() + vMemUtil, _ := gr.MemUtil.snapshot() + vPower, _ := gr.Power.snapshot() + labels = l + title = fmt.Sprintf("GPU %d", idx) + datasets = [][]float64{vTemp, vUtil, vMemUtil, vPower} + names = []string{"Temp °C", "Load %", "Mem %", "Power W"} - var ring *metricsRing - var title, unit string - switch name { - case "cpu-temp": - ring, title, unit = h.ringCPUTemp, "CPU Temperature", "°C" - case "power": - ring, title, unit = h.ringPower, "System Power", "W" default: http.NotFound(w, r) return } - vals, labels := ring.snapshot() - if len(vals) == 0 { - vals = []float64{0} + // Ensure all datasets same length as labels + n := len(labels) + if n == 0 { + n = 1 labels = []string{""} } - - // Sparse x-axis labels - sparse := make([]string, len(labels)) - step := len(labels) / 6 - if step < 1 { - step = 1 - } - for i := range labels { - if i%step == 0 { - sparse[i] = labels[i] + for i := range datasets { + if len(datasets[i]) == 0 { + datasets[i] = make([]float64, n) } } - opt := gocharts.NewLineChartOptionWithData([][]float64{vals}) - opt.Title = gocharts.TitleOption{Text: title + " (" + unit + ")"} + sparse := sparseLabels(labels, 6) + + opt := gocharts.NewLineChartOptionWithData(datasets) + opt.Title = gocharts.TitleOption{Text: title} opt.XAxis.Labels = sparse - opt.Legend = gocharts.LegendOption{Show: gocharts.Ptr(false)} + opt.Legend = gocharts.LegendOption{SeriesNames: names} p := gocharts.NewPainter(gocharts.PainterOptions{ OutputFormat: gocharts.ChartOutputSVG, - Width: 600, - Height: 180, + Width: 1400, + Height: 280, }, gocharts.PainterThemeOption(gocharts.GetTheme("grafana"))) if err := p.LineChart(opt); err != nil { http.Error(w, err.Error(), http.StatusInternalServerError) @@ -301,6 +355,27 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request) _, _ = w.Write(buf) } +func safeIdx(s []float64, i int) float64 { + if i < len(s) { + return s[i] + } + return 0 +} + +func sparseLabels(labels []string, n int) []string { + out := make([]string, len(labels)) + step := len(labels) / n + if step < 1 { + step = 1 + } + for i, l := range labels { + if i%step == 0 { + out[i] = l + } + } + return out +} + // ── Page handler ───────────────────────────────────────────────────────────── func (h *handler) handlePage(w http.ResponseWriter, r *http.Request) { diff --git a/bible-local/architecture/charting.md b/bible-local/architecture/charting.md new file mode 100644 index 0000000..e2bfae5 --- /dev/null +++ b/bible-local/architecture/charting.md @@ -0,0 +1,38 @@ +# Charting architecture + +## Decision: one chart engine for all live metrics + +**Engine:** `github.com/go-analyze/charts` (pure Go, no CGO, SVG output) +**Theme:** `grafana` (dark background, coloured lines) + +All live metrics charts in the web UI are server-side SVG images served by Go +and polled by the browser every 2 seconds via ``. +There is no client-side canvas or JS chart library. + +### Why go-analyze/charts + +- Pure Go, no CGO — builds cleanly inside the live-build container +- SVG output — crisp at any display resolution, full-width without pixelation +- Grafana theme matches the dark web UI colour scheme +- Active fork of the archived wcharczuk/go-chart + +### SAT stress-test charts + +The `drawGPUChartSVG` function in `platform/gpu_metrics.go` is a separate +self-contained SVG renderer used **only** for completed SAT run reports +(HTML export, burn-in summaries). It is not used for live metrics. + +### Live metrics chart endpoints + +| Path | Content | +|------|---------| +| `GET /api/metrics/chart/server.svg` | CPU temp, CPU load %, mem load %, power W, fan RPMs | +| `GET /api/metrics/chart/gpu/{idx}.svg` | GPU temp °C, load %, mem %, power W | + +Charts are 1400 × 280 px SVG. The page renders them at `width: 100%` in a +single-column layout so they always fill the viewport width. + +### Ring buffers + +Each metric is stored in a 120-sample ring buffer (2 minutes of history at 1 Hz). +Buffers are per-server or per-GPU and grow dynamically as new GPUs appear.