feat(metrics): single chart engine + full-width stacked layout

- One engine: go-analyze/charts (grafana theme) for all live metrics
- Server chart: CPU temp, CPU load%, mem load%, power W, fan RPMs
- GPU charts: temp, load%, mem%, power W — one card per GPU, added dynamically
- Charts 1400x280px SVG, rendered at width:100% in single-column layout
- Add CPU load (from /proc/stat) and mem load (from /proc/meminfo) to LiveMetricSample
- Add GPU mem utilization to GPUMetricRow (nvidia-smi utilization.memory)
- Document charting architecture in bible-local/architecture/charting.md

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-27 23:26:13 +03:00
parent e7a7ff54b9
commit ec0b7f7ff9
6 changed files with 336 additions and 78 deletions

View File

@@ -13,18 +13,19 @@ import (
// GPUMetricRow is one telemetry sample from nvidia-smi during a stress test.
type GPUMetricRow struct {
ElapsedSec float64
GPUIndex int
TempC float64
UsagePct float64
PowerW float64
ClockMHz float64
ElapsedSec float64 `json:"elapsed_sec"`
GPUIndex int `json:"index"`
TempC float64 `json:"temp_c"`
UsagePct float64 `json:"usage_pct"`
MemUsagePct float64 `json:"mem_usage_pct"`
PowerW float64 `json:"power_w"`
ClockMHz float64 `json:"clock_mhz"`
}
// sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
args := []string{
"--query-gpu=index,temperature.gpu,utilization.gpu,power.draw,clocks.current.graphics",
"--query-gpu=index,temperature.gpu,utilization.gpu,utilization.memory,power.draw,clocks.current.graphics",
"--format=csv,noheader,nounits",
}
if len(gpuIndices) > 0 {
@@ -45,16 +46,17 @@ func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
continue
}
parts := strings.Split(line, ", ")
if len(parts) < 5 {
if len(parts) < 6 {
continue
}
idx, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
rows = append(rows, GPUMetricRow{
GPUIndex: idx,
TempC: parseGPUFloat(parts[1]),
UsagePct: parseGPUFloat(parts[2]),
PowerW: parseGPUFloat(parts[3]),
ClockMHz: parseGPUFloat(parts[4]),
GPUIndex: idx,
TempC: parseGPUFloat(parts[1]),
UsagePct: parseGPUFloat(parts[2]),
MemUsagePct: parseGPUFloat(parts[3]),
PowerW: parseGPUFloat(parts[4]),
ClockMHz: parseGPUFloat(parts[5]),
})
}
return rows, nil