feat(metrics): single chart engine + full-width stacked layout
- One engine: go-analyze/charts (grafana theme) for all live metrics - Server chart: CPU temp, CPU load%, mem load%, power W, fan RPMs - GPU charts: temp, load%, mem%, power W — one card per GPU, added dynamically - Charts 1400x280px SVG, rendered at width:100% in single-column layout - Add CPU load (from /proc/stat) and mem load (from /proc/meminfo) to LiveMetricSample - Add GPU mem utilization to GPUMetricRow (nvidia-smi utilization.memory) - Document charting architecture in bible-local/architecture/charting.md Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -13,18 +13,19 @@ import (
|
||||
|
||||
// GPUMetricRow is one telemetry sample from nvidia-smi during a stress test.
|
||||
type GPUMetricRow struct {
|
||||
ElapsedSec float64
|
||||
GPUIndex int
|
||||
TempC float64
|
||||
UsagePct float64
|
||||
PowerW float64
|
||||
ClockMHz float64
|
||||
ElapsedSec float64 `json:"elapsed_sec"`
|
||||
GPUIndex int `json:"index"`
|
||||
TempC float64 `json:"temp_c"`
|
||||
UsagePct float64 `json:"usage_pct"`
|
||||
MemUsagePct float64 `json:"mem_usage_pct"`
|
||||
PowerW float64 `json:"power_w"`
|
||||
ClockMHz float64 `json:"clock_mhz"`
|
||||
}
|
||||
|
||||
// sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU.
|
||||
func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
|
||||
args := []string{
|
||||
"--query-gpu=index,temperature.gpu,utilization.gpu,power.draw,clocks.current.graphics",
|
||||
"--query-gpu=index,temperature.gpu,utilization.gpu,utilization.memory,power.draw,clocks.current.graphics",
|
||||
"--format=csv,noheader,nounits",
|
||||
}
|
||||
if len(gpuIndices) > 0 {
|
||||
@@ -45,16 +46,17 @@ func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
|
||||
continue
|
||||
}
|
||||
parts := strings.Split(line, ", ")
|
||||
if len(parts) < 5 {
|
||||
if len(parts) < 6 {
|
||||
continue
|
||||
}
|
||||
idx, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
|
||||
rows = append(rows, GPUMetricRow{
|
||||
GPUIndex: idx,
|
||||
TempC: parseGPUFloat(parts[1]),
|
||||
UsagePct: parseGPUFloat(parts[2]),
|
||||
PowerW: parseGPUFloat(parts[3]),
|
||||
ClockMHz: parseGPUFloat(parts[4]),
|
||||
GPUIndex: idx,
|
||||
TempC: parseGPUFloat(parts[1]),
|
||||
UsagePct: parseGPUFloat(parts[2]),
|
||||
MemUsagePct: parseGPUFloat(parts[3]),
|
||||
PowerW: parseGPUFloat(parts[4]),
|
||||
ClockMHz: parseGPUFloat(parts[5]),
|
||||
})
|
||||
}
|
||||
return rows, nil
|
||||
|
||||
@@ -1,15 +1,23 @@
|
||||
package platform
|
||||
|
||||
import "time"
|
||||
import (
|
||||
"bufio"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// LiveMetricSample is a single point-in-time snapshot of server metrics
|
||||
// collected for the web UI metrics page.
|
||||
type LiveMetricSample struct {
|
||||
Timestamp time.Time `json:"ts"`
|
||||
Fans []FanReading `json:"fans"`
|
||||
Temps []TempReading `json:"temps"`
|
||||
PowerW float64 `json:"power_w"`
|
||||
GPUs []GPUMetricRow `json:"gpus"`
|
||||
Timestamp time.Time `json:"ts"`
|
||||
Fans []FanReading `json:"fans"`
|
||||
Temps []TempReading `json:"temps"`
|
||||
PowerW float64 `json:"power_w"`
|
||||
CPULoadPct float64 `json:"cpu_load_pct"`
|
||||
MemLoadPct float64 `json:"mem_load_pct"`
|
||||
GPUs []GPUMetricRow `json:"gpus"`
|
||||
}
|
||||
|
||||
// TempReading is a named temperature sensor value.
|
||||
@@ -41,5 +49,91 @@ func SampleLiveMetrics() LiveMetricSample {
|
||||
// System power — returns 0 if unavailable
|
||||
s.PowerW = sampleSystemPower()
|
||||
|
||||
// CPU load — from /proc/stat
|
||||
s.CPULoadPct = sampleCPULoadPct()
|
||||
|
||||
// Memory load — from /proc/meminfo
|
||||
s.MemLoadPct = sampleMemLoadPct()
|
||||
|
||||
return s
|
||||
}
|
||||
|
||||
// sampleCPULoadPct reads two /proc/stat snapshots 200ms apart and returns
|
||||
// the overall CPU utilisation percentage.
|
||||
var cpuStatPrev [2]uint64 // [total, idle]
|
||||
|
||||
func sampleCPULoadPct() float64 {
|
||||
total, idle := readCPUStat()
|
||||
if total == 0 {
|
||||
return 0
|
||||
}
|
||||
prevTotal, prevIdle := cpuStatPrev[0], cpuStatPrev[1]
|
||||
cpuStatPrev = [2]uint64{total, idle}
|
||||
if prevTotal == 0 {
|
||||
return 0
|
||||
}
|
||||
dt := float64(total - prevTotal)
|
||||
di := float64(idle - prevIdle)
|
||||
if dt <= 0 {
|
||||
return 0
|
||||
}
|
||||
pct := (1 - di/dt) * 100
|
||||
if pct < 0 {
|
||||
return 0
|
||||
}
|
||||
if pct > 100 {
|
||||
return 100
|
||||
}
|
||||
return pct
|
||||
}
|
||||
|
||||
func readCPUStat() (total, idle uint64) {
|
||||
f, err := os.Open("/proc/stat")
|
||||
if err != nil {
|
||||
return 0, 0
|
||||
}
|
||||
defer f.Close()
|
||||
sc := bufio.NewScanner(f)
|
||||
for sc.Scan() {
|
||||
line := sc.Text()
|
||||
if !strings.HasPrefix(line, "cpu ") {
|
||||
continue
|
||||
}
|
||||
fields := strings.Fields(line)[1:] // skip "cpu"
|
||||
var vals [10]uint64
|
||||
for i := 0; i < len(fields) && i < 10; i++ {
|
||||
vals[i], _ = strconv.ParseUint(fields[i], 10, 64)
|
||||
}
|
||||
// idle = idle + iowait
|
||||
idle = vals[3] + vals[4]
|
||||
for _, v := range vals {
|
||||
total += v
|
||||
}
|
||||
return total, idle
|
||||
}
|
||||
return 0, 0
|
||||
}
|
||||
|
||||
func sampleMemLoadPct() float64 {
|
||||
f, err := os.Open("/proc/meminfo")
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
defer f.Close()
|
||||
vals := map[string]uint64{}
|
||||
sc := bufio.NewScanner(f)
|
||||
for sc.Scan() {
|
||||
fields := strings.Fields(sc.Text())
|
||||
if len(fields) >= 2 {
|
||||
v, _ := strconv.ParseUint(fields[1], 10, 64)
|
||||
vals[strings.TrimSuffix(fields[0], ":")] = v
|
||||
}
|
||||
}
|
||||
total := vals["MemTotal"]
|
||||
avail := vals["MemAvailable"]
|
||||
if total == 0 {
|
||||
return 0
|
||||
}
|
||||
used := total - avail
|
||||
return float64(used) / float64(total) * 100
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user