diff --git a/audit/internal/platform/gpu_metrics.go b/audit/internal/platform/gpu_metrics.go index b828b29..8cb1de7 100644 --- a/audit/internal/platform/gpu_metrics.go +++ b/audit/internal/platform/gpu_metrics.go @@ -13,18 +13,19 @@ import ( // GPUMetricRow is one telemetry sample from nvidia-smi during a stress test. type GPUMetricRow struct { - ElapsedSec float64 - GPUIndex int - TempC float64 - UsagePct float64 - PowerW float64 - ClockMHz float64 + ElapsedSec float64 `json:"elapsed_sec"` + GPUIndex int `json:"index"` + TempC float64 `json:"temp_c"` + UsagePct float64 `json:"usage_pct"` + MemUsagePct float64 `json:"mem_usage_pct"` + PowerW float64 `json:"power_w"` + ClockMHz float64 `json:"clock_mhz"` } // sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU. func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) { args := []string{ - "--query-gpu=index,temperature.gpu,utilization.gpu,power.draw,clocks.current.graphics", + "--query-gpu=index,temperature.gpu,utilization.gpu,utilization.memory,power.draw,clocks.current.graphics", "--format=csv,noheader,nounits", } if len(gpuIndices) > 0 { @@ -45,16 +46,17 @@ func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) { continue } parts := strings.Split(line, ", ") - if len(parts) < 5 { + if len(parts) < 6 { continue } idx, _ := strconv.Atoi(strings.TrimSpace(parts[0])) rows = append(rows, GPUMetricRow{ - GPUIndex: idx, - TempC: parseGPUFloat(parts[1]), - UsagePct: parseGPUFloat(parts[2]), - PowerW: parseGPUFloat(parts[3]), - ClockMHz: parseGPUFloat(parts[4]), + GPUIndex: idx, + TempC: parseGPUFloat(parts[1]), + UsagePct: parseGPUFloat(parts[2]), + MemUsagePct: parseGPUFloat(parts[3]), + PowerW: parseGPUFloat(parts[4]), + ClockMHz: parseGPUFloat(parts[5]), }) } return rows, nil diff --git a/audit/internal/platform/live_metrics.go b/audit/internal/platform/live_metrics.go index 76b64a0..0b516a9 100644 --- a/audit/internal/platform/live_metrics.go +++ b/audit/internal/platform/live_metrics.go @@ -1,15 +1,23 @@ package platform -import "time" +import ( + "bufio" + "os" + "strconv" + "strings" + "time" +) // LiveMetricSample is a single point-in-time snapshot of server metrics // collected for the web UI metrics page. type LiveMetricSample struct { - Timestamp time.Time `json:"ts"` - Fans []FanReading `json:"fans"` - Temps []TempReading `json:"temps"` - PowerW float64 `json:"power_w"` - GPUs []GPUMetricRow `json:"gpus"` + Timestamp time.Time `json:"ts"` + Fans []FanReading `json:"fans"` + Temps []TempReading `json:"temps"` + PowerW float64 `json:"power_w"` + CPULoadPct float64 `json:"cpu_load_pct"` + MemLoadPct float64 `json:"mem_load_pct"` + GPUs []GPUMetricRow `json:"gpus"` } // TempReading is a named temperature sensor value. @@ -41,5 +49,91 @@ func SampleLiveMetrics() LiveMetricSample { // System power — returns 0 if unavailable s.PowerW = sampleSystemPower() + // CPU load — from /proc/stat + s.CPULoadPct = sampleCPULoadPct() + + // Memory load — from /proc/meminfo + s.MemLoadPct = sampleMemLoadPct() + return s } + +// sampleCPULoadPct reads two /proc/stat snapshots 200ms apart and returns +// the overall CPU utilisation percentage. +var cpuStatPrev [2]uint64 // [total, idle] + +func sampleCPULoadPct() float64 { + total, idle := readCPUStat() + if total == 0 { + return 0 + } + prevTotal, prevIdle := cpuStatPrev[0], cpuStatPrev[1] + cpuStatPrev = [2]uint64{total, idle} + if prevTotal == 0 { + return 0 + } + dt := float64(total - prevTotal) + di := float64(idle - prevIdle) + if dt <= 0 { + return 0 + } + pct := (1 - di/dt) * 100 + if pct < 0 { + return 0 + } + if pct > 100 { + return 100 + } + return pct +} + +func readCPUStat() (total, idle uint64) { + f, err := os.Open("/proc/stat") + if err != nil { + return 0, 0 + } + defer f.Close() + sc := bufio.NewScanner(f) + for sc.Scan() { + line := sc.Text() + if !strings.HasPrefix(line, "cpu ") { + continue + } + fields := strings.Fields(line)[1:] // skip "cpu" + var vals [10]uint64 + for i := 0; i < len(fields) && i < 10; i++ { + vals[i], _ = strconv.ParseUint(fields[i], 10, 64) + } + // idle = idle + iowait + idle = vals[3] + vals[4] + for _, v := range vals { + total += v + } + return total, idle + } + return 0, 0 +} + +func sampleMemLoadPct() float64 { + f, err := os.Open("/proc/meminfo") + if err != nil { + return 0 + } + defer f.Close() + vals := map[string]uint64{} + sc := bufio.NewScanner(f) + for sc.Scan() { + fields := strings.Fields(sc.Text()) + if len(fields) >= 2 { + v, _ := strconv.ParseUint(fields[1], 10, 64) + vals[strings.TrimSuffix(fields[0], ":")] = v + } + } + total := vals["MemTotal"] + avail := vals["MemAvailable"] + if total == 0 { + return 0 + } + used := total - avail + return float64(used) / float64(total) * 100 +} diff --git a/audit/internal/webui/api.go b/audit/internal/webui/api.go index 025aad1..54f7bab 100644 --- a/audit/internal/webui/api.go +++ b/audit/internal/webui/api.go @@ -424,7 +424,7 @@ func (h *handler) handleAPIMetricsStream(w http.ResponseWriter, r *http.Request) case <-ticker.C: sample := platform.SampleLiveMetrics() - // Feed ring buffers for server-side SVG charts + // Feed server ring buffers for _, t := range sample.Temps { if t.Name == "CPU" { h.ringCPUTemp.push(t.Celsius) @@ -432,6 +432,35 @@ func (h *handler) handleAPIMetricsStream(w http.ResponseWriter, r *http.Request) } } h.ringPower.push(sample.PowerW) + h.ringCPULoad.push(sample.CPULoadPct) + h.ringMemLoad.push(sample.MemLoadPct) + + // Feed fan ring buffers (grow on first sight) + h.ringsMu.Lock() + for i, fan := range sample.Fans { + for len(h.ringFans) <= i { + h.ringFans = append(h.ringFans, newMetricsRing(120)) + h.fanNames = append(h.fanNames, fan.Name) + } + h.ringFans[i].push(float64(fan.RPM)) + } + // Feed per-GPU ring buffers (grow on first sight) + for _, gpu := range sample.GPUs { + idx := gpu.GPUIndex + for len(h.gpuRings) <= idx { + h.gpuRings = append(h.gpuRings, &gpuRings{ + Temp: newMetricsRing(120), + Util: newMetricsRing(120), + MemUtil: newMetricsRing(120), + Power: newMetricsRing(120), + }) + } + h.gpuRings[idx].Temp.push(gpu.TempC) + h.gpuRings[idx].Util.push(gpu.UsagePct) + h.gpuRings[idx].MemUtil.push(gpu.MemUsagePct) + h.gpuRings[idx].Power.push(gpu.PowerW) + } + h.ringsMu.Unlock() b, err := json.Marshal(sample) if err != nil { diff --git a/audit/internal/webui/pages.go b/audit/internal/webui/pages.go index 4b59643..3ebbd68 100644 --- a/audit/internal/webui/pages.go +++ b/audit/internal/webui/pages.go @@ -242,28 +242,27 @@ func renderHealthCard(opts HandlerOptions) string { // ── Metrics ─────────────────────────────────────────────────────────────────── func renderMetrics() string { - return `
Live server metrics, charts updated every 2 seconds.
-Waiting for data...
Live metrics — updated every 2 seconds. Charts use go-analyze/charts (grafana theme).
+ +