Files
bee/audit/internal/platform/live_metrics.go
Michael Chus ec6a0b292d fix(webui): fix sensor grouping and fan card visibility
- Tccd1-8 (AMD CCD die temps) now classified as 'cpu' group,
  appear on CPU Temperature chart instead of ambient
- Fan RPM card hidden when no fans detected
- Remove CPU Load/Mem Load/Power from fan table (have dedicated charts)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-29 10:58:01 +03:00

327 lines
7.8 KiB
Go

package platform
import (
"bufio"
"encoding/json"
"os"
"os/exec"
"sort"
"strconv"
"strings"
"time"
)
// LiveMetricSample is a single point-in-time snapshot of server metrics
// collected for the web UI metrics page.
type LiveMetricSample struct {
Timestamp time.Time `json:"ts"`
Fans []FanReading `json:"fans"`
Temps []TempReading `json:"temps"`
PowerW float64 `json:"power_w"`
CPULoadPct float64 `json:"cpu_load_pct"`
MemLoadPct float64 `json:"mem_load_pct"`
GPUs []GPUMetricRow `json:"gpus"`
}
// TempReading is a named temperature sensor value.
type TempReading struct {
Name string `json:"name"`
Group string `json:"group,omitempty"`
Celsius float64 `json:"celsius"`
}
// SampleLiveMetrics collects a single metrics snapshot from all available
// sources: GPU (via nvidia-smi), fans and temperatures (via ipmitool/sensors),
// and system power (via ipmitool dcmi). Missing sources are silently skipped.
func SampleLiveMetrics() LiveMetricSample {
s := LiveMetricSample{Timestamp: time.Now().UTC()}
// GPU metrics — try NVIDIA first, fall back to AMD
if gpus, err := SampleGPUMetrics(nil); err == nil && len(gpus) > 0 {
s.GPUs = gpus
} else if amdGPUs, err := sampleAMDGPUMetrics(); err == nil && len(amdGPUs) > 0 {
s.GPUs = amdGPUs
}
// Fan speeds — skipped silently if ipmitool unavailable
fans, _ := sampleFanSpeeds()
s.Fans = fans
s.Temps = append(s.Temps, sampleLiveTemperatureReadings()...)
if !hasTempGroup(s.Temps, "cpu") {
if cpuTemp := sampleCPUMaxTemp(); cpuTemp > 0 {
s.Temps = append(s.Temps, TempReading{Name: "CPU Max", Group: "cpu", Celsius: cpuTemp})
}
}
// System power — returns 0 if unavailable
s.PowerW = sampleSystemPower()
// CPU load — from /proc/stat
s.CPULoadPct = sampleCPULoadPct()
// Memory load — from /proc/meminfo
s.MemLoadPct = sampleMemLoadPct()
return s
}
// sampleCPULoadPct reads two /proc/stat snapshots 200ms apart and returns
// the overall CPU utilisation percentage.
var cpuStatPrev [2]uint64 // [total, idle]
func sampleCPULoadPct() float64 {
total, idle := readCPUStat()
if total == 0 {
return 0
}
prevTotal, prevIdle := cpuStatPrev[0], cpuStatPrev[1]
cpuStatPrev = [2]uint64{total, idle}
if prevTotal == 0 {
return 0
}
dt := float64(total - prevTotal)
di := float64(idle - prevIdle)
if dt <= 0 {
return 0
}
pct := (1 - di/dt) * 100
if pct < 0 {
return 0
}
if pct > 100 {
return 100
}
return pct
}
func readCPUStat() (total, idle uint64) {
f, err := os.Open("/proc/stat")
if err != nil {
return 0, 0
}
defer f.Close()
sc := bufio.NewScanner(f)
for sc.Scan() {
line := sc.Text()
if !strings.HasPrefix(line, "cpu ") {
continue
}
fields := strings.Fields(line)[1:] // skip "cpu"
var vals [10]uint64
for i := 0; i < len(fields) && i < 10; i++ {
vals[i], _ = strconv.ParseUint(fields[i], 10, 64)
}
// idle = idle + iowait
idle = vals[3] + vals[4]
for _, v := range vals {
total += v
}
return total, idle
}
return 0, 0
}
func sampleMemLoadPct() float64 {
f, err := os.Open("/proc/meminfo")
if err != nil {
return 0
}
defer f.Close()
vals := map[string]uint64{}
sc := bufio.NewScanner(f)
for sc.Scan() {
fields := strings.Fields(sc.Text())
if len(fields) >= 2 {
v, _ := strconv.ParseUint(fields[1], 10, 64)
vals[strings.TrimSuffix(fields[0], ":")] = v
}
}
total := vals["MemTotal"]
avail := vals["MemAvailable"]
if total == 0 {
return 0
}
used := total - avail
return float64(used) / float64(total) * 100
}
func hasTempGroup(temps []TempReading, group string) bool {
for _, t := range temps {
if t.Group == group {
return true
}
}
return false
}
func sampleLiveTemperatureReadings() []TempReading {
if temps := sampleLiveTempsViaSensorsJSON(); len(temps) > 0 {
return temps
}
return sampleLiveTempsViaIPMI()
}
func sampleLiveTempsViaSensorsJSON() []TempReading {
out, err := exec.Command("sensors", "-j").Output()
if err != nil || len(out) == 0 {
return nil
}
var doc map[string]map[string]any
if err := json.Unmarshal(out, &doc); err != nil {
return nil
}
chips := make([]string, 0, len(doc))
for chip := range doc {
chips = append(chips, chip)
}
sort.Strings(chips)
temps := make([]TempReading, 0, len(chips))
seen := map[string]struct{}{}
for _, chip := range chips {
features := doc[chip]
featureNames := make([]string, 0, len(features))
for name := range features {
featureNames = append(featureNames, name)
}
sort.Strings(featureNames)
for _, name := range featureNames {
if strings.EqualFold(name, "Adapter") {
continue
}
feature, ok := features[name].(map[string]any)
if !ok {
continue
}
value, ok := firstTempInputValue(feature)
if !ok || value <= 0 || value > 150 {
continue
}
group := classifyLiveTempGroup(chip, name)
if group == "gpu" {
continue
}
label := strings.TrimSpace(name)
if label == "" {
continue
}
if group == "ambient" {
label = compactAmbientTempName(chip, label)
}
key := group + "\x00" + label
if _, ok := seen[key]; ok {
continue
}
seen[key] = struct{}{}
temps = append(temps, TempReading{Name: label, Group: group, Celsius: value})
}
}
return temps
}
func sampleLiveTempsViaIPMI() []TempReading {
out, err := exec.Command("ipmitool", "sdr", "type", "Temperature").Output()
if err != nil || len(out) == 0 {
return nil
}
var temps []TempReading
seen := map[string]struct{}{}
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
parts := strings.Split(line, "|")
if len(parts) < 3 {
continue
}
name := strings.TrimSpace(parts[0])
if name == "" {
continue
}
unit := strings.ToLower(strings.TrimSpace(parts[2]))
if !strings.Contains(unit, "degrees") {
continue
}
raw := strings.TrimSpace(parts[1])
if raw == "" || strings.EqualFold(raw, "na") {
continue
}
value, err := strconv.ParseFloat(raw, 64)
if err != nil || value <= 0 || value > 150 {
continue
}
group := classifyLiveTempGroup("", name)
if group == "gpu" {
continue
}
label := name
if group == "ambient" {
label = compactAmbientTempName("", label)
}
key := group + "\x00" + label
if _, ok := seen[key]; ok {
continue
}
seen[key] = struct{}{}
temps = append(temps, TempReading{Name: label, Group: group, Celsius: value})
}
return temps
}
func firstTempInputValue(feature map[string]any) (float64, bool) {
keys := make([]string, 0, len(feature))
for key := range feature {
keys = append(keys, key)
}
sort.Strings(keys)
for _, key := range keys {
lower := strings.ToLower(key)
if !strings.Contains(lower, "temp") || !strings.HasSuffix(lower, "_input") {
continue
}
switch value := feature[key].(type) {
case float64:
return value, true
case string:
f, err := strconv.ParseFloat(value, 64)
if err == nil {
return f, true
}
}
}
return 0, false
}
func classifyLiveTempGroup(chip, name string) string {
text := strings.ToLower(strings.TrimSpace(chip + " " + name))
switch {
case strings.Contains(text, "gpu"), strings.Contains(text, "amdgpu"), strings.Contains(text, "nvidia"), strings.Contains(text, "adeon"):
return "gpu"
case strings.Contains(text, "coretemp"),
strings.Contains(text, "k10temp"),
strings.Contains(text, "zenpower"),
strings.Contains(text, "package id"),
strings.Contains(text, "x86_pkg_temp"),
strings.Contains(text, "tctl"),
strings.Contains(text, "tdie"),
strings.Contains(text, "tccd"),
strings.Contains(text, "cpu"),
strings.Contains(text, "peci"):
return "cpu"
default:
return "ambient"
}
}
func compactAmbientTempName(chip, name string) string {
chip = strings.TrimSpace(chip)
name = strings.TrimSpace(name)
if chip == "" || strings.EqualFold(chip, name) {
return name
}
if strings.Contains(strings.ToLower(name), strings.ToLower(chip)) {
return name
}
return chip + " / " + name
}