Files
bee/audit/internal/platform/live_metrics.go
Michael Chus b89580c24d Fix PSU power chart: use name-based SDR matching instead of entity ID
MSI servers place PSU_POWER_IN/OUT sensors on entity 3.0, not 10.N
(the IPMI "Power Supply" entity). The old parser filtered by entity ID
and found nothing, so the dashboard fell back to DCMI which reports
roughly half the actual draw.

Now delegates to collector.PSUSlotsFromSDR — the same name-based
matching already used in the Power Fit benchmark.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-19 18:39:21 +03:00

385 lines
9.4 KiB
Go

package platform
import (
"bee/audit/internal/collector"
"bufio"
"encoding/json"
"fmt"
"os"
"os/exec"
"sort"
"strconv"
"strings"
"time"
)
// LiveMetricSample is a single point-in-time snapshot of server metrics
// collected for the web UI metrics page.
type LiveMetricSample struct {
Timestamp time.Time `json:"ts"`
Fans []FanReading `json:"fans"`
Temps []TempReading `json:"temps"`
PowerW float64 `json:"power_w"`
PSUs []PSUReading `json:"psus,omitempty"`
CPULoadPct float64 `json:"cpu_load_pct"`
MemLoadPct float64 `json:"mem_load_pct"`
GPUs []GPUMetricRow `json:"gpus"`
}
// PSUReading is a per-slot power supply input power reading.
type PSUReading struct {
Slot int `json:"slot"`
Name string `json:"name"`
PowerW float64 `json:"power_w"`
}
// TempReading is a named temperature sensor value.
type TempReading struct {
Name string `json:"name"`
Group string `json:"group,omitempty"`
Celsius float64 `json:"celsius"`
}
// SampleLiveMetrics collects a single metrics snapshot from all available
// sources: GPU (via nvidia-smi), fans and temperatures (via ipmitool/sensors),
// and system power (via ipmitool dcmi). Missing sources are silently skipped.
func SampleLiveMetrics() LiveMetricSample {
s := LiveMetricSample{Timestamp: time.Now().UTC()}
// GPU metrics — try NVIDIA first, fall back to AMD
if gpus, err := SampleGPUMetrics(nil); err == nil && len(gpus) > 0 {
s.GPUs = gpus
} else if amdGPUs, err := sampleAMDGPUMetrics(); err == nil && len(amdGPUs) > 0 {
s.GPUs = amdGPUs
}
// Fan speeds — skipped silently if ipmitool unavailable
fans, _ := sampleFanSpeeds()
s.Fans = fans
s.Temps = append(s.Temps, sampleLiveTemperatureReadings()...)
if !hasTempGroup(s.Temps, "cpu") {
if cpuTemp := sampleCPUMaxTemp(); cpuTemp > 0 {
s.Temps = append(s.Temps, TempReading{Name: "CPU Max", Group: "cpu", Celsius: cpuTemp})
}
}
// System power — returns 0 if unavailable
s.PowerW = sampleSystemPower()
// Per-PSU power — populated when IPMI SDR has Power Supply entities with Watt readings
s.PSUs = samplePSUPower()
// CPU load — from /proc/stat
s.CPULoadPct = sampleCPULoadPct()
// Memory load — from /proc/meminfo
s.MemLoadPct = sampleMemLoadPct()
return s
}
// sampleCPULoadPct reads two /proc/stat snapshots 200ms apart and returns
// the overall CPU utilisation percentage.
func sampleCPULoadPct() float64 {
total0, idle0 := readCPUStat()
if total0 == 0 {
return 0
}
time.Sleep(200 * time.Millisecond)
total1, idle1 := readCPUStat()
if total1 == 0 {
return 0
}
return cpuLoadPctBetween(total0, idle0, total1, idle1)
}
func cpuLoadPctBetween(prevTotal, prevIdle, total, idle uint64) float64 {
dt := float64(total - prevTotal)
di := float64(idle - prevIdle)
if dt <= 0 {
return 0
}
pct := (1 - di/dt) * 100
if pct < 0 {
return 0
}
if pct > 100 {
return 100
}
return pct
}
func readCPUStat() (total, idle uint64) {
f, err := os.Open("/proc/stat")
if err != nil {
return 0, 0
}
defer f.Close()
sc := bufio.NewScanner(f)
for sc.Scan() {
line := sc.Text()
if !strings.HasPrefix(line, "cpu ") {
continue
}
fields := strings.Fields(line)[1:] // skip "cpu"
var vals [10]uint64
for i := 0; i < len(fields) && i < 10; i++ {
vals[i], _ = strconv.ParseUint(fields[i], 10, 64)
}
// idle = idle + iowait
idle = vals[3] + vals[4]
for _, v := range vals {
total += v
}
return total, idle
}
return 0, 0
}
func sampleMemLoadPct() float64 {
f, err := os.Open("/proc/meminfo")
if err != nil {
return 0
}
defer f.Close()
vals := map[string]uint64{}
sc := bufio.NewScanner(f)
for sc.Scan() {
fields := strings.Fields(sc.Text())
if len(fields) >= 2 {
v, _ := strconv.ParseUint(fields[1], 10, 64)
vals[strings.TrimSuffix(fields[0], ":")] = v
}
}
total := vals["MemTotal"]
avail := vals["MemAvailable"]
if total == 0 {
return 0
}
used := total - avail
return float64(used) / float64(total) * 100
}
func hasTempGroup(temps []TempReading, group string) bool {
for _, t := range temps {
if t.Group == group {
return true
}
}
return false
}
func sampleLiveTemperatureReadings() []TempReading {
if temps := sampleLiveTempsViaSensorsJSON(); len(temps) > 0 {
return temps
}
return sampleLiveTempsViaIPMI()
}
func sampleLiveTempsViaSensorsJSON() []TempReading {
out, err := exec.Command("sensors", "-j").Output()
if err != nil || len(out) == 0 {
return nil
}
var doc map[string]map[string]any
if err := json.Unmarshal(out, &doc); err != nil {
return nil
}
chips := make([]string, 0, len(doc))
for chip := range doc {
chips = append(chips, chip)
}
sort.Strings(chips)
temps := make([]TempReading, 0, len(chips))
seen := map[string]struct{}{}
for _, chip := range chips {
features := doc[chip]
featureNames := make([]string, 0, len(features))
for name := range features {
featureNames = append(featureNames, name)
}
sort.Strings(featureNames)
for _, name := range featureNames {
if strings.EqualFold(name, "Adapter") {
continue
}
feature, ok := features[name].(map[string]any)
if !ok {
continue
}
value, ok := firstTempInputValue(feature)
if !ok || value <= 0 || value > 150 {
continue
}
group := classifyLiveTempGroup(chip, name)
if group == "gpu" {
continue
}
label := strings.TrimSpace(name)
if label == "" {
continue
}
if group == "ambient" {
label = compactAmbientTempName(chip, label)
}
key := group + "\x00" + label
if _, ok := seen[key]; ok {
continue
}
seen[key] = struct{}{}
temps = append(temps, TempReading{Name: label, Group: group, Celsius: value})
}
}
return temps
}
func sampleLiveTempsViaIPMI() []TempReading {
out, err := exec.Command("ipmitool", "sdr", "type", "Temperature").Output()
if err != nil || len(out) == 0 {
return nil
}
var temps []TempReading
seen := map[string]struct{}{}
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
parts := strings.Split(line, "|")
if len(parts) < 3 {
continue
}
name := strings.TrimSpace(parts[0])
if name == "" {
continue
}
unit := strings.ToLower(strings.TrimSpace(parts[2]))
if !strings.Contains(unit, "degrees") {
continue
}
raw := strings.TrimSpace(parts[1])
if raw == "" || strings.EqualFold(raw, "na") {
continue
}
value, err := strconv.ParseFloat(raw, 64)
if err != nil || value <= 0 || value > 150 {
continue
}
group := classifyLiveTempGroup("", name)
if group == "gpu" {
continue
}
label := name
if group == "ambient" {
label = compactAmbientTempName("", label)
}
key := group + "\x00" + label
if _, ok := seen[key]; ok {
continue
}
seen[key] = struct{}{}
temps = append(temps, TempReading{Name: label, Group: group, Celsius: value})
}
return temps
}
func firstTempInputValue(feature map[string]any) (float64, bool) {
keys := make([]string, 0, len(feature))
for key := range feature {
keys = append(keys, key)
}
sort.Strings(keys)
for _, key := range keys {
lower := strings.ToLower(key)
if !strings.Contains(lower, "temp") || !strings.HasSuffix(lower, "_input") {
continue
}
switch value := feature[key].(type) {
case float64:
return value, true
case string:
f, err := strconv.ParseFloat(value, 64)
if err == nil {
return f, true
}
}
}
return 0, false
}
func classifyLiveTempGroup(chip, name string) string {
text := strings.ToLower(strings.TrimSpace(chip + " " + name))
switch {
case strings.Contains(text, "gpu"), strings.Contains(text, "amdgpu"), strings.Contains(text, "nvidia"), strings.Contains(text, "adeon"):
return "gpu"
case strings.Contains(text, "coretemp"),
strings.Contains(text, "k10temp"),
strings.Contains(text, "zenpower"),
strings.Contains(text, "package id"),
strings.Contains(text, "x86_pkg_temp"),
strings.Contains(text, "tctl"),
strings.Contains(text, "tdie"),
strings.Contains(text, "tccd"),
strings.Contains(text, "cpu"),
strings.Contains(text, "peci"):
return "cpu"
default:
return "ambient"
}
}
func compactAmbientTempName(chip, name string) string {
chip = strings.TrimSpace(chip)
name = strings.TrimSpace(name)
if chip == "" || strings.EqualFold(chip, name) {
return name
}
if strings.Contains(strings.ToLower(name), strings.ToLower(chip)) {
return name
}
return chip + " / " + name
}
// samplePSUPower reads per-PSU input power via IPMI SDR.
// Uses collector.PSUSlotsFromSDR (name-based matching) which works across
// vendors where PSU sensors may not carry entity ID "10.N".
// Returns nil when IPMI is unavailable or no PSU Watt sensors exist.
func samplePSUPower() []PSUReading {
out, err := exec.Command("ipmitool", "sdr").Output()
if err != nil || len(out) == 0 {
return nil
}
slots := collector.PSUSlotsFromSDR(string(out))
if len(slots) == 0 {
return nil
}
// Collect slot keys and sort for stable output.
keys := make([]int, 0, len(slots))
for k := range slots {
n, err := strconv.Atoi(k)
if err == nil {
keys = append(keys, n)
}
}
sort.Ints(keys)
psus := make([]PSUReading, 0, len(keys))
for _, k := range keys {
entry := slots[strconv.Itoa(k)]
// Prefer AC input power; fall back to DC output power.
var w float64
if entry.InputW != nil && *entry.InputW > 0 {
w = *entry.InputW
} else if entry.OutputW != nil && *entry.OutputW > 0 {
w = *entry.OutputW
}
if w <= 0 {
continue
}
psus = append(psus, PSUReading{Slot: k + 1, Name: fmt.Sprintf("PSU%d", k+1), PowerW: w})
}
if len(psus) == 0 {
return nil
}
return psus
}