MSI servers place PSU_POWER_IN/OUT sensors on entity 3.0, not 10.N (the IPMI "Power Supply" entity). The old parser filtered by entity ID and found nothing, so the dashboard fell back to DCMI which reports roughly half the actual draw. Now delegates to collector.PSUSlotsFromSDR — the same name-based matching already used in the Power Fit benchmark. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
385 lines
9.4 KiB
Go
385 lines
9.4 KiB
Go
package platform
|
|
|
|
import (
|
|
"bee/audit/internal/collector"
|
|
"bufio"
|
|
"encoding/json"
|
|
"fmt"
|
|
"os"
|
|
"os/exec"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
// LiveMetricSample is a single point-in-time snapshot of server metrics
|
|
// collected for the web UI metrics page.
|
|
type LiveMetricSample struct {
|
|
Timestamp time.Time `json:"ts"`
|
|
Fans []FanReading `json:"fans"`
|
|
Temps []TempReading `json:"temps"`
|
|
PowerW float64 `json:"power_w"`
|
|
PSUs []PSUReading `json:"psus,omitempty"`
|
|
CPULoadPct float64 `json:"cpu_load_pct"`
|
|
MemLoadPct float64 `json:"mem_load_pct"`
|
|
GPUs []GPUMetricRow `json:"gpus"`
|
|
}
|
|
|
|
// PSUReading is a per-slot power supply input power reading.
|
|
type PSUReading struct {
|
|
Slot int `json:"slot"`
|
|
Name string `json:"name"`
|
|
PowerW float64 `json:"power_w"`
|
|
}
|
|
|
|
// TempReading is a named temperature sensor value.
|
|
type TempReading struct {
|
|
Name string `json:"name"`
|
|
Group string `json:"group,omitempty"`
|
|
Celsius float64 `json:"celsius"`
|
|
}
|
|
|
|
// SampleLiveMetrics collects a single metrics snapshot from all available
|
|
// sources: GPU (via nvidia-smi), fans and temperatures (via ipmitool/sensors),
|
|
// and system power (via ipmitool dcmi). Missing sources are silently skipped.
|
|
func SampleLiveMetrics() LiveMetricSample {
|
|
s := LiveMetricSample{Timestamp: time.Now().UTC()}
|
|
|
|
// GPU metrics — try NVIDIA first, fall back to AMD
|
|
if gpus, err := SampleGPUMetrics(nil); err == nil && len(gpus) > 0 {
|
|
s.GPUs = gpus
|
|
} else if amdGPUs, err := sampleAMDGPUMetrics(); err == nil && len(amdGPUs) > 0 {
|
|
s.GPUs = amdGPUs
|
|
}
|
|
|
|
// Fan speeds — skipped silently if ipmitool unavailable
|
|
fans, _ := sampleFanSpeeds()
|
|
s.Fans = fans
|
|
|
|
s.Temps = append(s.Temps, sampleLiveTemperatureReadings()...)
|
|
if !hasTempGroup(s.Temps, "cpu") {
|
|
if cpuTemp := sampleCPUMaxTemp(); cpuTemp > 0 {
|
|
s.Temps = append(s.Temps, TempReading{Name: "CPU Max", Group: "cpu", Celsius: cpuTemp})
|
|
}
|
|
}
|
|
|
|
// System power — returns 0 if unavailable
|
|
s.PowerW = sampleSystemPower()
|
|
|
|
// Per-PSU power — populated when IPMI SDR has Power Supply entities with Watt readings
|
|
s.PSUs = samplePSUPower()
|
|
|
|
// CPU load — from /proc/stat
|
|
s.CPULoadPct = sampleCPULoadPct()
|
|
|
|
// Memory load — from /proc/meminfo
|
|
s.MemLoadPct = sampleMemLoadPct()
|
|
|
|
return s
|
|
}
|
|
|
|
// sampleCPULoadPct reads two /proc/stat snapshots 200ms apart and returns
|
|
// the overall CPU utilisation percentage.
|
|
func sampleCPULoadPct() float64 {
|
|
total0, idle0 := readCPUStat()
|
|
if total0 == 0 {
|
|
return 0
|
|
}
|
|
time.Sleep(200 * time.Millisecond)
|
|
total1, idle1 := readCPUStat()
|
|
if total1 == 0 {
|
|
return 0
|
|
}
|
|
return cpuLoadPctBetween(total0, idle0, total1, idle1)
|
|
}
|
|
|
|
func cpuLoadPctBetween(prevTotal, prevIdle, total, idle uint64) float64 {
|
|
dt := float64(total - prevTotal)
|
|
di := float64(idle - prevIdle)
|
|
if dt <= 0 {
|
|
return 0
|
|
}
|
|
pct := (1 - di/dt) * 100
|
|
if pct < 0 {
|
|
return 0
|
|
}
|
|
if pct > 100 {
|
|
return 100
|
|
}
|
|
return pct
|
|
}
|
|
|
|
func readCPUStat() (total, idle uint64) {
|
|
f, err := os.Open("/proc/stat")
|
|
if err != nil {
|
|
return 0, 0
|
|
}
|
|
defer f.Close()
|
|
sc := bufio.NewScanner(f)
|
|
for sc.Scan() {
|
|
line := sc.Text()
|
|
if !strings.HasPrefix(line, "cpu ") {
|
|
continue
|
|
}
|
|
fields := strings.Fields(line)[1:] // skip "cpu"
|
|
var vals [10]uint64
|
|
for i := 0; i < len(fields) && i < 10; i++ {
|
|
vals[i], _ = strconv.ParseUint(fields[i], 10, 64)
|
|
}
|
|
// idle = idle + iowait
|
|
idle = vals[3] + vals[4]
|
|
for _, v := range vals {
|
|
total += v
|
|
}
|
|
return total, idle
|
|
}
|
|
return 0, 0
|
|
}
|
|
|
|
func sampleMemLoadPct() float64 {
|
|
f, err := os.Open("/proc/meminfo")
|
|
if err != nil {
|
|
return 0
|
|
}
|
|
defer f.Close()
|
|
vals := map[string]uint64{}
|
|
sc := bufio.NewScanner(f)
|
|
for sc.Scan() {
|
|
fields := strings.Fields(sc.Text())
|
|
if len(fields) >= 2 {
|
|
v, _ := strconv.ParseUint(fields[1], 10, 64)
|
|
vals[strings.TrimSuffix(fields[0], ":")] = v
|
|
}
|
|
}
|
|
total := vals["MemTotal"]
|
|
avail := vals["MemAvailable"]
|
|
if total == 0 {
|
|
return 0
|
|
}
|
|
used := total - avail
|
|
return float64(used) / float64(total) * 100
|
|
}
|
|
|
|
func hasTempGroup(temps []TempReading, group string) bool {
|
|
for _, t := range temps {
|
|
if t.Group == group {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func sampleLiveTemperatureReadings() []TempReading {
|
|
if temps := sampleLiveTempsViaSensorsJSON(); len(temps) > 0 {
|
|
return temps
|
|
}
|
|
return sampleLiveTempsViaIPMI()
|
|
}
|
|
|
|
func sampleLiveTempsViaSensorsJSON() []TempReading {
|
|
out, err := exec.Command("sensors", "-j").Output()
|
|
if err != nil || len(out) == 0 {
|
|
return nil
|
|
}
|
|
|
|
var doc map[string]map[string]any
|
|
if err := json.Unmarshal(out, &doc); err != nil {
|
|
return nil
|
|
}
|
|
|
|
chips := make([]string, 0, len(doc))
|
|
for chip := range doc {
|
|
chips = append(chips, chip)
|
|
}
|
|
sort.Strings(chips)
|
|
|
|
temps := make([]TempReading, 0, len(chips))
|
|
seen := map[string]struct{}{}
|
|
for _, chip := range chips {
|
|
features := doc[chip]
|
|
featureNames := make([]string, 0, len(features))
|
|
for name := range features {
|
|
featureNames = append(featureNames, name)
|
|
}
|
|
sort.Strings(featureNames)
|
|
for _, name := range featureNames {
|
|
if strings.EqualFold(name, "Adapter") {
|
|
continue
|
|
}
|
|
feature, ok := features[name].(map[string]any)
|
|
if !ok {
|
|
continue
|
|
}
|
|
value, ok := firstTempInputValue(feature)
|
|
if !ok || value <= 0 || value > 150 {
|
|
continue
|
|
}
|
|
group := classifyLiveTempGroup(chip, name)
|
|
if group == "gpu" {
|
|
continue
|
|
}
|
|
label := strings.TrimSpace(name)
|
|
if label == "" {
|
|
continue
|
|
}
|
|
if group == "ambient" {
|
|
label = compactAmbientTempName(chip, label)
|
|
}
|
|
key := group + "\x00" + label
|
|
if _, ok := seen[key]; ok {
|
|
continue
|
|
}
|
|
seen[key] = struct{}{}
|
|
temps = append(temps, TempReading{Name: label, Group: group, Celsius: value})
|
|
}
|
|
}
|
|
return temps
|
|
}
|
|
|
|
func sampleLiveTempsViaIPMI() []TempReading {
|
|
out, err := exec.Command("ipmitool", "sdr", "type", "Temperature").Output()
|
|
if err != nil || len(out) == 0 {
|
|
return nil
|
|
}
|
|
var temps []TempReading
|
|
seen := map[string]struct{}{}
|
|
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
|
parts := strings.Split(line, "|")
|
|
if len(parts) < 3 {
|
|
continue
|
|
}
|
|
name := strings.TrimSpace(parts[0])
|
|
if name == "" {
|
|
continue
|
|
}
|
|
unit := strings.ToLower(strings.TrimSpace(parts[2]))
|
|
if !strings.Contains(unit, "degrees") {
|
|
continue
|
|
}
|
|
raw := strings.TrimSpace(parts[1])
|
|
if raw == "" || strings.EqualFold(raw, "na") {
|
|
continue
|
|
}
|
|
value, err := strconv.ParseFloat(raw, 64)
|
|
if err != nil || value <= 0 || value > 150 {
|
|
continue
|
|
}
|
|
group := classifyLiveTempGroup("", name)
|
|
if group == "gpu" {
|
|
continue
|
|
}
|
|
label := name
|
|
if group == "ambient" {
|
|
label = compactAmbientTempName("", label)
|
|
}
|
|
key := group + "\x00" + label
|
|
if _, ok := seen[key]; ok {
|
|
continue
|
|
}
|
|
seen[key] = struct{}{}
|
|
temps = append(temps, TempReading{Name: label, Group: group, Celsius: value})
|
|
}
|
|
return temps
|
|
}
|
|
|
|
func firstTempInputValue(feature map[string]any) (float64, bool) {
|
|
keys := make([]string, 0, len(feature))
|
|
for key := range feature {
|
|
keys = append(keys, key)
|
|
}
|
|
sort.Strings(keys)
|
|
for _, key := range keys {
|
|
lower := strings.ToLower(key)
|
|
if !strings.Contains(lower, "temp") || !strings.HasSuffix(lower, "_input") {
|
|
continue
|
|
}
|
|
switch value := feature[key].(type) {
|
|
case float64:
|
|
return value, true
|
|
case string:
|
|
f, err := strconv.ParseFloat(value, 64)
|
|
if err == nil {
|
|
return f, true
|
|
}
|
|
}
|
|
}
|
|
return 0, false
|
|
}
|
|
|
|
func classifyLiveTempGroup(chip, name string) string {
|
|
text := strings.ToLower(strings.TrimSpace(chip + " " + name))
|
|
switch {
|
|
case strings.Contains(text, "gpu"), strings.Contains(text, "amdgpu"), strings.Contains(text, "nvidia"), strings.Contains(text, "adeon"):
|
|
return "gpu"
|
|
case strings.Contains(text, "coretemp"),
|
|
strings.Contains(text, "k10temp"),
|
|
strings.Contains(text, "zenpower"),
|
|
strings.Contains(text, "package id"),
|
|
strings.Contains(text, "x86_pkg_temp"),
|
|
strings.Contains(text, "tctl"),
|
|
strings.Contains(text, "tdie"),
|
|
strings.Contains(text, "tccd"),
|
|
strings.Contains(text, "cpu"),
|
|
strings.Contains(text, "peci"):
|
|
return "cpu"
|
|
default:
|
|
return "ambient"
|
|
}
|
|
}
|
|
|
|
func compactAmbientTempName(chip, name string) string {
|
|
chip = strings.TrimSpace(chip)
|
|
name = strings.TrimSpace(name)
|
|
if chip == "" || strings.EqualFold(chip, name) {
|
|
return name
|
|
}
|
|
if strings.Contains(strings.ToLower(name), strings.ToLower(chip)) {
|
|
return name
|
|
}
|
|
return chip + " / " + name
|
|
}
|
|
|
|
// samplePSUPower reads per-PSU input power via IPMI SDR.
|
|
// Uses collector.PSUSlotsFromSDR (name-based matching) which works across
|
|
// vendors where PSU sensors may not carry entity ID "10.N".
|
|
// Returns nil when IPMI is unavailable or no PSU Watt sensors exist.
|
|
func samplePSUPower() []PSUReading {
|
|
out, err := exec.Command("ipmitool", "sdr").Output()
|
|
if err != nil || len(out) == 0 {
|
|
return nil
|
|
}
|
|
slots := collector.PSUSlotsFromSDR(string(out))
|
|
if len(slots) == 0 {
|
|
return nil
|
|
}
|
|
// Collect slot keys and sort for stable output.
|
|
keys := make([]int, 0, len(slots))
|
|
for k := range slots {
|
|
n, err := strconv.Atoi(k)
|
|
if err == nil {
|
|
keys = append(keys, n)
|
|
}
|
|
}
|
|
sort.Ints(keys)
|
|
psus := make([]PSUReading, 0, len(keys))
|
|
for _, k := range keys {
|
|
entry := slots[strconv.Itoa(k)]
|
|
// Prefer AC input power; fall back to DC output power.
|
|
var w float64
|
|
if entry.InputW != nil && *entry.InputW > 0 {
|
|
w = *entry.InputW
|
|
} else if entry.OutputW != nil && *entry.OutputW > 0 {
|
|
w = *entry.OutputW
|
|
}
|
|
if w <= 0 {
|
|
continue
|
|
}
|
|
psus = append(psus, PSUReading{Slot: k + 1, Name: fmt.Sprintf("PSU%d", k+1), PowerW: w})
|
|
}
|
|
if len(psus) == 0 {
|
|
return nil
|
|
}
|
|
return psus
|
|
}
|