Files
bee/audit/internal/collector/memory_telemetry.go
2026-03-15 21:04:53 +03:00

204 lines
4.5 KiB
Go

package collector
import (
"bee/audit/internal/schema"
"os"
"path/filepath"
"sort"
"strconv"
"strings"
)
var edacBaseDir = "/sys/devices/system/edac/mc"
type edacDIMMStats struct {
Label string
CECount *int64
UECount *int64
}
func enrichMemoryWithTelemetry(dimms []schema.HardwareMemory, doc sensorsDoc) []schema.HardwareMemory {
if len(dimms) == 0 {
return dimms
}
tempByLabel := memoryTempsFromSensors(doc)
stats := readEDACStats()
for i := range dimms {
labelKeys := dimmMatchKeys(dimms[i].Slot, dimms[i].Location)
for _, key := range labelKeys {
if temp, ok := tempByLabel[key]; ok {
dimms[i].TemperatureC = &temp
break
}
}
for _, key := range labelKeys {
if stat, ok := stats[key]; ok {
if stat.CECount != nil {
dimms[i].CorrectableECCErrorCount = stat.CECount
}
if stat.UECount != nil {
dimms[i].UncorrectableECCErrorCount = stat.UECount
}
if stat.UECount != nil && *stat.UECount > 0 {
dimms[i].DataLossDetected = boolPtr(true)
status := statusCritical
dimms[i].Status = &status
if dimms[i].ErrorDescription == nil {
dimms[i].ErrorDescription = stringPtr("EDAC reports uncorrectable ECC errors")
}
} else if stat.CECount != nil && *stat.CECount > 0 && (dimms[i].Status == nil || *dimms[i].Status == statusOK) {
status := statusWarning
dimms[i].Status = &status
if dimms[i].ErrorDescription == nil {
dimms[i].ErrorDescription = stringPtr("EDAC reports correctable ECC errors")
}
}
break
}
}
}
return dimms
}
func memoryTempsFromSensors(doc sensorsDoc) map[string]float64 {
out := map[string]float64{}
if len(doc) == 0 {
return out
}
for chip, features := range doc {
for featureName, raw := range features {
feature, ok := raw.(map[string]any)
if !ok || classifySensorFeature(feature) != "temp" {
continue
}
if !isLikelyMemoryTemp(chip, featureName) {
continue
}
temp, ok := firstFeatureFloat(feature, "_input")
if !ok {
continue
}
key := canonicalLabel(featureName)
if key == "" {
continue
}
if _, exists := out[key]; !exists {
out[key] = temp
}
}
}
return out
}
func readEDACStats() map[string]edacDIMMStats {
out := map[string]edacDIMMStats{}
mcDirs, err := filepath.Glob(filepath.Join(edacBaseDir, "mc*"))
if err != nil {
return out
}
sort.Strings(mcDirs)
for _, mcDir := range mcDirs {
dimmDirs, err := filepath.Glob(filepath.Join(mcDir, "dimm*"))
if err != nil {
continue
}
sort.Strings(dimmDirs)
for _, dimmDir := range dimmDirs {
stat, ok := readEDACDIMMStats(dimmDir)
if !ok {
continue
}
key := canonicalLabel(stat.Label)
if key == "" {
continue
}
out[key] = stat
}
}
return out
}
func readEDACDIMMStats(dimmDir string) (edacDIMMStats, bool) {
labelBytes, err := os.ReadFile(filepath.Join(dimmDir, "dimm_label"))
if err != nil {
labelBytes, err = os.ReadFile(filepath.Join(dimmDir, "label"))
if err != nil {
return edacDIMMStats{}, false
}
}
label := strings.TrimSpace(string(labelBytes))
if label == "" {
return edacDIMMStats{}, false
}
stat := edacDIMMStats{Label: label}
if value, ok := readEDACCount(dimmDir, []string{"dimm_ce_count", "ce_count"}); ok {
stat.CECount = &value
}
if value, ok := readEDACCount(dimmDir, []string{"dimm_ue_count", "ue_count"}); ok {
stat.UECount = &value
}
return stat, true
}
func readEDACCount(dir string, names []string) (int64, bool) {
for _, name := range names {
raw, err := os.ReadFile(filepath.Join(dir, name))
if err != nil {
continue
}
value, err := strconv.ParseInt(strings.TrimSpace(string(raw)), 10, 64)
if err == nil && value >= 0 {
return value, true
}
}
return 0, false
}
func dimmMatchKeys(slot, location *string) []string {
var out []string
add := func(value *string) {
key := canonicalLabel(derefString(value))
if key == "" {
return
}
for _, existing := range out {
if existing == key {
return
}
}
out = append(out, key)
}
add(slot)
add(location)
return out
}
func canonicalLabel(value string) string {
value = strings.ToUpper(strings.TrimSpace(value))
if value == "" {
return ""
}
var b strings.Builder
for _, r := range value {
if (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9') {
b.WriteRune(r)
}
}
return b.String()
}
func isLikelyMemoryTemp(chip, feature string) bool {
value := strings.ToLower(chip + " " + feature)
return strings.Contains(value, "dimm") || strings.Contains(value, "sodimm")
}
func boolPtr(value bool) *bool {
return &value
}