204 lines
4.5 KiB
Go
204 lines
4.5 KiB
Go
package collector
|
|
|
|
import (
|
|
"bee/audit/internal/schema"
|
|
"os"
|
|
"path/filepath"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
)
|
|
|
|
var edacBaseDir = "/sys/devices/system/edac/mc"
|
|
|
|
type edacDIMMStats struct {
|
|
Label string
|
|
CECount *int64
|
|
UECount *int64
|
|
}
|
|
|
|
func enrichMemoryWithTelemetry(dimms []schema.HardwareMemory, doc sensorsDoc) []schema.HardwareMemory {
|
|
if len(dimms) == 0 {
|
|
return dimms
|
|
}
|
|
|
|
tempByLabel := memoryTempsFromSensors(doc)
|
|
stats := readEDACStats()
|
|
|
|
for i := range dimms {
|
|
labelKeys := dimmMatchKeys(dimms[i].Slot, dimms[i].Location)
|
|
|
|
for _, key := range labelKeys {
|
|
if temp, ok := tempByLabel[key]; ok {
|
|
dimms[i].TemperatureC = &temp
|
|
break
|
|
}
|
|
}
|
|
|
|
for _, key := range labelKeys {
|
|
if stat, ok := stats[key]; ok {
|
|
if stat.CECount != nil {
|
|
dimms[i].CorrectableECCErrorCount = stat.CECount
|
|
}
|
|
if stat.UECount != nil {
|
|
dimms[i].UncorrectableECCErrorCount = stat.UECount
|
|
}
|
|
if stat.UECount != nil && *stat.UECount > 0 {
|
|
dimms[i].DataLossDetected = boolPtr(true)
|
|
status := statusCritical
|
|
dimms[i].Status = &status
|
|
if dimms[i].ErrorDescription == nil {
|
|
dimms[i].ErrorDescription = stringPtr("EDAC reports uncorrectable ECC errors")
|
|
}
|
|
} else if stat.CECount != nil && *stat.CECount > 0 && (dimms[i].Status == nil || *dimms[i].Status == statusOK) {
|
|
status := statusWarning
|
|
dimms[i].Status = &status
|
|
if dimms[i].ErrorDescription == nil {
|
|
dimms[i].ErrorDescription = stringPtr("EDAC reports correctable ECC errors")
|
|
}
|
|
}
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
return dimms
|
|
}
|
|
|
|
func memoryTempsFromSensors(doc sensorsDoc) map[string]float64 {
|
|
out := map[string]float64{}
|
|
if len(doc) == 0 {
|
|
return out
|
|
}
|
|
for chip, features := range doc {
|
|
for featureName, raw := range features {
|
|
feature, ok := raw.(map[string]any)
|
|
if !ok || classifySensorFeature(feature) != "temp" {
|
|
continue
|
|
}
|
|
if !isLikelyMemoryTemp(chip, featureName) {
|
|
continue
|
|
}
|
|
temp, ok := firstFeatureFloat(feature, "_input")
|
|
if !ok {
|
|
continue
|
|
}
|
|
key := canonicalLabel(featureName)
|
|
if key == "" {
|
|
continue
|
|
}
|
|
if _, exists := out[key]; !exists {
|
|
out[key] = temp
|
|
}
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func readEDACStats() map[string]edacDIMMStats {
|
|
out := map[string]edacDIMMStats{}
|
|
mcDirs, err := filepath.Glob(filepath.Join(edacBaseDir, "mc*"))
|
|
if err != nil {
|
|
return out
|
|
}
|
|
sort.Strings(mcDirs)
|
|
for _, mcDir := range mcDirs {
|
|
dimmDirs, err := filepath.Glob(filepath.Join(mcDir, "dimm*"))
|
|
if err != nil {
|
|
continue
|
|
}
|
|
sort.Strings(dimmDirs)
|
|
for _, dimmDir := range dimmDirs {
|
|
stat, ok := readEDACDIMMStats(dimmDir)
|
|
if !ok {
|
|
continue
|
|
}
|
|
key := canonicalLabel(stat.Label)
|
|
if key == "" {
|
|
continue
|
|
}
|
|
out[key] = stat
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func readEDACDIMMStats(dimmDir string) (edacDIMMStats, bool) {
|
|
labelBytes, err := os.ReadFile(filepath.Join(dimmDir, "dimm_label"))
|
|
if err != nil {
|
|
labelBytes, err = os.ReadFile(filepath.Join(dimmDir, "label"))
|
|
if err != nil {
|
|
return edacDIMMStats{}, false
|
|
}
|
|
}
|
|
label := strings.TrimSpace(string(labelBytes))
|
|
if label == "" {
|
|
return edacDIMMStats{}, false
|
|
}
|
|
|
|
stat := edacDIMMStats{Label: label}
|
|
if value, ok := readEDACCount(dimmDir, []string{"dimm_ce_count", "ce_count"}); ok {
|
|
stat.CECount = &value
|
|
}
|
|
if value, ok := readEDACCount(dimmDir, []string{"dimm_ue_count", "ue_count"}); ok {
|
|
stat.UECount = &value
|
|
}
|
|
return stat, true
|
|
}
|
|
|
|
func readEDACCount(dir string, names []string) (int64, bool) {
|
|
for _, name := range names {
|
|
raw, err := os.ReadFile(filepath.Join(dir, name))
|
|
if err != nil {
|
|
continue
|
|
}
|
|
value, err := strconv.ParseInt(strings.TrimSpace(string(raw)), 10, 64)
|
|
if err == nil && value >= 0 {
|
|
return value, true
|
|
}
|
|
}
|
|
return 0, false
|
|
}
|
|
|
|
func dimmMatchKeys(slot, location *string) []string {
|
|
var out []string
|
|
add := func(value *string) {
|
|
key := canonicalLabel(derefString(value))
|
|
if key == "" {
|
|
return
|
|
}
|
|
for _, existing := range out {
|
|
if existing == key {
|
|
return
|
|
}
|
|
}
|
|
out = append(out, key)
|
|
}
|
|
add(slot)
|
|
add(location)
|
|
return out
|
|
}
|
|
|
|
func canonicalLabel(value string) string {
|
|
value = strings.ToUpper(strings.TrimSpace(value))
|
|
if value == "" {
|
|
return ""
|
|
}
|
|
var b strings.Builder
|
|
for _, r := range value {
|
|
if (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9') {
|
|
b.WriteRune(r)
|
|
}
|
|
}
|
|
return b.String()
|
|
}
|
|
|
|
func isLikelyMemoryTemp(chip, feature string) bool {
|
|
value := strings.ToLower(chip + " " + feature)
|
|
return strings.Contains(value, "dimm") || strings.Contains(value, "sodimm")
|
|
}
|
|
|
|
func boolPtr(value bool) *bool {
|
|
return &value
|
|
}
|