Align hardware export with ingest contract
This commit is contained in:
203
audit/internal/collector/memory_telemetry.go
Normal file
203
audit/internal/collector/memory_telemetry.go
Normal file
@@ -0,0 +1,203 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var edacBaseDir = "/sys/devices/system/edac/mc"
|
||||
|
||||
type edacDIMMStats struct {
|
||||
Label string
|
||||
CECount *int64
|
||||
UECount *int64
|
||||
}
|
||||
|
||||
func enrichMemoryWithTelemetry(dimms []schema.HardwareMemory, doc sensorsDoc) []schema.HardwareMemory {
|
||||
if len(dimms) == 0 {
|
||||
return dimms
|
||||
}
|
||||
|
||||
tempByLabel := memoryTempsFromSensors(doc)
|
||||
stats := readEDACStats()
|
||||
|
||||
for i := range dimms {
|
||||
labelKeys := dimmMatchKeys(dimms[i].Slot, dimms[i].Location)
|
||||
|
||||
for _, key := range labelKeys {
|
||||
if temp, ok := tempByLabel[key]; ok {
|
||||
dimms[i].TemperatureC = &temp
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
for _, key := range labelKeys {
|
||||
if stat, ok := stats[key]; ok {
|
||||
if stat.CECount != nil {
|
||||
dimms[i].CorrectableECCErrorCount = stat.CECount
|
||||
}
|
||||
if stat.UECount != nil {
|
||||
dimms[i].UncorrectableECCErrorCount = stat.UECount
|
||||
}
|
||||
if stat.UECount != nil && *stat.UECount > 0 {
|
||||
dimms[i].DataLossDetected = boolPtr(true)
|
||||
status := statusCritical
|
||||
dimms[i].Status = &status
|
||||
if dimms[i].ErrorDescription == nil {
|
||||
dimms[i].ErrorDescription = stringPtr("EDAC reports uncorrectable ECC errors")
|
||||
}
|
||||
} else if stat.CECount != nil && *stat.CECount > 0 && (dimms[i].Status == nil || *dimms[i].Status == statusOK) {
|
||||
status := statusWarning
|
||||
dimms[i].Status = &status
|
||||
if dimms[i].ErrorDescription == nil {
|
||||
dimms[i].ErrorDescription = stringPtr("EDAC reports correctable ECC errors")
|
||||
}
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return dimms
|
||||
}
|
||||
|
||||
func memoryTempsFromSensors(doc sensorsDoc) map[string]float64 {
|
||||
out := map[string]float64{}
|
||||
if len(doc) == 0 {
|
||||
return out
|
||||
}
|
||||
for chip, features := range doc {
|
||||
for featureName, raw := range features {
|
||||
feature, ok := raw.(map[string]any)
|
||||
if !ok || classifySensorFeature(feature) != "temp" {
|
||||
continue
|
||||
}
|
||||
if !isLikelyMemoryTemp(chip, featureName) {
|
||||
continue
|
||||
}
|
||||
temp, ok := firstFeatureFloat(feature, "_input")
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
key := canonicalLabel(featureName)
|
||||
if key == "" {
|
||||
continue
|
||||
}
|
||||
if _, exists := out[key]; !exists {
|
||||
out[key] = temp
|
||||
}
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func readEDACStats() map[string]edacDIMMStats {
|
||||
out := map[string]edacDIMMStats{}
|
||||
mcDirs, err := filepath.Glob(filepath.Join(edacBaseDir, "mc*"))
|
||||
if err != nil {
|
||||
return out
|
||||
}
|
||||
sort.Strings(mcDirs)
|
||||
for _, mcDir := range mcDirs {
|
||||
dimmDirs, err := filepath.Glob(filepath.Join(mcDir, "dimm*"))
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
sort.Strings(dimmDirs)
|
||||
for _, dimmDir := range dimmDirs {
|
||||
stat, ok := readEDACDIMMStats(dimmDir)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
key := canonicalLabel(stat.Label)
|
||||
if key == "" {
|
||||
continue
|
||||
}
|
||||
out[key] = stat
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func readEDACDIMMStats(dimmDir string) (edacDIMMStats, bool) {
|
||||
labelBytes, err := os.ReadFile(filepath.Join(dimmDir, "dimm_label"))
|
||||
if err != nil {
|
||||
labelBytes, err = os.ReadFile(filepath.Join(dimmDir, "label"))
|
||||
if err != nil {
|
||||
return edacDIMMStats{}, false
|
||||
}
|
||||
}
|
||||
label := strings.TrimSpace(string(labelBytes))
|
||||
if label == "" {
|
||||
return edacDIMMStats{}, false
|
||||
}
|
||||
|
||||
stat := edacDIMMStats{Label: label}
|
||||
if value, ok := readEDACCount(dimmDir, []string{"dimm_ce_count", "ce_count"}); ok {
|
||||
stat.CECount = &value
|
||||
}
|
||||
if value, ok := readEDACCount(dimmDir, []string{"dimm_ue_count", "ue_count"}); ok {
|
||||
stat.UECount = &value
|
||||
}
|
||||
return stat, true
|
||||
}
|
||||
|
||||
func readEDACCount(dir string, names []string) (int64, bool) {
|
||||
for _, name := range names {
|
||||
raw, err := os.ReadFile(filepath.Join(dir, name))
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
value, err := strconv.ParseInt(strings.TrimSpace(string(raw)), 10, 64)
|
||||
if err == nil && value >= 0 {
|
||||
return value, true
|
||||
}
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
func dimmMatchKeys(slot, location *string) []string {
|
||||
var out []string
|
||||
add := func(value *string) {
|
||||
key := canonicalLabel(derefString(value))
|
||||
if key == "" {
|
||||
return
|
||||
}
|
||||
for _, existing := range out {
|
||||
if existing == key {
|
||||
return
|
||||
}
|
||||
}
|
||||
out = append(out, key)
|
||||
}
|
||||
add(slot)
|
||||
add(location)
|
||||
return out
|
||||
}
|
||||
|
||||
func canonicalLabel(value string) string {
|
||||
value = strings.ToUpper(strings.TrimSpace(value))
|
||||
if value == "" {
|
||||
return ""
|
||||
}
|
||||
var b strings.Builder
|
||||
for _, r := range value {
|
||||
if (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9') {
|
||||
b.WriteRune(r)
|
||||
}
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func isLikelyMemoryTemp(chip, feature string) bool {
|
||||
value := strings.ToLower(chip + " " + feature)
|
||||
return strings.Contains(value, "dimm") || strings.Contains(value, "sodimm")
|
||||
}
|
||||
|
||||
func boolPtr(value bool) *bool {
|
||||
return &value
|
||||
}
|
||||
Reference in New Issue
Block a user