Align hardware export with ingest contract

This commit is contained in:
Mikhail Chusavitin
2026-03-15 21:04:53 +03:00
parent b8c235b5ac
commit ab5a4be7ac
37 changed files with 3304 additions and 354 deletions

View File

@@ -317,38 +317,20 @@ func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, erro
}
func (a *App) HealthSummaryResult() ActionResult {
type auditFile struct {
Hardware struct {
Summary struct {
Status string `json:"status"`
Warnings []string `json:"warnings"`
Failures []string `json:"failures"`
StorageWarn int `json:"storage_warn"`
StorageFail int `json:"storage_fail"`
PCIeWarn int `json:"pcie_warn"`
PCIeFail int `json:"pcie_fail"`
PSUWarn int `json:"psu_warn"`
PSUFail int `json:"psu_fail"`
MemoryWarn int `json:"memory_warn"`
MemoryFail int `json:"memory_fail"`
} `json:"summary"`
} `json:"hardware"`
}
raw, err := os.ReadFile(DefaultAuditJSONPath)
if err != nil {
return ActionResult{Title: "Health summary", Body: "No audit JSON found."}
}
var snapshot auditFile
var snapshot schema.HardwareIngestRequest
if err := json.Unmarshal(raw, &snapshot); err != nil {
return ActionResult{Title: "Health summary", Body: "Audit JSON is unreadable."}
}
summary := snapshot.Hardware.Summary
summary := collector.BuildHealthSummary(snapshot.Hardware)
var body strings.Builder
status := summary.Status
if status == "" {
status = "UNKNOWN"
status = "Unknown"
}
fmt.Fprintf(&body, "Overall: %s\n", status)
fmt.Fprintf(&body, "Storage: warn=%d fail=%d\n", summary.StorageWarn, summary.StorageFail)
@@ -662,12 +644,12 @@ func formatIPLine(list func() ([]platform.InterfaceInfo, error)) string {
}
func isGPUDevice(dev schema.HardwarePCIeDevice) bool {
class := strings.ToLower(trimPtr(dev.DeviceClass))
class := trimPtr(dev.DeviceClass)
model := strings.ToLower(trimPtr(dev.Model))
vendor := strings.ToLower(trimPtr(dev.Manufacturer))
return strings.Contains(class, "vga") ||
strings.Contains(class, "3d") ||
strings.Contains(class, "display") ||
return class == "VideoController" ||
class == "DisplayController" ||
class == "ProcessingAccelerator" ||
strings.Contains(model, "nvidia") ||
strings.Contains(vendor, "nvidia") ||
strings.Contains(vendor, "amd")

View File

@@ -371,8 +371,6 @@ func TestFormatSATSummary(t *testing.T) {
}
func TestHealthSummaryResultIncludesCompactSATSummary(t *testing.T) {
t.Parallel()
tmp := t.TempDir()
oldAuditPath := DefaultAuditJSONPath
oldSATBaseDir := DefaultSATBaseDir
@@ -386,7 +384,7 @@ func TestHealthSummaryResultIncludesCompactSATSummary(t *testing.T) {
t.Fatalf("mkdir sat dir: %v", err)
}
raw := `{"hardware":{"summary":{"status":"WARNING","storage_warn":1,"storage_fail":0,"pcie_warn":0,"pcie_fail":0,"psu_warn":0,"psu_fail":0,"memory_warn":0,"memory_fail":0}}}`
raw := `{"collected_at":"2026-03-15T10:00:00Z","hardware":{"board":{"serial_number":"SRV123"},"storage":[{"serial_number":"DISK1","status":"Warning"}]}}`
if err := os.WriteFile(DefaultAuditJSONPath, []byte(raw), 0644); err != nil {
t.Fatalf("write audit json: %v", err)
}
@@ -401,8 +399,6 @@ func TestHealthSummaryResultIncludesCompactSATSummary(t *testing.T) {
}
func TestMainBanner(t *testing.T) {
t.Parallel()
tmp := t.TempDir()
oldAuditPath := DefaultAuditJSONPath
DefaultAuditJSONPath = filepath.Join(tmp, "audit.json")
@@ -413,7 +409,7 @@ func TestMainBanner(t *testing.T) {
product := "PowerEdge R760"
cpuModel := "Intel Xeon Gold 6430"
memoryType := "DDR5"
gpuClass := "VGA compatible controller"
gpuClass := "VideoController"
gpuModel := "NVIDIA H100"
payload := schema.HardwareIngestRequest{

View File

@@ -7,13 +7,15 @@ import (
"bee/audit/internal/runtimeenv"
"bee/audit/internal/schema"
"log/slog"
"os"
"time"
)
// Run executes all collectors and returns the combined snapshot.
// Partial failures are logged as warnings; collection always completes.
func Run(runtimeMode runtimeenv.Mode) schema.HardwareIngestRequest {
func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
start := time.Now()
collectedAt := time.Now().UTC().Format(time.RFC3339)
slog.Info("audit started")
snap := schema.HardwareSnapshot{}
@@ -27,27 +29,38 @@ func Run(runtimeMode runtimeenv.Mode) schema.HardwareIngestRequest {
snap.Firmware = append(snap.Firmware, cpuFW...)
snap.Memory = collectMemory()
sensorDoc, err := readSensorsJSONDoc()
if err != nil {
slog.Info("sensors: unavailable for enrichment", "err", err)
}
snap.CPUs = enrichCPUsWithTelemetry(snap.CPUs, sensorDoc)
snap.Memory = enrichMemoryWithTelemetry(snap.Memory, sensorDoc)
snap.Storage = collectStorage()
snap.PCIeDevices = collectPCIe()
snap.PCIeDevices = enrichPCIeWithNVIDIA(snap.PCIeDevices, snap.Board.SerialNumber)
snap.PCIeDevices = enrichPCIeWithMellanox(snap.PCIeDevices)
snap.PCIeDevices = enrichPCIeWithNICTelemetry(snap.PCIeDevices)
snap.PCIeDevices = enrichPCIeWithRAIDTelemetry(snap.PCIeDevices)
snap.Storage = enrichStorageWithVROC(snap.Storage, snap.PCIeDevices)
snap.Storage = appendUniqueStorage(snap.Storage, collectRAIDStorage(snap.PCIeDevices))
snap.PowerSupplies = collectPSUs()
snap.Summary = buildHealthSummary(snap)
snap.PowerSupplies = enrichPSUsWithTelemetry(snap.PowerSupplies, sensorDoc)
snap.Sensors = buildSensorsFromDoc(sensorDoc)
finalizeSnapshot(&snap, collectedAt)
// remaining collectors added in steps 1.8 1.10
slog.Info("audit completed", "duration", time.Since(start).Round(time.Millisecond))
sourceType := string(runtimeMode)
protocol := "os-direct"
sourceType := "manual"
var targetHost *string
if hostname, err := os.Hostname(); err == nil && hostname != "" {
targetHost = &hostname
}
return schema.HardwareIngestRequest{
SourceType: &sourceType,
Protocol: &protocol,
CollectedAt: time.Now().UTC().Format(time.RFC3339),
TargetHost: targetHost,
CollectedAt: collectedAt,
Hardware: snap,
}
}

View File

@@ -0,0 +1,64 @@
package collector
import "strings"
const (
statusOK = "OK"
statusWarning = "Warning"
statusCritical = "Critical"
statusUnknown = "Unknown"
statusEmpty = "Empty"
)
func mapPCIeDeviceClass(raw string) string {
normalized := strings.ToLower(strings.TrimSpace(raw))
switch {
case normalized == "":
return ""
case strings.Contains(normalized, "ethernet controller"):
return "EthernetController"
case strings.Contains(normalized, "fibre channel"):
return "FibreChannelController"
case strings.Contains(normalized, "network controller"), strings.Contains(normalized, "infiniband controller"):
return "NetworkController"
case strings.Contains(normalized, "serial attached scsi"), strings.Contains(normalized, "storage controller"):
return "StorageController"
case strings.Contains(normalized, "raid"), strings.Contains(normalized, "mass storage"):
return "MassStorageController"
case strings.Contains(normalized, "display controller"):
return "DisplayController"
case strings.Contains(normalized, "vga"), strings.Contains(normalized, "3d controller"), strings.Contains(normalized, "video controller"):
return "VideoController"
case strings.Contains(normalized, "processing accelerators"), strings.Contains(normalized, "processing accelerator"):
return "ProcessingAccelerator"
default:
return raw
}
}
func isNICClass(class string) bool {
switch strings.TrimSpace(class) {
case "EthernetController", "NetworkController":
return true
default:
return false
}
}
func isGPUClass(class string) bool {
switch strings.TrimSpace(class) {
case "VideoController", "DisplayController", "ProcessingAccelerator":
return true
default:
return false
}
}
func isRAIDClass(class string) bool {
switch strings.TrimSpace(class) {
case "MassStorageController", "StorageController":
return true
default:
return false
}
}

View File

@@ -51,12 +51,14 @@ func parseCPUs(output, boardSerial string) []schema.HardwareCPU {
// Returns false if the socket is unpopulated.
func parseCPUSection(fields map[string]string, boardSerial string) (schema.HardwareCPU, bool) {
status := parseCPUStatus(fields["Status"])
if status == "EMPTY" {
if status == statusEmpty {
return schema.HardwareCPU{}, false
}
cpu := schema.HardwareCPU{}
cpu.Status = &status
present := true
cpu.Present = &present
if socket, ok := parseSocketIndex(fields["Socket Designation"]); ok {
cpu.Socket = &socket
@@ -99,15 +101,15 @@ func parseCPUStatus(raw string) string {
upper := strings.ToUpper(raw)
switch {
case upper == "" || upper == "UNKNOWN":
return "UNKNOWN"
return statusUnknown
case strings.Contains(upper, "UNPOPULATED") || strings.Contains(upper, "NOT POPULATED"):
return "EMPTY"
return statusEmpty
case strings.Contains(upper, "ENABLED"):
return "OK"
return statusOK
case strings.Contains(upper, "DISABLED"):
return "WARNING"
return statusWarning
default:
return "UNKNOWN"
return statusUnknown
}
}

View File

@@ -0,0 +1,196 @@
package collector
import (
"bee/audit/internal/schema"
"os"
"path/filepath"
"regexp"
"sort"
"strconv"
"strings"
)
var (
cpuSysBaseDir = "/sys/devices/system/cpu"
socketIndexRe = regexp.MustCompile(`(?i)(?:package id|socket|cpu)\s*([0-9]+)`)
)
func enrichCPUsWithTelemetry(cpus []schema.HardwareCPU, doc sensorsDoc) []schema.HardwareCPU {
if len(cpus) == 0 {
return cpus
}
tempBySocket := cpuTempsFromSensors(doc, len(cpus))
powerBySocket := cpuPowerFromSensors(doc, len(cpus))
throttleBySocket := cpuThrottleBySocket()
for i := range cpus {
socket := 0
if cpus[i].Socket != nil {
socket = *cpus[i].Socket
}
if value, ok := tempBySocket[socket]; ok {
cpus[i].TemperatureC = &value
}
if value, ok := powerBySocket[socket]; ok {
cpus[i].PowerW = &value
}
if value, ok := throttleBySocket[socket]; ok {
cpus[i].Throttled = &value
}
}
return cpus
}
func cpuTempsFromSensors(doc sensorsDoc, cpuCount int) map[int]float64 {
out := map[int]float64{}
if len(doc) == 0 {
return out
}
var fallback []float64
for chip, features := range doc {
for featureName, raw := range features {
feature, ok := raw.(map[string]any)
if !ok {
continue
}
if classifySensorFeature(feature) != "temp" {
continue
}
temp, ok := firstFeatureFloat(feature, "_input")
if !ok {
continue
}
if socket, ok := detectCPUSocket(chip, featureName); ok {
if _, exists := out[socket]; !exists {
out[socket] = temp
}
continue
}
if isLikelyCPUTemp(chip, featureName) {
fallback = append(fallback, temp)
}
}
}
if len(out) == 0 && cpuCount == 1 && len(fallback) > 0 {
out[0] = fallback[0]
}
return out
}
func cpuPowerFromSensors(doc sensorsDoc, cpuCount int) map[int]float64 {
out := map[int]float64{}
if len(doc) == 0 {
return out
}
var fallback []float64
for chip, features := range doc {
for featureName, raw := range features {
feature, ok := raw.(map[string]any)
if !ok {
continue
}
if classifySensorFeature(feature) != "power" {
continue
}
power, ok := firstFeatureFloatWithContains(feature, []string{"power"})
if !ok {
continue
}
if socket, ok := detectCPUSocket(chip, featureName); ok {
if _, exists := out[socket]; !exists {
out[socket] = power
}
continue
}
if isLikelyCPUPower(chip, featureName) {
fallback = append(fallback, power)
}
}
}
if len(out) == 0 && cpuCount == 1 && len(fallback) > 0 {
out[0] = fallback[0]
}
return out
}
func detectCPUSocket(parts ...string) (int, bool) {
for _, part := range parts {
matches := socketIndexRe.FindStringSubmatch(strings.ToLower(part))
if len(matches) == 2 {
value, err := strconv.Atoi(matches[1])
if err == nil {
return value, true
}
}
}
return 0, false
}
func isLikelyCPUTemp(chip, feature string) bool {
value := strings.ToLower(chip + " " + feature)
return strings.Contains(value, "coretemp") ||
strings.Contains(value, "k10temp") ||
strings.Contains(value, "package id") ||
strings.Contains(value, "tdie") ||
strings.Contains(value, "tctl") ||
strings.Contains(value, "cpu temp")
}
func isLikelyCPUPower(chip, feature string) bool {
value := strings.ToLower(chip + " " + feature)
return strings.Contains(value, "intel-rapl") ||
strings.Contains(value, "package id") ||
strings.Contains(value, "package-") ||
strings.Contains(value, "cpu power")
}
func cpuThrottleBySocket() map[int]bool {
out := map[int]bool{}
cpuDirs, err := filepath.Glob(filepath.Join(cpuSysBaseDir, "cpu[0-9]*"))
if err != nil {
return out
}
sort.Strings(cpuDirs)
for _, cpuDir := range cpuDirs {
socket, ok := readSocketIndex(cpuDir)
if !ok {
continue
}
if cpuPackageThrottled(cpuDir) {
out[socket] = true
}
}
return out
}
func readSocketIndex(cpuDir string) (int, bool) {
raw, err := os.ReadFile(filepath.Join(cpuDir, "topology", "physical_package_id"))
if err != nil {
return 0, false
}
value, err := strconv.Atoi(strings.TrimSpace(string(raw)))
if err != nil || value < 0 {
return 0, false
}
return value, true
}
func cpuPackageThrottled(cpuDir string) bool {
paths := []string{
filepath.Join(cpuDir, "thermal_throttle", "package_throttle_count"),
filepath.Join(cpuDir, "thermal_throttle", "core_throttle_count"),
}
for _, path := range paths {
raw, err := os.ReadFile(path)
if err != nil {
continue
}
value, err := strconv.ParseInt(strings.TrimSpace(string(raw)), 10, 64)
if err == nil && value > 0 {
return true
}
}
return false
}

View File

@@ -0,0 +1,71 @@
package collector
import (
"os"
"path/filepath"
"testing"
"bee/audit/internal/schema"
)
func TestEnrichCPUsWithTelemetry(t *testing.T) {
tmp := t.TempDir()
oldBase := cpuSysBaseDir
cpuSysBaseDir = tmp
t.Cleanup(func() { cpuSysBaseDir = oldBase })
mustWriteFile(t, filepath.Join(tmp, "cpu0", "topology", "physical_package_id"), "0\n")
mustWriteFile(t, filepath.Join(tmp, "cpu0", "thermal_throttle", "package_throttle_count"), "3\n")
mustWriteFile(t, filepath.Join(tmp, "cpu1", "topology", "physical_package_id"), "1\n")
mustWriteFile(t, filepath.Join(tmp, "cpu1", "thermal_throttle", "package_throttle_count"), "0\n")
doc := sensorsDoc{
"coretemp-isa-0000": {
"Package id 0": map[string]any{"temp1_input": 61.5},
"Package id 1": map[string]any{"temp2_input": 58.0},
},
"intel-rapl-mmio-0": {
"Package id 0": map[string]any{"power1_average": 180.0},
"Package id 1": map[string]any{"power2_average": 175.0},
},
}
socket0 := 0
socket1 := 1
status := statusOK
cpus := []schema.HardwareCPU{
{Socket: &socket0, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
{Socket: &socket1, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
}
got := enrichCPUsWithTelemetry(cpus, doc)
if got[0].TemperatureC == nil || *got[0].TemperatureC != 61.5 {
t.Fatalf("cpu0 temperature mismatch: %#v", got[0].TemperatureC)
}
if got[0].PowerW == nil || *got[0].PowerW != 180.0 {
t.Fatalf("cpu0 power mismatch: %#v", got[0].PowerW)
}
if got[0].Throttled == nil || !*got[0].Throttled {
t.Fatalf("cpu0 throttled mismatch: %#v", got[0].Throttled)
}
if got[1].TemperatureC == nil || *got[1].TemperatureC != 58.0 {
t.Fatalf("cpu1 temperature mismatch: %#v", got[1].TemperatureC)
}
if got[1].PowerW == nil || *got[1].PowerW != 175.0 {
t.Fatalf("cpu1 power mismatch: %#v", got[1].PowerW)
}
if got[1].Throttled != nil && *got[1].Throttled {
t.Fatalf("cpu1 throttled mismatch: %#v", got[1].Throttled)
}
}
func mustWriteFile(t *testing.T, path, content string) {
t.Helper()
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
t.Fatalf("mkdir %s: %v", path, err)
}
if err := os.WriteFile(path, []byte(content), 0644); err != nil {
t.Fatalf("write %s: %v", path, err)
}
}

View File

@@ -69,12 +69,12 @@ func TestParseCPUStatus(t *testing.T) {
want string
}{
{"Populated, Enabled", "OK"},
{"Populated, Disabled By User", "WARNING"},
{"Populated, Disabled By BIOS", "WARNING"},
{"Unpopulated", "EMPTY"},
{"Not Populated", "EMPTY"},
{"Unknown", "UNKNOWN"},
{"", "UNKNOWN"},
{"Populated, Disabled By User", statusWarning},
{"Populated, Disabled By BIOS", statusWarning},
{"Unpopulated", statusEmpty},
{"Not Populated", statusEmpty},
{"Unknown", statusUnknown},
{"", statusUnknown},
}
for _, tt := range tests {
got := parseCPUStatus(tt.input)

View File

@@ -0,0 +1,179 @@
package collector
import (
"bee/audit/internal/schema"
"fmt"
)
func finalizeSnapshot(snap *schema.HardwareSnapshot, collectedAt string) {
snap.Memory = filterMemory(snap.Memory)
snap.Storage = filterStorage(snap.Storage)
snap.PowerSupplies = filterPSUs(snap.PowerSupplies)
setComponentStatusMetadata(snap, collectedAt)
deduplicateComponentSerials(snap)
}
func filterMemory(dimms []schema.HardwareMemory) []schema.HardwareMemory {
out := make([]schema.HardwareMemory, 0, len(dimms))
for _, dimm := range dimms {
if dimm.Present != nil && !*dimm.Present {
continue
}
if dimm.Status != nil && *dimm.Status == statusEmpty {
continue
}
if dimm.SerialNumber == nil || *dimm.SerialNumber == "" {
continue
}
out = append(out, dimm)
}
return out
}
func filterStorage(disks []schema.HardwareStorage) []schema.HardwareStorage {
out := make([]schema.HardwareStorage, 0, len(disks))
for _, disk := range disks {
if disk.SerialNumber == nil || *disk.SerialNumber == "" {
continue
}
out = append(out, disk)
}
return out
}
func filterPSUs(psus []schema.HardwarePowerSupply) []schema.HardwarePowerSupply {
out := make([]schema.HardwarePowerSupply, 0, len(psus))
for _, psu := range psus {
if psu.SerialNumber == nil || *psu.SerialNumber == "" {
continue
}
out = append(out, psu)
}
return out
}
func setComponentStatusMetadata(snap *schema.HardwareSnapshot, collectedAt string) {
for i := range snap.CPUs {
setStatusCheckedAt(&snap.CPUs[i].HardwareComponentStatus, collectedAt)
}
for i := range snap.Memory {
setStatusCheckedAt(&snap.Memory[i].HardwareComponentStatus, collectedAt)
}
for i := range snap.Storage {
setStatusCheckedAt(&snap.Storage[i].HardwareComponentStatus, collectedAt)
}
for i := range snap.PCIeDevices {
setStatusCheckedAt(&snap.PCIeDevices[i].HardwareComponentStatus, collectedAt)
}
for i := range snap.PowerSupplies {
setStatusCheckedAt(&snap.PowerSupplies[i].HardwareComponentStatus, collectedAt)
}
}
func setStatusCheckedAt(status *schema.HardwareComponentStatus, collectedAt string) {
if status == nil || status.Status == nil || *status.Status == "" {
return
}
if status.StatusCheckedAt == nil {
status.StatusCheckedAt = &collectedAt
}
}
func deduplicateComponentSerials(snap *schema.HardwareSnapshot) {
deduplicateCPUSerials(snap.CPUs)
deduplicateMemorySerials(snap.Memory)
deduplicateStorageSerials(snap.Storage)
deduplicatePCIeSerials(snap.PCIeDevices)
deduplicatePSUSerials(snap.PowerSupplies)
}
func deduplicateCPUSerials(items []schema.HardwareCPU) {
seen := map[string]int{}
seq := 1
for i := range items {
if items[i].SerialNumber == nil || *items[i].SerialNumber == "" {
continue
}
model := derefString(items[i].Model)
key := model + "\x00" + *items[i].SerialNumber
seen[key]++
if seen[key] > 1 {
repl := fmt.Sprintf("NO_SN-%08d", seq)
seq++
items[i].SerialNumber = &repl
}
}
}
func deduplicateMemorySerials(items []schema.HardwareMemory) {
seen := map[string]int{}
seq := 1
for i := range items {
if items[i].SerialNumber == nil || *items[i].SerialNumber == "" {
continue
}
model := derefString(items[i].PartNumber)
key := model + "\x00" + *items[i].SerialNumber
seen[key]++
if seen[key] > 1 {
repl := fmt.Sprintf("NO_SN-%08d", seq)
seq++
items[i].SerialNumber = &repl
}
}
}
func deduplicateStorageSerials(items []schema.HardwareStorage) {
seen := map[string]int{}
seq := 1
for i := range items {
if items[i].SerialNumber == nil || *items[i].SerialNumber == "" {
continue
}
model := derefString(items[i].Model)
key := model + "\x00" + *items[i].SerialNumber
seen[key]++
if seen[key] > 1 {
repl := fmt.Sprintf("NO_SN-%08d", seq)
seq++
items[i].SerialNumber = &repl
}
}
}
func deduplicatePCIeSerials(items []schema.HardwarePCIeDevice) {
seen := map[string]int{}
seq := 1
for i := range items {
if items[i].SerialNumber == nil || *items[i].SerialNumber == "" {
continue
}
model := derefString(items[i].Model)
key := model + "\x00" + *items[i].SerialNumber
seen[key]++
if seen[key] > 1 {
repl := fmt.Sprintf("NO_SN-%08d", seq)
seq++
items[i].SerialNumber = &repl
}
}
}
func deduplicatePSUSerials(items []schema.HardwarePowerSupply) {
seen := map[string]int{}
seq := 1
for i := range items {
if items[i].SerialNumber == nil || *items[i].SerialNumber == "" {
continue
}
model := derefString(items[i].Model)
key := model + "\x00" + *items[i].SerialNumber
seen[key]++
if seen[key] > 1 {
repl := fmt.Sprintf("NO_SN-%08d", seq)
seq++
items[i].SerialNumber = &repl
}
}
}

View File

@@ -0,0 +1,63 @@
package collector
import (
"bee/audit/internal/schema"
"testing"
)
func TestFinalizeSnapshotFiltersComponentsWithoutRequiredSerials(t *testing.T) {
collectedAt := "2026-03-15T12:00:00Z"
present := true
status := statusOK
serial := "SN-1"
snap := schema.HardwareSnapshot{
Memory: []schema.HardwareMemory{
{Present: &present, SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
{Present: &present, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
},
Storage: []schema.HardwareStorage{
{SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
{HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
},
PowerSupplies: []schema.HardwarePowerSupply{
{SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
{HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
},
}
finalizeSnapshot(&snap, collectedAt)
if len(snap.Memory) != 1 || snap.Memory[0].StatusCheckedAt == nil || *snap.Memory[0].StatusCheckedAt != collectedAt {
t.Fatalf("memory finalize mismatch: %+v", snap.Memory)
}
if len(snap.Storage) != 1 || snap.Storage[0].StatusCheckedAt == nil || *snap.Storage[0].StatusCheckedAt != collectedAt {
t.Fatalf("storage finalize mismatch: %+v", snap.Storage)
}
if len(snap.PowerSupplies) != 1 || snap.PowerSupplies[0].StatusCheckedAt == nil || *snap.PowerSupplies[0].StatusCheckedAt != collectedAt {
t.Fatalf("psu finalize mismatch: %+v", snap.PowerSupplies)
}
}
func TestFinalizeSnapshotDeduplicatesSerials(t *testing.T) {
collectedAt := "2026-03-15T12:00:00Z"
status := statusOK
model := "Device"
serial := "DUPLICATE"
snap := schema.HardwareSnapshot{
Storage: []schema.HardwareStorage{
{Model: &model, SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
{Model: &model, SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
},
}
finalizeSnapshot(&snap, collectedAt)
if got := *snap.Storage[0].SerialNumber; got != serial {
t.Fatalf("first serial changed: %q", got)
}
if got := *snap.Storage[1].SerialNumber; got != "NO_SN-00000001" {
t.Fatalf("duplicate serial mismatch: %q", got)
}
}

View File

@@ -47,12 +47,12 @@ func parseMemorySection(fields map[string]string) schema.HardwareMemory {
dimm.Present = &present
if !present {
status := "EMPTY"
status := statusEmpty
dimm.Status = &status
return dimm
}
status := "OK"
status := statusOK
dimm.Status = &status
if mb := parseMemorySizeMB(rawSize); mb > 0 {

View File

@@ -0,0 +1,203 @@
package collector
import (
"bee/audit/internal/schema"
"os"
"path/filepath"
"sort"
"strconv"
"strings"
)
var edacBaseDir = "/sys/devices/system/edac/mc"
type edacDIMMStats struct {
Label string
CECount *int64
UECount *int64
}
func enrichMemoryWithTelemetry(dimms []schema.HardwareMemory, doc sensorsDoc) []schema.HardwareMemory {
if len(dimms) == 0 {
return dimms
}
tempByLabel := memoryTempsFromSensors(doc)
stats := readEDACStats()
for i := range dimms {
labelKeys := dimmMatchKeys(dimms[i].Slot, dimms[i].Location)
for _, key := range labelKeys {
if temp, ok := tempByLabel[key]; ok {
dimms[i].TemperatureC = &temp
break
}
}
for _, key := range labelKeys {
if stat, ok := stats[key]; ok {
if stat.CECount != nil {
dimms[i].CorrectableECCErrorCount = stat.CECount
}
if stat.UECount != nil {
dimms[i].UncorrectableECCErrorCount = stat.UECount
}
if stat.UECount != nil && *stat.UECount > 0 {
dimms[i].DataLossDetected = boolPtr(true)
status := statusCritical
dimms[i].Status = &status
if dimms[i].ErrorDescription == nil {
dimms[i].ErrorDescription = stringPtr("EDAC reports uncorrectable ECC errors")
}
} else if stat.CECount != nil && *stat.CECount > 0 && (dimms[i].Status == nil || *dimms[i].Status == statusOK) {
status := statusWarning
dimms[i].Status = &status
if dimms[i].ErrorDescription == nil {
dimms[i].ErrorDescription = stringPtr("EDAC reports correctable ECC errors")
}
}
break
}
}
}
return dimms
}
func memoryTempsFromSensors(doc sensorsDoc) map[string]float64 {
out := map[string]float64{}
if len(doc) == 0 {
return out
}
for chip, features := range doc {
for featureName, raw := range features {
feature, ok := raw.(map[string]any)
if !ok || classifySensorFeature(feature) != "temp" {
continue
}
if !isLikelyMemoryTemp(chip, featureName) {
continue
}
temp, ok := firstFeatureFloat(feature, "_input")
if !ok {
continue
}
key := canonicalLabel(featureName)
if key == "" {
continue
}
if _, exists := out[key]; !exists {
out[key] = temp
}
}
}
return out
}
func readEDACStats() map[string]edacDIMMStats {
out := map[string]edacDIMMStats{}
mcDirs, err := filepath.Glob(filepath.Join(edacBaseDir, "mc*"))
if err != nil {
return out
}
sort.Strings(mcDirs)
for _, mcDir := range mcDirs {
dimmDirs, err := filepath.Glob(filepath.Join(mcDir, "dimm*"))
if err != nil {
continue
}
sort.Strings(dimmDirs)
for _, dimmDir := range dimmDirs {
stat, ok := readEDACDIMMStats(dimmDir)
if !ok {
continue
}
key := canonicalLabel(stat.Label)
if key == "" {
continue
}
out[key] = stat
}
}
return out
}
func readEDACDIMMStats(dimmDir string) (edacDIMMStats, bool) {
labelBytes, err := os.ReadFile(filepath.Join(dimmDir, "dimm_label"))
if err != nil {
labelBytes, err = os.ReadFile(filepath.Join(dimmDir, "label"))
if err != nil {
return edacDIMMStats{}, false
}
}
label := strings.TrimSpace(string(labelBytes))
if label == "" {
return edacDIMMStats{}, false
}
stat := edacDIMMStats{Label: label}
if value, ok := readEDACCount(dimmDir, []string{"dimm_ce_count", "ce_count"}); ok {
stat.CECount = &value
}
if value, ok := readEDACCount(dimmDir, []string{"dimm_ue_count", "ue_count"}); ok {
stat.UECount = &value
}
return stat, true
}
func readEDACCount(dir string, names []string) (int64, bool) {
for _, name := range names {
raw, err := os.ReadFile(filepath.Join(dir, name))
if err != nil {
continue
}
value, err := strconv.ParseInt(strings.TrimSpace(string(raw)), 10, 64)
if err == nil && value >= 0 {
return value, true
}
}
return 0, false
}
func dimmMatchKeys(slot, location *string) []string {
var out []string
add := func(value *string) {
key := canonicalLabel(derefString(value))
if key == "" {
return
}
for _, existing := range out {
if existing == key {
return
}
}
out = append(out, key)
}
add(slot)
add(location)
return out
}
func canonicalLabel(value string) string {
value = strings.ToUpper(strings.TrimSpace(value))
if value == "" {
return ""
}
var b strings.Builder
for _, r := range value {
if (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9') {
b.WriteRune(r)
}
}
return b.String()
}
func isLikelyMemoryTemp(chip, feature string) bool {
value := strings.ToLower(chip + " " + feature)
return strings.Contains(value, "dimm") || strings.Contains(value, "sodimm")
}
func boolPtr(value bool) *bool {
return &value
}

View File

@@ -0,0 +1,61 @@
package collector
import (
"path/filepath"
"testing"
"bee/audit/internal/schema"
)
func TestEnrichMemoryWithTelemetry(t *testing.T) {
tmp := t.TempDir()
oldBase := edacBaseDir
edacBaseDir = tmp
t.Cleanup(func() { edacBaseDir = oldBase })
mustWriteFile(t, filepath.Join(tmp, "mc0", "dimm0", "dimm_label"), "CPU0_DIMM_A1\n")
mustWriteFile(t, filepath.Join(tmp, "mc0", "dimm0", "dimm_ce_count"), "7\n")
mustWriteFile(t, filepath.Join(tmp, "mc0", "dimm0", "dimm_ue_count"), "0\n")
mustWriteFile(t, filepath.Join(tmp, "mc0", "dimm1", "dimm_label"), "CPU1_DIMM_B2\n")
mustWriteFile(t, filepath.Join(tmp, "mc0", "dimm1", "dimm_ce_count"), "0\n")
mustWriteFile(t, filepath.Join(tmp, "mc0", "dimm1", "dimm_ue_count"), "2\n")
doc := sensorsDoc{
"jc42-i2c-0-18": {
"CPU0 DIMM A1": map[string]any{"temp1_input": 43.0},
"CPU1 DIMM B2": map[string]any{"temp2_input": 46.0},
},
}
status := statusOK
slotA := "CPU0_DIMM_A1"
slotB := "CPU1_DIMM_B2"
dimms := []schema.HardwareMemory{
{Slot: &slotA, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
{Slot: &slotB, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
}
got := enrichMemoryWithTelemetry(dimms, doc)
if got[0].TemperatureC == nil || *got[0].TemperatureC != 43.0 {
t.Fatalf("dimm0 temperature mismatch: %#v", got[0].TemperatureC)
}
if got[0].CorrectableECCErrorCount == nil || *got[0].CorrectableECCErrorCount != 7 {
t.Fatalf("dimm0 ce mismatch: %#v", got[0].CorrectableECCErrorCount)
}
if got[0].Status == nil || *got[0].Status != statusWarning {
t.Fatalf("dimm0 status mismatch: %#v", got[0].Status)
}
if got[1].TemperatureC == nil || *got[1].TemperatureC != 46.0 {
t.Fatalf("dimm1 temperature mismatch: %#v", got[1].TemperatureC)
}
if got[1].UncorrectableECCErrorCount == nil || *got[1].UncorrectableECCErrorCount != 2 {
t.Fatalf("dimm1 ue mismatch: %#v", got[1].UncorrectableECCErrorCount)
}
if got[1].Status == nil || *got[1].Status != statusCritical {
t.Fatalf("dimm1 status mismatch: %#v", got[1].Status)
}
if got[1].DataLossDetected == nil || !*got[1].DataLossDetected {
t.Fatalf("dimm1 data_loss_detected mismatch: %#v", got[1].DataLossDetected)
}
}

View File

@@ -18,17 +18,13 @@ var (
}
return string(out), nil
}
readNetStatFile = func(iface, key string) (int64, error) {
path := filepath.Join("/sys/class/net", iface, "statistics", key)
readNetAddressFile = func(iface string) (string, error) {
path := filepath.Join("/sys/class/net", iface, "address")
raw, err := os.ReadFile(path)
if err != nil {
return 0, err
return "", err
}
v, err := strconv.ParseInt(strings.TrimSpace(string(raw)), 10, 64)
if err != nil {
return 0, err
}
return v, nil
return strings.TrimSpace(string(raw)), nil
}
)
@@ -47,6 +43,7 @@ func enrichPCIeWithNICTelemetry(devs []schema.HardwarePCIeDevice) []schema.Hardw
continue
}
iface := ifaces[0]
devs[i].MacAddresses = collectInterfaceMACs(ifaces)
if devs[i].Firmware == nil {
if out, err := ethtoolInfoQuery(iface); err == nil {
@@ -56,16 +53,13 @@ func enrichPCIeWithNICTelemetry(devs []schema.HardwarePCIeDevice) []schema.Hardw
}
}
if devs[i].Telemetry == nil {
devs[i].Telemetry = map[string]any{}
}
injectNICPacketStats(devs[i].Telemetry, iface)
if out, err := ethtoolModuleQuery(iface); err == nil {
injectSFPDOMTelemetry(devs[i].Telemetry, out)
if injectSFPDOMTelemetry(&devs[i], out) {
enriched++
continue
}
}
if len(devs[i].Telemetry) == 0 {
devs[i].Telemetry = nil
} else {
if len(devs[i].MacAddresses) > 0 || devs[i].Firmware != nil {
enriched++
}
}
@@ -77,31 +71,32 @@ func isNICDevice(dev schema.HardwarePCIeDevice) bool {
if dev.DeviceClass == nil {
return false
}
c := strings.ToLower(strings.TrimSpace(*dev.DeviceClass))
return strings.Contains(c, "ethernet controller") ||
strings.Contains(c, "network controller") ||
strings.Contains(c, "infiniband controller")
c := strings.TrimSpace(*dev.DeviceClass)
return isNICClass(c) || strings.EqualFold(c, "FibreChannelController")
}
func injectNICPacketStats(dst map[string]any, iface string) {
for _, key := range []string{"rx_packets", "tx_packets", "rx_errors", "tx_errors"} {
if v, err := readNetStatFile(iface, key); err == nil {
dst[key] = v
func collectInterfaceMACs(ifaces []string) []string {
seen := map[string]struct{}{}
var out []string
for _, iface := range ifaces {
mac, err := readNetAddressFile(iface)
if err != nil || mac == "" {
continue
}
mac = strings.ToLower(strings.TrimSpace(mac))
if _, ok := seen[mac]; ok {
continue
}
seen[mac] = struct{}{}
out = append(out, mac)
}
}
func injectSFPDOMTelemetry(dst map[string]any, raw string) {
parsed := parseSFPDOM(raw)
for k, v := range parsed {
dst[k] = v
}
return out
}
var floatRe = regexp.MustCompile(`[-+]?[0-9]*\.?[0-9]+`)
func parseSFPDOM(raw string) map[string]any {
out := map[string]any{}
func injectSFPDOMTelemetry(dev *schema.HardwarePCIeDevice, raw string) bool {
var changed bool
for _, line := range strings.Split(raw, "\n") {
trimmed := strings.TrimSpace(line)
if trimmed == "" {
@@ -117,26 +112,55 @@ func parseSFPDOM(raw string) map[string]any {
switch {
case strings.Contains(key, "module temperature"):
if f, ok := firstFloat(val); ok {
out["sfp_temperature_c"] = f
dev.SFPTemperatureC = &f
changed = true
}
case strings.Contains(key, "laser output power"):
if f, ok := dbmValue(val); ok {
out["sfp_tx_power_dbm"] = f
dev.SFPTXPowerDBM = &f
changed = true
}
case strings.Contains(key, "receiver signal"):
if f, ok := dbmValue(val); ok {
out["sfp_rx_power_dbm"] = f
dev.SFPRXPowerDBM = &f
changed = true
}
case strings.Contains(key, "module voltage"):
if f, ok := firstFloat(val); ok {
out["sfp_voltage_v"] = f
dev.SFPVoltageV = &f
changed = true
}
case strings.Contains(key, "laser bias current"):
if f, ok := firstFloat(val); ok {
out["sfp_bias_ma"] = f
dev.SFPBiasMA = &f
changed = true
}
}
}
return changed
}
func parseSFPDOM(raw string) map[string]any {
dev := schema.HardwarePCIeDevice{}
if !injectSFPDOMTelemetry(&dev, raw) {
return map[string]any{}
}
out := map[string]any{}
if dev.SFPTemperatureC != nil {
out["sfp_temperature_c"] = *dev.SFPTemperatureC
}
if dev.SFPTXPowerDBM != nil {
out["sfp_tx_power_dbm"] = *dev.SFPTXPowerDBM
}
if dev.SFPRXPowerDBM != nil {
out["sfp_rx_power_dbm"] = *dev.SFPRXPowerDBM
}
if dev.SFPVoltageV != nil {
out["sfp_voltage_v"] = *dev.SFPVoltageV
}
if dev.SFPBiasMA != nil {
out["sfp_bias_ma"] = *dev.SFPBiasMA
}
return out
}

View File

@@ -24,7 +24,7 @@ type nvidiaGPUInfo struct {
}
// enrichPCIeWithNVIDIA enriches NVIDIA PCIe devices with data from nvidia-smi.
// If the driver/tool is unavailable, NVIDIA devices get UNKNOWN status and
// If the driver/tool is unavailable, NVIDIA devices get Unknown status and
// a stable serial fallback based on board serial + slot.
func enrichPCIeWithNVIDIA(devs []schema.HardwarePCIeDevice, boardSerial string) []schema.HardwarePCIeDevice {
if !hasNVIDIADevices(devs) {
@@ -78,9 +78,10 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
devs[i].Firmware = &v
}
status := "OK"
status := statusOK
if info.ECCUncorrected != nil && *info.ECCUncorrected > 0 {
status = "WARNING"
status = statusWarning
devs[i].ErrorDescription = stringPtr("GPU reports uncorrected ECC errors")
}
devs[i].Status = &status
injectNVIDIATelemetry(&devs[i], info)
@@ -214,7 +215,7 @@ func isNVIDIADevice(dev schema.HardwarePCIeDevice) bool {
func setPCIeFallback(dev *schema.HardwarePCIeDevice, boardSerial string) {
setPCIeFallbackSerial(dev, boardSerial)
status := "UNKNOWN"
status := statusUnknown
dev.Status = &status
}
@@ -233,25 +234,19 @@ func setPCIeFallbackSerial(dev *schema.HardwarePCIeDevice, boardSerial string) {
}
func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) {
if dev.Telemetry == nil {
dev.Telemetry = map[string]any{}
}
if info.TemperatureC != nil {
dev.Telemetry["temperature_c"] = *info.TemperatureC
dev.TemperatureC = info.TemperatureC
}
if info.PowerW != nil {
dev.Telemetry["power_w"] = *info.PowerW
dev.PowerW = info.PowerW
}
if info.ECCUncorrected != nil {
dev.Telemetry["ecc_uncorrected_total"] = *info.ECCUncorrected
dev.ECCUncorrectedTotal = info.ECCUncorrected
}
if info.ECCCorrected != nil {
dev.Telemetry["ecc_corrected_total"] = *info.ECCCorrected
dev.ECCCorrectedTotal = info.ECCCorrected
}
if info.HWSlowdown != nil {
dev.Telemetry["hw_slowdown_active"] = *info.HWSlowdown
}
if len(dev.Telemetry) == 0 {
dev.Telemetry = nil
dev.HWSlowdown = info.HWSlowdown
}
}

View File

@@ -54,10 +54,10 @@ func TestEnrichPCIeWithNVIDIAData_driverLoaded(t *testing.T) {
status := "OK"
devices := []schema.HardwarePCIeDevice{
{
VendorID: &vendorID,
BDF: &bdf,
Manufacturer: &manufacturer,
Status: &status,
HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status},
VendorID: &vendorID,
BDF: &bdf,
Manufacturer: &manufacturer,
},
}
@@ -80,14 +80,14 @@ func TestEnrichPCIeWithNVIDIAData_driverLoaded(t *testing.T) {
if out[0].Firmware == nil || *out[0].Firmware != "96.00.1F.00.02" {
t.Fatalf("firmware: got %v", out[0].Firmware)
}
if out[0].Status == nil || *out[0].Status != "WARNING" {
if out[0].Status == nil || *out[0].Status != statusWarning {
t.Fatalf("status: got %v", out[0].Status)
}
if out[0].Telemetry == nil {
t.Fatal("expected telemetry")
if out[0].ECCUncorrectedTotal == nil || *out[0].ECCUncorrectedTotal != 2 {
t.Fatalf("ecc_uncorrected_total: got %#v", out[0].ECCUncorrectedTotal)
}
if got, ok := out[0].Telemetry["ecc_uncorrected_total"].(int64); !ok || got != 2 {
t.Fatalf("ecc_uncorrected_total: got %#v", out[0].Telemetry["ecc_uncorrected_total"])
if out[0].TemperatureC == nil || *out[0].TemperatureC != 55.5 {
t.Fatalf("temperature_c: got %#v", out[0].TemperatureC)
}
}
@@ -107,7 +107,7 @@ func TestEnrichPCIeWithNVIDIAData_driverMissingFallback(t *testing.T) {
if out[0].SerialNumber == nil || *out[0].SerialNumber != "BOARD-123-PCIE-0000:17:00.0" {
t.Fatalf("fallback serial: got %v", out[0].SerialNumber)
}
if out[0].Status == nil || *out[0].Status != "UNKNOWN" {
if out[0].Status == nil || *out[0].Status != statusUnknown {
t.Fatalf("fallback status: got %v", out[0].Status)
}
}

View File

@@ -79,7 +79,7 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
dev := schema.HardwarePCIeDevice{}
present := true
dev.Present = &present
status := "OK"
status := statusOK
dev.Status = &status
// Slot is the BDF: "0000:00:02.0"
@@ -93,10 +93,32 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
if deviceID != 0 {
dev.DeviceID = &deviceID
}
if numaNode, ok := readPCINumaNode(bdf); ok {
dev.NUMANode = &numaNode
}
if width, ok := readPCIIntAttribute(bdf, "current_link_width"); ok {
dev.LinkWidth = &width
}
if width, ok := readPCIIntAttribute(bdf, "max_link_width"); ok {
dev.MaxLinkWidth = &width
}
if speed, ok := readPCIStringAttribute(bdf, "current_link_speed"); ok {
linkSpeed := normalizePCILinkSpeed(speed)
if linkSpeed != "" {
dev.LinkSpeed = &linkSpeed
}
}
if speed, ok := readPCIStringAttribute(bdf, "max_link_speed"); ok {
linkSpeed := normalizePCILinkSpeed(speed)
if linkSpeed != "" {
dev.MaxLinkSpeed = &linkSpeed
}
}
}
if v := fields["Class"]; v != "" {
dev.DeviceClass = &v
class := mapPCIeDeviceClass(v)
dev.DeviceClass = &class
}
if v := fields["Vendor"]; v != "" {
dev.Manufacturer = &v
@@ -131,3 +153,55 @@ func readHexFile(path string) (int, error) {
n, err := strconv.ParseInt(s, 16, 64)
return int(n), err
}
func readPCINumaNode(bdf string) (int, bool) {
value, ok := readPCIIntAttribute(bdf, "numa_node")
if !ok || value < 0 {
return 0, false
}
return value, true
}
func readPCIIntAttribute(bdf, attribute string) (int, bool) {
out, err := exec.Command("cat", "/sys/bus/pci/devices/"+bdf+"/"+attribute).Output()
if err != nil {
return 0, false
}
value, err := strconv.Atoi(strings.TrimSpace(string(out)))
if err != nil || value < 0 {
return 0, false
}
return value, true
}
func readPCIStringAttribute(bdf, attribute string) (string, bool) {
out, err := exec.Command("cat", "/sys/bus/pci/devices/"+bdf+"/"+attribute).Output()
if err != nil {
return "", false
}
value := strings.TrimSpace(string(out))
if value == "" {
return "", false
}
return value, true
}
func normalizePCILinkSpeed(raw string) string {
raw = strings.TrimSpace(strings.ToLower(raw))
switch {
case strings.Contains(raw, "2.5"):
return "Gen1"
case strings.Contains(raw, "5.0"):
return "Gen2"
case strings.Contains(raw, "8.0"):
return "Gen3"
case strings.Contains(raw, "16.0"):
return "Gen4"
case strings.Contains(raw, "32.0"):
return "Gen5"
case strings.Contains(raw, "64.0"):
return "Gen6"
default:
return ""
}
}

View File

@@ -35,7 +35,27 @@ func TestParseLspci_filtersExcludedClasses(t *testing.T) {
if len(devs) != 1 {
t.Fatalf("expected 1 filtered device, got %d", len(devs))
}
if devs[0].DeviceClass == nil || *devs[0].DeviceClass != "VGA compatible controller" {
if devs[0].DeviceClass == nil || *devs[0].DeviceClass != "VideoController" {
t.Fatalf("unexpected remaining class: %v", devs[0].DeviceClass)
}
}
func TestNormalizePCILinkSpeed(t *testing.T) {
tests := []struct {
raw string
want string
}{
{"2.5 GT/s PCIe", "Gen1"},
{"5.0 GT/s PCIe", "Gen2"},
{"8.0 GT/s PCIe", "Gen3"},
{"16.0 GT/s PCIe", "Gen4"},
{"32.0 GT/s PCIe", "Gen5"},
{"64.0 GT/s PCIe", "Gen6"},
{"unknown", ""},
}
for _, tt := range tests {
if got := normalizePCILinkSpeed(tt.raw); got != tt.want {
t.Fatalf("normalizePCILinkSpeed(%q)=%q want %q", tt.raw, got, tt.want)
}
}
}

View File

@@ -114,7 +114,7 @@ func parseFRUBlock(block string, slotIdx int) (schema.HardwarePowerSupply, bool)
}
}
status := "OK"
status := statusOK
psu.Status = &status
return psu, true
@@ -123,9 +123,12 @@ func parseFRUBlock(block string, slotIdx int) (schema.HardwarePowerSupply, bool)
type psuSDR struct {
slot int
status string
reason string
inputPowerW *float64
outputPowerW *float64
inputVoltage *float64
temperatureC *float64
healthPct *float64
}
var psuSlotRe = regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b|\bps\s*([0-9]+)\b`)
@@ -148,10 +151,11 @@ func parsePSUSDR(raw string) map[int]psuSDR {
entry := out[slot]
entry.slot = slot
if entry.status == "" {
entry.status = "OK"
entry.status = statusOK
}
if state != "" && state != "ok" && state != "ns" {
entry.status = "FAILED"
entry.status = statusCritical
entry.reason = "PSU sensor reported non-OK state: " + state
}
lowerName := strings.ToLower(name)
@@ -162,6 +166,10 @@ func parsePSUSDR(raw string) map[int]psuSDR {
entry.outputPowerW = parseFloatPtr(value)
case strings.Contains(lowerName, "input voltage"), strings.Contains(lowerName, "ac input"):
entry.inputVoltage = parseFloatPtr(value)
case strings.Contains(lowerName, "temp"):
entry.temperatureC = parseFloatPtr(value)
case strings.Contains(lowerName, "health"), strings.Contains(lowerName, "remaining life"), strings.Contains(lowerName, "life remaining"):
entry.healthPct = parsePercentPtr(value)
}
out[slot] = entry
}
@@ -187,12 +195,23 @@ func mergePSUSDR(psus []schema.HardwarePowerSupply, sdr map[int]psuSDR) {
if entry.inputVoltage != nil {
psus[i].InputVoltage = entry.inputVoltage
}
if entry.temperatureC != nil {
psus[i].TemperatureC = entry.temperatureC
}
if entry.healthPct != nil {
psus[i].LifeRemainingPct = entry.healthPct
lifeUsed := 100 - *entry.healthPct
psus[i].LifeUsedPct = &lifeUsed
}
if entry.status != "" {
psus[i].Status = &entry.status
}
if psus[i].Status != nil && *psus[i].Status == "OK" {
if entry.reason != "" {
psus[i].ErrorDescription = &entry.reason
}
if psus[i].Status != nil && *psus[i].Status == statusOK {
if (entry.inputPowerW == nil && entry.outputPowerW == nil && entry.inputVoltage == nil) && entry.status == "" {
unknown := "UNKNOWN"
unknown := statusUnknown
psus[i].Status = &unknown
}
}

View File

@@ -7,6 +7,8 @@ func TestParsePSUSDR(t *testing.T) {
PS1 Input Power | 215 Watts | ok
PS1 Output Power | 198 Watts | ok
PS1 Input Voltage | 229 Volts | ok
PS1 Temp | 39 C | ok
PS1 Health | 97 % | ok
PS2 Input Power | 0 Watts | cr
`
@@ -14,7 +16,7 @@ PS2 Input Power | 0 Watts | cr
if len(got) != 2 {
t.Fatalf("len(got)=%d want 2", len(got))
}
if got[1].status != "OK" {
if got[1].status != statusOK {
t.Fatalf("ps1 status=%q", got[1].status)
}
if got[1].inputPowerW == nil || *got[1].inputPowerW != 215 {
@@ -26,7 +28,13 @@ PS2 Input Power | 0 Watts | cr
if got[1].inputVoltage == nil || *got[1].inputVoltage != 229 {
t.Fatalf("ps1 input voltage=%v", got[1].inputVoltage)
}
if got[2].status != "FAILED" {
if got[1].temperatureC == nil || *got[1].temperatureC != 39 {
t.Fatalf("ps1 temperature=%v", got[1].temperatureC)
}
if got[1].healthPct == nil || *got[1].healthPct != 97 {
t.Fatalf("ps1 health=%v", got[1].healthPct)
}
if got[2].status != statusCritical {
t.Fatalf("ps2 status=%q", got[2].status)
}
}

View File

@@ -0,0 +1,132 @@
package collector
import (
"bee/audit/internal/schema"
"strconv"
"strings"
)
func enrichPSUsWithTelemetry(psus []schema.HardwarePowerSupply, doc sensorsDoc) []schema.HardwarePowerSupply {
if len(psus) == 0 || len(doc) == 0 {
return psus
}
tempBySlot := psuTempsFromSensors(doc)
healthBySlot := psuHealthFromSensors(doc)
for i := range psus {
slot := derefPSUSlot(psus[i].Slot)
if slot == "" {
continue
}
if psus[i].TemperatureC == nil {
if value, ok := tempBySlot[slot]; ok {
psus[i].TemperatureC = &value
}
}
if psus[i].LifeRemainingPct == nil {
if value, ok := healthBySlot[slot]; ok {
psus[i].LifeRemainingPct = &value
used := 100 - value
psus[i].LifeUsedPct = &used
}
}
}
return psus
}
func psuHealthFromSensors(doc sensorsDoc) map[string]float64 {
out := map[string]float64{}
for chip, features := range doc {
for featureName, raw := range features {
feature, ok := raw.(map[string]any)
if !ok {
continue
}
if !isLikelyPSUHealth(chip, featureName) {
continue
}
value, ok := firstFeaturePercent(feature)
if !ok {
continue
}
if slot, ok := detectPSUSlot(chip, featureName); ok {
if _, exists := out[slot]; !exists {
out[slot] = value
}
}
}
}
return out
}
func firstFeaturePercent(feature map[string]any) (float64, bool) {
keys := sortedFeatureKeys(feature)
for _, key := range keys {
lower := strings.ToLower(key)
if strings.HasSuffix(lower, "_alarm") {
continue
}
if strings.Contains(lower, "health") || strings.Contains(lower, "life") || strings.Contains(lower, "remain") {
if value, ok := floatFromAny(feature[key]); ok {
return value, true
}
}
}
return 0, false
}
func isLikelyPSUHealth(chip, feature string) bool {
value := strings.ToLower(chip + " " + feature)
return (strings.Contains(value, "psu") || strings.Contains(value, "power supply")) &&
(strings.Contains(value, "health") || strings.Contains(value, "life") || strings.Contains(value, "remain"))
}
func psuTempsFromSensors(doc sensorsDoc) map[string]float64 {
out := map[string]float64{}
for chip, features := range doc {
for featureName, raw := range features {
feature, ok := raw.(map[string]any)
if !ok || classifySensorFeature(feature) != "temp" {
continue
}
if !isLikelyPSUTemp(chip, featureName) {
continue
}
temp, ok := firstFeatureFloat(feature, "_input")
if !ok {
continue
}
if slot, ok := detectPSUSlot(chip, featureName); ok {
if _, exists := out[slot]; !exists {
out[slot] = temp
}
}
}
}
return out
}
func isLikelyPSUTemp(chip, feature string) bool {
value := strings.ToLower(chip + " " + feature)
return strings.Contains(value, "psu") || strings.Contains(value, "power supply")
}
func detectPSUSlot(parts ...string) (string, bool) {
for _, part := range parts {
lower := strings.ToLower(part)
matches := psuSlotRe.FindStringSubmatch(lower)
if len(matches) == 0 {
continue
}
for _, group := range matches[1:] {
if group == "" {
continue
}
value, err := strconv.Atoi(group)
if err == nil && value > 0 {
return strconv.Itoa(value - 1), true
}
}
}
return "", false
}

View File

@@ -0,0 +1,42 @@
package collector
import (
"testing"
"bee/audit/internal/schema"
)
func TestEnrichPSUsWithTelemetry(t *testing.T) {
slot0 := "0"
slot1 := "1"
psus := []schema.HardwarePowerSupply{
{Slot: &slot0},
{Slot: &slot1},
}
doc := sensorsDoc{
"psu-hwmon-0": {
"PSU1 Temp": map[string]any{"temp1_input": 39.5},
"PSU2 Temp": map[string]any{"temp2_input": 41.0},
"PSU1 Health": map[string]any{"health1_input": 98.0},
"PSU2 Remaining Life": map[string]any{"life2_input": 95.0},
},
}
got := enrichPSUsWithTelemetry(psus, doc)
if got[0].TemperatureC == nil || *got[0].TemperatureC != 39.5 {
t.Fatalf("psu0 temperature mismatch: %#v", got[0].TemperatureC)
}
if got[1].TemperatureC == nil || *got[1].TemperatureC != 41.0 {
t.Fatalf("psu1 temperature mismatch: %#v", got[1].TemperatureC)
}
if got[0].LifeRemainingPct == nil || *got[0].LifeRemainingPct != 98.0 {
t.Fatalf("psu0 life remaining mismatch: %#v", got[0].LifeRemainingPct)
}
if got[0].LifeUsedPct == nil || *got[0].LifeUsedPct != 2.0 {
t.Fatalf("psu0 life used mismatch: %#v", got[0].LifeUsedPct)
}
if got[1].LifeRemainingPct == nil || *got[1].LifeRemainingPct != 95.0 {
t.Fatalf("psu1 life remaining mismatch: %#v", got[1].LifeRemainingPct)
}
}

View File

@@ -83,11 +83,7 @@ func isLikelyRAIDController(dev schema.HardwarePCIeDevice) bool {
if dev.DeviceClass == nil {
return false
}
c := strings.ToLower(*dev.DeviceClass)
return strings.Contains(c, "raid") ||
strings.Contains(c, "sas") ||
strings.Contains(c, "mass storage") ||
strings.Contains(c, "serial attached scsi")
return isRAIDClass(*dev.DeviceClass)
}
func collectStorcliDrives() []schema.HardwareStorage {
@@ -182,7 +178,10 @@ func parseSASIrcuDisplay(raw string) []schema.HardwareStorage {
present := true
status := mapRAIDDriveStatus(b["State"])
s := schema.HardwareStorage{Present: &present, Status: &status}
s := schema.HardwareStorage{
HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status},
Present: &present,
}
enclosure := strings.TrimSpace(b["Enclosure #"])
slot := strings.TrimSpace(b["Slot #"])
@@ -281,7 +280,10 @@ func parseArcconfPhysicalDrives(raw string) []schema.HardwareStorage {
for _, b := range blocks {
present := true
status := mapRAIDDriveStatus(b["State"])
s := schema.HardwareStorage{Present: &present, Status: &status}
s := schema.HardwareStorage{
HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status},
Present: &present,
}
if v := strings.TrimSpace(b["Reported Location"]); v != "" {
s.Slot = &v
@@ -362,8 +364,11 @@ func parseSSACLIPhysicalDrives(raw string) []schema.HardwareStorage {
if m := ssacliPhysicalDriveLine.FindStringSubmatch(trimmed); len(m) == 3 {
flush()
present := true
status := "UNKNOWN"
s := schema.HardwareStorage{Present: &present, Status: &status}
status := statusUnknown
s := schema.HardwareStorage{
HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status},
Present: &present,
}
slot := m[1]
s.Slot = &slot
@@ -475,8 +480,8 @@ func storcliDriveToStorage(d struct {
present := true
status := mapRAIDDriveStatus(d.State)
s := schema.HardwareStorage{
Present: &present,
Status: &status,
HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status},
Present: &present,
}
if v := strings.TrimSpace(d.EIDSlt); v != "" {
@@ -527,15 +532,15 @@ func mapRAIDDriveStatus(raw string) string {
u := strings.ToUpper(strings.TrimSpace(raw))
switch {
case strings.Contains(u, "OK"), strings.Contains(u, "OPTIMAL"), strings.Contains(u, "READY"):
return "OK"
return statusOK
case strings.Contains(u, "ONLN"), strings.Contains(u, "ONLINE"):
return "OK"
return statusOK
case strings.Contains(u, "RBLD"), strings.Contains(u, "REBUILD"):
return "WARNING"
return statusWarning
case strings.Contains(u, "FAIL"), strings.Contains(u, "OFFLINE"):
return "CRITICAL"
return statusCritical
default:
return "UNKNOWN"
return statusUnknown
}
}
@@ -641,8 +646,9 @@ func enrichStorageWithVROC(storage []schema.HardwareStorage, pcie []schema.Hardw
storage[i].Telemetry["vroc_array"] = arr.Name
storage[i].Telemetry["vroc_degraded"] = arr.Degraded
if arr.Degraded {
status := "WARNING"
status := statusWarning
storage[i].Status = &status
storage[i].ErrorDescription = stringPtr("VROC array is degraded")
}
updated++
}
@@ -659,14 +665,14 @@ func hasVROCController(pcie []schema.HardwarePCIeDevice) bool {
class := ""
if dev.DeviceClass != nil {
class = strings.ToLower(*dev.DeviceClass)
class = strings.TrimSpace(*dev.DeviceClass)
}
model := ""
if dev.Model != nil {
model = strings.ToLower(*dev.Model)
}
if strings.Contains(class, "raid") ||
if isRAIDClass(class) ||
strings.Contains(model, "vroc") ||
strings.Contains(model, "volume management device") ||
strings.Contains(model, "vmd") {

View File

@@ -0,0 +1,334 @@
package collector
import (
"bee/audit/internal/schema"
"encoding/json"
"log/slog"
"strconv"
"strings"
)
type raidControllerTelemetry struct {
BatteryChargePct *float64
BatteryHealthPct *float64
BatteryTemperatureC *float64
BatteryVoltageV *float64
BatteryReplaceRequired *bool
ErrorDescription *string
}
func enrichPCIeWithRAIDTelemetry(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice {
byVendor := collectRAIDControllerTelemetry()
if len(byVendor) == 0 {
return devs
}
positions := map[int]int{}
for i := range devs {
if devs[i].VendorID == nil || !isLikelyRAIDController(devs[i]) {
continue
}
vendor := *devs[i].VendorID
list := byVendor[vendor]
if len(list) == 0 {
continue
}
index := positions[vendor]
if index >= len(list) {
continue
}
positions[vendor] = index + 1
applyRAIDControllerTelemetry(&devs[i], list[index])
}
return devs
}
func applyRAIDControllerTelemetry(dev *schema.HardwarePCIeDevice, tel raidControllerTelemetry) {
if tel.BatteryChargePct != nil {
dev.BatteryChargePct = tel.BatteryChargePct
}
if tel.BatteryHealthPct != nil {
dev.BatteryHealthPct = tel.BatteryHealthPct
}
if tel.BatteryTemperatureC != nil {
dev.BatteryTemperatureC = tel.BatteryTemperatureC
}
if tel.BatteryVoltageV != nil {
dev.BatteryVoltageV = tel.BatteryVoltageV
}
if tel.BatteryReplaceRequired != nil {
dev.BatteryReplaceRequired = tel.BatteryReplaceRequired
}
if tel.ErrorDescription != nil {
dev.ErrorDescription = tel.ErrorDescription
if dev.Status == nil || *dev.Status == statusOK {
status := statusWarning
dev.Status = &status
}
}
}
func collectRAIDControllerTelemetry() map[int][]raidControllerTelemetry {
out := map[int][]raidControllerTelemetry{}
if raw, err := raidToolQuery("storcli64", "/call", "show", "all", "J"); err == nil {
list := parseStorcliControllerTelemetry(raw)
if len(list) > 0 {
out[vendorBroadcomLSI] = append(out[vendorBroadcomLSI], list...)
slog.Info("raid: storcli controller telemetry", "count", len(list))
}
}
if raw, err := raidToolQuery("ssacli", "ctrl", "all", "show", "config", "detail"); err == nil {
list := parseSSACLIControllerTelemetry(string(raw))
if len(list) > 0 {
out[vendorHPE] = append(out[vendorHPE], list...)
slog.Info("raid: ssacli controller telemetry", "count", len(list))
}
}
if raw, err := raidToolQuery("arcconf", "getconfig", "1", "ad"); err == nil {
list := parseArcconfControllerTelemetry(string(raw))
if len(list) > 0 {
out[vendorAdaptec] = append(out[vendorAdaptec], list...)
slog.Info("raid: arcconf controller telemetry", "count", len(list))
}
}
return out
}
func parseStorcliControllerTelemetry(raw []byte) []raidControllerTelemetry {
var doc struct {
Controllers []struct {
ResponseData map[string]any `json:"Response Data"`
} `json:"Controllers"`
}
if err := json.Unmarshal(raw, &doc); err != nil {
slog.Warn("raid: parse storcli controller telemetry failed", "err", err)
return nil
}
var out []raidControllerTelemetry
for _, ctl := range doc.Controllers {
tel := raidControllerTelemetry{}
mergeStorcliBatteryMap(&tel, nestedStringMap(ctl.ResponseData["BBU_Info"]))
mergeStorcliBatteryMap(&tel, nestedStringMap(ctl.ResponseData["BBU_Info_Details"]))
mergeStorcliBatteryMap(&tel, nestedStringMap(ctl.ResponseData["CV_Info"]))
mergeStorcliBatteryMap(&tel, nestedStringMap(ctl.ResponseData["CV_Info_Details"]))
if hasRAIDControllerTelemetry(tel) {
out = append(out, tel)
}
}
return out
}
func nestedStringMap(raw any) map[string]string {
switch value := raw.(type) {
case map[string]any:
out := map[string]string{}
flattenStringMap("", value, out)
return out
case []any:
out := map[string]string{}
for _, item := range value {
if m, ok := item.(map[string]any); ok {
flattenStringMap("", m, out)
}
}
return out
default:
return nil
}
}
func flattenStringMap(prefix string, in map[string]any, out map[string]string) {
for key, raw := range in {
fullKey := strings.TrimSpace(strings.ToLower(strings.Trim(prefix+" "+key, " ")))
switch value := raw.(type) {
case map[string]any:
flattenStringMap(fullKey, value, out)
case []any:
for _, item := range value {
if m, ok := item.(map[string]any); ok {
flattenStringMap(fullKey, m, out)
}
}
case string:
out[fullKey] = value
case json.Number:
out[fullKey] = value.String()
case float64:
out[fullKey] = strconv.FormatFloat(value, 'f', -1, 64)
case bool:
if value {
out[fullKey] = "true"
} else {
out[fullKey] = "false"
}
}
}
}
func mergeStorcliBatteryMap(tel *raidControllerTelemetry, fields map[string]string) {
if len(fields) == 0 {
return
}
for key, raw := range fields {
lower := strings.ToLower(strings.TrimSpace(key))
switch {
case strings.Contains(lower, "relative state of charge"), strings.Contains(lower, "remaining capacity"), strings.Contains(lower, "charge"):
if tel.BatteryChargePct == nil {
tel.BatteryChargePct = parsePercentPtr(raw)
}
case strings.Contains(lower, "state of health"), strings.Contains(lower, "health"):
if tel.BatteryHealthPct == nil {
tel.BatteryHealthPct = parsePercentPtr(raw)
}
case strings.Contains(lower, "temperature"):
if tel.BatteryTemperatureC == nil {
tel.BatteryTemperatureC = parseFloatPtr(raw)
}
case strings.Contains(lower, "voltage"):
if tel.BatteryVoltageV == nil {
tel.BatteryVoltageV = parseFloatPtr(raw)
}
case strings.Contains(lower, "replace"), strings.Contains(lower, "replacement required"):
if tel.BatteryReplaceRequired == nil {
tel.BatteryReplaceRequired = parseReplaceRequired(raw)
}
case strings.Contains(lower, "learn cycle requested"), strings.Contains(lower, "battery state"), strings.Contains(lower, "capacitance state"):
if desc := batteryStateDescription(raw); desc != nil && tel.ErrorDescription == nil {
tel.ErrorDescription = desc
}
}
}
}
func parseSSACLIControllerTelemetry(raw string) []raidControllerTelemetry {
lines := strings.Split(raw, "\n")
var out []raidControllerTelemetry
var current *raidControllerTelemetry
flush := func() {
if current != nil && hasRAIDControllerTelemetry(*current) {
out = append(out, *current)
}
current = nil
}
for _, line := range lines {
trimmed := strings.TrimSpace(line)
if trimmed == "" {
continue
}
if strings.HasPrefix(strings.ToLower(trimmed), "smart array") || strings.HasPrefix(strings.ToLower(trimmed), "controller ") {
flush()
current = &raidControllerTelemetry{}
continue
}
if current == nil {
continue
}
if idx := strings.Index(trimmed, ":"); idx > 0 {
key := strings.ToLower(strings.TrimSpace(trimmed[:idx]))
val := strings.TrimSpace(trimmed[idx+1:])
switch {
case strings.Contains(key, "capacitor temperature"), strings.Contains(key, "battery temperature"):
current.BatteryTemperatureC = parseFloatPtr(val)
case strings.Contains(key, "capacitor voltage"), strings.Contains(key, "battery voltage"):
current.BatteryVoltageV = parseFloatPtr(val)
case strings.Contains(key, "capacitor charge"), strings.Contains(key, "battery charge"):
current.BatteryChargePct = parsePercentPtr(val)
case strings.Contains(key, "capacitor health"), strings.Contains(key, "battery health"):
current.BatteryHealthPct = parsePercentPtr(val)
case strings.Contains(key, "replace") || strings.Contains(key, "failed"):
if current.BatteryReplaceRequired == nil {
current.BatteryReplaceRequired = parseReplaceRequired(val)
}
if desc := batteryStateDescription(val); desc != nil && current.ErrorDescription == nil {
current.ErrorDescription = desc
}
}
}
}
flush()
return out
}
func parseArcconfControllerTelemetry(raw string) []raidControllerTelemetry {
lines := strings.Split(raw, "\n")
tel := raidControllerTelemetry{}
for _, line := range lines {
trimmed := strings.TrimSpace(line)
if idx := strings.Index(trimmed, ":"); idx > 0 {
key := strings.ToLower(strings.TrimSpace(trimmed[:idx]))
val := strings.TrimSpace(trimmed[idx+1:])
switch {
case strings.Contains(key, "battery temperature"), strings.Contains(key, "capacitor temperature"):
tel.BatteryTemperatureC = parseFloatPtr(val)
case strings.Contains(key, "battery voltage"), strings.Contains(key, "capacitor voltage"):
tel.BatteryVoltageV = parseFloatPtr(val)
case strings.Contains(key, "battery charge"), strings.Contains(key, "capacitor charge"):
tel.BatteryChargePct = parsePercentPtr(val)
case strings.Contains(key, "battery health"), strings.Contains(key, "capacitor health"):
tel.BatteryHealthPct = parsePercentPtr(val)
case strings.Contains(key, "replace"), strings.Contains(key, "failed"):
if tel.BatteryReplaceRequired == nil {
tel.BatteryReplaceRequired = parseReplaceRequired(val)
}
if desc := batteryStateDescription(val); desc != nil && tel.ErrorDescription == nil {
tel.ErrorDescription = desc
}
}
}
}
if hasRAIDControllerTelemetry(tel) {
return []raidControllerTelemetry{tel}
}
return nil
}
func hasRAIDControllerTelemetry(tel raidControllerTelemetry) bool {
return tel.BatteryChargePct != nil ||
tel.BatteryHealthPct != nil ||
tel.BatteryTemperatureC != nil ||
tel.BatteryVoltageV != nil ||
tel.BatteryReplaceRequired != nil ||
tel.ErrorDescription != nil
}
func parsePercentPtr(raw string) *float64 {
raw = strings.ReplaceAll(strings.TrimSpace(raw), "%", "")
return parseFloatPtr(raw)
}
func parseReplaceRequired(raw string) *bool {
lower := strings.ToLower(strings.TrimSpace(raw))
switch {
case lower == "":
return nil
case strings.Contains(lower, "replace"), strings.Contains(lower, "failed"), strings.Contains(lower, "yes"), strings.Contains(lower, "required"):
value := true
return &value
case strings.Contains(lower, "no"), strings.Contains(lower, "ok"), strings.Contains(lower, "good"), strings.Contains(lower, "optimal"):
value := false
return &value
default:
return nil
}
}
func batteryStateDescription(raw string) *string {
lower := strings.ToLower(strings.TrimSpace(raw))
if lower == "" {
return nil
}
switch {
case strings.Contains(lower, "failed"), strings.Contains(lower, "fault"), strings.Contains(lower, "replace"), strings.Contains(lower, "warning"), strings.Contains(lower, "degraded"):
return &raw
default:
return nil
}
}

View File

@@ -1,6 +1,10 @@
package collector
import "testing"
import (
"bee/audit/internal/schema"
"errors"
"testing"
)
func TestParseSASIrcuControllerIDs(t *testing.T) {
raw := `LSI Corporation SAS2 IR Configuration Utility.
@@ -90,7 +94,111 @@ physicaldrive 1I:1:2 (894 GB, SAS HDD, Failed)
if drives[0].Status == nil || *drives[0].Status != "OK" {
t.Fatalf("drive0 status: %v", drives[0].Status)
}
if drives[1].Status == nil || *drives[1].Status != "CRITICAL" {
if drives[1].Status == nil || *drives[1].Status != statusCritical {
t.Fatalf("drive1 status: %v", drives[1].Status)
}
}
func TestParseStorcliControllerTelemetry(t *testing.T) {
raw := []byte(`{
"Controllers": [
{
"Response Data": {
"BBU_Info": {
"State of Health": "98 %",
"Relative State of Charge": "76 %",
"Temperature": "41 C",
"Voltage": "12.3 V",
"Replacement required": "No"
}
}
}
]
}`)
got := parseStorcliControllerTelemetry(raw)
if len(got) != 1 {
t.Fatalf("len(got)=%d want 1", len(got))
}
if got[0].BatteryHealthPct == nil || *got[0].BatteryHealthPct != 98 {
t.Fatalf("battery health=%v", got[0].BatteryHealthPct)
}
if got[0].BatteryChargePct == nil || *got[0].BatteryChargePct != 76 {
t.Fatalf("battery charge=%v", got[0].BatteryChargePct)
}
if got[0].BatteryTemperatureC == nil || *got[0].BatteryTemperatureC != 41 {
t.Fatalf("battery temperature=%v", got[0].BatteryTemperatureC)
}
if got[0].BatteryVoltageV == nil || *got[0].BatteryVoltageV != 12.3 {
t.Fatalf("battery voltage=%v", got[0].BatteryVoltageV)
}
if got[0].BatteryReplaceRequired == nil || *got[0].BatteryReplaceRequired {
t.Fatalf("battery replace=%v", got[0].BatteryReplaceRequired)
}
}
func TestParseSSACLIControllerTelemetry(t *testing.T) {
raw := `Smart Array P440ar in Slot 0
Battery/Capacitor Count: 1
Capacitor Temperature (C): 37
Capacitor Charge (%): 94
Capacitor Health (%): 96
Capacitor Voltage (V): 9.8
Capacitor Failed: No
`
got := parseSSACLIControllerTelemetry(raw)
if len(got) != 1 {
t.Fatalf("len(got)=%d want 1", len(got))
}
if got[0].BatteryTemperatureC == nil || *got[0].BatteryTemperatureC != 37 {
t.Fatalf("battery temperature=%v", got[0].BatteryTemperatureC)
}
if got[0].BatteryChargePct == nil || *got[0].BatteryChargePct != 94 {
t.Fatalf("battery charge=%v", got[0].BatteryChargePct)
}
}
func TestEnrichPCIeWithRAIDTelemetry(t *testing.T) {
orig := raidToolQuery
t.Cleanup(func() { raidToolQuery = orig })
raidToolQuery = func(name string, args ...string) ([]byte, error) {
switch name {
case "storcli64":
return []byte(`{
"Controllers": [
{
"Response Data": {
"CV_Info": {
"State of Health": "99 %",
"Relative State of Charge": "81 %",
"Temperature": "38 C",
"Voltage": "12.1 V",
"Replacement required": "No"
}
}
}
]
}`), nil
default:
return nil, errors.New("skip")
}
}
vendor := vendorBroadcomLSI
class := "MassStorageController"
status := statusOK
devs := []schema.HardwarePCIeDevice{{
VendorID: &vendor,
DeviceClass: &class,
HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status},
}}
out := enrichPCIeWithRAIDTelemetry(devs)
if out[0].BatteryHealthPct == nil || *out[0].BatteryHealthPct != 99 {
t.Fatalf("battery health=%v", out[0].BatteryHealthPct)
}
if out[0].BatteryChargePct == nil || *out[0].BatteryChargePct != 81 {
t.Fatalf("battery charge=%v", out[0].BatteryChargePct)
}
if out[0].BatteryVoltageV == nil || *out[0].BatteryVoltageV != 12.1 {
t.Fatalf("battery voltage=%v", out[0].BatteryVoltageV)
}
}

View File

@@ -0,0 +1,373 @@
package collector
import (
"bee/audit/internal/schema"
"encoding/json"
"log/slog"
"os/exec"
"sort"
"strconv"
"strings"
)
type sensorsDoc map[string]map[string]any
func collectSensors() *schema.HardwareSensors {
doc, err := readSensorsJSONDoc()
if err != nil {
slog.Info("sensors: unavailable, skipping", "err", err)
return nil
}
sensors := buildSensorsFromDoc(doc)
if sensors == nil || (len(sensors.Fans) == 0 && len(sensors.Power) == 0 && len(sensors.Temperatures) == 0 && len(sensors.Other) == 0) {
return nil
}
slog.Info("sensors: collected",
"fans", len(sensors.Fans),
"power", len(sensors.Power),
"temperatures", len(sensors.Temperatures),
"other", len(sensors.Other),
)
return sensors
}
func readSensorsJSONDoc() (sensorsDoc, error) {
out, err := exec.Command("sensors", "-j").Output()
if err != nil {
return nil, err
}
var doc sensorsDoc
if err := json.Unmarshal(out, &doc); err != nil {
return nil, err
}
return doc, nil
}
func buildSensorsFromDoc(doc sensorsDoc) *schema.HardwareSensors {
if len(doc) == 0 {
return nil
}
result := &schema.HardwareSensors{}
seen := map[string]struct{}{}
chips := make([]string, 0, len(doc))
for chip := range doc {
chips = append(chips, chip)
}
sort.Strings(chips)
for _, chip := range chips {
features := doc[chip]
location := sensorLocation(chip)
keys := make([]string, 0, len(features))
for key := range features {
keys = append(keys, key)
}
sort.Strings(keys)
for _, key := range keys {
if strings.EqualFold(key, "Adapter") {
continue
}
feature, ok := features[key].(map[string]any)
if !ok {
continue
}
name := strings.TrimSpace(key)
if name == "" {
continue
}
switch classifySensorFeature(feature) {
case "fan":
item := buildFanSensor(name, location, feature)
if item == nil || duplicateSensor(seen, "fan", item.Name) {
continue
}
result.Fans = append(result.Fans, *item)
case "temp":
item := buildTempSensor(name, location, feature)
if item == nil || duplicateSensor(seen, "temp", item.Name) {
continue
}
result.Temperatures = append(result.Temperatures, *item)
case "power":
item := buildPowerSensor(name, location, feature)
if item == nil || duplicateSensor(seen, "power", item.Name) {
continue
}
result.Power = append(result.Power, *item)
default:
item := buildOtherSensor(name, location, feature)
if item == nil || duplicateSensor(seen, "other", item.Name) {
continue
}
result.Other = append(result.Other, *item)
}
}
}
return result
}
func parseSensorsJSON(raw []byte) (*schema.HardwareSensors, error) {
var doc sensorsDoc
err := json.Unmarshal(raw, &doc)
if err != nil {
return nil, err
}
return buildSensorsFromDoc(doc), nil
}
func duplicateSensor(seen map[string]struct{}, sensorType, name string) bool {
key := sensorType + "\x00" + name
if _, ok := seen[key]; ok {
return true
}
seen[key] = struct{}{}
return false
}
func sensorLocation(chip string) *string {
chip = strings.TrimSpace(chip)
if chip == "" {
return nil
}
return &chip
}
func classifySensorFeature(feature map[string]any) string {
for key := range feature {
switch {
case strings.Contains(key, "fan") && strings.HasSuffix(key, "_input"):
return "fan"
case strings.Contains(key, "temp") && strings.HasSuffix(key, "_input"):
return "temp"
case strings.Contains(key, "power") && (strings.HasSuffix(key, "_input") || strings.HasSuffix(key, "_average")):
return "power"
case strings.Contains(key, "curr") && strings.HasSuffix(key, "_input"):
return "power"
case strings.HasPrefix(key, "in") && strings.HasSuffix(key, "_input"):
return "power"
}
}
return "other"
}
func buildFanSensor(name string, location *string, feature map[string]any) *schema.HardwareFanSensor {
rpm, ok := firstFeatureInt(feature, "_input")
if !ok {
return nil
}
item := &schema.HardwareFanSensor{Name: name, Location: location, RPM: &rpm}
if status := sensorStatusFromFeature(feature); status != nil {
item.Status = status
}
return item
}
func buildTempSensor(name string, location *string, feature map[string]any) *schema.HardwareTemperatureSensor {
celsius, ok := firstFeatureFloat(feature, "_input")
if !ok {
return nil
}
item := &schema.HardwareTemperatureSensor{Name: name, Location: location, Celsius: &celsius}
if warning, ok := firstFeatureFloatWithSuffixes(feature, []string{"_max", "_high"}); ok {
item.ThresholdWarningCelsius = &warning
}
if critical, ok := firstFeatureFloatWithSuffixes(feature, []string{"_crit", "_emergency"}); ok {
item.ThresholdCriticalCelsius = &critical
}
if status := sensorStatusFromFeature(feature); status != nil {
item.Status = status
} else {
item.Status = deriveTemperatureStatus(item.Celsius, item.ThresholdWarningCelsius, item.ThresholdCriticalCelsius)
}
return item
}
func buildPowerSensor(name string, location *string, feature map[string]any) *schema.HardwarePowerSensor {
item := &schema.HardwarePowerSensor{Name: name, Location: location}
if v, ok := firstFeatureFloatWithContains(feature, []string{"power"}); ok {
item.PowerW = &v
}
if v, ok := firstFeatureFloatWithPrefix(feature, "curr"); ok {
item.CurrentA = &v
}
if v, ok := firstFeatureFloatWithPrefix(feature, "in"); ok {
item.VoltageV = &v
}
if item.PowerW == nil && item.CurrentA == nil && item.VoltageV == nil {
return nil
}
if status := sensorStatusFromFeature(feature); status != nil {
item.Status = status
}
return item
}
func buildOtherSensor(name string, location *string, feature map[string]any) *schema.HardwareOtherSensor {
value, unit, ok := firstGenericSensorValue(feature)
if !ok {
return nil
}
item := &schema.HardwareOtherSensor{Name: name, Location: location, Value: &value}
if unit != "" {
item.Unit = &unit
}
if status := sensorStatusFromFeature(feature); status != nil {
item.Status = status
}
return item
}
func sensorStatusFromFeature(feature map[string]any) *string {
for key, raw := range feature {
if !strings.HasSuffix(key, "_alarm") {
continue
}
if number, ok := floatFromAny(raw); ok && number > 0 {
status := statusWarning
return &status
}
}
return nil
}
func deriveTemperatureStatus(current, warning, critical *float64) *string {
if current == nil {
return nil
}
switch {
case critical != nil && *current >= *critical:
status := statusCritical
return &status
case warning != nil && *current >= *warning:
status := statusWarning
return &status
default:
status := statusOK
return &status
}
}
func firstFeatureInt(feature map[string]any, suffix string) (int, bool) {
for key, raw := range feature {
if strings.HasSuffix(key, suffix) {
if value, ok := floatFromAny(raw); ok {
return int(value), true
}
}
}
return 0, false
}
func firstFeatureFloat(feature map[string]any, suffix string) (float64, bool) {
return firstFeatureFloatWithSuffixes(feature, []string{suffix})
}
func firstFeatureFloatWithSuffixes(feature map[string]any, suffixes []string) (float64, bool) {
keys := sortedFeatureKeys(feature)
for _, key := range keys {
for _, suffix := range suffixes {
if strings.HasSuffix(key, suffix) {
if value, ok := floatFromAny(feature[key]); ok {
return value, true
}
}
}
}
return 0, false
}
func firstFeatureFloatWithContains(feature map[string]any, parts []string) (float64, bool) {
keys := sortedFeatureKeys(feature)
for _, key := range keys {
matched := true
for _, part := range parts {
if !strings.Contains(key, part) {
matched = false
break
}
}
if matched {
if value, ok := floatFromAny(feature[key]); ok {
return value, true
}
}
}
return 0, false
}
func firstFeatureFloatWithPrefix(feature map[string]any, prefix string) (float64, bool) {
keys := sortedFeatureKeys(feature)
for _, key := range keys {
if strings.HasPrefix(key, prefix) && strings.HasSuffix(key, "_input") {
if value, ok := floatFromAny(feature[key]); ok {
return value, true
}
}
}
return 0, false
}
func firstGenericSensorValue(feature map[string]any) (float64, string, bool) {
keys := sortedFeatureKeys(feature)
for _, key := range keys {
if strings.HasSuffix(key, "_alarm") {
continue
}
value, ok := floatFromAny(feature[key])
if !ok {
continue
}
unit := inferSensorUnit(key)
return value, unit, true
}
return 0, "", false
}
func inferSensorUnit(key string) string {
switch {
case strings.Contains(key, "humidity"):
return "%"
case strings.Contains(key, "intrusion"):
return ""
default:
return ""
}
}
func sortedFeatureKeys(feature map[string]any) []string {
keys := make([]string, 0, len(feature))
for key := range feature {
keys = append(keys, key)
}
sort.Strings(keys)
return keys
}
func floatFromAny(raw any) (float64, bool) {
switch value := raw.(type) {
case float64:
return value, true
case float32:
return float64(value), true
case int:
return float64(value), true
case int64:
return float64(value), true
case json.Number:
if f, err := value.Float64(); err == nil {
return f, true
}
case string:
if value == "" {
return 0, false
}
if f, err := strconv.ParseFloat(value, 64); err == nil {
return f, true
}
}
return 0, false
}

View File

@@ -0,0 +1,54 @@
package collector
import "testing"
func TestParseSensorsJSON(t *testing.T) {
raw := []byte(`{
"coretemp-isa-0000": {
"Adapter": "ISA adapter",
"Package id 0": {
"temp1_input": 61.5,
"temp1_max": 80.0,
"temp1_crit": 95.0
},
"fan1": {
"fan1_input": 4200
}
},
"acpitz-acpi-0": {
"Adapter": "ACPI interface",
"in0": {
"in0_input": 12.06
},
"curr1": {
"curr1_input": 0.64
},
"power1": {
"power1_average": 137.0
},
"humidity1": {
"humidity1_input": 38.5
}
}
}`)
got, err := parseSensorsJSON(raw)
if err != nil {
t.Fatalf("parseSensorsJSON error: %v", err)
}
if got == nil {
t.Fatal("expected sensors")
}
if len(got.Temperatures) != 1 || got.Temperatures[0].Celsius == nil || *got.Temperatures[0].Celsius != 61.5 {
t.Fatalf("temperatures mismatch: %#v", got.Temperatures)
}
if len(got.Fans) != 1 || got.Fans[0].RPM == nil || *got.Fans[0].RPM != 4200 {
t.Fatalf("fans mismatch: %#v", got.Fans)
}
if len(got.Power) != 3 {
t.Fatalf("power sensors mismatch: %#v", got.Power)
}
if len(got.Other) != 1 || got.Other[0].Unit == nil || *got.Other[0].Unit != "%" {
t.Fatalf("other sensors mismatch: %#v", got.Other)
}
}

View File

@@ -26,13 +26,13 @@ func collectStorage() []schema.HardwareStorage {
// lsblkDevice is a minimal lsblk JSON record.
type lsblkDevice struct {
Name string `json:"name"`
Type string `json:"type"`
Size string `json:"size"`
Serial string `json:"serial"`
Model string `json:"model"`
Tran string `json:"tran"`
Hctl string `json:"hctl"`
Name string `json:"name"`
Type string `json:"type"`
Size string `json:"size"`
Serial string `json:"serial"`
Model string `json:"model"`
Tran string `json:"tran"`
Hctl string `json:"hctl"`
}
type lsblkRoot struct {
@@ -67,7 +67,10 @@ type smartctlInfo struct {
SerialNumber string `json:"serial_number"`
FirmwareVer string `json:"firmware_version"`
RotationRate int `json:"rotation_rate"`
SmartStatus struct {
Temperature struct {
Current int `json:"current"`
} `json:"temperature"`
SmartStatus struct {
Passed bool `json:"passed"`
} `json:"smart_status"`
UserCapacity struct {
@@ -75,9 +78,11 @@ type smartctlInfo struct {
} `json:"user_capacity"`
AtaSmartAttributes struct {
Table []struct {
ID int `json:"id"`
Name string `json:"name"`
Raw struct{ Value int64 `json:"value"` } `json:"raw"`
ID int `json:"id"`
Name string `json:"name"`
Raw struct {
Value int64 `json:"value"`
} `json:"raw"`
} `json:"table"`
} `json:"ata_smart_attributes"`
PowerOnTime struct {
@@ -130,7 +135,7 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
return s
}
var info smartctlInfo
var info smartctlInfo
if err := json.Unmarshal(out, &info); err == nil {
if v := cleanDMIValue(info.ModelName); v != "" {
s.Model = &v
@@ -152,14 +157,19 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
} else if info.RotationRate > 0 {
devType = "HDD"
}
s.Type = &devType
// telemetry
tel := map[string]any{}
if info.Temperature.Current > 0 {
t := float64(info.Temperature.Current)
s.TemperatureC = &t
}
if info.PowerOnTime.Hours > 0 {
tel["power_on_hours"] = info.PowerOnTime.Hours
v := int64(info.PowerOnTime.Hours)
s.PowerOnHours = &v
}
if info.PowerCycleCount > 0 {
tel["power_cycles"] = info.PowerCycleCount
v := int64(info.PowerCycleCount)
s.PowerCycles = &v
}
reallocated := int64(0)
pending := int64(0)
@@ -169,77 +179,79 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
switch attr.ID {
case 5:
reallocated = attr.Raw.Value
tel["reallocated_sectors"] = attr.Raw.Value
s.ReallocatedSectors = &reallocated
case 177:
tel["wear_leveling_pct"] = attr.Raw.Value
value := float64(attr.Raw.Value)
s.LifeUsedPct = &value
case 231:
lifeRemaining = attr.Raw.Value
tel["life_remaining_pct"] = attr.Raw.Value
value := float64(attr.Raw.Value)
s.LifeRemainingPct = &value
case 241:
tel["total_lba_written"] = attr.Raw.Value
value := attr.Raw.Value
s.WrittenBytes = &value
case 197:
pending = attr.Raw.Value
tel["current_pending_sectors"] = attr.Raw.Value
s.CurrentPendingSectors = &pending
case 198:
uncorrectable = attr.Raw.Value
tel["offline_uncorrectable"] = attr.Raw.Value
s.OfflineUncorrectable = &uncorrectable
}
}
if len(tel) > 0 {
s.Telemetry = tel
}
status := storageHealthStatus{
overallPassed: info.SmartStatus.Passed,
hasOverall: true,
reallocatedSectors: reallocated,
pendingSectors: pending,
overallPassed: info.SmartStatus.Passed,
hasOverall: true,
reallocatedSectors: reallocated,
pendingSectors: pending,
offlineUncorrectable: uncorrectable,
lifeRemainingPct: lifeRemaining,
lifeRemainingPct: lifeRemaining,
}
setStorageHealthStatus(&s, status)
return s
}
s.Type = &devType
status := "UNKNOWN"
status := statusUnknown
s.Status = &status
return s
}
// nvmeSmartLog is the subset of `nvme smart-log -o json` output we care about.
type nvmeSmartLog struct {
CriticalWarning int `json:"critical_warning"`
PercentageUsed int `json:"percentage_used"`
AvailableSpare int `json:"available_spare"`
SpareThreshold int `json:"spare_thresh"`
PowerOnHours int64 `json:"power_on_hours"`
PowerCycles int64 `json:"power_cycles"`
UnsafeShutdowns int64 `json:"unsafe_shutdowns"`
DataUnitsWritten int64 `json:"data_units_written"`
ControllerBusy int64 `json:"controller_busy_time"`
MediaErrors int64 `json:"media_errors"`
NumErrLogEntries int64 `json:"num_err_log_entries"`
CriticalWarning int `json:"critical_warning"`
PercentageUsed int `json:"percentage_used"`
AvailableSpare int `json:"available_spare"`
SpareThreshold int `json:"spare_thresh"`
Temperature int64 `json:"temperature"`
PowerOnHours int64 `json:"power_on_hours"`
PowerCycles int64 `json:"power_cycles"`
UnsafeShutdowns int64 `json:"unsafe_shutdowns"`
DataUnitsRead int64 `json:"data_units_read"`
DataUnitsWritten int64 `json:"data_units_written"`
ControllerBusy int64 `json:"controller_busy_time"`
MediaErrors int64 `json:"media_errors"`
NumErrLogEntries int64 `json:"num_err_log_entries"`
}
// nvmeIDCtrl is the subset of `nvme id-ctrl -o json` output.
type nvmeIDCtrl struct {
ModelNumber string `json:"mn"`
SerialNumber string `json:"sn"`
FirmwareRev string `json:"fr"`
TotalCapacity int64 `json:"tnvmcap"`
ModelNumber string `json:"mn"`
SerialNumber string `json:"sn"`
FirmwareRev string `json:"fr"`
TotalCapacity int64 `json:"tnvmcap"`
}
func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
present := true
devType := "NVMe"
iface := "NVMe"
status := "OK"
status := statusOK
s := schema.HardwareStorage{
Present: &present,
Type: &devType,
Interface: &iface,
Status: &status,
HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status},
Present: &present,
Type: &devType,
Interface: &iface,
}
devPath := "/dev/" + dev.Name
@@ -268,100 +280,123 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
if out, err := exec.Command("nvme", "smart-log", devPath, "-o", "json").Output(); err == nil {
var log nvmeSmartLog
if json.Unmarshal(out, &log) == nil {
tel := map[string]any{}
if log.CriticalWarning > 0 {
tel["critical_warning"] = log.CriticalWarning
}
if log.PowerOnHours > 0 {
tel["power_on_hours"] = log.PowerOnHours
s.PowerOnHours = &log.PowerOnHours
}
if log.PowerCycles > 0 {
tel["power_cycles"] = log.PowerCycles
s.PowerCycles = &log.PowerCycles
}
if log.UnsafeShutdowns > 0 {
tel["unsafe_shutdowns"] = log.UnsafeShutdowns
s.UnsafeShutdowns = &log.UnsafeShutdowns
}
if log.PercentageUsed > 0 {
tel["percentage_used"] = log.PercentageUsed
v := float64(log.PercentageUsed)
s.LifeUsedPct = &v
remaining := 100 - v
s.LifeRemainingPct = &remaining
}
if log.DataUnitsWritten > 0 {
tel["data_units_written"] = log.DataUnitsWritten
v := nvmeDataUnitsToBytes(log.DataUnitsWritten)
s.WrittenBytes = &v
}
if log.ControllerBusy > 0 {
tel["controller_busy_time"] = log.ControllerBusy
if log.DataUnitsRead > 0 {
v := nvmeDataUnitsToBytes(log.DataUnitsRead)
s.ReadBytes = &v
}
if log.AvailableSpare > 0 {
tel["available_spare_pct"] = log.AvailableSpare
}
if log.SpareThreshold > 0 {
tel["available_spare_threshold_pct"] = log.SpareThreshold
v := float64(log.AvailableSpare)
s.AvailableSparePct = &v
}
if log.MediaErrors > 0 {
tel["media_errors"] = log.MediaErrors
s.MediaErrors = &log.MediaErrors
}
if log.NumErrLogEntries > 0 {
tel["error_log_entries"] = log.NumErrLogEntries
s.ErrorLogEntries = &log.NumErrLogEntries
}
if len(tel) > 0 {
s.Telemetry = tel
if log.Temperature > 0 {
v := float64(log.Temperature - 273)
s.TemperatureC = &v
}
setStorageHealthStatus(&s, storageHealthStatus{
criticalWarning: log.CriticalWarning,
percentageUsed: int64(log.PercentageUsed),
availableSpare: int64(log.AvailableSpare),
spareThreshold: int64(log.SpareThreshold),
unsafeShutdowns: log.UnsafeShutdowns,
mediaErrors: log.MediaErrors,
errorLogEntries: log.NumErrLogEntries,
criticalWarning: log.CriticalWarning,
percentageUsed: int64(log.PercentageUsed),
availableSpare: int64(log.AvailableSpare),
spareThreshold: int64(log.SpareThreshold),
unsafeShutdowns: log.UnsafeShutdowns,
mediaErrors: log.MediaErrors,
errorLogEntries: log.NumErrLogEntries,
})
return s
}
}
status = "UNKNOWN"
status = statusUnknown
s.Status = &status
return s
}
func nvmeDataUnitsToBytes(units int64) int64 {
if units <= 0 {
return 0
}
return units * 512000
}
type storageHealthStatus struct {
hasOverall bool
overallPassed bool
reallocatedSectors int64
pendingSectors int64
offlineUncorrectable int64
lifeRemainingPct int64
criticalWarning int
percentageUsed int64
availableSpare int64
spareThreshold int64
unsafeShutdowns int64
mediaErrors int64
errorLogEntries int64
hasOverall bool
overallPassed bool
reallocatedSectors int64
pendingSectors int64
offlineUncorrectable int64
lifeRemainingPct int64
criticalWarning int
percentageUsed int64
availableSpare int64
spareThreshold int64
unsafeShutdowns int64
mediaErrors int64
errorLogEntries int64
}
func setStorageHealthStatus(s *schema.HardwareStorage, health storageHealthStatus) {
status := "OK"
status := statusOK
var description *string
switch {
case health.hasOverall && !health.overallPassed:
status = "FAILED"
status = statusCritical
description = stringPtr("SMART overall self-assessment failed")
case health.criticalWarning > 0:
status = "FAILED"
status = statusCritical
description = stringPtr("NVMe critical warning is set")
case health.pendingSectors > 0 || health.offlineUncorrectable > 0:
status = "FAILED"
status = statusCritical
description = stringPtr("Pending or offline uncorrectable sectors detected")
case health.mediaErrors > 0:
status = "WARNING"
status = statusWarning
description = stringPtr("Media errors reported")
case health.reallocatedSectors > 0:
status = "WARNING"
status = statusWarning
description = stringPtr("Reallocated sectors detected")
case health.errorLogEntries > 0:
status = "WARNING"
status = statusWarning
description = stringPtr("Device error log contains entries")
case health.lifeRemainingPct > 0 && health.lifeRemainingPct <= 10:
status = "WARNING"
status = statusWarning
description = stringPtr("Life remaining is low")
case health.percentageUsed >= 95:
status = "WARNING"
status = statusWarning
description = stringPtr("Drive wear level is high")
case health.availableSpare > 0 && health.spareThreshold > 0 && health.availableSpare <= health.spareThreshold:
status = "WARNING"
status = statusWarning
description = stringPtr("Available spare is at or below threshold")
case health.unsafeShutdowns > 100:
status = "WARNING"
status = statusWarning
description = stringPtr("Unsafe shutdown count is high")
}
s.Status = &status
s.ErrorDescription = description
}
func stringPtr(value string) *string {
return &value
}

View File

@@ -17,37 +17,37 @@ func TestSetStorageHealthStatus(t *testing.T) {
{
name: "smart overall failed",
health: storageHealthStatus{hasOverall: true, overallPassed: false},
want: "FAILED",
want: statusCritical,
},
{
name: "nvme critical warning",
health: storageHealthStatus{criticalWarning: 1},
want: "FAILED",
want: statusCritical,
},
{
name: "pending sectors",
health: storageHealthStatus{pendingSectors: 1},
want: "FAILED",
want: statusCritical,
},
{
name: "media errors warning",
health: storageHealthStatus{mediaErrors: 2},
want: "WARNING",
want: statusWarning,
},
{
name: "reallocated warning",
health: storageHealthStatus{reallocatedSectors: 1},
want: "WARNING",
want: statusWarning,
},
{
name: "life remaining low",
health: storageHealthStatus{lifeRemainingPct: 8},
want: "WARNING",
want: statusWarning,
},
{
name: "healthy",
health: storageHealthStatus{},
want: "OK",
want: statusOK,
},
}

View File

@@ -6,31 +6,31 @@ import (
"time"
)
func buildHealthSummary(snap schema.HardwareSnapshot) *schema.HardwareHealthSummary {
func BuildHealthSummary(snap schema.HardwareSnapshot) *schema.HardwareHealthSummary {
summary := &schema.HardwareHealthSummary{
Status: "OK",
Status: statusOK,
CollectedAt: time.Now().UTC().Format(time.RFC3339),
}
for _, dimm := range snap.Memory {
switch derefString(dimm.Status) {
case "WARNING":
case statusWarning:
summary.MemoryWarn++
summary.Warnings = append(summary.Warnings, formatMemorySummary(dimm))
case "FAILED":
case statusCritical:
summary.MemoryFail++
summary.Failures = append(summary.Failures, formatMemorySummary(dimm))
case "EMPTY":
case statusEmpty:
summary.EmptyDIMMs++
}
}
for _, disk := range snap.Storage {
switch derefString(disk.Status) {
case "WARNING":
case statusWarning:
summary.StorageWarn++
summary.Warnings = append(summary.Warnings, formatStorageSummary(disk))
case "FAILED":
case statusCritical:
summary.StorageFail++
summary.Failures = append(summary.Failures, formatStorageSummary(disk))
}
@@ -38,10 +38,10 @@ func buildHealthSummary(snap schema.HardwareSnapshot) *schema.HardwareHealthSumm
for _, dev := range snap.PCIeDevices {
switch derefString(dev.Status) {
case "WARNING":
case statusWarning:
summary.PCIeWarn++
summary.Warnings = append(summary.Warnings, formatPCIeSummary(dev))
case "FAILED":
case statusCritical:
summary.PCIeFail++
summary.Failures = append(summary.Failures, formatPCIeSummary(dev))
}
@@ -52,19 +52,19 @@ func buildHealthSummary(snap schema.HardwareSnapshot) *schema.HardwareHealthSumm
summary.MissingPSUs++
}
switch derefString(psu.Status) {
case "WARNING":
case statusWarning:
summary.PSUWarn++
summary.Warnings = append(summary.Warnings, formatPSUSummary(psu))
case "FAILED":
case statusCritical:
summary.PSUFail++
summary.Failures = append(summary.Failures, formatPSUSummary(psu))
}
}
if len(summary.Failures) > 0 || summary.StorageFail > 0 || summary.PCIeFail > 0 || summary.PSUFail > 0 || summary.MemoryFail > 0 {
summary.Status = "FAILED"
summary.Status = statusCritical
} else if len(summary.Warnings) > 0 || summary.StorageWarn > 0 || summary.PCIeWarn > 0 || summary.PSUWarn > 0 || summary.MemoryWarn > 0 {
summary.Status = "WARNING"
summary.Status = statusWarning
}
if len(summary.Warnings) == 0 {

View File

@@ -31,7 +31,7 @@ md125 : active raid1 nvme2n1[0] nvme3n1[1]
func TestHasVROCController(t *testing.T) {
intel := vendorIntel
model := "Volume Management Device NVMe RAID Controller"
class := "RAID bus controller"
class := "MassStorageController"
tests := []struct {
name string
pcie []schema.HardwarePCIeDevice

View File

@@ -5,10 +5,10 @@ package schema
// HardwareIngestRequest is the top-level output document produced by `bee audit`.
// It is accepted as-is by the core /api/ingest/hardware endpoint.
type HardwareIngestRequest struct {
Filename *string `json:"filename"`
SourceType *string `json:"source_type"`
Protocol *string `json:"protocol"`
TargetHost string `json:"target_host"`
Filename *string `json:"filename,omitempty"`
SourceType *string `json:"source_type,omitempty"`
Protocol *string `json:"protocol,omitempty"`
TargetHost *string `json:"target_host,omitempty"`
CollectedAt string `json:"collected_at"`
Hardware HardwareSnapshot `json:"hardware"`
}
@@ -21,32 +21,32 @@ type HardwareSnapshot struct {
Storage []HardwareStorage `json:"storage,omitempty"`
PCIeDevices []HardwarePCIeDevice `json:"pcie_devices,omitempty"`
PowerSupplies []HardwarePowerSupply `json:"power_supplies,omitempty"`
Summary *HardwareHealthSummary `json:"summary,omitempty"`
Sensors *HardwareSensors `json:"sensors,omitempty"`
}
type HardwareHealthSummary struct {
Status string `json:"status"`
Warnings []string `json:"warnings,omitempty"`
Failures []string `json:"failures,omitempty"`
StorageWarn int `json:"storage_warn,omitempty"`
StorageFail int `json:"storage_fail,omitempty"`
PCIeWarn int `json:"pcie_warn,omitempty"`
PCIeFail int `json:"pcie_fail,omitempty"`
PSUWarn int `json:"psu_warn,omitempty"`
PSUFail int `json:"psu_fail,omitempty"`
MemoryWarn int `json:"memory_warn,omitempty"`
MemoryFail int `json:"memory_fail,omitempty"`
EmptyDIMMs int `json:"empty_dimms,omitempty"`
MissingPSUs int `json:"missing_psus,omitempty"`
CollectedAt string `json:"collected_at,omitempty"`
Status string `json:"status"`
Warnings []string `json:"warnings,omitempty"`
Failures []string `json:"failures,omitempty"`
StorageWarn int `json:"storage_warn,omitempty"`
StorageFail int `json:"storage_fail,omitempty"`
PCIeWarn int `json:"pcie_warn,omitempty"`
PCIeFail int `json:"pcie_fail,omitempty"`
PSUWarn int `json:"psu_warn,omitempty"`
PSUFail int `json:"psu_fail,omitempty"`
MemoryWarn int `json:"memory_warn,omitempty"`
MemoryFail int `json:"memory_fail,omitempty"`
EmptyDIMMs int `json:"empty_dimms,omitempty"`
MissingPSUs int `json:"missing_psus,omitempty"`
CollectedAt string `json:"collected_at,omitempty"`
}
type HardwareBoard struct {
Manufacturer *string `json:"manufacturer"`
ProductName *string `json:"product_name"`
Manufacturer *string `json:"manufacturer,omitempty"`
ProductName *string `json:"product_name,omitempty"`
SerialNumber string `json:"serial_number"`
PartNumber *string `json:"part_number"`
UUID *string `json:"uuid"`
PartNumber *string `json:"part_number,omitempty"`
UUID *string `json:"uuid,omitempty"`
}
type HardwareFirmwareRecord struct {
@@ -55,77 +55,183 @@ type HardwareFirmwareRecord struct {
}
type HardwareCPU struct {
Socket *int `json:"socket"`
Model *string `json:"model"`
Manufacturer *string `json:"manufacturer"`
Status *string `json:"status"`
SerialNumber *string `json:"serial_number"`
Firmware *string `json:"firmware"`
Cores *int `json:"cores"`
Threads *int `json:"threads"`
FrequencyMHz *int `json:"frequency_mhz"`
MaxFrequencyMHz *int `json:"max_frequency_mhz"`
HardwareComponentStatus
Socket *int `json:"socket,omitempty"`
Model *string `json:"model,omitempty"`
Manufacturer *string `json:"manufacturer,omitempty"`
SerialNumber *string `json:"serial_number,omitempty"`
Firmware *string `json:"firmware,omitempty"`
Cores *int `json:"cores,omitempty"`
Threads *int `json:"threads,omitempty"`
FrequencyMHz *int `json:"frequency_mhz,omitempty"`
MaxFrequencyMHz *int `json:"max_frequency_mhz,omitempty"`
TemperatureC *float64 `json:"temperature_c,omitempty"`
PowerW *float64 `json:"power_w,omitempty"`
Throttled *bool `json:"throttled,omitempty"`
CorrectableErrorCount *int64 `json:"correctable_error_count,omitempty"`
UncorrectableErrorCount *int64 `json:"uncorrectable_error_count,omitempty"`
LifeRemainingPct *float64 `json:"life_remaining_pct,omitempty"`
LifeUsedPct *float64 `json:"life_used_pct,omitempty"`
Present *bool `json:"present,omitempty"`
}
type HardwareMemory struct {
Slot *string `json:"slot"`
Location *string `json:"location"`
Present *bool `json:"present"`
SizeMB *int `json:"size_mb"`
Type *string `json:"type"`
MaxSpeedMHz *int `json:"max_speed_mhz"`
CurrentSpeedMHz *int `json:"current_speed_mhz"`
Manufacturer *string `json:"manufacturer"`
SerialNumber *string `json:"serial_number"`
PartNumber *string `json:"part_number"`
Status *string `json:"status"`
HardwareComponentStatus
Slot *string `json:"slot,omitempty"`
Location *string `json:"location,omitempty"`
Present *bool `json:"present,omitempty"`
SizeMB *int `json:"size_mb,omitempty"`
Type *string `json:"type,omitempty"`
MaxSpeedMHz *int `json:"max_speed_mhz,omitempty"`
CurrentSpeedMHz *int `json:"current_speed_mhz,omitempty"`
Manufacturer *string `json:"manufacturer,omitempty"`
SerialNumber *string `json:"serial_number,omitempty"`
PartNumber *string `json:"part_number,omitempty"`
TemperatureC *float64 `json:"temperature_c,omitempty"`
CorrectableECCErrorCount *int64 `json:"correctable_ecc_error_count,omitempty"`
UncorrectableECCErrorCount *int64 `json:"uncorrectable_ecc_error_count,omitempty"`
LifeRemainingPct *float64 `json:"life_remaining_pct,omitempty"`
LifeUsedPct *float64 `json:"life_used_pct,omitempty"`
SpareBlocksRemainingPct *float64 `json:"spare_blocks_remaining_pct,omitempty"`
PerformanceDegraded *bool `json:"performance_degraded,omitempty"`
DataLossDetected *bool `json:"data_loss_detected,omitempty"`
}
type HardwareStorage struct {
Slot *string `json:"slot"`
Type *string `json:"type"`
Model *string `json:"model"`
SizeGB *int `json:"size_gb"`
SerialNumber *string `json:"serial_number"`
Manufacturer *string `json:"manufacturer"`
Firmware *string `json:"firmware"`
Interface *string `json:"interface"`
Present *bool `json:"present"`
Status *string `json:"status"`
Telemetry map[string]any `json:"telemetry,omitempty"`
HardwareComponentStatus
Slot *string `json:"slot,omitempty"`
Type *string `json:"type,omitempty"`
Model *string `json:"model,omitempty"`
SizeGB *int `json:"size_gb,omitempty"`
SerialNumber *string `json:"serial_number,omitempty"`
Manufacturer *string `json:"manufacturer,omitempty"`
Firmware *string `json:"firmware,omitempty"`
Interface *string `json:"interface,omitempty"`
Present *bool `json:"present,omitempty"`
TemperatureC *float64 `json:"temperature_c,omitempty"`
PowerOnHours *int64 `json:"power_on_hours,omitempty"`
PowerCycles *int64 `json:"power_cycles,omitempty"`
UnsafeShutdowns *int64 `json:"unsafe_shutdowns,omitempty"`
MediaErrors *int64 `json:"media_errors,omitempty"`
ErrorLogEntries *int64 `json:"error_log_entries,omitempty"`
WrittenBytes *int64 `json:"written_bytes,omitempty"`
ReadBytes *int64 `json:"read_bytes,omitempty"`
LifeUsedPct *float64 `json:"life_used_pct,omitempty"`
LifeRemainingPct *float64 `json:"life_remaining_pct,omitempty"`
AvailableSparePct *float64 `json:"available_spare_pct,omitempty"`
ReallocatedSectors *int64 `json:"reallocated_sectors,omitempty"`
CurrentPendingSectors *int64 `json:"current_pending_sectors,omitempty"`
OfflineUncorrectable *int64 `json:"offline_uncorrectable,omitempty"`
Telemetry map[string]any `json:"-"`
}
type HardwarePCIeDevice struct {
Slot *string `json:"slot"`
VendorID *int `json:"vendor_id"`
DeviceID *int `json:"device_id"`
BDF *string `json:"bdf"`
DeviceClass *string `json:"device_class"`
Manufacturer *string `json:"manufacturer"`
Model *string `json:"model"`
LinkWidth *int `json:"link_width"`
LinkSpeed *string `json:"link_speed"`
MaxLinkWidth *int `json:"max_link_width"`
MaxLinkSpeed *string `json:"max_link_speed"`
SerialNumber *string `json:"serial_number"`
Firmware *string `json:"firmware"`
Present *bool `json:"present"`
Status *string `json:"status"`
Telemetry map[string]any `json:"telemetry,omitempty"`
HardwareComponentStatus
Slot *string `json:"slot,omitempty"`
VendorID *int `json:"vendor_id,omitempty"`
DeviceID *int `json:"device_id,omitempty"`
NUMANode *int `json:"numa_node,omitempty"`
TemperatureC *float64 `json:"temperature_c,omitempty"`
PowerW *float64 `json:"power_w,omitempty"`
LifeRemainingPct *float64 `json:"life_remaining_pct,omitempty"`
LifeUsedPct *float64 `json:"life_used_pct,omitempty"`
ECCCorrectedTotal *int64 `json:"ecc_corrected_total,omitempty"`
ECCUncorrectedTotal *int64 `json:"ecc_uncorrected_total,omitempty"`
HWSlowdown *bool `json:"hw_slowdown,omitempty"`
BatteryChargePct *float64 `json:"battery_charge_pct,omitempty"`
BatteryHealthPct *float64 `json:"battery_health_pct,omitempty"`
BatteryTemperatureC *float64 `json:"battery_temperature_c,omitempty"`
BatteryVoltageV *float64 `json:"battery_voltage_v,omitempty"`
BatteryReplaceRequired *bool `json:"battery_replace_required,omitempty"`
SFPTemperatureC *float64 `json:"sfp_temperature_c,omitempty"`
SFPTXPowerDBM *float64 `json:"sfp_tx_power_dbm,omitempty"`
SFPRXPowerDBM *float64 `json:"sfp_rx_power_dbm,omitempty"`
SFPVoltageV *float64 `json:"sfp_voltage_v,omitempty"`
SFPBiasMA *float64 `json:"sfp_bias_ma,omitempty"`
BDF *string `json:"bdf,omitempty"`
DeviceClass *string `json:"device_class,omitempty"`
Manufacturer *string `json:"manufacturer,omitempty"`
Model *string `json:"model,omitempty"`
LinkWidth *int `json:"link_width,omitempty"`
LinkSpeed *string `json:"link_speed,omitempty"`
MaxLinkWidth *int `json:"max_link_width,omitempty"`
MaxLinkSpeed *string `json:"max_link_speed,omitempty"`
SerialNumber *string `json:"serial_number,omitempty"`
Firmware *string `json:"firmware,omitempty"`
MacAddresses []string `json:"mac_addresses,omitempty"`
Present *bool `json:"present,omitempty"`
Telemetry map[string]any `json:"-"`
}
type HardwarePowerSupply struct {
Slot *string `json:"slot"`
Present *bool `json:"present"`
Model *string `json:"model"`
Vendor *string `json:"vendor"`
WattageW *int `json:"wattage_w"`
SerialNumber *string `json:"serial_number"`
PartNumber *string `json:"part_number"`
Firmware *string `json:"firmware"`
Status *string `json:"status"`
InputType *string `json:"input_type"`
InputPowerW *float64 `json:"input_power_w"`
OutputPowerW *float64 `json:"output_power_w"`
InputVoltage *float64 `json:"input_voltage"`
HardwareComponentStatus
Slot *string `json:"slot,omitempty"`
Present *bool `json:"present,omitempty"`
Model *string `json:"model,omitempty"`
Vendor *string `json:"vendor,omitempty"`
WattageW *int `json:"wattage_w,omitempty"`
SerialNumber *string `json:"serial_number,omitempty"`
PartNumber *string `json:"part_number,omitempty"`
Firmware *string `json:"firmware,omitempty"`
InputType *string `json:"input_type,omitempty"`
InputPowerW *float64 `json:"input_power_w,omitempty"`
OutputPowerW *float64 `json:"output_power_w,omitempty"`
InputVoltage *float64 `json:"input_voltage,omitempty"`
TemperatureC *float64 `json:"temperature_c,omitempty"`
LifeRemainingPct *float64 `json:"life_remaining_pct,omitempty"`
LifeUsedPct *float64 `json:"life_used_pct,omitempty"`
}
type HardwareComponentStatus struct {
Status *string `json:"status,omitempty"`
StatusCheckedAt *string `json:"status_checked_at,omitempty"`
StatusChangedAt *string `json:"status_changed_at,omitempty"`
StatusHistory []HardwareStatusHistory `json:"status_history,omitempty"`
ErrorDescription *string `json:"error_description,omitempty"`
}
type HardwareStatusHistory struct {
Status string `json:"status"`
ChangedAt string `json:"changed_at"`
Details *string `json:"details,omitempty"`
}
type HardwareSensors struct {
Fans []HardwareFanSensor `json:"fans,omitempty"`
Power []HardwarePowerSensor `json:"power,omitempty"`
Temperatures []HardwareTemperatureSensor `json:"temperatures,omitempty"`
Other []HardwareOtherSensor `json:"other,omitempty"`
}
type HardwareFanSensor struct {
Name string `json:"name"`
Location *string `json:"location,omitempty"`
RPM *int `json:"rpm,omitempty"`
Status *string `json:"status,omitempty"`
}
type HardwarePowerSensor struct {
Name string `json:"name"`
Location *string `json:"location,omitempty"`
VoltageV *float64 `json:"voltage_v,omitempty"`
CurrentA *float64 `json:"current_a,omitempty"`
PowerW *float64 `json:"power_w,omitempty"`
Status *string `json:"status,omitempty"`
}
type HardwareTemperatureSensor struct {
Name string `json:"name"`
Location *string `json:"location,omitempty"`
Celsius *float64 `json:"celsius,omitempty"`
ThresholdWarningCelsius *float64 `json:"threshold_warning_celsius,omitempty"`
ThresholdCriticalCelsius *float64 `json:"threshold_critical_celsius,omitempty"`
Status *string `json:"status,omitempty"`
}
type HardwareOtherSensor struct {
Name string `json:"name"`
Location *string `json:"location,omitempty"`
Value *float64 `json:"value,omitempty"`
Unit *string `json:"unit,omitempty"`
Status *string `json:"status,omitempty"`
}