Align hardware export with ingest contract

This commit is contained in:
Mikhail Chusavitin
2026-03-15 21:04:53 +03:00
parent b8c235b5ac
commit ab5a4be7ac
37 changed files with 3304 additions and 354 deletions

View File

@@ -317,38 +317,20 @@ func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, erro
} }
func (a *App) HealthSummaryResult() ActionResult { func (a *App) HealthSummaryResult() ActionResult {
type auditFile struct {
Hardware struct {
Summary struct {
Status string `json:"status"`
Warnings []string `json:"warnings"`
Failures []string `json:"failures"`
StorageWarn int `json:"storage_warn"`
StorageFail int `json:"storage_fail"`
PCIeWarn int `json:"pcie_warn"`
PCIeFail int `json:"pcie_fail"`
PSUWarn int `json:"psu_warn"`
PSUFail int `json:"psu_fail"`
MemoryWarn int `json:"memory_warn"`
MemoryFail int `json:"memory_fail"`
} `json:"summary"`
} `json:"hardware"`
}
raw, err := os.ReadFile(DefaultAuditJSONPath) raw, err := os.ReadFile(DefaultAuditJSONPath)
if err != nil { if err != nil {
return ActionResult{Title: "Health summary", Body: "No audit JSON found."} return ActionResult{Title: "Health summary", Body: "No audit JSON found."}
} }
var snapshot auditFile var snapshot schema.HardwareIngestRequest
if err := json.Unmarshal(raw, &snapshot); err != nil { if err := json.Unmarshal(raw, &snapshot); err != nil {
return ActionResult{Title: "Health summary", Body: "Audit JSON is unreadable."} return ActionResult{Title: "Health summary", Body: "Audit JSON is unreadable."}
} }
summary := snapshot.Hardware.Summary summary := collector.BuildHealthSummary(snapshot.Hardware)
var body strings.Builder var body strings.Builder
status := summary.Status status := summary.Status
if status == "" { if status == "" {
status = "UNKNOWN" status = "Unknown"
} }
fmt.Fprintf(&body, "Overall: %s\n", status) fmt.Fprintf(&body, "Overall: %s\n", status)
fmt.Fprintf(&body, "Storage: warn=%d fail=%d\n", summary.StorageWarn, summary.StorageFail) fmt.Fprintf(&body, "Storage: warn=%d fail=%d\n", summary.StorageWarn, summary.StorageFail)
@@ -662,12 +644,12 @@ func formatIPLine(list func() ([]platform.InterfaceInfo, error)) string {
} }
func isGPUDevice(dev schema.HardwarePCIeDevice) bool { func isGPUDevice(dev schema.HardwarePCIeDevice) bool {
class := strings.ToLower(trimPtr(dev.DeviceClass)) class := trimPtr(dev.DeviceClass)
model := strings.ToLower(trimPtr(dev.Model)) model := strings.ToLower(trimPtr(dev.Model))
vendor := strings.ToLower(trimPtr(dev.Manufacturer)) vendor := strings.ToLower(trimPtr(dev.Manufacturer))
return strings.Contains(class, "vga") || return class == "VideoController" ||
strings.Contains(class, "3d") || class == "DisplayController" ||
strings.Contains(class, "display") || class == "ProcessingAccelerator" ||
strings.Contains(model, "nvidia") || strings.Contains(model, "nvidia") ||
strings.Contains(vendor, "nvidia") || strings.Contains(vendor, "nvidia") ||
strings.Contains(vendor, "amd") strings.Contains(vendor, "amd")

View File

@@ -371,8 +371,6 @@ func TestFormatSATSummary(t *testing.T) {
} }
func TestHealthSummaryResultIncludesCompactSATSummary(t *testing.T) { func TestHealthSummaryResultIncludesCompactSATSummary(t *testing.T) {
t.Parallel()
tmp := t.TempDir() tmp := t.TempDir()
oldAuditPath := DefaultAuditJSONPath oldAuditPath := DefaultAuditJSONPath
oldSATBaseDir := DefaultSATBaseDir oldSATBaseDir := DefaultSATBaseDir
@@ -386,7 +384,7 @@ func TestHealthSummaryResultIncludesCompactSATSummary(t *testing.T) {
t.Fatalf("mkdir sat dir: %v", err) t.Fatalf("mkdir sat dir: %v", err)
} }
raw := `{"hardware":{"summary":{"status":"WARNING","storage_warn":1,"storage_fail":0,"pcie_warn":0,"pcie_fail":0,"psu_warn":0,"psu_fail":0,"memory_warn":0,"memory_fail":0}}}` raw := `{"collected_at":"2026-03-15T10:00:00Z","hardware":{"board":{"serial_number":"SRV123"},"storage":[{"serial_number":"DISK1","status":"Warning"}]}}`
if err := os.WriteFile(DefaultAuditJSONPath, []byte(raw), 0644); err != nil { if err := os.WriteFile(DefaultAuditJSONPath, []byte(raw), 0644); err != nil {
t.Fatalf("write audit json: %v", err) t.Fatalf("write audit json: %v", err)
} }
@@ -401,8 +399,6 @@ func TestHealthSummaryResultIncludesCompactSATSummary(t *testing.T) {
} }
func TestMainBanner(t *testing.T) { func TestMainBanner(t *testing.T) {
t.Parallel()
tmp := t.TempDir() tmp := t.TempDir()
oldAuditPath := DefaultAuditJSONPath oldAuditPath := DefaultAuditJSONPath
DefaultAuditJSONPath = filepath.Join(tmp, "audit.json") DefaultAuditJSONPath = filepath.Join(tmp, "audit.json")
@@ -413,7 +409,7 @@ func TestMainBanner(t *testing.T) {
product := "PowerEdge R760" product := "PowerEdge R760"
cpuModel := "Intel Xeon Gold 6430" cpuModel := "Intel Xeon Gold 6430"
memoryType := "DDR5" memoryType := "DDR5"
gpuClass := "VGA compatible controller" gpuClass := "VideoController"
gpuModel := "NVIDIA H100" gpuModel := "NVIDIA H100"
payload := schema.HardwareIngestRequest{ payload := schema.HardwareIngestRequest{

View File

@@ -7,13 +7,15 @@ import (
"bee/audit/internal/runtimeenv" "bee/audit/internal/runtimeenv"
"bee/audit/internal/schema" "bee/audit/internal/schema"
"log/slog" "log/slog"
"os"
"time" "time"
) )
// Run executes all collectors and returns the combined snapshot. // Run executes all collectors and returns the combined snapshot.
// Partial failures are logged as warnings; collection always completes. // Partial failures are logged as warnings; collection always completes.
func Run(runtimeMode runtimeenv.Mode) schema.HardwareIngestRequest { func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
start := time.Now() start := time.Now()
collectedAt := time.Now().UTC().Format(time.RFC3339)
slog.Info("audit started") slog.Info("audit started")
snap := schema.HardwareSnapshot{} snap := schema.HardwareSnapshot{}
@@ -27,27 +29,38 @@ func Run(runtimeMode runtimeenv.Mode) schema.HardwareIngestRequest {
snap.Firmware = append(snap.Firmware, cpuFW...) snap.Firmware = append(snap.Firmware, cpuFW...)
snap.Memory = collectMemory() snap.Memory = collectMemory()
sensorDoc, err := readSensorsJSONDoc()
if err != nil {
slog.Info("sensors: unavailable for enrichment", "err", err)
}
snap.CPUs = enrichCPUsWithTelemetry(snap.CPUs, sensorDoc)
snap.Memory = enrichMemoryWithTelemetry(snap.Memory, sensorDoc)
snap.Storage = collectStorage() snap.Storage = collectStorage()
snap.PCIeDevices = collectPCIe() snap.PCIeDevices = collectPCIe()
snap.PCIeDevices = enrichPCIeWithNVIDIA(snap.PCIeDevices, snap.Board.SerialNumber) snap.PCIeDevices = enrichPCIeWithNVIDIA(snap.PCIeDevices, snap.Board.SerialNumber)
snap.PCIeDevices = enrichPCIeWithMellanox(snap.PCIeDevices) snap.PCIeDevices = enrichPCIeWithMellanox(snap.PCIeDevices)
snap.PCIeDevices = enrichPCIeWithNICTelemetry(snap.PCIeDevices) snap.PCIeDevices = enrichPCIeWithNICTelemetry(snap.PCIeDevices)
snap.PCIeDevices = enrichPCIeWithRAIDTelemetry(snap.PCIeDevices)
snap.Storage = enrichStorageWithVROC(snap.Storage, snap.PCIeDevices) snap.Storage = enrichStorageWithVROC(snap.Storage, snap.PCIeDevices)
snap.Storage = appendUniqueStorage(snap.Storage, collectRAIDStorage(snap.PCIeDevices)) snap.Storage = appendUniqueStorage(snap.Storage, collectRAIDStorage(snap.PCIeDevices))
snap.PowerSupplies = collectPSUs() snap.PowerSupplies = collectPSUs()
snap.Summary = buildHealthSummary(snap) snap.PowerSupplies = enrichPSUsWithTelemetry(snap.PowerSupplies, sensorDoc)
snap.Sensors = buildSensorsFromDoc(sensorDoc)
finalizeSnapshot(&snap, collectedAt)
// remaining collectors added in steps 1.8 1.10 // remaining collectors added in steps 1.8 1.10
slog.Info("audit completed", "duration", time.Since(start).Round(time.Millisecond)) slog.Info("audit completed", "duration", time.Since(start).Round(time.Millisecond))
sourceType := string(runtimeMode) sourceType := "manual"
protocol := "os-direct" var targetHost *string
if hostname, err := os.Hostname(); err == nil && hostname != "" {
targetHost = &hostname
}
return schema.HardwareIngestRequest{ return schema.HardwareIngestRequest{
SourceType: &sourceType, SourceType: &sourceType,
Protocol: &protocol, TargetHost: targetHost,
CollectedAt: time.Now().UTC().Format(time.RFC3339), CollectedAt: collectedAt,
Hardware: snap, Hardware: snap,
} }
} }

View File

@@ -0,0 +1,64 @@
package collector
import "strings"
const (
statusOK = "OK"
statusWarning = "Warning"
statusCritical = "Critical"
statusUnknown = "Unknown"
statusEmpty = "Empty"
)
func mapPCIeDeviceClass(raw string) string {
normalized := strings.ToLower(strings.TrimSpace(raw))
switch {
case normalized == "":
return ""
case strings.Contains(normalized, "ethernet controller"):
return "EthernetController"
case strings.Contains(normalized, "fibre channel"):
return "FibreChannelController"
case strings.Contains(normalized, "network controller"), strings.Contains(normalized, "infiniband controller"):
return "NetworkController"
case strings.Contains(normalized, "serial attached scsi"), strings.Contains(normalized, "storage controller"):
return "StorageController"
case strings.Contains(normalized, "raid"), strings.Contains(normalized, "mass storage"):
return "MassStorageController"
case strings.Contains(normalized, "display controller"):
return "DisplayController"
case strings.Contains(normalized, "vga"), strings.Contains(normalized, "3d controller"), strings.Contains(normalized, "video controller"):
return "VideoController"
case strings.Contains(normalized, "processing accelerators"), strings.Contains(normalized, "processing accelerator"):
return "ProcessingAccelerator"
default:
return raw
}
}
func isNICClass(class string) bool {
switch strings.TrimSpace(class) {
case "EthernetController", "NetworkController":
return true
default:
return false
}
}
func isGPUClass(class string) bool {
switch strings.TrimSpace(class) {
case "VideoController", "DisplayController", "ProcessingAccelerator":
return true
default:
return false
}
}
func isRAIDClass(class string) bool {
switch strings.TrimSpace(class) {
case "MassStorageController", "StorageController":
return true
default:
return false
}
}

View File

@@ -51,12 +51,14 @@ func parseCPUs(output, boardSerial string) []schema.HardwareCPU {
// Returns false if the socket is unpopulated. // Returns false if the socket is unpopulated.
func parseCPUSection(fields map[string]string, boardSerial string) (schema.HardwareCPU, bool) { func parseCPUSection(fields map[string]string, boardSerial string) (schema.HardwareCPU, bool) {
status := parseCPUStatus(fields["Status"]) status := parseCPUStatus(fields["Status"])
if status == "EMPTY" { if status == statusEmpty {
return schema.HardwareCPU{}, false return schema.HardwareCPU{}, false
} }
cpu := schema.HardwareCPU{} cpu := schema.HardwareCPU{}
cpu.Status = &status cpu.Status = &status
present := true
cpu.Present = &present
if socket, ok := parseSocketIndex(fields["Socket Designation"]); ok { if socket, ok := parseSocketIndex(fields["Socket Designation"]); ok {
cpu.Socket = &socket cpu.Socket = &socket
@@ -99,15 +101,15 @@ func parseCPUStatus(raw string) string {
upper := strings.ToUpper(raw) upper := strings.ToUpper(raw)
switch { switch {
case upper == "" || upper == "UNKNOWN": case upper == "" || upper == "UNKNOWN":
return "UNKNOWN" return statusUnknown
case strings.Contains(upper, "UNPOPULATED") || strings.Contains(upper, "NOT POPULATED"): case strings.Contains(upper, "UNPOPULATED") || strings.Contains(upper, "NOT POPULATED"):
return "EMPTY" return statusEmpty
case strings.Contains(upper, "ENABLED"): case strings.Contains(upper, "ENABLED"):
return "OK" return statusOK
case strings.Contains(upper, "DISABLED"): case strings.Contains(upper, "DISABLED"):
return "WARNING" return statusWarning
default: default:
return "UNKNOWN" return statusUnknown
} }
} }

View File

@@ -0,0 +1,196 @@
package collector
import (
"bee/audit/internal/schema"
"os"
"path/filepath"
"regexp"
"sort"
"strconv"
"strings"
)
var (
cpuSysBaseDir = "/sys/devices/system/cpu"
socketIndexRe = regexp.MustCompile(`(?i)(?:package id|socket|cpu)\s*([0-9]+)`)
)
func enrichCPUsWithTelemetry(cpus []schema.HardwareCPU, doc sensorsDoc) []schema.HardwareCPU {
if len(cpus) == 0 {
return cpus
}
tempBySocket := cpuTempsFromSensors(doc, len(cpus))
powerBySocket := cpuPowerFromSensors(doc, len(cpus))
throttleBySocket := cpuThrottleBySocket()
for i := range cpus {
socket := 0
if cpus[i].Socket != nil {
socket = *cpus[i].Socket
}
if value, ok := tempBySocket[socket]; ok {
cpus[i].TemperatureC = &value
}
if value, ok := powerBySocket[socket]; ok {
cpus[i].PowerW = &value
}
if value, ok := throttleBySocket[socket]; ok {
cpus[i].Throttled = &value
}
}
return cpus
}
func cpuTempsFromSensors(doc sensorsDoc, cpuCount int) map[int]float64 {
out := map[int]float64{}
if len(doc) == 0 {
return out
}
var fallback []float64
for chip, features := range doc {
for featureName, raw := range features {
feature, ok := raw.(map[string]any)
if !ok {
continue
}
if classifySensorFeature(feature) != "temp" {
continue
}
temp, ok := firstFeatureFloat(feature, "_input")
if !ok {
continue
}
if socket, ok := detectCPUSocket(chip, featureName); ok {
if _, exists := out[socket]; !exists {
out[socket] = temp
}
continue
}
if isLikelyCPUTemp(chip, featureName) {
fallback = append(fallback, temp)
}
}
}
if len(out) == 0 && cpuCount == 1 && len(fallback) > 0 {
out[0] = fallback[0]
}
return out
}
func cpuPowerFromSensors(doc sensorsDoc, cpuCount int) map[int]float64 {
out := map[int]float64{}
if len(doc) == 0 {
return out
}
var fallback []float64
for chip, features := range doc {
for featureName, raw := range features {
feature, ok := raw.(map[string]any)
if !ok {
continue
}
if classifySensorFeature(feature) != "power" {
continue
}
power, ok := firstFeatureFloatWithContains(feature, []string{"power"})
if !ok {
continue
}
if socket, ok := detectCPUSocket(chip, featureName); ok {
if _, exists := out[socket]; !exists {
out[socket] = power
}
continue
}
if isLikelyCPUPower(chip, featureName) {
fallback = append(fallback, power)
}
}
}
if len(out) == 0 && cpuCount == 1 && len(fallback) > 0 {
out[0] = fallback[0]
}
return out
}
func detectCPUSocket(parts ...string) (int, bool) {
for _, part := range parts {
matches := socketIndexRe.FindStringSubmatch(strings.ToLower(part))
if len(matches) == 2 {
value, err := strconv.Atoi(matches[1])
if err == nil {
return value, true
}
}
}
return 0, false
}
func isLikelyCPUTemp(chip, feature string) bool {
value := strings.ToLower(chip + " " + feature)
return strings.Contains(value, "coretemp") ||
strings.Contains(value, "k10temp") ||
strings.Contains(value, "package id") ||
strings.Contains(value, "tdie") ||
strings.Contains(value, "tctl") ||
strings.Contains(value, "cpu temp")
}
func isLikelyCPUPower(chip, feature string) bool {
value := strings.ToLower(chip + " " + feature)
return strings.Contains(value, "intel-rapl") ||
strings.Contains(value, "package id") ||
strings.Contains(value, "package-") ||
strings.Contains(value, "cpu power")
}
func cpuThrottleBySocket() map[int]bool {
out := map[int]bool{}
cpuDirs, err := filepath.Glob(filepath.Join(cpuSysBaseDir, "cpu[0-9]*"))
if err != nil {
return out
}
sort.Strings(cpuDirs)
for _, cpuDir := range cpuDirs {
socket, ok := readSocketIndex(cpuDir)
if !ok {
continue
}
if cpuPackageThrottled(cpuDir) {
out[socket] = true
}
}
return out
}
func readSocketIndex(cpuDir string) (int, bool) {
raw, err := os.ReadFile(filepath.Join(cpuDir, "topology", "physical_package_id"))
if err != nil {
return 0, false
}
value, err := strconv.Atoi(strings.TrimSpace(string(raw)))
if err != nil || value < 0 {
return 0, false
}
return value, true
}
func cpuPackageThrottled(cpuDir string) bool {
paths := []string{
filepath.Join(cpuDir, "thermal_throttle", "package_throttle_count"),
filepath.Join(cpuDir, "thermal_throttle", "core_throttle_count"),
}
for _, path := range paths {
raw, err := os.ReadFile(path)
if err != nil {
continue
}
value, err := strconv.ParseInt(strings.TrimSpace(string(raw)), 10, 64)
if err == nil && value > 0 {
return true
}
}
return false
}

View File

@@ -0,0 +1,71 @@
package collector
import (
"os"
"path/filepath"
"testing"
"bee/audit/internal/schema"
)
func TestEnrichCPUsWithTelemetry(t *testing.T) {
tmp := t.TempDir()
oldBase := cpuSysBaseDir
cpuSysBaseDir = tmp
t.Cleanup(func() { cpuSysBaseDir = oldBase })
mustWriteFile(t, filepath.Join(tmp, "cpu0", "topology", "physical_package_id"), "0\n")
mustWriteFile(t, filepath.Join(tmp, "cpu0", "thermal_throttle", "package_throttle_count"), "3\n")
mustWriteFile(t, filepath.Join(tmp, "cpu1", "topology", "physical_package_id"), "1\n")
mustWriteFile(t, filepath.Join(tmp, "cpu1", "thermal_throttle", "package_throttle_count"), "0\n")
doc := sensorsDoc{
"coretemp-isa-0000": {
"Package id 0": map[string]any{"temp1_input": 61.5},
"Package id 1": map[string]any{"temp2_input": 58.0},
},
"intel-rapl-mmio-0": {
"Package id 0": map[string]any{"power1_average": 180.0},
"Package id 1": map[string]any{"power2_average": 175.0},
},
}
socket0 := 0
socket1 := 1
status := statusOK
cpus := []schema.HardwareCPU{
{Socket: &socket0, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
{Socket: &socket1, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
}
got := enrichCPUsWithTelemetry(cpus, doc)
if got[0].TemperatureC == nil || *got[0].TemperatureC != 61.5 {
t.Fatalf("cpu0 temperature mismatch: %#v", got[0].TemperatureC)
}
if got[0].PowerW == nil || *got[0].PowerW != 180.0 {
t.Fatalf("cpu0 power mismatch: %#v", got[0].PowerW)
}
if got[0].Throttled == nil || !*got[0].Throttled {
t.Fatalf("cpu0 throttled mismatch: %#v", got[0].Throttled)
}
if got[1].TemperatureC == nil || *got[1].TemperatureC != 58.0 {
t.Fatalf("cpu1 temperature mismatch: %#v", got[1].TemperatureC)
}
if got[1].PowerW == nil || *got[1].PowerW != 175.0 {
t.Fatalf("cpu1 power mismatch: %#v", got[1].PowerW)
}
if got[1].Throttled != nil && *got[1].Throttled {
t.Fatalf("cpu1 throttled mismatch: %#v", got[1].Throttled)
}
}
func mustWriteFile(t *testing.T, path, content string) {
t.Helper()
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
t.Fatalf("mkdir %s: %v", path, err)
}
if err := os.WriteFile(path, []byte(content), 0644); err != nil {
t.Fatalf("write %s: %v", path, err)
}
}

View File

@@ -69,12 +69,12 @@ func TestParseCPUStatus(t *testing.T) {
want string want string
}{ }{
{"Populated, Enabled", "OK"}, {"Populated, Enabled", "OK"},
{"Populated, Disabled By User", "WARNING"}, {"Populated, Disabled By User", statusWarning},
{"Populated, Disabled By BIOS", "WARNING"}, {"Populated, Disabled By BIOS", statusWarning},
{"Unpopulated", "EMPTY"}, {"Unpopulated", statusEmpty},
{"Not Populated", "EMPTY"}, {"Not Populated", statusEmpty},
{"Unknown", "UNKNOWN"}, {"Unknown", statusUnknown},
{"", "UNKNOWN"}, {"", statusUnknown},
} }
for _, tt := range tests { for _, tt := range tests {
got := parseCPUStatus(tt.input) got := parseCPUStatus(tt.input)

View File

@@ -0,0 +1,179 @@
package collector
import (
"bee/audit/internal/schema"
"fmt"
)
func finalizeSnapshot(snap *schema.HardwareSnapshot, collectedAt string) {
snap.Memory = filterMemory(snap.Memory)
snap.Storage = filterStorage(snap.Storage)
snap.PowerSupplies = filterPSUs(snap.PowerSupplies)
setComponentStatusMetadata(snap, collectedAt)
deduplicateComponentSerials(snap)
}
func filterMemory(dimms []schema.HardwareMemory) []schema.HardwareMemory {
out := make([]schema.HardwareMemory, 0, len(dimms))
for _, dimm := range dimms {
if dimm.Present != nil && !*dimm.Present {
continue
}
if dimm.Status != nil && *dimm.Status == statusEmpty {
continue
}
if dimm.SerialNumber == nil || *dimm.SerialNumber == "" {
continue
}
out = append(out, dimm)
}
return out
}
func filterStorage(disks []schema.HardwareStorage) []schema.HardwareStorage {
out := make([]schema.HardwareStorage, 0, len(disks))
for _, disk := range disks {
if disk.SerialNumber == nil || *disk.SerialNumber == "" {
continue
}
out = append(out, disk)
}
return out
}
func filterPSUs(psus []schema.HardwarePowerSupply) []schema.HardwarePowerSupply {
out := make([]schema.HardwarePowerSupply, 0, len(psus))
for _, psu := range psus {
if psu.SerialNumber == nil || *psu.SerialNumber == "" {
continue
}
out = append(out, psu)
}
return out
}
func setComponentStatusMetadata(snap *schema.HardwareSnapshot, collectedAt string) {
for i := range snap.CPUs {
setStatusCheckedAt(&snap.CPUs[i].HardwareComponentStatus, collectedAt)
}
for i := range snap.Memory {
setStatusCheckedAt(&snap.Memory[i].HardwareComponentStatus, collectedAt)
}
for i := range snap.Storage {
setStatusCheckedAt(&snap.Storage[i].HardwareComponentStatus, collectedAt)
}
for i := range snap.PCIeDevices {
setStatusCheckedAt(&snap.PCIeDevices[i].HardwareComponentStatus, collectedAt)
}
for i := range snap.PowerSupplies {
setStatusCheckedAt(&snap.PowerSupplies[i].HardwareComponentStatus, collectedAt)
}
}
func setStatusCheckedAt(status *schema.HardwareComponentStatus, collectedAt string) {
if status == nil || status.Status == nil || *status.Status == "" {
return
}
if status.StatusCheckedAt == nil {
status.StatusCheckedAt = &collectedAt
}
}
func deduplicateComponentSerials(snap *schema.HardwareSnapshot) {
deduplicateCPUSerials(snap.CPUs)
deduplicateMemorySerials(snap.Memory)
deduplicateStorageSerials(snap.Storage)
deduplicatePCIeSerials(snap.PCIeDevices)
deduplicatePSUSerials(snap.PowerSupplies)
}
func deduplicateCPUSerials(items []schema.HardwareCPU) {
seen := map[string]int{}
seq := 1
for i := range items {
if items[i].SerialNumber == nil || *items[i].SerialNumber == "" {
continue
}
model := derefString(items[i].Model)
key := model + "\x00" + *items[i].SerialNumber
seen[key]++
if seen[key] > 1 {
repl := fmt.Sprintf("NO_SN-%08d", seq)
seq++
items[i].SerialNumber = &repl
}
}
}
func deduplicateMemorySerials(items []schema.HardwareMemory) {
seen := map[string]int{}
seq := 1
for i := range items {
if items[i].SerialNumber == nil || *items[i].SerialNumber == "" {
continue
}
model := derefString(items[i].PartNumber)
key := model + "\x00" + *items[i].SerialNumber
seen[key]++
if seen[key] > 1 {
repl := fmt.Sprintf("NO_SN-%08d", seq)
seq++
items[i].SerialNumber = &repl
}
}
}
func deduplicateStorageSerials(items []schema.HardwareStorage) {
seen := map[string]int{}
seq := 1
for i := range items {
if items[i].SerialNumber == nil || *items[i].SerialNumber == "" {
continue
}
model := derefString(items[i].Model)
key := model + "\x00" + *items[i].SerialNumber
seen[key]++
if seen[key] > 1 {
repl := fmt.Sprintf("NO_SN-%08d", seq)
seq++
items[i].SerialNumber = &repl
}
}
}
func deduplicatePCIeSerials(items []schema.HardwarePCIeDevice) {
seen := map[string]int{}
seq := 1
for i := range items {
if items[i].SerialNumber == nil || *items[i].SerialNumber == "" {
continue
}
model := derefString(items[i].Model)
key := model + "\x00" + *items[i].SerialNumber
seen[key]++
if seen[key] > 1 {
repl := fmt.Sprintf("NO_SN-%08d", seq)
seq++
items[i].SerialNumber = &repl
}
}
}
func deduplicatePSUSerials(items []schema.HardwarePowerSupply) {
seen := map[string]int{}
seq := 1
for i := range items {
if items[i].SerialNumber == nil || *items[i].SerialNumber == "" {
continue
}
model := derefString(items[i].Model)
key := model + "\x00" + *items[i].SerialNumber
seen[key]++
if seen[key] > 1 {
repl := fmt.Sprintf("NO_SN-%08d", seq)
seq++
items[i].SerialNumber = &repl
}
}
}

View File

@@ -0,0 +1,63 @@
package collector
import (
"bee/audit/internal/schema"
"testing"
)
func TestFinalizeSnapshotFiltersComponentsWithoutRequiredSerials(t *testing.T) {
collectedAt := "2026-03-15T12:00:00Z"
present := true
status := statusOK
serial := "SN-1"
snap := schema.HardwareSnapshot{
Memory: []schema.HardwareMemory{
{Present: &present, SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
{Present: &present, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
},
Storage: []schema.HardwareStorage{
{SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
{HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
},
PowerSupplies: []schema.HardwarePowerSupply{
{SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
{HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
},
}
finalizeSnapshot(&snap, collectedAt)
if len(snap.Memory) != 1 || snap.Memory[0].StatusCheckedAt == nil || *snap.Memory[0].StatusCheckedAt != collectedAt {
t.Fatalf("memory finalize mismatch: %+v", snap.Memory)
}
if len(snap.Storage) != 1 || snap.Storage[0].StatusCheckedAt == nil || *snap.Storage[0].StatusCheckedAt != collectedAt {
t.Fatalf("storage finalize mismatch: %+v", snap.Storage)
}
if len(snap.PowerSupplies) != 1 || snap.PowerSupplies[0].StatusCheckedAt == nil || *snap.PowerSupplies[0].StatusCheckedAt != collectedAt {
t.Fatalf("psu finalize mismatch: %+v", snap.PowerSupplies)
}
}
func TestFinalizeSnapshotDeduplicatesSerials(t *testing.T) {
collectedAt := "2026-03-15T12:00:00Z"
status := statusOK
model := "Device"
serial := "DUPLICATE"
snap := schema.HardwareSnapshot{
Storage: []schema.HardwareStorage{
{Model: &model, SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
{Model: &model, SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
},
}
finalizeSnapshot(&snap, collectedAt)
if got := *snap.Storage[0].SerialNumber; got != serial {
t.Fatalf("first serial changed: %q", got)
}
if got := *snap.Storage[1].SerialNumber; got != "NO_SN-00000001" {
t.Fatalf("duplicate serial mismatch: %q", got)
}
}

View File

@@ -47,12 +47,12 @@ func parseMemorySection(fields map[string]string) schema.HardwareMemory {
dimm.Present = &present dimm.Present = &present
if !present { if !present {
status := "EMPTY" status := statusEmpty
dimm.Status = &status dimm.Status = &status
return dimm return dimm
} }
status := "OK" status := statusOK
dimm.Status = &status dimm.Status = &status
if mb := parseMemorySizeMB(rawSize); mb > 0 { if mb := parseMemorySizeMB(rawSize); mb > 0 {

View File

@@ -0,0 +1,203 @@
package collector
import (
"bee/audit/internal/schema"
"os"
"path/filepath"
"sort"
"strconv"
"strings"
)
var edacBaseDir = "/sys/devices/system/edac/mc"
type edacDIMMStats struct {
Label string
CECount *int64
UECount *int64
}
func enrichMemoryWithTelemetry(dimms []schema.HardwareMemory, doc sensorsDoc) []schema.HardwareMemory {
if len(dimms) == 0 {
return dimms
}
tempByLabel := memoryTempsFromSensors(doc)
stats := readEDACStats()
for i := range dimms {
labelKeys := dimmMatchKeys(dimms[i].Slot, dimms[i].Location)
for _, key := range labelKeys {
if temp, ok := tempByLabel[key]; ok {
dimms[i].TemperatureC = &temp
break
}
}
for _, key := range labelKeys {
if stat, ok := stats[key]; ok {
if stat.CECount != nil {
dimms[i].CorrectableECCErrorCount = stat.CECount
}
if stat.UECount != nil {
dimms[i].UncorrectableECCErrorCount = stat.UECount
}
if stat.UECount != nil && *stat.UECount > 0 {
dimms[i].DataLossDetected = boolPtr(true)
status := statusCritical
dimms[i].Status = &status
if dimms[i].ErrorDescription == nil {
dimms[i].ErrorDescription = stringPtr("EDAC reports uncorrectable ECC errors")
}
} else if stat.CECount != nil && *stat.CECount > 0 && (dimms[i].Status == nil || *dimms[i].Status == statusOK) {
status := statusWarning
dimms[i].Status = &status
if dimms[i].ErrorDescription == nil {
dimms[i].ErrorDescription = stringPtr("EDAC reports correctable ECC errors")
}
}
break
}
}
}
return dimms
}
func memoryTempsFromSensors(doc sensorsDoc) map[string]float64 {
out := map[string]float64{}
if len(doc) == 0 {
return out
}
for chip, features := range doc {
for featureName, raw := range features {
feature, ok := raw.(map[string]any)
if !ok || classifySensorFeature(feature) != "temp" {
continue
}
if !isLikelyMemoryTemp(chip, featureName) {
continue
}
temp, ok := firstFeatureFloat(feature, "_input")
if !ok {
continue
}
key := canonicalLabel(featureName)
if key == "" {
continue
}
if _, exists := out[key]; !exists {
out[key] = temp
}
}
}
return out
}
func readEDACStats() map[string]edacDIMMStats {
out := map[string]edacDIMMStats{}
mcDirs, err := filepath.Glob(filepath.Join(edacBaseDir, "mc*"))
if err != nil {
return out
}
sort.Strings(mcDirs)
for _, mcDir := range mcDirs {
dimmDirs, err := filepath.Glob(filepath.Join(mcDir, "dimm*"))
if err != nil {
continue
}
sort.Strings(dimmDirs)
for _, dimmDir := range dimmDirs {
stat, ok := readEDACDIMMStats(dimmDir)
if !ok {
continue
}
key := canonicalLabel(stat.Label)
if key == "" {
continue
}
out[key] = stat
}
}
return out
}
func readEDACDIMMStats(dimmDir string) (edacDIMMStats, bool) {
labelBytes, err := os.ReadFile(filepath.Join(dimmDir, "dimm_label"))
if err != nil {
labelBytes, err = os.ReadFile(filepath.Join(dimmDir, "label"))
if err != nil {
return edacDIMMStats{}, false
}
}
label := strings.TrimSpace(string(labelBytes))
if label == "" {
return edacDIMMStats{}, false
}
stat := edacDIMMStats{Label: label}
if value, ok := readEDACCount(dimmDir, []string{"dimm_ce_count", "ce_count"}); ok {
stat.CECount = &value
}
if value, ok := readEDACCount(dimmDir, []string{"dimm_ue_count", "ue_count"}); ok {
stat.UECount = &value
}
return stat, true
}
func readEDACCount(dir string, names []string) (int64, bool) {
for _, name := range names {
raw, err := os.ReadFile(filepath.Join(dir, name))
if err != nil {
continue
}
value, err := strconv.ParseInt(strings.TrimSpace(string(raw)), 10, 64)
if err == nil && value >= 0 {
return value, true
}
}
return 0, false
}
func dimmMatchKeys(slot, location *string) []string {
var out []string
add := func(value *string) {
key := canonicalLabel(derefString(value))
if key == "" {
return
}
for _, existing := range out {
if existing == key {
return
}
}
out = append(out, key)
}
add(slot)
add(location)
return out
}
func canonicalLabel(value string) string {
value = strings.ToUpper(strings.TrimSpace(value))
if value == "" {
return ""
}
var b strings.Builder
for _, r := range value {
if (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9') {
b.WriteRune(r)
}
}
return b.String()
}
func isLikelyMemoryTemp(chip, feature string) bool {
value := strings.ToLower(chip + " " + feature)
return strings.Contains(value, "dimm") || strings.Contains(value, "sodimm")
}
func boolPtr(value bool) *bool {
return &value
}

View File

@@ -0,0 +1,61 @@
package collector
import (
"path/filepath"
"testing"
"bee/audit/internal/schema"
)
func TestEnrichMemoryWithTelemetry(t *testing.T) {
tmp := t.TempDir()
oldBase := edacBaseDir
edacBaseDir = tmp
t.Cleanup(func() { edacBaseDir = oldBase })
mustWriteFile(t, filepath.Join(tmp, "mc0", "dimm0", "dimm_label"), "CPU0_DIMM_A1\n")
mustWriteFile(t, filepath.Join(tmp, "mc0", "dimm0", "dimm_ce_count"), "7\n")
mustWriteFile(t, filepath.Join(tmp, "mc0", "dimm0", "dimm_ue_count"), "0\n")
mustWriteFile(t, filepath.Join(tmp, "mc0", "dimm1", "dimm_label"), "CPU1_DIMM_B2\n")
mustWriteFile(t, filepath.Join(tmp, "mc0", "dimm1", "dimm_ce_count"), "0\n")
mustWriteFile(t, filepath.Join(tmp, "mc0", "dimm1", "dimm_ue_count"), "2\n")
doc := sensorsDoc{
"jc42-i2c-0-18": {
"CPU0 DIMM A1": map[string]any{"temp1_input": 43.0},
"CPU1 DIMM B2": map[string]any{"temp2_input": 46.0},
},
}
status := statusOK
slotA := "CPU0_DIMM_A1"
slotB := "CPU1_DIMM_B2"
dimms := []schema.HardwareMemory{
{Slot: &slotA, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
{Slot: &slotB, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
}
got := enrichMemoryWithTelemetry(dimms, doc)
if got[0].TemperatureC == nil || *got[0].TemperatureC != 43.0 {
t.Fatalf("dimm0 temperature mismatch: %#v", got[0].TemperatureC)
}
if got[0].CorrectableECCErrorCount == nil || *got[0].CorrectableECCErrorCount != 7 {
t.Fatalf("dimm0 ce mismatch: %#v", got[0].CorrectableECCErrorCount)
}
if got[0].Status == nil || *got[0].Status != statusWarning {
t.Fatalf("dimm0 status mismatch: %#v", got[0].Status)
}
if got[1].TemperatureC == nil || *got[1].TemperatureC != 46.0 {
t.Fatalf("dimm1 temperature mismatch: %#v", got[1].TemperatureC)
}
if got[1].UncorrectableECCErrorCount == nil || *got[1].UncorrectableECCErrorCount != 2 {
t.Fatalf("dimm1 ue mismatch: %#v", got[1].UncorrectableECCErrorCount)
}
if got[1].Status == nil || *got[1].Status != statusCritical {
t.Fatalf("dimm1 status mismatch: %#v", got[1].Status)
}
if got[1].DataLossDetected == nil || !*got[1].DataLossDetected {
t.Fatalf("dimm1 data_loss_detected mismatch: %#v", got[1].DataLossDetected)
}
}

View File

@@ -18,17 +18,13 @@ var (
} }
return string(out), nil return string(out), nil
} }
readNetStatFile = func(iface, key string) (int64, error) { readNetAddressFile = func(iface string) (string, error) {
path := filepath.Join("/sys/class/net", iface, "statistics", key) path := filepath.Join("/sys/class/net", iface, "address")
raw, err := os.ReadFile(path) raw, err := os.ReadFile(path)
if err != nil { if err != nil {
return 0, err return "", err
} }
v, err := strconv.ParseInt(strings.TrimSpace(string(raw)), 10, 64) return strings.TrimSpace(string(raw)), nil
if err != nil {
return 0, err
}
return v, nil
} }
) )
@@ -47,6 +43,7 @@ func enrichPCIeWithNICTelemetry(devs []schema.HardwarePCIeDevice) []schema.Hardw
continue continue
} }
iface := ifaces[0] iface := ifaces[0]
devs[i].MacAddresses = collectInterfaceMACs(ifaces)
if devs[i].Firmware == nil { if devs[i].Firmware == nil {
if out, err := ethtoolInfoQuery(iface); err == nil { if out, err := ethtoolInfoQuery(iface); err == nil {
@@ -56,16 +53,13 @@ func enrichPCIeWithNICTelemetry(devs []schema.HardwarePCIeDevice) []schema.Hardw
} }
} }
if devs[i].Telemetry == nil {
devs[i].Telemetry = map[string]any{}
}
injectNICPacketStats(devs[i].Telemetry, iface)
if out, err := ethtoolModuleQuery(iface); err == nil { if out, err := ethtoolModuleQuery(iface); err == nil {
injectSFPDOMTelemetry(devs[i].Telemetry, out) if injectSFPDOMTelemetry(&devs[i], out) {
enriched++
continue
}
} }
if len(devs[i].Telemetry) == 0 { if len(devs[i].MacAddresses) > 0 || devs[i].Firmware != nil {
devs[i].Telemetry = nil
} else {
enriched++ enriched++
} }
} }
@@ -77,31 +71,32 @@ func isNICDevice(dev schema.HardwarePCIeDevice) bool {
if dev.DeviceClass == nil { if dev.DeviceClass == nil {
return false return false
} }
c := strings.ToLower(strings.TrimSpace(*dev.DeviceClass)) c := strings.TrimSpace(*dev.DeviceClass)
return strings.Contains(c, "ethernet controller") || return isNICClass(c) || strings.EqualFold(c, "FibreChannelController")
strings.Contains(c, "network controller") ||
strings.Contains(c, "infiniband controller")
} }
func injectNICPacketStats(dst map[string]any, iface string) { func collectInterfaceMACs(ifaces []string) []string {
for _, key := range []string{"rx_packets", "tx_packets", "rx_errors", "tx_errors"} { seen := map[string]struct{}{}
if v, err := readNetStatFile(iface, key); err == nil { var out []string
dst[key] = v for _, iface := range ifaces {
mac, err := readNetAddressFile(iface)
if err != nil || mac == "" {
continue
} }
mac = strings.ToLower(strings.TrimSpace(mac))
if _, ok := seen[mac]; ok {
continue
}
seen[mac] = struct{}{}
out = append(out, mac)
} }
} return out
func injectSFPDOMTelemetry(dst map[string]any, raw string) {
parsed := parseSFPDOM(raw)
for k, v := range parsed {
dst[k] = v
}
} }
var floatRe = regexp.MustCompile(`[-+]?[0-9]*\.?[0-9]+`) var floatRe = regexp.MustCompile(`[-+]?[0-9]*\.?[0-9]+`)
func parseSFPDOM(raw string) map[string]any { func injectSFPDOMTelemetry(dev *schema.HardwarePCIeDevice, raw string) bool {
out := map[string]any{} var changed bool
for _, line := range strings.Split(raw, "\n") { for _, line := range strings.Split(raw, "\n") {
trimmed := strings.TrimSpace(line) trimmed := strings.TrimSpace(line)
if trimmed == "" { if trimmed == "" {
@@ -117,26 +112,55 @@ func parseSFPDOM(raw string) map[string]any {
switch { switch {
case strings.Contains(key, "module temperature"): case strings.Contains(key, "module temperature"):
if f, ok := firstFloat(val); ok { if f, ok := firstFloat(val); ok {
out["sfp_temperature_c"] = f dev.SFPTemperatureC = &f
changed = true
} }
case strings.Contains(key, "laser output power"): case strings.Contains(key, "laser output power"):
if f, ok := dbmValue(val); ok { if f, ok := dbmValue(val); ok {
out["sfp_tx_power_dbm"] = f dev.SFPTXPowerDBM = &f
changed = true
} }
case strings.Contains(key, "receiver signal"): case strings.Contains(key, "receiver signal"):
if f, ok := dbmValue(val); ok { if f, ok := dbmValue(val); ok {
out["sfp_rx_power_dbm"] = f dev.SFPRXPowerDBM = &f
changed = true
} }
case strings.Contains(key, "module voltage"): case strings.Contains(key, "module voltage"):
if f, ok := firstFloat(val); ok { if f, ok := firstFloat(val); ok {
out["sfp_voltage_v"] = f dev.SFPVoltageV = &f
changed = true
} }
case strings.Contains(key, "laser bias current"): case strings.Contains(key, "laser bias current"):
if f, ok := firstFloat(val); ok { if f, ok := firstFloat(val); ok {
out["sfp_bias_ma"] = f dev.SFPBiasMA = &f
changed = true
} }
} }
} }
return changed
}
func parseSFPDOM(raw string) map[string]any {
dev := schema.HardwarePCIeDevice{}
if !injectSFPDOMTelemetry(&dev, raw) {
return map[string]any{}
}
out := map[string]any{}
if dev.SFPTemperatureC != nil {
out["sfp_temperature_c"] = *dev.SFPTemperatureC
}
if dev.SFPTXPowerDBM != nil {
out["sfp_tx_power_dbm"] = *dev.SFPTXPowerDBM
}
if dev.SFPRXPowerDBM != nil {
out["sfp_rx_power_dbm"] = *dev.SFPRXPowerDBM
}
if dev.SFPVoltageV != nil {
out["sfp_voltage_v"] = *dev.SFPVoltageV
}
if dev.SFPBiasMA != nil {
out["sfp_bias_ma"] = *dev.SFPBiasMA
}
return out return out
} }

View File

@@ -24,7 +24,7 @@ type nvidiaGPUInfo struct {
} }
// enrichPCIeWithNVIDIA enriches NVIDIA PCIe devices with data from nvidia-smi. // enrichPCIeWithNVIDIA enriches NVIDIA PCIe devices with data from nvidia-smi.
// If the driver/tool is unavailable, NVIDIA devices get UNKNOWN status and // If the driver/tool is unavailable, NVIDIA devices get Unknown status and
// a stable serial fallback based on board serial + slot. // a stable serial fallback based on board serial + slot.
func enrichPCIeWithNVIDIA(devs []schema.HardwarePCIeDevice, boardSerial string) []schema.HardwarePCIeDevice { func enrichPCIeWithNVIDIA(devs []schema.HardwarePCIeDevice, boardSerial string) []schema.HardwarePCIeDevice {
if !hasNVIDIADevices(devs) { if !hasNVIDIADevices(devs) {
@@ -78,9 +78,10 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
devs[i].Firmware = &v devs[i].Firmware = &v
} }
status := "OK" status := statusOK
if info.ECCUncorrected != nil && *info.ECCUncorrected > 0 { if info.ECCUncorrected != nil && *info.ECCUncorrected > 0 {
status = "WARNING" status = statusWarning
devs[i].ErrorDescription = stringPtr("GPU reports uncorrected ECC errors")
} }
devs[i].Status = &status devs[i].Status = &status
injectNVIDIATelemetry(&devs[i], info) injectNVIDIATelemetry(&devs[i], info)
@@ -214,7 +215,7 @@ func isNVIDIADevice(dev schema.HardwarePCIeDevice) bool {
func setPCIeFallback(dev *schema.HardwarePCIeDevice, boardSerial string) { func setPCIeFallback(dev *schema.HardwarePCIeDevice, boardSerial string) {
setPCIeFallbackSerial(dev, boardSerial) setPCIeFallbackSerial(dev, boardSerial)
status := "UNKNOWN" status := statusUnknown
dev.Status = &status dev.Status = &status
} }
@@ -233,25 +234,19 @@ func setPCIeFallbackSerial(dev *schema.HardwarePCIeDevice, boardSerial string) {
} }
func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) { func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) {
if dev.Telemetry == nil {
dev.Telemetry = map[string]any{}
}
if info.TemperatureC != nil { if info.TemperatureC != nil {
dev.Telemetry["temperature_c"] = *info.TemperatureC dev.TemperatureC = info.TemperatureC
} }
if info.PowerW != nil { if info.PowerW != nil {
dev.Telemetry["power_w"] = *info.PowerW dev.PowerW = info.PowerW
} }
if info.ECCUncorrected != nil { if info.ECCUncorrected != nil {
dev.Telemetry["ecc_uncorrected_total"] = *info.ECCUncorrected dev.ECCUncorrectedTotal = info.ECCUncorrected
} }
if info.ECCCorrected != nil { if info.ECCCorrected != nil {
dev.Telemetry["ecc_corrected_total"] = *info.ECCCorrected dev.ECCCorrectedTotal = info.ECCCorrected
} }
if info.HWSlowdown != nil { if info.HWSlowdown != nil {
dev.Telemetry["hw_slowdown_active"] = *info.HWSlowdown dev.HWSlowdown = info.HWSlowdown
}
if len(dev.Telemetry) == 0 {
dev.Telemetry = nil
} }
} }

View File

@@ -54,10 +54,10 @@ func TestEnrichPCIeWithNVIDIAData_driverLoaded(t *testing.T) {
status := "OK" status := "OK"
devices := []schema.HardwarePCIeDevice{ devices := []schema.HardwarePCIeDevice{
{ {
VendorID: &vendorID, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status},
BDF: &bdf, VendorID: &vendorID,
Manufacturer: &manufacturer, BDF: &bdf,
Status: &status, Manufacturer: &manufacturer,
}, },
} }
@@ -80,14 +80,14 @@ func TestEnrichPCIeWithNVIDIAData_driverLoaded(t *testing.T) {
if out[0].Firmware == nil || *out[0].Firmware != "96.00.1F.00.02" { if out[0].Firmware == nil || *out[0].Firmware != "96.00.1F.00.02" {
t.Fatalf("firmware: got %v", out[0].Firmware) t.Fatalf("firmware: got %v", out[0].Firmware)
} }
if out[0].Status == nil || *out[0].Status != "WARNING" { if out[0].Status == nil || *out[0].Status != statusWarning {
t.Fatalf("status: got %v", out[0].Status) t.Fatalf("status: got %v", out[0].Status)
} }
if out[0].Telemetry == nil { if out[0].ECCUncorrectedTotal == nil || *out[0].ECCUncorrectedTotal != 2 {
t.Fatal("expected telemetry") t.Fatalf("ecc_uncorrected_total: got %#v", out[0].ECCUncorrectedTotal)
} }
if got, ok := out[0].Telemetry["ecc_uncorrected_total"].(int64); !ok || got != 2 { if out[0].TemperatureC == nil || *out[0].TemperatureC != 55.5 {
t.Fatalf("ecc_uncorrected_total: got %#v", out[0].Telemetry["ecc_uncorrected_total"]) t.Fatalf("temperature_c: got %#v", out[0].TemperatureC)
} }
} }
@@ -107,7 +107,7 @@ func TestEnrichPCIeWithNVIDIAData_driverMissingFallback(t *testing.T) {
if out[0].SerialNumber == nil || *out[0].SerialNumber != "BOARD-123-PCIE-0000:17:00.0" { if out[0].SerialNumber == nil || *out[0].SerialNumber != "BOARD-123-PCIE-0000:17:00.0" {
t.Fatalf("fallback serial: got %v", out[0].SerialNumber) t.Fatalf("fallback serial: got %v", out[0].SerialNumber)
} }
if out[0].Status == nil || *out[0].Status != "UNKNOWN" { if out[0].Status == nil || *out[0].Status != statusUnknown {
t.Fatalf("fallback status: got %v", out[0].Status) t.Fatalf("fallback status: got %v", out[0].Status)
} }
} }

View File

@@ -79,7 +79,7 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
dev := schema.HardwarePCIeDevice{} dev := schema.HardwarePCIeDevice{}
present := true present := true
dev.Present = &present dev.Present = &present
status := "OK" status := statusOK
dev.Status = &status dev.Status = &status
// Slot is the BDF: "0000:00:02.0" // Slot is the BDF: "0000:00:02.0"
@@ -93,10 +93,32 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
if deviceID != 0 { if deviceID != 0 {
dev.DeviceID = &deviceID dev.DeviceID = &deviceID
} }
if numaNode, ok := readPCINumaNode(bdf); ok {
dev.NUMANode = &numaNode
}
if width, ok := readPCIIntAttribute(bdf, "current_link_width"); ok {
dev.LinkWidth = &width
}
if width, ok := readPCIIntAttribute(bdf, "max_link_width"); ok {
dev.MaxLinkWidth = &width
}
if speed, ok := readPCIStringAttribute(bdf, "current_link_speed"); ok {
linkSpeed := normalizePCILinkSpeed(speed)
if linkSpeed != "" {
dev.LinkSpeed = &linkSpeed
}
}
if speed, ok := readPCIStringAttribute(bdf, "max_link_speed"); ok {
linkSpeed := normalizePCILinkSpeed(speed)
if linkSpeed != "" {
dev.MaxLinkSpeed = &linkSpeed
}
}
} }
if v := fields["Class"]; v != "" { if v := fields["Class"]; v != "" {
dev.DeviceClass = &v class := mapPCIeDeviceClass(v)
dev.DeviceClass = &class
} }
if v := fields["Vendor"]; v != "" { if v := fields["Vendor"]; v != "" {
dev.Manufacturer = &v dev.Manufacturer = &v
@@ -131,3 +153,55 @@ func readHexFile(path string) (int, error) {
n, err := strconv.ParseInt(s, 16, 64) n, err := strconv.ParseInt(s, 16, 64)
return int(n), err return int(n), err
} }
func readPCINumaNode(bdf string) (int, bool) {
value, ok := readPCIIntAttribute(bdf, "numa_node")
if !ok || value < 0 {
return 0, false
}
return value, true
}
func readPCIIntAttribute(bdf, attribute string) (int, bool) {
out, err := exec.Command("cat", "/sys/bus/pci/devices/"+bdf+"/"+attribute).Output()
if err != nil {
return 0, false
}
value, err := strconv.Atoi(strings.TrimSpace(string(out)))
if err != nil || value < 0 {
return 0, false
}
return value, true
}
func readPCIStringAttribute(bdf, attribute string) (string, bool) {
out, err := exec.Command("cat", "/sys/bus/pci/devices/"+bdf+"/"+attribute).Output()
if err != nil {
return "", false
}
value := strings.TrimSpace(string(out))
if value == "" {
return "", false
}
return value, true
}
func normalizePCILinkSpeed(raw string) string {
raw = strings.TrimSpace(strings.ToLower(raw))
switch {
case strings.Contains(raw, "2.5"):
return "Gen1"
case strings.Contains(raw, "5.0"):
return "Gen2"
case strings.Contains(raw, "8.0"):
return "Gen3"
case strings.Contains(raw, "16.0"):
return "Gen4"
case strings.Contains(raw, "32.0"):
return "Gen5"
case strings.Contains(raw, "64.0"):
return "Gen6"
default:
return ""
}
}

View File

@@ -35,7 +35,27 @@ func TestParseLspci_filtersExcludedClasses(t *testing.T) {
if len(devs) != 1 { if len(devs) != 1 {
t.Fatalf("expected 1 filtered device, got %d", len(devs)) t.Fatalf("expected 1 filtered device, got %d", len(devs))
} }
if devs[0].DeviceClass == nil || *devs[0].DeviceClass != "VGA compatible controller" { if devs[0].DeviceClass == nil || *devs[0].DeviceClass != "VideoController" {
t.Fatalf("unexpected remaining class: %v", devs[0].DeviceClass) t.Fatalf("unexpected remaining class: %v", devs[0].DeviceClass)
} }
} }
func TestNormalizePCILinkSpeed(t *testing.T) {
tests := []struct {
raw string
want string
}{
{"2.5 GT/s PCIe", "Gen1"},
{"5.0 GT/s PCIe", "Gen2"},
{"8.0 GT/s PCIe", "Gen3"},
{"16.0 GT/s PCIe", "Gen4"},
{"32.0 GT/s PCIe", "Gen5"},
{"64.0 GT/s PCIe", "Gen6"},
{"unknown", ""},
}
for _, tt := range tests {
if got := normalizePCILinkSpeed(tt.raw); got != tt.want {
t.Fatalf("normalizePCILinkSpeed(%q)=%q want %q", tt.raw, got, tt.want)
}
}
}

View File

@@ -114,7 +114,7 @@ func parseFRUBlock(block string, slotIdx int) (schema.HardwarePowerSupply, bool)
} }
} }
status := "OK" status := statusOK
psu.Status = &status psu.Status = &status
return psu, true return psu, true
@@ -123,9 +123,12 @@ func parseFRUBlock(block string, slotIdx int) (schema.HardwarePowerSupply, bool)
type psuSDR struct { type psuSDR struct {
slot int slot int
status string status string
reason string
inputPowerW *float64 inputPowerW *float64
outputPowerW *float64 outputPowerW *float64
inputVoltage *float64 inputVoltage *float64
temperatureC *float64
healthPct *float64
} }
var psuSlotRe = regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b|\bps\s*([0-9]+)\b`) var psuSlotRe = regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b|\bps\s*([0-9]+)\b`)
@@ -148,10 +151,11 @@ func parsePSUSDR(raw string) map[int]psuSDR {
entry := out[slot] entry := out[slot]
entry.slot = slot entry.slot = slot
if entry.status == "" { if entry.status == "" {
entry.status = "OK" entry.status = statusOK
} }
if state != "" && state != "ok" && state != "ns" { if state != "" && state != "ok" && state != "ns" {
entry.status = "FAILED" entry.status = statusCritical
entry.reason = "PSU sensor reported non-OK state: " + state
} }
lowerName := strings.ToLower(name) lowerName := strings.ToLower(name)
@@ -162,6 +166,10 @@ func parsePSUSDR(raw string) map[int]psuSDR {
entry.outputPowerW = parseFloatPtr(value) entry.outputPowerW = parseFloatPtr(value)
case strings.Contains(lowerName, "input voltage"), strings.Contains(lowerName, "ac input"): case strings.Contains(lowerName, "input voltage"), strings.Contains(lowerName, "ac input"):
entry.inputVoltage = parseFloatPtr(value) entry.inputVoltage = parseFloatPtr(value)
case strings.Contains(lowerName, "temp"):
entry.temperatureC = parseFloatPtr(value)
case strings.Contains(lowerName, "health"), strings.Contains(lowerName, "remaining life"), strings.Contains(lowerName, "life remaining"):
entry.healthPct = parsePercentPtr(value)
} }
out[slot] = entry out[slot] = entry
} }
@@ -187,12 +195,23 @@ func mergePSUSDR(psus []schema.HardwarePowerSupply, sdr map[int]psuSDR) {
if entry.inputVoltage != nil { if entry.inputVoltage != nil {
psus[i].InputVoltage = entry.inputVoltage psus[i].InputVoltage = entry.inputVoltage
} }
if entry.temperatureC != nil {
psus[i].TemperatureC = entry.temperatureC
}
if entry.healthPct != nil {
psus[i].LifeRemainingPct = entry.healthPct
lifeUsed := 100 - *entry.healthPct
psus[i].LifeUsedPct = &lifeUsed
}
if entry.status != "" { if entry.status != "" {
psus[i].Status = &entry.status psus[i].Status = &entry.status
} }
if psus[i].Status != nil && *psus[i].Status == "OK" { if entry.reason != "" {
psus[i].ErrorDescription = &entry.reason
}
if psus[i].Status != nil && *psus[i].Status == statusOK {
if (entry.inputPowerW == nil && entry.outputPowerW == nil && entry.inputVoltage == nil) && entry.status == "" { if (entry.inputPowerW == nil && entry.outputPowerW == nil && entry.inputVoltage == nil) && entry.status == "" {
unknown := "UNKNOWN" unknown := statusUnknown
psus[i].Status = &unknown psus[i].Status = &unknown
} }
} }

View File

@@ -7,6 +7,8 @@ func TestParsePSUSDR(t *testing.T) {
PS1 Input Power | 215 Watts | ok PS1 Input Power | 215 Watts | ok
PS1 Output Power | 198 Watts | ok PS1 Output Power | 198 Watts | ok
PS1 Input Voltage | 229 Volts | ok PS1 Input Voltage | 229 Volts | ok
PS1 Temp | 39 C | ok
PS1 Health | 97 % | ok
PS2 Input Power | 0 Watts | cr PS2 Input Power | 0 Watts | cr
` `
@@ -14,7 +16,7 @@ PS2 Input Power | 0 Watts | cr
if len(got) != 2 { if len(got) != 2 {
t.Fatalf("len(got)=%d want 2", len(got)) t.Fatalf("len(got)=%d want 2", len(got))
} }
if got[1].status != "OK" { if got[1].status != statusOK {
t.Fatalf("ps1 status=%q", got[1].status) t.Fatalf("ps1 status=%q", got[1].status)
} }
if got[1].inputPowerW == nil || *got[1].inputPowerW != 215 { if got[1].inputPowerW == nil || *got[1].inputPowerW != 215 {
@@ -26,7 +28,13 @@ PS2 Input Power | 0 Watts | cr
if got[1].inputVoltage == nil || *got[1].inputVoltage != 229 { if got[1].inputVoltage == nil || *got[1].inputVoltage != 229 {
t.Fatalf("ps1 input voltage=%v", got[1].inputVoltage) t.Fatalf("ps1 input voltage=%v", got[1].inputVoltage)
} }
if got[2].status != "FAILED" { if got[1].temperatureC == nil || *got[1].temperatureC != 39 {
t.Fatalf("ps1 temperature=%v", got[1].temperatureC)
}
if got[1].healthPct == nil || *got[1].healthPct != 97 {
t.Fatalf("ps1 health=%v", got[1].healthPct)
}
if got[2].status != statusCritical {
t.Fatalf("ps2 status=%q", got[2].status) t.Fatalf("ps2 status=%q", got[2].status)
} }
} }

View File

@@ -0,0 +1,132 @@
package collector
import (
"bee/audit/internal/schema"
"strconv"
"strings"
)
func enrichPSUsWithTelemetry(psus []schema.HardwarePowerSupply, doc sensorsDoc) []schema.HardwarePowerSupply {
if len(psus) == 0 || len(doc) == 0 {
return psus
}
tempBySlot := psuTempsFromSensors(doc)
healthBySlot := psuHealthFromSensors(doc)
for i := range psus {
slot := derefPSUSlot(psus[i].Slot)
if slot == "" {
continue
}
if psus[i].TemperatureC == nil {
if value, ok := tempBySlot[slot]; ok {
psus[i].TemperatureC = &value
}
}
if psus[i].LifeRemainingPct == nil {
if value, ok := healthBySlot[slot]; ok {
psus[i].LifeRemainingPct = &value
used := 100 - value
psus[i].LifeUsedPct = &used
}
}
}
return psus
}
func psuHealthFromSensors(doc sensorsDoc) map[string]float64 {
out := map[string]float64{}
for chip, features := range doc {
for featureName, raw := range features {
feature, ok := raw.(map[string]any)
if !ok {
continue
}
if !isLikelyPSUHealth(chip, featureName) {
continue
}
value, ok := firstFeaturePercent(feature)
if !ok {
continue
}
if slot, ok := detectPSUSlot(chip, featureName); ok {
if _, exists := out[slot]; !exists {
out[slot] = value
}
}
}
}
return out
}
func firstFeaturePercent(feature map[string]any) (float64, bool) {
keys := sortedFeatureKeys(feature)
for _, key := range keys {
lower := strings.ToLower(key)
if strings.HasSuffix(lower, "_alarm") {
continue
}
if strings.Contains(lower, "health") || strings.Contains(lower, "life") || strings.Contains(lower, "remain") {
if value, ok := floatFromAny(feature[key]); ok {
return value, true
}
}
}
return 0, false
}
func isLikelyPSUHealth(chip, feature string) bool {
value := strings.ToLower(chip + " " + feature)
return (strings.Contains(value, "psu") || strings.Contains(value, "power supply")) &&
(strings.Contains(value, "health") || strings.Contains(value, "life") || strings.Contains(value, "remain"))
}
func psuTempsFromSensors(doc sensorsDoc) map[string]float64 {
out := map[string]float64{}
for chip, features := range doc {
for featureName, raw := range features {
feature, ok := raw.(map[string]any)
if !ok || classifySensorFeature(feature) != "temp" {
continue
}
if !isLikelyPSUTemp(chip, featureName) {
continue
}
temp, ok := firstFeatureFloat(feature, "_input")
if !ok {
continue
}
if slot, ok := detectPSUSlot(chip, featureName); ok {
if _, exists := out[slot]; !exists {
out[slot] = temp
}
}
}
}
return out
}
func isLikelyPSUTemp(chip, feature string) bool {
value := strings.ToLower(chip + " " + feature)
return strings.Contains(value, "psu") || strings.Contains(value, "power supply")
}
func detectPSUSlot(parts ...string) (string, bool) {
for _, part := range parts {
lower := strings.ToLower(part)
matches := psuSlotRe.FindStringSubmatch(lower)
if len(matches) == 0 {
continue
}
for _, group := range matches[1:] {
if group == "" {
continue
}
value, err := strconv.Atoi(group)
if err == nil && value > 0 {
return strconv.Itoa(value - 1), true
}
}
}
return "", false
}

View File

@@ -0,0 +1,42 @@
package collector
import (
"testing"
"bee/audit/internal/schema"
)
func TestEnrichPSUsWithTelemetry(t *testing.T) {
slot0 := "0"
slot1 := "1"
psus := []schema.HardwarePowerSupply{
{Slot: &slot0},
{Slot: &slot1},
}
doc := sensorsDoc{
"psu-hwmon-0": {
"PSU1 Temp": map[string]any{"temp1_input": 39.5},
"PSU2 Temp": map[string]any{"temp2_input": 41.0},
"PSU1 Health": map[string]any{"health1_input": 98.0},
"PSU2 Remaining Life": map[string]any{"life2_input": 95.0},
},
}
got := enrichPSUsWithTelemetry(psus, doc)
if got[0].TemperatureC == nil || *got[0].TemperatureC != 39.5 {
t.Fatalf("psu0 temperature mismatch: %#v", got[0].TemperatureC)
}
if got[1].TemperatureC == nil || *got[1].TemperatureC != 41.0 {
t.Fatalf("psu1 temperature mismatch: %#v", got[1].TemperatureC)
}
if got[0].LifeRemainingPct == nil || *got[0].LifeRemainingPct != 98.0 {
t.Fatalf("psu0 life remaining mismatch: %#v", got[0].LifeRemainingPct)
}
if got[0].LifeUsedPct == nil || *got[0].LifeUsedPct != 2.0 {
t.Fatalf("psu0 life used mismatch: %#v", got[0].LifeUsedPct)
}
if got[1].LifeRemainingPct == nil || *got[1].LifeRemainingPct != 95.0 {
t.Fatalf("psu1 life remaining mismatch: %#v", got[1].LifeRemainingPct)
}
}

View File

@@ -83,11 +83,7 @@ func isLikelyRAIDController(dev schema.HardwarePCIeDevice) bool {
if dev.DeviceClass == nil { if dev.DeviceClass == nil {
return false return false
} }
c := strings.ToLower(*dev.DeviceClass) return isRAIDClass(*dev.DeviceClass)
return strings.Contains(c, "raid") ||
strings.Contains(c, "sas") ||
strings.Contains(c, "mass storage") ||
strings.Contains(c, "serial attached scsi")
} }
func collectStorcliDrives() []schema.HardwareStorage { func collectStorcliDrives() []schema.HardwareStorage {
@@ -182,7 +178,10 @@ func parseSASIrcuDisplay(raw string) []schema.HardwareStorage {
present := true present := true
status := mapRAIDDriveStatus(b["State"]) status := mapRAIDDriveStatus(b["State"])
s := schema.HardwareStorage{Present: &present, Status: &status} s := schema.HardwareStorage{
HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status},
Present: &present,
}
enclosure := strings.TrimSpace(b["Enclosure #"]) enclosure := strings.TrimSpace(b["Enclosure #"])
slot := strings.TrimSpace(b["Slot #"]) slot := strings.TrimSpace(b["Slot #"])
@@ -281,7 +280,10 @@ func parseArcconfPhysicalDrives(raw string) []schema.HardwareStorage {
for _, b := range blocks { for _, b := range blocks {
present := true present := true
status := mapRAIDDriveStatus(b["State"]) status := mapRAIDDriveStatus(b["State"])
s := schema.HardwareStorage{Present: &present, Status: &status} s := schema.HardwareStorage{
HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status},
Present: &present,
}
if v := strings.TrimSpace(b["Reported Location"]); v != "" { if v := strings.TrimSpace(b["Reported Location"]); v != "" {
s.Slot = &v s.Slot = &v
@@ -362,8 +364,11 @@ func parseSSACLIPhysicalDrives(raw string) []schema.HardwareStorage {
if m := ssacliPhysicalDriveLine.FindStringSubmatch(trimmed); len(m) == 3 { if m := ssacliPhysicalDriveLine.FindStringSubmatch(trimmed); len(m) == 3 {
flush() flush()
present := true present := true
status := "UNKNOWN" status := statusUnknown
s := schema.HardwareStorage{Present: &present, Status: &status} s := schema.HardwareStorage{
HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status},
Present: &present,
}
slot := m[1] slot := m[1]
s.Slot = &slot s.Slot = &slot
@@ -475,8 +480,8 @@ func storcliDriveToStorage(d struct {
present := true present := true
status := mapRAIDDriveStatus(d.State) status := mapRAIDDriveStatus(d.State)
s := schema.HardwareStorage{ s := schema.HardwareStorage{
Present: &present, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status},
Status: &status, Present: &present,
} }
if v := strings.TrimSpace(d.EIDSlt); v != "" { if v := strings.TrimSpace(d.EIDSlt); v != "" {
@@ -527,15 +532,15 @@ func mapRAIDDriveStatus(raw string) string {
u := strings.ToUpper(strings.TrimSpace(raw)) u := strings.ToUpper(strings.TrimSpace(raw))
switch { switch {
case strings.Contains(u, "OK"), strings.Contains(u, "OPTIMAL"), strings.Contains(u, "READY"): case strings.Contains(u, "OK"), strings.Contains(u, "OPTIMAL"), strings.Contains(u, "READY"):
return "OK" return statusOK
case strings.Contains(u, "ONLN"), strings.Contains(u, "ONLINE"): case strings.Contains(u, "ONLN"), strings.Contains(u, "ONLINE"):
return "OK" return statusOK
case strings.Contains(u, "RBLD"), strings.Contains(u, "REBUILD"): case strings.Contains(u, "RBLD"), strings.Contains(u, "REBUILD"):
return "WARNING" return statusWarning
case strings.Contains(u, "FAIL"), strings.Contains(u, "OFFLINE"): case strings.Contains(u, "FAIL"), strings.Contains(u, "OFFLINE"):
return "CRITICAL" return statusCritical
default: default:
return "UNKNOWN" return statusUnknown
} }
} }
@@ -641,8 +646,9 @@ func enrichStorageWithVROC(storage []schema.HardwareStorage, pcie []schema.Hardw
storage[i].Telemetry["vroc_array"] = arr.Name storage[i].Telemetry["vroc_array"] = arr.Name
storage[i].Telemetry["vroc_degraded"] = arr.Degraded storage[i].Telemetry["vroc_degraded"] = arr.Degraded
if arr.Degraded { if arr.Degraded {
status := "WARNING" status := statusWarning
storage[i].Status = &status storage[i].Status = &status
storage[i].ErrorDescription = stringPtr("VROC array is degraded")
} }
updated++ updated++
} }
@@ -659,14 +665,14 @@ func hasVROCController(pcie []schema.HardwarePCIeDevice) bool {
class := "" class := ""
if dev.DeviceClass != nil { if dev.DeviceClass != nil {
class = strings.ToLower(*dev.DeviceClass) class = strings.TrimSpace(*dev.DeviceClass)
} }
model := "" model := ""
if dev.Model != nil { if dev.Model != nil {
model = strings.ToLower(*dev.Model) model = strings.ToLower(*dev.Model)
} }
if strings.Contains(class, "raid") || if isRAIDClass(class) ||
strings.Contains(model, "vroc") || strings.Contains(model, "vroc") ||
strings.Contains(model, "volume management device") || strings.Contains(model, "volume management device") ||
strings.Contains(model, "vmd") { strings.Contains(model, "vmd") {

View File

@@ -0,0 +1,334 @@
package collector
import (
"bee/audit/internal/schema"
"encoding/json"
"log/slog"
"strconv"
"strings"
)
type raidControllerTelemetry struct {
BatteryChargePct *float64
BatteryHealthPct *float64
BatteryTemperatureC *float64
BatteryVoltageV *float64
BatteryReplaceRequired *bool
ErrorDescription *string
}
func enrichPCIeWithRAIDTelemetry(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice {
byVendor := collectRAIDControllerTelemetry()
if len(byVendor) == 0 {
return devs
}
positions := map[int]int{}
for i := range devs {
if devs[i].VendorID == nil || !isLikelyRAIDController(devs[i]) {
continue
}
vendor := *devs[i].VendorID
list := byVendor[vendor]
if len(list) == 0 {
continue
}
index := positions[vendor]
if index >= len(list) {
continue
}
positions[vendor] = index + 1
applyRAIDControllerTelemetry(&devs[i], list[index])
}
return devs
}
func applyRAIDControllerTelemetry(dev *schema.HardwarePCIeDevice, tel raidControllerTelemetry) {
if tel.BatteryChargePct != nil {
dev.BatteryChargePct = tel.BatteryChargePct
}
if tel.BatteryHealthPct != nil {
dev.BatteryHealthPct = tel.BatteryHealthPct
}
if tel.BatteryTemperatureC != nil {
dev.BatteryTemperatureC = tel.BatteryTemperatureC
}
if tel.BatteryVoltageV != nil {
dev.BatteryVoltageV = tel.BatteryVoltageV
}
if tel.BatteryReplaceRequired != nil {
dev.BatteryReplaceRequired = tel.BatteryReplaceRequired
}
if tel.ErrorDescription != nil {
dev.ErrorDescription = tel.ErrorDescription
if dev.Status == nil || *dev.Status == statusOK {
status := statusWarning
dev.Status = &status
}
}
}
func collectRAIDControllerTelemetry() map[int][]raidControllerTelemetry {
out := map[int][]raidControllerTelemetry{}
if raw, err := raidToolQuery("storcli64", "/call", "show", "all", "J"); err == nil {
list := parseStorcliControllerTelemetry(raw)
if len(list) > 0 {
out[vendorBroadcomLSI] = append(out[vendorBroadcomLSI], list...)
slog.Info("raid: storcli controller telemetry", "count", len(list))
}
}
if raw, err := raidToolQuery("ssacli", "ctrl", "all", "show", "config", "detail"); err == nil {
list := parseSSACLIControllerTelemetry(string(raw))
if len(list) > 0 {
out[vendorHPE] = append(out[vendorHPE], list...)
slog.Info("raid: ssacli controller telemetry", "count", len(list))
}
}
if raw, err := raidToolQuery("arcconf", "getconfig", "1", "ad"); err == nil {
list := parseArcconfControllerTelemetry(string(raw))
if len(list) > 0 {
out[vendorAdaptec] = append(out[vendorAdaptec], list...)
slog.Info("raid: arcconf controller telemetry", "count", len(list))
}
}
return out
}
func parseStorcliControllerTelemetry(raw []byte) []raidControllerTelemetry {
var doc struct {
Controllers []struct {
ResponseData map[string]any `json:"Response Data"`
} `json:"Controllers"`
}
if err := json.Unmarshal(raw, &doc); err != nil {
slog.Warn("raid: parse storcli controller telemetry failed", "err", err)
return nil
}
var out []raidControllerTelemetry
for _, ctl := range doc.Controllers {
tel := raidControllerTelemetry{}
mergeStorcliBatteryMap(&tel, nestedStringMap(ctl.ResponseData["BBU_Info"]))
mergeStorcliBatteryMap(&tel, nestedStringMap(ctl.ResponseData["BBU_Info_Details"]))
mergeStorcliBatteryMap(&tel, nestedStringMap(ctl.ResponseData["CV_Info"]))
mergeStorcliBatteryMap(&tel, nestedStringMap(ctl.ResponseData["CV_Info_Details"]))
if hasRAIDControllerTelemetry(tel) {
out = append(out, tel)
}
}
return out
}
func nestedStringMap(raw any) map[string]string {
switch value := raw.(type) {
case map[string]any:
out := map[string]string{}
flattenStringMap("", value, out)
return out
case []any:
out := map[string]string{}
for _, item := range value {
if m, ok := item.(map[string]any); ok {
flattenStringMap("", m, out)
}
}
return out
default:
return nil
}
}
func flattenStringMap(prefix string, in map[string]any, out map[string]string) {
for key, raw := range in {
fullKey := strings.TrimSpace(strings.ToLower(strings.Trim(prefix+" "+key, " ")))
switch value := raw.(type) {
case map[string]any:
flattenStringMap(fullKey, value, out)
case []any:
for _, item := range value {
if m, ok := item.(map[string]any); ok {
flattenStringMap(fullKey, m, out)
}
}
case string:
out[fullKey] = value
case json.Number:
out[fullKey] = value.String()
case float64:
out[fullKey] = strconv.FormatFloat(value, 'f', -1, 64)
case bool:
if value {
out[fullKey] = "true"
} else {
out[fullKey] = "false"
}
}
}
}
func mergeStorcliBatteryMap(tel *raidControllerTelemetry, fields map[string]string) {
if len(fields) == 0 {
return
}
for key, raw := range fields {
lower := strings.ToLower(strings.TrimSpace(key))
switch {
case strings.Contains(lower, "relative state of charge"), strings.Contains(lower, "remaining capacity"), strings.Contains(lower, "charge"):
if tel.BatteryChargePct == nil {
tel.BatteryChargePct = parsePercentPtr(raw)
}
case strings.Contains(lower, "state of health"), strings.Contains(lower, "health"):
if tel.BatteryHealthPct == nil {
tel.BatteryHealthPct = parsePercentPtr(raw)
}
case strings.Contains(lower, "temperature"):
if tel.BatteryTemperatureC == nil {
tel.BatteryTemperatureC = parseFloatPtr(raw)
}
case strings.Contains(lower, "voltage"):
if tel.BatteryVoltageV == nil {
tel.BatteryVoltageV = parseFloatPtr(raw)
}
case strings.Contains(lower, "replace"), strings.Contains(lower, "replacement required"):
if tel.BatteryReplaceRequired == nil {
tel.BatteryReplaceRequired = parseReplaceRequired(raw)
}
case strings.Contains(lower, "learn cycle requested"), strings.Contains(lower, "battery state"), strings.Contains(lower, "capacitance state"):
if desc := batteryStateDescription(raw); desc != nil && tel.ErrorDescription == nil {
tel.ErrorDescription = desc
}
}
}
}
func parseSSACLIControllerTelemetry(raw string) []raidControllerTelemetry {
lines := strings.Split(raw, "\n")
var out []raidControllerTelemetry
var current *raidControllerTelemetry
flush := func() {
if current != nil && hasRAIDControllerTelemetry(*current) {
out = append(out, *current)
}
current = nil
}
for _, line := range lines {
trimmed := strings.TrimSpace(line)
if trimmed == "" {
continue
}
if strings.HasPrefix(strings.ToLower(trimmed), "smart array") || strings.HasPrefix(strings.ToLower(trimmed), "controller ") {
flush()
current = &raidControllerTelemetry{}
continue
}
if current == nil {
continue
}
if idx := strings.Index(trimmed, ":"); idx > 0 {
key := strings.ToLower(strings.TrimSpace(trimmed[:idx]))
val := strings.TrimSpace(trimmed[idx+1:])
switch {
case strings.Contains(key, "capacitor temperature"), strings.Contains(key, "battery temperature"):
current.BatteryTemperatureC = parseFloatPtr(val)
case strings.Contains(key, "capacitor voltage"), strings.Contains(key, "battery voltage"):
current.BatteryVoltageV = parseFloatPtr(val)
case strings.Contains(key, "capacitor charge"), strings.Contains(key, "battery charge"):
current.BatteryChargePct = parsePercentPtr(val)
case strings.Contains(key, "capacitor health"), strings.Contains(key, "battery health"):
current.BatteryHealthPct = parsePercentPtr(val)
case strings.Contains(key, "replace") || strings.Contains(key, "failed"):
if current.BatteryReplaceRequired == nil {
current.BatteryReplaceRequired = parseReplaceRequired(val)
}
if desc := batteryStateDescription(val); desc != nil && current.ErrorDescription == nil {
current.ErrorDescription = desc
}
}
}
}
flush()
return out
}
func parseArcconfControllerTelemetry(raw string) []raidControllerTelemetry {
lines := strings.Split(raw, "\n")
tel := raidControllerTelemetry{}
for _, line := range lines {
trimmed := strings.TrimSpace(line)
if idx := strings.Index(trimmed, ":"); idx > 0 {
key := strings.ToLower(strings.TrimSpace(trimmed[:idx]))
val := strings.TrimSpace(trimmed[idx+1:])
switch {
case strings.Contains(key, "battery temperature"), strings.Contains(key, "capacitor temperature"):
tel.BatteryTemperatureC = parseFloatPtr(val)
case strings.Contains(key, "battery voltage"), strings.Contains(key, "capacitor voltage"):
tel.BatteryVoltageV = parseFloatPtr(val)
case strings.Contains(key, "battery charge"), strings.Contains(key, "capacitor charge"):
tel.BatteryChargePct = parsePercentPtr(val)
case strings.Contains(key, "battery health"), strings.Contains(key, "capacitor health"):
tel.BatteryHealthPct = parsePercentPtr(val)
case strings.Contains(key, "replace"), strings.Contains(key, "failed"):
if tel.BatteryReplaceRequired == nil {
tel.BatteryReplaceRequired = parseReplaceRequired(val)
}
if desc := batteryStateDescription(val); desc != nil && tel.ErrorDescription == nil {
tel.ErrorDescription = desc
}
}
}
}
if hasRAIDControllerTelemetry(tel) {
return []raidControllerTelemetry{tel}
}
return nil
}
func hasRAIDControllerTelemetry(tel raidControllerTelemetry) bool {
return tel.BatteryChargePct != nil ||
tel.BatteryHealthPct != nil ||
tel.BatteryTemperatureC != nil ||
tel.BatteryVoltageV != nil ||
tel.BatteryReplaceRequired != nil ||
tel.ErrorDescription != nil
}
func parsePercentPtr(raw string) *float64 {
raw = strings.ReplaceAll(strings.TrimSpace(raw), "%", "")
return parseFloatPtr(raw)
}
func parseReplaceRequired(raw string) *bool {
lower := strings.ToLower(strings.TrimSpace(raw))
switch {
case lower == "":
return nil
case strings.Contains(lower, "replace"), strings.Contains(lower, "failed"), strings.Contains(lower, "yes"), strings.Contains(lower, "required"):
value := true
return &value
case strings.Contains(lower, "no"), strings.Contains(lower, "ok"), strings.Contains(lower, "good"), strings.Contains(lower, "optimal"):
value := false
return &value
default:
return nil
}
}
func batteryStateDescription(raw string) *string {
lower := strings.ToLower(strings.TrimSpace(raw))
if lower == "" {
return nil
}
switch {
case strings.Contains(lower, "failed"), strings.Contains(lower, "fault"), strings.Contains(lower, "replace"), strings.Contains(lower, "warning"), strings.Contains(lower, "degraded"):
return &raw
default:
return nil
}
}

View File

@@ -1,6 +1,10 @@
package collector package collector
import "testing" import (
"bee/audit/internal/schema"
"errors"
"testing"
)
func TestParseSASIrcuControllerIDs(t *testing.T) { func TestParseSASIrcuControllerIDs(t *testing.T) {
raw := `LSI Corporation SAS2 IR Configuration Utility. raw := `LSI Corporation SAS2 IR Configuration Utility.
@@ -90,7 +94,111 @@ physicaldrive 1I:1:2 (894 GB, SAS HDD, Failed)
if drives[0].Status == nil || *drives[0].Status != "OK" { if drives[0].Status == nil || *drives[0].Status != "OK" {
t.Fatalf("drive0 status: %v", drives[0].Status) t.Fatalf("drive0 status: %v", drives[0].Status)
} }
if drives[1].Status == nil || *drives[1].Status != "CRITICAL" { if drives[1].Status == nil || *drives[1].Status != statusCritical {
t.Fatalf("drive1 status: %v", drives[1].Status) t.Fatalf("drive1 status: %v", drives[1].Status)
} }
} }
func TestParseStorcliControllerTelemetry(t *testing.T) {
raw := []byte(`{
"Controllers": [
{
"Response Data": {
"BBU_Info": {
"State of Health": "98 %",
"Relative State of Charge": "76 %",
"Temperature": "41 C",
"Voltage": "12.3 V",
"Replacement required": "No"
}
}
}
]
}`)
got := parseStorcliControllerTelemetry(raw)
if len(got) != 1 {
t.Fatalf("len(got)=%d want 1", len(got))
}
if got[0].BatteryHealthPct == nil || *got[0].BatteryHealthPct != 98 {
t.Fatalf("battery health=%v", got[0].BatteryHealthPct)
}
if got[0].BatteryChargePct == nil || *got[0].BatteryChargePct != 76 {
t.Fatalf("battery charge=%v", got[0].BatteryChargePct)
}
if got[0].BatteryTemperatureC == nil || *got[0].BatteryTemperatureC != 41 {
t.Fatalf("battery temperature=%v", got[0].BatteryTemperatureC)
}
if got[0].BatteryVoltageV == nil || *got[0].BatteryVoltageV != 12.3 {
t.Fatalf("battery voltage=%v", got[0].BatteryVoltageV)
}
if got[0].BatteryReplaceRequired == nil || *got[0].BatteryReplaceRequired {
t.Fatalf("battery replace=%v", got[0].BatteryReplaceRequired)
}
}
func TestParseSSACLIControllerTelemetry(t *testing.T) {
raw := `Smart Array P440ar in Slot 0
Battery/Capacitor Count: 1
Capacitor Temperature (C): 37
Capacitor Charge (%): 94
Capacitor Health (%): 96
Capacitor Voltage (V): 9.8
Capacitor Failed: No
`
got := parseSSACLIControllerTelemetry(raw)
if len(got) != 1 {
t.Fatalf("len(got)=%d want 1", len(got))
}
if got[0].BatteryTemperatureC == nil || *got[0].BatteryTemperatureC != 37 {
t.Fatalf("battery temperature=%v", got[0].BatteryTemperatureC)
}
if got[0].BatteryChargePct == nil || *got[0].BatteryChargePct != 94 {
t.Fatalf("battery charge=%v", got[0].BatteryChargePct)
}
}
func TestEnrichPCIeWithRAIDTelemetry(t *testing.T) {
orig := raidToolQuery
t.Cleanup(func() { raidToolQuery = orig })
raidToolQuery = func(name string, args ...string) ([]byte, error) {
switch name {
case "storcli64":
return []byte(`{
"Controllers": [
{
"Response Data": {
"CV_Info": {
"State of Health": "99 %",
"Relative State of Charge": "81 %",
"Temperature": "38 C",
"Voltage": "12.1 V",
"Replacement required": "No"
}
}
}
]
}`), nil
default:
return nil, errors.New("skip")
}
}
vendor := vendorBroadcomLSI
class := "MassStorageController"
status := statusOK
devs := []schema.HardwarePCIeDevice{{
VendorID: &vendor,
DeviceClass: &class,
HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status},
}}
out := enrichPCIeWithRAIDTelemetry(devs)
if out[0].BatteryHealthPct == nil || *out[0].BatteryHealthPct != 99 {
t.Fatalf("battery health=%v", out[0].BatteryHealthPct)
}
if out[0].BatteryChargePct == nil || *out[0].BatteryChargePct != 81 {
t.Fatalf("battery charge=%v", out[0].BatteryChargePct)
}
if out[0].BatteryVoltageV == nil || *out[0].BatteryVoltageV != 12.1 {
t.Fatalf("battery voltage=%v", out[0].BatteryVoltageV)
}
}

View File

@@ -0,0 +1,373 @@
package collector
import (
"bee/audit/internal/schema"
"encoding/json"
"log/slog"
"os/exec"
"sort"
"strconv"
"strings"
)
type sensorsDoc map[string]map[string]any
func collectSensors() *schema.HardwareSensors {
doc, err := readSensorsJSONDoc()
if err != nil {
slog.Info("sensors: unavailable, skipping", "err", err)
return nil
}
sensors := buildSensorsFromDoc(doc)
if sensors == nil || (len(sensors.Fans) == 0 && len(sensors.Power) == 0 && len(sensors.Temperatures) == 0 && len(sensors.Other) == 0) {
return nil
}
slog.Info("sensors: collected",
"fans", len(sensors.Fans),
"power", len(sensors.Power),
"temperatures", len(sensors.Temperatures),
"other", len(sensors.Other),
)
return sensors
}
func readSensorsJSONDoc() (sensorsDoc, error) {
out, err := exec.Command("sensors", "-j").Output()
if err != nil {
return nil, err
}
var doc sensorsDoc
if err := json.Unmarshal(out, &doc); err != nil {
return nil, err
}
return doc, nil
}
func buildSensorsFromDoc(doc sensorsDoc) *schema.HardwareSensors {
if len(doc) == 0 {
return nil
}
result := &schema.HardwareSensors{}
seen := map[string]struct{}{}
chips := make([]string, 0, len(doc))
for chip := range doc {
chips = append(chips, chip)
}
sort.Strings(chips)
for _, chip := range chips {
features := doc[chip]
location := sensorLocation(chip)
keys := make([]string, 0, len(features))
for key := range features {
keys = append(keys, key)
}
sort.Strings(keys)
for _, key := range keys {
if strings.EqualFold(key, "Adapter") {
continue
}
feature, ok := features[key].(map[string]any)
if !ok {
continue
}
name := strings.TrimSpace(key)
if name == "" {
continue
}
switch classifySensorFeature(feature) {
case "fan":
item := buildFanSensor(name, location, feature)
if item == nil || duplicateSensor(seen, "fan", item.Name) {
continue
}
result.Fans = append(result.Fans, *item)
case "temp":
item := buildTempSensor(name, location, feature)
if item == nil || duplicateSensor(seen, "temp", item.Name) {
continue
}
result.Temperatures = append(result.Temperatures, *item)
case "power":
item := buildPowerSensor(name, location, feature)
if item == nil || duplicateSensor(seen, "power", item.Name) {
continue
}
result.Power = append(result.Power, *item)
default:
item := buildOtherSensor(name, location, feature)
if item == nil || duplicateSensor(seen, "other", item.Name) {
continue
}
result.Other = append(result.Other, *item)
}
}
}
return result
}
func parseSensorsJSON(raw []byte) (*schema.HardwareSensors, error) {
var doc sensorsDoc
err := json.Unmarshal(raw, &doc)
if err != nil {
return nil, err
}
return buildSensorsFromDoc(doc), nil
}
func duplicateSensor(seen map[string]struct{}, sensorType, name string) bool {
key := sensorType + "\x00" + name
if _, ok := seen[key]; ok {
return true
}
seen[key] = struct{}{}
return false
}
func sensorLocation(chip string) *string {
chip = strings.TrimSpace(chip)
if chip == "" {
return nil
}
return &chip
}
func classifySensorFeature(feature map[string]any) string {
for key := range feature {
switch {
case strings.Contains(key, "fan") && strings.HasSuffix(key, "_input"):
return "fan"
case strings.Contains(key, "temp") && strings.HasSuffix(key, "_input"):
return "temp"
case strings.Contains(key, "power") && (strings.HasSuffix(key, "_input") || strings.HasSuffix(key, "_average")):
return "power"
case strings.Contains(key, "curr") && strings.HasSuffix(key, "_input"):
return "power"
case strings.HasPrefix(key, "in") && strings.HasSuffix(key, "_input"):
return "power"
}
}
return "other"
}
func buildFanSensor(name string, location *string, feature map[string]any) *schema.HardwareFanSensor {
rpm, ok := firstFeatureInt(feature, "_input")
if !ok {
return nil
}
item := &schema.HardwareFanSensor{Name: name, Location: location, RPM: &rpm}
if status := sensorStatusFromFeature(feature); status != nil {
item.Status = status
}
return item
}
func buildTempSensor(name string, location *string, feature map[string]any) *schema.HardwareTemperatureSensor {
celsius, ok := firstFeatureFloat(feature, "_input")
if !ok {
return nil
}
item := &schema.HardwareTemperatureSensor{Name: name, Location: location, Celsius: &celsius}
if warning, ok := firstFeatureFloatWithSuffixes(feature, []string{"_max", "_high"}); ok {
item.ThresholdWarningCelsius = &warning
}
if critical, ok := firstFeatureFloatWithSuffixes(feature, []string{"_crit", "_emergency"}); ok {
item.ThresholdCriticalCelsius = &critical
}
if status := sensorStatusFromFeature(feature); status != nil {
item.Status = status
} else {
item.Status = deriveTemperatureStatus(item.Celsius, item.ThresholdWarningCelsius, item.ThresholdCriticalCelsius)
}
return item
}
func buildPowerSensor(name string, location *string, feature map[string]any) *schema.HardwarePowerSensor {
item := &schema.HardwarePowerSensor{Name: name, Location: location}
if v, ok := firstFeatureFloatWithContains(feature, []string{"power"}); ok {
item.PowerW = &v
}
if v, ok := firstFeatureFloatWithPrefix(feature, "curr"); ok {
item.CurrentA = &v
}
if v, ok := firstFeatureFloatWithPrefix(feature, "in"); ok {
item.VoltageV = &v
}
if item.PowerW == nil && item.CurrentA == nil && item.VoltageV == nil {
return nil
}
if status := sensorStatusFromFeature(feature); status != nil {
item.Status = status
}
return item
}
func buildOtherSensor(name string, location *string, feature map[string]any) *schema.HardwareOtherSensor {
value, unit, ok := firstGenericSensorValue(feature)
if !ok {
return nil
}
item := &schema.HardwareOtherSensor{Name: name, Location: location, Value: &value}
if unit != "" {
item.Unit = &unit
}
if status := sensorStatusFromFeature(feature); status != nil {
item.Status = status
}
return item
}
func sensorStatusFromFeature(feature map[string]any) *string {
for key, raw := range feature {
if !strings.HasSuffix(key, "_alarm") {
continue
}
if number, ok := floatFromAny(raw); ok && number > 0 {
status := statusWarning
return &status
}
}
return nil
}
func deriveTemperatureStatus(current, warning, critical *float64) *string {
if current == nil {
return nil
}
switch {
case critical != nil && *current >= *critical:
status := statusCritical
return &status
case warning != nil && *current >= *warning:
status := statusWarning
return &status
default:
status := statusOK
return &status
}
}
func firstFeatureInt(feature map[string]any, suffix string) (int, bool) {
for key, raw := range feature {
if strings.HasSuffix(key, suffix) {
if value, ok := floatFromAny(raw); ok {
return int(value), true
}
}
}
return 0, false
}
func firstFeatureFloat(feature map[string]any, suffix string) (float64, bool) {
return firstFeatureFloatWithSuffixes(feature, []string{suffix})
}
func firstFeatureFloatWithSuffixes(feature map[string]any, suffixes []string) (float64, bool) {
keys := sortedFeatureKeys(feature)
for _, key := range keys {
for _, suffix := range suffixes {
if strings.HasSuffix(key, suffix) {
if value, ok := floatFromAny(feature[key]); ok {
return value, true
}
}
}
}
return 0, false
}
func firstFeatureFloatWithContains(feature map[string]any, parts []string) (float64, bool) {
keys := sortedFeatureKeys(feature)
for _, key := range keys {
matched := true
for _, part := range parts {
if !strings.Contains(key, part) {
matched = false
break
}
}
if matched {
if value, ok := floatFromAny(feature[key]); ok {
return value, true
}
}
}
return 0, false
}
func firstFeatureFloatWithPrefix(feature map[string]any, prefix string) (float64, bool) {
keys := sortedFeatureKeys(feature)
for _, key := range keys {
if strings.HasPrefix(key, prefix) && strings.HasSuffix(key, "_input") {
if value, ok := floatFromAny(feature[key]); ok {
return value, true
}
}
}
return 0, false
}
func firstGenericSensorValue(feature map[string]any) (float64, string, bool) {
keys := sortedFeatureKeys(feature)
for _, key := range keys {
if strings.HasSuffix(key, "_alarm") {
continue
}
value, ok := floatFromAny(feature[key])
if !ok {
continue
}
unit := inferSensorUnit(key)
return value, unit, true
}
return 0, "", false
}
func inferSensorUnit(key string) string {
switch {
case strings.Contains(key, "humidity"):
return "%"
case strings.Contains(key, "intrusion"):
return ""
default:
return ""
}
}
func sortedFeatureKeys(feature map[string]any) []string {
keys := make([]string, 0, len(feature))
for key := range feature {
keys = append(keys, key)
}
sort.Strings(keys)
return keys
}
func floatFromAny(raw any) (float64, bool) {
switch value := raw.(type) {
case float64:
return value, true
case float32:
return float64(value), true
case int:
return float64(value), true
case int64:
return float64(value), true
case json.Number:
if f, err := value.Float64(); err == nil {
return f, true
}
case string:
if value == "" {
return 0, false
}
if f, err := strconv.ParseFloat(value, 64); err == nil {
return f, true
}
}
return 0, false
}

View File

@@ -0,0 +1,54 @@
package collector
import "testing"
func TestParseSensorsJSON(t *testing.T) {
raw := []byte(`{
"coretemp-isa-0000": {
"Adapter": "ISA adapter",
"Package id 0": {
"temp1_input": 61.5,
"temp1_max": 80.0,
"temp1_crit": 95.0
},
"fan1": {
"fan1_input": 4200
}
},
"acpitz-acpi-0": {
"Adapter": "ACPI interface",
"in0": {
"in0_input": 12.06
},
"curr1": {
"curr1_input": 0.64
},
"power1": {
"power1_average": 137.0
},
"humidity1": {
"humidity1_input": 38.5
}
}
}`)
got, err := parseSensorsJSON(raw)
if err != nil {
t.Fatalf("parseSensorsJSON error: %v", err)
}
if got == nil {
t.Fatal("expected sensors")
}
if len(got.Temperatures) != 1 || got.Temperatures[0].Celsius == nil || *got.Temperatures[0].Celsius != 61.5 {
t.Fatalf("temperatures mismatch: %#v", got.Temperatures)
}
if len(got.Fans) != 1 || got.Fans[0].RPM == nil || *got.Fans[0].RPM != 4200 {
t.Fatalf("fans mismatch: %#v", got.Fans)
}
if len(got.Power) != 3 {
t.Fatalf("power sensors mismatch: %#v", got.Power)
}
if len(got.Other) != 1 || got.Other[0].Unit == nil || *got.Other[0].Unit != "%" {
t.Fatalf("other sensors mismatch: %#v", got.Other)
}
}

View File

@@ -26,13 +26,13 @@ func collectStorage() []schema.HardwareStorage {
// lsblkDevice is a minimal lsblk JSON record. // lsblkDevice is a minimal lsblk JSON record.
type lsblkDevice struct { type lsblkDevice struct {
Name string `json:"name"` Name string `json:"name"`
Type string `json:"type"` Type string `json:"type"`
Size string `json:"size"` Size string `json:"size"`
Serial string `json:"serial"` Serial string `json:"serial"`
Model string `json:"model"` Model string `json:"model"`
Tran string `json:"tran"` Tran string `json:"tran"`
Hctl string `json:"hctl"` Hctl string `json:"hctl"`
} }
type lsblkRoot struct { type lsblkRoot struct {
@@ -67,7 +67,10 @@ type smartctlInfo struct {
SerialNumber string `json:"serial_number"` SerialNumber string `json:"serial_number"`
FirmwareVer string `json:"firmware_version"` FirmwareVer string `json:"firmware_version"`
RotationRate int `json:"rotation_rate"` RotationRate int `json:"rotation_rate"`
SmartStatus struct { Temperature struct {
Current int `json:"current"`
} `json:"temperature"`
SmartStatus struct {
Passed bool `json:"passed"` Passed bool `json:"passed"`
} `json:"smart_status"` } `json:"smart_status"`
UserCapacity struct { UserCapacity struct {
@@ -75,9 +78,11 @@ type smartctlInfo struct {
} `json:"user_capacity"` } `json:"user_capacity"`
AtaSmartAttributes struct { AtaSmartAttributes struct {
Table []struct { Table []struct {
ID int `json:"id"` ID int `json:"id"`
Name string `json:"name"` Name string `json:"name"`
Raw struct{ Value int64 `json:"value"` } `json:"raw"` Raw struct {
Value int64 `json:"value"`
} `json:"raw"`
} `json:"table"` } `json:"table"`
} `json:"ata_smart_attributes"` } `json:"ata_smart_attributes"`
PowerOnTime struct { PowerOnTime struct {
@@ -130,7 +135,7 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
return s return s
} }
var info smartctlInfo var info smartctlInfo
if err := json.Unmarshal(out, &info); err == nil { if err := json.Unmarshal(out, &info); err == nil {
if v := cleanDMIValue(info.ModelName); v != "" { if v := cleanDMIValue(info.ModelName); v != "" {
s.Model = &v s.Model = &v
@@ -152,14 +157,19 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
} else if info.RotationRate > 0 { } else if info.RotationRate > 0 {
devType = "HDD" devType = "HDD"
} }
s.Type = &devType
// telemetry if info.Temperature.Current > 0 {
tel := map[string]any{} t := float64(info.Temperature.Current)
s.TemperatureC = &t
}
if info.PowerOnTime.Hours > 0 { if info.PowerOnTime.Hours > 0 {
tel["power_on_hours"] = info.PowerOnTime.Hours v := int64(info.PowerOnTime.Hours)
s.PowerOnHours = &v
} }
if info.PowerCycleCount > 0 { if info.PowerCycleCount > 0 {
tel["power_cycles"] = info.PowerCycleCount v := int64(info.PowerCycleCount)
s.PowerCycles = &v
} }
reallocated := int64(0) reallocated := int64(0)
pending := int64(0) pending := int64(0)
@@ -169,77 +179,79 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
switch attr.ID { switch attr.ID {
case 5: case 5:
reallocated = attr.Raw.Value reallocated = attr.Raw.Value
tel["reallocated_sectors"] = attr.Raw.Value s.ReallocatedSectors = &reallocated
case 177: case 177:
tel["wear_leveling_pct"] = attr.Raw.Value value := float64(attr.Raw.Value)
s.LifeUsedPct = &value
case 231: case 231:
lifeRemaining = attr.Raw.Value lifeRemaining = attr.Raw.Value
tel["life_remaining_pct"] = attr.Raw.Value value := float64(attr.Raw.Value)
s.LifeRemainingPct = &value
case 241: case 241:
tel["total_lba_written"] = attr.Raw.Value value := attr.Raw.Value
s.WrittenBytes = &value
case 197: case 197:
pending = attr.Raw.Value pending = attr.Raw.Value
tel["current_pending_sectors"] = attr.Raw.Value s.CurrentPendingSectors = &pending
case 198: case 198:
uncorrectable = attr.Raw.Value uncorrectable = attr.Raw.Value
tel["offline_uncorrectable"] = attr.Raw.Value s.OfflineUncorrectable = &uncorrectable
} }
} }
if len(tel) > 0 {
s.Telemetry = tel
}
status := storageHealthStatus{ status := storageHealthStatus{
overallPassed: info.SmartStatus.Passed, overallPassed: info.SmartStatus.Passed,
hasOverall: true, hasOverall: true,
reallocatedSectors: reallocated, reallocatedSectors: reallocated,
pendingSectors: pending, pendingSectors: pending,
offlineUncorrectable: uncorrectable, offlineUncorrectable: uncorrectable,
lifeRemainingPct: lifeRemaining, lifeRemainingPct: lifeRemaining,
} }
setStorageHealthStatus(&s, status) setStorageHealthStatus(&s, status)
return s return s
} }
s.Type = &devType s.Type = &devType
status := "UNKNOWN" status := statusUnknown
s.Status = &status s.Status = &status
return s return s
} }
// nvmeSmartLog is the subset of `nvme smart-log -o json` output we care about. // nvmeSmartLog is the subset of `nvme smart-log -o json` output we care about.
type nvmeSmartLog struct { type nvmeSmartLog struct {
CriticalWarning int `json:"critical_warning"` CriticalWarning int `json:"critical_warning"`
PercentageUsed int `json:"percentage_used"` PercentageUsed int `json:"percentage_used"`
AvailableSpare int `json:"available_spare"` AvailableSpare int `json:"available_spare"`
SpareThreshold int `json:"spare_thresh"` SpareThreshold int `json:"spare_thresh"`
PowerOnHours int64 `json:"power_on_hours"` Temperature int64 `json:"temperature"`
PowerCycles int64 `json:"power_cycles"` PowerOnHours int64 `json:"power_on_hours"`
UnsafeShutdowns int64 `json:"unsafe_shutdowns"` PowerCycles int64 `json:"power_cycles"`
DataUnitsWritten int64 `json:"data_units_written"` UnsafeShutdowns int64 `json:"unsafe_shutdowns"`
ControllerBusy int64 `json:"controller_busy_time"` DataUnitsRead int64 `json:"data_units_read"`
MediaErrors int64 `json:"media_errors"` DataUnitsWritten int64 `json:"data_units_written"`
NumErrLogEntries int64 `json:"num_err_log_entries"` ControllerBusy int64 `json:"controller_busy_time"`
MediaErrors int64 `json:"media_errors"`
NumErrLogEntries int64 `json:"num_err_log_entries"`
} }
// nvmeIDCtrl is the subset of `nvme id-ctrl -o json` output. // nvmeIDCtrl is the subset of `nvme id-ctrl -o json` output.
type nvmeIDCtrl struct { type nvmeIDCtrl struct {
ModelNumber string `json:"mn"` ModelNumber string `json:"mn"`
SerialNumber string `json:"sn"` SerialNumber string `json:"sn"`
FirmwareRev string `json:"fr"` FirmwareRev string `json:"fr"`
TotalCapacity int64 `json:"tnvmcap"` TotalCapacity int64 `json:"tnvmcap"`
} }
func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage { func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
present := true present := true
devType := "NVMe" devType := "NVMe"
iface := "NVMe" iface := "NVMe"
status := "OK" status := statusOK
s := schema.HardwareStorage{ s := schema.HardwareStorage{
Present: &present, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status},
Type: &devType, Present: &present,
Interface: &iface, Type: &devType,
Status: &status, Interface: &iface,
} }
devPath := "/dev/" + dev.Name devPath := "/dev/" + dev.Name
@@ -268,100 +280,123 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
if out, err := exec.Command("nvme", "smart-log", devPath, "-o", "json").Output(); err == nil { if out, err := exec.Command("nvme", "smart-log", devPath, "-o", "json").Output(); err == nil {
var log nvmeSmartLog var log nvmeSmartLog
if json.Unmarshal(out, &log) == nil { if json.Unmarshal(out, &log) == nil {
tel := map[string]any{}
if log.CriticalWarning > 0 {
tel["critical_warning"] = log.CriticalWarning
}
if log.PowerOnHours > 0 { if log.PowerOnHours > 0 {
tel["power_on_hours"] = log.PowerOnHours s.PowerOnHours = &log.PowerOnHours
} }
if log.PowerCycles > 0 { if log.PowerCycles > 0 {
tel["power_cycles"] = log.PowerCycles s.PowerCycles = &log.PowerCycles
} }
if log.UnsafeShutdowns > 0 { if log.UnsafeShutdowns > 0 {
tel["unsafe_shutdowns"] = log.UnsafeShutdowns s.UnsafeShutdowns = &log.UnsafeShutdowns
} }
if log.PercentageUsed > 0 { if log.PercentageUsed > 0 {
tel["percentage_used"] = log.PercentageUsed v := float64(log.PercentageUsed)
s.LifeUsedPct = &v
remaining := 100 - v
s.LifeRemainingPct = &remaining
} }
if log.DataUnitsWritten > 0 { if log.DataUnitsWritten > 0 {
tel["data_units_written"] = log.DataUnitsWritten v := nvmeDataUnitsToBytes(log.DataUnitsWritten)
s.WrittenBytes = &v
} }
if log.ControllerBusy > 0 { if log.DataUnitsRead > 0 {
tel["controller_busy_time"] = log.ControllerBusy v := nvmeDataUnitsToBytes(log.DataUnitsRead)
s.ReadBytes = &v
} }
if log.AvailableSpare > 0 { if log.AvailableSpare > 0 {
tel["available_spare_pct"] = log.AvailableSpare v := float64(log.AvailableSpare)
} s.AvailableSparePct = &v
if log.SpareThreshold > 0 {
tel["available_spare_threshold_pct"] = log.SpareThreshold
} }
if log.MediaErrors > 0 { if log.MediaErrors > 0 {
tel["media_errors"] = log.MediaErrors s.MediaErrors = &log.MediaErrors
} }
if log.NumErrLogEntries > 0 { if log.NumErrLogEntries > 0 {
tel["error_log_entries"] = log.NumErrLogEntries s.ErrorLogEntries = &log.NumErrLogEntries
} }
if len(tel) > 0 { if log.Temperature > 0 {
s.Telemetry = tel v := float64(log.Temperature - 273)
s.TemperatureC = &v
} }
setStorageHealthStatus(&s, storageHealthStatus{ setStorageHealthStatus(&s, storageHealthStatus{
criticalWarning: log.CriticalWarning, criticalWarning: log.CriticalWarning,
percentageUsed: int64(log.PercentageUsed), percentageUsed: int64(log.PercentageUsed),
availableSpare: int64(log.AvailableSpare), availableSpare: int64(log.AvailableSpare),
spareThreshold: int64(log.SpareThreshold), spareThreshold: int64(log.SpareThreshold),
unsafeShutdowns: log.UnsafeShutdowns, unsafeShutdowns: log.UnsafeShutdowns,
mediaErrors: log.MediaErrors, mediaErrors: log.MediaErrors,
errorLogEntries: log.NumErrLogEntries, errorLogEntries: log.NumErrLogEntries,
}) })
return s return s
} }
} }
status = "UNKNOWN" status = statusUnknown
s.Status = &status s.Status = &status
return s return s
} }
func nvmeDataUnitsToBytes(units int64) int64 {
if units <= 0 {
return 0
}
return units * 512000
}
type storageHealthStatus struct { type storageHealthStatus struct {
hasOverall bool hasOverall bool
overallPassed bool overallPassed bool
reallocatedSectors int64 reallocatedSectors int64
pendingSectors int64 pendingSectors int64
offlineUncorrectable int64 offlineUncorrectable int64
lifeRemainingPct int64 lifeRemainingPct int64
criticalWarning int criticalWarning int
percentageUsed int64 percentageUsed int64
availableSpare int64 availableSpare int64
spareThreshold int64 spareThreshold int64
unsafeShutdowns int64 unsafeShutdowns int64
mediaErrors int64 mediaErrors int64
errorLogEntries int64 errorLogEntries int64
} }
func setStorageHealthStatus(s *schema.HardwareStorage, health storageHealthStatus) { func setStorageHealthStatus(s *schema.HardwareStorage, health storageHealthStatus) {
status := "OK" status := statusOK
var description *string
switch { switch {
case health.hasOverall && !health.overallPassed: case health.hasOverall && !health.overallPassed:
status = "FAILED" status = statusCritical
description = stringPtr("SMART overall self-assessment failed")
case health.criticalWarning > 0: case health.criticalWarning > 0:
status = "FAILED" status = statusCritical
description = stringPtr("NVMe critical warning is set")
case health.pendingSectors > 0 || health.offlineUncorrectable > 0: case health.pendingSectors > 0 || health.offlineUncorrectable > 0:
status = "FAILED" status = statusCritical
description = stringPtr("Pending or offline uncorrectable sectors detected")
case health.mediaErrors > 0: case health.mediaErrors > 0:
status = "WARNING" status = statusWarning
description = stringPtr("Media errors reported")
case health.reallocatedSectors > 0: case health.reallocatedSectors > 0:
status = "WARNING" status = statusWarning
description = stringPtr("Reallocated sectors detected")
case health.errorLogEntries > 0: case health.errorLogEntries > 0:
status = "WARNING" status = statusWarning
description = stringPtr("Device error log contains entries")
case health.lifeRemainingPct > 0 && health.lifeRemainingPct <= 10: case health.lifeRemainingPct > 0 && health.lifeRemainingPct <= 10:
status = "WARNING" status = statusWarning
description = stringPtr("Life remaining is low")
case health.percentageUsed >= 95: case health.percentageUsed >= 95:
status = "WARNING" status = statusWarning
description = stringPtr("Drive wear level is high")
case health.availableSpare > 0 && health.spareThreshold > 0 && health.availableSpare <= health.spareThreshold: case health.availableSpare > 0 && health.spareThreshold > 0 && health.availableSpare <= health.spareThreshold:
status = "WARNING" status = statusWarning
description = stringPtr("Available spare is at or below threshold")
case health.unsafeShutdowns > 100: case health.unsafeShutdowns > 100:
status = "WARNING" status = statusWarning
description = stringPtr("Unsafe shutdown count is high")
} }
s.Status = &status s.Status = &status
s.ErrorDescription = description
}
func stringPtr(value string) *string {
return &value
} }

View File

@@ -17,37 +17,37 @@ func TestSetStorageHealthStatus(t *testing.T) {
{ {
name: "smart overall failed", name: "smart overall failed",
health: storageHealthStatus{hasOverall: true, overallPassed: false}, health: storageHealthStatus{hasOverall: true, overallPassed: false},
want: "FAILED", want: statusCritical,
}, },
{ {
name: "nvme critical warning", name: "nvme critical warning",
health: storageHealthStatus{criticalWarning: 1}, health: storageHealthStatus{criticalWarning: 1},
want: "FAILED", want: statusCritical,
}, },
{ {
name: "pending sectors", name: "pending sectors",
health: storageHealthStatus{pendingSectors: 1}, health: storageHealthStatus{pendingSectors: 1},
want: "FAILED", want: statusCritical,
}, },
{ {
name: "media errors warning", name: "media errors warning",
health: storageHealthStatus{mediaErrors: 2}, health: storageHealthStatus{mediaErrors: 2},
want: "WARNING", want: statusWarning,
}, },
{ {
name: "reallocated warning", name: "reallocated warning",
health: storageHealthStatus{reallocatedSectors: 1}, health: storageHealthStatus{reallocatedSectors: 1},
want: "WARNING", want: statusWarning,
}, },
{ {
name: "life remaining low", name: "life remaining low",
health: storageHealthStatus{lifeRemainingPct: 8}, health: storageHealthStatus{lifeRemainingPct: 8},
want: "WARNING", want: statusWarning,
}, },
{ {
name: "healthy", name: "healthy",
health: storageHealthStatus{}, health: storageHealthStatus{},
want: "OK", want: statusOK,
}, },
} }

View File

@@ -6,31 +6,31 @@ import (
"time" "time"
) )
func buildHealthSummary(snap schema.HardwareSnapshot) *schema.HardwareHealthSummary { func BuildHealthSummary(snap schema.HardwareSnapshot) *schema.HardwareHealthSummary {
summary := &schema.HardwareHealthSummary{ summary := &schema.HardwareHealthSummary{
Status: "OK", Status: statusOK,
CollectedAt: time.Now().UTC().Format(time.RFC3339), CollectedAt: time.Now().UTC().Format(time.RFC3339),
} }
for _, dimm := range snap.Memory { for _, dimm := range snap.Memory {
switch derefString(dimm.Status) { switch derefString(dimm.Status) {
case "WARNING": case statusWarning:
summary.MemoryWarn++ summary.MemoryWarn++
summary.Warnings = append(summary.Warnings, formatMemorySummary(dimm)) summary.Warnings = append(summary.Warnings, formatMemorySummary(dimm))
case "FAILED": case statusCritical:
summary.MemoryFail++ summary.MemoryFail++
summary.Failures = append(summary.Failures, formatMemorySummary(dimm)) summary.Failures = append(summary.Failures, formatMemorySummary(dimm))
case "EMPTY": case statusEmpty:
summary.EmptyDIMMs++ summary.EmptyDIMMs++
} }
} }
for _, disk := range snap.Storage { for _, disk := range snap.Storage {
switch derefString(disk.Status) { switch derefString(disk.Status) {
case "WARNING": case statusWarning:
summary.StorageWarn++ summary.StorageWarn++
summary.Warnings = append(summary.Warnings, formatStorageSummary(disk)) summary.Warnings = append(summary.Warnings, formatStorageSummary(disk))
case "FAILED": case statusCritical:
summary.StorageFail++ summary.StorageFail++
summary.Failures = append(summary.Failures, formatStorageSummary(disk)) summary.Failures = append(summary.Failures, formatStorageSummary(disk))
} }
@@ -38,10 +38,10 @@ func buildHealthSummary(snap schema.HardwareSnapshot) *schema.HardwareHealthSumm
for _, dev := range snap.PCIeDevices { for _, dev := range snap.PCIeDevices {
switch derefString(dev.Status) { switch derefString(dev.Status) {
case "WARNING": case statusWarning:
summary.PCIeWarn++ summary.PCIeWarn++
summary.Warnings = append(summary.Warnings, formatPCIeSummary(dev)) summary.Warnings = append(summary.Warnings, formatPCIeSummary(dev))
case "FAILED": case statusCritical:
summary.PCIeFail++ summary.PCIeFail++
summary.Failures = append(summary.Failures, formatPCIeSummary(dev)) summary.Failures = append(summary.Failures, formatPCIeSummary(dev))
} }
@@ -52,19 +52,19 @@ func buildHealthSummary(snap schema.HardwareSnapshot) *schema.HardwareHealthSumm
summary.MissingPSUs++ summary.MissingPSUs++
} }
switch derefString(psu.Status) { switch derefString(psu.Status) {
case "WARNING": case statusWarning:
summary.PSUWarn++ summary.PSUWarn++
summary.Warnings = append(summary.Warnings, formatPSUSummary(psu)) summary.Warnings = append(summary.Warnings, formatPSUSummary(psu))
case "FAILED": case statusCritical:
summary.PSUFail++ summary.PSUFail++
summary.Failures = append(summary.Failures, formatPSUSummary(psu)) summary.Failures = append(summary.Failures, formatPSUSummary(psu))
} }
} }
if len(summary.Failures) > 0 || summary.StorageFail > 0 || summary.PCIeFail > 0 || summary.PSUFail > 0 || summary.MemoryFail > 0 { if len(summary.Failures) > 0 || summary.StorageFail > 0 || summary.PCIeFail > 0 || summary.PSUFail > 0 || summary.MemoryFail > 0 {
summary.Status = "FAILED" summary.Status = statusCritical
} else if len(summary.Warnings) > 0 || summary.StorageWarn > 0 || summary.PCIeWarn > 0 || summary.PSUWarn > 0 || summary.MemoryWarn > 0 { } else if len(summary.Warnings) > 0 || summary.StorageWarn > 0 || summary.PCIeWarn > 0 || summary.PSUWarn > 0 || summary.MemoryWarn > 0 {
summary.Status = "WARNING" summary.Status = statusWarning
} }
if len(summary.Warnings) == 0 { if len(summary.Warnings) == 0 {

View File

@@ -31,7 +31,7 @@ md125 : active raid1 nvme2n1[0] nvme3n1[1]
func TestHasVROCController(t *testing.T) { func TestHasVROCController(t *testing.T) {
intel := vendorIntel intel := vendorIntel
model := "Volume Management Device NVMe RAID Controller" model := "Volume Management Device NVMe RAID Controller"
class := "RAID bus controller" class := "MassStorageController"
tests := []struct { tests := []struct {
name string name string
pcie []schema.HardwarePCIeDevice pcie []schema.HardwarePCIeDevice

View File

@@ -5,10 +5,10 @@ package schema
// HardwareIngestRequest is the top-level output document produced by `bee audit`. // HardwareIngestRequest is the top-level output document produced by `bee audit`.
// It is accepted as-is by the core /api/ingest/hardware endpoint. // It is accepted as-is by the core /api/ingest/hardware endpoint.
type HardwareIngestRequest struct { type HardwareIngestRequest struct {
Filename *string `json:"filename"` Filename *string `json:"filename,omitempty"`
SourceType *string `json:"source_type"` SourceType *string `json:"source_type,omitempty"`
Protocol *string `json:"protocol"` Protocol *string `json:"protocol,omitempty"`
TargetHost string `json:"target_host"` TargetHost *string `json:"target_host,omitempty"`
CollectedAt string `json:"collected_at"` CollectedAt string `json:"collected_at"`
Hardware HardwareSnapshot `json:"hardware"` Hardware HardwareSnapshot `json:"hardware"`
} }
@@ -21,32 +21,32 @@ type HardwareSnapshot struct {
Storage []HardwareStorage `json:"storage,omitempty"` Storage []HardwareStorage `json:"storage,omitempty"`
PCIeDevices []HardwarePCIeDevice `json:"pcie_devices,omitempty"` PCIeDevices []HardwarePCIeDevice `json:"pcie_devices,omitempty"`
PowerSupplies []HardwarePowerSupply `json:"power_supplies,omitempty"` PowerSupplies []HardwarePowerSupply `json:"power_supplies,omitempty"`
Summary *HardwareHealthSummary `json:"summary,omitempty"` Sensors *HardwareSensors `json:"sensors,omitempty"`
} }
type HardwareHealthSummary struct { type HardwareHealthSummary struct {
Status string `json:"status"` Status string `json:"status"`
Warnings []string `json:"warnings,omitempty"` Warnings []string `json:"warnings,omitempty"`
Failures []string `json:"failures,omitempty"` Failures []string `json:"failures,omitempty"`
StorageWarn int `json:"storage_warn,omitempty"` StorageWarn int `json:"storage_warn,omitempty"`
StorageFail int `json:"storage_fail,omitempty"` StorageFail int `json:"storage_fail,omitempty"`
PCIeWarn int `json:"pcie_warn,omitempty"` PCIeWarn int `json:"pcie_warn,omitempty"`
PCIeFail int `json:"pcie_fail,omitempty"` PCIeFail int `json:"pcie_fail,omitempty"`
PSUWarn int `json:"psu_warn,omitempty"` PSUWarn int `json:"psu_warn,omitempty"`
PSUFail int `json:"psu_fail,omitempty"` PSUFail int `json:"psu_fail,omitempty"`
MemoryWarn int `json:"memory_warn,omitempty"` MemoryWarn int `json:"memory_warn,omitempty"`
MemoryFail int `json:"memory_fail,omitempty"` MemoryFail int `json:"memory_fail,omitempty"`
EmptyDIMMs int `json:"empty_dimms,omitempty"` EmptyDIMMs int `json:"empty_dimms,omitempty"`
MissingPSUs int `json:"missing_psus,omitempty"` MissingPSUs int `json:"missing_psus,omitempty"`
CollectedAt string `json:"collected_at,omitempty"` CollectedAt string `json:"collected_at,omitempty"`
} }
type HardwareBoard struct { type HardwareBoard struct {
Manufacturer *string `json:"manufacturer"` Manufacturer *string `json:"manufacturer,omitempty"`
ProductName *string `json:"product_name"` ProductName *string `json:"product_name,omitempty"`
SerialNumber string `json:"serial_number"` SerialNumber string `json:"serial_number"`
PartNumber *string `json:"part_number"` PartNumber *string `json:"part_number,omitempty"`
UUID *string `json:"uuid"` UUID *string `json:"uuid,omitempty"`
} }
type HardwareFirmwareRecord struct { type HardwareFirmwareRecord struct {
@@ -55,77 +55,183 @@ type HardwareFirmwareRecord struct {
} }
type HardwareCPU struct { type HardwareCPU struct {
Socket *int `json:"socket"` HardwareComponentStatus
Model *string `json:"model"` Socket *int `json:"socket,omitempty"`
Manufacturer *string `json:"manufacturer"` Model *string `json:"model,omitempty"`
Status *string `json:"status"` Manufacturer *string `json:"manufacturer,omitempty"`
SerialNumber *string `json:"serial_number"` SerialNumber *string `json:"serial_number,omitempty"`
Firmware *string `json:"firmware"` Firmware *string `json:"firmware,omitempty"`
Cores *int `json:"cores"` Cores *int `json:"cores,omitempty"`
Threads *int `json:"threads"` Threads *int `json:"threads,omitempty"`
FrequencyMHz *int `json:"frequency_mhz"` FrequencyMHz *int `json:"frequency_mhz,omitempty"`
MaxFrequencyMHz *int `json:"max_frequency_mhz"` MaxFrequencyMHz *int `json:"max_frequency_mhz,omitempty"`
TemperatureC *float64 `json:"temperature_c,omitempty"`
PowerW *float64 `json:"power_w,omitempty"`
Throttled *bool `json:"throttled,omitempty"`
CorrectableErrorCount *int64 `json:"correctable_error_count,omitempty"`
UncorrectableErrorCount *int64 `json:"uncorrectable_error_count,omitempty"`
LifeRemainingPct *float64 `json:"life_remaining_pct,omitempty"`
LifeUsedPct *float64 `json:"life_used_pct,omitempty"`
Present *bool `json:"present,omitempty"`
} }
type HardwareMemory struct { type HardwareMemory struct {
Slot *string `json:"slot"` HardwareComponentStatus
Location *string `json:"location"` Slot *string `json:"slot,omitempty"`
Present *bool `json:"present"` Location *string `json:"location,omitempty"`
SizeMB *int `json:"size_mb"` Present *bool `json:"present,omitempty"`
Type *string `json:"type"` SizeMB *int `json:"size_mb,omitempty"`
MaxSpeedMHz *int `json:"max_speed_mhz"` Type *string `json:"type,omitempty"`
CurrentSpeedMHz *int `json:"current_speed_mhz"` MaxSpeedMHz *int `json:"max_speed_mhz,omitempty"`
Manufacturer *string `json:"manufacturer"` CurrentSpeedMHz *int `json:"current_speed_mhz,omitempty"`
SerialNumber *string `json:"serial_number"` Manufacturer *string `json:"manufacturer,omitempty"`
PartNumber *string `json:"part_number"` SerialNumber *string `json:"serial_number,omitempty"`
Status *string `json:"status"` PartNumber *string `json:"part_number,omitempty"`
TemperatureC *float64 `json:"temperature_c,omitempty"`
CorrectableECCErrorCount *int64 `json:"correctable_ecc_error_count,omitempty"`
UncorrectableECCErrorCount *int64 `json:"uncorrectable_ecc_error_count,omitempty"`
LifeRemainingPct *float64 `json:"life_remaining_pct,omitempty"`
LifeUsedPct *float64 `json:"life_used_pct,omitempty"`
SpareBlocksRemainingPct *float64 `json:"spare_blocks_remaining_pct,omitempty"`
PerformanceDegraded *bool `json:"performance_degraded,omitempty"`
DataLossDetected *bool `json:"data_loss_detected,omitempty"`
} }
type HardwareStorage struct { type HardwareStorage struct {
Slot *string `json:"slot"` HardwareComponentStatus
Type *string `json:"type"` Slot *string `json:"slot,omitempty"`
Model *string `json:"model"` Type *string `json:"type,omitempty"`
SizeGB *int `json:"size_gb"` Model *string `json:"model,omitempty"`
SerialNumber *string `json:"serial_number"` SizeGB *int `json:"size_gb,omitempty"`
Manufacturer *string `json:"manufacturer"` SerialNumber *string `json:"serial_number,omitempty"`
Firmware *string `json:"firmware"` Manufacturer *string `json:"manufacturer,omitempty"`
Interface *string `json:"interface"` Firmware *string `json:"firmware,omitempty"`
Present *bool `json:"present"` Interface *string `json:"interface,omitempty"`
Status *string `json:"status"` Present *bool `json:"present,omitempty"`
Telemetry map[string]any `json:"telemetry,omitempty"` TemperatureC *float64 `json:"temperature_c,omitempty"`
PowerOnHours *int64 `json:"power_on_hours,omitempty"`
PowerCycles *int64 `json:"power_cycles,omitempty"`
UnsafeShutdowns *int64 `json:"unsafe_shutdowns,omitempty"`
MediaErrors *int64 `json:"media_errors,omitempty"`
ErrorLogEntries *int64 `json:"error_log_entries,omitempty"`
WrittenBytes *int64 `json:"written_bytes,omitempty"`
ReadBytes *int64 `json:"read_bytes,omitempty"`
LifeUsedPct *float64 `json:"life_used_pct,omitempty"`
LifeRemainingPct *float64 `json:"life_remaining_pct,omitempty"`
AvailableSparePct *float64 `json:"available_spare_pct,omitempty"`
ReallocatedSectors *int64 `json:"reallocated_sectors,omitempty"`
CurrentPendingSectors *int64 `json:"current_pending_sectors,omitempty"`
OfflineUncorrectable *int64 `json:"offline_uncorrectable,omitempty"`
Telemetry map[string]any `json:"-"`
} }
type HardwarePCIeDevice struct { type HardwarePCIeDevice struct {
Slot *string `json:"slot"` HardwareComponentStatus
VendorID *int `json:"vendor_id"` Slot *string `json:"slot,omitempty"`
DeviceID *int `json:"device_id"` VendorID *int `json:"vendor_id,omitempty"`
BDF *string `json:"bdf"` DeviceID *int `json:"device_id,omitempty"`
DeviceClass *string `json:"device_class"` NUMANode *int `json:"numa_node,omitempty"`
Manufacturer *string `json:"manufacturer"` TemperatureC *float64 `json:"temperature_c,omitempty"`
Model *string `json:"model"` PowerW *float64 `json:"power_w,omitempty"`
LinkWidth *int `json:"link_width"` LifeRemainingPct *float64 `json:"life_remaining_pct,omitempty"`
LinkSpeed *string `json:"link_speed"` LifeUsedPct *float64 `json:"life_used_pct,omitempty"`
MaxLinkWidth *int `json:"max_link_width"` ECCCorrectedTotal *int64 `json:"ecc_corrected_total,omitempty"`
MaxLinkSpeed *string `json:"max_link_speed"` ECCUncorrectedTotal *int64 `json:"ecc_uncorrected_total,omitempty"`
SerialNumber *string `json:"serial_number"` HWSlowdown *bool `json:"hw_slowdown,omitempty"`
Firmware *string `json:"firmware"` BatteryChargePct *float64 `json:"battery_charge_pct,omitempty"`
Present *bool `json:"present"` BatteryHealthPct *float64 `json:"battery_health_pct,omitempty"`
Status *string `json:"status"` BatteryTemperatureC *float64 `json:"battery_temperature_c,omitempty"`
Telemetry map[string]any `json:"telemetry,omitempty"` BatteryVoltageV *float64 `json:"battery_voltage_v,omitempty"`
BatteryReplaceRequired *bool `json:"battery_replace_required,omitempty"`
SFPTemperatureC *float64 `json:"sfp_temperature_c,omitempty"`
SFPTXPowerDBM *float64 `json:"sfp_tx_power_dbm,omitempty"`
SFPRXPowerDBM *float64 `json:"sfp_rx_power_dbm,omitempty"`
SFPVoltageV *float64 `json:"sfp_voltage_v,omitempty"`
SFPBiasMA *float64 `json:"sfp_bias_ma,omitempty"`
BDF *string `json:"bdf,omitempty"`
DeviceClass *string `json:"device_class,omitempty"`
Manufacturer *string `json:"manufacturer,omitempty"`
Model *string `json:"model,omitempty"`
LinkWidth *int `json:"link_width,omitempty"`
LinkSpeed *string `json:"link_speed,omitempty"`
MaxLinkWidth *int `json:"max_link_width,omitempty"`
MaxLinkSpeed *string `json:"max_link_speed,omitempty"`
SerialNumber *string `json:"serial_number,omitempty"`
Firmware *string `json:"firmware,omitempty"`
MacAddresses []string `json:"mac_addresses,omitempty"`
Present *bool `json:"present,omitempty"`
Telemetry map[string]any `json:"-"`
} }
type HardwarePowerSupply struct { type HardwarePowerSupply struct {
Slot *string `json:"slot"` HardwareComponentStatus
Present *bool `json:"present"` Slot *string `json:"slot,omitempty"`
Model *string `json:"model"` Present *bool `json:"present,omitempty"`
Vendor *string `json:"vendor"` Model *string `json:"model,omitempty"`
WattageW *int `json:"wattage_w"` Vendor *string `json:"vendor,omitempty"`
SerialNumber *string `json:"serial_number"` WattageW *int `json:"wattage_w,omitempty"`
PartNumber *string `json:"part_number"` SerialNumber *string `json:"serial_number,omitempty"`
Firmware *string `json:"firmware"` PartNumber *string `json:"part_number,omitempty"`
Status *string `json:"status"` Firmware *string `json:"firmware,omitempty"`
InputType *string `json:"input_type"` InputType *string `json:"input_type,omitempty"`
InputPowerW *float64 `json:"input_power_w"` InputPowerW *float64 `json:"input_power_w,omitempty"`
OutputPowerW *float64 `json:"output_power_w"` OutputPowerW *float64 `json:"output_power_w,omitempty"`
InputVoltage *float64 `json:"input_voltage"` InputVoltage *float64 `json:"input_voltage,omitempty"`
TemperatureC *float64 `json:"temperature_c,omitempty"`
LifeRemainingPct *float64 `json:"life_remaining_pct,omitempty"`
LifeUsedPct *float64 `json:"life_used_pct,omitempty"`
}
type HardwareComponentStatus struct {
Status *string `json:"status,omitempty"`
StatusCheckedAt *string `json:"status_checked_at,omitempty"`
StatusChangedAt *string `json:"status_changed_at,omitempty"`
StatusHistory []HardwareStatusHistory `json:"status_history,omitempty"`
ErrorDescription *string `json:"error_description,omitempty"`
}
type HardwareStatusHistory struct {
Status string `json:"status"`
ChangedAt string `json:"changed_at"`
Details *string `json:"details,omitempty"`
}
type HardwareSensors struct {
Fans []HardwareFanSensor `json:"fans,omitempty"`
Power []HardwarePowerSensor `json:"power,omitempty"`
Temperatures []HardwareTemperatureSensor `json:"temperatures,omitempty"`
Other []HardwareOtherSensor `json:"other,omitempty"`
}
type HardwareFanSensor struct {
Name string `json:"name"`
Location *string `json:"location,omitempty"`
RPM *int `json:"rpm,omitempty"`
Status *string `json:"status,omitempty"`
}
type HardwarePowerSensor struct {
Name string `json:"name"`
Location *string `json:"location,omitempty"`
VoltageV *float64 `json:"voltage_v,omitempty"`
CurrentA *float64 `json:"current_a,omitempty"`
PowerW *float64 `json:"power_w,omitempty"`
Status *string `json:"status,omitempty"`
}
type HardwareTemperatureSensor struct {
Name string `json:"name"`
Location *string `json:"location,omitempty"`
Celsius *float64 `json:"celsius,omitempty"`
ThresholdWarningCelsius *float64 `json:"threshold_warning_celsius,omitempty"`
ThresholdCriticalCelsius *float64 `json:"threshold_critical_celsius,omitempty"`
Status *string `json:"status,omitempty"`
}
type HardwareOtherSensor struct {
Name string `json:"name"`
Location *string `json:"location,omitempty"`
Value *float64 `json:"value,omitempty"`
Unit *string `json:"unit,omitempty"`
Status *string `json:"status,omitempty"`
} }

View File

@@ -9,4 +9,5 @@ Generic engineering rules live in `bible/rules/patterns/`.
|---|---| |---|---|
| `architecture/system-overview.md` | What bee does, scope, tech stack | | `architecture/system-overview.md` | What bee does, scope, tech stack |
| `architecture/runtime-flows.md` | Boot sequence, audit flow, service order | | `architecture/runtime-flows.md` | Boot sequence, audit flow, service order |
| `docs/hardware-ingest-contract.md` | Current Reanimator hardware ingest JSON contract |
| `decisions/` | Architectural decision log | | `decisions/` | Architectural decision log |

View File

@@ -4,7 +4,7 @@
Hardware audit LiveCD. Boots on a server via BMC virtual media or USB. Hardware audit LiveCD. Boots on a server via BMC virtual media or USB.
Collects hardware inventory at OS level (not through BMC/Redfish). Collects hardware inventory at OS level (not through BMC/Redfish).
Produces `HardwareIngestRequest` JSON compatible with core/reanimator. Produces `HardwareIngestRequest` JSON compatible with the contract in `bible-local/docs/hardware-ingest-contract.md`.
## Why it exists ## Why it exists
@@ -46,6 +46,16 @@ Fills gaps where Redfish/logpile is blind:
- Anything requiring persistent storage on the audited machine - Anything requiring persistent storage on the audited machine
- Windows support - Windows support
- Any functionality requiring internet access at boot - Any functionality requiring internet access at boot
- Component lifecycle/history across multiple snapshots
- Status transition history (`status_history`, `status_changed_at`) derived from previous exports
- Replacement detection between two or more audit runs
## Contract boundary
- `bee` is responsible for the current hardware snapshot only.
- `bee` should populate current component state, hardware inventory, telemetry, and `status_checked_at`.
- Historical status transitions and component replacement logic belong to the centralized ingest/lifecycle system, not to `bee`.
- Contract fields that have no honest local source on a generic Linux host may remain empty.
## Tech stack ## Tech stack

View File

@@ -18,3 +18,51 @@
- точнее классифицировать vendor-specific self-test outputs в `storage SAT` - точнее классифицировать vendor-specific self-test outputs в `storage SAT`
- подобрать дефолты `memtester` по объёму RAM на целевых машинах - подобрать дефолты `memtester` по объёму RAM на целевых машинах
- при необходимости расширить `bee-gpu-stress` по длительности/нагрузке - при необходимости расширить `bee-gpu-stress` по длительности/нагрузке
## Hardware Contract backlog
**Статус:** уточнён, сокращён до `bee`-only snapshot scope.
### Не backlog для `bee`
Эти задачи не должны реализовываться в `bee`, потому что относятся к централизованному ingest/lifecycle слою:
- `status_history`
- `status_changed_at`
- определение замены компонента между snapshot'ами
- timeline/lifecycle/history по diff между экспортами
`bee` отвечает только за текущий snapshot железа и `status_checked_at`.
### Реализуемо инкрементально
Эти поля можно развивать дальше по мере появления реальных sample outputs и vendor-specific parser'ов:
- `cpus.correctable_error_count`
- `cpus.uncorrectable_error_count`
- `power_supplies.life_remaining_pct`
- `power_supplies.life_used_pct`
- `pcie_devices.battery_charge_pct`
- `pcie_devices.battery_health_pct`
- `pcie_devices.battery_temperature_c`
- `pcie_devices.battery_voltage_v`
- `pcie_devices.battery_replace_required`
### Vendor/platform-specific, часто пустые
Эти поля допустимо оставлять пустыми на части платформ даже после реализации parser'ов:
- `power_supplies.life_remaining_pct`
- `power_supplies.life_used_pct`
- часть `pcie_devices.battery_*` для неподдержанных RAID/NIC/GPU вендоров
### Unsupported в `bee`
Эти поля считаются нереалистичными для общего OS-level hardware snapshotter без synthetic/fake data:
- `cpus.life_remaining_pct`
- `cpus.life_used_pct`
- `memory.life_remaining_pct`
- `memory.life_used_pct`
- `memory.spare_blocks_remaining_pct`
- `memory.performance_degraded`
Причина: у обычного Linux-host audit обычно нет честного vendor-neutral runtime source для этих метрик.
Эти поля считаются дропнутыми из backlog `bee` и не должны возвращаться в план работ без появления нового доказуемого локального источника данных на целевых машинах.

View File

@@ -0,0 +1,730 @@
---
title: Hardware Ingest JSON Contract
version: "2.1"
updated: "2026-03-15"
maintainer: Reanimator Core
audience: external-integrators, ai-agents
language: ru
---
# Интеграция с Reanimator: контракт JSON-импорта аппаратного обеспечения
Версия: **2.1** · Дата: **2026-03-15**
Документ описывает формат JSON для передачи данных об аппаратном обеспечении серверов в систему **Reanimator** (управление жизненным циклом аппаратного обеспечения).
Предназначен для разработчиков смежных систем (Redfish-коллекторов, агентов мониторинга, CMDB-экспортёров) и может быть включён в документацию интегрируемых проектов.
> Актуальная версия документа: https://git.mchus.pro/reanimator/core/src/branch/main/bible-local/docs/hardware-ingest-contract.md
---
## Changelog
| Версия | Дата | Изменения |
|--------|------|-----------|
| 2.4 | 2026-03-15 | Добавлена первая волна component telemetry: health/life поля для `cpus`, `memory`, `storage`, `pcie_devices`, `power_supplies` |
| 2.3 | 2026-03-15 | Добавлены component telemetry поля: `pcie_devices.temperature_c`, `pcie_devices.power_w`, `power_supplies.temperature_c` |
| 2.2 | 2026-03-15 | Добавлено поле `numa_node` у `pcie_devices` для topology/affinity |
| 2.1 | 2026-03-15 | Добавлена секция `sensors` (fans, power, temperatures, other); поле `mac_addresses` у `pcie_devices`; расширен список значений `device_class` |
| 2.0 | 2026-02-01 | История статусов (`status_history`, `status_changed_at`); поля telemetry у PSU; async job response |
| 1.0 | 2026-01-01 | Начальная версия контракта |
---
## Принципы
1. **Snapshot** — JSON описывает состояние сервера на момент сбора. Может включать историю изменений статуса компонентов.
2. **Идемпотентность** — повторная отправка идентичного payload не создаёт дублей (дедупликация по хешу).
3. **Частичность** — можно передавать только те секции, данные по которым доступны. Пустой массив и отсутствие секции эквивалентны.
4. **Строгая схема** — endpoint использует строгий JSON-декодер; неизвестные поля приводят к `400 Bad Request`.
5. **Event-driven** — импорт создаёт события в timeline (LOG_COLLECTED, INSTALLED, REMOVED, FIRMWARE_CHANGED и др.).
---
## Endpoint
```
POST /ingest/hardware
Content-Type: application/json
```
**Ответ при приёме (202 Accepted):**
```json
{
"status": "accepted",
"job_id": "job_01J..."
}
```
Импорт выполняется асинхронно. Результат доступен по:
```
GET /ingest/hardware/jobs/{job_id}
```
**Ответ при успехе задачи:**
```json
{
"status": "success",
"bundle_id": "lb_01J...",
"asset_id": "mach_01J...",
"collected_at": "2026-02-10T15:30:00Z",
"duplicate": false,
"summary": {
"parts_observed": 15,
"parts_created": 2,
"parts_updated": 13,
"installations_created": 2,
"installations_closed": 1,
"timeline_events_created": 9,
"failure_events_created": 1
}
}
```
**Ответ при дубликате:**
```json
{
"status": "success",
"duplicate": true,
"message": "LogBundle with this content hash already exists"
}
```
**Ответ при ошибке (400 Bad Request):**
```json
{
"status": "error",
"error": "validation_failed",
"details": {
"field": "hardware.board.serial_number",
"message": "serial_number is required"
}
}
```
Частые причины `400`:
- Неверный формат `collected_at` (требуется RFC3339).
- Пустой `hardware.board.serial_number`.
- Наличие неизвестного JSON-поля на любом уровне.
- Тело запроса превышает допустимый размер.
---
## Структура верхнего уровня
```json
{
"filename": "redfish://10.10.10.103",
"source_type": "api",
"protocol": "redfish",
"target_host": "10.10.10.103",
"collected_at": "2026-02-10T15:30:00Z",
"hardware": {
"board": { ... },
"firmware": [ ... ],
"cpus": [ ... ],
"memory": [ ... ],
"storage": [ ... ],
"pcie_devices": [ ... ],
"power_supplies": [ ... ],
"sensors": { ... }
}
}
```
### Поля верхнего уровня
| Поле | Тип | Обязательно | Описание |
|------|-----|-------------|----------|
| `collected_at` | string RFC3339 | **да** | Время сбора данных |
| `hardware` | object | **да** | Аппаратный снапшот |
| `hardware.board.serial_number` | string | **да** | Серийный номер платы/сервера |
| `target_host` | string | нет | IP или hostname |
| `source_type` | string | нет | Тип источника: `api`, `logfile`, `manual` |
| `protocol` | string | нет | Протокол: `redfish`, `ipmi`, `snmp`, `ssh` |
| `filename` | string | нет | Идентификатор источника |
---
## Общие поля статуса компонентов
Применяются ко всем компонентным секциям (`cpus`, `memory`, `storage`, `pcie_devices`, `power_supplies`).
| Поле | Тип | Описание |
|------|-----|----------|
| `status` | string | Текущий статус: `OK`, `Warning`, `Critical`, `Unknown`, `Empty` |
| `status_checked_at` | string RFC3339 | Время последней проверки статуса |
| `status_changed_at` | string RFC3339 | Время последнего изменения статуса |
| `status_history` | array | История переходов статусов (см. ниже) |
| `error_description` | string | Текст ошибки/диагностики |
**Объект `status_history[]`:**
| Поле | Тип | Обязательно | Описание |
|------|-----|-------------|----------|
| `status` | string | **да** | Статус в этот момент |
| `changed_at` | string RFC3339 | **да** | Время перехода (без этого поля запись игнорируется) |
| `details` | string | нет | Пояснение к переходу |
**Правила приоритета времени события:**
1. `status_changed_at`
2. Последняя запись `status_history` с совпадающим статусом
3. Последняя парсируемая запись `status_history`
4. `status_checked_at`
**Правила передачи статусов:**
- Передавайте `status` как текущее состояние компонента в snapshot.
- Если источник хранит историю — передавайте `status_history` отсортированным по `changed_at` по возрастанию.
- Не включайте записи `status_history` без `changed_at`.
- Все даты — RFC3339, рекомендуется UTC (`Z`).
---
## Секции hardware
### board
Основная информация о сервере. Обязательная секция.
| Поле | Тип | Обязательно | Описание |
|------|-----|-------------|----------|
| `serial_number` | string | **да** | Серийный номер (ключ идентификации Asset) |
| `manufacturer` | string | нет | Производитель |
| `product_name` | string | нет | Модель |
| `part_number` | string | нет | Партномер |
| `uuid` | string | нет | UUID системы |
Значения `"NULL"` в строковых полях трактуются как отсутствие данных.
```json
"board": {
"manufacturer": "Supermicro",
"product_name": "X12DPG-QT6",
"serial_number": "21D634101",
"part_number": "X12DPG-QT6-REV1.01",
"uuid": "d7ef2fe5-2fd0-11f0-910a-346f11040868"
}
```
---
### firmware
Версии прошивок системных компонентов (BIOS, BMC, CPLD и др.).
| Поле | Тип | Обязательно | Описание |
|------|-----|-------------|----------|
| `device_name` | string | **да** | Название устройства (`BIOS`, `BMC`, `CPLD`, …) |
| `version` | string | **да** | Версия прошивки |
Записи с пустым `device_name` или `version` игнорируются.
Изменение версии создаёт событие `FIRMWARE_CHANGED` для Asset.
```json
"firmware": [
{ "device_name": "BIOS", "version": "06.08.05" },
{ "device_name": "BMC", "version": "5.17.00" },
{ "device_name": "CPLD", "version": "01.02.03" }
]
```
---
### cpus
| Поле | Тип | Обязательно | Описание |
|------|-----|-------------|----------|
| `socket` | int | **да** | Номер сокета (используется для генерации serial) |
| `model` | string | нет | Модель процессора |
| `manufacturer` | string | нет | Производитель |
| `cores` | int | нет | Количество ядер |
| `threads` | int | нет | Количество потоков |
| `frequency_mhz` | int | нет | Текущая частота |
| `max_frequency_mhz` | int | нет | Максимальная частота |
| `temperature_c` | float | нет | Температура CPU, °C (telemetry) |
| `power_w` | float | нет | Текущая мощность CPU, Вт (telemetry) |
| `throttled` | bool | нет | Зафиксирован thermal/power throttling |
| `correctable_error_count` | int | нет | Количество корректируемых ошибок CPU |
| `uncorrectable_error_count` | int | нет | Количество некорректируемых ошибок CPU |
| `life_remaining_pct` | float | нет | Остаточный ресурс / health, % |
| `life_used_pct` | float | нет | Использованный ресурс / wear, % |
| `serial_number` | string | нет | Серийный номер (если доступен) |
| `firmware` | string | нет | Версия микрокода |
| `present` | bool | нет | Наличие (по умолчанию `true`) |
| + общие поля статуса | | | см. раздел выше |
**Генерация serial_number при отсутствии:** `{board_serial}-CPU-{socket}`
```json
"cpus": [
{
"socket": 0,
"model": "INTEL(R) XEON(R) GOLD 6530",
"cores": 32,
"threads": 64,
"frequency_mhz": 2100,
"max_frequency_mhz": 4000,
"temperature_c": 61.5,
"power_w": 182.0,
"throttled": false,
"manufacturer": "Intel",
"status": "OK",
"status_checked_at": "2026-02-10T15:28:00Z"
}
]
```
---
### memory
| Поле | Тип | Обязательно | Описание |
|------|-----|-------------|----------|
| `slot` | string | нет | Идентификатор слота |
| `location` | string | нет | Физическое расположение |
| `present` | bool | нет | Наличие модуля (по умолчанию `true`) |
| `serial_number` | string | нет | Серийный номер |
| `part_number` | string | нет | Партномер (используется как модель) |
| `manufacturer` | string | нет | Производитель |
| `size_mb` | int | нет | Объём в МБ |
| `type` | string | нет | Тип: `DDR3`, `DDR4`, `DDR5`, … |
| `max_speed_mhz` | int | нет | Максимальная частота |
| `current_speed_mhz` | int | нет | Текущая частота |
| `temperature_c` | float | нет | Температура DIMM/модуля, °C (telemetry) |
| `correctable_ecc_error_count` | int | нет | Количество корректируемых ECC-ошибок |
| `uncorrectable_ecc_error_count` | int | нет | Количество некорректируемых ECC-ошибок |
| `life_remaining_pct` | float | нет | Остаточный ресурс / health, % |
| `life_used_pct` | float | нет | Использованный ресурс / wear, % |
| `spare_blocks_remaining_pct` | float | нет | Остаток spare blocks, % |
| `performance_degraded` | bool | нет | Зафиксирована деградация производительности |
| `data_loss_detected` | bool | нет | Источник сигнализирует риск/факт потери данных |
| + общие поля статуса | | | см. раздел выше |
Модуль без `serial_number` игнорируется. Модуль с `present=false` или `status=Empty` игнорируется.
```json
"memory": [
{
"slot": "CPU0_C0D0",
"present": true,
"size_mb": 32768,
"type": "DDR5",
"max_speed_mhz": 4800,
"current_speed_mhz": 4800,
"temperature_c": 43.0,
"correctable_ecc_error_count": 0,
"manufacturer": "Hynix",
"serial_number": "80AD032419E17CEEC1",
"part_number": "HMCG88AGBRA191N",
"status": "OK"
}
]
```
---
### storage
| Поле | Тип | Обязательно | Описание |
|------|-----|-------------|----------|
| `slot` | string | нет | Идентификатор слота |
| `serial_number` | string | нет | Серийный номер |
| `model` | string | нет | Модель |
| `manufacturer` | string | нет | Производитель |
| `type` | string | нет | Тип: `NVMe`, `SSD`, `HDD` |
| `interface` | string | нет | Интерфейс: `NVMe`, `SATA`, `SAS` |
| `size_gb` | int | нет | Размер в ГБ |
| `temperature_c` | float | нет | Температура накопителя, °C (telemetry) |
| `power_on_hours` | int64 | нет | Время работы, часы |
| `power_cycles` | int64 | нет | Количество циклов питания |
| `unsafe_shutdowns` | int64 | нет | Нештатные выключения |
| `media_errors` | int64 | нет | Ошибки носителя / media errors |
| `error_log_entries` | int64 | нет | Количество записей в error log |
| `written_bytes` | int64 | нет | Всего записано байт |
| `read_bytes` | int64 | нет | Всего прочитано байт |
| `life_used_pct` | float | нет | Использованный ресурс / wear, % |
| `life_remaining_pct` | float | нет | Остаточный ресурс / health, % |
| `available_spare_pct` | float | нет | Доступный spare, % |
| `reallocated_sectors` | int64 | нет | Переназначенные сектора |
| `current_pending_sectors` | int64 | нет | Сектора в ожидании ремапа |
| `offline_uncorrectable` | int64 | нет | Некорректируемые ошибки offline scan |
| `firmware` | string | нет | Версия прошивки |
| `present` | bool | нет | Наличие (по умолчанию `true`) |
| + общие поля статуса | | | см. раздел выше |
Диск без `serial_number` игнорируется. Изменение `firmware` создаёт событие `FIRMWARE_CHANGED`.
```json
"storage": [
{
"slot": "OB01",
"type": "NVMe",
"model": "INTEL SSDPF2KX076T1",
"size_gb": 7680,
"temperature_c": 38.5,
"power_on_hours": 12450,
"unsafe_shutdowns": 3,
"written_bytes": 9876543210,
"life_remaining_pct": 91.0,
"serial_number": "BTAX41900GF87P6DGN",
"manufacturer": "Intel",
"firmware": "9CV10510",
"interface": "NVMe",
"present": true,
"status": "OK"
}
]
```
---
### pcie_devices
| Поле | Тип | Обязательно | Описание |
|------|-----|-------------|----------|
| `slot` | string | нет | Идентификатор слота |
| `vendor_id` | int | нет | PCI Vendor ID (decimal) |
| `device_id` | int | нет | PCI Device ID (decimal) |
| `numa_node` | int | нет | NUMA node / CPU affinity устройства |
| `temperature_c` | float | нет | Температура устройства, °C (telemetry) |
| `power_w` | float | нет | Текущее энергопотребление устройства, Вт (telemetry) |
| `life_remaining_pct` | float | нет | Остаточный ресурс / health, % |
| `life_used_pct` | float | нет | Использованный ресурс / wear, % |
| `ecc_corrected_total` | int64 | нет | Всего корректируемых ECC-ошибок |
| `ecc_uncorrected_total` | int64 | нет | Всего некорректируемых ECC-ошибок |
| `hw_slowdown` | bool | нет | Устройство вошло в hardware slowdown / protective mode |
| `battery_charge_pct` | float | нет | Заряд батареи / supercap, % |
| `battery_health_pct` | float | нет | Состояние батареи / supercap, % |
| `battery_temperature_c` | float | нет | Температура батареи / supercap, °C |
| `battery_voltage_v` | float | нет | Напряжение батареи / supercap, В |
| `battery_replace_required` | bool | нет | Требуется замена батареи / supercap |
| `sfp_temperature_c` | float | нет | Температура SFP/optic, °C |
| `sfp_tx_power_dbm` | float | нет | TX optical power, dBm |
| `sfp_rx_power_dbm` | float | нет | RX optical power, dBm |
| `sfp_voltage_v` | float | нет | Напряжение SFP, В |
| `sfp_bias_ma` | float | нет | Bias current SFP, мА |
| `bdf` | string | нет | Bus:Device.Function, например `0000:18:00.0` |
| `device_class` | string | нет | Класс устройства (см. список ниже) |
| `manufacturer` | string | нет | Производитель |
| `model` | string | нет | Модель |
| `serial_number` | string | нет | Серийный номер |
| `firmware` | string | нет | Версия прошивки |
| `link_width` | int | нет | Текущая ширина линка |
| `link_speed` | string | нет | Текущая скорость: `Gen3`, `Gen4`, `Gen5` |
| `max_link_width` | int | нет | Максимальная ширина линка |
| `max_link_speed` | string | нет | Максимальная скорость |
| `mac_addresses` | string[] | нет | MAC-адреса портов (для сетевых устройств) |
| `present` | bool | нет | Наличие (по умолчанию `true`) |
| + общие поля статуса | | | см. раздел выше |
`numa_node` передавайте для NIC / InfiniBand / RAID / GPU, когда источник знает CPU/NUMA affinity. Поле сохраняется в snapshot-атрибутах PCIe-компонента и дублируется в telemetry для topology use cases.
Поля `temperature_c` и `power_w` используйте для device-level telemetry GPU / accelerator / smart PCIe devices. Они не влияют на идентификацию компонента.
**Генерация serial_number при отсутствии или `"N/A"`:** `{board_serial}-PCIE-{slot}`
**Значения `device_class`:**
| Значение | Назначение |
|----------|------------|
| `MassStorageController` | RAID-контроллеры |
| `StorageController` | HBA, SAS-контроллеры |
| `NetworkController` | Сетевые адаптеры (InfiniBand, общий) |
| `EthernetController` | Ethernet NIC |
| `FibreChannelController` | Fibre Channel HBA |
| `VideoController` | GPU, видеокарты |
| `ProcessingAccelerator` | Вычислительные ускорители (AI/ML) |
| `DisplayController` | Контроллеры дисплея (BMC VGA) |
Список открытый: допускаются произвольные строки для нестандартных классов.
```json
"pcie_devices": [
{
"slot": "PCIeCard2",
"vendor_id": 5555,
"device_id": 4401,
"numa_node": 0,
"temperature_c": 48.5,
"power_w": 18.2,
"sfp_temperature_c": 36.2,
"sfp_tx_power_dbm": -1.8,
"sfp_rx_power_dbm": -2.1,
"bdf": "0000:3b:00.0",
"device_class": "EthernetController",
"manufacturer": "Intel",
"model": "X710 10GbE",
"serial_number": "K65472-003",
"firmware": "9.20 0x8000d4ae",
"mac_addresses": ["3c:fd:fe:aa:bb:cc", "3c:fd:fe:aa:bb:cd"],
"status": "OK"
}
]
```
---
### power_supplies
| Поле | Тип | Обязательно | Описание |
|------|-----|-------------|----------|
| `slot` | string | нет | Идентификатор слота |
| `present` | bool | нет | Наличие (по умолчанию `true`) |
| `serial_number` | string | нет | Серийный номер |
| `part_number` | string | нет | Партномер |
| `model` | string | нет | Модель |
| `vendor` | string | нет | Производитель |
| `wattage_w` | int | нет | Мощность в ваттах |
| `firmware` | string | нет | Версия прошивки |
| `input_type` | string | нет | Тип входа (например `ACWideRange`) |
| `input_voltage` | float | нет | Входное напряжение, В (telemetry) |
| `input_power_w` | float | нет | Входная мощность, Вт (telemetry) |
| `output_power_w` | float | нет | Выходная мощность, Вт (telemetry) |
| `temperature_c` | float | нет | Температура PSU, °C (telemetry) |
| `life_remaining_pct` | float | нет | Остаточный ресурс / health, % |
| `life_used_pct` | float | нет | Использованный ресурс / wear, % |
| + общие поля статуса | | | см. раздел выше |
Поля telemetry (`input_voltage`, `input_power_w`, `output_power_w`, `temperature_c`, `life_remaining_pct`, `life_used_pct`) сохраняются в атрибутах компонента и не влияют на его идентификацию.
PSU без `serial_number` игнорируется.
```json
"power_supplies": [
{
"slot": "0",
"present": true,
"model": "GW-CRPS3000LW",
"vendor": "Great Wall",
"wattage_w": 3000,
"serial_number": "2P06C102610",
"firmware": "00.03.05",
"status": "OK",
"input_type": "ACWideRange",
"input_power_w": 137,
"output_power_w": 104,
"input_voltage": 215.25,
"temperature_c": 39.5,
"life_remaining_pct": 97.0
}
]
```
---
### sensors
Показания сенсоров сервера. Секция опциональная, не привязана к компонентам.
Данные хранятся как последнее известное значение (last-known-value) на уровне Asset.
```json
"sensors": {
"fans": [ ... ],
"power": [ ... ],
"temperatures": [ ... ],
"other": [ ... ]
}
```
#### sensors.fans
| Поле | Тип | Обязательно | Описание |
|------|-----|-------------|----------|
| `name` | string | **да** | Уникальное имя сенсора в рамках секции |
| `location` | string | нет | Физическое расположение |
| `rpm` | int | нет | Обороты, RPM |
| `status` | string | нет | Статус: `OK`, `Warning`, `Critical`, `Unknown` |
#### sensors.power
| Поле | Тип | Обязательно | Описание |
|------|-----|-------------|----------|
| `name` | string | **да** | Уникальное имя сенсора |
| `location` | string | нет | Физическое расположение |
| `voltage_v` | float | нет | Напряжение, В |
| `current_a` | float | нет | Ток, А |
| `power_w` | float | нет | Мощность, Вт |
| `status` | string | нет | Статус |
#### sensors.temperatures
| Поле | Тип | Обязательно | Описание |
|------|-----|-------------|----------|
| `name` | string | **да** | Уникальное имя сенсора |
| `location` | string | нет | Физическое расположение |
| `celsius` | float | нет | Температура, °C |
| `threshold_warning_celsius` | float | нет | Порог Warning, °C |
| `threshold_critical_celsius` | float | нет | Порог Critical, °C |
| `status` | string | нет | Статус |
#### sensors.other
| Поле | Тип | Обязательно | Описание |
|------|-----|-------------|----------|
| `name` | string | **да** | Уникальное имя сенсора |
| `location` | string | нет | Физическое расположение |
| `value` | float | нет | Значение |
| `unit` | string | нет | Единица измерения |
| `status` | string | нет | Статус |
**Правила sensors:**
- Идентификатор сенсора: пара `(sensor_type, name)`. Дубли в одном payload — берётся первое вхождение.
- Сенсоры без `name` игнорируются.
- При каждом импорте значения перезаписываются (upsert по ключу).
```json
"sensors": {
"fans": [
{ "name": "FAN1", "location": "Front", "rpm": 4200, "status": "OK" },
{ "name": "FAN_CPU0", "location": "CPU0", "rpm": 5600, "status": "OK" }
],
"power": [
{ "name": "12V Rail", "location": "Mainboard", "voltage_v": 12.06, "status": "OK" },
{ "name": "PSU0 Input", "location": "PSU0", "voltage_v": 215.25, "current_a": 0.64, "power_w": 137.0, "status": "OK" }
],
"temperatures": [
{ "name": "CPU0 Temp", "location": "CPU0", "celsius": 46.0, "threshold_warning_celsius": 80.0, "threshold_critical_celsius": 95.0, "status": "OK" },
{ "name": "Inlet Temp", "location": "Front", "celsius": 22.0, "threshold_warning_celsius": 40.0, "threshold_critical_celsius": 50.0, "status": "OK" }
],
"other": [
{ "name": "System Humidity", "value": 38.5, "unit": "%", "status": "OK" }
]
}
```
---
## Обработка статусов компонентов
| Статус | Поведение |
|--------|-----------|
| `OK` | Нормальная обработка |
| `Warning` | Создаётся событие `COMPONENT_WARNING` |
| `Critical` | Создаётся событие `COMPONENT_FAILED` + запись в `failure_events` |
| `Unknown` | Компонент считается рабочим, создаётся событие `COMPONENT_UNKNOWN` |
| `Empty` | Компонент не создаётся/не обновляется |
---
## Обработка отсутствующих serial_number
| Тип | Поведение |
|-----|-----------|
| CPU | Генерируется: `{board_serial}-CPU-{socket}` |
| PCIe | Генерируется: `{board_serial}-PCIE-{slot}` (если serial = `"N/A"` или пустой) |
| Memory | Компонент игнорируется |
| Storage | Компонент игнорируется |
| PSU | Компонент игнорируется |
Если `serial_number` не уникален внутри одного payload для того же `model`:
- Первое вхождение сохраняет оригинальный серийный номер.
- Каждое следующее дублирующее получает placeholder: `NO_SN-XXXXXXXX`.
---
## Минимальный валидный пример
```json
{
"collected_at": "2026-02-10T15:30:00Z",
"target_host": "192.168.1.100",
"hardware": {
"board": {
"serial_number": "SRV-001"
}
}
}
```
---
## Полный пример с историей статусов
```json
{
"filename": "redfish://10.10.10.103",
"source_type": "api",
"protocol": "redfish",
"target_host": "10.10.10.103",
"collected_at": "2026-02-10T15:30:00Z",
"hardware": {
"board": {
"manufacturer": "Supermicro",
"product_name": "X12DPG-QT6",
"serial_number": "21D634101"
},
"firmware": [
{ "device_name": "BIOS", "version": "06.08.05" },
{ "device_name": "BMC", "version": "5.17.00" }
],
"cpus": [
{
"socket": 0,
"model": "INTEL(R) XEON(R) GOLD 6530",
"manufacturer": "Intel",
"cores": 32,
"threads": 64,
"status": "OK"
}
],
"storage": [
{
"slot": "OB01",
"type": "NVMe",
"model": "INTEL SSDPF2KX076T1",
"size_gb": 7680,
"serial_number": "BTAX41900GF87P6DGN",
"manufacturer": "Intel",
"firmware": "9CV10510",
"present": true,
"status": "OK",
"status_changed_at": "2026-02-10T15:22:00Z",
"status_history": [
{ "status": "Critical", "changed_at": "2026-02-10T15:10:00Z", "details": "I/O timeout on NVMe queue 3" },
{ "status": "OK", "changed_at": "2026-02-10T15:22:00Z", "details": "Recovered after controller reset" }
]
}
],
"pcie_devices": [
{
"slot": "PCIeCard1",
"device_class": "EthernetController",
"manufacturer": "Intel",
"model": "X710 10GbE",
"serial_number": "K65472-003",
"mac_addresses": ["3c:fd:fe:aa:bb:cc", "3c:fd:fe:aa:bb:cd"],
"status": "OK"
}
],
"power_supplies": [
{
"slot": "0",
"present": true,
"model": "GW-CRPS3000LW",
"vendor": "Great Wall",
"wattage_w": 3000,
"serial_number": "2P06C102610",
"firmware": "00.03.05",
"status": "OK",
"input_power_w": 137,
"output_power_w": 104,
"input_voltage": 215.25
}
],
"sensors": {
"fans": [
{ "name": "FAN1", "location": "Front", "rpm": 4200, "status": "OK" }
],
"power": [
{ "name": "12V Rail", "voltage_v": 12.06, "status": "OK" }
],
"temperatures": [
{ "name": "CPU0 Temp", "celsius": 46.0, "threshold_warning_celsius": 80.0, "threshold_critical_celsius": 95.0, "status": "OK" }
],
"other": [
{ "name": "System Humidity", "value": 38.5, "unit": "%" }
]
}
}
}
```

View File

@@ -12,6 +12,7 @@ iproute2
isc-dhcp-client isc-dhcp-client
iputils-ping iputils-ping
ethtool ethtool
lm-sensors
qemu-guest-agent qemu-guest-agent
# SSH # SSH