Align hardware export with ingest contract
This commit is contained in:
@@ -317,38 +317,20 @@ func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, erro
|
||||
}
|
||||
|
||||
func (a *App) HealthSummaryResult() ActionResult {
|
||||
type auditFile struct {
|
||||
Hardware struct {
|
||||
Summary struct {
|
||||
Status string `json:"status"`
|
||||
Warnings []string `json:"warnings"`
|
||||
Failures []string `json:"failures"`
|
||||
StorageWarn int `json:"storage_warn"`
|
||||
StorageFail int `json:"storage_fail"`
|
||||
PCIeWarn int `json:"pcie_warn"`
|
||||
PCIeFail int `json:"pcie_fail"`
|
||||
PSUWarn int `json:"psu_warn"`
|
||||
PSUFail int `json:"psu_fail"`
|
||||
MemoryWarn int `json:"memory_warn"`
|
||||
MemoryFail int `json:"memory_fail"`
|
||||
} `json:"summary"`
|
||||
} `json:"hardware"`
|
||||
}
|
||||
|
||||
raw, err := os.ReadFile(DefaultAuditJSONPath)
|
||||
if err != nil {
|
||||
return ActionResult{Title: "Health summary", Body: "No audit JSON found."}
|
||||
}
|
||||
var snapshot auditFile
|
||||
var snapshot schema.HardwareIngestRequest
|
||||
if err := json.Unmarshal(raw, &snapshot); err != nil {
|
||||
return ActionResult{Title: "Health summary", Body: "Audit JSON is unreadable."}
|
||||
}
|
||||
|
||||
summary := snapshot.Hardware.Summary
|
||||
summary := collector.BuildHealthSummary(snapshot.Hardware)
|
||||
var body strings.Builder
|
||||
status := summary.Status
|
||||
if status == "" {
|
||||
status = "UNKNOWN"
|
||||
status = "Unknown"
|
||||
}
|
||||
fmt.Fprintf(&body, "Overall: %s\n", status)
|
||||
fmt.Fprintf(&body, "Storage: warn=%d fail=%d\n", summary.StorageWarn, summary.StorageFail)
|
||||
@@ -662,12 +644,12 @@ func formatIPLine(list func() ([]platform.InterfaceInfo, error)) string {
|
||||
}
|
||||
|
||||
func isGPUDevice(dev schema.HardwarePCIeDevice) bool {
|
||||
class := strings.ToLower(trimPtr(dev.DeviceClass))
|
||||
class := trimPtr(dev.DeviceClass)
|
||||
model := strings.ToLower(trimPtr(dev.Model))
|
||||
vendor := strings.ToLower(trimPtr(dev.Manufacturer))
|
||||
return strings.Contains(class, "vga") ||
|
||||
strings.Contains(class, "3d") ||
|
||||
strings.Contains(class, "display") ||
|
||||
return class == "VideoController" ||
|
||||
class == "DisplayController" ||
|
||||
class == "ProcessingAccelerator" ||
|
||||
strings.Contains(model, "nvidia") ||
|
||||
strings.Contains(vendor, "nvidia") ||
|
||||
strings.Contains(vendor, "amd")
|
||||
|
||||
@@ -371,8 +371,6 @@ func TestFormatSATSummary(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestHealthSummaryResultIncludesCompactSATSummary(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tmp := t.TempDir()
|
||||
oldAuditPath := DefaultAuditJSONPath
|
||||
oldSATBaseDir := DefaultSATBaseDir
|
||||
@@ -386,7 +384,7 @@ func TestHealthSummaryResultIncludesCompactSATSummary(t *testing.T) {
|
||||
t.Fatalf("mkdir sat dir: %v", err)
|
||||
}
|
||||
|
||||
raw := `{"hardware":{"summary":{"status":"WARNING","storage_warn":1,"storage_fail":0,"pcie_warn":0,"pcie_fail":0,"psu_warn":0,"psu_fail":0,"memory_warn":0,"memory_fail":0}}}`
|
||||
raw := `{"collected_at":"2026-03-15T10:00:00Z","hardware":{"board":{"serial_number":"SRV123"},"storage":[{"serial_number":"DISK1","status":"Warning"}]}}`
|
||||
if err := os.WriteFile(DefaultAuditJSONPath, []byte(raw), 0644); err != nil {
|
||||
t.Fatalf("write audit json: %v", err)
|
||||
}
|
||||
@@ -401,8 +399,6 @@ func TestHealthSummaryResultIncludesCompactSATSummary(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestMainBanner(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tmp := t.TempDir()
|
||||
oldAuditPath := DefaultAuditJSONPath
|
||||
DefaultAuditJSONPath = filepath.Join(tmp, "audit.json")
|
||||
@@ -413,7 +409,7 @@ func TestMainBanner(t *testing.T) {
|
||||
product := "PowerEdge R760"
|
||||
cpuModel := "Intel Xeon Gold 6430"
|
||||
memoryType := "DDR5"
|
||||
gpuClass := "VGA compatible controller"
|
||||
gpuClass := "VideoController"
|
||||
gpuModel := "NVIDIA H100"
|
||||
|
||||
payload := schema.HardwareIngestRequest{
|
||||
|
||||
@@ -7,13 +7,15 @@ import (
|
||||
"bee/audit/internal/runtimeenv"
|
||||
"bee/audit/internal/schema"
|
||||
"log/slog"
|
||||
"os"
|
||||
"time"
|
||||
)
|
||||
|
||||
// Run executes all collectors and returns the combined snapshot.
|
||||
// Partial failures are logged as warnings; collection always completes.
|
||||
func Run(runtimeMode runtimeenv.Mode) schema.HardwareIngestRequest {
|
||||
func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
|
||||
start := time.Now()
|
||||
collectedAt := time.Now().UTC().Format(time.RFC3339)
|
||||
slog.Info("audit started")
|
||||
|
||||
snap := schema.HardwareSnapshot{}
|
||||
@@ -27,27 +29,38 @@ func Run(runtimeMode runtimeenv.Mode) schema.HardwareIngestRequest {
|
||||
snap.Firmware = append(snap.Firmware, cpuFW...)
|
||||
|
||||
snap.Memory = collectMemory()
|
||||
sensorDoc, err := readSensorsJSONDoc()
|
||||
if err != nil {
|
||||
slog.Info("sensors: unavailable for enrichment", "err", err)
|
||||
}
|
||||
snap.CPUs = enrichCPUsWithTelemetry(snap.CPUs, sensorDoc)
|
||||
snap.Memory = enrichMemoryWithTelemetry(snap.Memory, sensorDoc)
|
||||
snap.Storage = collectStorage()
|
||||
snap.PCIeDevices = collectPCIe()
|
||||
snap.PCIeDevices = enrichPCIeWithNVIDIA(snap.PCIeDevices, snap.Board.SerialNumber)
|
||||
snap.PCIeDevices = enrichPCIeWithMellanox(snap.PCIeDevices)
|
||||
snap.PCIeDevices = enrichPCIeWithNICTelemetry(snap.PCIeDevices)
|
||||
snap.PCIeDevices = enrichPCIeWithRAIDTelemetry(snap.PCIeDevices)
|
||||
snap.Storage = enrichStorageWithVROC(snap.Storage, snap.PCIeDevices)
|
||||
snap.Storage = appendUniqueStorage(snap.Storage, collectRAIDStorage(snap.PCIeDevices))
|
||||
snap.PowerSupplies = collectPSUs()
|
||||
snap.Summary = buildHealthSummary(snap)
|
||||
snap.PowerSupplies = enrichPSUsWithTelemetry(snap.PowerSupplies, sensorDoc)
|
||||
snap.Sensors = buildSensorsFromDoc(sensorDoc)
|
||||
finalizeSnapshot(&snap, collectedAt)
|
||||
|
||||
// remaining collectors added in steps 1.8 – 1.10
|
||||
|
||||
slog.Info("audit completed", "duration", time.Since(start).Round(time.Millisecond))
|
||||
|
||||
sourceType := string(runtimeMode)
|
||||
protocol := "os-direct"
|
||||
|
||||
sourceType := "manual"
|
||||
var targetHost *string
|
||||
if hostname, err := os.Hostname(); err == nil && hostname != "" {
|
||||
targetHost = &hostname
|
||||
}
|
||||
return schema.HardwareIngestRequest{
|
||||
SourceType: &sourceType,
|
||||
Protocol: &protocol,
|
||||
CollectedAt: time.Now().UTC().Format(time.RFC3339),
|
||||
TargetHost: targetHost,
|
||||
CollectedAt: collectedAt,
|
||||
Hardware: snap,
|
||||
}
|
||||
}
|
||||
|
||||
64
audit/internal/collector/contract.go
Normal file
64
audit/internal/collector/contract.go
Normal file
@@ -0,0 +1,64 @@
|
||||
package collector
|
||||
|
||||
import "strings"
|
||||
|
||||
const (
|
||||
statusOK = "OK"
|
||||
statusWarning = "Warning"
|
||||
statusCritical = "Critical"
|
||||
statusUnknown = "Unknown"
|
||||
statusEmpty = "Empty"
|
||||
)
|
||||
|
||||
func mapPCIeDeviceClass(raw string) string {
|
||||
normalized := strings.ToLower(strings.TrimSpace(raw))
|
||||
switch {
|
||||
case normalized == "":
|
||||
return ""
|
||||
case strings.Contains(normalized, "ethernet controller"):
|
||||
return "EthernetController"
|
||||
case strings.Contains(normalized, "fibre channel"):
|
||||
return "FibreChannelController"
|
||||
case strings.Contains(normalized, "network controller"), strings.Contains(normalized, "infiniband controller"):
|
||||
return "NetworkController"
|
||||
case strings.Contains(normalized, "serial attached scsi"), strings.Contains(normalized, "storage controller"):
|
||||
return "StorageController"
|
||||
case strings.Contains(normalized, "raid"), strings.Contains(normalized, "mass storage"):
|
||||
return "MassStorageController"
|
||||
case strings.Contains(normalized, "display controller"):
|
||||
return "DisplayController"
|
||||
case strings.Contains(normalized, "vga"), strings.Contains(normalized, "3d controller"), strings.Contains(normalized, "video controller"):
|
||||
return "VideoController"
|
||||
case strings.Contains(normalized, "processing accelerators"), strings.Contains(normalized, "processing accelerator"):
|
||||
return "ProcessingAccelerator"
|
||||
default:
|
||||
return raw
|
||||
}
|
||||
}
|
||||
|
||||
func isNICClass(class string) bool {
|
||||
switch strings.TrimSpace(class) {
|
||||
case "EthernetController", "NetworkController":
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func isGPUClass(class string) bool {
|
||||
switch strings.TrimSpace(class) {
|
||||
case "VideoController", "DisplayController", "ProcessingAccelerator":
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func isRAIDClass(class string) bool {
|
||||
switch strings.TrimSpace(class) {
|
||||
case "MassStorageController", "StorageController":
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
@@ -51,12 +51,14 @@ func parseCPUs(output, boardSerial string) []schema.HardwareCPU {
|
||||
// Returns false if the socket is unpopulated.
|
||||
func parseCPUSection(fields map[string]string, boardSerial string) (schema.HardwareCPU, bool) {
|
||||
status := parseCPUStatus(fields["Status"])
|
||||
if status == "EMPTY" {
|
||||
if status == statusEmpty {
|
||||
return schema.HardwareCPU{}, false
|
||||
}
|
||||
|
||||
cpu := schema.HardwareCPU{}
|
||||
cpu.Status = &status
|
||||
present := true
|
||||
cpu.Present = &present
|
||||
|
||||
if socket, ok := parseSocketIndex(fields["Socket Designation"]); ok {
|
||||
cpu.Socket = &socket
|
||||
@@ -99,15 +101,15 @@ func parseCPUStatus(raw string) string {
|
||||
upper := strings.ToUpper(raw)
|
||||
switch {
|
||||
case upper == "" || upper == "UNKNOWN":
|
||||
return "UNKNOWN"
|
||||
return statusUnknown
|
||||
case strings.Contains(upper, "UNPOPULATED") || strings.Contains(upper, "NOT POPULATED"):
|
||||
return "EMPTY"
|
||||
return statusEmpty
|
||||
case strings.Contains(upper, "ENABLED"):
|
||||
return "OK"
|
||||
return statusOK
|
||||
case strings.Contains(upper, "DISABLED"):
|
||||
return "WARNING"
|
||||
return statusWarning
|
||||
default:
|
||||
return "UNKNOWN"
|
||||
return statusUnknown
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
196
audit/internal/collector/cpu_telemetry.go
Normal file
196
audit/internal/collector/cpu_telemetry.go
Normal file
@@ -0,0 +1,196 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var (
|
||||
cpuSysBaseDir = "/sys/devices/system/cpu"
|
||||
socketIndexRe = regexp.MustCompile(`(?i)(?:package id|socket|cpu)\s*([0-9]+)`)
|
||||
)
|
||||
|
||||
func enrichCPUsWithTelemetry(cpus []schema.HardwareCPU, doc sensorsDoc) []schema.HardwareCPU {
|
||||
if len(cpus) == 0 {
|
||||
return cpus
|
||||
}
|
||||
|
||||
tempBySocket := cpuTempsFromSensors(doc, len(cpus))
|
||||
powerBySocket := cpuPowerFromSensors(doc, len(cpus))
|
||||
throttleBySocket := cpuThrottleBySocket()
|
||||
|
||||
for i := range cpus {
|
||||
socket := 0
|
||||
if cpus[i].Socket != nil {
|
||||
socket = *cpus[i].Socket
|
||||
}
|
||||
if value, ok := tempBySocket[socket]; ok {
|
||||
cpus[i].TemperatureC = &value
|
||||
}
|
||||
if value, ok := powerBySocket[socket]; ok {
|
||||
cpus[i].PowerW = &value
|
||||
}
|
||||
if value, ok := throttleBySocket[socket]; ok {
|
||||
cpus[i].Throttled = &value
|
||||
}
|
||||
}
|
||||
|
||||
return cpus
|
||||
}
|
||||
|
||||
func cpuTempsFromSensors(doc sensorsDoc, cpuCount int) map[int]float64 {
|
||||
out := map[int]float64{}
|
||||
if len(doc) == 0 {
|
||||
return out
|
||||
}
|
||||
var fallback []float64
|
||||
for chip, features := range doc {
|
||||
for featureName, raw := range features {
|
||||
feature, ok := raw.(map[string]any)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if classifySensorFeature(feature) != "temp" {
|
||||
continue
|
||||
}
|
||||
temp, ok := firstFeatureFloat(feature, "_input")
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if socket, ok := detectCPUSocket(chip, featureName); ok {
|
||||
if _, exists := out[socket]; !exists {
|
||||
out[socket] = temp
|
||||
}
|
||||
continue
|
||||
}
|
||||
if isLikelyCPUTemp(chip, featureName) {
|
||||
fallback = append(fallback, temp)
|
||||
}
|
||||
}
|
||||
}
|
||||
if len(out) == 0 && cpuCount == 1 && len(fallback) > 0 {
|
||||
out[0] = fallback[0]
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func cpuPowerFromSensors(doc sensorsDoc, cpuCount int) map[int]float64 {
|
||||
out := map[int]float64{}
|
||||
if len(doc) == 0 {
|
||||
return out
|
||||
}
|
||||
var fallback []float64
|
||||
for chip, features := range doc {
|
||||
for featureName, raw := range features {
|
||||
feature, ok := raw.(map[string]any)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if classifySensorFeature(feature) != "power" {
|
||||
continue
|
||||
}
|
||||
power, ok := firstFeatureFloatWithContains(feature, []string{"power"})
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if socket, ok := detectCPUSocket(chip, featureName); ok {
|
||||
if _, exists := out[socket]; !exists {
|
||||
out[socket] = power
|
||||
}
|
||||
continue
|
||||
}
|
||||
if isLikelyCPUPower(chip, featureName) {
|
||||
fallback = append(fallback, power)
|
||||
}
|
||||
}
|
||||
}
|
||||
if len(out) == 0 && cpuCount == 1 && len(fallback) > 0 {
|
||||
out[0] = fallback[0]
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func detectCPUSocket(parts ...string) (int, bool) {
|
||||
for _, part := range parts {
|
||||
matches := socketIndexRe.FindStringSubmatch(strings.ToLower(part))
|
||||
if len(matches) == 2 {
|
||||
value, err := strconv.Atoi(matches[1])
|
||||
if err == nil {
|
||||
return value, true
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
func isLikelyCPUTemp(chip, feature string) bool {
|
||||
value := strings.ToLower(chip + " " + feature)
|
||||
return strings.Contains(value, "coretemp") ||
|
||||
strings.Contains(value, "k10temp") ||
|
||||
strings.Contains(value, "package id") ||
|
||||
strings.Contains(value, "tdie") ||
|
||||
strings.Contains(value, "tctl") ||
|
||||
strings.Contains(value, "cpu temp")
|
||||
}
|
||||
|
||||
func isLikelyCPUPower(chip, feature string) bool {
|
||||
value := strings.ToLower(chip + " " + feature)
|
||||
return strings.Contains(value, "intel-rapl") ||
|
||||
strings.Contains(value, "package id") ||
|
||||
strings.Contains(value, "package-") ||
|
||||
strings.Contains(value, "cpu power")
|
||||
}
|
||||
|
||||
func cpuThrottleBySocket() map[int]bool {
|
||||
out := map[int]bool{}
|
||||
cpuDirs, err := filepath.Glob(filepath.Join(cpuSysBaseDir, "cpu[0-9]*"))
|
||||
if err != nil {
|
||||
return out
|
||||
}
|
||||
sort.Strings(cpuDirs)
|
||||
for _, cpuDir := range cpuDirs {
|
||||
socket, ok := readSocketIndex(cpuDir)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if cpuPackageThrottled(cpuDir) {
|
||||
out[socket] = true
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func readSocketIndex(cpuDir string) (int, bool) {
|
||||
raw, err := os.ReadFile(filepath.Join(cpuDir, "topology", "physical_package_id"))
|
||||
if err != nil {
|
||||
return 0, false
|
||||
}
|
||||
value, err := strconv.Atoi(strings.TrimSpace(string(raw)))
|
||||
if err != nil || value < 0 {
|
||||
return 0, false
|
||||
}
|
||||
return value, true
|
||||
}
|
||||
|
||||
func cpuPackageThrottled(cpuDir string) bool {
|
||||
paths := []string{
|
||||
filepath.Join(cpuDir, "thermal_throttle", "package_throttle_count"),
|
||||
filepath.Join(cpuDir, "thermal_throttle", "core_throttle_count"),
|
||||
}
|
||||
for _, path := range paths {
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
value, err := strconv.ParseInt(strings.TrimSpace(string(raw)), 10, 64)
|
||||
if err == nil && value > 0 {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
71
audit/internal/collector/cpu_telemetry_test.go
Normal file
71
audit/internal/collector/cpu_telemetry_test.go
Normal file
@@ -0,0 +1,71 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
func TestEnrichCPUsWithTelemetry(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
oldBase := cpuSysBaseDir
|
||||
cpuSysBaseDir = tmp
|
||||
t.Cleanup(func() { cpuSysBaseDir = oldBase })
|
||||
|
||||
mustWriteFile(t, filepath.Join(tmp, "cpu0", "topology", "physical_package_id"), "0\n")
|
||||
mustWriteFile(t, filepath.Join(tmp, "cpu0", "thermal_throttle", "package_throttle_count"), "3\n")
|
||||
mustWriteFile(t, filepath.Join(tmp, "cpu1", "topology", "physical_package_id"), "1\n")
|
||||
mustWriteFile(t, filepath.Join(tmp, "cpu1", "thermal_throttle", "package_throttle_count"), "0\n")
|
||||
|
||||
doc := sensorsDoc{
|
||||
"coretemp-isa-0000": {
|
||||
"Package id 0": map[string]any{"temp1_input": 61.5},
|
||||
"Package id 1": map[string]any{"temp2_input": 58.0},
|
||||
},
|
||||
"intel-rapl-mmio-0": {
|
||||
"Package id 0": map[string]any{"power1_average": 180.0},
|
||||
"Package id 1": map[string]any{"power2_average": 175.0},
|
||||
},
|
||||
}
|
||||
|
||||
socket0 := 0
|
||||
socket1 := 1
|
||||
status := statusOK
|
||||
cpus := []schema.HardwareCPU{
|
||||
{Socket: &socket0, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
{Socket: &socket1, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
}
|
||||
|
||||
got := enrichCPUsWithTelemetry(cpus, doc)
|
||||
|
||||
if got[0].TemperatureC == nil || *got[0].TemperatureC != 61.5 {
|
||||
t.Fatalf("cpu0 temperature mismatch: %#v", got[0].TemperatureC)
|
||||
}
|
||||
if got[0].PowerW == nil || *got[0].PowerW != 180.0 {
|
||||
t.Fatalf("cpu0 power mismatch: %#v", got[0].PowerW)
|
||||
}
|
||||
if got[0].Throttled == nil || !*got[0].Throttled {
|
||||
t.Fatalf("cpu0 throttled mismatch: %#v", got[0].Throttled)
|
||||
}
|
||||
if got[1].TemperatureC == nil || *got[1].TemperatureC != 58.0 {
|
||||
t.Fatalf("cpu1 temperature mismatch: %#v", got[1].TemperatureC)
|
||||
}
|
||||
if got[1].PowerW == nil || *got[1].PowerW != 175.0 {
|
||||
t.Fatalf("cpu1 power mismatch: %#v", got[1].PowerW)
|
||||
}
|
||||
if got[1].Throttled != nil && *got[1].Throttled {
|
||||
t.Fatalf("cpu1 throttled mismatch: %#v", got[1].Throttled)
|
||||
}
|
||||
}
|
||||
|
||||
func mustWriteFile(t *testing.T, path, content string) {
|
||||
t.Helper()
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||
t.Fatalf("mkdir %s: %v", path, err)
|
||||
}
|
||||
if err := os.WriteFile(path, []byte(content), 0644); err != nil {
|
||||
t.Fatalf("write %s: %v", path, err)
|
||||
}
|
||||
}
|
||||
@@ -69,12 +69,12 @@ func TestParseCPUStatus(t *testing.T) {
|
||||
want string
|
||||
}{
|
||||
{"Populated, Enabled", "OK"},
|
||||
{"Populated, Disabled By User", "WARNING"},
|
||||
{"Populated, Disabled By BIOS", "WARNING"},
|
||||
{"Unpopulated", "EMPTY"},
|
||||
{"Not Populated", "EMPTY"},
|
||||
{"Unknown", "UNKNOWN"},
|
||||
{"", "UNKNOWN"},
|
||||
{"Populated, Disabled By User", statusWarning},
|
||||
{"Populated, Disabled By BIOS", statusWarning},
|
||||
{"Unpopulated", statusEmpty},
|
||||
{"Not Populated", statusEmpty},
|
||||
{"Unknown", statusUnknown},
|
||||
{"", statusUnknown},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
got := parseCPUStatus(tt.input)
|
||||
|
||||
179
audit/internal/collector/finalize.go
Normal file
179
audit/internal/collector/finalize.go
Normal file
@@ -0,0 +1,179 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"fmt"
|
||||
)
|
||||
|
||||
func finalizeSnapshot(snap *schema.HardwareSnapshot, collectedAt string) {
|
||||
snap.Memory = filterMemory(snap.Memory)
|
||||
snap.Storage = filterStorage(snap.Storage)
|
||||
snap.PowerSupplies = filterPSUs(snap.PowerSupplies)
|
||||
|
||||
setComponentStatusMetadata(snap, collectedAt)
|
||||
deduplicateComponentSerials(snap)
|
||||
}
|
||||
|
||||
func filterMemory(dimms []schema.HardwareMemory) []schema.HardwareMemory {
|
||||
out := make([]schema.HardwareMemory, 0, len(dimms))
|
||||
for _, dimm := range dimms {
|
||||
if dimm.Present != nil && !*dimm.Present {
|
||||
continue
|
||||
}
|
||||
if dimm.Status != nil && *dimm.Status == statusEmpty {
|
||||
continue
|
||||
}
|
||||
if dimm.SerialNumber == nil || *dimm.SerialNumber == "" {
|
||||
continue
|
||||
}
|
||||
out = append(out, dimm)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func filterStorage(disks []schema.HardwareStorage) []schema.HardwareStorage {
|
||||
out := make([]schema.HardwareStorage, 0, len(disks))
|
||||
for _, disk := range disks {
|
||||
if disk.SerialNumber == nil || *disk.SerialNumber == "" {
|
||||
continue
|
||||
}
|
||||
out = append(out, disk)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func filterPSUs(psus []schema.HardwarePowerSupply) []schema.HardwarePowerSupply {
|
||||
out := make([]schema.HardwarePowerSupply, 0, len(psus))
|
||||
for _, psu := range psus {
|
||||
if psu.SerialNumber == nil || *psu.SerialNumber == "" {
|
||||
continue
|
||||
}
|
||||
out = append(out, psu)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func setComponentStatusMetadata(snap *schema.HardwareSnapshot, collectedAt string) {
|
||||
for i := range snap.CPUs {
|
||||
setStatusCheckedAt(&snap.CPUs[i].HardwareComponentStatus, collectedAt)
|
||||
}
|
||||
for i := range snap.Memory {
|
||||
setStatusCheckedAt(&snap.Memory[i].HardwareComponentStatus, collectedAt)
|
||||
}
|
||||
for i := range snap.Storage {
|
||||
setStatusCheckedAt(&snap.Storage[i].HardwareComponentStatus, collectedAt)
|
||||
}
|
||||
for i := range snap.PCIeDevices {
|
||||
setStatusCheckedAt(&snap.PCIeDevices[i].HardwareComponentStatus, collectedAt)
|
||||
}
|
||||
for i := range snap.PowerSupplies {
|
||||
setStatusCheckedAt(&snap.PowerSupplies[i].HardwareComponentStatus, collectedAt)
|
||||
}
|
||||
}
|
||||
|
||||
func setStatusCheckedAt(status *schema.HardwareComponentStatus, collectedAt string) {
|
||||
if status == nil || status.Status == nil || *status.Status == "" {
|
||||
return
|
||||
}
|
||||
if status.StatusCheckedAt == nil {
|
||||
status.StatusCheckedAt = &collectedAt
|
||||
}
|
||||
}
|
||||
|
||||
func deduplicateComponentSerials(snap *schema.HardwareSnapshot) {
|
||||
deduplicateCPUSerials(snap.CPUs)
|
||||
deduplicateMemorySerials(snap.Memory)
|
||||
deduplicateStorageSerials(snap.Storage)
|
||||
deduplicatePCIeSerials(snap.PCIeDevices)
|
||||
deduplicatePSUSerials(snap.PowerSupplies)
|
||||
}
|
||||
|
||||
func deduplicateCPUSerials(items []schema.HardwareCPU) {
|
||||
seen := map[string]int{}
|
||||
seq := 1
|
||||
for i := range items {
|
||||
if items[i].SerialNumber == nil || *items[i].SerialNumber == "" {
|
||||
continue
|
||||
}
|
||||
model := derefString(items[i].Model)
|
||||
key := model + "\x00" + *items[i].SerialNumber
|
||||
seen[key]++
|
||||
if seen[key] > 1 {
|
||||
repl := fmt.Sprintf("NO_SN-%08d", seq)
|
||||
seq++
|
||||
items[i].SerialNumber = &repl
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func deduplicateMemorySerials(items []schema.HardwareMemory) {
|
||||
seen := map[string]int{}
|
||||
seq := 1
|
||||
for i := range items {
|
||||
if items[i].SerialNumber == nil || *items[i].SerialNumber == "" {
|
||||
continue
|
||||
}
|
||||
model := derefString(items[i].PartNumber)
|
||||
key := model + "\x00" + *items[i].SerialNumber
|
||||
seen[key]++
|
||||
if seen[key] > 1 {
|
||||
repl := fmt.Sprintf("NO_SN-%08d", seq)
|
||||
seq++
|
||||
items[i].SerialNumber = &repl
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func deduplicateStorageSerials(items []schema.HardwareStorage) {
|
||||
seen := map[string]int{}
|
||||
seq := 1
|
||||
for i := range items {
|
||||
if items[i].SerialNumber == nil || *items[i].SerialNumber == "" {
|
||||
continue
|
||||
}
|
||||
model := derefString(items[i].Model)
|
||||
key := model + "\x00" + *items[i].SerialNumber
|
||||
seen[key]++
|
||||
if seen[key] > 1 {
|
||||
repl := fmt.Sprintf("NO_SN-%08d", seq)
|
||||
seq++
|
||||
items[i].SerialNumber = &repl
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func deduplicatePCIeSerials(items []schema.HardwarePCIeDevice) {
|
||||
seen := map[string]int{}
|
||||
seq := 1
|
||||
for i := range items {
|
||||
if items[i].SerialNumber == nil || *items[i].SerialNumber == "" {
|
||||
continue
|
||||
}
|
||||
model := derefString(items[i].Model)
|
||||
key := model + "\x00" + *items[i].SerialNumber
|
||||
seen[key]++
|
||||
if seen[key] > 1 {
|
||||
repl := fmt.Sprintf("NO_SN-%08d", seq)
|
||||
seq++
|
||||
items[i].SerialNumber = &repl
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func deduplicatePSUSerials(items []schema.HardwarePowerSupply) {
|
||||
seen := map[string]int{}
|
||||
seq := 1
|
||||
for i := range items {
|
||||
if items[i].SerialNumber == nil || *items[i].SerialNumber == "" {
|
||||
continue
|
||||
}
|
||||
model := derefString(items[i].Model)
|
||||
key := model + "\x00" + *items[i].SerialNumber
|
||||
seen[key]++
|
||||
if seen[key] > 1 {
|
||||
repl := fmt.Sprintf("NO_SN-%08d", seq)
|
||||
seq++
|
||||
items[i].SerialNumber = &repl
|
||||
}
|
||||
}
|
||||
}
|
||||
63
audit/internal/collector/finalize_test.go
Normal file
63
audit/internal/collector/finalize_test.go
Normal file
@@ -0,0 +1,63 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestFinalizeSnapshotFiltersComponentsWithoutRequiredSerials(t *testing.T) {
|
||||
collectedAt := "2026-03-15T12:00:00Z"
|
||||
present := true
|
||||
status := statusOK
|
||||
serial := "SN-1"
|
||||
|
||||
snap := schema.HardwareSnapshot{
|
||||
Memory: []schema.HardwareMemory{
|
||||
{Present: &present, SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
{Present: &present, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
},
|
||||
Storage: []schema.HardwareStorage{
|
||||
{SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
{HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
},
|
||||
PowerSupplies: []schema.HardwarePowerSupply{
|
||||
{SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
{HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
},
|
||||
}
|
||||
|
||||
finalizeSnapshot(&snap, collectedAt)
|
||||
|
||||
if len(snap.Memory) != 1 || snap.Memory[0].StatusCheckedAt == nil || *snap.Memory[0].StatusCheckedAt != collectedAt {
|
||||
t.Fatalf("memory finalize mismatch: %+v", snap.Memory)
|
||||
}
|
||||
if len(snap.Storage) != 1 || snap.Storage[0].StatusCheckedAt == nil || *snap.Storage[0].StatusCheckedAt != collectedAt {
|
||||
t.Fatalf("storage finalize mismatch: %+v", snap.Storage)
|
||||
}
|
||||
if len(snap.PowerSupplies) != 1 || snap.PowerSupplies[0].StatusCheckedAt == nil || *snap.PowerSupplies[0].StatusCheckedAt != collectedAt {
|
||||
t.Fatalf("psu finalize mismatch: %+v", snap.PowerSupplies)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFinalizeSnapshotDeduplicatesSerials(t *testing.T) {
|
||||
collectedAt := "2026-03-15T12:00:00Z"
|
||||
status := statusOK
|
||||
model := "Device"
|
||||
serial := "DUPLICATE"
|
||||
|
||||
snap := schema.HardwareSnapshot{
|
||||
Storage: []schema.HardwareStorage{
|
||||
{Model: &model, SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
{Model: &model, SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
},
|
||||
}
|
||||
|
||||
finalizeSnapshot(&snap, collectedAt)
|
||||
|
||||
if got := *snap.Storage[0].SerialNumber; got != serial {
|
||||
t.Fatalf("first serial changed: %q", got)
|
||||
}
|
||||
if got := *snap.Storage[1].SerialNumber; got != "NO_SN-00000001" {
|
||||
t.Fatalf("duplicate serial mismatch: %q", got)
|
||||
}
|
||||
}
|
||||
@@ -47,12 +47,12 @@ func parseMemorySection(fields map[string]string) schema.HardwareMemory {
|
||||
dimm.Present = &present
|
||||
|
||||
if !present {
|
||||
status := "EMPTY"
|
||||
status := statusEmpty
|
||||
dimm.Status = &status
|
||||
return dimm
|
||||
}
|
||||
|
||||
status := "OK"
|
||||
status := statusOK
|
||||
dimm.Status = &status
|
||||
|
||||
if mb := parseMemorySizeMB(rawSize); mb > 0 {
|
||||
|
||||
203
audit/internal/collector/memory_telemetry.go
Normal file
203
audit/internal/collector/memory_telemetry.go
Normal file
@@ -0,0 +1,203 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var edacBaseDir = "/sys/devices/system/edac/mc"
|
||||
|
||||
type edacDIMMStats struct {
|
||||
Label string
|
||||
CECount *int64
|
||||
UECount *int64
|
||||
}
|
||||
|
||||
func enrichMemoryWithTelemetry(dimms []schema.HardwareMemory, doc sensorsDoc) []schema.HardwareMemory {
|
||||
if len(dimms) == 0 {
|
||||
return dimms
|
||||
}
|
||||
|
||||
tempByLabel := memoryTempsFromSensors(doc)
|
||||
stats := readEDACStats()
|
||||
|
||||
for i := range dimms {
|
||||
labelKeys := dimmMatchKeys(dimms[i].Slot, dimms[i].Location)
|
||||
|
||||
for _, key := range labelKeys {
|
||||
if temp, ok := tempByLabel[key]; ok {
|
||||
dimms[i].TemperatureC = &temp
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
for _, key := range labelKeys {
|
||||
if stat, ok := stats[key]; ok {
|
||||
if stat.CECount != nil {
|
||||
dimms[i].CorrectableECCErrorCount = stat.CECount
|
||||
}
|
||||
if stat.UECount != nil {
|
||||
dimms[i].UncorrectableECCErrorCount = stat.UECount
|
||||
}
|
||||
if stat.UECount != nil && *stat.UECount > 0 {
|
||||
dimms[i].DataLossDetected = boolPtr(true)
|
||||
status := statusCritical
|
||||
dimms[i].Status = &status
|
||||
if dimms[i].ErrorDescription == nil {
|
||||
dimms[i].ErrorDescription = stringPtr("EDAC reports uncorrectable ECC errors")
|
||||
}
|
||||
} else if stat.CECount != nil && *stat.CECount > 0 && (dimms[i].Status == nil || *dimms[i].Status == statusOK) {
|
||||
status := statusWarning
|
||||
dimms[i].Status = &status
|
||||
if dimms[i].ErrorDescription == nil {
|
||||
dimms[i].ErrorDescription = stringPtr("EDAC reports correctable ECC errors")
|
||||
}
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return dimms
|
||||
}
|
||||
|
||||
func memoryTempsFromSensors(doc sensorsDoc) map[string]float64 {
|
||||
out := map[string]float64{}
|
||||
if len(doc) == 0 {
|
||||
return out
|
||||
}
|
||||
for chip, features := range doc {
|
||||
for featureName, raw := range features {
|
||||
feature, ok := raw.(map[string]any)
|
||||
if !ok || classifySensorFeature(feature) != "temp" {
|
||||
continue
|
||||
}
|
||||
if !isLikelyMemoryTemp(chip, featureName) {
|
||||
continue
|
||||
}
|
||||
temp, ok := firstFeatureFloat(feature, "_input")
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
key := canonicalLabel(featureName)
|
||||
if key == "" {
|
||||
continue
|
||||
}
|
||||
if _, exists := out[key]; !exists {
|
||||
out[key] = temp
|
||||
}
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func readEDACStats() map[string]edacDIMMStats {
|
||||
out := map[string]edacDIMMStats{}
|
||||
mcDirs, err := filepath.Glob(filepath.Join(edacBaseDir, "mc*"))
|
||||
if err != nil {
|
||||
return out
|
||||
}
|
||||
sort.Strings(mcDirs)
|
||||
for _, mcDir := range mcDirs {
|
||||
dimmDirs, err := filepath.Glob(filepath.Join(mcDir, "dimm*"))
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
sort.Strings(dimmDirs)
|
||||
for _, dimmDir := range dimmDirs {
|
||||
stat, ok := readEDACDIMMStats(dimmDir)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
key := canonicalLabel(stat.Label)
|
||||
if key == "" {
|
||||
continue
|
||||
}
|
||||
out[key] = stat
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func readEDACDIMMStats(dimmDir string) (edacDIMMStats, bool) {
|
||||
labelBytes, err := os.ReadFile(filepath.Join(dimmDir, "dimm_label"))
|
||||
if err != nil {
|
||||
labelBytes, err = os.ReadFile(filepath.Join(dimmDir, "label"))
|
||||
if err != nil {
|
||||
return edacDIMMStats{}, false
|
||||
}
|
||||
}
|
||||
label := strings.TrimSpace(string(labelBytes))
|
||||
if label == "" {
|
||||
return edacDIMMStats{}, false
|
||||
}
|
||||
|
||||
stat := edacDIMMStats{Label: label}
|
||||
if value, ok := readEDACCount(dimmDir, []string{"dimm_ce_count", "ce_count"}); ok {
|
||||
stat.CECount = &value
|
||||
}
|
||||
if value, ok := readEDACCount(dimmDir, []string{"dimm_ue_count", "ue_count"}); ok {
|
||||
stat.UECount = &value
|
||||
}
|
||||
return stat, true
|
||||
}
|
||||
|
||||
func readEDACCount(dir string, names []string) (int64, bool) {
|
||||
for _, name := range names {
|
||||
raw, err := os.ReadFile(filepath.Join(dir, name))
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
value, err := strconv.ParseInt(strings.TrimSpace(string(raw)), 10, 64)
|
||||
if err == nil && value >= 0 {
|
||||
return value, true
|
||||
}
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
func dimmMatchKeys(slot, location *string) []string {
|
||||
var out []string
|
||||
add := func(value *string) {
|
||||
key := canonicalLabel(derefString(value))
|
||||
if key == "" {
|
||||
return
|
||||
}
|
||||
for _, existing := range out {
|
||||
if existing == key {
|
||||
return
|
||||
}
|
||||
}
|
||||
out = append(out, key)
|
||||
}
|
||||
add(slot)
|
||||
add(location)
|
||||
return out
|
||||
}
|
||||
|
||||
func canonicalLabel(value string) string {
|
||||
value = strings.ToUpper(strings.TrimSpace(value))
|
||||
if value == "" {
|
||||
return ""
|
||||
}
|
||||
var b strings.Builder
|
||||
for _, r := range value {
|
||||
if (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9') {
|
||||
b.WriteRune(r)
|
||||
}
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func isLikelyMemoryTemp(chip, feature string) bool {
|
||||
value := strings.ToLower(chip + " " + feature)
|
||||
return strings.Contains(value, "dimm") || strings.Contains(value, "sodimm")
|
||||
}
|
||||
|
||||
func boolPtr(value bool) *bool {
|
||||
return &value
|
||||
}
|
||||
61
audit/internal/collector/memory_telemetry_test.go
Normal file
61
audit/internal/collector/memory_telemetry_test.go
Normal file
@@ -0,0 +1,61 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
func TestEnrichMemoryWithTelemetry(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
oldBase := edacBaseDir
|
||||
edacBaseDir = tmp
|
||||
t.Cleanup(func() { edacBaseDir = oldBase })
|
||||
|
||||
mustWriteFile(t, filepath.Join(tmp, "mc0", "dimm0", "dimm_label"), "CPU0_DIMM_A1\n")
|
||||
mustWriteFile(t, filepath.Join(tmp, "mc0", "dimm0", "dimm_ce_count"), "7\n")
|
||||
mustWriteFile(t, filepath.Join(tmp, "mc0", "dimm0", "dimm_ue_count"), "0\n")
|
||||
mustWriteFile(t, filepath.Join(tmp, "mc0", "dimm1", "dimm_label"), "CPU1_DIMM_B2\n")
|
||||
mustWriteFile(t, filepath.Join(tmp, "mc0", "dimm1", "dimm_ce_count"), "0\n")
|
||||
mustWriteFile(t, filepath.Join(tmp, "mc0", "dimm1", "dimm_ue_count"), "2\n")
|
||||
|
||||
doc := sensorsDoc{
|
||||
"jc42-i2c-0-18": {
|
||||
"CPU0 DIMM A1": map[string]any{"temp1_input": 43.0},
|
||||
"CPU1 DIMM B2": map[string]any{"temp2_input": 46.0},
|
||||
},
|
||||
}
|
||||
|
||||
status := statusOK
|
||||
slotA := "CPU0_DIMM_A1"
|
||||
slotB := "CPU1_DIMM_B2"
|
||||
dimms := []schema.HardwareMemory{
|
||||
{Slot: &slotA, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
{Slot: &slotB, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
}
|
||||
|
||||
got := enrichMemoryWithTelemetry(dimms, doc)
|
||||
|
||||
if got[0].TemperatureC == nil || *got[0].TemperatureC != 43.0 {
|
||||
t.Fatalf("dimm0 temperature mismatch: %#v", got[0].TemperatureC)
|
||||
}
|
||||
if got[0].CorrectableECCErrorCount == nil || *got[0].CorrectableECCErrorCount != 7 {
|
||||
t.Fatalf("dimm0 ce mismatch: %#v", got[0].CorrectableECCErrorCount)
|
||||
}
|
||||
if got[0].Status == nil || *got[0].Status != statusWarning {
|
||||
t.Fatalf("dimm0 status mismatch: %#v", got[0].Status)
|
||||
}
|
||||
if got[1].TemperatureC == nil || *got[1].TemperatureC != 46.0 {
|
||||
t.Fatalf("dimm1 temperature mismatch: %#v", got[1].TemperatureC)
|
||||
}
|
||||
if got[1].UncorrectableECCErrorCount == nil || *got[1].UncorrectableECCErrorCount != 2 {
|
||||
t.Fatalf("dimm1 ue mismatch: %#v", got[1].UncorrectableECCErrorCount)
|
||||
}
|
||||
if got[1].Status == nil || *got[1].Status != statusCritical {
|
||||
t.Fatalf("dimm1 status mismatch: %#v", got[1].Status)
|
||||
}
|
||||
if got[1].DataLossDetected == nil || !*got[1].DataLossDetected {
|
||||
t.Fatalf("dimm1 data_loss_detected mismatch: %#v", got[1].DataLossDetected)
|
||||
}
|
||||
}
|
||||
@@ -18,17 +18,13 @@ var (
|
||||
}
|
||||
return string(out), nil
|
||||
}
|
||||
readNetStatFile = func(iface, key string) (int64, error) {
|
||||
path := filepath.Join("/sys/class/net", iface, "statistics", key)
|
||||
readNetAddressFile = func(iface string) (string, error) {
|
||||
path := filepath.Join("/sys/class/net", iface, "address")
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
return "", err
|
||||
}
|
||||
v, err := strconv.ParseInt(strings.TrimSpace(string(raw)), 10, 64)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
return v, nil
|
||||
return strings.TrimSpace(string(raw)), nil
|
||||
}
|
||||
)
|
||||
|
||||
@@ -47,6 +43,7 @@ func enrichPCIeWithNICTelemetry(devs []schema.HardwarePCIeDevice) []schema.Hardw
|
||||
continue
|
||||
}
|
||||
iface := ifaces[0]
|
||||
devs[i].MacAddresses = collectInterfaceMACs(ifaces)
|
||||
|
||||
if devs[i].Firmware == nil {
|
||||
if out, err := ethtoolInfoQuery(iface); err == nil {
|
||||
@@ -56,16 +53,13 @@ func enrichPCIeWithNICTelemetry(devs []schema.HardwarePCIeDevice) []schema.Hardw
|
||||
}
|
||||
}
|
||||
|
||||
if devs[i].Telemetry == nil {
|
||||
devs[i].Telemetry = map[string]any{}
|
||||
}
|
||||
injectNICPacketStats(devs[i].Telemetry, iface)
|
||||
if out, err := ethtoolModuleQuery(iface); err == nil {
|
||||
injectSFPDOMTelemetry(devs[i].Telemetry, out)
|
||||
if injectSFPDOMTelemetry(&devs[i], out) {
|
||||
enriched++
|
||||
continue
|
||||
}
|
||||
}
|
||||
if len(devs[i].Telemetry) == 0 {
|
||||
devs[i].Telemetry = nil
|
||||
} else {
|
||||
if len(devs[i].MacAddresses) > 0 || devs[i].Firmware != nil {
|
||||
enriched++
|
||||
}
|
||||
}
|
||||
@@ -77,31 +71,32 @@ func isNICDevice(dev schema.HardwarePCIeDevice) bool {
|
||||
if dev.DeviceClass == nil {
|
||||
return false
|
||||
}
|
||||
c := strings.ToLower(strings.TrimSpace(*dev.DeviceClass))
|
||||
return strings.Contains(c, "ethernet controller") ||
|
||||
strings.Contains(c, "network controller") ||
|
||||
strings.Contains(c, "infiniband controller")
|
||||
c := strings.TrimSpace(*dev.DeviceClass)
|
||||
return isNICClass(c) || strings.EqualFold(c, "FibreChannelController")
|
||||
}
|
||||
|
||||
func injectNICPacketStats(dst map[string]any, iface string) {
|
||||
for _, key := range []string{"rx_packets", "tx_packets", "rx_errors", "tx_errors"} {
|
||||
if v, err := readNetStatFile(iface, key); err == nil {
|
||||
dst[key] = v
|
||||
func collectInterfaceMACs(ifaces []string) []string {
|
||||
seen := map[string]struct{}{}
|
||||
var out []string
|
||||
for _, iface := range ifaces {
|
||||
mac, err := readNetAddressFile(iface)
|
||||
if err != nil || mac == "" {
|
||||
continue
|
||||
}
|
||||
mac = strings.ToLower(strings.TrimSpace(mac))
|
||||
if _, ok := seen[mac]; ok {
|
||||
continue
|
||||
}
|
||||
seen[mac] = struct{}{}
|
||||
out = append(out, mac)
|
||||
}
|
||||
}
|
||||
|
||||
func injectSFPDOMTelemetry(dst map[string]any, raw string) {
|
||||
parsed := parseSFPDOM(raw)
|
||||
for k, v := range parsed {
|
||||
dst[k] = v
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
var floatRe = regexp.MustCompile(`[-+]?[0-9]*\.?[0-9]+`)
|
||||
|
||||
func parseSFPDOM(raw string) map[string]any {
|
||||
out := map[string]any{}
|
||||
func injectSFPDOMTelemetry(dev *schema.HardwarePCIeDevice, raw string) bool {
|
||||
var changed bool
|
||||
for _, line := range strings.Split(raw, "\n") {
|
||||
trimmed := strings.TrimSpace(line)
|
||||
if trimmed == "" {
|
||||
@@ -117,26 +112,55 @@ func parseSFPDOM(raw string) map[string]any {
|
||||
switch {
|
||||
case strings.Contains(key, "module temperature"):
|
||||
if f, ok := firstFloat(val); ok {
|
||||
out["sfp_temperature_c"] = f
|
||||
dev.SFPTemperatureC = &f
|
||||
changed = true
|
||||
}
|
||||
case strings.Contains(key, "laser output power"):
|
||||
if f, ok := dbmValue(val); ok {
|
||||
out["sfp_tx_power_dbm"] = f
|
||||
dev.SFPTXPowerDBM = &f
|
||||
changed = true
|
||||
}
|
||||
case strings.Contains(key, "receiver signal"):
|
||||
if f, ok := dbmValue(val); ok {
|
||||
out["sfp_rx_power_dbm"] = f
|
||||
dev.SFPRXPowerDBM = &f
|
||||
changed = true
|
||||
}
|
||||
case strings.Contains(key, "module voltage"):
|
||||
if f, ok := firstFloat(val); ok {
|
||||
out["sfp_voltage_v"] = f
|
||||
dev.SFPVoltageV = &f
|
||||
changed = true
|
||||
}
|
||||
case strings.Contains(key, "laser bias current"):
|
||||
if f, ok := firstFloat(val); ok {
|
||||
out["sfp_bias_ma"] = f
|
||||
dev.SFPBiasMA = &f
|
||||
changed = true
|
||||
}
|
||||
}
|
||||
}
|
||||
return changed
|
||||
}
|
||||
|
||||
func parseSFPDOM(raw string) map[string]any {
|
||||
dev := schema.HardwarePCIeDevice{}
|
||||
if !injectSFPDOMTelemetry(&dev, raw) {
|
||||
return map[string]any{}
|
||||
}
|
||||
out := map[string]any{}
|
||||
if dev.SFPTemperatureC != nil {
|
||||
out["sfp_temperature_c"] = *dev.SFPTemperatureC
|
||||
}
|
||||
if dev.SFPTXPowerDBM != nil {
|
||||
out["sfp_tx_power_dbm"] = *dev.SFPTXPowerDBM
|
||||
}
|
||||
if dev.SFPRXPowerDBM != nil {
|
||||
out["sfp_rx_power_dbm"] = *dev.SFPRXPowerDBM
|
||||
}
|
||||
if dev.SFPVoltageV != nil {
|
||||
out["sfp_voltage_v"] = *dev.SFPVoltageV
|
||||
}
|
||||
if dev.SFPBiasMA != nil {
|
||||
out["sfp_bias_ma"] = *dev.SFPBiasMA
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
|
||||
@@ -24,7 +24,7 @@ type nvidiaGPUInfo struct {
|
||||
}
|
||||
|
||||
// enrichPCIeWithNVIDIA enriches NVIDIA PCIe devices with data from nvidia-smi.
|
||||
// If the driver/tool is unavailable, NVIDIA devices get UNKNOWN status and
|
||||
// If the driver/tool is unavailable, NVIDIA devices get Unknown status and
|
||||
// a stable serial fallback based on board serial + slot.
|
||||
func enrichPCIeWithNVIDIA(devs []schema.HardwarePCIeDevice, boardSerial string) []schema.HardwarePCIeDevice {
|
||||
if !hasNVIDIADevices(devs) {
|
||||
@@ -78,9 +78,10 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
|
||||
devs[i].Firmware = &v
|
||||
}
|
||||
|
||||
status := "OK"
|
||||
status := statusOK
|
||||
if info.ECCUncorrected != nil && *info.ECCUncorrected > 0 {
|
||||
status = "WARNING"
|
||||
status = statusWarning
|
||||
devs[i].ErrorDescription = stringPtr("GPU reports uncorrected ECC errors")
|
||||
}
|
||||
devs[i].Status = &status
|
||||
injectNVIDIATelemetry(&devs[i], info)
|
||||
@@ -214,7 +215,7 @@ func isNVIDIADevice(dev schema.HardwarePCIeDevice) bool {
|
||||
|
||||
func setPCIeFallback(dev *schema.HardwarePCIeDevice, boardSerial string) {
|
||||
setPCIeFallbackSerial(dev, boardSerial)
|
||||
status := "UNKNOWN"
|
||||
status := statusUnknown
|
||||
dev.Status = &status
|
||||
}
|
||||
|
||||
@@ -233,25 +234,19 @@ func setPCIeFallbackSerial(dev *schema.HardwarePCIeDevice, boardSerial string) {
|
||||
}
|
||||
|
||||
func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) {
|
||||
if dev.Telemetry == nil {
|
||||
dev.Telemetry = map[string]any{}
|
||||
}
|
||||
if info.TemperatureC != nil {
|
||||
dev.Telemetry["temperature_c"] = *info.TemperatureC
|
||||
dev.TemperatureC = info.TemperatureC
|
||||
}
|
||||
if info.PowerW != nil {
|
||||
dev.Telemetry["power_w"] = *info.PowerW
|
||||
dev.PowerW = info.PowerW
|
||||
}
|
||||
if info.ECCUncorrected != nil {
|
||||
dev.Telemetry["ecc_uncorrected_total"] = *info.ECCUncorrected
|
||||
dev.ECCUncorrectedTotal = info.ECCUncorrected
|
||||
}
|
||||
if info.ECCCorrected != nil {
|
||||
dev.Telemetry["ecc_corrected_total"] = *info.ECCCorrected
|
||||
dev.ECCCorrectedTotal = info.ECCCorrected
|
||||
}
|
||||
if info.HWSlowdown != nil {
|
||||
dev.Telemetry["hw_slowdown_active"] = *info.HWSlowdown
|
||||
}
|
||||
if len(dev.Telemetry) == 0 {
|
||||
dev.Telemetry = nil
|
||||
dev.HWSlowdown = info.HWSlowdown
|
||||
}
|
||||
}
|
||||
|
||||
@@ -54,10 +54,10 @@ func TestEnrichPCIeWithNVIDIAData_driverLoaded(t *testing.T) {
|
||||
status := "OK"
|
||||
devices := []schema.HardwarePCIeDevice{
|
||||
{
|
||||
VendorID: &vendorID,
|
||||
BDF: &bdf,
|
||||
Manufacturer: &manufacturer,
|
||||
Status: &status,
|
||||
HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status},
|
||||
VendorID: &vendorID,
|
||||
BDF: &bdf,
|
||||
Manufacturer: &manufacturer,
|
||||
},
|
||||
}
|
||||
|
||||
@@ -80,14 +80,14 @@ func TestEnrichPCIeWithNVIDIAData_driverLoaded(t *testing.T) {
|
||||
if out[0].Firmware == nil || *out[0].Firmware != "96.00.1F.00.02" {
|
||||
t.Fatalf("firmware: got %v", out[0].Firmware)
|
||||
}
|
||||
if out[0].Status == nil || *out[0].Status != "WARNING" {
|
||||
if out[0].Status == nil || *out[0].Status != statusWarning {
|
||||
t.Fatalf("status: got %v", out[0].Status)
|
||||
}
|
||||
if out[0].Telemetry == nil {
|
||||
t.Fatal("expected telemetry")
|
||||
if out[0].ECCUncorrectedTotal == nil || *out[0].ECCUncorrectedTotal != 2 {
|
||||
t.Fatalf("ecc_uncorrected_total: got %#v", out[0].ECCUncorrectedTotal)
|
||||
}
|
||||
if got, ok := out[0].Telemetry["ecc_uncorrected_total"].(int64); !ok || got != 2 {
|
||||
t.Fatalf("ecc_uncorrected_total: got %#v", out[0].Telemetry["ecc_uncorrected_total"])
|
||||
if out[0].TemperatureC == nil || *out[0].TemperatureC != 55.5 {
|
||||
t.Fatalf("temperature_c: got %#v", out[0].TemperatureC)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -107,7 +107,7 @@ func TestEnrichPCIeWithNVIDIAData_driverMissingFallback(t *testing.T) {
|
||||
if out[0].SerialNumber == nil || *out[0].SerialNumber != "BOARD-123-PCIE-0000:17:00.0" {
|
||||
t.Fatalf("fallback serial: got %v", out[0].SerialNumber)
|
||||
}
|
||||
if out[0].Status == nil || *out[0].Status != "UNKNOWN" {
|
||||
if out[0].Status == nil || *out[0].Status != statusUnknown {
|
||||
t.Fatalf("fallback status: got %v", out[0].Status)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -79,7 +79,7 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
|
||||
dev := schema.HardwarePCIeDevice{}
|
||||
present := true
|
||||
dev.Present = &present
|
||||
status := "OK"
|
||||
status := statusOK
|
||||
dev.Status = &status
|
||||
|
||||
// Slot is the BDF: "0000:00:02.0"
|
||||
@@ -93,10 +93,32 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
|
||||
if deviceID != 0 {
|
||||
dev.DeviceID = &deviceID
|
||||
}
|
||||
if numaNode, ok := readPCINumaNode(bdf); ok {
|
||||
dev.NUMANode = &numaNode
|
||||
}
|
||||
if width, ok := readPCIIntAttribute(bdf, "current_link_width"); ok {
|
||||
dev.LinkWidth = &width
|
||||
}
|
||||
if width, ok := readPCIIntAttribute(bdf, "max_link_width"); ok {
|
||||
dev.MaxLinkWidth = &width
|
||||
}
|
||||
if speed, ok := readPCIStringAttribute(bdf, "current_link_speed"); ok {
|
||||
linkSpeed := normalizePCILinkSpeed(speed)
|
||||
if linkSpeed != "" {
|
||||
dev.LinkSpeed = &linkSpeed
|
||||
}
|
||||
}
|
||||
if speed, ok := readPCIStringAttribute(bdf, "max_link_speed"); ok {
|
||||
linkSpeed := normalizePCILinkSpeed(speed)
|
||||
if linkSpeed != "" {
|
||||
dev.MaxLinkSpeed = &linkSpeed
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if v := fields["Class"]; v != "" {
|
||||
dev.DeviceClass = &v
|
||||
class := mapPCIeDeviceClass(v)
|
||||
dev.DeviceClass = &class
|
||||
}
|
||||
if v := fields["Vendor"]; v != "" {
|
||||
dev.Manufacturer = &v
|
||||
@@ -131,3 +153,55 @@ func readHexFile(path string) (int, error) {
|
||||
n, err := strconv.ParseInt(s, 16, 64)
|
||||
return int(n), err
|
||||
}
|
||||
|
||||
func readPCINumaNode(bdf string) (int, bool) {
|
||||
value, ok := readPCIIntAttribute(bdf, "numa_node")
|
||||
if !ok || value < 0 {
|
||||
return 0, false
|
||||
}
|
||||
return value, true
|
||||
}
|
||||
|
||||
func readPCIIntAttribute(bdf, attribute string) (int, bool) {
|
||||
out, err := exec.Command("cat", "/sys/bus/pci/devices/"+bdf+"/"+attribute).Output()
|
||||
if err != nil {
|
||||
return 0, false
|
||||
}
|
||||
value, err := strconv.Atoi(strings.TrimSpace(string(out)))
|
||||
if err != nil || value < 0 {
|
||||
return 0, false
|
||||
}
|
||||
return value, true
|
||||
}
|
||||
|
||||
func readPCIStringAttribute(bdf, attribute string) (string, bool) {
|
||||
out, err := exec.Command("cat", "/sys/bus/pci/devices/"+bdf+"/"+attribute).Output()
|
||||
if err != nil {
|
||||
return "", false
|
||||
}
|
||||
value := strings.TrimSpace(string(out))
|
||||
if value == "" {
|
||||
return "", false
|
||||
}
|
||||
return value, true
|
||||
}
|
||||
|
||||
func normalizePCILinkSpeed(raw string) string {
|
||||
raw = strings.TrimSpace(strings.ToLower(raw))
|
||||
switch {
|
||||
case strings.Contains(raw, "2.5"):
|
||||
return "Gen1"
|
||||
case strings.Contains(raw, "5.0"):
|
||||
return "Gen2"
|
||||
case strings.Contains(raw, "8.0"):
|
||||
return "Gen3"
|
||||
case strings.Contains(raw, "16.0"):
|
||||
return "Gen4"
|
||||
case strings.Contains(raw, "32.0"):
|
||||
return "Gen5"
|
||||
case strings.Contains(raw, "64.0"):
|
||||
return "Gen6"
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
@@ -35,7 +35,27 @@ func TestParseLspci_filtersExcludedClasses(t *testing.T) {
|
||||
if len(devs) != 1 {
|
||||
t.Fatalf("expected 1 filtered device, got %d", len(devs))
|
||||
}
|
||||
if devs[0].DeviceClass == nil || *devs[0].DeviceClass != "VGA compatible controller" {
|
||||
if devs[0].DeviceClass == nil || *devs[0].DeviceClass != "VideoController" {
|
||||
t.Fatalf("unexpected remaining class: %v", devs[0].DeviceClass)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizePCILinkSpeed(t *testing.T) {
|
||||
tests := []struct {
|
||||
raw string
|
||||
want string
|
||||
}{
|
||||
{"2.5 GT/s PCIe", "Gen1"},
|
||||
{"5.0 GT/s PCIe", "Gen2"},
|
||||
{"8.0 GT/s PCIe", "Gen3"},
|
||||
{"16.0 GT/s PCIe", "Gen4"},
|
||||
{"32.0 GT/s PCIe", "Gen5"},
|
||||
{"64.0 GT/s PCIe", "Gen6"},
|
||||
{"unknown", ""},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
if got := normalizePCILinkSpeed(tt.raw); got != tt.want {
|
||||
t.Fatalf("normalizePCILinkSpeed(%q)=%q want %q", tt.raw, got, tt.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -114,7 +114,7 @@ func parseFRUBlock(block string, slotIdx int) (schema.HardwarePowerSupply, bool)
|
||||
}
|
||||
}
|
||||
|
||||
status := "OK"
|
||||
status := statusOK
|
||||
psu.Status = &status
|
||||
|
||||
return psu, true
|
||||
@@ -123,9 +123,12 @@ func parseFRUBlock(block string, slotIdx int) (schema.HardwarePowerSupply, bool)
|
||||
type psuSDR struct {
|
||||
slot int
|
||||
status string
|
||||
reason string
|
||||
inputPowerW *float64
|
||||
outputPowerW *float64
|
||||
inputVoltage *float64
|
||||
temperatureC *float64
|
||||
healthPct *float64
|
||||
}
|
||||
|
||||
var psuSlotRe = regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b|\bps\s*([0-9]+)\b`)
|
||||
@@ -148,10 +151,11 @@ func parsePSUSDR(raw string) map[int]psuSDR {
|
||||
entry := out[slot]
|
||||
entry.slot = slot
|
||||
if entry.status == "" {
|
||||
entry.status = "OK"
|
||||
entry.status = statusOK
|
||||
}
|
||||
if state != "" && state != "ok" && state != "ns" {
|
||||
entry.status = "FAILED"
|
||||
entry.status = statusCritical
|
||||
entry.reason = "PSU sensor reported non-OK state: " + state
|
||||
}
|
||||
|
||||
lowerName := strings.ToLower(name)
|
||||
@@ -162,6 +166,10 @@ func parsePSUSDR(raw string) map[int]psuSDR {
|
||||
entry.outputPowerW = parseFloatPtr(value)
|
||||
case strings.Contains(lowerName, "input voltage"), strings.Contains(lowerName, "ac input"):
|
||||
entry.inputVoltage = parseFloatPtr(value)
|
||||
case strings.Contains(lowerName, "temp"):
|
||||
entry.temperatureC = parseFloatPtr(value)
|
||||
case strings.Contains(lowerName, "health"), strings.Contains(lowerName, "remaining life"), strings.Contains(lowerName, "life remaining"):
|
||||
entry.healthPct = parsePercentPtr(value)
|
||||
}
|
||||
out[slot] = entry
|
||||
}
|
||||
@@ -187,12 +195,23 @@ func mergePSUSDR(psus []schema.HardwarePowerSupply, sdr map[int]psuSDR) {
|
||||
if entry.inputVoltage != nil {
|
||||
psus[i].InputVoltage = entry.inputVoltage
|
||||
}
|
||||
if entry.temperatureC != nil {
|
||||
psus[i].TemperatureC = entry.temperatureC
|
||||
}
|
||||
if entry.healthPct != nil {
|
||||
psus[i].LifeRemainingPct = entry.healthPct
|
||||
lifeUsed := 100 - *entry.healthPct
|
||||
psus[i].LifeUsedPct = &lifeUsed
|
||||
}
|
||||
if entry.status != "" {
|
||||
psus[i].Status = &entry.status
|
||||
}
|
||||
if psus[i].Status != nil && *psus[i].Status == "OK" {
|
||||
if entry.reason != "" {
|
||||
psus[i].ErrorDescription = &entry.reason
|
||||
}
|
||||
if psus[i].Status != nil && *psus[i].Status == statusOK {
|
||||
if (entry.inputPowerW == nil && entry.outputPowerW == nil && entry.inputVoltage == nil) && entry.status == "" {
|
||||
unknown := "UNKNOWN"
|
||||
unknown := statusUnknown
|
||||
psus[i].Status = &unknown
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,6 +7,8 @@ func TestParsePSUSDR(t *testing.T) {
|
||||
PS1 Input Power | 215 Watts | ok
|
||||
PS1 Output Power | 198 Watts | ok
|
||||
PS1 Input Voltage | 229 Volts | ok
|
||||
PS1 Temp | 39 C | ok
|
||||
PS1 Health | 97 % | ok
|
||||
PS2 Input Power | 0 Watts | cr
|
||||
`
|
||||
|
||||
@@ -14,7 +16,7 @@ PS2 Input Power | 0 Watts | cr
|
||||
if len(got) != 2 {
|
||||
t.Fatalf("len(got)=%d want 2", len(got))
|
||||
}
|
||||
if got[1].status != "OK" {
|
||||
if got[1].status != statusOK {
|
||||
t.Fatalf("ps1 status=%q", got[1].status)
|
||||
}
|
||||
if got[1].inputPowerW == nil || *got[1].inputPowerW != 215 {
|
||||
@@ -26,7 +28,13 @@ PS2 Input Power | 0 Watts | cr
|
||||
if got[1].inputVoltage == nil || *got[1].inputVoltage != 229 {
|
||||
t.Fatalf("ps1 input voltage=%v", got[1].inputVoltage)
|
||||
}
|
||||
if got[2].status != "FAILED" {
|
||||
if got[1].temperatureC == nil || *got[1].temperatureC != 39 {
|
||||
t.Fatalf("ps1 temperature=%v", got[1].temperatureC)
|
||||
}
|
||||
if got[1].healthPct == nil || *got[1].healthPct != 97 {
|
||||
t.Fatalf("ps1 health=%v", got[1].healthPct)
|
||||
}
|
||||
if got[2].status != statusCritical {
|
||||
t.Fatalf("ps2 status=%q", got[2].status)
|
||||
}
|
||||
}
|
||||
|
||||
132
audit/internal/collector/psu_telemetry.go
Normal file
132
audit/internal/collector/psu_telemetry.go
Normal file
@@ -0,0 +1,132 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func enrichPSUsWithTelemetry(psus []schema.HardwarePowerSupply, doc sensorsDoc) []schema.HardwarePowerSupply {
|
||||
if len(psus) == 0 || len(doc) == 0 {
|
||||
return psus
|
||||
}
|
||||
|
||||
tempBySlot := psuTempsFromSensors(doc)
|
||||
healthBySlot := psuHealthFromSensors(doc)
|
||||
for i := range psus {
|
||||
slot := derefPSUSlot(psus[i].Slot)
|
||||
if slot == "" {
|
||||
continue
|
||||
}
|
||||
if psus[i].TemperatureC == nil {
|
||||
if value, ok := tempBySlot[slot]; ok {
|
||||
psus[i].TemperatureC = &value
|
||||
}
|
||||
}
|
||||
if psus[i].LifeRemainingPct == nil {
|
||||
if value, ok := healthBySlot[slot]; ok {
|
||||
psus[i].LifeRemainingPct = &value
|
||||
used := 100 - value
|
||||
psus[i].LifeUsedPct = &used
|
||||
}
|
||||
}
|
||||
}
|
||||
return psus
|
||||
}
|
||||
|
||||
func psuHealthFromSensors(doc sensorsDoc) map[string]float64 {
|
||||
out := map[string]float64{}
|
||||
for chip, features := range doc {
|
||||
for featureName, raw := range features {
|
||||
feature, ok := raw.(map[string]any)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if !isLikelyPSUHealth(chip, featureName) {
|
||||
continue
|
||||
}
|
||||
value, ok := firstFeaturePercent(feature)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if slot, ok := detectPSUSlot(chip, featureName); ok {
|
||||
if _, exists := out[slot]; !exists {
|
||||
out[slot] = value
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func firstFeaturePercent(feature map[string]any) (float64, bool) {
|
||||
keys := sortedFeatureKeys(feature)
|
||||
for _, key := range keys {
|
||||
lower := strings.ToLower(key)
|
||||
if strings.HasSuffix(lower, "_alarm") {
|
||||
continue
|
||||
}
|
||||
if strings.Contains(lower, "health") || strings.Contains(lower, "life") || strings.Contains(lower, "remain") {
|
||||
if value, ok := floatFromAny(feature[key]); ok {
|
||||
return value, true
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
func isLikelyPSUHealth(chip, feature string) bool {
|
||||
value := strings.ToLower(chip + " " + feature)
|
||||
return (strings.Contains(value, "psu") || strings.Contains(value, "power supply")) &&
|
||||
(strings.Contains(value, "health") || strings.Contains(value, "life") || strings.Contains(value, "remain"))
|
||||
}
|
||||
|
||||
func psuTempsFromSensors(doc sensorsDoc) map[string]float64 {
|
||||
out := map[string]float64{}
|
||||
for chip, features := range doc {
|
||||
for featureName, raw := range features {
|
||||
feature, ok := raw.(map[string]any)
|
||||
if !ok || classifySensorFeature(feature) != "temp" {
|
||||
continue
|
||||
}
|
||||
if !isLikelyPSUTemp(chip, featureName) {
|
||||
continue
|
||||
}
|
||||
temp, ok := firstFeatureFloat(feature, "_input")
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if slot, ok := detectPSUSlot(chip, featureName); ok {
|
||||
if _, exists := out[slot]; !exists {
|
||||
out[slot] = temp
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func isLikelyPSUTemp(chip, feature string) bool {
|
||||
value := strings.ToLower(chip + " " + feature)
|
||||
return strings.Contains(value, "psu") || strings.Contains(value, "power supply")
|
||||
}
|
||||
|
||||
func detectPSUSlot(parts ...string) (string, bool) {
|
||||
for _, part := range parts {
|
||||
lower := strings.ToLower(part)
|
||||
matches := psuSlotRe.FindStringSubmatch(lower)
|
||||
if len(matches) == 0 {
|
||||
continue
|
||||
}
|
||||
for _, group := range matches[1:] {
|
||||
if group == "" {
|
||||
continue
|
||||
}
|
||||
value, err := strconv.Atoi(group)
|
||||
if err == nil && value > 0 {
|
||||
return strconv.Itoa(value - 1), true
|
||||
}
|
||||
}
|
||||
}
|
||||
return "", false
|
||||
}
|
||||
42
audit/internal/collector/psu_telemetry_test.go
Normal file
42
audit/internal/collector/psu_telemetry_test.go
Normal file
@@ -0,0 +1,42 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
func TestEnrichPSUsWithTelemetry(t *testing.T) {
|
||||
slot0 := "0"
|
||||
slot1 := "1"
|
||||
psus := []schema.HardwarePowerSupply{
|
||||
{Slot: &slot0},
|
||||
{Slot: &slot1},
|
||||
}
|
||||
|
||||
doc := sensorsDoc{
|
||||
"psu-hwmon-0": {
|
||||
"PSU1 Temp": map[string]any{"temp1_input": 39.5},
|
||||
"PSU2 Temp": map[string]any{"temp2_input": 41.0},
|
||||
"PSU1 Health": map[string]any{"health1_input": 98.0},
|
||||
"PSU2 Remaining Life": map[string]any{"life2_input": 95.0},
|
||||
},
|
||||
}
|
||||
|
||||
got := enrichPSUsWithTelemetry(psus, doc)
|
||||
if got[0].TemperatureC == nil || *got[0].TemperatureC != 39.5 {
|
||||
t.Fatalf("psu0 temperature mismatch: %#v", got[0].TemperatureC)
|
||||
}
|
||||
if got[1].TemperatureC == nil || *got[1].TemperatureC != 41.0 {
|
||||
t.Fatalf("psu1 temperature mismatch: %#v", got[1].TemperatureC)
|
||||
}
|
||||
if got[0].LifeRemainingPct == nil || *got[0].LifeRemainingPct != 98.0 {
|
||||
t.Fatalf("psu0 life remaining mismatch: %#v", got[0].LifeRemainingPct)
|
||||
}
|
||||
if got[0].LifeUsedPct == nil || *got[0].LifeUsedPct != 2.0 {
|
||||
t.Fatalf("psu0 life used mismatch: %#v", got[0].LifeUsedPct)
|
||||
}
|
||||
if got[1].LifeRemainingPct == nil || *got[1].LifeRemainingPct != 95.0 {
|
||||
t.Fatalf("psu1 life remaining mismatch: %#v", got[1].LifeRemainingPct)
|
||||
}
|
||||
}
|
||||
@@ -83,11 +83,7 @@ func isLikelyRAIDController(dev schema.HardwarePCIeDevice) bool {
|
||||
if dev.DeviceClass == nil {
|
||||
return false
|
||||
}
|
||||
c := strings.ToLower(*dev.DeviceClass)
|
||||
return strings.Contains(c, "raid") ||
|
||||
strings.Contains(c, "sas") ||
|
||||
strings.Contains(c, "mass storage") ||
|
||||
strings.Contains(c, "serial attached scsi")
|
||||
return isRAIDClass(*dev.DeviceClass)
|
||||
}
|
||||
|
||||
func collectStorcliDrives() []schema.HardwareStorage {
|
||||
@@ -182,7 +178,10 @@ func parseSASIrcuDisplay(raw string) []schema.HardwareStorage {
|
||||
|
||||
present := true
|
||||
status := mapRAIDDriveStatus(b["State"])
|
||||
s := schema.HardwareStorage{Present: &present, Status: &status}
|
||||
s := schema.HardwareStorage{
|
||||
HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status},
|
||||
Present: &present,
|
||||
}
|
||||
|
||||
enclosure := strings.TrimSpace(b["Enclosure #"])
|
||||
slot := strings.TrimSpace(b["Slot #"])
|
||||
@@ -281,7 +280,10 @@ func parseArcconfPhysicalDrives(raw string) []schema.HardwareStorage {
|
||||
for _, b := range blocks {
|
||||
present := true
|
||||
status := mapRAIDDriveStatus(b["State"])
|
||||
s := schema.HardwareStorage{Present: &present, Status: &status}
|
||||
s := schema.HardwareStorage{
|
||||
HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status},
|
||||
Present: &present,
|
||||
}
|
||||
|
||||
if v := strings.TrimSpace(b["Reported Location"]); v != "" {
|
||||
s.Slot = &v
|
||||
@@ -362,8 +364,11 @@ func parseSSACLIPhysicalDrives(raw string) []schema.HardwareStorage {
|
||||
if m := ssacliPhysicalDriveLine.FindStringSubmatch(trimmed); len(m) == 3 {
|
||||
flush()
|
||||
present := true
|
||||
status := "UNKNOWN"
|
||||
s := schema.HardwareStorage{Present: &present, Status: &status}
|
||||
status := statusUnknown
|
||||
s := schema.HardwareStorage{
|
||||
HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status},
|
||||
Present: &present,
|
||||
}
|
||||
slot := m[1]
|
||||
s.Slot = &slot
|
||||
|
||||
@@ -475,8 +480,8 @@ func storcliDriveToStorage(d struct {
|
||||
present := true
|
||||
status := mapRAIDDriveStatus(d.State)
|
||||
s := schema.HardwareStorage{
|
||||
Present: &present,
|
||||
Status: &status,
|
||||
HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status},
|
||||
Present: &present,
|
||||
}
|
||||
|
||||
if v := strings.TrimSpace(d.EIDSlt); v != "" {
|
||||
@@ -527,15 +532,15 @@ func mapRAIDDriveStatus(raw string) string {
|
||||
u := strings.ToUpper(strings.TrimSpace(raw))
|
||||
switch {
|
||||
case strings.Contains(u, "OK"), strings.Contains(u, "OPTIMAL"), strings.Contains(u, "READY"):
|
||||
return "OK"
|
||||
return statusOK
|
||||
case strings.Contains(u, "ONLN"), strings.Contains(u, "ONLINE"):
|
||||
return "OK"
|
||||
return statusOK
|
||||
case strings.Contains(u, "RBLD"), strings.Contains(u, "REBUILD"):
|
||||
return "WARNING"
|
||||
return statusWarning
|
||||
case strings.Contains(u, "FAIL"), strings.Contains(u, "OFFLINE"):
|
||||
return "CRITICAL"
|
||||
return statusCritical
|
||||
default:
|
||||
return "UNKNOWN"
|
||||
return statusUnknown
|
||||
}
|
||||
}
|
||||
|
||||
@@ -641,8 +646,9 @@ func enrichStorageWithVROC(storage []schema.HardwareStorage, pcie []schema.Hardw
|
||||
storage[i].Telemetry["vroc_array"] = arr.Name
|
||||
storage[i].Telemetry["vroc_degraded"] = arr.Degraded
|
||||
if arr.Degraded {
|
||||
status := "WARNING"
|
||||
status := statusWarning
|
||||
storage[i].Status = &status
|
||||
storage[i].ErrorDescription = stringPtr("VROC array is degraded")
|
||||
}
|
||||
updated++
|
||||
}
|
||||
@@ -659,14 +665,14 @@ func hasVROCController(pcie []schema.HardwarePCIeDevice) bool {
|
||||
|
||||
class := ""
|
||||
if dev.DeviceClass != nil {
|
||||
class = strings.ToLower(*dev.DeviceClass)
|
||||
class = strings.TrimSpace(*dev.DeviceClass)
|
||||
}
|
||||
model := ""
|
||||
if dev.Model != nil {
|
||||
model = strings.ToLower(*dev.Model)
|
||||
}
|
||||
|
||||
if strings.Contains(class, "raid") ||
|
||||
if isRAIDClass(class) ||
|
||||
strings.Contains(model, "vroc") ||
|
||||
strings.Contains(model, "volume management device") ||
|
||||
strings.Contains(model, "vmd") {
|
||||
|
||||
334
audit/internal/collector/raid_controller_telemetry.go
Normal file
334
audit/internal/collector/raid_controller_telemetry.go
Normal file
@@ -0,0 +1,334 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"encoding/json"
|
||||
"log/slog"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type raidControllerTelemetry struct {
|
||||
BatteryChargePct *float64
|
||||
BatteryHealthPct *float64
|
||||
BatteryTemperatureC *float64
|
||||
BatteryVoltageV *float64
|
||||
BatteryReplaceRequired *bool
|
||||
ErrorDescription *string
|
||||
}
|
||||
|
||||
func enrichPCIeWithRAIDTelemetry(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice {
|
||||
byVendor := collectRAIDControllerTelemetry()
|
||||
if len(byVendor) == 0 {
|
||||
return devs
|
||||
}
|
||||
|
||||
positions := map[int]int{}
|
||||
for i := range devs {
|
||||
if devs[i].VendorID == nil || !isLikelyRAIDController(devs[i]) {
|
||||
continue
|
||||
}
|
||||
vendor := *devs[i].VendorID
|
||||
list := byVendor[vendor]
|
||||
if len(list) == 0 {
|
||||
continue
|
||||
}
|
||||
index := positions[vendor]
|
||||
if index >= len(list) {
|
||||
continue
|
||||
}
|
||||
positions[vendor] = index + 1
|
||||
applyRAIDControllerTelemetry(&devs[i], list[index])
|
||||
}
|
||||
|
||||
return devs
|
||||
}
|
||||
|
||||
func applyRAIDControllerTelemetry(dev *schema.HardwarePCIeDevice, tel raidControllerTelemetry) {
|
||||
if tel.BatteryChargePct != nil {
|
||||
dev.BatteryChargePct = tel.BatteryChargePct
|
||||
}
|
||||
if tel.BatteryHealthPct != nil {
|
||||
dev.BatteryHealthPct = tel.BatteryHealthPct
|
||||
}
|
||||
if tel.BatteryTemperatureC != nil {
|
||||
dev.BatteryTemperatureC = tel.BatteryTemperatureC
|
||||
}
|
||||
if tel.BatteryVoltageV != nil {
|
||||
dev.BatteryVoltageV = tel.BatteryVoltageV
|
||||
}
|
||||
if tel.BatteryReplaceRequired != nil {
|
||||
dev.BatteryReplaceRequired = tel.BatteryReplaceRequired
|
||||
}
|
||||
if tel.ErrorDescription != nil {
|
||||
dev.ErrorDescription = tel.ErrorDescription
|
||||
if dev.Status == nil || *dev.Status == statusOK {
|
||||
status := statusWarning
|
||||
dev.Status = &status
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func collectRAIDControllerTelemetry() map[int][]raidControllerTelemetry {
|
||||
out := map[int][]raidControllerTelemetry{}
|
||||
|
||||
if raw, err := raidToolQuery("storcli64", "/call", "show", "all", "J"); err == nil {
|
||||
list := parseStorcliControllerTelemetry(raw)
|
||||
if len(list) > 0 {
|
||||
out[vendorBroadcomLSI] = append(out[vendorBroadcomLSI], list...)
|
||||
slog.Info("raid: storcli controller telemetry", "count", len(list))
|
||||
}
|
||||
}
|
||||
|
||||
if raw, err := raidToolQuery("ssacli", "ctrl", "all", "show", "config", "detail"); err == nil {
|
||||
list := parseSSACLIControllerTelemetry(string(raw))
|
||||
if len(list) > 0 {
|
||||
out[vendorHPE] = append(out[vendorHPE], list...)
|
||||
slog.Info("raid: ssacli controller telemetry", "count", len(list))
|
||||
}
|
||||
}
|
||||
|
||||
if raw, err := raidToolQuery("arcconf", "getconfig", "1", "ad"); err == nil {
|
||||
list := parseArcconfControllerTelemetry(string(raw))
|
||||
if len(list) > 0 {
|
||||
out[vendorAdaptec] = append(out[vendorAdaptec], list...)
|
||||
slog.Info("raid: arcconf controller telemetry", "count", len(list))
|
||||
}
|
||||
}
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
func parseStorcliControllerTelemetry(raw []byte) []raidControllerTelemetry {
|
||||
var doc struct {
|
||||
Controllers []struct {
|
||||
ResponseData map[string]any `json:"Response Data"`
|
||||
} `json:"Controllers"`
|
||||
}
|
||||
if err := json.Unmarshal(raw, &doc); err != nil {
|
||||
slog.Warn("raid: parse storcli controller telemetry failed", "err", err)
|
||||
return nil
|
||||
}
|
||||
|
||||
var out []raidControllerTelemetry
|
||||
for _, ctl := range doc.Controllers {
|
||||
tel := raidControllerTelemetry{}
|
||||
mergeStorcliBatteryMap(&tel, nestedStringMap(ctl.ResponseData["BBU_Info"]))
|
||||
mergeStorcliBatteryMap(&tel, nestedStringMap(ctl.ResponseData["BBU_Info_Details"]))
|
||||
mergeStorcliBatteryMap(&tel, nestedStringMap(ctl.ResponseData["CV_Info"]))
|
||||
mergeStorcliBatteryMap(&tel, nestedStringMap(ctl.ResponseData["CV_Info_Details"]))
|
||||
if hasRAIDControllerTelemetry(tel) {
|
||||
out = append(out, tel)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func nestedStringMap(raw any) map[string]string {
|
||||
switch value := raw.(type) {
|
||||
case map[string]any:
|
||||
out := map[string]string{}
|
||||
flattenStringMap("", value, out)
|
||||
return out
|
||||
case []any:
|
||||
out := map[string]string{}
|
||||
for _, item := range value {
|
||||
if m, ok := item.(map[string]any); ok {
|
||||
flattenStringMap("", m, out)
|
||||
}
|
||||
}
|
||||
return out
|
||||
default:
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
func flattenStringMap(prefix string, in map[string]any, out map[string]string) {
|
||||
for key, raw := range in {
|
||||
fullKey := strings.TrimSpace(strings.ToLower(strings.Trim(prefix+" "+key, " ")))
|
||||
switch value := raw.(type) {
|
||||
case map[string]any:
|
||||
flattenStringMap(fullKey, value, out)
|
||||
case []any:
|
||||
for _, item := range value {
|
||||
if m, ok := item.(map[string]any); ok {
|
||||
flattenStringMap(fullKey, m, out)
|
||||
}
|
||||
}
|
||||
case string:
|
||||
out[fullKey] = value
|
||||
case json.Number:
|
||||
out[fullKey] = value.String()
|
||||
case float64:
|
||||
out[fullKey] = strconv.FormatFloat(value, 'f', -1, 64)
|
||||
case bool:
|
||||
if value {
|
||||
out[fullKey] = "true"
|
||||
} else {
|
||||
out[fullKey] = "false"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func mergeStorcliBatteryMap(tel *raidControllerTelemetry, fields map[string]string) {
|
||||
if len(fields) == 0 {
|
||||
return
|
||||
}
|
||||
for key, raw := range fields {
|
||||
lower := strings.ToLower(strings.TrimSpace(key))
|
||||
switch {
|
||||
case strings.Contains(lower, "relative state of charge"), strings.Contains(lower, "remaining capacity"), strings.Contains(lower, "charge"):
|
||||
if tel.BatteryChargePct == nil {
|
||||
tel.BatteryChargePct = parsePercentPtr(raw)
|
||||
}
|
||||
case strings.Contains(lower, "state of health"), strings.Contains(lower, "health"):
|
||||
if tel.BatteryHealthPct == nil {
|
||||
tel.BatteryHealthPct = parsePercentPtr(raw)
|
||||
}
|
||||
case strings.Contains(lower, "temperature"):
|
||||
if tel.BatteryTemperatureC == nil {
|
||||
tel.BatteryTemperatureC = parseFloatPtr(raw)
|
||||
}
|
||||
case strings.Contains(lower, "voltage"):
|
||||
if tel.BatteryVoltageV == nil {
|
||||
tel.BatteryVoltageV = parseFloatPtr(raw)
|
||||
}
|
||||
case strings.Contains(lower, "replace"), strings.Contains(lower, "replacement required"):
|
||||
if tel.BatteryReplaceRequired == nil {
|
||||
tel.BatteryReplaceRequired = parseReplaceRequired(raw)
|
||||
}
|
||||
case strings.Contains(lower, "learn cycle requested"), strings.Contains(lower, "battery state"), strings.Contains(lower, "capacitance state"):
|
||||
if desc := batteryStateDescription(raw); desc != nil && tel.ErrorDescription == nil {
|
||||
tel.ErrorDescription = desc
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func parseSSACLIControllerTelemetry(raw string) []raidControllerTelemetry {
|
||||
lines := strings.Split(raw, "\n")
|
||||
var out []raidControllerTelemetry
|
||||
var current *raidControllerTelemetry
|
||||
|
||||
flush := func() {
|
||||
if current != nil && hasRAIDControllerTelemetry(*current) {
|
||||
out = append(out, *current)
|
||||
}
|
||||
current = nil
|
||||
}
|
||||
|
||||
for _, line := range lines {
|
||||
trimmed := strings.TrimSpace(line)
|
||||
if trimmed == "" {
|
||||
continue
|
||||
}
|
||||
if strings.HasPrefix(strings.ToLower(trimmed), "smart array") || strings.HasPrefix(strings.ToLower(trimmed), "controller ") {
|
||||
flush()
|
||||
current = &raidControllerTelemetry{}
|
||||
continue
|
||||
}
|
||||
if current == nil {
|
||||
continue
|
||||
}
|
||||
if idx := strings.Index(trimmed, ":"); idx > 0 {
|
||||
key := strings.ToLower(strings.TrimSpace(trimmed[:idx]))
|
||||
val := strings.TrimSpace(trimmed[idx+1:])
|
||||
switch {
|
||||
case strings.Contains(key, "capacitor temperature"), strings.Contains(key, "battery temperature"):
|
||||
current.BatteryTemperatureC = parseFloatPtr(val)
|
||||
case strings.Contains(key, "capacitor voltage"), strings.Contains(key, "battery voltage"):
|
||||
current.BatteryVoltageV = parseFloatPtr(val)
|
||||
case strings.Contains(key, "capacitor charge"), strings.Contains(key, "battery charge"):
|
||||
current.BatteryChargePct = parsePercentPtr(val)
|
||||
case strings.Contains(key, "capacitor health"), strings.Contains(key, "battery health"):
|
||||
current.BatteryHealthPct = parsePercentPtr(val)
|
||||
case strings.Contains(key, "replace") || strings.Contains(key, "failed"):
|
||||
if current.BatteryReplaceRequired == nil {
|
||||
current.BatteryReplaceRequired = parseReplaceRequired(val)
|
||||
}
|
||||
if desc := batteryStateDescription(val); desc != nil && current.ErrorDescription == nil {
|
||||
current.ErrorDescription = desc
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
flush()
|
||||
return out
|
||||
}
|
||||
|
||||
func parseArcconfControllerTelemetry(raw string) []raidControllerTelemetry {
|
||||
lines := strings.Split(raw, "\n")
|
||||
tel := raidControllerTelemetry{}
|
||||
for _, line := range lines {
|
||||
trimmed := strings.TrimSpace(line)
|
||||
if idx := strings.Index(trimmed, ":"); idx > 0 {
|
||||
key := strings.ToLower(strings.TrimSpace(trimmed[:idx]))
|
||||
val := strings.TrimSpace(trimmed[idx+1:])
|
||||
switch {
|
||||
case strings.Contains(key, "battery temperature"), strings.Contains(key, "capacitor temperature"):
|
||||
tel.BatteryTemperatureC = parseFloatPtr(val)
|
||||
case strings.Contains(key, "battery voltage"), strings.Contains(key, "capacitor voltage"):
|
||||
tel.BatteryVoltageV = parseFloatPtr(val)
|
||||
case strings.Contains(key, "battery charge"), strings.Contains(key, "capacitor charge"):
|
||||
tel.BatteryChargePct = parsePercentPtr(val)
|
||||
case strings.Contains(key, "battery health"), strings.Contains(key, "capacitor health"):
|
||||
tel.BatteryHealthPct = parsePercentPtr(val)
|
||||
case strings.Contains(key, "replace"), strings.Contains(key, "failed"):
|
||||
if tel.BatteryReplaceRequired == nil {
|
||||
tel.BatteryReplaceRequired = parseReplaceRequired(val)
|
||||
}
|
||||
if desc := batteryStateDescription(val); desc != nil && tel.ErrorDescription == nil {
|
||||
tel.ErrorDescription = desc
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if hasRAIDControllerTelemetry(tel) {
|
||||
return []raidControllerTelemetry{tel}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func hasRAIDControllerTelemetry(tel raidControllerTelemetry) bool {
|
||||
return tel.BatteryChargePct != nil ||
|
||||
tel.BatteryHealthPct != nil ||
|
||||
tel.BatteryTemperatureC != nil ||
|
||||
tel.BatteryVoltageV != nil ||
|
||||
tel.BatteryReplaceRequired != nil ||
|
||||
tel.ErrorDescription != nil
|
||||
}
|
||||
|
||||
func parsePercentPtr(raw string) *float64 {
|
||||
raw = strings.ReplaceAll(strings.TrimSpace(raw), "%", "")
|
||||
return parseFloatPtr(raw)
|
||||
}
|
||||
|
||||
func parseReplaceRequired(raw string) *bool {
|
||||
lower := strings.ToLower(strings.TrimSpace(raw))
|
||||
switch {
|
||||
case lower == "":
|
||||
return nil
|
||||
case strings.Contains(lower, "replace"), strings.Contains(lower, "failed"), strings.Contains(lower, "yes"), strings.Contains(lower, "required"):
|
||||
value := true
|
||||
return &value
|
||||
case strings.Contains(lower, "no"), strings.Contains(lower, "ok"), strings.Contains(lower, "good"), strings.Contains(lower, "optimal"):
|
||||
value := false
|
||||
return &value
|
||||
default:
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
func batteryStateDescription(raw string) *string {
|
||||
lower := strings.ToLower(strings.TrimSpace(raw))
|
||||
if lower == "" {
|
||||
return nil
|
||||
}
|
||||
switch {
|
||||
case strings.Contains(lower, "failed"), strings.Contains(lower, "fault"), strings.Contains(lower, "replace"), strings.Contains(lower, "warning"), strings.Contains(lower, "degraded"):
|
||||
return &raw
|
||||
default:
|
||||
return nil
|
||||
}
|
||||
}
|
||||
@@ -1,6 +1,10 @@
|
||||
package collector
|
||||
|
||||
import "testing"
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"errors"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestParseSASIrcuControllerIDs(t *testing.T) {
|
||||
raw := `LSI Corporation SAS2 IR Configuration Utility.
|
||||
@@ -90,7 +94,111 @@ physicaldrive 1I:1:2 (894 GB, SAS HDD, Failed)
|
||||
if drives[0].Status == nil || *drives[0].Status != "OK" {
|
||||
t.Fatalf("drive0 status: %v", drives[0].Status)
|
||||
}
|
||||
if drives[1].Status == nil || *drives[1].Status != "CRITICAL" {
|
||||
if drives[1].Status == nil || *drives[1].Status != statusCritical {
|
||||
t.Fatalf("drive1 status: %v", drives[1].Status)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseStorcliControllerTelemetry(t *testing.T) {
|
||||
raw := []byte(`{
|
||||
"Controllers": [
|
||||
{
|
||||
"Response Data": {
|
||||
"BBU_Info": {
|
||||
"State of Health": "98 %",
|
||||
"Relative State of Charge": "76 %",
|
||||
"Temperature": "41 C",
|
||||
"Voltage": "12.3 V",
|
||||
"Replacement required": "No"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}`)
|
||||
got := parseStorcliControllerTelemetry(raw)
|
||||
if len(got) != 1 {
|
||||
t.Fatalf("len(got)=%d want 1", len(got))
|
||||
}
|
||||
if got[0].BatteryHealthPct == nil || *got[0].BatteryHealthPct != 98 {
|
||||
t.Fatalf("battery health=%v", got[0].BatteryHealthPct)
|
||||
}
|
||||
if got[0].BatteryChargePct == nil || *got[0].BatteryChargePct != 76 {
|
||||
t.Fatalf("battery charge=%v", got[0].BatteryChargePct)
|
||||
}
|
||||
if got[0].BatteryTemperatureC == nil || *got[0].BatteryTemperatureC != 41 {
|
||||
t.Fatalf("battery temperature=%v", got[0].BatteryTemperatureC)
|
||||
}
|
||||
if got[0].BatteryVoltageV == nil || *got[0].BatteryVoltageV != 12.3 {
|
||||
t.Fatalf("battery voltage=%v", got[0].BatteryVoltageV)
|
||||
}
|
||||
if got[0].BatteryReplaceRequired == nil || *got[0].BatteryReplaceRequired {
|
||||
t.Fatalf("battery replace=%v", got[0].BatteryReplaceRequired)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseSSACLIControllerTelemetry(t *testing.T) {
|
||||
raw := `Smart Array P440ar in Slot 0
|
||||
Battery/Capacitor Count: 1
|
||||
Capacitor Temperature (C): 37
|
||||
Capacitor Charge (%): 94
|
||||
Capacitor Health (%): 96
|
||||
Capacitor Voltage (V): 9.8
|
||||
Capacitor Failed: No
|
||||
`
|
||||
got := parseSSACLIControllerTelemetry(raw)
|
||||
if len(got) != 1 {
|
||||
t.Fatalf("len(got)=%d want 1", len(got))
|
||||
}
|
||||
if got[0].BatteryTemperatureC == nil || *got[0].BatteryTemperatureC != 37 {
|
||||
t.Fatalf("battery temperature=%v", got[0].BatteryTemperatureC)
|
||||
}
|
||||
if got[0].BatteryChargePct == nil || *got[0].BatteryChargePct != 94 {
|
||||
t.Fatalf("battery charge=%v", got[0].BatteryChargePct)
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnrichPCIeWithRAIDTelemetry(t *testing.T) {
|
||||
orig := raidToolQuery
|
||||
t.Cleanup(func() { raidToolQuery = orig })
|
||||
raidToolQuery = func(name string, args ...string) ([]byte, error) {
|
||||
switch name {
|
||||
case "storcli64":
|
||||
return []byte(`{
|
||||
"Controllers": [
|
||||
{
|
||||
"Response Data": {
|
||||
"CV_Info": {
|
||||
"State of Health": "99 %",
|
||||
"Relative State of Charge": "81 %",
|
||||
"Temperature": "38 C",
|
||||
"Voltage": "12.1 V",
|
||||
"Replacement required": "No"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}`), nil
|
||||
default:
|
||||
return nil, errors.New("skip")
|
||||
}
|
||||
}
|
||||
|
||||
vendor := vendorBroadcomLSI
|
||||
class := "MassStorageController"
|
||||
status := statusOK
|
||||
devs := []schema.HardwarePCIeDevice{{
|
||||
VendorID: &vendor,
|
||||
DeviceClass: &class,
|
||||
HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status},
|
||||
}}
|
||||
out := enrichPCIeWithRAIDTelemetry(devs)
|
||||
if out[0].BatteryHealthPct == nil || *out[0].BatteryHealthPct != 99 {
|
||||
t.Fatalf("battery health=%v", out[0].BatteryHealthPct)
|
||||
}
|
||||
if out[0].BatteryChargePct == nil || *out[0].BatteryChargePct != 81 {
|
||||
t.Fatalf("battery charge=%v", out[0].BatteryChargePct)
|
||||
}
|
||||
if out[0].BatteryVoltageV == nil || *out[0].BatteryVoltageV != 12.1 {
|
||||
t.Fatalf("battery voltage=%v", out[0].BatteryVoltageV)
|
||||
}
|
||||
}
|
||||
|
||||
373
audit/internal/collector/sensors.go
Normal file
373
audit/internal/collector/sensors.go
Normal file
@@ -0,0 +1,373 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"encoding/json"
|
||||
"log/slog"
|
||||
"os/exec"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type sensorsDoc map[string]map[string]any
|
||||
|
||||
func collectSensors() *schema.HardwareSensors {
|
||||
doc, err := readSensorsJSONDoc()
|
||||
if err != nil {
|
||||
slog.Info("sensors: unavailable, skipping", "err", err)
|
||||
return nil
|
||||
}
|
||||
sensors := buildSensorsFromDoc(doc)
|
||||
if sensors == nil || (len(sensors.Fans) == 0 && len(sensors.Power) == 0 && len(sensors.Temperatures) == 0 && len(sensors.Other) == 0) {
|
||||
return nil
|
||||
}
|
||||
slog.Info("sensors: collected",
|
||||
"fans", len(sensors.Fans),
|
||||
"power", len(sensors.Power),
|
||||
"temperatures", len(sensors.Temperatures),
|
||||
"other", len(sensors.Other),
|
||||
)
|
||||
return sensors
|
||||
}
|
||||
|
||||
func readSensorsJSONDoc() (sensorsDoc, error) {
|
||||
out, err := exec.Command("sensors", "-j").Output()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var doc sensorsDoc
|
||||
if err := json.Unmarshal(out, &doc); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return doc, nil
|
||||
}
|
||||
|
||||
func buildSensorsFromDoc(doc sensorsDoc) *schema.HardwareSensors {
|
||||
if len(doc) == 0 {
|
||||
return nil
|
||||
}
|
||||
result := &schema.HardwareSensors{}
|
||||
seen := map[string]struct{}{}
|
||||
|
||||
chips := make([]string, 0, len(doc))
|
||||
for chip := range doc {
|
||||
chips = append(chips, chip)
|
||||
}
|
||||
sort.Strings(chips)
|
||||
|
||||
for _, chip := range chips {
|
||||
features := doc[chip]
|
||||
location := sensorLocation(chip)
|
||||
|
||||
keys := make([]string, 0, len(features))
|
||||
for key := range features {
|
||||
keys = append(keys, key)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
|
||||
for _, key := range keys {
|
||||
if strings.EqualFold(key, "Adapter") {
|
||||
continue
|
||||
}
|
||||
feature, ok := features[key].(map[string]any)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
name := strings.TrimSpace(key)
|
||||
if name == "" {
|
||||
continue
|
||||
}
|
||||
switch classifySensorFeature(feature) {
|
||||
case "fan":
|
||||
item := buildFanSensor(name, location, feature)
|
||||
if item == nil || duplicateSensor(seen, "fan", item.Name) {
|
||||
continue
|
||||
}
|
||||
result.Fans = append(result.Fans, *item)
|
||||
case "temp":
|
||||
item := buildTempSensor(name, location, feature)
|
||||
if item == nil || duplicateSensor(seen, "temp", item.Name) {
|
||||
continue
|
||||
}
|
||||
result.Temperatures = append(result.Temperatures, *item)
|
||||
case "power":
|
||||
item := buildPowerSensor(name, location, feature)
|
||||
if item == nil || duplicateSensor(seen, "power", item.Name) {
|
||||
continue
|
||||
}
|
||||
result.Power = append(result.Power, *item)
|
||||
default:
|
||||
item := buildOtherSensor(name, location, feature)
|
||||
if item == nil || duplicateSensor(seen, "other", item.Name) {
|
||||
continue
|
||||
}
|
||||
result.Other = append(result.Other, *item)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
func parseSensorsJSON(raw []byte) (*schema.HardwareSensors, error) {
|
||||
var doc sensorsDoc
|
||||
err := json.Unmarshal(raw, &doc)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return buildSensorsFromDoc(doc), nil
|
||||
}
|
||||
|
||||
func duplicateSensor(seen map[string]struct{}, sensorType, name string) bool {
|
||||
key := sensorType + "\x00" + name
|
||||
if _, ok := seen[key]; ok {
|
||||
return true
|
||||
}
|
||||
seen[key] = struct{}{}
|
||||
return false
|
||||
}
|
||||
|
||||
func sensorLocation(chip string) *string {
|
||||
chip = strings.TrimSpace(chip)
|
||||
if chip == "" {
|
||||
return nil
|
||||
}
|
||||
return &chip
|
||||
}
|
||||
|
||||
func classifySensorFeature(feature map[string]any) string {
|
||||
for key := range feature {
|
||||
switch {
|
||||
case strings.Contains(key, "fan") && strings.HasSuffix(key, "_input"):
|
||||
return "fan"
|
||||
case strings.Contains(key, "temp") && strings.HasSuffix(key, "_input"):
|
||||
return "temp"
|
||||
case strings.Contains(key, "power") && (strings.HasSuffix(key, "_input") || strings.HasSuffix(key, "_average")):
|
||||
return "power"
|
||||
case strings.Contains(key, "curr") && strings.HasSuffix(key, "_input"):
|
||||
return "power"
|
||||
case strings.HasPrefix(key, "in") && strings.HasSuffix(key, "_input"):
|
||||
return "power"
|
||||
}
|
||||
}
|
||||
return "other"
|
||||
}
|
||||
|
||||
func buildFanSensor(name string, location *string, feature map[string]any) *schema.HardwareFanSensor {
|
||||
rpm, ok := firstFeatureInt(feature, "_input")
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
item := &schema.HardwareFanSensor{Name: name, Location: location, RPM: &rpm}
|
||||
if status := sensorStatusFromFeature(feature); status != nil {
|
||||
item.Status = status
|
||||
}
|
||||
return item
|
||||
}
|
||||
|
||||
func buildTempSensor(name string, location *string, feature map[string]any) *schema.HardwareTemperatureSensor {
|
||||
celsius, ok := firstFeatureFloat(feature, "_input")
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
item := &schema.HardwareTemperatureSensor{Name: name, Location: location, Celsius: &celsius}
|
||||
if warning, ok := firstFeatureFloatWithSuffixes(feature, []string{"_max", "_high"}); ok {
|
||||
item.ThresholdWarningCelsius = &warning
|
||||
}
|
||||
if critical, ok := firstFeatureFloatWithSuffixes(feature, []string{"_crit", "_emergency"}); ok {
|
||||
item.ThresholdCriticalCelsius = &critical
|
||||
}
|
||||
if status := sensorStatusFromFeature(feature); status != nil {
|
||||
item.Status = status
|
||||
} else {
|
||||
item.Status = deriveTemperatureStatus(item.Celsius, item.ThresholdWarningCelsius, item.ThresholdCriticalCelsius)
|
||||
}
|
||||
return item
|
||||
}
|
||||
|
||||
func buildPowerSensor(name string, location *string, feature map[string]any) *schema.HardwarePowerSensor {
|
||||
item := &schema.HardwarePowerSensor{Name: name, Location: location}
|
||||
if v, ok := firstFeatureFloatWithContains(feature, []string{"power"}); ok {
|
||||
item.PowerW = &v
|
||||
}
|
||||
if v, ok := firstFeatureFloatWithPrefix(feature, "curr"); ok {
|
||||
item.CurrentA = &v
|
||||
}
|
||||
if v, ok := firstFeatureFloatWithPrefix(feature, "in"); ok {
|
||||
item.VoltageV = &v
|
||||
}
|
||||
if item.PowerW == nil && item.CurrentA == nil && item.VoltageV == nil {
|
||||
return nil
|
||||
}
|
||||
if status := sensorStatusFromFeature(feature); status != nil {
|
||||
item.Status = status
|
||||
}
|
||||
return item
|
||||
}
|
||||
|
||||
func buildOtherSensor(name string, location *string, feature map[string]any) *schema.HardwareOtherSensor {
|
||||
value, unit, ok := firstGenericSensorValue(feature)
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
item := &schema.HardwareOtherSensor{Name: name, Location: location, Value: &value}
|
||||
if unit != "" {
|
||||
item.Unit = &unit
|
||||
}
|
||||
if status := sensorStatusFromFeature(feature); status != nil {
|
||||
item.Status = status
|
||||
}
|
||||
return item
|
||||
}
|
||||
|
||||
func sensorStatusFromFeature(feature map[string]any) *string {
|
||||
for key, raw := range feature {
|
||||
if !strings.HasSuffix(key, "_alarm") {
|
||||
continue
|
||||
}
|
||||
if number, ok := floatFromAny(raw); ok && number > 0 {
|
||||
status := statusWarning
|
||||
return &status
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func deriveTemperatureStatus(current, warning, critical *float64) *string {
|
||||
if current == nil {
|
||||
return nil
|
||||
}
|
||||
switch {
|
||||
case critical != nil && *current >= *critical:
|
||||
status := statusCritical
|
||||
return &status
|
||||
case warning != nil && *current >= *warning:
|
||||
status := statusWarning
|
||||
return &status
|
||||
default:
|
||||
status := statusOK
|
||||
return &status
|
||||
}
|
||||
}
|
||||
|
||||
func firstFeatureInt(feature map[string]any, suffix string) (int, bool) {
|
||||
for key, raw := range feature {
|
||||
if strings.HasSuffix(key, suffix) {
|
||||
if value, ok := floatFromAny(raw); ok {
|
||||
return int(value), true
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
func firstFeatureFloat(feature map[string]any, suffix string) (float64, bool) {
|
||||
return firstFeatureFloatWithSuffixes(feature, []string{suffix})
|
||||
}
|
||||
|
||||
func firstFeatureFloatWithSuffixes(feature map[string]any, suffixes []string) (float64, bool) {
|
||||
keys := sortedFeatureKeys(feature)
|
||||
for _, key := range keys {
|
||||
for _, suffix := range suffixes {
|
||||
if strings.HasSuffix(key, suffix) {
|
||||
if value, ok := floatFromAny(feature[key]); ok {
|
||||
return value, true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
func firstFeatureFloatWithContains(feature map[string]any, parts []string) (float64, bool) {
|
||||
keys := sortedFeatureKeys(feature)
|
||||
for _, key := range keys {
|
||||
matched := true
|
||||
for _, part := range parts {
|
||||
if !strings.Contains(key, part) {
|
||||
matched = false
|
||||
break
|
||||
}
|
||||
}
|
||||
if matched {
|
||||
if value, ok := floatFromAny(feature[key]); ok {
|
||||
return value, true
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
func firstFeatureFloatWithPrefix(feature map[string]any, prefix string) (float64, bool) {
|
||||
keys := sortedFeatureKeys(feature)
|
||||
for _, key := range keys {
|
||||
if strings.HasPrefix(key, prefix) && strings.HasSuffix(key, "_input") {
|
||||
if value, ok := floatFromAny(feature[key]); ok {
|
||||
return value, true
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
func firstGenericSensorValue(feature map[string]any) (float64, string, bool) {
|
||||
keys := sortedFeatureKeys(feature)
|
||||
for _, key := range keys {
|
||||
if strings.HasSuffix(key, "_alarm") {
|
||||
continue
|
||||
}
|
||||
value, ok := floatFromAny(feature[key])
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
unit := inferSensorUnit(key)
|
||||
return value, unit, true
|
||||
}
|
||||
return 0, "", false
|
||||
}
|
||||
|
||||
func inferSensorUnit(key string) string {
|
||||
switch {
|
||||
case strings.Contains(key, "humidity"):
|
||||
return "%"
|
||||
case strings.Contains(key, "intrusion"):
|
||||
return ""
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
func sortedFeatureKeys(feature map[string]any) []string {
|
||||
keys := make([]string, 0, len(feature))
|
||||
for key := range feature {
|
||||
keys = append(keys, key)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
return keys
|
||||
}
|
||||
|
||||
func floatFromAny(raw any) (float64, bool) {
|
||||
switch value := raw.(type) {
|
||||
case float64:
|
||||
return value, true
|
||||
case float32:
|
||||
return float64(value), true
|
||||
case int:
|
||||
return float64(value), true
|
||||
case int64:
|
||||
return float64(value), true
|
||||
case json.Number:
|
||||
if f, err := value.Float64(); err == nil {
|
||||
return f, true
|
||||
}
|
||||
case string:
|
||||
if value == "" {
|
||||
return 0, false
|
||||
}
|
||||
if f, err := strconv.ParseFloat(value, 64); err == nil {
|
||||
return f, true
|
||||
}
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
54
audit/internal/collector/sensors_test.go
Normal file
54
audit/internal/collector/sensors_test.go
Normal file
@@ -0,0 +1,54 @@
|
||||
package collector
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestParseSensorsJSON(t *testing.T) {
|
||||
raw := []byte(`{
|
||||
"coretemp-isa-0000": {
|
||||
"Adapter": "ISA adapter",
|
||||
"Package id 0": {
|
||||
"temp1_input": 61.5,
|
||||
"temp1_max": 80.0,
|
||||
"temp1_crit": 95.0
|
||||
},
|
||||
"fan1": {
|
||||
"fan1_input": 4200
|
||||
}
|
||||
},
|
||||
"acpitz-acpi-0": {
|
||||
"Adapter": "ACPI interface",
|
||||
"in0": {
|
||||
"in0_input": 12.06
|
||||
},
|
||||
"curr1": {
|
||||
"curr1_input": 0.64
|
||||
},
|
||||
"power1": {
|
||||
"power1_average": 137.0
|
||||
},
|
||||
"humidity1": {
|
||||
"humidity1_input": 38.5
|
||||
}
|
||||
}
|
||||
}`)
|
||||
|
||||
got, err := parseSensorsJSON(raw)
|
||||
if err != nil {
|
||||
t.Fatalf("parseSensorsJSON error: %v", err)
|
||||
}
|
||||
if got == nil {
|
||||
t.Fatal("expected sensors")
|
||||
}
|
||||
if len(got.Temperatures) != 1 || got.Temperatures[0].Celsius == nil || *got.Temperatures[0].Celsius != 61.5 {
|
||||
t.Fatalf("temperatures mismatch: %#v", got.Temperatures)
|
||||
}
|
||||
if len(got.Fans) != 1 || got.Fans[0].RPM == nil || *got.Fans[0].RPM != 4200 {
|
||||
t.Fatalf("fans mismatch: %#v", got.Fans)
|
||||
}
|
||||
if len(got.Power) != 3 {
|
||||
t.Fatalf("power sensors mismatch: %#v", got.Power)
|
||||
}
|
||||
if len(got.Other) != 1 || got.Other[0].Unit == nil || *got.Other[0].Unit != "%" {
|
||||
t.Fatalf("other sensors mismatch: %#v", got.Other)
|
||||
}
|
||||
}
|
||||
@@ -26,13 +26,13 @@ func collectStorage() []schema.HardwareStorage {
|
||||
|
||||
// lsblkDevice is a minimal lsblk JSON record.
|
||||
type lsblkDevice struct {
|
||||
Name string `json:"name"`
|
||||
Type string `json:"type"`
|
||||
Size string `json:"size"`
|
||||
Serial string `json:"serial"`
|
||||
Model string `json:"model"`
|
||||
Tran string `json:"tran"`
|
||||
Hctl string `json:"hctl"`
|
||||
Name string `json:"name"`
|
||||
Type string `json:"type"`
|
||||
Size string `json:"size"`
|
||||
Serial string `json:"serial"`
|
||||
Model string `json:"model"`
|
||||
Tran string `json:"tran"`
|
||||
Hctl string `json:"hctl"`
|
||||
}
|
||||
|
||||
type lsblkRoot struct {
|
||||
@@ -67,7 +67,10 @@ type smartctlInfo struct {
|
||||
SerialNumber string `json:"serial_number"`
|
||||
FirmwareVer string `json:"firmware_version"`
|
||||
RotationRate int `json:"rotation_rate"`
|
||||
SmartStatus struct {
|
||||
Temperature struct {
|
||||
Current int `json:"current"`
|
||||
} `json:"temperature"`
|
||||
SmartStatus struct {
|
||||
Passed bool `json:"passed"`
|
||||
} `json:"smart_status"`
|
||||
UserCapacity struct {
|
||||
@@ -75,9 +78,11 @@ type smartctlInfo struct {
|
||||
} `json:"user_capacity"`
|
||||
AtaSmartAttributes struct {
|
||||
Table []struct {
|
||||
ID int `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Raw struct{ Value int64 `json:"value"` } `json:"raw"`
|
||||
ID int `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Raw struct {
|
||||
Value int64 `json:"value"`
|
||||
} `json:"raw"`
|
||||
} `json:"table"`
|
||||
} `json:"ata_smart_attributes"`
|
||||
PowerOnTime struct {
|
||||
@@ -130,7 +135,7 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
|
||||
return s
|
||||
}
|
||||
|
||||
var info smartctlInfo
|
||||
var info smartctlInfo
|
||||
if err := json.Unmarshal(out, &info); err == nil {
|
||||
if v := cleanDMIValue(info.ModelName); v != "" {
|
||||
s.Model = &v
|
||||
@@ -152,14 +157,19 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
|
||||
} else if info.RotationRate > 0 {
|
||||
devType = "HDD"
|
||||
}
|
||||
s.Type = &devType
|
||||
|
||||
// telemetry
|
||||
tel := map[string]any{}
|
||||
if info.Temperature.Current > 0 {
|
||||
t := float64(info.Temperature.Current)
|
||||
s.TemperatureC = &t
|
||||
}
|
||||
if info.PowerOnTime.Hours > 0 {
|
||||
tel["power_on_hours"] = info.PowerOnTime.Hours
|
||||
v := int64(info.PowerOnTime.Hours)
|
||||
s.PowerOnHours = &v
|
||||
}
|
||||
if info.PowerCycleCount > 0 {
|
||||
tel["power_cycles"] = info.PowerCycleCount
|
||||
v := int64(info.PowerCycleCount)
|
||||
s.PowerCycles = &v
|
||||
}
|
||||
reallocated := int64(0)
|
||||
pending := int64(0)
|
||||
@@ -169,77 +179,79 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
|
||||
switch attr.ID {
|
||||
case 5:
|
||||
reallocated = attr.Raw.Value
|
||||
tel["reallocated_sectors"] = attr.Raw.Value
|
||||
s.ReallocatedSectors = &reallocated
|
||||
case 177:
|
||||
tel["wear_leveling_pct"] = attr.Raw.Value
|
||||
value := float64(attr.Raw.Value)
|
||||
s.LifeUsedPct = &value
|
||||
case 231:
|
||||
lifeRemaining = attr.Raw.Value
|
||||
tel["life_remaining_pct"] = attr.Raw.Value
|
||||
value := float64(attr.Raw.Value)
|
||||
s.LifeRemainingPct = &value
|
||||
case 241:
|
||||
tel["total_lba_written"] = attr.Raw.Value
|
||||
value := attr.Raw.Value
|
||||
s.WrittenBytes = &value
|
||||
case 197:
|
||||
pending = attr.Raw.Value
|
||||
tel["current_pending_sectors"] = attr.Raw.Value
|
||||
s.CurrentPendingSectors = &pending
|
||||
case 198:
|
||||
uncorrectable = attr.Raw.Value
|
||||
tel["offline_uncorrectable"] = attr.Raw.Value
|
||||
s.OfflineUncorrectable = &uncorrectable
|
||||
}
|
||||
}
|
||||
if len(tel) > 0 {
|
||||
s.Telemetry = tel
|
||||
}
|
||||
|
||||
status := storageHealthStatus{
|
||||
overallPassed: info.SmartStatus.Passed,
|
||||
hasOverall: true,
|
||||
reallocatedSectors: reallocated,
|
||||
pendingSectors: pending,
|
||||
overallPassed: info.SmartStatus.Passed,
|
||||
hasOverall: true,
|
||||
reallocatedSectors: reallocated,
|
||||
pendingSectors: pending,
|
||||
offlineUncorrectable: uncorrectable,
|
||||
lifeRemainingPct: lifeRemaining,
|
||||
lifeRemainingPct: lifeRemaining,
|
||||
}
|
||||
setStorageHealthStatus(&s, status)
|
||||
return s
|
||||
}
|
||||
|
||||
s.Type = &devType
|
||||
status := "UNKNOWN"
|
||||
status := statusUnknown
|
||||
s.Status = &status
|
||||
return s
|
||||
}
|
||||
|
||||
// nvmeSmartLog is the subset of `nvme smart-log -o json` output we care about.
|
||||
type nvmeSmartLog struct {
|
||||
CriticalWarning int `json:"critical_warning"`
|
||||
PercentageUsed int `json:"percentage_used"`
|
||||
AvailableSpare int `json:"available_spare"`
|
||||
SpareThreshold int `json:"spare_thresh"`
|
||||
PowerOnHours int64 `json:"power_on_hours"`
|
||||
PowerCycles int64 `json:"power_cycles"`
|
||||
UnsafeShutdowns int64 `json:"unsafe_shutdowns"`
|
||||
DataUnitsWritten int64 `json:"data_units_written"`
|
||||
ControllerBusy int64 `json:"controller_busy_time"`
|
||||
MediaErrors int64 `json:"media_errors"`
|
||||
NumErrLogEntries int64 `json:"num_err_log_entries"`
|
||||
CriticalWarning int `json:"critical_warning"`
|
||||
PercentageUsed int `json:"percentage_used"`
|
||||
AvailableSpare int `json:"available_spare"`
|
||||
SpareThreshold int `json:"spare_thresh"`
|
||||
Temperature int64 `json:"temperature"`
|
||||
PowerOnHours int64 `json:"power_on_hours"`
|
||||
PowerCycles int64 `json:"power_cycles"`
|
||||
UnsafeShutdowns int64 `json:"unsafe_shutdowns"`
|
||||
DataUnitsRead int64 `json:"data_units_read"`
|
||||
DataUnitsWritten int64 `json:"data_units_written"`
|
||||
ControllerBusy int64 `json:"controller_busy_time"`
|
||||
MediaErrors int64 `json:"media_errors"`
|
||||
NumErrLogEntries int64 `json:"num_err_log_entries"`
|
||||
}
|
||||
|
||||
// nvmeIDCtrl is the subset of `nvme id-ctrl -o json` output.
|
||||
type nvmeIDCtrl struct {
|
||||
ModelNumber string `json:"mn"`
|
||||
SerialNumber string `json:"sn"`
|
||||
FirmwareRev string `json:"fr"`
|
||||
TotalCapacity int64 `json:"tnvmcap"`
|
||||
ModelNumber string `json:"mn"`
|
||||
SerialNumber string `json:"sn"`
|
||||
FirmwareRev string `json:"fr"`
|
||||
TotalCapacity int64 `json:"tnvmcap"`
|
||||
}
|
||||
|
||||
func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
||||
present := true
|
||||
devType := "NVMe"
|
||||
iface := "NVMe"
|
||||
status := "OK"
|
||||
status := statusOK
|
||||
s := schema.HardwareStorage{
|
||||
Present: &present,
|
||||
Type: &devType,
|
||||
Interface: &iface,
|
||||
Status: &status,
|
||||
HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status},
|
||||
Present: &present,
|
||||
Type: &devType,
|
||||
Interface: &iface,
|
||||
}
|
||||
|
||||
devPath := "/dev/" + dev.Name
|
||||
@@ -268,100 +280,123 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
||||
if out, err := exec.Command("nvme", "smart-log", devPath, "-o", "json").Output(); err == nil {
|
||||
var log nvmeSmartLog
|
||||
if json.Unmarshal(out, &log) == nil {
|
||||
tel := map[string]any{}
|
||||
if log.CriticalWarning > 0 {
|
||||
tel["critical_warning"] = log.CriticalWarning
|
||||
}
|
||||
if log.PowerOnHours > 0 {
|
||||
tel["power_on_hours"] = log.PowerOnHours
|
||||
s.PowerOnHours = &log.PowerOnHours
|
||||
}
|
||||
if log.PowerCycles > 0 {
|
||||
tel["power_cycles"] = log.PowerCycles
|
||||
s.PowerCycles = &log.PowerCycles
|
||||
}
|
||||
if log.UnsafeShutdowns > 0 {
|
||||
tel["unsafe_shutdowns"] = log.UnsafeShutdowns
|
||||
s.UnsafeShutdowns = &log.UnsafeShutdowns
|
||||
}
|
||||
if log.PercentageUsed > 0 {
|
||||
tel["percentage_used"] = log.PercentageUsed
|
||||
v := float64(log.PercentageUsed)
|
||||
s.LifeUsedPct = &v
|
||||
remaining := 100 - v
|
||||
s.LifeRemainingPct = &remaining
|
||||
}
|
||||
if log.DataUnitsWritten > 0 {
|
||||
tel["data_units_written"] = log.DataUnitsWritten
|
||||
v := nvmeDataUnitsToBytes(log.DataUnitsWritten)
|
||||
s.WrittenBytes = &v
|
||||
}
|
||||
if log.ControllerBusy > 0 {
|
||||
tel["controller_busy_time"] = log.ControllerBusy
|
||||
if log.DataUnitsRead > 0 {
|
||||
v := nvmeDataUnitsToBytes(log.DataUnitsRead)
|
||||
s.ReadBytes = &v
|
||||
}
|
||||
if log.AvailableSpare > 0 {
|
||||
tel["available_spare_pct"] = log.AvailableSpare
|
||||
}
|
||||
if log.SpareThreshold > 0 {
|
||||
tel["available_spare_threshold_pct"] = log.SpareThreshold
|
||||
v := float64(log.AvailableSpare)
|
||||
s.AvailableSparePct = &v
|
||||
}
|
||||
if log.MediaErrors > 0 {
|
||||
tel["media_errors"] = log.MediaErrors
|
||||
s.MediaErrors = &log.MediaErrors
|
||||
}
|
||||
if log.NumErrLogEntries > 0 {
|
||||
tel["error_log_entries"] = log.NumErrLogEntries
|
||||
s.ErrorLogEntries = &log.NumErrLogEntries
|
||||
}
|
||||
if len(tel) > 0 {
|
||||
s.Telemetry = tel
|
||||
if log.Temperature > 0 {
|
||||
v := float64(log.Temperature - 273)
|
||||
s.TemperatureC = &v
|
||||
}
|
||||
setStorageHealthStatus(&s, storageHealthStatus{
|
||||
criticalWarning: log.CriticalWarning,
|
||||
percentageUsed: int64(log.PercentageUsed),
|
||||
availableSpare: int64(log.AvailableSpare),
|
||||
spareThreshold: int64(log.SpareThreshold),
|
||||
unsafeShutdowns: log.UnsafeShutdowns,
|
||||
mediaErrors: log.MediaErrors,
|
||||
errorLogEntries: log.NumErrLogEntries,
|
||||
criticalWarning: log.CriticalWarning,
|
||||
percentageUsed: int64(log.PercentageUsed),
|
||||
availableSpare: int64(log.AvailableSpare),
|
||||
spareThreshold: int64(log.SpareThreshold),
|
||||
unsafeShutdowns: log.UnsafeShutdowns,
|
||||
mediaErrors: log.MediaErrors,
|
||||
errorLogEntries: log.NumErrLogEntries,
|
||||
})
|
||||
return s
|
||||
}
|
||||
}
|
||||
|
||||
status = "UNKNOWN"
|
||||
status = statusUnknown
|
||||
s.Status = &status
|
||||
return s
|
||||
}
|
||||
|
||||
func nvmeDataUnitsToBytes(units int64) int64 {
|
||||
if units <= 0 {
|
||||
return 0
|
||||
}
|
||||
return units * 512000
|
||||
}
|
||||
|
||||
type storageHealthStatus struct {
|
||||
hasOverall bool
|
||||
overallPassed bool
|
||||
reallocatedSectors int64
|
||||
pendingSectors int64
|
||||
offlineUncorrectable int64
|
||||
lifeRemainingPct int64
|
||||
criticalWarning int
|
||||
percentageUsed int64
|
||||
availableSpare int64
|
||||
spareThreshold int64
|
||||
unsafeShutdowns int64
|
||||
mediaErrors int64
|
||||
errorLogEntries int64
|
||||
hasOverall bool
|
||||
overallPassed bool
|
||||
reallocatedSectors int64
|
||||
pendingSectors int64
|
||||
offlineUncorrectable int64
|
||||
lifeRemainingPct int64
|
||||
criticalWarning int
|
||||
percentageUsed int64
|
||||
availableSpare int64
|
||||
spareThreshold int64
|
||||
unsafeShutdowns int64
|
||||
mediaErrors int64
|
||||
errorLogEntries int64
|
||||
}
|
||||
|
||||
func setStorageHealthStatus(s *schema.HardwareStorage, health storageHealthStatus) {
|
||||
status := "OK"
|
||||
status := statusOK
|
||||
var description *string
|
||||
switch {
|
||||
case health.hasOverall && !health.overallPassed:
|
||||
status = "FAILED"
|
||||
status = statusCritical
|
||||
description = stringPtr("SMART overall self-assessment failed")
|
||||
case health.criticalWarning > 0:
|
||||
status = "FAILED"
|
||||
status = statusCritical
|
||||
description = stringPtr("NVMe critical warning is set")
|
||||
case health.pendingSectors > 0 || health.offlineUncorrectable > 0:
|
||||
status = "FAILED"
|
||||
status = statusCritical
|
||||
description = stringPtr("Pending or offline uncorrectable sectors detected")
|
||||
case health.mediaErrors > 0:
|
||||
status = "WARNING"
|
||||
status = statusWarning
|
||||
description = stringPtr("Media errors reported")
|
||||
case health.reallocatedSectors > 0:
|
||||
status = "WARNING"
|
||||
status = statusWarning
|
||||
description = stringPtr("Reallocated sectors detected")
|
||||
case health.errorLogEntries > 0:
|
||||
status = "WARNING"
|
||||
status = statusWarning
|
||||
description = stringPtr("Device error log contains entries")
|
||||
case health.lifeRemainingPct > 0 && health.lifeRemainingPct <= 10:
|
||||
status = "WARNING"
|
||||
status = statusWarning
|
||||
description = stringPtr("Life remaining is low")
|
||||
case health.percentageUsed >= 95:
|
||||
status = "WARNING"
|
||||
status = statusWarning
|
||||
description = stringPtr("Drive wear level is high")
|
||||
case health.availableSpare > 0 && health.spareThreshold > 0 && health.availableSpare <= health.spareThreshold:
|
||||
status = "WARNING"
|
||||
status = statusWarning
|
||||
description = stringPtr("Available spare is at or below threshold")
|
||||
case health.unsafeShutdowns > 100:
|
||||
status = "WARNING"
|
||||
status = statusWarning
|
||||
description = stringPtr("Unsafe shutdown count is high")
|
||||
}
|
||||
s.Status = &status
|
||||
s.ErrorDescription = description
|
||||
}
|
||||
|
||||
func stringPtr(value string) *string {
|
||||
return &value
|
||||
}
|
||||
|
||||
@@ -17,37 +17,37 @@ func TestSetStorageHealthStatus(t *testing.T) {
|
||||
{
|
||||
name: "smart overall failed",
|
||||
health: storageHealthStatus{hasOverall: true, overallPassed: false},
|
||||
want: "FAILED",
|
||||
want: statusCritical,
|
||||
},
|
||||
{
|
||||
name: "nvme critical warning",
|
||||
health: storageHealthStatus{criticalWarning: 1},
|
||||
want: "FAILED",
|
||||
want: statusCritical,
|
||||
},
|
||||
{
|
||||
name: "pending sectors",
|
||||
health: storageHealthStatus{pendingSectors: 1},
|
||||
want: "FAILED",
|
||||
want: statusCritical,
|
||||
},
|
||||
{
|
||||
name: "media errors warning",
|
||||
health: storageHealthStatus{mediaErrors: 2},
|
||||
want: "WARNING",
|
||||
want: statusWarning,
|
||||
},
|
||||
{
|
||||
name: "reallocated warning",
|
||||
health: storageHealthStatus{reallocatedSectors: 1},
|
||||
want: "WARNING",
|
||||
want: statusWarning,
|
||||
},
|
||||
{
|
||||
name: "life remaining low",
|
||||
health: storageHealthStatus{lifeRemainingPct: 8},
|
||||
want: "WARNING",
|
||||
want: statusWarning,
|
||||
},
|
||||
{
|
||||
name: "healthy",
|
||||
health: storageHealthStatus{},
|
||||
want: "OK",
|
||||
want: statusOK,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@@ -6,31 +6,31 @@ import (
|
||||
"time"
|
||||
)
|
||||
|
||||
func buildHealthSummary(snap schema.HardwareSnapshot) *schema.HardwareHealthSummary {
|
||||
func BuildHealthSummary(snap schema.HardwareSnapshot) *schema.HardwareHealthSummary {
|
||||
summary := &schema.HardwareHealthSummary{
|
||||
Status: "OK",
|
||||
Status: statusOK,
|
||||
CollectedAt: time.Now().UTC().Format(time.RFC3339),
|
||||
}
|
||||
|
||||
for _, dimm := range snap.Memory {
|
||||
switch derefString(dimm.Status) {
|
||||
case "WARNING":
|
||||
case statusWarning:
|
||||
summary.MemoryWarn++
|
||||
summary.Warnings = append(summary.Warnings, formatMemorySummary(dimm))
|
||||
case "FAILED":
|
||||
case statusCritical:
|
||||
summary.MemoryFail++
|
||||
summary.Failures = append(summary.Failures, formatMemorySummary(dimm))
|
||||
case "EMPTY":
|
||||
case statusEmpty:
|
||||
summary.EmptyDIMMs++
|
||||
}
|
||||
}
|
||||
|
||||
for _, disk := range snap.Storage {
|
||||
switch derefString(disk.Status) {
|
||||
case "WARNING":
|
||||
case statusWarning:
|
||||
summary.StorageWarn++
|
||||
summary.Warnings = append(summary.Warnings, formatStorageSummary(disk))
|
||||
case "FAILED":
|
||||
case statusCritical:
|
||||
summary.StorageFail++
|
||||
summary.Failures = append(summary.Failures, formatStorageSummary(disk))
|
||||
}
|
||||
@@ -38,10 +38,10 @@ func buildHealthSummary(snap schema.HardwareSnapshot) *schema.HardwareHealthSumm
|
||||
|
||||
for _, dev := range snap.PCIeDevices {
|
||||
switch derefString(dev.Status) {
|
||||
case "WARNING":
|
||||
case statusWarning:
|
||||
summary.PCIeWarn++
|
||||
summary.Warnings = append(summary.Warnings, formatPCIeSummary(dev))
|
||||
case "FAILED":
|
||||
case statusCritical:
|
||||
summary.PCIeFail++
|
||||
summary.Failures = append(summary.Failures, formatPCIeSummary(dev))
|
||||
}
|
||||
@@ -52,19 +52,19 @@ func buildHealthSummary(snap schema.HardwareSnapshot) *schema.HardwareHealthSumm
|
||||
summary.MissingPSUs++
|
||||
}
|
||||
switch derefString(psu.Status) {
|
||||
case "WARNING":
|
||||
case statusWarning:
|
||||
summary.PSUWarn++
|
||||
summary.Warnings = append(summary.Warnings, formatPSUSummary(psu))
|
||||
case "FAILED":
|
||||
case statusCritical:
|
||||
summary.PSUFail++
|
||||
summary.Failures = append(summary.Failures, formatPSUSummary(psu))
|
||||
}
|
||||
}
|
||||
|
||||
if len(summary.Failures) > 0 || summary.StorageFail > 0 || summary.PCIeFail > 0 || summary.PSUFail > 0 || summary.MemoryFail > 0 {
|
||||
summary.Status = "FAILED"
|
||||
summary.Status = statusCritical
|
||||
} else if len(summary.Warnings) > 0 || summary.StorageWarn > 0 || summary.PCIeWarn > 0 || summary.PSUWarn > 0 || summary.MemoryWarn > 0 {
|
||||
summary.Status = "WARNING"
|
||||
summary.Status = statusWarning
|
||||
}
|
||||
|
||||
if len(summary.Warnings) == 0 {
|
||||
|
||||
@@ -31,7 +31,7 @@ md125 : active raid1 nvme2n1[0] nvme3n1[1]
|
||||
func TestHasVROCController(t *testing.T) {
|
||||
intel := vendorIntel
|
||||
model := "Volume Management Device NVMe RAID Controller"
|
||||
class := "RAID bus controller"
|
||||
class := "MassStorageController"
|
||||
tests := []struct {
|
||||
name string
|
||||
pcie []schema.HardwarePCIeDevice
|
||||
|
||||
@@ -5,10 +5,10 @@ package schema
|
||||
// HardwareIngestRequest is the top-level output document produced by `bee audit`.
|
||||
// It is accepted as-is by the core /api/ingest/hardware endpoint.
|
||||
type HardwareIngestRequest struct {
|
||||
Filename *string `json:"filename"`
|
||||
SourceType *string `json:"source_type"`
|
||||
Protocol *string `json:"protocol"`
|
||||
TargetHost string `json:"target_host"`
|
||||
Filename *string `json:"filename,omitempty"`
|
||||
SourceType *string `json:"source_type,omitempty"`
|
||||
Protocol *string `json:"protocol,omitempty"`
|
||||
TargetHost *string `json:"target_host,omitempty"`
|
||||
CollectedAt string `json:"collected_at"`
|
||||
Hardware HardwareSnapshot `json:"hardware"`
|
||||
}
|
||||
@@ -21,32 +21,32 @@ type HardwareSnapshot struct {
|
||||
Storage []HardwareStorage `json:"storage,omitempty"`
|
||||
PCIeDevices []HardwarePCIeDevice `json:"pcie_devices,omitempty"`
|
||||
PowerSupplies []HardwarePowerSupply `json:"power_supplies,omitempty"`
|
||||
Summary *HardwareHealthSummary `json:"summary,omitempty"`
|
||||
Sensors *HardwareSensors `json:"sensors,omitempty"`
|
||||
}
|
||||
|
||||
type HardwareHealthSummary struct {
|
||||
Status string `json:"status"`
|
||||
Warnings []string `json:"warnings,omitempty"`
|
||||
Failures []string `json:"failures,omitempty"`
|
||||
StorageWarn int `json:"storage_warn,omitempty"`
|
||||
StorageFail int `json:"storage_fail,omitempty"`
|
||||
PCIeWarn int `json:"pcie_warn,omitempty"`
|
||||
PCIeFail int `json:"pcie_fail,omitempty"`
|
||||
PSUWarn int `json:"psu_warn,omitempty"`
|
||||
PSUFail int `json:"psu_fail,omitempty"`
|
||||
MemoryWarn int `json:"memory_warn,omitempty"`
|
||||
MemoryFail int `json:"memory_fail,omitempty"`
|
||||
EmptyDIMMs int `json:"empty_dimms,omitempty"`
|
||||
MissingPSUs int `json:"missing_psus,omitempty"`
|
||||
CollectedAt string `json:"collected_at,omitempty"`
|
||||
Status string `json:"status"`
|
||||
Warnings []string `json:"warnings,omitempty"`
|
||||
Failures []string `json:"failures,omitempty"`
|
||||
StorageWarn int `json:"storage_warn,omitempty"`
|
||||
StorageFail int `json:"storage_fail,omitempty"`
|
||||
PCIeWarn int `json:"pcie_warn,omitempty"`
|
||||
PCIeFail int `json:"pcie_fail,omitempty"`
|
||||
PSUWarn int `json:"psu_warn,omitempty"`
|
||||
PSUFail int `json:"psu_fail,omitempty"`
|
||||
MemoryWarn int `json:"memory_warn,omitempty"`
|
||||
MemoryFail int `json:"memory_fail,omitempty"`
|
||||
EmptyDIMMs int `json:"empty_dimms,omitempty"`
|
||||
MissingPSUs int `json:"missing_psus,omitempty"`
|
||||
CollectedAt string `json:"collected_at,omitempty"`
|
||||
}
|
||||
|
||||
type HardwareBoard struct {
|
||||
Manufacturer *string `json:"manufacturer"`
|
||||
ProductName *string `json:"product_name"`
|
||||
Manufacturer *string `json:"manufacturer,omitempty"`
|
||||
ProductName *string `json:"product_name,omitempty"`
|
||||
SerialNumber string `json:"serial_number"`
|
||||
PartNumber *string `json:"part_number"`
|
||||
UUID *string `json:"uuid"`
|
||||
PartNumber *string `json:"part_number,omitempty"`
|
||||
UUID *string `json:"uuid,omitempty"`
|
||||
}
|
||||
|
||||
type HardwareFirmwareRecord struct {
|
||||
@@ -55,77 +55,183 @@ type HardwareFirmwareRecord struct {
|
||||
}
|
||||
|
||||
type HardwareCPU struct {
|
||||
Socket *int `json:"socket"`
|
||||
Model *string `json:"model"`
|
||||
Manufacturer *string `json:"manufacturer"`
|
||||
Status *string `json:"status"`
|
||||
SerialNumber *string `json:"serial_number"`
|
||||
Firmware *string `json:"firmware"`
|
||||
Cores *int `json:"cores"`
|
||||
Threads *int `json:"threads"`
|
||||
FrequencyMHz *int `json:"frequency_mhz"`
|
||||
MaxFrequencyMHz *int `json:"max_frequency_mhz"`
|
||||
HardwareComponentStatus
|
||||
Socket *int `json:"socket,omitempty"`
|
||||
Model *string `json:"model,omitempty"`
|
||||
Manufacturer *string `json:"manufacturer,omitempty"`
|
||||
SerialNumber *string `json:"serial_number,omitempty"`
|
||||
Firmware *string `json:"firmware,omitempty"`
|
||||
Cores *int `json:"cores,omitempty"`
|
||||
Threads *int `json:"threads,omitempty"`
|
||||
FrequencyMHz *int `json:"frequency_mhz,omitempty"`
|
||||
MaxFrequencyMHz *int `json:"max_frequency_mhz,omitempty"`
|
||||
TemperatureC *float64 `json:"temperature_c,omitempty"`
|
||||
PowerW *float64 `json:"power_w,omitempty"`
|
||||
Throttled *bool `json:"throttled,omitempty"`
|
||||
CorrectableErrorCount *int64 `json:"correctable_error_count,omitempty"`
|
||||
UncorrectableErrorCount *int64 `json:"uncorrectable_error_count,omitempty"`
|
||||
LifeRemainingPct *float64 `json:"life_remaining_pct,omitempty"`
|
||||
LifeUsedPct *float64 `json:"life_used_pct,omitempty"`
|
||||
Present *bool `json:"present,omitempty"`
|
||||
}
|
||||
|
||||
type HardwareMemory struct {
|
||||
Slot *string `json:"slot"`
|
||||
Location *string `json:"location"`
|
||||
Present *bool `json:"present"`
|
||||
SizeMB *int `json:"size_mb"`
|
||||
Type *string `json:"type"`
|
||||
MaxSpeedMHz *int `json:"max_speed_mhz"`
|
||||
CurrentSpeedMHz *int `json:"current_speed_mhz"`
|
||||
Manufacturer *string `json:"manufacturer"`
|
||||
SerialNumber *string `json:"serial_number"`
|
||||
PartNumber *string `json:"part_number"`
|
||||
Status *string `json:"status"`
|
||||
HardwareComponentStatus
|
||||
Slot *string `json:"slot,omitempty"`
|
||||
Location *string `json:"location,omitempty"`
|
||||
Present *bool `json:"present,omitempty"`
|
||||
SizeMB *int `json:"size_mb,omitempty"`
|
||||
Type *string `json:"type,omitempty"`
|
||||
MaxSpeedMHz *int `json:"max_speed_mhz,omitempty"`
|
||||
CurrentSpeedMHz *int `json:"current_speed_mhz,omitempty"`
|
||||
Manufacturer *string `json:"manufacturer,omitempty"`
|
||||
SerialNumber *string `json:"serial_number,omitempty"`
|
||||
PartNumber *string `json:"part_number,omitempty"`
|
||||
TemperatureC *float64 `json:"temperature_c,omitempty"`
|
||||
CorrectableECCErrorCount *int64 `json:"correctable_ecc_error_count,omitempty"`
|
||||
UncorrectableECCErrorCount *int64 `json:"uncorrectable_ecc_error_count,omitempty"`
|
||||
LifeRemainingPct *float64 `json:"life_remaining_pct,omitempty"`
|
||||
LifeUsedPct *float64 `json:"life_used_pct,omitempty"`
|
||||
SpareBlocksRemainingPct *float64 `json:"spare_blocks_remaining_pct,omitempty"`
|
||||
PerformanceDegraded *bool `json:"performance_degraded,omitempty"`
|
||||
DataLossDetected *bool `json:"data_loss_detected,omitempty"`
|
||||
}
|
||||
|
||||
type HardwareStorage struct {
|
||||
Slot *string `json:"slot"`
|
||||
Type *string `json:"type"`
|
||||
Model *string `json:"model"`
|
||||
SizeGB *int `json:"size_gb"`
|
||||
SerialNumber *string `json:"serial_number"`
|
||||
Manufacturer *string `json:"manufacturer"`
|
||||
Firmware *string `json:"firmware"`
|
||||
Interface *string `json:"interface"`
|
||||
Present *bool `json:"present"`
|
||||
Status *string `json:"status"`
|
||||
Telemetry map[string]any `json:"telemetry,omitempty"`
|
||||
HardwareComponentStatus
|
||||
Slot *string `json:"slot,omitempty"`
|
||||
Type *string `json:"type,omitempty"`
|
||||
Model *string `json:"model,omitempty"`
|
||||
SizeGB *int `json:"size_gb,omitempty"`
|
||||
SerialNumber *string `json:"serial_number,omitempty"`
|
||||
Manufacturer *string `json:"manufacturer,omitempty"`
|
||||
Firmware *string `json:"firmware,omitempty"`
|
||||
Interface *string `json:"interface,omitempty"`
|
||||
Present *bool `json:"present,omitempty"`
|
||||
TemperatureC *float64 `json:"temperature_c,omitempty"`
|
||||
PowerOnHours *int64 `json:"power_on_hours,omitempty"`
|
||||
PowerCycles *int64 `json:"power_cycles,omitempty"`
|
||||
UnsafeShutdowns *int64 `json:"unsafe_shutdowns,omitempty"`
|
||||
MediaErrors *int64 `json:"media_errors,omitempty"`
|
||||
ErrorLogEntries *int64 `json:"error_log_entries,omitempty"`
|
||||
WrittenBytes *int64 `json:"written_bytes,omitempty"`
|
||||
ReadBytes *int64 `json:"read_bytes,omitempty"`
|
||||
LifeUsedPct *float64 `json:"life_used_pct,omitempty"`
|
||||
LifeRemainingPct *float64 `json:"life_remaining_pct,omitempty"`
|
||||
AvailableSparePct *float64 `json:"available_spare_pct,omitempty"`
|
||||
ReallocatedSectors *int64 `json:"reallocated_sectors,omitempty"`
|
||||
CurrentPendingSectors *int64 `json:"current_pending_sectors,omitempty"`
|
||||
OfflineUncorrectable *int64 `json:"offline_uncorrectable,omitempty"`
|
||||
Telemetry map[string]any `json:"-"`
|
||||
}
|
||||
|
||||
type HardwarePCIeDevice struct {
|
||||
Slot *string `json:"slot"`
|
||||
VendorID *int `json:"vendor_id"`
|
||||
DeviceID *int `json:"device_id"`
|
||||
BDF *string `json:"bdf"`
|
||||
DeviceClass *string `json:"device_class"`
|
||||
Manufacturer *string `json:"manufacturer"`
|
||||
Model *string `json:"model"`
|
||||
LinkWidth *int `json:"link_width"`
|
||||
LinkSpeed *string `json:"link_speed"`
|
||||
MaxLinkWidth *int `json:"max_link_width"`
|
||||
MaxLinkSpeed *string `json:"max_link_speed"`
|
||||
SerialNumber *string `json:"serial_number"`
|
||||
Firmware *string `json:"firmware"`
|
||||
Present *bool `json:"present"`
|
||||
Status *string `json:"status"`
|
||||
Telemetry map[string]any `json:"telemetry,omitempty"`
|
||||
HardwareComponentStatus
|
||||
Slot *string `json:"slot,omitempty"`
|
||||
VendorID *int `json:"vendor_id,omitempty"`
|
||||
DeviceID *int `json:"device_id,omitempty"`
|
||||
NUMANode *int `json:"numa_node,omitempty"`
|
||||
TemperatureC *float64 `json:"temperature_c,omitempty"`
|
||||
PowerW *float64 `json:"power_w,omitempty"`
|
||||
LifeRemainingPct *float64 `json:"life_remaining_pct,omitempty"`
|
||||
LifeUsedPct *float64 `json:"life_used_pct,omitempty"`
|
||||
ECCCorrectedTotal *int64 `json:"ecc_corrected_total,omitempty"`
|
||||
ECCUncorrectedTotal *int64 `json:"ecc_uncorrected_total,omitempty"`
|
||||
HWSlowdown *bool `json:"hw_slowdown,omitempty"`
|
||||
BatteryChargePct *float64 `json:"battery_charge_pct,omitempty"`
|
||||
BatteryHealthPct *float64 `json:"battery_health_pct,omitempty"`
|
||||
BatteryTemperatureC *float64 `json:"battery_temperature_c,omitempty"`
|
||||
BatteryVoltageV *float64 `json:"battery_voltage_v,omitempty"`
|
||||
BatteryReplaceRequired *bool `json:"battery_replace_required,omitempty"`
|
||||
SFPTemperatureC *float64 `json:"sfp_temperature_c,omitempty"`
|
||||
SFPTXPowerDBM *float64 `json:"sfp_tx_power_dbm,omitempty"`
|
||||
SFPRXPowerDBM *float64 `json:"sfp_rx_power_dbm,omitempty"`
|
||||
SFPVoltageV *float64 `json:"sfp_voltage_v,omitempty"`
|
||||
SFPBiasMA *float64 `json:"sfp_bias_ma,omitempty"`
|
||||
BDF *string `json:"bdf,omitempty"`
|
||||
DeviceClass *string `json:"device_class,omitempty"`
|
||||
Manufacturer *string `json:"manufacturer,omitempty"`
|
||||
Model *string `json:"model,omitempty"`
|
||||
LinkWidth *int `json:"link_width,omitempty"`
|
||||
LinkSpeed *string `json:"link_speed,omitempty"`
|
||||
MaxLinkWidth *int `json:"max_link_width,omitempty"`
|
||||
MaxLinkSpeed *string `json:"max_link_speed,omitempty"`
|
||||
SerialNumber *string `json:"serial_number,omitempty"`
|
||||
Firmware *string `json:"firmware,omitempty"`
|
||||
MacAddresses []string `json:"mac_addresses,omitempty"`
|
||||
Present *bool `json:"present,omitempty"`
|
||||
Telemetry map[string]any `json:"-"`
|
||||
}
|
||||
|
||||
type HardwarePowerSupply struct {
|
||||
Slot *string `json:"slot"`
|
||||
Present *bool `json:"present"`
|
||||
Model *string `json:"model"`
|
||||
Vendor *string `json:"vendor"`
|
||||
WattageW *int `json:"wattage_w"`
|
||||
SerialNumber *string `json:"serial_number"`
|
||||
PartNumber *string `json:"part_number"`
|
||||
Firmware *string `json:"firmware"`
|
||||
Status *string `json:"status"`
|
||||
InputType *string `json:"input_type"`
|
||||
InputPowerW *float64 `json:"input_power_w"`
|
||||
OutputPowerW *float64 `json:"output_power_w"`
|
||||
InputVoltage *float64 `json:"input_voltage"`
|
||||
HardwareComponentStatus
|
||||
Slot *string `json:"slot,omitempty"`
|
||||
Present *bool `json:"present,omitempty"`
|
||||
Model *string `json:"model,omitempty"`
|
||||
Vendor *string `json:"vendor,omitempty"`
|
||||
WattageW *int `json:"wattage_w,omitempty"`
|
||||
SerialNumber *string `json:"serial_number,omitempty"`
|
||||
PartNumber *string `json:"part_number,omitempty"`
|
||||
Firmware *string `json:"firmware,omitempty"`
|
||||
InputType *string `json:"input_type,omitempty"`
|
||||
InputPowerW *float64 `json:"input_power_w,omitempty"`
|
||||
OutputPowerW *float64 `json:"output_power_w,omitempty"`
|
||||
InputVoltage *float64 `json:"input_voltage,omitempty"`
|
||||
TemperatureC *float64 `json:"temperature_c,omitempty"`
|
||||
LifeRemainingPct *float64 `json:"life_remaining_pct,omitempty"`
|
||||
LifeUsedPct *float64 `json:"life_used_pct,omitempty"`
|
||||
}
|
||||
|
||||
type HardwareComponentStatus struct {
|
||||
Status *string `json:"status,omitempty"`
|
||||
StatusCheckedAt *string `json:"status_checked_at,omitempty"`
|
||||
StatusChangedAt *string `json:"status_changed_at,omitempty"`
|
||||
StatusHistory []HardwareStatusHistory `json:"status_history,omitempty"`
|
||||
ErrorDescription *string `json:"error_description,omitempty"`
|
||||
}
|
||||
|
||||
type HardwareStatusHistory struct {
|
||||
Status string `json:"status"`
|
||||
ChangedAt string `json:"changed_at"`
|
||||
Details *string `json:"details,omitempty"`
|
||||
}
|
||||
|
||||
type HardwareSensors struct {
|
||||
Fans []HardwareFanSensor `json:"fans,omitempty"`
|
||||
Power []HardwarePowerSensor `json:"power,omitempty"`
|
||||
Temperatures []HardwareTemperatureSensor `json:"temperatures,omitempty"`
|
||||
Other []HardwareOtherSensor `json:"other,omitempty"`
|
||||
}
|
||||
|
||||
type HardwareFanSensor struct {
|
||||
Name string `json:"name"`
|
||||
Location *string `json:"location,omitempty"`
|
||||
RPM *int `json:"rpm,omitempty"`
|
||||
Status *string `json:"status,omitempty"`
|
||||
}
|
||||
|
||||
type HardwarePowerSensor struct {
|
||||
Name string `json:"name"`
|
||||
Location *string `json:"location,omitempty"`
|
||||
VoltageV *float64 `json:"voltage_v,omitempty"`
|
||||
CurrentA *float64 `json:"current_a,omitempty"`
|
||||
PowerW *float64 `json:"power_w,omitempty"`
|
||||
Status *string `json:"status,omitempty"`
|
||||
}
|
||||
|
||||
type HardwareTemperatureSensor struct {
|
||||
Name string `json:"name"`
|
||||
Location *string `json:"location,omitempty"`
|
||||
Celsius *float64 `json:"celsius,omitempty"`
|
||||
ThresholdWarningCelsius *float64 `json:"threshold_warning_celsius,omitempty"`
|
||||
ThresholdCriticalCelsius *float64 `json:"threshold_critical_celsius,omitempty"`
|
||||
Status *string `json:"status,omitempty"`
|
||||
}
|
||||
|
||||
type HardwareOtherSensor struct {
|
||||
Name string `json:"name"`
|
||||
Location *string `json:"location,omitempty"`
|
||||
Value *float64 `json:"value,omitempty"`
|
||||
Unit *string `json:"unit,omitempty"`
|
||||
Status *string `json:"status,omitempty"`
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user