Align hardware export with ingest contract
This commit is contained in:
@@ -317,38 +317,20 @@ func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, erro
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) HealthSummaryResult() ActionResult {
|
func (a *App) HealthSummaryResult() ActionResult {
|
||||||
type auditFile struct {
|
|
||||||
Hardware struct {
|
|
||||||
Summary struct {
|
|
||||||
Status string `json:"status"`
|
|
||||||
Warnings []string `json:"warnings"`
|
|
||||||
Failures []string `json:"failures"`
|
|
||||||
StorageWarn int `json:"storage_warn"`
|
|
||||||
StorageFail int `json:"storage_fail"`
|
|
||||||
PCIeWarn int `json:"pcie_warn"`
|
|
||||||
PCIeFail int `json:"pcie_fail"`
|
|
||||||
PSUWarn int `json:"psu_warn"`
|
|
||||||
PSUFail int `json:"psu_fail"`
|
|
||||||
MemoryWarn int `json:"memory_warn"`
|
|
||||||
MemoryFail int `json:"memory_fail"`
|
|
||||||
} `json:"summary"`
|
|
||||||
} `json:"hardware"`
|
|
||||||
}
|
|
||||||
|
|
||||||
raw, err := os.ReadFile(DefaultAuditJSONPath)
|
raw, err := os.ReadFile(DefaultAuditJSONPath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return ActionResult{Title: "Health summary", Body: "No audit JSON found."}
|
return ActionResult{Title: "Health summary", Body: "No audit JSON found."}
|
||||||
}
|
}
|
||||||
var snapshot auditFile
|
var snapshot schema.HardwareIngestRequest
|
||||||
if err := json.Unmarshal(raw, &snapshot); err != nil {
|
if err := json.Unmarshal(raw, &snapshot); err != nil {
|
||||||
return ActionResult{Title: "Health summary", Body: "Audit JSON is unreadable."}
|
return ActionResult{Title: "Health summary", Body: "Audit JSON is unreadable."}
|
||||||
}
|
}
|
||||||
|
|
||||||
summary := snapshot.Hardware.Summary
|
summary := collector.BuildHealthSummary(snapshot.Hardware)
|
||||||
var body strings.Builder
|
var body strings.Builder
|
||||||
status := summary.Status
|
status := summary.Status
|
||||||
if status == "" {
|
if status == "" {
|
||||||
status = "UNKNOWN"
|
status = "Unknown"
|
||||||
}
|
}
|
||||||
fmt.Fprintf(&body, "Overall: %s\n", status)
|
fmt.Fprintf(&body, "Overall: %s\n", status)
|
||||||
fmt.Fprintf(&body, "Storage: warn=%d fail=%d\n", summary.StorageWarn, summary.StorageFail)
|
fmt.Fprintf(&body, "Storage: warn=%d fail=%d\n", summary.StorageWarn, summary.StorageFail)
|
||||||
@@ -662,12 +644,12 @@ func formatIPLine(list func() ([]platform.InterfaceInfo, error)) string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func isGPUDevice(dev schema.HardwarePCIeDevice) bool {
|
func isGPUDevice(dev schema.HardwarePCIeDevice) bool {
|
||||||
class := strings.ToLower(trimPtr(dev.DeviceClass))
|
class := trimPtr(dev.DeviceClass)
|
||||||
model := strings.ToLower(trimPtr(dev.Model))
|
model := strings.ToLower(trimPtr(dev.Model))
|
||||||
vendor := strings.ToLower(trimPtr(dev.Manufacturer))
|
vendor := strings.ToLower(trimPtr(dev.Manufacturer))
|
||||||
return strings.Contains(class, "vga") ||
|
return class == "VideoController" ||
|
||||||
strings.Contains(class, "3d") ||
|
class == "DisplayController" ||
|
||||||
strings.Contains(class, "display") ||
|
class == "ProcessingAccelerator" ||
|
||||||
strings.Contains(model, "nvidia") ||
|
strings.Contains(model, "nvidia") ||
|
||||||
strings.Contains(vendor, "nvidia") ||
|
strings.Contains(vendor, "nvidia") ||
|
||||||
strings.Contains(vendor, "amd")
|
strings.Contains(vendor, "amd")
|
||||||
|
|||||||
@@ -371,8 +371,6 @@ func TestFormatSATSummary(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestHealthSummaryResultIncludesCompactSATSummary(t *testing.T) {
|
func TestHealthSummaryResultIncludesCompactSATSummary(t *testing.T) {
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
tmp := t.TempDir()
|
tmp := t.TempDir()
|
||||||
oldAuditPath := DefaultAuditJSONPath
|
oldAuditPath := DefaultAuditJSONPath
|
||||||
oldSATBaseDir := DefaultSATBaseDir
|
oldSATBaseDir := DefaultSATBaseDir
|
||||||
@@ -386,7 +384,7 @@ func TestHealthSummaryResultIncludesCompactSATSummary(t *testing.T) {
|
|||||||
t.Fatalf("mkdir sat dir: %v", err)
|
t.Fatalf("mkdir sat dir: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
raw := `{"hardware":{"summary":{"status":"WARNING","storage_warn":1,"storage_fail":0,"pcie_warn":0,"pcie_fail":0,"psu_warn":0,"psu_fail":0,"memory_warn":0,"memory_fail":0}}}`
|
raw := `{"collected_at":"2026-03-15T10:00:00Z","hardware":{"board":{"serial_number":"SRV123"},"storage":[{"serial_number":"DISK1","status":"Warning"}]}}`
|
||||||
if err := os.WriteFile(DefaultAuditJSONPath, []byte(raw), 0644); err != nil {
|
if err := os.WriteFile(DefaultAuditJSONPath, []byte(raw), 0644); err != nil {
|
||||||
t.Fatalf("write audit json: %v", err)
|
t.Fatalf("write audit json: %v", err)
|
||||||
}
|
}
|
||||||
@@ -401,8 +399,6 @@ func TestHealthSummaryResultIncludesCompactSATSummary(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestMainBanner(t *testing.T) {
|
func TestMainBanner(t *testing.T) {
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
tmp := t.TempDir()
|
tmp := t.TempDir()
|
||||||
oldAuditPath := DefaultAuditJSONPath
|
oldAuditPath := DefaultAuditJSONPath
|
||||||
DefaultAuditJSONPath = filepath.Join(tmp, "audit.json")
|
DefaultAuditJSONPath = filepath.Join(tmp, "audit.json")
|
||||||
@@ -413,7 +409,7 @@ func TestMainBanner(t *testing.T) {
|
|||||||
product := "PowerEdge R760"
|
product := "PowerEdge R760"
|
||||||
cpuModel := "Intel Xeon Gold 6430"
|
cpuModel := "Intel Xeon Gold 6430"
|
||||||
memoryType := "DDR5"
|
memoryType := "DDR5"
|
||||||
gpuClass := "VGA compatible controller"
|
gpuClass := "VideoController"
|
||||||
gpuModel := "NVIDIA H100"
|
gpuModel := "NVIDIA H100"
|
||||||
|
|
||||||
payload := schema.HardwareIngestRequest{
|
payload := schema.HardwareIngestRequest{
|
||||||
|
|||||||
@@ -7,13 +7,15 @@ import (
|
|||||||
"bee/audit/internal/runtimeenv"
|
"bee/audit/internal/runtimeenv"
|
||||||
"bee/audit/internal/schema"
|
"bee/audit/internal/schema"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
|
"os"
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Run executes all collectors and returns the combined snapshot.
|
// Run executes all collectors and returns the combined snapshot.
|
||||||
// Partial failures are logged as warnings; collection always completes.
|
// Partial failures are logged as warnings; collection always completes.
|
||||||
func Run(runtimeMode runtimeenv.Mode) schema.HardwareIngestRequest {
|
func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
|
||||||
start := time.Now()
|
start := time.Now()
|
||||||
|
collectedAt := time.Now().UTC().Format(time.RFC3339)
|
||||||
slog.Info("audit started")
|
slog.Info("audit started")
|
||||||
|
|
||||||
snap := schema.HardwareSnapshot{}
|
snap := schema.HardwareSnapshot{}
|
||||||
@@ -27,27 +29,38 @@ func Run(runtimeMode runtimeenv.Mode) schema.HardwareIngestRequest {
|
|||||||
snap.Firmware = append(snap.Firmware, cpuFW...)
|
snap.Firmware = append(snap.Firmware, cpuFW...)
|
||||||
|
|
||||||
snap.Memory = collectMemory()
|
snap.Memory = collectMemory()
|
||||||
|
sensorDoc, err := readSensorsJSONDoc()
|
||||||
|
if err != nil {
|
||||||
|
slog.Info("sensors: unavailable for enrichment", "err", err)
|
||||||
|
}
|
||||||
|
snap.CPUs = enrichCPUsWithTelemetry(snap.CPUs, sensorDoc)
|
||||||
|
snap.Memory = enrichMemoryWithTelemetry(snap.Memory, sensorDoc)
|
||||||
snap.Storage = collectStorage()
|
snap.Storage = collectStorage()
|
||||||
snap.PCIeDevices = collectPCIe()
|
snap.PCIeDevices = collectPCIe()
|
||||||
snap.PCIeDevices = enrichPCIeWithNVIDIA(snap.PCIeDevices, snap.Board.SerialNumber)
|
snap.PCIeDevices = enrichPCIeWithNVIDIA(snap.PCIeDevices, snap.Board.SerialNumber)
|
||||||
snap.PCIeDevices = enrichPCIeWithMellanox(snap.PCIeDevices)
|
snap.PCIeDevices = enrichPCIeWithMellanox(snap.PCIeDevices)
|
||||||
snap.PCIeDevices = enrichPCIeWithNICTelemetry(snap.PCIeDevices)
|
snap.PCIeDevices = enrichPCIeWithNICTelemetry(snap.PCIeDevices)
|
||||||
|
snap.PCIeDevices = enrichPCIeWithRAIDTelemetry(snap.PCIeDevices)
|
||||||
snap.Storage = enrichStorageWithVROC(snap.Storage, snap.PCIeDevices)
|
snap.Storage = enrichStorageWithVROC(snap.Storage, snap.PCIeDevices)
|
||||||
snap.Storage = appendUniqueStorage(snap.Storage, collectRAIDStorage(snap.PCIeDevices))
|
snap.Storage = appendUniqueStorage(snap.Storage, collectRAIDStorage(snap.PCIeDevices))
|
||||||
snap.PowerSupplies = collectPSUs()
|
snap.PowerSupplies = collectPSUs()
|
||||||
snap.Summary = buildHealthSummary(snap)
|
snap.PowerSupplies = enrichPSUsWithTelemetry(snap.PowerSupplies, sensorDoc)
|
||||||
|
snap.Sensors = buildSensorsFromDoc(sensorDoc)
|
||||||
|
finalizeSnapshot(&snap, collectedAt)
|
||||||
|
|
||||||
// remaining collectors added in steps 1.8 – 1.10
|
// remaining collectors added in steps 1.8 – 1.10
|
||||||
|
|
||||||
slog.Info("audit completed", "duration", time.Since(start).Round(time.Millisecond))
|
slog.Info("audit completed", "duration", time.Since(start).Round(time.Millisecond))
|
||||||
|
|
||||||
sourceType := string(runtimeMode)
|
sourceType := "manual"
|
||||||
protocol := "os-direct"
|
var targetHost *string
|
||||||
|
if hostname, err := os.Hostname(); err == nil && hostname != "" {
|
||||||
|
targetHost = &hostname
|
||||||
|
}
|
||||||
return schema.HardwareIngestRequest{
|
return schema.HardwareIngestRequest{
|
||||||
SourceType: &sourceType,
|
SourceType: &sourceType,
|
||||||
Protocol: &protocol,
|
TargetHost: targetHost,
|
||||||
CollectedAt: time.Now().UTC().Format(time.RFC3339),
|
CollectedAt: collectedAt,
|
||||||
Hardware: snap,
|
Hardware: snap,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
64
audit/internal/collector/contract.go
Normal file
64
audit/internal/collector/contract.go
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
package collector
|
||||||
|
|
||||||
|
import "strings"
|
||||||
|
|
||||||
|
const (
|
||||||
|
statusOK = "OK"
|
||||||
|
statusWarning = "Warning"
|
||||||
|
statusCritical = "Critical"
|
||||||
|
statusUnknown = "Unknown"
|
||||||
|
statusEmpty = "Empty"
|
||||||
|
)
|
||||||
|
|
||||||
|
func mapPCIeDeviceClass(raw string) string {
|
||||||
|
normalized := strings.ToLower(strings.TrimSpace(raw))
|
||||||
|
switch {
|
||||||
|
case normalized == "":
|
||||||
|
return ""
|
||||||
|
case strings.Contains(normalized, "ethernet controller"):
|
||||||
|
return "EthernetController"
|
||||||
|
case strings.Contains(normalized, "fibre channel"):
|
||||||
|
return "FibreChannelController"
|
||||||
|
case strings.Contains(normalized, "network controller"), strings.Contains(normalized, "infiniband controller"):
|
||||||
|
return "NetworkController"
|
||||||
|
case strings.Contains(normalized, "serial attached scsi"), strings.Contains(normalized, "storage controller"):
|
||||||
|
return "StorageController"
|
||||||
|
case strings.Contains(normalized, "raid"), strings.Contains(normalized, "mass storage"):
|
||||||
|
return "MassStorageController"
|
||||||
|
case strings.Contains(normalized, "display controller"):
|
||||||
|
return "DisplayController"
|
||||||
|
case strings.Contains(normalized, "vga"), strings.Contains(normalized, "3d controller"), strings.Contains(normalized, "video controller"):
|
||||||
|
return "VideoController"
|
||||||
|
case strings.Contains(normalized, "processing accelerators"), strings.Contains(normalized, "processing accelerator"):
|
||||||
|
return "ProcessingAccelerator"
|
||||||
|
default:
|
||||||
|
return raw
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func isNICClass(class string) bool {
|
||||||
|
switch strings.TrimSpace(class) {
|
||||||
|
case "EthernetController", "NetworkController":
|
||||||
|
return true
|
||||||
|
default:
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func isGPUClass(class string) bool {
|
||||||
|
switch strings.TrimSpace(class) {
|
||||||
|
case "VideoController", "DisplayController", "ProcessingAccelerator":
|
||||||
|
return true
|
||||||
|
default:
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func isRAIDClass(class string) bool {
|
||||||
|
switch strings.TrimSpace(class) {
|
||||||
|
case "MassStorageController", "StorageController":
|
||||||
|
return true
|
||||||
|
default:
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -51,12 +51,14 @@ func parseCPUs(output, boardSerial string) []schema.HardwareCPU {
|
|||||||
// Returns false if the socket is unpopulated.
|
// Returns false if the socket is unpopulated.
|
||||||
func parseCPUSection(fields map[string]string, boardSerial string) (schema.HardwareCPU, bool) {
|
func parseCPUSection(fields map[string]string, boardSerial string) (schema.HardwareCPU, bool) {
|
||||||
status := parseCPUStatus(fields["Status"])
|
status := parseCPUStatus(fields["Status"])
|
||||||
if status == "EMPTY" {
|
if status == statusEmpty {
|
||||||
return schema.HardwareCPU{}, false
|
return schema.HardwareCPU{}, false
|
||||||
}
|
}
|
||||||
|
|
||||||
cpu := schema.HardwareCPU{}
|
cpu := schema.HardwareCPU{}
|
||||||
cpu.Status = &status
|
cpu.Status = &status
|
||||||
|
present := true
|
||||||
|
cpu.Present = &present
|
||||||
|
|
||||||
if socket, ok := parseSocketIndex(fields["Socket Designation"]); ok {
|
if socket, ok := parseSocketIndex(fields["Socket Designation"]); ok {
|
||||||
cpu.Socket = &socket
|
cpu.Socket = &socket
|
||||||
@@ -99,15 +101,15 @@ func parseCPUStatus(raw string) string {
|
|||||||
upper := strings.ToUpper(raw)
|
upper := strings.ToUpper(raw)
|
||||||
switch {
|
switch {
|
||||||
case upper == "" || upper == "UNKNOWN":
|
case upper == "" || upper == "UNKNOWN":
|
||||||
return "UNKNOWN"
|
return statusUnknown
|
||||||
case strings.Contains(upper, "UNPOPULATED") || strings.Contains(upper, "NOT POPULATED"):
|
case strings.Contains(upper, "UNPOPULATED") || strings.Contains(upper, "NOT POPULATED"):
|
||||||
return "EMPTY"
|
return statusEmpty
|
||||||
case strings.Contains(upper, "ENABLED"):
|
case strings.Contains(upper, "ENABLED"):
|
||||||
return "OK"
|
return statusOK
|
||||||
case strings.Contains(upper, "DISABLED"):
|
case strings.Contains(upper, "DISABLED"):
|
||||||
return "WARNING"
|
return statusWarning
|
||||||
default:
|
default:
|
||||||
return "UNKNOWN"
|
return statusUnknown
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
196
audit/internal/collector/cpu_telemetry.go
Normal file
196
audit/internal/collector/cpu_telemetry.go
Normal file
@@ -0,0 +1,196 @@
|
|||||||
|
package collector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"regexp"
|
||||||
|
"sort"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
cpuSysBaseDir = "/sys/devices/system/cpu"
|
||||||
|
socketIndexRe = regexp.MustCompile(`(?i)(?:package id|socket|cpu)\s*([0-9]+)`)
|
||||||
|
)
|
||||||
|
|
||||||
|
func enrichCPUsWithTelemetry(cpus []schema.HardwareCPU, doc sensorsDoc) []schema.HardwareCPU {
|
||||||
|
if len(cpus) == 0 {
|
||||||
|
return cpus
|
||||||
|
}
|
||||||
|
|
||||||
|
tempBySocket := cpuTempsFromSensors(doc, len(cpus))
|
||||||
|
powerBySocket := cpuPowerFromSensors(doc, len(cpus))
|
||||||
|
throttleBySocket := cpuThrottleBySocket()
|
||||||
|
|
||||||
|
for i := range cpus {
|
||||||
|
socket := 0
|
||||||
|
if cpus[i].Socket != nil {
|
||||||
|
socket = *cpus[i].Socket
|
||||||
|
}
|
||||||
|
if value, ok := tempBySocket[socket]; ok {
|
||||||
|
cpus[i].TemperatureC = &value
|
||||||
|
}
|
||||||
|
if value, ok := powerBySocket[socket]; ok {
|
||||||
|
cpus[i].PowerW = &value
|
||||||
|
}
|
||||||
|
if value, ok := throttleBySocket[socket]; ok {
|
||||||
|
cpus[i].Throttled = &value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return cpus
|
||||||
|
}
|
||||||
|
|
||||||
|
func cpuTempsFromSensors(doc sensorsDoc, cpuCount int) map[int]float64 {
|
||||||
|
out := map[int]float64{}
|
||||||
|
if len(doc) == 0 {
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
var fallback []float64
|
||||||
|
for chip, features := range doc {
|
||||||
|
for featureName, raw := range features {
|
||||||
|
feature, ok := raw.(map[string]any)
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if classifySensorFeature(feature) != "temp" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
temp, ok := firstFeatureFloat(feature, "_input")
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if socket, ok := detectCPUSocket(chip, featureName); ok {
|
||||||
|
if _, exists := out[socket]; !exists {
|
||||||
|
out[socket] = temp
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if isLikelyCPUTemp(chip, featureName) {
|
||||||
|
fallback = append(fallback, temp)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(out) == 0 && cpuCount == 1 && len(fallback) > 0 {
|
||||||
|
out[0] = fallback[0]
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func cpuPowerFromSensors(doc sensorsDoc, cpuCount int) map[int]float64 {
|
||||||
|
out := map[int]float64{}
|
||||||
|
if len(doc) == 0 {
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
var fallback []float64
|
||||||
|
for chip, features := range doc {
|
||||||
|
for featureName, raw := range features {
|
||||||
|
feature, ok := raw.(map[string]any)
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if classifySensorFeature(feature) != "power" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
power, ok := firstFeatureFloatWithContains(feature, []string{"power"})
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if socket, ok := detectCPUSocket(chip, featureName); ok {
|
||||||
|
if _, exists := out[socket]; !exists {
|
||||||
|
out[socket] = power
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if isLikelyCPUPower(chip, featureName) {
|
||||||
|
fallback = append(fallback, power)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(out) == 0 && cpuCount == 1 && len(fallback) > 0 {
|
||||||
|
out[0] = fallback[0]
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func detectCPUSocket(parts ...string) (int, bool) {
|
||||||
|
for _, part := range parts {
|
||||||
|
matches := socketIndexRe.FindStringSubmatch(strings.ToLower(part))
|
||||||
|
if len(matches) == 2 {
|
||||||
|
value, err := strconv.Atoi(matches[1])
|
||||||
|
if err == nil {
|
||||||
|
return value, true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
|
||||||
|
func isLikelyCPUTemp(chip, feature string) bool {
|
||||||
|
value := strings.ToLower(chip + " " + feature)
|
||||||
|
return strings.Contains(value, "coretemp") ||
|
||||||
|
strings.Contains(value, "k10temp") ||
|
||||||
|
strings.Contains(value, "package id") ||
|
||||||
|
strings.Contains(value, "tdie") ||
|
||||||
|
strings.Contains(value, "tctl") ||
|
||||||
|
strings.Contains(value, "cpu temp")
|
||||||
|
}
|
||||||
|
|
||||||
|
func isLikelyCPUPower(chip, feature string) bool {
|
||||||
|
value := strings.ToLower(chip + " " + feature)
|
||||||
|
return strings.Contains(value, "intel-rapl") ||
|
||||||
|
strings.Contains(value, "package id") ||
|
||||||
|
strings.Contains(value, "package-") ||
|
||||||
|
strings.Contains(value, "cpu power")
|
||||||
|
}
|
||||||
|
|
||||||
|
func cpuThrottleBySocket() map[int]bool {
|
||||||
|
out := map[int]bool{}
|
||||||
|
cpuDirs, err := filepath.Glob(filepath.Join(cpuSysBaseDir, "cpu[0-9]*"))
|
||||||
|
if err != nil {
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
sort.Strings(cpuDirs)
|
||||||
|
for _, cpuDir := range cpuDirs {
|
||||||
|
socket, ok := readSocketIndex(cpuDir)
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if cpuPackageThrottled(cpuDir) {
|
||||||
|
out[socket] = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func readSocketIndex(cpuDir string) (int, bool) {
|
||||||
|
raw, err := os.ReadFile(filepath.Join(cpuDir, "topology", "physical_package_id"))
|
||||||
|
if err != nil {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
value, err := strconv.Atoi(strings.TrimSpace(string(raw)))
|
||||||
|
if err != nil || value < 0 {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
return value, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func cpuPackageThrottled(cpuDir string) bool {
|
||||||
|
paths := []string{
|
||||||
|
filepath.Join(cpuDir, "thermal_throttle", "package_throttle_count"),
|
||||||
|
filepath.Join(cpuDir, "thermal_throttle", "core_throttle_count"),
|
||||||
|
}
|
||||||
|
for _, path := range paths {
|
||||||
|
raw, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
value, err := strconv.ParseInt(strings.TrimSpace(string(raw)), 10, 64)
|
||||||
|
if err == nil && value > 0 {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
71
audit/internal/collector/cpu_telemetry_test.go
Normal file
71
audit/internal/collector/cpu_telemetry_test.go
Normal file
@@ -0,0 +1,71 @@
|
|||||||
|
package collector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestEnrichCPUsWithTelemetry(t *testing.T) {
|
||||||
|
tmp := t.TempDir()
|
||||||
|
oldBase := cpuSysBaseDir
|
||||||
|
cpuSysBaseDir = tmp
|
||||||
|
t.Cleanup(func() { cpuSysBaseDir = oldBase })
|
||||||
|
|
||||||
|
mustWriteFile(t, filepath.Join(tmp, "cpu0", "topology", "physical_package_id"), "0\n")
|
||||||
|
mustWriteFile(t, filepath.Join(tmp, "cpu0", "thermal_throttle", "package_throttle_count"), "3\n")
|
||||||
|
mustWriteFile(t, filepath.Join(tmp, "cpu1", "topology", "physical_package_id"), "1\n")
|
||||||
|
mustWriteFile(t, filepath.Join(tmp, "cpu1", "thermal_throttle", "package_throttle_count"), "0\n")
|
||||||
|
|
||||||
|
doc := sensorsDoc{
|
||||||
|
"coretemp-isa-0000": {
|
||||||
|
"Package id 0": map[string]any{"temp1_input": 61.5},
|
||||||
|
"Package id 1": map[string]any{"temp2_input": 58.0},
|
||||||
|
},
|
||||||
|
"intel-rapl-mmio-0": {
|
||||||
|
"Package id 0": map[string]any{"power1_average": 180.0},
|
||||||
|
"Package id 1": map[string]any{"power2_average": 175.0},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
socket0 := 0
|
||||||
|
socket1 := 1
|
||||||
|
status := statusOK
|
||||||
|
cpus := []schema.HardwareCPU{
|
||||||
|
{Socket: &socket0, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||||
|
{Socket: &socket1, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||||
|
}
|
||||||
|
|
||||||
|
got := enrichCPUsWithTelemetry(cpus, doc)
|
||||||
|
|
||||||
|
if got[0].TemperatureC == nil || *got[0].TemperatureC != 61.5 {
|
||||||
|
t.Fatalf("cpu0 temperature mismatch: %#v", got[0].TemperatureC)
|
||||||
|
}
|
||||||
|
if got[0].PowerW == nil || *got[0].PowerW != 180.0 {
|
||||||
|
t.Fatalf("cpu0 power mismatch: %#v", got[0].PowerW)
|
||||||
|
}
|
||||||
|
if got[0].Throttled == nil || !*got[0].Throttled {
|
||||||
|
t.Fatalf("cpu0 throttled mismatch: %#v", got[0].Throttled)
|
||||||
|
}
|
||||||
|
if got[1].TemperatureC == nil || *got[1].TemperatureC != 58.0 {
|
||||||
|
t.Fatalf("cpu1 temperature mismatch: %#v", got[1].TemperatureC)
|
||||||
|
}
|
||||||
|
if got[1].PowerW == nil || *got[1].PowerW != 175.0 {
|
||||||
|
t.Fatalf("cpu1 power mismatch: %#v", got[1].PowerW)
|
||||||
|
}
|
||||||
|
if got[1].Throttled != nil && *got[1].Throttled {
|
||||||
|
t.Fatalf("cpu1 throttled mismatch: %#v", got[1].Throttled)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func mustWriteFile(t *testing.T, path, content string) {
|
||||||
|
t.Helper()
|
||||||
|
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||||
|
t.Fatalf("mkdir %s: %v", path, err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(path, []byte(content), 0644); err != nil {
|
||||||
|
t.Fatalf("write %s: %v", path, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -69,12 +69,12 @@ func TestParseCPUStatus(t *testing.T) {
|
|||||||
want string
|
want string
|
||||||
}{
|
}{
|
||||||
{"Populated, Enabled", "OK"},
|
{"Populated, Enabled", "OK"},
|
||||||
{"Populated, Disabled By User", "WARNING"},
|
{"Populated, Disabled By User", statusWarning},
|
||||||
{"Populated, Disabled By BIOS", "WARNING"},
|
{"Populated, Disabled By BIOS", statusWarning},
|
||||||
{"Unpopulated", "EMPTY"},
|
{"Unpopulated", statusEmpty},
|
||||||
{"Not Populated", "EMPTY"},
|
{"Not Populated", statusEmpty},
|
||||||
{"Unknown", "UNKNOWN"},
|
{"Unknown", statusUnknown},
|
||||||
{"", "UNKNOWN"},
|
{"", statusUnknown},
|
||||||
}
|
}
|
||||||
for _, tt := range tests {
|
for _, tt := range tests {
|
||||||
got := parseCPUStatus(tt.input)
|
got := parseCPUStatus(tt.input)
|
||||||
|
|||||||
179
audit/internal/collector/finalize.go
Normal file
179
audit/internal/collector/finalize.go
Normal file
@@ -0,0 +1,179 @@
|
|||||||
|
package collector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
"fmt"
|
||||||
|
)
|
||||||
|
|
||||||
|
func finalizeSnapshot(snap *schema.HardwareSnapshot, collectedAt string) {
|
||||||
|
snap.Memory = filterMemory(snap.Memory)
|
||||||
|
snap.Storage = filterStorage(snap.Storage)
|
||||||
|
snap.PowerSupplies = filterPSUs(snap.PowerSupplies)
|
||||||
|
|
||||||
|
setComponentStatusMetadata(snap, collectedAt)
|
||||||
|
deduplicateComponentSerials(snap)
|
||||||
|
}
|
||||||
|
|
||||||
|
func filterMemory(dimms []schema.HardwareMemory) []schema.HardwareMemory {
|
||||||
|
out := make([]schema.HardwareMemory, 0, len(dimms))
|
||||||
|
for _, dimm := range dimms {
|
||||||
|
if dimm.Present != nil && !*dimm.Present {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if dimm.Status != nil && *dimm.Status == statusEmpty {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if dimm.SerialNumber == nil || *dimm.SerialNumber == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
out = append(out, dimm)
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func filterStorage(disks []schema.HardwareStorage) []schema.HardwareStorage {
|
||||||
|
out := make([]schema.HardwareStorage, 0, len(disks))
|
||||||
|
for _, disk := range disks {
|
||||||
|
if disk.SerialNumber == nil || *disk.SerialNumber == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
out = append(out, disk)
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func filterPSUs(psus []schema.HardwarePowerSupply) []schema.HardwarePowerSupply {
|
||||||
|
out := make([]schema.HardwarePowerSupply, 0, len(psus))
|
||||||
|
for _, psu := range psus {
|
||||||
|
if psu.SerialNumber == nil || *psu.SerialNumber == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
out = append(out, psu)
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func setComponentStatusMetadata(snap *schema.HardwareSnapshot, collectedAt string) {
|
||||||
|
for i := range snap.CPUs {
|
||||||
|
setStatusCheckedAt(&snap.CPUs[i].HardwareComponentStatus, collectedAt)
|
||||||
|
}
|
||||||
|
for i := range snap.Memory {
|
||||||
|
setStatusCheckedAt(&snap.Memory[i].HardwareComponentStatus, collectedAt)
|
||||||
|
}
|
||||||
|
for i := range snap.Storage {
|
||||||
|
setStatusCheckedAt(&snap.Storage[i].HardwareComponentStatus, collectedAt)
|
||||||
|
}
|
||||||
|
for i := range snap.PCIeDevices {
|
||||||
|
setStatusCheckedAt(&snap.PCIeDevices[i].HardwareComponentStatus, collectedAt)
|
||||||
|
}
|
||||||
|
for i := range snap.PowerSupplies {
|
||||||
|
setStatusCheckedAt(&snap.PowerSupplies[i].HardwareComponentStatus, collectedAt)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func setStatusCheckedAt(status *schema.HardwareComponentStatus, collectedAt string) {
|
||||||
|
if status == nil || status.Status == nil || *status.Status == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if status.StatusCheckedAt == nil {
|
||||||
|
status.StatusCheckedAt = &collectedAt
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func deduplicateComponentSerials(snap *schema.HardwareSnapshot) {
|
||||||
|
deduplicateCPUSerials(snap.CPUs)
|
||||||
|
deduplicateMemorySerials(snap.Memory)
|
||||||
|
deduplicateStorageSerials(snap.Storage)
|
||||||
|
deduplicatePCIeSerials(snap.PCIeDevices)
|
||||||
|
deduplicatePSUSerials(snap.PowerSupplies)
|
||||||
|
}
|
||||||
|
|
||||||
|
func deduplicateCPUSerials(items []schema.HardwareCPU) {
|
||||||
|
seen := map[string]int{}
|
||||||
|
seq := 1
|
||||||
|
for i := range items {
|
||||||
|
if items[i].SerialNumber == nil || *items[i].SerialNumber == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
model := derefString(items[i].Model)
|
||||||
|
key := model + "\x00" + *items[i].SerialNumber
|
||||||
|
seen[key]++
|
||||||
|
if seen[key] > 1 {
|
||||||
|
repl := fmt.Sprintf("NO_SN-%08d", seq)
|
||||||
|
seq++
|
||||||
|
items[i].SerialNumber = &repl
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func deduplicateMemorySerials(items []schema.HardwareMemory) {
|
||||||
|
seen := map[string]int{}
|
||||||
|
seq := 1
|
||||||
|
for i := range items {
|
||||||
|
if items[i].SerialNumber == nil || *items[i].SerialNumber == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
model := derefString(items[i].PartNumber)
|
||||||
|
key := model + "\x00" + *items[i].SerialNumber
|
||||||
|
seen[key]++
|
||||||
|
if seen[key] > 1 {
|
||||||
|
repl := fmt.Sprintf("NO_SN-%08d", seq)
|
||||||
|
seq++
|
||||||
|
items[i].SerialNumber = &repl
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func deduplicateStorageSerials(items []schema.HardwareStorage) {
|
||||||
|
seen := map[string]int{}
|
||||||
|
seq := 1
|
||||||
|
for i := range items {
|
||||||
|
if items[i].SerialNumber == nil || *items[i].SerialNumber == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
model := derefString(items[i].Model)
|
||||||
|
key := model + "\x00" + *items[i].SerialNumber
|
||||||
|
seen[key]++
|
||||||
|
if seen[key] > 1 {
|
||||||
|
repl := fmt.Sprintf("NO_SN-%08d", seq)
|
||||||
|
seq++
|
||||||
|
items[i].SerialNumber = &repl
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func deduplicatePCIeSerials(items []schema.HardwarePCIeDevice) {
|
||||||
|
seen := map[string]int{}
|
||||||
|
seq := 1
|
||||||
|
for i := range items {
|
||||||
|
if items[i].SerialNumber == nil || *items[i].SerialNumber == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
model := derefString(items[i].Model)
|
||||||
|
key := model + "\x00" + *items[i].SerialNumber
|
||||||
|
seen[key]++
|
||||||
|
if seen[key] > 1 {
|
||||||
|
repl := fmt.Sprintf("NO_SN-%08d", seq)
|
||||||
|
seq++
|
||||||
|
items[i].SerialNumber = &repl
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func deduplicatePSUSerials(items []schema.HardwarePowerSupply) {
|
||||||
|
seen := map[string]int{}
|
||||||
|
seq := 1
|
||||||
|
for i := range items {
|
||||||
|
if items[i].SerialNumber == nil || *items[i].SerialNumber == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
model := derefString(items[i].Model)
|
||||||
|
key := model + "\x00" + *items[i].SerialNumber
|
||||||
|
seen[key]++
|
||||||
|
if seen[key] > 1 {
|
||||||
|
repl := fmt.Sprintf("NO_SN-%08d", seq)
|
||||||
|
seq++
|
||||||
|
items[i].SerialNumber = &repl
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
63
audit/internal/collector/finalize_test.go
Normal file
63
audit/internal/collector/finalize_test.go
Normal file
@@ -0,0 +1,63 @@
|
|||||||
|
package collector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestFinalizeSnapshotFiltersComponentsWithoutRequiredSerials(t *testing.T) {
|
||||||
|
collectedAt := "2026-03-15T12:00:00Z"
|
||||||
|
present := true
|
||||||
|
status := statusOK
|
||||||
|
serial := "SN-1"
|
||||||
|
|
||||||
|
snap := schema.HardwareSnapshot{
|
||||||
|
Memory: []schema.HardwareMemory{
|
||||||
|
{Present: &present, SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||||
|
{Present: &present, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||||
|
},
|
||||||
|
Storage: []schema.HardwareStorage{
|
||||||
|
{SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||||
|
{HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||||
|
},
|
||||||
|
PowerSupplies: []schema.HardwarePowerSupply{
|
||||||
|
{SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||||
|
{HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
finalizeSnapshot(&snap, collectedAt)
|
||||||
|
|
||||||
|
if len(snap.Memory) != 1 || snap.Memory[0].StatusCheckedAt == nil || *snap.Memory[0].StatusCheckedAt != collectedAt {
|
||||||
|
t.Fatalf("memory finalize mismatch: %+v", snap.Memory)
|
||||||
|
}
|
||||||
|
if len(snap.Storage) != 1 || snap.Storage[0].StatusCheckedAt == nil || *snap.Storage[0].StatusCheckedAt != collectedAt {
|
||||||
|
t.Fatalf("storage finalize mismatch: %+v", snap.Storage)
|
||||||
|
}
|
||||||
|
if len(snap.PowerSupplies) != 1 || snap.PowerSupplies[0].StatusCheckedAt == nil || *snap.PowerSupplies[0].StatusCheckedAt != collectedAt {
|
||||||
|
t.Fatalf("psu finalize mismatch: %+v", snap.PowerSupplies)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestFinalizeSnapshotDeduplicatesSerials(t *testing.T) {
|
||||||
|
collectedAt := "2026-03-15T12:00:00Z"
|
||||||
|
status := statusOK
|
||||||
|
model := "Device"
|
||||||
|
serial := "DUPLICATE"
|
||||||
|
|
||||||
|
snap := schema.HardwareSnapshot{
|
||||||
|
Storage: []schema.HardwareStorage{
|
||||||
|
{Model: &model, SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||||
|
{Model: &model, SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
finalizeSnapshot(&snap, collectedAt)
|
||||||
|
|
||||||
|
if got := *snap.Storage[0].SerialNumber; got != serial {
|
||||||
|
t.Fatalf("first serial changed: %q", got)
|
||||||
|
}
|
||||||
|
if got := *snap.Storage[1].SerialNumber; got != "NO_SN-00000001" {
|
||||||
|
t.Fatalf("duplicate serial mismatch: %q", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -47,12 +47,12 @@ func parseMemorySection(fields map[string]string) schema.HardwareMemory {
|
|||||||
dimm.Present = &present
|
dimm.Present = &present
|
||||||
|
|
||||||
if !present {
|
if !present {
|
||||||
status := "EMPTY"
|
status := statusEmpty
|
||||||
dimm.Status = &status
|
dimm.Status = &status
|
||||||
return dimm
|
return dimm
|
||||||
}
|
}
|
||||||
|
|
||||||
status := "OK"
|
status := statusOK
|
||||||
dimm.Status = &status
|
dimm.Status = &status
|
||||||
|
|
||||||
if mb := parseMemorySizeMB(rawSize); mb > 0 {
|
if mb := parseMemorySizeMB(rawSize); mb > 0 {
|
||||||
|
|||||||
203
audit/internal/collector/memory_telemetry.go
Normal file
203
audit/internal/collector/memory_telemetry.go
Normal file
@@ -0,0 +1,203 @@
|
|||||||
|
package collector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"sort"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
var edacBaseDir = "/sys/devices/system/edac/mc"
|
||||||
|
|
||||||
|
type edacDIMMStats struct {
|
||||||
|
Label string
|
||||||
|
CECount *int64
|
||||||
|
UECount *int64
|
||||||
|
}
|
||||||
|
|
||||||
|
func enrichMemoryWithTelemetry(dimms []schema.HardwareMemory, doc sensorsDoc) []schema.HardwareMemory {
|
||||||
|
if len(dimms) == 0 {
|
||||||
|
return dimms
|
||||||
|
}
|
||||||
|
|
||||||
|
tempByLabel := memoryTempsFromSensors(doc)
|
||||||
|
stats := readEDACStats()
|
||||||
|
|
||||||
|
for i := range dimms {
|
||||||
|
labelKeys := dimmMatchKeys(dimms[i].Slot, dimms[i].Location)
|
||||||
|
|
||||||
|
for _, key := range labelKeys {
|
||||||
|
if temp, ok := tempByLabel[key]; ok {
|
||||||
|
dimms[i].TemperatureC = &temp
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, key := range labelKeys {
|
||||||
|
if stat, ok := stats[key]; ok {
|
||||||
|
if stat.CECount != nil {
|
||||||
|
dimms[i].CorrectableECCErrorCount = stat.CECount
|
||||||
|
}
|
||||||
|
if stat.UECount != nil {
|
||||||
|
dimms[i].UncorrectableECCErrorCount = stat.UECount
|
||||||
|
}
|
||||||
|
if stat.UECount != nil && *stat.UECount > 0 {
|
||||||
|
dimms[i].DataLossDetected = boolPtr(true)
|
||||||
|
status := statusCritical
|
||||||
|
dimms[i].Status = &status
|
||||||
|
if dimms[i].ErrorDescription == nil {
|
||||||
|
dimms[i].ErrorDescription = stringPtr("EDAC reports uncorrectable ECC errors")
|
||||||
|
}
|
||||||
|
} else if stat.CECount != nil && *stat.CECount > 0 && (dimms[i].Status == nil || *dimms[i].Status == statusOK) {
|
||||||
|
status := statusWarning
|
||||||
|
dimms[i].Status = &status
|
||||||
|
if dimms[i].ErrorDescription == nil {
|
||||||
|
dimms[i].ErrorDescription = stringPtr("EDAC reports correctable ECC errors")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return dimms
|
||||||
|
}
|
||||||
|
|
||||||
|
func memoryTempsFromSensors(doc sensorsDoc) map[string]float64 {
|
||||||
|
out := map[string]float64{}
|
||||||
|
if len(doc) == 0 {
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
for chip, features := range doc {
|
||||||
|
for featureName, raw := range features {
|
||||||
|
feature, ok := raw.(map[string]any)
|
||||||
|
if !ok || classifySensorFeature(feature) != "temp" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if !isLikelyMemoryTemp(chip, featureName) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
temp, ok := firstFeatureFloat(feature, "_input")
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
key := canonicalLabel(featureName)
|
||||||
|
if key == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if _, exists := out[key]; !exists {
|
||||||
|
out[key] = temp
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func readEDACStats() map[string]edacDIMMStats {
|
||||||
|
out := map[string]edacDIMMStats{}
|
||||||
|
mcDirs, err := filepath.Glob(filepath.Join(edacBaseDir, "mc*"))
|
||||||
|
if err != nil {
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
sort.Strings(mcDirs)
|
||||||
|
for _, mcDir := range mcDirs {
|
||||||
|
dimmDirs, err := filepath.Glob(filepath.Join(mcDir, "dimm*"))
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
sort.Strings(dimmDirs)
|
||||||
|
for _, dimmDir := range dimmDirs {
|
||||||
|
stat, ok := readEDACDIMMStats(dimmDir)
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
key := canonicalLabel(stat.Label)
|
||||||
|
if key == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
out[key] = stat
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func readEDACDIMMStats(dimmDir string) (edacDIMMStats, bool) {
|
||||||
|
labelBytes, err := os.ReadFile(filepath.Join(dimmDir, "dimm_label"))
|
||||||
|
if err != nil {
|
||||||
|
labelBytes, err = os.ReadFile(filepath.Join(dimmDir, "label"))
|
||||||
|
if err != nil {
|
||||||
|
return edacDIMMStats{}, false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
label := strings.TrimSpace(string(labelBytes))
|
||||||
|
if label == "" {
|
||||||
|
return edacDIMMStats{}, false
|
||||||
|
}
|
||||||
|
|
||||||
|
stat := edacDIMMStats{Label: label}
|
||||||
|
if value, ok := readEDACCount(dimmDir, []string{"dimm_ce_count", "ce_count"}); ok {
|
||||||
|
stat.CECount = &value
|
||||||
|
}
|
||||||
|
if value, ok := readEDACCount(dimmDir, []string{"dimm_ue_count", "ue_count"}); ok {
|
||||||
|
stat.UECount = &value
|
||||||
|
}
|
||||||
|
return stat, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func readEDACCount(dir string, names []string) (int64, bool) {
|
||||||
|
for _, name := range names {
|
||||||
|
raw, err := os.ReadFile(filepath.Join(dir, name))
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
value, err := strconv.ParseInt(strings.TrimSpace(string(raw)), 10, 64)
|
||||||
|
if err == nil && value >= 0 {
|
||||||
|
return value, true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
|
||||||
|
func dimmMatchKeys(slot, location *string) []string {
|
||||||
|
var out []string
|
||||||
|
add := func(value *string) {
|
||||||
|
key := canonicalLabel(derefString(value))
|
||||||
|
if key == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for _, existing := range out {
|
||||||
|
if existing == key {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out = append(out, key)
|
||||||
|
}
|
||||||
|
add(slot)
|
||||||
|
add(location)
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func canonicalLabel(value string) string {
|
||||||
|
value = strings.ToUpper(strings.TrimSpace(value))
|
||||||
|
if value == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
var b strings.Builder
|
||||||
|
for _, r := range value {
|
||||||
|
if (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9') {
|
||||||
|
b.WriteRune(r)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func isLikelyMemoryTemp(chip, feature string) bool {
|
||||||
|
value := strings.ToLower(chip + " " + feature)
|
||||||
|
return strings.Contains(value, "dimm") || strings.Contains(value, "sodimm")
|
||||||
|
}
|
||||||
|
|
||||||
|
func boolPtr(value bool) *bool {
|
||||||
|
return &value
|
||||||
|
}
|
||||||
61
audit/internal/collector/memory_telemetry_test.go
Normal file
61
audit/internal/collector/memory_telemetry_test.go
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
package collector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"path/filepath"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestEnrichMemoryWithTelemetry(t *testing.T) {
|
||||||
|
tmp := t.TempDir()
|
||||||
|
oldBase := edacBaseDir
|
||||||
|
edacBaseDir = tmp
|
||||||
|
t.Cleanup(func() { edacBaseDir = oldBase })
|
||||||
|
|
||||||
|
mustWriteFile(t, filepath.Join(tmp, "mc0", "dimm0", "dimm_label"), "CPU0_DIMM_A1\n")
|
||||||
|
mustWriteFile(t, filepath.Join(tmp, "mc0", "dimm0", "dimm_ce_count"), "7\n")
|
||||||
|
mustWriteFile(t, filepath.Join(tmp, "mc0", "dimm0", "dimm_ue_count"), "0\n")
|
||||||
|
mustWriteFile(t, filepath.Join(tmp, "mc0", "dimm1", "dimm_label"), "CPU1_DIMM_B2\n")
|
||||||
|
mustWriteFile(t, filepath.Join(tmp, "mc0", "dimm1", "dimm_ce_count"), "0\n")
|
||||||
|
mustWriteFile(t, filepath.Join(tmp, "mc0", "dimm1", "dimm_ue_count"), "2\n")
|
||||||
|
|
||||||
|
doc := sensorsDoc{
|
||||||
|
"jc42-i2c-0-18": {
|
||||||
|
"CPU0 DIMM A1": map[string]any{"temp1_input": 43.0},
|
||||||
|
"CPU1 DIMM B2": map[string]any{"temp2_input": 46.0},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
status := statusOK
|
||||||
|
slotA := "CPU0_DIMM_A1"
|
||||||
|
slotB := "CPU1_DIMM_B2"
|
||||||
|
dimms := []schema.HardwareMemory{
|
||||||
|
{Slot: &slotA, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||||
|
{Slot: &slotB, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||||
|
}
|
||||||
|
|
||||||
|
got := enrichMemoryWithTelemetry(dimms, doc)
|
||||||
|
|
||||||
|
if got[0].TemperatureC == nil || *got[0].TemperatureC != 43.0 {
|
||||||
|
t.Fatalf("dimm0 temperature mismatch: %#v", got[0].TemperatureC)
|
||||||
|
}
|
||||||
|
if got[0].CorrectableECCErrorCount == nil || *got[0].CorrectableECCErrorCount != 7 {
|
||||||
|
t.Fatalf("dimm0 ce mismatch: %#v", got[0].CorrectableECCErrorCount)
|
||||||
|
}
|
||||||
|
if got[0].Status == nil || *got[0].Status != statusWarning {
|
||||||
|
t.Fatalf("dimm0 status mismatch: %#v", got[0].Status)
|
||||||
|
}
|
||||||
|
if got[1].TemperatureC == nil || *got[1].TemperatureC != 46.0 {
|
||||||
|
t.Fatalf("dimm1 temperature mismatch: %#v", got[1].TemperatureC)
|
||||||
|
}
|
||||||
|
if got[1].UncorrectableECCErrorCount == nil || *got[1].UncorrectableECCErrorCount != 2 {
|
||||||
|
t.Fatalf("dimm1 ue mismatch: %#v", got[1].UncorrectableECCErrorCount)
|
||||||
|
}
|
||||||
|
if got[1].Status == nil || *got[1].Status != statusCritical {
|
||||||
|
t.Fatalf("dimm1 status mismatch: %#v", got[1].Status)
|
||||||
|
}
|
||||||
|
if got[1].DataLossDetected == nil || !*got[1].DataLossDetected {
|
||||||
|
t.Fatalf("dimm1 data_loss_detected mismatch: %#v", got[1].DataLossDetected)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -18,17 +18,13 @@ var (
|
|||||||
}
|
}
|
||||||
return string(out), nil
|
return string(out), nil
|
||||||
}
|
}
|
||||||
readNetStatFile = func(iface, key string) (int64, error) {
|
readNetAddressFile = func(iface string) (string, error) {
|
||||||
path := filepath.Join("/sys/class/net", iface, "statistics", key)
|
path := filepath.Join("/sys/class/net", iface, "address")
|
||||||
raw, err := os.ReadFile(path)
|
raw, err := os.ReadFile(path)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return 0, err
|
return "", err
|
||||||
}
|
}
|
||||||
v, err := strconv.ParseInt(strings.TrimSpace(string(raw)), 10, 64)
|
return strings.TrimSpace(string(raw)), nil
|
||||||
if err != nil {
|
|
||||||
return 0, err
|
|
||||||
}
|
|
||||||
return v, nil
|
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -47,6 +43,7 @@ func enrichPCIeWithNICTelemetry(devs []schema.HardwarePCIeDevice) []schema.Hardw
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
iface := ifaces[0]
|
iface := ifaces[0]
|
||||||
|
devs[i].MacAddresses = collectInterfaceMACs(ifaces)
|
||||||
|
|
||||||
if devs[i].Firmware == nil {
|
if devs[i].Firmware == nil {
|
||||||
if out, err := ethtoolInfoQuery(iface); err == nil {
|
if out, err := ethtoolInfoQuery(iface); err == nil {
|
||||||
@@ -56,16 +53,13 @@ func enrichPCIeWithNICTelemetry(devs []schema.HardwarePCIeDevice) []schema.Hardw
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if devs[i].Telemetry == nil {
|
|
||||||
devs[i].Telemetry = map[string]any{}
|
|
||||||
}
|
|
||||||
injectNICPacketStats(devs[i].Telemetry, iface)
|
|
||||||
if out, err := ethtoolModuleQuery(iface); err == nil {
|
if out, err := ethtoolModuleQuery(iface); err == nil {
|
||||||
injectSFPDOMTelemetry(devs[i].Telemetry, out)
|
if injectSFPDOMTelemetry(&devs[i], out) {
|
||||||
|
enriched++
|
||||||
|
continue
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if len(devs[i].Telemetry) == 0 {
|
if len(devs[i].MacAddresses) > 0 || devs[i].Firmware != nil {
|
||||||
devs[i].Telemetry = nil
|
|
||||||
} else {
|
|
||||||
enriched++
|
enriched++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -77,31 +71,32 @@ func isNICDevice(dev schema.HardwarePCIeDevice) bool {
|
|||||||
if dev.DeviceClass == nil {
|
if dev.DeviceClass == nil {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
c := strings.ToLower(strings.TrimSpace(*dev.DeviceClass))
|
c := strings.TrimSpace(*dev.DeviceClass)
|
||||||
return strings.Contains(c, "ethernet controller") ||
|
return isNICClass(c) || strings.EqualFold(c, "FibreChannelController")
|
||||||
strings.Contains(c, "network controller") ||
|
|
||||||
strings.Contains(c, "infiniband controller")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func injectNICPacketStats(dst map[string]any, iface string) {
|
func collectInterfaceMACs(ifaces []string) []string {
|
||||||
for _, key := range []string{"rx_packets", "tx_packets", "rx_errors", "tx_errors"} {
|
seen := map[string]struct{}{}
|
||||||
if v, err := readNetStatFile(iface, key); err == nil {
|
var out []string
|
||||||
dst[key] = v
|
for _, iface := range ifaces {
|
||||||
|
mac, err := readNetAddressFile(iface)
|
||||||
|
if err != nil || mac == "" {
|
||||||
|
continue
|
||||||
}
|
}
|
||||||
|
mac = strings.ToLower(strings.TrimSpace(mac))
|
||||||
|
if _, ok := seen[mac]; ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seen[mac] = struct{}{}
|
||||||
|
out = append(out, mac)
|
||||||
}
|
}
|
||||||
}
|
return out
|
||||||
|
|
||||||
func injectSFPDOMTelemetry(dst map[string]any, raw string) {
|
|
||||||
parsed := parseSFPDOM(raw)
|
|
||||||
for k, v := range parsed {
|
|
||||||
dst[k] = v
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
var floatRe = regexp.MustCompile(`[-+]?[0-9]*\.?[0-9]+`)
|
var floatRe = regexp.MustCompile(`[-+]?[0-9]*\.?[0-9]+`)
|
||||||
|
|
||||||
func parseSFPDOM(raw string) map[string]any {
|
func injectSFPDOMTelemetry(dev *schema.HardwarePCIeDevice, raw string) bool {
|
||||||
out := map[string]any{}
|
var changed bool
|
||||||
for _, line := range strings.Split(raw, "\n") {
|
for _, line := range strings.Split(raw, "\n") {
|
||||||
trimmed := strings.TrimSpace(line)
|
trimmed := strings.TrimSpace(line)
|
||||||
if trimmed == "" {
|
if trimmed == "" {
|
||||||
@@ -117,26 +112,55 @@ func parseSFPDOM(raw string) map[string]any {
|
|||||||
switch {
|
switch {
|
||||||
case strings.Contains(key, "module temperature"):
|
case strings.Contains(key, "module temperature"):
|
||||||
if f, ok := firstFloat(val); ok {
|
if f, ok := firstFloat(val); ok {
|
||||||
out["sfp_temperature_c"] = f
|
dev.SFPTemperatureC = &f
|
||||||
|
changed = true
|
||||||
}
|
}
|
||||||
case strings.Contains(key, "laser output power"):
|
case strings.Contains(key, "laser output power"):
|
||||||
if f, ok := dbmValue(val); ok {
|
if f, ok := dbmValue(val); ok {
|
||||||
out["sfp_tx_power_dbm"] = f
|
dev.SFPTXPowerDBM = &f
|
||||||
|
changed = true
|
||||||
}
|
}
|
||||||
case strings.Contains(key, "receiver signal"):
|
case strings.Contains(key, "receiver signal"):
|
||||||
if f, ok := dbmValue(val); ok {
|
if f, ok := dbmValue(val); ok {
|
||||||
out["sfp_rx_power_dbm"] = f
|
dev.SFPRXPowerDBM = &f
|
||||||
|
changed = true
|
||||||
}
|
}
|
||||||
case strings.Contains(key, "module voltage"):
|
case strings.Contains(key, "module voltage"):
|
||||||
if f, ok := firstFloat(val); ok {
|
if f, ok := firstFloat(val); ok {
|
||||||
out["sfp_voltage_v"] = f
|
dev.SFPVoltageV = &f
|
||||||
|
changed = true
|
||||||
}
|
}
|
||||||
case strings.Contains(key, "laser bias current"):
|
case strings.Contains(key, "laser bias current"):
|
||||||
if f, ok := firstFloat(val); ok {
|
if f, ok := firstFloat(val); ok {
|
||||||
out["sfp_bias_ma"] = f
|
dev.SFPBiasMA = &f
|
||||||
|
changed = true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return changed
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseSFPDOM(raw string) map[string]any {
|
||||||
|
dev := schema.HardwarePCIeDevice{}
|
||||||
|
if !injectSFPDOMTelemetry(&dev, raw) {
|
||||||
|
return map[string]any{}
|
||||||
|
}
|
||||||
|
out := map[string]any{}
|
||||||
|
if dev.SFPTemperatureC != nil {
|
||||||
|
out["sfp_temperature_c"] = *dev.SFPTemperatureC
|
||||||
|
}
|
||||||
|
if dev.SFPTXPowerDBM != nil {
|
||||||
|
out["sfp_tx_power_dbm"] = *dev.SFPTXPowerDBM
|
||||||
|
}
|
||||||
|
if dev.SFPRXPowerDBM != nil {
|
||||||
|
out["sfp_rx_power_dbm"] = *dev.SFPRXPowerDBM
|
||||||
|
}
|
||||||
|
if dev.SFPVoltageV != nil {
|
||||||
|
out["sfp_voltage_v"] = *dev.SFPVoltageV
|
||||||
|
}
|
||||||
|
if dev.SFPBiasMA != nil {
|
||||||
|
out["sfp_bias_ma"] = *dev.SFPBiasMA
|
||||||
|
}
|
||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ type nvidiaGPUInfo struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// enrichPCIeWithNVIDIA enriches NVIDIA PCIe devices with data from nvidia-smi.
|
// enrichPCIeWithNVIDIA enriches NVIDIA PCIe devices with data from nvidia-smi.
|
||||||
// If the driver/tool is unavailable, NVIDIA devices get UNKNOWN status and
|
// If the driver/tool is unavailable, NVIDIA devices get Unknown status and
|
||||||
// a stable serial fallback based on board serial + slot.
|
// a stable serial fallback based on board serial + slot.
|
||||||
func enrichPCIeWithNVIDIA(devs []schema.HardwarePCIeDevice, boardSerial string) []schema.HardwarePCIeDevice {
|
func enrichPCIeWithNVIDIA(devs []schema.HardwarePCIeDevice, boardSerial string) []schema.HardwarePCIeDevice {
|
||||||
if !hasNVIDIADevices(devs) {
|
if !hasNVIDIADevices(devs) {
|
||||||
@@ -78,9 +78,10 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
|
|||||||
devs[i].Firmware = &v
|
devs[i].Firmware = &v
|
||||||
}
|
}
|
||||||
|
|
||||||
status := "OK"
|
status := statusOK
|
||||||
if info.ECCUncorrected != nil && *info.ECCUncorrected > 0 {
|
if info.ECCUncorrected != nil && *info.ECCUncorrected > 0 {
|
||||||
status = "WARNING"
|
status = statusWarning
|
||||||
|
devs[i].ErrorDescription = stringPtr("GPU reports uncorrected ECC errors")
|
||||||
}
|
}
|
||||||
devs[i].Status = &status
|
devs[i].Status = &status
|
||||||
injectNVIDIATelemetry(&devs[i], info)
|
injectNVIDIATelemetry(&devs[i], info)
|
||||||
@@ -214,7 +215,7 @@ func isNVIDIADevice(dev schema.HardwarePCIeDevice) bool {
|
|||||||
|
|
||||||
func setPCIeFallback(dev *schema.HardwarePCIeDevice, boardSerial string) {
|
func setPCIeFallback(dev *schema.HardwarePCIeDevice, boardSerial string) {
|
||||||
setPCIeFallbackSerial(dev, boardSerial)
|
setPCIeFallbackSerial(dev, boardSerial)
|
||||||
status := "UNKNOWN"
|
status := statusUnknown
|
||||||
dev.Status = &status
|
dev.Status = &status
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -233,25 +234,19 @@ func setPCIeFallbackSerial(dev *schema.HardwarePCIeDevice, boardSerial string) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) {
|
func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) {
|
||||||
if dev.Telemetry == nil {
|
|
||||||
dev.Telemetry = map[string]any{}
|
|
||||||
}
|
|
||||||
if info.TemperatureC != nil {
|
if info.TemperatureC != nil {
|
||||||
dev.Telemetry["temperature_c"] = *info.TemperatureC
|
dev.TemperatureC = info.TemperatureC
|
||||||
}
|
}
|
||||||
if info.PowerW != nil {
|
if info.PowerW != nil {
|
||||||
dev.Telemetry["power_w"] = *info.PowerW
|
dev.PowerW = info.PowerW
|
||||||
}
|
}
|
||||||
if info.ECCUncorrected != nil {
|
if info.ECCUncorrected != nil {
|
||||||
dev.Telemetry["ecc_uncorrected_total"] = *info.ECCUncorrected
|
dev.ECCUncorrectedTotal = info.ECCUncorrected
|
||||||
}
|
}
|
||||||
if info.ECCCorrected != nil {
|
if info.ECCCorrected != nil {
|
||||||
dev.Telemetry["ecc_corrected_total"] = *info.ECCCorrected
|
dev.ECCCorrectedTotal = info.ECCCorrected
|
||||||
}
|
}
|
||||||
if info.HWSlowdown != nil {
|
if info.HWSlowdown != nil {
|
||||||
dev.Telemetry["hw_slowdown_active"] = *info.HWSlowdown
|
dev.HWSlowdown = info.HWSlowdown
|
||||||
}
|
|
||||||
if len(dev.Telemetry) == 0 {
|
|
||||||
dev.Telemetry = nil
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -54,10 +54,10 @@ func TestEnrichPCIeWithNVIDIAData_driverLoaded(t *testing.T) {
|
|||||||
status := "OK"
|
status := "OK"
|
||||||
devices := []schema.HardwarePCIeDevice{
|
devices := []schema.HardwarePCIeDevice{
|
||||||
{
|
{
|
||||||
VendorID: &vendorID,
|
HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status},
|
||||||
BDF: &bdf,
|
VendorID: &vendorID,
|
||||||
Manufacturer: &manufacturer,
|
BDF: &bdf,
|
||||||
Status: &status,
|
Manufacturer: &manufacturer,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -80,14 +80,14 @@ func TestEnrichPCIeWithNVIDIAData_driverLoaded(t *testing.T) {
|
|||||||
if out[0].Firmware == nil || *out[0].Firmware != "96.00.1F.00.02" {
|
if out[0].Firmware == nil || *out[0].Firmware != "96.00.1F.00.02" {
|
||||||
t.Fatalf("firmware: got %v", out[0].Firmware)
|
t.Fatalf("firmware: got %v", out[0].Firmware)
|
||||||
}
|
}
|
||||||
if out[0].Status == nil || *out[0].Status != "WARNING" {
|
if out[0].Status == nil || *out[0].Status != statusWarning {
|
||||||
t.Fatalf("status: got %v", out[0].Status)
|
t.Fatalf("status: got %v", out[0].Status)
|
||||||
}
|
}
|
||||||
if out[0].Telemetry == nil {
|
if out[0].ECCUncorrectedTotal == nil || *out[0].ECCUncorrectedTotal != 2 {
|
||||||
t.Fatal("expected telemetry")
|
t.Fatalf("ecc_uncorrected_total: got %#v", out[0].ECCUncorrectedTotal)
|
||||||
}
|
}
|
||||||
if got, ok := out[0].Telemetry["ecc_uncorrected_total"].(int64); !ok || got != 2 {
|
if out[0].TemperatureC == nil || *out[0].TemperatureC != 55.5 {
|
||||||
t.Fatalf("ecc_uncorrected_total: got %#v", out[0].Telemetry["ecc_uncorrected_total"])
|
t.Fatalf("temperature_c: got %#v", out[0].TemperatureC)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -107,7 +107,7 @@ func TestEnrichPCIeWithNVIDIAData_driverMissingFallback(t *testing.T) {
|
|||||||
if out[0].SerialNumber == nil || *out[0].SerialNumber != "BOARD-123-PCIE-0000:17:00.0" {
|
if out[0].SerialNumber == nil || *out[0].SerialNumber != "BOARD-123-PCIE-0000:17:00.0" {
|
||||||
t.Fatalf("fallback serial: got %v", out[0].SerialNumber)
|
t.Fatalf("fallback serial: got %v", out[0].SerialNumber)
|
||||||
}
|
}
|
||||||
if out[0].Status == nil || *out[0].Status != "UNKNOWN" {
|
if out[0].Status == nil || *out[0].Status != statusUnknown {
|
||||||
t.Fatalf("fallback status: got %v", out[0].Status)
|
t.Fatalf("fallback status: got %v", out[0].Status)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -79,7 +79,7 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
|
|||||||
dev := schema.HardwarePCIeDevice{}
|
dev := schema.HardwarePCIeDevice{}
|
||||||
present := true
|
present := true
|
||||||
dev.Present = &present
|
dev.Present = &present
|
||||||
status := "OK"
|
status := statusOK
|
||||||
dev.Status = &status
|
dev.Status = &status
|
||||||
|
|
||||||
// Slot is the BDF: "0000:00:02.0"
|
// Slot is the BDF: "0000:00:02.0"
|
||||||
@@ -93,10 +93,32 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
|
|||||||
if deviceID != 0 {
|
if deviceID != 0 {
|
||||||
dev.DeviceID = &deviceID
|
dev.DeviceID = &deviceID
|
||||||
}
|
}
|
||||||
|
if numaNode, ok := readPCINumaNode(bdf); ok {
|
||||||
|
dev.NUMANode = &numaNode
|
||||||
|
}
|
||||||
|
if width, ok := readPCIIntAttribute(bdf, "current_link_width"); ok {
|
||||||
|
dev.LinkWidth = &width
|
||||||
|
}
|
||||||
|
if width, ok := readPCIIntAttribute(bdf, "max_link_width"); ok {
|
||||||
|
dev.MaxLinkWidth = &width
|
||||||
|
}
|
||||||
|
if speed, ok := readPCIStringAttribute(bdf, "current_link_speed"); ok {
|
||||||
|
linkSpeed := normalizePCILinkSpeed(speed)
|
||||||
|
if linkSpeed != "" {
|
||||||
|
dev.LinkSpeed = &linkSpeed
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if speed, ok := readPCIStringAttribute(bdf, "max_link_speed"); ok {
|
||||||
|
linkSpeed := normalizePCILinkSpeed(speed)
|
||||||
|
if linkSpeed != "" {
|
||||||
|
dev.MaxLinkSpeed = &linkSpeed
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if v := fields["Class"]; v != "" {
|
if v := fields["Class"]; v != "" {
|
||||||
dev.DeviceClass = &v
|
class := mapPCIeDeviceClass(v)
|
||||||
|
dev.DeviceClass = &class
|
||||||
}
|
}
|
||||||
if v := fields["Vendor"]; v != "" {
|
if v := fields["Vendor"]; v != "" {
|
||||||
dev.Manufacturer = &v
|
dev.Manufacturer = &v
|
||||||
@@ -131,3 +153,55 @@ func readHexFile(path string) (int, error) {
|
|||||||
n, err := strconv.ParseInt(s, 16, 64)
|
n, err := strconv.ParseInt(s, 16, 64)
|
||||||
return int(n), err
|
return int(n), err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func readPCINumaNode(bdf string) (int, bool) {
|
||||||
|
value, ok := readPCIIntAttribute(bdf, "numa_node")
|
||||||
|
if !ok || value < 0 {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
return value, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func readPCIIntAttribute(bdf, attribute string) (int, bool) {
|
||||||
|
out, err := exec.Command("cat", "/sys/bus/pci/devices/"+bdf+"/"+attribute).Output()
|
||||||
|
if err != nil {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
value, err := strconv.Atoi(strings.TrimSpace(string(out)))
|
||||||
|
if err != nil || value < 0 {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
return value, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func readPCIStringAttribute(bdf, attribute string) (string, bool) {
|
||||||
|
out, err := exec.Command("cat", "/sys/bus/pci/devices/"+bdf+"/"+attribute).Output()
|
||||||
|
if err != nil {
|
||||||
|
return "", false
|
||||||
|
}
|
||||||
|
value := strings.TrimSpace(string(out))
|
||||||
|
if value == "" {
|
||||||
|
return "", false
|
||||||
|
}
|
||||||
|
return value, true
|
||||||
|
}
|
||||||
|
|
||||||
|
func normalizePCILinkSpeed(raw string) string {
|
||||||
|
raw = strings.TrimSpace(strings.ToLower(raw))
|
||||||
|
switch {
|
||||||
|
case strings.Contains(raw, "2.5"):
|
||||||
|
return "Gen1"
|
||||||
|
case strings.Contains(raw, "5.0"):
|
||||||
|
return "Gen2"
|
||||||
|
case strings.Contains(raw, "8.0"):
|
||||||
|
return "Gen3"
|
||||||
|
case strings.Contains(raw, "16.0"):
|
||||||
|
return "Gen4"
|
||||||
|
case strings.Contains(raw, "32.0"):
|
||||||
|
return "Gen5"
|
||||||
|
case strings.Contains(raw, "64.0"):
|
||||||
|
return "Gen6"
|
||||||
|
default:
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -35,7 +35,27 @@ func TestParseLspci_filtersExcludedClasses(t *testing.T) {
|
|||||||
if len(devs) != 1 {
|
if len(devs) != 1 {
|
||||||
t.Fatalf("expected 1 filtered device, got %d", len(devs))
|
t.Fatalf("expected 1 filtered device, got %d", len(devs))
|
||||||
}
|
}
|
||||||
if devs[0].DeviceClass == nil || *devs[0].DeviceClass != "VGA compatible controller" {
|
if devs[0].DeviceClass == nil || *devs[0].DeviceClass != "VideoController" {
|
||||||
t.Fatalf("unexpected remaining class: %v", devs[0].DeviceClass)
|
t.Fatalf("unexpected remaining class: %v", devs[0].DeviceClass)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestNormalizePCILinkSpeed(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
raw string
|
||||||
|
want string
|
||||||
|
}{
|
||||||
|
{"2.5 GT/s PCIe", "Gen1"},
|
||||||
|
{"5.0 GT/s PCIe", "Gen2"},
|
||||||
|
{"8.0 GT/s PCIe", "Gen3"},
|
||||||
|
{"16.0 GT/s PCIe", "Gen4"},
|
||||||
|
{"32.0 GT/s PCIe", "Gen5"},
|
||||||
|
{"64.0 GT/s PCIe", "Gen6"},
|
||||||
|
{"unknown", ""},
|
||||||
|
}
|
||||||
|
for _, tt := range tests {
|
||||||
|
if got := normalizePCILinkSpeed(tt.raw); got != tt.want {
|
||||||
|
t.Fatalf("normalizePCILinkSpeed(%q)=%q want %q", tt.raw, got, tt.want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -114,7 +114,7 @@ func parseFRUBlock(block string, slotIdx int) (schema.HardwarePowerSupply, bool)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
status := "OK"
|
status := statusOK
|
||||||
psu.Status = &status
|
psu.Status = &status
|
||||||
|
|
||||||
return psu, true
|
return psu, true
|
||||||
@@ -123,9 +123,12 @@ func parseFRUBlock(block string, slotIdx int) (schema.HardwarePowerSupply, bool)
|
|||||||
type psuSDR struct {
|
type psuSDR struct {
|
||||||
slot int
|
slot int
|
||||||
status string
|
status string
|
||||||
|
reason string
|
||||||
inputPowerW *float64
|
inputPowerW *float64
|
||||||
outputPowerW *float64
|
outputPowerW *float64
|
||||||
inputVoltage *float64
|
inputVoltage *float64
|
||||||
|
temperatureC *float64
|
||||||
|
healthPct *float64
|
||||||
}
|
}
|
||||||
|
|
||||||
var psuSlotRe = regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b|\bps\s*([0-9]+)\b`)
|
var psuSlotRe = regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b|\bps\s*([0-9]+)\b`)
|
||||||
@@ -148,10 +151,11 @@ func parsePSUSDR(raw string) map[int]psuSDR {
|
|||||||
entry := out[slot]
|
entry := out[slot]
|
||||||
entry.slot = slot
|
entry.slot = slot
|
||||||
if entry.status == "" {
|
if entry.status == "" {
|
||||||
entry.status = "OK"
|
entry.status = statusOK
|
||||||
}
|
}
|
||||||
if state != "" && state != "ok" && state != "ns" {
|
if state != "" && state != "ok" && state != "ns" {
|
||||||
entry.status = "FAILED"
|
entry.status = statusCritical
|
||||||
|
entry.reason = "PSU sensor reported non-OK state: " + state
|
||||||
}
|
}
|
||||||
|
|
||||||
lowerName := strings.ToLower(name)
|
lowerName := strings.ToLower(name)
|
||||||
@@ -162,6 +166,10 @@ func parsePSUSDR(raw string) map[int]psuSDR {
|
|||||||
entry.outputPowerW = parseFloatPtr(value)
|
entry.outputPowerW = parseFloatPtr(value)
|
||||||
case strings.Contains(lowerName, "input voltage"), strings.Contains(lowerName, "ac input"):
|
case strings.Contains(lowerName, "input voltage"), strings.Contains(lowerName, "ac input"):
|
||||||
entry.inputVoltage = parseFloatPtr(value)
|
entry.inputVoltage = parseFloatPtr(value)
|
||||||
|
case strings.Contains(lowerName, "temp"):
|
||||||
|
entry.temperatureC = parseFloatPtr(value)
|
||||||
|
case strings.Contains(lowerName, "health"), strings.Contains(lowerName, "remaining life"), strings.Contains(lowerName, "life remaining"):
|
||||||
|
entry.healthPct = parsePercentPtr(value)
|
||||||
}
|
}
|
||||||
out[slot] = entry
|
out[slot] = entry
|
||||||
}
|
}
|
||||||
@@ -187,12 +195,23 @@ func mergePSUSDR(psus []schema.HardwarePowerSupply, sdr map[int]psuSDR) {
|
|||||||
if entry.inputVoltage != nil {
|
if entry.inputVoltage != nil {
|
||||||
psus[i].InputVoltage = entry.inputVoltage
|
psus[i].InputVoltage = entry.inputVoltage
|
||||||
}
|
}
|
||||||
|
if entry.temperatureC != nil {
|
||||||
|
psus[i].TemperatureC = entry.temperatureC
|
||||||
|
}
|
||||||
|
if entry.healthPct != nil {
|
||||||
|
psus[i].LifeRemainingPct = entry.healthPct
|
||||||
|
lifeUsed := 100 - *entry.healthPct
|
||||||
|
psus[i].LifeUsedPct = &lifeUsed
|
||||||
|
}
|
||||||
if entry.status != "" {
|
if entry.status != "" {
|
||||||
psus[i].Status = &entry.status
|
psus[i].Status = &entry.status
|
||||||
}
|
}
|
||||||
if psus[i].Status != nil && *psus[i].Status == "OK" {
|
if entry.reason != "" {
|
||||||
|
psus[i].ErrorDescription = &entry.reason
|
||||||
|
}
|
||||||
|
if psus[i].Status != nil && *psus[i].Status == statusOK {
|
||||||
if (entry.inputPowerW == nil && entry.outputPowerW == nil && entry.inputVoltage == nil) && entry.status == "" {
|
if (entry.inputPowerW == nil && entry.outputPowerW == nil && entry.inputVoltage == nil) && entry.status == "" {
|
||||||
unknown := "UNKNOWN"
|
unknown := statusUnknown
|
||||||
psus[i].Status = &unknown
|
psus[i].Status = &unknown
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -7,6 +7,8 @@ func TestParsePSUSDR(t *testing.T) {
|
|||||||
PS1 Input Power | 215 Watts | ok
|
PS1 Input Power | 215 Watts | ok
|
||||||
PS1 Output Power | 198 Watts | ok
|
PS1 Output Power | 198 Watts | ok
|
||||||
PS1 Input Voltage | 229 Volts | ok
|
PS1 Input Voltage | 229 Volts | ok
|
||||||
|
PS1 Temp | 39 C | ok
|
||||||
|
PS1 Health | 97 % | ok
|
||||||
PS2 Input Power | 0 Watts | cr
|
PS2 Input Power | 0 Watts | cr
|
||||||
`
|
`
|
||||||
|
|
||||||
@@ -14,7 +16,7 @@ PS2 Input Power | 0 Watts | cr
|
|||||||
if len(got) != 2 {
|
if len(got) != 2 {
|
||||||
t.Fatalf("len(got)=%d want 2", len(got))
|
t.Fatalf("len(got)=%d want 2", len(got))
|
||||||
}
|
}
|
||||||
if got[1].status != "OK" {
|
if got[1].status != statusOK {
|
||||||
t.Fatalf("ps1 status=%q", got[1].status)
|
t.Fatalf("ps1 status=%q", got[1].status)
|
||||||
}
|
}
|
||||||
if got[1].inputPowerW == nil || *got[1].inputPowerW != 215 {
|
if got[1].inputPowerW == nil || *got[1].inputPowerW != 215 {
|
||||||
@@ -26,7 +28,13 @@ PS2 Input Power | 0 Watts | cr
|
|||||||
if got[1].inputVoltage == nil || *got[1].inputVoltage != 229 {
|
if got[1].inputVoltage == nil || *got[1].inputVoltage != 229 {
|
||||||
t.Fatalf("ps1 input voltage=%v", got[1].inputVoltage)
|
t.Fatalf("ps1 input voltage=%v", got[1].inputVoltage)
|
||||||
}
|
}
|
||||||
if got[2].status != "FAILED" {
|
if got[1].temperatureC == nil || *got[1].temperatureC != 39 {
|
||||||
|
t.Fatalf("ps1 temperature=%v", got[1].temperatureC)
|
||||||
|
}
|
||||||
|
if got[1].healthPct == nil || *got[1].healthPct != 97 {
|
||||||
|
t.Fatalf("ps1 health=%v", got[1].healthPct)
|
||||||
|
}
|
||||||
|
if got[2].status != statusCritical {
|
||||||
t.Fatalf("ps2 status=%q", got[2].status)
|
t.Fatalf("ps2 status=%q", got[2].status)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
132
audit/internal/collector/psu_telemetry.go
Normal file
132
audit/internal/collector/psu_telemetry.go
Normal file
@@ -0,0 +1,132 @@
|
|||||||
|
package collector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
func enrichPSUsWithTelemetry(psus []schema.HardwarePowerSupply, doc sensorsDoc) []schema.HardwarePowerSupply {
|
||||||
|
if len(psus) == 0 || len(doc) == 0 {
|
||||||
|
return psus
|
||||||
|
}
|
||||||
|
|
||||||
|
tempBySlot := psuTempsFromSensors(doc)
|
||||||
|
healthBySlot := psuHealthFromSensors(doc)
|
||||||
|
for i := range psus {
|
||||||
|
slot := derefPSUSlot(psus[i].Slot)
|
||||||
|
if slot == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if psus[i].TemperatureC == nil {
|
||||||
|
if value, ok := tempBySlot[slot]; ok {
|
||||||
|
psus[i].TemperatureC = &value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if psus[i].LifeRemainingPct == nil {
|
||||||
|
if value, ok := healthBySlot[slot]; ok {
|
||||||
|
psus[i].LifeRemainingPct = &value
|
||||||
|
used := 100 - value
|
||||||
|
psus[i].LifeUsedPct = &used
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return psus
|
||||||
|
}
|
||||||
|
|
||||||
|
func psuHealthFromSensors(doc sensorsDoc) map[string]float64 {
|
||||||
|
out := map[string]float64{}
|
||||||
|
for chip, features := range doc {
|
||||||
|
for featureName, raw := range features {
|
||||||
|
feature, ok := raw.(map[string]any)
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if !isLikelyPSUHealth(chip, featureName) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
value, ok := firstFeaturePercent(feature)
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if slot, ok := detectPSUSlot(chip, featureName); ok {
|
||||||
|
if _, exists := out[slot]; !exists {
|
||||||
|
out[slot] = value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func firstFeaturePercent(feature map[string]any) (float64, bool) {
|
||||||
|
keys := sortedFeatureKeys(feature)
|
||||||
|
for _, key := range keys {
|
||||||
|
lower := strings.ToLower(key)
|
||||||
|
if strings.HasSuffix(lower, "_alarm") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if strings.Contains(lower, "health") || strings.Contains(lower, "life") || strings.Contains(lower, "remain") {
|
||||||
|
if value, ok := floatFromAny(feature[key]); ok {
|
||||||
|
return value, true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
|
||||||
|
func isLikelyPSUHealth(chip, feature string) bool {
|
||||||
|
value := strings.ToLower(chip + " " + feature)
|
||||||
|
return (strings.Contains(value, "psu") || strings.Contains(value, "power supply")) &&
|
||||||
|
(strings.Contains(value, "health") || strings.Contains(value, "life") || strings.Contains(value, "remain"))
|
||||||
|
}
|
||||||
|
|
||||||
|
func psuTempsFromSensors(doc sensorsDoc) map[string]float64 {
|
||||||
|
out := map[string]float64{}
|
||||||
|
for chip, features := range doc {
|
||||||
|
for featureName, raw := range features {
|
||||||
|
feature, ok := raw.(map[string]any)
|
||||||
|
if !ok || classifySensorFeature(feature) != "temp" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if !isLikelyPSUTemp(chip, featureName) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
temp, ok := firstFeatureFloat(feature, "_input")
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if slot, ok := detectPSUSlot(chip, featureName); ok {
|
||||||
|
if _, exists := out[slot]; !exists {
|
||||||
|
out[slot] = temp
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func isLikelyPSUTemp(chip, feature string) bool {
|
||||||
|
value := strings.ToLower(chip + " " + feature)
|
||||||
|
return strings.Contains(value, "psu") || strings.Contains(value, "power supply")
|
||||||
|
}
|
||||||
|
|
||||||
|
func detectPSUSlot(parts ...string) (string, bool) {
|
||||||
|
for _, part := range parts {
|
||||||
|
lower := strings.ToLower(part)
|
||||||
|
matches := psuSlotRe.FindStringSubmatch(lower)
|
||||||
|
if len(matches) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
for _, group := range matches[1:] {
|
||||||
|
if group == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
value, err := strconv.Atoi(group)
|
||||||
|
if err == nil && value > 0 {
|
||||||
|
return strconv.Itoa(value - 1), true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return "", false
|
||||||
|
}
|
||||||
42
audit/internal/collector/psu_telemetry_test.go
Normal file
42
audit/internal/collector/psu_telemetry_test.go
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
package collector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestEnrichPSUsWithTelemetry(t *testing.T) {
|
||||||
|
slot0 := "0"
|
||||||
|
slot1 := "1"
|
||||||
|
psus := []schema.HardwarePowerSupply{
|
||||||
|
{Slot: &slot0},
|
||||||
|
{Slot: &slot1},
|
||||||
|
}
|
||||||
|
|
||||||
|
doc := sensorsDoc{
|
||||||
|
"psu-hwmon-0": {
|
||||||
|
"PSU1 Temp": map[string]any{"temp1_input": 39.5},
|
||||||
|
"PSU2 Temp": map[string]any{"temp2_input": 41.0},
|
||||||
|
"PSU1 Health": map[string]any{"health1_input": 98.0},
|
||||||
|
"PSU2 Remaining Life": map[string]any{"life2_input": 95.0},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
got := enrichPSUsWithTelemetry(psus, doc)
|
||||||
|
if got[0].TemperatureC == nil || *got[0].TemperatureC != 39.5 {
|
||||||
|
t.Fatalf("psu0 temperature mismatch: %#v", got[0].TemperatureC)
|
||||||
|
}
|
||||||
|
if got[1].TemperatureC == nil || *got[1].TemperatureC != 41.0 {
|
||||||
|
t.Fatalf("psu1 temperature mismatch: %#v", got[1].TemperatureC)
|
||||||
|
}
|
||||||
|
if got[0].LifeRemainingPct == nil || *got[0].LifeRemainingPct != 98.0 {
|
||||||
|
t.Fatalf("psu0 life remaining mismatch: %#v", got[0].LifeRemainingPct)
|
||||||
|
}
|
||||||
|
if got[0].LifeUsedPct == nil || *got[0].LifeUsedPct != 2.0 {
|
||||||
|
t.Fatalf("psu0 life used mismatch: %#v", got[0].LifeUsedPct)
|
||||||
|
}
|
||||||
|
if got[1].LifeRemainingPct == nil || *got[1].LifeRemainingPct != 95.0 {
|
||||||
|
t.Fatalf("psu1 life remaining mismatch: %#v", got[1].LifeRemainingPct)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -83,11 +83,7 @@ func isLikelyRAIDController(dev schema.HardwarePCIeDevice) bool {
|
|||||||
if dev.DeviceClass == nil {
|
if dev.DeviceClass == nil {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
c := strings.ToLower(*dev.DeviceClass)
|
return isRAIDClass(*dev.DeviceClass)
|
||||||
return strings.Contains(c, "raid") ||
|
|
||||||
strings.Contains(c, "sas") ||
|
|
||||||
strings.Contains(c, "mass storage") ||
|
|
||||||
strings.Contains(c, "serial attached scsi")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func collectStorcliDrives() []schema.HardwareStorage {
|
func collectStorcliDrives() []schema.HardwareStorage {
|
||||||
@@ -182,7 +178,10 @@ func parseSASIrcuDisplay(raw string) []schema.HardwareStorage {
|
|||||||
|
|
||||||
present := true
|
present := true
|
||||||
status := mapRAIDDriveStatus(b["State"])
|
status := mapRAIDDriveStatus(b["State"])
|
||||||
s := schema.HardwareStorage{Present: &present, Status: &status}
|
s := schema.HardwareStorage{
|
||||||
|
HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status},
|
||||||
|
Present: &present,
|
||||||
|
}
|
||||||
|
|
||||||
enclosure := strings.TrimSpace(b["Enclosure #"])
|
enclosure := strings.TrimSpace(b["Enclosure #"])
|
||||||
slot := strings.TrimSpace(b["Slot #"])
|
slot := strings.TrimSpace(b["Slot #"])
|
||||||
@@ -281,7 +280,10 @@ func parseArcconfPhysicalDrives(raw string) []schema.HardwareStorage {
|
|||||||
for _, b := range blocks {
|
for _, b := range blocks {
|
||||||
present := true
|
present := true
|
||||||
status := mapRAIDDriveStatus(b["State"])
|
status := mapRAIDDriveStatus(b["State"])
|
||||||
s := schema.HardwareStorage{Present: &present, Status: &status}
|
s := schema.HardwareStorage{
|
||||||
|
HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status},
|
||||||
|
Present: &present,
|
||||||
|
}
|
||||||
|
|
||||||
if v := strings.TrimSpace(b["Reported Location"]); v != "" {
|
if v := strings.TrimSpace(b["Reported Location"]); v != "" {
|
||||||
s.Slot = &v
|
s.Slot = &v
|
||||||
@@ -362,8 +364,11 @@ func parseSSACLIPhysicalDrives(raw string) []schema.HardwareStorage {
|
|||||||
if m := ssacliPhysicalDriveLine.FindStringSubmatch(trimmed); len(m) == 3 {
|
if m := ssacliPhysicalDriveLine.FindStringSubmatch(trimmed); len(m) == 3 {
|
||||||
flush()
|
flush()
|
||||||
present := true
|
present := true
|
||||||
status := "UNKNOWN"
|
status := statusUnknown
|
||||||
s := schema.HardwareStorage{Present: &present, Status: &status}
|
s := schema.HardwareStorage{
|
||||||
|
HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status},
|
||||||
|
Present: &present,
|
||||||
|
}
|
||||||
slot := m[1]
|
slot := m[1]
|
||||||
s.Slot = &slot
|
s.Slot = &slot
|
||||||
|
|
||||||
@@ -475,8 +480,8 @@ func storcliDriveToStorage(d struct {
|
|||||||
present := true
|
present := true
|
||||||
status := mapRAIDDriveStatus(d.State)
|
status := mapRAIDDriveStatus(d.State)
|
||||||
s := schema.HardwareStorage{
|
s := schema.HardwareStorage{
|
||||||
Present: &present,
|
HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status},
|
||||||
Status: &status,
|
Present: &present,
|
||||||
}
|
}
|
||||||
|
|
||||||
if v := strings.TrimSpace(d.EIDSlt); v != "" {
|
if v := strings.TrimSpace(d.EIDSlt); v != "" {
|
||||||
@@ -527,15 +532,15 @@ func mapRAIDDriveStatus(raw string) string {
|
|||||||
u := strings.ToUpper(strings.TrimSpace(raw))
|
u := strings.ToUpper(strings.TrimSpace(raw))
|
||||||
switch {
|
switch {
|
||||||
case strings.Contains(u, "OK"), strings.Contains(u, "OPTIMAL"), strings.Contains(u, "READY"):
|
case strings.Contains(u, "OK"), strings.Contains(u, "OPTIMAL"), strings.Contains(u, "READY"):
|
||||||
return "OK"
|
return statusOK
|
||||||
case strings.Contains(u, "ONLN"), strings.Contains(u, "ONLINE"):
|
case strings.Contains(u, "ONLN"), strings.Contains(u, "ONLINE"):
|
||||||
return "OK"
|
return statusOK
|
||||||
case strings.Contains(u, "RBLD"), strings.Contains(u, "REBUILD"):
|
case strings.Contains(u, "RBLD"), strings.Contains(u, "REBUILD"):
|
||||||
return "WARNING"
|
return statusWarning
|
||||||
case strings.Contains(u, "FAIL"), strings.Contains(u, "OFFLINE"):
|
case strings.Contains(u, "FAIL"), strings.Contains(u, "OFFLINE"):
|
||||||
return "CRITICAL"
|
return statusCritical
|
||||||
default:
|
default:
|
||||||
return "UNKNOWN"
|
return statusUnknown
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -641,8 +646,9 @@ func enrichStorageWithVROC(storage []schema.HardwareStorage, pcie []schema.Hardw
|
|||||||
storage[i].Telemetry["vroc_array"] = arr.Name
|
storage[i].Telemetry["vroc_array"] = arr.Name
|
||||||
storage[i].Telemetry["vroc_degraded"] = arr.Degraded
|
storage[i].Telemetry["vroc_degraded"] = arr.Degraded
|
||||||
if arr.Degraded {
|
if arr.Degraded {
|
||||||
status := "WARNING"
|
status := statusWarning
|
||||||
storage[i].Status = &status
|
storage[i].Status = &status
|
||||||
|
storage[i].ErrorDescription = stringPtr("VROC array is degraded")
|
||||||
}
|
}
|
||||||
updated++
|
updated++
|
||||||
}
|
}
|
||||||
@@ -659,14 +665,14 @@ func hasVROCController(pcie []schema.HardwarePCIeDevice) bool {
|
|||||||
|
|
||||||
class := ""
|
class := ""
|
||||||
if dev.DeviceClass != nil {
|
if dev.DeviceClass != nil {
|
||||||
class = strings.ToLower(*dev.DeviceClass)
|
class = strings.TrimSpace(*dev.DeviceClass)
|
||||||
}
|
}
|
||||||
model := ""
|
model := ""
|
||||||
if dev.Model != nil {
|
if dev.Model != nil {
|
||||||
model = strings.ToLower(*dev.Model)
|
model = strings.ToLower(*dev.Model)
|
||||||
}
|
}
|
||||||
|
|
||||||
if strings.Contains(class, "raid") ||
|
if isRAIDClass(class) ||
|
||||||
strings.Contains(model, "vroc") ||
|
strings.Contains(model, "vroc") ||
|
||||||
strings.Contains(model, "volume management device") ||
|
strings.Contains(model, "volume management device") ||
|
||||||
strings.Contains(model, "vmd") {
|
strings.Contains(model, "vmd") {
|
||||||
|
|||||||
334
audit/internal/collector/raid_controller_telemetry.go
Normal file
334
audit/internal/collector/raid_controller_telemetry.go
Normal file
@@ -0,0 +1,334 @@
|
|||||||
|
package collector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
"encoding/json"
|
||||||
|
"log/slog"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
type raidControllerTelemetry struct {
|
||||||
|
BatteryChargePct *float64
|
||||||
|
BatteryHealthPct *float64
|
||||||
|
BatteryTemperatureC *float64
|
||||||
|
BatteryVoltageV *float64
|
||||||
|
BatteryReplaceRequired *bool
|
||||||
|
ErrorDescription *string
|
||||||
|
}
|
||||||
|
|
||||||
|
func enrichPCIeWithRAIDTelemetry(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice {
|
||||||
|
byVendor := collectRAIDControllerTelemetry()
|
||||||
|
if len(byVendor) == 0 {
|
||||||
|
return devs
|
||||||
|
}
|
||||||
|
|
||||||
|
positions := map[int]int{}
|
||||||
|
for i := range devs {
|
||||||
|
if devs[i].VendorID == nil || !isLikelyRAIDController(devs[i]) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
vendor := *devs[i].VendorID
|
||||||
|
list := byVendor[vendor]
|
||||||
|
if len(list) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
index := positions[vendor]
|
||||||
|
if index >= len(list) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
positions[vendor] = index + 1
|
||||||
|
applyRAIDControllerTelemetry(&devs[i], list[index])
|
||||||
|
}
|
||||||
|
|
||||||
|
return devs
|
||||||
|
}
|
||||||
|
|
||||||
|
func applyRAIDControllerTelemetry(dev *schema.HardwarePCIeDevice, tel raidControllerTelemetry) {
|
||||||
|
if tel.BatteryChargePct != nil {
|
||||||
|
dev.BatteryChargePct = tel.BatteryChargePct
|
||||||
|
}
|
||||||
|
if tel.BatteryHealthPct != nil {
|
||||||
|
dev.BatteryHealthPct = tel.BatteryHealthPct
|
||||||
|
}
|
||||||
|
if tel.BatteryTemperatureC != nil {
|
||||||
|
dev.BatteryTemperatureC = tel.BatteryTemperatureC
|
||||||
|
}
|
||||||
|
if tel.BatteryVoltageV != nil {
|
||||||
|
dev.BatteryVoltageV = tel.BatteryVoltageV
|
||||||
|
}
|
||||||
|
if tel.BatteryReplaceRequired != nil {
|
||||||
|
dev.BatteryReplaceRequired = tel.BatteryReplaceRequired
|
||||||
|
}
|
||||||
|
if tel.ErrorDescription != nil {
|
||||||
|
dev.ErrorDescription = tel.ErrorDescription
|
||||||
|
if dev.Status == nil || *dev.Status == statusOK {
|
||||||
|
status := statusWarning
|
||||||
|
dev.Status = &status
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func collectRAIDControllerTelemetry() map[int][]raidControllerTelemetry {
|
||||||
|
out := map[int][]raidControllerTelemetry{}
|
||||||
|
|
||||||
|
if raw, err := raidToolQuery("storcli64", "/call", "show", "all", "J"); err == nil {
|
||||||
|
list := parseStorcliControllerTelemetry(raw)
|
||||||
|
if len(list) > 0 {
|
||||||
|
out[vendorBroadcomLSI] = append(out[vendorBroadcomLSI], list...)
|
||||||
|
slog.Info("raid: storcli controller telemetry", "count", len(list))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if raw, err := raidToolQuery("ssacli", "ctrl", "all", "show", "config", "detail"); err == nil {
|
||||||
|
list := parseSSACLIControllerTelemetry(string(raw))
|
||||||
|
if len(list) > 0 {
|
||||||
|
out[vendorHPE] = append(out[vendorHPE], list...)
|
||||||
|
slog.Info("raid: ssacli controller telemetry", "count", len(list))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if raw, err := raidToolQuery("arcconf", "getconfig", "1", "ad"); err == nil {
|
||||||
|
list := parseArcconfControllerTelemetry(string(raw))
|
||||||
|
if len(list) > 0 {
|
||||||
|
out[vendorAdaptec] = append(out[vendorAdaptec], list...)
|
||||||
|
slog.Info("raid: arcconf controller telemetry", "count", len(list))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseStorcliControllerTelemetry(raw []byte) []raidControllerTelemetry {
|
||||||
|
var doc struct {
|
||||||
|
Controllers []struct {
|
||||||
|
ResponseData map[string]any `json:"Response Data"`
|
||||||
|
} `json:"Controllers"`
|
||||||
|
}
|
||||||
|
if err := json.Unmarshal(raw, &doc); err != nil {
|
||||||
|
slog.Warn("raid: parse storcli controller telemetry failed", "err", err)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
var out []raidControllerTelemetry
|
||||||
|
for _, ctl := range doc.Controllers {
|
||||||
|
tel := raidControllerTelemetry{}
|
||||||
|
mergeStorcliBatteryMap(&tel, nestedStringMap(ctl.ResponseData["BBU_Info"]))
|
||||||
|
mergeStorcliBatteryMap(&tel, nestedStringMap(ctl.ResponseData["BBU_Info_Details"]))
|
||||||
|
mergeStorcliBatteryMap(&tel, nestedStringMap(ctl.ResponseData["CV_Info"]))
|
||||||
|
mergeStorcliBatteryMap(&tel, nestedStringMap(ctl.ResponseData["CV_Info_Details"]))
|
||||||
|
if hasRAIDControllerTelemetry(tel) {
|
||||||
|
out = append(out, tel)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func nestedStringMap(raw any) map[string]string {
|
||||||
|
switch value := raw.(type) {
|
||||||
|
case map[string]any:
|
||||||
|
out := map[string]string{}
|
||||||
|
flattenStringMap("", value, out)
|
||||||
|
return out
|
||||||
|
case []any:
|
||||||
|
out := map[string]string{}
|
||||||
|
for _, item := range value {
|
||||||
|
if m, ok := item.(map[string]any); ok {
|
||||||
|
flattenStringMap("", m, out)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
default:
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func flattenStringMap(prefix string, in map[string]any, out map[string]string) {
|
||||||
|
for key, raw := range in {
|
||||||
|
fullKey := strings.TrimSpace(strings.ToLower(strings.Trim(prefix+" "+key, " ")))
|
||||||
|
switch value := raw.(type) {
|
||||||
|
case map[string]any:
|
||||||
|
flattenStringMap(fullKey, value, out)
|
||||||
|
case []any:
|
||||||
|
for _, item := range value {
|
||||||
|
if m, ok := item.(map[string]any); ok {
|
||||||
|
flattenStringMap(fullKey, m, out)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case string:
|
||||||
|
out[fullKey] = value
|
||||||
|
case json.Number:
|
||||||
|
out[fullKey] = value.String()
|
||||||
|
case float64:
|
||||||
|
out[fullKey] = strconv.FormatFloat(value, 'f', -1, 64)
|
||||||
|
case bool:
|
||||||
|
if value {
|
||||||
|
out[fullKey] = "true"
|
||||||
|
} else {
|
||||||
|
out[fullKey] = "false"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func mergeStorcliBatteryMap(tel *raidControllerTelemetry, fields map[string]string) {
|
||||||
|
if len(fields) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for key, raw := range fields {
|
||||||
|
lower := strings.ToLower(strings.TrimSpace(key))
|
||||||
|
switch {
|
||||||
|
case strings.Contains(lower, "relative state of charge"), strings.Contains(lower, "remaining capacity"), strings.Contains(lower, "charge"):
|
||||||
|
if tel.BatteryChargePct == nil {
|
||||||
|
tel.BatteryChargePct = parsePercentPtr(raw)
|
||||||
|
}
|
||||||
|
case strings.Contains(lower, "state of health"), strings.Contains(lower, "health"):
|
||||||
|
if tel.BatteryHealthPct == nil {
|
||||||
|
tel.BatteryHealthPct = parsePercentPtr(raw)
|
||||||
|
}
|
||||||
|
case strings.Contains(lower, "temperature"):
|
||||||
|
if tel.BatteryTemperatureC == nil {
|
||||||
|
tel.BatteryTemperatureC = parseFloatPtr(raw)
|
||||||
|
}
|
||||||
|
case strings.Contains(lower, "voltage"):
|
||||||
|
if tel.BatteryVoltageV == nil {
|
||||||
|
tel.BatteryVoltageV = parseFloatPtr(raw)
|
||||||
|
}
|
||||||
|
case strings.Contains(lower, "replace"), strings.Contains(lower, "replacement required"):
|
||||||
|
if tel.BatteryReplaceRequired == nil {
|
||||||
|
tel.BatteryReplaceRequired = parseReplaceRequired(raw)
|
||||||
|
}
|
||||||
|
case strings.Contains(lower, "learn cycle requested"), strings.Contains(lower, "battery state"), strings.Contains(lower, "capacitance state"):
|
||||||
|
if desc := batteryStateDescription(raw); desc != nil && tel.ErrorDescription == nil {
|
||||||
|
tel.ErrorDescription = desc
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseSSACLIControllerTelemetry(raw string) []raidControllerTelemetry {
|
||||||
|
lines := strings.Split(raw, "\n")
|
||||||
|
var out []raidControllerTelemetry
|
||||||
|
var current *raidControllerTelemetry
|
||||||
|
|
||||||
|
flush := func() {
|
||||||
|
if current != nil && hasRAIDControllerTelemetry(*current) {
|
||||||
|
out = append(out, *current)
|
||||||
|
}
|
||||||
|
current = nil
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, line := range lines {
|
||||||
|
trimmed := strings.TrimSpace(line)
|
||||||
|
if trimmed == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if strings.HasPrefix(strings.ToLower(trimmed), "smart array") || strings.HasPrefix(strings.ToLower(trimmed), "controller ") {
|
||||||
|
flush()
|
||||||
|
current = &raidControllerTelemetry{}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if current == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if idx := strings.Index(trimmed, ":"); idx > 0 {
|
||||||
|
key := strings.ToLower(strings.TrimSpace(trimmed[:idx]))
|
||||||
|
val := strings.TrimSpace(trimmed[idx+1:])
|
||||||
|
switch {
|
||||||
|
case strings.Contains(key, "capacitor temperature"), strings.Contains(key, "battery temperature"):
|
||||||
|
current.BatteryTemperatureC = parseFloatPtr(val)
|
||||||
|
case strings.Contains(key, "capacitor voltage"), strings.Contains(key, "battery voltage"):
|
||||||
|
current.BatteryVoltageV = parseFloatPtr(val)
|
||||||
|
case strings.Contains(key, "capacitor charge"), strings.Contains(key, "battery charge"):
|
||||||
|
current.BatteryChargePct = parsePercentPtr(val)
|
||||||
|
case strings.Contains(key, "capacitor health"), strings.Contains(key, "battery health"):
|
||||||
|
current.BatteryHealthPct = parsePercentPtr(val)
|
||||||
|
case strings.Contains(key, "replace") || strings.Contains(key, "failed"):
|
||||||
|
if current.BatteryReplaceRequired == nil {
|
||||||
|
current.BatteryReplaceRequired = parseReplaceRequired(val)
|
||||||
|
}
|
||||||
|
if desc := batteryStateDescription(val); desc != nil && current.ErrorDescription == nil {
|
||||||
|
current.ErrorDescription = desc
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
flush()
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseArcconfControllerTelemetry(raw string) []raidControllerTelemetry {
|
||||||
|
lines := strings.Split(raw, "\n")
|
||||||
|
tel := raidControllerTelemetry{}
|
||||||
|
for _, line := range lines {
|
||||||
|
trimmed := strings.TrimSpace(line)
|
||||||
|
if idx := strings.Index(trimmed, ":"); idx > 0 {
|
||||||
|
key := strings.ToLower(strings.TrimSpace(trimmed[:idx]))
|
||||||
|
val := strings.TrimSpace(trimmed[idx+1:])
|
||||||
|
switch {
|
||||||
|
case strings.Contains(key, "battery temperature"), strings.Contains(key, "capacitor temperature"):
|
||||||
|
tel.BatteryTemperatureC = parseFloatPtr(val)
|
||||||
|
case strings.Contains(key, "battery voltage"), strings.Contains(key, "capacitor voltage"):
|
||||||
|
tel.BatteryVoltageV = parseFloatPtr(val)
|
||||||
|
case strings.Contains(key, "battery charge"), strings.Contains(key, "capacitor charge"):
|
||||||
|
tel.BatteryChargePct = parsePercentPtr(val)
|
||||||
|
case strings.Contains(key, "battery health"), strings.Contains(key, "capacitor health"):
|
||||||
|
tel.BatteryHealthPct = parsePercentPtr(val)
|
||||||
|
case strings.Contains(key, "replace"), strings.Contains(key, "failed"):
|
||||||
|
if tel.BatteryReplaceRequired == nil {
|
||||||
|
tel.BatteryReplaceRequired = parseReplaceRequired(val)
|
||||||
|
}
|
||||||
|
if desc := batteryStateDescription(val); desc != nil && tel.ErrorDescription == nil {
|
||||||
|
tel.ErrorDescription = desc
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if hasRAIDControllerTelemetry(tel) {
|
||||||
|
return []raidControllerTelemetry{tel}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func hasRAIDControllerTelemetry(tel raidControllerTelemetry) bool {
|
||||||
|
return tel.BatteryChargePct != nil ||
|
||||||
|
tel.BatteryHealthPct != nil ||
|
||||||
|
tel.BatteryTemperatureC != nil ||
|
||||||
|
tel.BatteryVoltageV != nil ||
|
||||||
|
tel.BatteryReplaceRequired != nil ||
|
||||||
|
tel.ErrorDescription != nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func parsePercentPtr(raw string) *float64 {
|
||||||
|
raw = strings.ReplaceAll(strings.TrimSpace(raw), "%", "")
|
||||||
|
return parseFloatPtr(raw)
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseReplaceRequired(raw string) *bool {
|
||||||
|
lower := strings.ToLower(strings.TrimSpace(raw))
|
||||||
|
switch {
|
||||||
|
case lower == "":
|
||||||
|
return nil
|
||||||
|
case strings.Contains(lower, "replace"), strings.Contains(lower, "failed"), strings.Contains(lower, "yes"), strings.Contains(lower, "required"):
|
||||||
|
value := true
|
||||||
|
return &value
|
||||||
|
case strings.Contains(lower, "no"), strings.Contains(lower, "ok"), strings.Contains(lower, "good"), strings.Contains(lower, "optimal"):
|
||||||
|
value := false
|
||||||
|
return &value
|
||||||
|
default:
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func batteryStateDescription(raw string) *string {
|
||||||
|
lower := strings.ToLower(strings.TrimSpace(raw))
|
||||||
|
if lower == "" {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
switch {
|
||||||
|
case strings.Contains(lower, "failed"), strings.Contains(lower, "fault"), strings.Contains(lower, "replace"), strings.Contains(lower, "warning"), strings.Contains(lower, "degraded"):
|
||||||
|
return &raw
|
||||||
|
default:
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,6 +1,10 @@
|
|||||||
package collector
|
package collector
|
||||||
|
|
||||||
import "testing"
|
import (
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
"errors"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
func TestParseSASIrcuControllerIDs(t *testing.T) {
|
func TestParseSASIrcuControllerIDs(t *testing.T) {
|
||||||
raw := `LSI Corporation SAS2 IR Configuration Utility.
|
raw := `LSI Corporation SAS2 IR Configuration Utility.
|
||||||
@@ -90,7 +94,111 @@ physicaldrive 1I:1:2 (894 GB, SAS HDD, Failed)
|
|||||||
if drives[0].Status == nil || *drives[0].Status != "OK" {
|
if drives[0].Status == nil || *drives[0].Status != "OK" {
|
||||||
t.Fatalf("drive0 status: %v", drives[0].Status)
|
t.Fatalf("drive0 status: %v", drives[0].Status)
|
||||||
}
|
}
|
||||||
if drives[1].Status == nil || *drives[1].Status != "CRITICAL" {
|
if drives[1].Status == nil || *drives[1].Status != statusCritical {
|
||||||
t.Fatalf("drive1 status: %v", drives[1].Status)
|
t.Fatalf("drive1 status: %v", drives[1].Status)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestParseStorcliControllerTelemetry(t *testing.T) {
|
||||||
|
raw := []byte(`{
|
||||||
|
"Controllers": [
|
||||||
|
{
|
||||||
|
"Response Data": {
|
||||||
|
"BBU_Info": {
|
||||||
|
"State of Health": "98 %",
|
||||||
|
"Relative State of Charge": "76 %",
|
||||||
|
"Temperature": "41 C",
|
||||||
|
"Voltage": "12.3 V",
|
||||||
|
"Replacement required": "No"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}`)
|
||||||
|
got := parseStorcliControllerTelemetry(raw)
|
||||||
|
if len(got) != 1 {
|
||||||
|
t.Fatalf("len(got)=%d want 1", len(got))
|
||||||
|
}
|
||||||
|
if got[0].BatteryHealthPct == nil || *got[0].BatteryHealthPct != 98 {
|
||||||
|
t.Fatalf("battery health=%v", got[0].BatteryHealthPct)
|
||||||
|
}
|
||||||
|
if got[0].BatteryChargePct == nil || *got[0].BatteryChargePct != 76 {
|
||||||
|
t.Fatalf("battery charge=%v", got[0].BatteryChargePct)
|
||||||
|
}
|
||||||
|
if got[0].BatteryTemperatureC == nil || *got[0].BatteryTemperatureC != 41 {
|
||||||
|
t.Fatalf("battery temperature=%v", got[0].BatteryTemperatureC)
|
||||||
|
}
|
||||||
|
if got[0].BatteryVoltageV == nil || *got[0].BatteryVoltageV != 12.3 {
|
||||||
|
t.Fatalf("battery voltage=%v", got[0].BatteryVoltageV)
|
||||||
|
}
|
||||||
|
if got[0].BatteryReplaceRequired == nil || *got[0].BatteryReplaceRequired {
|
||||||
|
t.Fatalf("battery replace=%v", got[0].BatteryReplaceRequired)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseSSACLIControllerTelemetry(t *testing.T) {
|
||||||
|
raw := `Smart Array P440ar in Slot 0
|
||||||
|
Battery/Capacitor Count: 1
|
||||||
|
Capacitor Temperature (C): 37
|
||||||
|
Capacitor Charge (%): 94
|
||||||
|
Capacitor Health (%): 96
|
||||||
|
Capacitor Voltage (V): 9.8
|
||||||
|
Capacitor Failed: No
|
||||||
|
`
|
||||||
|
got := parseSSACLIControllerTelemetry(raw)
|
||||||
|
if len(got) != 1 {
|
||||||
|
t.Fatalf("len(got)=%d want 1", len(got))
|
||||||
|
}
|
||||||
|
if got[0].BatteryTemperatureC == nil || *got[0].BatteryTemperatureC != 37 {
|
||||||
|
t.Fatalf("battery temperature=%v", got[0].BatteryTemperatureC)
|
||||||
|
}
|
||||||
|
if got[0].BatteryChargePct == nil || *got[0].BatteryChargePct != 94 {
|
||||||
|
t.Fatalf("battery charge=%v", got[0].BatteryChargePct)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestEnrichPCIeWithRAIDTelemetry(t *testing.T) {
|
||||||
|
orig := raidToolQuery
|
||||||
|
t.Cleanup(func() { raidToolQuery = orig })
|
||||||
|
raidToolQuery = func(name string, args ...string) ([]byte, error) {
|
||||||
|
switch name {
|
||||||
|
case "storcli64":
|
||||||
|
return []byte(`{
|
||||||
|
"Controllers": [
|
||||||
|
{
|
||||||
|
"Response Data": {
|
||||||
|
"CV_Info": {
|
||||||
|
"State of Health": "99 %",
|
||||||
|
"Relative State of Charge": "81 %",
|
||||||
|
"Temperature": "38 C",
|
||||||
|
"Voltage": "12.1 V",
|
||||||
|
"Replacement required": "No"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}`), nil
|
||||||
|
default:
|
||||||
|
return nil, errors.New("skip")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
vendor := vendorBroadcomLSI
|
||||||
|
class := "MassStorageController"
|
||||||
|
status := statusOK
|
||||||
|
devs := []schema.HardwarePCIeDevice{{
|
||||||
|
VendorID: &vendor,
|
||||||
|
DeviceClass: &class,
|
||||||
|
HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status},
|
||||||
|
}}
|
||||||
|
out := enrichPCIeWithRAIDTelemetry(devs)
|
||||||
|
if out[0].BatteryHealthPct == nil || *out[0].BatteryHealthPct != 99 {
|
||||||
|
t.Fatalf("battery health=%v", out[0].BatteryHealthPct)
|
||||||
|
}
|
||||||
|
if out[0].BatteryChargePct == nil || *out[0].BatteryChargePct != 81 {
|
||||||
|
t.Fatalf("battery charge=%v", out[0].BatteryChargePct)
|
||||||
|
}
|
||||||
|
if out[0].BatteryVoltageV == nil || *out[0].BatteryVoltageV != 12.1 {
|
||||||
|
t.Fatalf("battery voltage=%v", out[0].BatteryVoltageV)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
373
audit/internal/collector/sensors.go
Normal file
373
audit/internal/collector/sensors.go
Normal file
@@ -0,0 +1,373 @@
|
|||||||
|
package collector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
"encoding/json"
|
||||||
|
"log/slog"
|
||||||
|
"os/exec"
|
||||||
|
"sort"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
type sensorsDoc map[string]map[string]any
|
||||||
|
|
||||||
|
func collectSensors() *schema.HardwareSensors {
|
||||||
|
doc, err := readSensorsJSONDoc()
|
||||||
|
if err != nil {
|
||||||
|
slog.Info("sensors: unavailable, skipping", "err", err)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
sensors := buildSensorsFromDoc(doc)
|
||||||
|
if sensors == nil || (len(sensors.Fans) == 0 && len(sensors.Power) == 0 && len(sensors.Temperatures) == 0 && len(sensors.Other) == 0) {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
slog.Info("sensors: collected",
|
||||||
|
"fans", len(sensors.Fans),
|
||||||
|
"power", len(sensors.Power),
|
||||||
|
"temperatures", len(sensors.Temperatures),
|
||||||
|
"other", len(sensors.Other),
|
||||||
|
)
|
||||||
|
return sensors
|
||||||
|
}
|
||||||
|
|
||||||
|
func readSensorsJSONDoc() (sensorsDoc, error) {
|
||||||
|
out, err := exec.Command("sensors", "-j").Output()
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
var doc sensorsDoc
|
||||||
|
if err := json.Unmarshal(out, &doc); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return doc, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildSensorsFromDoc(doc sensorsDoc) *schema.HardwareSensors {
|
||||||
|
if len(doc) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
result := &schema.HardwareSensors{}
|
||||||
|
seen := map[string]struct{}{}
|
||||||
|
|
||||||
|
chips := make([]string, 0, len(doc))
|
||||||
|
for chip := range doc {
|
||||||
|
chips = append(chips, chip)
|
||||||
|
}
|
||||||
|
sort.Strings(chips)
|
||||||
|
|
||||||
|
for _, chip := range chips {
|
||||||
|
features := doc[chip]
|
||||||
|
location := sensorLocation(chip)
|
||||||
|
|
||||||
|
keys := make([]string, 0, len(features))
|
||||||
|
for key := range features {
|
||||||
|
keys = append(keys, key)
|
||||||
|
}
|
||||||
|
sort.Strings(keys)
|
||||||
|
|
||||||
|
for _, key := range keys {
|
||||||
|
if strings.EqualFold(key, "Adapter") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
feature, ok := features[key].(map[string]any)
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
name := strings.TrimSpace(key)
|
||||||
|
if name == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
switch classifySensorFeature(feature) {
|
||||||
|
case "fan":
|
||||||
|
item := buildFanSensor(name, location, feature)
|
||||||
|
if item == nil || duplicateSensor(seen, "fan", item.Name) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
result.Fans = append(result.Fans, *item)
|
||||||
|
case "temp":
|
||||||
|
item := buildTempSensor(name, location, feature)
|
||||||
|
if item == nil || duplicateSensor(seen, "temp", item.Name) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
result.Temperatures = append(result.Temperatures, *item)
|
||||||
|
case "power":
|
||||||
|
item := buildPowerSensor(name, location, feature)
|
||||||
|
if item == nil || duplicateSensor(seen, "power", item.Name) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
result.Power = append(result.Power, *item)
|
||||||
|
default:
|
||||||
|
item := buildOtherSensor(name, location, feature)
|
||||||
|
if item == nil || duplicateSensor(seen, "other", item.Name) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
result.Other = append(result.Other, *item)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseSensorsJSON(raw []byte) (*schema.HardwareSensors, error) {
|
||||||
|
var doc sensorsDoc
|
||||||
|
err := json.Unmarshal(raw, &doc)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return buildSensorsFromDoc(doc), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func duplicateSensor(seen map[string]struct{}, sensorType, name string) bool {
|
||||||
|
key := sensorType + "\x00" + name
|
||||||
|
if _, ok := seen[key]; ok {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
seen[key] = struct{}{}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
func sensorLocation(chip string) *string {
|
||||||
|
chip = strings.TrimSpace(chip)
|
||||||
|
if chip == "" {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return &chip
|
||||||
|
}
|
||||||
|
|
||||||
|
func classifySensorFeature(feature map[string]any) string {
|
||||||
|
for key := range feature {
|
||||||
|
switch {
|
||||||
|
case strings.Contains(key, "fan") && strings.HasSuffix(key, "_input"):
|
||||||
|
return "fan"
|
||||||
|
case strings.Contains(key, "temp") && strings.HasSuffix(key, "_input"):
|
||||||
|
return "temp"
|
||||||
|
case strings.Contains(key, "power") && (strings.HasSuffix(key, "_input") || strings.HasSuffix(key, "_average")):
|
||||||
|
return "power"
|
||||||
|
case strings.Contains(key, "curr") && strings.HasSuffix(key, "_input"):
|
||||||
|
return "power"
|
||||||
|
case strings.HasPrefix(key, "in") && strings.HasSuffix(key, "_input"):
|
||||||
|
return "power"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return "other"
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildFanSensor(name string, location *string, feature map[string]any) *schema.HardwareFanSensor {
|
||||||
|
rpm, ok := firstFeatureInt(feature, "_input")
|
||||||
|
if !ok {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
item := &schema.HardwareFanSensor{Name: name, Location: location, RPM: &rpm}
|
||||||
|
if status := sensorStatusFromFeature(feature); status != nil {
|
||||||
|
item.Status = status
|
||||||
|
}
|
||||||
|
return item
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildTempSensor(name string, location *string, feature map[string]any) *schema.HardwareTemperatureSensor {
|
||||||
|
celsius, ok := firstFeatureFloat(feature, "_input")
|
||||||
|
if !ok {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
item := &schema.HardwareTemperatureSensor{Name: name, Location: location, Celsius: &celsius}
|
||||||
|
if warning, ok := firstFeatureFloatWithSuffixes(feature, []string{"_max", "_high"}); ok {
|
||||||
|
item.ThresholdWarningCelsius = &warning
|
||||||
|
}
|
||||||
|
if critical, ok := firstFeatureFloatWithSuffixes(feature, []string{"_crit", "_emergency"}); ok {
|
||||||
|
item.ThresholdCriticalCelsius = &critical
|
||||||
|
}
|
||||||
|
if status := sensorStatusFromFeature(feature); status != nil {
|
||||||
|
item.Status = status
|
||||||
|
} else {
|
||||||
|
item.Status = deriveTemperatureStatus(item.Celsius, item.ThresholdWarningCelsius, item.ThresholdCriticalCelsius)
|
||||||
|
}
|
||||||
|
return item
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildPowerSensor(name string, location *string, feature map[string]any) *schema.HardwarePowerSensor {
|
||||||
|
item := &schema.HardwarePowerSensor{Name: name, Location: location}
|
||||||
|
if v, ok := firstFeatureFloatWithContains(feature, []string{"power"}); ok {
|
||||||
|
item.PowerW = &v
|
||||||
|
}
|
||||||
|
if v, ok := firstFeatureFloatWithPrefix(feature, "curr"); ok {
|
||||||
|
item.CurrentA = &v
|
||||||
|
}
|
||||||
|
if v, ok := firstFeatureFloatWithPrefix(feature, "in"); ok {
|
||||||
|
item.VoltageV = &v
|
||||||
|
}
|
||||||
|
if item.PowerW == nil && item.CurrentA == nil && item.VoltageV == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if status := sensorStatusFromFeature(feature); status != nil {
|
||||||
|
item.Status = status
|
||||||
|
}
|
||||||
|
return item
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildOtherSensor(name string, location *string, feature map[string]any) *schema.HardwareOtherSensor {
|
||||||
|
value, unit, ok := firstGenericSensorValue(feature)
|
||||||
|
if !ok {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
item := &schema.HardwareOtherSensor{Name: name, Location: location, Value: &value}
|
||||||
|
if unit != "" {
|
||||||
|
item.Unit = &unit
|
||||||
|
}
|
||||||
|
if status := sensorStatusFromFeature(feature); status != nil {
|
||||||
|
item.Status = status
|
||||||
|
}
|
||||||
|
return item
|
||||||
|
}
|
||||||
|
|
||||||
|
func sensorStatusFromFeature(feature map[string]any) *string {
|
||||||
|
for key, raw := range feature {
|
||||||
|
if !strings.HasSuffix(key, "_alarm") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if number, ok := floatFromAny(raw); ok && number > 0 {
|
||||||
|
status := statusWarning
|
||||||
|
return &status
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func deriveTemperatureStatus(current, warning, critical *float64) *string {
|
||||||
|
if current == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
switch {
|
||||||
|
case critical != nil && *current >= *critical:
|
||||||
|
status := statusCritical
|
||||||
|
return &status
|
||||||
|
case warning != nil && *current >= *warning:
|
||||||
|
status := statusWarning
|
||||||
|
return &status
|
||||||
|
default:
|
||||||
|
status := statusOK
|
||||||
|
return &status
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func firstFeatureInt(feature map[string]any, suffix string) (int, bool) {
|
||||||
|
for key, raw := range feature {
|
||||||
|
if strings.HasSuffix(key, suffix) {
|
||||||
|
if value, ok := floatFromAny(raw); ok {
|
||||||
|
return int(value), true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
|
||||||
|
func firstFeatureFloat(feature map[string]any, suffix string) (float64, bool) {
|
||||||
|
return firstFeatureFloatWithSuffixes(feature, []string{suffix})
|
||||||
|
}
|
||||||
|
|
||||||
|
func firstFeatureFloatWithSuffixes(feature map[string]any, suffixes []string) (float64, bool) {
|
||||||
|
keys := sortedFeatureKeys(feature)
|
||||||
|
for _, key := range keys {
|
||||||
|
for _, suffix := range suffixes {
|
||||||
|
if strings.HasSuffix(key, suffix) {
|
||||||
|
if value, ok := floatFromAny(feature[key]); ok {
|
||||||
|
return value, true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
|
||||||
|
func firstFeatureFloatWithContains(feature map[string]any, parts []string) (float64, bool) {
|
||||||
|
keys := sortedFeatureKeys(feature)
|
||||||
|
for _, key := range keys {
|
||||||
|
matched := true
|
||||||
|
for _, part := range parts {
|
||||||
|
if !strings.Contains(key, part) {
|
||||||
|
matched = false
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if matched {
|
||||||
|
if value, ok := floatFromAny(feature[key]); ok {
|
||||||
|
return value, true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
|
||||||
|
func firstFeatureFloatWithPrefix(feature map[string]any, prefix string) (float64, bool) {
|
||||||
|
keys := sortedFeatureKeys(feature)
|
||||||
|
for _, key := range keys {
|
||||||
|
if strings.HasPrefix(key, prefix) && strings.HasSuffix(key, "_input") {
|
||||||
|
if value, ok := floatFromAny(feature[key]); ok {
|
||||||
|
return value, true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
|
||||||
|
func firstGenericSensorValue(feature map[string]any) (float64, string, bool) {
|
||||||
|
keys := sortedFeatureKeys(feature)
|
||||||
|
for _, key := range keys {
|
||||||
|
if strings.HasSuffix(key, "_alarm") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
value, ok := floatFromAny(feature[key])
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
unit := inferSensorUnit(key)
|
||||||
|
return value, unit, true
|
||||||
|
}
|
||||||
|
return 0, "", false
|
||||||
|
}
|
||||||
|
|
||||||
|
func inferSensorUnit(key string) string {
|
||||||
|
switch {
|
||||||
|
case strings.Contains(key, "humidity"):
|
||||||
|
return "%"
|
||||||
|
case strings.Contains(key, "intrusion"):
|
||||||
|
return ""
|
||||||
|
default:
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func sortedFeatureKeys(feature map[string]any) []string {
|
||||||
|
keys := make([]string, 0, len(feature))
|
||||||
|
for key := range feature {
|
||||||
|
keys = append(keys, key)
|
||||||
|
}
|
||||||
|
sort.Strings(keys)
|
||||||
|
return keys
|
||||||
|
}
|
||||||
|
|
||||||
|
func floatFromAny(raw any) (float64, bool) {
|
||||||
|
switch value := raw.(type) {
|
||||||
|
case float64:
|
||||||
|
return value, true
|
||||||
|
case float32:
|
||||||
|
return float64(value), true
|
||||||
|
case int:
|
||||||
|
return float64(value), true
|
||||||
|
case int64:
|
||||||
|
return float64(value), true
|
||||||
|
case json.Number:
|
||||||
|
if f, err := value.Float64(); err == nil {
|
||||||
|
return f, true
|
||||||
|
}
|
||||||
|
case string:
|
||||||
|
if value == "" {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
if f, err := strconv.ParseFloat(value, 64); err == nil {
|
||||||
|
return f, true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
54
audit/internal/collector/sensors_test.go
Normal file
54
audit/internal/collector/sensors_test.go
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
package collector
|
||||||
|
|
||||||
|
import "testing"
|
||||||
|
|
||||||
|
func TestParseSensorsJSON(t *testing.T) {
|
||||||
|
raw := []byte(`{
|
||||||
|
"coretemp-isa-0000": {
|
||||||
|
"Adapter": "ISA adapter",
|
||||||
|
"Package id 0": {
|
||||||
|
"temp1_input": 61.5,
|
||||||
|
"temp1_max": 80.0,
|
||||||
|
"temp1_crit": 95.0
|
||||||
|
},
|
||||||
|
"fan1": {
|
||||||
|
"fan1_input": 4200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"acpitz-acpi-0": {
|
||||||
|
"Adapter": "ACPI interface",
|
||||||
|
"in0": {
|
||||||
|
"in0_input": 12.06
|
||||||
|
},
|
||||||
|
"curr1": {
|
||||||
|
"curr1_input": 0.64
|
||||||
|
},
|
||||||
|
"power1": {
|
||||||
|
"power1_average": 137.0
|
||||||
|
},
|
||||||
|
"humidity1": {
|
||||||
|
"humidity1_input": 38.5
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}`)
|
||||||
|
|
||||||
|
got, err := parseSensorsJSON(raw)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("parseSensorsJSON error: %v", err)
|
||||||
|
}
|
||||||
|
if got == nil {
|
||||||
|
t.Fatal("expected sensors")
|
||||||
|
}
|
||||||
|
if len(got.Temperatures) != 1 || got.Temperatures[0].Celsius == nil || *got.Temperatures[0].Celsius != 61.5 {
|
||||||
|
t.Fatalf("temperatures mismatch: %#v", got.Temperatures)
|
||||||
|
}
|
||||||
|
if len(got.Fans) != 1 || got.Fans[0].RPM == nil || *got.Fans[0].RPM != 4200 {
|
||||||
|
t.Fatalf("fans mismatch: %#v", got.Fans)
|
||||||
|
}
|
||||||
|
if len(got.Power) != 3 {
|
||||||
|
t.Fatalf("power sensors mismatch: %#v", got.Power)
|
||||||
|
}
|
||||||
|
if len(got.Other) != 1 || got.Other[0].Unit == nil || *got.Other[0].Unit != "%" {
|
||||||
|
t.Fatalf("other sensors mismatch: %#v", got.Other)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -26,13 +26,13 @@ func collectStorage() []schema.HardwareStorage {
|
|||||||
|
|
||||||
// lsblkDevice is a minimal lsblk JSON record.
|
// lsblkDevice is a minimal lsblk JSON record.
|
||||||
type lsblkDevice struct {
|
type lsblkDevice struct {
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
Type string `json:"type"`
|
Type string `json:"type"`
|
||||||
Size string `json:"size"`
|
Size string `json:"size"`
|
||||||
Serial string `json:"serial"`
|
Serial string `json:"serial"`
|
||||||
Model string `json:"model"`
|
Model string `json:"model"`
|
||||||
Tran string `json:"tran"`
|
Tran string `json:"tran"`
|
||||||
Hctl string `json:"hctl"`
|
Hctl string `json:"hctl"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type lsblkRoot struct {
|
type lsblkRoot struct {
|
||||||
@@ -67,7 +67,10 @@ type smartctlInfo struct {
|
|||||||
SerialNumber string `json:"serial_number"`
|
SerialNumber string `json:"serial_number"`
|
||||||
FirmwareVer string `json:"firmware_version"`
|
FirmwareVer string `json:"firmware_version"`
|
||||||
RotationRate int `json:"rotation_rate"`
|
RotationRate int `json:"rotation_rate"`
|
||||||
SmartStatus struct {
|
Temperature struct {
|
||||||
|
Current int `json:"current"`
|
||||||
|
} `json:"temperature"`
|
||||||
|
SmartStatus struct {
|
||||||
Passed bool `json:"passed"`
|
Passed bool `json:"passed"`
|
||||||
} `json:"smart_status"`
|
} `json:"smart_status"`
|
||||||
UserCapacity struct {
|
UserCapacity struct {
|
||||||
@@ -75,9 +78,11 @@ type smartctlInfo struct {
|
|||||||
} `json:"user_capacity"`
|
} `json:"user_capacity"`
|
||||||
AtaSmartAttributes struct {
|
AtaSmartAttributes struct {
|
||||||
Table []struct {
|
Table []struct {
|
||||||
ID int `json:"id"`
|
ID int `json:"id"`
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
Raw struct{ Value int64 `json:"value"` } `json:"raw"`
|
Raw struct {
|
||||||
|
Value int64 `json:"value"`
|
||||||
|
} `json:"raw"`
|
||||||
} `json:"table"`
|
} `json:"table"`
|
||||||
} `json:"ata_smart_attributes"`
|
} `json:"ata_smart_attributes"`
|
||||||
PowerOnTime struct {
|
PowerOnTime struct {
|
||||||
@@ -130,7 +135,7 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
|
|||||||
return s
|
return s
|
||||||
}
|
}
|
||||||
|
|
||||||
var info smartctlInfo
|
var info smartctlInfo
|
||||||
if err := json.Unmarshal(out, &info); err == nil {
|
if err := json.Unmarshal(out, &info); err == nil {
|
||||||
if v := cleanDMIValue(info.ModelName); v != "" {
|
if v := cleanDMIValue(info.ModelName); v != "" {
|
||||||
s.Model = &v
|
s.Model = &v
|
||||||
@@ -152,14 +157,19 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
|
|||||||
} else if info.RotationRate > 0 {
|
} else if info.RotationRate > 0 {
|
||||||
devType = "HDD"
|
devType = "HDD"
|
||||||
}
|
}
|
||||||
|
s.Type = &devType
|
||||||
|
|
||||||
// telemetry
|
if info.Temperature.Current > 0 {
|
||||||
tel := map[string]any{}
|
t := float64(info.Temperature.Current)
|
||||||
|
s.TemperatureC = &t
|
||||||
|
}
|
||||||
if info.PowerOnTime.Hours > 0 {
|
if info.PowerOnTime.Hours > 0 {
|
||||||
tel["power_on_hours"] = info.PowerOnTime.Hours
|
v := int64(info.PowerOnTime.Hours)
|
||||||
|
s.PowerOnHours = &v
|
||||||
}
|
}
|
||||||
if info.PowerCycleCount > 0 {
|
if info.PowerCycleCount > 0 {
|
||||||
tel["power_cycles"] = info.PowerCycleCount
|
v := int64(info.PowerCycleCount)
|
||||||
|
s.PowerCycles = &v
|
||||||
}
|
}
|
||||||
reallocated := int64(0)
|
reallocated := int64(0)
|
||||||
pending := int64(0)
|
pending := int64(0)
|
||||||
@@ -169,77 +179,79 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
|
|||||||
switch attr.ID {
|
switch attr.ID {
|
||||||
case 5:
|
case 5:
|
||||||
reallocated = attr.Raw.Value
|
reallocated = attr.Raw.Value
|
||||||
tel["reallocated_sectors"] = attr.Raw.Value
|
s.ReallocatedSectors = &reallocated
|
||||||
case 177:
|
case 177:
|
||||||
tel["wear_leveling_pct"] = attr.Raw.Value
|
value := float64(attr.Raw.Value)
|
||||||
|
s.LifeUsedPct = &value
|
||||||
case 231:
|
case 231:
|
||||||
lifeRemaining = attr.Raw.Value
|
lifeRemaining = attr.Raw.Value
|
||||||
tel["life_remaining_pct"] = attr.Raw.Value
|
value := float64(attr.Raw.Value)
|
||||||
|
s.LifeRemainingPct = &value
|
||||||
case 241:
|
case 241:
|
||||||
tel["total_lba_written"] = attr.Raw.Value
|
value := attr.Raw.Value
|
||||||
|
s.WrittenBytes = &value
|
||||||
case 197:
|
case 197:
|
||||||
pending = attr.Raw.Value
|
pending = attr.Raw.Value
|
||||||
tel["current_pending_sectors"] = attr.Raw.Value
|
s.CurrentPendingSectors = &pending
|
||||||
case 198:
|
case 198:
|
||||||
uncorrectable = attr.Raw.Value
|
uncorrectable = attr.Raw.Value
|
||||||
tel["offline_uncorrectable"] = attr.Raw.Value
|
s.OfflineUncorrectable = &uncorrectable
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if len(tel) > 0 {
|
|
||||||
s.Telemetry = tel
|
|
||||||
}
|
|
||||||
|
|
||||||
status := storageHealthStatus{
|
status := storageHealthStatus{
|
||||||
overallPassed: info.SmartStatus.Passed,
|
overallPassed: info.SmartStatus.Passed,
|
||||||
hasOverall: true,
|
hasOverall: true,
|
||||||
reallocatedSectors: reallocated,
|
reallocatedSectors: reallocated,
|
||||||
pendingSectors: pending,
|
pendingSectors: pending,
|
||||||
offlineUncorrectable: uncorrectable,
|
offlineUncorrectable: uncorrectable,
|
||||||
lifeRemainingPct: lifeRemaining,
|
lifeRemainingPct: lifeRemaining,
|
||||||
}
|
}
|
||||||
setStorageHealthStatus(&s, status)
|
setStorageHealthStatus(&s, status)
|
||||||
return s
|
return s
|
||||||
}
|
}
|
||||||
|
|
||||||
s.Type = &devType
|
s.Type = &devType
|
||||||
status := "UNKNOWN"
|
status := statusUnknown
|
||||||
s.Status = &status
|
s.Status = &status
|
||||||
return s
|
return s
|
||||||
}
|
}
|
||||||
|
|
||||||
// nvmeSmartLog is the subset of `nvme smart-log -o json` output we care about.
|
// nvmeSmartLog is the subset of `nvme smart-log -o json` output we care about.
|
||||||
type nvmeSmartLog struct {
|
type nvmeSmartLog struct {
|
||||||
CriticalWarning int `json:"critical_warning"`
|
CriticalWarning int `json:"critical_warning"`
|
||||||
PercentageUsed int `json:"percentage_used"`
|
PercentageUsed int `json:"percentage_used"`
|
||||||
AvailableSpare int `json:"available_spare"`
|
AvailableSpare int `json:"available_spare"`
|
||||||
SpareThreshold int `json:"spare_thresh"`
|
SpareThreshold int `json:"spare_thresh"`
|
||||||
PowerOnHours int64 `json:"power_on_hours"`
|
Temperature int64 `json:"temperature"`
|
||||||
PowerCycles int64 `json:"power_cycles"`
|
PowerOnHours int64 `json:"power_on_hours"`
|
||||||
UnsafeShutdowns int64 `json:"unsafe_shutdowns"`
|
PowerCycles int64 `json:"power_cycles"`
|
||||||
DataUnitsWritten int64 `json:"data_units_written"`
|
UnsafeShutdowns int64 `json:"unsafe_shutdowns"`
|
||||||
ControllerBusy int64 `json:"controller_busy_time"`
|
DataUnitsRead int64 `json:"data_units_read"`
|
||||||
MediaErrors int64 `json:"media_errors"`
|
DataUnitsWritten int64 `json:"data_units_written"`
|
||||||
NumErrLogEntries int64 `json:"num_err_log_entries"`
|
ControllerBusy int64 `json:"controller_busy_time"`
|
||||||
|
MediaErrors int64 `json:"media_errors"`
|
||||||
|
NumErrLogEntries int64 `json:"num_err_log_entries"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// nvmeIDCtrl is the subset of `nvme id-ctrl -o json` output.
|
// nvmeIDCtrl is the subset of `nvme id-ctrl -o json` output.
|
||||||
type nvmeIDCtrl struct {
|
type nvmeIDCtrl struct {
|
||||||
ModelNumber string `json:"mn"`
|
ModelNumber string `json:"mn"`
|
||||||
SerialNumber string `json:"sn"`
|
SerialNumber string `json:"sn"`
|
||||||
FirmwareRev string `json:"fr"`
|
FirmwareRev string `json:"fr"`
|
||||||
TotalCapacity int64 `json:"tnvmcap"`
|
TotalCapacity int64 `json:"tnvmcap"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
||||||
present := true
|
present := true
|
||||||
devType := "NVMe"
|
devType := "NVMe"
|
||||||
iface := "NVMe"
|
iface := "NVMe"
|
||||||
status := "OK"
|
status := statusOK
|
||||||
s := schema.HardwareStorage{
|
s := schema.HardwareStorage{
|
||||||
Present: &present,
|
HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status},
|
||||||
Type: &devType,
|
Present: &present,
|
||||||
Interface: &iface,
|
Type: &devType,
|
||||||
Status: &status,
|
Interface: &iface,
|
||||||
}
|
}
|
||||||
|
|
||||||
devPath := "/dev/" + dev.Name
|
devPath := "/dev/" + dev.Name
|
||||||
@@ -268,100 +280,123 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
|||||||
if out, err := exec.Command("nvme", "smart-log", devPath, "-o", "json").Output(); err == nil {
|
if out, err := exec.Command("nvme", "smart-log", devPath, "-o", "json").Output(); err == nil {
|
||||||
var log nvmeSmartLog
|
var log nvmeSmartLog
|
||||||
if json.Unmarshal(out, &log) == nil {
|
if json.Unmarshal(out, &log) == nil {
|
||||||
tel := map[string]any{}
|
|
||||||
if log.CriticalWarning > 0 {
|
|
||||||
tel["critical_warning"] = log.CriticalWarning
|
|
||||||
}
|
|
||||||
if log.PowerOnHours > 0 {
|
if log.PowerOnHours > 0 {
|
||||||
tel["power_on_hours"] = log.PowerOnHours
|
s.PowerOnHours = &log.PowerOnHours
|
||||||
}
|
}
|
||||||
if log.PowerCycles > 0 {
|
if log.PowerCycles > 0 {
|
||||||
tel["power_cycles"] = log.PowerCycles
|
s.PowerCycles = &log.PowerCycles
|
||||||
}
|
}
|
||||||
if log.UnsafeShutdowns > 0 {
|
if log.UnsafeShutdowns > 0 {
|
||||||
tel["unsafe_shutdowns"] = log.UnsafeShutdowns
|
s.UnsafeShutdowns = &log.UnsafeShutdowns
|
||||||
}
|
}
|
||||||
if log.PercentageUsed > 0 {
|
if log.PercentageUsed > 0 {
|
||||||
tel["percentage_used"] = log.PercentageUsed
|
v := float64(log.PercentageUsed)
|
||||||
|
s.LifeUsedPct = &v
|
||||||
|
remaining := 100 - v
|
||||||
|
s.LifeRemainingPct = &remaining
|
||||||
}
|
}
|
||||||
if log.DataUnitsWritten > 0 {
|
if log.DataUnitsWritten > 0 {
|
||||||
tel["data_units_written"] = log.DataUnitsWritten
|
v := nvmeDataUnitsToBytes(log.DataUnitsWritten)
|
||||||
|
s.WrittenBytes = &v
|
||||||
}
|
}
|
||||||
if log.ControllerBusy > 0 {
|
if log.DataUnitsRead > 0 {
|
||||||
tel["controller_busy_time"] = log.ControllerBusy
|
v := nvmeDataUnitsToBytes(log.DataUnitsRead)
|
||||||
|
s.ReadBytes = &v
|
||||||
}
|
}
|
||||||
if log.AvailableSpare > 0 {
|
if log.AvailableSpare > 0 {
|
||||||
tel["available_spare_pct"] = log.AvailableSpare
|
v := float64(log.AvailableSpare)
|
||||||
}
|
s.AvailableSparePct = &v
|
||||||
if log.SpareThreshold > 0 {
|
|
||||||
tel["available_spare_threshold_pct"] = log.SpareThreshold
|
|
||||||
}
|
}
|
||||||
if log.MediaErrors > 0 {
|
if log.MediaErrors > 0 {
|
||||||
tel["media_errors"] = log.MediaErrors
|
s.MediaErrors = &log.MediaErrors
|
||||||
}
|
}
|
||||||
if log.NumErrLogEntries > 0 {
|
if log.NumErrLogEntries > 0 {
|
||||||
tel["error_log_entries"] = log.NumErrLogEntries
|
s.ErrorLogEntries = &log.NumErrLogEntries
|
||||||
}
|
}
|
||||||
if len(tel) > 0 {
|
if log.Temperature > 0 {
|
||||||
s.Telemetry = tel
|
v := float64(log.Temperature - 273)
|
||||||
|
s.TemperatureC = &v
|
||||||
}
|
}
|
||||||
setStorageHealthStatus(&s, storageHealthStatus{
|
setStorageHealthStatus(&s, storageHealthStatus{
|
||||||
criticalWarning: log.CriticalWarning,
|
criticalWarning: log.CriticalWarning,
|
||||||
percentageUsed: int64(log.PercentageUsed),
|
percentageUsed: int64(log.PercentageUsed),
|
||||||
availableSpare: int64(log.AvailableSpare),
|
availableSpare: int64(log.AvailableSpare),
|
||||||
spareThreshold: int64(log.SpareThreshold),
|
spareThreshold: int64(log.SpareThreshold),
|
||||||
unsafeShutdowns: log.UnsafeShutdowns,
|
unsafeShutdowns: log.UnsafeShutdowns,
|
||||||
mediaErrors: log.MediaErrors,
|
mediaErrors: log.MediaErrors,
|
||||||
errorLogEntries: log.NumErrLogEntries,
|
errorLogEntries: log.NumErrLogEntries,
|
||||||
})
|
})
|
||||||
return s
|
return s
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
status = "UNKNOWN"
|
status = statusUnknown
|
||||||
s.Status = &status
|
s.Status = &status
|
||||||
return s
|
return s
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func nvmeDataUnitsToBytes(units int64) int64 {
|
||||||
|
if units <= 0 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return units * 512000
|
||||||
|
}
|
||||||
|
|
||||||
type storageHealthStatus struct {
|
type storageHealthStatus struct {
|
||||||
hasOverall bool
|
hasOverall bool
|
||||||
overallPassed bool
|
overallPassed bool
|
||||||
reallocatedSectors int64
|
reallocatedSectors int64
|
||||||
pendingSectors int64
|
pendingSectors int64
|
||||||
offlineUncorrectable int64
|
offlineUncorrectable int64
|
||||||
lifeRemainingPct int64
|
lifeRemainingPct int64
|
||||||
criticalWarning int
|
criticalWarning int
|
||||||
percentageUsed int64
|
percentageUsed int64
|
||||||
availableSpare int64
|
availableSpare int64
|
||||||
spareThreshold int64
|
spareThreshold int64
|
||||||
unsafeShutdowns int64
|
unsafeShutdowns int64
|
||||||
mediaErrors int64
|
mediaErrors int64
|
||||||
errorLogEntries int64
|
errorLogEntries int64
|
||||||
}
|
}
|
||||||
|
|
||||||
func setStorageHealthStatus(s *schema.HardwareStorage, health storageHealthStatus) {
|
func setStorageHealthStatus(s *schema.HardwareStorage, health storageHealthStatus) {
|
||||||
status := "OK"
|
status := statusOK
|
||||||
|
var description *string
|
||||||
switch {
|
switch {
|
||||||
case health.hasOverall && !health.overallPassed:
|
case health.hasOverall && !health.overallPassed:
|
||||||
status = "FAILED"
|
status = statusCritical
|
||||||
|
description = stringPtr("SMART overall self-assessment failed")
|
||||||
case health.criticalWarning > 0:
|
case health.criticalWarning > 0:
|
||||||
status = "FAILED"
|
status = statusCritical
|
||||||
|
description = stringPtr("NVMe critical warning is set")
|
||||||
case health.pendingSectors > 0 || health.offlineUncorrectable > 0:
|
case health.pendingSectors > 0 || health.offlineUncorrectable > 0:
|
||||||
status = "FAILED"
|
status = statusCritical
|
||||||
|
description = stringPtr("Pending or offline uncorrectable sectors detected")
|
||||||
case health.mediaErrors > 0:
|
case health.mediaErrors > 0:
|
||||||
status = "WARNING"
|
status = statusWarning
|
||||||
|
description = stringPtr("Media errors reported")
|
||||||
case health.reallocatedSectors > 0:
|
case health.reallocatedSectors > 0:
|
||||||
status = "WARNING"
|
status = statusWarning
|
||||||
|
description = stringPtr("Reallocated sectors detected")
|
||||||
case health.errorLogEntries > 0:
|
case health.errorLogEntries > 0:
|
||||||
status = "WARNING"
|
status = statusWarning
|
||||||
|
description = stringPtr("Device error log contains entries")
|
||||||
case health.lifeRemainingPct > 0 && health.lifeRemainingPct <= 10:
|
case health.lifeRemainingPct > 0 && health.lifeRemainingPct <= 10:
|
||||||
status = "WARNING"
|
status = statusWarning
|
||||||
|
description = stringPtr("Life remaining is low")
|
||||||
case health.percentageUsed >= 95:
|
case health.percentageUsed >= 95:
|
||||||
status = "WARNING"
|
status = statusWarning
|
||||||
|
description = stringPtr("Drive wear level is high")
|
||||||
case health.availableSpare > 0 && health.spareThreshold > 0 && health.availableSpare <= health.spareThreshold:
|
case health.availableSpare > 0 && health.spareThreshold > 0 && health.availableSpare <= health.spareThreshold:
|
||||||
status = "WARNING"
|
status = statusWarning
|
||||||
|
description = stringPtr("Available spare is at or below threshold")
|
||||||
case health.unsafeShutdowns > 100:
|
case health.unsafeShutdowns > 100:
|
||||||
status = "WARNING"
|
status = statusWarning
|
||||||
|
description = stringPtr("Unsafe shutdown count is high")
|
||||||
}
|
}
|
||||||
s.Status = &status
|
s.Status = &status
|
||||||
|
s.ErrorDescription = description
|
||||||
|
}
|
||||||
|
|
||||||
|
func stringPtr(value string) *string {
|
||||||
|
return &value
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -17,37 +17,37 @@ func TestSetStorageHealthStatus(t *testing.T) {
|
|||||||
{
|
{
|
||||||
name: "smart overall failed",
|
name: "smart overall failed",
|
||||||
health: storageHealthStatus{hasOverall: true, overallPassed: false},
|
health: storageHealthStatus{hasOverall: true, overallPassed: false},
|
||||||
want: "FAILED",
|
want: statusCritical,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "nvme critical warning",
|
name: "nvme critical warning",
|
||||||
health: storageHealthStatus{criticalWarning: 1},
|
health: storageHealthStatus{criticalWarning: 1},
|
||||||
want: "FAILED",
|
want: statusCritical,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "pending sectors",
|
name: "pending sectors",
|
||||||
health: storageHealthStatus{pendingSectors: 1},
|
health: storageHealthStatus{pendingSectors: 1},
|
||||||
want: "FAILED",
|
want: statusCritical,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "media errors warning",
|
name: "media errors warning",
|
||||||
health: storageHealthStatus{mediaErrors: 2},
|
health: storageHealthStatus{mediaErrors: 2},
|
||||||
want: "WARNING",
|
want: statusWarning,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "reallocated warning",
|
name: "reallocated warning",
|
||||||
health: storageHealthStatus{reallocatedSectors: 1},
|
health: storageHealthStatus{reallocatedSectors: 1},
|
||||||
want: "WARNING",
|
want: statusWarning,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "life remaining low",
|
name: "life remaining low",
|
||||||
health: storageHealthStatus{lifeRemainingPct: 8},
|
health: storageHealthStatus{lifeRemainingPct: 8},
|
||||||
want: "WARNING",
|
want: statusWarning,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "healthy",
|
name: "healthy",
|
||||||
health: storageHealthStatus{},
|
health: storageHealthStatus{},
|
||||||
want: "OK",
|
want: statusOK,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -6,31 +6,31 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
func buildHealthSummary(snap schema.HardwareSnapshot) *schema.HardwareHealthSummary {
|
func BuildHealthSummary(snap schema.HardwareSnapshot) *schema.HardwareHealthSummary {
|
||||||
summary := &schema.HardwareHealthSummary{
|
summary := &schema.HardwareHealthSummary{
|
||||||
Status: "OK",
|
Status: statusOK,
|
||||||
CollectedAt: time.Now().UTC().Format(time.RFC3339),
|
CollectedAt: time.Now().UTC().Format(time.RFC3339),
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, dimm := range snap.Memory {
|
for _, dimm := range snap.Memory {
|
||||||
switch derefString(dimm.Status) {
|
switch derefString(dimm.Status) {
|
||||||
case "WARNING":
|
case statusWarning:
|
||||||
summary.MemoryWarn++
|
summary.MemoryWarn++
|
||||||
summary.Warnings = append(summary.Warnings, formatMemorySummary(dimm))
|
summary.Warnings = append(summary.Warnings, formatMemorySummary(dimm))
|
||||||
case "FAILED":
|
case statusCritical:
|
||||||
summary.MemoryFail++
|
summary.MemoryFail++
|
||||||
summary.Failures = append(summary.Failures, formatMemorySummary(dimm))
|
summary.Failures = append(summary.Failures, formatMemorySummary(dimm))
|
||||||
case "EMPTY":
|
case statusEmpty:
|
||||||
summary.EmptyDIMMs++
|
summary.EmptyDIMMs++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, disk := range snap.Storage {
|
for _, disk := range snap.Storage {
|
||||||
switch derefString(disk.Status) {
|
switch derefString(disk.Status) {
|
||||||
case "WARNING":
|
case statusWarning:
|
||||||
summary.StorageWarn++
|
summary.StorageWarn++
|
||||||
summary.Warnings = append(summary.Warnings, formatStorageSummary(disk))
|
summary.Warnings = append(summary.Warnings, formatStorageSummary(disk))
|
||||||
case "FAILED":
|
case statusCritical:
|
||||||
summary.StorageFail++
|
summary.StorageFail++
|
||||||
summary.Failures = append(summary.Failures, formatStorageSummary(disk))
|
summary.Failures = append(summary.Failures, formatStorageSummary(disk))
|
||||||
}
|
}
|
||||||
@@ -38,10 +38,10 @@ func buildHealthSummary(snap schema.HardwareSnapshot) *schema.HardwareHealthSumm
|
|||||||
|
|
||||||
for _, dev := range snap.PCIeDevices {
|
for _, dev := range snap.PCIeDevices {
|
||||||
switch derefString(dev.Status) {
|
switch derefString(dev.Status) {
|
||||||
case "WARNING":
|
case statusWarning:
|
||||||
summary.PCIeWarn++
|
summary.PCIeWarn++
|
||||||
summary.Warnings = append(summary.Warnings, formatPCIeSummary(dev))
|
summary.Warnings = append(summary.Warnings, formatPCIeSummary(dev))
|
||||||
case "FAILED":
|
case statusCritical:
|
||||||
summary.PCIeFail++
|
summary.PCIeFail++
|
||||||
summary.Failures = append(summary.Failures, formatPCIeSummary(dev))
|
summary.Failures = append(summary.Failures, formatPCIeSummary(dev))
|
||||||
}
|
}
|
||||||
@@ -52,19 +52,19 @@ func buildHealthSummary(snap schema.HardwareSnapshot) *schema.HardwareHealthSumm
|
|||||||
summary.MissingPSUs++
|
summary.MissingPSUs++
|
||||||
}
|
}
|
||||||
switch derefString(psu.Status) {
|
switch derefString(psu.Status) {
|
||||||
case "WARNING":
|
case statusWarning:
|
||||||
summary.PSUWarn++
|
summary.PSUWarn++
|
||||||
summary.Warnings = append(summary.Warnings, formatPSUSummary(psu))
|
summary.Warnings = append(summary.Warnings, formatPSUSummary(psu))
|
||||||
case "FAILED":
|
case statusCritical:
|
||||||
summary.PSUFail++
|
summary.PSUFail++
|
||||||
summary.Failures = append(summary.Failures, formatPSUSummary(psu))
|
summary.Failures = append(summary.Failures, formatPSUSummary(psu))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(summary.Failures) > 0 || summary.StorageFail > 0 || summary.PCIeFail > 0 || summary.PSUFail > 0 || summary.MemoryFail > 0 {
|
if len(summary.Failures) > 0 || summary.StorageFail > 0 || summary.PCIeFail > 0 || summary.PSUFail > 0 || summary.MemoryFail > 0 {
|
||||||
summary.Status = "FAILED"
|
summary.Status = statusCritical
|
||||||
} else if len(summary.Warnings) > 0 || summary.StorageWarn > 0 || summary.PCIeWarn > 0 || summary.PSUWarn > 0 || summary.MemoryWarn > 0 {
|
} else if len(summary.Warnings) > 0 || summary.StorageWarn > 0 || summary.PCIeWarn > 0 || summary.PSUWarn > 0 || summary.MemoryWarn > 0 {
|
||||||
summary.Status = "WARNING"
|
summary.Status = statusWarning
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(summary.Warnings) == 0 {
|
if len(summary.Warnings) == 0 {
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ md125 : active raid1 nvme2n1[0] nvme3n1[1]
|
|||||||
func TestHasVROCController(t *testing.T) {
|
func TestHasVROCController(t *testing.T) {
|
||||||
intel := vendorIntel
|
intel := vendorIntel
|
||||||
model := "Volume Management Device NVMe RAID Controller"
|
model := "Volume Management Device NVMe RAID Controller"
|
||||||
class := "RAID bus controller"
|
class := "MassStorageController"
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
name string
|
name string
|
||||||
pcie []schema.HardwarePCIeDevice
|
pcie []schema.HardwarePCIeDevice
|
||||||
|
|||||||
@@ -5,10 +5,10 @@ package schema
|
|||||||
// HardwareIngestRequest is the top-level output document produced by `bee audit`.
|
// HardwareIngestRequest is the top-level output document produced by `bee audit`.
|
||||||
// It is accepted as-is by the core /api/ingest/hardware endpoint.
|
// It is accepted as-is by the core /api/ingest/hardware endpoint.
|
||||||
type HardwareIngestRequest struct {
|
type HardwareIngestRequest struct {
|
||||||
Filename *string `json:"filename"`
|
Filename *string `json:"filename,omitempty"`
|
||||||
SourceType *string `json:"source_type"`
|
SourceType *string `json:"source_type,omitempty"`
|
||||||
Protocol *string `json:"protocol"`
|
Protocol *string `json:"protocol,omitempty"`
|
||||||
TargetHost string `json:"target_host"`
|
TargetHost *string `json:"target_host,omitempty"`
|
||||||
CollectedAt string `json:"collected_at"`
|
CollectedAt string `json:"collected_at"`
|
||||||
Hardware HardwareSnapshot `json:"hardware"`
|
Hardware HardwareSnapshot `json:"hardware"`
|
||||||
}
|
}
|
||||||
@@ -21,32 +21,32 @@ type HardwareSnapshot struct {
|
|||||||
Storage []HardwareStorage `json:"storage,omitempty"`
|
Storage []HardwareStorage `json:"storage,omitempty"`
|
||||||
PCIeDevices []HardwarePCIeDevice `json:"pcie_devices,omitempty"`
|
PCIeDevices []HardwarePCIeDevice `json:"pcie_devices,omitempty"`
|
||||||
PowerSupplies []HardwarePowerSupply `json:"power_supplies,omitempty"`
|
PowerSupplies []HardwarePowerSupply `json:"power_supplies,omitempty"`
|
||||||
Summary *HardwareHealthSummary `json:"summary,omitempty"`
|
Sensors *HardwareSensors `json:"sensors,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type HardwareHealthSummary struct {
|
type HardwareHealthSummary struct {
|
||||||
Status string `json:"status"`
|
Status string `json:"status"`
|
||||||
Warnings []string `json:"warnings,omitempty"`
|
Warnings []string `json:"warnings,omitempty"`
|
||||||
Failures []string `json:"failures,omitempty"`
|
Failures []string `json:"failures,omitempty"`
|
||||||
StorageWarn int `json:"storage_warn,omitempty"`
|
StorageWarn int `json:"storage_warn,omitempty"`
|
||||||
StorageFail int `json:"storage_fail,omitempty"`
|
StorageFail int `json:"storage_fail,omitempty"`
|
||||||
PCIeWarn int `json:"pcie_warn,omitempty"`
|
PCIeWarn int `json:"pcie_warn,omitempty"`
|
||||||
PCIeFail int `json:"pcie_fail,omitempty"`
|
PCIeFail int `json:"pcie_fail,omitempty"`
|
||||||
PSUWarn int `json:"psu_warn,omitempty"`
|
PSUWarn int `json:"psu_warn,omitempty"`
|
||||||
PSUFail int `json:"psu_fail,omitempty"`
|
PSUFail int `json:"psu_fail,omitempty"`
|
||||||
MemoryWarn int `json:"memory_warn,omitempty"`
|
MemoryWarn int `json:"memory_warn,omitempty"`
|
||||||
MemoryFail int `json:"memory_fail,omitempty"`
|
MemoryFail int `json:"memory_fail,omitempty"`
|
||||||
EmptyDIMMs int `json:"empty_dimms,omitempty"`
|
EmptyDIMMs int `json:"empty_dimms,omitempty"`
|
||||||
MissingPSUs int `json:"missing_psus,omitempty"`
|
MissingPSUs int `json:"missing_psus,omitempty"`
|
||||||
CollectedAt string `json:"collected_at,omitempty"`
|
CollectedAt string `json:"collected_at,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type HardwareBoard struct {
|
type HardwareBoard struct {
|
||||||
Manufacturer *string `json:"manufacturer"`
|
Manufacturer *string `json:"manufacturer,omitempty"`
|
||||||
ProductName *string `json:"product_name"`
|
ProductName *string `json:"product_name,omitempty"`
|
||||||
SerialNumber string `json:"serial_number"`
|
SerialNumber string `json:"serial_number"`
|
||||||
PartNumber *string `json:"part_number"`
|
PartNumber *string `json:"part_number,omitempty"`
|
||||||
UUID *string `json:"uuid"`
|
UUID *string `json:"uuid,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type HardwareFirmwareRecord struct {
|
type HardwareFirmwareRecord struct {
|
||||||
@@ -55,77 +55,183 @@ type HardwareFirmwareRecord struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type HardwareCPU struct {
|
type HardwareCPU struct {
|
||||||
Socket *int `json:"socket"`
|
HardwareComponentStatus
|
||||||
Model *string `json:"model"`
|
Socket *int `json:"socket,omitempty"`
|
||||||
Manufacturer *string `json:"manufacturer"`
|
Model *string `json:"model,omitempty"`
|
||||||
Status *string `json:"status"`
|
Manufacturer *string `json:"manufacturer,omitempty"`
|
||||||
SerialNumber *string `json:"serial_number"`
|
SerialNumber *string `json:"serial_number,omitempty"`
|
||||||
Firmware *string `json:"firmware"`
|
Firmware *string `json:"firmware,omitempty"`
|
||||||
Cores *int `json:"cores"`
|
Cores *int `json:"cores,omitempty"`
|
||||||
Threads *int `json:"threads"`
|
Threads *int `json:"threads,omitempty"`
|
||||||
FrequencyMHz *int `json:"frequency_mhz"`
|
FrequencyMHz *int `json:"frequency_mhz,omitempty"`
|
||||||
MaxFrequencyMHz *int `json:"max_frequency_mhz"`
|
MaxFrequencyMHz *int `json:"max_frequency_mhz,omitempty"`
|
||||||
|
TemperatureC *float64 `json:"temperature_c,omitempty"`
|
||||||
|
PowerW *float64 `json:"power_w,omitempty"`
|
||||||
|
Throttled *bool `json:"throttled,omitempty"`
|
||||||
|
CorrectableErrorCount *int64 `json:"correctable_error_count,omitempty"`
|
||||||
|
UncorrectableErrorCount *int64 `json:"uncorrectable_error_count,omitempty"`
|
||||||
|
LifeRemainingPct *float64 `json:"life_remaining_pct,omitempty"`
|
||||||
|
LifeUsedPct *float64 `json:"life_used_pct,omitempty"`
|
||||||
|
Present *bool `json:"present,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type HardwareMemory struct {
|
type HardwareMemory struct {
|
||||||
Slot *string `json:"slot"`
|
HardwareComponentStatus
|
||||||
Location *string `json:"location"`
|
Slot *string `json:"slot,omitempty"`
|
||||||
Present *bool `json:"present"`
|
Location *string `json:"location,omitempty"`
|
||||||
SizeMB *int `json:"size_mb"`
|
Present *bool `json:"present,omitempty"`
|
||||||
Type *string `json:"type"`
|
SizeMB *int `json:"size_mb,omitempty"`
|
||||||
MaxSpeedMHz *int `json:"max_speed_mhz"`
|
Type *string `json:"type,omitempty"`
|
||||||
CurrentSpeedMHz *int `json:"current_speed_mhz"`
|
MaxSpeedMHz *int `json:"max_speed_mhz,omitempty"`
|
||||||
Manufacturer *string `json:"manufacturer"`
|
CurrentSpeedMHz *int `json:"current_speed_mhz,omitempty"`
|
||||||
SerialNumber *string `json:"serial_number"`
|
Manufacturer *string `json:"manufacturer,omitempty"`
|
||||||
PartNumber *string `json:"part_number"`
|
SerialNumber *string `json:"serial_number,omitempty"`
|
||||||
Status *string `json:"status"`
|
PartNumber *string `json:"part_number,omitempty"`
|
||||||
|
TemperatureC *float64 `json:"temperature_c,omitempty"`
|
||||||
|
CorrectableECCErrorCount *int64 `json:"correctable_ecc_error_count,omitempty"`
|
||||||
|
UncorrectableECCErrorCount *int64 `json:"uncorrectable_ecc_error_count,omitempty"`
|
||||||
|
LifeRemainingPct *float64 `json:"life_remaining_pct,omitempty"`
|
||||||
|
LifeUsedPct *float64 `json:"life_used_pct,omitempty"`
|
||||||
|
SpareBlocksRemainingPct *float64 `json:"spare_blocks_remaining_pct,omitempty"`
|
||||||
|
PerformanceDegraded *bool `json:"performance_degraded,omitempty"`
|
||||||
|
DataLossDetected *bool `json:"data_loss_detected,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type HardwareStorage struct {
|
type HardwareStorage struct {
|
||||||
Slot *string `json:"slot"`
|
HardwareComponentStatus
|
||||||
Type *string `json:"type"`
|
Slot *string `json:"slot,omitempty"`
|
||||||
Model *string `json:"model"`
|
Type *string `json:"type,omitempty"`
|
||||||
SizeGB *int `json:"size_gb"`
|
Model *string `json:"model,omitempty"`
|
||||||
SerialNumber *string `json:"serial_number"`
|
SizeGB *int `json:"size_gb,omitempty"`
|
||||||
Manufacturer *string `json:"manufacturer"`
|
SerialNumber *string `json:"serial_number,omitempty"`
|
||||||
Firmware *string `json:"firmware"`
|
Manufacturer *string `json:"manufacturer,omitempty"`
|
||||||
Interface *string `json:"interface"`
|
Firmware *string `json:"firmware,omitempty"`
|
||||||
Present *bool `json:"present"`
|
Interface *string `json:"interface,omitempty"`
|
||||||
Status *string `json:"status"`
|
Present *bool `json:"present,omitempty"`
|
||||||
Telemetry map[string]any `json:"telemetry,omitempty"`
|
TemperatureC *float64 `json:"temperature_c,omitempty"`
|
||||||
|
PowerOnHours *int64 `json:"power_on_hours,omitempty"`
|
||||||
|
PowerCycles *int64 `json:"power_cycles,omitempty"`
|
||||||
|
UnsafeShutdowns *int64 `json:"unsafe_shutdowns,omitempty"`
|
||||||
|
MediaErrors *int64 `json:"media_errors,omitempty"`
|
||||||
|
ErrorLogEntries *int64 `json:"error_log_entries,omitempty"`
|
||||||
|
WrittenBytes *int64 `json:"written_bytes,omitempty"`
|
||||||
|
ReadBytes *int64 `json:"read_bytes,omitempty"`
|
||||||
|
LifeUsedPct *float64 `json:"life_used_pct,omitempty"`
|
||||||
|
LifeRemainingPct *float64 `json:"life_remaining_pct,omitempty"`
|
||||||
|
AvailableSparePct *float64 `json:"available_spare_pct,omitempty"`
|
||||||
|
ReallocatedSectors *int64 `json:"reallocated_sectors,omitempty"`
|
||||||
|
CurrentPendingSectors *int64 `json:"current_pending_sectors,omitempty"`
|
||||||
|
OfflineUncorrectable *int64 `json:"offline_uncorrectable,omitempty"`
|
||||||
|
Telemetry map[string]any `json:"-"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type HardwarePCIeDevice struct {
|
type HardwarePCIeDevice struct {
|
||||||
Slot *string `json:"slot"`
|
HardwareComponentStatus
|
||||||
VendorID *int `json:"vendor_id"`
|
Slot *string `json:"slot,omitempty"`
|
||||||
DeviceID *int `json:"device_id"`
|
VendorID *int `json:"vendor_id,omitempty"`
|
||||||
BDF *string `json:"bdf"`
|
DeviceID *int `json:"device_id,omitempty"`
|
||||||
DeviceClass *string `json:"device_class"`
|
NUMANode *int `json:"numa_node,omitempty"`
|
||||||
Manufacturer *string `json:"manufacturer"`
|
TemperatureC *float64 `json:"temperature_c,omitempty"`
|
||||||
Model *string `json:"model"`
|
PowerW *float64 `json:"power_w,omitempty"`
|
||||||
LinkWidth *int `json:"link_width"`
|
LifeRemainingPct *float64 `json:"life_remaining_pct,omitempty"`
|
||||||
LinkSpeed *string `json:"link_speed"`
|
LifeUsedPct *float64 `json:"life_used_pct,omitempty"`
|
||||||
MaxLinkWidth *int `json:"max_link_width"`
|
ECCCorrectedTotal *int64 `json:"ecc_corrected_total,omitempty"`
|
||||||
MaxLinkSpeed *string `json:"max_link_speed"`
|
ECCUncorrectedTotal *int64 `json:"ecc_uncorrected_total,omitempty"`
|
||||||
SerialNumber *string `json:"serial_number"`
|
HWSlowdown *bool `json:"hw_slowdown,omitempty"`
|
||||||
Firmware *string `json:"firmware"`
|
BatteryChargePct *float64 `json:"battery_charge_pct,omitempty"`
|
||||||
Present *bool `json:"present"`
|
BatteryHealthPct *float64 `json:"battery_health_pct,omitempty"`
|
||||||
Status *string `json:"status"`
|
BatteryTemperatureC *float64 `json:"battery_temperature_c,omitempty"`
|
||||||
Telemetry map[string]any `json:"telemetry,omitempty"`
|
BatteryVoltageV *float64 `json:"battery_voltage_v,omitempty"`
|
||||||
|
BatteryReplaceRequired *bool `json:"battery_replace_required,omitempty"`
|
||||||
|
SFPTemperatureC *float64 `json:"sfp_temperature_c,omitempty"`
|
||||||
|
SFPTXPowerDBM *float64 `json:"sfp_tx_power_dbm,omitempty"`
|
||||||
|
SFPRXPowerDBM *float64 `json:"sfp_rx_power_dbm,omitempty"`
|
||||||
|
SFPVoltageV *float64 `json:"sfp_voltage_v,omitempty"`
|
||||||
|
SFPBiasMA *float64 `json:"sfp_bias_ma,omitempty"`
|
||||||
|
BDF *string `json:"bdf,omitempty"`
|
||||||
|
DeviceClass *string `json:"device_class,omitempty"`
|
||||||
|
Manufacturer *string `json:"manufacturer,omitempty"`
|
||||||
|
Model *string `json:"model,omitempty"`
|
||||||
|
LinkWidth *int `json:"link_width,omitempty"`
|
||||||
|
LinkSpeed *string `json:"link_speed,omitempty"`
|
||||||
|
MaxLinkWidth *int `json:"max_link_width,omitempty"`
|
||||||
|
MaxLinkSpeed *string `json:"max_link_speed,omitempty"`
|
||||||
|
SerialNumber *string `json:"serial_number,omitempty"`
|
||||||
|
Firmware *string `json:"firmware,omitempty"`
|
||||||
|
MacAddresses []string `json:"mac_addresses,omitempty"`
|
||||||
|
Present *bool `json:"present,omitempty"`
|
||||||
|
Telemetry map[string]any `json:"-"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type HardwarePowerSupply struct {
|
type HardwarePowerSupply struct {
|
||||||
Slot *string `json:"slot"`
|
HardwareComponentStatus
|
||||||
Present *bool `json:"present"`
|
Slot *string `json:"slot,omitempty"`
|
||||||
Model *string `json:"model"`
|
Present *bool `json:"present,omitempty"`
|
||||||
Vendor *string `json:"vendor"`
|
Model *string `json:"model,omitempty"`
|
||||||
WattageW *int `json:"wattage_w"`
|
Vendor *string `json:"vendor,omitempty"`
|
||||||
SerialNumber *string `json:"serial_number"`
|
WattageW *int `json:"wattage_w,omitempty"`
|
||||||
PartNumber *string `json:"part_number"`
|
SerialNumber *string `json:"serial_number,omitempty"`
|
||||||
Firmware *string `json:"firmware"`
|
PartNumber *string `json:"part_number,omitempty"`
|
||||||
Status *string `json:"status"`
|
Firmware *string `json:"firmware,omitempty"`
|
||||||
InputType *string `json:"input_type"`
|
InputType *string `json:"input_type,omitempty"`
|
||||||
InputPowerW *float64 `json:"input_power_w"`
|
InputPowerW *float64 `json:"input_power_w,omitempty"`
|
||||||
OutputPowerW *float64 `json:"output_power_w"`
|
OutputPowerW *float64 `json:"output_power_w,omitempty"`
|
||||||
InputVoltage *float64 `json:"input_voltage"`
|
InputVoltage *float64 `json:"input_voltage,omitempty"`
|
||||||
|
TemperatureC *float64 `json:"temperature_c,omitempty"`
|
||||||
|
LifeRemainingPct *float64 `json:"life_remaining_pct,omitempty"`
|
||||||
|
LifeUsedPct *float64 `json:"life_used_pct,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type HardwareComponentStatus struct {
|
||||||
|
Status *string `json:"status,omitempty"`
|
||||||
|
StatusCheckedAt *string `json:"status_checked_at,omitempty"`
|
||||||
|
StatusChangedAt *string `json:"status_changed_at,omitempty"`
|
||||||
|
StatusHistory []HardwareStatusHistory `json:"status_history,omitempty"`
|
||||||
|
ErrorDescription *string `json:"error_description,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type HardwareStatusHistory struct {
|
||||||
|
Status string `json:"status"`
|
||||||
|
ChangedAt string `json:"changed_at"`
|
||||||
|
Details *string `json:"details,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type HardwareSensors struct {
|
||||||
|
Fans []HardwareFanSensor `json:"fans,omitempty"`
|
||||||
|
Power []HardwarePowerSensor `json:"power,omitempty"`
|
||||||
|
Temperatures []HardwareTemperatureSensor `json:"temperatures,omitempty"`
|
||||||
|
Other []HardwareOtherSensor `json:"other,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type HardwareFanSensor struct {
|
||||||
|
Name string `json:"name"`
|
||||||
|
Location *string `json:"location,omitempty"`
|
||||||
|
RPM *int `json:"rpm,omitempty"`
|
||||||
|
Status *string `json:"status,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type HardwarePowerSensor struct {
|
||||||
|
Name string `json:"name"`
|
||||||
|
Location *string `json:"location,omitempty"`
|
||||||
|
VoltageV *float64 `json:"voltage_v,omitempty"`
|
||||||
|
CurrentA *float64 `json:"current_a,omitempty"`
|
||||||
|
PowerW *float64 `json:"power_w,omitempty"`
|
||||||
|
Status *string `json:"status,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type HardwareTemperatureSensor struct {
|
||||||
|
Name string `json:"name"`
|
||||||
|
Location *string `json:"location,omitempty"`
|
||||||
|
Celsius *float64 `json:"celsius,omitempty"`
|
||||||
|
ThresholdWarningCelsius *float64 `json:"threshold_warning_celsius,omitempty"`
|
||||||
|
ThresholdCriticalCelsius *float64 `json:"threshold_critical_celsius,omitempty"`
|
||||||
|
Status *string `json:"status,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type HardwareOtherSensor struct {
|
||||||
|
Name string `json:"name"`
|
||||||
|
Location *string `json:"location,omitempty"`
|
||||||
|
Value *float64 `json:"value,omitempty"`
|
||||||
|
Unit *string `json:"unit,omitempty"`
|
||||||
|
Status *string `json:"status,omitempty"`
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -9,4 +9,5 @@ Generic engineering rules live in `bible/rules/patterns/`.
|
|||||||
|---|---|
|
|---|---|
|
||||||
| `architecture/system-overview.md` | What bee does, scope, tech stack |
|
| `architecture/system-overview.md` | What bee does, scope, tech stack |
|
||||||
| `architecture/runtime-flows.md` | Boot sequence, audit flow, service order |
|
| `architecture/runtime-flows.md` | Boot sequence, audit flow, service order |
|
||||||
|
| `docs/hardware-ingest-contract.md` | Current Reanimator hardware ingest JSON contract |
|
||||||
| `decisions/` | Architectural decision log |
|
| `decisions/` | Architectural decision log |
|
||||||
|
|||||||
@@ -4,7 +4,7 @@
|
|||||||
|
|
||||||
Hardware audit LiveCD. Boots on a server via BMC virtual media or USB.
|
Hardware audit LiveCD. Boots on a server via BMC virtual media or USB.
|
||||||
Collects hardware inventory at OS level (not through BMC/Redfish).
|
Collects hardware inventory at OS level (not through BMC/Redfish).
|
||||||
Produces `HardwareIngestRequest` JSON compatible with core/reanimator.
|
Produces `HardwareIngestRequest` JSON compatible with the contract in `bible-local/docs/hardware-ingest-contract.md`.
|
||||||
|
|
||||||
## Why it exists
|
## Why it exists
|
||||||
|
|
||||||
@@ -46,6 +46,16 @@ Fills gaps where Redfish/logpile is blind:
|
|||||||
- Anything requiring persistent storage on the audited machine
|
- Anything requiring persistent storage on the audited machine
|
||||||
- Windows support
|
- Windows support
|
||||||
- Any functionality requiring internet access at boot
|
- Any functionality requiring internet access at boot
|
||||||
|
- Component lifecycle/history across multiple snapshots
|
||||||
|
- Status transition history (`status_history`, `status_changed_at`) derived from previous exports
|
||||||
|
- Replacement detection between two or more audit runs
|
||||||
|
|
||||||
|
## Contract boundary
|
||||||
|
|
||||||
|
- `bee` is responsible for the current hardware snapshot only.
|
||||||
|
- `bee` should populate current component state, hardware inventory, telemetry, and `status_checked_at`.
|
||||||
|
- Historical status transitions and component replacement logic belong to the centralized ingest/lifecycle system, not to `bee`.
|
||||||
|
- Contract fields that have no honest local source on a generic Linux host may remain empty.
|
||||||
|
|
||||||
## Tech stack
|
## Tech stack
|
||||||
|
|
||||||
|
|||||||
@@ -18,3 +18,51 @@
|
|||||||
- точнее классифицировать vendor-specific self-test outputs в `storage SAT`
|
- точнее классифицировать vendor-specific self-test outputs в `storage SAT`
|
||||||
- подобрать дефолты `memtester` по объёму RAM на целевых машинах
|
- подобрать дефолты `memtester` по объёму RAM на целевых машинах
|
||||||
- при необходимости расширить `bee-gpu-stress` по длительности/нагрузке
|
- при необходимости расширить `bee-gpu-stress` по длительности/нагрузке
|
||||||
|
|
||||||
|
## Hardware Contract backlog
|
||||||
|
|
||||||
|
**Статус:** уточнён, сокращён до `bee`-only snapshot scope.
|
||||||
|
|
||||||
|
### Не backlog для `bee`
|
||||||
|
|
||||||
|
Эти задачи не должны реализовываться в `bee`, потому что относятся к централизованному ingest/lifecycle слою:
|
||||||
|
- `status_history`
|
||||||
|
- `status_changed_at`
|
||||||
|
- определение замены компонента между snapshot'ами
|
||||||
|
- timeline/lifecycle/history по diff между экспортами
|
||||||
|
|
||||||
|
`bee` отвечает только за текущий snapshot железа и `status_checked_at`.
|
||||||
|
|
||||||
|
### Реализуемо инкрементально
|
||||||
|
|
||||||
|
Эти поля можно развивать дальше по мере появления реальных sample outputs и vendor-specific parser'ов:
|
||||||
|
- `cpus.correctable_error_count`
|
||||||
|
- `cpus.uncorrectable_error_count`
|
||||||
|
- `power_supplies.life_remaining_pct`
|
||||||
|
- `power_supplies.life_used_pct`
|
||||||
|
- `pcie_devices.battery_charge_pct`
|
||||||
|
- `pcie_devices.battery_health_pct`
|
||||||
|
- `pcie_devices.battery_temperature_c`
|
||||||
|
- `pcie_devices.battery_voltage_v`
|
||||||
|
- `pcie_devices.battery_replace_required`
|
||||||
|
|
||||||
|
### Vendor/platform-specific, часто пустые
|
||||||
|
|
||||||
|
Эти поля допустимо оставлять пустыми на части платформ даже после реализации parser'ов:
|
||||||
|
- `power_supplies.life_remaining_pct`
|
||||||
|
- `power_supplies.life_used_pct`
|
||||||
|
- часть `pcie_devices.battery_*` для неподдержанных RAID/NIC/GPU вендоров
|
||||||
|
|
||||||
|
### Unsupported в `bee`
|
||||||
|
|
||||||
|
Эти поля считаются нереалистичными для общего OS-level hardware snapshotter без synthetic/fake data:
|
||||||
|
- `cpus.life_remaining_pct`
|
||||||
|
- `cpus.life_used_pct`
|
||||||
|
- `memory.life_remaining_pct`
|
||||||
|
- `memory.life_used_pct`
|
||||||
|
- `memory.spare_blocks_remaining_pct`
|
||||||
|
- `memory.performance_degraded`
|
||||||
|
|
||||||
|
Причина: у обычного Linux-host audit обычно нет честного vendor-neutral runtime source для этих метрик.
|
||||||
|
|
||||||
|
Эти поля считаются дропнутыми из backlog `bee` и не должны возвращаться в план работ без появления нового доказуемого локального источника данных на целевых машинах.
|
||||||
|
|||||||
730
bible-local/docs/hardware-ingest-contract.md
Normal file
730
bible-local/docs/hardware-ingest-contract.md
Normal file
@@ -0,0 +1,730 @@
|
|||||||
|
---
|
||||||
|
title: Hardware Ingest JSON Contract
|
||||||
|
version: "2.1"
|
||||||
|
updated: "2026-03-15"
|
||||||
|
maintainer: Reanimator Core
|
||||||
|
audience: external-integrators, ai-agents
|
||||||
|
language: ru
|
||||||
|
---
|
||||||
|
|
||||||
|
# Интеграция с Reanimator: контракт JSON-импорта аппаратного обеспечения
|
||||||
|
|
||||||
|
Версия: **2.1** · Дата: **2026-03-15**
|
||||||
|
|
||||||
|
Документ описывает формат JSON для передачи данных об аппаратном обеспечении серверов в систему **Reanimator** (управление жизненным циклом аппаратного обеспечения).
|
||||||
|
Предназначен для разработчиков смежных систем (Redfish-коллекторов, агентов мониторинга, CMDB-экспортёров) и может быть включён в документацию интегрируемых проектов.
|
||||||
|
|
||||||
|
> Актуальная версия документа: https://git.mchus.pro/reanimator/core/src/branch/main/bible-local/docs/hardware-ingest-contract.md
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Changelog
|
||||||
|
|
||||||
|
| Версия | Дата | Изменения |
|
||||||
|
|--------|------|-----------|
|
||||||
|
| 2.4 | 2026-03-15 | Добавлена первая волна component telemetry: health/life поля для `cpus`, `memory`, `storage`, `pcie_devices`, `power_supplies` |
|
||||||
|
| 2.3 | 2026-03-15 | Добавлены component telemetry поля: `pcie_devices.temperature_c`, `pcie_devices.power_w`, `power_supplies.temperature_c` |
|
||||||
|
| 2.2 | 2026-03-15 | Добавлено поле `numa_node` у `pcie_devices` для topology/affinity |
|
||||||
|
| 2.1 | 2026-03-15 | Добавлена секция `sensors` (fans, power, temperatures, other); поле `mac_addresses` у `pcie_devices`; расширен список значений `device_class` |
|
||||||
|
| 2.0 | 2026-02-01 | История статусов (`status_history`, `status_changed_at`); поля telemetry у PSU; async job response |
|
||||||
|
| 1.0 | 2026-01-01 | Начальная версия контракта |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Принципы
|
||||||
|
|
||||||
|
1. **Snapshot** — JSON описывает состояние сервера на момент сбора. Может включать историю изменений статуса компонентов.
|
||||||
|
2. **Идемпотентность** — повторная отправка идентичного payload не создаёт дублей (дедупликация по хешу).
|
||||||
|
3. **Частичность** — можно передавать только те секции, данные по которым доступны. Пустой массив и отсутствие секции эквивалентны.
|
||||||
|
4. **Строгая схема** — endpoint использует строгий JSON-декодер; неизвестные поля приводят к `400 Bad Request`.
|
||||||
|
5. **Event-driven** — импорт создаёт события в timeline (LOG_COLLECTED, INSTALLED, REMOVED, FIRMWARE_CHANGED и др.).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Endpoint
|
||||||
|
|
||||||
|
```
|
||||||
|
POST /ingest/hardware
|
||||||
|
Content-Type: application/json
|
||||||
|
```
|
||||||
|
|
||||||
|
**Ответ при приёме (202 Accepted):**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"status": "accepted",
|
||||||
|
"job_id": "job_01J..."
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Импорт выполняется асинхронно. Результат доступен по:
|
||||||
|
```
|
||||||
|
GET /ingest/hardware/jobs/{job_id}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Ответ при успехе задачи:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"status": "success",
|
||||||
|
"bundle_id": "lb_01J...",
|
||||||
|
"asset_id": "mach_01J...",
|
||||||
|
"collected_at": "2026-02-10T15:30:00Z",
|
||||||
|
"duplicate": false,
|
||||||
|
"summary": {
|
||||||
|
"parts_observed": 15,
|
||||||
|
"parts_created": 2,
|
||||||
|
"parts_updated": 13,
|
||||||
|
"installations_created": 2,
|
||||||
|
"installations_closed": 1,
|
||||||
|
"timeline_events_created": 9,
|
||||||
|
"failure_events_created": 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Ответ при дубликате:**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"status": "success",
|
||||||
|
"duplicate": true,
|
||||||
|
"message": "LogBundle with this content hash already exists"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Ответ при ошибке (400 Bad Request):**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"status": "error",
|
||||||
|
"error": "validation_failed",
|
||||||
|
"details": {
|
||||||
|
"field": "hardware.board.serial_number",
|
||||||
|
"message": "serial_number is required"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Частые причины `400`:
|
||||||
|
- Неверный формат `collected_at` (требуется RFC3339).
|
||||||
|
- Пустой `hardware.board.serial_number`.
|
||||||
|
- Наличие неизвестного JSON-поля на любом уровне.
|
||||||
|
- Тело запроса превышает допустимый размер.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Структура верхнего уровня
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"filename": "redfish://10.10.10.103",
|
||||||
|
"source_type": "api",
|
||||||
|
"protocol": "redfish",
|
||||||
|
"target_host": "10.10.10.103",
|
||||||
|
"collected_at": "2026-02-10T15:30:00Z",
|
||||||
|
"hardware": {
|
||||||
|
"board": { ... },
|
||||||
|
"firmware": [ ... ],
|
||||||
|
"cpus": [ ... ],
|
||||||
|
"memory": [ ... ],
|
||||||
|
"storage": [ ... ],
|
||||||
|
"pcie_devices": [ ... ],
|
||||||
|
"power_supplies": [ ... ],
|
||||||
|
"sensors": { ... }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Поля верхнего уровня
|
||||||
|
|
||||||
|
| Поле | Тип | Обязательно | Описание |
|
||||||
|
|------|-----|-------------|----------|
|
||||||
|
| `collected_at` | string RFC3339 | **да** | Время сбора данных |
|
||||||
|
| `hardware` | object | **да** | Аппаратный снапшот |
|
||||||
|
| `hardware.board.serial_number` | string | **да** | Серийный номер платы/сервера |
|
||||||
|
| `target_host` | string | нет | IP или hostname |
|
||||||
|
| `source_type` | string | нет | Тип источника: `api`, `logfile`, `manual` |
|
||||||
|
| `protocol` | string | нет | Протокол: `redfish`, `ipmi`, `snmp`, `ssh` |
|
||||||
|
| `filename` | string | нет | Идентификатор источника |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Общие поля статуса компонентов
|
||||||
|
|
||||||
|
Применяются ко всем компонентным секциям (`cpus`, `memory`, `storage`, `pcie_devices`, `power_supplies`).
|
||||||
|
|
||||||
|
| Поле | Тип | Описание |
|
||||||
|
|------|-----|----------|
|
||||||
|
| `status` | string | Текущий статус: `OK`, `Warning`, `Critical`, `Unknown`, `Empty` |
|
||||||
|
| `status_checked_at` | string RFC3339 | Время последней проверки статуса |
|
||||||
|
| `status_changed_at` | string RFC3339 | Время последнего изменения статуса |
|
||||||
|
| `status_history` | array | История переходов статусов (см. ниже) |
|
||||||
|
| `error_description` | string | Текст ошибки/диагностики |
|
||||||
|
|
||||||
|
**Объект `status_history[]`:**
|
||||||
|
|
||||||
|
| Поле | Тип | Обязательно | Описание |
|
||||||
|
|------|-----|-------------|----------|
|
||||||
|
| `status` | string | **да** | Статус в этот момент |
|
||||||
|
| `changed_at` | string RFC3339 | **да** | Время перехода (без этого поля запись игнорируется) |
|
||||||
|
| `details` | string | нет | Пояснение к переходу |
|
||||||
|
|
||||||
|
**Правила приоритета времени события:**
|
||||||
|
|
||||||
|
1. `status_changed_at`
|
||||||
|
2. Последняя запись `status_history` с совпадающим статусом
|
||||||
|
3. Последняя парсируемая запись `status_history`
|
||||||
|
4. `status_checked_at`
|
||||||
|
|
||||||
|
**Правила передачи статусов:**
|
||||||
|
- Передавайте `status` как текущее состояние компонента в snapshot.
|
||||||
|
- Если источник хранит историю — передавайте `status_history` отсортированным по `changed_at` по возрастанию.
|
||||||
|
- Не включайте записи `status_history` без `changed_at`.
|
||||||
|
- Все даты — RFC3339, рекомендуется UTC (`Z`).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Секции hardware
|
||||||
|
|
||||||
|
### board
|
||||||
|
|
||||||
|
Основная информация о сервере. Обязательная секция.
|
||||||
|
|
||||||
|
| Поле | Тип | Обязательно | Описание |
|
||||||
|
|------|-----|-------------|----------|
|
||||||
|
| `serial_number` | string | **да** | Серийный номер (ключ идентификации Asset) |
|
||||||
|
| `manufacturer` | string | нет | Производитель |
|
||||||
|
| `product_name` | string | нет | Модель |
|
||||||
|
| `part_number` | string | нет | Партномер |
|
||||||
|
| `uuid` | string | нет | UUID системы |
|
||||||
|
|
||||||
|
Значения `"NULL"` в строковых полях трактуются как отсутствие данных.
|
||||||
|
|
||||||
|
```json
|
||||||
|
"board": {
|
||||||
|
"manufacturer": "Supermicro",
|
||||||
|
"product_name": "X12DPG-QT6",
|
||||||
|
"serial_number": "21D634101",
|
||||||
|
"part_number": "X12DPG-QT6-REV1.01",
|
||||||
|
"uuid": "d7ef2fe5-2fd0-11f0-910a-346f11040868"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### firmware
|
||||||
|
|
||||||
|
Версии прошивок системных компонентов (BIOS, BMC, CPLD и др.).
|
||||||
|
|
||||||
|
| Поле | Тип | Обязательно | Описание |
|
||||||
|
|------|-----|-------------|----------|
|
||||||
|
| `device_name` | string | **да** | Название устройства (`BIOS`, `BMC`, `CPLD`, …) |
|
||||||
|
| `version` | string | **да** | Версия прошивки |
|
||||||
|
|
||||||
|
Записи с пустым `device_name` или `version` игнорируются.
|
||||||
|
Изменение версии создаёт событие `FIRMWARE_CHANGED` для Asset.
|
||||||
|
|
||||||
|
```json
|
||||||
|
"firmware": [
|
||||||
|
{ "device_name": "BIOS", "version": "06.08.05" },
|
||||||
|
{ "device_name": "BMC", "version": "5.17.00" },
|
||||||
|
{ "device_name": "CPLD", "version": "01.02.03" }
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### cpus
|
||||||
|
|
||||||
|
| Поле | Тип | Обязательно | Описание |
|
||||||
|
|------|-----|-------------|----------|
|
||||||
|
| `socket` | int | **да** | Номер сокета (используется для генерации serial) |
|
||||||
|
| `model` | string | нет | Модель процессора |
|
||||||
|
| `manufacturer` | string | нет | Производитель |
|
||||||
|
| `cores` | int | нет | Количество ядер |
|
||||||
|
| `threads` | int | нет | Количество потоков |
|
||||||
|
| `frequency_mhz` | int | нет | Текущая частота |
|
||||||
|
| `max_frequency_mhz` | int | нет | Максимальная частота |
|
||||||
|
| `temperature_c` | float | нет | Температура CPU, °C (telemetry) |
|
||||||
|
| `power_w` | float | нет | Текущая мощность CPU, Вт (telemetry) |
|
||||||
|
| `throttled` | bool | нет | Зафиксирован thermal/power throttling |
|
||||||
|
| `correctable_error_count` | int | нет | Количество корректируемых ошибок CPU |
|
||||||
|
| `uncorrectable_error_count` | int | нет | Количество некорректируемых ошибок CPU |
|
||||||
|
| `life_remaining_pct` | float | нет | Остаточный ресурс / health, % |
|
||||||
|
| `life_used_pct` | float | нет | Использованный ресурс / wear, % |
|
||||||
|
| `serial_number` | string | нет | Серийный номер (если доступен) |
|
||||||
|
| `firmware` | string | нет | Версия микрокода |
|
||||||
|
| `present` | bool | нет | Наличие (по умолчанию `true`) |
|
||||||
|
| + общие поля статуса | | | см. раздел выше |
|
||||||
|
|
||||||
|
**Генерация serial_number при отсутствии:** `{board_serial}-CPU-{socket}`
|
||||||
|
|
||||||
|
```json
|
||||||
|
"cpus": [
|
||||||
|
{
|
||||||
|
"socket": 0,
|
||||||
|
"model": "INTEL(R) XEON(R) GOLD 6530",
|
||||||
|
"cores": 32,
|
||||||
|
"threads": 64,
|
||||||
|
"frequency_mhz": 2100,
|
||||||
|
"max_frequency_mhz": 4000,
|
||||||
|
"temperature_c": 61.5,
|
||||||
|
"power_w": 182.0,
|
||||||
|
"throttled": false,
|
||||||
|
"manufacturer": "Intel",
|
||||||
|
"status": "OK",
|
||||||
|
"status_checked_at": "2026-02-10T15:28:00Z"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### memory
|
||||||
|
|
||||||
|
| Поле | Тип | Обязательно | Описание |
|
||||||
|
|------|-----|-------------|----------|
|
||||||
|
| `slot` | string | нет | Идентификатор слота |
|
||||||
|
| `location` | string | нет | Физическое расположение |
|
||||||
|
| `present` | bool | нет | Наличие модуля (по умолчанию `true`) |
|
||||||
|
| `serial_number` | string | нет | Серийный номер |
|
||||||
|
| `part_number` | string | нет | Партномер (используется как модель) |
|
||||||
|
| `manufacturer` | string | нет | Производитель |
|
||||||
|
| `size_mb` | int | нет | Объём в МБ |
|
||||||
|
| `type` | string | нет | Тип: `DDR3`, `DDR4`, `DDR5`, … |
|
||||||
|
| `max_speed_mhz` | int | нет | Максимальная частота |
|
||||||
|
| `current_speed_mhz` | int | нет | Текущая частота |
|
||||||
|
| `temperature_c` | float | нет | Температура DIMM/модуля, °C (telemetry) |
|
||||||
|
| `correctable_ecc_error_count` | int | нет | Количество корректируемых ECC-ошибок |
|
||||||
|
| `uncorrectable_ecc_error_count` | int | нет | Количество некорректируемых ECC-ошибок |
|
||||||
|
| `life_remaining_pct` | float | нет | Остаточный ресурс / health, % |
|
||||||
|
| `life_used_pct` | float | нет | Использованный ресурс / wear, % |
|
||||||
|
| `spare_blocks_remaining_pct` | float | нет | Остаток spare blocks, % |
|
||||||
|
| `performance_degraded` | bool | нет | Зафиксирована деградация производительности |
|
||||||
|
| `data_loss_detected` | bool | нет | Источник сигнализирует риск/факт потери данных |
|
||||||
|
| + общие поля статуса | | | см. раздел выше |
|
||||||
|
|
||||||
|
Модуль без `serial_number` игнорируется. Модуль с `present=false` или `status=Empty` игнорируется.
|
||||||
|
|
||||||
|
```json
|
||||||
|
"memory": [
|
||||||
|
{
|
||||||
|
"slot": "CPU0_C0D0",
|
||||||
|
"present": true,
|
||||||
|
"size_mb": 32768,
|
||||||
|
"type": "DDR5",
|
||||||
|
"max_speed_mhz": 4800,
|
||||||
|
"current_speed_mhz": 4800,
|
||||||
|
"temperature_c": 43.0,
|
||||||
|
"correctable_ecc_error_count": 0,
|
||||||
|
"manufacturer": "Hynix",
|
||||||
|
"serial_number": "80AD032419E17CEEC1",
|
||||||
|
"part_number": "HMCG88AGBRA191N",
|
||||||
|
"status": "OK"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### storage
|
||||||
|
|
||||||
|
| Поле | Тип | Обязательно | Описание |
|
||||||
|
|------|-----|-------------|----------|
|
||||||
|
| `slot` | string | нет | Идентификатор слота |
|
||||||
|
| `serial_number` | string | нет | Серийный номер |
|
||||||
|
| `model` | string | нет | Модель |
|
||||||
|
| `manufacturer` | string | нет | Производитель |
|
||||||
|
| `type` | string | нет | Тип: `NVMe`, `SSD`, `HDD` |
|
||||||
|
| `interface` | string | нет | Интерфейс: `NVMe`, `SATA`, `SAS` |
|
||||||
|
| `size_gb` | int | нет | Размер в ГБ |
|
||||||
|
| `temperature_c` | float | нет | Температура накопителя, °C (telemetry) |
|
||||||
|
| `power_on_hours` | int64 | нет | Время работы, часы |
|
||||||
|
| `power_cycles` | int64 | нет | Количество циклов питания |
|
||||||
|
| `unsafe_shutdowns` | int64 | нет | Нештатные выключения |
|
||||||
|
| `media_errors` | int64 | нет | Ошибки носителя / media errors |
|
||||||
|
| `error_log_entries` | int64 | нет | Количество записей в error log |
|
||||||
|
| `written_bytes` | int64 | нет | Всего записано байт |
|
||||||
|
| `read_bytes` | int64 | нет | Всего прочитано байт |
|
||||||
|
| `life_used_pct` | float | нет | Использованный ресурс / wear, % |
|
||||||
|
| `life_remaining_pct` | float | нет | Остаточный ресурс / health, % |
|
||||||
|
| `available_spare_pct` | float | нет | Доступный spare, % |
|
||||||
|
| `reallocated_sectors` | int64 | нет | Переназначенные сектора |
|
||||||
|
| `current_pending_sectors` | int64 | нет | Сектора в ожидании ремапа |
|
||||||
|
| `offline_uncorrectable` | int64 | нет | Некорректируемые ошибки offline scan |
|
||||||
|
| `firmware` | string | нет | Версия прошивки |
|
||||||
|
| `present` | bool | нет | Наличие (по умолчанию `true`) |
|
||||||
|
| + общие поля статуса | | | см. раздел выше |
|
||||||
|
|
||||||
|
Диск без `serial_number` игнорируется. Изменение `firmware` создаёт событие `FIRMWARE_CHANGED`.
|
||||||
|
|
||||||
|
```json
|
||||||
|
"storage": [
|
||||||
|
{
|
||||||
|
"slot": "OB01",
|
||||||
|
"type": "NVMe",
|
||||||
|
"model": "INTEL SSDPF2KX076T1",
|
||||||
|
"size_gb": 7680,
|
||||||
|
"temperature_c": 38.5,
|
||||||
|
"power_on_hours": 12450,
|
||||||
|
"unsafe_shutdowns": 3,
|
||||||
|
"written_bytes": 9876543210,
|
||||||
|
"life_remaining_pct": 91.0,
|
||||||
|
"serial_number": "BTAX41900GF87P6DGN",
|
||||||
|
"manufacturer": "Intel",
|
||||||
|
"firmware": "9CV10510",
|
||||||
|
"interface": "NVMe",
|
||||||
|
"present": true,
|
||||||
|
"status": "OK"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### pcie_devices
|
||||||
|
|
||||||
|
| Поле | Тип | Обязательно | Описание |
|
||||||
|
|------|-----|-------------|----------|
|
||||||
|
| `slot` | string | нет | Идентификатор слота |
|
||||||
|
| `vendor_id` | int | нет | PCI Vendor ID (decimal) |
|
||||||
|
| `device_id` | int | нет | PCI Device ID (decimal) |
|
||||||
|
| `numa_node` | int | нет | NUMA node / CPU affinity устройства |
|
||||||
|
| `temperature_c` | float | нет | Температура устройства, °C (telemetry) |
|
||||||
|
| `power_w` | float | нет | Текущее энергопотребление устройства, Вт (telemetry) |
|
||||||
|
| `life_remaining_pct` | float | нет | Остаточный ресурс / health, % |
|
||||||
|
| `life_used_pct` | float | нет | Использованный ресурс / wear, % |
|
||||||
|
| `ecc_corrected_total` | int64 | нет | Всего корректируемых ECC-ошибок |
|
||||||
|
| `ecc_uncorrected_total` | int64 | нет | Всего некорректируемых ECC-ошибок |
|
||||||
|
| `hw_slowdown` | bool | нет | Устройство вошло в hardware slowdown / protective mode |
|
||||||
|
| `battery_charge_pct` | float | нет | Заряд батареи / supercap, % |
|
||||||
|
| `battery_health_pct` | float | нет | Состояние батареи / supercap, % |
|
||||||
|
| `battery_temperature_c` | float | нет | Температура батареи / supercap, °C |
|
||||||
|
| `battery_voltage_v` | float | нет | Напряжение батареи / supercap, В |
|
||||||
|
| `battery_replace_required` | bool | нет | Требуется замена батареи / supercap |
|
||||||
|
| `sfp_temperature_c` | float | нет | Температура SFP/optic, °C |
|
||||||
|
| `sfp_tx_power_dbm` | float | нет | TX optical power, dBm |
|
||||||
|
| `sfp_rx_power_dbm` | float | нет | RX optical power, dBm |
|
||||||
|
| `sfp_voltage_v` | float | нет | Напряжение SFP, В |
|
||||||
|
| `sfp_bias_ma` | float | нет | Bias current SFP, мА |
|
||||||
|
| `bdf` | string | нет | Bus:Device.Function, например `0000:18:00.0` |
|
||||||
|
| `device_class` | string | нет | Класс устройства (см. список ниже) |
|
||||||
|
| `manufacturer` | string | нет | Производитель |
|
||||||
|
| `model` | string | нет | Модель |
|
||||||
|
| `serial_number` | string | нет | Серийный номер |
|
||||||
|
| `firmware` | string | нет | Версия прошивки |
|
||||||
|
| `link_width` | int | нет | Текущая ширина линка |
|
||||||
|
| `link_speed` | string | нет | Текущая скорость: `Gen3`, `Gen4`, `Gen5` |
|
||||||
|
| `max_link_width` | int | нет | Максимальная ширина линка |
|
||||||
|
| `max_link_speed` | string | нет | Максимальная скорость |
|
||||||
|
| `mac_addresses` | string[] | нет | MAC-адреса портов (для сетевых устройств) |
|
||||||
|
| `present` | bool | нет | Наличие (по умолчанию `true`) |
|
||||||
|
| + общие поля статуса | | | см. раздел выше |
|
||||||
|
|
||||||
|
`numa_node` передавайте для NIC / InfiniBand / RAID / GPU, когда источник знает CPU/NUMA affinity. Поле сохраняется в snapshot-атрибутах PCIe-компонента и дублируется в telemetry для topology use cases.
|
||||||
|
Поля `temperature_c` и `power_w` используйте для device-level telemetry GPU / accelerator / smart PCIe devices. Они не влияют на идентификацию компонента.
|
||||||
|
|
||||||
|
**Генерация serial_number при отсутствии или `"N/A"`:** `{board_serial}-PCIE-{slot}`
|
||||||
|
|
||||||
|
**Значения `device_class`:**
|
||||||
|
|
||||||
|
| Значение | Назначение |
|
||||||
|
|----------|------------|
|
||||||
|
| `MassStorageController` | RAID-контроллеры |
|
||||||
|
| `StorageController` | HBA, SAS-контроллеры |
|
||||||
|
| `NetworkController` | Сетевые адаптеры (InfiniBand, общий) |
|
||||||
|
| `EthernetController` | Ethernet NIC |
|
||||||
|
| `FibreChannelController` | Fibre Channel HBA |
|
||||||
|
| `VideoController` | GPU, видеокарты |
|
||||||
|
| `ProcessingAccelerator` | Вычислительные ускорители (AI/ML) |
|
||||||
|
| `DisplayController` | Контроллеры дисплея (BMC VGA) |
|
||||||
|
|
||||||
|
Список открытый: допускаются произвольные строки для нестандартных классов.
|
||||||
|
|
||||||
|
```json
|
||||||
|
"pcie_devices": [
|
||||||
|
{
|
||||||
|
"slot": "PCIeCard2",
|
||||||
|
"vendor_id": 5555,
|
||||||
|
"device_id": 4401,
|
||||||
|
"numa_node": 0,
|
||||||
|
"temperature_c": 48.5,
|
||||||
|
"power_w": 18.2,
|
||||||
|
"sfp_temperature_c": 36.2,
|
||||||
|
"sfp_tx_power_dbm": -1.8,
|
||||||
|
"sfp_rx_power_dbm": -2.1,
|
||||||
|
"bdf": "0000:3b:00.0",
|
||||||
|
"device_class": "EthernetController",
|
||||||
|
"manufacturer": "Intel",
|
||||||
|
"model": "X710 10GbE",
|
||||||
|
"serial_number": "K65472-003",
|
||||||
|
"firmware": "9.20 0x8000d4ae",
|
||||||
|
"mac_addresses": ["3c:fd:fe:aa:bb:cc", "3c:fd:fe:aa:bb:cd"],
|
||||||
|
"status": "OK"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### power_supplies
|
||||||
|
|
||||||
|
| Поле | Тип | Обязательно | Описание |
|
||||||
|
|------|-----|-------------|----------|
|
||||||
|
| `slot` | string | нет | Идентификатор слота |
|
||||||
|
| `present` | bool | нет | Наличие (по умолчанию `true`) |
|
||||||
|
| `serial_number` | string | нет | Серийный номер |
|
||||||
|
| `part_number` | string | нет | Партномер |
|
||||||
|
| `model` | string | нет | Модель |
|
||||||
|
| `vendor` | string | нет | Производитель |
|
||||||
|
| `wattage_w` | int | нет | Мощность в ваттах |
|
||||||
|
| `firmware` | string | нет | Версия прошивки |
|
||||||
|
| `input_type` | string | нет | Тип входа (например `ACWideRange`) |
|
||||||
|
| `input_voltage` | float | нет | Входное напряжение, В (telemetry) |
|
||||||
|
| `input_power_w` | float | нет | Входная мощность, Вт (telemetry) |
|
||||||
|
| `output_power_w` | float | нет | Выходная мощность, Вт (telemetry) |
|
||||||
|
| `temperature_c` | float | нет | Температура PSU, °C (telemetry) |
|
||||||
|
| `life_remaining_pct` | float | нет | Остаточный ресурс / health, % |
|
||||||
|
| `life_used_pct` | float | нет | Использованный ресурс / wear, % |
|
||||||
|
| + общие поля статуса | | | см. раздел выше |
|
||||||
|
|
||||||
|
Поля telemetry (`input_voltage`, `input_power_w`, `output_power_w`, `temperature_c`, `life_remaining_pct`, `life_used_pct`) сохраняются в атрибутах компонента и не влияют на его идентификацию.
|
||||||
|
|
||||||
|
PSU без `serial_number` игнорируется.
|
||||||
|
|
||||||
|
```json
|
||||||
|
"power_supplies": [
|
||||||
|
{
|
||||||
|
"slot": "0",
|
||||||
|
"present": true,
|
||||||
|
"model": "GW-CRPS3000LW",
|
||||||
|
"vendor": "Great Wall",
|
||||||
|
"wattage_w": 3000,
|
||||||
|
"serial_number": "2P06C102610",
|
||||||
|
"firmware": "00.03.05",
|
||||||
|
"status": "OK",
|
||||||
|
"input_type": "ACWideRange",
|
||||||
|
"input_power_w": 137,
|
||||||
|
"output_power_w": 104,
|
||||||
|
"input_voltage": 215.25,
|
||||||
|
"temperature_c": 39.5,
|
||||||
|
"life_remaining_pct": 97.0
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### sensors
|
||||||
|
|
||||||
|
Показания сенсоров сервера. Секция опциональная, не привязана к компонентам.
|
||||||
|
Данные хранятся как последнее известное значение (last-known-value) на уровне Asset.
|
||||||
|
|
||||||
|
```json
|
||||||
|
"sensors": {
|
||||||
|
"fans": [ ... ],
|
||||||
|
"power": [ ... ],
|
||||||
|
"temperatures": [ ... ],
|
||||||
|
"other": [ ... ]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### sensors.fans
|
||||||
|
|
||||||
|
| Поле | Тип | Обязательно | Описание |
|
||||||
|
|------|-----|-------------|----------|
|
||||||
|
| `name` | string | **да** | Уникальное имя сенсора в рамках секции |
|
||||||
|
| `location` | string | нет | Физическое расположение |
|
||||||
|
| `rpm` | int | нет | Обороты, RPM |
|
||||||
|
| `status` | string | нет | Статус: `OK`, `Warning`, `Critical`, `Unknown` |
|
||||||
|
|
||||||
|
#### sensors.power
|
||||||
|
|
||||||
|
| Поле | Тип | Обязательно | Описание |
|
||||||
|
|------|-----|-------------|----------|
|
||||||
|
| `name` | string | **да** | Уникальное имя сенсора |
|
||||||
|
| `location` | string | нет | Физическое расположение |
|
||||||
|
| `voltage_v` | float | нет | Напряжение, В |
|
||||||
|
| `current_a` | float | нет | Ток, А |
|
||||||
|
| `power_w` | float | нет | Мощность, Вт |
|
||||||
|
| `status` | string | нет | Статус |
|
||||||
|
|
||||||
|
#### sensors.temperatures
|
||||||
|
|
||||||
|
| Поле | Тип | Обязательно | Описание |
|
||||||
|
|------|-----|-------------|----------|
|
||||||
|
| `name` | string | **да** | Уникальное имя сенсора |
|
||||||
|
| `location` | string | нет | Физическое расположение |
|
||||||
|
| `celsius` | float | нет | Температура, °C |
|
||||||
|
| `threshold_warning_celsius` | float | нет | Порог Warning, °C |
|
||||||
|
| `threshold_critical_celsius` | float | нет | Порог Critical, °C |
|
||||||
|
| `status` | string | нет | Статус |
|
||||||
|
|
||||||
|
#### sensors.other
|
||||||
|
|
||||||
|
| Поле | Тип | Обязательно | Описание |
|
||||||
|
|------|-----|-------------|----------|
|
||||||
|
| `name` | string | **да** | Уникальное имя сенсора |
|
||||||
|
| `location` | string | нет | Физическое расположение |
|
||||||
|
| `value` | float | нет | Значение |
|
||||||
|
| `unit` | string | нет | Единица измерения |
|
||||||
|
| `status` | string | нет | Статус |
|
||||||
|
|
||||||
|
**Правила sensors:**
|
||||||
|
- Идентификатор сенсора: пара `(sensor_type, name)`. Дубли в одном payload — берётся первое вхождение.
|
||||||
|
- Сенсоры без `name` игнорируются.
|
||||||
|
- При каждом импорте значения перезаписываются (upsert по ключу).
|
||||||
|
|
||||||
|
```json
|
||||||
|
"sensors": {
|
||||||
|
"fans": [
|
||||||
|
{ "name": "FAN1", "location": "Front", "rpm": 4200, "status": "OK" },
|
||||||
|
{ "name": "FAN_CPU0", "location": "CPU0", "rpm": 5600, "status": "OK" }
|
||||||
|
],
|
||||||
|
"power": [
|
||||||
|
{ "name": "12V Rail", "location": "Mainboard", "voltage_v": 12.06, "status": "OK" },
|
||||||
|
{ "name": "PSU0 Input", "location": "PSU0", "voltage_v": 215.25, "current_a": 0.64, "power_w": 137.0, "status": "OK" }
|
||||||
|
],
|
||||||
|
"temperatures": [
|
||||||
|
{ "name": "CPU0 Temp", "location": "CPU0", "celsius": 46.0, "threshold_warning_celsius": 80.0, "threshold_critical_celsius": 95.0, "status": "OK" },
|
||||||
|
{ "name": "Inlet Temp", "location": "Front", "celsius": 22.0, "threshold_warning_celsius": 40.0, "threshold_critical_celsius": 50.0, "status": "OK" }
|
||||||
|
],
|
||||||
|
"other": [
|
||||||
|
{ "name": "System Humidity", "value": 38.5, "unit": "%", "status": "OK" }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Обработка статусов компонентов
|
||||||
|
|
||||||
|
| Статус | Поведение |
|
||||||
|
|--------|-----------|
|
||||||
|
| `OK` | Нормальная обработка |
|
||||||
|
| `Warning` | Создаётся событие `COMPONENT_WARNING` |
|
||||||
|
| `Critical` | Создаётся событие `COMPONENT_FAILED` + запись в `failure_events` |
|
||||||
|
| `Unknown` | Компонент считается рабочим, создаётся событие `COMPONENT_UNKNOWN` |
|
||||||
|
| `Empty` | Компонент не создаётся/не обновляется |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Обработка отсутствующих serial_number
|
||||||
|
|
||||||
|
| Тип | Поведение |
|
||||||
|
|-----|-----------|
|
||||||
|
| CPU | Генерируется: `{board_serial}-CPU-{socket}` |
|
||||||
|
| PCIe | Генерируется: `{board_serial}-PCIE-{slot}` (если serial = `"N/A"` или пустой) |
|
||||||
|
| Memory | Компонент игнорируется |
|
||||||
|
| Storage | Компонент игнорируется |
|
||||||
|
| PSU | Компонент игнорируется |
|
||||||
|
|
||||||
|
Если `serial_number` не уникален внутри одного payload для того же `model`:
|
||||||
|
- Первое вхождение сохраняет оригинальный серийный номер.
|
||||||
|
- Каждое следующее дублирующее получает placeholder: `NO_SN-XXXXXXXX`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Минимальный валидный пример
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"collected_at": "2026-02-10T15:30:00Z",
|
||||||
|
"target_host": "192.168.1.100",
|
||||||
|
"hardware": {
|
||||||
|
"board": {
|
||||||
|
"serial_number": "SRV-001"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Полный пример с историей статусов
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"filename": "redfish://10.10.10.103",
|
||||||
|
"source_type": "api",
|
||||||
|
"protocol": "redfish",
|
||||||
|
"target_host": "10.10.10.103",
|
||||||
|
"collected_at": "2026-02-10T15:30:00Z",
|
||||||
|
"hardware": {
|
||||||
|
"board": {
|
||||||
|
"manufacturer": "Supermicro",
|
||||||
|
"product_name": "X12DPG-QT6",
|
||||||
|
"serial_number": "21D634101"
|
||||||
|
},
|
||||||
|
"firmware": [
|
||||||
|
{ "device_name": "BIOS", "version": "06.08.05" },
|
||||||
|
{ "device_name": "BMC", "version": "5.17.00" }
|
||||||
|
],
|
||||||
|
"cpus": [
|
||||||
|
{
|
||||||
|
"socket": 0,
|
||||||
|
"model": "INTEL(R) XEON(R) GOLD 6530",
|
||||||
|
"manufacturer": "Intel",
|
||||||
|
"cores": 32,
|
||||||
|
"threads": 64,
|
||||||
|
"status": "OK"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"storage": [
|
||||||
|
{
|
||||||
|
"slot": "OB01",
|
||||||
|
"type": "NVMe",
|
||||||
|
"model": "INTEL SSDPF2KX076T1",
|
||||||
|
"size_gb": 7680,
|
||||||
|
"serial_number": "BTAX41900GF87P6DGN",
|
||||||
|
"manufacturer": "Intel",
|
||||||
|
"firmware": "9CV10510",
|
||||||
|
"present": true,
|
||||||
|
"status": "OK",
|
||||||
|
"status_changed_at": "2026-02-10T15:22:00Z",
|
||||||
|
"status_history": [
|
||||||
|
{ "status": "Critical", "changed_at": "2026-02-10T15:10:00Z", "details": "I/O timeout on NVMe queue 3" },
|
||||||
|
{ "status": "OK", "changed_at": "2026-02-10T15:22:00Z", "details": "Recovered after controller reset" }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"pcie_devices": [
|
||||||
|
{
|
||||||
|
"slot": "PCIeCard1",
|
||||||
|
"device_class": "EthernetController",
|
||||||
|
"manufacturer": "Intel",
|
||||||
|
"model": "X710 10GbE",
|
||||||
|
"serial_number": "K65472-003",
|
||||||
|
"mac_addresses": ["3c:fd:fe:aa:bb:cc", "3c:fd:fe:aa:bb:cd"],
|
||||||
|
"status": "OK"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"power_supplies": [
|
||||||
|
{
|
||||||
|
"slot": "0",
|
||||||
|
"present": true,
|
||||||
|
"model": "GW-CRPS3000LW",
|
||||||
|
"vendor": "Great Wall",
|
||||||
|
"wattage_w": 3000,
|
||||||
|
"serial_number": "2P06C102610",
|
||||||
|
"firmware": "00.03.05",
|
||||||
|
"status": "OK",
|
||||||
|
"input_power_w": 137,
|
||||||
|
"output_power_w": 104,
|
||||||
|
"input_voltage": 215.25
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"sensors": {
|
||||||
|
"fans": [
|
||||||
|
{ "name": "FAN1", "location": "Front", "rpm": 4200, "status": "OK" }
|
||||||
|
],
|
||||||
|
"power": [
|
||||||
|
{ "name": "12V Rail", "voltage_v": 12.06, "status": "OK" }
|
||||||
|
],
|
||||||
|
"temperatures": [
|
||||||
|
{ "name": "CPU0 Temp", "celsius": 46.0, "threshold_warning_celsius": 80.0, "threshold_critical_celsius": 95.0, "status": "OK" }
|
||||||
|
],
|
||||||
|
"other": [
|
||||||
|
{ "name": "System Humidity", "value": 38.5, "unit": "%" }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
@@ -12,6 +12,7 @@ iproute2
|
|||||||
isc-dhcp-client
|
isc-dhcp-client
|
||||||
iputils-ping
|
iputils-ping
|
||||||
ethtool
|
ethtool
|
||||||
|
lm-sensors
|
||||||
qemu-guest-agent
|
qemu-guest-agent
|
||||||
|
|
||||||
# SSH
|
# SSH
|
||||||
|
|||||||
Reference in New Issue
Block a user