Add health verdicts and acceptance tests

This commit is contained in:
Mikhail Chusavitin
2026-03-14 17:53:58 +03:00
parent 17f0bda45e
commit b483e2ce35
28 changed files with 1688 additions and 82 deletions

View File

@@ -35,6 +35,7 @@ func Run(runtimeMode runtimeenv.Mode) schema.HardwareIngestRequest {
snap.Storage = enrichStorageWithVROC(snap.Storage, snap.PCIeDevices)
snap.Storage = appendUniqueStorage(snap.Storage, collectRAIDStorage(snap.PCIeDevices))
snap.PowerSupplies = collectPSUs()
snap.Summary = buildHealthSummary(snap)
// remaining collectors added in steps 1.8 1.10

View File

@@ -4,6 +4,7 @@ import (
"bee/audit/internal/schema"
"log/slog"
"os/exec"
"regexp"
"strconv"
"strings"
)
@@ -16,6 +17,9 @@ func collectPSUs() []schema.HardwarePowerSupply {
return nil
}
psus := parseFRU(string(out))
if sdrOut, err := exec.Command("ipmitool", "sdr").Output(); err == nil {
mergePSUSDR(psus, parsePSUSDR(string(sdrOut)))
}
slog.Info("psu: collected", "count", len(psus))
return psus
}
@@ -116,6 +120,135 @@ func parseFRUBlock(block string, slotIdx int) (schema.HardwarePowerSupply, bool)
return psu, true
}
type psuSDR struct {
slot int
status string
inputPowerW *float64
outputPowerW *float64
inputVoltage *float64
}
var psuSlotRe = regexp.MustCompile(`(?i)\bpsu?\s*([0-9]+)\b|\bps\s*([0-9]+)\b`)
func parsePSUSDR(raw string) map[int]psuSDR {
out := map[int]psuSDR{}
for _, line := range strings.Split(raw, "\n") {
fields := splitSDRFields(line)
if len(fields) < 3 {
continue
}
name := fields[0]
value := fields[1]
state := strings.ToLower(fields[2])
slot, ok := parsePSUSlot(name)
if !ok {
continue
}
entry := out[slot]
entry.slot = slot
if entry.status == "" {
entry.status = "OK"
}
if state != "" && state != "ok" && state != "ns" {
entry.status = "FAILED"
}
lowerName := strings.ToLower(name)
switch {
case strings.Contains(lowerName, "input power"):
entry.inputPowerW = parseFloatPtr(value)
case strings.Contains(lowerName, "output power"):
entry.outputPowerW = parseFloatPtr(value)
case strings.Contains(lowerName, "input voltage"), strings.Contains(lowerName, "ac input"):
entry.inputVoltage = parseFloatPtr(value)
}
out[slot] = entry
}
return out
}
func mergePSUSDR(psus []schema.HardwarePowerSupply, sdr map[int]psuSDR) {
for i := range psus {
slotIdx, err := strconv.Atoi(derefPSUSlot(psus[i].Slot))
if err != nil {
continue
}
entry, ok := sdr[slotIdx+1]
if !ok {
continue
}
if entry.inputPowerW != nil {
psus[i].InputPowerW = entry.inputPowerW
}
if entry.outputPowerW != nil {
psus[i].OutputPowerW = entry.outputPowerW
}
if entry.inputVoltage != nil {
psus[i].InputVoltage = entry.inputVoltage
}
if entry.status != "" {
psus[i].Status = &entry.status
}
if psus[i].Status != nil && *psus[i].Status == "OK" {
if (entry.inputPowerW == nil && entry.outputPowerW == nil && entry.inputVoltage == nil) && entry.status == "" {
unknown := "UNKNOWN"
psus[i].Status = &unknown
}
}
}
}
func splitSDRFields(line string) []string {
parts := strings.Split(line, "|")
out := make([]string, 0, len(parts))
for _, part := range parts {
part = strings.TrimSpace(part)
if part != "" {
out = append(out, part)
}
}
return out
}
func parsePSUSlot(name string) (int, bool) {
m := psuSlotRe.FindStringSubmatch(strings.ToLower(name))
if len(m) == 0 {
return 0, false
}
for _, group := range m[1:] {
if group == "" {
continue
}
n, err := strconv.Atoi(group)
if err == nil && n > 0 {
return n, true
}
}
return 0, false
}
func parseFloatPtr(raw string) *float64 {
raw = strings.TrimSpace(raw)
if raw == "" || strings.EqualFold(raw, "na") {
return nil
}
for _, field := range strings.Fields(raw) {
n, err := strconv.ParseFloat(strings.TrimSpace(field), 64)
if err == nil {
return &n
}
}
return nil
}
func derefPSUSlot(slot *string) string {
if slot == nil {
return ""
}
return *slot
}
// parseWattage extracts wattage from strings like "PSU 800W", "1200W PLATINUM".
func parseWattage(s string) int {
s = strings.ToUpper(s)

View File

@@ -0,0 +1,32 @@
package collector
import "testing"
func TestParsePSUSDR(t *testing.T) {
raw := `
PS1 Input Power | 215 Watts | ok
PS1 Output Power | 198 Watts | ok
PS1 Input Voltage | 229 Volts | ok
PS2 Input Power | 0 Watts | cr
`
got := parsePSUSDR(raw)
if len(got) != 2 {
t.Fatalf("len(got)=%d want 2", len(got))
}
if got[1].status != "OK" {
t.Fatalf("ps1 status=%q", got[1].status)
}
if got[1].inputPowerW == nil || *got[1].inputPowerW != 215 {
t.Fatalf("ps1 input power=%v", got[1].inputPowerW)
}
if got[1].outputPowerW == nil || *got[1].outputPowerW != 198 {
t.Fatalf("ps1 output power=%v", got[1].outputPowerW)
}
if got[1].inputVoltage == nil || *got[1].inputVoltage != 229 {
t.Fatalf("ps1 input voltage=%v", got[1].inputVoltage)
}
if got[2].status != "FAILED" {
t.Fatalf("ps2 status=%q", got[2].status)
}
}

View File

@@ -67,6 +67,9 @@ type smartctlInfo struct {
SerialNumber string `json:"serial_number"`
FirmwareVer string `json:"firmware_version"`
RotationRate int `json:"rotation_rate"`
SmartStatus struct {
Passed bool `json:"passed"`
} `json:"smart_status"`
UserCapacity struct {
Bytes int64 `json:"bytes"`
} `json:"user_capacity"`
@@ -127,7 +130,7 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
return s
}
var info smartctlInfo
var info smartctlInfo
if err := json.Unmarshal(out, &info); err == nil {
if v := cleanDMIValue(info.ModelName); v != "" {
s.Model = &v
@@ -158,37 +161,65 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
if info.PowerCycleCount > 0 {
tel["power_cycles"] = info.PowerCycleCount
}
reallocated := int64(0)
pending := int64(0)
uncorrectable := int64(0)
lifeRemaining := int64(0)
for _, attr := range info.AtaSmartAttributes.Table {
switch attr.ID {
case 5:
reallocated = attr.Raw.Value
tel["reallocated_sectors"] = attr.Raw.Value
case 177:
tel["wear_leveling_pct"] = attr.Raw.Value
case 231:
lifeRemaining = attr.Raw.Value
tel["life_remaining_pct"] = attr.Raw.Value
case 241:
tel["total_lba_written"] = attr.Raw.Value
case 197:
pending = attr.Raw.Value
tel["current_pending_sectors"] = attr.Raw.Value
case 198:
uncorrectable = attr.Raw.Value
tel["offline_uncorrectable"] = attr.Raw.Value
}
}
if len(tel) > 0 {
s.Telemetry = tel
}
status := storageHealthStatus{
overallPassed: info.SmartStatus.Passed,
hasOverall: true,
reallocatedSectors: reallocated,
pendingSectors: pending,
offlineUncorrectable: uncorrectable,
lifeRemainingPct: lifeRemaining,
}
setStorageHealthStatus(&s, status)
return s
}
s.Type = &devType
status := "OK"
status := "UNKNOWN"
s.Status = &status
return s
}
// nvmeSmartLog is the subset of `nvme smart-log -o json` output we care about.
type nvmeSmartLog struct {
PercentageUsed int `json:"percentage_used"`
PowerOnHours int64 `json:"power_on_hours"`
PowerCycles int64 `json:"power_cycles"`
UnsafeShutdowns int64 `json:"unsafe_shutdowns"`
DataUnitsWritten int64 `json:"data_units_written"`
ControllerBusy int64 `json:"controller_busy_time"`
CriticalWarning int `json:"critical_warning"`
PercentageUsed int `json:"percentage_used"`
AvailableSpare int `json:"available_spare"`
SpareThreshold int `json:"spare_thresh"`
PowerOnHours int64 `json:"power_on_hours"`
PowerCycles int64 `json:"power_cycles"`
UnsafeShutdowns int64 `json:"unsafe_shutdowns"`
DataUnitsWritten int64 `json:"data_units_written"`
ControllerBusy int64 `json:"controller_busy_time"`
MediaErrors int64 `json:"media_errors"`
NumErrLogEntries int64 `json:"num_err_log_entries"`
}
// nvmeIDCtrl is the subset of `nvme id-ctrl -o json` output.
@@ -238,6 +269,9 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
var log nvmeSmartLog
if json.Unmarshal(out, &log) == nil {
tel := map[string]any{}
if log.CriticalWarning > 0 {
tel["critical_warning"] = log.CriticalWarning
}
if log.PowerOnHours > 0 {
tel["power_on_hours"] = log.PowerOnHours
}
@@ -256,11 +290,78 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
if log.ControllerBusy > 0 {
tel["controller_busy_time"] = log.ControllerBusy
}
if log.AvailableSpare > 0 {
tel["available_spare_pct"] = log.AvailableSpare
}
if log.SpareThreshold > 0 {
tel["available_spare_threshold_pct"] = log.SpareThreshold
}
if log.MediaErrors > 0 {
tel["media_errors"] = log.MediaErrors
}
if log.NumErrLogEntries > 0 {
tel["error_log_entries"] = log.NumErrLogEntries
}
if len(tel) > 0 {
s.Telemetry = tel
}
setStorageHealthStatus(&s, storageHealthStatus{
criticalWarning: log.CriticalWarning,
percentageUsed: int64(log.PercentageUsed),
availableSpare: int64(log.AvailableSpare),
spareThreshold: int64(log.SpareThreshold),
unsafeShutdowns: log.UnsafeShutdowns,
mediaErrors: log.MediaErrors,
errorLogEntries: log.NumErrLogEntries,
})
return s
}
}
status = "UNKNOWN"
s.Status = &status
return s
}
type storageHealthStatus struct {
hasOverall bool
overallPassed bool
reallocatedSectors int64
pendingSectors int64
offlineUncorrectable int64
lifeRemainingPct int64
criticalWarning int
percentageUsed int64
availableSpare int64
spareThreshold int64
unsafeShutdowns int64
mediaErrors int64
errorLogEntries int64
}
func setStorageHealthStatus(s *schema.HardwareStorage, health storageHealthStatus) {
status := "OK"
switch {
case health.hasOverall && !health.overallPassed:
status = "FAILED"
case health.criticalWarning > 0:
status = "FAILED"
case health.pendingSectors > 0 || health.offlineUncorrectable > 0:
status = "FAILED"
case health.mediaErrors > 0:
status = "WARNING"
case health.reallocatedSectors > 0:
status = "WARNING"
case health.errorLogEntries > 0:
status = "WARNING"
case health.lifeRemainingPct > 0 && health.lifeRemainingPct <= 10:
status = "WARNING"
case health.percentageUsed >= 95:
status = "WARNING"
case health.availableSpare > 0 && health.spareThreshold > 0 && health.availableSpare <= health.spareThreshold:
status = "WARNING"
case health.unsafeShutdowns > 100:
status = "WARNING"
}
s.Status = &status
}

View File

@@ -0,0 +1,63 @@
package collector
import (
"testing"
"bee/audit/internal/schema"
)
func TestSetStorageHealthStatus(t *testing.T) {
t.Parallel()
tests := []struct {
name string
health storageHealthStatus
want string
}{
{
name: "smart overall failed",
health: storageHealthStatus{hasOverall: true, overallPassed: false},
want: "FAILED",
},
{
name: "nvme critical warning",
health: storageHealthStatus{criticalWarning: 1},
want: "FAILED",
},
{
name: "pending sectors",
health: storageHealthStatus{pendingSectors: 1},
want: "FAILED",
},
{
name: "media errors warning",
health: storageHealthStatus{mediaErrors: 2},
want: "WARNING",
},
{
name: "reallocated warning",
health: storageHealthStatus{reallocatedSectors: 1},
want: "WARNING",
},
{
name: "life remaining low",
health: storageHealthStatus{lifeRemainingPct: 8},
want: "WARNING",
},
{
name: "healthy",
health: storageHealthStatus{},
want: "OK",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
var disk schema.HardwareStorage
setStorageHealthStatus(&disk, tt.health)
if disk.Status == nil || *disk.Status != tt.want {
t.Fatalf("status=%v want %q", disk.Status, tt.want)
}
})
}
}

View File

@@ -0,0 +1,114 @@
package collector
import (
"bee/audit/internal/schema"
"fmt"
"time"
)
func buildHealthSummary(snap schema.HardwareSnapshot) *schema.HardwareHealthSummary {
summary := &schema.HardwareHealthSummary{
Status: "OK",
CollectedAt: time.Now().UTC().Format(time.RFC3339),
}
for _, dimm := range snap.Memory {
switch derefString(dimm.Status) {
case "WARNING":
summary.MemoryWarn++
summary.Warnings = append(summary.Warnings, formatMemorySummary(dimm))
case "FAILED":
summary.MemoryFail++
summary.Failures = append(summary.Failures, formatMemorySummary(dimm))
case "EMPTY":
summary.EmptyDIMMs++
}
}
for _, disk := range snap.Storage {
switch derefString(disk.Status) {
case "WARNING":
summary.StorageWarn++
summary.Warnings = append(summary.Warnings, formatStorageSummary(disk))
case "FAILED":
summary.StorageFail++
summary.Failures = append(summary.Failures, formatStorageSummary(disk))
}
}
for _, dev := range snap.PCIeDevices {
switch derefString(dev.Status) {
case "WARNING":
summary.PCIeWarn++
summary.Warnings = append(summary.Warnings, formatPCIeSummary(dev))
case "FAILED":
summary.PCIeFail++
summary.Failures = append(summary.Failures, formatPCIeSummary(dev))
}
}
for _, psu := range snap.PowerSupplies {
if psu.Present != nil && !*psu.Present {
summary.MissingPSUs++
}
switch derefString(psu.Status) {
case "WARNING":
summary.PSUWarn++
summary.Warnings = append(summary.Warnings, formatPSUSummary(psu))
case "FAILED":
summary.PSUFail++
summary.Failures = append(summary.Failures, formatPSUSummary(psu))
}
}
if len(summary.Failures) > 0 || summary.StorageFail > 0 || summary.PCIeFail > 0 || summary.PSUFail > 0 || summary.MemoryFail > 0 {
summary.Status = "FAILED"
} else if len(summary.Warnings) > 0 || summary.StorageWarn > 0 || summary.PCIeWarn > 0 || summary.PSUWarn > 0 || summary.MemoryWarn > 0 {
summary.Status = "WARNING"
}
if len(summary.Warnings) == 0 {
summary.Warnings = nil
}
if len(summary.Failures) == 0 {
summary.Failures = nil
}
return summary
}
func derefString(value *string) string {
if value == nil {
return ""
}
return *value
}
func preferredName(model, serial, slot *string) string {
switch {
case model != nil && *model != "":
return *model
case serial != nil && *serial != "":
return *serial
case slot != nil && *slot != "":
return *slot
default:
return "unknown"
}
}
func formatStorageSummary(disk schema.HardwareStorage) string {
return fmt.Sprintf("storage %s status=%s", preferredName(disk.Model, disk.SerialNumber, disk.Slot), derefString(disk.Status))
}
func formatPCIeSummary(dev schema.HardwarePCIeDevice) string {
return fmt.Sprintf("pcie %s status=%s", preferredName(dev.Model, dev.SerialNumber, dev.BDF), derefString(dev.Status))
}
func formatPSUSummary(psu schema.HardwarePowerSupply) string {
return fmt.Sprintf("psu %s status=%s", preferredName(psu.Model, psu.SerialNumber, psu.Slot), derefString(psu.Status))
}
func formatMemorySummary(dimm schema.HardwareMemory) string {
return fmt.Sprintf("memory %s status=%s", preferredName(dimm.PartNumber, dimm.SerialNumber, dimm.Slot), derefString(dimm.Status))
}