519 lines
13 KiB
Go
519 lines
13 KiB
Go
package collector
|
|
|
|
import (
|
|
"bee/audit/internal/schema"
|
|
"encoding/json"
|
|
"log/slog"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"strconv"
|
|
"strings"
|
|
)
|
|
|
|
func collectStorage() []schema.HardwareStorage {
|
|
devs := discoverStorageDevices()
|
|
result := make([]schema.HardwareStorage, 0, len(devs))
|
|
for _, dev := range devs {
|
|
var s schema.HardwareStorage
|
|
if strings.HasPrefix(dev.Name, "nvme") {
|
|
s = enrichWithNVMe(dev)
|
|
} else {
|
|
s = enrichWithSmartctl(dev)
|
|
}
|
|
result = append(result, s)
|
|
}
|
|
slog.Info("storage: collected", "count", len(result))
|
|
return result
|
|
}
|
|
|
|
// lsblkDevice is a minimal lsblk JSON record.
|
|
type lsblkDevice struct {
|
|
Name string `json:"name"`
|
|
Type string `json:"type"`
|
|
Size string `json:"size"`
|
|
Serial string `json:"serial"`
|
|
Model string `json:"model"`
|
|
Tran string `json:"tran"`
|
|
Hctl string `json:"hctl"`
|
|
}
|
|
|
|
type lsblkRoot struct {
|
|
Blockdevices []lsblkDevice `json:"blockdevices"`
|
|
}
|
|
|
|
type nvmeListRoot struct {
|
|
Devices []nvmeListDevice `json:"Devices"`
|
|
}
|
|
|
|
type nvmeListDevice struct {
|
|
DevicePath string `json:"DevicePath"`
|
|
ModelNumber string `json:"ModelNumber"`
|
|
SerialNumber string `json:"SerialNumber"`
|
|
Firmware string `json:"Firmware"`
|
|
PhysicalSize int64 `json:"PhysicalSize"`
|
|
}
|
|
|
|
func discoverStorageDevices() []lsblkDevice {
|
|
merged := map[string]lsblkDevice{}
|
|
for _, dev := range lsblkDevices() {
|
|
if dev.Name == "" {
|
|
continue
|
|
}
|
|
merged[dev.Name] = dev
|
|
}
|
|
for _, dev := range nvmeListDevices() {
|
|
if dev.Name == "" {
|
|
continue
|
|
}
|
|
current := merged[dev.Name]
|
|
merged[dev.Name] = mergeStorageDevice(current, dev)
|
|
}
|
|
|
|
disks := make([]lsblkDevice, 0, len(merged))
|
|
for _, dev := range merged {
|
|
if dev.Type == "" {
|
|
dev.Type = "disk"
|
|
}
|
|
if dev.Type != "disk" {
|
|
continue
|
|
}
|
|
disks = append(disks, dev)
|
|
}
|
|
return disks
|
|
}
|
|
|
|
func lsblkDevices() []lsblkDevice {
|
|
out, err := exec.Command("lsblk", "-J", "-d",
|
|
"-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL").Output()
|
|
if err != nil {
|
|
slog.Warn("storage: lsblk failed", "err", err)
|
|
return nil
|
|
}
|
|
var root lsblkRoot
|
|
if err := json.Unmarshal(out, &root); err != nil {
|
|
slog.Warn("storage: lsblk parse failed", "err", err)
|
|
return nil
|
|
}
|
|
var disks []lsblkDevice
|
|
for _, d := range root.Blockdevices {
|
|
if d.Type == "disk" {
|
|
disks = append(disks, d)
|
|
}
|
|
}
|
|
return disks
|
|
}
|
|
|
|
func nvmeListDevices() []lsblkDevice {
|
|
out, err := exec.Command("nvme", "list", "-o", "json").Output()
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
var root nvmeListRoot
|
|
if err := json.Unmarshal(out, &root); err != nil {
|
|
slog.Warn("storage: nvme list parse failed", "err", err)
|
|
return nil
|
|
}
|
|
devices := make([]lsblkDevice, 0, len(root.Devices))
|
|
for _, dev := range root.Devices {
|
|
name := filepath.Base(strings.TrimSpace(dev.DevicePath))
|
|
if name == "" {
|
|
continue
|
|
}
|
|
devices = append(devices, lsblkDevice{
|
|
Name: name,
|
|
Type: "disk",
|
|
Size: strconv.FormatInt(dev.PhysicalSize, 10),
|
|
Serial: strings.TrimSpace(dev.SerialNumber),
|
|
Model: strings.TrimSpace(dev.ModelNumber),
|
|
Tran: "nvme",
|
|
})
|
|
}
|
|
return devices
|
|
}
|
|
|
|
func mergeStorageDevice(existing, incoming lsblkDevice) lsblkDevice {
|
|
if existing.Name == "" {
|
|
return incoming
|
|
}
|
|
if existing.Type == "" {
|
|
existing.Type = incoming.Type
|
|
}
|
|
if strings.TrimSpace(existing.Size) == "" {
|
|
existing.Size = incoming.Size
|
|
}
|
|
if strings.TrimSpace(existing.Serial) == "" {
|
|
existing.Serial = incoming.Serial
|
|
}
|
|
if strings.TrimSpace(existing.Model) == "" {
|
|
existing.Model = incoming.Model
|
|
}
|
|
if strings.TrimSpace(existing.Tran) == "" {
|
|
existing.Tran = incoming.Tran
|
|
}
|
|
if strings.TrimSpace(existing.Hctl) == "" {
|
|
existing.Hctl = incoming.Hctl
|
|
}
|
|
return existing
|
|
}
|
|
|
|
// smartctlInfo is the subset of smartctl -j -a output we care about.
|
|
type smartctlInfo struct {
|
|
ModelFamily string `json:"model_family"`
|
|
ModelName string `json:"model_name"`
|
|
SerialNumber string `json:"serial_number"`
|
|
FirmwareVer string `json:"firmware_version"`
|
|
RotationRate int `json:"rotation_rate"`
|
|
Temperature struct {
|
|
Current int `json:"current"`
|
|
} `json:"temperature"`
|
|
SmartStatus struct {
|
|
Passed bool `json:"passed"`
|
|
} `json:"smart_status"`
|
|
UserCapacity struct {
|
|
Bytes int64 `json:"bytes"`
|
|
} `json:"user_capacity"`
|
|
AtaSmartAttributes struct {
|
|
Table []struct {
|
|
ID int `json:"id"`
|
|
Name string `json:"name"`
|
|
Raw struct {
|
|
Value int64 `json:"value"`
|
|
} `json:"raw"`
|
|
} `json:"table"`
|
|
} `json:"ata_smart_attributes"`
|
|
PowerOnTime struct {
|
|
Hours int `json:"hours"`
|
|
} `json:"power_on_time"`
|
|
PowerCycleCount int `json:"power_cycle_count"`
|
|
}
|
|
|
|
func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
|
|
present := true
|
|
s := schema.HardwareStorage{Present: &present}
|
|
|
|
tran := strings.ToLower(dev.Tran)
|
|
devPath := "/dev/" + dev.Name
|
|
|
|
// determine device type (refined by smartctl rotation_rate below)
|
|
var devType string
|
|
switch {
|
|
case strings.HasPrefix(dev.Name, "nvme"):
|
|
devType = "NVMe"
|
|
case tran == "usb":
|
|
devType = "USB"
|
|
case tran == "sata" || tran == "sas":
|
|
devType = "HDD" // refined to SSD below if rotation_rate==0
|
|
default:
|
|
devType = "Unknown"
|
|
}
|
|
|
|
iface := strings.ToUpper(tran)
|
|
if iface != "" {
|
|
s.Interface = &iface
|
|
}
|
|
|
|
// slot from HCTL (host:channel:target:lun)
|
|
if dev.Hctl != "" {
|
|
s.Slot = &dev.Hctl
|
|
}
|
|
|
|
// run smartctl
|
|
out, err := exec.Command("smartctl", "-j", "-a", devPath).Output()
|
|
if err != nil {
|
|
// still fill what lsblk gave us
|
|
if v := strings.TrimSpace(dev.Model); v != "" {
|
|
s.Model = &v
|
|
}
|
|
if v := strings.TrimSpace(dev.Serial); v != "" {
|
|
s.SerialNumber = &v
|
|
}
|
|
s.Type = &devType
|
|
return s
|
|
}
|
|
|
|
var info smartctlInfo
|
|
if err := json.Unmarshal(out, &info); err == nil {
|
|
if v := cleanDMIValue(info.ModelName); v != "" {
|
|
s.Model = &v
|
|
}
|
|
if v := cleanDMIValue(info.SerialNumber); v != "" {
|
|
s.SerialNumber = &v
|
|
}
|
|
if v := cleanDMIValue(info.FirmwareVer); v != "" {
|
|
s.Firmware = &v
|
|
}
|
|
if info.UserCapacity.Bytes > 0 {
|
|
gb := int(info.UserCapacity.Bytes / 1_000_000_000)
|
|
s.SizeGB = &gb
|
|
}
|
|
|
|
// refine type from rotation_rate
|
|
if info.RotationRate == 0 && devType != "NVMe" && devType != "USB" {
|
|
devType = "SSD"
|
|
} else if info.RotationRate > 0 {
|
|
devType = "HDD"
|
|
}
|
|
s.Type = &devType
|
|
|
|
if info.Temperature.Current > 0 {
|
|
t := float64(info.Temperature.Current)
|
|
s.TemperatureC = &t
|
|
}
|
|
if info.PowerOnTime.Hours > 0 {
|
|
v := int64(info.PowerOnTime.Hours)
|
|
s.PowerOnHours = &v
|
|
}
|
|
if info.PowerCycleCount > 0 {
|
|
v := int64(info.PowerCycleCount)
|
|
s.PowerCycles = &v
|
|
}
|
|
reallocated := int64(0)
|
|
pending := int64(0)
|
|
uncorrectable := int64(0)
|
|
lifeRemaining := int64(0)
|
|
for _, attr := range info.AtaSmartAttributes.Table {
|
|
switch attr.ID {
|
|
case 5:
|
|
reallocated = attr.Raw.Value
|
|
s.ReallocatedSectors = &reallocated
|
|
case 177:
|
|
value := float64(attr.Raw.Value)
|
|
s.LifeUsedPct = &value
|
|
case 231:
|
|
lifeRemaining = attr.Raw.Value
|
|
value := float64(attr.Raw.Value)
|
|
s.LifeRemainingPct = &value
|
|
case 241:
|
|
value := attr.Raw.Value
|
|
s.WrittenBytes = &value
|
|
case 197:
|
|
pending = attr.Raw.Value
|
|
s.CurrentPendingSectors = &pending
|
|
case 198:
|
|
uncorrectable = attr.Raw.Value
|
|
s.OfflineUncorrectable = &uncorrectable
|
|
}
|
|
}
|
|
|
|
status := storageHealthStatus{
|
|
overallPassed: info.SmartStatus.Passed,
|
|
hasOverall: true,
|
|
reallocatedSectors: reallocated,
|
|
pendingSectors: pending,
|
|
offlineUncorrectable: uncorrectable,
|
|
lifeRemainingPct: lifeRemaining,
|
|
}
|
|
setStorageHealthStatus(&s, status)
|
|
return s
|
|
}
|
|
|
|
s.Type = &devType
|
|
status := statusUnknown
|
|
s.Status = &status
|
|
return s
|
|
}
|
|
|
|
// nvmeSmartLog is the subset of `nvme smart-log -o json` output we care about.
|
|
type nvmeSmartLog struct {
|
|
CriticalWarning int `json:"critical_warning"`
|
|
PercentageUsed int `json:"percentage_used"`
|
|
AvailableSpare int `json:"available_spare"`
|
|
SpareThreshold int `json:"spare_thresh"`
|
|
Temperature int64 `json:"temperature"`
|
|
PowerOnHours int64 `json:"power_on_hours"`
|
|
PowerCycles int64 `json:"power_cycles"`
|
|
UnsafeShutdowns int64 `json:"unsafe_shutdowns"`
|
|
DataUnitsRead int64 `json:"data_units_read"`
|
|
DataUnitsWritten int64 `json:"data_units_written"`
|
|
ControllerBusy int64 `json:"controller_busy_time"`
|
|
MediaErrors int64 `json:"media_errors"`
|
|
NumErrLogEntries int64 `json:"num_err_log_entries"`
|
|
}
|
|
|
|
// nvmeIDCtrl is the subset of `nvme id-ctrl -o json` output.
|
|
type nvmeIDCtrl struct {
|
|
ModelNumber string `json:"mn"`
|
|
SerialNumber string `json:"sn"`
|
|
FirmwareRev string `json:"fr"`
|
|
TotalCapacity int64 `json:"tnvmcap"`
|
|
}
|
|
|
|
func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
|
present := true
|
|
devType := "NVMe"
|
|
iface := "NVMe"
|
|
status := statusOK
|
|
s := schema.HardwareStorage{
|
|
HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status},
|
|
Present: &present,
|
|
Type: &devType,
|
|
Interface: &iface,
|
|
}
|
|
|
|
devPath := "/dev/" + dev.Name
|
|
if v := cleanDMIValue(strings.TrimSpace(dev.Model)); v != "" {
|
|
s.Model = &v
|
|
}
|
|
if v := cleanDMIValue(strings.TrimSpace(dev.Serial)); v != "" {
|
|
s.SerialNumber = &v
|
|
}
|
|
if size := parseStorageBytes(dev.Size); size > 0 {
|
|
gb := int(size / 1_000_000_000)
|
|
if gb > 0 {
|
|
s.SizeGB = &gb
|
|
}
|
|
}
|
|
|
|
// id-ctrl: model, serial, firmware, capacity
|
|
if out, err := exec.Command("nvme", "id-ctrl", devPath, "-o", "json").Output(); err == nil {
|
|
var ctrl nvmeIDCtrl
|
|
if json.Unmarshal(out, &ctrl) == nil {
|
|
if v := cleanDMIValue(strings.TrimSpace(ctrl.ModelNumber)); v != "" {
|
|
s.Model = &v
|
|
}
|
|
if v := cleanDMIValue(strings.TrimSpace(ctrl.SerialNumber)); v != "" {
|
|
s.SerialNumber = &v
|
|
}
|
|
if v := cleanDMIValue(strings.TrimSpace(ctrl.FirmwareRev)); v != "" {
|
|
s.Firmware = &v
|
|
}
|
|
if ctrl.TotalCapacity > 0 {
|
|
gb := int(ctrl.TotalCapacity / 1_000_000_000)
|
|
s.SizeGB = &gb
|
|
}
|
|
}
|
|
}
|
|
|
|
// smart-log: wear telemetry
|
|
if out, err := exec.Command("nvme", "smart-log", devPath, "-o", "json").Output(); err == nil {
|
|
var log nvmeSmartLog
|
|
if json.Unmarshal(out, &log) == nil {
|
|
if log.PowerOnHours > 0 {
|
|
s.PowerOnHours = &log.PowerOnHours
|
|
}
|
|
if log.PowerCycles > 0 {
|
|
s.PowerCycles = &log.PowerCycles
|
|
}
|
|
if log.UnsafeShutdowns > 0 {
|
|
s.UnsafeShutdowns = &log.UnsafeShutdowns
|
|
}
|
|
if log.PercentageUsed > 0 {
|
|
v := float64(log.PercentageUsed)
|
|
s.LifeUsedPct = &v
|
|
remaining := 100 - v
|
|
s.LifeRemainingPct = &remaining
|
|
}
|
|
if log.DataUnitsWritten > 0 {
|
|
v := nvmeDataUnitsToBytes(log.DataUnitsWritten)
|
|
s.WrittenBytes = &v
|
|
}
|
|
if log.DataUnitsRead > 0 {
|
|
v := nvmeDataUnitsToBytes(log.DataUnitsRead)
|
|
s.ReadBytes = &v
|
|
}
|
|
if log.AvailableSpare > 0 {
|
|
v := float64(log.AvailableSpare)
|
|
s.AvailableSparePct = &v
|
|
}
|
|
if log.MediaErrors > 0 {
|
|
s.MediaErrors = &log.MediaErrors
|
|
}
|
|
if log.NumErrLogEntries > 0 {
|
|
s.ErrorLogEntries = &log.NumErrLogEntries
|
|
}
|
|
if log.Temperature > 0 {
|
|
v := float64(log.Temperature - 273)
|
|
s.TemperatureC = &v
|
|
}
|
|
setStorageHealthStatus(&s, storageHealthStatus{
|
|
criticalWarning: log.CriticalWarning,
|
|
percentageUsed: int64(log.PercentageUsed),
|
|
availableSpare: int64(log.AvailableSpare),
|
|
spareThreshold: int64(log.SpareThreshold),
|
|
unsafeShutdowns: log.UnsafeShutdowns,
|
|
mediaErrors: log.MediaErrors,
|
|
errorLogEntries: log.NumErrLogEntries,
|
|
})
|
|
return s
|
|
}
|
|
}
|
|
|
|
status = statusUnknown
|
|
s.Status = &status
|
|
return s
|
|
}
|
|
|
|
func parseStorageBytes(raw string) int64 {
|
|
value, err := strconv.ParseInt(strings.TrimSpace(raw), 10, 64)
|
|
if err == nil && value > 0 {
|
|
return value
|
|
}
|
|
return 0
|
|
}
|
|
|
|
func nvmeDataUnitsToBytes(units int64) int64 {
|
|
if units <= 0 {
|
|
return 0
|
|
}
|
|
return units * 512000
|
|
}
|
|
|
|
type storageHealthStatus struct {
|
|
hasOverall bool
|
|
overallPassed bool
|
|
reallocatedSectors int64
|
|
pendingSectors int64
|
|
offlineUncorrectable int64
|
|
lifeRemainingPct int64
|
|
criticalWarning int
|
|
percentageUsed int64
|
|
availableSpare int64
|
|
spareThreshold int64
|
|
unsafeShutdowns int64
|
|
mediaErrors int64
|
|
errorLogEntries int64
|
|
}
|
|
|
|
func setStorageHealthStatus(s *schema.HardwareStorage, health storageHealthStatus) {
|
|
status := statusOK
|
|
var description *string
|
|
switch {
|
|
case health.hasOverall && !health.overallPassed:
|
|
status = statusCritical
|
|
description = stringPtr("SMART overall self-assessment failed")
|
|
case health.criticalWarning > 0:
|
|
status = statusCritical
|
|
description = stringPtr("NVMe critical warning is set")
|
|
case health.pendingSectors > 0 || health.offlineUncorrectable > 0:
|
|
status = statusCritical
|
|
description = stringPtr("Pending or offline uncorrectable sectors detected")
|
|
case health.mediaErrors > 0:
|
|
status = statusWarning
|
|
description = stringPtr("Media errors reported")
|
|
case health.reallocatedSectors > 0:
|
|
status = statusWarning
|
|
description = stringPtr("Reallocated sectors detected")
|
|
case health.errorLogEntries > 0:
|
|
status = statusWarning
|
|
description = stringPtr("Device error log contains entries")
|
|
case health.lifeRemainingPct > 0 && health.lifeRemainingPct <= 10:
|
|
status = statusWarning
|
|
description = stringPtr("Life remaining is low")
|
|
case health.percentageUsed >= 95:
|
|
status = statusWarning
|
|
description = stringPtr("Drive wear level is high")
|
|
case health.availableSpare > 0 && health.spareThreshold > 0 && health.availableSpare <= health.spareThreshold:
|
|
status = statusWarning
|
|
description = stringPtr("Available spare is at or below threshold")
|
|
case health.unsafeShutdowns > 100:
|
|
status = statusWarning
|
|
description = stringPtr("Unsafe shutdown count is high")
|
|
}
|
|
s.Status = &status
|
|
s.ErrorDescription = description
|
|
}
|
|
|
|
func stringPtr(value string) *string {
|
|
return &value
|
|
}
|