368 lines
9.7 KiB
Go
368 lines
9.7 KiB
Go
package collector
|
|
|
|
import (
|
|
"bee/audit/internal/schema"
|
|
"encoding/json"
|
|
"log/slog"
|
|
"os/exec"
|
|
"strings"
|
|
)
|
|
|
|
func collectStorage() []schema.HardwareStorage {
|
|
devs := lsblkDevices()
|
|
result := make([]schema.HardwareStorage, 0, len(devs))
|
|
for _, dev := range devs {
|
|
var s schema.HardwareStorage
|
|
if strings.HasPrefix(dev.Name, "nvme") {
|
|
s = enrichWithNVMe(dev)
|
|
} else {
|
|
s = enrichWithSmartctl(dev)
|
|
}
|
|
result = append(result, s)
|
|
}
|
|
slog.Info("storage: collected", "count", len(result))
|
|
return result
|
|
}
|
|
|
|
// lsblkDevice is a minimal lsblk JSON record.
|
|
type lsblkDevice struct {
|
|
Name string `json:"name"`
|
|
Type string `json:"type"`
|
|
Size string `json:"size"`
|
|
Serial string `json:"serial"`
|
|
Model string `json:"model"`
|
|
Tran string `json:"tran"`
|
|
Hctl string `json:"hctl"`
|
|
}
|
|
|
|
type lsblkRoot struct {
|
|
Blockdevices []lsblkDevice `json:"blockdevices"`
|
|
}
|
|
|
|
func lsblkDevices() []lsblkDevice {
|
|
out, err := exec.Command("lsblk", "-J", "-d",
|
|
"-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL").Output()
|
|
if err != nil {
|
|
slog.Warn("storage: lsblk failed", "err", err)
|
|
return nil
|
|
}
|
|
var root lsblkRoot
|
|
if err := json.Unmarshal(out, &root); err != nil {
|
|
slog.Warn("storage: lsblk parse failed", "err", err)
|
|
return nil
|
|
}
|
|
var disks []lsblkDevice
|
|
for _, d := range root.Blockdevices {
|
|
if d.Type == "disk" {
|
|
disks = append(disks, d)
|
|
}
|
|
}
|
|
return disks
|
|
}
|
|
|
|
// smartctlInfo is the subset of smartctl -j -a output we care about.
|
|
type smartctlInfo struct {
|
|
ModelFamily string `json:"model_family"`
|
|
ModelName string `json:"model_name"`
|
|
SerialNumber string `json:"serial_number"`
|
|
FirmwareVer string `json:"firmware_version"`
|
|
RotationRate int `json:"rotation_rate"`
|
|
SmartStatus struct {
|
|
Passed bool `json:"passed"`
|
|
} `json:"smart_status"`
|
|
UserCapacity struct {
|
|
Bytes int64 `json:"bytes"`
|
|
} `json:"user_capacity"`
|
|
AtaSmartAttributes struct {
|
|
Table []struct {
|
|
ID int `json:"id"`
|
|
Name string `json:"name"`
|
|
Raw struct{ Value int64 `json:"value"` } `json:"raw"`
|
|
} `json:"table"`
|
|
} `json:"ata_smart_attributes"`
|
|
PowerOnTime struct {
|
|
Hours int `json:"hours"`
|
|
} `json:"power_on_time"`
|
|
PowerCycleCount int `json:"power_cycle_count"`
|
|
}
|
|
|
|
func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
|
|
present := true
|
|
s := schema.HardwareStorage{Present: &present}
|
|
|
|
tran := strings.ToLower(dev.Tran)
|
|
devPath := "/dev/" + dev.Name
|
|
|
|
// determine device type (refined by smartctl rotation_rate below)
|
|
var devType string
|
|
switch {
|
|
case strings.HasPrefix(dev.Name, "nvme"):
|
|
devType = "NVMe"
|
|
case tran == "usb":
|
|
devType = "USB"
|
|
case tran == "sata" || tran == "sas":
|
|
devType = "HDD" // refined to SSD below if rotation_rate==0
|
|
default:
|
|
devType = "Unknown"
|
|
}
|
|
|
|
iface := strings.ToUpper(tran)
|
|
if iface != "" {
|
|
s.Interface = &iface
|
|
}
|
|
|
|
// slot from HCTL (host:channel:target:lun)
|
|
if dev.Hctl != "" {
|
|
s.Slot = &dev.Hctl
|
|
}
|
|
|
|
// run smartctl
|
|
out, err := exec.Command("smartctl", "-j", "-a", devPath).Output()
|
|
if err != nil {
|
|
// still fill what lsblk gave us
|
|
if v := strings.TrimSpace(dev.Model); v != "" {
|
|
s.Model = &v
|
|
}
|
|
if v := strings.TrimSpace(dev.Serial); v != "" {
|
|
s.SerialNumber = &v
|
|
}
|
|
s.Type = &devType
|
|
return s
|
|
}
|
|
|
|
var info smartctlInfo
|
|
if err := json.Unmarshal(out, &info); err == nil {
|
|
if v := cleanDMIValue(info.ModelName); v != "" {
|
|
s.Model = &v
|
|
}
|
|
if v := cleanDMIValue(info.SerialNumber); v != "" {
|
|
s.SerialNumber = &v
|
|
}
|
|
if v := cleanDMIValue(info.FirmwareVer); v != "" {
|
|
s.Firmware = &v
|
|
}
|
|
if info.UserCapacity.Bytes > 0 {
|
|
gb := int(info.UserCapacity.Bytes / 1_000_000_000)
|
|
s.SizeGB = &gb
|
|
}
|
|
|
|
// refine type from rotation_rate
|
|
if info.RotationRate == 0 && devType != "NVMe" && devType != "USB" {
|
|
devType = "SSD"
|
|
} else if info.RotationRate > 0 {
|
|
devType = "HDD"
|
|
}
|
|
|
|
// telemetry
|
|
tel := map[string]any{}
|
|
if info.PowerOnTime.Hours > 0 {
|
|
tel["power_on_hours"] = info.PowerOnTime.Hours
|
|
}
|
|
if info.PowerCycleCount > 0 {
|
|
tel["power_cycles"] = info.PowerCycleCount
|
|
}
|
|
reallocated := int64(0)
|
|
pending := int64(0)
|
|
uncorrectable := int64(0)
|
|
lifeRemaining := int64(0)
|
|
for _, attr := range info.AtaSmartAttributes.Table {
|
|
switch attr.ID {
|
|
case 5:
|
|
reallocated = attr.Raw.Value
|
|
tel["reallocated_sectors"] = attr.Raw.Value
|
|
case 177:
|
|
tel["wear_leveling_pct"] = attr.Raw.Value
|
|
case 231:
|
|
lifeRemaining = attr.Raw.Value
|
|
tel["life_remaining_pct"] = attr.Raw.Value
|
|
case 241:
|
|
tel["total_lba_written"] = attr.Raw.Value
|
|
case 197:
|
|
pending = attr.Raw.Value
|
|
tel["current_pending_sectors"] = attr.Raw.Value
|
|
case 198:
|
|
uncorrectable = attr.Raw.Value
|
|
tel["offline_uncorrectable"] = attr.Raw.Value
|
|
}
|
|
}
|
|
if len(tel) > 0 {
|
|
s.Telemetry = tel
|
|
}
|
|
|
|
status := storageHealthStatus{
|
|
overallPassed: info.SmartStatus.Passed,
|
|
hasOverall: true,
|
|
reallocatedSectors: reallocated,
|
|
pendingSectors: pending,
|
|
offlineUncorrectable: uncorrectable,
|
|
lifeRemainingPct: lifeRemaining,
|
|
}
|
|
setStorageHealthStatus(&s, status)
|
|
return s
|
|
}
|
|
|
|
s.Type = &devType
|
|
status := "UNKNOWN"
|
|
s.Status = &status
|
|
return s
|
|
}
|
|
|
|
// nvmeSmartLog is the subset of `nvme smart-log -o json` output we care about.
|
|
type nvmeSmartLog struct {
|
|
CriticalWarning int `json:"critical_warning"`
|
|
PercentageUsed int `json:"percentage_used"`
|
|
AvailableSpare int `json:"available_spare"`
|
|
SpareThreshold int `json:"spare_thresh"`
|
|
PowerOnHours int64 `json:"power_on_hours"`
|
|
PowerCycles int64 `json:"power_cycles"`
|
|
UnsafeShutdowns int64 `json:"unsafe_shutdowns"`
|
|
DataUnitsWritten int64 `json:"data_units_written"`
|
|
ControllerBusy int64 `json:"controller_busy_time"`
|
|
MediaErrors int64 `json:"media_errors"`
|
|
NumErrLogEntries int64 `json:"num_err_log_entries"`
|
|
}
|
|
|
|
// nvmeIDCtrl is the subset of `nvme id-ctrl -o json` output.
|
|
type nvmeIDCtrl struct {
|
|
ModelNumber string `json:"mn"`
|
|
SerialNumber string `json:"sn"`
|
|
FirmwareRev string `json:"fr"`
|
|
TotalCapacity int64 `json:"tnvmcap"`
|
|
}
|
|
|
|
func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
|
present := true
|
|
devType := "NVMe"
|
|
iface := "NVMe"
|
|
status := "OK"
|
|
s := schema.HardwareStorage{
|
|
Present: &present,
|
|
Type: &devType,
|
|
Interface: &iface,
|
|
Status: &status,
|
|
}
|
|
|
|
devPath := "/dev/" + dev.Name
|
|
|
|
// id-ctrl: model, serial, firmware, capacity
|
|
if out, err := exec.Command("nvme", "id-ctrl", devPath, "-o", "json").Output(); err == nil {
|
|
var ctrl nvmeIDCtrl
|
|
if json.Unmarshal(out, &ctrl) == nil {
|
|
if v := cleanDMIValue(strings.TrimSpace(ctrl.ModelNumber)); v != "" {
|
|
s.Model = &v
|
|
}
|
|
if v := cleanDMIValue(strings.TrimSpace(ctrl.SerialNumber)); v != "" {
|
|
s.SerialNumber = &v
|
|
}
|
|
if v := cleanDMIValue(strings.TrimSpace(ctrl.FirmwareRev)); v != "" {
|
|
s.Firmware = &v
|
|
}
|
|
if ctrl.TotalCapacity > 0 {
|
|
gb := int(ctrl.TotalCapacity / 1_000_000_000)
|
|
s.SizeGB = &gb
|
|
}
|
|
}
|
|
}
|
|
|
|
// smart-log: wear telemetry
|
|
if out, err := exec.Command("nvme", "smart-log", devPath, "-o", "json").Output(); err == nil {
|
|
var log nvmeSmartLog
|
|
if json.Unmarshal(out, &log) == nil {
|
|
tel := map[string]any{}
|
|
if log.CriticalWarning > 0 {
|
|
tel["critical_warning"] = log.CriticalWarning
|
|
}
|
|
if log.PowerOnHours > 0 {
|
|
tel["power_on_hours"] = log.PowerOnHours
|
|
}
|
|
if log.PowerCycles > 0 {
|
|
tel["power_cycles"] = log.PowerCycles
|
|
}
|
|
if log.UnsafeShutdowns > 0 {
|
|
tel["unsafe_shutdowns"] = log.UnsafeShutdowns
|
|
}
|
|
if log.PercentageUsed > 0 {
|
|
tel["percentage_used"] = log.PercentageUsed
|
|
}
|
|
if log.DataUnitsWritten > 0 {
|
|
tel["data_units_written"] = log.DataUnitsWritten
|
|
}
|
|
if log.ControllerBusy > 0 {
|
|
tel["controller_busy_time"] = log.ControllerBusy
|
|
}
|
|
if log.AvailableSpare > 0 {
|
|
tel["available_spare_pct"] = log.AvailableSpare
|
|
}
|
|
if log.SpareThreshold > 0 {
|
|
tel["available_spare_threshold_pct"] = log.SpareThreshold
|
|
}
|
|
if log.MediaErrors > 0 {
|
|
tel["media_errors"] = log.MediaErrors
|
|
}
|
|
if log.NumErrLogEntries > 0 {
|
|
tel["error_log_entries"] = log.NumErrLogEntries
|
|
}
|
|
if len(tel) > 0 {
|
|
s.Telemetry = tel
|
|
}
|
|
setStorageHealthStatus(&s, storageHealthStatus{
|
|
criticalWarning: log.CriticalWarning,
|
|
percentageUsed: int64(log.PercentageUsed),
|
|
availableSpare: int64(log.AvailableSpare),
|
|
spareThreshold: int64(log.SpareThreshold),
|
|
unsafeShutdowns: log.UnsafeShutdowns,
|
|
mediaErrors: log.MediaErrors,
|
|
errorLogEntries: log.NumErrLogEntries,
|
|
})
|
|
return s
|
|
}
|
|
}
|
|
|
|
status = "UNKNOWN"
|
|
s.Status = &status
|
|
return s
|
|
}
|
|
|
|
type storageHealthStatus struct {
|
|
hasOverall bool
|
|
overallPassed bool
|
|
reallocatedSectors int64
|
|
pendingSectors int64
|
|
offlineUncorrectable int64
|
|
lifeRemainingPct int64
|
|
criticalWarning int
|
|
percentageUsed int64
|
|
availableSpare int64
|
|
spareThreshold int64
|
|
unsafeShutdowns int64
|
|
mediaErrors int64
|
|
errorLogEntries int64
|
|
}
|
|
|
|
func setStorageHealthStatus(s *schema.HardwareStorage, health storageHealthStatus) {
|
|
status := "OK"
|
|
switch {
|
|
case health.hasOverall && !health.overallPassed:
|
|
status = "FAILED"
|
|
case health.criticalWarning > 0:
|
|
status = "FAILED"
|
|
case health.pendingSectors > 0 || health.offlineUncorrectable > 0:
|
|
status = "FAILED"
|
|
case health.mediaErrors > 0:
|
|
status = "WARNING"
|
|
case health.reallocatedSectors > 0:
|
|
status = "WARNING"
|
|
case health.errorLogEntries > 0:
|
|
status = "WARNING"
|
|
case health.lifeRemainingPct > 0 && health.lifeRemainingPct <= 10:
|
|
status = "WARNING"
|
|
case health.percentageUsed >= 95:
|
|
status = "WARNING"
|
|
case health.availableSpare > 0 && health.spareThreshold > 0 && health.availableSpare <= health.spareThreshold:
|
|
status = "WARNING"
|
|
case health.unsafeShutdowns > 100:
|
|
status = "WARNING"
|
|
}
|
|
s.Status = &status
|
|
}
|