Files
bee/audit/internal/collector/storage.go
2026-03-16 18:20:26 +03:00

519 lines
13 KiB
Go

package collector
import (
"bee/audit/internal/schema"
"encoding/json"
"log/slog"
"os/exec"
"path/filepath"
"strconv"
"strings"
)
func collectStorage() []schema.HardwareStorage {
devs := discoverStorageDevices()
result := make([]schema.HardwareStorage, 0, len(devs))
for _, dev := range devs {
var s schema.HardwareStorage
if strings.HasPrefix(dev.Name, "nvme") {
s = enrichWithNVMe(dev)
} else {
s = enrichWithSmartctl(dev)
}
result = append(result, s)
}
slog.Info("storage: collected", "count", len(result))
return result
}
// lsblkDevice is a minimal lsblk JSON record.
type lsblkDevice struct {
Name string `json:"name"`
Type string `json:"type"`
Size string `json:"size"`
Serial string `json:"serial"`
Model string `json:"model"`
Tran string `json:"tran"`
Hctl string `json:"hctl"`
}
type lsblkRoot struct {
Blockdevices []lsblkDevice `json:"blockdevices"`
}
type nvmeListRoot struct {
Devices []nvmeListDevice `json:"Devices"`
}
type nvmeListDevice struct {
DevicePath string `json:"DevicePath"`
ModelNumber string `json:"ModelNumber"`
SerialNumber string `json:"SerialNumber"`
Firmware string `json:"Firmware"`
PhysicalSize int64 `json:"PhysicalSize"`
}
func discoverStorageDevices() []lsblkDevice {
merged := map[string]lsblkDevice{}
for _, dev := range lsblkDevices() {
if dev.Name == "" {
continue
}
merged[dev.Name] = dev
}
for _, dev := range nvmeListDevices() {
if dev.Name == "" {
continue
}
current := merged[dev.Name]
merged[dev.Name] = mergeStorageDevice(current, dev)
}
disks := make([]lsblkDevice, 0, len(merged))
for _, dev := range merged {
if dev.Type == "" {
dev.Type = "disk"
}
if dev.Type != "disk" {
continue
}
disks = append(disks, dev)
}
return disks
}
func lsblkDevices() []lsblkDevice {
out, err := exec.Command("lsblk", "-J", "-d",
"-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL").Output()
if err != nil {
slog.Warn("storage: lsblk failed", "err", err)
return nil
}
var root lsblkRoot
if err := json.Unmarshal(out, &root); err != nil {
slog.Warn("storage: lsblk parse failed", "err", err)
return nil
}
var disks []lsblkDevice
for _, d := range root.Blockdevices {
if d.Type == "disk" {
disks = append(disks, d)
}
}
return disks
}
func nvmeListDevices() []lsblkDevice {
out, err := exec.Command("nvme", "list", "-o", "json").Output()
if err != nil {
return nil
}
var root nvmeListRoot
if err := json.Unmarshal(out, &root); err != nil {
slog.Warn("storage: nvme list parse failed", "err", err)
return nil
}
devices := make([]lsblkDevice, 0, len(root.Devices))
for _, dev := range root.Devices {
name := filepath.Base(strings.TrimSpace(dev.DevicePath))
if name == "" {
continue
}
devices = append(devices, lsblkDevice{
Name: name,
Type: "disk",
Size: strconv.FormatInt(dev.PhysicalSize, 10),
Serial: strings.TrimSpace(dev.SerialNumber),
Model: strings.TrimSpace(dev.ModelNumber),
Tran: "nvme",
})
}
return devices
}
func mergeStorageDevice(existing, incoming lsblkDevice) lsblkDevice {
if existing.Name == "" {
return incoming
}
if existing.Type == "" {
existing.Type = incoming.Type
}
if strings.TrimSpace(existing.Size) == "" {
existing.Size = incoming.Size
}
if strings.TrimSpace(existing.Serial) == "" {
existing.Serial = incoming.Serial
}
if strings.TrimSpace(existing.Model) == "" {
existing.Model = incoming.Model
}
if strings.TrimSpace(existing.Tran) == "" {
existing.Tran = incoming.Tran
}
if strings.TrimSpace(existing.Hctl) == "" {
existing.Hctl = incoming.Hctl
}
return existing
}
// smartctlInfo is the subset of smartctl -j -a output we care about.
type smartctlInfo struct {
ModelFamily string `json:"model_family"`
ModelName string `json:"model_name"`
SerialNumber string `json:"serial_number"`
FirmwareVer string `json:"firmware_version"`
RotationRate int `json:"rotation_rate"`
Temperature struct {
Current int `json:"current"`
} `json:"temperature"`
SmartStatus struct {
Passed bool `json:"passed"`
} `json:"smart_status"`
UserCapacity struct {
Bytes int64 `json:"bytes"`
} `json:"user_capacity"`
AtaSmartAttributes struct {
Table []struct {
ID int `json:"id"`
Name string `json:"name"`
Raw struct {
Value int64 `json:"value"`
} `json:"raw"`
} `json:"table"`
} `json:"ata_smart_attributes"`
PowerOnTime struct {
Hours int `json:"hours"`
} `json:"power_on_time"`
PowerCycleCount int `json:"power_cycle_count"`
}
func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
present := true
s := schema.HardwareStorage{Present: &present}
tran := strings.ToLower(dev.Tran)
devPath := "/dev/" + dev.Name
// determine device type (refined by smartctl rotation_rate below)
var devType string
switch {
case strings.HasPrefix(dev.Name, "nvme"):
devType = "NVMe"
case tran == "usb":
devType = "USB"
case tran == "sata" || tran == "sas":
devType = "HDD" // refined to SSD below if rotation_rate==0
default:
devType = "Unknown"
}
iface := strings.ToUpper(tran)
if iface != "" {
s.Interface = &iface
}
// slot from HCTL (host:channel:target:lun)
if dev.Hctl != "" {
s.Slot = &dev.Hctl
}
// run smartctl
out, err := exec.Command("smartctl", "-j", "-a", devPath).Output()
if err != nil {
// still fill what lsblk gave us
if v := strings.TrimSpace(dev.Model); v != "" {
s.Model = &v
}
if v := strings.TrimSpace(dev.Serial); v != "" {
s.SerialNumber = &v
}
s.Type = &devType
return s
}
var info smartctlInfo
if err := json.Unmarshal(out, &info); err == nil {
if v := cleanDMIValue(info.ModelName); v != "" {
s.Model = &v
}
if v := cleanDMIValue(info.SerialNumber); v != "" {
s.SerialNumber = &v
}
if v := cleanDMIValue(info.FirmwareVer); v != "" {
s.Firmware = &v
}
if info.UserCapacity.Bytes > 0 {
gb := int(info.UserCapacity.Bytes / 1_000_000_000)
s.SizeGB = &gb
}
// refine type from rotation_rate
if info.RotationRate == 0 && devType != "NVMe" && devType != "USB" {
devType = "SSD"
} else if info.RotationRate > 0 {
devType = "HDD"
}
s.Type = &devType
if info.Temperature.Current > 0 {
t := float64(info.Temperature.Current)
s.TemperatureC = &t
}
if info.PowerOnTime.Hours > 0 {
v := int64(info.PowerOnTime.Hours)
s.PowerOnHours = &v
}
if info.PowerCycleCount > 0 {
v := int64(info.PowerCycleCount)
s.PowerCycles = &v
}
reallocated := int64(0)
pending := int64(0)
uncorrectable := int64(0)
lifeRemaining := int64(0)
for _, attr := range info.AtaSmartAttributes.Table {
switch attr.ID {
case 5:
reallocated = attr.Raw.Value
s.ReallocatedSectors = &reallocated
case 177:
value := float64(attr.Raw.Value)
s.LifeUsedPct = &value
case 231:
lifeRemaining = attr.Raw.Value
value := float64(attr.Raw.Value)
s.LifeRemainingPct = &value
case 241:
value := attr.Raw.Value
s.WrittenBytes = &value
case 197:
pending = attr.Raw.Value
s.CurrentPendingSectors = &pending
case 198:
uncorrectable = attr.Raw.Value
s.OfflineUncorrectable = &uncorrectable
}
}
status := storageHealthStatus{
overallPassed: info.SmartStatus.Passed,
hasOverall: true,
reallocatedSectors: reallocated,
pendingSectors: pending,
offlineUncorrectable: uncorrectable,
lifeRemainingPct: lifeRemaining,
}
setStorageHealthStatus(&s, status)
return s
}
s.Type = &devType
status := statusUnknown
s.Status = &status
return s
}
// nvmeSmartLog is the subset of `nvme smart-log -o json` output we care about.
type nvmeSmartLog struct {
CriticalWarning int `json:"critical_warning"`
PercentageUsed int `json:"percentage_used"`
AvailableSpare int `json:"available_spare"`
SpareThreshold int `json:"spare_thresh"`
Temperature int64 `json:"temperature"`
PowerOnHours int64 `json:"power_on_hours"`
PowerCycles int64 `json:"power_cycles"`
UnsafeShutdowns int64 `json:"unsafe_shutdowns"`
DataUnitsRead int64 `json:"data_units_read"`
DataUnitsWritten int64 `json:"data_units_written"`
ControllerBusy int64 `json:"controller_busy_time"`
MediaErrors int64 `json:"media_errors"`
NumErrLogEntries int64 `json:"num_err_log_entries"`
}
// nvmeIDCtrl is the subset of `nvme id-ctrl -o json` output.
type nvmeIDCtrl struct {
ModelNumber string `json:"mn"`
SerialNumber string `json:"sn"`
FirmwareRev string `json:"fr"`
TotalCapacity int64 `json:"tnvmcap"`
}
func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
present := true
devType := "NVMe"
iface := "NVMe"
status := statusOK
s := schema.HardwareStorage{
HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status},
Present: &present,
Type: &devType,
Interface: &iface,
}
devPath := "/dev/" + dev.Name
if v := cleanDMIValue(strings.TrimSpace(dev.Model)); v != "" {
s.Model = &v
}
if v := cleanDMIValue(strings.TrimSpace(dev.Serial)); v != "" {
s.SerialNumber = &v
}
if size := parseStorageBytes(dev.Size); size > 0 {
gb := int(size / 1_000_000_000)
if gb > 0 {
s.SizeGB = &gb
}
}
// id-ctrl: model, serial, firmware, capacity
if out, err := exec.Command("nvme", "id-ctrl", devPath, "-o", "json").Output(); err == nil {
var ctrl nvmeIDCtrl
if json.Unmarshal(out, &ctrl) == nil {
if v := cleanDMIValue(strings.TrimSpace(ctrl.ModelNumber)); v != "" {
s.Model = &v
}
if v := cleanDMIValue(strings.TrimSpace(ctrl.SerialNumber)); v != "" {
s.SerialNumber = &v
}
if v := cleanDMIValue(strings.TrimSpace(ctrl.FirmwareRev)); v != "" {
s.Firmware = &v
}
if ctrl.TotalCapacity > 0 {
gb := int(ctrl.TotalCapacity / 1_000_000_000)
s.SizeGB = &gb
}
}
}
// smart-log: wear telemetry
if out, err := exec.Command("nvme", "smart-log", devPath, "-o", "json").Output(); err == nil {
var log nvmeSmartLog
if json.Unmarshal(out, &log) == nil {
if log.PowerOnHours > 0 {
s.PowerOnHours = &log.PowerOnHours
}
if log.PowerCycles > 0 {
s.PowerCycles = &log.PowerCycles
}
if log.UnsafeShutdowns > 0 {
s.UnsafeShutdowns = &log.UnsafeShutdowns
}
if log.PercentageUsed > 0 {
v := float64(log.PercentageUsed)
s.LifeUsedPct = &v
remaining := 100 - v
s.LifeRemainingPct = &remaining
}
if log.DataUnitsWritten > 0 {
v := nvmeDataUnitsToBytes(log.DataUnitsWritten)
s.WrittenBytes = &v
}
if log.DataUnitsRead > 0 {
v := nvmeDataUnitsToBytes(log.DataUnitsRead)
s.ReadBytes = &v
}
if log.AvailableSpare > 0 {
v := float64(log.AvailableSpare)
s.AvailableSparePct = &v
}
if log.MediaErrors > 0 {
s.MediaErrors = &log.MediaErrors
}
if log.NumErrLogEntries > 0 {
s.ErrorLogEntries = &log.NumErrLogEntries
}
if log.Temperature > 0 {
v := float64(log.Temperature - 273)
s.TemperatureC = &v
}
setStorageHealthStatus(&s, storageHealthStatus{
criticalWarning: log.CriticalWarning,
percentageUsed: int64(log.PercentageUsed),
availableSpare: int64(log.AvailableSpare),
spareThreshold: int64(log.SpareThreshold),
unsafeShutdowns: log.UnsafeShutdowns,
mediaErrors: log.MediaErrors,
errorLogEntries: log.NumErrLogEntries,
})
return s
}
}
status = statusUnknown
s.Status = &status
return s
}
func parseStorageBytes(raw string) int64 {
value, err := strconv.ParseInt(strings.TrimSpace(raw), 10, 64)
if err == nil && value > 0 {
return value
}
return 0
}
func nvmeDataUnitsToBytes(units int64) int64 {
if units <= 0 {
return 0
}
return units * 512000
}
type storageHealthStatus struct {
hasOverall bool
overallPassed bool
reallocatedSectors int64
pendingSectors int64
offlineUncorrectable int64
lifeRemainingPct int64
criticalWarning int
percentageUsed int64
availableSpare int64
spareThreshold int64
unsafeShutdowns int64
mediaErrors int64
errorLogEntries int64
}
func setStorageHealthStatus(s *schema.HardwareStorage, health storageHealthStatus) {
status := statusOK
var description *string
switch {
case health.hasOverall && !health.overallPassed:
status = statusCritical
description = stringPtr("SMART overall self-assessment failed")
case health.criticalWarning > 0:
status = statusCritical
description = stringPtr("NVMe critical warning is set")
case health.pendingSectors > 0 || health.offlineUncorrectable > 0:
status = statusCritical
description = stringPtr("Pending or offline uncorrectable sectors detected")
case health.mediaErrors > 0:
status = statusWarning
description = stringPtr("Media errors reported")
case health.reallocatedSectors > 0:
status = statusWarning
description = stringPtr("Reallocated sectors detected")
case health.errorLogEntries > 0:
status = statusWarning
description = stringPtr("Device error log contains entries")
case health.lifeRemainingPct > 0 && health.lifeRemainingPct <= 10:
status = statusWarning
description = stringPtr("Life remaining is low")
case health.percentageUsed >= 95:
status = statusWarning
description = stringPtr("Drive wear level is high")
case health.availableSpare > 0 && health.spareThreshold > 0 && health.availableSpare <= health.spareThreshold:
status = statusWarning
description = stringPtr("Available spare is at or below threshold")
case health.unsafeShutdowns > 100:
status = statusWarning
description = stringPtr("Unsafe shutdown count is high")
}
s.Status = &status
s.ErrorDescription = description
}
func stringPtr(value string) *string {
return &value
}