Add health verdicts and acceptance tests
This commit is contained in:
@@ -67,6 +67,9 @@ type smartctlInfo struct {
|
||||
SerialNumber string `json:"serial_number"`
|
||||
FirmwareVer string `json:"firmware_version"`
|
||||
RotationRate int `json:"rotation_rate"`
|
||||
SmartStatus struct {
|
||||
Passed bool `json:"passed"`
|
||||
} `json:"smart_status"`
|
||||
UserCapacity struct {
|
||||
Bytes int64 `json:"bytes"`
|
||||
} `json:"user_capacity"`
|
||||
@@ -127,7 +130,7 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
|
||||
return s
|
||||
}
|
||||
|
||||
var info smartctlInfo
|
||||
var info smartctlInfo
|
||||
if err := json.Unmarshal(out, &info); err == nil {
|
||||
if v := cleanDMIValue(info.ModelName); v != "" {
|
||||
s.Model = &v
|
||||
@@ -158,37 +161,65 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
|
||||
if info.PowerCycleCount > 0 {
|
||||
tel["power_cycles"] = info.PowerCycleCount
|
||||
}
|
||||
reallocated := int64(0)
|
||||
pending := int64(0)
|
||||
uncorrectable := int64(0)
|
||||
lifeRemaining := int64(0)
|
||||
for _, attr := range info.AtaSmartAttributes.Table {
|
||||
switch attr.ID {
|
||||
case 5:
|
||||
reallocated = attr.Raw.Value
|
||||
tel["reallocated_sectors"] = attr.Raw.Value
|
||||
case 177:
|
||||
tel["wear_leveling_pct"] = attr.Raw.Value
|
||||
case 231:
|
||||
lifeRemaining = attr.Raw.Value
|
||||
tel["life_remaining_pct"] = attr.Raw.Value
|
||||
case 241:
|
||||
tel["total_lba_written"] = attr.Raw.Value
|
||||
case 197:
|
||||
pending = attr.Raw.Value
|
||||
tel["current_pending_sectors"] = attr.Raw.Value
|
||||
case 198:
|
||||
uncorrectable = attr.Raw.Value
|
||||
tel["offline_uncorrectable"] = attr.Raw.Value
|
||||
}
|
||||
}
|
||||
if len(tel) > 0 {
|
||||
s.Telemetry = tel
|
||||
}
|
||||
|
||||
status := storageHealthStatus{
|
||||
overallPassed: info.SmartStatus.Passed,
|
||||
hasOverall: true,
|
||||
reallocatedSectors: reallocated,
|
||||
pendingSectors: pending,
|
||||
offlineUncorrectable: uncorrectable,
|
||||
lifeRemainingPct: lifeRemaining,
|
||||
}
|
||||
setStorageHealthStatus(&s, status)
|
||||
return s
|
||||
}
|
||||
|
||||
s.Type = &devType
|
||||
status := "OK"
|
||||
status := "UNKNOWN"
|
||||
s.Status = &status
|
||||
return s
|
||||
}
|
||||
|
||||
// nvmeSmartLog is the subset of `nvme smart-log -o json` output we care about.
|
||||
type nvmeSmartLog struct {
|
||||
PercentageUsed int `json:"percentage_used"`
|
||||
PowerOnHours int64 `json:"power_on_hours"`
|
||||
PowerCycles int64 `json:"power_cycles"`
|
||||
UnsafeShutdowns int64 `json:"unsafe_shutdowns"`
|
||||
DataUnitsWritten int64 `json:"data_units_written"`
|
||||
ControllerBusy int64 `json:"controller_busy_time"`
|
||||
CriticalWarning int `json:"critical_warning"`
|
||||
PercentageUsed int `json:"percentage_used"`
|
||||
AvailableSpare int `json:"available_spare"`
|
||||
SpareThreshold int `json:"spare_thresh"`
|
||||
PowerOnHours int64 `json:"power_on_hours"`
|
||||
PowerCycles int64 `json:"power_cycles"`
|
||||
UnsafeShutdowns int64 `json:"unsafe_shutdowns"`
|
||||
DataUnitsWritten int64 `json:"data_units_written"`
|
||||
ControllerBusy int64 `json:"controller_busy_time"`
|
||||
MediaErrors int64 `json:"media_errors"`
|
||||
NumErrLogEntries int64 `json:"num_err_log_entries"`
|
||||
}
|
||||
|
||||
// nvmeIDCtrl is the subset of `nvme id-ctrl -o json` output.
|
||||
@@ -238,6 +269,9 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
||||
var log nvmeSmartLog
|
||||
if json.Unmarshal(out, &log) == nil {
|
||||
tel := map[string]any{}
|
||||
if log.CriticalWarning > 0 {
|
||||
tel["critical_warning"] = log.CriticalWarning
|
||||
}
|
||||
if log.PowerOnHours > 0 {
|
||||
tel["power_on_hours"] = log.PowerOnHours
|
||||
}
|
||||
@@ -256,11 +290,78 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
||||
if log.ControllerBusy > 0 {
|
||||
tel["controller_busy_time"] = log.ControllerBusy
|
||||
}
|
||||
if log.AvailableSpare > 0 {
|
||||
tel["available_spare_pct"] = log.AvailableSpare
|
||||
}
|
||||
if log.SpareThreshold > 0 {
|
||||
tel["available_spare_threshold_pct"] = log.SpareThreshold
|
||||
}
|
||||
if log.MediaErrors > 0 {
|
||||
tel["media_errors"] = log.MediaErrors
|
||||
}
|
||||
if log.NumErrLogEntries > 0 {
|
||||
tel["error_log_entries"] = log.NumErrLogEntries
|
||||
}
|
||||
if len(tel) > 0 {
|
||||
s.Telemetry = tel
|
||||
}
|
||||
setStorageHealthStatus(&s, storageHealthStatus{
|
||||
criticalWarning: log.CriticalWarning,
|
||||
percentageUsed: int64(log.PercentageUsed),
|
||||
availableSpare: int64(log.AvailableSpare),
|
||||
spareThreshold: int64(log.SpareThreshold),
|
||||
unsafeShutdowns: log.UnsafeShutdowns,
|
||||
mediaErrors: log.MediaErrors,
|
||||
errorLogEntries: log.NumErrLogEntries,
|
||||
})
|
||||
return s
|
||||
}
|
||||
}
|
||||
|
||||
status = "UNKNOWN"
|
||||
s.Status = &status
|
||||
return s
|
||||
}
|
||||
|
||||
type storageHealthStatus struct {
|
||||
hasOverall bool
|
||||
overallPassed bool
|
||||
reallocatedSectors int64
|
||||
pendingSectors int64
|
||||
offlineUncorrectable int64
|
||||
lifeRemainingPct int64
|
||||
criticalWarning int
|
||||
percentageUsed int64
|
||||
availableSpare int64
|
||||
spareThreshold int64
|
||||
unsafeShutdowns int64
|
||||
mediaErrors int64
|
||||
errorLogEntries int64
|
||||
}
|
||||
|
||||
func setStorageHealthStatus(s *schema.HardwareStorage, health storageHealthStatus) {
|
||||
status := "OK"
|
||||
switch {
|
||||
case health.hasOverall && !health.overallPassed:
|
||||
status = "FAILED"
|
||||
case health.criticalWarning > 0:
|
||||
status = "FAILED"
|
||||
case health.pendingSectors > 0 || health.offlineUncorrectable > 0:
|
||||
status = "FAILED"
|
||||
case health.mediaErrors > 0:
|
||||
status = "WARNING"
|
||||
case health.reallocatedSectors > 0:
|
||||
status = "WARNING"
|
||||
case health.errorLogEntries > 0:
|
||||
status = "WARNING"
|
||||
case health.lifeRemainingPct > 0 && health.lifeRemainingPct <= 10:
|
||||
status = "WARNING"
|
||||
case health.percentageUsed >= 95:
|
||||
status = "WARNING"
|
||||
case health.availableSpare > 0 && health.spareThreshold > 0 && health.availableSpare <= health.spareThreshold:
|
||||
status = "WARNING"
|
||||
case health.unsafeShutdowns > 100:
|
||||
status = "WARNING"
|
||||
}
|
||||
s.Status = &status
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user