Add health verdicts and acceptance tests

This commit is contained in:
Mikhail Chusavitin
2026-03-14 17:53:58 +03:00
parent 17f0bda45e
commit b483e2ce35
28 changed files with 1688 additions and 82 deletions

View File

@@ -67,6 +67,9 @@ type smartctlInfo struct {
SerialNumber string `json:"serial_number"`
FirmwareVer string `json:"firmware_version"`
RotationRate int `json:"rotation_rate"`
SmartStatus struct {
Passed bool `json:"passed"`
} `json:"smart_status"`
UserCapacity struct {
Bytes int64 `json:"bytes"`
} `json:"user_capacity"`
@@ -127,7 +130,7 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
return s
}
var info smartctlInfo
var info smartctlInfo
if err := json.Unmarshal(out, &info); err == nil {
if v := cleanDMIValue(info.ModelName); v != "" {
s.Model = &v
@@ -158,37 +161,65 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
if info.PowerCycleCount > 0 {
tel["power_cycles"] = info.PowerCycleCount
}
reallocated := int64(0)
pending := int64(0)
uncorrectable := int64(0)
lifeRemaining := int64(0)
for _, attr := range info.AtaSmartAttributes.Table {
switch attr.ID {
case 5:
reallocated = attr.Raw.Value
tel["reallocated_sectors"] = attr.Raw.Value
case 177:
tel["wear_leveling_pct"] = attr.Raw.Value
case 231:
lifeRemaining = attr.Raw.Value
tel["life_remaining_pct"] = attr.Raw.Value
case 241:
tel["total_lba_written"] = attr.Raw.Value
case 197:
pending = attr.Raw.Value
tel["current_pending_sectors"] = attr.Raw.Value
case 198:
uncorrectable = attr.Raw.Value
tel["offline_uncorrectable"] = attr.Raw.Value
}
}
if len(tel) > 0 {
s.Telemetry = tel
}
status := storageHealthStatus{
overallPassed: info.SmartStatus.Passed,
hasOverall: true,
reallocatedSectors: reallocated,
pendingSectors: pending,
offlineUncorrectable: uncorrectable,
lifeRemainingPct: lifeRemaining,
}
setStorageHealthStatus(&s, status)
return s
}
s.Type = &devType
status := "OK"
status := "UNKNOWN"
s.Status = &status
return s
}
// nvmeSmartLog is the subset of `nvme smart-log -o json` output we care about.
type nvmeSmartLog struct {
PercentageUsed int `json:"percentage_used"`
PowerOnHours int64 `json:"power_on_hours"`
PowerCycles int64 `json:"power_cycles"`
UnsafeShutdowns int64 `json:"unsafe_shutdowns"`
DataUnitsWritten int64 `json:"data_units_written"`
ControllerBusy int64 `json:"controller_busy_time"`
CriticalWarning int `json:"critical_warning"`
PercentageUsed int `json:"percentage_used"`
AvailableSpare int `json:"available_spare"`
SpareThreshold int `json:"spare_thresh"`
PowerOnHours int64 `json:"power_on_hours"`
PowerCycles int64 `json:"power_cycles"`
UnsafeShutdowns int64 `json:"unsafe_shutdowns"`
DataUnitsWritten int64 `json:"data_units_written"`
ControllerBusy int64 `json:"controller_busy_time"`
MediaErrors int64 `json:"media_errors"`
NumErrLogEntries int64 `json:"num_err_log_entries"`
}
// nvmeIDCtrl is the subset of `nvme id-ctrl -o json` output.
@@ -238,6 +269,9 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
var log nvmeSmartLog
if json.Unmarshal(out, &log) == nil {
tel := map[string]any{}
if log.CriticalWarning > 0 {
tel["critical_warning"] = log.CriticalWarning
}
if log.PowerOnHours > 0 {
tel["power_on_hours"] = log.PowerOnHours
}
@@ -256,11 +290,78 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
if log.ControllerBusy > 0 {
tel["controller_busy_time"] = log.ControllerBusy
}
if log.AvailableSpare > 0 {
tel["available_spare_pct"] = log.AvailableSpare
}
if log.SpareThreshold > 0 {
tel["available_spare_threshold_pct"] = log.SpareThreshold
}
if log.MediaErrors > 0 {
tel["media_errors"] = log.MediaErrors
}
if log.NumErrLogEntries > 0 {
tel["error_log_entries"] = log.NumErrLogEntries
}
if len(tel) > 0 {
s.Telemetry = tel
}
setStorageHealthStatus(&s, storageHealthStatus{
criticalWarning: log.CriticalWarning,
percentageUsed: int64(log.PercentageUsed),
availableSpare: int64(log.AvailableSpare),
spareThreshold: int64(log.SpareThreshold),
unsafeShutdowns: log.UnsafeShutdowns,
mediaErrors: log.MediaErrors,
errorLogEntries: log.NumErrLogEntries,
})
return s
}
}
status = "UNKNOWN"
s.Status = &status
return s
}
type storageHealthStatus struct {
hasOverall bool
overallPassed bool
reallocatedSectors int64
pendingSectors int64
offlineUncorrectable int64
lifeRemainingPct int64
criticalWarning int
percentageUsed int64
availableSpare int64
spareThreshold int64
unsafeShutdowns int64
mediaErrors int64
errorLogEntries int64
}
func setStorageHealthStatus(s *schema.HardwareStorage, health storageHealthStatus) {
status := "OK"
switch {
case health.hasOverall && !health.overallPassed:
status = "FAILED"
case health.criticalWarning > 0:
status = "FAILED"
case health.pendingSectors > 0 || health.offlineUncorrectable > 0:
status = "FAILED"
case health.mediaErrors > 0:
status = "WARNING"
case health.reallocatedSectors > 0:
status = "WARNING"
case health.errorLogEntries > 0:
status = "WARNING"
case health.lifeRemainingPct > 0 && health.lifeRemainingPct <= 10:
status = "WARNING"
case health.percentageUsed >= 95:
status = "WARNING"
case health.availableSpare > 0 && health.spareThreshold > 0 && health.availableSpare <= health.spareThreshold:
status = "WARNING"
case health.unsafeShutdowns > 100:
status = "WARNING"
}
s.Status = &status
}