nvme-cli emits smart-log counters as JSON strings and uses field names avail_spare / percent_used instead of the prose names in the NVMe spec. The nvmeSmartLog struct had int64 fields with wrong JSON tags — Unmarshal returned an error and the whole health block was skipped, leaving every NVMe drive with status=Unknown. Fix: switch all numeric fields to jsonInt64 (already used for lsblk block sizes) which accepts both bare numbers and quoted strings, and correct the avail_spare / percent_used tag names. Also fix validateIsVendorGPU for NVIDIA: previously counted any NVIDIA PCIe device (including NVSwitch bridges) as a GPU, producing wrong estimates (12 instead of 8 on an HGX H100 system). Now requires device_class to be videocontroller or processingaccelerator, matching the existing AMD filter logic. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
118 lines
2.9 KiB
Go
118 lines
2.9 KiB
Go
package collector
|
|
|
|
import (
|
|
"encoding/json"
|
|
"testing"
|
|
|
|
"bee/audit/internal/schema"
|
|
)
|
|
|
|
// TestNVMeSmartLogUnmarshal verifies that nvme-cli JSON output (where most
|
|
// counters are quoted strings and field names differ from NVMe spec prose)
|
|
// is correctly parsed into nvmeSmartLog.
|
|
func TestNVMeSmartLogUnmarshal(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
// Real nvme-cli output: counters are JSON strings, spare is "avail_spare",
|
|
// percentage used is "percent_used".
|
|
raw := `{
|
|
"critical_warning": 0,
|
|
"temperature": 310,
|
|
"avail_spare": 100,
|
|
"spare_thresh": 5,
|
|
"percent_used": 0,
|
|
"data_units_read": "10925415",
|
|
"data_units_written": "8497672",
|
|
"controller_busy_time": "305",
|
|
"power_cycles": "53",
|
|
"power_on_hours": "49",
|
|
"unsafe_shutdowns": "22",
|
|
"media_errors": "0",
|
|
"num_err_log_entries": "0"
|
|
}`
|
|
var log nvmeSmartLog
|
|
if err := json.Unmarshal([]byte(raw), &log); err != nil {
|
|
t.Fatalf("json.Unmarshal failed: %v", err)
|
|
}
|
|
if log.PowerOnHours != 49 {
|
|
t.Errorf("PowerOnHours=%d want 49", log.PowerOnHours)
|
|
}
|
|
if log.PowerCycles != 53 {
|
|
t.Errorf("PowerCycles=%d want 53", log.PowerCycles)
|
|
}
|
|
if log.AvailableSpare != 100 {
|
|
t.Errorf("AvailableSpare=%d want 100", log.AvailableSpare)
|
|
}
|
|
if log.SpareThreshold != 5 {
|
|
t.Errorf("SpareThreshold=%d want 5", log.SpareThreshold)
|
|
}
|
|
if log.PercentageUsed != 0 {
|
|
t.Errorf("PercentageUsed=%d want 0", log.PercentageUsed)
|
|
}
|
|
if log.Temperature != 310 {
|
|
t.Errorf("Temperature=%d want 310", log.Temperature)
|
|
}
|
|
if log.MediaErrors != 0 {
|
|
t.Errorf("MediaErrors=%d want 0", log.MediaErrors)
|
|
}
|
|
if log.UnsafeShutdowns != 22 {
|
|
t.Errorf("UnsafeShutdowns=%d want 22", log.UnsafeShutdowns)
|
|
}
|
|
}
|
|
|
|
func TestSetStorageHealthStatus(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
tests := []struct {
|
|
name string
|
|
health storageHealthStatus
|
|
want string
|
|
}{
|
|
{
|
|
name: "smart overall failed",
|
|
health: storageHealthStatus{hasOverall: true, overallPassed: false},
|
|
want: statusCritical,
|
|
},
|
|
{
|
|
name: "nvme critical warning",
|
|
health: storageHealthStatus{criticalWarning: 1},
|
|
want: statusCritical,
|
|
},
|
|
{
|
|
name: "pending sectors",
|
|
health: storageHealthStatus{pendingSectors: 1},
|
|
want: statusCritical,
|
|
},
|
|
{
|
|
name: "media errors warning",
|
|
health: storageHealthStatus{mediaErrors: 2},
|
|
want: statusWarning,
|
|
},
|
|
{
|
|
name: "reallocated warning",
|
|
health: storageHealthStatus{reallocatedSectors: 1},
|
|
want: statusWarning,
|
|
},
|
|
{
|
|
name: "life remaining low",
|
|
health: storageHealthStatus{lifeRemainingPct: 8},
|
|
want: statusWarning,
|
|
},
|
|
{
|
|
name: "healthy",
|
|
health: storageHealthStatus{},
|
|
want: statusOK,
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
var disk schema.HardwareStorage
|
|
setStorageHealthStatus(&disk, tt.health)
|
|
if disk.Status == nil || *disk.Status != tt.want {
|
|
t.Fatalf("status=%v want %q", disk.Status, tt.want)
|
|
}
|
|
})
|
|
}
|
|
}
|