Files
bee/audit/internal/collector/storage_health_test.go
Michael Chus e169a7722c Fix NVMe SMART status always Unknown; fix GPU count including NVSwitches
nvme-cli emits smart-log counters as JSON strings and uses field names
avail_spare / percent_used instead of the prose names in the NVMe spec.
The nvmeSmartLog struct had int64 fields with wrong JSON tags — Unmarshal
returned an error and the whole health block was skipped, leaving every
NVMe drive with status=Unknown.

Fix: switch all numeric fields to jsonInt64 (already used for lsblk
block sizes) which accepts both bare numbers and quoted strings, and
correct the avail_spare / percent_used tag names.

Also fix validateIsVendorGPU for NVIDIA: previously counted any NVIDIA
PCIe device (including NVSwitch bridges) as a GPU, producing wrong
estimates (12 instead of 8 on an HGX H100 system). Now requires
device_class to be videocontroller or processingaccelerator, matching
the existing AMD filter logic.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-04 18:06:32 +03:00

118 lines
2.9 KiB
Go

package collector
import (
"encoding/json"
"testing"
"bee/audit/internal/schema"
)
// TestNVMeSmartLogUnmarshal verifies that nvme-cli JSON output (where most
// counters are quoted strings and field names differ from NVMe spec prose)
// is correctly parsed into nvmeSmartLog.
func TestNVMeSmartLogUnmarshal(t *testing.T) {
t.Parallel()
// Real nvme-cli output: counters are JSON strings, spare is "avail_spare",
// percentage used is "percent_used".
raw := `{
"critical_warning": 0,
"temperature": 310,
"avail_spare": 100,
"spare_thresh": 5,
"percent_used": 0,
"data_units_read": "10925415",
"data_units_written": "8497672",
"controller_busy_time": "305",
"power_cycles": "53",
"power_on_hours": "49",
"unsafe_shutdowns": "22",
"media_errors": "0",
"num_err_log_entries": "0"
}`
var log nvmeSmartLog
if err := json.Unmarshal([]byte(raw), &log); err != nil {
t.Fatalf("json.Unmarshal failed: %v", err)
}
if log.PowerOnHours != 49 {
t.Errorf("PowerOnHours=%d want 49", log.PowerOnHours)
}
if log.PowerCycles != 53 {
t.Errorf("PowerCycles=%d want 53", log.PowerCycles)
}
if log.AvailableSpare != 100 {
t.Errorf("AvailableSpare=%d want 100", log.AvailableSpare)
}
if log.SpareThreshold != 5 {
t.Errorf("SpareThreshold=%d want 5", log.SpareThreshold)
}
if log.PercentageUsed != 0 {
t.Errorf("PercentageUsed=%d want 0", log.PercentageUsed)
}
if log.Temperature != 310 {
t.Errorf("Temperature=%d want 310", log.Temperature)
}
if log.MediaErrors != 0 {
t.Errorf("MediaErrors=%d want 0", log.MediaErrors)
}
if log.UnsafeShutdowns != 22 {
t.Errorf("UnsafeShutdowns=%d want 22", log.UnsafeShutdowns)
}
}
func TestSetStorageHealthStatus(t *testing.T) {
t.Parallel()
tests := []struct {
name string
health storageHealthStatus
want string
}{
{
name: "smart overall failed",
health: storageHealthStatus{hasOverall: true, overallPassed: false},
want: statusCritical,
},
{
name: "nvme critical warning",
health: storageHealthStatus{criticalWarning: 1},
want: statusCritical,
},
{
name: "pending sectors",
health: storageHealthStatus{pendingSectors: 1},
want: statusCritical,
},
{
name: "media errors warning",
health: storageHealthStatus{mediaErrors: 2},
want: statusWarning,
},
{
name: "reallocated warning",
health: storageHealthStatus{reallocatedSectors: 1},
want: statusWarning,
},
{
name: "life remaining low",
health: storageHealthStatus{lifeRemainingPct: 8},
want: statusWarning,
},
{
name: "healthy",
health: storageHealthStatus{},
want: statusOK,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
var disk schema.HardwareStorage
setStorageHealthStatus(&disk, tt.health)
if disk.Status == nil || *disk.Status != tt.want {
t.Fatalf("status=%v want %q", disk.Status, tt.want)
}
})
}
}