From e169a7722c794404a1b105e4e6c7f36faac03fab Mon Sep 17 00:00:00 2001 From: Michael Chus Date: Thu, 4 Jun 2026 18:06:32 +0300 Subject: [PATCH] Fix NVMe SMART status always Unknown; fix GPU count including NVSwitches MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit nvme-cli emits smart-log counters as JSON strings and uses field names avail_spare / percent_used instead of the prose names in the NVMe spec. The nvmeSmartLog struct had int64 fields with wrong JSON tags — Unmarshal returned an error and the whole health block was skipped, leaving every NVMe drive with status=Unknown. Fix: switch all numeric fields to jsonInt64 (already used for lsblk block sizes) which accepts both bare numbers and quoted strings, and correct the avail_spare / percent_used tag names. Also fix validateIsVendorGPU for NVIDIA: previously counted any NVIDIA PCIe device (including NVSwitch bridges) as a GPU, producing wrong estimates (12 instead of 8 on an HGX H100 system). Now requires device_class to be videocontroller or processingaccelerator, matching the existing AMD filter logic. Co-Authored-By: Claude Sonnet 4.6 --- audit/internal/collector/storage.go | 56 +++++++++++-------- .../internal/collector/storage_health_test.go | 54 ++++++++++++++++++ audit/internal/webui/page_validate.go | 4 +- 3 files changed, 89 insertions(+), 25 deletions(-) diff --git a/audit/internal/collector/storage.go b/audit/internal/collector/storage.go index f89bc2c..5336a03 100644 --- a/audit/internal/collector/storage.go +++ b/audit/internal/collector/storage.go @@ -413,20 +413,23 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage { } // nvmeSmartLog is the subset of `nvme smart-log -o json` output we care about. +// nvme-cli emits most counters as JSON strings (e.g. "power_on_hours":"49"), +// so all numeric fields use jsonInt64 which accepts both bare numbers and +// quoted strings. Field names match nvme-cli JSON output, not NVMe spec prose. type nvmeSmartLog struct { - CriticalWarning int `json:"critical_warning"` - PercentageUsed int `json:"percentage_used"` - AvailableSpare int `json:"available_spare"` - SpareThreshold int `json:"spare_thresh"` - Temperature int64 `json:"temperature"` - PowerOnHours int64 `json:"power_on_hours"` - PowerCycles int64 `json:"power_cycles"` - UnsafeShutdowns int64 `json:"unsafe_shutdowns"` - DataUnitsRead int64 `json:"data_units_read"` - DataUnitsWritten int64 `json:"data_units_written"` - ControllerBusy int64 `json:"controller_busy_time"` - MediaErrors int64 `json:"media_errors"` - NumErrLogEntries int64 `json:"num_err_log_entries"` + CriticalWarning jsonInt64 `json:"critical_warning"` + PercentageUsed jsonInt64 `json:"percent_used"` + AvailableSpare jsonInt64 `json:"avail_spare"` + SpareThreshold jsonInt64 `json:"spare_thresh"` + Temperature jsonInt64 `json:"temperature"` + PowerOnHours jsonInt64 `json:"power_on_hours"` + PowerCycles jsonInt64 `json:"power_cycles"` + UnsafeShutdowns jsonInt64 `json:"unsafe_shutdowns"` + DataUnitsRead jsonInt64 `json:"data_units_read"` + DataUnitsWritten jsonInt64 `json:"data_units_written"` + ControllerBusy jsonInt64 `json:"controller_busy_time"` + MediaErrors jsonInt64 `json:"media_errors"` + NumErrLogEntries jsonInt64 `json:"num_err_log_entries"` } // nvmeIDCtrl is the subset of `nvme id-ctrl -o json` output. @@ -491,13 +494,16 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage { var log nvmeSmartLog if json.Unmarshal(out, &log) == nil { if log.PowerOnHours > 0 { - s.PowerOnHours = &log.PowerOnHours + v := int64(log.PowerOnHours) + s.PowerOnHours = &v } if log.PowerCycles > 0 { - s.PowerCycles = &log.PowerCycles + v := int64(log.PowerCycles) + s.PowerCycles = &v } if log.UnsafeShutdowns > 0 { - s.UnsafeShutdowns = &log.UnsafeShutdowns + v := int64(log.UnsafeShutdowns) + s.UnsafeShutdowns = &v } if log.PercentageUsed > 0 { v := float64(log.PercentageUsed) @@ -506,11 +512,11 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage { s.LifeRemainingPct = &remaining } if log.DataUnitsWritten > 0 { - v := nvmeDataUnitsToBytes(log.DataUnitsWritten) + v := nvmeDataUnitsToBytes(int64(log.DataUnitsWritten)) s.WrittenBytes = &v } if log.DataUnitsRead > 0 { - v := nvmeDataUnitsToBytes(log.DataUnitsRead) + v := nvmeDataUnitsToBytes(int64(log.DataUnitsRead)) s.ReadBytes = &v } if log.AvailableSpare > 0 { @@ -518,23 +524,25 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage { s.AvailableSparePct = &v } if log.MediaErrors > 0 { - s.MediaErrors = &log.MediaErrors + v := int64(log.MediaErrors) + s.MediaErrors = &v } if log.NumErrLogEntries > 0 { - s.ErrorLogEntries = &log.NumErrLogEntries + v := int64(log.NumErrLogEntries) + s.ErrorLogEntries = &v } if log.Temperature > 0 { v := float64(log.Temperature - 273) s.TemperatureC = &v } setStorageHealthStatus(&s, storageHealthStatus{ - criticalWarning: log.CriticalWarning, + criticalWarning: int(log.CriticalWarning), percentageUsed: int64(log.PercentageUsed), availableSpare: int64(log.AvailableSpare), spareThreshold: int64(log.SpareThreshold), - unsafeShutdowns: log.UnsafeShutdowns, - mediaErrors: log.MediaErrors, - errorLogEntries: log.NumErrLogEntries, + unsafeShutdowns: int64(log.UnsafeShutdowns), + mediaErrors: int64(log.MediaErrors), + errorLogEntries: int64(log.NumErrLogEntries), }) return s } diff --git a/audit/internal/collector/storage_health_test.go b/audit/internal/collector/storage_health_test.go index a056fa2..c8dfc31 100644 --- a/audit/internal/collector/storage_health_test.go +++ b/audit/internal/collector/storage_health_test.go @@ -1,11 +1,65 @@ package collector import ( + "encoding/json" "testing" "bee/audit/internal/schema" ) +// TestNVMeSmartLogUnmarshal verifies that nvme-cli JSON output (where most +// counters are quoted strings and field names differ from NVMe spec prose) +// is correctly parsed into nvmeSmartLog. +func TestNVMeSmartLogUnmarshal(t *testing.T) { + t.Parallel() + + // Real nvme-cli output: counters are JSON strings, spare is "avail_spare", + // percentage used is "percent_used". + raw := `{ + "critical_warning": 0, + "temperature": 310, + "avail_spare": 100, + "spare_thresh": 5, + "percent_used": 0, + "data_units_read": "10925415", + "data_units_written": "8497672", + "controller_busy_time": "305", + "power_cycles": "53", + "power_on_hours": "49", + "unsafe_shutdowns": "22", + "media_errors": "0", + "num_err_log_entries": "0" + }` + var log nvmeSmartLog + if err := json.Unmarshal([]byte(raw), &log); err != nil { + t.Fatalf("json.Unmarshal failed: %v", err) + } + if log.PowerOnHours != 49 { + t.Errorf("PowerOnHours=%d want 49", log.PowerOnHours) + } + if log.PowerCycles != 53 { + t.Errorf("PowerCycles=%d want 53", log.PowerCycles) + } + if log.AvailableSpare != 100 { + t.Errorf("AvailableSpare=%d want 100", log.AvailableSpare) + } + if log.SpareThreshold != 5 { + t.Errorf("SpareThreshold=%d want 5", log.SpareThreshold) + } + if log.PercentageUsed != 0 { + t.Errorf("PercentageUsed=%d want 0", log.PercentageUsed) + } + if log.Temperature != 310 { + t.Errorf("Temperature=%d want 310", log.Temperature) + } + if log.MediaErrors != 0 { + t.Errorf("MediaErrors=%d want 0", log.MediaErrors) + } + if log.UnsafeShutdowns != 22 { + t.Errorf("UnsafeShutdowns=%d want 22", log.UnsafeShutdowns) + } +} + func TestSetStorageHealthStatus(t *testing.T) { t.Parallel() diff --git a/audit/internal/webui/page_validate.go b/audit/internal/webui/page_validate.go index 4c2ac74..90fcace 100644 --- a/audit/internal/webui/page_validate.go +++ b/audit/internal/webui/page_validate.go @@ -642,7 +642,9 @@ func validateIsVendorGPU(dev schema.HardwarePCIeDevice, vendor string) bool { } switch vendor { case "nvidia": - return strings.Contains(model, "nvidia") || strings.Contains(manufacturer, "nvidia") + isNVIDIAVendor := strings.Contains(model, "nvidia") || strings.Contains(manufacturer, "nvidia") + isGPUClass := class == "videocontroller" || class == "processingaccelerator" || class == "displaycontroller" + return isNVIDIAVendor && isGPUClass case "amd": isGPUClass := class == "processingaccelerator" || class == "displaycontroller" || class == "videocontroller" isAMDVendor := strings.Contains(manufacturer, "advanced micro devices") || strings.Contains(manufacturer, "amd") || strings.Contains(manufacturer, "ati")