feat: sync with hardware ingest contract v2.10

- PCIeDevice: add model, firmware, present, iommu_group, telemetry fields
  (temperature_c, power_w, ecc_corrected_total, ecc_uncorrected_total,
  hw_slowdown) — were silently dropped on JSON parse, breaking bee audit display
- buildDevicesFromLegacy: use pcie.Model as fallback (PartNumber > Model >
  Description), copy MACAddresses/Present/Firmware, propagate telemetry into
  Details so convertPCIeFromDevices picks them up
- Storage: add logical_block_size_bytes, physical_block_size_bytes,
  metadata_bytes_per_block (contract v2.10, 2026-04-29) to models, exporter
  struct and converter pipeline
- ReanimatorHardware: add platform_config map[string]any (contract v2.9)
- Update internal/chart submodule to v2.0 (contract 2.10 viewer support:
  event_logs section, platform_config section, storage block size columns)
- Update bible submodule

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Mikhail Chusavitin
2026-04-30 15:53:47 +03:00
parent 88e4e8dd49
commit 78d0e26fd0
5 changed files with 110 additions and 66 deletions

2
bible

Submodule bible updated: 52444350c1...d2600f1279

View File

@@ -159,6 +159,16 @@ func buildDevicesFromLegacy(hw *models.HardwareConfig) []models.HardwareDevice {
}
for _, stor := range hw.Storage {
present := stor.Present
storDetails := mergeDetailMaps(nil, stor.Details)
if stor.LogicalBlockSizeBytes != 0 {
storDetails = mergeDetailMaps(storDetails, map[string]any{"logical_block_size_bytes": stor.LogicalBlockSizeBytes})
}
if stor.PhysicalBlockSizeBytes != 0 {
storDetails = mergeDetailMaps(storDetails, map[string]any{"physical_block_size_bytes": stor.PhysicalBlockSizeBytes})
}
if stor.MetadataBytesPerBlock != 0 {
storDetails = mergeDetailMaps(storDetails, map[string]any{"metadata_bytes_per_block": stor.MetadataBytesPerBlock})
}
appendDevice(models.HardwareDevice{
Kind: models.DeviceKindStorage,
Slot: stor.Slot,
@@ -177,27 +187,38 @@ func buildDevicesFromLegacy(hw *models.HardwareConfig) []models.HardwareDevice {
StatusAtCollect: stor.StatusAtCollect,
StatusHistory: stor.StatusHistory,
ErrorDescription: stor.ErrorDescription,
Details: mergeDetailMaps(nil, stor.Details),
Details: storDetails,
})
}
for _, pcie := range hw.PCIeDevices {
// Use PartNumber as model when available; fall back to chip description.
// Description contains the chip/product name (e.g. "BCM57414 NetXtreme-E …")
// while PartNumber is a part/product code. Prefer PartNumber when set.
pcieModel := pcie.PartNumber
if pcieModel == "" {
pcieModel = pcie.Description
}
// Priority: PartNumber (vendor P/N) > Model (product name) > Description (chip label).
pcieModel := firstNonEmptyString(pcie.PartNumber, pcie.Model, pcie.Description)
details := mergeDetailMaps(nil, pcie.Details)
pcieFirmware := stringFromDetailMap(details, "firmware")
// Firmware: prefer direct field, fall back to details, then NVSwitch lookup.
pcieFirmware := firstNonEmptyString(pcie.Firmware, stringFromDetailMap(details, "firmware"))
if pcieFirmware == "" && isNVSwitchPCIeDevice(pcie) {
pcieFirmware = nvswitchFirmwareBySlot[normalizeNVSwitchSlotForLookup(pcie.Slot)]
if pcieFirmware != "" {
details = mergeDetailMaps(details, map[string]any{
"firmware": pcieFirmware,
})
}
}
if pcieFirmware != "" {
details = mergeDetailMaps(details, map[string]any{"firmware": pcieFirmware})
}
// Telemetry fields: put into details so convertPCIeFromDevices can pick them up.
if pcie.TemperatureC != nil {
details = mergeDetailMaps(details, map[string]any{"temperature_c": *pcie.TemperatureC})
}
if pcie.PowerW != nil {
details = mergeDetailMaps(details, map[string]any{"power_w": *pcie.PowerW})
}
if pcie.ECCCorrectedTotal != nil {
details = mergeDetailMaps(details, map[string]any{"ecc_corrected_total": *pcie.ECCCorrectedTotal})
}
if pcie.ECCUncorrectedTotal != nil {
details = mergeDetailMaps(details, map[string]any{"ecc_uncorrected_total": *pcie.ECCUncorrectedTotal})
}
if pcie.HWSlowdown != nil {
details = mergeDetailMaps(details, map[string]any{"hw_slowdown": *pcie.HWSlowdown})
}
present := pcie.Present
appendDevice(models.HardwareDevice{
Kind: models.DeviceKindPCIe,
Slot: pcie.Slot,
@@ -209,11 +230,13 @@ func buildDevicesFromLegacy(hw *models.HardwareConfig) []models.HardwareDevice {
PartNumber: pcie.PartNumber,
Manufacturer: pcie.Manufacturer,
SerialNumber: pcie.SerialNumber,
MACAddresses: append([]string(nil), pcie.MACAddresses...),
LinkWidth: pcie.LinkWidth,
LinkSpeed: pcie.LinkSpeed,
MaxLinkWidth: pcie.MaxLinkWidth,
MaxLinkSpeed: pcie.MaxLinkSpeed,
NUMANode: pcie.NUMANode,
Present: present,
Status: pcie.Status,
StatusCheckedAt: pcie.StatusCheckedAt,
StatusChangedAt: pcie.StatusChangedAt,
@@ -738,36 +761,39 @@ func convertStorageFromDevices(devices []models.HardwareDevice, collectedAt stri
meta := buildStatusMeta(status, d.StatusCheckedAt, d.StatusChangedAt, d.StatusHistory, d.ErrorDescription, collectedAt)
presentValue := present
result = append(result, ReanimatorStorage{
Slot: d.Slot,
Type: d.Type,
Model: d.Model,
SizeGB: d.SizeGB,
SerialNumber: d.SerialNumber,
Manufacturer: d.Manufacturer,
Firmware: d.Firmware,
Interface: d.Interface,
Present: &presentValue,
TemperatureC: floatFromDetailMap(d.Details, "temperature_c"),
PowerOnHours: int64FromDetailMap(d.Details, "power_on_hours"),
PowerCycles: int64FromDetailMap(d.Details, "power_cycles"),
UnsafeShutdowns: int64FromDetailMap(d.Details, "unsafe_shutdowns"),
MediaErrors: int64FromDetailMap(d.Details, "media_errors"),
ErrorLogEntries: int64FromDetailMap(d.Details, "error_log_entries"),
WrittenBytes: int64FromDetailMap(d.Details, "written_bytes"),
ReadBytes: int64FromDetailMap(d.Details, "read_bytes"),
LifeUsedPct: floatFromDetailMap(d.Details, "life_used_pct"),
RemainingEndurancePct: d.RemainingEndurancePct,
LifeRemainingPct: floatFromDetailMap(d.Details, "life_remaining_pct"),
AvailableSparePct: floatFromDetailMap(d.Details, "available_spare_pct"),
ReallocatedSectors: int64FromDetailMap(d.Details, "reallocated_sectors"),
CurrentPendingSectors: int64FromDetailMap(d.Details, "current_pending_sectors"),
OfflineUncorrectable: int64FromDetailMap(d.Details, "offline_uncorrectable"),
Status: status,
StatusCheckedAt: meta.StatusCheckedAt,
StatusChangedAt: meta.StatusChangedAt,
ManufacturedYearWeek: manufacturedYearWeekFromDetails(d.Details),
StatusHistory: meta.StatusHistory,
ErrorDescription: meta.ErrorDescription,
Slot: d.Slot,
Type: d.Type,
Model: d.Model,
SizeGB: d.SizeGB,
SerialNumber: d.SerialNumber,
Manufacturer: d.Manufacturer,
Firmware: d.Firmware,
Interface: d.Interface,
Present: &presentValue,
LogicalBlockSizeBytes: int64FromDetailMap(d.Details, "logical_block_size_bytes"),
PhysicalBlockSizeBytes: int64FromDetailMap(d.Details, "physical_block_size_bytes"),
MetadataBytesPerBlock: int64FromDetailMap(d.Details, "metadata_bytes_per_block"),
TemperatureC: floatFromDetailMap(d.Details, "temperature_c"),
PowerOnHours: int64FromDetailMap(d.Details, "power_on_hours"),
PowerCycles: int64FromDetailMap(d.Details, "power_cycles"),
UnsafeShutdowns: int64FromDetailMap(d.Details, "unsafe_shutdowns"),
MediaErrors: int64FromDetailMap(d.Details, "media_errors"),
ErrorLogEntries: int64FromDetailMap(d.Details, "error_log_entries"),
WrittenBytes: int64FromDetailMap(d.Details, "written_bytes"),
ReadBytes: int64FromDetailMap(d.Details, "read_bytes"),
LifeUsedPct: floatFromDetailMap(d.Details, "life_used_pct"),
RemainingEndurancePct: d.RemainingEndurancePct,
LifeRemainingPct: floatFromDetailMap(d.Details, "life_remaining_pct"),
AvailableSparePct: floatFromDetailMap(d.Details, "available_spare_pct"),
ReallocatedSectors: int64FromDetailMap(d.Details, "reallocated_sectors"),
CurrentPendingSectors: int64FromDetailMap(d.Details, "current_pending_sectors"),
OfflineUncorrectable: int64FromDetailMap(d.Details, "offline_uncorrectable"),
Status: status,
StatusCheckedAt: meta.StatusCheckedAt,
StatusChangedAt: meta.StatusChangedAt,
ManufacturedYearWeek: manufacturedYearWeekFromDetails(d.Details),
StatusHistory: meta.StatusHistory,
ErrorDescription: meta.ErrorDescription,
})
}
return result

View File

@@ -12,15 +12,16 @@ type ReanimatorExport struct {
// ReanimatorHardware contains all hardware components
type ReanimatorHardware struct {
Board ReanimatorBoard `json:"board"`
Firmware []ReanimatorFirmware `json:"firmware,omitempty"`
CPUs []ReanimatorCPU `json:"cpus,omitempty"`
Memory []ReanimatorMemory `json:"memory,omitempty"`
Storage []ReanimatorStorage `json:"storage,omitempty"`
PCIeDevices []ReanimatorPCIe `json:"pcie_devices,omitempty"`
PowerSupplies []ReanimatorPSU `json:"power_supplies,omitempty"`
Sensors *ReanimatorSensors `json:"sensors,omitempty"`
EventLogs []ReanimatorEventLog `json:"event_logs,omitempty"`
Board ReanimatorBoard `json:"board"`
Firmware []ReanimatorFirmware `json:"firmware,omitempty"`
CPUs []ReanimatorCPU `json:"cpus,omitempty"`
Memory []ReanimatorMemory `json:"memory,omitempty"`
Storage []ReanimatorStorage `json:"storage,omitempty"`
PCIeDevices []ReanimatorPCIe `json:"pcie_devices,omitempty"`
PowerSupplies []ReanimatorPSU `json:"power_supplies,omitempty"`
Sensors *ReanimatorSensors `json:"sensors,omitempty"`
EventLogs []ReanimatorEventLog `json:"event_logs,omitempty"`
PlatformConfig map[string]any `json:"platform_config,omitempty"`
}
// ReanimatorBoard represents motherboard/server information
@@ -101,17 +102,20 @@ type ReanimatorMemory struct {
// ReanimatorStorage represents a storage device
type ReanimatorStorage struct {
Slot string `json:"slot"`
Type string `json:"type,omitempty"`
Model string `json:"model"`
SizeGB int `json:"size_gb,omitempty"`
SerialNumber string `json:"serial_number"`
Manufacturer string `json:"manufacturer,omitempty"`
Firmware string `json:"firmware,omitempty"`
Interface string `json:"interface,omitempty"`
Present *bool `json:"present,omitempty"`
TemperatureC float64 `json:"temperature_c,omitempty"`
PowerOnHours int64 `json:"power_on_hours,omitempty"`
Slot string `json:"slot"`
Type string `json:"type,omitempty"`
Model string `json:"model"`
SizeGB int `json:"size_gb,omitempty"`
SerialNumber string `json:"serial_number"`
Manufacturer string `json:"manufacturer,omitempty"`
Firmware string `json:"firmware,omitempty"`
Interface string `json:"interface,omitempty"`
Present *bool `json:"present,omitempty"`
LogicalBlockSizeBytes int64 `json:"logical_block_size_bytes,omitempty"`
PhysicalBlockSizeBytes int64 `json:"physical_block_size_bytes,omitempty"`
MetadataBytesPerBlock int64 `json:"metadata_bytes_per_block,omitempty"`
TemperatureC float64 `json:"temperature_c,omitempty"`
PowerOnHours int64 `json:"power_on_hours,omitempty"`
PowerCycles int64 `json:"power_cycles,omitempty"`
UnsafeShutdowns int64 `json:"unsafe_shutdowns,omitempty"`
MediaErrors int64 `json:"media_errors,omitempty"`

View File

@@ -245,6 +245,9 @@ type Storage struct {
Location string `json:"location,omitempty"` // Front/Rear
BackplaneID int `json:"backplane_id,omitempty"`
RemainingEndurancePct *int `json:"remaining_endurance_pct,omitempty"` // 0-100 %; nil = not reported
LogicalBlockSizeBytes int64 `json:"logical_block_size_bytes,omitempty"`
PhysicalBlockSizeBytes int64 `json:"physical_block_size_bytes,omitempty"`
MetadataBytesPerBlock int64 `json:"metadata_bytes_per_block,omitempty"`
Status string `json:"status,omitempty"`
Details map[string]any `json:"details,omitempty"`
@@ -278,6 +281,8 @@ type PCIeDevice struct {
BDF string `json:"bdf"`
DeviceClass string `json:"device_class"`
Manufacturer string `json:"manufacturer,omitempty"`
Model string `json:"model,omitempty"`
Firmware string `json:"firmware,omitempty"`
LinkWidth int `json:"link_width"`
LinkSpeed string `json:"link_speed"`
MaxLinkWidth int `json:"max_link_width"`
@@ -286,8 +291,17 @@ type PCIeDevice struct {
SerialNumber string `json:"serial_number,omitempty"`
MACAddresses []string `json:"mac_addresses,omitempty"`
NUMANode int `json:"numa_node,omitempty"` // 0 = not reported/N/A
Present *bool `json:"present,omitempty"`
IOMMUGroup *int `json:"iommu_group,omitempty"`
Status string `json:"status,omitempty"`
// GPU telemetry fields (populated by bee audit for GPU devices)
TemperatureC *float64 `json:"temperature_c,omitempty"`
PowerW *float64 `json:"power_w,omitempty"`
ECCCorrectedTotal *int64 `json:"ecc_corrected_total,omitempty"`
ECCUncorrectedTotal *int64 `json:"ecc_uncorrected_total,omitempty"`
HWSlowdown *bool `json:"hw_slowdown,omitempty"`
StatusCheckedAt *time.Time `json:"status_checked_at,omitempty"`
StatusChangedAt *time.Time `json:"status_changed_at,omitempty"`
StatusAtCollect *StatusAtCollection `json:"status_at_collection,omitempty"`