- storage: add jsonInt64 dual-format unmarshaler to handle lsblk output change in util-linux 2.38 (LOG-SEC/PHY-SEC now emitted as JSON integers, not quoted strings); fixes SATA disks invisible on Debian 12 - pcie: detect NVLink bridge mezzanine CX-7 cards (Mellanox x2, no host net ifaces, DeviceName contains "NVLINK" in lspci -v) and mark them with device_class="NVLinkBridge"; escalate PCIe link speed downgrade to Critical for these cards (Gen3 on a fixed internal connector = hardware fault, not a transient warning) - pcie: cross-reference nvidia-smi topo to capture NVLink bond counts and active status for all NVLink bridge cards - packages: add infiniband-diags to ISO; provides ibstat required by nvidia-fabricmanager-start.sh to enumerate IB devices before FM launch (absence causes CUDA_ERROR_SYSTEM_NOT_READY) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
71 lines
2.5 KiB
Go
71 lines
2.5 KiB
Go
// Package collector runs all hardware collectors and merges results
|
||
// into a single HardwareSnapshot. Each sub-collector is independent:
|
||
// a failure in one does not abort the others.
|
||
package collector
|
||
|
||
import (
|
||
"bee/audit/internal/runtimeenv"
|
||
"bee/audit/internal/schema"
|
||
"log/slog"
|
||
"os"
|
||
"time"
|
||
)
|
||
|
||
// Run executes all collectors and returns the combined snapshot.
|
||
// Partial failures are logged as warnings; collection always completes.
|
||
func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
|
||
start := time.Now()
|
||
collectedAt := time.Now().UTC().Format(time.RFC3339)
|
||
slog.Info("audit started")
|
||
|
||
snap := schema.HardwareSnapshot{}
|
||
|
||
board, biosFW := collectBoard()
|
||
snap.Board = board
|
||
snap.Firmware = append(snap.Firmware, biosFW...)
|
||
snap.Firmware = append(snap.Firmware, collectBMCFirmware(derefString(snap.Board.Manufacturer))...)
|
||
|
||
snap.CPUs = collectCPUs()
|
||
|
||
snap.Memory = collectMemory()
|
||
sensorDoc, err := readSensorsJSONDoc()
|
||
if err != nil {
|
||
slog.Info("sensors: unavailable for enrichment", "err", err)
|
||
}
|
||
snap.CPUs = enrichCPUsWithTelemetry(snap.CPUs, sensorDoc)
|
||
snap.Memory = enrichMemoryWithTelemetry(snap.Memory, sensorDoc)
|
||
bestEffortRescanHotplugStorage()
|
||
snap.Storage = collectStorage()
|
||
snap.PCIeDevices = collectPCIe()
|
||
snap.PCIeDevices = enrichPCIeWithAMD(snap.PCIeDevices)
|
||
snap.PCIeDevices = enrichPCIeWithPCISerials(snap.PCIeDevices)
|
||
snap.PCIeDevices = enrichPCIeWithNVIDIA(snap.PCIeDevices)
|
||
snap.PCIeDevices = enrichNVLinkBridgesWithGPUTopo(snap.PCIeDevices)
|
||
snap.PCIeDevices = enrichPCIeWithMellanox(snap.PCIeDevices)
|
||
snap.PCIeDevices = enrichPCIeWithNICTelemetry(snap.PCIeDevices)
|
||
snap.PCIeDevices = enrichPCIeWithRAIDTelemetry(snap.PCIeDevices)
|
||
snap.Storage = enrichStorageWithVROC(snap.Storage, snap.PCIeDevices)
|
||
snap.Storage = appendUniqueStorage(snap.Storage, collectRAIDStorage(snap.PCIeDevices))
|
||
snap.VROCLicense = collectVROCLicense(snap.PCIeDevices)
|
||
snap.PowerSupplies = collectPSUs(derefString(snap.Board.Manufacturer))
|
||
snap.PowerSupplies = enrichPSUsWithTelemetry(snap.PowerSupplies, sensorDoc)
|
||
snap.Sensors = buildSensorsFromDoc(sensorDoc)
|
||
finalizeSnapshot(&snap, collectedAt)
|
||
|
||
// remaining collectors added in steps 1.8 – 1.10
|
||
|
||
slog.Info("audit completed", "duration", time.Since(start).Round(time.Millisecond))
|
||
|
||
sourceType := "manual"
|
||
var targetHost *string
|
||
if hostname, err := os.Hostname(); err == nil && hostname != "" {
|
||
targetHost = &hostname
|
||
}
|
||
return schema.HardwareIngestRequest{
|
||
SourceType: &sourceType,
|
||
TargetHost: targetHost,
|
||
CollectedAt: collectedAt,
|
||
Hardware: snap,
|
||
}
|
||
}
|