Files
bee/audit/internal/collector/pcie.go
Mikhail Chusavitin 05c1fde233 Warn on PCIe link speed degradation and collect lspci -vvv in techdump
- collector/pcie: add applyPCIeLinkSpeedWarning that sets status=Warning
  and ErrorDescription when current link speed is below maximum negotiated
  speed (e.g. Gen1 running on a Gen5 slot)
- collector/pcie: add pcieLinkSpeedRank helper for Gen string comparison
- collector/pcie_filter_test: cover degraded and healthy link speed cases
- platform/techdump: collect lspci -vvv → lspci-vvv.txt for LnkCap/LnkSta

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-12 12:42:17 +03:00

302 lines
7.3 KiB
Go

package collector
import (
"bee/audit/internal/schema"
"fmt"
"log/slog"
"os/exec"
"strconv"
"strings"
)
func collectPCIe() []schema.HardwarePCIeDevice {
out, err := exec.Command("lspci", "-vmm", "-D").Output()
if err != nil {
slog.Warn("pcie: lspci failed", "err", err)
return nil
}
devs := parseLspci(string(out))
slog.Info("pcie: collected", "count", len(devs))
return devs
}
func parseLspci(output string) []schema.HardwarePCIeDevice {
// lspci -vmm -D outputs blank-line separated records, each field is "Key:\tValue"
var devs []schema.HardwarePCIeDevice
for _, block := range strings.Split(output, "\n\n") {
block = strings.TrimSpace(block)
if block == "" {
continue
}
fields := map[string]string{}
for _, line := range strings.Split(block, "\n") {
idx := strings.Index(line, ":\t")
if idx < 0 {
continue
}
key := strings.TrimSpace(line[:idx])
val := strings.TrimSpace(line[idx+2:])
fields[key] = val
}
if !shouldIncludePCIeDevice(fields["Class"], fields["Vendor"], fields["Device"]) {
continue
}
dev := parseLspciDevice(fields)
devs = append(devs, dev)
}
return devs
}
func shouldIncludePCIeDevice(class, vendor, device string) bool {
c := strings.ToLower(strings.TrimSpace(class))
v := strings.ToLower(strings.TrimSpace(vendor))
d := strings.ToLower(strings.TrimSpace(device))
if c == "" {
return true
}
// Keep inventory focused on useful replaceable components, not chipset/virtual noise.
excluded := []string{
"host bridge",
"isa bridge",
"pci bridge",
"co-processor",
"performance counter",
"performance counters",
"ram memory",
"system peripheral",
"communication controller",
"signal processing controller",
"usb controller",
"smbus",
"audio device",
"serial bus controller",
"unassigned class",
"non-essential instrumentation",
}
for _, bad := range excluded {
if strings.Contains(c, bad) {
return false
}
}
// Exclude BMC/management virtual VGA adapters — these are firmware video chips,
// not real GPUs, and pollute the GPU inventory (e.g. iBMC, iDRAC, iLO VGA).
if strings.Contains(c, "vga") || strings.Contains(c, "display") || strings.Contains(c, "3d") {
bmcPatterns := []string{
"management system chip",
"management controller",
"ibmc",
"idrac",
"ilo vga",
"aspeed",
"matrox",
}
for _, bad := range bmcPatterns {
if strings.Contains(d, bad) {
return false
}
}
}
if strings.Contains(v, "advanced micro devices") || strings.Contains(v, "[amd]") {
internalAMDPatterns := []string{
"dummy function",
"reserved spp",
"ptdma",
"cryptographic coprocessor pspcpp",
"pspcpp",
}
for _, bad := range internalAMDPatterns {
if strings.Contains(d, bad) {
return false
}
}
}
return true
}
func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
dev := schema.HardwarePCIeDevice{}
present := true
dev.Present = &present
status := statusOK
dev.Status = &status
// Slot is the BDF: "0000:00:02.0"
if bdf := fields["Slot"]; bdf != "" {
dev.Slot = &bdf
dev.BDF = &bdf
// parse vendor_id and device_id from sysfs
vendorID, deviceID := readPCIIDs(bdf)
if vendorID != 0 {
dev.VendorID = &vendorID
}
if deviceID != 0 {
dev.DeviceID = &deviceID
}
if numaNode, ok := readPCINumaNode(bdf); ok {
dev.NUMANode = &numaNode
} else if numaNode, ok := parsePCINumaNode(fields["NUMANode"]); ok {
dev.NUMANode = &numaNode
}
if width, ok := readPCIIntAttribute(bdf, "current_link_width"); ok {
dev.LinkWidth = &width
}
if width, ok := readPCIIntAttribute(bdf, "max_link_width"); ok {
dev.MaxLinkWidth = &width
}
if speed, ok := readPCIStringAttribute(bdf, "current_link_speed"); ok {
linkSpeed := normalizePCILinkSpeed(speed)
if linkSpeed != "" {
dev.LinkSpeed = &linkSpeed
}
}
if speed, ok := readPCIStringAttribute(bdf, "max_link_speed"); ok {
linkSpeed := normalizePCILinkSpeed(speed)
if linkSpeed != "" {
dev.MaxLinkSpeed = &linkSpeed
}
}
}
if v := fields["Class"]; v != "" {
class := mapPCIeDeviceClass(v)
dev.DeviceClass = &class
}
if v := fields["Vendor"]; v != "" {
dev.Manufacturer = &v
}
if v := fields["Device"]; v != "" {
dev.Model = &v
}
// SVendor/SDevice available but not in schema — skip
// Warn if PCIe link is running below its maximum negotiated speed.
applyPCIeLinkSpeedWarning(&dev)
return dev
}
// readPCIIDs reads vendor and device IDs from sysfs for a given BDF.
func readPCIIDs(bdf string) (vendorID, deviceID int) {
base := "/sys/bus/pci/devices/" + bdf
if v, err := readHexFile(base + "/vendor"); err == nil {
vendorID = v
}
if v, err := readHexFile(base + "/device"); err == nil {
deviceID = v
}
return
}
func readHexFile(path string) (int, error) {
out, err := exec.Command("cat", path).Output()
if err != nil {
return 0, err
}
s := strings.TrimSpace(strings.TrimPrefix(string(out), "0x"))
n, err := strconv.ParseInt(s, 16, 64)
return int(n), err
}
func readPCINumaNode(bdf string) (int, bool) {
value, ok := readPCIIntAttribute(bdf, "numa_node")
if !ok || value < 0 {
return 0, false
}
return value, true
}
func parsePCINumaNode(raw string) (int, bool) {
raw = strings.TrimSpace(raw)
if raw == "" {
return 0, false
}
value, err := strconv.Atoi(raw)
if err != nil || value < 0 {
return 0, false
}
return value, true
}
func readPCIIntAttribute(bdf, attribute string) (int, bool) {
out, err := exec.Command("cat", "/sys/bus/pci/devices/"+bdf+"/"+attribute).Output()
if err != nil {
return 0, false
}
value, err := strconv.Atoi(strings.TrimSpace(string(out)))
if err != nil || value < 0 {
return 0, false
}
return value, true
}
func readPCIStringAttribute(bdf, attribute string) (string, bool) {
out, err := exec.Command("cat", "/sys/bus/pci/devices/"+bdf+"/"+attribute).Output()
if err != nil {
return "", false
}
value := strings.TrimSpace(string(out))
if value == "" {
return "", false
}
return value, true
}
// applyPCIeLinkSpeedWarning sets the device status to Warning if the current PCIe link
// speed is below the maximum negotiated speed supported by both ends.
func applyPCIeLinkSpeedWarning(dev *schema.HardwarePCIeDevice) {
if dev.LinkSpeed == nil || dev.MaxLinkSpeed == nil {
return
}
if pcieLinkSpeedRank(*dev.LinkSpeed) < pcieLinkSpeedRank(*dev.MaxLinkSpeed) {
warn := statusWarning
dev.Status = &warn
desc := fmt.Sprintf("PCIe link speed degraded: running at %s, capable of %s", *dev.LinkSpeed, *dev.MaxLinkSpeed)
dev.ErrorDescription = &desc
}
}
// pcieLinkSpeedRank returns a numeric rank for a normalized Gen string (e.g. "Gen4" → 4).
// Returns 0 for unrecognised values so comparisons fail safe.
func pcieLinkSpeedRank(gen string) int {
switch gen {
case "Gen1":
return 1
case "Gen2":
return 2
case "Gen3":
return 3
case "Gen4":
return 4
case "Gen5":
return 5
case "Gen6":
return 6
default:
return 0
}
}
func normalizePCILinkSpeed(raw string) string {
raw = strings.TrimSpace(strings.ToLower(raw))
switch {
case strings.Contains(raw, "2.5"):
return "Gen1"
case strings.Contains(raw, "5.0"):
return "Gen2"
case strings.Contains(raw, "8.0"):
return "Gen3"
case strings.Contains(raw, "16.0"):
return "Gen4"
case strings.Contains(raw, "32.0"):
return "Gen5"
case strings.Contains(raw, "64.0"):
return "Gen6"
default:
return ""
}
}