package collector import ( "bee/audit/internal/schema" "encoding/csv" "fmt" "log/slog" "os/exec" "strconv" "strings" ) const nvidiaVendorID = 0x10de type nvidiaGPUInfo struct { BDF string Serial string VBIOS string TemperatureC *float64 PowerW *float64 ECCUncorrected *int64 ECCCorrected *int64 HWSlowdown *bool } // enrichPCIeWithNVIDIA enriches NVIDIA PCIe devices with data from nvidia-smi. // If the driver/tool is unavailable, NVIDIA devices get Unknown status. func enrichPCIeWithNVIDIA(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice { if !hasNVIDIADevices(devs) { return devs } gpuByBDF, err := queryNVIDIAGPUs() if err != nil { slog.Info("nvidia: enrichment skipped", "err", err) return enrichPCIeWithNVIDIAData(devs, nil, false) } return enrichPCIeWithNVIDIAData(devs, gpuByBDF, true) } func hasNVIDIADevices(devs []schema.HardwarePCIeDevice) bool { for _, dev := range devs { if isNVIDIADevice(dev) { return true } } return false } func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[string]nvidiaGPUInfo, driverLoaded bool) []schema.HardwarePCIeDevice { enriched := 0 for i := range devs { if !isNVIDIADevice(devs[i]) { continue } if !driverLoaded { setPCIeFallback(&devs[i]) continue } bdf := "" if devs[i].BDF != nil { bdf = normalizePCIeBDF(*devs[i].BDF) } info, ok := gpuByBDF[bdf] if !ok { setPCIeFallback(&devs[i]) continue } if v := strings.TrimSpace(info.Serial); v != "" { devs[i].SerialNumber = &v } if v := strings.TrimSpace(info.VBIOS); v != "" { devs[i].Firmware = &v } status := statusOK if info.ECCUncorrected != nil && *info.ECCUncorrected > 0 { status = statusWarning devs[i].ErrorDescription = stringPtr("GPU reports uncorrected ECC errors") } devs[i].Status = &status injectNVIDIATelemetry(&devs[i], info) enriched++ } if driverLoaded { slog.Info("nvidia: enriched", "count", enriched) } return devs } func queryNVIDIAGPUs() (map[string]nvidiaGPUInfo, error) { out, err := exec.Command( "nvidia-smi", "--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown", "--format=csv,noheader,nounits", ).Output() if err != nil { return nil, err } return parseNVIDIASMIQuery(string(out)) } func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) { r := csv.NewReader(strings.NewReader(raw)) r.TrimLeadingSpace = true r.FieldsPerRecord = -1 records, err := r.ReadAll() if err != nil { return nil, err } result := make(map[string]nvidiaGPUInfo) for _, rec := range records { if len(rec) == 0 { continue } if len(rec) < 9 { return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 9", len(rec)) } bdf := normalizePCIeBDF(rec[1]) if bdf == "" { continue } info := nvidiaGPUInfo{ BDF: bdf, Serial: strings.TrimSpace(rec[2]), VBIOS: strings.TrimSpace(rec[3]), TemperatureC: parseMaybeFloat(rec[4]), PowerW: parseMaybeFloat(rec[5]), ECCUncorrected: parseMaybeInt64(rec[6]), ECCCorrected: parseMaybeInt64(rec[7]), HWSlowdown: parseMaybeBool(rec[8]), } result[bdf] = info } return result, nil } func parseMaybeFloat(v string) *float64 { v = strings.TrimSpace(v) if v == "" || strings.EqualFold(v, "n/a") || strings.EqualFold(v, "not supported") || strings.EqualFold(v, "[not supported]") { return nil } n, err := strconv.ParseFloat(v, 64) if err != nil { return nil } return &n } func parseMaybeInt64(v string) *int64 { v = strings.TrimSpace(v) if v == "" || strings.EqualFold(v, "n/a") || strings.EqualFold(v, "not supported") || strings.EqualFold(v, "[not supported]") { return nil } n, err := strconv.ParseInt(v, 10, 64) if err != nil { return nil } return &n } func parseMaybeBool(v string) *bool { v = strings.TrimSpace(strings.ToLower(v)) switch v { case "active", "enabled", "true", "1": b := true return &b case "not active", "disabled", "false", "0": b := false return &b default: return nil } } func normalizePCIeBDF(bdf string) string { bdf = strings.TrimSpace(strings.ToLower(bdf)) if bdf == "" { return "" } parts := strings.Split(bdf, ":") if len(parts) == 3 { domain := parts[0] if len(domain) > 4 { domain = domain[len(domain)-4:] } return domain + ":" + parts[1] + ":" + parts[2] } if len(parts) == 2 { return "0000:" + parts[0] + ":" + parts[1] } return bdf } func isNVIDIADevice(dev schema.HardwarePCIeDevice) bool { if dev.VendorID != nil && *dev.VendorID == nvidiaVendorID { return true } if dev.Manufacturer != nil && strings.Contains(strings.ToLower(*dev.Manufacturer), "nvidia") { return true } return false } func setPCIeFallback(dev *schema.HardwarePCIeDevice) { status := statusUnknown dev.Status = &status } func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) { if info.TemperatureC != nil { dev.TemperatureC = info.TemperatureC } if info.PowerW != nil { dev.PowerW = info.PowerW } if info.ECCUncorrected != nil { dev.ECCUncorrectedTotal = info.ECCUncorrected } if info.ECCCorrected != nil { dev.ECCCorrectedTotal = info.ECCCorrected } if info.HWSlowdown != nil { dev.HWSlowdown = info.HWSlowdown } }