Files
bee/audit/internal/collector/nvidia.go
2026-03-15 23:03:38 +03:00

235 lines
5.3 KiB
Go

package collector
import (
"bee/audit/internal/schema"
"encoding/csv"
"fmt"
"log/slog"
"os/exec"
"strconv"
"strings"
)
const nvidiaVendorID = 0x10de
type nvidiaGPUInfo struct {
BDF string
Serial string
VBIOS string
TemperatureC *float64
PowerW *float64
ECCUncorrected *int64
ECCCorrected *int64
HWSlowdown *bool
}
// enrichPCIeWithNVIDIA enriches NVIDIA PCIe devices with data from nvidia-smi.
// If the driver/tool is unavailable, NVIDIA devices get Unknown status.
func enrichPCIeWithNVIDIA(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice {
if !hasNVIDIADevices(devs) {
return devs
}
gpuByBDF, err := queryNVIDIAGPUs()
if err != nil {
slog.Info("nvidia: enrichment skipped", "err", err)
return enrichPCIeWithNVIDIAData(devs, nil, false)
}
return enrichPCIeWithNVIDIAData(devs, gpuByBDF, true)
}
func hasNVIDIADevices(devs []schema.HardwarePCIeDevice) bool {
for _, dev := range devs {
if isNVIDIADevice(dev) {
return true
}
}
return false
}
func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[string]nvidiaGPUInfo, driverLoaded bool) []schema.HardwarePCIeDevice {
enriched := 0
for i := range devs {
if !isNVIDIADevice(devs[i]) {
continue
}
if !driverLoaded {
setPCIeFallback(&devs[i])
continue
}
bdf := ""
if devs[i].BDF != nil {
bdf = normalizePCIeBDF(*devs[i].BDF)
}
info, ok := gpuByBDF[bdf]
if !ok {
setPCIeFallback(&devs[i])
continue
}
if v := strings.TrimSpace(info.Serial); v != "" {
devs[i].SerialNumber = &v
}
if v := strings.TrimSpace(info.VBIOS); v != "" {
devs[i].Firmware = &v
}
status := statusOK
if info.ECCUncorrected != nil && *info.ECCUncorrected > 0 {
status = statusWarning
devs[i].ErrorDescription = stringPtr("GPU reports uncorrected ECC errors")
}
devs[i].Status = &status
injectNVIDIATelemetry(&devs[i], info)
enriched++
}
if driverLoaded {
slog.Info("nvidia: enriched", "count", enriched)
}
return devs
}
func queryNVIDIAGPUs() (map[string]nvidiaGPUInfo, error) {
out, err := exec.Command(
"nvidia-smi",
"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown",
"--format=csv,noheader,nounits",
).Output()
if err != nil {
return nil, err
}
return parseNVIDIASMIQuery(string(out))
}
func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
r := csv.NewReader(strings.NewReader(raw))
r.TrimLeadingSpace = true
r.FieldsPerRecord = -1
records, err := r.ReadAll()
if err != nil {
return nil, err
}
result := make(map[string]nvidiaGPUInfo)
for _, rec := range records {
if len(rec) == 0 {
continue
}
if len(rec) < 9 {
return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 9", len(rec))
}
bdf := normalizePCIeBDF(rec[1])
if bdf == "" {
continue
}
info := nvidiaGPUInfo{
BDF: bdf,
Serial: strings.TrimSpace(rec[2]),
VBIOS: strings.TrimSpace(rec[3]),
TemperatureC: parseMaybeFloat(rec[4]),
PowerW: parseMaybeFloat(rec[5]),
ECCUncorrected: parseMaybeInt64(rec[6]),
ECCCorrected: parseMaybeInt64(rec[7]),
HWSlowdown: parseMaybeBool(rec[8]),
}
result[bdf] = info
}
return result, nil
}
func parseMaybeFloat(v string) *float64 {
v = strings.TrimSpace(v)
if v == "" || strings.EqualFold(v, "n/a") || strings.EqualFold(v, "not supported") || strings.EqualFold(v, "[not supported]") {
return nil
}
n, err := strconv.ParseFloat(v, 64)
if err != nil {
return nil
}
return &n
}
func parseMaybeInt64(v string) *int64 {
v = strings.TrimSpace(v)
if v == "" || strings.EqualFold(v, "n/a") || strings.EqualFold(v, "not supported") || strings.EqualFold(v, "[not supported]") {
return nil
}
n, err := strconv.ParseInt(v, 10, 64)
if err != nil {
return nil
}
return &n
}
func parseMaybeBool(v string) *bool {
v = strings.TrimSpace(strings.ToLower(v))
switch v {
case "active", "enabled", "true", "1":
b := true
return &b
case "not active", "disabled", "false", "0":
b := false
return &b
default:
return nil
}
}
func normalizePCIeBDF(bdf string) string {
bdf = strings.TrimSpace(strings.ToLower(bdf))
if bdf == "" {
return ""
}
parts := strings.Split(bdf, ":")
if len(parts) == 3 {
domain := parts[0]
if len(domain) > 4 {
domain = domain[len(domain)-4:]
}
return domain + ":" + parts[1] + ":" + parts[2]
}
if len(parts) == 2 {
return "0000:" + parts[0] + ":" + parts[1]
}
return bdf
}
func isNVIDIADevice(dev schema.HardwarePCIeDevice) bool {
if dev.VendorID != nil && *dev.VendorID == nvidiaVendorID {
return true
}
if dev.Manufacturer != nil && strings.Contains(strings.ToLower(*dev.Manufacturer), "nvidia") {
return true
}
return false
}
func setPCIeFallback(dev *schema.HardwarePCIeDevice) {
status := statusUnknown
dev.Status = &status
}
func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) {
if info.TemperatureC != nil {
dev.TemperatureC = info.TemperatureC
}
if info.PowerW != nil {
dev.PowerW = info.PowerW
}
if info.ECCUncorrected != nil {
dev.ECCUncorrectedTotal = info.ECCUncorrected
}
if info.ECCCorrected != nil {
dev.ECCCorrectedTotal = info.ECCCorrected
}
if info.HWSlowdown != nil {
dev.HWSlowdown = info.HWSlowdown
}
}