235 lines
5.3 KiB
Go
235 lines
5.3 KiB
Go
package collector
|
|
|
|
import (
|
|
"bee/audit/internal/schema"
|
|
"encoding/csv"
|
|
"fmt"
|
|
"log/slog"
|
|
"os/exec"
|
|
"strconv"
|
|
"strings"
|
|
)
|
|
|
|
const nvidiaVendorID = 0x10de
|
|
|
|
type nvidiaGPUInfo struct {
|
|
BDF string
|
|
Serial string
|
|
VBIOS string
|
|
TemperatureC *float64
|
|
PowerW *float64
|
|
ECCUncorrected *int64
|
|
ECCCorrected *int64
|
|
HWSlowdown *bool
|
|
}
|
|
|
|
// enrichPCIeWithNVIDIA enriches NVIDIA PCIe devices with data from nvidia-smi.
|
|
// If the driver/tool is unavailable, NVIDIA devices get Unknown status.
|
|
func enrichPCIeWithNVIDIA(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice {
|
|
if !hasNVIDIADevices(devs) {
|
|
return devs
|
|
}
|
|
gpuByBDF, err := queryNVIDIAGPUs()
|
|
if err != nil {
|
|
slog.Info("nvidia: enrichment skipped", "err", err)
|
|
return enrichPCIeWithNVIDIAData(devs, nil, false)
|
|
}
|
|
return enrichPCIeWithNVIDIAData(devs, gpuByBDF, true)
|
|
}
|
|
|
|
func hasNVIDIADevices(devs []schema.HardwarePCIeDevice) bool {
|
|
for _, dev := range devs {
|
|
if isNVIDIADevice(dev) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[string]nvidiaGPUInfo, driverLoaded bool) []schema.HardwarePCIeDevice {
|
|
enriched := 0
|
|
for i := range devs {
|
|
if !isNVIDIADevice(devs[i]) {
|
|
continue
|
|
}
|
|
|
|
if !driverLoaded {
|
|
setPCIeFallback(&devs[i])
|
|
continue
|
|
}
|
|
|
|
bdf := ""
|
|
if devs[i].BDF != nil {
|
|
bdf = normalizePCIeBDF(*devs[i].BDF)
|
|
}
|
|
info, ok := gpuByBDF[bdf]
|
|
if !ok {
|
|
setPCIeFallback(&devs[i])
|
|
continue
|
|
}
|
|
|
|
if v := strings.TrimSpace(info.Serial); v != "" {
|
|
devs[i].SerialNumber = &v
|
|
}
|
|
if v := strings.TrimSpace(info.VBIOS); v != "" {
|
|
devs[i].Firmware = &v
|
|
}
|
|
|
|
status := statusOK
|
|
if info.ECCUncorrected != nil && *info.ECCUncorrected > 0 {
|
|
status = statusWarning
|
|
devs[i].ErrorDescription = stringPtr("GPU reports uncorrected ECC errors")
|
|
}
|
|
devs[i].Status = &status
|
|
injectNVIDIATelemetry(&devs[i], info)
|
|
enriched++
|
|
}
|
|
|
|
if driverLoaded {
|
|
slog.Info("nvidia: enriched", "count", enriched)
|
|
}
|
|
return devs
|
|
}
|
|
|
|
func queryNVIDIAGPUs() (map[string]nvidiaGPUInfo, error) {
|
|
out, err := exec.Command(
|
|
"nvidia-smi",
|
|
"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown",
|
|
"--format=csv,noheader,nounits",
|
|
).Output()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return parseNVIDIASMIQuery(string(out))
|
|
}
|
|
|
|
func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
|
|
r := csv.NewReader(strings.NewReader(raw))
|
|
r.TrimLeadingSpace = true
|
|
r.FieldsPerRecord = -1
|
|
|
|
records, err := r.ReadAll()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
result := make(map[string]nvidiaGPUInfo)
|
|
for _, rec := range records {
|
|
if len(rec) == 0 {
|
|
continue
|
|
}
|
|
if len(rec) < 9 {
|
|
return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 9", len(rec))
|
|
}
|
|
|
|
bdf := normalizePCIeBDF(rec[1])
|
|
if bdf == "" {
|
|
continue
|
|
}
|
|
|
|
info := nvidiaGPUInfo{
|
|
BDF: bdf,
|
|
Serial: strings.TrimSpace(rec[2]),
|
|
VBIOS: strings.TrimSpace(rec[3]),
|
|
TemperatureC: parseMaybeFloat(rec[4]),
|
|
PowerW: parseMaybeFloat(rec[5]),
|
|
ECCUncorrected: parseMaybeInt64(rec[6]),
|
|
ECCCorrected: parseMaybeInt64(rec[7]),
|
|
HWSlowdown: parseMaybeBool(rec[8]),
|
|
}
|
|
result[bdf] = info
|
|
}
|
|
|
|
return result, nil
|
|
}
|
|
|
|
func parseMaybeFloat(v string) *float64 {
|
|
v = strings.TrimSpace(v)
|
|
if v == "" || strings.EqualFold(v, "n/a") || strings.EqualFold(v, "not supported") || strings.EqualFold(v, "[not supported]") {
|
|
return nil
|
|
}
|
|
n, err := strconv.ParseFloat(v, 64)
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
return &n
|
|
}
|
|
|
|
func parseMaybeInt64(v string) *int64 {
|
|
v = strings.TrimSpace(v)
|
|
if v == "" || strings.EqualFold(v, "n/a") || strings.EqualFold(v, "not supported") || strings.EqualFold(v, "[not supported]") {
|
|
return nil
|
|
}
|
|
n, err := strconv.ParseInt(v, 10, 64)
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
return &n
|
|
}
|
|
|
|
func parseMaybeBool(v string) *bool {
|
|
v = strings.TrimSpace(strings.ToLower(v))
|
|
switch v {
|
|
case "active", "enabled", "true", "1":
|
|
b := true
|
|
return &b
|
|
case "not active", "disabled", "false", "0":
|
|
b := false
|
|
return &b
|
|
default:
|
|
return nil
|
|
}
|
|
}
|
|
|
|
func normalizePCIeBDF(bdf string) string {
|
|
bdf = strings.TrimSpace(strings.ToLower(bdf))
|
|
if bdf == "" {
|
|
return ""
|
|
}
|
|
parts := strings.Split(bdf, ":")
|
|
if len(parts) == 3 {
|
|
domain := parts[0]
|
|
if len(domain) > 4 {
|
|
domain = domain[len(domain)-4:]
|
|
}
|
|
return domain + ":" + parts[1] + ":" + parts[2]
|
|
}
|
|
if len(parts) == 2 {
|
|
return "0000:" + parts[0] + ":" + parts[1]
|
|
}
|
|
return bdf
|
|
}
|
|
|
|
func isNVIDIADevice(dev schema.HardwarePCIeDevice) bool {
|
|
if dev.VendorID != nil && *dev.VendorID == nvidiaVendorID {
|
|
return true
|
|
}
|
|
if dev.Manufacturer != nil && strings.Contains(strings.ToLower(*dev.Manufacturer), "nvidia") {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
func setPCIeFallback(dev *schema.HardwarePCIeDevice) {
|
|
status := statusUnknown
|
|
dev.Status = &status
|
|
}
|
|
|
|
func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) {
|
|
if info.TemperatureC != nil {
|
|
dev.TemperatureC = info.TemperatureC
|
|
}
|
|
if info.PowerW != nil {
|
|
dev.PowerW = info.PowerW
|
|
}
|
|
if info.ECCUncorrected != nil {
|
|
dev.ECCUncorrectedTotal = info.ECCUncorrected
|
|
}
|
|
if info.ECCCorrected != nil {
|
|
dev.ECCCorrectedTotal = info.ECCCorrected
|
|
}
|
|
if info.HWSlowdown != nil {
|
|
dev.HWSlowdown = info.HWSlowdown
|
|
}
|
|
}
|