Implement audit enrichments, TUI workflows, and production ISO scaffold
This commit is contained in:
245
audit/internal/collector/nvidia.go
Normal file
245
audit/internal/collector/nvidia.go
Normal file
@@ -0,0 +1,245 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"encoding/csv"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
const nvidiaVendorID = 0x10de
|
||||
|
||||
type nvidiaGPUInfo struct {
|
||||
BDF string
|
||||
Serial string
|
||||
VBIOS string
|
||||
TemperatureC *float64
|
||||
PowerW *float64
|
||||
ECCUncorrected *int64
|
||||
ECCCorrected *int64
|
||||
HWSlowdown *bool
|
||||
}
|
||||
|
||||
// enrichPCIeWithNVIDIA enriches NVIDIA PCIe devices with data from nvidia-smi.
|
||||
// If the driver/tool is unavailable, NVIDIA devices get UNKNOWN status and
|
||||
// a stable serial fallback based on board serial + slot.
|
||||
func enrichPCIeWithNVIDIA(devs []schema.HardwarePCIeDevice, boardSerial string) []schema.HardwarePCIeDevice {
|
||||
gpuByBDF, err := queryNVIDIAGPUs()
|
||||
if err != nil {
|
||||
slog.Info("nvidia: enrichment skipped", "err", err)
|
||||
return enrichPCIeWithNVIDIAData(devs, nil, boardSerial, false)
|
||||
}
|
||||
return enrichPCIeWithNVIDIAData(devs, gpuByBDF, boardSerial, true)
|
||||
}
|
||||
|
||||
func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[string]nvidiaGPUInfo, boardSerial string, driverLoaded bool) []schema.HardwarePCIeDevice {
|
||||
enriched := 0
|
||||
for i := range devs {
|
||||
if !isNVIDIADevice(devs[i]) {
|
||||
continue
|
||||
}
|
||||
|
||||
if !driverLoaded {
|
||||
setPCIeFallback(&devs[i], boardSerial)
|
||||
continue
|
||||
}
|
||||
|
||||
bdf := ""
|
||||
if devs[i].BDF != nil {
|
||||
bdf = normalizePCIeBDF(*devs[i].BDF)
|
||||
}
|
||||
info, ok := gpuByBDF[bdf]
|
||||
if !ok {
|
||||
setPCIeFallback(&devs[i], boardSerial)
|
||||
continue
|
||||
}
|
||||
|
||||
if v := strings.TrimSpace(info.Serial); v != "" {
|
||||
devs[i].SerialNumber = &v
|
||||
} else {
|
||||
setPCIeFallbackSerial(&devs[i], boardSerial)
|
||||
}
|
||||
if v := strings.TrimSpace(info.VBIOS); v != "" {
|
||||
devs[i].Firmware = &v
|
||||
}
|
||||
|
||||
status := "OK"
|
||||
if info.ECCUncorrected != nil && *info.ECCUncorrected > 0 {
|
||||
status = "WARNING"
|
||||
}
|
||||
devs[i].Status = &status
|
||||
injectNVIDIATelemetry(&devs[i], info)
|
||||
enriched++
|
||||
}
|
||||
|
||||
if driverLoaded {
|
||||
slog.Info("nvidia: enriched", "count", enriched)
|
||||
}
|
||||
return devs
|
||||
}
|
||||
|
||||
func queryNVIDIAGPUs() (map[string]nvidiaGPUInfo, error) {
|
||||
out, err := exec.Command(
|
||||
"nvidia-smi",
|
||||
"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown",
|
||||
"--format=csv,noheader,nounits",
|
||||
).Output()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return parseNVIDIASMIQuery(string(out))
|
||||
}
|
||||
|
||||
func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
|
||||
r := csv.NewReader(strings.NewReader(raw))
|
||||
r.TrimLeadingSpace = true
|
||||
r.FieldsPerRecord = -1
|
||||
|
||||
records, err := r.ReadAll()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
result := make(map[string]nvidiaGPUInfo)
|
||||
for _, rec := range records {
|
||||
if len(rec) == 0 {
|
||||
continue
|
||||
}
|
||||
if len(rec) < 9 {
|
||||
return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 9", len(rec))
|
||||
}
|
||||
|
||||
bdf := normalizePCIeBDF(rec[1])
|
||||
if bdf == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
info := nvidiaGPUInfo{
|
||||
BDF: bdf,
|
||||
Serial: strings.TrimSpace(rec[2]),
|
||||
VBIOS: strings.TrimSpace(rec[3]),
|
||||
TemperatureC: parseMaybeFloat(rec[4]),
|
||||
PowerW: parseMaybeFloat(rec[5]),
|
||||
ECCUncorrected: parseMaybeInt64(rec[6]),
|
||||
ECCCorrected: parseMaybeInt64(rec[7]),
|
||||
HWSlowdown: parseMaybeBool(rec[8]),
|
||||
}
|
||||
result[bdf] = info
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func parseMaybeFloat(v string) *float64 {
|
||||
v = strings.TrimSpace(v)
|
||||
if v == "" || strings.EqualFold(v, "n/a") || strings.EqualFold(v, "not supported") || strings.EqualFold(v, "[not supported]") {
|
||||
return nil
|
||||
}
|
||||
n, err := strconv.ParseFloat(v, 64)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
return &n
|
||||
}
|
||||
|
||||
func parseMaybeInt64(v string) *int64 {
|
||||
v = strings.TrimSpace(v)
|
||||
if v == "" || strings.EqualFold(v, "n/a") || strings.EqualFold(v, "not supported") || strings.EqualFold(v, "[not supported]") {
|
||||
return nil
|
||||
}
|
||||
n, err := strconv.ParseInt(v, 10, 64)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
return &n
|
||||
}
|
||||
|
||||
func parseMaybeBool(v string) *bool {
|
||||
v = strings.TrimSpace(strings.ToLower(v))
|
||||
switch v {
|
||||
case "active", "enabled", "true", "1":
|
||||
b := true
|
||||
return &b
|
||||
case "not active", "disabled", "false", "0":
|
||||
b := false
|
||||
return &b
|
||||
default:
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
func normalizePCIeBDF(bdf string) string {
|
||||
bdf = strings.TrimSpace(strings.ToLower(bdf))
|
||||
if bdf == "" {
|
||||
return ""
|
||||
}
|
||||
parts := strings.Split(bdf, ":")
|
||||
if len(parts) == 3 {
|
||||
domain := parts[0]
|
||||
if len(domain) > 4 {
|
||||
domain = domain[len(domain)-4:]
|
||||
}
|
||||
return domain + ":" + parts[1] + ":" + parts[2]
|
||||
}
|
||||
if len(parts) == 2 {
|
||||
return "0000:" + parts[0] + ":" + parts[1]
|
||||
}
|
||||
return bdf
|
||||
}
|
||||
|
||||
func isNVIDIADevice(dev schema.HardwarePCIeDevice) bool {
|
||||
if dev.VendorID != nil && *dev.VendorID == nvidiaVendorID {
|
||||
return true
|
||||
}
|
||||
if dev.Manufacturer != nil && strings.Contains(strings.ToLower(*dev.Manufacturer), "nvidia") {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func setPCIeFallback(dev *schema.HardwarePCIeDevice, boardSerial string) {
|
||||
setPCIeFallbackSerial(dev, boardSerial)
|
||||
status := "UNKNOWN"
|
||||
dev.Status = &status
|
||||
}
|
||||
|
||||
func setPCIeFallbackSerial(dev *schema.HardwarePCIeDevice, boardSerial string) {
|
||||
if strings.TrimSpace(boardSerial) == "" || dev.SerialNumber != nil {
|
||||
return
|
||||
}
|
||||
slot := "unknown"
|
||||
if dev.BDF != nil && strings.TrimSpace(*dev.BDF) != "" {
|
||||
slot = strings.TrimSpace(*dev.BDF)
|
||||
} else if dev.Slot != nil && strings.TrimSpace(*dev.Slot) != "" {
|
||||
slot = strings.TrimSpace(*dev.Slot)
|
||||
}
|
||||
fb := fmt.Sprintf("%s-PCIE-%s", boardSerial, slot)
|
||||
dev.SerialNumber = &fb
|
||||
}
|
||||
|
||||
func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) {
|
||||
if dev.Telemetry == nil {
|
||||
dev.Telemetry = map[string]any{}
|
||||
}
|
||||
if info.TemperatureC != nil {
|
||||
dev.Telemetry["temperature_c"] = *info.TemperatureC
|
||||
}
|
||||
if info.PowerW != nil {
|
||||
dev.Telemetry["power_w"] = *info.PowerW
|
||||
}
|
||||
if info.ECCUncorrected != nil {
|
||||
dev.Telemetry["ecc_uncorrected_total"] = *info.ECCUncorrected
|
||||
}
|
||||
if info.ECCCorrected != nil {
|
||||
dev.Telemetry["ecc_corrected_total"] = *info.ECCCorrected
|
||||
}
|
||||
if info.HWSlowdown != nil {
|
||||
dev.Telemetry["hw_slowdown_active"] = *info.HWSlowdown
|
||||
}
|
||||
if len(dev.Telemetry) == 0 {
|
||||
dev.Telemetry = nil
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user