253 lines
5.8 KiB
Go
253 lines
5.8 KiB
Go
package collector
|
|
|
|
import (
|
|
"encoding/csv"
|
|
"log/slog"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"bee/audit/internal/schema"
|
|
)
|
|
|
|
var (
|
|
amdSMIExecCommand = exec.Command
|
|
amdSMILookPath = exec.LookPath
|
|
amdSMIGlob = filepath.Glob
|
|
)
|
|
|
|
var amdSMIExecutableGlobs = []string{
|
|
"/opt/rocm/bin/rocm-smi",
|
|
"/opt/rocm-*/bin/rocm-smi",
|
|
"/usr/local/bin/rocm-smi",
|
|
}
|
|
|
|
type amdGPUInfo struct {
|
|
BDF string
|
|
Serial string
|
|
Product string
|
|
Firmware string
|
|
PowerW *float64
|
|
TempC *float64
|
|
}
|
|
|
|
func enrichPCIeWithAMD(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice {
|
|
if !hasAMDGPUDevices(devs) {
|
|
return devs
|
|
}
|
|
infoByBDF, err := queryAMDGPUs()
|
|
if err != nil {
|
|
slog.Info("amdgpu: enrichment skipped", "err", err)
|
|
return devs
|
|
}
|
|
enriched := 0
|
|
for i := range devs {
|
|
if !isAMDGPUDevice(devs[i]) || devs[i].BDF == nil {
|
|
continue
|
|
}
|
|
info, ok := infoByBDF[normalizePCIeBDF(*devs[i].BDF)]
|
|
if !ok {
|
|
continue
|
|
}
|
|
if strings.TrimSpace(info.Serial) != "" {
|
|
devs[i].SerialNumber = &info.Serial
|
|
}
|
|
if strings.TrimSpace(info.Firmware) != "" {
|
|
devs[i].Firmware = &info.Firmware
|
|
}
|
|
if strings.TrimSpace(info.Product) != "" && devs[i].Model == nil {
|
|
devs[i].Model = &info.Product
|
|
}
|
|
if info.PowerW != nil {
|
|
devs[i].PowerW = info.PowerW
|
|
}
|
|
if info.TempC != nil {
|
|
devs[i].TemperatureC = info.TempC
|
|
}
|
|
enriched++
|
|
}
|
|
if enriched > 0 {
|
|
slog.Info("amdgpu: enriched", "count", enriched)
|
|
}
|
|
return devs
|
|
}
|
|
|
|
func hasAMDGPUDevices(devs []schema.HardwarePCIeDevice) bool {
|
|
for _, dev := range devs {
|
|
if isAMDGPUDevice(dev) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func isAMDGPUDevice(dev schema.HardwarePCIeDevice) bool {
|
|
if dev.Manufacturer == nil || dev.DeviceClass == nil {
|
|
return false
|
|
}
|
|
manufacturer := strings.ToLower(strings.TrimSpace(*dev.Manufacturer))
|
|
return strings.Contains(manufacturer, "advanced micro devices") && isGPUClass(strings.TrimSpace(*dev.DeviceClass))
|
|
}
|
|
|
|
func queryAMDGPUs() (map[string]amdGPUInfo, error) {
|
|
busByCard, err := queryAMDField("--showbus")
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
infoByCard := map[string]amdGPUInfo{}
|
|
for card, bus := range busByCard {
|
|
bdf := normalizePCIeBDF(bus)
|
|
if bdf == "" {
|
|
continue
|
|
}
|
|
infoByCard[card] = amdGPUInfo{BDF: bdf}
|
|
}
|
|
if len(infoByCard) == 0 {
|
|
return map[string]amdGPUInfo{}, nil
|
|
}
|
|
mergeAMDField(infoByCard, "--showserial", func(info *amdGPUInfo, value string) { info.Serial = value })
|
|
mergeAMDField(infoByCard, "--showproductname", func(info *amdGPUInfo, value string) { info.Product = value })
|
|
mergeAMDField(infoByCard, "--showvbios", func(info *amdGPUInfo, value string) { info.Firmware = value })
|
|
mergeAMDNumericField(infoByCard, "--showpower", func(info *amdGPUInfo, value float64) { info.PowerW = &value })
|
|
mergeAMDNumericField(infoByCard, "--showtemp", func(info *amdGPUInfo, value float64) { info.TempC = &value })
|
|
|
|
result := make(map[string]amdGPUInfo, len(infoByCard))
|
|
for _, info := range infoByCard {
|
|
if info.BDF == "" {
|
|
continue
|
|
}
|
|
result[info.BDF] = info
|
|
}
|
|
return result, nil
|
|
}
|
|
|
|
func mergeAMDField(infoByCard map[string]amdGPUInfo, flag string, apply func(*amdGPUInfo, string)) {
|
|
values, err := queryAMDField(flag)
|
|
if err != nil {
|
|
return
|
|
}
|
|
for card, value := range values {
|
|
info, ok := infoByCard[card]
|
|
if !ok {
|
|
continue
|
|
}
|
|
value = strings.TrimSpace(value)
|
|
if value == "" {
|
|
continue
|
|
}
|
|
apply(&info, value)
|
|
infoByCard[card] = info
|
|
}
|
|
}
|
|
|
|
func mergeAMDNumericField(infoByCard map[string]amdGPUInfo, flag string, apply func(*amdGPUInfo, float64)) {
|
|
values, err := queryAMDNumericField(flag)
|
|
if err != nil {
|
|
return
|
|
}
|
|
for card, value := range values {
|
|
info, ok := infoByCard[card]
|
|
if !ok {
|
|
continue
|
|
}
|
|
apply(&info, value)
|
|
infoByCard[card] = info
|
|
}
|
|
}
|
|
|
|
func queryAMDField(flag string) (map[string]string, error) {
|
|
cmd, err := resolveAMDSMICmd(flag, "--csv")
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
out, err := amdSMIExecCommand(cmd[0], cmd[1:]...).CombinedOutput()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return parseROCmSingleValueCSV(string(out)), nil
|
|
}
|
|
|
|
func queryAMDNumericField(flag string) (map[string]float64, error) {
|
|
values, err := queryAMDField(flag)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
out := map[string]float64{}
|
|
for card, raw := range values {
|
|
if value, ok := firstFloat(raw); ok {
|
|
out[card] = value
|
|
}
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
func resolveAMDSMICmd(args ...string) ([]string, error) {
|
|
if path, err := amdSMILookPath("rocm-smi"); err == nil {
|
|
return append([]string{path}, args...), nil
|
|
}
|
|
for _, pattern := range amdSMIExecutableGlobs {
|
|
matches, err := amdSMIGlob(pattern)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
sort.Strings(matches)
|
|
for _, match := range matches {
|
|
return append([]string{match}, args...), nil
|
|
}
|
|
}
|
|
return nil, exec.ErrNotFound
|
|
}
|
|
|
|
func parseROCmSingleValueCSV(raw string) map[string]string {
|
|
rows := map[string]string{}
|
|
reader := csv.NewReader(strings.NewReader(raw))
|
|
reader.FieldsPerRecord = -1
|
|
records, err := reader.ReadAll()
|
|
if err != nil {
|
|
return rows
|
|
}
|
|
for _, rec := range records {
|
|
if len(rec) < 2 {
|
|
continue
|
|
}
|
|
card := normalizeROCmCardKey(rec[0])
|
|
if card == "" {
|
|
continue
|
|
}
|
|
value := strings.TrimSpace(strings.Join(rec[1:], ","))
|
|
if value == "" || looksLikeCSVHeaderValue(value) {
|
|
continue
|
|
}
|
|
rows[card] = value
|
|
}
|
|
return rows
|
|
}
|
|
|
|
func normalizeROCmCardKey(raw string) string {
|
|
raw = strings.ToLower(strings.TrimSpace(raw))
|
|
raw = strings.Trim(raw, "\"")
|
|
if raw == "" {
|
|
return ""
|
|
}
|
|
if raw == "device" || raw == "gpu" || raw == "card" {
|
|
return ""
|
|
}
|
|
if strings.HasPrefix(raw, "card") {
|
|
return raw
|
|
}
|
|
if _, err := strconv.Atoi(raw); err == nil {
|
|
return "card" + raw
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func looksLikeCSVHeaderValue(value string) bool {
|
|
value = strings.ToLower(strings.TrimSpace(value))
|
|
return strings.Contains(value, "product") ||
|
|
strings.Contains(value, "serial") ||
|
|
strings.Contains(value, "vbios") ||
|
|
strings.Contains(value, "bus")
|
|
}
|