diff --git a/audit/internal/app/app.go b/audit/internal/app/app.go index 447c2b8..914e5f6 100644 --- a/audit/internal/app/app.go +++ b/audit/internal/app/app.go @@ -105,6 +105,7 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro } } result := collector.Run(runtimeMode) + applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir) if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil { result.Runtime = &health } diff --git a/audit/internal/app/sat_overlay.go b/audit/internal/app/sat_overlay.go new file mode 100644 index 0000000..6b88b80 --- /dev/null +++ b/audit/internal/app/sat_overlay.go @@ -0,0 +1,214 @@ +package app + +import ( + "os" + "path/filepath" + "sort" + "strings" + + "bee/audit/internal/schema" +) + +func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string) { + if snap == nil || strings.TrimSpace(baseDir) == "" { + return + } + if summary, ok := loadLatestSATSummary(baseDir, "gpu-amd-"); ok { + applyGPUVendorSAT(snap.PCIeDevices, "amd", summary) + } + if summary, ok := loadLatestSATSummary(baseDir, "gpu-nvidia-"); ok { + applyGPUVendorSAT(snap.PCIeDevices, "nvidia", summary) + } + if summary, ok := loadLatestSATSummary(baseDir, "memory-"); ok { + applyMemorySAT(snap.Memory, summary) + } + if summary, ok := loadLatestSATSummary(baseDir, "cpu-"); ok { + applyCPUSAT(snap.CPUs, summary) + } + if summary, ok := loadLatestSATSummary(baseDir, "storage-"); ok { + applyStorageSAT(snap.Storage, summary) + } +} + +type satSummary struct { + runAtUTC string + overall string + kv map[string]string +} + +func loadLatestSATSummary(baseDir, prefix string) (satSummary, bool) { + matches, err := filepath.Glob(filepath.Join(baseDir, prefix+"*/summary.txt")) + if err != nil || len(matches) == 0 { + return satSummary{}, false + } + sort.Strings(matches) + raw, err := os.ReadFile(matches[len(matches)-1]) + if err != nil { + return satSummary{}, false + } + kv := parseKeyValueSummary(string(raw)) + return satSummary{ + runAtUTC: strings.TrimSpace(kv["run_at_utc"]), + overall: strings.ToUpper(strings.TrimSpace(kv["overall_status"])), + kv: kv, + }, true +} + +func applyGPUVendorSAT(devs []schema.HardwarePCIeDevice, vendor string, summary satSummary) { + status, description, ok := satSummaryStatus(summary, vendor+" GPU SAT") + if !ok { + return + } + for i := range devs { + if !matchesGPUVendor(devs[i], vendor) { + continue + } + mergeComponentStatus(&devs[i].HardwareComponentStatus, summary.runAtUTC, status, description) + } +} + +func applyMemorySAT(dimms []schema.HardwareMemory, summary satSummary) { + status, description, ok := satSummaryStatus(summary, "memory SAT") + if !ok { + return + } + for i := range dimms { + mergeComponentStatus(&dimms[i].HardwareComponentStatus, summary.runAtUTC, status, description) + } +} + +func applyCPUSAT(cpus []schema.HardwareCPU, summary satSummary) { + status, description, ok := satSummaryStatus(summary, "CPU SAT") + if !ok { + return + } + for i := range cpus { + mergeComponentStatus(&cpus[i].HardwareComponentStatus, summary.runAtUTC, status, description) + } +} + +func applyStorageSAT(disks []schema.HardwareStorage, summary satSummary) { + byDevice := parseStorageSATStatus(summary) + for i := range disks { + devPath, _ := disks[i].Telemetry["linux_device"].(string) + devName := filepath.Base(strings.TrimSpace(devPath)) + if devName == "" { + continue + } + result, ok := byDevice[devName] + if !ok { + continue + } + mergeComponentStatus(&disks[i].HardwareComponentStatus, summary.runAtUTC, result.status, result.description) + } +} + +type satStatusResult struct { + status string + description string + ok bool +} + +func parseStorageSATStatus(summary satSummary) map[string]satStatusResult { + result := map[string]satStatusResult{} + for key, value := range summary.kv { + if !strings.HasSuffix(key, "_status") || key == "overall_status" { + continue + } + base := strings.TrimSuffix(key, "_status") + idx := strings.Index(base, "_") + if idx <= 0 { + continue + } + devName := base[:idx] + step := strings.ReplaceAll(base[idx+1:], "_", "-") + stepStatus, desc, ok := satKeyStatus(strings.ToUpper(strings.TrimSpace(value)), "storage "+step) + if !ok { + continue + } + current := result[devName] + if !current.ok || statusSeverity(stepStatus) > statusSeverity(current.status) { + result[devName] = satStatusResult{status: stepStatus, description: desc, ok: true} + } + } + return result +} + +func satSummaryStatus(summary satSummary, label string) (string, string, bool) { + return satKeyStatus(summary.overall, label) +} + +func satKeyStatus(rawStatus, label string) (string, string, bool) { + switch strings.ToUpper(strings.TrimSpace(rawStatus)) { + case "OK": + return "OK", label + " passed", true + case "PARTIAL", "UNSUPPORTED", "CANCELED", "CANCELLED": + return "Warning", label + " incomplete", true + case "FAILED": + return "Critical", label + " failed", true + default: + return "", "", false + } +} + +func mergeComponentStatus(component *schema.HardwareComponentStatus, changedAt, satStatus, description string) { + if component == nil || satStatus == "" { + return + } + current := strings.TrimSpace(ptrString(component.Status)) + if current == "" || current == "Unknown" || statusSeverity(satStatus) > statusSeverity(current) { + component.Status = appStringPtr(satStatus) + if strings.TrimSpace(description) != "" { + component.ErrorDescription = appStringPtr(description) + } + if strings.TrimSpace(changedAt) != "" { + component.StatusChangedAt = appStringPtr(changedAt) + component.StatusHistory = append(component.StatusHistory, schema.HardwareStatusHistory{ + Status: satStatus, + ChangedAt: changedAt, + Details: appStringPtr(description), + }) + } + } +} + +func statusSeverity(status string) int { + switch strings.TrimSpace(status) { + case "Critical": + return 3 + case "Warning": + return 2 + case "OK": + return 1 + default: + return 0 + } +} + +func matchesGPUVendor(dev schema.HardwarePCIeDevice, vendor string) bool { + if dev.DeviceClass == nil || !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Controller") && !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Accelerator") { + if dev.DeviceClass == nil || !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Display") && !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Video") { + return false + } + } + manufacturer := strings.ToLower(strings.TrimSpace(ptrString(dev.Manufacturer))) + switch vendor { + case "amd": + return strings.Contains(manufacturer, "advanced micro devices") || strings.Contains(manufacturer, "amd/ati") + case "nvidia": + return strings.Contains(manufacturer, "nvidia") + default: + return false + } +} + +func ptrString(v *string) string { + if v == nil { + return "" + } + return *v +} + +func appStringPtr(value string) *string { + return &value +} diff --git a/audit/internal/app/sat_overlay_test.go b/audit/internal/app/sat_overlay_test.go new file mode 100644 index 0000000..defe09a --- /dev/null +++ b/audit/internal/app/sat_overlay_test.go @@ -0,0 +1,61 @@ +package app + +import ( + "os" + "path/filepath" + "testing" + + "bee/audit/internal/schema" +) + +func TestApplyLatestSATStatusesMarksStorageByDevice(t *testing.T) { + baseDir := t.TempDir() + runDir := filepath.Join(baseDir, "storage-20260325-161151") + if err := os.MkdirAll(runDir, 0755); err != nil { + t.Fatal(err) + } + raw := "run_at_utc=2026-03-25T16:11:51Z\nnvme0n1_nvme_smart_log_status=OK\nsda_smartctl_health_status=FAILED\noverall_status=FAILED\n" + if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(raw), 0644); err != nil { + t.Fatal(err) + } + + nvme := schema.HardwareStorage{Telemetry: map[string]any{"linux_device": "/dev/nvme0n1"}} + usb := schema.HardwareStorage{Telemetry: map[string]any{"linux_device": "/dev/sda"}} + snap := schema.HardwareSnapshot{Storage: []schema.HardwareStorage{nvme, usb}} + + applyLatestSATStatuses(&snap, baseDir) + + if snap.Storage[0].Status == nil || *snap.Storage[0].Status != "OK" { + t.Fatalf("nvme status=%v want OK", snap.Storage[0].Status) + } + if snap.Storage[1].Status == nil || *snap.Storage[1].Status != "Critical" { + t.Fatalf("sda status=%v want Critical", snap.Storage[1].Status) + } +} + +func TestApplyLatestSATStatusesMarksAMDGPUs(t *testing.T) { + baseDir := t.TempDir() + runDir := filepath.Join(baseDir, "gpu-amd-20260325-161436") + if err := os.MkdirAll(runDir, 0755); err != nil { + t.Fatal(err) + } + raw := "run_at_utc=2026-03-25T16:14:36Z\noverall_status=FAILED\n" + if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(raw), 0644); err != nil { + t.Fatal(err) + } + + class := "DisplayController" + manufacturer := "Advanced Micro Devices, Inc. [AMD/ATI]" + snap := schema.HardwareSnapshot{ + PCIeDevices: []schema.HardwarePCIeDevice{{ + DeviceClass: &class, + Manufacturer: &manufacturer, + }}, + } + + applyLatestSATStatuses(&snap, baseDir) + + if snap.PCIeDevices[0].Status == nil || *snap.PCIeDevices[0].Status != "Critical" { + t.Fatalf("gpu status=%v want Critical", snap.PCIeDevices[0].Status) + } +} diff --git a/audit/internal/collector/amdgpu.go b/audit/internal/collector/amdgpu.go new file mode 100644 index 0000000..59ad6af --- /dev/null +++ b/audit/internal/collector/amdgpu.go @@ -0,0 +1,252 @@ +package collector + +import ( + "encoding/csv" + "log/slog" + "os/exec" + "path/filepath" + "sort" + "strconv" + "strings" + + "bee/audit/internal/schema" +) + +var ( + amdSMIExecCommand = exec.Command + amdSMILookPath = exec.LookPath + amdSMIGlob = filepath.Glob +) + +var amdSMIExecutableGlobs = []string{ + "/opt/rocm/bin/rocm-smi", + "/opt/rocm-*/bin/rocm-smi", + "/usr/local/bin/rocm-smi", +} + +type amdGPUInfo struct { + BDF string + Serial string + Product string + Firmware string + PowerW *float64 + TempC *float64 +} + +func enrichPCIeWithAMD(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice { + if !hasAMDGPUDevices(devs) { + return devs + } + infoByBDF, err := queryAMDGPUs() + if err != nil { + slog.Info("amdgpu: enrichment skipped", "err", err) + return devs + } + enriched := 0 + for i := range devs { + if !isAMDGPUDevice(devs[i]) || devs[i].BDF == nil { + continue + } + info, ok := infoByBDF[normalizePCIeBDF(*devs[i].BDF)] + if !ok { + continue + } + if strings.TrimSpace(info.Serial) != "" { + devs[i].SerialNumber = &info.Serial + } + if strings.TrimSpace(info.Firmware) != "" { + devs[i].Firmware = &info.Firmware + } + if strings.TrimSpace(info.Product) != "" && devs[i].Model == nil { + devs[i].Model = &info.Product + } + if info.PowerW != nil { + devs[i].PowerW = info.PowerW + } + if info.TempC != nil { + devs[i].TemperatureC = info.TempC + } + enriched++ + } + if enriched > 0 { + slog.Info("amdgpu: enriched", "count", enriched) + } + return devs +} + +func hasAMDGPUDevices(devs []schema.HardwarePCIeDevice) bool { + for _, dev := range devs { + if isAMDGPUDevice(dev) { + return true + } + } + return false +} + +func isAMDGPUDevice(dev schema.HardwarePCIeDevice) bool { + if dev.Manufacturer == nil || dev.DeviceClass == nil { + return false + } + manufacturer := strings.ToLower(strings.TrimSpace(*dev.Manufacturer)) + return strings.Contains(manufacturer, "advanced micro devices") && isGPUClass(strings.TrimSpace(*dev.DeviceClass)) +} + +func queryAMDGPUs() (map[string]amdGPUInfo, error) { + busByCard, err := queryAMDField("--showbus") + if err != nil { + return nil, err + } + infoByCard := map[string]amdGPUInfo{} + for card, bus := range busByCard { + bdf := normalizePCIeBDF(bus) + if bdf == "" { + continue + } + infoByCard[card] = amdGPUInfo{BDF: bdf} + } + if len(infoByCard) == 0 { + return map[string]amdGPUInfo{}, nil + } + mergeAMDField(infoByCard, "--showserial", func(info *amdGPUInfo, value string) { info.Serial = value }) + mergeAMDField(infoByCard, "--showproductname", func(info *amdGPUInfo, value string) { info.Product = value }) + mergeAMDField(infoByCard, "--showvbios", func(info *amdGPUInfo, value string) { info.Firmware = value }) + mergeAMDNumericField(infoByCard, "--showpower", func(info *amdGPUInfo, value float64) { info.PowerW = &value }) + mergeAMDNumericField(infoByCard, "--showtemp", func(info *amdGPUInfo, value float64) { info.TempC = &value }) + + result := make(map[string]amdGPUInfo, len(infoByCard)) + for _, info := range infoByCard { + if info.BDF == "" { + continue + } + result[info.BDF] = info + } + return result, nil +} + +func mergeAMDField(infoByCard map[string]amdGPUInfo, flag string, apply func(*amdGPUInfo, string)) { + values, err := queryAMDField(flag) + if err != nil { + return + } + for card, value := range values { + info, ok := infoByCard[card] + if !ok { + continue + } + value = strings.TrimSpace(value) + if value == "" { + continue + } + apply(&info, value) + infoByCard[card] = info + } +} + +func mergeAMDNumericField(infoByCard map[string]amdGPUInfo, flag string, apply func(*amdGPUInfo, float64)) { + values, err := queryAMDNumericField(flag) + if err != nil { + return + } + for card, value := range values { + info, ok := infoByCard[card] + if !ok { + continue + } + apply(&info, value) + infoByCard[card] = info + } +} + +func queryAMDField(flag string) (map[string]string, error) { + cmd, err := resolveAMDSMICmd(flag, "--csv") + if err != nil { + return nil, err + } + out, err := amdSMIExecCommand(cmd[0], cmd[1:]...).CombinedOutput() + if err != nil { + return nil, err + } + return parseROCmSingleValueCSV(string(out)), nil +} + +func queryAMDNumericField(flag string) (map[string]float64, error) { + values, err := queryAMDField(flag) + if err != nil { + return nil, err + } + out := map[string]float64{} + for card, raw := range values { + if value, ok := firstFloat(raw); ok { + out[card] = value + } + } + return out, nil +} + +func resolveAMDSMICmd(args ...string) ([]string, error) { + if path, err := amdSMILookPath("rocm-smi"); err == nil { + return append([]string{path}, args...), nil + } + for _, pattern := range amdSMIExecutableGlobs { + matches, err := amdSMIGlob(pattern) + if err != nil { + continue + } + sort.Strings(matches) + for _, match := range matches { + return append([]string{match}, args...), nil + } + } + return nil, exec.ErrNotFound +} + +func parseROCmSingleValueCSV(raw string) map[string]string { + rows := map[string]string{} + reader := csv.NewReader(strings.NewReader(raw)) + reader.FieldsPerRecord = -1 + records, err := reader.ReadAll() + if err != nil { + return rows + } + for _, rec := range records { + if len(rec) < 2 { + continue + } + card := normalizeROCmCardKey(rec[0]) + if card == "" { + continue + } + value := strings.TrimSpace(strings.Join(rec[1:], ",")) + if value == "" || looksLikeCSVHeaderValue(value) { + continue + } + rows[card] = value + } + return rows +} + +func normalizeROCmCardKey(raw string) string { + raw = strings.ToLower(strings.TrimSpace(raw)) + raw = strings.Trim(raw, "\"") + if raw == "" { + return "" + } + if raw == "device" || raw == "gpu" || raw == "card" { + return "" + } + if strings.HasPrefix(raw, "card") { + return raw + } + if _, err := strconv.Atoi(raw); err == nil { + return "card" + raw + } + return "" +} + +func looksLikeCSVHeaderValue(value string) bool { + value = strings.ToLower(strings.TrimSpace(value)) + return strings.Contains(value, "product") || + strings.Contains(value, "serial") || + strings.Contains(value, "vbios") || + strings.Contains(value, "bus") +} diff --git a/audit/internal/collector/amdgpu_test.go b/audit/internal/collector/amdgpu_test.go new file mode 100644 index 0000000..dc3417a --- /dev/null +++ b/audit/internal/collector/amdgpu_test.go @@ -0,0 +1,56 @@ +package collector + +import ( + "os/exec" + "testing" +) + +func TestParseROCmSingleValueCSV(t *testing.T) { + raw := "device,Serial Number\ncard0,ABC123\ncard1,XYZ789\n" + got := parseROCmSingleValueCSV(raw) + if got["card0"] != "ABC123" { + t.Fatalf("card0=%q want ABC123", got["card0"]) + } + if got["card1"] != "XYZ789" { + t.Fatalf("card1=%q want XYZ789", got["card1"]) + } +} + +func TestQueryAMDNumericFieldParsesUnits(t *testing.T) { + origExec := amdSMIExecCommand + origLookPath := amdSMILookPath + t.Cleanup(func() { + amdSMIExecCommand = origExec + amdSMILookPath = origLookPath + }) + + amdSMILookPath = func(string) (string, error) { return "/usr/bin/rocm-smi", nil } + amdSMIExecCommand = func(name string, args ...string) *exec.Cmd { + return exec.Command("sh", "-c", "printf 'device,Temperature\\ncard0,45.5c\\ncard1,67.0c\\n'") + } + + got, err := queryAMDNumericField("--showtemp") + if err != nil { + t.Fatalf("queryAMDNumericField: %v", err) + } + if got["card0"] != 45.5 { + t.Fatalf("card0=%v want 45.5", got["card0"]) + } + if got["card1"] != 67.0 { + t.Fatalf("card1=%v want 67.0", got["card1"]) + } +} + +func TestNormalizeROCmCardKey(t *testing.T) { + tests := map[string]string{ + "0": "card0", + "card1": "card1", + "Device": "", + "": "", + } + for input, want := range tests { + if got := normalizeROCmCardKey(input); got != want { + t.Fatalf("normalizeROCmCardKey(%q)=%q want %q", input, got, want) + } + } +} diff --git a/audit/internal/collector/collector.go b/audit/internal/collector/collector.go index c91e866..f9990fc 100644 --- a/audit/internal/collector/collector.go +++ b/audit/internal/collector/collector.go @@ -36,6 +36,7 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest { snap.Memory = enrichMemoryWithTelemetry(snap.Memory, sensorDoc) snap.Storage = collectStorage() snap.PCIeDevices = collectPCIe() + snap.PCIeDevices = enrichPCIeWithAMD(snap.PCIeDevices) snap.PCIeDevices = enrichPCIeWithPCISerials(snap.PCIeDevices) snap.PCIeDevices = enrichPCIeWithNVIDIA(snap.PCIeDevices) snap.PCIeDevices = enrichPCIeWithMellanox(snap.PCIeDevices) diff --git a/audit/internal/collector/storage.go b/audit/internal/collector/storage.go index 5dcd03d..efdb2fc 100644 --- a/audit/internal/collector/storage.go +++ b/audit/internal/collector/storage.go @@ -190,6 +190,7 @@ type smartctlInfo struct { func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage { present := true s := schema.HardwareStorage{Present: &present} + s.Telemetry = map[string]any{"linux_device": "/dev/" + dev.Name} tran := strings.ToLower(dev.Tran) devPath := "/dev/" + dev.Name @@ -348,6 +349,7 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage { Present: &present, Type: &devType, Interface: &iface, + Telemetry: map[string]any{"linux_device": "/dev/" + dev.Name}, } devPath := "/dev/" + dev.Name diff --git a/internal/chart b/internal/chart index ac8120c..05db699 160000 --- a/internal/chart +++ b/internal/chart @@ -1 +1 @@ -Subproject commit ac8120c8ab800bb3067efcada50bc4272dc8f76a +Subproject commit 05db6994d4a77bc95cc9d96892f81875f2f9fa01