package app import ( "os" "path/filepath" "strconv" "sort" "strings" "bee/audit/internal/schema" ) func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string, db *ComponentStatusDB) { if snap == nil || strings.TrimSpace(baseDir) == "" { return } if summary, ok := loadLatestSATSummary(baseDir, "gpu-amd-"); ok { applyGPUVendorSAT(snap.PCIeDevices, "amd", summary) } if summary, ok := loadLatestSATSummary(baseDir, "gpu-nvidia-"); ok { applyGPUVendorSAT(snap.PCIeDevices, "nvidia", summary) applyNvidiaPerGPUStatus(snap.PCIeDevices, baseDir) } if summary, ok := loadLatestSATSummary(baseDir, "memory-"); ok { applyMemorySAT(snap.Memory, summary) } if summary, ok := loadLatestSATSummary(baseDir, "cpu-"); ok { applyCPUSAT(snap.CPUs, summary) } if summary, ok := loadLatestSATSummary(baseDir, "storage-"); ok { applyStorageSAT(snap.Storage, summary) } // Apply unified component status DB — overlaid last so it can only upgrade severity. applyComponentStatusDB(snap, db) } type nvidiaPerGPUStatus struct { runStatus string reason string } func applyNvidiaPerGPUStatus(devs []schema.HardwarePCIeDevice, baseDir string) { statusByIndex, ts, ok := loadLatestNvidiaPerGPUStatus(baseDir) if !ok { return } for i := range devs { if devs[i].Telemetry == nil { continue } rawIdx, ok := devs[i].Telemetry["nvidia_gpu_index"] if !ok { continue } idx, ok := telemetryInt(rawIdx) if !ok { continue } st, ok := statusByIndex[idx] if !ok { continue } status, description, ok := satKeyStatus(st.runStatus, firstNonEmpty(strings.TrimSpace(st.reason), "nvidia GPU SAT")) if !ok { continue } mergeComponentStatusPreferDetail(&devs[i].HardwareComponentStatus, ts, status, description) } } func loadLatestNvidiaPerGPUStatus(baseDir string) (map[int]nvidiaPerGPUStatus, string, bool) { matches, err := filepath.Glob(filepath.Join(baseDir, "gpu-nvidia-*")) if err != nil || len(matches) == 0 { return nil, "", false } sort.Strings(matches) runDir := matches[len(matches)-1] summaryRaw, err := os.ReadFile(filepath.Join(runDir, "summary.txt")) if err != nil { return nil, "", false } summaryKV := parseKeyValueSummary(string(summaryRaw)) runAtUTC := strings.TrimSpace(summaryKV["run_at_utc"]) files, err := filepath.Glob(filepath.Join(runDir, "gpu-*-status.txt")) if err != nil || len(files) == 0 { return nil, "", false } out := make(map[int]nvidiaPerGPUStatus, len(files)) for _, file := range files { raw, err := os.ReadFile(file) if err != nil { continue } kv := parseKeyValueSummary(string(raw)) idx, err := strconv.Atoi(strings.TrimSpace(kv["gpu_index"])) if err != nil { continue } out[idx] = nvidiaPerGPUStatus{ runStatus: strings.ToUpper(strings.TrimSpace(kv["run_status"])), reason: strings.TrimSpace(kv["reason"]), } } if len(out) == 0 { return nil, "", false } return out, runAtUTC, true } func telemetryInt(v any) (int, bool) { switch value := v.(type) { case int: return value, true case int32: return int(value), true case int64: return int(value), true case float64: return int(value), true case string: n, err := strconv.Atoi(strings.TrimSpace(value)) if err != nil { return 0, false } return n, true default: return 0, false } } type satSummary struct { runAtUTC string overall string kv map[string]string } func loadLatestSATSummary(baseDir, prefix string) (satSummary, bool) { matches, err := filepath.Glob(filepath.Join(baseDir, prefix+"*/summary.txt")) if err != nil || len(matches) == 0 { return satSummary{}, false } sort.Strings(matches) raw, err := os.ReadFile(matches[len(matches)-1]) if err != nil { return satSummary{}, false } kv := parseKeyValueSummary(string(raw)) return satSummary{ runAtUTC: strings.TrimSpace(kv["run_at_utc"]), overall: strings.ToUpper(strings.TrimSpace(kv["overall_status"])), kv: kv, }, true } func applyGPUVendorSAT(devs []schema.HardwarePCIeDevice, vendor string, summary satSummary) { status, description, ok := satSummaryStatus(summary, vendor+" GPU SAT") if !ok { return } for i := range devs { if !matchesGPUVendor(devs[i], vendor) { continue } mergeComponentStatus(&devs[i].HardwareComponentStatus, summary.runAtUTC, status, description) } } func applyMemorySAT(dimms []schema.HardwareMemory, summary satSummary) { status, description, ok := satSummaryStatus(summary, "memory SAT") if !ok { return } for i := range dimms { mergeComponentStatus(&dimms[i].HardwareComponentStatus, summary.runAtUTC, status, description) } } func applyCPUSAT(cpus []schema.HardwareCPU, summary satSummary) { status, description, ok := satSummaryStatus(summary, "CPU SAT") if !ok { return } for i := range cpus { mergeComponentStatus(&cpus[i].HardwareComponentStatus, summary.runAtUTC, status, description) } } func applyStorageSAT(disks []schema.HardwareStorage, summary satSummary) { byDevice := parseStorageSATStatus(summary) for i := range disks { devPath, _ := disks[i].Telemetry["linux_device"].(string) devName := filepath.Base(strings.TrimSpace(devPath)) if devName == "" { continue } result, ok := byDevice[devName] if !ok { continue } mergeComponentStatus(&disks[i].HardwareComponentStatus, summary.runAtUTC, result.status, result.description) } } type satStatusResult struct { status string description string ok bool } func parseStorageSATStatus(summary satSummary) map[string]satStatusResult { result := map[string]satStatusResult{} for key, value := range summary.kv { if !strings.HasSuffix(key, "_status") || key == "overall_status" { continue } base := strings.TrimSuffix(key, "_status") idx := strings.Index(base, "_") if idx <= 0 { continue } devName := base[:idx] step := strings.ReplaceAll(base[idx+1:], "_", "-") stepStatus, desc, ok := satKeyStatus(strings.ToUpper(strings.TrimSpace(value)), "storage "+step) if !ok { continue } current := result[devName] if !current.ok || statusSeverity(stepStatus) > statusSeverity(current.status) { result[devName] = satStatusResult{status: stepStatus, description: desc, ok: true} } } return result } func satSummaryStatus(summary satSummary, label string) (string, string, bool) { return satKeyStatus(summary.overall, label) } func satKeyStatus(rawStatus, label string) (string, string, bool) { switch strings.ToUpper(strings.TrimSpace(rawStatus)) { case "OK": // No error description on success — error_description is for problems only. return "OK", "", true case "PARTIAL", "UNSUPPORTED", "CANCELED", "CANCELLED": // Tool couldn't run or test was incomplete — we can't assert hardware health. return "Unknown", "", true case "FAILED": return "Critical", label + " failed", true default: return "", "", false } } func mergeComponentStatus(component *schema.HardwareComponentStatus, changedAt, satStatus, description string) { if component == nil || satStatus == "" { return } current := strings.TrimSpace(ptrString(component.Status)) if current == "" || current == "Unknown" || statusSeverity(satStatus) > statusSeverity(current) { component.Status = appStringPtr(satStatus) if strings.TrimSpace(description) != "" { component.ErrorDescription = appStringPtr(description) } if strings.TrimSpace(changedAt) != "" { component.StatusChangedAt = appStringPtr(changedAt) component.StatusHistory = append(component.StatusHistory, schema.HardwareStatusHistory{ Status: satStatus, ChangedAt: changedAt, Details: appStringPtr(description), }) } } } func mergeComponentStatusPreferDetail(component *schema.HardwareComponentStatus, changedAt, satStatus, description string) { if component == nil || satStatus == "" { return } current := strings.TrimSpace(ptrString(component.Status)) newSeverity := statusSeverity(satStatus) currentSeverity := statusSeverity(current) if current == "" || current == "Unknown" || newSeverity > currentSeverity { mergeComponentStatus(component, changedAt, satStatus, description) return } if newSeverity == currentSeverity && strings.TrimSpace(description) != "" { component.Status = appStringPtr(satStatus) component.ErrorDescription = appStringPtr(description) if strings.TrimSpace(changedAt) != "" { component.StatusChangedAt = appStringPtr(changedAt) component.StatusHistory = append(component.StatusHistory, schema.HardwareStatusHistory{ Status: satStatus, ChangedAt: changedAt, Details: appStringPtr(description), }) } } } func statusSeverity(status string) int { switch strings.TrimSpace(status) { case "Critical": return 3 case "Warning": return 2 case "OK": return 1 case "Unknown": return 1 // same as OK — does not override OK from another source default: return 0 } } func matchesGPUVendor(dev schema.HardwarePCIeDevice, vendor string) bool { if dev.DeviceClass == nil || !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Controller") && !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Accelerator") { if dev.DeviceClass == nil || !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Display") && !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Video") { return false } } manufacturer := strings.ToLower(strings.TrimSpace(ptrString(dev.Manufacturer))) switch vendor { case "amd": return strings.Contains(manufacturer, "advanced micro devices") || strings.Contains(manufacturer, "amd/ati") case "nvidia": return strings.Contains(manufacturer, "nvidia") default: return false } } func applyComponentStatusDB(snap *schema.HardwareSnapshot, db *ComponentStatusDB) { if snap == nil || db == nil { return } for _, rec := range db.All() { key := rec.ComponentKey status := dbStatusToSATStatus(rec.Status) if status == "" { continue } detail := rec.ErrorSummary ts := rec.LastChangedAt.UTC().Format("2006-01-02T15:04:05Z") switch { case strings.HasPrefix(key, "pcie:"): bdf := strings.TrimPrefix(key, "pcie:") bdf = strings.TrimPrefix(bdf, "gpu:") // strip sub-type if present // bdf may be empty (e.g. "pcie:gpu:nvidia") — skip BDF matching if sanitizeBDFForLookup(bdf) == "" { break } normalized := sanitizeBDFForLookup(bdf) for i := range snap.PCIeDevices { if snap.PCIeDevices[i].BDF == nil { continue } if sanitizeBDFForLookup(*snap.PCIeDevices[i].BDF) == normalized { mergeComponentStatus(&snap.PCIeDevices[i].HardwareComponentStatus, ts, status, detail) } } case strings.HasPrefix(key, "storage:"): devName := strings.TrimPrefix(key, "storage:") if devName == "all" { for i := range snap.Storage { mergeComponentStatus(&snap.Storage[i].HardwareComponentStatus, ts, status, detail) } } else { for i := range snap.Storage { linuxDev, _ := snap.Storage[i].Telemetry["linux_device"].(string) if filepath.Base(strings.TrimSpace(linuxDev)) == devName { mergeComponentStatus(&snap.Storage[i].HardwareComponentStatus, ts, status, detail) } } } case strings.HasPrefix(key, "memory:"): for i := range snap.Memory { mergeComponentStatus(&snap.Memory[i].HardwareComponentStatus, ts, status, detail) } case strings.HasPrefix(key, "cpu:"): for i := range snap.CPUs { mergeComponentStatus(&snap.CPUs[i].HardwareComponentStatus, ts, status, detail) } } } } // dbStatusToSATStatus converts ComponentStatusDB status strings to the format // expected by mergeComponentStatus (which uses "OK", "Warning", "Critical", "Unknown"). func dbStatusToSATStatus(s string) string { switch strings.TrimSpace(s) { case "OK", "Warning", "Critical", "Unknown": return s default: return "" } } // sanitizeBDFForLookup normalises a PCIe BDF address to a canonical lower-case form // suitable for comparison. "c8:00.0" → "0000:c8:00.0"; already-full BDFs are left as-is. func sanitizeBDFForLookup(bdf string) string { bdf = strings.ToLower(strings.TrimSpace(bdf)) if bdf == "" || bdf == "gpu" || strings.ContainsAny(bdf, " \t") { return "" } if strings.Count(bdf, ":") == 1 { bdf = "0000:" + bdf } return bdf } func ptrString(v *string) string { if v == nil { return "" } return *v } func appStringPtr(value string) *string { return &value }