package collector import ( "fmt" "strings" "git.mchus.pro/mchus/logpile/internal/collector/redfishprofile" "git.mchus.pro/mchus/logpile/internal/models" ) func (r redfishSnapshotReader) collectGPUs(systemPaths, chassisPaths []string, plan redfishprofile.ResolvedAnalysisPlan) []models.GPU { collections := make([]string, 0, len(systemPaths)*3+len(chassisPaths)*2) for _, systemPath := range systemPaths { collections = append(collections, joinPath(systemPath, "/PCIeDevices")) collections = append(collections, joinPath(systemPath, "/Accelerators")) collections = append(collections, joinPath(systemPath, "/GraphicsControllers")) } for _, chassisPath := range chassisPaths { collections = append(collections, joinPath(chassisPath, "/PCIeDevices")) collections = append(collections, joinPath(chassisPath, "/Accelerators")) } var out []models.GPU seen := make(map[string]struct{}) idx := 1 for _, collectionPath := range collections { memberDocs, err := r.getCollectionMembers(collectionPath) if err != nil || len(memberDocs) == 0 { continue } for _, doc := range memberDocs { functionDocs := r.getLinkedPCIeFunctions(doc) if !looksLikeGPU(doc, functionDocs) { continue } supplementalDocs := r.getLinkedSupplementalDocs(doc, "EnvironmentMetrics", "Metrics") for _, fn := range functionDocs { supplementalDocs = append(supplementalDocs, r.getLinkedSupplementalDocs(fn, "EnvironmentMetrics", "Metrics")...) } gpu := parseGPUWithSupplementalDocs(doc, functionDocs, supplementalDocs, idx) idx++ if plan.Directives.EnableGenericGraphicsControllerDedup && shouldSkipGenericGPUDuplicate(out, gpu) { continue } key := gpuDocDedupKey(doc, gpu) if key == "" { continue } if _, ok := seen[key]; ok { continue } seen[key] = struct{}{} out = append(out, gpu) } } if plan.Directives.EnableGenericGraphicsControllerDedup { return dropModelOnlyGPUPlaceholders(out) } return out } // msiGhostGPUFilter returns true when the GPU chassis for gpuID shows a temperature // of 0 on a powered-on host, which is the reliable MSI/AMI signal that the GPU is // no longer physically installed (stale BMC inventory cache). // It only filters when the system PowerState is "On" — when the host is off, all // temperature readings are 0 and we cannot distinguish absent from idle. func (r redfishSnapshotReader) msiGhostGPUFilter(systemPaths []string, gpuID, chassisPath string) bool { // Require host powered on. for _, sp := range systemPaths { doc, err := r.getJSON(sp) if err != nil { continue } if !strings.EqualFold(strings.TrimSpace(asString(doc["PowerState"])), "on") { return false } break } // Read the temperature sensor for this GPU chassis. sensorPath := joinPath(chassisPath, "/Sensors/"+gpuID+"_Temperature") sensorDoc, err := r.getJSON(sensorPath) if err != nil || len(sensorDoc) == 0 { return false } reading, ok := sensorDoc["Reading"] if !ok { return false } switch v := reading.(type) { case float64: return v == 0 case int: return v == 0 case int64: return v == 0 } return false } // collectGPUsFromProcessors finds GPUs that some BMCs (e.g. MSI) expose as // Processor entries with ProcessorType=GPU rather than as PCIe devices. // It supplements the existing gpus slice (already found via PCIe path), // skipping entries already present by UUID or SerialNumber. // Serial numbers are looked up from Chassis members named after each GPU Id. func (r redfishSnapshotReader) collectGPUsFromProcessors(systemPaths, chassisPaths []string, existing []models.GPU, plan redfishprofile.ResolvedAnalysisPlan) []models.GPU { if !plan.Directives.EnableProcessorGPUFallback { return append([]models.GPU{}, existing...) } chassisByID := make(map[string]map[string]interface{}) chassisPathByID := make(map[string]string) for _, cp := range chassisPaths { doc, err := r.getJSON(cp) if err != nil || len(doc) == 0 { continue } id := strings.TrimSpace(asString(doc["Id"])) if id != "" { chassisByID[strings.ToUpper(id)] = doc chassisPathByID[strings.ToUpper(id)] = cp } } seenUUID := make(map[string]struct{}) seenSerial := make(map[string]struct{}) for _, g := range existing { if u := strings.ToUpper(strings.TrimSpace(g.UUID)); u != "" { seenUUID[u] = struct{}{} } if s := strings.ToUpper(strings.TrimSpace(g.SerialNumber)); s != "" { seenSerial[s] = struct{}{} } } out := append([]models.GPU{}, existing...) idx := len(existing) + 1 for _, systemPath := range systemPaths { procDocs, err := r.getCollectionMembers(joinPath(systemPath, "/Processors")) if err != nil { continue } for _, doc := range procDocs { if !strings.EqualFold(strings.TrimSpace(asString(doc["ProcessorType"])), "GPU") { continue } gpuID := strings.TrimSpace(asString(doc["Id"])) serial := findFirstNormalizedStringByKeys(doc, "SerialNumber") if serial == "" { serial = resolveProcessorGPUChassisSerial(chassisByID, gpuID, plan) } if plan.Directives.EnableMSIGhostGPUFilter { chassisPath := resolveProcessorGPUChassisPath(chassisPathByID, gpuID, plan) if chassisPath != "" && r.msiGhostGPUFilter(systemPaths, gpuID, chassisPath) { continue } } uuid := strings.TrimSpace(asString(doc["UUID"])) uuidKey := strings.ToUpper(uuid) serialKey := strings.ToUpper(serial) if uuidKey != "" { if _, dup := seenUUID[uuidKey]; dup { continue } seenUUID[uuidKey] = struct{}{} } if serialKey != "" { if _, dup := seenSerial[serialKey]; dup { continue } seenSerial[serialKey] = struct{}{} } slotLabel := firstNonEmpty( redfishLocationLabel(doc["Location"]), redfishLocationLabel(doc["PhysicalLocation"]), ) if slotLabel == "" && gpuID != "" { slotLabel = gpuID } if slotLabel == "" { slotLabel = fmt.Sprintf("GPU%d", idx) } out = append(out, models.GPU{ Slot: slotLabel, Model: firstNonEmpty(asString(doc["Model"]), asString(doc["Name"])), Manufacturer: asString(doc["Manufacturer"]), PartNumber: asString(doc["PartNumber"]), SerialNumber: serial, UUID: uuid, Status: mapStatus(doc["Status"]), }) idx++ } } return out }