Align hardware export with ingest contract

This commit is contained in:
Mikhail Chusavitin
2026-03-15 21:04:53 +03:00
parent b8c235b5ac
commit ab5a4be7ac
37 changed files with 3304 additions and 354 deletions

View File

@@ -24,7 +24,7 @@ type nvidiaGPUInfo struct {
}
// enrichPCIeWithNVIDIA enriches NVIDIA PCIe devices with data from nvidia-smi.
// If the driver/tool is unavailable, NVIDIA devices get UNKNOWN status and
// If the driver/tool is unavailable, NVIDIA devices get Unknown status and
// a stable serial fallback based on board serial + slot.
func enrichPCIeWithNVIDIA(devs []schema.HardwarePCIeDevice, boardSerial string) []schema.HardwarePCIeDevice {
if !hasNVIDIADevices(devs) {
@@ -78,9 +78,10 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
devs[i].Firmware = &v
}
status := "OK"
status := statusOK
if info.ECCUncorrected != nil && *info.ECCUncorrected > 0 {
status = "WARNING"
status = statusWarning
devs[i].ErrorDescription = stringPtr("GPU reports uncorrected ECC errors")
}
devs[i].Status = &status
injectNVIDIATelemetry(&devs[i], info)
@@ -214,7 +215,7 @@ func isNVIDIADevice(dev schema.HardwarePCIeDevice) bool {
func setPCIeFallback(dev *schema.HardwarePCIeDevice, boardSerial string) {
setPCIeFallbackSerial(dev, boardSerial)
status := "UNKNOWN"
status := statusUnknown
dev.Status = &status
}
@@ -233,25 +234,19 @@ func setPCIeFallbackSerial(dev *schema.HardwarePCIeDevice, boardSerial string) {
}
func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) {
if dev.Telemetry == nil {
dev.Telemetry = map[string]any{}
}
if info.TemperatureC != nil {
dev.Telemetry["temperature_c"] = *info.TemperatureC
dev.TemperatureC = info.TemperatureC
}
if info.PowerW != nil {
dev.Telemetry["power_w"] = *info.PowerW
dev.PowerW = info.PowerW
}
if info.ECCUncorrected != nil {
dev.Telemetry["ecc_uncorrected_total"] = *info.ECCUncorrected
dev.ECCUncorrectedTotal = info.ECCUncorrected
}
if info.ECCCorrected != nil {
dev.Telemetry["ecc_corrected_total"] = *info.ECCCorrected
dev.ECCCorrectedTotal = info.ECCCorrected
}
if info.HWSlowdown != nil {
dev.Telemetry["hw_slowdown_active"] = *info.HWSlowdown
}
if len(dev.Telemetry) == 0 {
dev.Telemetry = nil
dev.HWSlowdown = info.HWSlowdown
}
}