Align hardware export with ingest contract
This commit is contained in:
@@ -24,7 +24,7 @@ type nvidiaGPUInfo struct {
|
||||
}
|
||||
|
||||
// enrichPCIeWithNVIDIA enriches NVIDIA PCIe devices with data from nvidia-smi.
|
||||
// If the driver/tool is unavailable, NVIDIA devices get UNKNOWN status and
|
||||
// If the driver/tool is unavailable, NVIDIA devices get Unknown status and
|
||||
// a stable serial fallback based on board serial + slot.
|
||||
func enrichPCIeWithNVIDIA(devs []schema.HardwarePCIeDevice, boardSerial string) []schema.HardwarePCIeDevice {
|
||||
if !hasNVIDIADevices(devs) {
|
||||
@@ -78,9 +78,10 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
|
||||
devs[i].Firmware = &v
|
||||
}
|
||||
|
||||
status := "OK"
|
||||
status := statusOK
|
||||
if info.ECCUncorrected != nil && *info.ECCUncorrected > 0 {
|
||||
status = "WARNING"
|
||||
status = statusWarning
|
||||
devs[i].ErrorDescription = stringPtr("GPU reports uncorrected ECC errors")
|
||||
}
|
||||
devs[i].Status = &status
|
||||
injectNVIDIATelemetry(&devs[i], info)
|
||||
@@ -214,7 +215,7 @@ func isNVIDIADevice(dev schema.HardwarePCIeDevice) bool {
|
||||
|
||||
func setPCIeFallback(dev *schema.HardwarePCIeDevice, boardSerial string) {
|
||||
setPCIeFallbackSerial(dev, boardSerial)
|
||||
status := "UNKNOWN"
|
||||
status := statusUnknown
|
||||
dev.Status = &status
|
||||
}
|
||||
|
||||
@@ -233,25 +234,19 @@ func setPCIeFallbackSerial(dev *schema.HardwarePCIeDevice, boardSerial string) {
|
||||
}
|
||||
|
||||
func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) {
|
||||
if dev.Telemetry == nil {
|
||||
dev.Telemetry = map[string]any{}
|
||||
}
|
||||
if info.TemperatureC != nil {
|
||||
dev.Telemetry["temperature_c"] = *info.TemperatureC
|
||||
dev.TemperatureC = info.TemperatureC
|
||||
}
|
||||
if info.PowerW != nil {
|
||||
dev.Telemetry["power_w"] = *info.PowerW
|
||||
dev.PowerW = info.PowerW
|
||||
}
|
||||
if info.ECCUncorrected != nil {
|
||||
dev.Telemetry["ecc_uncorrected_total"] = *info.ECCUncorrected
|
||||
dev.ECCUncorrectedTotal = info.ECCUncorrected
|
||||
}
|
||||
if info.ECCCorrected != nil {
|
||||
dev.Telemetry["ecc_corrected_total"] = *info.ECCCorrected
|
||||
dev.ECCCorrectedTotal = info.ECCCorrected
|
||||
}
|
||||
if info.HWSlowdown != nil {
|
||||
dev.Telemetry["hw_slowdown_active"] = *info.HWSlowdown
|
||||
}
|
||||
if len(dev.Telemetry) == 0 {
|
||||
dev.Telemetry = nil
|
||||
dev.HWSlowdown = info.HWSlowdown
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user