From a714c45f10a9c68c9f395fdc927ee94ce6348aa6 Mon Sep 17 00:00:00 2001 From: Michael Chus Date: Sun, 29 Mar 2026 11:10:13 +0300 Subject: [PATCH] fix(metrics): parse rocm-smi CSV by header keywords, not column position MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MI250X outputs 7 temperature columns before power/use%; positional parsing read junction temp (~40°C) as GPU utilisation. Switch to header-based colIdx() lookup so the correct fields are read regardless of column order or rocm-smi version. Co-Authored-By: Claude Sonnet 4.6 --- audit/internal/platform/gpu_metrics.go | 78 ++++++++++++++------------ 1 file changed, 43 insertions(+), 35 deletions(-) diff --git a/audit/internal/platform/gpu_metrics.go b/audit/internal/platform/gpu_metrics.go index c10af5e..0bec459 100644 --- a/audit/internal/platform/gpu_metrics.go +++ b/audit/internal/platform/gpu_metrics.go @@ -78,48 +78,56 @@ func SampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) { // sampleAMDGPUMetrics queries rocm-smi for live GPU metrics. func sampleAMDGPUMetrics() ([]GPUMetricRow, error) { - // --showtemp --showuse --showpower --csv — one row per GPU out, err := runROCmSMI("--showtemp", "--showuse", "--showpower", "--showmemuse", "--csv") if err != nil { return nil, err } - var rows []GPUMetricRow - for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") { - line = strings.TrimSpace(line) - if line == "" || strings.HasPrefix(strings.ToLower(line), "device") { - continue - } - // CSV format: device,temp_c,gpu_use%,mem_use%,power_w (order may vary by rocm-smi version) - // We parse by column header from the first line. - parts := strings.Split(line, ",") - if len(parts) < 2 { - continue - } - idx := len(rows) - row := GPUMetricRow{GPUIndex: idx} - // rocm-smi CSV columns vary; extract what we can - for i, p := range parts { - p = strings.TrimSpace(p) - switch { - case i == 0: - // device index like "card0" or "0" - case strings.Contains(strings.ToLower(p), "n/a"): - // skip N/A - default: - // Try to match by position heuristic: temp, use%, memuse%, power - v := parseGPUFloat(p) - switch { - case i == 1 && row.TempC == 0: - row.TempC = v - case i == 2 && row.UsagePct == 0: - row.UsagePct = v - case i == 3 && row.MemUsagePct == 0: - row.MemUsagePct = v - case i == 4 && row.PowerW == 0: - row.PowerW = v + lines := strings.Split(strings.TrimSpace(string(out)), "\n") + if len(lines) < 2 { + return nil, fmt.Errorf("rocm-smi: insufficient output") + } + + // Parse header to find column indices by name. + headers := strings.Split(lines[0], ",") + colIdx := func(keywords ...string) int { + for i, h := range headers { + hl := strings.ToLower(strings.TrimSpace(h)) + for _, kw := range keywords { + if strings.Contains(hl, kw) { + return i } } } + return -1 + } + idxTemp := colIdx("sensor edge", "temperature (c)", "temp") + idxUse := colIdx("gpu use (%)") + idxMem := colIdx("vram%", "memory allocated") + idxPow := colIdx("average graphics package power", "power (w)") + + var rows []GPUMetricRow + for _, line := range lines[1:] { + line = strings.TrimSpace(line) + if line == "" { + continue + } + parts := strings.Split(line, ",") + idx := len(rows) + row := GPUMetricRow{GPUIndex: idx} + get := func(i int) float64 { + if i < 0 || i >= len(parts) { + return 0 + } + v := strings.TrimSpace(parts[i]) + if strings.EqualFold(v, "n/a") { + return 0 + } + return parseGPUFloat(v) + } + row.TempC = get(idxTemp) + row.UsagePct = get(idxUse) + row.MemUsagePct = get(idxMem) + row.PowerW = get(idxPow) rows = append(rows, row) } if len(rows) == 0 {