fix(metrics): parse rocm-smi CSV by header keywords, not column position

MI250X outputs 7 temperature columns before power/use%; positional parsing
read junction temp (~40°C) as GPU utilisation. Switch to header-based
colIdx() lookup so the correct fields are read regardless of column order
or rocm-smi version.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-29 11:10:13 +03:00
parent 349e026cfa
commit a714c45f10

View File

@@ -78,48 +78,56 @@ func SampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
// sampleAMDGPUMetrics queries rocm-smi for live GPU metrics.
func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
// --showtemp --showuse --showpower --csv — one row per GPU
out, err := runROCmSMI("--showtemp", "--showuse", "--showpower", "--showmemuse", "--csv")
if err != nil {
return nil, err
}
var rows []GPUMetricRow
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
line = strings.TrimSpace(line)
if line == "" || strings.HasPrefix(strings.ToLower(line), "device") {
continue
}
// CSV format: device,temp_c,gpu_use%,mem_use%,power_w (order may vary by rocm-smi version)
// We parse by column header from the first line.
parts := strings.Split(line, ",")
if len(parts) < 2 {
continue
}
idx := len(rows)
row := GPUMetricRow{GPUIndex: idx}
// rocm-smi CSV columns vary; extract what we can
for i, p := range parts {
p = strings.TrimSpace(p)
switch {
case i == 0:
// device index like "card0" or "0"
case strings.Contains(strings.ToLower(p), "n/a"):
// skip N/A
default:
// Try to match by position heuristic: temp, use%, memuse%, power
v := parseGPUFloat(p)
switch {
case i == 1 && row.TempC == 0:
row.TempC = v
case i == 2 && row.UsagePct == 0:
row.UsagePct = v
case i == 3 && row.MemUsagePct == 0:
row.MemUsagePct = v
case i == 4 && row.PowerW == 0:
row.PowerW = v
lines := strings.Split(strings.TrimSpace(string(out)), "\n")
if len(lines) < 2 {
return nil, fmt.Errorf("rocm-smi: insufficient output")
}
// Parse header to find column indices by name.
headers := strings.Split(lines[0], ",")
colIdx := func(keywords ...string) int {
for i, h := range headers {
hl := strings.ToLower(strings.TrimSpace(h))
for _, kw := range keywords {
if strings.Contains(hl, kw) {
return i
}
}
}
return -1
}
idxTemp := colIdx("sensor edge", "temperature (c)", "temp")
idxUse := colIdx("gpu use (%)")
idxMem := colIdx("vram%", "memory allocated")
idxPow := colIdx("average graphics package power", "power (w)")
var rows []GPUMetricRow
for _, line := range lines[1:] {
line = strings.TrimSpace(line)
if line == "" {
continue
}
parts := strings.Split(line, ",")
idx := len(rows)
row := GPUMetricRow{GPUIndex: idx}
get := func(i int) float64 {
if i < 0 || i >= len(parts) {
return 0
}
v := strings.TrimSpace(parts[i])
if strings.EqualFold(v, "n/a") {
return 0
}
return parseGPUFloat(v)
}
row.TempC = get(idxTemp)
row.UsagePct = get(idxUse)
row.MemUsagePct = get(idxMem)
row.PowerW = get(idxPow)
rows = append(rows, row)
}
if len(rows) == 0 {