fix(metrics): parse rocm-smi CSV by header keywords, not column position
MI250X outputs 7 temperature columns before power/use%; positional parsing read junction temp (~40°C) as GPU utilisation. Switch to header-based colIdx() lookup so the correct fields are read regardless of column order or rocm-smi version. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -78,48 +78,56 @@ func SampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) {
|
|||||||
|
|
||||||
// sampleAMDGPUMetrics queries rocm-smi for live GPU metrics.
|
// sampleAMDGPUMetrics queries rocm-smi for live GPU metrics.
|
||||||
func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
|
func sampleAMDGPUMetrics() ([]GPUMetricRow, error) {
|
||||||
// --showtemp --showuse --showpower --csv — one row per GPU
|
|
||||||
out, err := runROCmSMI("--showtemp", "--showuse", "--showpower", "--showmemuse", "--csv")
|
out, err := runROCmSMI("--showtemp", "--showuse", "--showpower", "--showmemuse", "--csv")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
var rows []GPUMetricRow
|
lines := strings.Split(strings.TrimSpace(string(out)), "\n")
|
||||||
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
if len(lines) < 2 {
|
||||||
line = strings.TrimSpace(line)
|
return nil, fmt.Errorf("rocm-smi: insufficient output")
|
||||||
if line == "" || strings.HasPrefix(strings.ToLower(line), "device") {
|
}
|
||||||
continue
|
|
||||||
}
|
// Parse header to find column indices by name.
|
||||||
// CSV format: device,temp_c,gpu_use%,mem_use%,power_w (order may vary by rocm-smi version)
|
headers := strings.Split(lines[0], ",")
|
||||||
// We parse by column header from the first line.
|
colIdx := func(keywords ...string) int {
|
||||||
parts := strings.Split(line, ",")
|
for i, h := range headers {
|
||||||
if len(parts) < 2 {
|
hl := strings.ToLower(strings.TrimSpace(h))
|
||||||
continue
|
for _, kw := range keywords {
|
||||||
}
|
if strings.Contains(hl, kw) {
|
||||||
idx := len(rows)
|
return i
|
||||||
row := GPUMetricRow{GPUIndex: idx}
|
|
||||||
// rocm-smi CSV columns vary; extract what we can
|
|
||||||
for i, p := range parts {
|
|
||||||
p = strings.TrimSpace(p)
|
|
||||||
switch {
|
|
||||||
case i == 0:
|
|
||||||
// device index like "card0" or "0"
|
|
||||||
case strings.Contains(strings.ToLower(p), "n/a"):
|
|
||||||
// skip N/A
|
|
||||||
default:
|
|
||||||
// Try to match by position heuristic: temp, use%, memuse%, power
|
|
||||||
v := parseGPUFloat(p)
|
|
||||||
switch {
|
|
||||||
case i == 1 && row.TempC == 0:
|
|
||||||
row.TempC = v
|
|
||||||
case i == 2 && row.UsagePct == 0:
|
|
||||||
row.UsagePct = v
|
|
||||||
case i == 3 && row.MemUsagePct == 0:
|
|
||||||
row.MemUsagePct = v
|
|
||||||
case i == 4 && row.PowerW == 0:
|
|
||||||
row.PowerW = v
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return -1
|
||||||
|
}
|
||||||
|
idxTemp := colIdx("sensor edge", "temperature (c)", "temp")
|
||||||
|
idxUse := colIdx("gpu use (%)")
|
||||||
|
idxMem := colIdx("vram%", "memory allocated")
|
||||||
|
idxPow := colIdx("average graphics package power", "power (w)")
|
||||||
|
|
||||||
|
var rows []GPUMetricRow
|
||||||
|
for _, line := range lines[1:] {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
if line == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
parts := strings.Split(line, ",")
|
||||||
|
idx := len(rows)
|
||||||
|
row := GPUMetricRow{GPUIndex: idx}
|
||||||
|
get := func(i int) float64 {
|
||||||
|
if i < 0 || i >= len(parts) {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
v := strings.TrimSpace(parts[i])
|
||||||
|
if strings.EqualFold(v, "n/a") {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return parseGPUFloat(v)
|
||||||
|
}
|
||||||
|
row.TempC = get(idxTemp)
|
||||||
|
row.UsagePct = get(idxUse)
|
||||||
|
row.MemUsagePct = get(idxMem)
|
||||||
|
row.PowerW = get(idxPow)
|
||||||
rows = append(rows, row)
|
rows = append(rows, row)
|
||||||
}
|
}
|
||||||
if len(rows) == 0 {
|
if len(rows) == 0 {
|
||||||
|
|||||||
Reference in New Issue
Block a user