package platform import ( "bytes" "fmt" "math" "os" "os/exec" "strconv" "strings" "time" ) // GPUMetricRow is one telemetry sample from nvidia-smi during a stress test. type GPUMetricRow struct { Stage string `json:"stage,omitempty"` ElapsedSec float64 `json:"elapsed_sec"` GPUIndex int `json:"index"` TempC float64 `json:"temp_c"` UsagePct float64 `json:"usage_pct"` MemUsagePct float64 `json:"mem_usage_pct"` PowerW float64 `json:"power_w"` ClockMHz float64 `json:"clock_mhz"` MemClockMHz float64 `json:"mem_clock_mhz"` FanAvgRPM float64 `json:"fan_avg_rpm,omitempty"` FanDutyCyclePct float64 `json:"fan_duty_cycle_pct,omitempty"` FanDutyCycleAvailable bool `json:"fan_duty_cycle_available,omitempty"` } // sampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU. func sampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) { args := []string{ "--query-gpu=index,temperature.gpu,utilization.gpu,utilization.memory,power.draw,clocks.current.graphics,clocks.current.memory", "--format=csv,noheader,nounits", } if len(gpuIndices) > 0 { ids := make([]string, len(gpuIndices)) for i, idx := range gpuIndices { ids[i] = strconv.Itoa(idx) } args = append([]string{"--id=" + strings.Join(ids, ",")}, args...) } out, err := exec.Command("nvidia-smi", args...).Output() if err != nil { return nil, err } var rows []GPUMetricRow for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") { line = strings.TrimSpace(line) if line == "" { continue } parts := strings.Split(line, ", ") if len(parts) < 7 { continue } idx, _ := strconv.Atoi(strings.TrimSpace(parts[0])) rows = append(rows, GPUMetricRow{ GPUIndex: idx, TempC: parseGPUFloat(parts[1]), UsagePct: parseGPUFloat(parts[2]), MemUsagePct: parseGPUFloat(parts[3]), PowerW: parseGPUFloat(parts[4]), ClockMHz: parseGPUFloat(parts[5]), MemClockMHz: parseGPUFloat(parts[6]), }) } return rows, nil } func parseGPUFloat(s string) float64 { s = strings.TrimSpace(s) if s == "N/A" || s == "[Not Supported]" || s == "" { return 0 } v, _ := strconv.ParseFloat(s, 64) return v } // SampleGPUMetrics runs nvidia-smi once and returns current metrics for each GPU. func SampleGPUMetrics(gpuIndices []int) ([]GPUMetricRow, error) { return sampleGPUMetrics(gpuIndices) } // sampleAMDGPUMetrics queries rocm-smi for live GPU metrics. func sampleAMDGPUMetrics() ([]GPUMetricRow, error) { out, err := runROCmSMI("--showtemp", "--showuse", "--showpower", "--showmemuse", "--csv") if err != nil { return nil, err } lines := strings.Split(strings.TrimSpace(string(out)), "\n") if len(lines) < 2 { return nil, fmt.Errorf("rocm-smi: insufficient output") } // Parse header to find column indices by name. headers := strings.Split(lines[0], ",") colIdx := func(keywords ...string) int { for i, h := range headers { hl := strings.ToLower(strings.TrimSpace(h)) for _, kw := range keywords { if strings.Contains(hl, kw) { return i } } } return -1 } idxTemp := colIdx("sensor edge", "temperature (c)", "temp") idxUse := colIdx("gpu use (%)") idxMem := colIdx("vram%", "memory allocated") idxPow := colIdx("average graphics package power", "power (w)") var rows []GPUMetricRow for _, line := range lines[1:] { line = strings.TrimSpace(line) if line == "" { continue } parts := strings.Split(line, ",") idx := len(rows) row := GPUMetricRow{GPUIndex: idx} get := func(i int) float64 { if i < 0 || i >= len(parts) { return 0 } v := strings.TrimSpace(parts[i]) if strings.EqualFold(v, "n/a") { return 0 } return parseGPUFloat(v) } row.TempC = get(idxTemp) row.UsagePct = get(idxUse) row.MemUsagePct = get(idxMem) row.PowerW = get(idxPow) rows = append(rows, row) } if len(rows) == 0 { return nil, fmt.Errorf("rocm-smi: no GPU rows parsed") } return rows, nil } // WriteGPUMetricsCSV writes collected rows as a CSV file. func WriteGPUMetricsCSV(path string, rows []GPUMetricRow) error { var b bytes.Buffer b.WriteString("stage,elapsed_sec,gpu_index,temperature_c,usage_pct,mem_usage_pct,power_w,clock_mhz,mem_clock_mhz,fan_avg_rpm,fan_duty_cycle_pct,fan_duty_cycle_available\n") for _, r := range rows { dutyAvail := 0 if r.FanDutyCycleAvailable { dutyAvail = 1 } fmt.Fprintf(&b, "%s,%.1f,%d,%.1f,%.1f,%.1f,%.1f,%.0f,%.0f,%.0f,%.1f,%d\n", strconv.Quote(strings.TrimSpace(r.Stage)), r.ElapsedSec, r.GPUIndex, r.TempC, r.UsagePct, r.MemUsagePct, r.PowerW, r.ClockMHz, r.MemClockMHz, r.FanAvgRPM, r.FanDutyCyclePct, dutyAvail) } return os.WriteFile(path, b.Bytes(), 0644) } type gpuMetricStageSpan struct { Name string Start float64 End float64 } // WriteGPUMetricsHTML writes a standalone HTML file with one SVG chart per GPU. func WriteGPUMetricsHTML(path string, rows []GPUMetricRow) error { // Group by GPU index preserving order. seen := make(map[int]bool) var order []int gpuMap := make(map[int][]GPUMetricRow) for _, r := range rows { if !seen[r.GPUIndex] { seen[r.GPUIndex] = true order = append(order, r.GPUIndex) } gpuMap[r.GPUIndex] = append(gpuMap[r.GPUIndex], r) } stageSpans := buildGPUMetricStageSpans(rows) stageColorByName := make(map[string]string, len(stageSpans)) for i, span := range stageSpans { stageColorByName[span.Name] = gpuMetricStagePalette[i%len(gpuMetricStagePalette)] } var legend strings.Builder if len(stageSpans) > 0 { legend.WriteString(`
`) for _, span := range stageSpans { fmt.Fprintf(&legend, `%s`, stageColorByName[span.Name], gpuHTMLEscape(span.Name)) } legend.WriteString(`
`) } var svgs strings.Builder for _, gpuIdx := range order { svgs.WriteString(drawGPUChartSVG(gpuMap[gpuIdx], gpuIdx, stageSpans, stageColorByName)) svgs.WriteString("\n") } ts := time.Now().UTC().Format("2006-01-02 15:04:05 UTC") html := fmt.Sprintf(` GPU Stress Test Metrics
GPU Stress Test Metrics

GPU Stress Test Metrics

Generated %s

%s
%s
`, ts, legend.String(), svgs.String()) return os.WriteFile(path, []byte(html), 0644) } // drawGPUChartSVG generates a self-contained SVG chart for one GPU. func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int, stageSpans []gpuMetricStageSpan, stageColorByName map[string]string) string { // Layout const W, H = 960, 520 const plotX1 = 120 // usage axis / chart left border const plotX2 = 840 // power axis / chart right border const plotY1 = 70 // top const plotY2 = 465 // bottom (PH = 395) const PW = plotX2 - plotX1 const PH = plotY2 - plotY1 // Outer axes const tempAxisX = 60 // temp axis line const clockAxisX = 900 // clock axis line colors := [4]string{"#e74c3c", "#3498db", "#2ecc71", "#f39c12"} seriesLabel := [4]string{ fmt.Sprintf("GPU %d Temp (°C)", gpuIdx), fmt.Sprintf("GPU %d Usage (%%)", gpuIdx), fmt.Sprintf("GPU %d Power (W)", gpuIdx), fmt.Sprintf("GPU %d Clock (MHz)", gpuIdx), } axisLabel := [4]string{"Temperature (°C)", "GPU Usage (%)", "Power (W)", "Clock (MHz)"} // Extract series t := make([]float64, len(rows)) vals := [4][]float64{} for i := range vals { vals[i] = make([]float64, len(rows)) } for i, r := range rows { t[i] = r.ElapsedSec vals[0][i] = r.TempC vals[1][i] = r.UsagePct vals[2][i] = r.PowerW vals[3][i] = r.ClockMHz } tMin, tMax := gpuMinMax(t) type axisScale struct { ticks []float64 min, max float64 } var axes [4]axisScale for i := 0; i < 4; i++ { mn, mx := gpuMinMax(vals[i]) tks := gpuNiceTicks(mn, mx, 8) axes[i] = axisScale{ticks: tks, min: tks[0], max: tks[len(tks)-1]} } xv := func(tv float64) float64 { if tMax == tMin { return float64(plotX1) } return float64(plotX1) + (tv-tMin)/(tMax-tMin)*float64(PW) } yv := func(v float64, ai int) float64 { a := axes[ai] if a.max == a.min { return float64(plotY1 + PH/2) } return float64(plotY2) - (v-a.min)/(a.max-a.min)*float64(PH) } var b strings.Builder fmt.Fprintf(&b, ``+"\n", W, H) // Title fmt.Fprintf(&b, `GPU Stress Test Metrics — GPU %d`+"\n", plotX1+PW/2, gpuIdx) // Horizontal grid (align to temp axis ticks) b.WriteString(`` + "\n") for _, tick := range axes[0].ticks { y := yv(tick, 0) if y < float64(plotY1) || y > float64(plotY2) { continue } fmt.Fprintf(&b, ``+"\n", plotX1, y, plotX2, y) } // Vertical grid xTicks := gpuNiceTicks(tMin, tMax, 10) for _, tv := range xTicks { x := xv(tv) if x < float64(plotX1) || x > float64(plotX2) { continue } fmt.Fprintf(&b, ``+"\n", x, plotY1, x, plotY2) } b.WriteString("\n") // Stage backgrounds for _, span := range stageSpans { x1 := xv(span.Start) x2 := xv(span.End) if x2 < x1 { x1, x2 = x2, x1 } if x2-x1 < 1 { x2 = x1 + 1 } color := stageColorByName[span.Name] fmt.Fprintf(&b, ``+"\n", x1, plotY1, x2-x1, PH, color) fmt.Fprintf(&b, `%s`+"\n", x1+(x2-x1)/2, plotY1+12, gpuHTMLEscape(span.Name)) } // Chart border fmt.Fprintf(&b, ``+"\n", plotX1, plotY1, PW, PH) // X axis ticks and labels b.WriteString(`` + "\n") for _, tv := range xTicks { x := xv(tv) if x < float64(plotX1) || x > float64(plotX2) { continue } fmt.Fprintf(&b, `%s`+"\n", x, plotY2+18, gpuFormatTick(tv)) fmt.Fprintf(&b, ``+"\n", x, plotY2, x, plotY2+4) } b.WriteString("\n") fmt.Fprintf(&b, `Time (seconds)`+"\n", plotX1+PW/2, plotY2+38) // Y axes: [tempAxisX, plotX1, plotX2, clockAxisX] axisLineX := [4]int{tempAxisX, plotX1, plotX2, clockAxisX} axisRight := [4]bool{false, false, true, true} // Label x positions (for rotated vertical text) axisLabelX := [4]int{10, 68, 868, 950} for i := 0; i < 4; i++ { ax := axisLineX[i] right := axisRight[i] color := colors[i] // Axis line fmt.Fprintf(&b, ``+"\n", ax, plotY1, ax, plotY2, color) // Ticks and tick labels fmt.Fprintf(&b, ``+"\n", color) for _, tick := range axes[i].ticks { y := yv(tick, i) if y < float64(plotY1) || y > float64(plotY2) { continue } dx := -5 textX := ax - 8 anchor := "end" if right { dx = 5 textX = ax + 8 anchor = "start" } fmt.Fprintf(&b, ``+"\n", ax, y, ax+dx, y, color) fmt.Fprintf(&b, `%s`+"\n", textX, y, anchor, gpuFormatTick(tick)) } b.WriteString("\n") // Axis label (rotated) lx := axisLabelX[i] fmt.Fprintf(&b, `%s`+"\n", lx, plotY1+PH/2, color, axisLabel[i]) } // Data lines for i := 0; i < 4; i++ { var pts strings.Builder for j := range rows { x := xv(t[j]) y := yv(vals[i][j], i) if j == 0 { fmt.Fprintf(&pts, "%.1f,%.1f", x, y) } else { fmt.Fprintf(&pts, " %.1f,%.1f", x, y) } } fmt.Fprintf(&b, ``+"\n", pts.String(), colors[i]) } // Legend const legendY = 42 for i := 0; i < 4; i++ { lx := plotX1 + i*(PW/4) + 10 fmt.Fprintf(&b, ``+"\n", lx, legendY, lx+20, legendY, colors[i]) fmt.Fprintf(&b, `%s`+"\n", lx+25, legendY+4, seriesLabel[i]) } b.WriteString("\n") return b.String() } func gpuMinMax(vals []float64) (float64, float64) { if len(vals) == 0 { return 0, 1 } mn, mx := vals[0], vals[0] for _, v := range vals[1:] { if v < mn { mn = v } if v > mx { mx = v } } return mn, mx } func gpuNiceTicks(mn, mx float64, targetCount int) []float64 { if mn == mx { mn -= 1 mx += 1 } r := mx - mn step := math.Pow(10, math.Floor(math.Log10(r/float64(targetCount)))) for _, f := range []float64{1, 2, 5, 10} { if r/(f*step) <= float64(targetCount)*1.5 { step = f * step break } } lo := math.Floor(mn/step) * step hi := math.Ceil(mx/step) * step var ticks []float64 for v := lo; v <= hi+step*0.001; v += step { ticks = append(ticks, math.Round(v*1e9)/1e9) } return ticks } func gpuFormatTick(v float64) string { if v == math.Trunc(v) { return strconv.Itoa(int(v)) } return strconv.FormatFloat(v, 'f', 1, 64) } var gpuMetricStagePalette = []string{ "#d95c5c", "#2185d0", "#21ba45", "#f2c037", "#6435c9", "#00b5ad", "#a5673f", } func buildGPUMetricStageSpans(rows []GPUMetricRow) []gpuMetricStageSpan { var spans []gpuMetricStageSpan for _, row := range rows { name := strings.TrimSpace(row.Stage) if name == "" { name = "run" } if len(spans) == 0 || spans[len(spans)-1].Name != name { spans = append(spans, gpuMetricStageSpan{Name: name, Start: row.ElapsedSec, End: row.ElapsedSec}) continue } spans[len(spans)-1].End = row.ElapsedSec } for i := range spans { if spans[i].End <= spans[i].Start { spans[i].End = spans[i].Start + 1 } } return spans } var gpuHTMLReplacer = strings.NewReplacer( "&", "&", "<", "<", ">", ">", `"`, """, "'", "'", ) func gpuHTMLEscape(s string) string { return gpuHTMLReplacer.Replace(s) }