- Dockerfile: install cuda-nvcc-13-0 from NVIDIA repo for compilation - build-nccl-tests.sh: downloads libnccl-dev for nccl.h, builds all_reduce_perf - build.sh: runs nccl-tests build, injects binary into /usr/local/bin/ - platform: RunNCCLTests() auto-detects GPU count, runs all_reduce_perf - TUI: NCCL bandwidth test entry in Burn-in Tests screen [N] hotkey Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
137 lines
3.0 KiB
Go
137 lines
3.0 KiB
Go
package tui
|
|
|
|
import (
|
|
"fmt"
|
|
"strings"
|
|
|
|
tea "github.com/charmbracelet/bubbletea"
|
|
)
|
|
|
|
const (
|
|
burnCurGPUStress = 0
|
|
burnCurModeQuick = 1
|
|
burnCurModeStd = 2
|
|
burnCurModeExpr = 3
|
|
burnCurRun = 4
|
|
burnCurNCCLTests = 5
|
|
burnCurTotal = 6
|
|
)
|
|
|
|
func (m model) enterBurnInTests() (tea.Model, tea.Cmd) {
|
|
m.screen = screenBurnInTests
|
|
m.cursor = 0
|
|
if !m.burnInitialized {
|
|
m.burnMode = 0
|
|
m.burnCursor = 0
|
|
m.burnInitialized = true
|
|
}
|
|
return m, nil
|
|
}
|
|
|
|
func (m model) updateBurnInTests(msg tea.KeyMsg) (tea.Model, tea.Cmd) {
|
|
switch msg.String() {
|
|
case "up", "k":
|
|
if m.burnCursor > 0 {
|
|
m.burnCursor--
|
|
}
|
|
case "down", "j":
|
|
if m.burnCursor < burnCurTotal-1 {
|
|
m.burnCursor++
|
|
}
|
|
case " ":
|
|
switch m.burnCursor {
|
|
case burnCurModeQuick, burnCurModeStd, burnCurModeExpr:
|
|
m.burnMode = m.burnCursor - burnCurModeQuick
|
|
}
|
|
case "enter":
|
|
switch m.burnCursor {
|
|
case burnCurGPUStress, burnCurRun:
|
|
return m.burnRunSelected()
|
|
case burnCurModeQuick, burnCurModeStd, burnCurModeExpr:
|
|
m.burnMode = m.burnCursor - burnCurModeQuick
|
|
case burnCurNCCLTests:
|
|
return m.burnRunNCCL()
|
|
}
|
|
case "f", "F", "r", "R":
|
|
return m.burnRunSelected()
|
|
case "n", "N":
|
|
return m.burnRunNCCL()
|
|
case "1":
|
|
m.burnMode = 0
|
|
case "2":
|
|
m.burnMode = 1
|
|
case "3":
|
|
m.burnMode = 2
|
|
case "esc":
|
|
m.screen = screenMain
|
|
m.cursor = 1
|
|
case "q", "ctrl+c":
|
|
return m, tea.Quit
|
|
}
|
|
return m, nil
|
|
}
|
|
|
|
func (m model) burnRunSelected() (tea.Model, tea.Cmd) {
|
|
return m.hcRunFanStress()
|
|
}
|
|
|
|
func (m model) burnRunNCCL() (tea.Model, tea.Cmd) {
|
|
m.pendingAction = actionRunNCCLTests
|
|
m.screen = screenConfirm
|
|
m.cursor = 0
|
|
return m, nil
|
|
}
|
|
|
|
func renderBurnInTests(m model) string {
|
|
var b strings.Builder
|
|
|
|
fmt.Fprintln(&b, "BURN-IN TESTS")
|
|
fmt.Fprintln(&b)
|
|
fmt.Fprintln(&b, " Stress tests:")
|
|
fmt.Fprintln(&b)
|
|
|
|
pfx := " "
|
|
if m.burnCursor == burnCurGPUStress {
|
|
pfx = "> "
|
|
}
|
|
fmt.Fprintf(&b, "%s[ GPU PLATFORM STRESS TEST [F] ] (thermal cycling, fan lag, throttle check)\n", pfx)
|
|
|
|
fmt.Fprintln(&b)
|
|
fmt.Fprintln(&b, " Mode:")
|
|
modes := []struct{ label, key string }{
|
|
{"Quick", "1"},
|
|
{"Standard", "2"},
|
|
{"Express", "3"},
|
|
}
|
|
for i, mode := range modes {
|
|
pfx := " "
|
|
if m.burnCursor == burnCurModeQuick+i {
|
|
pfx = "> "
|
|
}
|
|
radio := "( )"
|
|
if m.burnMode == i {
|
|
radio = "(*)"
|
|
}
|
|
fmt.Fprintf(&b, "%s%s %-10s [%s]\n", pfx, radio, mode.label, mode.key)
|
|
}
|
|
|
|
fmt.Fprintln(&b)
|
|
pfx = " "
|
|
if m.burnCursor == burnCurRun {
|
|
pfx = "> "
|
|
}
|
|
fmt.Fprintf(&b, "%s[ RUN SELECTED [R] ]\n", pfx)
|
|
|
|
fmt.Fprintln(&b)
|
|
pfx = " "
|
|
if m.burnCursor == burnCurNCCLTests {
|
|
pfx = "> "
|
|
}
|
|
fmt.Fprintf(&b, "%s[ NCCL BANDWIDTH TEST [N] ] (all_reduce_perf, NVLink/PCIe bandwidth)\n", pfx)
|
|
|
|
fmt.Fprintln(&b)
|
|
fmt.Fprintln(&b, "─────────────────────────────────────────────────────────────────")
|
|
fmt.Fprint(&b, "[↑↓] move [space/enter] select [1/2/3] mode [R/F] run [N] nccl [Esc] back")
|
|
return b.String()
|
|
}
|