Benchmark: parallel GPU mode, resilient inventory query, server model in results
- Add parallel GPU mode (checkbox, off by default): runs all selected GPUs simultaneously via a single bee-gpu-burn invocation instead of sequentially; per-GPU telemetry, throttle counters, TOPS, and scoring are preserved - Make queryBenchmarkGPUInfo resilient: falls back to a base field set when extended fields (attribute.multiprocessor_count, power.default_limit) cause exit status 2, preventing lgc normalization from being silently skipped - Log explicit "graphics clock lock skipped" note when inventory is unavailable - Collect server model from DMI (/sys/class/dmi/id/product_name) and store in result JSON; benchmark history columns now show "Server Model (N× GPU Model)" grouped by server+GPU type rather than individual GPU index Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -105,7 +105,9 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
BenchmarkVersion: benchmarkVersion,
|
BenchmarkVersion: benchmarkVersion,
|
||||||
GeneratedAt: time.Now().UTC(),
|
GeneratedAt: time.Now().UTC(),
|
||||||
Hostname: hostname,
|
Hostname: hostname,
|
||||||
|
ServerModel: readServerModel(),
|
||||||
BenchmarkProfile: spec.Name,
|
BenchmarkProfile: spec.Name,
|
||||||
|
ParallelGPUs: opts.ParallelGPUs,
|
||||||
SelectedGPUIndices: append([]int(nil), selected...),
|
SelectedGPUIndices: append([]int(nil), selected...),
|
||||||
Normalization: BenchmarkNormalization{
|
Normalization: BenchmarkNormalization{
|
||||||
Status: "full",
|
Status: "full",
|
||||||
@@ -143,6 +145,10 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
if opts.ParallelGPUs {
|
||||||
|
runNvidiaBenchmarkParallel(ctx, verboseLog, runDir, selected, infoByIndex, opts, spec, logFunc, &result, &serverIdleW, &serverLoadedWSum, &serverIdleOK, &serverLoadedOK, &serverLoadedSamples)
|
||||||
|
} else {
|
||||||
|
|
||||||
for _, idx := range selected {
|
for _, idx := range selected {
|
||||||
gpuResult := BenchmarkGPUResult{
|
gpuResult := BenchmarkGPUResult{
|
||||||
Index: idx,
|
Index: idx,
|
||||||
@@ -285,6 +291,8 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult))
|
result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(gpuResult))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
} // end sequential path
|
||||||
|
|
||||||
if len(selected) > 1 && opts.RunNCCL {
|
if len(selected) > 1 && opts.RunNCCL {
|
||||||
result.Interconnect = runBenchmarkInterconnect(ctx, verboseLog, runDir, selected, spec, logFunc)
|
result.Interconnect = runBenchmarkInterconnect(ctx, verboseLog, runDir, selected, spec, logFunc)
|
||||||
if result.Interconnect != nil && result.Interconnect.Supported {
|
if result.Interconnect != nil && result.Interconnect.Supported {
|
||||||
@@ -362,60 +370,87 @@ func resolveBenchmarkProfile(profile string) benchmarkProfileSpec {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
|
// benchmarkGPUInfoQuery describes a nvidia-smi --query-gpu field set to try.
|
||||||
args := []string{
|
// Fields are tried in order; the first successful query wins. Extended fields
|
||||||
"--query-gpu=index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics,attribute.multiprocessor_count,power.default_limit",
|
// (attribute.multiprocessor_count, power.default_limit) are not supported on
|
||||||
"--format=csv,noheader,nounits",
|
// all driver versions, so we fall back to the base set if the full query fails.
|
||||||
}
|
var benchmarkGPUInfoQueries = []struct {
|
||||||
if len(gpuIndices) > 0 {
|
fields string
|
||||||
args = append([]string{"--id=" + joinIndexList(gpuIndices)}, args...)
|
extended bool // whether this query includes optional extended fields
|
||||||
}
|
}{
|
||||||
out, err := satExecCommand("nvidia-smi", args...).Output()
|
{
|
||||||
if err != nil {
|
fields: "index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics,attribute.multiprocessor_count,power.default_limit",
|
||||||
return nil, fmt.Errorf("nvidia-smi gpu info: %w", err)
|
extended: true,
|
||||||
}
|
},
|
||||||
|
{
|
||||||
r := csv.NewReader(strings.NewReader(string(out)))
|
fields: "index,uuid,name,pci.bus_id,vbios_version,power.limit,clocks.max.graphics,clocks.max.memory,clocks.base.graphics",
|
||||||
r.TrimLeadingSpace = true
|
extended: false,
|
||||||
r.FieldsPerRecord = -1
|
},
|
||||||
rows, err := r.ReadAll()
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("parse nvidia-smi gpu info: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
infoByIndex := make(map[int]benchmarkGPUInfo, len(rows))
|
|
||||||
for _, row := range rows {
|
|
||||||
if len(row) < 9 {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
idx, err := strconv.Atoi(strings.TrimSpace(row[0]))
|
|
||||||
if err != nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
info := benchmarkGPUInfo{
|
|
||||||
Index: idx,
|
|
||||||
UUID: strings.TrimSpace(row[1]),
|
|
||||||
Name: strings.TrimSpace(row[2]),
|
|
||||||
BusID: strings.TrimSpace(row[3]),
|
|
||||||
VBIOS: strings.TrimSpace(row[4]),
|
|
||||||
PowerLimitW: parseBenchmarkFloat(row[5]),
|
|
||||||
MaxGraphicsClockMHz: parseBenchmarkFloat(row[6]),
|
|
||||||
MaxMemoryClockMHz: parseBenchmarkFloat(row[7]),
|
|
||||||
}
|
|
||||||
if len(row) >= 9 {
|
|
||||||
info.BaseGraphicsClockMHz = parseBenchmarkFloat(row[8])
|
|
||||||
}
|
|
||||||
if len(row) >= 10 {
|
|
||||||
info.MultiprocessorCount = int(parseBenchmarkFloat(row[9]))
|
|
||||||
}
|
|
||||||
if len(row) >= 11 {
|
|
||||||
info.DefaultPowerLimitW = parseBenchmarkFloat(row[10])
|
|
||||||
}
|
|
||||||
infoByIndex[idx] = info
|
|
||||||
}
|
|
||||||
return infoByIndex, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func queryBenchmarkGPUInfo(gpuIndices []int) (map[int]benchmarkGPUInfo, error) {
|
||||||
|
var lastErr error
|
||||||
|
for _, q := range benchmarkGPUInfoQueries {
|
||||||
|
args := []string{
|
||||||
|
"--query-gpu=" + q.fields,
|
||||||
|
"--format=csv,noheader,nounits",
|
||||||
|
}
|
||||||
|
if len(gpuIndices) > 0 {
|
||||||
|
args = append([]string{"--id=" + joinIndexList(gpuIndices)}, args...)
|
||||||
|
}
|
||||||
|
out, err := satExecCommand("nvidia-smi", args...).Output()
|
||||||
|
if err != nil {
|
||||||
|
lastErr = fmt.Errorf("nvidia-smi gpu info (%s): %w", q.fields[:min(len(q.fields), 40)], err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
r := csv.NewReader(strings.NewReader(string(out)))
|
||||||
|
r.TrimLeadingSpace = true
|
||||||
|
r.FieldsPerRecord = -1
|
||||||
|
rows, err := r.ReadAll()
|
||||||
|
if err != nil {
|
||||||
|
lastErr = fmt.Errorf("parse nvidia-smi gpu info: %w", err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
infoByIndex := make(map[int]benchmarkGPUInfo, len(rows))
|
||||||
|
for _, row := range rows {
|
||||||
|
if len(row) < 9 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
idx, err := strconv.Atoi(strings.TrimSpace(row[0]))
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
info := benchmarkGPUInfo{
|
||||||
|
Index: idx,
|
||||||
|
UUID: strings.TrimSpace(row[1]),
|
||||||
|
Name: strings.TrimSpace(row[2]),
|
||||||
|
BusID: strings.TrimSpace(row[3]),
|
||||||
|
VBIOS: strings.TrimSpace(row[4]),
|
||||||
|
PowerLimitW: parseBenchmarkFloat(row[5]),
|
||||||
|
MaxGraphicsClockMHz: parseBenchmarkFloat(row[6]),
|
||||||
|
MaxMemoryClockMHz: parseBenchmarkFloat(row[7]),
|
||||||
|
}
|
||||||
|
if len(row) >= 9 {
|
||||||
|
info.BaseGraphicsClockMHz = parseBenchmarkFloat(row[8])
|
||||||
|
}
|
||||||
|
if q.extended {
|
||||||
|
if len(row) >= 10 {
|
||||||
|
info.MultiprocessorCount = int(parseBenchmarkFloat(row[9]))
|
||||||
|
}
|
||||||
|
if len(row) >= 11 {
|
||||||
|
info.DefaultPowerLimitW = parseBenchmarkFloat(row[10])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
infoByIndex[idx] = info
|
||||||
|
}
|
||||||
|
return infoByIndex, nil
|
||||||
|
}
|
||||||
|
return nil, lastErr
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
func applyBenchmarkNormalization(ctx context.Context, verboseLog string, gpuIndices []int, infoByIndex map[int]benchmarkGPUInfo, result *NvidiaBenchmarkResult) []benchmarkRestoreAction {
|
func applyBenchmarkNormalization(ctx context.Context, verboseLog string, gpuIndices []int, infoByIndex map[int]benchmarkGPUInfo, result *NvidiaBenchmarkResult) []benchmarkRestoreAction {
|
||||||
if os.Geteuid() != 0 {
|
if os.Geteuid() != 0 {
|
||||||
result.Normalization.Status = "partial"
|
result.Normalization.Status = "partial"
|
||||||
@@ -454,6 +489,10 @@ func applyBenchmarkNormalization(ctx context.Context, verboseLog string, gpuIndi
|
|||||||
_, _ = runSATCommandCtx(context.Background(), verboseLog, fmt.Sprintf("restore-gpu-%d-rgc", idxCopy), []string{"nvidia-smi", "-i", strconv.Itoa(idxCopy), "-rgc"}, nil, nil)
|
_, _ = runSATCommandCtx(context.Background(), verboseLog, fmt.Sprintf("restore-gpu-%d-rgc", idxCopy), []string{"nvidia-smi", "-i", strconv.Itoa(idxCopy), "-rgc"}, nil, nil)
|
||||||
}})
|
}})
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
rec.GPUClockLockStatus = "skipped"
|
||||||
|
rec.Notes = append(rec.Notes, "graphics clock lock skipped: gpu inventory unavailable or MaxGraphicsClockMHz=0")
|
||||||
|
result.Normalization.Status = "partial"
|
||||||
}
|
}
|
||||||
|
|
||||||
if info, ok := infoByIndex[idx]; ok && info.MaxMemoryClockMHz > 0 {
|
if info, ok := infoByIndex[idx]; ok && info.MaxMemoryClockMHz > 0 {
|
||||||
@@ -1209,3 +1248,246 @@ func characterizeServerPower(idleW, loadedW, gpuReportedSumW float64, ipmiAvaila
|
|||||||
}
|
}
|
||||||
return sp
|
return sp
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// readServerModel returns the DMI system product name (e.g. "SuperMicro SYS-421GE-TNRT").
|
||||||
|
// Returns empty string if unavailable (non-Linux or missing DMI entry).
|
||||||
|
func readServerModel() string {
|
||||||
|
data, err := os.ReadFile("/sys/class/dmi/id/product_name")
|
||||||
|
if err != nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return strings.TrimSpace(string(data))
|
||||||
|
}
|
||||||
|
|
||||||
|
// filterRowsByGPU returns only the metric rows for a specific GPU index.
|
||||||
|
func filterRowsByGPU(rows []GPUMetricRow, gpuIndex int) []GPUMetricRow {
|
||||||
|
var out []GPUMetricRow
|
||||||
|
for _, r := range rows {
|
||||||
|
if r.GPUIndex == gpuIndex {
|
||||||
|
out = append(out, r)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseBenchmarkBurnLogByGPU splits a multi-GPU bee-gpu-burn output by [gpu N] prefix
|
||||||
|
// and returns a per-GPU parse result map.
|
||||||
|
func parseBenchmarkBurnLogByGPU(raw string) map[int]benchmarkBurnParseResult {
|
||||||
|
gpuLines := make(map[int][]string)
|
||||||
|
for _, line := range strings.Split(strings.ReplaceAll(raw, "\r\n", "\n"), "\n") {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
if !strings.HasPrefix(line, "[gpu ") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
end := strings.Index(line, "] ")
|
||||||
|
if end < 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
gpuIdx, err := strconv.Atoi(strings.TrimSpace(line[5:end]))
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
gpuLines[gpuIdx] = append(gpuLines[gpuIdx], line[end+2:])
|
||||||
|
}
|
||||||
|
results := make(map[int]benchmarkBurnParseResult, len(gpuLines))
|
||||||
|
for gpuIdx, lines := range gpuLines {
|
||||||
|
// Lines are already stripped of the [gpu N] prefix; parseBenchmarkBurnLog
|
||||||
|
// calls stripBenchmarkPrefix which is a no-op on already-stripped lines.
|
||||||
|
results[gpuIdx] = parseBenchmarkBurnLog(strings.Join(lines, "\n"))
|
||||||
|
}
|
||||||
|
return results
|
||||||
|
}
|
||||||
|
|
||||||
|
// runNvidiaBenchmarkParallel runs warmup and steady compute on all selected GPUs
|
||||||
|
// simultaneously using a single bee-gpu-burn invocation per phase.
|
||||||
|
func runNvidiaBenchmarkParallel(
|
||||||
|
ctx context.Context,
|
||||||
|
verboseLog, runDir string,
|
||||||
|
selected []int,
|
||||||
|
infoByIndex map[int]benchmarkGPUInfo,
|
||||||
|
opts NvidiaBenchmarkOptions,
|
||||||
|
spec benchmarkProfileSpec,
|
||||||
|
logFunc func(string),
|
||||||
|
result *NvidiaBenchmarkResult,
|
||||||
|
serverIdleW *float64, serverLoadedWSum *float64,
|
||||||
|
serverIdleOK *bool, serverLoadedOK *bool, serverLoadedSamples *int,
|
||||||
|
) {
|
||||||
|
allDevices := joinIndexList(selected)
|
||||||
|
|
||||||
|
// Build per-GPU result stubs.
|
||||||
|
gpuResults := make(map[int]*BenchmarkGPUResult, len(selected))
|
||||||
|
for _, idx := range selected {
|
||||||
|
r := &BenchmarkGPUResult{Index: idx, Status: "FAILED"}
|
||||||
|
if info, ok := infoByIndex[idx]; ok {
|
||||||
|
r.UUID = info.UUID
|
||||||
|
r.Name = info.Name
|
||||||
|
r.BusID = info.BusID
|
||||||
|
r.VBIOS = info.VBIOS
|
||||||
|
r.PowerLimitW = info.PowerLimitW
|
||||||
|
r.MultiprocessorCount = info.MultiprocessorCount
|
||||||
|
r.DefaultPowerLimitW = info.DefaultPowerLimitW
|
||||||
|
r.MaxGraphicsClockMHz = info.MaxGraphicsClockMHz
|
||||||
|
r.BaseGraphicsClockMHz = info.BaseGraphicsClockMHz
|
||||||
|
r.MaxMemoryClockMHz = info.MaxMemoryClockMHz
|
||||||
|
}
|
||||||
|
if norm := findBenchmarkNormalization(result.Normalization.GPUs, idx); norm != nil {
|
||||||
|
r.LockedGraphicsClockMHz = norm.GPUClockLockMHz
|
||||||
|
r.LockedMemoryClockMHz = norm.MemoryClockLockMHz
|
||||||
|
}
|
||||||
|
gpuResults[idx] = r
|
||||||
|
}
|
||||||
|
|
||||||
|
// Baseline: sample all GPUs together.
|
||||||
|
baselineRows, err := collectBenchmarkSamples(ctx, spec.BaselineSec, selected)
|
||||||
|
if err != nil && err != context.Canceled {
|
||||||
|
for _, idx := range selected {
|
||||||
|
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "baseline sampling failed: "+err.Error())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, idx := range selected {
|
||||||
|
perGPU := filterRowsByGPU(baselineRows, idx)
|
||||||
|
gpuResults[idx].Baseline = summarizeBenchmarkTelemetry(perGPU)
|
||||||
|
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-baseline", idx), perGPU)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sample server idle power once.
|
||||||
|
if !*serverIdleOK {
|
||||||
|
if w, ok := sampleIPMIPowerSeries(ctx, maxInt(spec.BaselineSec, 10)); ok {
|
||||||
|
*serverIdleW = w
|
||||||
|
*serverIdleOK = true
|
||||||
|
logFunc(fmt.Sprintf("server idle power (IPMI): %.0f W", w))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Warmup: all GPUs simultaneously.
|
||||||
|
warmupCmd := []string{
|
||||||
|
"bee-gpu-burn",
|
||||||
|
"--seconds", strconv.Itoa(spec.WarmupSec),
|
||||||
|
"--size-mb", strconv.Itoa(opts.SizeMB),
|
||||||
|
"--devices", allDevices,
|
||||||
|
}
|
||||||
|
logFunc(fmt.Sprintf("GPUs %s: parallel warmup (%ds)", allDevices, spec.WarmupSec))
|
||||||
|
warmupOut, warmupRows, warmupErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-warmup.log", warmupCmd, nil, selected, runDir, "gpu-all-warmup", logFunc)
|
||||||
|
_ = os.WriteFile(filepath.Join(runDir, "gpu-all-warmup.log"), warmupOut, 0644)
|
||||||
|
for _, idx := range selected {
|
||||||
|
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-warmup", idx), filterRowsByGPU(warmupRows, idx))
|
||||||
|
}
|
||||||
|
if warmupErr != nil {
|
||||||
|
for _, idx := range selected {
|
||||||
|
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "parallel warmup failed: "+warmupErr.Error())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Snapshot throttle counters before steady.
|
||||||
|
beforeThrottle := make(map[int]BenchmarkThrottleCounters, len(selected))
|
||||||
|
for _, idx := range selected {
|
||||||
|
beforeThrottle[idx], _ = queryThrottleCounters(idx)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Steady: all GPUs simultaneously.
|
||||||
|
steadyCmd := []string{
|
||||||
|
"bee-gpu-burn",
|
||||||
|
"--seconds", strconv.Itoa(spec.SteadySec),
|
||||||
|
"--size-mb", strconv.Itoa(opts.SizeMB),
|
||||||
|
"--devices", allDevices,
|
||||||
|
}
|
||||||
|
logFunc(fmt.Sprintf("GPUs %s: parallel steady compute (%ds)", allDevices, spec.SteadySec))
|
||||||
|
|
||||||
|
// Sample server power via IPMI in parallel with steady phase.
|
||||||
|
ipmiStopCh := make(chan struct{})
|
||||||
|
ipmiResultCh := make(chan float64, 1)
|
||||||
|
go func() {
|
||||||
|
defer close(ipmiResultCh)
|
||||||
|
var samples []float64
|
||||||
|
ticker := time.NewTicker(5 * time.Second)
|
||||||
|
defer ticker.Stop()
|
||||||
|
select {
|
||||||
|
case <-ipmiStopCh:
|
||||||
|
return
|
||||||
|
case <-time.After(15 * time.Second):
|
||||||
|
}
|
||||||
|
for {
|
||||||
|
if w, err := queryIPMIServerPowerW(); err == nil {
|
||||||
|
samples = append(samples, w)
|
||||||
|
}
|
||||||
|
select {
|
||||||
|
case <-ipmiStopCh:
|
||||||
|
if len(samples) > 0 {
|
||||||
|
var sum float64
|
||||||
|
for _, w := range samples {
|
||||||
|
sum += w
|
||||||
|
}
|
||||||
|
ipmiResultCh <- sum / float64(len(samples))
|
||||||
|
}
|
||||||
|
return
|
||||||
|
case <-ticker.C:
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
steadyOut, steadyRows, steadyErr := runBenchmarkCommandWithMetrics(ctx, verboseLog, "gpu-all-steady.log", steadyCmd, nil, selected, runDir, "gpu-all-steady", logFunc)
|
||||||
|
close(ipmiStopCh)
|
||||||
|
if loadedW, ok := <-ipmiResultCh; ok {
|
||||||
|
*serverLoadedWSum += loadedW
|
||||||
|
(*serverLoadedSamples)++
|
||||||
|
*serverLoadedOK = true
|
||||||
|
logFunc(fmt.Sprintf("GPUs %s: server loaded power (IPMI): %.0f W", allDevices, loadedW))
|
||||||
|
}
|
||||||
|
_ = os.WriteFile(filepath.Join(runDir, "gpu-all-steady.log"), steadyOut, 0644)
|
||||||
|
|
||||||
|
afterThrottle := make(map[int]BenchmarkThrottleCounters, len(selected))
|
||||||
|
for _, idx := range selected {
|
||||||
|
afterThrottle[idx], _ = queryThrottleCounters(idx)
|
||||||
|
}
|
||||||
|
|
||||||
|
parseResults := parseBenchmarkBurnLogByGPU(string(steadyOut))
|
||||||
|
|
||||||
|
for _, idx := range selected {
|
||||||
|
perGPU := filterRowsByGPU(steadyRows, idx)
|
||||||
|
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-steady", idx), perGPU)
|
||||||
|
gpuResults[idx].Steady = summarizeBenchmarkTelemetry(perGPU)
|
||||||
|
gpuResults[idx].Throttle = diffThrottleCounters(beforeThrottle[idx], afterThrottle[idx])
|
||||||
|
|
||||||
|
if pr, ok := parseResults[idx]; ok {
|
||||||
|
gpuResults[idx].ComputeCapability = pr.ComputeCapability
|
||||||
|
gpuResults[idx].Backend = pr.Backend
|
||||||
|
gpuResults[idx].PrecisionResults = pr.Profiles
|
||||||
|
if pr.Fallback {
|
||||||
|
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "benchmark used driver PTX fallback; tensor throughput score is not comparable")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if steadyErr != nil {
|
||||||
|
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "parallel steady compute failed: "+steadyErr.Error())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cooldown: all GPUs together.
|
||||||
|
cooldownRows, err := collectBenchmarkSamples(ctx, spec.CooldownSec, selected)
|
||||||
|
if err != nil && err != context.Canceled {
|
||||||
|
for _, idx := range selected {
|
||||||
|
gpuResults[idx].Notes = append(gpuResults[idx].Notes, "cooldown sampling failed: "+err.Error())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, idx := range selected {
|
||||||
|
perGPU := filterRowsByGPU(cooldownRows, idx)
|
||||||
|
gpuResults[idx].Cooldown = summarizeBenchmarkTelemetry(perGPU)
|
||||||
|
writeBenchmarkMetricsFiles(runDir, fmt.Sprintf("gpu-%d-cooldown", idx), perGPU)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Score and finalize each GPU.
|
||||||
|
for _, idx := range selected {
|
||||||
|
r := gpuResults[idx]
|
||||||
|
r.Scores = scoreBenchmarkGPUResult(*r)
|
||||||
|
r.DegradationReasons = detectBenchmarkDegradationReasons(*r, result.Normalization.Status)
|
||||||
|
pr := parseResults[idx]
|
||||||
|
switch {
|
||||||
|
case steadyErr != nil:
|
||||||
|
r.Status = classifySATErrorStatus(steadyOut, steadyErr)
|
||||||
|
case pr.Fallback:
|
||||||
|
r.Status = "PARTIAL"
|
||||||
|
default:
|
||||||
|
r.Status = "OK"
|
||||||
|
}
|
||||||
|
result.GPUs = append(result.GPUs, finalizeBenchmarkGPUResult(*r))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -14,13 +14,17 @@ type NvidiaBenchmarkOptions struct {
|
|||||||
GPUIndices []int
|
GPUIndices []int
|
||||||
ExcludeGPUIndices []int
|
ExcludeGPUIndices []int
|
||||||
RunNCCL bool
|
RunNCCL bool
|
||||||
|
ParallelGPUs bool // run all selected GPUs simultaneously instead of sequentially
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
type NvidiaBenchmarkResult struct {
|
type NvidiaBenchmarkResult struct {
|
||||||
BenchmarkVersion string `json:"benchmark_version"`
|
BenchmarkVersion string `json:"benchmark_version"`
|
||||||
GeneratedAt time.Time `json:"generated_at"`
|
GeneratedAt time.Time `json:"generated_at"`
|
||||||
Hostname string `json:"hostname,omitempty"`
|
Hostname string `json:"hostname,omitempty"`
|
||||||
|
ServerModel string `json:"server_model,omitempty"`
|
||||||
BenchmarkProfile string `json:"benchmark_profile"`
|
BenchmarkProfile string `json:"benchmark_profile"`
|
||||||
|
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
|
||||||
OverallStatus string `json:"overall_status"`
|
OverallStatus string `json:"overall_status"`
|
||||||
SelectedGPUIndices []int `json:"selected_gpu_indices"`
|
SelectedGPUIndices []int `json:"selected_gpu_indices"`
|
||||||
Findings []string `json:"findings,omitempty"`
|
Findings []string `json:"findings,omitempty"`
|
||||||
|
|||||||
@@ -470,6 +470,7 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
|
|||||||
GPUIndices []int `json:"gpu_indices"`
|
GPUIndices []int `json:"gpu_indices"`
|
||||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
|
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
|
||||||
RunNCCL *bool `json:"run_nccl"`
|
RunNCCL *bool `json:"run_nccl"`
|
||||||
|
ParallelGPUs *bool `json:"parallel_gpus"`
|
||||||
DisplayName string `json:"display_name"`
|
DisplayName string `json:"display_name"`
|
||||||
}
|
}
|
||||||
if r.Body != nil {
|
if r.Body != nil {
|
||||||
@@ -483,6 +484,10 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
|
|||||||
if body.RunNCCL != nil {
|
if body.RunNCCL != nil {
|
||||||
runNCCL = *body.RunNCCL
|
runNCCL = *body.RunNCCL
|
||||||
}
|
}
|
||||||
|
parallelGPUs := false
|
||||||
|
if body.ParallelGPUs != nil {
|
||||||
|
parallelGPUs = *body.ParallelGPUs
|
||||||
|
}
|
||||||
name := taskDisplayName("nvidia-benchmark", "", "")
|
name := taskDisplayName("nvidia-benchmark", "", "")
|
||||||
if strings.TrimSpace(body.DisplayName) != "" {
|
if strings.TrimSpace(body.DisplayName) != "" {
|
||||||
name = body.DisplayName
|
name = body.DisplayName
|
||||||
@@ -493,6 +498,7 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
|
|||||||
SizeMB: body.SizeMB,
|
SizeMB: body.SizeMB,
|
||||||
BenchmarkProfile: body.Profile,
|
BenchmarkProfile: body.Profile,
|
||||||
RunNCCL: runNCCL,
|
RunNCCL: runNCCL,
|
||||||
|
ParallelGPUs: parallelGPUs,
|
||||||
DisplayName: body.DisplayName,
|
DisplayName: body.DisplayName,
|
||||||
}, name, h.opts.App, "benchmark-nvidia")
|
}, name, h.opts.App, "benchmark-nvidia")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
@@ -1625,6 +1625,10 @@ func renderBenchmark(opts HandlerOptions) string {
|
|||||||
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
|
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
<label class="benchmark-cb-row">
|
||||||
|
<input type="checkbox" id="benchmark-parallel-gpus">
|
||||||
|
<span>Run all selected GPUs simultaneously (parallel mode)</span>
|
||||||
|
</label>
|
||||||
<label class="benchmark-cb-row">
|
<label class="benchmark-cb-row">
|
||||||
<input type="checkbox" id="benchmark-run-nccl" checked>
|
<input type="checkbox" id="benchmark-run-nccl" checked>
|
||||||
<span>Run multi-GPU interconnect step (NCCL) only on the selected GPUs</span>
|
<span>Run multi-GPU interconnect step (NCCL) only on the selected GPUs</span>
|
||||||
@@ -1750,10 +1754,12 @@ function runNvidiaBenchmark() {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (benchmarkES) { benchmarkES.close(); benchmarkES = null; }
|
if (benchmarkES) { benchmarkES.close(); benchmarkES = null; }
|
||||||
|
const parallelGPUs = !!document.getElementById('benchmark-parallel-gpus').checked;
|
||||||
const body = {
|
const body = {
|
||||||
profile: document.getElementById('benchmark-profile').value || 'standard',
|
profile: document.getElementById('benchmark-profile').value || 'standard',
|
||||||
gpu_indices: selected,
|
gpu_indices: selected,
|
||||||
run_nccl: !!document.getElementById('benchmark-run-nccl').checked,
|
run_nccl: !!document.getElementById('benchmark-run-nccl').checked,
|
||||||
|
parallel_gpus: parallelGPUs,
|
||||||
display_name: 'NVIDIA Benchmark'
|
display_name: 'NVIDIA Benchmark'
|
||||||
};
|
};
|
||||||
document.getElementById('benchmark-output').style.display = 'block';
|
document.getElementById('benchmark-output').style.display = 'block';
|
||||||
@@ -1887,19 +1893,31 @@ func loadBenchmarkHistoryFromPaths(paths []string) ([]benchmarkHistoryColumn, []
|
|||||||
displayTime: result.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
|
displayTime: result.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
|
||||||
cells: make(map[string]benchmarkHistoryCell),
|
cells: make(map[string]benchmarkHistoryCell),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Count how many GPUs of each model appear in this run (for the label).
|
||||||
|
gpuModelCount := make(map[string]int)
|
||||||
for _, gpu := range result.GPUs {
|
for _, gpu := range result.GPUs {
|
||||||
key := benchmarkHistoryColumnKey(gpu.Name, gpu.Index)
|
gpuModelCount[strings.TrimSpace(gpu.Name)]++
|
||||||
|
}
|
||||||
|
|
||||||
|
// Track best composite score per column key within this run.
|
||||||
|
runBest := make(map[string]float64)
|
||||||
|
for _, gpu := range result.GPUs {
|
||||||
|
key := benchmarkHistoryColumnKey(result.ServerModel, gpu.Name)
|
||||||
|
count := gpuModelCount[strings.TrimSpace(gpu.Name)]
|
||||||
columnByKey[key] = benchmarkHistoryColumn{
|
columnByKey[key] = benchmarkHistoryColumn{
|
||||||
key: key,
|
key: key,
|
||||||
label: benchmarkHistoryColumnLabel(gpu.Name, gpu.Index),
|
label: benchmarkHistoryColumnLabel(result.ServerModel, gpu.Name, count),
|
||||||
name: strings.TrimSpace(gpu.Name),
|
name: strings.TrimSpace(gpu.Name),
|
||||||
index: gpu.Index,
|
index: gpu.Index,
|
||||||
}
|
}
|
||||||
run.cells[key] = benchmarkHistoryCell{
|
if gpu.Scores.CompositeScore > runBest[key] {
|
||||||
score: gpu.Scores.CompositeScore,
|
runBest[key] = gpu.Scores.CompositeScore
|
||||||
present: true,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
for key, score := range runBest {
|
||||||
|
run.cells[key] = benchmarkHistoryCell{score: score, present: true}
|
||||||
|
}
|
||||||
runs = append(runs, run)
|
runs = append(runs, run)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1908,13 +1926,10 @@ func loadBenchmarkHistoryFromPaths(paths []string) ([]benchmarkHistoryColumn, []
|
|||||||
columns = append(columns, col)
|
columns = append(columns, col)
|
||||||
}
|
}
|
||||||
sort.Slice(columns, func(i, j int) bool {
|
sort.Slice(columns, func(i, j int) bool {
|
||||||
leftName := strings.ToLower(strings.TrimSpace(columns[i].name))
|
li := strings.ToLower(columns[i].label)
|
||||||
rightName := strings.ToLower(strings.TrimSpace(columns[j].name))
|
lj := strings.ToLower(columns[j].label)
|
||||||
if leftName != rightName {
|
if li != lj {
|
||||||
return leftName < rightName
|
return li < lj
|
||||||
}
|
|
||||||
if columns[i].index != columns[j].index {
|
|
||||||
return columns[i].index < columns[j].index
|
|
||||||
}
|
}
|
||||||
return columns[i].key < columns[j].key
|
return columns[i].key < columns[j].key
|
||||||
})
|
})
|
||||||
@@ -1924,16 +1939,25 @@ func loadBenchmarkHistoryFromPaths(paths []string) ([]benchmarkHistoryColumn, []
|
|||||||
return columns, runs
|
return columns, runs
|
||||||
}
|
}
|
||||||
|
|
||||||
func benchmarkHistoryColumnKey(name string, index int) string {
|
// benchmarkHistoryColumnKey groups results by server model + GPU model so that
|
||||||
return strings.TrimSpace(name) + "|" + strconv.Itoa(index)
|
// runs on the same hardware produce one column regardless of individual GPU index.
|
||||||
|
func benchmarkHistoryColumnKey(serverModel, gpuName string) string {
|
||||||
|
return strings.TrimSpace(serverModel) + "|" + strings.TrimSpace(gpuName)
|
||||||
}
|
}
|
||||||
|
|
||||||
func benchmarkHistoryColumnLabel(name string, index int) string {
|
// benchmarkHistoryColumnLabel formats the column header as
|
||||||
name = strings.TrimSpace(name)
|
// "Server Model (N× GPU Model)" or "GPU Model" when server info is missing.
|
||||||
if name == "" {
|
func benchmarkHistoryColumnLabel(serverModel, gpuName string, count int) string {
|
||||||
return fmt.Sprintf("GPU %d", index)
|
serverModel = strings.TrimSpace(serverModel)
|
||||||
|
gpuName = strings.TrimSpace(gpuName)
|
||||||
|
if gpuName == "" {
|
||||||
|
gpuName = "Unknown GPU"
|
||||||
}
|
}
|
||||||
return fmt.Sprintf("%s / GPU %d", name, index)
|
gpuPart := fmt.Sprintf("%d× %s", count, gpuName)
|
||||||
|
if serverModel == "" {
|
||||||
|
return gpuPart
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("%s (%s)", serverModel, gpuPart)
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Burn ──────────────────────────────────────────────────────────────────────
|
// ── Burn ──────────────────────────────────────────────────────────────────────
|
||||||
|
|||||||
@@ -123,6 +123,7 @@ type taskParams struct {
|
|||||||
BurnProfile string `json:"burn_profile,omitempty"`
|
BurnProfile string `json:"burn_profile,omitempty"`
|
||||||
BenchmarkProfile string `json:"benchmark_profile,omitempty"`
|
BenchmarkProfile string `json:"benchmark_profile,omitempty"`
|
||||||
RunNCCL bool `json:"run_nccl,omitempty"`
|
RunNCCL bool `json:"run_nccl,omitempty"`
|
||||||
|
ParallelGPUs bool `json:"parallel_gpus,omitempty"`
|
||||||
DisplayName string `json:"display_name,omitempty"`
|
DisplayName string `json:"display_name,omitempty"`
|
||||||
Device string `json:"device,omitempty"` // for install
|
Device string `json:"device,omitempty"` // for install
|
||||||
PlatformComponents []string `json:"platform_components,omitempty"`
|
PlatformComponents []string `json:"platform_components,omitempty"`
|
||||||
@@ -585,6 +586,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
GPUIndices: t.params.GPUIndices,
|
GPUIndices: t.params.GPUIndices,
|
||||||
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||||
RunNCCL: t.params.RunNCCL,
|
RunNCCL: t.params.RunNCCL,
|
||||||
|
ParallelGPUs: t.params.ParallelGPUs,
|
||||||
}, j.append)
|
}, j.append)
|
||||||
case "nvidia-compute":
|
case "nvidia-compute":
|
||||||
if a == nil {
|
if a == nil {
|
||||||
|
|||||||
Reference in New Issue
Block a user