Compare commits
6 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2354ae367d | ||
|
|
0d0e1f55a7 | ||
|
|
35f4c53887 | ||
|
|
981315e6fd | ||
|
|
fc5c100a29 | ||
| 6e94216f3b |
@@ -679,7 +679,10 @@ func runBenchmarkInterconnect(ctx context.Context, verboseLog, runDir string, gp
|
|||||||
"-g", strconv.Itoa(len(gpuIndices)),
|
"-g", strconv.Itoa(len(gpuIndices)),
|
||||||
"--iters", strconv.Itoa(maxInt(20, spec.NCCLSec/10)),
|
"--iters", strconv.Itoa(maxInt(20, spec.NCCLSec/10)),
|
||||||
}
|
}
|
||||||
env := []string{"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices)}
|
env := []string{
|
||||||
|
"CUDA_DEVICE_ORDER=PCI_BUS_ID",
|
||||||
|
"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices),
|
||||||
|
}
|
||||||
logFunc(fmt.Sprintf("NCCL interconnect: gpus=%s", joinIndexList(gpuIndices)))
|
logFunc(fmt.Sprintf("NCCL interconnect: gpus=%s", joinIndexList(gpuIndices)))
|
||||||
out, err := runSATCommandCtx(ctx, verboseLog, "nccl-all-reduce.log", cmd, env, logFunc)
|
out, err := runSATCommandCtx(ctx, verboseLog, "nccl-all-reduce.log", cmd, env, logFunc)
|
||||||
_ = os.WriteFile(filepath.Join(runDir, "nccl-all-reduce.log"), out, 0644)
|
_ = os.WriteFile(filepath.Join(runDir, "nccl-all-reduce.log"), out, 0644)
|
||||||
|
|||||||
@@ -16,12 +16,12 @@ func (s *System) RunNvidiaStressPack(ctx context.Context, baseDir string, opts N
|
|||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
|
||||||
return runAcceptancePackCtx(ctx, baseDir, nvidiaStressArchivePrefix(opts.Loader), []satJob{
|
return runAcceptancePackCtx(ctx, baseDir, nvidiaStressArchivePrefix(opts.Loader), withNvidiaPersistenceMode(
|
||||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
|
satJob{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
|
||||||
job,
|
job,
|
||||||
{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||||
}, logFunc)
|
), logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func nvidiaStressArchivePrefix(loader string) string {
|
func nvidiaStressArchivePrefix(loader string) string {
|
||||||
|
|||||||
@@ -278,13 +278,13 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
|
|||||||
if gpuCount < 1 {
|
if gpuCount < 1 {
|
||||||
gpuCount = 1
|
gpuCount = 1
|
||||||
}
|
}
|
||||||
return runAcceptancePackCtx(ctx, baseDir, "nccl-tests", []satJob{
|
return runAcceptancePackCtx(ctx, baseDir, "nccl-tests", withNvidiaPersistenceMode(
|
||||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
{name: "02-all-reduce-perf.log", cmd: []string{
|
satJob{name: "02-all-reduce-perf.log", cmd: []string{
|
||||||
"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
|
"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
|
||||||
"-g", strconv.Itoa(gpuCount), "--iters", "20",
|
"-g", strconv.Itoa(gpuCount), "--iters", "20",
|
||||||
}},
|
}},
|
||||||
}, logFunc)
|
), logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
@@ -296,18 +296,18 @@ func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir strin
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", []satJob{
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", withNvidiaPersistenceMode(
|
||||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
|
satJob{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
|
||||||
{
|
satJob{
|
||||||
name: "03-dcgmproftester.log",
|
name: "03-dcgmproftester.log",
|
||||||
cmd: profCmd,
|
cmd: profCmd,
|
||||||
env: nvidiaVisibleDevicesEnv(selected),
|
env: nvidiaVisibleDevicesEnv(selected),
|
||||||
collectGPU: true,
|
collectGPU: true,
|
||||||
gpuIndices: selected,
|
gpuIndices: selected,
|
||||||
},
|
},
|
||||||
{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||||
}, logFunc)
|
), logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
@@ -315,16 +315,16 @@ func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string,
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", []satJob{
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", withNvidiaPersistenceMode(
|
||||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
{
|
satJob{
|
||||||
name: "02-dcgmi-targeted-power.log",
|
name: "02-dcgmi-targeted-power.log",
|
||||||
cmd: nvidiaDCGMNamedDiagCommand("targeted_power", normalizeNvidiaBurnDuration(durationSec), selected),
|
cmd: nvidiaDCGMNamedDiagCommand("targeted_power", normalizeNvidiaBurnDuration(durationSec), selected),
|
||||||
collectGPU: true,
|
collectGPU: true,
|
||||||
gpuIndices: selected,
|
gpuIndices: selected,
|
||||||
},
|
},
|
||||||
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||||
}, logFunc)
|
), logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
@@ -332,16 +332,16 @@ func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, dur
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", []satJob{
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", withNvidiaPersistenceMode(
|
||||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
{
|
satJob{
|
||||||
name: "02-dcgmi-pulse-test.log",
|
name: "02-dcgmi-pulse-test.log",
|
||||||
cmd: nvidiaDCGMNamedDiagCommand("pulse_test", normalizeNvidiaBurnDuration(durationSec), selected),
|
cmd: nvidiaDCGMNamedDiagCommand("pulse_test", normalizeNvidiaBurnDuration(durationSec), selected),
|
||||||
collectGPU: true,
|
collectGPU: true,
|
||||||
gpuIndices: selected,
|
gpuIndices: selected,
|
||||||
},
|
},
|
||||||
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||||
}, logFunc)
|
), logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
@@ -349,16 +349,16 @@ func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpu
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", []satJob{
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", withNvidiaPersistenceMode(
|
||||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
{
|
satJob{
|
||||||
name: "02-dcgmi-nvbandwidth.log",
|
name: "02-dcgmi-nvbandwidth.log",
|
||||||
cmd: nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, selected),
|
cmd: nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, selected),
|
||||||
collectGPU: true,
|
collectGPU: true,
|
||||||
gpuIndices: selected,
|
gpuIndices: selected,
|
||||||
},
|
},
|
||||||
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||||
}, logFunc)
|
), logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||||
@@ -389,16 +389,16 @@ func (s *System) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDi
|
|||||||
logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
|
logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-stress", []satJob{
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-stress", withNvidiaPersistenceMode(
|
||||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
{
|
satJob{
|
||||||
name: "02-dcgmi-targeted-stress.log",
|
name: "02-dcgmi-targeted-stress.log",
|
||||||
cmd: nvidiaDCGMNamedDiagCommand("targeted_stress", normalizeNvidiaBurnDuration(durationSec), selected),
|
cmd: nvidiaDCGMNamedDiagCommand("targeted_stress", normalizeNvidiaBurnDuration(durationSec), selected),
|
||||||
collectGPU: true,
|
collectGPU: true,
|
||||||
gpuIndices: selected,
|
gpuIndices: selected,
|
||||||
},
|
},
|
||||||
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||||
}, logFunc)
|
), logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) {
|
func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) {
|
||||||
@@ -568,14 +568,24 @@ type satStats struct {
|
|||||||
Unsupported int
|
Unsupported int
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func withNvidiaPersistenceMode(jobs ...satJob) []satJob {
|
||||||
|
out := make([]satJob, 0, len(jobs)+1)
|
||||||
|
out = append(out, satJob{
|
||||||
|
name: "00-nvidia-smi-persistence-mode.log",
|
||||||
|
cmd: []string{"nvidia-smi", "-pm", "1"},
|
||||||
|
})
|
||||||
|
out = append(out, jobs...)
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
func nvidiaSATJobs() []satJob {
|
func nvidiaSATJobs() []satJob {
|
||||||
return []satJob{
|
return withNvidiaPersistenceMode(
|
||||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
satJob{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||||
{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
satJob{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||||
{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
|
satJob{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
|
||||||
{name: "05-bee-gpu-burn.log", cmd: []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}},
|
satJob{name: "05-bee-gpu-burn.log", cmd: []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}},
|
||||||
}
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
|
func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
|
||||||
@@ -590,12 +600,12 @@ func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
|
|||||||
}
|
}
|
||||||
diagArgs = append(diagArgs, "-i", strings.Join(ids, ","))
|
diagArgs = append(diagArgs, "-i", strings.Join(ids, ","))
|
||||||
}
|
}
|
||||||
return []satJob{
|
return withNvidiaPersistenceMode(
|
||||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
satJob{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||||
{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
satJob{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||||
{name: "04-dcgmi-diag.log", cmd: diagArgs},
|
satJob{name: "04-dcgmi-diag.log", cmd: diagArgs},
|
||||||
}
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
func nvidiaDCGMNamedDiagCommand(name string, durationSec int, gpuIndices []int) []string {
|
func nvidiaDCGMNamedDiagCommand(name string, durationSec int, gpuIndices []int) []string {
|
||||||
@@ -620,7 +630,10 @@ func nvidiaVisibleDevicesEnv(gpuIndices []int) []string {
|
|||||||
if len(gpuIndices) == 0 {
|
if len(gpuIndices) == 0 {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
return []string{"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices)}
|
return []string{
|
||||||
|
"CUDA_DEVICE_ORDER=PCI_BUS_ID",
|
||||||
|
"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob, logFunc func(string)) (string, error) {
|
func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob, logFunc func(string)) (string, error) {
|
||||||
@@ -661,6 +674,9 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
|
|||||||
if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
|
if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
|
||||||
return "", writeErr
|
return "", writeErr
|
||||||
}
|
}
|
||||||
|
if ctx.Err() != nil {
|
||||||
|
return "", ctx.Err()
|
||||||
|
}
|
||||||
status, rc := classifySATResult(job.name, out, err)
|
status, rc := classifySATResult(job.name, out, err)
|
||||||
stats.Add(status)
|
stats.Add(status)
|
||||||
key := strings.TrimSuffix(strings.TrimPrefix(job.name, "0"), ".log")
|
key := strings.TrimSuffix(strings.TrimPrefix(job.name, "0"), ".log")
|
||||||
|
|||||||
@@ -1,12 +1,14 @@
|
|||||||
package platform
|
package platform
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"context"
|
||||||
"errors"
|
"errors"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestStorageSATCommands(t *testing.T) {
|
func TestStorageSATCommands(t *testing.T) {
|
||||||
@@ -28,13 +30,19 @@ func TestRunNvidiaAcceptancePackIncludesGPUStress(t *testing.T) {
|
|||||||
|
|
||||||
jobs := nvidiaSATJobs()
|
jobs := nvidiaSATJobs()
|
||||||
|
|
||||||
if len(jobs) != 5 {
|
if len(jobs) != 6 {
|
||||||
t.Fatalf("jobs=%d want 5", len(jobs))
|
t.Fatalf("jobs=%d want 6", len(jobs))
|
||||||
}
|
}
|
||||||
if got := jobs[4].cmd[0]; got != "bee-gpu-burn" {
|
if got := jobs[0].cmd[0]; got != "nvidia-smi" {
|
||||||
|
t.Fatalf("preflight command=%q want nvidia-smi", got)
|
||||||
|
}
|
||||||
|
if got := strings.Join(jobs[0].cmd, " "); got != "nvidia-smi -pm 1" {
|
||||||
|
t.Fatalf("preflight=%q want %q", got, "nvidia-smi -pm 1")
|
||||||
|
}
|
||||||
|
if got := jobs[5].cmd[0]; got != "bee-gpu-burn" {
|
||||||
t.Fatalf("gpu stress command=%q want bee-gpu-burn", got)
|
t.Fatalf("gpu stress command=%q want bee-gpu-burn", got)
|
||||||
}
|
}
|
||||||
if got := jobs[3].cmd[1]; got != "--output-file" {
|
if got := jobs[4].cmd[1]; got != "--output-file" {
|
||||||
t.Fatalf("bug report flag=%q want --output-file", got)
|
t.Fatalf("bug report flag=%q want --output-file", got)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -82,7 +90,7 @@ func TestAMDStressJobsIncludeBandwidthAndGST(t *testing.T) {
|
|||||||
|
|
||||||
func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) {
|
func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) {
|
||||||
jobs := nvidiaSATJobs()
|
jobs := nvidiaSATJobs()
|
||||||
got := jobs[4].cmd
|
got := jobs[5].cmd
|
||||||
want := []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}
|
want := []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}
|
||||||
if len(got) != len(want) {
|
if len(got) != len(want) {
|
||||||
t.Fatalf("cmd len=%d want %d", len(got), len(want))
|
t.Fatalf("cmd len=%d want %d", len(got), len(want))
|
||||||
@@ -94,6 +102,19 @@ func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestNvidiaDCGMJobsEnablePersistenceModeBeforeDiag(t *testing.T) {
|
||||||
|
jobs := nvidiaDCGMJobs(3, []int{2, 0})
|
||||||
|
if len(jobs) != 5 {
|
||||||
|
t.Fatalf("jobs=%d want 5", len(jobs))
|
||||||
|
}
|
||||||
|
if got := strings.Join(jobs[0].cmd, " "); got != "nvidia-smi -pm 1" {
|
||||||
|
t.Fatalf("preflight=%q want %q", got, "nvidia-smi -pm 1")
|
||||||
|
}
|
||||||
|
if got := strings.Join(jobs[4].cmd, " "); got != "dcgmi diag -r 3 -i 2,0" {
|
||||||
|
t.Fatalf("diag=%q want %q", got, "dcgmi diag -r 3 -i 2,0")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestBuildNvidiaStressJobUsesSelectedLoaderAndDevices(t *testing.T) {
|
func TestBuildNvidiaStressJobUsesSelectedLoaderAndDevices(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
@@ -234,11 +255,14 @@ func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) {
|
|||||||
|
|
||||||
func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
|
func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
|
||||||
env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
|
env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
|
||||||
if len(env) != 1 {
|
if len(env) != 2 {
|
||||||
t.Fatalf("env len=%d want 1 (%v)", len(env), env)
|
t.Fatalf("env len=%d want 2 (%v)", len(env), env)
|
||||||
}
|
}
|
||||||
if env[0] != "CUDA_VISIBLE_DEVICES=0,2,4" {
|
if env[0] != "CUDA_DEVICE_ORDER=PCI_BUS_ID" {
|
||||||
t.Fatalf("env[0]=%q want CUDA_VISIBLE_DEVICES=0,2,4", env[0])
|
t.Fatalf("env[0]=%q want CUDA_DEVICE_ORDER=PCI_BUS_ID", env[0])
|
||||||
|
}
|
||||||
|
if env[1] != "CUDA_VISIBLE_DEVICES=0,2,4" {
|
||||||
|
t.Fatalf("env[1]=%q want CUDA_VISIBLE_DEVICES=0,2,4", env[1])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -331,6 +355,38 @@ func TestClassifySATResult(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestRunAcceptancePackCtxReturnsContextErrorWithoutArchive(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
t.Cleanup(cancel)
|
||||||
|
|
||||||
|
done := make(chan struct{})
|
||||||
|
go func() {
|
||||||
|
time.Sleep(100 * time.Millisecond)
|
||||||
|
cancel()
|
||||||
|
close(done)
|
||||||
|
}()
|
||||||
|
|
||||||
|
archive, err := runAcceptancePackCtx(ctx, dir, "cancelled-pack", []satJob{
|
||||||
|
{name: "01-sleep.log", cmd: []string{"sh", "-c", "sleep 5"}},
|
||||||
|
}, nil)
|
||||||
|
<-done
|
||||||
|
|
||||||
|
if !errors.Is(err, context.Canceled) {
|
||||||
|
t.Fatalf("err=%v want context.Canceled", err)
|
||||||
|
}
|
||||||
|
if archive != "" {
|
||||||
|
t.Fatalf("archive=%q want empty", archive)
|
||||||
|
}
|
||||||
|
matches, globErr := filepath.Glob(filepath.Join(dir, "cancelled-pack-*.tar.gz"))
|
||||||
|
if globErr != nil {
|
||||||
|
t.Fatalf("Glob error: %v", globErr)
|
||||||
|
}
|
||||||
|
if len(matches) != 0 {
|
||||||
|
t.Fatalf("archives=%v want none", matches)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestParseStorageDevicesSkipsUSBDisks(t *testing.T) {
|
func TestParseStorageDevicesSkipsUSBDisks(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ import (
|
|||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"regexp"
|
"regexp"
|
||||||
|
"sort"
|
||||||
"strings"
|
"strings"
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
"syscall"
|
"syscall"
|
||||||
@@ -21,13 +22,238 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
var ansiEscapeRE = regexp.MustCompile(`\x1b\[[0-9;]*[a-zA-Z]|\x1b[()][A-Z0-9]|\x1b[DABC]`)
|
var ansiEscapeRE = regexp.MustCompile(`\x1b\[[0-9;]*[a-zA-Z]|\x1b[()][A-Z0-9]|\x1b[DABC]`)
|
||||||
|
var apiListNvidiaGPUs = func(a *app.App) ([]platform.NvidiaGPU, error) {
|
||||||
|
if a == nil {
|
||||||
|
return nil, fmt.Errorf("app not configured")
|
||||||
|
}
|
||||||
|
return a.ListNvidiaGPUs()
|
||||||
|
}
|
||||||
|
|
||||||
// ── Job ID counter ────────────────────────────────────────────────────────────
|
// ── Job ID counter ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
var jobCounter atomic.Uint64
|
var jobCounter atomic.Uint64
|
||||||
|
|
||||||
func newJobID(prefix string) string {
|
func newJobID(_ string) string {
|
||||||
return fmt.Sprintf("%s-%d", prefix, jobCounter.Add(1))
|
start := int((jobCounter.Add(1) - 1) % 1000)
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
defer globalQueue.mu.Unlock()
|
||||||
|
for offset := 0; offset < 1000; offset++ {
|
||||||
|
n := (start + offset) % 1000
|
||||||
|
id := fmt.Sprintf("TASK-%03d", n)
|
||||||
|
if !taskIDInUseLocked(id) {
|
||||||
|
return id
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("TASK-%03d", start)
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskIDInUseLocked(id string) bool {
|
||||||
|
for _, t := range globalQueue.tasks {
|
||||||
|
if t != nil && t.ID == id {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
type taskRunResponse struct {
|
||||||
|
TaskID string `json:"task_id,omitempty"`
|
||||||
|
JobID string `json:"job_id,omitempty"`
|
||||||
|
TaskIDs []string `json:"task_ids,omitempty"`
|
||||||
|
JobIDs []string `json:"job_ids,omitempty"`
|
||||||
|
TaskCount int `json:"task_count,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type nvidiaTaskSelection struct {
|
||||||
|
GPUIndices []int
|
||||||
|
Label string
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeTaskRunResponse(w http.ResponseWriter, tasks []*Task) {
|
||||||
|
if len(tasks) == 0 {
|
||||||
|
writeJSON(w, taskRunResponse{})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
ids := make([]string, 0, len(tasks))
|
||||||
|
for _, t := range tasks {
|
||||||
|
if t == nil || strings.TrimSpace(t.ID) == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
ids = append(ids, t.ID)
|
||||||
|
}
|
||||||
|
resp := taskRunResponse{TaskCount: len(ids)}
|
||||||
|
if len(ids) > 0 {
|
||||||
|
resp.TaskID = ids[0]
|
||||||
|
resp.JobID = ids[0]
|
||||||
|
resp.TaskIDs = ids
|
||||||
|
resp.JobIDs = ids
|
||||||
|
}
|
||||||
|
writeJSON(w, resp)
|
||||||
|
}
|
||||||
|
|
||||||
|
func shouldSplitHomogeneousNvidiaTarget(target string) bool {
|
||||||
|
switch strings.TrimSpace(target) {
|
||||||
|
case "nvidia", "nvidia-targeted-stress", "nvidia-benchmark", "nvidia-compute",
|
||||||
|
"nvidia-targeted-power", "nvidia-pulse", "nvidia-interconnect",
|
||||||
|
"nvidia-bandwidth", "nvidia-stress":
|
||||||
|
return true
|
||||||
|
default:
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func expandHomogeneousNvidiaSelections(gpus []platform.NvidiaGPU, include, exclude []int) ([]nvidiaTaskSelection, error) {
|
||||||
|
if len(gpus) == 0 {
|
||||||
|
return nil, fmt.Errorf("no NVIDIA GPUs detected")
|
||||||
|
}
|
||||||
|
indexed := make(map[int]platform.NvidiaGPU, len(gpus))
|
||||||
|
allIndices := make([]int, 0, len(gpus))
|
||||||
|
for _, gpu := range gpus {
|
||||||
|
indexed[gpu.Index] = gpu
|
||||||
|
allIndices = append(allIndices, gpu.Index)
|
||||||
|
}
|
||||||
|
sort.Ints(allIndices)
|
||||||
|
|
||||||
|
selected := allIndices
|
||||||
|
if len(include) > 0 {
|
||||||
|
selected = make([]int, 0, len(include))
|
||||||
|
seen := make(map[int]struct{}, len(include))
|
||||||
|
for _, idx := range include {
|
||||||
|
if _, ok := indexed[idx]; !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if _, dup := seen[idx]; dup {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seen[idx] = struct{}{}
|
||||||
|
selected = append(selected, idx)
|
||||||
|
}
|
||||||
|
sort.Ints(selected)
|
||||||
|
}
|
||||||
|
if len(exclude) > 0 {
|
||||||
|
skip := make(map[int]struct{}, len(exclude))
|
||||||
|
for _, idx := range exclude {
|
||||||
|
skip[idx] = struct{}{}
|
||||||
|
}
|
||||||
|
filtered := selected[:0]
|
||||||
|
for _, idx := range selected {
|
||||||
|
if _, ok := skip[idx]; ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
filtered = append(filtered, idx)
|
||||||
|
}
|
||||||
|
selected = filtered
|
||||||
|
}
|
||||||
|
if len(selected) == 0 {
|
||||||
|
return nil, fmt.Errorf("no NVIDIA GPUs selected")
|
||||||
|
}
|
||||||
|
|
||||||
|
modelGroups := make(map[string][]platform.NvidiaGPU)
|
||||||
|
modelOrder := make([]string, 0)
|
||||||
|
for _, idx := range selected {
|
||||||
|
gpu := indexed[idx]
|
||||||
|
model := strings.TrimSpace(gpu.Name)
|
||||||
|
if model == "" {
|
||||||
|
model = fmt.Sprintf("GPU %d", gpu.Index)
|
||||||
|
}
|
||||||
|
if _, ok := modelGroups[model]; !ok {
|
||||||
|
modelOrder = append(modelOrder, model)
|
||||||
|
}
|
||||||
|
modelGroups[model] = append(modelGroups[model], gpu)
|
||||||
|
}
|
||||||
|
sort.Slice(modelOrder, func(i, j int) bool {
|
||||||
|
left := modelGroups[modelOrder[i]]
|
||||||
|
right := modelGroups[modelOrder[j]]
|
||||||
|
if len(left) == 0 || len(right) == 0 {
|
||||||
|
return modelOrder[i] < modelOrder[j]
|
||||||
|
}
|
||||||
|
return left[0].Index < right[0].Index
|
||||||
|
})
|
||||||
|
|
||||||
|
var groups []nvidiaTaskSelection
|
||||||
|
var singles []nvidiaTaskSelection
|
||||||
|
for _, model := range modelOrder {
|
||||||
|
group := modelGroups[model]
|
||||||
|
sort.Slice(group, func(i, j int) bool { return group[i].Index < group[j].Index })
|
||||||
|
indices := make([]int, 0, len(group))
|
||||||
|
for _, gpu := range group {
|
||||||
|
indices = append(indices, gpu.Index)
|
||||||
|
}
|
||||||
|
if len(indices) >= 2 {
|
||||||
|
groups = append(groups, nvidiaTaskSelection{
|
||||||
|
GPUIndices: indices,
|
||||||
|
Label: fmt.Sprintf("%s; GPUs %s", model, joinTaskIndices(indices)),
|
||||||
|
})
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
gpu := group[0]
|
||||||
|
singles = append(singles, nvidiaTaskSelection{
|
||||||
|
GPUIndices: []int{gpu.Index},
|
||||||
|
Label: fmt.Sprintf("GPU %d — %s", gpu.Index, model),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return append(groups, singles...), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func joinTaskIndices(indices []int) string {
|
||||||
|
parts := make([]string, 0, len(indices))
|
||||||
|
for _, idx := range indices {
|
||||||
|
parts = append(parts, fmt.Sprintf("%d", idx))
|
||||||
|
}
|
||||||
|
return strings.Join(parts, ",")
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatSplitTaskName(baseName, selectionLabel string) string {
|
||||||
|
baseName = strings.TrimSpace(baseName)
|
||||||
|
selectionLabel = strings.TrimSpace(selectionLabel)
|
||||||
|
if baseName == "" {
|
||||||
|
return selectionLabel
|
||||||
|
}
|
||||||
|
if selectionLabel == "" {
|
||||||
|
return baseName
|
||||||
|
}
|
||||||
|
return baseName + " (" + selectionLabel + ")"
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildNvidiaTaskSet(target string, priority int, createdAt time.Time, params taskParams, baseName string, appRef *app.App, idPrefix string) ([]*Task, error) {
|
||||||
|
if !shouldSplitHomogeneousNvidiaTarget(target) {
|
||||||
|
t := &Task{
|
||||||
|
ID: newJobID(idPrefix),
|
||||||
|
Name: baseName,
|
||||||
|
Target: target,
|
||||||
|
Priority: priority,
|
||||||
|
Status: TaskPending,
|
||||||
|
CreatedAt: createdAt,
|
||||||
|
params: params,
|
||||||
|
}
|
||||||
|
return []*Task{t}, nil
|
||||||
|
}
|
||||||
|
gpus, err := apiListNvidiaGPUs(appRef)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
selections, err := expandHomogeneousNvidiaSelections(gpus, params.GPUIndices, params.ExcludeGPUIndices)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
tasks := make([]*Task, 0, len(selections))
|
||||||
|
for _, selection := range selections {
|
||||||
|
taskParamsCopy := params
|
||||||
|
taskParamsCopy.GPUIndices = append([]int(nil), selection.GPUIndices...)
|
||||||
|
taskParamsCopy.ExcludeGPUIndices = nil
|
||||||
|
displayName := formatSplitTaskName(baseName, selection.Label)
|
||||||
|
taskParamsCopy.DisplayName = displayName
|
||||||
|
tasks = append(tasks, &Task{
|
||||||
|
ID: newJobID(idPrefix),
|
||||||
|
Name: displayName,
|
||||||
|
Target: target,
|
||||||
|
Priority: priority,
|
||||||
|
Status: TaskPending,
|
||||||
|
CreatedAt: createdAt,
|
||||||
|
params: taskParamsCopy,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return tasks, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── SSE helpers ───────────────────────────────────────────────────────────────
|
// ── SSE helpers ───────────────────────────────────────────────────────────────
|
||||||
@@ -207,28 +433,28 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
|||||||
}
|
}
|
||||||
|
|
||||||
name := taskDisplayName(target, body.Profile, body.Loader)
|
name := taskDisplayName(target, body.Profile, body.Loader)
|
||||||
t := &Task{
|
|
||||||
ID: newJobID("sat-" + target),
|
|
||||||
Name: name,
|
|
||||||
Target: target,
|
|
||||||
Status: TaskPending,
|
|
||||||
CreatedAt: time.Now(),
|
|
||||||
params: taskParams{
|
|
||||||
Duration: body.Duration,
|
|
||||||
DiagLevel: body.DiagLevel,
|
|
||||||
GPUIndices: body.GPUIndices,
|
|
||||||
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
|
||||||
Loader: body.Loader,
|
|
||||||
BurnProfile: body.Profile,
|
|
||||||
DisplayName: body.DisplayName,
|
|
||||||
PlatformComponents: body.PlatformComponents,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
if strings.TrimSpace(body.DisplayName) != "" {
|
if strings.TrimSpace(body.DisplayName) != "" {
|
||||||
t.Name = body.DisplayName
|
name = body.DisplayName
|
||||||
}
|
}
|
||||||
globalQueue.enqueue(t)
|
params := taskParams{
|
||||||
writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID})
|
Duration: body.Duration,
|
||||||
|
DiagLevel: body.DiagLevel,
|
||||||
|
GPUIndices: body.GPUIndices,
|
||||||
|
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
||||||
|
Loader: body.Loader,
|
||||||
|
BurnProfile: body.Profile,
|
||||||
|
DisplayName: body.DisplayName,
|
||||||
|
PlatformComponents: body.PlatformComponents,
|
||||||
|
}
|
||||||
|
tasks, err := buildNvidiaTaskSet(target, 0, time.Now(), params, name, h.opts.App, "sat-"+target)
|
||||||
|
if err != nil {
|
||||||
|
writeError(w, http.StatusBadRequest, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for _, t := range tasks {
|
||||||
|
globalQueue.enqueue(t)
|
||||||
|
}
|
||||||
|
writeTaskRunResponse(w, tasks)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -257,27 +483,26 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
|
|||||||
if body.RunNCCL != nil {
|
if body.RunNCCL != nil {
|
||||||
runNCCL = *body.RunNCCL
|
runNCCL = *body.RunNCCL
|
||||||
}
|
}
|
||||||
t := &Task{
|
name := taskDisplayName("nvidia-benchmark", "", "")
|
||||||
ID: newJobID("benchmark-nvidia"),
|
|
||||||
Name: taskDisplayName("nvidia-benchmark", "", ""),
|
|
||||||
Target: "nvidia-benchmark",
|
|
||||||
Priority: 15,
|
|
||||||
Status: TaskPending,
|
|
||||||
CreatedAt: time.Now(),
|
|
||||||
params: taskParams{
|
|
||||||
GPUIndices: body.GPUIndices,
|
|
||||||
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
|
||||||
SizeMB: body.SizeMB,
|
|
||||||
BenchmarkProfile: body.Profile,
|
|
||||||
RunNCCL: runNCCL,
|
|
||||||
DisplayName: body.DisplayName,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
if strings.TrimSpace(body.DisplayName) != "" {
|
if strings.TrimSpace(body.DisplayName) != "" {
|
||||||
t.Name = body.DisplayName
|
name = body.DisplayName
|
||||||
}
|
}
|
||||||
globalQueue.enqueue(t)
|
tasks, err := buildNvidiaTaskSet("nvidia-benchmark", 15, time.Now(), taskParams{
|
||||||
writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID})
|
GPUIndices: body.GPUIndices,
|
||||||
|
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
||||||
|
SizeMB: body.SizeMB,
|
||||||
|
BenchmarkProfile: body.Profile,
|
||||||
|
RunNCCL: runNCCL,
|
||||||
|
DisplayName: body.DisplayName,
|
||||||
|
}, name, h.opts.App, "benchmark-nvidia")
|
||||||
|
if err != nil {
|
||||||
|
writeError(w, http.StatusBadRequest, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for _, t := range tasks {
|
||||||
|
globalQueue.enqueue(t)
|
||||||
|
}
|
||||||
|
writeTaskRunResponse(w, tasks)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (h *handler) handleAPISATStream(w http.ResponseWriter, r *http.Request) {
|
func (h *handler) handleAPISATStream(w http.ResponseWriter, r *http.Request) {
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
package webui
|
package webui
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"encoding/json"
|
||||||
"net/http/httptest"
|
"net/http/httptest"
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
@@ -74,6 +75,14 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
|
|||||||
globalQueue.tasks = originalTasks
|
globalQueue.tasks = originalTasks
|
||||||
globalQueue.mu.Unlock()
|
globalQueue.mu.Unlock()
|
||||||
})
|
})
|
||||||
|
prevList := apiListNvidiaGPUs
|
||||||
|
apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
|
||||||
|
return []platform.NvidiaGPU{
|
||||||
|
{Index: 1, Name: "NVIDIA H100 PCIe"},
|
||||||
|
{Index: 3, Name: "NVIDIA H100 PCIe"},
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
||||||
|
|
||||||
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||||
req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`))
|
req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`))
|
||||||
@@ -101,6 +110,97 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
originalTasks := globalQueue.tasks
|
||||||
|
globalQueue.tasks = nil
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
t.Cleanup(func() {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
globalQueue.tasks = originalTasks
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
})
|
||||||
|
prevList := apiListNvidiaGPUs
|
||||||
|
apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
|
||||||
|
return []platform.NvidiaGPU{
|
||||||
|
{Index: 0, Name: "NVIDIA H100 PCIe"},
|
||||||
|
{Index: 1, Name: "NVIDIA H100 PCIe"},
|
||||||
|
{Index: 2, Name: "NVIDIA H200 NVL"},
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
||||||
|
|
||||||
|
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||||
|
req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"run_nccl":false}`))
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
|
||||||
|
h.handleAPIBenchmarkNvidiaRun(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != 200 {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
var resp taskRunResponse
|
||||||
|
if err := json.Unmarshal(rec.Body.Bytes(), &resp); err != nil {
|
||||||
|
t.Fatalf("decode response: %v", err)
|
||||||
|
}
|
||||||
|
if len(resp.TaskIDs) != 2 {
|
||||||
|
t.Fatalf("task_ids=%v want 2 items", resp.TaskIDs)
|
||||||
|
}
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
defer globalQueue.mu.Unlock()
|
||||||
|
if len(globalQueue.tasks) != 2 {
|
||||||
|
t.Fatalf("tasks=%d want 2", len(globalQueue.tasks))
|
||||||
|
}
|
||||||
|
if got := globalQueue.tasks[0].params.GPUIndices; len(got) != 2 || got[0] != 0 || got[1] != 1 {
|
||||||
|
t.Fatalf("task[0] gpu indices=%v want [0 1]", got)
|
||||||
|
}
|
||||||
|
if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
|
||||||
|
t.Fatalf("task[1] gpu indices=%v want [2]", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
originalTasks := globalQueue.tasks
|
||||||
|
globalQueue.tasks = nil
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
t.Cleanup(func() {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
globalQueue.tasks = originalTasks
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
})
|
||||||
|
prevList := apiListNvidiaGPUs
|
||||||
|
apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
|
||||||
|
return []platform.NvidiaGPU{
|
||||||
|
{Index: 0, Name: "NVIDIA H100 PCIe"},
|
||||||
|
{Index: 1, Name: "NVIDIA H100 PCIe"},
|
||||||
|
{Index: 2, Name: "NVIDIA H200 NVL"},
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
||||||
|
|
||||||
|
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||||
|
req := httptest.NewRequest("POST", "/api/sat/nvidia-targeted-power/run", strings.NewReader(`{"profile":"acceptance","gpu_indices":[0,1,2]}`))
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
|
||||||
|
h.handleAPISATRun("nvidia-targeted-power").ServeHTTP(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != 200 {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
defer globalQueue.mu.Unlock()
|
||||||
|
if len(globalQueue.tasks) != 2 {
|
||||||
|
t.Fatalf("tasks=%d want 2", len(globalQueue.tasks))
|
||||||
|
}
|
||||||
|
if got := globalQueue.tasks[0].params.GPUIndices; len(got) != 2 || got[0] != 0 || got[1] != 1 {
|
||||||
|
t.Fatalf("task[0] gpu indices=%v want [0 1]", got)
|
||||||
|
}
|
||||||
|
if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
|
||||||
|
t.Fatalf("task[1] gpu indices=%v want [2]", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {
|
func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {
|
||||||
h := &handler{}
|
h := &handler{}
|
||||||
h.pushFanRings([]platform.FanReading{
|
h.pushFanRings([]platform.FanReading{
|
||||||
|
|||||||
@@ -8,9 +8,12 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"sort"
|
"sort"
|
||||||
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
"bee/audit/internal/app"
|
"bee/audit/internal/app"
|
||||||
|
"bee/audit/internal/platform"
|
||||||
"bee/audit/internal/schema"
|
"bee/audit/internal/schema"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -161,7 +164,7 @@ func renderPage(page string, opts HandlerOptions) string {
|
|||||||
case "benchmark":
|
case "benchmark":
|
||||||
pageID = "benchmark"
|
pageID = "benchmark"
|
||||||
title = "Benchmark"
|
title = "Benchmark"
|
||||||
body = renderBenchmark()
|
body = renderBenchmark(opts)
|
||||||
case "tasks":
|
case "tasks":
|
||||||
pageID = "tasks"
|
pageID = "tasks"
|
||||||
title = "Tasks"
|
title = "Tasks"
|
||||||
@@ -1072,14 +1075,14 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
inv.NVIDIA,
|
inv.NVIDIA,
|
||||||
`Select which NVIDIA GPUs to include in Validate. The same selection is used by both NVIDIA GPU cards below and by Validate one by one.`,
|
`Select which NVIDIA GPUs to include in Validate. The same selection is used by both NVIDIA GPU cards below and by Validate one by one.`,
|
||||||
`<code>nvidia-smi --query-gpu=index,name,memory.total</code>`,
|
`<code>nvidia-smi --query-gpu=index,name,memory.total</code>`,
|
||||||
`<div id="sat-gpu-list"><p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs…</p></div><div style="display:flex;gap:8px;flex-wrap:wrap;margin-top:8px"><button type="button" class="btn btn-sm btn-secondary" onclick="satSelectAllGPUs()">Select all</button><button type="button" class="btn btn-sm btn-secondary" onclick="satSelectNoGPUs()">Clear</button></div><div id="sat-gpu-selection-note" style="font-size:12px;color:var(--muted);margin-top:8px"></div>`,
|
`<div id="sat-gpu-list"><p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs…</p></div><div style="display:flex;gap:8px;flex-wrap:wrap;margin-top:8px"><button type="button" class="btn btn-sm btn-secondary" onclick="satSelectAllGPUs()">Select all</button><button type="button" class="btn btn-sm btn-secondary" onclick="satSelectNoGPUs()">Clear</button></div><div id="sat-gpu-selection-note" style="font-size:12px;color:var(--muted);margin-top:8px"></div>`,
|
||||||
)) +
|
|
||||||
renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody(
|
|
||||||
inv.NVIDIA,
|
|
||||||
`Runs NVIDIA diagnostics and board inventory checks.`,
|
|
||||||
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
|
|
||||||
`Runs one GPU at a time on the selected NVIDIA GPUs. Diag level is taken from Validate Profile.`,
|
|
||||||
)) +
|
)) +
|
||||||
|
renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody(
|
||||||
|
inv.NVIDIA,
|
||||||
|
`Runs NVIDIA diagnostics and board inventory checks.`,
|
||||||
|
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
|
||||||
|
`Runs one GPU at a time on the selected NVIDIA GPUs. Diag level is taken from Validate Profile.`,
|
||||||
|
)) +
|
||||||
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
|
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
|
||||||
inv.NVIDIA,
|
inv.NVIDIA,
|
||||||
`Runs a controlled NVIDIA DCGM load in Validate to check stability under moderate stress.`,
|
`Runs a controlled NVIDIA DCGM load in Validate to check stability under moderate stress.`,
|
||||||
@@ -1569,7 +1572,25 @@ func renderSATCard(id, label, runAction, headerActions, body string) string {
|
|||||||
|
|
||||||
// ── Benchmark ─────────────────────────────────────────────────────────────────
|
// ── Benchmark ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
func renderBenchmark() string {
|
type benchmarkHistoryColumn struct {
|
||||||
|
key string
|
||||||
|
label string
|
||||||
|
name string
|
||||||
|
index int
|
||||||
|
}
|
||||||
|
|
||||||
|
type benchmarkHistoryCell struct {
|
||||||
|
score float64
|
||||||
|
present bool
|
||||||
|
}
|
||||||
|
|
||||||
|
type benchmarkHistoryRun struct {
|
||||||
|
generatedAt time.Time
|
||||||
|
displayTime string
|
||||||
|
cells map[string]benchmarkHistoryCell
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderBenchmark(opts HandlerOptions) string {
|
||||||
return `<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Benchmark runs generate a human-readable TXT report and machine-readable result bundle. Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
return `<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Benchmark runs generate a human-readable TXT report and machine-readable result bundle. Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||||
|
|
||||||
<div class="grid2">
|
<div class="grid2">
|
||||||
@@ -1618,6 +1639,8 @@ func renderBenchmark() string {
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
` + renderBenchmarkResultsCard(opts.ExportDir) + `
|
||||||
|
|
||||||
<div id="benchmark-output" style="display:none;margin-top:16px" class="card">
|
<div id="benchmark-output" style="display:none;margin-top:16px" class="card">
|
||||||
<div class="card-head">Benchmark Output <span id="benchmark-title"></span></div>
|
<div class="card-head">Benchmark Output <span id="benchmark-title"></span></div>
|
||||||
<div class="card-body"><div id="benchmark-terminal" class="terminal"></div></div>
|
<div class="card-body"><div id="benchmark-terminal" class="terminal"></div></div>
|
||||||
@@ -1633,6 +1656,12 @@ func renderBenchmark() string {
|
|||||||
<script>
|
<script>
|
||||||
let benchmarkES = null;
|
let benchmarkES = null;
|
||||||
|
|
||||||
|
function benchmarkTaskIDs(payload) {
|
||||||
|
if (payload && Array.isArray(payload.task_ids) && payload.task_ids.length) return payload.task_ids;
|
||||||
|
if (payload && payload.task_id) return [payload.task_id];
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
function benchmarkSelectedGPUIndices() {
|
function benchmarkSelectedGPUIndices() {
|
||||||
return Array.from(document.querySelectorAll('.benchmark-gpu-checkbox'))
|
return Array.from(document.querySelectorAll('.benchmark-gpu-checkbox'))
|
||||||
.filter(function(el) { return el.checked && !el.disabled; })
|
.filter(function(el) { return el.checked && !el.disabled; })
|
||||||
@@ -1732,17 +1761,37 @@ function runNvidiaBenchmark() {
|
|||||||
return payload;
|
return payload;
|
||||||
});
|
});
|
||||||
}).then(function(d) {
|
}).then(function(d) {
|
||||||
status.textContent = 'Task ' + d.task_id + ' queued.';
|
const taskIds = benchmarkTaskIDs(d);
|
||||||
term.textContent += 'Task ' + d.task_id + ' queued. Streaming log...\n';
|
if (!taskIds.length) throw new Error('No benchmark task was queued.');
|
||||||
benchmarkES = new EventSource('/api/tasks/' + d.task_id + '/stream');
|
status.textContent = taskIds.length === 1 ? ('Task ' + taskIds[0] + ' queued.') : ('Queued ' + taskIds.length + ' tasks.');
|
||||||
benchmarkES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
const streamNext = function(idx, failures) {
|
||||||
benchmarkES.addEventListener('done', function(e) {
|
if (idx >= taskIds.length) {
|
||||||
benchmarkES.close();
|
status.textContent = failures ? 'Completed with failures.' : 'Completed.';
|
||||||
benchmarkES = null;
|
return;
|
||||||
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
}
|
||||||
term.scrollTop = term.scrollHeight;
|
const taskId = taskIds[idx];
|
||||||
status.textContent = e.data ? 'Failed.' : 'Completed.';
|
term.textContent += '\n[' + (idx + 1) + '/' + taskIds.length + '] Task ' + taskId + ' queued. Streaming log...\n';
|
||||||
});
|
benchmarkES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||||
|
benchmarkES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||||
|
benchmarkES.addEventListener('done', function(e) {
|
||||||
|
benchmarkES.close();
|
||||||
|
benchmarkES = null;
|
||||||
|
if (e.data) failures += 1;
|
||||||
|
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||||
|
term.scrollTop = term.scrollHeight;
|
||||||
|
streamNext(idx + 1, failures);
|
||||||
|
});
|
||||||
|
benchmarkES.onerror = function() {
|
||||||
|
if (benchmarkES) {
|
||||||
|
benchmarkES.close();
|
||||||
|
benchmarkES = null;
|
||||||
|
}
|
||||||
|
term.textContent += '\nERROR: stream disconnected.\n';
|
||||||
|
term.scrollTop = term.scrollHeight;
|
||||||
|
streamNext(idx + 1, failures + 1);
|
||||||
|
};
|
||||||
|
};
|
||||||
|
streamNext(0, 0);
|
||||||
}).catch(function(err) {
|
}).catch(function(err) {
|
||||||
status.textContent = 'Error.';
|
status.textContent = 'Error.';
|
||||||
term.textContent += 'ERROR: ' + err.message + '\n';
|
term.textContent += 'ERROR: ' + err.message + '\n';
|
||||||
@@ -1754,6 +1803,129 @@ benchmarkLoadGPUs();
|
|||||||
</script>`
|
</script>`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func renderBenchmarkResultsCard(exportDir string) string {
|
||||||
|
columns, runs := loadBenchmarkHistory(exportDir)
|
||||||
|
return renderBenchmarkResultsCardFromRuns(
|
||||||
|
"Benchmark Results",
|
||||||
|
"Composite score by saved benchmark run and GPU.",
|
||||||
|
"No saved benchmark runs yet.",
|
||||||
|
columns,
|
||||||
|
runs,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string, columns []benchmarkHistoryColumn, runs []benchmarkHistoryRun) string {
|
||||||
|
if len(runs) == 0 {
|
||||||
|
return `<div class="card"><div class="card-head">` + html.EscapeString(title) + `</div><div class="card-body"><p style="color:var(--muted);font-size:13px">` + html.EscapeString(emptyMessage) + `</p></div></div>`
|
||||||
|
}
|
||||||
|
var b strings.Builder
|
||||||
|
b.WriteString(`<div class="card"><div class="card-head">` + html.EscapeString(title) + `</div><div class="card-body">`)
|
||||||
|
if strings.TrimSpace(description) != "" {
|
||||||
|
b.WriteString(`<p style="color:var(--muted);font-size:13px;margin-bottom:12px">` + html.EscapeString(description) + `</p>`)
|
||||||
|
}
|
||||||
|
b.WriteString(`<div style="overflow-x:auto">`)
|
||||||
|
b.WriteString(`<table><thead><tr><th>Test</th><th>Time</th>`)
|
||||||
|
for _, col := range columns {
|
||||||
|
b.WriteString(`<th>` + html.EscapeString(col.label) + `</th>`)
|
||||||
|
}
|
||||||
|
b.WriteString(`</tr></thead><tbody>`)
|
||||||
|
for i, run := range runs {
|
||||||
|
b.WriteString(`<tr>`)
|
||||||
|
b.WriteString(`<td>#` + strconv.Itoa(i+1) + `</td>`)
|
||||||
|
b.WriteString(`<td>` + html.EscapeString(run.displayTime) + `</td>`)
|
||||||
|
for _, col := range columns {
|
||||||
|
cell, ok := run.cells[col.key]
|
||||||
|
if !ok || !cell.present {
|
||||||
|
b.WriteString(`<td style="color:var(--muted)">-</td>`)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
b.WriteString(`<td>` + fmt.Sprintf("%.2f", cell.score) + `</td>`)
|
||||||
|
}
|
||||||
|
b.WriteString(`</tr>`)
|
||||||
|
}
|
||||||
|
b.WriteString(`</tbody></table></div></div></div>`)
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func loadBenchmarkHistory(exportDir string) ([]benchmarkHistoryColumn, []benchmarkHistoryRun) {
|
||||||
|
baseDir := app.DefaultBenchmarkBaseDir
|
||||||
|
if strings.TrimSpace(exportDir) != "" {
|
||||||
|
baseDir = filepath.Join(exportDir, "bee-benchmark")
|
||||||
|
}
|
||||||
|
paths, err := filepath.Glob(filepath.Join(baseDir, "gpu-benchmark-*", "result.json"))
|
||||||
|
if err != nil || len(paths) == 0 {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
sort.Strings(paths)
|
||||||
|
return loadBenchmarkHistoryFromPaths(paths)
|
||||||
|
}
|
||||||
|
|
||||||
|
func loadBenchmarkHistoryFromPaths(paths []string) ([]benchmarkHistoryColumn, []benchmarkHistoryRun) {
|
||||||
|
columnByKey := make(map[string]benchmarkHistoryColumn)
|
||||||
|
runs := make([]benchmarkHistoryRun, 0, len(paths))
|
||||||
|
for _, path := range paths {
|
||||||
|
raw, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
var result platform.NvidiaBenchmarkResult
|
||||||
|
if err := json.Unmarshal(raw, &result); err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
run := benchmarkHistoryRun{
|
||||||
|
generatedAt: result.GeneratedAt,
|
||||||
|
displayTime: result.GeneratedAt.Local().Format("2006-01-02 15:04:05"),
|
||||||
|
cells: make(map[string]benchmarkHistoryCell),
|
||||||
|
}
|
||||||
|
for _, gpu := range result.GPUs {
|
||||||
|
key := benchmarkHistoryColumnKey(gpu.Name, gpu.Index)
|
||||||
|
columnByKey[key] = benchmarkHistoryColumn{
|
||||||
|
key: key,
|
||||||
|
label: benchmarkHistoryColumnLabel(gpu.Name, gpu.Index),
|
||||||
|
name: strings.TrimSpace(gpu.Name),
|
||||||
|
index: gpu.Index,
|
||||||
|
}
|
||||||
|
run.cells[key] = benchmarkHistoryCell{
|
||||||
|
score: gpu.Scores.CompositeScore,
|
||||||
|
present: true,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
runs = append(runs, run)
|
||||||
|
}
|
||||||
|
|
||||||
|
columns := make([]benchmarkHistoryColumn, 0, len(columnByKey))
|
||||||
|
for _, col := range columnByKey {
|
||||||
|
columns = append(columns, col)
|
||||||
|
}
|
||||||
|
sort.Slice(columns, func(i, j int) bool {
|
||||||
|
leftName := strings.ToLower(strings.TrimSpace(columns[i].name))
|
||||||
|
rightName := strings.ToLower(strings.TrimSpace(columns[j].name))
|
||||||
|
if leftName != rightName {
|
||||||
|
return leftName < rightName
|
||||||
|
}
|
||||||
|
if columns[i].index != columns[j].index {
|
||||||
|
return columns[i].index < columns[j].index
|
||||||
|
}
|
||||||
|
return columns[i].key < columns[j].key
|
||||||
|
})
|
||||||
|
sort.Slice(runs, func(i, j int) bool {
|
||||||
|
return runs[i].generatedAt.After(runs[j].generatedAt)
|
||||||
|
})
|
||||||
|
return columns, runs
|
||||||
|
}
|
||||||
|
|
||||||
|
func benchmarkHistoryColumnKey(name string, index int) string {
|
||||||
|
return strings.TrimSpace(name) + "|" + strconv.Itoa(index)
|
||||||
|
}
|
||||||
|
|
||||||
|
func benchmarkHistoryColumnLabel(name string, index int) string {
|
||||||
|
name = strings.TrimSpace(name)
|
||||||
|
if name == "" {
|
||||||
|
return fmt.Sprintf("GPU %d", index)
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("%s / GPU %d", name, index)
|
||||||
|
}
|
||||||
|
|
||||||
// ── Burn ──────────────────────────────────────────────────────────────────────
|
// ── Burn ──────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
func renderBurn() string {
|
func renderBurn() string {
|
||||||
@@ -1873,6 +2045,12 @@ func renderBurn() string {
|
|||||||
<script>
|
<script>
|
||||||
let biES = null;
|
let biES = null;
|
||||||
|
|
||||||
|
function burnTaskIDs(payload) {
|
||||||
|
if (payload && Array.isArray(payload.task_ids) && payload.task_ids.length) return payload.task_ids;
|
||||||
|
if (payload && payload.task_id) return [payload.task_id];
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
function burnProfile() {
|
function burnProfile() {
|
||||||
const selected = document.querySelector('input[name="burn-profile"]:checked');
|
const selected = document.querySelector('input[name="burn-profile"]:checked');
|
||||||
return selected ? selected.value : 'smoke';
|
return selected ? selected.value : 'smoke';
|
||||||
@@ -1974,6 +2152,9 @@ function streamTask(taskId, label) {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
function streamBurnTask(taskId, label, resetTerminal) {
|
function streamBurnTask(taskId, label, resetTerminal) {
|
||||||
|
return streamBurnTaskSet([taskId], label, resetTerminal);
|
||||||
|
}
|
||||||
|
function streamBurnTaskSet(taskIds, label, resetTerminal) {
|
||||||
if (biES) { biES.close(); biES = null; }
|
if (biES) { biES.close(); biES = null; }
|
||||||
document.getElementById('bi-output').style.display = 'block';
|
document.getElementById('bi-output').style.display = 'block';
|
||||||
document.getElementById('bi-title').textContent = '— ' + label + ' [' + burnProfile() + ']';
|
document.getElementById('bi-title').textContent = '— ' + label + ' [' + burnProfile() + ']';
|
||||||
@@ -1981,27 +2162,40 @@ function streamBurnTask(taskId, label, resetTerminal) {
|
|||||||
if (resetTerminal) {
|
if (resetTerminal) {
|
||||||
term.textContent = '';
|
term.textContent = '';
|
||||||
}
|
}
|
||||||
term.textContent += 'Task ' + taskId + ' queued. Streaming...\n';
|
if (!Array.isArray(taskIds) || !taskIds.length) {
|
||||||
return new Promise(function(resolve) {
|
term.textContent += 'ERROR: no tasks queued.\n';
|
||||||
biES = new EventSource('/api/tasks/' + taskId + '/stream');
|
return Promise.resolve({ok:false, error:'no tasks queued'});
|
||||||
biES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
}
|
||||||
biES.addEventListener('done', function(e) {
|
const streamNext = function(idx, failures) {
|
||||||
biES.close();
|
if (idx >= taskIds.length) {
|
||||||
biES = null;
|
return Promise.resolve({ok: failures === 0, error: failures ? (failures + ' task(s) failed') : ''});
|
||||||
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
}
|
||||||
term.scrollTop = term.scrollHeight;
|
const taskId = taskIds[idx];
|
||||||
resolve({ok: !e.data, error: e.data || ''});
|
term.textContent += '[' + (idx + 1) + '/' + taskIds.length + '] Task ' + taskId + ' queued. Streaming...\n';
|
||||||
});
|
return new Promise(function(resolve) {
|
||||||
biES.onerror = function() {
|
biES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||||
if (biES) {
|
biES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||||
|
biES.addEventListener('done', function(e) {
|
||||||
biES.close();
|
biES.close();
|
||||||
biES = null;
|
biES = null;
|
||||||
}
|
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||||
term.textContent += '\nERROR: stream disconnected.\n';
|
term.scrollTop = term.scrollHeight;
|
||||||
term.scrollTop = term.scrollHeight;
|
resolve(failures + (e.data ? 1 : 0));
|
||||||
resolve({ok: false, error: 'stream disconnected'});
|
});
|
||||||
};
|
biES.onerror = function() {
|
||||||
});
|
if (biES) {
|
||||||
|
biES.close();
|
||||||
|
biES = null;
|
||||||
|
}
|
||||||
|
term.textContent += '\nERROR: stream disconnected.\n';
|
||||||
|
term.scrollTop = term.scrollHeight;
|
||||||
|
resolve(failures + 1);
|
||||||
|
};
|
||||||
|
}).then(function(nextFailures) {
|
||||||
|
return streamNext(idx + 1, nextFailures);
|
||||||
|
});
|
||||||
|
};
|
||||||
|
return streamNext(0, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
function runBurnTaskSet(tasks, statusElId) {
|
function runBurnTaskSet(tasks, statusElId) {
|
||||||
@@ -2029,7 +2223,7 @@ function runBurnTaskSet(tasks, statusElId) {
|
|||||||
if (status) status.textContent = 'Running ' + (idx + 1) + '/' + enabled.length + '...';
|
if (status) status.textContent = 'Running ' + (idx + 1) + '/' + enabled.length + '...';
|
||||||
return enqueueBurnTask(t.target, t.label, t.extra, !!t.nvidia)
|
return enqueueBurnTask(t.target, t.label, t.extra, !!t.nvidia)
|
||||||
.then(function(d) {
|
.then(function(d) {
|
||||||
return streamBurnTask(d.task_id, t.label, false);
|
return streamBurnTaskSet(burnTaskIDs(d), t.label, false);
|
||||||
})
|
})
|
||||||
.then(function() {
|
.then(function() {
|
||||||
return runNext(idx + 1);
|
return runNext(idx + 1);
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
package webui
|
package webui
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"encoding/json"
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/http/httptest"
|
"net/http/httptest"
|
||||||
"os"
|
"os"
|
||||||
@@ -636,6 +637,66 @@ func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
exportDir := filepath.Join(dir, "export")
|
||||||
|
runDir := filepath.Join(exportDir, "bee-benchmark", "gpu-benchmark-20260406-120000")
|
||||||
|
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
result := platform.NvidiaBenchmarkResult{
|
||||||
|
GeneratedAt: time.Date(2026, time.April, 6, 12, 0, 0, 0, time.UTC),
|
||||||
|
BenchmarkProfile: "standard",
|
||||||
|
OverallStatus: "OK",
|
||||||
|
GPUs: []platform.BenchmarkGPUResult{
|
||||||
|
{
|
||||||
|
Index: 0,
|
||||||
|
Name: "NVIDIA H100 PCIe",
|
||||||
|
Scores: platform.BenchmarkScorecard{
|
||||||
|
CompositeScore: 1176.25,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Index: 1,
|
||||||
|
Name: "NVIDIA H100 PCIe",
|
||||||
|
Scores: platform.BenchmarkScorecard{
|
||||||
|
CompositeScore: 1168.50,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
raw, err := json.Marshal(result)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(filepath.Join(runDir, "result.json"), raw, 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
handler := NewHandler(HandlerOptions{ExportDir: exportDir})
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/benchmark", nil))
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d", rec.Code)
|
||||||
|
}
|
||||||
|
body := rec.Body.String()
|
||||||
|
wantTime := result.GeneratedAt.Local().Format("2006-01-02 15:04:05")
|
||||||
|
for _, needle := range []string{
|
||||||
|
`Benchmark Results`,
|
||||||
|
`Composite score by saved benchmark run and GPU.`,
|
||||||
|
`NVIDIA H100 PCIe / GPU 0`,
|
||||||
|
`NVIDIA H100 PCIe / GPU 1`,
|
||||||
|
`#1`,
|
||||||
|
wantTime,
|
||||||
|
`1176.25`,
|
||||||
|
`1168.50`,
|
||||||
|
} {
|
||||||
|
if !strings.Contains(body, needle) {
|
||||||
|
t.Fatalf("benchmark page missing %q: %s", needle, body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
|
func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
|
||||||
handler := NewHandler(HandlerOptions{})
|
handler := NewHandler(HandlerOptions{})
|
||||||
rec := httptest.NewRecorder()
|
rec := httptest.NewRecorder()
|
||||||
|
|||||||
@@ -97,10 +97,13 @@ func renderTaskDetailPage(opts HandlerOptions, task Task) string {
|
|||||||
body.WriteString(`</div></div>`)
|
body.WriteString(`</div></div>`)
|
||||||
}
|
}
|
||||||
|
|
||||||
if task.Status == TaskRunning || task.Status == TaskPending {
|
if task.Status == TaskRunning {
|
||||||
body.WriteString(`<div class="card"><div class="card-head">Live Charts</div><div class="card-body">`)
|
body.WriteString(`<div class="card"><div class="card-head">Live Charts</div><div class="card-body">`)
|
||||||
body.WriteString(`<div id="task-live-charts" style="display:flex;flex-direction:column;gap:16px;color:var(--muted);font-size:13px">Loading charts...</div>`)
|
body.WriteString(`<div id="task-live-charts" style="display:flex;flex-direction:column;gap:16px;color:var(--muted);font-size:13px">Loading charts...</div>`)
|
||||||
body.WriteString(`</div></div>`)
|
body.WriteString(`</div></div>`)
|
||||||
|
}
|
||||||
|
|
||||||
|
if task.Status == TaskRunning || task.Status == TaskPending {
|
||||||
body.WriteString(`<div class="card"><div class="card-head">Live Logs</div><div class="card-body">`)
|
body.WriteString(`<div class="card"><div class="card-head">Live Logs</div><div class="card-body">`)
|
||||||
body.WriteString(`<div id="task-live-log" class="terminal" style="max-height:none;white-space:pre-wrap">Connecting...</div>`)
|
body.WriteString(`<div id="task-live-log" class="terminal" style="max-height:none;white-space:pre-wrap">Connecting...</div>`)
|
||||||
body.WriteString(`</div></div>`)
|
body.WriteString(`</div></div>`)
|
||||||
|
|||||||
@@ -230,6 +230,9 @@ func renderTaskReportFragment(report taskReport, charts map[string]string, logTe
|
|||||||
b.WriteString(`<div style="margin-top:14px;font-size:13px;color:var(--muted)">`)
|
b.WriteString(`<div style="margin-top:14px;font-size:13px;color:var(--muted)">`)
|
||||||
b.WriteString(`Started: ` + formatTaskTime(report.StartedAt, report.CreatedAt) + ` | Finished: ` + formatTaskTime(report.DoneAt, time.Time{}) + ` | Duration: ` + formatTaskDuration(report.DurationSec))
|
b.WriteString(`Started: ` + formatTaskTime(report.StartedAt, report.CreatedAt) + ` | Finished: ` + formatTaskTime(report.DoneAt, time.Time{}) + ` | Duration: ` + formatTaskDuration(report.DurationSec))
|
||||||
b.WriteString(`</div></div></div>`)
|
b.WriteString(`</div></div></div>`)
|
||||||
|
if benchmarkCard := renderTaskBenchmarkResultsCard(report.Target, logText); benchmarkCard != "" {
|
||||||
|
b.WriteString(benchmarkCard)
|
||||||
|
}
|
||||||
|
|
||||||
if len(report.Charts) > 0 {
|
if len(report.Charts) > 0 {
|
||||||
for _, chart := range report.Charts {
|
for _, chart := range report.Charts {
|
||||||
@@ -247,6 +250,57 @@ func renderTaskReportFragment(report taskReport, charts map[string]string, logTe
|
|||||||
return b.String()
|
return b.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func renderTaskBenchmarkResultsCard(target, logText string) string {
|
||||||
|
if strings.TrimSpace(target) != "nvidia-benchmark" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
resultPath := taskBenchmarkResultPath(logText)
|
||||||
|
if strings.TrimSpace(resultPath) == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
columns, runs := loadBenchmarkHistoryFromPaths([]string{resultPath})
|
||||||
|
if len(runs) == 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return renderBenchmarkResultsCardFromRuns(
|
||||||
|
"Benchmark Results",
|
||||||
|
"Composite score for this benchmark task.",
|
||||||
|
"No benchmark results were saved for this task.",
|
||||||
|
columns,
|
||||||
|
runs,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskBenchmarkResultPath(logText string) string {
|
||||||
|
archivePath := taskArchivePathFromLog(logText)
|
||||||
|
if archivePath == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||||
|
if runDir == archivePath {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return filepath.Join(runDir, "result.json")
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskArchivePathFromLog(logText string) string {
|
||||||
|
lines := strings.Split(logText, "\n")
|
||||||
|
for i := len(lines) - 1; i >= 0; i-- {
|
||||||
|
line := strings.TrimSpace(lines[i])
|
||||||
|
if line == "" || !strings.HasPrefix(line, "Archive:") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
path := strings.TrimSpace(strings.TrimPrefix(line, "Archive:"))
|
||||||
|
if strings.HasPrefix(path, "Archive written to ") {
|
||||||
|
path = strings.TrimSpace(strings.TrimPrefix(path, "Archive written to "))
|
||||||
|
}
|
||||||
|
if strings.HasSuffix(path, ".tar.gz") {
|
||||||
|
return path
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
func renderTaskStatusBadge(status string) string {
|
func renderTaskStatusBadge(status string) string {
|
||||||
className := map[string]string{
|
className := map[string]string{
|
||||||
TaskRunning: "badge-ok",
|
TaskRunning: "badge-ok",
|
||||||
|
|||||||
@@ -1149,7 +1149,32 @@ func taskArtifactsDir(root string, t *Task, status string) string {
|
|||||||
if strings.TrimSpace(root) == "" || t == nil {
|
if strings.TrimSpace(root) == "" || t == nil {
|
||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
return filepath.Join(root, fmt.Sprintf("%s_%s_%s", t.ID, sanitizeTaskFolderPart(t.Name), taskFolderStatus(status)))
|
prefix := taskFolderNumberPrefix(t.ID)
|
||||||
|
return filepath.Join(root, fmt.Sprintf("%s_%s_%s", prefix, sanitizeTaskFolderPart(t.Name), taskFolderStatus(status)))
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskFolderNumberPrefix(taskID string) string {
|
||||||
|
taskID = strings.TrimSpace(taskID)
|
||||||
|
if strings.HasPrefix(taskID, "TASK-") && len(taskID) >= len("TASK-000") {
|
||||||
|
num := strings.TrimSpace(strings.TrimPrefix(taskID, "TASK-"))
|
||||||
|
if len(num) == 3 {
|
||||||
|
allDigits := true
|
||||||
|
for _, r := range num {
|
||||||
|
if r < '0' || r > '9' {
|
||||||
|
allDigits = false
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if allDigits {
|
||||||
|
return num
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fallback := sanitizeTaskFolderPart(taskID)
|
||||||
|
if fallback == "" {
|
||||||
|
return "000"
|
||||||
|
}
|
||||||
|
return fallback
|
||||||
}
|
}
|
||||||
|
|
||||||
func ensureTaskReportPaths(t *Task) {
|
func ensureTaskReportPaths(t *Task) {
|
||||||
|
|||||||
@@ -163,6 +163,40 @@ func TestTaskQueueSnapshotSortsNewestFirst(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestNewJobIDUsesTASKPrefixAndZeroPadding(t *testing.T) {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
origTasks := globalQueue.tasks
|
||||||
|
globalQueue.tasks = nil
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
origCounter := jobCounter.Load()
|
||||||
|
jobCounter.Store(0)
|
||||||
|
t.Cleanup(func() {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
globalQueue.tasks = origTasks
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
jobCounter.Store(origCounter)
|
||||||
|
})
|
||||||
|
|
||||||
|
if got := newJobID("ignored"); got != "TASK-000" {
|
||||||
|
t.Fatalf("id=%q want TASK-000", got)
|
||||||
|
}
|
||||||
|
if got := newJobID("ignored"); got != "TASK-001" {
|
||||||
|
t.Fatalf("id=%q want TASK-001", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTaskArtifactsDirStartsWithTaskNumber(t *testing.T) {
|
||||||
|
root := t.TempDir()
|
||||||
|
task := &Task{
|
||||||
|
ID: "TASK-007",
|
||||||
|
Name: "NVIDIA Benchmark",
|
||||||
|
}
|
||||||
|
got := filepath.Base(taskArtifactsDir(root, task, TaskDone))
|
||||||
|
if !strings.HasPrefix(got, "007_") {
|
||||||
|
t.Fatalf("artifacts dir=%q want prefix 007_", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestHandleAPITasksStreamReplaysPersistedLogWithoutLiveJob(t *testing.T) {
|
func TestHandleAPITasksStreamReplaysPersistedLogWithoutLiveJob(t *testing.T) {
|
||||||
dir := t.TempDir()
|
dir := t.TempDir()
|
||||||
logPath := filepath.Join(dir, "task.log")
|
logPath := filepath.Join(dir, "task.log")
|
||||||
@@ -325,6 +359,78 @@ func TestFinalizeTaskRunCreatesReportFolderAndArtifacts(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
metricsPath := filepath.Join(dir, "metrics.db")
|
||||||
|
prevMetricsPath := taskReportMetricsDBPath
|
||||||
|
taskReportMetricsDBPath = metricsPath
|
||||||
|
t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })
|
||||||
|
|
||||||
|
benchmarkDir := filepath.Join(dir, "bee-benchmark", "gpu-benchmark-20260406-120000")
|
||||||
|
if err := os.MkdirAll(benchmarkDir, 0755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
result := platform.NvidiaBenchmarkResult{
|
||||||
|
GeneratedAt: time.Date(2026, time.April, 6, 12, 0, 0, 0, time.UTC),
|
||||||
|
BenchmarkProfile: "standard",
|
||||||
|
OverallStatus: "OK",
|
||||||
|
GPUs: []platform.BenchmarkGPUResult{
|
||||||
|
{
|
||||||
|
Index: 0,
|
||||||
|
Name: "NVIDIA H100 PCIe",
|
||||||
|
Scores: platform.BenchmarkScorecard{
|
||||||
|
CompositeScore: 1176.25,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
raw, err := json.Marshal(result)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(filepath.Join(benchmarkDir, "result.json"), raw, 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
artifactsDir := filepath.Join(dir, "tasks", "task-bench_done")
|
||||||
|
if err := os.MkdirAll(artifactsDir, 0755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
task := &Task{
|
||||||
|
ID: "task-bench",
|
||||||
|
Name: "NVIDIA Benchmark",
|
||||||
|
Target: "nvidia-benchmark",
|
||||||
|
Status: TaskDone,
|
||||||
|
CreatedAt: time.Now().UTC().Add(-time.Minute),
|
||||||
|
ArtifactsDir: artifactsDir,
|
||||||
|
}
|
||||||
|
ensureTaskReportPaths(task)
|
||||||
|
logText := "line-1\nArchive: " + filepath.Join(dir, "bee-benchmark", "gpu-benchmark-20260406-120000.tar.gz") + "\n"
|
||||||
|
if err := os.WriteFile(task.LogPath, []byte(logText), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := writeTaskReportArtifacts(task); err != nil {
|
||||||
|
t.Fatalf("writeTaskReportArtifacts: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
body, err := os.ReadFile(task.ReportHTMLPath)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("ReadFile(report.html): %v", err)
|
||||||
|
}
|
||||||
|
html := string(body)
|
||||||
|
for _, needle := range []string{
|
||||||
|
`Benchmark Results`,
|
||||||
|
`Composite score for this benchmark task.`,
|
||||||
|
`NVIDIA H100 PCIe / GPU 0`,
|
||||||
|
`1176.25`,
|
||||||
|
} {
|
||||||
|
if !strings.Contains(html, needle) {
|
||||||
|
t.Fatalf("report missing %q: %s", needle, html)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestTaskLifecycleMirrorsToSerialConsole(t *testing.T) {
|
func TestTaskLifecycleMirrorsToSerialConsole(t *testing.T) {
|
||||||
var lines []string
|
var lines []string
|
||||||
prev := taskSerialWriteLine
|
prev := taskSerialWriteLine
|
||||||
|
|||||||
@@ -62,6 +62,8 @@ done
|
|||||||
echo "loader=bee-gpu-burn"
|
echo "loader=bee-gpu-burn"
|
||||||
echo "selected_gpus=${FINAL}"
|
echo "selected_gpus=${FINAL}"
|
||||||
|
|
||||||
|
export CUDA_DEVICE_ORDER="PCI_BUS_ID"
|
||||||
|
|
||||||
TMP_DIR=$(mktemp -d)
|
TMP_DIR=$(mktemp -d)
|
||||||
trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM
|
trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM
|
||||||
|
|
||||||
@@ -78,7 +80,8 @@ for id in $(echo "${FINAL}" | tr ',' ' '); do
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
echo "starting gpu ${id} size=${gpu_size_mb}MB"
|
echo "starting gpu ${id} size=${gpu_size_mb}MB"
|
||||||
"${WORKER}" --device "${id}" --seconds "${SECONDS}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
|
CUDA_VISIBLE_DEVICES="${id}" \
|
||||||
|
"${WORKER}" --device 0 --seconds "${SECONDS}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
|
||||||
pid=$!
|
pid=$!
|
||||||
WORKERS="${WORKERS} ${pid}:${id}:${log}"
|
WORKERS="${WORKERS} ${pid}:${id}:${log}"
|
||||||
done
|
done
|
||||||
|
|||||||
@@ -152,14 +152,19 @@ done
|
|||||||
|
|
||||||
[ -n "${FINAL}" ] || { echo "no NVIDIA GPUs selected after filters" >&2; exit 1; }
|
[ -n "${FINAL}" ] || { echo "no NVIDIA GPUs selected after filters" >&2; exit 1; }
|
||||||
|
|
||||||
|
export CUDA_DEVICE_ORDER="PCI_BUS_ID"
|
||||||
|
export CUDA_VISIBLE_DEVICES="${FINAL}"
|
||||||
|
|
||||||
JOHN_DEVICES=""
|
JOHN_DEVICES=""
|
||||||
|
local_id=1
|
||||||
for id in $(echo "${FINAL}" | tr ',' ' '); do
|
for id in $(echo "${FINAL}" | tr ',' ' '); do
|
||||||
opencl_id=$((id + 1))
|
opencl_id="${local_id}"
|
||||||
if [ -z "${JOHN_DEVICES}" ]; then
|
if [ -z "${JOHN_DEVICES}" ]; then
|
||||||
JOHN_DEVICES="${opencl_id}"
|
JOHN_DEVICES="${opencl_id}"
|
||||||
else
|
else
|
||||||
JOHN_DEVICES="${JOHN_DEVICES},${opencl_id}"
|
JOHN_DEVICES="${JOHN_DEVICES},${opencl_id}"
|
||||||
fi
|
fi
|
||||||
|
local_id=$((local_id + 1))
|
||||||
done
|
done
|
||||||
|
|
||||||
echo "loader=john"
|
echo "loader=john"
|
||||||
|
|||||||
@@ -70,6 +70,8 @@ echo "gpu_count=${GPU_COUNT}"
|
|||||||
echo "range=${MIN_BYTES}..${MAX_BYTES}"
|
echo "range=${MIN_BYTES}..${MAX_BYTES}"
|
||||||
echo "iters=${ITERS}"
|
echo "iters=${ITERS}"
|
||||||
|
|
||||||
|
export CUDA_DEVICE_ORDER="PCI_BUS_ID"
|
||||||
|
|
||||||
deadline=$(( $(date +%s) + SECONDS ))
|
deadline=$(( $(date +%s) + SECONDS ))
|
||||||
round=0
|
round=0
|
||||||
|
|
||||||
|
|||||||
@@ -209,6 +209,18 @@ fi
|
|||||||
ldconfig 2>/dev/null || true
|
ldconfig 2>/dev/null || true
|
||||||
log "ldconfig refreshed"
|
log "ldconfig refreshed"
|
||||||
|
|
||||||
|
# Keep persistence mode enabled across the session so dcgmi / stress tools do
|
||||||
|
# not fail with deployment warnings on otherwise healthy GPUs.
|
||||||
|
if command -v nvidia-smi >/dev/null 2>&1; then
|
||||||
|
if nvidia-smi -pm 1 >/dev/null 2>&1; then
|
||||||
|
log "enabled NVIDIA persistence mode"
|
||||||
|
else
|
||||||
|
log "WARN: failed to enable NVIDIA persistence mode"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
log "WARN: nvidia-smi not found — cannot enable persistence mode"
|
||||||
|
fi
|
||||||
|
|
||||||
# Start DCGM host engine so dcgmi can discover GPUs.
|
# Start DCGM host engine so dcgmi can discover GPUs.
|
||||||
# nv-hostengine must run after the NVIDIA modules and device nodes are ready.
|
# nv-hostengine must run after the NVIDIA modules and device nodes are ready.
|
||||||
# If it started too early (for example via systemd before bee-nvidia-load), it can
|
# If it started too early (for example via systemd before bee-nvidia-load), it can
|
||||||
|
|||||||
Reference in New Issue
Block a user