diff --git a/audit/internal/platform/nvidia_stress.go b/audit/internal/platform/nvidia_stress.go index 6ef735e..8089a8d 100644 --- a/audit/internal/platform/nvidia_stress.go +++ b/audit/internal/platform/nvidia_stress.go @@ -16,12 +16,12 @@ func (s *System) RunNvidiaStressPack(ctx context.Context, baseDir string, opts N return "", err } - return runAcceptancePackCtx(ctx, baseDir, nvidiaStressArchivePrefix(opts.Loader), []satJob{ - {name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, - {name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}}, + return runAcceptancePackCtx(ctx, baseDir, nvidiaStressArchivePrefix(opts.Loader), withNvidiaPersistenceMode( + satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, + satJob{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}}, job, - {name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}}, - }, logFunc) + satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}}, + ), logFunc) } func nvidiaStressArchivePrefix(loader string) string { diff --git a/audit/internal/platform/sat.go b/audit/internal/platform/sat.go index 29c4803..0a41525 100644 --- a/audit/internal/platform/sat.go +++ b/audit/internal/platform/sat.go @@ -278,13 +278,13 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func( if gpuCount < 1 { gpuCount = 1 } - return runAcceptancePackCtx(ctx, baseDir, "nccl-tests", []satJob{ - {name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, - {name: "02-all-reduce-perf.log", cmd: []string{ + return runAcceptancePackCtx(ctx, baseDir, "nccl-tests", withNvidiaPersistenceMode( + satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, + satJob{name: "02-all-reduce-perf.log", cmd: []string{ "all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2", "-g", strconv.Itoa(gpuCount), "--iters", "20", }}, - }, logFunc) + ), logFunc) } func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) { @@ -296,18 +296,18 @@ func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir strin if err != nil { return "", err } - return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", []satJob{ - {name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, - {name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}}, - { + return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", withNvidiaPersistenceMode( + satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, + satJob{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}}, + satJob{ name: "03-dcgmproftester.log", cmd: profCmd, env: nvidiaVisibleDevicesEnv(selected), collectGPU: true, gpuIndices: selected, }, - {name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}}, - }, logFunc) + satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}}, + ), logFunc) } func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) { @@ -315,16 +315,16 @@ func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, if err != nil { return "", err } - return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", []satJob{ - {name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, - { + return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", withNvidiaPersistenceMode( + satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, + satJob{ name: "02-dcgmi-targeted-power.log", cmd: nvidiaDCGMNamedDiagCommand("targeted_power", normalizeNvidiaBurnDuration(durationSec), selected), collectGPU: true, gpuIndices: selected, }, - {name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}}, - }, logFunc) + satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}}, + ), logFunc) } func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) { @@ -332,16 +332,16 @@ func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, dur if err != nil { return "", err } - return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", []satJob{ - {name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, - { + return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", withNvidiaPersistenceMode( + satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, + satJob{ name: "02-dcgmi-pulse-test.log", cmd: nvidiaDCGMNamedDiagCommand("pulse_test", normalizeNvidiaBurnDuration(durationSec), selected), collectGPU: true, gpuIndices: selected, }, - {name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}}, - }, logFunc) + satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}}, + ), logFunc) } func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) { @@ -349,16 +349,16 @@ func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpu if err != nil { return "", err } - return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", []satJob{ - {name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, - { + return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", withNvidiaPersistenceMode( + satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, + satJob{ name: "02-dcgmi-nvbandwidth.log", cmd: nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, selected), collectGPU: true, gpuIndices: selected, }, - {name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}}, - }, logFunc) + satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}}, + ), logFunc) } func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) { @@ -389,16 +389,16 @@ func (s *System) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDi logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name)) } } - return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-stress", []satJob{ - {name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, - { + return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-stress", withNvidiaPersistenceMode( + satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, + satJob{ name: "02-dcgmi-targeted-stress.log", cmd: nvidiaDCGMNamedDiagCommand("targeted_stress", normalizeNvidiaBurnDuration(durationSec), selected), collectGPU: true, gpuIndices: selected, }, - {name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}}, - }, logFunc) + satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}}, + ), logFunc) } func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) { @@ -568,14 +568,24 @@ type satStats struct { Unsupported int } +func withNvidiaPersistenceMode(jobs ...satJob) []satJob { + out := make([]satJob, 0, len(jobs)+1) + out = append(out, satJob{ + name: "00-nvidia-smi-persistence-mode.log", + cmd: []string{"nvidia-smi", "-pm", "1"}, + }) + out = append(out, jobs...) + return out +} + func nvidiaSATJobs() []satJob { - return []satJob{ - {name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, - {name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}}, - {name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}}, - {name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}}, - {name: "05-bee-gpu-burn.log", cmd: []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}}, - } + return withNvidiaPersistenceMode( + satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, + satJob{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}}, + satJob{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}}, + satJob{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}}, + satJob{name: "05-bee-gpu-burn.log", cmd: []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}}, + ) } func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob { @@ -590,12 +600,12 @@ func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob { } diagArgs = append(diagArgs, "-i", strings.Join(ids, ",")) } - return []satJob{ - {name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, - {name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}}, - {name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}}, - {name: "04-dcgmi-diag.log", cmd: diagArgs}, - } + return withNvidiaPersistenceMode( + satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, + satJob{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}}, + satJob{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}}, + satJob{name: "04-dcgmi-diag.log", cmd: diagArgs}, + ) } func nvidiaDCGMNamedDiagCommand(name string, durationSec int, gpuIndices []int) []string { diff --git a/audit/internal/platform/sat_test.go b/audit/internal/platform/sat_test.go index f46f311..5aab064 100644 --- a/audit/internal/platform/sat_test.go +++ b/audit/internal/platform/sat_test.go @@ -28,13 +28,19 @@ func TestRunNvidiaAcceptancePackIncludesGPUStress(t *testing.T) { jobs := nvidiaSATJobs() - if len(jobs) != 5 { - t.Fatalf("jobs=%d want 5", len(jobs)) + if len(jobs) != 6 { + t.Fatalf("jobs=%d want 6", len(jobs)) } - if got := jobs[4].cmd[0]; got != "bee-gpu-burn" { + if got := jobs[0].cmd[0]; got != "nvidia-smi" { + t.Fatalf("preflight command=%q want nvidia-smi", got) + } + if got := strings.Join(jobs[0].cmd, " "); got != "nvidia-smi -pm 1" { + t.Fatalf("preflight=%q want %q", got, "nvidia-smi -pm 1") + } + if got := jobs[5].cmd[0]; got != "bee-gpu-burn" { t.Fatalf("gpu stress command=%q want bee-gpu-burn", got) } - if got := jobs[3].cmd[1]; got != "--output-file" { + if got := jobs[4].cmd[1]; got != "--output-file" { t.Fatalf("bug report flag=%q want --output-file", got) } } @@ -82,7 +88,7 @@ func TestAMDStressJobsIncludeBandwidthAndGST(t *testing.T) { func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) { jobs := nvidiaSATJobs() - got := jobs[4].cmd + got := jobs[5].cmd want := []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"} if len(got) != len(want) { t.Fatalf("cmd len=%d want %d", len(got), len(want)) @@ -94,6 +100,19 @@ func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) { } } +func TestNvidiaDCGMJobsEnablePersistenceModeBeforeDiag(t *testing.T) { + jobs := nvidiaDCGMJobs(3, []int{2, 0}) + if len(jobs) != 5 { + t.Fatalf("jobs=%d want 5", len(jobs)) + } + if got := strings.Join(jobs[0].cmd, " "); got != "nvidia-smi -pm 1" { + t.Fatalf("preflight=%q want %q", got, "nvidia-smi -pm 1") + } + if got := strings.Join(jobs[4].cmd, " "); got != "dcgmi diag -r 3 -i 2,0" { + t.Fatalf("diag=%q want %q", got, "dcgmi diag -r 3 -i 2,0") + } +} + func TestBuildNvidiaStressJobUsesSelectedLoaderAndDevices(t *testing.T) { t.Parallel() diff --git a/audit/internal/webui/pages.go b/audit/internal/webui/pages.go index b70f05b..a0117f9 100644 --- a/audit/internal/webui/pages.go +++ b/audit/internal/webui/pages.go @@ -8,9 +8,12 @@ import ( "os" "path/filepath" "sort" + "strconv" "strings" + "time" "bee/audit/internal/app" + "bee/audit/internal/platform" "bee/audit/internal/schema" ) @@ -161,7 +164,7 @@ func renderPage(page string, opts HandlerOptions) string { case "benchmark": pageID = "benchmark" title = "Benchmark" - body = renderBenchmark() + body = renderBenchmark(opts) case "tasks": pageID = "tasks" title = "Tasks" @@ -1072,14 +1075,14 @@ func renderValidate(opts HandlerOptions) string { inv.NVIDIA, `Select which NVIDIA GPUs to include in Validate. The same selection is used by both NVIDIA GPU cards below and by Validate one by one.`, `nvidia-smi --query-gpu=index,name,memory.total`, - `

Loading NVIDIA GPUs…

`, - )) + - renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody( - inv.NVIDIA, - `Runs NVIDIA diagnostics and board inventory checks.`, - `nvidia-smi, dmidecode, dcgmi diag`, - `Runs one GPU at a time on the selected NVIDIA GPUs. Diag level is taken from Validate Profile.`, + `

Loading NVIDIA GPUs…

`, )) + + renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody( + inv.NVIDIA, + `Runs NVIDIA diagnostics and board inventory checks.`, + `nvidia-smi, dmidecode, dcgmi diag`, + `Runs one GPU at a time on the selected NVIDIA GPUs. Diag level is taken from Validate Profile.`, + )) + renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody( inv.NVIDIA, `Runs a controlled NVIDIA DCGM load in Validate to check stability under moderate stress.`, @@ -1569,7 +1572,25 @@ func renderSATCard(id, label, runAction, headerActions, body string) string { // ── Benchmark ───────────────────────────────────────────────────────────────── -func renderBenchmark() string { +type benchmarkHistoryColumn struct { + key string + label string + name string + index int +} + +type benchmarkHistoryCell struct { + score float64 + present bool +} + +type benchmarkHistoryRun struct { + generatedAt time.Time + displayTime string + cells map[string]benchmarkHistoryCell +} + +func renderBenchmark(opts HandlerOptions) string { return `

Benchmark runs generate a human-readable TXT report and machine-readable result bundle. Tasks continue in the background — view progress in Tasks.

@@ -1618,6 +1639,8 @@ func renderBenchmark() string {
+` + renderBenchmarkResultsCard(opts.ExportDir) + ` +