diff --git a/audit/internal/platform/nvidia_stress.go b/audit/internal/platform/nvidia_stress.go
index 6ef735e..8089a8d 100644
--- a/audit/internal/platform/nvidia_stress.go
+++ b/audit/internal/platform/nvidia_stress.go
@@ -16,12 +16,12 @@ func (s *System) RunNvidiaStressPack(ctx context.Context, baseDir string, opts N
return "", err
}
- return runAcceptancePackCtx(ctx, baseDir, nvidiaStressArchivePrefix(opts.Loader), []satJob{
- {name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
- {name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
+ return runAcceptancePackCtx(ctx, baseDir, nvidiaStressArchivePrefix(opts.Loader), withNvidiaPersistenceMode(
+ satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+ satJob{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
job,
- {name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
- }, logFunc)
+ satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
+ ), logFunc)
}
func nvidiaStressArchivePrefix(loader string) string {
diff --git a/audit/internal/platform/sat.go b/audit/internal/platform/sat.go
index 29c4803..0a41525 100644
--- a/audit/internal/platform/sat.go
+++ b/audit/internal/platform/sat.go
@@ -278,13 +278,13 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
if gpuCount < 1 {
gpuCount = 1
}
- return runAcceptancePackCtx(ctx, baseDir, "nccl-tests", []satJob{
- {name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
- {name: "02-all-reduce-perf.log", cmd: []string{
+ return runAcceptancePackCtx(ctx, baseDir, "nccl-tests", withNvidiaPersistenceMode(
+ satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+ satJob{name: "02-all-reduce-perf.log", cmd: []string{
"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
"-g", strconv.Itoa(gpuCount), "--iters", "20",
}},
- }, logFunc)
+ ), logFunc)
}
func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
@@ -296,18 +296,18 @@ func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir strin
if err != nil {
return "", err
}
- return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", []satJob{
- {name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
- {name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
- {
+ return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", withNvidiaPersistenceMode(
+ satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+ satJob{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
+ satJob{
name: "03-dcgmproftester.log",
cmd: profCmd,
env: nvidiaVisibleDevicesEnv(selected),
collectGPU: true,
gpuIndices: selected,
},
- {name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
- }, logFunc)
+ satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
+ ), logFunc)
}
func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
@@ -315,16 +315,16 @@ func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string,
if err != nil {
return "", err
}
- return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", []satJob{
- {name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
- {
+ return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", withNvidiaPersistenceMode(
+ satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+ satJob{
name: "02-dcgmi-targeted-power.log",
cmd: nvidiaDCGMNamedDiagCommand("targeted_power", normalizeNvidiaBurnDuration(durationSec), selected),
collectGPU: true,
gpuIndices: selected,
},
- {name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
- }, logFunc)
+ satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
+ ), logFunc)
}
func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
@@ -332,16 +332,16 @@ func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, dur
if err != nil {
return "", err
}
- return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", []satJob{
- {name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
- {
+ return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", withNvidiaPersistenceMode(
+ satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+ satJob{
name: "02-dcgmi-pulse-test.log",
cmd: nvidiaDCGMNamedDiagCommand("pulse_test", normalizeNvidiaBurnDuration(durationSec), selected),
collectGPU: true,
gpuIndices: selected,
},
- {name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
- }, logFunc)
+ satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
+ ), logFunc)
}
func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
@@ -349,16 +349,16 @@ func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpu
if err != nil {
return "", err
}
- return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", []satJob{
- {name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
- {
+ return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", withNvidiaPersistenceMode(
+ satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+ satJob{
name: "02-dcgmi-nvbandwidth.log",
cmd: nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, selected),
collectGPU: true,
gpuIndices: selected,
},
- {name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
- }, logFunc)
+ satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
+ ), logFunc)
}
func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
@@ -389,16 +389,16 @@ func (s *System) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDi
logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
}
}
- return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-stress", []satJob{
- {name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
- {
+ return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-stress", withNvidiaPersistenceMode(
+ satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+ satJob{
name: "02-dcgmi-targeted-stress.log",
cmd: nvidiaDCGMNamedDiagCommand("targeted_stress", normalizeNvidiaBurnDuration(durationSec), selected),
collectGPU: true,
gpuIndices: selected,
},
- {name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
- }, logFunc)
+ satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
+ ), logFunc)
}
func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) {
@@ -568,14 +568,24 @@ type satStats struct {
Unsupported int
}
+func withNvidiaPersistenceMode(jobs ...satJob) []satJob {
+ out := make([]satJob, 0, len(jobs)+1)
+ out = append(out, satJob{
+ name: "00-nvidia-smi-persistence-mode.log",
+ cmd: []string{"nvidia-smi", "-pm", "1"},
+ })
+ out = append(out, jobs...)
+ return out
+}
+
func nvidiaSATJobs() []satJob {
- return []satJob{
- {name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
- {name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
- {name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
- {name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
- {name: "05-bee-gpu-burn.log", cmd: []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}},
- }
+ return withNvidiaPersistenceMode(
+ satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+ satJob{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
+ satJob{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
+ satJob{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
+ satJob{name: "05-bee-gpu-burn.log", cmd: []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}},
+ )
}
func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
@@ -590,12 +600,12 @@ func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
}
diagArgs = append(diagArgs, "-i", strings.Join(ids, ","))
}
- return []satJob{
- {name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
- {name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
- {name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
- {name: "04-dcgmi-diag.log", cmd: diagArgs},
- }
+ return withNvidiaPersistenceMode(
+ satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
+ satJob{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
+ satJob{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
+ satJob{name: "04-dcgmi-diag.log", cmd: diagArgs},
+ )
}
func nvidiaDCGMNamedDiagCommand(name string, durationSec int, gpuIndices []int) []string {
diff --git a/audit/internal/platform/sat_test.go b/audit/internal/platform/sat_test.go
index f46f311..5aab064 100644
--- a/audit/internal/platform/sat_test.go
+++ b/audit/internal/platform/sat_test.go
@@ -28,13 +28,19 @@ func TestRunNvidiaAcceptancePackIncludesGPUStress(t *testing.T) {
jobs := nvidiaSATJobs()
- if len(jobs) != 5 {
- t.Fatalf("jobs=%d want 5", len(jobs))
+ if len(jobs) != 6 {
+ t.Fatalf("jobs=%d want 6", len(jobs))
}
- if got := jobs[4].cmd[0]; got != "bee-gpu-burn" {
+ if got := jobs[0].cmd[0]; got != "nvidia-smi" {
+ t.Fatalf("preflight command=%q want nvidia-smi", got)
+ }
+ if got := strings.Join(jobs[0].cmd, " "); got != "nvidia-smi -pm 1" {
+ t.Fatalf("preflight=%q want %q", got, "nvidia-smi -pm 1")
+ }
+ if got := jobs[5].cmd[0]; got != "bee-gpu-burn" {
t.Fatalf("gpu stress command=%q want bee-gpu-burn", got)
}
- if got := jobs[3].cmd[1]; got != "--output-file" {
+ if got := jobs[4].cmd[1]; got != "--output-file" {
t.Fatalf("bug report flag=%q want --output-file", got)
}
}
@@ -82,7 +88,7 @@ func TestAMDStressJobsIncludeBandwidthAndGST(t *testing.T) {
func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) {
jobs := nvidiaSATJobs()
- got := jobs[4].cmd
+ got := jobs[5].cmd
want := []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}
if len(got) != len(want) {
t.Fatalf("cmd len=%d want %d", len(got), len(want))
@@ -94,6 +100,19 @@ func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) {
}
}
+func TestNvidiaDCGMJobsEnablePersistenceModeBeforeDiag(t *testing.T) {
+ jobs := nvidiaDCGMJobs(3, []int{2, 0})
+ if len(jobs) != 5 {
+ t.Fatalf("jobs=%d want 5", len(jobs))
+ }
+ if got := strings.Join(jobs[0].cmd, " "); got != "nvidia-smi -pm 1" {
+ t.Fatalf("preflight=%q want %q", got, "nvidia-smi -pm 1")
+ }
+ if got := strings.Join(jobs[4].cmd, " "); got != "dcgmi diag -r 3 -i 2,0" {
+ t.Fatalf("diag=%q want %q", got, "dcgmi diag -r 3 -i 2,0")
+ }
+}
+
func TestBuildNvidiaStressJobUsesSelectedLoaderAndDevices(t *testing.T) {
t.Parallel()
diff --git a/audit/internal/webui/pages.go b/audit/internal/webui/pages.go
index b70f05b..a0117f9 100644
--- a/audit/internal/webui/pages.go
+++ b/audit/internal/webui/pages.go
@@ -8,9 +8,12 @@ import (
"os"
"path/filepath"
"sort"
+ "strconv"
"strings"
+ "time"
"bee/audit/internal/app"
+ "bee/audit/internal/platform"
"bee/audit/internal/schema"
)
@@ -161,7 +164,7 @@ func renderPage(page string, opts HandlerOptions) string {
case "benchmark":
pageID = "benchmark"
title = "Benchmark"
- body = renderBenchmark()
+ body = renderBenchmark(opts)
case "tasks":
pageID = "tasks"
title = "Tasks"
@@ -1072,14 +1075,14 @@ func renderValidate(opts HandlerOptions) string {
inv.NVIDIA,
`Select which NVIDIA GPUs to include in Validate. The same selection is used by both NVIDIA GPU cards below and by Validate one by one.`,
`nvidia-smi --query-gpu=index,name,memory.total`,
- `
Loading NVIDIA GPUs…
nvidia-smi, dmidecode, dcgmi diag`,
- `Runs one GPU at a time on the selected NVIDIA GPUs. Diag level is taken from Validate Profile.`,
+ `Loading NVIDIA GPUs…
nvidia-smi, dmidecode, dcgmi diag`,
+ `Runs one GPU at a time on the selected NVIDIA GPUs. Diag level is taken from Validate Profile.`,
+ )) +
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
inv.NVIDIA,
`Runs a controlled NVIDIA DCGM load in Validate to check stability under moderate stress.`,
@@ -1569,7 +1572,25 @@ func renderSATCard(id, label, runAction, headerActions, body string) string {
// ── Benchmark ─────────────────────────────────────────────────────────────────
-func renderBenchmark() string {
+type benchmarkHistoryColumn struct {
+ key string
+ label string
+ name string
+ index int
+}
+
+type benchmarkHistoryCell struct {
+ score float64
+ present bool
+}
+
+type benchmarkHistoryRun struct {
+ generatedAt time.Time
+ displayTime string
+ cells map[string]benchmarkHistoryCell
+}
+
+func renderBenchmark(opts HandlerOptions) string {
return `Benchmark runs generate a human-readable TXT report and machine-readable result bundle. Tasks continue in the background — view progress in Tasks.