Add HPL (LINPACK) benchmark as validate/stress task

HPL 2.3 from netlib compiled against OpenBLAS with a minimal single-process MPI stub — no MPI package required in the ISO. Matrix size is auto-sized to 80% of total RAM at runtime. Build: - VERSIONS: HPL_VERSION=2.3, HPL_SHA256=32c5c17d… - build-hpl.sh: downloads HPL + OpenBLAS from Debian 12 repo, compiles xhpl with a self-contained mpi_stub.c - build.sh: step 80-hpl, injects xhpl + libopenblas into overlay Runtime: - bee-hpl: generates HPL.dat (N auto from /proc/meminfo, NB=256, P=1 Q=1), runs xhpl, prints standard WR... Gflops output - platform/hpl.go: RunHPL(), parses WR line → GFlops + PASSED/FAILED - tasks.go: target "hpl" - pages.go: LINPACK (HPL) card in validate/stress grid (stress-only) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-08 07:08:18 +03:00
parent b2f8626fee
commit 16e7ae00e7
10 changed files with 541 additions and 6 deletions
--- a/audit/internal/webui/pages.go
+++ b/audit/internal/webui/pages.go
@@ -1143,6 +1143,16 @@ func renderValidate(opts HandlerOptions) string {
 		`</div>` +
 		`</div>
 <div class="grid3" style="margin-top:16px">
+` + `<div id="sat-card-hpl">` +
+		renderSATCard("hpl", "LINPACK (HPL)", "runSAT('hpl')", "", renderValidateCardBody(
+			``,
+			`Standard High Performance LINPACK benchmark. Measures sustained FP64 GFLOPS and memory bandwidth of the CPU subsystem. Uses 80% of available RAM. Pass/fail based on HPL residual check.`,
+			`<code>xhpl</code> (HPL 2.3, OpenBLAS)`,
+			`Skipped in Validate mode. Runs in Stress mode only. Runtime scales with RAM — expect 5–30 min.<p id="sat-hpl-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
+		)) +
+		`</div>` +
+		`</div>
+<div class="grid3" style="margin-top:16px">
 ` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody(
 		inv.AMD,
 		`Runs the selected AMD checks only. GPU Validate collects inventory; MEM Integrity uses the RVS MEM module; MEM Bandwidth uses rocm-bandwidth-test and the RVS BABEL module.`,
@@ -1178,6 +1188,7 @@ function satModeChanged() {
    {card: 'sat-card-nvidia-pulse',           hint: 'sat-pt-mode-hint'},
    {card: 'sat-card-nvidia-interconnect',    hint: 'sat-ni-mode-hint'},
    {card: 'sat-card-nvidia-bandwidth',       hint: 'sat-nb-mode-hint'},
+    {card: 'sat-card-hpl',                    hint: 'sat-hpl-mode-hint'},
  ].forEach(function(item) {
    const card = document.getElementById(item.card);
    if (card) {
@@ -1188,7 +1199,7 @@ function satModeChanged() {
  });
 }
 function satLabels() {
-  return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress (dcgmi diag targeted_stress)', 'nvidia-targeted-power':'NVIDIA Targeted Power (dcgmi diag targeted_power)', 'nvidia-pulse':'NVIDIA Pulse Test (dcgmi diag pulse_test)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL all_reduce_perf)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
+  return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress (dcgmi diag targeted_stress)', 'nvidia-targeted-power':'NVIDIA Targeted Power (dcgmi diag targeted_power)', 'nvidia-pulse':'NVIDIA PSU Pulse Test (dcgmi diag pulse_test)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL all_reduce_perf)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', hpl:'LINPACK (HPL)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
 }
 let satNvidiaGPUsPromise = null;
 function loadSatNvidiaGPUs() {
@@ -1437,8 +1448,8 @@ function runAllSAT() {
  const cycles = Math.max(1, parseInt(document.getElementById('sat-cycles').value)||1);
  const status = document.getElementById('sat-all-status');
  status.textContent = 'Enqueuing...';
-  const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
-  const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','memory','storage','cpu'].concat(selectedAMDValidateTargets());
+  const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth', 'hpl'];
+  const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','hpl','memory','storage','cpu'].concat(selectedAMDValidateTargets());
  const activeTargets = baseTargets.filter(target => {
    if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false;
    const btn = document.getElementById('sat-btn-' + target);
@@ -2082,7 +2093,7 @@ func benchmarkHistoryParallelLabel(serverModel, gpuName string, count int) strin

 func renderBurn() string {
 	return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>&#9888; Warning:</strong> Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.</div>
-<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> DCGM diagnostics and ` + "targeted_stress" + ` remain in <a href="/validate">Validate</a>. Burn exposes official NVIDIA load recipes by test goal plus separate custom stress tools.</div>
+<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `), NCCL, NVBandwidth, and LINPACK remain in <a href="/validate">Validate → Stress mode</a>. Burn exposes sustained GPU compute load recipes.</div>
 <p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>

 <div class="card" style="margin-bottom:16px">
--- a/audit/internal/webui/server_test.go
+++ b/audit/internal/webui/server_test.go
@@ -741,8 +741,8 @@ func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
 	for _, needle := range []string{
 		`NVIDIA Max Compute Load`,
 		`dcgmproftester`,
-		`targeted_stress remain in <a href="/validate">Validate</a>`,
-		`NVIDIA Interconnect Test (NCCL all_reduce_perf)`,
+		`NCCL`,
+		`Validate → Stress mode`,
 		`id="burn-gpu-list"`,
 	} {
 		if !strings.Contains(body, needle) {
--- a/audit/internal/webui/tasks.go
+++ b/audit/internal/webui/tasks.go
@@ -39,6 +39,7 @@ var taskNames = map[string]string{
 	"nvidia-interconnect":    "NVIDIA Interconnect Test (NCCL all_reduce_perf)",
 	"nvidia-bandwidth":       "NVIDIA Bandwidth Test (NVBandwidth)",
 	"nvidia-stress":          "NVIDIA GPU Stress",
+	"hpl":                    "LINPACK (HPL)",
 	"memory":                 "Memory SAT",
 	"storage":                "Storage SAT",
 	"cpu":                    "CPU SAT",
@@ -739,6 +740,19 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
 		}
 		archive, err = runSATStressPackCtx(a, ctx, "", dur, j.append)
+	case "hpl":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		opts := platform.HPLOptions{
+			MemFraction: 0.80,
+			NB:          256,
+		}
+		archive, err = func() (string, error) {
+			path, _, runErr := a.RunHPL(ctx, "", opts, j.append)
+			return path, runErr
+		}()
 	case "platform-stress":
 		if a == nil {
 			err = fmt.Errorf("app not configured")