Add HPL (LINPACK) benchmark as validate/stress task

HPL 2.3 from netlib compiled against OpenBLAS with a minimal single-process MPI stub — no MPI package required in the ISO. Matrix size is auto-sized to 80% of total RAM at runtime. Build: - VERSIONS: HPL_VERSION=2.3, HPL_SHA256=32c5c17d… - build-hpl.sh: downloads HPL + OpenBLAS from Debian 12 repo, compiles xhpl with a self-contained mpi_stub.c - build.sh: step 80-hpl, injects xhpl + libopenblas into overlay Runtime: - bee-hpl: generates HPL.dat (N auto from /proc/meminfo, NB=256, P=1 Q=1), runs xhpl, prints standard WR... Gflops output - platform/hpl.go: RunHPL(), parses WR line → GFlops + PASSED/FAILED - tasks.go: target "hpl" - pages.go: LINPACK (HPL) card in validate/stress grid (stress-only) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-08 07:08:18 +03:00
parent b2f8626fee
commit 16e7ae00e7
10 changed files with 541 additions and 6 deletions
--- a/audit/internal/app/app.go
+++ b/audit/internal/app/app.go
@@ -139,6 +139,7 @@ type satRunner interface {
 	RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
 	RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
 	RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
+	RunHPL(ctx context.Context, baseDir string, opts platform.HPLOptions, logFunc func(string)) (string, *platform.HPLResult, error)
 }

 type runtimeChecker interface {
@@ -737,6 +738,13 @@ func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
 	return ActionResult{Title: "NCCL bandwidth test", Body: body}, err
 }

+func (a *App) RunHPL(ctx context.Context, baseDir string, opts platform.HPLOptions, logFunc func(string)) (string, *platform.HPLResult, error) {
+	if a == nil {
+		return "", nil, fmt.Errorf("app not configured")
+	}
+	return a.sat.RunHPL(ctx, baseDir, opts, logFunc)
+}
+
 func (a *App) RunFanStressTestResult(ctx context.Context, opts platform.FanStressOptions) (ActionResult, error) {
 	path, err := a.RunFanStressTest(ctx, "", opts)
 	body := formatFanStressResult(path)
--- a/audit/internal/app/app_test.go
+++ b/audit/internal/app/app_test.go
@@ -282,6 +282,9 @@ func (f fakeSAT) RunPlatformStress(_ context.Context, _ string, _ platform.Platf
 func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) {
 	return "", nil
 }
+func (f fakeSAT) RunHPL(_ context.Context, _ string, _ platform.HPLOptions, _ func(string)) (string, *platform.HPLResult, error) {
+	return "", nil, nil
+}

 func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
 	t.Parallel()
--- a/audit/internal/platform/hpl.go
+++ b/audit/internal/platform/hpl.go
@@ -0,0 +1,142 @@
+package platform
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"time"
+)
+
+// HPLOptions configures the HPL (LINPACK) benchmark run.
+type HPLOptions struct {
+	MemFraction float64 // fraction of RAM to use (default 0.80)
+	NB          int     // block size (default 256)
+}
+
+// HPLResult holds the parsed result of an HPL run.
+type HPLResult struct {
+	N          int     // matrix dimension
+	NB         int     // block size
+	P          int     // process grid rows
+	Q          int     // process grid cols
+	TimeSec    float64 // wall time in seconds
+	GFlops     float64 // achieved performance
+	Residual   float64 // backward error residual (from HPL verification line)
+	Status     string  // "PASSED" or "FAILED"
+	RawOutput  string  // full xhpl output
+}
+
+func applyHPLDefaults(opts *HPLOptions) {
+	if opts.MemFraction <= 0 || opts.MemFraction > 1 {
+		opts.MemFraction = 0.80
+	}
+	if opts.NB <= 0 {
+		opts.NB = 256
+	}
+}
+
+// RunHPL runs bee-hpl and returns parsed results plus a tar.gz artifact path.
+func (s *System) RunHPL(ctx context.Context, baseDir string, opts HPLOptions, logFunc func(string)) (string, *HPLResult, error) {
+	applyHPLDefaults(&opts)
+
+	if baseDir == "" {
+		baseDir = "/var/log/bee-sat"
+	}
+	ts := time.Now().UTC().Format("20060102-150405")
+	runDir := filepath.Join(baseDir, "hpl-"+ts)
+	if err := os.MkdirAll(runDir, 0755); err != nil {
+		return "", nil, fmt.Errorf("mkdir %s: %w", runDir, err)
+	}
+
+	logPath := filepath.Join(runDir, "hpl.log")
+
+	cmd := []string{
+		"bee-hpl",
+		"--mem-fraction", strconv.FormatFloat(opts.MemFraction, 'f', 2, 64),
+		"--nb", strconv.Itoa(opts.NB),
+	}
+
+	if logFunc != nil {
+		logFunc(fmt.Sprintf("HPL: N will be auto-sized to %.0f%% of RAM, NB=%d", opts.MemFraction*100, opts.NB))
+	}
+
+	out, err := runSATCommandCtx(ctx, "", "hpl", cmd, nil, logFunc)
+	_ = os.WriteFile(logPath, out, 0644)
+
+	result := parseHPLOutput(string(out))
+	result.RawOutput = string(out)
+
+	if err != nil && err != context.Canceled {
+		return "", result, fmt.Errorf("bee-hpl failed: %w", err)
+	}
+	if err == nil && result.GFlops <= 0 {
+		return "", result, fmt.Errorf("HPL completed but no Gflops result found in output")
+	}
+
+	// Write summary
+	summary := fmt.Sprintf("N=%d NB=%d time=%.2fs gflops=%.3f status=%s\n",
+		result.N, result.NB, result.TimeSec, result.GFlops, result.Status)
+	_ = os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644)
+
+	if logFunc != nil {
+		logFunc(fmt.Sprintf("HPL result: N=%d NB=%d %.2fs %.3f Gflops %s",
+			result.N, result.NB, result.TimeSec, result.GFlops, result.Status))
+	}
+
+	ts2 := time.Now().UTC().Format("20060102-150405")
+	archive := filepath.Join(baseDir, "hpl-"+ts2+".tar.gz")
+	if archErr := createTarGz(archive, runDir); archErr != nil {
+		return runDir, result, err
+	}
+	return archive, result, err
+}
+
+// parseHPLOutput extracts N, NB, time, and Gflops from standard HPL output.
+//
+// HPL prints a result line of the form:
+//
+//	WR00L2L2       45312   256     1     1        1234.56             5.678e+01
+//	T/V               N    NB     P     Q           Time                 Gflops
+func parseHPLOutput(output string) *HPLResult {
+	result := &HPLResult{Status: "FAILED"}
+	for _, line := range strings.Split(output, "\n") {
+		line = strings.TrimSpace(line)
+		// Result line starts with WR
+		if strings.HasPrefix(line, "WR") {
+			fields := strings.Fields(line)
+			// WR00L2L2  N  NB  P  Q  Time  Gflops
+			if len(fields) >= 7 {
+				result.N, _ = strconv.Atoi(fields[1])
+				result.NB, _ = strconv.Atoi(fields[2])
+				result.P, _ = strconv.Atoi(fields[3])
+				result.Q, _ = strconv.Atoi(fields[4])
+				result.TimeSec, _ = strconv.ParseFloat(fields[5], 64)
+				result.GFlops, _ = strconv.ParseFloat(fields[6], 64)
+			}
+		}
+		// Verification line: "||Ax-b||_oo/(eps*(||A||_oo*||x||_oo+||b||_oo)*N)= ... PASSED"
+		if strings.Contains(line, "PASSED") {
+			result.Status = "PASSED"
+			fields := strings.Fields(line)
+			for i, f := range fields {
+				if f == "PASSED" && i > 0 {
+					result.Residual, _ = strconv.ParseFloat(fields[i-1], 64)
+				}
+			}
+		}
+	}
+	return result
+}
+
+// hplAvailable returns true if bee-hpl and xhpl are present and executable.
+func hplAvailable() bool {
+	if _, err := exec.LookPath("bee-hpl"); err != nil {
+		return false
+	}
+	_, err := os.Stat("/usr/local/lib/bee/xhpl")
+	return err == nil
+}
--- a/audit/internal/webui/pages.go
+++ b/audit/internal/webui/pages.go
@@ -1143,6 +1143,16 @@ func renderValidate(opts HandlerOptions) string {
 		`</div>` +
 		`</div>
 <div class="grid3" style="margin-top:16px">
+` + `<div id="sat-card-hpl">` +
+		renderSATCard("hpl", "LINPACK (HPL)", "runSAT('hpl')", "", renderValidateCardBody(
+			``,
+			`Standard High Performance LINPACK benchmark. Measures sustained FP64 GFLOPS and memory bandwidth of the CPU subsystem. Uses 80% of available RAM. Pass/fail based on HPL residual check.`,
+			`<code>xhpl</code> (HPL 2.3, OpenBLAS)`,
+			`Skipped in Validate mode. Runs in Stress mode only. Runtime scales with RAM — expect 5–30 min.<p id="sat-hpl-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
+		)) +
+		`</div>` +
+		`</div>
+<div class="grid3" style="margin-top:16px">
 ` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody(
 		inv.AMD,
 		`Runs the selected AMD checks only. GPU Validate collects inventory; MEM Integrity uses the RVS MEM module; MEM Bandwidth uses rocm-bandwidth-test and the RVS BABEL module.`,
@@ -1178,6 +1188,7 @@ function satModeChanged() {
    {card: 'sat-card-nvidia-pulse',           hint: 'sat-pt-mode-hint'},
    {card: 'sat-card-nvidia-interconnect',    hint: 'sat-ni-mode-hint'},
    {card: 'sat-card-nvidia-bandwidth',       hint: 'sat-nb-mode-hint'},
+    {card: 'sat-card-hpl',                    hint: 'sat-hpl-mode-hint'},
  ].forEach(function(item) {
    const card = document.getElementById(item.card);
    if (card) {
@@ -1188,7 +1199,7 @@ function satModeChanged() {
  });
 }
 function satLabels() {
-  return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress (dcgmi diag targeted_stress)', 'nvidia-targeted-power':'NVIDIA Targeted Power (dcgmi diag targeted_power)', 'nvidia-pulse':'NVIDIA Pulse Test (dcgmi diag pulse_test)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL all_reduce_perf)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
+  return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress (dcgmi diag targeted_stress)', 'nvidia-targeted-power':'NVIDIA Targeted Power (dcgmi diag targeted_power)', 'nvidia-pulse':'NVIDIA PSU Pulse Test (dcgmi diag pulse_test)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL all_reduce_perf)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', hpl:'LINPACK (HPL)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
 }
 let satNvidiaGPUsPromise = null;
 function loadSatNvidiaGPUs() {
@@ -1437,8 +1448,8 @@ function runAllSAT() {
  const cycles = Math.max(1, parseInt(document.getElementById('sat-cycles').value)||1);
  const status = document.getElementById('sat-all-status');
  status.textContent = 'Enqueuing...';
-  const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
-  const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','memory','storage','cpu'].concat(selectedAMDValidateTargets());
+  const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth', 'hpl'];
+  const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','hpl','memory','storage','cpu'].concat(selectedAMDValidateTargets());
  const activeTargets = baseTargets.filter(target => {
    if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false;
    const btn = document.getElementById('sat-btn-' + target);
@@ -2082,7 +2093,7 @@ func benchmarkHistoryParallelLabel(serverModel, gpuName string, count int) strin

 func renderBurn() string {
 	return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>&#9888; Warning:</strong> Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.</div>
-<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> DCGM diagnostics and ` + "targeted_stress" + ` remain in <a href="/validate">Validate</a>. Burn exposes official NVIDIA load recipes by test goal plus separate custom stress tools.</div>
+<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `), NCCL, NVBandwidth, and LINPACK remain in <a href="/validate">Validate → Stress mode</a>. Burn exposes sustained GPU compute load recipes.</div>
 <p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>

 <div class="card" style="margin-bottom:16px">
--- a/audit/internal/webui/server_test.go
+++ b/audit/internal/webui/server_test.go
@@ -741,8 +741,8 @@ func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
 	for _, needle := range []string{
 		`NVIDIA Max Compute Load`,
 		`dcgmproftester`,
-		`targeted_stress remain in <a href="/validate">Validate</a>`,
-		`NVIDIA Interconnect Test (NCCL all_reduce_perf)`,
+		`NCCL`,
+		`Validate → Stress mode`,
 		`id="burn-gpu-list"`,
 	} {
 		if !strings.Contains(body, needle) {
--- a/audit/internal/webui/tasks.go
+++ b/audit/internal/webui/tasks.go
@@ -39,6 +39,7 @@ var taskNames = map[string]string{
 	"nvidia-interconnect":    "NVIDIA Interconnect Test (NCCL all_reduce_perf)",
 	"nvidia-bandwidth":       "NVIDIA Bandwidth Test (NVBandwidth)",
 	"nvidia-stress":          "NVIDIA GPU Stress",
+	"hpl":                    "LINPACK (HPL)",
 	"memory":                 "Memory SAT",
 	"storage":                "Storage SAT",
 	"cpu":                    "CPU SAT",
@@ -739,6 +740,19 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
 		}
 		archive, err = runSATStressPackCtx(a, ctx, "", dur, j.append)
+	case "hpl":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		opts := platform.HPLOptions{
+			MemFraction: 0.80,
+			NB:          256,
+		}
+		archive, err = func() (string, error) {
+			path, _, runErr := a.RunHPL(ctx, "", opts, j.append)
+			return path, runErr
+		}()
 	case "platform-stress":
 		if a == nil {
 			err = fmt.Errorf("app not configured")