Remove HPL from build and audit flows

This commit is contained in:
Mikhail Chusavitin
2026-04-08 10:00:23 +03:00
parent 13899aa864
commit e0d94d7f47
9 changed files with 1 additions and 639 deletions

View File

@@ -139,7 +139,6 @@ type satRunner interface {
RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
RunHPL(ctx context.Context, baseDir string, opts platform.HPLOptions, logFunc func(string)) (string, *platform.HPLResult, error)
}
type runtimeChecker interface {
@@ -738,13 +737,6 @@ func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
return ActionResult{Title: "NCCL bandwidth test", Body: body}, err
}
func (a *App) RunHPL(ctx context.Context, baseDir string, opts platform.HPLOptions, logFunc func(string)) (string, *platform.HPLResult, error) {
if a == nil {
return "", nil, fmt.Errorf("app not configured")
}
return a.sat.RunHPL(ctx, baseDir, opts, logFunc)
}
func (a *App) RunFanStressTestResult(ctx context.Context, opts platform.FanStressOptions) (ActionResult, error) {
path, err := a.RunFanStressTest(ctx, "", opts)
body := formatFanStressResult(path)

View File

@@ -282,9 +282,6 @@ func (f fakeSAT) RunPlatformStress(_ context.Context, _ string, _ platform.Platf
func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) {
return "", nil
}
func (f fakeSAT) RunHPL(_ context.Context, _ string, _ platform.HPLOptions, _ func(string)) (string, *platform.HPLResult, error) {
return "", nil, nil
}
func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
t.Parallel()

View File

@@ -1,142 +0,0 @@
package platform
import (
"context"
"fmt"
"os"
"os/exec"
"path/filepath"
"strconv"
"strings"
"time"
)
// HPLOptions configures the HPL (LINPACK) benchmark run.
type HPLOptions struct {
MemFraction float64 // fraction of RAM to use (default 0.80)
NB int // block size (default 256)
}
// HPLResult holds the parsed result of an HPL run.
type HPLResult struct {
N int // matrix dimension
NB int // block size
P int // process grid rows
Q int // process grid cols
TimeSec float64 // wall time in seconds
GFlops float64 // achieved performance
Residual float64 // backward error residual (from HPL verification line)
Status string // "PASSED" or "FAILED"
RawOutput string // full xhpl output
}
func applyHPLDefaults(opts *HPLOptions) {
if opts.MemFraction <= 0 || opts.MemFraction > 1 {
opts.MemFraction = 0.80
}
if opts.NB <= 0 {
opts.NB = 256
}
}
// RunHPL runs bee-hpl and returns parsed results plus a tar.gz artifact path.
func (s *System) RunHPL(ctx context.Context, baseDir string, opts HPLOptions, logFunc func(string)) (string, *HPLResult, error) {
applyHPLDefaults(&opts)
if baseDir == "" {
baseDir = "/var/log/bee-sat"
}
ts := time.Now().UTC().Format("20060102-150405")
runDir := filepath.Join(baseDir, "hpl-"+ts)
if err := os.MkdirAll(runDir, 0755); err != nil {
return "", nil, fmt.Errorf("mkdir %s: %w", runDir, err)
}
logPath := filepath.Join(runDir, "hpl.log")
cmd := []string{
"bee-hpl",
"--mem-fraction", strconv.FormatFloat(opts.MemFraction, 'f', 2, 64),
"--nb", strconv.Itoa(opts.NB),
}
if logFunc != nil {
logFunc(fmt.Sprintf("HPL: N will be auto-sized to %.0f%% of RAM, NB=%d", opts.MemFraction*100, opts.NB))
}
out, err := runSATCommandCtx(ctx, "", "hpl", cmd, nil, logFunc)
_ = os.WriteFile(logPath, out, 0644)
result := parseHPLOutput(string(out))
result.RawOutput = string(out)
if err != nil && err != context.Canceled {
return "", result, fmt.Errorf("bee-hpl failed: %w", err)
}
if err == nil && result.GFlops <= 0 {
return "", result, fmt.Errorf("HPL completed but no Gflops result found in output")
}
// Write summary
summary := fmt.Sprintf("N=%d NB=%d time=%.2fs gflops=%.3f status=%s\n",
result.N, result.NB, result.TimeSec, result.GFlops, result.Status)
_ = os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644)
if logFunc != nil {
logFunc(fmt.Sprintf("HPL result: N=%d NB=%d %.2fs %.3f Gflops %s",
result.N, result.NB, result.TimeSec, result.GFlops, result.Status))
}
ts2 := time.Now().UTC().Format("20060102-150405")
archive := filepath.Join(baseDir, "hpl-"+ts2+".tar.gz")
if archErr := createTarGz(archive, runDir); archErr != nil {
return runDir, result, err
}
return archive, result, err
}
// parseHPLOutput extracts N, NB, time, and Gflops from standard HPL output.
//
// HPL prints a result line of the form:
//
// WR00L2L2 45312 256 1 1 1234.56 5.678e+01
// T/V N NB P Q Time Gflops
func parseHPLOutput(output string) *HPLResult {
result := &HPLResult{Status: "FAILED"}
for _, line := range strings.Split(output, "\n") {
line = strings.TrimSpace(line)
// Result line starts with WR
if strings.HasPrefix(line, "WR") {
fields := strings.Fields(line)
// WR00L2L2 N NB P Q Time Gflops
if len(fields) >= 7 {
result.N, _ = strconv.Atoi(fields[1])
result.NB, _ = strconv.Atoi(fields[2])
result.P, _ = strconv.Atoi(fields[3])
result.Q, _ = strconv.Atoi(fields[4])
result.TimeSec, _ = strconv.ParseFloat(fields[5], 64)
result.GFlops, _ = strconv.ParseFloat(fields[6], 64)
}
}
// Verification line: "||Ax-b||_oo/(eps*(||A||_oo*||x||_oo+||b||_oo)*N)= ... PASSED"
if strings.Contains(line, "PASSED") {
result.Status = "PASSED"
fields := strings.Fields(line)
for i, f := range fields {
if f == "PASSED" && i > 0 {
result.Residual, _ = strconv.ParseFloat(fields[i-1], 64)
}
}
}
}
return result
}
// hplAvailable returns true if bee-hpl and xhpl are present and executable.
func hplAvailable() bool {
if _, err := exec.LookPath("bee-hpl"); err != nil {
return false
}
_, err := os.Stat("/usr/local/lib/bee/xhpl")
return err == nil
}

View File

@@ -1143,16 +1143,6 @@ func renderValidate(opts HandlerOptions) string {
`</div>` +
`</div>
<div class="grid3" style="margin-top:16px">
` + `<div id="sat-card-hpl">` +
renderSATCard("hpl", "LINPACK (HPL)", "runSAT('hpl')", "", renderValidateCardBody(
``,
`Standard High Performance LINPACK benchmark. Measures sustained FP64 GFLOPS and memory bandwidth of the CPU subsystem. Uses 80% of available RAM. Pass/fail based on HPL residual check.`,
`<code>xhpl</code> (HPL 2.3, OpenBLAS)`,
`Skipped in Validate mode. Runs in Stress mode only. Runtime scales with RAM — expect 530 min.<p id="sat-hpl-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
)) +
`</div>` +
`</div>
<div class="grid3" style="margin-top:16px">
` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody(
inv.AMD,
`Runs the selected AMD checks only. GPU Validate collects inventory; MEM Integrity uses the RVS MEM module; MEM Bandwidth uses rocm-bandwidth-test and the RVS BABEL module.`,
@@ -1188,7 +1178,6 @@ function satModeChanged() {
{card: 'sat-card-nvidia-pulse', hint: 'sat-pt-mode-hint'},
{card: 'sat-card-nvidia-interconnect', hint: 'sat-ni-mode-hint'},
{card: 'sat-card-nvidia-bandwidth', hint: 'sat-nb-mode-hint'},
{card: 'sat-card-hpl', hint: 'sat-hpl-mode-hint'},
].forEach(function(item) {
const card = document.getElementById(item.card);
if (card) {
@@ -1199,7 +1188,7 @@ function satModeChanged() {
});
}
function satLabels() {
return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress (dcgmi diag targeted_stress)', 'nvidia-targeted-power':'NVIDIA Targeted Power (dcgmi diag targeted_power)', 'nvidia-pulse':'NVIDIA PSU Pulse Test (dcgmi diag pulse_test)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL all_reduce_perf)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', hpl:'LINPACK (HPL)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress (dcgmi diag targeted_stress)', 'nvidia-targeted-power':'NVIDIA Targeted Power (dcgmi diag targeted_power)', 'nvidia-pulse':'NVIDIA PSU Pulse Test (dcgmi diag pulse_test)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL all_reduce_perf)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
}
let satNvidiaGPUsPromise = null;
function loadSatNvidiaGPUs() {

View File

@@ -39,7 +39,6 @@ var taskNames = map[string]string{
"nvidia-interconnect": "NVIDIA Interconnect Test (NCCL all_reduce_perf)",
"nvidia-bandwidth": "NVIDIA Bandwidth Test (NVBandwidth)",
"nvidia-stress": "NVIDIA GPU Stress",
"hpl": "LINPACK (HPL)",
"memory": "Memory SAT",
"storage": "Storage SAT",
"cpu": "CPU SAT",
@@ -740,19 +739,6 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
}
archive, err = runSATStressPackCtx(a, ctx, "", dur, j.append)
case "hpl":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
opts := platform.HPLOptions{
MemFraction: 0.80,
NB: 256,
}
archive, err = func() (string, error) {
path, _, runErr := a.RunHPL(ctx, "", opts, j.append)
return path, runErr
}()
case "platform-stress":
if a == nil {
err = fmt.Errorf("app not configured")