Compare commits
11 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c1690a084b | ||
|
|
9481ca2805 | ||
|
|
a78fdadd88 | ||
|
|
4ef403898f | ||
| 025548ab3c | |||
|
|
e0d94d7f47 | ||
|
|
13899aa864 | ||
|
|
f345d8a89d | ||
|
|
4715059ac0 | ||
|
|
0660a40287 | ||
|
|
67369d9b7b |
@@ -117,7 +117,7 @@ type satRunner interface {
|
|||||||
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
|
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
|
RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
|
||||||
RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error)
|
||||||
RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
|
RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
@@ -139,7 +139,6 @@ type satRunner interface {
|
|||||||
RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
|
RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
|
||||||
RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
|
RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
|
||||||
RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||||
RunHPL(ctx context.Context, baseDir string, opts platform.HPLOptions, logFunc func(string)) (string, *platform.HPLResult, error)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type runtimeChecker interface {
|
type runtimeChecker interface {
|
||||||
@@ -567,11 +566,11 @@ func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts pl
|
|||||||
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
|
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
|
||||||
if strings.TrimSpace(baseDir) == "" {
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
baseDir = DefaultSATBaseDir
|
baseDir = DefaultSATBaseDir
|
||||||
}
|
}
|
||||||
return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, staggerSec, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
func (a *App) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
@@ -738,13 +737,6 @@ func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
|
|||||||
return ActionResult{Title: "NCCL bandwidth test", Body: body}, err
|
return ActionResult{Title: "NCCL bandwidth test", Body: body}, err
|
||||||
}
|
}
|
||||||
|
|
||||||
func (a *App) RunHPL(ctx context.Context, baseDir string, opts platform.HPLOptions, logFunc func(string)) (string, *platform.HPLResult, error) {
|
|
||||||
if a == nil {
|
|
||||||
return "", nil, fmt.Errorf("app not configured")
|
|
||||||
}
|
|
||||||
return a.sat.RunHPL(ctx, baseDir, opts, logFunc)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (a *App) RunFanStressTestResult(ctx context.Context, opts platform.FanStressOptions) (ActionResult, error) {
|
func (a *App) RunFanStressTestResult(ctx context.Context, opts platform.FanStressOptions) (ActionResult, error) {
|
||||||
path, err := a.RunFanStressTest(ctx, "", opts)
|
path, err := a.RunFanStressTest(ctx, "", opts)
|
||||||
body := formatFanStressResult(path)
|
body := formatFanStressResult(path)
|
||||||
|
|||||||
@@ -161,7 +161,7 @@ func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir
|
|||||||
return f.runNvidiaFn(baseDir)
|
return f.runNvidiaFn(baseDir)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunNvidiaOfficialComputePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
|
func (f fakeSAT) RunNvidiaOfficialComputePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ int, _ func(string)) (string, error) {
|
||||||
if f.runNvidiaComputeFn != nil {
|
if f.runNvidiaComputeFn != nil {
|
||||||
return f.runNvidiaComputeFn(baseDir, durationSec, gpuIndices)
|
return f.runNvidiaComputeFn(baseDir, durationSec, gpuIndices)
|
||||||
}
|
}
|
||||||
@@ -282,9 +282,6 @@ func (f fakeSAT) RunPlatformStress(_ context.Context, _ string, _ platform.Platf
|
|||||||
func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) {
|
func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) {
|
||||||
return "", nil
|
return "", nil
|
||||||
}
|
}
|
||||||
func (f fakeSAT) RunHPL(_ context.Context, _ string, _ platform.HPLOptions, _ func(string)) (string, *platform.HPLResult, error) {
|
|
||||||
return "", nil, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
|
func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
@@ -545,8 +542,6 @@ func TestActionResultsUseFallbackBody(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
|
func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
tmp := t.TempDir()
|
tmp := t.TempDir()
|
||||||
oldExportDir := DefaultExportDir
|
oldExportDir := DefaultExportDir
|
||||||
DefaultExportDir = tmp
|
DefaultExportDir = tmp
|
||||||
@@ -583,8 +578,6 @@ func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestExportSupportBundleResultDoesNotPretendSuccessOnError(t *testing.T) {
|
func TestExportSupportBundleResultDoesNotPretendSuccessOnError(t *testing.T) {
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
tmp := t.TempDir()
|
tmp := t.TempDir()
|
||||||
oldExportDir := DefaultExportDir
|
oldExportDir := DefaultExportDir
|
||||||
DefaultExportDir = tmp
|
DefaultExportDir = tmp
|
||||||
@@ -646,8 +639,6 @@ func TestRunNvidiaAcceptancePackResult(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestRunSATDefaultsToExportDir(t *testing.T) {
|
func TestRunSATDefaultsToExportDir(t *testing.T) {
|
||||||
t.Parallel()
|
|
||||||
|
|
||||||
oldSATBaseDir := DefaultSATBaseDir
|
oldSATBaseDir := DefaultSATBaseDir
|
||||||
DefaultSATBaseDir = "/tmp/export/bee-sat"
|
DefaultSATBaseDir = "/tmp/export/bee-sat"
|
||||||
t.Cleanup(func() { DefaultSATBaseDir = oldSATBaseDir })
|
t.Cleanup(func() { DefaultSATBaseDir = oldSATBaseDir })
|
||||||
|
|||||||
@@ -54,7 +54,7 @@ if ! command -v lspci >/dev/null 2>&1; then
|
|||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
found=0
|
found=0
|
||||||
for gpu in $(lspci -Dn | awk '$3 ~ /^10de:/ {print $1}'); do
|
for gpu in $(lspci -Dn | awk '$2 ~ /^03(00|02):$/ && $3 ~ /^10de:/ {print $1}'); do
|
||||||
found=1
|
found=1
|
||||||
echo "=== GPU $gpu ==="
|
echo "=== GPU $gpu ==="
|
||||||
lspci -s "$gpu" -vv 2>&1 || true
|
lspci -s "$gpu" -vv 2>&1 || true
|
||||||
@@ -74,6 +74,11 @@ fi
|
|||||||
for d in /sys/bus/pci/devices/*/; do
|
for d in /sys/bus/pci/devices/*/; do
|
||||||
vendor=$(cat "$d/vendor" 2>/dev/null)
|
vendor=$(cat "$d/vendor" 2>/dev/null)
|
||||||
[ "$vendor" = "0x10de" ] || continue
|
[ "$vendor" = "0x10de" ] || continue
|
||||||
|
class=$(cat "$d/class" 2>/dev/null)
|
||||||
|
case "$class" in
|
||||||
|
0x030000|0x030200) ;;
|
||||||
|
*) continue ;;
|
||||||
|
esac
|
||||||
dev=$(basename "$d")
|
dev=$(basename "$d")
|
||||||
echo "=== $dev ==="
|
echo "=== $dev ==="
|
||||||
for f in current_link_speed current_link_width max_link_speed max_link_width; do
|
for f in current_link_speed current_link_width max_link_speed max_link_width; do
|
||||||
@@ -192,7 +197,7 @@ var supportBundleOptionalFiles = []struct {
|
|||||||
{name: "system/syslog.txt", src: "/var/log/syslog"},
|
{name: "system/syslog.txt", src: "/var/log/syslog"},
|
||||||
}
|
}
|
||||||
|
|
||||||
const supportBundleGlob = "bee-support-*.tar.gz"
|
const supportBundleGlob = "????-??-?? (BEE-SP*)*.tar.gz"
|
||||||
|
|
||||||
func BuildSupportBundle(exportDir string) (string, error) {
|
func BuildSupportBundle(exportDir string) (string, error) {
|
||||||
exportDir = strings.TrimSpace(exportDir)
|
exportDir = strings.TrimSpace(exportDir)
|
||||||
@@ -206,9 +211,14 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
|||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
|
||||||
host := sanitizeFilename(hostnameOr("unknown"))
|
now := time.Now().UTC()
|
||||||
ts := time.Now().UTC().Format("20060102-150405")
|
date := now.Format("2006-01-02")
|
||||||
stageRoot := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-%s-%s", host, ts))
|
tod := now.Format("15:04:05")
|
||||||
|
ver := bundleVersion()
|
||||||
|
model := serverModelForBundle()
|
||||||
|
sn := serverSerialForBundle()
|
||||||
|
|
||||||
|
stageRoot := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-stage-%s-%s", sanitizeFilename(hostnameOr("unknown")), now.Format("20060102-150405")))
|
||||||
if err := os.MkdirAll(stageRoot, 0755); err != nil {
|
if err := os.MkdirAll(stageRoot, 0755); err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
@@ -240,7 +250,8 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
|||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
|
||||||
archivePath := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-%s-%s.tar.gz", host, ts))
|
archiveName := fmt.Sprintf("%s (BEE-SP v%s) %s %s %s.tar.gz", date, ver, model, sn, tod)
|
||||||
|
archivePath := filepath.Join(os.TempDir(), archiveName)
|
||||||
if err := createSupportTarGz(archivePath, stageRoot); err != nil {
|
if err := createSupportTarGz(archivePath, stageRoot); err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
@@ -397,6 +408,60 @@ func writeManifest(dst, exportDir, stageRoot string) error {
|
|||||||
return os.WriteFile(dst, []byte(body.String()), 0644)
|
return os.WriteFile(dst, []byte(body.String()), 0644)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func bundleVersion() string {
|
||||||
|
v := buildVersion()
|
||||||
|
v = strings.TrimPrefix(v, "v")
|
||||||
|
v = strings.TrimPrefix(v, "V")
|
||||||
|
if v == "" || v == "unknown" {
|
||||||
|
return "0.0"
|
||||||
|
}
|
||||||
|
return v
|
||||||
|
}
|
||||||
|
|
||||||
|
func serverModelForBundle() string {
|
||||||
|
raw, err := exec.Command("dmidecode", "-t", "1").Output()
|
||||||
|
if err != nil {
|
||||||
|
return "unknown"
|
||||||
|
}
|
||||||
|
for _, line := range strings.Split(string(raw), "\n") {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
key, val, ok := strings.Cut(line, ": ")
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(key) == "Product Name" {
|
||||||
|
val = strings.TrimSpace(val)
|
||||||
|
if val == "" {
|
||||||
|
return "unknown"
|
||||||
|
}
|
||||||
|
return strings.ReplaceAll(val, " ", "_")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return "unknown"
|
||||||
|
}
|
||||||
|
|
||||||
|
func serverSerialForBundle() string {
|
||||||
|
raw, err := exec.Command("dmidecode", "-t", "1").Output()
|
||||||
|
if err != nil {
|
||||||
|
return "unknown"
|
||||||
|
}
|
||||||
|
for _, line := range strings.Split(string(raw), "\n") {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
key, val, ok := strings.Cut(line, ": ")
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if strings.TrimSpace(key) == "Serial Number" {
|
||||||
|
val = strings.TrimSpace(val)
|
||||||
|
if val == "" {
|
||||||
|
return "unknown"
|
||||||
|
}
|
||||||
|
return val
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return "unknown"
|
||||||
|
}
|
||||||
|
|
||||||
func buildVersion() string {
|
func buildVersion() string {
|
||||||
raw, err := exec.Command("bee", "version").CombinedOutput()
|
raw, err := exec.Command("bee", "version").CombinedOutput()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
@@ -383,10 +383,7 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const (
|
const (
|
||||||
ansiRed = "\033[31m"
|
ansiAmber = "\033[38;5;214m"
|
||||||
ansiBlue = "\033[34m"
|
|
||||||
ansiGreen = "\033[32m"
|
|
||||||
ansiYellow = "\033[33m"
|
|
||||||
ansiReset = "\033[0m"
|
ansiReset = "\033[0m"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -415,10 +412,10 @@ func RenderGPUTerminalChart(rows []GPUMetricRow) string {
|
|||||||
fn func(GPUMetricRow) float64
|
fn func(GPUMetricRow) float64
|
||||||
}
|
}
|
||||||
defs := []seriesDef{
|
defs := []seriesDef{
|
||||||
{"Temperature (°C)", ansiRed, func(r GPUMetricRow) float64 { return r.TempC }},
|
{"Temperature (°C)", ansiAmber, func(r GPUMetricRow) float64 { return r.TempC }},
|
||||||
{"GPU Usage (%)", ansiBlue, func(r GPUMetricRow) float64 { return r.UsagePct }},
|
{"GPU Usage (%)", ansiAmber, func(r GPUMetricRow) float64 { return r.UsagePct }},
|
||||||
{"Power (W)", ansiGreen, func(r GPUMetricRow) float64 { return r.PowerW }},
|
{"Power (W)", ansiAmber, func(r GPUMetricRow) float64 { return r.PowerW }},
|
||||||
{"Clock (MHz)", ansiYellow, func(r GPUMetricRow) float64 { return r.ClockMHz }},
|
{"Clock (MHz)", ansiAmber, func(r GPUMetricRow) float64 { return r.ClockMHz }},
|
||||||
}
|
}
|
||||||
|
|
||||||
var b strings.Builder
|
var b strings.Builder
|
||||||
|
|||||||
@@ -1,142 +0,0 @@
|
|||||||
package platform
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"fmt"
|
|
||||||
"os"
|
|
||||||
"os/exec"
|
|
||||||
"path/filepath"
|
|
||||||
"strconv"
|
|
||||||
"strings"
|
|
||||||
"time"
|
|
||||||
)
|
|
||||||
|
|
||||||
// HPLOptions configures the HPL (LINPACK) benchmark run.
|
|
||||||
type HPLOptions struct {
|
|
||||||
MemFraction float64 // fraction of RAM to use (default 0.80)
|
|
||||||
NB int // block size (default 256)
|
|
||||||
}
|
|
||||||
|
|
||||||
// HPLResult holds the parsed result of an HPL run.
|
|
||||||
type HPLResult struct {
|
|
||||||
N int // matrix dimension
|
|
||||||
NB int // block size
|
|
||||||
P int // process grid rows
|
|
||||||
Q int // process grid cols
|
|
||||||
TimeSec float64 // wall time in seconds
|
|
||||||
GFlops float64 // achieved performance
|
|
||||||
Residual float64 // backward error residual (from HPL verification line)
|
|
||||||
Status string // "PASSED" or "FAILED"
|
|
||||||
RawOutput string // full xhpl output
|
|
||||||
}
|
|
||||||
|
|
||||||
func applyHPLDefaults(opts *HPLOptions) {
|
|
||||||
if opts.MemFraction <= 0 || opts.MemFraction > 1 {
|
|
||||||
opts.MemFraction = 0.80
|
|
||||||
}
|
|
||||||
if opts.NB <= 0 {
|
|
||||||
opts.NB = 256
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// RunHPL runs bee-hpl and returns parsed results plus a tar.gz artifact path.
|
|
||||||
func (s *System) RunHPL(ctx context.Context, baseDir string, opts HPLOptions, logFunc func(string)) (string, *HPLResult, error) {
|
|
||||||
applyHPLDefaults(&opts)
|
|
||||||
|
|
||||||
if baseDir == "" {
|
|
||||||
baseDir = "/var/log/bee-sat"
|
|
||||||
}
|
|
||||||
ts := time.Now().UTC().Format("20060102-150405")
|
|
||||||
runDir := filepath.Join(baseDir, "hpl-"+ts)
|
|
||||||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
|
||||||
return "", nil, fmt.Errorf("mkdir %s: %w", runDir, err)
|
|
||||||
}
|
|
||||||
|
|
||||||
logPath := filepath.Join(runDir, "hpl.log")
|
|
||||||
|
|
||||||
cmd := []string{
|
|
||||||
"bee-hpl",
|
|
||||||
"--mem-fraction", strconv.FormatFloat(opts.MemFraction, 'f', 2, 64),
|
|
||||||
"--nb", strconv.Itoa(opts.NB),
|
|
||||||
}
|
|
||||||
|
|
||||||
if logFunc != nil {
|
|
||||||
logFunc(fmt.Sprintf("HPL: N will be auto-sized to %.0f%% of RAM, NB=%d", opts.MemFraction*100, opts.NB))
|
|
||||||
}
|
|
||||||
|
|
||||||
out, err := runSATCommandCtx(ctx, "", "hpl", cmd, nil, logFunc)
|
|
||||||
_ = os.WriteFile(logPath, out, 0644)
|
|
||||||
|
|
||||||
result := parseHPLOutput(string(out))
|
|
||||||
result.RawOutput = string(out)
|
|
||||||
|
|
||||||
if err != nil && err != context.Canceled {
|
|
||||||
return "", result, fmt.Errorf("bee-hpl failed: %w", err)
|
|
||||||
}
|
|
||||||
if err == nil && result.GFlops <= 0 {
|
|
||||||
return "", result, fmt.Errorf("HPL completed but no Gflops result found in output")
|
|
||||||
}
|
|
||||||
|
|
||||||
// Write summary
|
|
||||||
summary := fmt.Sprintf("N=%d NB=%d time=%.2fs gflops=%.3f status=%s\n",
|
|
||||||
result.N, result.NB, result.TimeSec, result.GFlops, result.Status)
|
|
||||||
_ = os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644)
|
|
||||||
|
|
||||||
if logFunc != nil {
|
|
||||||
logFunc(fmt.Sprintf("HPL result: N=%d NB=%d %.2fs %.3f Gflops %s",
|
|
||||||
result.N, result.NB, result.TimeSec, result.GFlops, result.Status))
|
|
||||||
}
|
|
||||||
|
|
||||||
ts2 := time.Now().UTC().Format("20060102-150405")
|
|
||||||
archive := filepath.Join(baseDir, "hpl-"+ts2+".tar.gz")
|
|
||||||
if archErr := createTarGz(archive, runDir); archErr != nil {
|
|
||||||
return runDir, result, err
|
|
||||||
}
|
|
||||||
return archive, result, err
|
|
||||||
}
|
|
||||||
|
|
||||||
// parseHPLOutput extracts N, NB, time, and Gflops from standard HPL output.
|
|
||||||
//
|
|
||||||
// HPL prints a result line of the form:
|
|
||||||
//
|
|
||||||
// WR00L2L2 45312 256 1 1 1234.56 5.678e+01
|
|
||||||
// T/V N NB P Q Time Gflops
|
|
||||||
func parseHPLOutput(output string) *HPLResult {
|
|
||||||
result := &HPLResult{Status: "FAILED"}
|
|
||||||
for _, line := range strings.Split(output, "\n") {
|
|
||||||
line = strings.TrimSpace(line)
|
|
||||||
// Result line starts with WR
|
|
||||||
if strings.HasPrefix(line, "WR") {
|
|
||||||
fields := strings.Fields(line)
|
|
||||||
// WR00L2L2 N NB P Q Time Gflops
|
|
||||||
if len(fields) >= 7 {
|
|
||||||
result.N, _ = strconv.Atoi(fields[1])
|
|
||||||
result.NB, _ = strconv.Atoi(fields[2])
|
|
||||||
result.P, _ = strconv.Atoi(fields[3])
|
|
||||||
result.Q, _ = strconv.Atoi(fields[4])
|
|
||||||
result.TimeSec, _ = strconv.ParseFloat(fields[5], 64)
|
|
||||||
result.GFlops, _ = strconv.ParseFloat(fields[6], 64)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Verification line: "||Ax-b||_oo/(eps*(||A||_oo*||x||_oo+||b||_oo)*N)= ... PASSED"
|
|
||||||
if strings.Contains(line, "PASSED") {
|
|
||||||
result.Status = "PASSED"
|
|
||||||
fields := strings.Fields(line)
|
|
||||||
for i, f := range fields {
|
|
||||||
if f == "PASSED" && i > 0 {
|
|
||||||
result.Residual, _ = strconv.ParseFloat(fields[i-1], 64)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return result
|
|
||||||
}
|
|
||||||
|
|
||||||
// hplAvailable returns true if bee-hpl and xhpl are present and executable.
|
|
||||||
func hplAvailable() bool {
|
|
||||||
if _, err := exec.LookPath("bee-hpl"); err != nil {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
_, err := os.Stat("/usr/local/lib/bee/xhpl")
|
|
||||||
return err == nil
|
|
||||||
}
|
|
||||||
@@ -49,6 +49,9 @@ func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
|
|||||||
"--seconds", strconv.Itoa(opts.DurationSec),
|
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||||
"--size-mb", strconv.Itoa(opts.SizeMB),
|
"--size-mb", strconv.Itoa(opts.SizeMB),
|
||||||
}
|
}
|
||||||
|
if opts.StaggerSeconds > 0 && len(selected) > 1 {
|
||||||
|
cmd = append(cmd, "--stagger-seconds", strconv.Itoa(opts.StaggerSeconds))
|
||||||
|
}
|
||||||
if len(selected) > 0 {
|
if len(selected) > 0 {
|
||||||
cmd = append(cmd, "--devices", joinIndexList(selected))
|
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||||
}
|
}
|
||||||
@@ -63,6 +66,9 @@ func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
|
|||||||
"bee-john-gpu-stress",
|
"bee-john-gpu-stress",
|
||||||
"--seconds", strconv.Itoa(opts.DurationSec),
|
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||||
}
|
}
|
||||||
|
if opts.StaggerSeconds > 0 && len(selected) > 1 {
|
||||||
|
cmd = append(cmd, "--stagger-seconds", strconv.Itoa(opts.StaggerSeconds))
|
||||||
|
}
|
||||||
if len(selected) > 0 {
|
if len(selected) > 0 {
|
||||||
cmd = append(cmd, "--devices", joinIndexList(selected))
|
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -384,22 +384,36 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
|
|||||||
), logFunc)
|
), logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
|
||||||
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
profCmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)))
|
var (
|
||||||
|
profCmd []string
|
||||||
|
profEnv []string
|
||||||
|
)
|
||||||
|
if staggerSec > 0 && len(selected) > 1 {
|
||||||
|
profCmd = []string{
|
||||||
|
"bee-dcgmproftester-staggered",
|
||||||
|
"--seconds", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)),
|
||||||
|
"--stagger-seconds", strconv.Itoa(staggerSec),
|
||||||
|
"--devices", joinIndexList(selected),
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
profCmd, err = resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
profEnv = nvidiaVisibleDevicesEnv(selected)
|
||||||
|
}
|
||||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", withNvidiaPersistenceMode(
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", withNvidiaPersistenceMode(
|
||||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
satJob{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
|
satJob{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
|
||||||
satJob{
|
satJob{
|
||||||
name: "03-dcgmproftester.log",
|
name: "03-dcgmproftester.log",
|
||||||
cmd: profCmd,
|
cmd: profCmd,
|
||||||
env: nvidiaVisibleDevicesEnv(selected),
|
env: profEnv,
|
||||||
collectGPU: true,
|
collectGPU: true,
|
||||||
gpuIndices: selected,
|
gpuIndices: selected,
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -70,6 +70,7 @@ type NvidiaStressOptions struct {
|
|||||||
Loader string
|
Loader string
|
||||||
GPUIndices []int
|
GPUIndices []int
|
||||||
ExcludeGPUIndices []int
|
ExcludeGPUIndices []int
|
||||||
|
StaggerSeconds int
|
||||||
}
|
}
|
||||||
|
|
||||||
func New() *System {
|
func New() *System {
|
||||||
|
|||||||
@@ -487,6 +487,7 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
|||||||
StressMode bool `json:"stress_mode"`
|
StressMode bool `json:"stress_mode"`
|
||||||
GPUIndices []int `json:"gpu_indices"`
|
GPUIndices []int `json:"gpu_indices"`
|
||||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
|
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
|
||||||
|
StaggerGPUStart bool `json:"stagger_gpu_start"`
|
||||||
Loader string `json:"loader"`
|
Loader string `json:"loader"`
|
||||||
Profile string `json:"profile"`
|
Profile string `json:"profile"`
|
||||||
DisplayName string `json:"display_name"`
|
DisplayName string `json:"display_name"`
|
||||||
@@ -508,6 +509,7 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
|||||||
StressMode: body.StressMode,
|
StressMode: body.StressMode,
|
||||||
GPUIndices: body.GPUIndices,
|
GPUIndices: body.GPUIndices,
|
||||||
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
||||||
|
StaggerGPUStart: body.StaggerGPUStart,
|
||||||
Loader: body.Loader,
|
Loader: body.Loader,
|
||||||
BurnProfile: body.Profile,
|
BurnProfile: body.Profile,
|
||||||
DisplayName: body.DisplayName,
|
DisplayName: body.DisplayName,
|
||||||
@@ -1376,107 +1378,3 @@ func (h *handler) rollbackPendingNetworkChange() error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Display / Screen Resolution ───────────────────────────────────────────────
|
|
||||||
|
|
||||||
type displayMode struct {
|
|
||||||
Output string `json:"output"`
|
|
||||||
Mode string `json:"mode"`
|
|
||||||
Current bool `json:"current"`
|
|
||||||
}
|
|
||||||
|
|
||||||
type displayInfo struct {
|
|
||||||
Output string `json:"output"`
|
|
||||||
Modes []displayMode `json:"modes"`
|
|
||||||
Current string `json:"current"`
|
|
||||||
}
|
|
||||||
|
|
||||||
var xrandrOutputRE = regexp.MustCompile(`^(\S+)\s+connected`)
|
|
||||||
var xrandrModeRE = regexp.MustCompile(`^\s{3}(\d+x\d+)\s`)
|
|
||||||
var xrandrCurrentRE = regexp.MustCompile(`\*`)
|
|
||||||
|
|
||||||
func parseXrandrOutput(out string) []displayInfo {
|
|
||||||
var infos []displayInfo
|
|
||||||
var cur *displayInfo
|
|
||||||
for _, line := range strings.Split(out, "\n") {
|
|
||||||
if m := xrandrOutputRE.FindStringSubmatch(line); m != nil {
|
|
||||||
if cur != nil {
|
|
||||||
infos = append(infos, *cur)
|
|
||||||
}
|
|
||||||
cur = &displayInfo{Output: m[1]}
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if cur == nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if m := xrandrModeRE.FindStringSubmatch(line); m != nil {
|
|
||||||
isCurrent := xrandrCurrentRE.MatchString(line)
|
|
||||||
mode := displayMode{Output: cur.Output, Mode: m[1], Current: isCurrent}
|
|
||||||
cur.Modes = append(cur.Modes, mode)
|
|
||||||
if isCurrent {
|
|
||||||
cur.Current = m[1]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if cur != nil {
|
|
||||||
infos = append(infos, *cur)
|
|
||||||
}
|
|
||||||
return infos
|
|
||||||
}
|
|
||||||
|
|
||||||
func xrandrCommand(args ...string) *exec.Cmd {
|
|
||||||
cmd := exec.Command("xrandr", args...)
|
|
||||||
env := append([]string{}, os.Environ()...)
|
|
||||||
hasDisplay := false
|
|
||||||
hasXAuthority := false
|
|
||||||
for _, kv := range env {
|
|
||||||
if strings.HasPrefix(kv, "DISPLAY=") && strings.TrimPrefix(kv, "DISPLAY=") != "" {
|
|
||||||
hasDisplay = true
|
|
||||||
}
|
|
||||||
if strings.HasPrefix(kv, "XAUTHORITY=") && strings.TrimPrefix(kv, "XAUTHORITY=") != "" {
|
|
||||||
hasXAuthority = true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if !hasDisplay {
|
|
||||||
env = append(env, "DISPLAY=:0")
|
|
||||||
}
|
|
||||||
if !hasXAuthority {
|
|
||||||
env = append(env, "XAUTHORITY=/home/bee/.Xauthority")
|
|
||||||
}
|
|
||||||
cmd.Env = env
|
|
||||||
return cmd
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *handler) handleAPIDisplayResolutions(w http.ResponseWriter, _ *http.Request) {
|
|
||||||
out, err := xrandrCommand().Output()
|
|
||||||
if err != nil {
|
|
||||||
writeError(w, http.StatusInternalServerError, "xrandr: "+err.Error())
|
|
||||||
return
|
|
||||||
}
|
|
||||||
writeJSON(w, parseXrandrOutput(string(out)))
|
|
||||||
}
|
|
||||||
|
|
||||||
func (h *handler) handleAPIDisplaySet(w http.ResponseWriter, r *http.Request) {
|
|
||||||
var req struct {
|
|
||||||
Output string `json:"output"`
|
|
||||||
Mode string `json:"mode"`
|
|
||||||
}
|
|
||||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil || req.Output == "" || req.Mode == "" {
|
|
||||||
writeError(w, http.StatusBadRequest, "output and mode are required")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
// Validate mode looks like WxH to prevent injection
|
|
||||||
if !regexp.MustCompile(`^\d+x\d+$`).MatchString(req.Mode) {
|
|
||||||
writeError(w, http.StatusBadRequest, "invalid mode format")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
// Validate output name (no special chars)
|
|
||||||
if !regexp.MustCompile(`^[A-Za-z0-9_\-]+$`).MatchString(req.Output) {
|
|
||||||
writeError(w, http.StatusBadRequest, "invalid output name")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
if out, err := xrandrCommand("--output", req.Output, "--mode", req.Mode).CombinedOutput(); err != nil {
|
|
||||||
writeError(w, http.StatusInternalServerError, "xrandr: "+strings.TrimSpace(string(out)))
|
|
||||||
return
|
|
||||||
}
|
|
||||||
writeJSON(w, map[string]string{"status": "ok", "output": req.Output, "mode": req.Mode})
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -10,30 +10,6 @@ import (
|
|||||||
"bee/audit/internal/platform"
|
"bee/audit/internal/platform"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestXrandrCommandAddsDefaultX11Env(t *testing.T) {
|
|
||||||
t.Setenv("DISPLAY", "")
|
|
||||||
t.Setenv("XAUTHORITY", "")
|
|
||||||
|
|
||||||
cmd := xrandrCommand("--query")
|
|
||||||
|
|
||||||
var hasDisplay bool
|
|
||||||
var hasXAuthority bool
|
|
||||||
for _, kv := range cmd.Env {
|
|
||||||
if kv == "DISPLAY=:0" {
|
|
||||||
hasDisplay = true
|
|
||||||
}
|
|
||||||
if kv == "XAUTHORITY=/home/bee/.Xauthority" {
|
|
||||||
hasXAuthority = true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if !hasDisplay {
|
|
||||||
t.Fatalf("DISPLAY not injected: %v", cmd.Env)
|
|
||||||
}
|
|
||||||
if !hasXAuthority {
|
|
||||||
t.Fatalf("XAUTHORITY not injected: %v", cmd.Env)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
|
func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
|
||||||
globalQueue.mu.Lock()
|
globalQueue.mu.Lock()
|
||||||
originalTasks := globalQueue.tasks
|
originalTasks := globalQueue.tasks
|
||||||
|
|||||||
@@ -1036,20 +1036,21 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
<div class="card-body validate-profile-body">
|
<div class="card-body validate-profile-body">
|
||||||
<div class="validate-profile-col">
|
<div class="validate-profile-col">
|
||||||
<div class="form-row" style="margin:0"><label>Cycles</label><input type="number" id="sat-cycles" value="1" min="1" max="100" style="width:100%"></div>
|
<div class="form-row" style="margin:0"><label>Cycles</label><input type="number" id="sat-cycles" value="1" min="1" max="100" style="width:100%"></div>
|
||||||
|
</div>
|
||||||
|
<div class="validate-profile-col">
|
||||||
<div class="form-row" style="margin:12px 0 0"><label>Mode</label></div>
|
<div class="form-row" style="margin:12px 0 0"><label>Mode</label></div>
|
||||||
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-validate" value="validate" checked onchange="satModeChanged()"><span>Validate — quick non-destructive check</span></label>
|
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-validate" value="validate" checked onchange="satModeChanged()"><span>Validate — quick non-destructive check</span></label>
|
||||||
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (~30–60 min)</span></label>
|
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (~30–60 min)</span></label>
|
||||||
</div>
|
</div>
|
||||||
<div class="validate-profile-col validate-profile-action">
|
<div class="validate-profile-col validate-profile-action">
|
||||||
<p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially with the selected cycle count and mode. Validate is quick (~5–15 min total); Stress is thorough (~30–60 min total).</p>
|
<p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially with the selected cycle count and mode. Validate is quick (~5–15 min total); Stress is thorough (~30–60 min total).</p>
|
||||||
<button class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
|
<button type="button" class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
|
||||||
</div>
|
<div style="margin-top:12px">
|
||||||
<div class="validate-profile-col"></div>
|
|
||||||
</div>
|
|
||||||
<div class="card-body" style="padding-top:0;display:flex;justify-content:center">
|
|
||||||
<span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
|
<span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
<div class="grid3">
|
<div class="grid3">
|
||||||
` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody(
|
` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody(
|
||||||
@@ -1143,16 +1144,6 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
`</div>` +
|
`</div>` +
|
||||||
`</div>
|
`</div>
|
||||||
<div class="grid3" style="margin-top:16px">
|
<div class="grid3" style="margin-top:16px">
|
||||||
` + `<div id="sat-card-hpl">` +
|
|
||||||
renderSATCard("hpl", "LINPACK (HPL)", "runSAT('hpl')", "", renderValidateCardBody(
|
|
||||||
``,
|
|
||||||
`Standard High Performance LINPACK benchmark. Measures sustained FP64 GFLOPS and memory bandwidth of the CPU subsystem. Uses 80% of available RAM. Pass/fail based on HPL residual check.`,
|
|
||||||
`<code>xhpl</code> (HPL 2.3, OpenBLAS)`,
|
|
||||||
`Skipped in Validate mode. Runs in Stress mode only. Runtime scales with RAM — expect 5–30 min.<p id="sat-hpl-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
|
||||||
)) +
|
|
||||||
`</div>` +
|
|
||||||
`</div>
|
|
||||||
<div class="grid3" style="margin-top:16px">
|
|
||||||
` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody(
|
` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody(
|
||||||
inv.AMD,
|
inv.AMD,
|
||||||
`Runs the selected AMD checks only. GPU Validate collects inventory; MEM Integrity uses the RVS MEM module; MEM Bandwidth uses rocm-bandwidth-test and the RVS BABEL module.`,
|
`Runs the selected AMD checks only. GPU Validate collects inventory; MEM Integrity uses the RVS MEM module; MEM Bandwidth uses rocm-bandwidth-test and the RVS BABEL module.`,
|
||||||
@@ -1188,7 +1179,6 @@ function satModeChanged() {
|
|||||||
{card: 'sat-card-nvidia-pulse', hint: 'sat-pt-mode-hint'},
|
{card: 'sat-card-nvidia-pulse', hint: 'sat-pt-mode-hint'},
|
||||||
{card: 'sat-card-nvidia-interconnect', hint: 'sat-ni-mode-hint'},
|
{card: 'sat-card-nvidia-interconnect', hint: 'sat-ni-mode-hint'},
|
||||||
{card: 'sat-card-nvidia-bandwidth', hint: 'sat-nb-mode-hint'},
|
{card: 'sat-card-nvidia-bandwidth', hint: 'sat-nb-mode-hint'},
|
||||||
{card: 'sat-card-hpl', hint: 'sat-hpl-mode-hint'},
|
|
||||||
].forEach(function(item) {
|
].forEach(function(item) {
|
||||||
const card = document.getElementById(item.card);
|
const card = document.getElementById(item.card);
|
||||||
if (card) {
|
if (card) {
|
||||||
@@ -1199,7 +1189,7 @@ function satModeChanged() {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
function satLabels() {
|
function satLabels() {
|
||||||
return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress (dcgmi diag targeted_stress)', 'nvidia-targeted-power':'NVIDIA Targeted Power (dcgmi diag targeted_power)', 'nvidia-pulse':'NVIDIA PSU Pulse Test (dcgmi diag pulse_test)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL all_reduce_perf)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', hpl:'LINPACK (HPL)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
|
return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress (dcgmi diag targeted_stress)', 'nvidia-targeted-power':'NVIDIA Targeted Power (dcgmi diag targeted_power)', 'nvidia-pulse':'NVIDIA PSU Pulse Test (dcgmi diag pulse_test)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL all_reduce_perf)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
|
||||||
}
|
}
|
||||||
let satNvidiaGPUsPromise = null;
|
let satNvidiaGPUsPromise = null;
|
||||||
function loadSatNvidiaGPUs() {
|
function loadSatNvidiaGPUs() {
|
||||||
@@ -2106,11 +2096,11 @@ func renderBurn() string {
|
|||||||
<label class="cb-row"><input type="radio" name="burn-profile" value="overnight"><span>Overnight — 8 hours</span></label>
|
<label class="cb-row"><input type="radio" name="burn-profile" value="overnight"><span>Overnight — 8 hours</span></label>
|
||||||
</div>
|
</div>
|
||||||
<div class="burn-profile-col burn-profile-action">
|
<div class="burn-profile-col burn-profile-action">
|
||||||
<button class="btn btn-primary" onclick="runAllBurnTasks()">Burn one by one</button>
|
<button type="button" class="btn btn-primary" onclick="runAllBurnTasks()">Burn one by one</button>
|
||||||
<p>Run checked tests one by one. Tests run without cooldown. Each test duration is determined by the Burn Profile. Total test duration is the sum of all selected tests multiplied by the Burn Profile duration.</p>
|
<p>Run checked tests one by one. Tests run without cooldown. Each test duration is determined by the Burn Profile. Total test duration is the sum of all selected tests multiplied by the Burn Profile duration.</p>
|
||||||
</div>
|
</div>
|
||||||
<div class="burn-profile-col burn-profile-action">
|
<div class="burn-profile-col burn-profile-action">
|
||||||
<button class="btn btn-secondary" onclick="runPlatformStress()">Thermal Cycling</button>
|
<button type="button" class="btn btn-secondary" onclick="runPlatformStress()">Thermal Cycling</button>
|
||||||
<p>Run checked core test modules (CPU, MEM, GPU). Tests start at the same time and run for a period with short cooldown phases to stress the server cooling system.</p>
|
<p>Run checked core test modules (CPU, MEM, GPU). Tests start at the same time and run for a period with short cooldown phases to stress the server cooling system.</p>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
@@ -2131,6 +2121,10 @@ func renderBurn() string {
|
|||||||
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
|
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
|
||||||
</div>
|
</div>
|
||||||
<p id="burn-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA burn recipes.</p>
|
<p id="burn-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA burn recipes.</p>
|
||||||
|
<label class="cb-row" style="margin-top:10px">
|
||||||
|
<input type="checkbox" id="burn-stagger-nvidia">
|
||||||
|
<span>Ramp selected NVIDIA GPUs one by one before full-load hold. Uses a 3-minute stabilization window per GPU, then keeps all selected GPUs under load for the chosen Burn Profile duration.</span>
|
||||||
|
</label>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
@@ -2158,10 +2152,6 @@ func renderBurn() string {
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div class="burn-section">GPU-Specific Tests</div>
|
|
||||||
<div class="grid2 burn-grid" style="margin-bottom:16px">
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div id="bi-output" style="display:none;margin-top:16px" class="card">
|
<div id="bi-output" style="display:none;margin-top:16px" class="card">
|
||||||
<div class="card-head">Output <span id="bi-title"></span></div>
|
<div class="card-head">Output <span id="bi-title"></span></div>
|
||||||
<div class="card-body"><div id="bi-terminal" class="terminal"></div></div>
|
<div class="card-body"><div id="bi-terminal" class="terminal"></div></div>
|
||||||
@@ -2210,6 +2200,11 @@ function burnSelectedGPUIndices() {
|
|||||||
.sort(function(a, b) { return a - b; });
|
.sort(function(a, b) { return a - b; });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function burnUseNvidiaRampUp() {
|
||||||
|
const el = document.getElementById('burn-stagger-nvidia');
|
||||||
|
return !!(el && el.checked);
|
||||||
|
}
|
||||||
|
|
||||||
function burnUpdateSelectionNote() {
|
function burnUpdateSelectionNote() {
|
||||||
const note = document.getElementById('burn-selection-note');
|
const note = document.getElementById('burn-selection-note');
|
||||||
const selected = burnSelectedGPUIndices();
|
const selected = burnSelectedGPUIndices();
|
||||||
@@ -2269,6 +2264,9 @@ function enqueueBurnTask(target, label, extra, useSelectedNvidia) {
|
|||||||
return Promise.reject(new Error('Select at least one NVIDIA GPU.'));
|
return Promise.reject(new Error('Select at least one NVIDIA GPU.'));
|
||||||
}
|
}
|
||||||
body.gpu_indices = selected;
|
body.gpu_indices = selected;
|
||||||
|
if (burnUseNvidiaRampUp() && selected.length > 1) {
|
||||||
|
body.stagger_gpu_start = true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return fetch('/api/sat/' + target + '/run', {
|
return fetch('/api/sat/' + target + '/run', {
|
||||||
method: 'POST',
|
method: 'POST',
|
||||||
@@ -2860,55 +2858,6 @@ usbRefresh();
|
|||||||
</script>`
|
</script>`
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Display Resolution ────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
func renderDisplayInline() string {
|
|
||||||
return `<div id="display-status" style="color:var(--muted);font-size:13px;margin-bottom:12px">Loading displays...</div>
|
|
||||||
<div id="display-controls"></div>
|
|
||||||
<script>
|
|
||||||
(function(){
|
|
||||||
function loadDisplays() {
|
|
||||||
fetch('/api/display/resolutions').then(r=>r.json()).then(displays => {
|
|
||||||
const status = document.getElementById('display-status');
|
|
||||||
const ctrl = document.getElementById('display-controls');
|
|
||||||
if (!displays || displays.length === 0) {
|
|
||||||
status.textContent = 'No connected displays found or xrandr not available.';
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
status.textContent = '';
|
|
||||||
ctrl.innerHTML = displays.map(d => {
|
|
||||||
const opts = (d.modes||[]).map(m =>
|
|
||||||
'<option value="'+m.mode+'"'+(m.current?' selected':'')+'>'+m.mode+(m.current?' (current)':'')+'</option>'
|
|
||||||
).join('');
|
|
||||||
return '<div style="margin-bottom:12px">'
|
|
||||||
+'<span style="font-weight:600;margin-right:8px">'+d.output+'</span>'
|
|
||||||
+'<span style="color:var(--muted);font-size:12px;margin-right:12px">Current: '+d.current+'</span>'
|
|
||||||
+'<select id="res-sel-'+d.output+'" style="margin-right:8px">'+opts+'</select>'
|
|
||||||
+'<button class="btn btn-sm btn-primary" onclick="applyResolution(\''+d.output+'\')">Apply</button>'
|
|
||||||
+'</div>';
|
|
||||||
}).join('');
|
|
||||||
}).catch(()=>{
|
|
||||||
document.getElementById('display-status').textContent = 'xrandr not available on this system.';
|
|
||||||
});
|
|
||||||
}
|
|
||||||
window.applyResolution = function(output) {
|
|
||||||
const sel = document.getElementById('res-sel-'+output);
|
|
||||||
if (!sel) return;
|
|
||||||
const mode = sel.value;
|
|
||||||
const btn = sel.nextElementSibling;
|
|
||||||
btn.disabled = true;
|
|
||||||
btn.textContent = 'Applying...';
|
|
||||||
fetch('/api/display/set', {method:'POST', headers:{'Content-Type':'application/json'}, body:JSON.stringify({output:output,mode:mode})})
|
|
||||||
.then(r=>r.json()).then(d=>{
|
|
||||||
if (d.error) { alert('Error: '+d.error); }
|
|
||||||
loadDisplays();
|
|
||||||
}).catch(e=>{ alert('Error: '+e); })
|
|
||||||
.finally(()=>{ btn.disabled=false; btn.textContent='Apply'; });
|
|
||||||
};
|
|
||||||
loadDisplays();
|
|
||||||
})();
|
|
||||||
</script>`
|
|
||||||
}
|
|
||||||
|
|
||||||
func renderNvidiaSelfHealInline() string {
|
func renderNvidiaSelfHealInline() string {
|
||||||
return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Inspect NVIDIA GPU health, restart the bee-nvidia driver service, and issue a per-GPU reset when the driver reports reset required.</p>
|
return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Inspect NVIDIA GPU health, restart the bee-nvidia driver service, and issue a per-GPU reset when the driver reports reset required.</p>
|
||||||
@@ -3097,8 +3046,6 @@ function installToRAM() {
|
|||||||
<div class="card"><div class="card-head">Services</div><div class="card-body">` +
|
<div class="card"><div class="card-head">Services</div><div class="card-body">` +
|
||||||
renderServicesInline() + `</div></div>
|
renderServicesInline() + `</div></div>
|
||||||
|
|
||||||
<div class="card"><div class="card-head">Display Resolution</div><div class="card-body">` +
|
|
||||||
renderDisplayInline() + `</div></div>
|
|
||||||
|
|
||||||
<script>
|
<script>
|
||||||
function checkTools() {
|
function checkTools() {
|
||||||
|
|||||||
@@ -295,10 +295,6 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
|||||||
// Tools
|
// Tools
|
||||||
mux.HandleFunc("GET /api/tools/check", h.handleAPIToolsCheck)
|
mux.HandleFunc("GET /api/tools/check", h.handleAPIToolsCheck)
|
||||||
|
|
||||||
// Display
|
|
||||||
mux.HandleFunc("GET /api/display/resolutions", h.handleAPIDisplayResolutions)
|
|
||||||
mux.HandleFunc("POST /api/display/set", h.handleAPIDisplaySet)
|
|
||||||
|
|
||||||
// GPU presence / tools
|
// GPU presence / tools
|
||||||
mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
|
mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
|
||||||
mux.HandleFunc("GET /api/gpu/nvidia", h.handleAPIGNVIDIAGPUs)
|
mux.HandleFunc("GET /api/gpu/nvidia", h.handleAPIGNVIDIAGPUs)
|
||||||
|
|||||||
@@ -39,7 +39,6 @@ var taskNames = map[string]string{
|
|||||||
"nvidia-interconnect": "NVIDIA Interconnect Test (NCCL all_reduce_perf)",
|
"nvidia-interconnect": "NVIDIA Interconnect Test (NCCL all_reduce_perf)",
|
||||||
"nvidia-bandwidth": "NVIDIA Bandwidth Test (NVBandwidth)",
|
"nvidia-bandwidth": "NVIDIA Bandwidth Test (NVBandwidth)",
|
||||||
"nvidia-stress": "NVIDIA GPU Stress",
|
"nvidia-stress": "NVIDIA GPU Stress",
|
||||||
"hpl": "LINPACK (HPL)",
|
|
||||||
"memory": "Memory SAT",
|
"memory": "Memory SAT",
|
||||||
"storage": "Storage SAT",
|
"storage": "Storage SAT",
|
||||||
"cpu": "CPU SAT",
|
"cpu": "CPU SAT",
|
||||||
@@ -119,6 +118,7 @@ type taskParams struct {
|
|||||||
StressMode bool `json:"stress_mode,omitempty"`
|
StressMode bool `json:"stress_mode,omitempty"`
|
||||||
GPUIndices []int `json:"gpu_indices,omitempty"`
|
GPUIndices []int `json:"gpu_indices,omitempty"`
|
||||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices,omitempty"`
|
ExcludeGPUIndices []int `json:"exclude_gpu_indices,omitempty"`
|
||||||
|
StaggerGPUStart bool `json:"stagger_gpu_start,omitempty"`
|
||||||
SizeMB int `json:"size_mb,omitempty"`
|
SizeMB int `json:"size_mb,omitempty"`
|
||||||
Passes int `json:"passes,omitempty"`
|
Passes int `json:"passes,omitempty"`
|
||||||
Loader string `json:"loader,omitempty"`
|
Loader string `json:"loader,omitempty"`
|
||||||
@@ -163,6 +163,13 @@ func resolveBurnPreset(profile string) burnPreset {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func boolToNvidiaStaggerSeconds(enabled bool, selected []int) int {
|
||||||
|
if enabled && len(selected) > 1 {
|
||||||
|
return 180
|
||||||
|
}
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions {
|
func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions {
|
||||||
acceptanceCycles := []platform.PlatformStressCycle{
|
acceptanceCycles := []platform.PlatformStressCycle{
|
||||||
{LoadSec: 85, IdleSec: 5},
|
{LoadSec: 85, IdleSec: 5},
|
||||||
@@ -602,7 +609,11 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
if t.params.BurnProfile != "" && dur <= 0 {
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
}
|
}
|
||||||
archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, j.append)
|
staggerSec := boolToNvidiaStaggerSeconds(t.params.StaggerGPUStart, t.params.GPUIndices)
|
||||||
|
if staggerSec > 0 {
|
||||||
|
j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU", staggerSec))
|
||||||
|
}
|
||||||
|
archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, staggerSec, j.append)
|
||||||
case "nvidia-targeted-power":
|
case "nvidia-targeted-power":
|
||||||
if a == nil {
|
if a == nil {
|
||||||
err = fmt.Errorf("app not configured")
|
err = fmt.Errorf("app not configured")
|
||||||
@@ -657,6 +668,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
Loader: t.params.Loader,
|
Loader: t.params.Loader,
|
||||||
GPUIndices: t.params.GPUIndices,
|
GPUIndices: t.params.GPUIndices,
|
||||||
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||||
|
StaggerSeconds: boolToNvidiaStaggerSeconds(t.params.StaggerGPUStart, t.params.GPUIndices),
|
||||||
}, j.append)
|
}, j.append)
|
||||||
case "memory":
|
case "memory":
|
||||||
if a == nil {
|
if a == nil {
|
||||||
@@ -740,19 +752,6 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
}
|
}
|
||||||
archive, err = runSATStressPackCtx(a, ctx, "", dur, j.append)
|
archive, err = runSATStressPackCtx(a, ctx, "", dur, j.append)
|
||||||
case "hpl":
|
|
||||||
if a == nil {
|
|
||||||
err = fmt.Errorf("app not configured")
|
|
||||||
break
|
|
||||||
}
|
|
||||||
opts := platform.HPLOptions{
|
|
||||||
MemFraction: 0.80,
|
|
||||||
NB: 256,
|
|
||||||
}
|
|
||||||
archive, err = func() (string, error) {
|
|
||||||
path, _, runErr := a.RunHPL(ctx, "", opts, j.append)
|
|
||||||
return path, runErr
|
|
||||||
}()
|
|
||||||
case "platform-stress":
|
case "platform-stress":
|
||||||
if a == nil {
|
if a == nil {
|
||||||
err = fmt.Errorf("app not configured")
|
err = fmt.Errorf("app not configured")
|
||||||
|
|||||||
@@ -19,7 +19,5 @@ ROCRAND_VERSION=3.2.0.60304-76~22.04
|
|||||||
HIP_RUNTIME_AMD_VERSION=6.3.42134.60304-76~22.04
|
HIP_RUNTIME_AMD_VERSION=6.3.42134.60304-76~22.04
|
||||||
HIPBLASLT_VERSION=0.10.0.60304-76~22.04
|
HIPBLASLT_VERSION=0.10.0.60304-76~22.04
|
||||||
COMGR_VERSION=2.8.0.60304-76~22.04
|
COMGR_VERSION=2.8.0.60304-76~22.04
|
||||||
HPL_VERSION=2.3
|
|
||||||
HPL_SHA256=32c5c17d22330e6f2337b681aded51637fb6008d3f0eb7c277b163fadd612830
|
|
||||||
GO_VERSION=1.24.0
|
GO_VERSION=1.24.0
|
||||||
AUDIT_VERSION=1.0.0
|
AUDIT_VERSION=1.0.0
|
||||||
|
|||||||
@@ -1,331 +0,0 @@
|
|||||||
#!/bin/sh
|
|
||||||
# build-hpl.sh — build HPL (High Performance LINPACK) for the bee LiveCD.
|
|
||||||
#
|
|
||||||
# Downloads HPL 2.3 from netlib, downloads OpenBLAS runtime from the Debian 12
|
|
||||||
# apt repo, and compiles xhpl using a minimal single-process MPI stub so that
|
|
||||||
# no MPI package is required inside the ISO.
|
|
||||||
#
|
|
||||||
# The resulting xhpl binary is a standard HPL binary whose output is compatible
|
|
||||||
# with the accepted HPL format (WR... Gflops lines).
|
|
||||||
#
|
|
||||||
# Output:
|
|
||||||
# $CACHE_DIR/bin/xhpl
|
|
||||||
# $CACHE_DIR/lib/libopenblas.so* (runtime, injected into ISO /usr/lib/)
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
HPL_VERSION="$1"
|
|
||||||
HPL_SHA256="$2"
|
|
||||||
DIST_DIR="$3"
|
|
||||||
|
|
||||||
[ -n "$HPL_VERSION" ] || { echo "usage: $0 <hpl-version> <sha256> <dist-dir>"; exit 1; }
|
|
||||||
[ -n "$HPL_SHA256" ] || { echo "usage: $0 <hpl-version> <sha256> <dist-dir>"; exit 1; }
|
|
||||||
[ -n "$DIST_DIR" ] || { echo "usage: $0 <hpl-version> <sha256> <dist-dir>"; exit 1; }
|
|
||||||
|
|
||||||
echo "=== HPL ${HPL_VERSION} ==="
|
|
||||||
|
|
||||||
CACHE_DIR="${DIST_DIR}/hpl-${HPL_VERSION}"
|
|
||||||
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
|
||||||
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/hpl-downloads"
|
|
||||||
|
|
||||||
if [ -x "${CACHE_DIR}/bin/xhpl" ]; then
|
|
||||||
echo "=== HPL cached, skipping build ==="
|
|
||||||
echo "binary: ${CACHE_DIR}/bin/xhpl"
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
mkdir -p "${DOWNLOAD_CACHE_DIR}" "${CACHE_DIR}/bin" "${CACHE_DIR}/lib"
|
|
||||||
|
|
||||||
# ── download HPL source ────────────────────────────────────────────────────────
|
|
||||||
HPL_TAR="${DOWNLOAD_CACHE_DIR}/hpl-${HPL_VERSION}.tar.gz"
|
|
||||||
DEFAULT_HPL_URLS="
|
|
||||||
https://www.netlib.org/benchmark/hpl/hpl-${HPL_VERSION}.tar.gz
|
|
||||||
https://fossies.org/linux/privat/hpl-${HPL_VERSION}.tar.gz
|
|
||||||
"
|
|
||||||
HPL_GIT_URL="${HPL_GIT_URL:-https://github.com/icl-utk-edu/hpl.git}"
|
|
||||||
DEFAULT_HPL_GIT_REFS="v${HPL_VERSION} ${HPL_VERSION} main"
|
|
||||||
HPL_SOURCE_MODE="tarball"
|
|
||||||
|
|
||||||
download_to_file() {
|
|
||||||
url="$1"
|
|
||||||
out="$2"
|
|
||||||
|
|
||||||
if command -v curl >/dev/null 2>&1; then
|
|
||||||
curl -fL \
|
|
||||||
--connect-timeout 15 \
|
|
||||||
--max-time 180 \
|
|
||||||
--retry 2 \
|
|
||||||
--retry-delay 2 \
|
|
||||||
--output "${out}" \
|
|
||||||
"${url}"
|
|
||||||
return $?
|
|
||||||
fi
|
|
||||||
|
|
||||||
wget \
|
|
||||||
--show-progress \
|
|
||||||
--tries=2 \
|
|
||||||
--timeout=30 \
|
|
||||||
-O "${out}" \
|
|
||||||
"${url}"
|
|
||||||
}
|
|
||||||
|
|
||||||
download_hpl_tarball() {
|
|
||||||
out="$1"
|
|
||||||
tmp="${out}.part"
|
|
||||||
urls="${HPL_URLS:-$DEFAULT_HPL_URLS}"
|
|
||||||
|
|
||||||
rm -f "${tmp}"
|
|
||||||
for url in ${urls}; do
|
|
||||||
[ -n "${url}" ] || continue
|
|
||||||
echo "=== trying HPL source: ${url} ==="
|
|
||||||
if download_to_file "${url}" "${tmp}"; then
|
|
||||||
mv "${tmp}" "${out}"
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
rm -f "${tmp}"
|
|
||||||
echo "=== failed: ${url} ==="
|
|
||||||
done
|
|
||||||
|
|
||||||
echo "ERROR: failed to download HPL ${HPL_VERSION} from all configured URLs" >&2
|
|
||||||
return 1
|
|
||||||
}
|
|
||||||
|
|
||||||
download_hpl_from_git_archive() {
|
|
||||||
out="$1"
|
|
||||||
refs="${HPL_GIT_REFS:-$DEFAULT_HPL_GIT_REFS}"
|
|
||||||
tmp_root="$(mktemp -d)"
|
|
||||||
repo_dir="${tmp_root}/repo"
|
|
||||||
archive_dir="${tmp_root}/hpl-${HPL_VERSION}"
|
|
||||||
archive_tmp="${out}.part"
|
|
||||||
|
|
||||||
for ref in ${refs}; do
|
|
||||||
[ -n "${ref}" ] || continue
|
|
||||||
echo "=== trying HPL git source: ${HPL_GIT_URL} ref ${ref} ==="
|
|
||||||
rm -rf "${repo_dir}" "${archive_dir}" "${archive_tmp}"
|
|
||||||
if git clone --depth 1 --branch "${ref}" "${HPL_GIT_URL}" "${repo_dir}"; then
|
|
||||||
mv "${repo_dir}" "${archive_dir}"
|
|
||||||
tar czf "${archive_tmp}" -C "${tmp_root}" "hpl-${HPL_VERSION}"
|
|
||||||
mv "${archive_tmp}" "${out}"
|
|
||||||
rm -rf "${tmp_root}"
|
|
||||||
HPL_SOURCE_MODE="git"
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
echo "=== failed git ref: ${ref} ==="
|
|
||||||
done
|
|
||||||
|
|
||||||
rm -rf "${tmp_root}" "${archive_tmp}"
|
|
||||||
echo "ERROR: failed to obtain HPL ${HPL_VERSION} from all configured sources" >&2
|
|
||||||
echo " looked for cache: ${out}" >&2
|
|
||||||
echo " tarball mirrors: ${HPL_URLS:-$DEFAULT_HPL_URLS}" >&2
|
|
||||||
echo " git fallback: ${HPL_GIT_URL} refs ${refs}" >&2
|
|
||||||
echo " override mirrors with HPL_URLS=\"https://mirror1/...\"" >&2
|
|
||||||
echo " override git refs with HPL_GIT_REFS=\"v${HPL_VERSION} ${HPL_VERSION} main\"" >&2
|
|
||||||
return 1
|
|
||||||
}
|
|
||||||
|
|
||||||
if [ ! -f "${HPL_TAR}" ]; then
|
|
||||||
echo "=== downloading HPL ${HPL_VERSION} ==="
|
|
||||||
download_hpl_tarball "${HPL_TAR}" || download_hpl_from_git_archive "${HPL_TAR}"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ "${HPL_SOURCE_MODE}" = "tarball" ]; then
|
|
||||||
actual_sha="$(sha256sum "${HPL_TAR}" | awk '{print $1}')"
|
|
||||||
if [ "${actual_sha}" != "${HPL_SHA256}" ]; then
|
|
||||||
echo "ERROR: sha256 mismatch for hpl-${HPL_VERSION}.tar.gz" >&2
|
|
||||||
echo " expected: ${HPL_SHA256}" >&2
|
|
||||||
echo " actual: ${actual_sha}" >&2
|
|
||||||
rm -f "${HPL_TAR}"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
echo "sha256 OK: hpl-${HPL_VERSION}.tar.gz"
|
|
||||||
else
|
|
||||||
echo "=== HPL source obtained from git fallback; skipping tarball sha256 check ==="
|
|
||||||
fi
|
|
||||||
|
|
||||||
# ── download OpenBLAS from Debian 12 apt repo ─────────────────────────────────
|
|
||||||
REPO_BASE="https://deb.debian.org/debian/pool/main/o/openblas"
|
|
||||||
PACKAGES_GZ="${DOWNLOAD_CACHE_DIR}/Packages.gz"
|
|
||||||
OPENBLAS_PKG="libopenblas0-openmp"
|
|
||||||
|
|
||||||
echo "=== fetching Debian 12 Packages.gz ==="
|
|
||||||
wget -q -O "${PACKAGES_GZ}" \
|
|
||||||
"https://deb.debian.org/debian/dists/bookworm/main/binary-amd64/Packages.gz"
|
|
||||||
|
|
||||||
lookup_deb() {
|
|
||||||
pkg="$1"
|
|
||||||
gzip -dc "${PACKAGES_GZ}" | awk -v pkg="$pkg" '
|
|
||||||
/^Package: / { cur=$2 }
|
|
||||||
/^Filename: / { file=$2 }
|
|
||||||
/^SHA256: / { sha=$2 }
|
|
||||||
/^$/ {
|
|
||||||
if (cur == pkg) { print file " " sha; exit }
|
|
||||||
cur=""; file=""; sha=""
|
|
||||||
}
|
|
||||||
END {
|
|
||||||
if (cur == pkg) print file " " sha
|
|
||||||
}'
|
|
||||||
}
|
|
||||||
|
|
||||||
meta="$(lookup_deb "${OPENBLAS_PKG}")"
|
|
||||||
[ -n "$meta" ] || { echo "ERROR: ${OPENBLAS_PKG} not found in Packages.gz"; exit 1; }
|
|
||||||
repo_file="$(printf '%s' "$meta" | awk '{print $1}')"
|
|
||||||
repo_sha="$(printf '%s' "$meta" | awk '{print $2}')"
|
|
||||||
|
|
||||||
OPENBLAS_DEB="${DOWNLOAD_CACHE_DIR}/$(basename "${repo_file}")"
|
|
||||||
if [ -f "${OPENBLAS_DEB}" ]; then
|
|
||||||
actual="$(sha256sum "${OPENBLAS_DEB}" | awk '{print $1}')"
|
|
||||||
[ "$actual" = "$repo_sha" ] || rm -f "${OPENBLAS_DEB}"
|
|
||||||
fi
|
|
||||||
if [ ! -f "${OPENBLAS_DEB}" ]; then
|
|
||||||
echo "=== downloading ${OPENBLAS_PKG} ==="
|
|
||||||
wget --show-progress -O "${OPENBLAS_DEB}" "https://deb.debian.org/debian/${repo_file}"
|
|
||||||
actual="$(sha256sum "${OPENBLAS_DEB}" | awk '{print $1}')"
|
|
||||||
[ "$actual" = "$repo_sha" ] || { echo "ERROR: sha256 mismatch for ${OPENBLAS_PKG}"; rm -f "${OPENBLAS_DEB}"; exit 1; }
|
|
||||||
fi
|
|
||||||
|
|
||||||
# extract libopenblas shared libs
|
|
||||||
TMP_DEB=$(mktemp -d)
|
|
||||||
trap 'rm -rf "${TMP_DEB}" "${BUILD_TMP:-}"' EXIT INT TERM
|
|
||||||
(
|
|
||||||
cd "${TMP_DEB}"
|
|
||||||
ar x "${OPENBLAS_DEB}"
|
|
||||||
tar xf data.tar.*
|
|
||||||
)
|
|
||||||
find "${TMP_DEB}" \( -name 'libopenblas*.so*' \) \( -type f -o -type l \) \
|
|
||||||
-exec cp -a {} "${CACHE_DIR}/lib/" \;
|
|
||||||
echo "=== OpenBLAS libs: $(ls "${CACHE_DIR}/lib/" | wc -l) files ==="
|
|
||||||
|
|
||||||
# also need libopenblas-dev header for compilation (we only need the .so symlink)
|
|
||||||
OPENBLAS_SO="$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libopenblas.so.*' -type f | sort | head -1)"
|
|
||||||
[ -n "${OPENBLAS_SO}" ] || { echo "ERROR: libopenblas.so not extracted"; exit 1; }
|
|
||||||
SONAME="$(basename "${OPENBLAS_SO}")"
|
|
||||||
ln -sf "${SONAME}" "${CACHE_DIR}/lib/libopenblas.so" 2>/dev/null || true
|
|
||||||
ln -sf "${SONAME}" "${CACHE_DIR}/lib/libblas.so" 2>/dev/null || true
|
|
||||||
|
|
||||||
# ── build HPL ─────────────────────────────────────────────────────────────────
|
|
||||||
BUILD_TMP=$(mktemp -d)
|
|
||||||
|
|
||||||
cd "${BUILD_TMP}"
|
|
||||||
tar xf "${HPL_TAR}"
|
|
||||||
SRC_DIR="$(find . -maxdepth 1 -type d -name 'hpl-*' | head -1)"
|
|
||||||
[ -n "${SRC_DIR}" ] || { echo "ERROR: HPL source dir not found"; exit 1; }
|
|
||||||
cd "${SRC_DIR}"
|
|
||||||
|
|
||||||
# Write a minimal single-process MPI stub so we don't need an MPI package.
|
|
||||||
# HPL only needs these functions for single-process execution.
|
|
||||||
cat > "${BUILD_TMP}/mpi_stub.c" <<'MPISTUB'
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <sys/time.h>
|
|
||||||
|
|
||||||
typedef int MPI_Comm;
|
|
||||||
typedef int MPI_Datatype;
|
|
||||||
typedef int MPI_Op;
|
|
||||||
typedef int MPI_Status;
|
|
||||||
typedef int MPI_Request;
|
|
||||||
|
|
||||||
#define MPI_COMM_WORLD 0
|
|
||||||
#define MPI_SUCCESS 0
|
|
||||||
#define MPI_DOUBLE 6
|
|
||||||
#define MPI_INT 5
|
|
||||||
#define MPI_SUM 0
|
|
||||||
#define MPI_MAX 1
|
|
||||||
#define MPI_MIN 2
|
|
||||||
#define MPI_BYTE 1
|
|
||||||
#define MPI_ANY_SOURCE -1
|
|
||||||
#define MPI_ANY_TAG -1
|
|
||||||
#define MPI_STATUS_IGNORE ((MPI_Status*)0)
|
|
||||||
|
|
||||||
int MPI_Init(int *argc, char ***argv) { (void)argc; (void)argv; return MPI_SUCCESS; }
|
|
||||||
int MPI_Finalize(void) { return MPI_SUCCESS; }
|
|
||||||
int MPI_Comm_rank(MPI_Comm c, int *rank) { (void)c; *rank = 0; return MPI_SUCCESS; }
|
|
||||||
int MPI_Comm_size(MPI_Comm c, int *size) { (void)c; *size = 1; return MPI_SUCCESS; }
|
|
||||||
int MPI_Bcast(void *b, int n, MPI_Datatype t, int r, MPI_Comm c)
|
|
||||||
{ (void)b;(void)n;(void)t;(void)r;(void)c; return MPI_SUCCESS; }
|
|
||||||
int MPI_Reduce(const void *s, void *r, int n, MPI_Datatype t, MPI_Op op, int root, MPI_Comm c) {
|
|
||||||
(void)op;(void)root;(void)c;
|
|
||||||
size_t sz = (t==MPI_DOUBLE)?sizeof(double):(t==MPI_INT)?sizeof(int):1;
|
|
||||||
memcpy(r, s, (size_t)n * sz);
|
|
||||||
return MPI_SUCCESS;
|
|
||||||
}
|
|
||||||
int MPI_Allreduce(const void *s, void *r, int n, MPI_Datatype t, MPI_Op op, MPI_Comm c)
|
|
||||||
{ return MPI_Reduce(s,r,n,t,op,0,c); }
|
|
||||||
int MPI_Send(const void *b, int n, MPI_Datatype t, int d, int tag, MPI_Comm c)
|
|
||||||
{ (void)b;(void)n;(void)t;(void)d;(void)tag;(void)c; return MPI_SUCCESS; }
|
|
||||||
int MPI_Recv(void *b, int n, MPI_Datatype t, int s, int tag, MPI_Comm c, MPI_Status *st)
|
|
||||||
{ (void)b;(void)n;(void)t;(void)s;(void)tag;(void)c;(void)st; return MPI_SUCCESS; }
|
|
||||||
int MPI_Sendrecv(const void *sb, int sn, MPI_Datatype st2, int dest, int stag,
|
|
||||||
void *rb, int rn, MPI_Datatype rt, int src, int rtag,
|
|
||||||
MPI_Comm c, MPI_Status *status)
|
|
||||||
{ (void)sb;(void)sn;(void)st2;(void)dest;(void)stag;
|
|
||||||
(void)rb;(void)rn;(void)rt;(void)src;(void)rtag;(void)c;(void)status;
|
|
||||||
return MPI_SUCCESS; }
|
|
||||||
int MPI_Irecv(void *b, int n, MPI_Datatype t, int s, int tag, MPI_Comm c, MPI_Request *req)
|
|
||||||
{ (void)b;(void)n;(void)t;(void)s;(void)tag;(void)c;(void)req; return MPI_SUCCESS; }
|
|
||||||
int MPI_Wait(MPI_Request *req, MPI_Status *st)
|
|
||||||
{ (void)req;(void)st; return MPI_SUCCESS; }
|
|
||||||
int MPI_Abort(MPI_Comm c, int code) { (void)c; exit(code); }
|
|
||||||
double MPI_Wtime(void) {
|
|
||||||
struct timeval tv;
|
|
||||||
gettimeofday(&tv, NULL);
|
|
||||||
return (double)tv.tv_sec + (double)tv.tv_usec * 1e-6;
|
|
||||||
}
|
|
||||||
MPISTUB
|
|
||||||
|
|
||||||
# Write Make.bee — HPL makefile configuration
|
|
||||||
cat > Make.bee <<MAKEFILE
|
|
||||||
SHELL = /bin/sh
|
|
||||||
CD = cd
|
|
||||||
CP = cp
|
|
||||||
LN_S = ln -s
|
|
||||||
MKDIR = mkdir -p
|
|
||||||
RM = /bin/rm -f
|
|
||||||
TOUCH = touch
|
|
||||||
ARCH = bee
|
|
||||||
|
|
||||||
# Directories
|
|
||||||
TOPdir = \$(shell pwd)
|
|
||||||
INCdir = \$(TOPdir)/include
|
|
||||||
BINdir = \$(TOPdir)/bin/\$(ARCH)
|
|
||||||
LIBdir = \$(TOPdir)/lib/\$(ARCH)
|
|
||||||
HPLlib = \$(LIBdir)/libhpl.a
|
|
||||||
|
|
||||||
# Compiler
|
|
||||||
CC = gcc
|
|
||||||
CCNOOPT = \$(HPL_DEFS)
|
|
||||||
CCFLAGS = \$(HPL_DEFS) -O3 -march=native -funroll-loops -fomit-frame-pointer
|
|
||||||
|
|
||||||
# Linker
|
|
||||||
LINKER = gcc
|
|
||||||
LINKFLAGS = \$(CCFLAGS)
|
|
||||||
|
|
||||||
# MPI (single-process stub — no actual MPI needed)
|
|
||||||
MPdir =
|
|
||||||
MPinc = -I${BUILD_TMP}
|
|
||||||
MPlib = ${BUILD_TMP}/mpi_stub.o
|
|
||||||
|
|
||||||
# BLAS (OpenBLAS)
|
|
||||||
LAdir = ${CACHE_DIR}/lib
|
|
||||||
LAinc =
|
|
||||||
LAlib = -L\$(LAdir) -Wl,-rpath,/usr/lib -lopenblas
|
|
||||||
|
|
||||||
HPL_OPTS =
|
|
||||||
HPL_DEFS = \$(HPL_OPTS) -DHPL_CALL_CBLAS
|
|
||||||
MAKEFILE
|
|
||||||
echo "=== Make.bee written ==="
|
|
||||||
|
|
||||||
# compile MPI stub
|
|
||||||
gcc -O2 -c -o "${BUILD_TMP}/mpi_stub.o" "${BUILD_TMP}/mpi_stub.c"
|
|
||||||
|
|
||||||
# build HPL
|
|
||||||
echo "=== building HPL ${HPL_VERSION} ==="
|
|
||||||
make -j"$(nproc)" arch=bee 2>&1 | tail -20
|
|
||||||
|
|
||||||
XHPL_BIN="bin/bee/xhpl"
|
|
||||||
[ -x "${XHPL_BIN}" ] || { echo "ERROR: xhpl not found after build"; exit 1; }
|
|
||||||
|
|
||||||
cp "${XHPL_BIN}" "${CACHE_DIR}/bin/xhpl"
|
|
||||||
chmod +x "${CACHE_DIR}/bin/xhpl"
|
|
||||||
echo "=== HPL build complete ==="
|
|
||||||
echo "binary: ${CACHE_DIR}/bin/xhpl"
|
|
||||||
echo "libs: $(ls "${CACHE_DIR}/lib/")"
|
|
||||||
@@ -1148,19 +1148,6 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
|||||||
echo "=== john injected ==="
|
echo "=== john injected ==="
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# --- build HPL (CPU LINPACK) — runs on all variants ---
|
|
||||||
run_step "build HPL ${HPL_VERSION}" "80-hpl" \
|
|
||||||
sh "${BUILDER_DIR}/build-hpl.sh" "${HPL_VERSION}" "${HPL_SHA256}" "${DIST_DIR}"
|
|
||||||
|
|
||||||
HPL_CACHE="${DIST_DIR}/hpl-${HPL_VERSION}"
|
|
||||||
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/lib/bee"
|
|
||||||
cp "${HPL_CACHE}/bin/xhpl" "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/xhpl"
|
|
||||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/xhpl"
|
|
||||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-hpl" 2>/dev/null || true
|
|
||||||
# Inject OpenBLAS runtime libs needed by xhpl
|
|
||||||
cp "${HPL_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" 2>/dev/null || true
|
|
||||||
echo "=== HPL injected: xhpl + $(ls "${HPL_CACHE}/lib/" | wc -l) OpenBLAS libs ==="
|
|
||||||
|
|
||||||
# --- embed build metadata ---
|
# --- embed build metadata ---
|
||||||
mkdir -p "${OVERLAY_STAGE_DIR}/etc"
|
mkdir -p "${OVERLAY_STAGE_DIR}/etc"
|
||||||
BUILD_DATE="$(date +%Y-%m-%d)"
|
BUILD_DATE="$(date +%Y-%m-%d)"
|
||||||
@@ -1193,7 +1180,6 @@ BUILD_DATE=${BUILD_DATE}
|
|||||||
GIT_COMMIT=${GIT_COMMIT}
|
GIT_COMMIT=${GIT_COMMIT}
|
||||||
DEBIAN_VERSION=${DEBIAN_VERSION}
|
DEBIAN_VERSION=${DEBIAN_VERSION}
|
||||||
DEBIAN_KERNEL_ABI=${DEBIAN_KERNEL_ABI}
|
DEBIAN_KERNEL_ABI=${DEBIAN_KERNEL_ABI}
|
||||||
HPL_VERSION=${HPL_VERSION}
|
|
||||||
${GPU_VERSION_LINE}
|
${GPU_VERSION_LINE}
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
|
|||||||
@@ -1,9 +1,9 @@
|
|||||||
set color_normal=light-gray/black
|
set color_normal=light-gray/black
|
||||||
set color_highlight=white/dark-gray
|
set color_highlight=yellow/black
|
||||||
|
|
||||||
if [ -e /boot/grub/splash.png ]; then
|
if [ -e /boot/grub/splash.png ]; then
|
||||||
set theme=/boot/grub/live-theme/theme.txt
|
set theme=/boot/grub/live-theme/theme.txt
|
||||||
else
|
else
|
||||||
set menu_color_normal=cyan/black
|
set menu_color_normal=yellow/black
|
||||||
set menu_color_highlight=white/dark-gray
|
set menu_color_highlight=white/brown
|
||||||
fi
|
fi
|
||||||
|
|||||||
@@ -82,16 +82,22 @@ glow_draw.ellipse((520, 340, 1400, 760), fill=(255, 190, 40, 36))
|
|||||||
glow = glow.filter(ImageFilter.GaussianBlur(60))
|
glow = glow.filter(ImageFilter.GaussianBlur(60))
|
||||||
img = Image.alpha_composite(img.convert('RGBA'), glow)
|
img = Image.alpha_composite(img.convert('RGBA'), glow)
|
||||||
|
|
||||||
font_logo = load_font(MONO_FONT_CANDIDATES, 64)
|
TARGET_LOGO_W = 400
|
||||||
|
max_chars = max(len(line) for line in ASCII_ART)
|
||||||
|
_probe_font = load_font(MONO_FONT_CANDIDATES, 64)
|
||||||
|
_probe_cw, _ = mono_metrics(_probe_font)
|
||||||
|
font_size_logo = max(6, int(64 * TARGET_LOGO_W / (_probe_cw * max_chars)))
|
||||||
|
font_logo = load_font(MONO_FONT_CANDIDATES, font_size_logo)
|
||||||
char_w, char_h = mono_metrics(font_logo)
|
char_w, char_h = mono_metrics(font_logo)
|
||||||
logo_mask = render_ascii_mask(font_logo, ASCII_ART, char_w, char_h, 8)
|
logo_mask = render_ascii_mask(font_logo, ASCII_ART, char_w, char_h, 2)
|
||||||
logo_w, logo_h = logo_mask.size
|
logo_w, logo_h = logo_mask.size
|
||||||
logo_x = (W - logo_w) // 2
|
logo_x = (W - logo_w) // 2
|
||||||
logo_y = 270
|
logo_y = 380
|
||||||
|
|
||||||
shadow_mask = logo_mask.filter(ImageFilter.GaussianBlur(2))
|
sh_off = max(1, font_size_logo // 6)
|
||||||
img.paste(SHADOW, (logo_x + 16, logo_y + 14), shadow_mask)
|
shadow_mask = logo_mask.filter(ImageFilter.GaussianBlur(1))
|
||||||
img.paste(FG_DIM, (logo_x + 8, logo_y + 7), logo_mask)
|
img.paste(SHADOW, (logo_x + sh_off * 2, logo_y + sh_off * 2), shadow_mask)
|
||||||
|
img.paste(FG_DIM, (logo_x + sh_off, logo_y + sh_off), logo_mask)
|
||||||
img.paste(FG, (logo_x, logo_y), logo_mask)
|
img.paste(FG, (logo_x, logo_y), logo_mask)
|
||||||
|
|
||||||
font_sub = load_font(SUB_FONT_CANDIDATES, 30)
|
font_sub = load_font(SUB_FONT_CANDIDATES, 30)
|
||||||
|
|||||||
110
iso/overlay/usr/local/bin/bee-dcgmproftester-staggered
Executable file
110
iso/overlay/usr/local/bin/bee-dcgmproftester-staggered
Executable file
@@ -0,0 +1,110 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
set -eu
|
||||||
|
|
||||||
|
SECONDS=300
|
||||||
|
STAGGER_SECONDS=180
|
||||||
|
DEVICES=""
|
||||||
|
EXCLUDE=""
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
echo "usage: $0 [--seconds N] [--stagger-seconds N] [--devices 0,1] [--exclude 2,3]" >&2
|
||||||
|
exit 2
|
||||||
|
}
|
||||||
|
|
||||||
|
normalize_list() {
|
||||||
|
echo "${1:-}" | tr ',' '\n' | sed 's/[[:space:]]//g' | awk 'NF' | sort -n | uniq | paste -sd, -
|
||||||
|
}
|
||||||
|
|
||||||
|
contains_csv() {
|
||||||
|
needle="$1"
|
||||||
|
haystack="${2:-}"
|
||||||
|
echo ",${haystack}," | grep -q ",${needle},"
|
||||||
|
}
|
||||||
|
|
||||||
|
resolve_dcgmproftester() {
|
||||||
|
for candidate in dcgmproftester dcgmproftester13 dcgmproftester12 dcgmproftester11; do
|
||||||
|
if command -v "${candidate}" >/dev/null 2>&1; then
|
||||||
|
command -v "${candidate}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
while [ "$#" -gt 0 ]; do
|
||||||
|
case "$1" in
|
||||||
|
--seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
|
||||||
|
--stagger-seconds) [ "$#" -ge 2 ] || usage; STAGGER_SECONDS="$2"; shift 2 ;;
|
||||||
|
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
||||||
|
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
||||||
|
*) usage ;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
PROF=$(resolve_dcgmproftester) || { echo "dcgmproftester not found in PATH" >&2; exit 1; }
|
||||||
|
ALL_DEVICES=$(nvidia-smi --query-gpu=index --format=csv,noheader,nounits 2>/dev/null | sed 's/[[:space:]]//g' | awk 'NF' | paste -sd, -)
|
||||||
|
[ -n "${ALL_DEVICES}" ] || { echo "nvidia-smi found no NVIDIA GPUs" >&2; exit 1; }
|
||||||
|
|
||||||
|
DEVICES=$(normalize_list "${DEVICES}")
|
||||||
|
EXCLUDE=$(normalize_list "${EXCLUDE}")
|
||||||
|
SELECTED="${DEVICES}"
|
||||||
|
if [ -z "${SELECTED}" ]; then
|
||||||
|
SELECTED="${ALL_DEVICES}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
FINAL=""
|
||||||
|
for id in $(echo "${SELECTED}" | tr ',' ' '); do
|
||||||
|
[ -n "${id}" ] || continue
|
||||||
|
if contains_csv "${id}" "${EXCLUDE}"; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
if [ -z "${FINAL}" ]; then
|
||||||
|
FINAL="${id}"
|
||||||
|
else
|
||||||
|
FINAL="${FINAL},${id}"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
[ -n "${FINAL}" ] || { echo "no NVIDIA GPUs selected after filters" >&2; exit 1; }
|
||||||
|
|
||||||
|
echo "loader=dcgmproftester-staggered"
|
||||||
|
echo "selected_gpus=${FINAL}"
|
||||||
|
echo "stagger_seconds=${STAGGER_SECONDS}"
|
||||||
|
|
||||||
|
TMP_DIR=$(mktemp -d)
|
||||||
|
trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM
|
||||||
|
|
||||||
|
GPU_COUNT=$(echo "${FINAL}" | tr ',' '\n' | awk 'NF' | wc -l | tr -d '[:space:]')
|
||||||
|
gpu_pos=0
|
||||||
|
WORKERS=""
|
||||||
|
for id in $(echo "${FINAL}" | tr ',' ' '); do
|
||||||
|
gpu_pos=$((gpu_pos + 1))
|
||||||
|
log="${TMP_DIR}/gpu-${id}.log"
|
||||||
|
extra_sec=$(( STAGGER_SECONDS * (GPU_COUNT - gpu_pos) ))
|
||||||
|
gpu_seconds=$(( SECONDS + extra_sec ))
|
||||||
|
echo "starting gpu ${id} seconds=${gpu_seconds}"
|
||||||
|
CUDA_VISIBLE_DEVICES="${id}" "${PROF}" --no-dcgm-validation -t 1004 -d "${gpu_seconds}" >"${log}" 2>&1 &
|
||||||
|
pid=$!
|
||||||
|
WORKERS="${WORKERS} ${pid}:${id}:${log}"
|
||||||
|
if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${gpu_pos}" -lt "${GPU_COUNT}" ]; then
|
||||||
|
sleep "${STAGGER_SECONDS}"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
status=0
|
||||||
|
for spec in ${WORKERS}; do
|
||||||
|
pid=${spec%%:*}
|
||||||
|
rest=${spec#*:}
|
||||||
|
id=${rest%%:*}
|
||||||
|
log=${rest#*:}
|
||||||
|
if wait "${pid}"; then
|
||||||
|
echo "gpu ${id} finished: OK"
|
||||||
|
else
|
||||||
|
rc=$?
|
||||||
|
echo "gpu ${id} finished: FAILED rc=${rc}"
|
||||||
|
status=1
|
||||||
|
fi
|
||||||
|
sed "s/^/[gpu ${id}] /" "${log}" || true
|
||||||
|
done
|
||||||
|
|
||||||
|
exit "${status}"
|
||||||
17
iso/overlay/usr/local/bin/bee-gpu-burn
Normal file → Executable file
17
iso/overlay/usr/local/bin/bee-gpu-burn
Normal file → Executable file
@@ -2,13 +2,14 @@
|
|||||||
set -eu
|
set -eu
|
||||||
|
|
||||||
SECONDS=5
|
SECONDS=5
|
||||||
|
STAGGER_SECONDS=0
|
||||||
SIZE_MB=0
|
SIZE_MB=0
|
||||||
DEVICES=""
|
DEVICES=""
|
||||||
EXCLUDE=""
|
EXCLUDE=""
|
||||||
WORKER="/usr/local/lib/bee/bee-gpu-burn-worker"
|
WORKER="/usr/local/lib/bee/bee-gpu-burn-worker"
|
||||||
|
|
||||||
usage() {
|
usage() {
|
||||||
echo "usage: $0 [--seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3]" >&2
|
echo "usage: $0 [--seconds N] [--stagger-seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3]" >&2
|
||||||
exit 2
|
exit 2
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -25,6 +26,7 @@ contains_csv() {
|
|||||||
while [ "$#" -gt 0 ]; do
|
while [ "$#" -gt 0 ]; do
|
||||||
case "$1" in
|
case "$1" in
|
||||||
--seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
|
--seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
|
||||||
|
--stagger-seconds) [ "$#" -ge 2 ] || usage; STAGGER_SECONDS="$2"; shift 2 ;;
|
||||||
--size-mb|-m) [ "$#" -ge 2 ] || usage; SIZE_MB="$2"; shift 2 ;;
|
--size-mb|-m) [ "$#" -ge 2 ] || usage; SIZE_MB="$2"; shift 2 ;;
|
||||||
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
||||||
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
||||||
@@ -61,14 +63,18 @@ done
|
|||||||
|
|
||||||
echo "loader=bee-gpu-burn"
|
echo "loader=bee-gpu-burn"
|
||||||
echo "selected_gpus=${FINAL}"
|
echo "selected_gpus=${FINAL}"
|
||||||
|
echo "stagger_seconds=${STAGGER_SECONDS}"
|
||||||
|
|
||||||
export CUDA_DEVICE_ORDER="PCI_BUS_ID"
|
export CUDA_DEVICE_ORDER="PCI_BUS_ID"
|
||||||
|
|
||||||
TMP_DIR=$(mktemp -d)
|
TMP_DIR=$(mktemp -d)
|
||||||
trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM
|
trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM
|
||||||
|
|
||||||
|
GPU_COUNT=$(echo "${FINAL}" | tr ',' '\n' | awk 'NF' | wc -l | tr -d '[:space:]')
|
||||||
|
gpu_pos=0
|
||||||
WORKERS=""
|
WORKERS=""
|
||||||
for id in $(echo "${FINAL}" | tr ',' ' '); do
|
for id in $(echo "${FINAL}" | tr ',' ' '); do
|
||||||
|
gpu_pos=$((gpu_pos + 1))
|
||||||
log="${TMP_DIR}/gpu-${id}.log"
|
log="${TMP_DIR}/gpu-${id}.log"
|
||||||
gpu_size_mb="${SIZE_MB}"
|
gpu_size_mb="${SIZE_MB}"
|
||||||
if [ "${gpu_size_mb}" -le 0 ] 2>/dev/null; then
|
if [ "${gpu_size_mb}" -le 0 ] 2>/dev/null; then
|
||||||
@@ -79,11 +85,16 @@ for id in $(echo "${FINAL}" | tr ',' ' '); do
|
|||||||
gpu_size_mb=512
|
gpu_size_mb=512
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
echo "starting gpu ${id} size=${gpu_size_mb}MB"
|
extra_sec=$(( STAGGER_SECONDS * (GPU_COUNT - gpu_pos) ))
|
||||||
|
gpu_seconds=$(( SECONDS + extra_sec ))
|
||||||
|
echo "starting gpu ${id} size=${gpu_size_mb}MB seconds=${gpu_seconds}"
|
||||||
CUDA_VISIBLE_DEVICES="${id}" \
|
CUDA_VISIBLE_DEVICES="${id}" \
|
||||||
"${WORKER}" --device 0 --seconds "${SECONDS}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
|
"${WORKER}" --device 0 --seconds "${gpu_seconds}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
|
||||||
pid=$!
|
pid=$!
|
||||||
WORKERS="${WORKERS} ${pid}:${id}:${log}"
|
WORKERS="${WORKERS} ${pid}:${id}:${log}"
|
||||||
|
if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${gpu_pos}" -lt "${GPU_COUNT}" ]; then
|
||||||
|
sleep "${STAGGER_SECONDS}"
|
||||||
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
status=0
|
status=0
|
||||||
|
|||||||
@@ -1,97 +0,0 @@
|
|||||||
#!/bin/sh
|
|
||||||
# bee-hpl — run HPL (High Performance LINPACK) with auto-sized problem.
|
|
||||||
#
|
|
||||||
# Generates HPL.dat based on available RAM, runs xhpl, and prints standard
|
|
||||||
# HPL output. The WR... line with Gflops is parsed by the bee audit tool.
|
|
||||||
#
|
|
||||||
# Usage: bee-hpl [--mem-fraction 0.80] [--nb 256] [--seconds N]
|
|
||||||
#
|
|
||||||
# --mem-fraction fraction of total RAM to use for the matrix (default 0.80)
|
|
||||||
# --nb block size; 256 is good for modern CPUs (default 256)
|
|
||||||
# --seconds ignored — HPL runtime is determined by problem size; kept
|
|
||||||
# for interface compatibility with other bee stress tools
|
|
||||||
|
|
||||||
set -eu
|
|
||||||
|
|
||||||
XHPL="/usr/local/lib/bee/xhpl"
|
|
||||||
MEM_FRACTION="0.80"
|
|
||||||
NB=256
|
|
||||||
|
|
||||||
usage() {
|
|
||||||
echo "usage: $0 [--mem-fraction 0.80] [--nb 256] [--seconds N]" >&2
|
|
||||||
exit 2
|
|
||||||
}
|
|
||||||
|
|
||||||
while [ "$#" -gt 0 ]; do
|
|
||||||
case "$1" in
|
|
||||||
--mem-fraction) [ "$#" -ge 2 ] || usage; MEM_FRACTION="$2"; shift 2 ;;
|
|
||||||
--nb) [ "$#" -ge 2 ] || usage; NB="$2"; shift 2 ;;
|
|
||||||
--seconds) [ "$#" -ge 2 ] || usage; shift 2 ;; # accepted, ignored
|
|
||||||
*) usage ;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
|
|
||||||
[ -x "${XHPL}" ] || { echo "ERROR: xhpl not found at ${XHPL}" >&2; exit 1; }
|
|
||||||
|
|
||||||
# Detect total RAM in bytes
|
|
||||||
TOTAL_KB=$(grep MemTotal /proc/meminfo | awk '{print $2}')
|
|
||||||
[ -n "${TOTAL_KB}" ] || { echo "ERROR: cannot read MemTotal from /proc/meminfo" >&2; exit 1; }
|
|
||||||
TOTAL_BYTES=$(( TOTAL_KB * 1024 ))
|
|
||||||
|
|
||||||
# N = floor(sqrt(fraction * total_bytes / 8)) rounded down to multiple of NB
|
|
||||||
# Use awk for floating-point sqrt
|
|
||||||
N=$(awk -v total="${TOTAL_BYTES}" -v frac="${MEM_FRACTION}" -v nb="${NB}" '
|
|
||||||
BEGIN {
|
|
||||||
raw = int(sqrt(total * frac / 8.0))
|
|
||||||
n = int(raw / nb) * nb
|
|
||||||
if (n < nb) n = nb
|
|
||||||
print n
|
|
||||||
}')
|
|
||||||
|
|
||||||
echo "loader=bee-hpl"
|
|
||||||
echo "total_ram_mb=$(( TOTAL_KB / 1024 ))"
|
|
||||||
echo "matrix_n=${N}"
|
|
||||||
echo "block_nb=${NB}"
|
|
||||||
echo "mem_fraction=${MEM_FRACTION}"
|
|
||||||
|
|
||||||
# Generate HPL.dat in a temp directory and run from there
|
|
||||||
RUNDIR=$(mktemp -d)
|
|
||||||
trap 'rm -rf "${RUNDIR}"' EXIT INT TERM
|
|
||||||
|
|
||||||
cat > "${RUNDIR}/HPL.dat" <<DAT
|
|
||||||
HPLinpack benchmark input file
|
|
||||||
Innovative Computing Laboratory, University of Tennessee
|
|
||||||
HPL.out output file name (if any)
|
|
||||||
6 device out (6=stdout, 7=stderr, file)
|
|
||||||
1 # of problems sizes (N)
|
|
||||||
${N} Ns
|
|
||||||
1 # of NBs
|
|
||||||
${NB} NBs
|
|
||||||
0 PMAP process mapping (0=Row-,1=Column-major)
|
|
||||||
1 # of process grids (P x Q)
|
|
||||||
1 Ps
|
|
||||||
1 Qs
|
|
||||||
16.0 threshold
|
|
||||||
1 # of panel fact
|
|
||||||
2 PFACTs (0=left, 1=Crout, 2=Right)
|
|
||||||
1 # of recursive stopping criterium
|
|
||||||
4 NBMINs (>= 1)
|
|
||||||
1 # of panels in recursion
|
|
||||||
2 NDIVs
|
|
||||||
1 # of recursive panel fact.
|
|
||||||
1 RFACTs (0=left, 1=Crout, 2=Right)
|
|
||||||
1 # of broadcast
|
|
||||||
1 BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM)
|
|
||||||
1 # of lookahead depth
|
|
||||||
1 DEPTHs (>=0)
|
|
||||||
2 SWAP (0=bin-exch,1=long,2=mix)
|
|
||||||
64 swapping threshold
|
|
||||||
0 L1 in (0=transposed,1=no-transposed) form
|
|
||||||
0 U in (0=transposed,1=no-transposed) form
|
|
||||||
1 Equilibration (0=no,1=yes)
|
|
||||||
8 memory alignment in double (> 0)
|
|
||||||
DAT
|
|
||||||
|
|
||||||
cd "${RUNDIR}"
|
|
||||||
echo "---"
|
|
||||||
"${XHPL}"
|
|
||||||
16
iso/overlay/usr/local/bin/bee-john-gpu-stress
Normal file → Executable file
16
iso/overlay/usr/local/bin/bee-john-gpu-stress
Normal file → Executable file
@@ -2,6 +2,7 @@
|
|||||||
set -eu
|
set -eu
|
||||||
|
|
||||||
DURATION_SEC=300
|
DURATION_SEC=300
|
||||||
|
STAGGER_SECONDS=0
|
||||||
DEVICES=""
|
DEVICES=""
|
||||||
EXCLUDE=""
|
EXCLUDE=""
|
||||||
FORMAT=""
|
FORMAT=""
|
||||||
@@ -12,7 +13,7 @@ export OCL_ICD_VENDORS="/etc/OpenCL/vendors"
|
|||||||
export LD_LIBRARY_PATH="/usr/lib:/usr/local/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
|
export LD_LIBRARY_PATH="/usr/lib:/usr/local/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
|
||||||
|
|
||||||
usage() {
|
usage() {
|
||||||
echo "usage: $0 [--seconds N] [--devices 0,1] [--exclude 2,3] [--format name]" >&2
|
echo "usage: $0 [--seconds N] [--stagger-seconds N] [--devices 0,1] [--exclude 2,3] [--format name]" >&2
|
||||||
exit 2
|
exit 2
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -118,6 +119,7 @@ ensure_opencl_ready() {
|
|||||||
while [ "$#" -gt 0 ]; do
|
while [ "$#" -gt 0 ]; do
|
||||||
case "$1" in
|
case "$1" in
|
||||||
--seconds|-t) [ "$#" -ge 2 ] || usage; DURATION_SEC="$2"; shift 2 ;;
|
--seconds|-t) [ "$#" -ge 2 ] || usage; DURATION_SEC="$2"; shift 2 ;;
|
||||||
|
--stagger-seconds) [ "$#" -ge 2 ] || usage; STAGGER_SECONDS="$2"; shift 2 ;;
|
||||||
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
||||||
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
||||||
--format) [ "$#" -ge 2 ] || usage; FORMAT="$2"; shift 2 ;;
|
--format) [ "$#" -ge 2 ] || usage; FORMAT="$2"; shift 2 ;;
|
||||||
@@ -170,6 +172,7 @@ done
|
|||||||
echo "loader=john"
|
echo "loader=john"
|
||||||
echo "selected_gpus=${FINAL}"
|
echo "selected_gpus=${FINAL}"
|
||||||
echo "john_devices=${JOHN_DEVICES}"
|
echo "john_devices=${JOHN_DEVICES}"
|
||||||
|
echo "stagger_seconds=${STAGGER_SECONDS}"
|
||||||
|
|
||||||
cd "${JOHN_DIR}"
|
cd "${JOHN_DIR}"
|
||||||
|
|
||||||
@@ -232,14 +235,21 @@ trap cleanup EXIT INT TERM
|
|||||||
echo "format=${CHOSEN_FORMAT}"
|
echo "format=${CHOSEN_FORMAT}"
|
||||||
echo "target_seconds=${DURATION_SEC}"
|
echo "target_seconds=${DURATION_SEC}"
|
||||||
echo "slice_seconds=${TEST_SLICE_SECONDS}"
|
echo "slice_seconds=${TEST_SLICE_SECONDS}"
|
||||||
DEADLINE=$(( $(date +%s) + DURATION_SEC ))
|
TOTAL_DEVICES=$(echo "${JOHN_DEVICES}" | tr ',' '\n' | awk 'NF' | wc -l | tr -d '[:space:]')
|
||||||
_first=1
|
_first=1
|
||||||
|
pos=0
|
||||||
for opencl_id in $(echo "${JOHN_DEVICES}" | tr ',' ' '); do
|
for opencl_id in $(echo "${JOHN_DEVICES}" | tr ',' ' '); do
|
||||||
|
pos=$((pos + 1))
|
||||||
[ "${_first}" = "1" ] || sleep 3
|
[ "${_first}" = "1" ] || sleep 3
|
||||||
_first=0
|
_first=0
|
||||||
run_john_loop "${opencl_id}" "${DEADLINE}" &
|
extra_sec=$(( STAGGER_SECONDS * (TOTAL_DEVICES - pos) ))
|
||||||
|
deadline=$(( $(date +%s) + DURATION_SEC + extra_sec ))
|
||||||
|
run_john_loop "${opencl_id}" "${deadline}" &
|
||||||
pid=$!
|
pid=$!
|
||||||
PIDS="${PIDS} ${pid}"
|
PIDS="${PIDS} ${pid}"
|
||||||
|
if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${pos}" -lt "${TOTAL_DEVICES}" ]; then
|
||||||
|
sleep "${STAGGER_SECONDS}"
|
||||||
|
fi
|
||||||
done
|
done
|
||||||
FAIL=0
|
FAIL=0
|
||||||
for pid in ${PIDS}; do
|
for pid in ${PIDS}; do
|
||||||
|
|||||||
@@ -21,8 +21,13 @@ read_nvidia_modules_flavor() {
|
|||||||
|
|
||||||
log "kernel: $(uname -r)"
|
log "kernel: $(uname -r)"
|
||||||
|
|
||||||
# Skip if no NVIDIA GPU present (PCI vendor 10de)
|
# Skip if no NVIDIA display/compute GPU is present.
|
||||||
if ! lspci -nn 2>/dev/null | grep -qi '10de:'; then
|
# Match only display-class PCI functions (0300 VGA, 0302 3D controller) from vendor 10de.
|
||||||
|
have_nvidia_gpu() {
|
||||||
|
lspci -Dn 2>/dev/null | awk '$2 ~ /^03(00|02):$/ && $3 ~ /^10de:/ { found=1; exit } END { exit(found ? 0 : 1) }'
|
||||||
|
}
|
||||||
|
|
||||||
|
if ! have_nvidia_gpu; then
|
||||||
log "no NVIDIA GPU detected — skipping module load"
|
log "no NVIDIA GPU detected — skipping module load"
|
||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ log() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
have_nvidia_gpu() {
|
have_nvidia_gpu() {
|
||||||
lspci -nn 2>/dev/null | grep -qi '10de:'
|
lspci -Dn 2>/dev/null | awk '$2 ~ /^03(00|02):$/ && $3 ~ /^10de:/ { found=1; exit } END { exit(found ? 0 : 1) }'
|
||||||
}
|
}
|
||||||
|
|
||||||
service_active() {
|
service_active() {
|
||||||
|
|||||||
Reference in New Issue
Block a user