Compare commits
20 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c1690a084b | ||
|
|
9481ca2805 | ||
|
|
a78fdadd88 | ||
|
|
4ef403898f | ||
| 025548ab3c | |||
|
|
e0d94d7f47 | ||
|
|
13899aa864 | ||
|
|
f345d8a89d | ||
|
|
4715059ac0 | ||
|
|
0660a40287 | ||
|
|
67369d9b7b | ||
|
|
3f41a026ca | ||
|
|
0ee4f46537 | ||
| 8db40b098a | |||
| 16e7ae00e7 | |||
| b2f8626fee | |||
| dd26e03b2d | |||
| 6937a4c6ec | |||
| b9be93c213 | |||
| d1a22d782d |
@@ -382,9 +382,9 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
|
||||
archive, err = application.RunNvidiaAcceptancePack("", logLine)
|
||||
}
|
||||
case "memory":
|
||||
archive, err = application.RunMemoryAcceptancePackCtx(context.Background(), "", logLine)
|
||||
archive, err = application.RunMemoryAcceptancePackCtx(context.Background(), "", 256, 1, logLine)
|
||||
case "storage":
|
||||
archive, err = application.RunStorageAcceptancePackCtx(context.Background(), "", logLine)
|
||||
archive, err = application.RunStorageAcceptancePackCtx(context.Background(), "", false, logLine)
|
||||
case "cpu":
|
||||
dur := *duration
|
||||
if dur <= 0 {
|
||||
|
||||
@@ -117,15 +117,15 @@ type satRunner interface {
|
||||
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
|
||||
RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error)
|
||||
RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error)
|
||||
ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error)
|
||||
ResetNvidiaGPU(index int) (string, error)
|
||||
RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||
RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||
RunMemoryAcceptancePack(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error)
|
||||
RunStorageAcceptancePack(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error)
|
||||
RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||
ListNvidiaGPUs() ([]platform.NvidiaGPU, error)
|
||||
DetectGPUVendor() string
|
||||
@@ -566,11 +566,11 @@ func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts pl
|
||||
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
||||
return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, staggerSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
@@ -602,14 +602,14 @@ func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts p
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, logFunc)
|
||||
return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, 256, 1, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
func (a *App) RunMemoryAcceptancePackCtx(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunMemoryAcceptancePack(ctx, baseDir, logFunc)
|
||||
return a.sat.RunMemoryAcceptancePack(ctx, baseDir, sizeMB, passes, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||
@@ -634,14 +634,14 @@ func (a *App) RunCPUAcceptancePackResult(baseDir string, durationSec int) (Actio
|
||||
}
|
||||
|
||||
func (a *App) RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunStorageAcceptancePackCtx(context.Background(), baseDir, logFunc)
|
||||
return a.RunStorageAcceptancePackCtx(context.Background(), baseDir, false, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunStorageAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
func (a *App) RunStorageAcceptancePackCtx(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunStorageAcceptancePack(ctx, baseDir, logFunc)
|
||||
return a.sat.RunStorageAcceptancePack(ctx, baseDir, extended, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||
|
||||
@@ -161,7 +161,7 @@ func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaOfficialComputePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
|
||||
func (f fakeSAT) RunNvidiaOfficialComputePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ int, _ func(string)) (string, error) {
|
||||
if f.runNvidiaComputeFn != nil {
|
||||
return f.runNvidiaComputeFn(baseDir, durationSec, gpuIndices)
|
||||
}
|
||||
@@ -217,11 +217,11 @@ func (f fakeSAT) ResetNvidiaGPU(index int) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunMemoryAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) {
|
||||
func (f fakeSAT) RunMemoryAcceptancePack(_ context.Context, baseDir string, _, _ int, _ func(string)) (string, error) {
|
||||
return f.runMemoryFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunStorageAcceptancePack(_ context.Context, baseDir string, _ func(string)) (string, error) {
|
||||
func (f fakeSAT) RunStorageAcceptancePack(_ context.Context, baseDir string, _ bool, _ func(string)) (string, error) {
|
||||
return f.runStorageFn(baseDir)
|
||||
}
|
||||
|
||||
@@ -542,8 +542,6 @@ func TestActionResultsUseFallbackBody(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tmp := t.TempDir()
|
||||
oldExportDir := DefaultExportDir
|
||||
DefaultExportDir = tmp
|
||||
@@ -580,8 +578,6 @@ func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestExportSupportBundleResultDoesNotPretendSuccessOnError(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tmp := t.TempDir()
|
||||
oldExportDir := DefaultExportDir
|
||||
DefaultExportDir = tmp
|
||||
@@ -643,8 +639,6 @@ func TestRunNvidiaAcceptancePackResult(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestRunSATDefaultsToExportDir(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
oldSATBaseDir := DefaultSATBaseDir
|
||||
DefaultSATBaseDir = "/tmp/export/bee-sat"
|
||||
t.Cleanup(func() { DefaultSATBaseDir = oldSATBaseDir })
|
||||
|
||||
@@ -54,7 +54,7 @@ if ! command -v lspci >/dev/null 2>&1; then
|
||||
exit 0
|
||||
fi
|
||||
found=0
|
||||
for gpu in $(lspci -Dn | awk '$3 ~ /^10de:/ {print $1}'); do
|
||||
for gpu in $(lspci -Dn | awk '$2 ~ /^03(00|02):$/ && $3 ~ /^10de:/ {print $1}'); do
|
||||
found=1
|
||||
echo "=== GPU $gpu ==="
|
||||
lspci -s "$gpu" -vv 2>&1 || true
|
||||
@@ -73,8 +73,13 @@ fi
|
||||
{name: "system/pcie-nvidia-link.txt", cmd: []string{"sh", "-c", `
|
||||
for d in /sys/bus/pci/devices/*/; do
|
||||
vendor=$(cat "$d/vendor" 2>/dev/null)
|
||||
[ "$vendor" = "0x10de" ] || continue
|
||||
dev=$(basename "$d")
|
||||
[ "$vendor" = "0x10de" ] || continue
|
||||
class=$(cat "$d/class" 2>/dev/null)
|
||||
case "$class" in
|
||||
0x030000|0x030200) ;;
|
||||
*) continue ;;
|
||||
esac
|
||||
dev=$(basename "$d")
|
||||
echo "=== $dev ==="
|
||||
for f in current_link_speed current_link_width max_link_speed max_link_width; do
|
||||
printf " %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)"
|
||||
@@ -192,7 +197,7 @@ var supportBundleOptionalFiles = []struct {
|
||||
{name: "system/syslog.txt", src: "/var/log/syslog"},
|
||||
}
|
||||
|
||||
const supportBundleGlob = "bee-support-*.tar.gz"
|
||||
const supportBundleGlob = "????-??-?? (BEE-SP*)*.tar.gz"
|
||||
|
||||
func BuildSupportBundle(exportDir string) (string, error) {
|
||||
exportDir = strings.TrimSpace(exportDir)
|
||||
@@ -206,9 +211,14 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
||||
return "", err
|
||||
}
|
||||
|
||||
host := sanitizeFilename(hostnameOr("unknown"))
|
||||
ts := time.Now().UTC().Format("20060102-150405")
|
||||
stageRoot := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-%s-%s", host, ts))
|
||||
now := time.Now().UTC()
|
||||
date := now.Format("2006-01-02")
|
||||
tod := now.Format("15:04:05")
|
||||
ver := bundleVersion()
|
||||
model := serverModelForBundle()
|
||||
sn := serverSerialForBundle()
|
||||
|
||||
stageRoot := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-stage-%s-%s", sanitizeFilename(hostnameOr("unknown")), now.Format("20060102-150405")))
|
||||
if err := os.MkdirAll(stageRoot, 0755); err != nil {
|
||||
return "", err
|
||||
}
|
||||
@@ -240,7 +250,8 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
||||
return "", err
|
||||
}
|
||||
|
||||
archivePath := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-%s-%s.tar.gz", host, ts))
|
||||
archiveName := fmt.Sprintf("%s (BEE-SP v%s) %s %s %s.tar.gz", date, ver, model, sn, tod)
|
||||
archivePath := filepath.Join(os.TempDir(), archiveName)
|
||||
if err := createSupportTarGz(archivePath, stageRoot); err != nil {
|
||||
return "", err
|
||||
}
|
||||
@@ -397,6 +408,60 @@ func writeManifest(dst, exportDir, stageRoot string) error {
|
||||
return os.WriteFile(dst, []byte(body.String()), 0644)
|
||||
}
|
||||
|
||||
func bundleVersion() string {
|
||||
v := buildVersion()
|
||||
v = strings.TrimPrefix(v, "v")
|
||||
v = strings.TrimPrefix(v, "V")
|
||||
if v == "" || v == "unknown" {
|
||||
return "0.0"
|
||||
}
|
||||
return v
|
||||
}
|
||||
|
||||
func serverModelForBundle() string {
|
||||
raw, err := exec.Command("dmidecode", "-t", "1").Output()
|
||||
if err != nil {
|
||||
return "unknown"
|
||||
}
|
||||
for _, line := range strings.Split(string(raw), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
key, val, ok := strings.Cut(line, ": ")
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if strings.TrimSpace(key) == "Product Name" {
|
||||
val = strings.TrimSpace(val)
|
||||
if val == "" {
|
||||
return "unknown"
|
||||
}
|
||||
return strings.ReplaceAll(val, " ", "_")
|
||||
}
|
||||
}
|
||||
return "unknown"
|
||||
}
|
||||
|
||||
func serverSerialForBundle() string {
|
||||
raw, err := exec.Command("dmidecode", "-t", "1").Output()
|
||||
if err != nil {
|
||||
return "unknown"
|
||||
}
|
||||
for _, line := range strings.Split(string(raw), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
key, val, ok := strings.Cut(line, ": ")
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if strings.TrimSpace(key) == "Serial Number" {
|
||||
val = strings.TrimSpace(val)
|
||||
if val == "" {
|
||||
return "unknown"
|
||||
}
|
||||
return val
|
||||
}
|
||||
}
|
||||
return "unknown"
|
||||
}
|
||||
|
||||
func buildVersion() string {
|
||||
raw, err := exec.Command("bee", "version").CombinedOutput()
|
||||
if err != nil {
|
||||
|
||||
@@ -326,8 +326,8 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
||||
}
|
||||
|
||||
report := renderBenchmarkReportWithCharts(result, loadBenchmarkReportCharts(runDir, selected))
|
||||
if err := os.WriteFile(filepath.Join(runDir, "report.txt"), []byte(report), 0644); err != nil {
|
||||
return "", fmt.Errorf("write report.txt: %w", err)
|
||||
if err := os.WriteFile(filepath.Join(runDir, "report.md"), []byte(report), 0644); err != nil {
|
||||
return "", fmt.Errorf("write report.md: %w", err)
|
||||
}
|
||||
|
||||
summary := renderBenchmarkSummary(result)
|
||||
@@ -1183,18 +1183,8 @@ func queryIPMIServerPowerW() (float64, error) {
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("ipmitool dcmi power reading: %w", err)
|
||||
}
|
||||
for _, line := range strings.Split(string(out), "\n") {
|
||||
if strings.Contains(line, "Current Power") {
|
||||
parts := strings.SplitN(line, ":", 2)
|
||||
if len(parts) == 2 {
|
||||
val := strings.TrimSpace(strings.TrimSuffix(strings.TrimSpace(parts[1]), "Watts"))
|
||||
val = strings.TrimSpace(val)
|
||||
w, err := strconv.ParseFloat(val, 64)
|
||||
if err == nil && w > 0 {
|
||||
return w, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
if w := parseDCMIPowerReading(string(out)); w > 0 {
|
||||
return w, nil
|
||||
}
|
||||
return 0, fmt.Errorf("could not parse ipmitool dcmi power reading output")
|
||||
}
|
||||
|
||||
@@ -22,18 +22,53 @@ var ansiEscapePattern = regexp.MustCompile(`\x1b\[[0-9;]*m`)
|
||||
|
||||
func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benchmarkReportChart) string {
|
||||
var b strings.Builder
|
||||
fmt.Fprintf(&b, "Bee NVIDIA Benchmark Report\n")
|
||||
fmt.Fprintf(&b, "===========================\n\n")
|
||||
fmt.Fprintf(&b, "Generated: %s\n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
|
||||
fmt.Fprintf(&b, "Host: %s\n", result.Hostname)
|
||||
fmt.Fprintf(&b, "Profile: %s\n", result.BenchmarkProfile)
|
||||
fmt.Fprintf(&b, "Overall status: %s\n", result.OverallStatus)
|
||||
fmt.Fprintf(&b, "Selected GPUs: %s\n", joinIndexList(result.SelectedGPUIndices))
|
||||
fmt.Fprintf(&b, "Normalization: %s\n\n", result.Normalization.Status)
|
||||
|
||||
// ── Header ────────────────────────────────────────────────────────────────
|
||||
b.WriteString("# Bee NVIDIA Benchmark Report\n\n")
|
||||
|
||||
// System identity block
|
||||
if result.ServerModel != "" {
|
||||
fmt.Fprintf(&b, "**Server:** %s \n", result.ServerModel)
|
||||
}
|
||||
if result.Hostname != "" {
|
||||
fmt.Fprintf(&b, "**Host:** %s \n", result.Hostname)
|
||||
}
|
||||
// GPU models summary
|
||||
if len(result.GPUs) > 0 {
|
||||
modelCount := make(map[string]int)
|
||||
var modelOrder []string
|
||||
for _, g := range result.GPUs {
|
||||
m := strings.TrimSpace(g.Name)
|
||||
if m == "" {
|
||||
m = "Unknown GPU"
|
||||
}
|
||||
if modelCount[m] == 0 {
|
||||
modelOrder = append(modelOrder, m)
|
||||
}
|
||||
modelCount[m]++
|
||||
}
|
||||
var parts []string
|
||||
for _, m := range modelOrder {
|
||||
if modelCount[m] == 1 {
|
||||
parts = append(parts, m)
|
||||
} else {
|
||||
parts = append(parts, fmt.Sprintf("%d× %s", modelCount[m], m))
|
||||
}
|
||||
}
|
||||
fmt.Fprintf(&b, "**GPU(s):** %s \n", strings.Join(parts, ", "))
|
||||
}
|
||||
fmt.Fprintf(&b, "**Profile:** %s \n", result.BenchmarkProfile)
|
||||
fmt.Fprintf(&b, "**App version:** %s \n", result.BenchmarkVersion)
|
||||
fmt.Fprintf(&b, "**Generated:** %s \n", result.GeneratedAt.Format("2006-01-02 15:04:05 UTC"))
|
||||
if result.ParallelGPUs {
|
||||
fmt.Fprintf(&b, "**Mode:** parallel (all GPUs simultaneously) \n")
|
||||
}
|
||||
fmt.Fprintf(&b, "**Overall status:** %s \n", result.OverallStatus)
|
||||
b.WriteString("\n")
|
||||
|
||||
// ── Executive Summary ─────────────────────────────────────────────────────
|
||||
if len(result.Findings) > 0 {
|
||||
fmt.Fprintf(&b, "Executive Summary\n")
|
||||
fmt.Fprintf(&b, "-----------------\n")
|
||||
b.WriteString("## Executive Summary\n\n")
|
||||
for _, finding := range result.Findings {
|
||||
fmt.Fprintf(&b, "- %s\n", finding)
|
||||
}
|
||||
@@ -41,149 +76,206 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
|
||||
}
|
||||
|
||||
if len(result.Warnings) > 0 {
|
||||
fmt.Fprintf(&b, "Warnings\n")
|
||||
fmt.Fprintf(&b, "--------\n")
|
||||
b.WriteString("## Warnings\n\n")
|
||||
for _, warning := range result.Warnings {
|
||||
fmt.Fprintf(&b, "- %s\n", warning)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
fmt.Fprintf(&b, "Per GPU Scorecard\n")
|
||||
fmt.Fprintf(&b, "-----------------\n")
|
||||
// ── Scorecard table ───────────────────────────────────────────────────────
|
||||
b.WriteString("## Scorecard\n\n")
|
||||
b.WriteString("| GPU | Status | Composite | Compute | TOPS/SM/GHz | Power Sustain | Thermal Sustain | Stability | Interconnect |\n")
|
||||
b.WriteString("|-----|--------|-----------|---------|-------------|---------------|-----------------|-----------|-------------|\n")
|
||||
for _, gpu := range result.GPUs {
|
||||
fmt.Fprintf(&b, "GPU %d %s\n", gpu.Index, gpu.Name)
|
||||
fmt.Fprintf(&b, " Status: %s\n", gpu.Status)
|
||||
fmt.Fprintf(&b, " Composite score: %.2f\n", gpu.Scores.CompositeScore)
|
||||
fmt.Fprintf(&b, " Compute score: %.2f\n", gpu.Scores.ComputeScore)
|
||||
if gpu.Scores.TOPSPerSMPerGHz > 0 {
|
||||
fmt.Fprintf(&b, " Compute efficiency: %.3f TOPS/SM/GHz\n", gpu.Scores.TOPSPerSMPerGHz)
|
||||
name := strings.TrimSpace(gpu.Name)
|
||||
if name == "" {
|
||||
name = "Unknown"
|
||||
}
|
||||
fmt.Fprintf(&b, " Power sustain: %.1f\n", gpu.Scores.PowerSustainScore)
|
||||
fmt.Fprintf(&b, " Thermal sustain: %.1f\n", gpu.Scores.ThermalSustainScore)
|
||||
fmt.Fprintf(&b, " Stability: %.1f\n", gpu.Scores.StabilityScore)
|
||||
interconnect := "-"
|
||||
if gpu.Scores.InterconnectScore > 0 {
|
||||
fmt.Fprintf(&b, " Interconnect: %.1f\n", gpu.Scores.InterconnectScore)
|
||||
interconnect = fmt.Sprintf("%.1f", gpu.Scores.InterconnectScore)
|
||||
}
|
||||
if len(gpu.DegradationReasons) > 0 {
|
||||
fmt.Fprintf(&b, " Degradation reasons: %s\n", strings.Join(gpu.DegradationReasons, ", "))
|
||||
topsPerSM := "-"
|
||||
if gpu.Scores.TOPSPerSMPerGHz > 0 {
|
||||
topsPerSM = fmt.Sprintf("%.3f", gpu.Scores.TOPSPerSMPerGHz)
|
||||
}
|
||||
fmt.Fprintf(&b, " Avg power/temp/clock: %.1f W / %.1f C / %.0f MHz\n", gpu.Steady.AvgPowerW, gpu.Steady.AvgTempC, gpu.Steady.AvgGraphicsClockMHz)
|
||||
fmt.Fprintf(&b, " P95 power/temp/clock: %.1f W / %.1f C / %.0f MHz\n", gpu.Steady.P95PowerW, gpu.Steady.P95TempC, gpu.Steady.P95GraphicsClockMHz)
|
||||
if len(gpu.PrecisionResults) > 0 {
|
||||
fmt.Fprintf(&b, " Precision results:\n")
|
||||
for _, precision := range gpu.PrecisionResults {
|
||||
if precision.Supported {
|
||||
fmt.Fprintf(&b, " - %s: %.2f TOPS lanes=%d iterations=%d\n", precision.Name, precision.TeraOpsPerSec, precision.Lanes, precision.Iterations)
|
||||
} else {
|
||||
fmt.Fprintf(&b, " - %s: unsupported (%s)\n", precision.Name, precision.Notes)
|
||||
}
|
||||
}
|
||||
fmt.Fprintf(&b, "| GPU %d %s | %s | **%.2f** | %.2f | %s | %.1f | %.1f | %.1f | %s |\n",
|
||||
gpu.Index, name,
|
||||
gpu.Status,
|
||||
gpu.Scores.CompositeScore,
|
||||
gpu.Scores.ComputeScore,
|
||||
topsPerSM,
|
||||
gpu.Scores.PowerSustainScore,
|
||||
gpu.Scores.ThermalSustainScore,
|
||||
gpu.Scores.StabilityScore,
|
||||
interconnect,
|
||||
)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// ── Per GPU detail ────────────────────────────────────────────────────────
|
||||
b.WriteString("## Per-GPU Details\n\n")
|
||||
for _, gpu := range result.GPUs {
|
||||
name := strings.TrimSpace(gpu.Name)
|
||||
if name == "" {
|
||||
name = "Unknown GPU"
|
||||
}
|
||||
fmt.Fprintf(&b, " Throttle: %s\n", formatThrottleLine(gpu.Throttle, gpu.Steady.DurationSec))
|
||||
if len(gpu.Notes) > 0 {
|
||||
fmt.Fprintf(&b, " Notes:\n")
|
||||
for _, note := range gpu.Notes {
|
||||
fmt.Fprintf(&b, " - %s\n", note)
|
||||
}
|
||||
fmt.Fprintf(&b, "### GPU %d — %s\n\n", gpu.Index, name)
|
||||
|
||||
// Identity
|
||||
if gpu.BusID != "" {
|
||||
fmt.Fprintf(&b, "- **Bus ID:** %s\n", gpu.BusID)
|
||||
}
|
||||
if gpu.VBIOS != "" {
|
||||
fmt.Fprintf(&b, "- **vBIOS:** %s\n", gpu.VBIOS)
|
||||
}
|
||||
if gpu.ComputeCapability != "" {
|
||||
fmt.Fprintf(&b, "- **Compute capability:** %s\n", gpu.ComputeCapability)
|
||||
}
|
||||
if gpu.MultiprocessorCount > 0 {
|
||||
fmt.Fprintf(&b, "- **SMs:** %d\n", gpu.MultiprocessorCount)
|
||||
}
|
||||
if gpu.PowerLimitW > 0 {
|
||||
fmt.Fprintf(&b, "- **Power limit:** %.0f W (default %.0f W)\n", gpu.PowerLimitW, gpu.DefaultPowerLimitW)
|
||||
}
|
||||
if gpu.LockedGraphicsClockMHz > 0 {
|
||||
fmt.Fprintf(&b, "- **Locked clocks:** GPU %.0f MHz / Mem %.0f MHz\n", gpu.LockedGraphicsClockMHz, gpu.LockedMemoryClockMHz)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
|
||||
// Steady-state telemetry
|
||||
fmt.Fprintf(&b, "**Steady-state telemetry** (%ds):\n\n", int(gpu.Steady.DurationSec))
|
||||
b.WriteString("| | Avg | P95 |\n|---|---|---|\n")
|
||||
fmt.Fprintf(&b, "| Power | %.1f W | %.1f W |\n", gpu.Steady.AvgPowerW, gpu.Steady.P95PowerW)
|
||||
fmt.Fprintf(&b, "| Temperature | %.1f °C | %.1f °C |\n", gpu.Steady.AvgTempC, gpu.Steady.P95TempC)
|
||||
fmt.Fprintf(&b, "| GPU clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgGraphicsClockMHz, gpu.Steady.P95GraphicsClockMHz)
|
||||
fmt.Fprintf(&b, "| Memory clock | %.0f MHz | %.0f MHz |\n", gpu.Steady.AvgMemoryClockMHz, gpu.Steady.P95MemoryClockMHz)
|
||||
fmt.Fprintf(&b, "| GPU utilisation | %.1f %% | — |\n", gpu.Steady.AvgUsagePct)
|
||||
b.WriteString("\n")
|
||||
|
||||
// Throttle
|
||||
throttle := formatThrottleLine(gpu.Throttle, gpu.Steady.DurationSec)
|
||||
if throttle != "none" {
|
||||
fmt.Fprintf(&b, "**Throttle:** %s\n\n", throttle)
|
||||
}
|
||||
|
||||
// Precision results
|
||||
if len(gpu.PrecisionResults) > 0 {
|
||||
b.WriteString("**Precision results:**\n\n")
|
||||
b.WriteString("| Precision | TOPS | Lanes | Iterations |\n|-----------|------|-------|------------|\n")
|
||||
for _, p := range gpu.PrecisionResults {
|
||||
if p.Supported {
|
||||
fmt.Fprintf(&b, "| %s | %.2f | %d | %d |\n", p.Name, p.TeraOpsPerSec, p.Lanes, p.Iterations)
|
||||
} else {
|
||||
fmt.Fprintf(&b, "| %s | — (unsupported) | — | — |\n", p.Name)
|
||||
}
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
// Degradation / Notes
|
||||
if len(gpu.DegradationReasons) > 0 {
|
||||
fmt.Fprintf(&b, "**Degradation reasons:** %s\n\n", strings.Join(gpu.DegradationReasons, ", "))
|
||||
}
|
||||
if len(gpu.Notes) > 0 {
|
||||
b.WriteString("**Notes:**\n\n")
|
||||
for _, note := range gpu.Notes {
|
||||
fmt.Fprintf(&b, "- %s\n", note)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
}
|
||||
|
||||
// ── Interconnect ──────────────────────────────────────────────────────────
|
||||
if result.Interconnect != nil {
|
||||
fmt.Fprintf(&b, "Interconnect\n")
|
||||
fmt.Fprintf(&b, "------------\n")
|
||||
fmt.Fprintf(&b, "Status: %s\n", result.Interconnect.Status)
|
||||
b.WriteString("## Interconnect (NCCL)\n\n")
|
||||
fmt.Fprintf(&b, "**Status:** %s\n\n", result.Interconnect.Status)
|
||||
if result.Interconnect.Supported {
|
||||
fmt.Fprintf(&b, "Avg algbw / busbw: %.1f / %.1f GB/s\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.AvgBusBWGBps)
|
||||
fmt.Fprintf(&b, "Max algbw / busbw: %.1f / %.1f GB/s\n", result.Interconnect.MaxAlgBWGBps, result.Interconnect.MaxBusBWGBps)
|
||||
b.WriteString("| Metric | Avg | Max |\n|--------|-----|-----|\n")
|
||||
fmt.Fprintf(&b, "| Alg BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgAlgBWGBps, result.Interconnect.MaxAlgBWGBps)
|
||||
fmt.Fprintf(&b, "| Bus BW | %.1f GB/s | %.1f GB/s |\n", result.Interconnect.AvgBusBWGBps, result.Interconnect.MaxBusBWGBps)
|
||||
b.WriteString("\n")
|
||||
}
|
||||
for _, note := range result.Interconnect.Notes {
|
||||
fmt.Fprintf(&b, "- %s\n", note)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
if len(result.Interconnect.Notes) > 0 {
|
||||
b.WriteString("\n")
|
||||
}
|
||||
}
|
||||
|
||||
// ── Server Power (IPMI) ───────────────────────────────────────────────────
|
||||
if sp := result.ServerPower; sp != nil {
|
||||
b.WriteString("## Server Power (IPMI)\n\n")
|
||||
if !sp.Available {
|
||||
b.WriteString("IPMI power measurement unavailable.\n\n")
|
||||
} else {
|
||||
b.WriteString("| | Value |\n|---|---|\n")
|
||||
fmt.Fprintf(&b, "| Server idle | %.0f W |\n", sp.IdleW)
|
||||
fmt.Fprintf(&b, "| Server under load | %.0f W |\n", sp.LoadedW)
|
||||
fmt.Fprintf(&b, "| Server delta (load − idle) | %.0f W |\n", sp.DeltaW)
|
||||
fmt.Fprintf(&b, "| GPU-reported sum | %.0f W |\n", sp.GPUReportedSumW)
|
||||
if sp.ReportingRatio > 0 {
|
||||
fmt.Fprintf(&b, "| Reporting ratio | %.2f (1.0 = accurate, <0.75 = GPU over-reports) |\n", sp.ReportingRatio)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
for _, note := range sp.Notes {
|
||||
fmt.Fprintf(&b, "- %s\n", note)
|
||||
}
|
||||
if len(sp.Notes) > 0 {
|
||||
b.WriteString("\n")
|
||||
}
|
||||
}
|
||||
|
||||
// ── Terminal charts (steady-state only) ───────────────────────────────────
|
||||
if len(charts) > 0 {
|
||||
fmt.Fprintf(&b, "Terminal Charts\n")
|
||||
fmt.Fprintf(&b, "---------------\n")
|
||||
b.WriteString("## Steady-State Charts\n\n")
|
||||
for _, chart := range charts {
|
||||
content := strings.TrimSpace(stripANSIEscapeSequences(chart.Content))
|
||||
if content == "" {
|
||||
continue
|
||||
}
|
||||
fmt.Fprintf(&b, "%s\n", chart.Title)
|
||||
fmt.Fprintf(&b, "%s\n", strings.Repeat("~", len(chart.Title)))
|
||||
fmt.Fprintf(&b, "%s\n\n", content)
|
||||
fmt.Fprintf(&b, "### %s\n\n```\n%s\n```\n\n", chart.Title, content)
|
||||
}
|
||||
}
|
||||
|
||||
if sp := result.ServerPower; sp != nil {
|
||||
fmt.Fprintf(&b, "Server Power (IPMI)\n")
|
||||
fmt.Fprintf(&b, "-------------------\n")
|
||||
if !sp.Available {
|
||||
fmt.Fprintf(&b, "Unavailable\n")
|
||||
} else {
|
||||
fmt.Fprintf(&b, " Server idle: %.0f W\n", sp.IdleW)
|
||||
fmt.Fprintf(&b, " Server under load: %.0f W\n", sp.LoadedW)
|
||||
fmt.Fprintf(&b, " Server delta: %.0f W\n", sp.DeltaW)
|
||||
fmt.Fprintf(&b, " GPU reported (sum): %.0f W\n", sp.GPUReportedSumW)
|
||||
if sp.ReportingRatio > 0 {
|
||||
fmt.Fprintf(&b, " Reporting ratio: %.2f (1.0 = accurate, <0.75 = GPU over-reports)\n", sp.ReportingRatio)
|
||||
}
|
||||
}
|
||||
for _, note := range sp.Notes {
|
||||
fmt.Fprintf(&b, " Note: %s\n", note)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
// ── Methodology ───────────────────────────────────────────────────────────
|
||||
b.WriteString("## Methodology\n\n")
|
||||
fmt.Fprintf(&b, "- Profile `%s` uses standardized baseline → warmup → steady-state → interconnect → cooldown phases.\n", result.BenchmarkProfile)
|
||||
b.WriteString("- Single-GPU compute score from bee-gpu-burn cuBLASLt when available.\n")
|
||||
b.WriteString("- Thermal and power limitations inferred from NVIDIA clock event reason counters and sustained telemetry.\n")
|
||||
b.WriteString("- `result.json` is the canonical machine-readable source for this benchmark run.\n\n")
|
||||
|
||||
fmt.Fprintf(&b, "Methodology\n")
|
||||
fmt.Fprintf(&b, "-----------\n")
|
||||
fmt.Fprintf(&b, "- Profile %s uses standardized baseline, warmup, steady-state, interconnect, and cooldown phases.\n", result.BenchmarkProfile)
|
||||
fmt.Fprintf(&b, "- Single-GPU compute score comes from bee-gpu-burn cuBLASLt output when available.\n")
|
||||
fmt.Fprintf(&b, "- Thermal and power limitations are inferred from NVIDIA clock event reason counters and sustained telemetry.\n")
|
||||
fmt.Fprintf(&b, "- result.json is the canonical machine-readable source for this benchmark run.\n\n")
|
||||
|
||||
fmt.Fprintf(&b, "Raw Files\n")
|
||||
fmt.Fprintf(&b, "---------\n")
|
||||
fmt.Fprintf(&b, "- result.json\n")
|
||||
fmt.Fprintf(&b, "- report.txt\n")
|
||||
fmt.Fprintf(&b, "- summary.txt\n")
|
||||
fmt.Fprintf(&b, "- verbose.log\n")
|
||||
fmt.Fprintf(&b, "- gpu-*-baseline-metrics.csv/html/term.txt\n")
|
||||
fmt.Fprintf(&b, "- gpu-*-warmup.log\n")
|
||||
fmt.Fprintf(&b, "- gpu-*-steady.log\n")
|
||||
fmt.Fprintf(&b, "- gpu-*-steady-metrics.csv/html/term.txt\n")
|
||||
fmt.Fprintf(&b, "- gpu-*-cooldown-metrics.csv/html/term.txt\n")
|
||||
// ── Raw files ─────────────────────────────────────────────────────────────
|
||||
b.WriteString("## Raw Files\n\n")
|
||||
b.WriteString("- `result.json`\n- `report.md`\n- `summary.txt`\n- `verbose.log`\n")
|
||||
b.WriteString("- `gpu-*-baseline-metrics.csv/html/term.txt`\n")
|
||||
b.WriteString("- `gpu-*-warmup.log`\n")
|
||||
b.WriteString("- `gpu-*-steady.log`\n")
|
||||
b.WriteString("- `gpu-*-steady-metrics.csv/html/term.txt`\n")
|
||||
b.WriteString("- `gpu-*-cooldown-metrics.csv/html/term.txt`\n")
|
||||
if result.Interconnect != nil {
|
||||
fmt.Fprintf(&b, "- nccl-all-reduce.log\n")
|
||||
b.WriteString("- `nccl-all-reduce.log`\n")
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// loadBenchmarkReportCharts loads only steady-state terminal charts (baseline and
|
||||
// cooldown charts are not useful for human review).
|
||||
func loadBenchmarkReportCharts(runDir string, gpuIndices []int) []benchmarkReportChart {
|
||||
phases := []struct {
|
||||
name string
|
||||
label string
|
||||
}{
|
||||
{name: "baseline", label: "Baseline"},
|
||||
{name: "steady", label: "Steady State"},
|
||||
{name: "cooldown", label: "Cooldown"},
|
||||
}
|
||||
var charts []benchmarkReportChart
|
||||
for _, idx := range gpuIndices {
|
||||
for _, phase := range phases {
|
||||
path := filepath.Join(runDir, fmt.Sprintf("gpu-%d-%s-metrics-term.txt", idx, phase.name))
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil || len(raw) == 0 {
|
||||
continue
|
||||
}
|
||||
charts = append(charts, benchmarkReportChart{
|
||||
Title: fmt.Sprintf("GPU %d %s", idx, phase.label),
|
||||
Content: string(raw),
|
||||
})
|
||||
path := filepath.Join(runDir, fmt.Sprintf("gpu-%d-steady-metrics-term.txt", idx))
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil || len(raw) == 0 {
|
||||
continue
|
||||
}
|
||||
charts = append(charts, benchmarkReportChart{
|
||||
Title: fmt.Sprintf("GPU %d — Steady State", idx),
|
||||
Content: string(raw),
|
||||
})
|
||||
}
|
||||
return charts
|
||||
}
|
||||
|
||||
@@ -137,8 +137,9 @@ func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
|
||||
for _, needle := range []string{
|
||||
"Executive Summary",
|
||||
"GPU 0 spent measurable time under SW power cap.",
|
||||
"Composite score: 1176.00",
|
||||
"fp16_tensor: 700.00 TOPS",
|
||||
"1176.00",
|
||||
"fp16_tensor",
|
||||
"700.00",
|
||||
} {
|
||||
if !strings.Contains(report, needle) {
|
||||
t.Fatalf("report missing %q\n%s", needle, report)
|
||||
@@ -164,7 +165,7 @@ func TestRenderBenchmarkReportIncludesTerminalChartsWithoutANSI(t *testing.T) {
|
||||
})
|
||||
|
||||
for _, needle := range []string{
|
||||
"Terminal Charts",
|
||||
"Steady-State Charts",
|
||||
"GPU 0 Steady State",
|
||||
"GPU 0 chart",
|
||||
"42┤───",
|
||||
|
||||
@@ -383,10 +383,7 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
|
||||
}
|
||||
|
||||
const (
|
||||
ansiRed = "\033[31m"
|
||||
ansiBlue = "\033[34m"
|
||||
ansiGreen = "\033[32m"
|
||||
ansiYellow = "\033[33m"
|
||||
ansiAmber = "\033[38;5;214m"
|
||||
ansiReset = "\033[0m"
|
||||
)
|
||||
|
||||
@@ -415,10 +412,10 @@ func RenderGPUTerminalChart(rows []GPUMetricRow) string {
|
||||
fn func(GPUMetricRow) float64
|
||||
}
|
||||
defs := []seriesDef{
|
||||
{"Temperature (°C)", ansiRed, func(r GPUMetricRow) float64 { return r.TempC }},
|
||||
{"GPU Usage (%)", ansiBlue, func(r GPUMetricRow) float64 { return r.UsagePct }},
|
||||
{"Power (W)", ansiGreen, func(r GPUMetricRow) float64 { return r.PowerW }},
|
||||
{"Clock (MHz)", ansiYellow, func(r GPUMetricRow) float64 { return r.ClockMHz }},
|
||||
{"Temperature (°C)", ansiAmber, func(r GPUMetricRow) float64 { return r.TempC }},
|
||||
{"GPU Usage (%)", ansiAmber, func(r GPUMetricRow) float64 { return r.UsagePct }},
|
||||
{"Power (W)", ansiAmber, func(r GPUMetricRow) float64 { return r.PowerW }},
|
||||
{"Clock (MHz)", ansiAmber, func(r GPUMetricRow) float64 { return r.ClockMHz }},
|
||||
}
|
||||
|
||||
var b strings.Builder
|
||||
|
||||
@@ -49,6 +49,9 @@ func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
|
||||
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||
"--size-mb", strconv.Itoa(opts.SizeMB),
|
||||
}
|
||||
if opts.StaggerSeconds > 0 && len(selected) > 1 {
|
||||
cmd = append(cmd, "--stagger-seconds", strconv.Itoa(opts.StaggerSeconds))
|
||||
}
|
||||
if len(selected) > 0 {
|
||||
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||
}
|
||||
@@ -63,6 +66,9 @@ func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
|
||||
"bee-john-gpu-stress",
|
||||
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||
}
|
||||
if opts.StaggerSeconds > 0 && len(selected) > 1 {
|
||||
cmd = append(cmd, "--stagger-seconds", strconv.Itoa(opts.StaggerSeconds))
|
||||
}
|
||||
if len(selected) > 0 {
|
||||
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||
}
|
||||
|
||||
@@ -384,25 +384,39 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
|
||||
), logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
|
||||
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
profCmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)))
|
||||
if err != nil {
|
||||
return "", err
|
||||
var (
|
||||
profCmd []string
|
||||
profEnv []string
|
||||
)
|
||||
if staggerSec > 0 && len(selected) > 1 {
|
||||
profCmd = []string{
|
||||
"bee-dcgmproftester-staggered",
|
||||
"--seconds", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)),
|
||||
"--stagger-seconds", strconv.Itoa(staggerSec),
|
||||
"--devices", joinIndexList(selected),
|
||||
}
|
||||
} else {
|
||||
profCmd, err = resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)))
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
profEnv = nvidiaVisibleDevicesEnv(selected)
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
|
||||
satJob{
|
||||
name: "03-dcgmproftester.log",
|
||||
cmd: profCmd,
|
||||
env: nvidiaVisibleDevicesEnv(selected),
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
},
|
||||
satJob{
|
||||
name: "03-dcgmproftester.log",
|
||||
cmd: profCmd,
|
||||
env: profEnv,
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
},
|
||||
satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
), logFunc)
|
||||
}
|
||||
@@ -531,9 +545,13 @@ func memoryStressSizeArg() string {
|
||||
return fmt.Sprintf("%dM", targetMB)
|
||||
}
|
||||
|
||||
func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
sizeMB := envInt("BEE_MEMTESTER_SIZE_MB", 128)
|
||||
passes := envInt("BEE_MEMTESTER_PASSES", 1)
|
||||
func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) {
|
||||
if sizeMB <= 0 {
|
||||
sizeMB = 256
|
||||
}
|
||||
if passes <= 0 {
|
||||
passes = 1
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "memory", []satJob{
|
||||
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
||||
{name: "02-memtester.log", cmd: []string{"memtester", fmt.Sprintf("%dM", sizeMB), fmt.Sprintf("%d", passes)}},
|
||||
@@ -590,7 +608,7 @@ func (s *System) RunCPUAcceptancePack(ctx context.Context, baseDir string, durat
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
|
||||
if baseDir == "" {
|
||||
baseDir = "/var/log/bee-sat"
|
||||
}
|
||||
@@ -622,7 +640,7 @@ func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, l
|
||||
break
|
||||
}
|
||||
prefix := fmt.Sprintf("%02d-%s", index+1, filepath.Base(devPath))
|
||||
commands := storageSATCommands(devPath)
|
||||
commands := storageSATCommands(devPath, extended)
|
||||
for cmdIndex, job := range commands {
|
||||
if ctx.Err() != nil {
|
||||
break
|
||||
@@ -1086,17 +1104,25 @@ func listStorageDevices() ([]string, error) {
|
||||
return parseStorageDevices(string(out)), nil
|
||||
}
|
||||
|
||||
func storageSATCommands(devPath string) []satJob {
|
||||
func storageSATCommands(devPath string, extended bool) []satJob {
|
||||
if strings.Contains(filepath.Base(devPath), "nvme") {
|
||||
selfTestLevel := "1"
|
||||
if extended {
|
||||
selfTestLevel = "2"
|
||||
}
|
||||
return []satJob{
|
||||
{name: "nvme-id-ctrl", cmd: []string{"nvme", "id-ctrl", devPath, "-o", "json"}},
|
||||
{name: "nvme-smart-log", cmd: []string{"nvme", "smart-log", devPath, "-o", "json"}},
|
||||
{name: "nvme-device-self-test", cmd: []string{"nvme", "device-self-test", devPath, "-s", "1", "--wait"}},
|
||||
{name: "nvme-device-self-test", cmd: []string{"nvme", "device-self-test", devPath, "-s", selfTestLevel, "--wait"}},
|
||||
}
|
||||
}
|
||||
smartTestType := "short"
|
||||
if extended {
|
||||
smartTestType = "long"
|
||||
}
|
||||
return []satJob{
|
||||
{name: "smartctl-health", cmd: []string{"smartctl", "-H", "-A", devPath}},
|
||||
{name: "smartctl-self-test-short", cmd: []string{"smartctl", "-t", "short", devPath}},
|
||||
{name: "smartctl-self-test-short", cmd: []string{"smartctl", "-t", smartTestType, devPath}},
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -20,7 +20,7 @@ type FanStressOptions struct {
|
||||
Phase1DurSec int // first load phase duration in seconds (default 300)
|
||||
PauseSec int // pause between the two load phases (default 60)
|
||||
Phase2DurSec int // second load phase duration in seconds (default 300)
|
||||
SizeMB int // GPU memory to allocate per GPU during stress (default 64)
|
||||
SizeMB int // GPU memory to allocate per GPU during stress (0 = auto: 95% of VRAM)
|
||||
GPUIndices []int // which GPU indices to stress (empty = all detected)
|
||||
}
|
||||
|
||||
@@ -243,9 +243,8 @@ func applyFanStressDefaults(opts *FanStressOptions) {
|
||||
if opts.Phase2DurSec <= 0 {
|
||||
opts.Phase2DurSec = 300
|
||||
}
|
||||
if opts.SizeMB <= 0 {
|
||||
opts.SizeMB = 64
|
||||
}
|
||||
// SizeMB == 0 means "auto" (worker picks 95% of GPU VRAM for maximum power draw).
|
||||
// Leave at 0 to avoid passing a too-small size that starves the tensor-core path.
|
||||
}
|
||||
|
||||
// sampleFanStressRow collects all metrics for one telemetry sample.
|
||||
|
||||
@@ -14,12 +14,12 @@ import (
|
||||
func TestStorageSATCommands(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
nvme := storageSATCommands("/dev/nvme0n1")
|
||||
nvme := storageSATCommands("/dev/nvme0n1", false)
|
||||
if len(nvme) != 3 || nvme[2].cmd[0] != "nvme" {
|
||||
t.Fatalf("unexpected nvme commands: %#v", nvme)
|
||||
}
|
||||
|
||||
sata := storageSATCommands("/dev/sda")
|
||||
sata := storageSATCommands("/dev/sda", false)
|
||||
if len(sata) != 2 || sata[0].cmd[0] != "smartctl" {
|
||||
t.Fatalf("unexpected sata commands: %#v", sata)
|
||||
}
|
||||
|
||||
@@ -70,6 +70,7 @@ type NvidiaStressOptions struct {
|
||||
Loader string
|
||||
GPUIndices []int
|
||||
ExcludeGPUIndices []int
|
||||
StaggerSeconds int
|
||||
}
|
||||
|
||||
func New() *System {
|
||||
|
||||
@@ -222,7 +222,21 @@ func formatSplitTaskName(baseName, selectionLabel string) string {
|
||||
}
|
||||
|
||||
func buildNvidiaTaskSet(target string, priority int, createdAt time.Time, params taskParams, baseName string, appRef *app.App, idPrefix string) ([]*Task, error) {
|
||||
if !shouldSplitHomogeneousNvidiaTarget(target) {
|
||||
if !shouldSplitHomogeneousNvidiaTarget(target) || params.ParallelGPUs {
|
||||
// Parallel mode (or non-splittable target): one task for all selected GPUs.
|
||||
if params.ParallelGPUs && shouldSplitHomogeneousNvidiaTarget(target) {
|
||||
// Resolve the selected GPU indices so ExcludeGPUIndices is applied.
|
||||
gpus, err := apiListNvidiaGPUs(appRef)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
resolved, err := expandSelectedGPUIndices(gpus, params.GPUIndices, params.ExcludeGPUIndices)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
params.GPUIndices = resolved
|
||||
params.ExcludeGPUIndices = nil
|
||||
}
|
||||
t := &Task{
|
||||
ID: newJobID(idPrefix),
|
||||
Name: baseName,
|
||||
@@ -262,6 +276,53 @@ func buildNvidiaTaskSet(target string, priority int, createdAt time.Time, params
|
||||
return tasks, nil
|
||||
}
|
||||
|
||||
// expandSelectedGPUIndices returns the sorted list of selected GPU indices after
|
||||
// applying include/exclude filters, without splitting by model.
|
||||
func expandSelectedGPUIndices(gpus []platform.NvidiaGPU, include, exclude []int) ([]int, error) {
|
||||
indexed := make(map[int]struct{}, len(gpus))
|
||||
allIndices := make([]int, 0, len(gpus))
|
||||
for _, gpu := range gpus {
|
||||
indexed[gpu.Index] = struct{}{}
|
||||
allIndices = append(allIndices, gpu.Index)
|
||||
}
|
||||
sort.Ints(allIndices)
|
||||
|
||||
selected := allIndices
|
||||
if len(include) > 0 {
|
||||
selected = make([]int, 0, len(include))
|
||||
seen := make(map[int]struct{}, len(include))
|
||||
for _, idx := range include {
|
||||
if _, ok := indexed[idx]; !ok {
|
||||
continue
|
||||
}
|
||||
if _, dup := seen[idx]; dup {
|
||||
continue
|
||||
}
|
||||
seen[idx] = struct{}{}
|
||||
selected = append(selected, idx)
|
||||
}
|
||||
sort.Ints(selected)
|
||||
}
|
||||
if len(exclude) > 0 {
|
||||
skip := make(map[int]struct{}, len(exclude))
|
||||
for _, idx := range exclude {
|
||||
skip[idx] = struct{}{}
|
||||
}
|
||||
filtered := selected[:0]
|
||||
for _, idx := range selected {
|
||||
if _, ok := skip[idx]; ok {
|
||||
continue
|
||||
}
|
||||
filtered = append(filtered, idx)
|
||||
}
|
||||
selected = filtered
|
||||
}
|
||||
if len(selected) == 0 {
|
||||
return nil, fmt.Errorf("no NVIDIA GPUs selected")
|
||||
}
|
||||
return selected, nil
|
||||
}
|
||||
|
||||
// ── SSE helpers ───────────────────────────────────────────────────────────────
|
||||
|
||||
func sseWrite(w http.ResponseWriter, event, data string) bool {
|
||||
@@ -421,12 +482,13 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
||||
return
|
||||
}
|
||||
|
||||
var body struct {
|
||||
Duration int `json:"duration"`
|
||||
DiagLevel int `json:"diag_level"`
|
||||
GPUIndices []int `json:"gpu_indices"`
|
||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
|
||||
Loader string `json:"loader"`
|
||||
var body struct {
|
||||
Duration int `json:"duration"`
|
||||
StressMode bool `json:"stress_mode"`
|
||||
GPUIndices []int `json:"gpu_indices"`
|
||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
|
||||
StaggerGPUStart bool `json:"stagger_gpu_start"`
|
||||
Loader string `json:"loader"`
|
||||
Profile string `json:"profile"`
|
||||
DisplayName string `json:"display_name"`
|
||||
PlatformComponents []string `json:"platform_components"`
|
||||
@@ -442,12 +504,13 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
||||
if strings.TrimSpace(body.DisplayName) != "" {
|
||||
name = body.DisplayName
|
||||
}
|
||||
params := taskParams{
|
||||
Duration: body.Duration,
|
||||
DiagLevel: body.DiagLevel,
|
||||
GPUIndices: body.GPUIndices,
|
||||
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
||||
Loader: body.Loader,
|
||||
params := taskParams{
|
||||
Duration: body.Duration,
|
||||
StressMode: body.StressMode,
|
||||
GPUIndices: body.GPUIndices,
|
||||
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
||||
StaggerGPUStart: body.StaggerGPUStart,
|
||||
Loader: body.Loader,
|
||||
BurnProfile: body.Profile,
|
||||
DisplayName: body.DisplayName,
|
||||
PlatformComponents: body.PlatformComponents,
|
||||
@@ -1315,107 +1378,3 @@ func (h *handler) rollbackPendingNetworkChange() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// ── Display / Screen Resolution ───────────────────────────────────────────────
|
||||
|
||||
type displayMode struct {
|
||||
Output string `json:"output"`
|
||||
Mode string `json:"mode"`
|
||||
Current bool `json:"current"`
|
||||
}
|
||||
|
||||
type displayInfo struct {
|
||||
Output string `json:"output"`
|
||||
Modes []displayMode `json:"modes"`
|
||||
Current string `json:"current"`
|
||||
}
|
||||
|
||||
var xrandrOutputRE = regexp.MustCompile(`^(\S+)\s+connected`)
|
||||
var xrandrModeRE = regexp.MustCompile(`^\s{3}(\d+x\d+)\s`)
|
||||
var xrandrCurrentRE = regexp.MustCompile(`\*`)
|
||||
|
||||
func parseXrandrOutput(out string) []displayInfo {
|
||||
var infos []displayInfo
|
||||
var cur *displayInfo
|
||||
for _, line := range strings.Split(out, "\n") {
|
||||
if m := xrandrOutputRE.FindStringSubmatch(line); m != nil {
|
||||
if cur != nil {
|
||||
infos = append(infos, *cur)
|
||||
}
|
||||
cur = &displayInfo{Output: m[1]}
|
||||
continue
|
||||
}
|
||||
if cur == nil {
|
||||
continue
|
||||
}
|
||||
if m := xrandrModeRE.FindStringSubmatch(line); m != nil {
|
||||
isCurrent := xrandrCurrentRE.MatchString(line)
|
||||
mode := displayMode{Output: cur.Output, Mode: m[1], Current: isCurrent}
|
||||
cur.Modes = append(cur.Modes, mode)
|
||||
if isCurrent {
|
||||
cur.Current = m[1]
|
||||
}
|
||||
}
|
||||
}
|
||||
if cur != nil {
|
||||
infos = append(infos, *cur)
|
||||
}
|
||||
return infos
|
||||
}
|
||||
|
||||
func xrandrCommand(args ...string) *exec.Cmd {
|
||||
cmd := exec.Command("xrandr", args...)
|
||||
env := append([]string{}, os.Environ()...)
|
||||
hasDisplay := false
|
||||
hasXAuthority := false
|
||||
for _, kv := range env {
|
||||
if strings.HasPrefix(kv, "DISPLAY=") && strings.TrimPrefix(kv, "DISPLAY=") != "" {
|
||||
hasDisplay = true
|
||||
}
|
||||
if strings.HasPrefix(kv, "XAUTHORITY=") && strings.TrimPrefix(kv, "XAUTHORITY=") != "" {
|
||||
hasXAuthority = true
|
||||
}
|
||||
}
|
||||
if !hasDisplay {
|
||||
env = append(env, "DISPLAY=:0")
|
||||
}
|
||||
if !hasXAuthority {
|
||||
env = append(env, "XAUTHORITY=/home/bee/.Xauthority")
|
||||
}
|
||||
cmd.Env = env
|
||||
return cmd
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIDisplayResolutions(w http.ResponseWriter, _ *http.Request) {
|
||||
out, err := xrandrCommand().Output()
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, "xrandr: "+err.Error())
|
||||
return
|
||||
}
|
||||
writeJSON(w, parseXrandrOutput(string(out)))
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIDisplaySet(w http.ResponseWriter, r *http.Request) {
|
||||
var req struct {
|
||||
Output string `json:"output"`
|
||||
Mode string `json:"mode"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil || req.Output == "" || req.Mode == "" {
|
||||
writeError(w, http.StatusBadRequest, "output and mode are required")
|
||||
return
|
||||
}
|
||||
// Validate mode looks like WxH to prevent injection
|
||||
if !regexp.MustCompile(`^\d+x\d+$`).MatchString(req.Mode) {
|
||||
writeError(w, http.StatusBadRequest, "invalid mode format")
|
||||
return
|
||||
}
|
||||
// Validate output name (no special chars)
|
||||
if !regexp.MustCompile(`^[A-Za-z0-9_\-]+$`).MatchString(req.Output) {
|
||||
writeError(w, http.StatusBadRequest, "invalid output name")
|
||||
return
|
||||
}
|
||||
if out, err := xrandrCommand("--output", req.Output, "--mode", req.Mode).CombinedOutput(); err != nil {
|
||||
writeError(w, http.StatusInternalServerError, "xrandr: "+strings.TrimSpace(string(out)))
|
||||
return
|
||||
}
|
||||
writeJSON(w, map[string]string{"status": "ok", "output": req.Output, "mode": req.Mode})
|
||||
}
|
||||
|
||||
@@ -10,30 +10,6 @@ import (
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
func TestXrandrCommandAddsDefaultX11Env(t *testing.T) {
|
||||
t.Setenv("DISPLAY", "")
|
||||
t.Setenv("XAUTHORITY", "")
|
||||
|
||||
cmd := xrandrCommand("--query")
|
||||
|
||||
var hasDisplay bool
|
||||
var hasXAuthority bool
|
||||
for _, kv := range cmd.Env {
|
||||
if kv == "DISPLAY=:0" {
|
||||
hasDisplay = true
|
||||
}
|
||||
if kv == "XAUTHORITY=/home/bee/.Xauthority" {
|
||||
hasXAuthority = true
|
||||
}
|
||||
}
|
||||
if !hasDisplay {
|
||||
t.Fatalf("DISPLAY not injected: %v", cmd.Env)
|
||||
}
|
||||
if !hasXAuthority {
|
||||
t.Fatalf("XAUTHORITY not injected: %v", cmd.Env)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
originalTasks := globalQueue.tasks
|
||||
|
||||
@@ -1031,42 +1031,45 @@ func renderValidate(opts HandlerOptions) string {
|
||||
return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div>
|
||||
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">Validate Profile</div>
|
||||
<div class="card-body validate-profile-body">
|
||||
<div class="validate-profile-col">
|
||||
<div class="form-row" style="margin:0"><label>Cycles</label><input type="number" id="sat-cycles" value="1" min="1" max="100" style="width:100%"></div>
|
||||
<div class="form-row" style="margin:12px 0 0"><label>Diag level</label><select id="sat-profile-nvidia-level" style="width:100%"><option value="1">Level 1 — Quick</option><option value="2">Level 2 — Standard</option><option value="3">Level 3 — Extended</option><option value="4">Level 4 — Full</option></select></div>
|
||||
</div>
|
||||
<div class="validate-profile-col validate-profile-action">
|
||||
<p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially with the selected cycle count. NVIDIA <code>dcgmi diag</code> uses the selected diag level from this profile.</p>
|
||||
<button class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
|
||||
</div>
|
||||
<div class="validate-profile-col"></div>
|
||||
</div>
|
||||
<div class="card-body" style="padding-top:0;display:flex;justify-content:center">
|
||||
<span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
|
||||
</div>
|
||||
</div>
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">Validate Profile</div>
|
||||
<div class="card-body validate-profile-body">
|
||||
<div class="validate-profile-col">
|
||||
<div class="form-row" style="margin:0"><label>Cycles</label><input type="number" id="sat-cycles" value="1" min="1" max="100" style="width:100%"></div>
|
||||
</div>
|
||||
<div class="validate-profile-col">
|
||||
<div class="form-row" style="margin:12px 0 0"><label>Mode</label></div>
|
||||
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-validate" value="validate" checked onchange="satModeChanged()"><span>Validate — quick non-destructive check</span></label>
|
||||
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (~30–60 min)</span></label>
|
||||
</div>
|
||||
<div class="validate-profile-col validate-profile-action">
|
||||
<p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially with the selected cycle count and mode. Validate is quick (~5–15 min total); Stress is thorough (~30–60 min total).</p>
|
||||
<button type="button" class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
|
||||
<div style="margin-top:12px">
|
||||
<span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="grid3">
|
||||
` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody(
|
||||
inv.CPU,
|
||||
`Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`,
|
||||
`<code>lscpu</code>, <code>sensors</code>, <code>stress-ng</code>`,
|
||||
`Duration is taken from Validate Profile diag level: Level 1 = 60s, Level 2 = 5m, Level 3 = 1h, Level 4 = 1h.`,
|
||||
`60s in Validate, 30 min in Stress.`,
|
||||
)) +
|
||||
renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody(
|
||||
inv.Memory,
|
||||
`Runs a short RAM validation pass and records memory state around the test.`,
|
||||
`Runs a RAM validation pass and records memory state around the test.`,
|
||||
`<code>free</code>, <code>memtester</code>`,
|
||||
`No extra settings.`,
|
||||
`256 MB / 1 pass in Validate, 1 GB / 3 passes in Stress.`,
|
||||
)) +
|
||||
renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
|
||||
inv.Storage,
|
||||
`Scans all storage devices and runs the matching health or self-test path for each device type.`,
|
||||
`<code>lsblk</code>; NVMe: <code>nvme</code>; SATA/SAS: <code>smartctl</code>`,
|
||||
`No extra settings.`,
|
||||
`Short self-test in Validate, extended self-test in Stress.`,
|
||||
)) +
|
||||
`</div>
|
||||
<div style="height:1px;background:var(--border);margin:16px 0"></div>
|
||||
@@ -1083,6 +1086,12 @@ func renderValidate(opts HandlerOptions) string {
|
||||
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
|
||||
</div>
|
||||
<p id="sat-gpu-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA validate tasks.</p>
|
||||
<div style="margin-top:10px;padding-top:10px;border-top:1px solid var(--border)">
|
||||
<label class="sat-gpu-row" title="When checked, multi-GPU tests (PSU Pulse, NCCL, NVBandwidth) run on ALL GPUs in the system regardless of the selection above.">
|
||||
<input type="checkbox" id="sat-multi-gpu-all" checked onchange="satUpdateGPUSelectionNote()">
|
||||
<span><strong>Multi-GPU tests</strong> — use all GPUs <span style="font-size:11px;color:var(--muted)">(PSU Pulse, NCCL, NVBandwidth)</span></span>
|
||||
</label>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@@ -1091,14 +1100,48 @@ func renderValidate(opts HandlerOptions) string {
|
||||
inv.NVIDIA,
|
||||
`Runs NVIDIA diagnostics and board inventory checks.`,
|
||||
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
|
||||
`Runs one GPU at a time on the selected NVIDIA GPUs. Diag level is taken from Validate Profile.`,
|
||||
`Level 2 in Validate, Level 3 in Stress. Runs one GPU at a time on the selected NVIDIA GPUs.`,
|
||||
)) +
|
||||
`<div id="sat-card-nvidia-targeted-stress">` +
|
||||
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Runs a controlled NVIDIA DCGM load in Validate to check stability under moderate stress.`,
|
||||
`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
|
||||
`<code>dcgmi diag targeted_stress</code>`,
|
||||
`Runs one GPU at a time on the selected NVIDIA GPUs with the fixed DCGM targeted stress recipe.`,
|
||||
`Skipped in Validate mode. Runs after dcgmi diag in Stress mode. Runs one GPU at a time on the selected NVIDIA GPUs.<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`<div id="sat-card-nvidia-targeted-power">` +
|
||||
renderSATCard("nvidia-targeted-power", "NVIDIA Targeted Power", "runNvidiaValidateSet('nvidia-targeted-power')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
|
||||
`<code>dcgmi diag targeted_power</code>`,
|
||||
`Skipped in Validate mode. Runs in Stress mode only. Runs one GPU at a time.<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`<div id="sat-card-nvidia-pulse">` +
|
||||
renderSATCard("nvidia-pulse", "NVIDIA PSU Pulse Test", "runNvidiaFabricValidate('nvidia-pulse')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`,
|
||||
`<code>dcgmi diag pulse_test</code>`,
|
||||
`Skipped in Validate mode. Runs in Stress mode only. Runs all selected GPUs simultaneously — synchronous pulsing is required to stress the PSU.<p id="sat-pt-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`<div id="sat-card-nvidia-interconnect">` +
|
||||
renderSATCard("nvidia-interconnect", "NVIDIA Interconnect (NCCL)", "runNvidiaFabricValidate('nvidia-interconnect')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
|
||||
`<code>all_reduce_perf</code> (NCCL tests)`,
|
||||
`Skipped in Validate mode. Runs in Stress mode only. Runs across all selected GPUs simultaneously (requires ≥2).<p id="sat-ni-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`<div id="sat-card-nvidia-bandwidth">` +
|
||||
renderSATCard("nvidia-bandwidth", "NVIDIA Bandwidth (NVBandwidth)", "runNvidiaFabricValidate('nvidia-bandwidth')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
|
||||
`<code>nvbandwidth</code>`,
|
||||
`Skipped in Validate mode. Runs in Stress mode only. Runs across all selected GPUs simultaneously.<p id="sat-nb-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`</div>
|
||||
<div class="grid3" style="margin-top:16px">
|
||||
` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody(
|
||||
@@ -1125,17 +1168,28 @@ func renderValidate(opts HandlerOptions) string {
|
||||
</style>
|
||||
<script>
|
||||
let satES = null;
|
||||
function satDiagLevel() {
|
||||
return parseInt(document.getElementById('sat-profile-nvidia-level').value) || 1;
|
||||
function satStressMode() {
|
||||
return document.querySelector('input[name="sat-mode"]:checked')?.value === 'stress';
|
||||
}
|
||||
function satCPUDurationFromDiagLevel() {
|
||||
const level = satDiagLevel();
|
||||
if (level === 1) return 60;
|
||||
if (level === 2) return 5 * 60;
|
||||
return 60 * 60;
|
||||
function satModeChanged() {
|
||||
const stress = satStressMode();
|
||||
[
|
||||
{card: 'sat-card-nvidia-targeted-stress', hint: 'sat-ts-mode-hint'},
|
||||
{card: 'sat-card-nvidia-targeted-power', hint: 'sat-tp-mode-hint'},
|
||||
{card: 'sat-card-nvidia-pulse', hint: 'sat-pt-mode-hint'},
|
||||
{card: 'sat-card-nvidia-interconnect', hint: 'sat-ni-mode-hint'},
|
||||
{card: 'sat-card-nvidia-bandwidth', hint: 'sat-nb-mode-hint'},
|
||||
].forEach(function(item) {
|
||||
const card = document.getElementById(item.card);
|
||||
if (card) {
|
||||
card.style.opacity = stress ? '1' : '0.5';
|
||||
const hint = document.getElementById(item.hint);
|
||||
if (hint) hint.style.display = stress ? 'none' : '';
|
||||
}
|
||||
});
|
||||
}
|
||||
function satLabels() {
|
||||
return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
|
||||
return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress (dcgmi diag targeted_stress)', 'nvidia-targeted-power':'NVIDIA Targeted Power (dcgmi diag targeted_power)', 'nvidia-pulse':'NVIDIA PSU Pulse Test (dcgmi diag pulse_test)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL all_reduce_perf)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
|
||||
}
|
||||
let satNvidiaGPUsPromise = null;
|
||||
function loadSatNvidiaGPUs() {
|
||||
@@ -1156,6 +1210,10 @@ function satSelectedGPUIndices() {
|
||||
.filter(function(v) { return !Number.isNaN(v); })
|
||||
.sort(function(a, b) { return a - b; });
|
||||
}
|
||||
function satMultiGPUAll() {
|
||||
const cb = document.getElementById('sat-multi-gpu-all');
|
||||
return cb ? cb.checked : true;
|
||||
}
|
||||
function satUpdateGPUSelectionNote() {
|
||||
const note = document.getElementById('sat-gpu-selection-note');
|
||||
if (!note) return;
|
||||
@@ -1164,7 +1222,8 @@ function satUpdateGPUSelectionNote() {
|
||||
note.textContent = 'Select at least one NVIDIA GPU to enable NVIDIA validate tasks.';
|
||||
return;
|
||||
}
|
||||
note.textContent = 'Selected NVIDIA GPUs: ' + selected.join(', ') + '.';
|
||||
const multiAll = satMultiGPUAll();
|
||||
note.textContent = 'Selected GPUs: ' + selected.join(', ') + '. Multi-GPU tests: ' + (multiAll ? 'all GPUs in system' : 'selected GPUs only') + '.';
|
||||
}
|
||||
function satRenderGPUList(gpus) {
|
||||
const root = document.getElementById('sat-gpu-list');
|
||||
@@ -1211,9 +1270,8 @@ function satRequestBody(target, overrides) {
|
||||
const body = {};
|
||||
const labels = satLabels();
|
||||
body.display_name = labels[target] || ('Validate ' + target);
|
||||
if (target === 'nvidia') body.diag_level = satDiagLevel();
|
||||
if (target === 'nvidia-targeted-stress') body.duration = 300;
|
||||
if (target === 'cpu') body.duration = satCPUDurationFromDiagLevel();
|
||||
body.stress_mode = satStressMode();
|
||||
if (target === 'cpu') body.duration = satStressMode() ? 1800 : 60;
|
||||
if (overrides) {
|
||||
Object.keys(overrides).forEach(key => { body[key] = overrides[key]; });
|
||||
}
|
||||
@@ -1275,8 +1333,28 @@ function runSATWithOverrides(target, overrides) {
|
||||
return enqueueSATTarget(target, overrides)
|
||||
.then(d => streamSATTask(d.task_id, title, false));
|
||||
}
|
||||
const nvidiaPerGPUTargets = ['nvidia', 'nvidia-targeted-stress', 'nvidia-targeted-power'];
|
||||
// pulse_test and fabric tests run on all selected GPUs simultaneously
|
||||
const nvidiaAllGPUTargets = ['nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
|
||||
function satAllGPUIndicesForMulti() {
|
||||
// If "Multi-GPU tests — all GPUs" is checked, return all detected GPUs.
|
||||
// Otherwise fall back to the per-GPU selection.
|
||||
if (satMultiGPUAll()) {
|
||||
return loadSatNvidiaGPUs().then(function(gpus) {
|
||||
return gpus.map(function(g) { return Number(g.index); });
|
||||
});
|
||||
}
|
||||
const sel = satSelectedGPUIndices();
|
||||
return Promise.resolve(sel);
|
||||
}
|
||||
function expandSATTarget(target) {
|
||||
if (target !== 'nvidia' && target !== 'nvidia-targeted-stress') {
|
||||
if (nvidiaAllGPUTargets.indexOf(target) >= 0) {
|
||||
return satAllGPUIndicesForMulti().then(function(indices) {
|
||||
if (!indices.length) return Promise.reject(new Error('No NVIDIA GPUs available.'));
|
||||
return [{target: target, overrides: {gpu_indices: indices, display_name: satLabels()[target] || target}}];
|
||||
});
|
||||
}
|
||||
if (nvidiaPerGPUTargets.indexOf(target) < 0) {
|
||||
return Promise.resolve([{target: target}]);
|
||||
}
|
||||
const selected = satSelectedGPUIndices();
|
||||
@@ -1292,6 +1370,12 @@ function expandSATTarget(target) {
|
||||
label: satGPUDisplayName(gpu)
|
||||
})));
|
||||
}
|
||||
function runNvidiaFabricValidate(target) {
|
||||
satAllGPUIndicesForMulti().then(function(indices) {
|
||||
if (!indices.length) { alert('No NVIDIA GPUs available.'); return; }
|
||||
runSATWithOverrides(target, {gpu_indices: indices, display_name: satLabels()[target] || target});
|
||||
});
|
||||
}
|
||||
function runNvidiaValidateSet(target) {
|
||||
return loadSatNvidiaGPUs().then(gpus => {
|
||||
const selected = satSelectedGPUIndices();
|
||||
@@ -1354,8 +1438,10 @@ function runAllSAT() {
|
||||
const cycles = Math.max(1, parseInt(document.getElementById('sat-cycles').value)||1);
|
||||
const status = document.getElementById('sat-all-status');
|
||||
status.textContent = 'Enqueuing...';
|
||||
const baseTargets = ['nvidia','nvidia-targeted-stress','memory','storage','cpu'].concat(selectedAMDValidateTargets());
|
||||
const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth', 'hpl'];
|
||||
const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','hpl','memory','storage','cpu'].concat(selectedAMDValidateTargets());
|
||||
const activeTargets = baseTargets.filter(target => {
|
||||
if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false;
|
||||
const btn = document.getElementById('sat-btn-' + target);
|
||||
return !(btn && btn.disabled);
|
||||
});
|
||||
@@ -1390,6 +1476,10 @@ function runAllSAT() {
|
||||
fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
|
||||
if (!gp.nvidia) disableSATCard('nvidia', 'No NVIDIA GPU detected');
|
||||
if (!gp.nvidia) disableSATCard('nvidia-targeted-stress', 'No NVIDIA GPU detected');
|
||||
if (!gp.nvidia) disableSATCard('nvidia-targeted-power', 'No NVIDIA GPU detected');
|
||||
if (!gp.nvidia) disableSATCard('nvidia-pulse', 'No NVIDIA GPU detected');
|
||||
if (!gp.nvidia) disableSATCard('nvidia-interconnect', 'No NVIDIA GPU detected');
|
||||
if (!gp.nvidia) disableSATCard('nvidia-bandwidth', 'No NVIDIA GPU detected');
|
||||
if (!gp.amd) disableSATCard('amd', 'No AMD GPU detected');
|
||||
if (!gp.amd) disableSATAMDOptions('No AMD GPU detected');
|
||||
});
|
||||
@@ -1583,10 +1673,11 @@ func renderSATCard(id, label, runAction, headerActions, body string) string {
|
||||
// ── Benchmark ─────────────────────────────────────────────────────────────────
|
||||
|
||||
type benchmarkHistoryColumn struct {
|
||||
key string
|
||||
label string
|
||||
name string
|
||||
index int
|
||||
key string
|
||||
label string
|
||||
name string
|
||||
index int
|
||||
parallel bool
|
||||
}
|
||||
|
||||
type benchmarkHistoryCell struct {
|
||||
@@ -1894,29 +1985,43 @@ func loadBenchmarkHistoryFromPaths(paths []string) ([]benchmarkHistoryColumn, []
|
||||
cells: make(map[string]benchmarkHistoryCell),
|
||||
}
|
||||
|
||||
// Count how many GPUs of each model appear in this run (for the label).
|
||||
gpuModelCount := make(map[string]int)
|
||||
for _, gpu := range result.GPUs {
|
||||
gpuModelCount[strings.TrimSpace(gpu.Name)]++
|
||||
}
|
||||
|
||||
// Track best composite score per column key within this run.
|
||||
runBest := make(map[string]float64)
|
||||
for _, gpu := range result.GPUs {
|
||||
key := benchmarkHistoryColumnKey(result.ServerModel, gpu.Name)
|
||||
count := gpuModelCount[strings.TrimSpace(gpu.Name)]
|
||||
columnByKey[key] = benchmarkHistoryColumn{
|
||||
key: key,
|
||||
label: benchmarkHistoryColumnLabel(result.ServerModel, gpu.Name, count),
|
||||
name: strings.TrimSpace(gpu.Name),
|
||||
index: gpu.Index,
|
||||
if result.ParallelGPUs {
|
||||
// All GPUs ran simultaneously — one column per server, score = avg composite.
|
||||
gpuModelCount := make(map[string]int)
|
||||
for _, gpu := range result.GPUs {
|
||||
gpuModelCount[strings.TrimSpace(gpu.Name)]++
|
||||
}
|
||||
if gpu.Scores.CompositeScore > runBest[key] {
|
||||
runBest[key] = gpu.Scores.CompositeScore
|
||||
scoreSum := make(map[string]float64)
|
||||
scoreCnt := make(map[string]int)
|
||||
for _, gpu := range result.GPUs {
|
||||
key := "parallel|" + strings.TrimSpace(result.ServerModel) + "|" + strings.TrimSpace(gpu.Name)
|
||||
scoreSum[key] += gpu.Scores.CompositeScore
|
||||
scoreCnt[key]++
|
||||
count := gpuModelCount[strings.TrimSpace(gpu.Name)]
|
||||
columnByKey[key] = benchmarkHistoryColumn{
|
||||
key: key,
|
||||
label: benchmarkHistoryParallelLabel(result.ServerModel, gpu.Name, count),
|
||||
name: strings.TrimSpace(gpu.Name),
|
||||
index: -1,
|
||||
parallel: true,
|
||||
}
|
||||
}
|
||||
for key, sum := range scoreSum {
|
||||
run.cells[key] = benchmarkHistoryCell{score: sum / float64(scoreCnt[key]), present: true}
|
||||
}
|
||||
} else {
|
||||
// Each GPU ran independently — one column per GPU index.
|
||||
for _, gpu := range result.GPUs {
|
||||
key := "gpu|" + strings.TrimSpace(result.ServerModel) + "|" + strings.TrimSpace(gpu.Name) + "|" + strconv.Itoa(gpu.Index)
|
||||
columnByKey[key] = benchmarkHistoryColumn{
|
||||
key: key,
|
||||
label: benchmarkHistoryPerGPULabel(gpu.Name, gpu.Index),
|
||||
name: strings.TrimSpace(gpu.Name),
|
||||
index: gpu.Index,
|
||||
parallel: false,
|
||||
}
|
||||
run.cells[key] = benchmarkHistoryCell{score: gpu.Scores.CompositeScore, present: true}
|
||||
}
|
||||
}
|
||||
for key, score := range runBest {
|
||||
run.cells[key] = benchmarkHistoryCell{score: score, present: true}
|
||||
}
|
||||
runs = append(runs, run)
|
||||
}
|
||||
@@ -1925,13 +2030,24 @@ func loadBenchmarkHistoryFromPaths(paths []string) ([]benchmarkHistoryColumn, []
|
||||
for _, col := range columnByKey {
|
||||
columns = append(columns, col)
|
||||
}
|
||||
// Sequential GPU columns first (sorted by GPU index), then parallel server columns.
|
||||
sort.Slice(columns, func(i, j int) bool {
|
||||
li := strings.ToLower(columns[i].label)
|
||||
lj := strings.ToLower(columns[j].label)
|
||||
if li != lj {
|
||||
return li < lj
|
||||
if columns[i].parallel != columns[j].parallel {
|
||||
return !columns[i].parallel // sequential first
|
||||
}
|
||||
return columns[i].key < columns[j].key
|
||||
if columns[i].parallel {
|
||||
li := strings.ToLower(columns[i].label)
|
||||
lj := strings.ToLower(columns[j].label)
|
||||
if li != lj {
|
||||
return li < lj
|
||||
}
|
||||
return columns[i].key < columns[j].key
|
||||
}
|
||||
// Sequential: sort by GPU index, then name.
|
||||
if columns[i].index != columns[j].index {
|
||||
return columns[i].index < columns[j].index
|
||||
}
|
||||
return strings.ToLower(columns[i].name) < strings.ToLower(columns[j].name)
|
||||
})
|
||||
sort.Slice(runs, func(i, j int) bool {
|
||||
return runs[i].generatedAt.After(runs[j].generatedAt)
|
||||
@@ -1939,32 +2055,35 @@ func loadBenchmarkHistoryFromPaths(paths []string) ([]benchmarkHistoryColumn, []
|
||||
return columns, runs
|
||||
}
|
||||
|
||||
// benchmarkHistoryColumnKey groups results by server model + GPU model so that
|
||||
// runs on the same hardware produce one column regardless of individual GPU index.
|
||||
func benchmarkHistoryColumnKey(serverModel, gpuName string) string {
|
||||
return strings.TrimSpace(serverModel) + "|" + strings.TrimSpace(gpuName)
|
||||
// benchmarkHistoryPerGPULabel formats a label for a single-GPU column: "GPU #N — ModelName".
|
||||
func benchmarkHistoryPerGPULabel(gpuName string, index int) string {
|
||||
gpuName = strings.TrimSpace(gpuName)
|
||||
if gpuName == "" {
|
||||
gpuName = "Unknown GPU"
|
||||
}
|
||||
return fmt.Sprintf("GPU #%d — %s", index, gpuName)
|
||||
}
|
||||
|
||||
// benchmarkHistoryColumnLabel formats the column header as
|
||||
// "Server Model (N× GPU Model)" or "GPU Model" when server info is missing.
|
||||
func benchmarkHistoryColumnLabel(serverModel, gpuName string, count int) string {
|
||||
// benchmarkHistoryParallelLabel formats a label for an all-GPU parallel column:
|
||||
// "ServerModel — N× ModelName (All GPUs)" or "N× ModelName (All GPUs)" if no server.
|
||||
func benchmarkHistoryParallelLabel(serverModel, gpuName string, count int) string {
|
||||
serverModel = strings.TrimSpace(serverModel)
|
||||
gpuName = strings.TrimSpace(gpuName)
|
||||
if gpuName == "" {
|
||||
gpuName = "Unknown GPU"
|
||||
}
|
||||
gpuPart := fmt.Sprintf("%d× %s", count, gpuName)
|
||||
gpuPart := fmt.Sprintf("%d× %s (All GPUs)", count, gpuName)
|
||||
if serverModel == "" {
|
||||
return gpuPart
|
||||
}
|
||||
return fmt.Sprintf("%s (%s)", serverModel, gpuPart)
|
||||
return fmt.Sprintf("%s — %s", serverModel, gpuPart)
|
||||
}
|
||||
|
||||
// ── Burn ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
func renderBurn() string {
|
||||
return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>⚠ Warning:</strong> Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.</div>
|
||||
<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> DCGM diagnostics and ` + "targeted_stress" + ` remain in <a href="/validate">Validate</a>. Burn exposes official NVIDIA load recipes by test goal plus separate custom stress tools.</div>
|
||||
<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `), NCCL, NVBandwidth, and LINPACK remain in <a href="/validate">Validate → Stress mode</a>. Burn exposes sustained GPU compute load recipes.</div>
|
||||
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
@@ -1977,11 +2096,11 @@ func renderBurn() string {
|
||||
<label class="cb-row"><input type="radio" name="burn-profile" value="overnight"><span>Overnight — 8 hours</span></label>
|
||||
</div>
|
||||
<div class="burn-profile-col burn-profile-action">
|
||||
<button class="btn btn-primary" onclick="runAllBurnTasks()">Burn one by one</button>
|
||||
<button type="button" class="btn btn-primary" onclick="runAllBurnTasks()">Burn one by one</button>
|
||||
<p>Run checked tests one by one. Tests run without cooldown. Each test duration is determined by the Burn Profile. Total test duration is the sum of all selected tests multiplied by the Burn Profile duration.</p>
|
||||
</div>
|
||||
<div class="burn-profile-col burn-profile-action">
|
||||
<button class="btn btn-secondary" onclick="runPlatformStress()">Thermal Cycling</button>
|
||||
<button type="button" class="btn btn-secondary" onclick="runPlatformStress()">Thermal Cycling</button>
|
||||
<p>Run checked core test modules (CPU, MEM, GPU). Tests start at the same time and run for a period with short cooldown phases to stress the server cooling system.</p>
|
||||
</div>
|
||||
</div>
|
||||
@@ -1998,12 +2117,16 @@ func renderBurn() string {
|
||||
<button class="btn btn-sm btn-secondary" type="button" onclick="burnSelectAll()">Select All</button>
|
||||
<button class="btn btn-sm btn-secondary" type="button" onclick="burnSelectNone()">Clear</button>
|
||||
</div>
|
||||
<div id="burn-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
|
||||
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
|
||||
</div>
|
||||
<p id="burn-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA burn recipes.</p>
|
||||
</div>
|
||||
</div>
|
||||
<div id="burn-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
|
||||
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
|
||||
</div>
|
||||
<p id="burn-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA burn recipes.</p>
|
||||
<label class="cb-row" style="margin-top:10px">
|
||||
<input type="checkbox" id="burn-stagger-nvidia">
|
||||
<span>Ramp selected NVIDIA GPUs one by one before full-load hold. Uses a 3-minute stabilization window per GPU, then keeps all selected GPUs under load for the chosen Burn Profile duration.</span>
|
||||
</label>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="burn-section">Core Burn Paths</div>
|
||||
<div class="grid2 burn-grid" style="margin-bottom:16px">
|
||||
@@ -2029,27 +2152,6 @@ func renderBurn() string {
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="burn-section">GPU-Specific Tests</div>
|
||||
<div class="grid2 burn-grid" style="margin-bottom:16px">
|
||||
<div class="card burn-card">
|
||||
<div class="card-head card-head-actions"><span>Power Delivery / Power Budget</span><button class="btn btn-primary btn-sm" onclick="runBurnTaskSet([{id:'burn-nvidia-power',target:'nvidia-targeted-power',label:'NVIDIA Targeted Power (dcgmi diag targeted_power)',nvidia:true},{id:'burn-nvidia-pulse',target:'nvidia-pulse',label:'NVIDIA Pulse Test (dcgmi diag pulse_test)',nvidia:true}])">Run</button></div>
|
||||
<div class="card-body burn-card-body">
|
||||
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">Official NVIDIA power-oriented recipes. ` + "targeted_power" + ` checks sustained delivery; ` + "pulse_test" + ` checks transient behavior.</p>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-nvidia-power" disabled><span>NVIDIA Targeted Power (dcgmi diag targeted_power) <span class="cb-note" id="note-nvidia-power"></span></span></label>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-nvidia-pulse" disabled><span>NVIDIA Pulse Test (dcgmi diag pulse_test) <span class="cb-note" id="note-nvidia-pulse"></span></span></label>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card burn-card">
|
||||
<div class="card-head card-head-actions"><span>Interconnect / Bandwidth</span><button class="btn btn-primary btn-sm" onclick="runBurnTaskSet([{id:'burn-nvidia-interconnect',target:'nvidia-interconnect',label:'NVIDIA Interconnect Test (NCCL all_reduce_perf)',nvidia:true},{id:'burn-nvidia-bandwidth',target:'nvidia-bandwidth',label:'NVIDIA Bandwidth Test (NVBandwidth)',nvidia:true}])">Run</button></div>
|
||||
<div class="card-body burn-card-body">
|
||||
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">Official NVIDIA fabric paths. NCCL is interconnect-only and is not a compute burn. NVBandwidth validates copy and bandwidth paths.</p>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-nvidia-interconnect" disabled><span>NVIDIA Interconnect Test (NCCL all_reduce_perf) <span class="cb-note" id="note-nvidia-interconnect"></span></span></label>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-nvidia-bandwidth" disabled><span>NVIDIA Bandwidth Test (NVBandwidth) <span class="cb-note" id="note-nvidia-bandwidth"></span></span></label>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="bi-output" style="display:none;margin-top:16px" class="card">
|
||||
<div class="card-head">Output <span id="bi-title"></span></div>
|
||||
<div class="card-body"><div id="bi-terminal" class="terminal"></div></div>
|
||||
@@ -2098,6 +2200,11 @@ function burnSelectedGPUIndices() {
|
||||
.sort(function(a, b) { return a - b; });
|
||||
}
|
||||
|
||||
function burnUseNvidiaRampUp() {
|
||||
const el = document.getElementById('burn-stagger-nvidia');
|
||||
return !!(el && el.checked);
|
||||
}
|
||||
|
||||
function burnUpdateSelectionNote() {
|
||||
const note = document.getElementById('burn-selection-note');
|
||||
const selected = burnSelectedGPUIndices();
|
||||
@@ -2157,6 +2264,9 @@ function enqueueBurnTask(target, label, extra, useSelectedNvidia) {
|
||||
return Promise.reject(new Error('Select at least one NVIDIA GPU.'));
|
||||
}
|
||||
body.gpu_indices = selected;
|
||||
if (burnUseNvidiaRampUp() && selected.length > 1) {
|
||||
body.stagger_gpu_start = true;
|
||||
}
|
||||
}
|
||||
return fetch('/api/sat/' + target + '/run', {
|
||||
method: 'POST',
|
||||
@@ -2299,10 +2409,6 @@ function runAllBurnTasks() {
|
||||
const status = document.getElementById('burn-all-status');
|
||||
const all = [
|
||||
{id:'burn-nvidia-compute',target:'nvidia-compute',label:'NVIDIA Max Compute Load (dcgmproftester)',nvidia:true},
|
||||
{id:'burn-nvidia-power',target:'nvidia-targeted-power',label:'NVIDIA Targeted Power (dcgmi diag targeted_power)',nvidia:true},
|
||||
{id:'burn-nvidia-pulse',target:'nvidia-pulse',label:'NVIDIA Pulse Test (dcgmi diag pulse_test)',nvidia:true},
|
||||
{id:'burn-nvidia-interconnect',target:'nvidia-interconnect',label:'NVIDIA Interconnect Test (NCCL all_reduce_perf)',nvidia:true},
|
||||
{id:'burn-nvidia-bandwidth',target:'nvidia-bandwidth',label:'NVIDIA Bandwidth Test (NVBandwidth)',nvidia:true},
|
||||
{id:'burn-gpu-bee',target:'nvidia-stress',label:'GPU Burn (bee-gpu-burn)',nvidia:true,extra:{loader:'builtin'}},
|
||||
{id:'burn-gpu-john',target:'nvidia-stress',label:'John GPU Stress (john/OpenCL)',nvidia:true,extra:{loader:'john'}},
|
||||
{id:'burn-gpu-rvs',target:'amd-stress',label:'AMD GPU Stress (rvs gst)'},
|
||||
@@ -2317,10 +2423,6 @@ function runAllBurnTasks() {
|
||||
fetch('/api/gpu/tools').then(function(r) { return r.json(); }).then(function(tools) {
|
||||
const map = {
|
||||
'nvidia-compute': {cb:'burn-nvidia-compute', note:'note-nvidia-compute', reason:'dcgmproftester not available or NVIDIA driver not running'},
|
||||
'nvidia-targeted-power': {cb:'burn-nvidia-power', note:'note-nvidia-power', reason:'dcgmi not available or NVIDIA driver not running'},
|
||||
'nvidia-pulse': {cb:'burn-nvidia-pulse', note:'note-nvidia-pulse', reason:'dcgmi not available or NVIDIA driver not running'},
|
||||
'nvidia-interconnect': {cb:'burn-nvidia-interconnect', note:'note-nvidia-interconnect', reason:'NCCL interconnect tool not available or NVIDIA driver not running'},
|
||||
'nvidia-bandwidth': {cb:'burn-nvidia-bandwidth', note:'note-nvidia-bandwidth', reason:'nvbandwidth or dcgmi not available or NVIDIA driver not running'},
|
||||
'bee-gpu-burn': {cb:'burn-gpu-bee', note:'note-bee', reason:'bee-gpu-burn not available or NVIDIA driver not running'},
|
||||
'john': {cb:'burn-gpu-john', note:'note-john', reason:'bee-john-gpu-stress not available or NVIDIA driver not running'},
|
||||
'rvs': {cb:'burn-gpu-rvs', note:'note-rvs', reason:'AMD driver not running'},
|
||||
@@ -2756,55 +2858,6 @@ usbRefresh();
|
||||
</script>`
|
||||
}
|
||||
|
||||
// ── Display Resolution ────────────────────────────────────────────────────────
|
||||
|
||||
func renderDisplayInline() string {
|
||||
return `<div id="display-status" style="color:var(--muted);font-size:13px;margin-bottom:12px">Loading displays...</div>
|
||||
<div id="display-controls"></div>
|
||||
<script>
|
||||
(function(){
|
||||
function loadDisplays() {
|
||||
fetch('/api/display/resolutions').then(r=>r.json()).then(displays => {
|
||||
const status = document.getElementById('display-status');
|
||||
const ctrl = document.getElementById('display-controls');
|
||||
if (!displays || displays.length === 0) {
|
||||
status.textContent = 'No connected displays found or xrandr not available.';
|
||||
return;
|
||||
}
|
||||
status.textContent = '';
|
||||
ctrl.innerHTML = displays.map(d => {
|
||||
const opts = (d.modes||[]).map(m =>
|
||||
'<option value="'+m.mode+'"'+(m.current?' selected':'')+'>'+m.mode+(m.current?' (current)':'')+'</option>'
|
||||
).join('');
|
||||
return '<div style="margin-bottom:12px">'
|
||||
+'<span style="font-weight:600;margin-right:8px">'+d.output+'</span>'
|
||||
+'<span style="color:var(--muted);font-size:12px;margin-right:12px">Current: '+d.current+'</span>'
|
||||
+'<select id="res-sel-'+d.output+'" style="margin-right:8px">'+opts+'</select>'
|
||||
+'<button class="btn btn-sm btn-primary" onclick="applyResolution(\''+d.output+'\')">Apply</button>'
|
||||
+'</div>';
|
||||
}).join('');
|
||||
}).catch(()=>{
|
||||
document.getElementById('display-status').textContent = 'xrandr not available on this system.';
|
||||
});
|
||||
}
|
||||
window.applyResolution = function(output) {
|
||||
const sel = document.getElementById('res-sel-'+output);
|
||||
if (!sel) return;
|
||||
const mode = sel.value;
|
||||
const btn = sel.nextElementSibling;
|
||||
btn.disabled = true;
|
||||
btn.textContent = 'Applying...';
|
||||
fetch('/api/display/set', {method:'POST', headers:{'Content-Type':'application/json'}, body:JSON.stringify({output:output,mode:mode})})
|
||||
.then(r=>r.json()).then(d=>{
|
||||
if (d.error) { alert('Error: '+d.error); }
|
||||
loadDisplays();
|
||||
}).catch(e=>{ alert('Error: '+e); })
|
||||
.finally(()=>{ btn.disabled=false; btn.textContent='Apply'; });
|
||||
};
|
||||
loadDisplays();
|
||||
})();
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderNvidiaSelfHealInline() string {
|
||||
return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Inspect NVIDIA GPU health, restart the bee-nvidia driver service, and issue a per-GPU reset when the driver reports reset required.</p>
|
||||
@@ -2993,8 +3046,6 @@ function installToRAM() {
|
||||
<div class="card"><div class="card-head">Services</div><div class="card-body">` +
|
||||
renderServicesInline() + `</div></div>
|
||||
|
||||
<div class="card"><div class="card-head">Display Resolution</div><div class="card-body">` +
|
||||
renderDisplayInline() + `</div></div>
|
||||
|
||||
<script>
|
||||
function checkTools() {
|
||||
|
||||
@@ -295,10 +295,6 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
// Tools
|
||||
mux.HandleFunc("GET /api/tools/check", h.handleAPIToolsCheck)
|
||||
|
||||
// Display
|
||||
mux.HandleFunc("GET /api/display/resolutions", h.handleAPIDisplayResolutions)
|
||||
mux.HandleFunc("POST /api/display/set", h.handleAPIDisplaySet)
|
||||
|
||||
// GPU presence / tools
|
||||
mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
|
||||
mux.HandleFunc("GET /api/gpu/nvidia", h.handleAPIGNVIDIAGPUs)
|
||||
|
||||
@@ -693,8 +693,8 @@ func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) {
|
||||
for _, needle := range []string{
|
||||
`Benchmark Results`,
|
||||
`Composite score by saved benchmark run and GPU.`,
|
||||
`NVIDIA H100 PCIe / GPU 0`,
|
||||
`NVIDIA H100 PCIe / GPU 1`,
|
||||
`GPU #0 — NVIDIA H100 PCIe`,
|
||||
`GPU #1 — NVIDIA H100 PCIe`,
|
||||
`#1`,
|
||||
wantTime,
|
||||
`1176.25`,
|
||||
@@ -741,8 +741,8 @@ func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
|
||||
for _, needle := range []string{
|
||||
`NVIDIA Max Compute Load`,
|
||||
`dcgmproftester`,
|
||||
`targeted_stress remain in <a href="/validate">Validate</a>`,
|
||||
`NVIDIA Interconnect Test (NCCL all_reduce_perf)`,
|
||||
`NCCL`,
|
||||
`Validate → Stress mode`,
|
||||
`id="burn-gpu-list"`,
|
||||
} {
|
||||
if !strings.Contains(body, needle) {
|
||||
|
||||
@@ -115,10 +115,12 @@ type Task struct {
|
||||
// taskParams holds optional parameters parsed from the run request.
|
||||
type taskParams struct {
|
||||
Duration int `json:"duration,omitempty"`
|
||||
DiagLevel int `json:"diag_level,omitempty"`
|
||||
StressMode bool `json:"stress_mode,omitempty"`
|
||||
GPUIndices []int `json:"gpu_indices,omitempty"`
|
||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices,omitempty"`
|
||||
StaggerGPUStart bool `json:"stagger_gpu_start,omitempty"`
|
||||
SizeMB int `json:"size_mb,omitempty"`
|
||||
Passes int `json:"passes,omitempty"`
|
||||
Loader string `json:"loader,omitempty"`
|
||||
BurnProfile string `json:"burn_profile,omitempty"`
|
||||
BenchmarkProfile string `json:"benchmark_profile,omitempty"`
|
||||
@@ -161,6 +163,13 @@ func resolveBurnPreset(profile string) burnPreset {
|
||||
}
|
||||
}
|
||||
|
||||
func boolToNvidiaStaggerSeconds(enabled bool, selected []int) int {
|
||||
if enabled && len(selected) > 1 {
|
||||
return 180
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions {
|
||||
acceptanceCycles := []platform.PlatformStressCycle{
|
||||
{LoadSec: 85, IdleSec: 5},
|
||||
@@ -215,11 +224,11 @@ var globalQueue = &taskQueue{trigger: make(chan struct{}, 1)}
|
||||
const maxTaskHistory = 50
|
||||
|
||||
var (
|
||||
runMemoryAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunMemoryAcceptancePackCtx(ctx, baseDir, logFunc)
|
||||
runMemoryAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) {
|
||||
return a.RunMemoryAcceptancePackCtx(ctx, baseDir, sizeMB, passes, logFunc)
|
||||
}
|
||||
runStorageAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunStorageAcceptancePackCtx(ctx, baseDir, logFunc)
|
||||
runStorageAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
|
||||
return a.RunStorageAcceptancePackCtx(ctx, baseDir, extended, logFunc)
|
||||
}
|
||||
runCPUAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.RunCPUAcceptancePackCtx(ctx, baseDir, durationSec, logFunc)
|
||||
@@ -552,7 +561,10 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
diagLevel := t.params.DiagLevel
|
||||
diagLevel := 2
|
||||
if t.params.StressMode {
|
||||
diagLevel = 3
|
||||
}
|
||||
if len(t.params.GPUIndices) > 0 || diagLevel > 0 {
|
||||
result, e := a.RunNvidiaAcceptancePackWithOptions(
|
||||
ctx, "", diagLevel, t.params.GPUIndices, j.append,
|
||||
@@ -588,7 +600,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
RunNCCL: t.params.RunNCCL,
|
||||
ParallelGPUs: t.params.ParallelGPUs,
|
||||
}, j.append)
|
||||
case "nvidia-compute":
|
||||
case "nvidia-compute":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
@@ -597,7 +609,11 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, j.append)
|
||||
staggerSec := boolToNvidiaStaggerSeconds(t.params.StaggerGPUStart, t.params.GPUIndices)
|
||||
if staggerSec > 0 {
|
||||
j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU", staggerSec))
|
||||
}
|
||||
archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, staggerSec, j.append)
|
||||
case "nvidia-targeted-power":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
@@ -647,24 +663,29 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
|
||||
DurationSec: dur,
|
||||
Loader: t.params.Loader,
|
||||
GPUIndices: t.params.GPUIndices,
|
||||
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||
}, j.append)
|
||||
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
|
||||
DurationSec: dur,
|
||||
Loader: t.params.Loader,
|
||||
GPUIndices: t.params.GPUIndices,
|
||||
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||
StaggerSeconds: boolToNvidiaStaggerSeconds(t.params.StaggerGPUStart, t.params.GPUIndices),
|
||||
}, j.append)
|
||||
case "memory":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = runMemoryAcceptancePackCtx(a, ctx, "", j.append)
|
||||
sizeMB, passes := 256, 1
|
||||
if t.params.StressMode {
|
||||
sizeMB, passes = 1024, 3
|
||||
}
|
||||
archive, err = runMemoryAcceptancePackCtx(a, ctx, "", sizeMB, passes, j.append)
|
||||
case "storage":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = runStorageAcceptancePackCtx(a, ctx, "", j.append)
|
||||
archive, err = runStorageAcceptancePackCtx(a, ctx, "", t.params.StressMode, j.append)
|
||||
case "cpu":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
@@ -675,7 +696,11 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
if dur <= 0 {
|
||||
dur = 60
|
||||
if t.params.StressMode {
|
||||
dur = 1800
|
||||
} else {
|
||||
dur = 60
|
||||
}
|
||||
}
|
||||
j.append(fmt.Sprintf("CPU stress duration: %ds", dur))
|
||||
archive, err = runCPUAcceptancePackCtx(a, ctx, "", dur, j.append)
|
||||
|
||||
@@ -422,7 +422,7 @@ func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) {
|
||||
for _, needle := range []string{
|
||||
`Benchmark Results`,
|
||||
`Composite score for this benchmark task.`,
|
||||
`NVIDIA H100 PCIe / GPU 0`,
|
||||
`GPU #0 — NVIDIA H100 PCIe`,
|
||||
`1176.25`,
|
||||
} {
|
||||
if !strings.Contains(html, needle) {
|
||||
|
||||
2
bible
2
bible
Submodule bible updated: 688b87e98d...1d89a4918e
@@ -36,7 +36,6 @@ typedef void *CUstream;
|
||||
#define MAX_CUBLAS_PROFILES 5
|
||||
#define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
|
||||
#define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
|
||||
#define STRESS_LAUNCH_DEPTH 8
|
||||
|
||||
static const char *ptx_source =
|
||||
".version 6.0\n"
|
||||
@@ -344,7 +343,6 @@ static int run_ptx_fallback(struct cuda_api *api,
|
||||
unsigned long iterations = 0;
|
||||
int mp_count = 0;
|
||||
int stream_count = 1;
|
||||
int launches_per_wave = 0;
|
||||
|
||||
memset(report, 0, sizeof(*report));
|
||||
snprintf(report->backend, sizeof(report->backend), "driver-ptx");
|
||||
@@ -419,44 +417,42 @@ static int run_ptx_fallback(struct cuda_api *api,
|
||||
|
||||
unsigned int threads = 256;
|
||||
|
||||
double start = now_seconds();
|
||||
double deadline = start + (double)seconds;
|
||||
double deadline = now_seconds() + (double)seconds;
|
||||
double next_sync = now_seconds() + 1.0;
|
||||
while (now_seconds() < deadline) {
|
||||
launches_per_wave = 0;
|
||||
for (int depth = 0; depth < STRESS_LAUNCH_DEPTH && now_seconds() < deadline; depth++) {
|
||||
int launched_this_batch = 0;
|
||||
for (int lane = 0; lane < stream_count; lane++) {
|
||||
unsigned int blocks = (unsigned int)((words[lane] + threads - 1) / threads);
|
||||
if (!check_rc(api,
|
||||
"cuLaunchKernel",
|
||||
api->cuLaunchKernel(kernel,
|
||||
blocks,
|
||||
1,
|
||||
1,
|
||||
threads,
|
||||
1,
|
||||
1,
|
||||
0,
|
||||
streams[lane],
|
||||
params[lane],
|
||||
NULL))) {
|
||||
goto fail;
|
||||
}
|
||||
launches_per_wave++;
|
||||
launched_this_batch++;
|
||||
}
|
||||
if (launched_this_batch <= 0) {
|
||||
break;
|
||||
int launched = 0;
|
||||
for (int lane = 0; lane < stream_count; lane++) {
|
||||
unsigned int blocks = (unsigned int)((words[lane] + threads - 1) / threads);
|
||||
if (!check_rc(api,
|
||||
"cuLaunchKernel",
|
||||
api->cuLaunchKernel(kernel,
|
||||
blocks,
|
||||
1,
|
||||
1,
|
||||
threads,
|
||||
1,
|
||||
1,
|
||||
0,
|
||||
streams[lane],
|
||||
params[lane],
|
||||
NULL))) {
|
||||
goto fail;
|
||||
}
|
||||
launched++;
|
||||
iterations++;
|
||||
}
|
||||
if (launches_per_wave <= 0) {
|
||||
if (launched <= 0) {
|
||||
goto fail;
|
||||
}
|
||||
if (!check_rc(api, "cuCtxSynchronize", api->cuCtxSynchronize())) {
|
||||
goto fail;
|
||||
double now = now_seconds();
|
||||
if (now >= next_sync || now >= deadline) {
|
||||
if (!check_rc(api, "cuCtxSynchronize", api->cuCtxSynchronize())) {
|
||||
goto fail;
|
||||
}
|
||||
next_sync = now + 1.0;
|
||||
}
|
||||
iterations += (unsigned long)launches_per_wave;
|
||||
}
|
||||
api->cuCtxSynchronize();
|
||||
|
||||
if (!check_rc(api, "cuMemcpyDtoH", api->cuMemcpyDtoH(sample, device_mem[0], sizeof(sample)))) {
|
||||
goto fail;
|
||||
@@ -468,11 +464,10 @@ static int run_ptx_fallback(struct cuda_api *api,
|
||||
report->iterations = iterations;
|
||||
snprintf(report->details,
|
||||
sizeof(report->details),
|
||||
"fallback_int32=OK requested_mb=%d actual_mb=%d streams=%d queue_depth=%d per_stream_mb=%zu iterations=%lu\n",
|
||||
"fallback_int32=OK requested_mb=%d actual_mb=%d streams=%d per_stream_mb=%zu iterations=%lu\n",
|
||||
size_mb,
|
||||
report->buffer_mb,
|
||||
report->stream_count,
|
||||
STRESS_LAUNCH_DEPTH,
|
||||
bytes_per_stream[0] / (1024u * 1024u),
|
||||
iterations);
|
||||
|
||||
@@ -1140,7 +1135,6 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
int stream_count = 1;
|
||||
int profile_count = (int)(sizeof(k_profiles) / sizeof(k_profiles[0]));
|
||||
int prepared_count = 0;
|
||||
int wave_launches = 0;
|
||||
size_t requested_budget = 0;
|
||||
size_t total_budget = 0;
|
||||
size_t per_profile_budget = 0;
|
||||
@@ -1207,11 +1201,10 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
report->buffer_mb = (int)(total_budget / (1024u * 1024u));
|
||||
append_detail(report->details,
|
||||
sizeof(report->details),
|
||||
"requested_mb=%d actual_mb=%d streams=%d queue_depth=%d mp_count=%d per_worker_mb=%zu\n",
|
||||
"requested_mb=%d actual_mb=%d streams=%d mp_count=%d per_worker_mb=%zu\n",
|
||||
size_mb,
|
||||
report->buffer_mb,
|
||||
report->stream_count,
|
||||
STRESS_LAUNCH_DEPTH,
|
||||
mp_count,
|
||||
per_profile_budget / (1024u * 1024u));
|
||||
|
||||
@@ -1260,50 +1253,55 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Keep the GPU queue continuously full by submitting kernels without
|
||||
* synchronizing after every wave. A sync barrier after each small batch
|
||||
* creates CPU↔GPU ping-pong gaps that prevent full TDP utilisation,
|
||||
* especially when individual kernels are short. Instead we sync at most
|
||||
* once per second (for error detection) and once at the very end. */
|
||||
double deadline = now_seconds() + (double)seconds;
|
||||
double next_sync = now_seconds() + 1.0;
|
||||
while (now_seconds() < deadline) {
|
||||
wave_launches = 0;
|
||||
for (int depth = 0; depth < STRESS_LAUNCH_DEPTH && now_seconds() < deadline; depth++) {
|
||||
int launched_this_batch = 0;
|
||||
for (int i = 0; i < prepared_count; i++) {
|
||||
if (!prepared[i].ready) {
|
||||
continue;
|
||||
}
|
||||
if (!run_cublas_profile(handle, &cublas, &prepared[i])) {
|
||||
append_detail(report->details,
|
||||
sizeof(report->details),
|
||||
"%s=FAILED runtime\n",
|
||||
prepared[i].desc.name);
|
||||
for (int j = 0; j < prepared_count; j++) {
|
||||
destroy_profile(&cublas, cuda, &prepared[j]);
|
||||
}
|
||||
cublas.cublasLtDestroy(handle);
|
||||
destroy_streams(cuda, streams, stream_count);
|
||||
cuda->cuCtxDestroy(ctx);
|
||||
return 0;
|
||||
}
|
||||
prepared[i].iterations++;
|
||||
report->iterations++;
|
||||
wave_launches++;
|
||||
launched_this_batch++;
|
||||
int launched = 0;
|
||||
for (int i = 0; i < prepared_count; i++) {
|
||||
if (!prepared[i].ready) {
|
||||
continue;
|
||||
}
|
||||
if (launched_this_batch <= 0) {
|
||||
break;
|
||||
if (!run_cublas_profile(handle, &cublas, &prepared[i])) {
|
||||
append_detail(report->details,
|
||||
sizeof(report->details),
|
||||
"%s=FAILED runtime\n",
|
||||
prepared[i].desc.name);
|
||||
for (int j = 0; j < prepared_count; j++) {
|
||||
destroy_profile(&cublas, cuda, &prepared[j]);
|
||||
}
|
||||
cublas.cublasLtDestroy(handle);
|
||||
destroy_streams(cuda, streams, stream_count);
|
||||
cuda->cuCtxDestroy(ctx);
|
||||
return 0;
|
||||
}
|
||||
prepared[i].iterations++;
|
||||
report->iterations++;
|
||||
launched++;
|
||||
}
|
||||
if (wave_launches <= 0) {
|
||||
if (launched <= 0) {
|
||||
break;
|
||||
}
|
||||
if (!check_rc(cuda, "cuCtxSynchronize", cuda->cuCtxSynchronize())) {
|
||||
for (int i = 0; i < prepared_count; i++) {
|
||||
destroy_profile(&cublas, cuda, &prepared[i]);
|
||||
double now = now_seconds();
|
||||
if (now >= next_sync || now >= deadline) {
|
||||
if (!check_rc(cuda, "cuCtxSynchronize", cuda->cuCtxSynchronize())) {
|
||||
for (int i = 0; i < prepared_count; i++) {
|
||||
destroy_profile(&cublas, cuda, &prepared[i]);
|
||||
}
|
||||
cublas.cublasLtDestroy(handle);
|
||||
destroy_streams(cuda, streams, stream_count);
|
||||
cuda->cuCtxDestroy(ctx);
|
||||
return 0;
|
||||
}
|
||||
cublas.cublasLtDestroy(handle);
|
||||
destroy_streams(cuda, streams, stream_count);
|
||||
cuda->cuCtxDestroy(ctx);
|
||||
return 0;
|
||||
next_sync = now + 1.0;
|
||||
}
|
||||
}
|
||||
/* Final drain — ensure all queued work finishes before we read results. */
|
||||
cuda->cuCtxSynchronize();
|
||||
|
||||
for (int i = 0; i < prepared_count; i++) {
|
||||
if (!prepared[i].ready) {
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
set color_normal=light-gray/black
|
||||
set color_highlight=white/dark-gray
|
||||
set color_highlight=yellow/black
|
||||
|
||||
if [ -e /boot/grub/splash.png ]; then
|
||||
set theme=/boot/grub/live-theme/theme.txt
|
||||
else
|
||||
set menu_color_normal=cyan/black
|
||||
set menu_color_highlight=white/dark-gray
|
||||
set menu_color_normal=yellow/black
|
||||
set menu_color_highlight=white/brown
|
||||
fi
|
||||
|
||||
@@ -10,20 +10,15 @@ import os
|
||||
|
||||
W, H = 1920, 1080
|
||||
|
||||
GLYPHS = {
|
||||
'E': ["11111", "10000", "11110", "10000", "10000", "10000", "11111"],
|
||||
'A': ["01110", "10001", "10001", "11111", "10001", "10001", "10001"],
|
||||
'S': ["01111", "10000", "10000", "01110", "00001", "00001", "11110"],
|
||||
'Y': ["10001", "10001", "01010", "00100", "00100", "00100", "00100"],
|
||||
'B': ["11110", "10001", "10001", "11110", "10001", "10001", "11110"],
|
||||
'-': ["00000", "00000", "11111", "00000", "00000", "00000", "00000"],
|
||||
}
|
||||
|
||||
TITLE = "EASY-BEE"
|
||||
SUBTITLE = "Hardware Audit LiveCD"
|
||||
CELL = 30
|
||||
GLYPH_GAP = 18
|
||||
ROW_GAP = 6
|
||||
ASCII_ART = [
|
||||
" ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗",
|
||||
" ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝",
|
||||
" █████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗",
|
||||
" ██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝",
|
||||
" ███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗",
|
||||
" ╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝",
|
||||
]
|
||||
SUBTITLE = " Hardware Audit LiveCD"
|
||||
|
||||
FG = (0xF6, 0xD0, 0x47)
|
||||
FG_DIM = (0xD4, 0xA9, 0x1C)
|
||||
@@ -31,6 +26,12 @@ SHADOW = (0x5E, 0x47, 0x05)
|
||||
SUB = (0x96, 0x7A, 0x17)
|
||||
BG = (0x05, 0x05, 0x05)
|
||||
|
||||
MONO_FONT_CANDIDATES = [
|
||||
'/usr/share/fonts/truetype/dejavu/DejaVuSansMono-Bold.ttf',
|
||||
'/usr/share/fonts/truetype/liberation2/LiberationMono-Bold.ttf',
|
||||
'/usr/share/fonts/truetype/liberation/LiberationMono-Bold.ttf',
|
||||
'/usr/share/fonts/truetype/freefont/FreeMonoBold.ttf',
|
||||
]
|
||||
SUB_FONT_CANDIDATES = [
|
||||
'/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf',
|
||||
'/usr/share/fonts/truetype/liberation2/LiberationSans-Bold.ttf',
|
||||
@@ -39,43 +40,34 @@ SUB_FONT_CANDIDATES = [
|
||||
]
|
||||
|
||||
|
||||
def load_font(size):
|
||||
for path in SUB_FONT_CANDIDATES:
|
||||
def load_font(candidates, size):
|
||||
for path in candidates:
|
||||
if os.path.exists(path):
|
||||
return ImageFont.truetype(path, size)
|
||||
return ImageFont.load_default()
|
||||
|
||||
|
||||
def glyph_width(ch):
|
||||
return len(GLYPHS[ch][0])
|
||||
def mono_metrics(font):
|
||||
probe = Image.new('L', (W, H), 0)
|
||||
draw = ImageDraw.Draw(probe)
|
||||
char_w = int(round(draw.textlength("M", font=font)))
|
||||
bb = draw.textbbox((0, 0), "Mg", font=font)
|
||||
char_h = bb[3] - bb[1]
|
||||
return char_w, char_h
|
||||
|
||||
|
||||
def render_logo_mask():
|
||||
width_cells = 0
|
||||
for idx, ch in enumerate(TITLE):
|
||||
width_cells += glyph_width(ch)
|
||||
if idx != len(TITLE) - 1:
|
||||
width_cells += 1
|
||||
mask_w = width_cells * CELL + (len(TITLE) - 1) * GLYPH_GAP
|
||||
mask_h = 7 * CELL + 6 * ROW_GAP
|
||||
mask = Image.new('L', (mask_w, mask_h), 0)
|
||||
def render_ascii_mask(font, lines, char_w, char_h, line_gap):
|
||||
width = max(len(line) for line in lines) * char_w
|
||||
height = len(lines) * char_h + line_gap * (len(lines) - 1)
|
||||
mask = Image.new('L', (width, height), 0)
|
||||
draw = ImageDraw.Draw(mask)
|
||||
|
||||
cx = 0
|
||||
for idx, ch in enumerate(TITLE):
|
||||
glyph = GLYPHS[ch]
|
||||
for row_idx, row in enumerate(glyph):
|
||||
for col_idx, cell in enumerate(row):
|
||||
if cell != '1':
|
||||
continue
|
||||
x0 = cx + col_idx * CELL
|
||||
y0 = row_idx * (CELL + ROW_GAP)
|
||||
x1 = x0 + CELL - 4
|
||||
y1 = y0 + CELL - 4
|
||||
draw.rounded_rectangle((x0, y0, x1, y1), radius=4, fill=255)
|
||||
cx += glyph_width(ch) * CELL
|
||||
if idx != len(TITLE) - 1:
|
||||
cx += CELL + GLYPH_GAP
|
||||
for row, line in enumerate(lines):
|
||||
y = row * (char_h + line_gap)
|
||||
for col, ch in enumerate(line):
|
||||
if ch == ' ':
|
||||
continue
|
||||
x = col * char_w
|
||||
draw.text((x, y), ch, font=font, fill=255)
|
||||
return mask
|
||||
|
||||
|
||||
@@ -90,20 +82,28 @@ glow_draw.ellipse((520, 340, 1400, 760), fill=(255, 190, 40, 36))
|
||||
glow = glow.filter(ImageFilter.GaussianBlur(60))
|
||||
img = Image.alpha_composite(img.convert('RGBA'), glow)
|
||||
|
||||
logo_mask = render_logo_mask()
|
||||
TARGET_LOGO_W = 400
|
||||
max_chars = max(len(line) for line in ASCII_ART)
|
||||
_probe_font = load_font(MONO_FONT_CANDIDATES, 64)
|
||||
_probe_cw, _ = mono_metrics(_probe_font)
|
||||
font_size_logo = max(6, int(64 * TARGET_LOGO_W / (_probe_cw * max_chars)))
|
||||
font_logo = load_font(MONO_FONT_CANDIDATES, font_size_logo)
|
||||
char_w, char_h = mono_metrics(font_logo)
|
||||
logo_mask = render_ascii_mask(font_logo, ASCII_ART, char_w, char_h, 2)
|
||||
logo_w, logo_h = logo_mask.size
|
||||
logo_x = (W - logo_w) // 2
|
||||
logo_y = 290
|
||||
logo_y = 380
|
||||
|
||||
shadow_mask = logo_mask.filter(ImageFilter.GaussianBlur(2))
|
||||
img.paste(SHADOW, (logo_x + 16, logo_y + 14), shadow_mask)
|
||||
img.paste(FG_DIM, (logo_x + 8, logo_y + 7), logo_mask)
|
||||
sh_off = max(1, font_size_logo // 6)
|
||||
shadow_mask = logo_mask.filter(ImageFilter.GaussianBlur(1))
|
||||
img.paste(SHADOW, (logo_x + sh_off * 2, logo_y + sh_off * 2), shadow_mask)
|
||||
img.paste(FG_DIM, (logo_x + sh_off, logo_y + sh_off), logo_mask)
|
||||
img.paste(FG, (logo_x, logo_y), logo_mask)
|
||||
|
||||
font_sub = load_font(30)
|
||||
font_sub = load_font(SUB_FONT_CANDIDATES, 30)
|
||||
sub_bb = draw.textbbox((0, 0), SUBTITLE, font=font_sub)
|
||||
sub_x = (W - (sub_bb[2] - sub_bb[0])) // 2
|
||||
sub_y = logo_y + logo_h + 54
|
||||
sub_y = logo_y + logo_h + 48
|
||||
draw = ImageDraw.Draw(img)
|
||||
draw.text((sub_x + 2, sub_y + 2), SUBTITLE, font=font_sub, fill=(35, 28, 6))
|
||||
draw.text((sub_x, sub_y), SUBTITLE, font=font_sub, fill=SUB)
|
||||
|
||||
110
iso/overlay/usr/local/bin/bee-dcgmproftester-staggered
Executable file
110
iso/overlay/usr/local/bin/bee-dcgmproftester-staggered
Executable file
@@ -0,0 +1,110 @@
|
||||
#!/bin/sh
|
||||
set -eu
|
||||
|
||||
SECONDS=300
|
||||
STAGGER_SECONDS=180
|
||||
DEVICES=""
|
||||
EXCLUDE=""
|
||||
|
||||
usage() {
|
||||
echo "usage: $0 [--seconds N] [--stagger-seconds N] [--devices 0,1] [--exclude 2,3]" >&2
|
||||
exit 2
|
||||
}
|
||||
|
||||
normalize_list() {
|
||||
echo "${1:-}" | tr ',' '\n' | sed 's/[[:space:]]//g' | awk 'NF' | sort -n | uniq | paste -sd, -
|
||||
}
|
||||
|
||||
contains_csv() {
|
||||
needle="$1"
|
||||
haystack="${2:-}"
|
||||
echo ",${haystack}," | grep -q ",${needle},"
|
||||
}
|
||||
|
||||
resolve_dcgmproftester() {
|
||||
for candidate in dcgmproftester dcgmproftester13 dcgmproftester12 dcgmproftester11; do
|
||||
if command -v "${candidate}" >/dev/null 2>&1; then
|
||||
command -v "${candidate}"
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
while [ "$#" -gt 0 ]; do
|
||||
case "$1" in
|
||||
--seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
|
||||
--stagger-seconds) [ "$#" -ge 2 ] || usage; STAGGER_SECONDS="$2"; shift 2 ;;
|
||||
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
||||
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
||||
*) usage ;;
|
||||
esac
|
||||
done
|
||||
|
||||
PROF=$(resolve_dcgmproftester) || { echo "dcgmproftester not found in PATH" >&2; exit 1; }
|
||||
ALL_DEVICES=$(nvidia-smi --query-gpu=index --format=csv,noheader,nounits 2>/dev/null | sed 's/[[:space:]]//g' | awk 'NF' | paste -sd, -)
|
||||
[ -n "${ALL_DEVICES}" ] || { echo "nvidia-smi found no NVIDIA GPUs" >&2; exit 1; }
|
||||
|
||||
DEVICES=$(normalize_list "${DEVICES}")
|
||||
EXCLUDE=$(normalize_list "${EXCLUDE}")
|
||||
SELECTED="${DEVICES}"
|
||||
if [ -z "${SELECTED}" ]; then
|
||||
SELECTED="${ALL_DEVICES}"
|
||||
fi
|
||||
|
||||
FINAL=""
|
||||
for id in $(echo "${SELECTED}" | tr ',' ' '); do
|
||||
[ -n "${id}" ] || continue
|
||||
if contains_csv "${id}" "${EXCLUDE}"; then
|
||||
continue
|
||||
fi
|
||||
if [ -z "${FINAL}" ]; then
|
||||
FINAL="${id}"
|
||||
else
|
||||
FINAL="${FINAL},${id}"
|
||||
fi
|
||||
done
|
||||
|
||||
[ -n "${FINAL}" ] || { echo "no NVIDIA GPUs selected after filters" >&2; exit 1; }
|
||||
|
||||
echo "loader=dcgmproftester-staggered"
|
||||
echo "selected_gpus=${FINAL}"
|
||||
echo "stagger_seconds=${STAGGER_SECONDS}"
|
||||
|
||||
TMP_DIR=$(mktemp -d)
|
||||
trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM
|
||||
|
||||
GPU_COUNT=$(echo "${FINAL}" | tr ',' '\n' | awk 'NF' | wc -l | tr -d '[:space:]')
|
||||
gpu_pos=0
|
||||
WORKERS=""
|
||||
for id in $(echo "${FINAL}" | tr ',' ' '); do
|
||||
gpu_pos=$((gpu_pos + 1))
|
||||
log="${TMP_DIR}/gpu-${id}.log"
|
||||
extra_sec=$(( STAGGER_SECONDS * (GPU_COUNT - gpu_pos) ))
|
||||
gpu_seconds=$(( SECONDS + extra_sec ))
|
||||
echo "starting gpu ${id} seconds=${gpu_seconds}"
|
||||
CUDA_VISIBLE_DEVICES="${id}" "${PROF}" --no-dcgm-validation -t 1004 -d "${gpu_seconds}" >"${log}" 2>&1 &
|
||||
pid=$!
|
||||
WORKERS="${WORKERS} ${pid}:${id}:${log}"
|
||||
if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${gpu_pos}" -lt "${GPU_COUNT}" ]; then
|
||||
sleep "${STAGGER_SECONDS}"
|
||||
fi
|
||||
done
|
||||
|
||||
status=0
|
||||
for spec in ${WORKERS}; do
|
||||
pid=${spec%%:*}
|
||||
rest=${spec#*:}
|
||||
id=${rest%%:*}
|
||||
log=${rest#*:}
|
||||
if wait "${pid}"; then
|
||||
echo "gpu ${id} finished: OK"
|
||||
else
|
||||
rc=$?
|
||||
echo "gpu ${id} finished: FAILED rc=${rc}"
|
||||
status=1
|
||||
fi
|
||||
sed "s/^/[gpu ${id}] /" "${log}" || true
|
||||
done
|
||||
|
||||
exit "${status}"
|
||||
17
iso/overlay/usr/local/bin/bee-gpu-burn
Normal file → Executable file
17
iso/overlay/usr/local/bin/bee-gpu-burn
Normal file → Executable file
@@ -2,13 +2,14 @@
|
||||
set -eu
|
||||
|
||||
SECONDS=5
|
||||
STAGGER_SECONDS=0
|
||||
SIZE_MB=0
|
||||
DEVICES=""
|
||||
EXCLUDE=""
|
||||
WORKER="/usr/local/lib/bee/bee-gpu-burn-worker"
|
||||
|
||||
usage() {
|
||||
echo "usage: $0 [--seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3]" >&2
|
||||
echo "usage: $0 [--seconds N] [--stagger-seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3]" >&2
|
||||
exit 2
|
||||
}
|
||||
|
||||
@@ -25,6 +26,7 @@ contains_csv() {
|
||||
while [ "$#" -gt 0 ]; do
|
||||
case "$1" in
|
||||
--seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
|
||||
--stagger-seconds) [ "$#" -ge 2 ] || usage; STAGGER_SECONDS="$2"; shift 2 ;;
|
||||
--size-mb|-m) [ "$#" -ge 2 ] || usage; SIZE_MB="$2"; shift 2 ;;
|
||||
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
||||
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
||||
@@ -61,14 +63,18 @@ done
|
||||
|
||||
echo "loader=bee-gpu-burn"
|
||||
echo "selected_gpus=${FINAL}"
|
||||
echo "stagger_seconds=${STAGGER_SECONDS}"
|
||||
|
||||
export CUDA_DEVICE_ORDER="PCI_BUS_ID"
|
||||
|
||||
TMP_DIR=$(mktemp -d)
|
||||
trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM
|
||||
|
||||
GPU_COUNT=$(echo "${FINAL}" | tr ',' '\n' | awk 'NF' | wc -l | tr -d '[:space:]')
|
||||
gpu_pos=0
|
||||
WORKERS=""
|
||||
for id in $(echo "${FINAL}" | tr ',' ' '); do
|
||||
gpu_pos=$((gpu_pos + 1))
|
||||
log="${TMP_DIR}/gpu-${id}.log"
|
||||
gpu_size_mb="${SIZE_MB}"
|
||||
if [ "${gpu_size_mb}" -le 0 ] 2>/dev/null; then
|
||||
@@ -79,11 +85,16 @@ for id in $(echo "${FINAL}" | tr ',' ' '); do
|
||||
gpu_size_mb=512
|
||||
fi
|
||||
fi
|
||||
echo "starting gpu ${id} size=${gpu_size_mb}MB"
|
||||
extra_sec=$(( STAGGER_SECONDS * (GPU_COUNT - gpu_pos) ))
|
||||
gpu_seconds=$(( SECONDS + extra_sec ))
|
||||
echo "starting gpu ${id} size=${gpu_size_mb}MB seconds=${gpu_seconds}"
|
||||
CUDA_VISIBLE_DEVICES="${id}" \
|
||||
"${WORKER}" --device 0 --seconds "${SECONDS}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
|
||||
"${WORKER}" --device 0 --seconds "${gpu_seconds}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
|
||||
pid=$!
|
||||
WORKERS="${WORKERS} ${pid}:${id}:${log}"
|
||||
if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${gpu_pos}" -lt "${GPU_COUNT}" ]; then
|
||||
sleep "${STAGGER_SECONDS}"
|
||||
fi
|
||||
done
|
||||
|
||||
status=0
|
||||
|
||||
16
iso/overlay/usr/local/bin/bee-john-gpu-stress
Normal file → Executable file
16
iso/overlay/usr/local/bin/bee-john-gpu-stress
Normal file → Executable file
@@ -2,6 +2,7 @@
|
||||
set -eu
|
||||
|
||||
DURATION_SEC=300
|
||||
STAGGER_SECONDS=0
|
||||
DEVICES=""
|
||||
EXCLUDE=""
|
||||
FORMAT=""
|
||||
@@ -12,7 +13,7 @@ export OCL_ICD_VENDORS="/etc/OpenCL/vendors"
|
||||
export LD_LIBRARY_PATH="/usr/lib:/usr/local/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
|
||||
|
||||
usage() {
|
||||
echo "usage: $0 [--seconds N] [--devices 0,1] [--exclude 2,3] [--format name]" >&2
|
||||
echo "usage: $0 [--seconds N] [--stagger-seconds N] [--devices 0,1] [--exclude 2,3] [--format name]" >&2
|
||||
exit 2
|
||||
}
|
||||
|
||||
@@ -118,6 +119,7 @@ ensure_opencl_ready() {
|
||||
while [ "$#" -gt 0 ]; do
|
||||
case "$1" in
|
||||
--seconds|-t) [ "$#" -ge 2 ] || usage; DURATION_SEC="$2"; shift 2 ;;
|
||||
--stagger-seconds) [ "$#" -ge 2 ] || usage; STAGGER_SECONDS="$2"; shift 2 ;;
|
||||
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
||||
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
||||
--format) [ "$#" -ge 2 ] || usage; FORMAT="$2"; shift 2 ;;
|
||||
@@ -170,6 +172,7 @@ done
|
||||
echo "loader=john"
|
||||
echo "selected_gpus=${FINAL}"
|
||||
echo "john_devices=${JOHN_DEVICES}"
|
||||
echo "stagger_seconds=${STAGGER_SECONDS}"
|
||||
|
||||
cd "${JOHN_DIR}"
|
||||
|
||||
@@ -232,14 +235,21 @@ trap cleanup EXIT INT TERM
|
||||
echo "format=${CHOSEN_FORMAT}"
|
||||
echo "target_seconds=${DURATION_SEC}"
|
||||
echo "slice_seconds=${TEST_SLICE_SECONDS}"
|
||||
DEADLINE=$(( $(date +%s) + DURATION_SEC ))
|
||||
TOTAL_DEVICES=$(echo "${JOHN_DEVICES}" | tr ',' '\n' | awk 'NF' | wc -l | tr -d '[:space:]')
|
||||
_first=1
|
||||
pos=0
|
||||
for opencl_id in $(echo "${JOHN_DEVICES}" | tr ',' ' '); do
|
||||
pos=$((pos + 1))
|
||||
[ "${_first}" = "1" ] || sleep 3
|
||||
_first=0
|
||||
run_john_loop "${opencl_id}" "${DEADLINE}" &
|
||||
extra_sec=$(( STAGGER_SECONDS * (TOTAL_DEVICES - pos) ))
|
||||
deadline=$(( $(date +%s) + DURATION_SEC + extra_sec ))
|
||||
run_john_loop "${opencl_id}" "${deadline}" &
|
||||
pid=$!
|
||||
PIDS="${PIDS} ${pid}"
|
||||
if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${pos}" -lt "${TOTAL_DEVICES}" ]; then
|
||||
sleep "${STAGGER_SECONDS}"
|
||||
fi
|
||||
done
|
||||
FAIL=0
|
||||
for pid in ${PIDS}; do
|
||||
|
||||
@@ -21,8 +21,13 @@ read_nvidia_modules_flavor() {
|
||||
|
||||
log "kernel: $(uname -r)"
|
||||
|
||||
# Skip if no NVIDIA GPU present (PCI vendor 10de)
|
||||
if ! lspci -nn 2>/dev/null | grep -qi '10de:'; then
|
||||
# Skip if no NVIDIA display/compute GPU is present.
|
||||
# Match only display-class PCI functions (0300 VGA, 0302 3D controller) from vendor 10de.
|
||||
have_nvidia_gpu() {
|
||||
lspci -Dn 2>/dev/null | awk '$2 ~ /^03(00|02):$/ && $3 ~ /^10de:/ { found=1; exit } END { exit(found ? 0 : 1) }'
|
||||
}
|
||||
|
||||
if ! have_nvidia_gpu; then
|
||||
log "no NVIDIA GPU detected — skipping module load"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
@@ -14,7 +14,7 @@ log() {
|
||||
}
|
||||
|
||||
have_nvidia_gpu() {
|
||||
lspci -nn 2>/dev/null | grep -qi '10de:'
|
||||
lspci -Dn 2>/dev/null | awk '$2 ~ /^03(00|02):$/ && $3 ~ /^10de:/ { found=1; exit } END { exit(found ? 0 : 1) }'
|
||||
}
|
||||
|
||||
service_active() {
|
||||
|
||||
Reference in New Issue
Block a user