Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c850b39b01 | ||
|
|
6dee8f3509 | ||
|
|
20f834aa96 |
4
PLAN.md
4
PLAN.md
@@ -343,9 +343,9 @@ Planned code shape:
|
|||||||
- `bee tui` can rerun the audit manually
|
- `bee tui` can rerun the audit manually
|
||||||
- `bee tui` can export the latest audit JSON to removable media
|
- `bee tui` can export the latest audit JSON to removable media
|
||||||
- `bee tui` can show health summary and run NVIDIA/memory/storage acceptance tests
|
- `bee tui` can show health summary and run NVIDIA/memory/storage acceptance tests
|
||||||
- NVIDIA SAT now includes a lightweight in-image GPU stress step via `bee-gpu-stress`
|
- NVIDIA SAT now includes a lightweight in-image GPU stress step via `bee-gpu-burn`
|
||||||
- SAT summaries now expose `overall_status` plus per-job `OK/FAILED/UNSUPPORTED`
|
- SAT summaries now expose `overall_status` plus per-job `OK/FAILED/UNSUPPORTED`
|
||||||
- Memory/GPU SAT runtime defaults can be overridden via `BEE_MEMTESTER_*` and `BEE_GPU_STRESS_*`
|
- Memory SAT runtime defaults can be overridden via `BEE_MEMTESTER_*`
|
||||||
- removable export requires explicit target selection, mount, confirmation, copy, and cleanup
|
- removable export requires explicit target selection, mount, confirmation, copy, and cleanup
|
||||||
|
|
||||||
### 2.6 — Vendor utilities and optional assets
|
### 2.6 — Vendor utilities and optional assets
|
||||||
|
|||||||
@@ -356,6 +356,7 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
|
|||||||
fs := flag.NewFlagSet("sat", flag.ContinueOnError)
|
fs := flag.NewFlagSet("sat", flag.ContinueOnError)
|
||||||
fs.SetOutput(stderr)
|
fs.SetOutput(stderr)
|
||||||
duration := fs.Int("duration", 0, "stress-ng duration in seconds (cpu only; default: 60)")
|
duration := fs.Int("duration", 0, "stress-ng duration in seconds (cpu only; default: 60)")
|
||||||
|
diagLevel := fs.Int("diag-level", 0, "DCGM diagnostic level for nvidia (1=quick, 2=medium, 3=targeted stress, 4=extended stress; default: 1)")
|
||||||
if err := fs.Parse(args[1:]); err != nil {
|
if err := fs.Parse(args[1:]); err != nil {
|
||||||
if err == flag.ErrHelp {
|
if err == flag.ErrHelp {
|
||||||
return 0
|
return 0
|
||||||
@@ -370,7 +371,7 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
|
|||||||
target := args[0]
|
target := args[0]
|
||||||
if target != "nvidia" && target != "memory" && target != "storage" && target != "cpu" {
|
if target != "nvidia" && target != "memory" && target != "storage" && target != "cpu" {
|
||||||
fmt.Fprintf(stderr, "bee sat: unknown target %q\n", target)
|
fmt.Fprintf(stderr, "bee sat: unknown target %q\n", target)
|
||||||
fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>]")
|
fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>] [--diag-level <1-4>]")
|
||||||
return 2
|
return 2
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -382,7 +383,12 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
|
|||||||
logLine := func(s string) { fmt.Fprintln(os.Stderr, s) }
|
logLine := func(s string) { fmt.Fprintln(os.Stderr, s) }
|
||||||
switch target {
|
switch target {
|
||||||
case "nvidia":
|
case "nvidia":
|
||||||
archive, err = application.RunNvidiaAcceptancePack("", logLine)
|
level := *diagLevel
|
||||||
|
if level > 0 {
|
||||||
|
_, err = application.RunNvidiaAcceptancePackWithOptions(context.Background(), "", level, nil, logLine)
|
||||||
|
} else {
|
||||||
|
archive, err = application.RunNvidiaAcceptancePack("", logLine)
|
||||||
|
}
|
||||||
case "memory":
|
case "memory":
|
||||||
archive, err = application.RunMemoryAcceptancePackCtx(context.Background(), "", logLine)
|
archive, err = application.RunMemoryAcceptancePackCtx(context.Background(), "", logLine)
|
||||||
case "storage":
|
case "storage":
|
||||||
|
|||||||
@@ -107,6 +107,7 @@ func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
|
|||||||
type satRunner interface {
|
type satRunner interface {
|
||||||
RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error)
|
RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error)
|
||||||
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
|
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||||
|
RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error)
|
||||||
RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||||
RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||||
RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||||
@@ -508,6 +509,17 @@ func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir st
|
|||||||
return ActionResult{Title: "NVIDIA DCGM", Body: body}, err
|
return ActionResult{Title: "NVIDIA DCGM", Body: body}, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaStressPack(baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunNvidiaStressPackCtx(context.Background(), baseDir, opts, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunNvidiaStressPack(ctx, baseDir, opts, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
func (a *App) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
func (a *App) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||||
return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, logFunc)
|
return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, logFunc)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -120,14 +120,15 @@ func (f fakeTools) CheckTools(names []string) []platform.ToolStatus {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type fakeSAT struct {
|
type fakeSAT struct {
|
||||||
runNvidiaFn func(string) (string, error)
|
runNvidiaFn func(string) (string, error)
|
||||||
runMemoryFn func(string) (string, error)
|
runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error)
|
||||||
runStorageFn func(string) (string, error)
|
runMemoryFn func(string) (string, error)
|
||||||
runCPUFn func(string, int) (string, error)
|
runStorageFn func(string) (string, error)
|
||||||
detectVendorFn func() string
|
runCPUFn func(string, int) (string, error)
|
||||||
listAMDGPUsFn func() ([]platform.AMDGPUInfo, error)
|
detectVendorFn func() string
|
||||||
runAMDPackFn func(string) (string, error)
|
listAMDGPUsFn func() ([]platform.AMDGPUInfo, error)
|
||||||
listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error)
|
runAMDPackFn func(string) (string, error)
|
||||||
|
listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string, _ func(string)) (string, error) {
|
func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string, _ func(string)) (string, error) {
|
||||||
@@ -138,6 +139,13 @@ func (f fakeSAT) RunNvidiaAcceptancePackWithOptions(_ context.Context, baseDir s
|
|||||||
return f.runNvidiaFn(baseDir)
|
return f.runNvidiaFn(baseDir)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunNvidiaStressPack(_ context.Context, baseDir string, opts platform.NvidiaStressOptions, _ func(string)) (string, error) {
|
||||||
|
if f.runNvidiaStressFn != nil {
|
||||||
|
return f.runNvidiaStressFn(baseDir, opts)
|
||||||
|
}
|
||||||
|
return f.runNvidiaFn(baseDir)
|
||||||
|
}
|
||||||
|
|
||||||
func (f fakeSAT) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
|
func (f fakeSAT) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
|
||||||
if f.listNvidiaGPUsFn != nil {
|
if f.listNvidiaGPUsFn != nil {
|
||||||
return f.listNvidiaGPUsFn()
|
return f.listNvidiaGPUsFn()
|
||||||
|
|||||||
194
audit/internal/platform/nvidia_stress.go
Normal file
194
audit/internal/platform/nvidia_stress.go
Normal file
@@ -0,0 +1,194 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"sort"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
func (s *System) RunNvidiaStressPack(ctx context.Context, baseDir string, opts NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||||
|
normalizeNvidiaStressOptions(&opts)
|
||||||
|
|
||||||
|
job, err := buildNvidiaStressJob(opts)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
|
||||||
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-stress", []satJob{
|
||||||
|
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
|
{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
|
||||||
|
job,
|
||||||
|
{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||||
|
}, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
|
||||||
|
selected, err := resolveNvidiaGPUSelection(opts.GPUIndices, opts.ExcludeGPUIndices)
|
||||||
|
if err != nil {
|
||||||
|
return satJob{}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
loader := strings.TrimSpace(strings.ToLower(opts.Loader))
|
||||||
|
switch loader {
|
||||||
|
case "", NvidiaStressLoaderBuiltin:
|
||||||
|
cmd := []string{
|
||||||
|
"bee-gpu-burn",
|
||||||
|
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||||
|
"--size-mb", strconv.Itoa(opts.SizeMB),
|
||||||
|
}
|
||||||
|
if len(selected) > 0 {
|
||||||
|
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||||
|
}
|
||||||
|
return satJob{
|
||||||
|
name: "03-bee-gpu-burn.log",
|
||||||
|
cmd: cmd,
|
||||||
|
collectGPU: true,
|
||||||
|
gpuIndices: selected,
|
||||||
|
}, nil
|
||||||
|
case NvidiaStressLoaderJohn:
|
||||||
|
cmd := []string{
|
||||||
|
"bee-john-gpu-stress",
|
||||||
|
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||||
|
}
|
||||||
|
if len(selected) > 0 {
|
||||||
|
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||||
|
}
|
||||||
|
return satJob{
|
||||||
|
name: "03-john-gpu-stress.log",
|
||||||
|
cmd: cmd,
|
||||||
|
collectGPU: true,
|
||||||
|
gpuIndices: selected,
|
||||||
|
}, nil
|
||||||
|
case NvidiaStressLoaderNCCL:
|
||||||
|
cmd := []string{
|
||||||
|
"bee-nccl-gpu-stress",
|
||||||
|
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||||
|
}
|
||||||
|
if len(selected) > 0 {
|
||||||
|
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||||
|
}
|
||||||
|
return satJob{
|
||||||
|
name: "03-bee-nccl-gpu-stress.log",
|
||||||
|
cmd: cmd,
|
||||||
|
collectGPU: true,
|
||||||
|
gpuIndices: selected,
|
||||||
|
}, nil
|
||||||
|
default:
|
||||||
|
return satJob{}, fmt.Errorf("unknown NVIDIA stress loader %q", opts.Loader)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func normalizeNvidiaStressOptions(opts *NvidiaStressOptions) {
|
||||||
|
if opts.DurationSec <= 0 {
|
||||||
|
opts.DurationSec = 300
|
||||||
|
}
|
||||||
|
if opts.SizeMB <= 0 {
|
||||||
|
opts.SizeMB = 64
|
||||||
|
}
|
||||||
|
switch strings.TrimSpace(strings.ToLower(opts.Loader)) {
|
||||||
|
case "", NvidiaStressLoaderBuiltin:
|
||||||
|
opts.Loader = NvidiaStressLoaderBuiltin
|
||||||
|
case NvidiaStressLoaderJohn:
|
||||||
|
opts.Loader = NvidiaStressLoaderJohn
|
||||||
|
case NvidiaStressLoaderNCCL:
|
||||||
|
opts.Loader = NvidiaStressLoaderNCCL
|
||||||
|
default:
|
||||||
|
opts.Loader = NvidiaStressLoaderBuiltin
|
||||||
|
}
|
||||||
|
opts.GPUIndices = dedupeSortedIndices(opts.GPUIndices)
|
||||||
|
opts.ExcludeGPUIndices = dedupeSortedIndices(opts.ExcludeGPUIndices)
|
||||||
|
}
|
||||||
|
|
||||||
|
func resolveNvidiaGPUSelection(include, exclude []int) ([]int, error) {
|
||||||
|
all, err := listNvidiaGPUIndices()
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if len(all) == 0 {
|
||||||
|
return nil, fmt.Errorf("nvidia-smi found no NVIDIA GPUs")
|
||||||
|
}
|
||||||
|
|
||||||
|
selected := all
|
||||||
|
if len(include) > 0 {
|
||||||
|
want := make(map[int]struct{}, len(include))
|
||||||
|
for _, idx := range include {
|
||||||
|
want[idx] = struct{}{}
|
||||||
|
}
|
||||||
|
selected = selected[:0]
|
||||||
|
for _, idx := range all {
|
||||||
|
if _, ok := want[idx]; ok {
|
||||||
|
selected = append(selected, idx)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(exclude) > 0 {
|
||||||
|
skip := make(map[int]struct{}, len(exclude))
|
||||||
|
for _, idx := range exclude {
|
||||||
|
skip[idx] = struct{}{}
|
||||||
|
}
|
||||||
|
filtered := selected[:0]
|
||||||
|
for _, idx := range selected {
|
||||||
|
if _, ok := skip[idx]; ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
filtered = append(filtered, idx)
|
||||||
|
}
|
||||||
|
selected = filtered
|
||||||
|
}
|
||||||
|
if len(selected) == 0 {
|
||||||
|
return nil, fmt.Errorf("no NVIDIA GPUs selected after applying filters")
|
||||||
|
}
|
||||||
|
out := append([]int(nil), selected...)
|
||||||
|
sort.Ints(out)
|
||||||
|
return out, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func listNvidiaGPUIndices() ([]int, error) {
|
||||||
|
out, err := satExecCommand("nvidia-smi", "--query-gpu=index", "--format=csv,noheader,nounits").Output()
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("nvidia-smi: %w", err)
|
||||||
|
}
|
||||||
|
var indices []int
|
||||||
|
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
if line == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
idx, err := strconv.Atoi(line)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
indices = append(indices, idx)
|
||||||
|
}
|
||||||
|
return dedupeSortedIndices(indices), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func dedupeSortedIndices(values []int) []int {
|
||||||
|
if len(values) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
seen := make(map[int]struct{}, len(values))
|
||||||
|
out := make([]int, 0, len(values))
|
||||||
|
for _, value := range values {
|
||||||
|
if value < 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if _, ok := seen[value]; ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seen[value] = struct{}{}
|
||||||
|
out = append(out, value)
|
||||||
|
}
|
||||||
|
sort.Ints(out)
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func joinIndexList(values []int) string {
|
||||||
|
parts := make([]string, 0, len(values))
|
||||||
|
for _, value := range values {
|
||||||
|
parts = append(parts, strconv.Itoa(value))
|
||||||
|
}
|
||||||
|
return strings.Join(parts, ",")
|
||||||
|
}
|
||||||
@@ -423,7 +423,10 @@ func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
|
func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
|
||||||
path, err := satLookPath("bee-gpu-stress")
|
path, err := satLookPath("bee-gpu-burn")
|
||||||
|
if err != nil {
|
||||||
|
path, err = satLookPath("bee-gpu-stress")
|
||||||
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -136,7 +136,10 @@ func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
|
|||||||
tools = append(tools, s.CheckTools([]string{
|
tools = append(tools, s.CheckTools([]string{
|
||||||
"nvidia-smi",
|
"nvidia-smi",
|
||||||
"nvidia-bug-report.sh",
|
"nvidia-bug-report.sh",
|
||||||
"bee-gpu-stress",
|
"bee-gpu-burn",
|
||||||
|
"bee-john-gpu-stress",
|
||||||
|
"bee-nccl-gpu-stress",
|
||||||
|
"all_reduce_perf",
|
||||||
})...)
|
})...)
|
||||||
case "amd":
|
case "amd":
|
||||||
tool := ToolStatus{Name: "rocm-smi"}
|
tool := ToolStatus{Name: "rocm-smi"}
|
||||||
@@ -176,8 +179,8 @@ func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHe
|
|||||||
health.DriverReady = true
|
health.DriverReady = true
|
||||||
}
|
}
|
||||||
|
|
||||||
if lookErr := exec.Command("sh", "-c", "command -v bee-gpu-stress >/dev/null 2>&1").Run(); lookErr == nil {
|
if _, lookErr := exec.LookPath("bee-gpu-burn"); lookErr == nil {
|
||||||
out, err := exec.Command("bee-gpu-stress", "--seconds", "1", "--size-mb", "1").CombinedOutput()
|
out, err := exec.Command("bee-gpu-burn", "--seconds", "1", "--size-mb", "1").CombinedOutput()
|
||||||
if err == nil {
|
if err == nil {
|
||||||
health.CUDAReady = true
|
health.CUDAReady = true
|
||||||
} else if strings.Contains(strings.ToLower(string(out)), "cuda_error_system_not_ready") {
|
} else if strings.Contains(strings.ToLower(string(out)), "cuda_error_system_not_ready") {
|
||||||
|
|||||||
@@ -425,14 +425,12 @@ type satStats struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func nvidiaSATJobs() []satJob {
|
func nvidiaSATJobs() []satJob {
|
||||||
seconds := envInt("BEE_GPU_STRESS_SECONDS", 5)
|
|
||||||
sizeMB := envInt("BEE_GPU_STRESS_SIZE_MB", 64)
|
|
||||||
return []satJob{
|
return []satJob{
|
||||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||||
{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||||
{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
|
{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
|
||||||
{name: "05-bee-gpu-stress.log", cmd: []string{"bee-gpu-stress", "--seconds", fmt.Sprintf("%d", seconds), "--size-mb", fmt.Sprintf("%d", sizeMB)}},
|
{name: "05-bee-gpu-burn.log", cmd: []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -130,26 +130,21 @@ func (s *System) RunFanStressTest(ctx context.Context, baseDir string, opts FanS
|
|||||||
stats.OK++
|
stats.OK++
|
||||||
}
|
}
|
||||||
|
|
||||||
// loadPhase runs bee-gpu-stress for durSec; sampler stamps phaseName on each row.
|
// loadPhase runs bee-gpu-burn for durSec; sampler stamps phaseName on each row.
|
||||||
loadPhase := func(phaseName, stepName string, durSec int) {
|
loadPhase := func(phaseName, stepName string, durSec int) {
|
||||||
if ctx.Err() != nil {
|
if ctx.Err() != nil {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
setPhase(phaseName)
|
setPhase(phaseName)
|
||||||
var env []string
|
|
||||||
if len(opts.GPUIndices) > 0 {
|
|
||||||
ids := make([]string, len(opts.GPUIndices))
|
|
||||||
for i, idx := range opts.GPUIndices {
|
|
||||||
ids[i] = strconv.Itoa(idx)
|
|
||||||
}
|
|
||||||
env = []string{"CUDA_VISIBLE_DEVICES=" + strings.Join(ids, ",")}
|
|
||||||
}
|
|
||||||
cmd := []string{
|
cmd := []string{
|
||||||
"bee-gpu-stress",
|
"bee-gpu-burn",
|
||||||
"--seconds", strconv.Itoa(durSec),
|
"--seconds", strconv.Itoa(durSec),
|
||||||
"--size-mb", strconv.Itoa(opts.SizeMB),
|
"--size-mb", strconv.Itoa(opts.SizeMB),
|
||||||
}
|
}
|
||||||
out, err := runSATCommandCtx(ctx, verboseLog, stepName, cmd, env, nil)
|
if len(opts.GPUIndices) > 0 {
|
||||||
|
cmd = append(cmd, "--devices", joinIndexList(dedupeSortedIndices(opts.GPUIndices)))
|
||||||
|
}
|
||||||
|
out, err := runSATCommandCtx(ctx, verboseLog, stepName, cmd, nil, nil)
|
||||||
_ = os.WriteFile(filepath.Join(runDir, stepName+".log"), out, 0644)
|
_ = os.WriteFile(filepath.Join(runDir, stepName+".log"), out, 0644)
|
||||||
if err != nil && err != context.Canceled && err.Error() != "signal: killed" {
|
if err != nil && err != context.Canceled && err.Error() != "signal: killed" {
|
||||||
fmt.Fprintf(&summary, "%s_status=FAILED\n", stepName)
|
fmt.Fprintf(&summary, "%s_status=FAILED\n", stepName)
|
||||||
@@ -323,8 +318,9 @@ func sampleFanSpeeds() ([]FanReading, error) {
|
|||||||
|
|
||||||
// parseFanSpeeds parses "ipmitool sdr type Fan" output.
|
// parseFanSpeeds parses "ipmitool sdr type Fan" output.
|
||||||
// Handles two formats:
|
// Handles two formats:
|
||||||
// Old: "FAN1 | 2400.000 | RPM | ok" (value in col[1], unit in col[2])
|
//
|
||||||
// New: "FAN1 | 41h | ok | 29.1 | 4340 RPM" (value+unit combined in last col)
|
// Old: "FAN1 | 2400.000 | RPM | ok" (value in col[1], unit in col[2])
|
||||||
|
// New: "FAN1 | 41h | ok | 29.1 | 4340 RPM" (value+unit combined in last col)
|
||||||
func parseFanSpeeds(raw string) []FanReading {
|
func parseFanSpeeds(raw string) []FanReading {
|
||||||
var fans []FanReading
|
var fans []FanReading
|
||||||
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
|
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
|
||||||
|
|||||||
@@ -31,8 +31,8 @@ func TestRunNvidiaAcceptancePackIncludesGPUStress(t *testing.T) {
|
|||||||
if len(jobs) != 5 {
|
if len(jobs) != 5 {
|
||||||
t.Fatalf("jobs=%d want 5", len(jobs))
|
t.Fatalf("jobs=%d want 5", len(jobs))
|
||||||
}
|
}
|
||||||
if got := jobs[4].cmd[0]; got != "bee-gpu-stress" {
|
if got := jobs[4].cmd[0]; got != "bee-gpu-burn" {
|
||||||
t.Fatalf("gpu stress command=%q want bee-gpu-stress", got)
|
t.Fatalf("gpu stress command=%q want bee-gpu-burn", got)
|
||||||
}
|
}
|
||||||
if got := jobs[3].cmd[1]; got != "--output-file" {
|
if got := jobs[3].cmd[1]; got != "--output-file" {
|
||||||
t.Fatalf("bug report flag=%q want --output-file", got)
|
t.Fatalf("bug report flag=%q want --output-file", got)
|
||||||
@@ -80,13 +80,10 @@ func TestAMDStressJobsIncludeBandwidthAndGST(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestNvidiaSATJobsUseEnvOverrides(t *testing.T) {
|
func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) {
|
||||||
t.Setenv("BEE_GPU_STRESS_SECONDS", "9")
|
|
||||||
t.Setenv("BEE_GPU_STRESS_SIZE_MB", "96")
|
|
||||||
|
|
||||||
jobs := nvidiaSATJobs()
|
jobs := nvidiaSATJobs()
|
||||||
got := jobs[4].cmd
|
got := jobs[4].cmd
|
||||||
want := []string{"bee-gpu-stress", "--seconds", "9", "--size-mb", "96"}
|
want := []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}
|
||||||
if len(got) != len(want) {
|
if len(got) != len(want) {
|
||||||
t.Fatalf("cmd len=%d want %d", len(got), len(want))
|
t.Fatalf("cmd len=%d want %d", len(got), len(want))
|
||||||
}
|
}
|
||||||
@@ -97,6 +94,74 @@ func TestNvidiaSATJobsUseEnvOverrides(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestBuildNvidiaStressJobUsesSelectedLoaderAndDevices(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
oldExecCommand := satExecCommand
|
||||||
|
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||||
|
if name == "nvidia-smi" {
|
||||||
|
return exec.Command("sh", "-c", "printf '0\n1\n2\n'")
|
||||||
|
}
|
||||||
|
return exec.Command(name, args...)
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { satExecCommand = oldExecCommand })
|
||||||
|
|
||||||
|
job, err := buildNvidiaStressJob(NvidiaStressOptions{
|
||||||
|
DurationSec: 600,
|
||||||
|
Loader: NvidiaStressLoaderJohn,
|
||||||
|
ExcludeGPUIndices: []int{1},
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("buildNvidiaStressJob error: %v", err)
|
||||||
|
}
|
||||||
|
wantCmd := []string{"bee-john-gpu-stress", "--seconds", "600", "--devices", "0,2"}
|
||||||
|
if len(job.cmd) != len(wantCmd) {
|
||||||
|
t.Fatalf("cmd len=%d want %d (%v)", len(job.cmd), len(wantCmd), job.cmd)
|
||||||
|
}
|
||||||
|
for i := range wantCmd {
|
||||||
|
if job.cmd[i] != wantCmd[i] {
|
||||||
|
t.Fatalf("cmd[%d]=%q want %q", i, job.cmd[i], wantCmd[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if got := joinIndexList(job.gpuIndices); got != "0,2" {
|
||||||
|
t.Fatalf("gpuIndices=%q want 0,2", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBuildNvidiaStressJobUsesNCCLLoader(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
oldExecCommand := satExecCommand
|
||||||
|
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||||
|
if name == "nvidia-smi" {
|
||||||
|
return exec.Command("sh", "-c", "printf '0\n1\n2\n'")
|
||||||
|
}
|
||||||
|
return exec.Command(name, args...)
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { satExecCommand = oldExecCommand })
|
||||||
|
|
||||||
|
job, err := buildNvidiaStressJob(NvidiaStressOptions{
|
||||||
|
DurationSec: 120,
|
||||||
|
Loader: NvidiaStressLoaderNCCL,
|
||||||
|
GPUIndices: []int{2, 0},
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("buildNvidiaStressJob error: %v", err)
|
||||||
|
}
|
||||||
|
wantCmd := []string{"bee-nccl-gpu-stress", "--seconds", "120", "--devices", "0,2"}
|
||||||
|
if len(job.cmd) != len(wantCmd) {
|
||||||
|
t.Fatalf("cmd len=%d want %d (%v)", len(job.cmd), len(wantCmd), job.cmd)
|
||||||
|
}
|
||||||
|
for i := range wantCmd {
|
||||||
|
if job.cmd[i] != wantCmd[i] {
|
||||||
|
t.Fatalf("cmd[%d]=%q want %q", i, job.cmd[i], wantCmd[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if got := joinIndexList(job.gpuIndices); got != "0,2" {
|
||||||
|
t.Fatalf("gpuIndices=%q want 0,2", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestEnvIntFallback(t *testing.T) {
|
func TestEnvIntFallback(t *testing.T) {
|
||||||
os.Unsetenv("BEE_MEMTESTER_SIZE_MB")
|
os.Unsetenv("BEE_MEMTESTER_SIZE_MB")
|
||||||
if got := envInt("BEE_MEMTESTER_SIZE_MB", 123); got != 123 {
|
if got := envInt("BEE_MEMTESTER_SIZE_MB", 123); got != 123 {
|
||||||
@@ -122,8 +187,8 @@ func TestClassifySATResult(t *testing.T) {
|
|||||||
}{
|
}{
|
||||||
{name: "ok", job: "memtester", out: "done", err: nil, status: "OK"},
|
{name: "ok", job: "memtester", out: "done", err: nil, status: "OK"},
|
||||||
{name: "unsupported", job: "smartctl-self-test-short", out: "Self-test not supported", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
{name: "unsupported", job: "smartctl-self-test-short", out: "Self-test not supported", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
||||||
{name: "failed", job: "bee-gpu-stress", out: "cuda error", err: errors.New("rc 1"), status: "FAILED"},
|
{name: "failed", job: "bee-gpu-burn", out: "cuda error", err: errors.New("rc 1"), status: "FAILED"},
|
||||||
{name: "cuda not ready", job: "bee-gpu-stress", out: "cuInit failed: CUDA_ERROR_SYSTEM_NOT_READY", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
{name: "cuda not ready", job: "bee-gpu-burn", out: "cuInit failed: CUDA_ERROR_SYSTEM_NOT_READY", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, tt := range tests {
|
for _, tt := range tests {
|
||||||
|
|||||||
@@ -51,6 +51,20 @@ type ToolStatus struct {
|
|||||||
OK bool
|
OK bool
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const (
|
||||||
|
NvidiaStressLoaderBuiltin = "builtin"
|
||||||
|
NvidiaStressLoaderJohn = "john"
|
||||||
|
NvidiaStressLoaderNCCL = "nccl"
|
||||||
|
)
|
||||||
|
|
||||||
|
type NvidiaStressOptions struct {
|
||||||
|
DurationSec int
|
||||||
|
SizeMB int
|
||||||
|
Loader string
|
||||||
|
GPUIndices []int
|
||||||
|
ExcludeGPUIndices []int
|
||||||
|
}
|
||||||
|
|
||||||
func New() *System {
|
func New() *System {
|
||||||
return &System{}
|
return &System{}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -9,14 +9,18 @@ import (
|
|||||||
"net/http"
|
"net/http"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"regexp"
|
||||||
"strings"
|
"strings"
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
|
"syscall"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"bee/audit/internal/app"
|
"bee/audit/internal/app"
|
||||||
"bee/audit/internal/platform"
|
"bee/audit/internal/platform"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var ansiEscapeRE = regexp.MustCompile(`\x1b\[[0-9;]*[a-zA-Z]|\x1b[()][A-Z0-9]|\x1b[DABC]`)
|
||||||
|
|
||||||
// ── Job ID counter ────────────────────────────────────────────────────────────
|
// ── Job ID counter ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
var jobCounter atomic.Uint64
|
var jobCounter atomic.Uint64
|
||||||
@@ -91,11 +95,25 @@ func runCmdJob(j *jobState, cmd *exec.Cmd) {
|
|||||||
j.finish(err.Error())
|
j.finish(err.Error())
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
// Lower the CPU scheduling priority of stress/audit subprocesses to nice+10
|
||||||
|
// so the X server and kernel interrupt handling remain responsive under load
|
||||||
|
// (prevents KVM/IPMI graphical console from freezing during GPU stress tests).
|
||||||
|
if cmd.Process != nil {
|
||||||
|
_ = syscall.Setpriority(syscall.PRIO_PROCESS, cmd.Process.Pid, 10)
|
||||||
|
}
|
||||||
|
|
||||||
go func() {
|
go func() {
|
||||||
scanner := bufio.NewScanner(pr)
|
scanner := bufio.NewScanner(pr)
|
||||||
for scanner.Scan() {
|
for scanner.Scan() {
|
||||||
j.append(scanner.Text())
|
// Split on \r to handle progress-bar style output (e.g. \r overwrites)
|
||||||
|
// and strip ANSI escape codes so logs are readable in the browser.
|
||||||
|
parts := strings.Split(scanner.Text(), "\r")
|
||||||
|
for _, part := range parts {
|
||||||
|
line := ansiEscapeRE.ReplaceAllString(part, "")
|
||||||
|
if line != "" {
|
||||||
|
j.append(line)
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
@@ -153,17 +171,24 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
|||||||
}
|
}
|
||||||
|
|
||||||
var body struct {
|
var body struct {
|
||||||
Duration int `json:"duration"`
|
Duration int `json:"duration"`
|
||||||
DiagLevel int `json:"diag_level"`
|
DiagLevel int `json:"diag_level"`
|
||||||
GPUIndices []int `json:"gpu_indices"`
|
GPUIndices []int `json:"gpu_indices"`
|
||||||
Profile string `json:"profile"`
|
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
|
||||||
DisplayName string `json:"display_name"`
|
Loader string `json:"loader"`
|
||||||
|
Profile string `json:"profile"`
|
||||||
|
DisplayName string `json:"display_name"`
|
||||||
}
|
}
|
||||||
if r.ContentLength > 0 {
|
if r.ContentLength > 0 {
|
||||||
_ = json.NewDecoder(r.Body).Decode(&body)
|
_ = json.NewDecoder(r.Body).Decode(&body)
|
||||||
}
|
}
|
||||||
|
|
||||||
name := taskNames[target]
|
name := taskNames[target]
|
||||||
|
if body.Profile != "" {
|
||||||
|
if n, ok := burnNames[target]; ok {
|
||||||
|
name = n
|
||||||
|
}
|
||||||
|
}
|
||||||
if name == "" {
|
if name == "" {
|
||||||
name = target
|
name = target
|
||||||
}
|
}
|
||||||
@@ -174,11 +199,13 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
|||||||
Status: TaskPending,
|
Status: TaskPending,
|
||||||
CreatedAt: time.Now(),
|
CreatedAt: time.Now(),
|
||||||
params: taskParams{
|
params: taskParams{
|
||||||
Duration: body.Duration,
|
Duration: body.Duration,
|
||||||
DiagLevel: body.DiagLevel,
|
DiagLevel: body.DiagLevel,
|
||||||
GPUIndices: body.GPUIndices,
|
GPUIndices: body.GPUIndices,
|
||||||
BurnProfile: body.Profile,
|
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
||||||
DisplayName: body.DisplayName,
|
Loader: body.Loader,
|
||||||
|
BurnProfile: body.Profile,
|
||||||
|
DisplayName: body.DisplayName,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
if strings.TrimSpace(body.DisplayName) != "" {
|
if strings.TrimSpace(body.DisplayName) != "" {
|
||||||
@@ -405,6 +432,58 @@ func (h *handler) handleAPIExportBundle(w http.ResponseWriter, r *http.Request)
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPIExportUSBTargets(w http.ResponseWriter, _ *http.Request) {
|
||||||
|
if h.opts.App == nil {
|
||||||
|
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
targets, err := h.opts.App.ListRemovableTargets()
|
||||||
|
if err != nil {
|
||||||
|
writeError(w, http.StatusInternalServerError, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if targets == nil {
|
||||||
|
targets = []platform.RemovableTarget{}
|
||||||
|
}
|
||||||
|
writeJSON(w, targets)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPIExportUSBAudit(w http.ResponseWriter, r *http.Request) {
|
||||||
|
if h.opts.App == nil {
|
||||||
|
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
var target platform.RemovableTarget
|
||||||
|
if err := json.NewDecoder(r.Body).Decode(&target); err != nil || target.Device == "" {
|
||||||
|
writeError(w, http.StatusBadRequest, "device is required")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
result, err := h.opts.App.ExportLatestAuditResult(target)
|
||||||
|
if err != nil {
|
||||||
|
writeError(w, http.StatusInternalServerError, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
writeJSON(w, map[string]string{"status": "ok", "message": result.Body})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPIExportUSBBundle(w http.ResponseWriter, r *http.Request) {
|
||||||
|
if h.opts.App == nil {
|
||||||
|
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
var target platform.RemovableTarget
|
||||||
|
if err := json.NewDecoder(r.Body).Decode(&target); err != nil || target.Device == "" {
|
||||||
|
writeError(w, http.StatusBadRequest, "device is required")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
result, err := h.opts.App.ExportSupportBundleResult(target)
|
||||||
|
if err != nil {
|
||||||
|
writeError(w, http.StatusInternalServerError, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
writeJSON(w, map[string]string{"status": "ok", "message": result.Body})
|
||||||
|
}
|
||||||
|
|
||||||
// ── GPU presence ──────────────────────────────────────────────────────────────
|
// ── GPU presence ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
func (h *handler) handleAPIGPUPresence(w http.ResponseWriter, r *http.Request) {
|
func (h *handler) handleAPIGPUPresence(w http.ResponseWriter, r *http.Request) {
|
||||||
@@ -790,3 +869,85 @@ func (h *handler) rollbackPendingNetworkChange() error {
|
|||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── Display / Screen Resolution ───────────────────────────────────────────────
|
||||||
|
|
||||||
|
type displayMode struct {
|
||||||
|
Output string `json:"output"`
|
||||||
|
Mode string `json:"mode"`
|
||||||
|
Current bool `json:"current"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type displayInfo struct {
|
||||||
|
Output string `json:"output"`
|
||||||
|
Modes []displayMode `json:"modes"`
|
||||||
|
Current string `json:"current"`
|
||||||
|
}
|
||||||
|
|
||||||
|
var xrandrOutputRE = regexp.MustCompile(`^(\S+)\s+connected`)
|
||||||
|
var xrandrModeRE = regexp.MustCompile(`^\s{3}(\d+x\d+)\s`)
|
||||||
|
var xrandrCurrentRE = regexp.MustCompile(`\*`)
|
||||||
|
|
||||||
|
func parseXrandrOutput(out string) []displayInfo {
|
||||||
|
var infos []displayInfo
|
||||||
|
var cur *displayInfo
|
||||||
|
for _, line := range strings.Split(out, "\n") {
|
||||||
|
if m := xrandrOutputRE.FindStringSubmatch(line); m != nil {
|
||||||
|
if cur != nil {
|
||||||
|
infos = append(infos, *cur)
|
||||||
|
}
|
||||||
|
cur = &displayInfo{Output: m[1]}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if cur == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if m := xrandrModeRE.FindStringSubmatch(line); m != nil {
|
||||||
|
isCurrent := xrandrCurrentRE.MatchString(line)
|
||||||
|
mode := displayMode{Output: cur.Output, Mode: m[1], Current: isCurrent}
|
||||||
|
cur.Modes = append(cur.Modes, mode)
|
||||||
|
if isCurrent {
|
||||||
|
cur.Current = m[1]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if cur != nil {
|
||||||
|
infos = append(infos, *cur)
|
||||||
|
}
|
||||||
|
return infos
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPIDisplayResolutions(w http.ResponseWriter, _ *http.Request) {
|
||||||
|
out, err := exec.Command("xrandr").Output()
|
||||||
|
if err != nil {
|
||||||
|
writeError(w, http.StatusInternalServerError, "xrandr: "+err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
writeJSON(w, parseXrandrOutput(string(out)))
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPIDisplaySet(w http.ResponseWriter, r *http.Request) {
|
||||||
|
var req struct {
|
||||||
|
Output string `json:"output"`
|
||||||
|
Mode string `json:"mode"`
|
||||||
|
}
|
||||||
|
if err := json.NewDecoder(r.Body).Decode(&req); err != nil || req.Output == "" || req.Mode == "" {
|
||||||
|
writeError(w, http.StatusBadRequest, "output and mode are required")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// Validate mode looks like WxH to prevent injection
|
||||||
|
if !regexp.MustCompile(`^\d+x\d+$`).MatchString(req.Mode) {
|
||||||
|
writeError(w, http.StatusBadRequest, "invalid mode format")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// Validate output name (no special chars)
|
||||||
|
if !regexp.MustCompile(`^[A-Za-z0-9_\-]+$`).MatchString(req.Output) {
|
||||||
|
writeError(w, http.StatusBadRequest, "invalid output name")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if out, err := exec.Command("xrandr", "--output", req.Output, "--mode", req.Mode).CombinedOutput(); err != nil {
|
||||||
|
writeError(w, http.StatusInternalServerError, "xrandr: "+strings.TrimSpace(string(out)))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
writeJSON(w, map[string]string{"status": "ok", "output": req.Output, "mode": req.Mode})
|
||||||
|
}
|
||||||
|
|||||||
@@ -205,12 +205,83 @@ document.querySelectorAll('.terminal').forEach(function(t){
|
|||||||
|
|
||||||
func renderDashboard(opts HandlerOptions) string {
|
func renderDashboard(opts HandlerOptions) string {
|
||||||
var b strings.Builder
|
var b strings.Builder
|
||||||
|
b.WriteString(renderAuditStatusBanner(opts))
|
||||||
b.WriteString(renderHardwareSummaryCard(opts))
|
b.WriteString(renderHardwareSummaryCard(opts))
|
||||||
b.WriteString(renderHealthCard(opts))
|
b.WriteString(renderHealthCard(opts))
|
||||||
b.WriteString(renderMetrics())
|
b.WriteString(renderMetrics())
|
||||||
return b.String()
|
return b.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// renderAuditStatusBanner shows a live progress banner when an audit task is
|
||||||
|
// running and auto-reloads the page when it completes.
|
||||||
|
func renderAuditStatusBanner(opts HandlerOptions) string {
|
||||||
|
// If audit data already exists, no banner needed — data is fresh.
|
||||||
|
// We still inject the polling script so a newly-triggered audit also reloads.
|
||||||
|
hasData := false
|
||||||
|
if _, err := loadSnapshot(opts.AuditPath); err == nil {
|
||||||
|
hasData = true
|
||||||
|
}
|
||||||
|
_ = hasData
|
||||||
|
|
||||||
|
return `<div id="audit-banner" style="display:none" class="alert alert-warn" style="margin-bottom:16px">
|
||||||
|
<span id="audit-banner-text">▶ Hardware audit is running — page will refresh automatically when complete.</span>
|
||||||
|
<a href="/tasks" style="margin-left:12px;font-size:12px">View in Tasks</a>
|
||||||
|
</div>
|
||||||
|
<script>
|
||||||
|
(function(){
|
||||||
|
var _auditPoll = null;
|
||||||
|
var _auditSeenRunning = false;
|
||||||
|
|
||||||
|
function pollAuditTask() {
|
||||||
|
fetch('/api/tasks').then(function(r){ return r.json(); }).then(function(tasks){
|
||||||
|
if (!tasks) return;
|
||||||
|
var audit = null;
|
||||||
|
for (var i = 0; i < tasks.length; i++) {
|
||||||
|
if (tasks[i].target === 'audit') { audit = tasks[i]; break; }
|
||||||
|
}
|
||||||
|
var banner = document.getElementById('audit-banner');
|
||||||
|
var txt = document.getElementById('audit-banner-text');
|
||||||
|
if (!audit) {
|
||||||
|
if (banner) banner.style.display = 'none';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (audit.status === 'running' || audit.status === 'pending') {
|
||||||
|
_auditSeenRunning = true;
|
||||||
|
if (banner) {
|
||||||
|
banner.style.display = '';
|
||||||
|
var label = audit.status === 'pending' ? 'pending\u2026' : 'running\u2026';
|
||||||
|
if (txt) txt.textContent = '\u25b6 Hardware audit ' + label + ' \u2014 page will refresh when complete.';
|
||||||
|
}
|
||||||
|
} else if (audit.status === 'done' && _auditSeenRunning) {
|
||||||
|
// Audit just finished — reload to show fresh hardware data.
|
||||||
|
clearInterval(_auditPoll);
|
||||||
|
if (banner) {
|
||||||
|
if (txt) txt.textContent = '\u2713 Audit complete \u2014 reloading\u2026';
|
||||||
|
banner.style.background = 'var(--ok-bg,#fcfff5)';
|
||||||
|
banner.style.color = 'var(--ok-fg,#2c662d)';
|
||||||
|
}
|
||||||
|
setTimeout(function(){ window.location.reload(); }, 800);
|
||||||
|
} else if (audit.status === 'failed') {
|
||||||
|
_auditSeenRunning = false;
|
||||||
|
if (banner) {
|
||||||
|
banner.style.display = '';
|
||||||
|
banner.style.background = 'var(--crit-bg,#fff6f6)';
|
||||||
|
banner.style.color = 'var(--crit-fg,#9f3a38)';
|
||||||
|
if (txt) txt.textContent = '\u2717 Audit failed: ' + (audit.error||'unknown error');
|
||||||
|
clearInterval(_auditPoll);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (banner) banner.style.display = 'none';
|
||||||
|
}
|
||||||
|
}).catch(function(){});
|
||||||
|
}
|
||||||
|
|
||||||
|
_auditPoll = setInterval(pollAuditTask, 3000);
|
||||||
|
pollAuditTask();
|
||||||
|
})();
|
||||||
|
</script>`
|
||||||
|
}
|
||||||
|
|
||||||
func renderAudit() string {
|
func renderAudit() string {
|
||||||
return `<div class="card"><div class="card-head">Audit Viewer <button class="btn btn-sm btn-secondary" style="margin-left:auto" onclick="openAuditModal()">Actions</button></div><div class="card-body" style="padding:0"><iframe class="viewer-frame" src="/viewer" title="Audit viewer"></iframe></div></div>`
|
return `<div class="card"><div class="card-head">Audit Viewer <button class="btn btn-sm btn-secondary" style="margin-left:auto" onclick="openAuditModal()">Actions</button></div><div class="card-body" style="padding:0"><iframe class="viewer-frame" src="/viewer" title="Audit viewer"></iframe></div></div>`
|
||||||
}
|
}
|
||||||
@@ -593,12 +664,15 @@ func renderBurn() string {
|
|||||||
return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>⚠ Warning:</strong> Stress tests on this page run hardware at maximum load. Repeated or prolonged use may reduce hardware lifespan (storage endurance, GPU wear). Use only when necessary.</div>
|
return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>⚠ Warning:</strong> Stress tests on this page run hardware at maximum load. Repeated or prolonged use may reduce hardware lifespan (storage endurance, GPU wear). Use only when necessary.</div>
|
||||||
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||||
<div class="card"><div class="card-head">Burn Profile</div><div class="card-body">
|
<div class="card"><div class="card-head">Burn Profile</div><div class="card-body">
|
||||||
<div class="form-row" style="max-width:320px"><label>Preset</label><select id="burn-profile"><option value="smoke">Smoke: 5 minutes</option><option value="acceptance">Acceptance: 1 hour</option><option value="overnight">Overnight: 8 hours</option></select></div>
|
<div class="form-row" style="max-width:320px"><label>Preset</label><select id="burn-profile"><option value="smoke" selected>Smoke: quick check (~5 min CPU / DCGM level 1)</option><option value="acceptance">Acceptance: 1 hour (DCGM level 3)</option><option value="overnight">Overnight: 8 hours (DCGM level 4)</option></select></div>
|
||||||
<p style="color:var(--muted);font-size:12px">Applied to all tests on this page. NVIDIA uses mapped DCGM levels: smoke=quick, acceptance=targeted stress, overnight=extended stress.</p>
|
<p style="color:var(--muted);font-size:12px">Applied to all tests on this page. NVIDIA SAT on the Validate page still uses DCGM. NVIDIA GPU Stress on this page uses the selected stress loader for the preset duration.</p>
|
||||||
</div></div>
|
</div></div>
|
||||||
<div class="grid3">
|
<div class="grid3">
|
||||||
<div class="card"><div class="card-head">NVIDIA GPU Stress</div><div class="card-body">
|
<div class="card"><div class="card-head">NVIDIA GPU Stress</div><div class="card-body">
|
||||||
<button id="sat-btn-nvidia" class="btn btn-primary" onclick="runBurnIn('nvidia')">▶ Start NVIDIA Stress</button>
|
<div class="form-row"><label>Load Tool</label><select id="nvidia-stress-loader"><option value="builtin" selected>bee-gpu-burn</option><option value="nccl">NCCL all_reduce_perf</option><option value="john">John the Ripper jumbo (OpenCL)</option></select></div>
|
||||||
|
<div class="form-row"><label>Exclude GPU indices</label><input type="text" id="nvidia-stress-exclude" placeholder="e.g. 1,3"></div>
|
||||||
|
<p style="color:var(--muted);font-size:12px;margin-bottom:8px"><code>bee-gpu-burn</code> runs on all detected NVIDIA GPUs by default. <code>NCCL all_reduce_perf</code> is useful for multi-GPU / interconnect load. Use exclusions only when one or more cards must be skipped.</p>
|
||||||
|
<button id="sat-btn-nvidia-stress" class="btn btn-primary" onclick="runBurnIn('nvidia-stress')">▶ Start NVIDIA Stress</button>
|
||||||
</div></div>
|
</div></div>
|
||||||
<div class="card"><div class="card-head">CPU Stress</div><div class="card-body">
|
<div class="card"><div class="card-head">CPU Stress</div><div class="card-body">
|
||||||
<button class="btn btn-primary" onclick="runBurnIn('cpu')">▶ Start CPU Stress</button>
|
<button class="btn btn-primary" onclick="runBurnIn('cpu')">▶ Start CPU Stress</button>
|
||||||
@@ -626,11 +700,24 @@ func renderBurn() string {
|
|||||||
</div>
|
</div>
|
||||||
<script>
|
<script>
|
||||||
let biES = null;
|
let biES = null;
|
||||||
|
function parseGPUIndexList(raw) {
|
||||||
|
return (raw || '')
|
||||||
|
.split(',')
|
||||||
|
.map(v => v.trim())
|
||||||
|
.filter(v => v !== '')
|
||||||
|
.map(v => Number(v))
|
||||||
|
.filter(v => Number.isInteger(v) && v >= 0);
|
||||||
|
}
|
||||||
function runBurnIn(target) {
|
function runBurnIn(target) {
|
||||||
if (biES) { biES.close(); biES = null; }
|
if (biES) { biES.close(); biES = null; }
|
||||||
const body = { profile: document.getElementById('burn-profile').value || 'smoke' };
|
const body = { profile: document.getElementById('burn-profile').value || 'smoke' };
|
||||||
|
if (target === 'nvidia-stress') {
|
||||||
|
body.loader = document.getElementById('nvidia-stress-loader').value || 'builtin';
|
||||||
|
body.exclude_gpu_indices = parseGPUIndexList(document.getElementById('nvidia-stress-exclude').value);
|
||||||
|
}
|
||||||
document.getElementById('bi-output').style.display='block';
|
document.getElementById('bi-output').style.display='block';
|
||||||
document.getElementById('bi-title').textContent = '— ' + target + ' [' + body.profile + ']';
|
const loaderLabel = body.loader ? ' / ' + body.loader : '';
|
||||||
|
document.getElementById('bi-title').textContent = '— ' + target + loaderLabel + ' [' + body.profile + ']';
|
||||||
const term = document.getElementById('bi-terminal');
|
const term = document.getElementById('bi-terminal');
|
||||||
term.textContent = 'Enqueuing ' + target + ' stress...\n';
|
term.textContent = 'Enqueuing ' + target + ' stress...\n';
|
||||||
fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(body)})
|
fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(body)})
|
||||||
@@ -645,7 +732,7 @@ function runBurnIn(target) {
|
|||||||
</script>
|
</script>
|
||||||
<script>
|
<script>
|
||||||
fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
|
fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
|
||||||
if (!gp.nvidia) disableSATCard('nvidia', 'No NVIDIA GPU detected');
|
if (!gp.nvidia) disableSATCard('nvidia-stress', 'No NVIDIA GPU detected');
|
||||||
if (!gp.amd) disableSATCard('amd-stress', 'No AMD GPU detected');
|
if (!gp.amd) disableSATCard('amd-stress', 'No AMD GPU detected');
|
||||||
});
|
});
|
||||||
function disableSATCard(id, reason) {
|
function disableSATCard(id, reason) {
|
||||||
@@ -845,12 +932,79 @@ func renderExport(exportDir string) string {
|
|||||||
return `<div class="grid2">
|
return `<div class="grid2">
|
||||||
<div class="card"><div class="card-head">Support Bundle</div><div class="card-body">
|
<div class="card"><div class="card-head">Support Bundle</div><div class="card-body">
|
||||||
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Creates a tar.gz archive of all audit files, SAT results, and logs.</p>
|
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Creates a tar.gz archive of all audit files, SAT results, and logs.</p>
|
||||||
<a class="btn btn-primary" href="/export/support.tar.gz">⬇ Download Support Bundle</a>
|
<a class="btn btn-primary" href="/export/support.tar.gz">↓ Download Support Bundle</a>
|
||||||
</div></div>
|
</div></div>
|
||||||
<div class="card"><div class="card-head">Export Files</div><div class="card-body">
|
<div class="card"><div class="card-head">Export Files</div><div class="card-body">
|
||||||
<table><tr><th>File</th></tr>` + rows.String() + `</table>
|
<table><tr><th>File</th></tr>` + rows.String() + `</table>
|
||||||
</div></div>
|
</div></div>
|
||||||
</div>`
|
</div>
|
||||||
|
|
||||||
|
<div class="card" style="margin-top:16px">
|
||||||
|
<div class="card-head">Export to USB
|
||||||
|
<button class="btn btn-sm btn-secondary" onclick="usbRefresh()" style="margin-left:auto">↻ Refresh</button>
|
||||||
|
</div>
|
||||||
|
<div class="card-body">
|
||||||
|
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Write audit JSON or support bundle directly to a removable USB drive.</p>
|
||||||
|
<div id="usb-status" style="font-size:13px;color:var(--muted)">Scanning for USB devices...</div>
|
||||||
|
<div id="usb-targets" style="margin-top:12px"></div>
|
||||||
|
<div id="usb-msg" style="margin-top:10px;font-size:13px"></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<script>
|
||||||
|
(function(){
|
||||||
|
function usbRefresh() {
|
||||||
|
document.getElementById('usb-status').textContent = 'Scanning...';
|
||||||
|
document.getElementById('usb-targets').innerHTML = '';
|
||||||
|
document.getElementById('usb-msg').textContent = '';
|
||||||
|
fetch('/api/export/usb').then(r=>r.json()).then(targets => {
|
||||||
|
const st = document.getElementById('usb-status');
|
||||||
|
const ct = document.getElementById('usb-targets');
|
||||||
|
if (!targets || targets.length === 0) {
|
||||||
|
st.textContent = 'No removable USB devices found.';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
st.textContent = targets.length + ' device(s) found:';
|
||||||
|
ct.innerHTML = '<table><tr><th>Device</th><th>FS</th><th>Size</th><th>Label</th><th>Model</th><th>Actions</th></tr>' +
|
||||||
|
targets.map(t => {
|
||||||
|
const dev = t.device || '';
|
||||||
|
const label = t.label || '';
|
||||||
|
const model = t.model || '';
|
||||||
|
return '<tr>' +
|
||||||
|
'<td style="font-family:monospace">'+dev+'</td>' +
|
||||||
|
'<td>'+t.fs_type+'</td>' +
|
||||||
|
'<td>'+t.size+'</td>' +
|
||||||
|
'<td>'+label+'</td>' +
|
||||||
|
'<td style="font-size:12px;color:var(--muted)">'+model+'</td>' +
|
||||||
|
'<td style="white-space:nowrap">' +
|
||||||
|
'<button class="btn btn-sm btn-primary" onclick="usbExport(\'audit\','+JSON.stringify(t)+')">Audit JSON</button> ' +
|
||||||
|
'<button class="btn btn-sm btn-secondary" onclick="usbExport(\'bundle\','+JSON.stringify(t)+')">Support Bundle</button>' +
|
||||||
|
'</td></tr>';
|
||||||
|
}).join('') + '</table>';
|
||||||
|
}).catch(e => {
|
||||||
|
document.getElementById('usb-status').textContent = 'Error: ' + e;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
window.usbExport = function(type, target) {
|
||||||
|
const msg = document.getElementById('usb-msg');
|
||||||
|
msg.style.color = 'var(--muted)';
|
||||||
|
msg.textContent = 'Exporting to ' + (target.device||'') + '...';
|
||||||
|
fetch('/api/export/usb/'+type, {
|
||||||
|
method: 'POST',
|
||||||
|
headers: {'Content-Type':'application/json'},
|
||||||
|
body: JSON.stringify(target)
|
||||||
|
}).then(r=>r.json()).then(d => {
|
||||||
|
if (d.error) { msg.style.color='var(--err,red)'; msg.textContent = 'Error: '+d.error; return; }
|
||||||
|
msg.style.color = 'var(--ok,green)';
|
||||||
|
msg.textContent = d.message || 'Done.';
|
||||||
|
}).catch(e => {
|
||||||
|
msg.style.color = 'var(--err,red)';
|
||||||
|
msg.textContent = 'Error: '+e;
|
||||||
|
});
|
||||||
|
};
|
||||||
|
window.usbRefresh = usbRefresh;
|
||||||
|
usbRefresh();
|
||||||
|
})();
|
||||||
|
</script>`
|
||||||
}
|
}
|
||||||
|
|
||||||
func listExportFiles(exportDir string) ([]string, error) {
|
func listExportFiles(exportDir string) ([]string, error) {
|
||||||
@@ -876,6 +1030,56 @@ func listExportFiles(exportDir string) ([]string, error) {
|
|||||||
return entries, nil
|
return entries, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── Display Resolution ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
func renderDisplayInline() string {
|
||||||
|
return `<div id="display-status" style="color:var(--muted);font-size:13px;margin-bottom:12px">Loading displays...</div>
|
||||||
|
<div id="display-controls"></div>
|
||||||
|
<script>
|
||||||
|
(function(){
|
||||||
|
function loadDisplays() {
|
||||||
|
fetch('/api/display/resolutions').then(r=>r.json()).then(displays => {
|
||||||
|
const status = document.getElementById('display-status');
|
||||||
|
const ctrl = document.getElementById('display-controls');
|
||||||
|
if (!displays || displays.length === 0) {
|
||||||
|
status.textContent = 'No connected displays found or xrandr not available.';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
status.textContent = '';
|
||||||
|
ctrl.innerHTML = displays.map(d => {
|
||||||
|
const opts = (d.modes||[]).map(m =>
|
||||||
|
'<option value="'+m.mode+'"'+(m.current?' selected':'')+'>'+m.mode+(m.current?' (current)':'')+'</option>'
|
||||||
|
).join('');
|
||||||
|
return '<div style="margin-bottom:12px">'
|
||||||
|
+'<span style="font-weight:600;margin-right:8px">'+d.output+'</span>'
|
||||||
|
+'<span style="color:var(--muted);font-size:12px;margin-right:12px">Current: '+d.current+'</span>'
|
||||||
|
+'<select id="res-sel-'+d.output+'" style="margin-right:8px">'+opts+'</select>'
|
||||||
|
+'<button class="btn btn-sm btn-primary" onclick="applyResolution(\''+d.output+'\')">Apply</button>'
|
||||||
|
+'</div>';
|
||||||
|
}).join('');
|
||||||
|
}).catch(()=>{
|
||||||
|
document.getElementById('display-status').textContent = 'xrandr not available on this system.';
|
||||||
|
});
|
||||||
|
}
|
||||||
|
window.applyResolution = function(output) {
|
||||||
|
const sel = document.getElementById('res-sel-'+output);
|
||||||
|
if (!sel) return;
|
||||||
|
const mode = sel.value;
|
||||||
|
const btn = sel.nextElementSibling;
|
||||||
|
btn.disabled = true;
|
||||||
|
btn.textContent = 'Applying...';
|
||||||
|
fetch('/api/display/set', {method:'POST', headers:{'Content-Type':'application/json'}, body:JSON.stringify({output:output,mode:mode})})
|
||||||
|
.then(r=>r.json()).then(d=>{
|
||||||
|
if (d.error) { alert('Error: '+d.error); }
|
||||||
|
loadDisplays();
|
||||||
|
}).catch(e=>{ alert('Error: '+e); })
|
||||||
|
.finally(()=>{ btn.disabled=false; btn.textContent='Apply'; });
|
||||||
|
};
|
||||||
|
loadDisplays();
|
||||||
|
})();
|
||||||
|
</script>`
|
||||||
|
}
|
||||||
|
|
||||||
// ── Tools ─────────────────────────────────────────────────────────────────────
|
// ── Tools ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
func renderTools() string {
|
func renderTools() string {
|
||||||
@@ -927,6 +1131,9 @@ function installToRAM() {
|
|||||||
<div class="card"><div class="card-head">Services</div><div class="card-body">` +
|
<div class="card"><div class="card-head">Services</div><div class="card-body">` +
|
||||||
renderServicesInline() + `</div></div>
|
renderServicesInline() + `</div></div>
|
||||||
|
|
||||||
|
<div class="card"><div class="card-head">Display Resolution</div><div class="card-body">` +
|
||||||
|
renderDisplayInline() + `</div></div>
|
||||||
|
|
||||||
<script>
|
<script>
|
||||||
function checkTools() {
|
function checkTools() {
|
||||||
document.getElementById('tools-table').innerHTML = '<p style="color:var(--muted);font-size:13px">Checking...</p>';
|
document.getElementById('tools-table').innerHTML = '<p style="color:var(--muted);font-size:13px">Checking...</p>';
|
||||||
|
|||||||
@@ -206,6 +206,7 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
|||||||
|
|
||||||
// SAT
|
// SAT
|
||||||
mux.HandleFunc("POST /api/sat/nvidia/run", h.handleAPISATRun("nvidia"))
|
mux.HandleFunc("POST /api/sat/nvidia/run", h.handleAPISATRun("nvidia"))
|
||||||
|
mux.HandleFunc("POST /api/sat/nvidia-stress/run", h.handleAPISATRun("nvidia-stress"))
|
||||||
mux.HandleFunc("POST /api/sat/memory/run", h.handleAPISATRun("memory"))
|
mux.HandleFunc("POST /api/sat/memory/run", h.handleAPISATRun("memory"))
|
||||||
mux.HandleFunc("POST /api/sat/storage/run", h.handleAPISATRun("storage"))
|
mux.HandleFunc("POST /api/sat/storage/run", h.handleAPISATRun("storage"))
|
||||||
mux.HandleFunc("POST /api/sat/cpu/run", h.handleAPISATRun("cpu"))
|
mux.HandleFunc("POST /api/sat/cpu/run", h.handleAPISATRun("cpu"))
|
||||||
@@ -241,10 +242,17 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
|||||||
// Export
|
// Export
|
||||||
mux.HandleFunc("GET /api/export/list", h.handleAPIExportList)
|
mux.HandleFunc("GET /api/export/list", h.handleAPIExportList)
|
||||||
mux.HandleFunc("POST /api/export/bundle", h.handleAPIExportBundle)
|
mux.HandleFunc("POST /api/export/bundle", h.handleAPIExportBundle)
|
||||||
|
mux.HandleFunc("GET /api/export/usb", h.handleAPIExportUSBTargets)
|
||||||
|
mux.HandleFunc("POST /api/export/usb/audit", h.handleAPIExportUSBAudit)
|
||||||
|
mux.HandleFunc("POST /api/export/usb/bundle", h.handleAPIExportUSBBundle)
|
||||||
|
|
||||||
// Tools
|
// Tools
|
||||||
mux.HandleFunc("GET /api/tools/check", h.handleAPIToolsCheck)
|
mux.HandleFunc("GET /api/tools/check", h.handleAPIToolsCheck)
|
||||||
|
|
||||||
|
// Display
|
||||||
|
mux.HandleFunc("GET /api/display/resolutions", h.handleAPIDisplayResolutions)
|
||||||
|
mux.HandleFunc("POST /api/display/set", h.handleAPIDisplaySet)
|
||||||
|
|
||||||
// GPU presence
|
// GPU presence
|
||||||
mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
|
mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
|
||||||
|
|
||||||
|
|||||||
@@ -24,22 +24,31 @@ const (
|
|||||||
TaskCancelled = "cancelled"
|
TaskCancelled = "cancelled"
|
||||||
)
|
)
|
||||||
|
|
||||||
// taskNames maps target → human-readable name.
|
// taskNames maps target → human-readable name for validate (SAT) runs.
|
||||||
var taskNames = map[string]string{
|
var taskNames = map[string]string{
|
||||||
"nvidia": "NVIDIA SAT",
|
"nvidia": "NVIDIA SAT",
|
||||||
"memory": "Memory SAT",
|
"nvidia-stress": "NVIDIA GPU Stress",
|
||||||
"storage": "Storage SAT",
|
"memory": "Memory SAT",
|
||||||
"cpu": "CPU SAT",
|
"storage": "Storage SAT",
|
||||||
"amd": "AMD GPU SAT",
|
"cpu": "CPU SAT",
|
||||||
"amd-mem": "AMD GPU MEM Integrity",
|
"amd": "AMD GPU SAT",
|
||||||
"amd-bandwidth": "AMD GPU MEM Bandwidth",
|
"amd-mem": "AMD GPU MEM Integrity",
|
||||||
"amd-stress": "AMD GPU Burn-in",
|
"amd-bandwidth": "AMD GPU MEM Bandwidth",
|
||||||
"memory-stress": "Memory Burn-in",
|
"amd-stress": "AMD GPU Burn-in",
|
||||||
"sat-stress": "SAT Stress (stressapptest)",
|
"memory-stress": "Memory Burn-in",
|
||||||
|
"sat-stress": "SAT Stress (stressapptest)",
|
||||||
"platform-stress": "Platform Thermal Cycling",
|
"platform-stress": "Platform Thermal Cycling",
|
||||||
"audit": "Audit",
|
"audit": "Audit",
|
||||||
"install": "Install to Disk",
|
"install": "Install to Disk",
|
||||||
"install-to-ram": "Install to RAM",
|
"install-to-ram": "Install to RAM",
|
||||||
|
}
|
||||||
|
|
||||||
|
// burnNames maps target → human-readable name when a burn profile is set.
|
||||||
|
var burnNames = map[string]string{
|
||||||
|
"nvidia": "NVIDIA Burn-in",
|
||||||
|
"memory": "Memory Burn-in",
|
||||||
|
"cpu": "CPU Burn-in",
|
||||||
|
"amd": "AMD GPU Burn-in",
|
||||||
}
|
}
|
||||||
|
|
||||||
// Task represents one unit of work in the queue.
|
// Task represents one unit of work in the queue.
|
||||||
@@ -62,12 +71,14 @@ type Task struct {
|
|||||||
|
|
||||||
// taskParams holds optional parameters parsed from the run request.
|
// taskParams holds optional parameters parsed from the run request.
|
||||||
type taskParams struct {
|
type taskParams struct {
|
||||||
Duration int `json:"duration,omitempty"`
|
Duration int `json:"duration,omitempty"`
|
||||||
DiagLevel int `json:"diag_level,omitempty"`
|
DiagLevel int `json:"diag_level,omitempty"`
|
||||||
GPUIndices []int `json:"gpu_indices,omitempty"`
|
GPUIndices []int `json:"gpu_indices,omitempty"`
|
||||||
BurnProfile string `json:"burn_profile,omitempty"`
|
ExcludeGPUIndices []int `json:"exclude_gpu_indices,omitempty"`
|
||||||
DisplayName string `json:"display_name,omitempty"`
|
Loader string `json:"loader,omitempty"`
|
||||||
Device string `json:"device,omitempty"` // for install
|
BurnProfile string `json:"burn_profile,omitempty"`
|
||||||
|
DisplayName string `json:"display_name,omitempty"`
|
||||||
|
Device string `json:"device,omitempty"` // for install
|
||||||
}
|
}
|
||||||
|
|
||||||
type persistedTask struct {
|
type persistedTask struct {
|
||||||
@@ -162,6 +173,9 @@ var (
|
|||||||
runAMDMemBandwidthPackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
runAMDMemBandwidthPackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
return a.RunAMDMemBandwidthPackCtx(ctx, baseDir, logFunc)
|
return a.RunAMDMemBandwidthPackCtx(ctx, baseDir, logFunc)
|
||||||
}
|
}
|
||||||
|
runNvidiaStressPackCtx = func(a *app.App, ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunNvidiaStressPackCtx(ctx, baseDir, opts, logFunc)
|
||||||
|
}
|
||||||
runAMDStressPackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
runAMDStressPackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
return a.RunAMDStressPackCtx(ctx, baseDir, durationSec, logFunc)
|
return a.RunAMDStressPackCtx(ctx, baseDir, durationSec, logFunc)
|
||||||
}
|
}
|
||||||
@@ -403,6 +417,17 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
} else {
|
} else {
|
||||||
archive, err = a.RunNvidiaAcceptancePack("", j.append)
|
archive, err = a.RunNvidiaAcceptancePack("", j.append)
|
||||||
}
|
}
|
||||||
|
case "nvidia-stress":
|
||||||
|
dur := t.params.Duration
|
||||||
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
|
}
|
||||||
|
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
|
||||||
|
DurationSec: dur,
|
||||||
|
Loader: t.params.Loader,
|
||||||
|
GPUIndices: t.params.GPUIndices,
|
||||||
|
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||||
|
}, j.append)
|
||||||
case "memory":
|
case "memory":
|
||||||
archive, err = runMemoryAcceptancePackCtx(a, ctx, "", j.append)
|
archive, err = runMemoryAcceptancePackCtx(a, ctx, "", j.append)
|
||||||
case "storage":
|
case "storage":
|
||||||
|
|||||||
2
bible
2
bible
Submodule bible updated: 456c1f022c...688b87e98d
@@ -81,9 +81,9 @@ build-in-container.sh [--authorized-keys /path/to/keys]
|
|||||||
7. `build-cublas.sh`:
|
7. `build-cublas.sh`:
|
||||||
a. download `libcublas`, `libcublasLt`, `libcudart` runtime + dev packages from the NVIDIA CUDA Debian repo
|
a. download `libcublas`, `libcublasLt`, `libcudart` runtime + dev packages from the NVIDIA CUDA Debian repo
|
||||||
b. verify packages against repo `Packages.gz`
|
b. verify packages against repo `Packages.gz`
|
||||||
c. extract headers for `bee-gpu-stress` build
|
c. extract headers for `bee-gpu-burn` worker build
|
||||||
d. cache userspace libs in `dist/cublas-<version>+cuda<series>/`
|
d. cache userspace libs in `dist/cublas-<version>+cuda<series>/`
|
||||||
8. build `bee-gpu-stress` against extracted cuBLASLt/cudart headers
|
8. build `bee-gpu-burn` worker against extracted cuBLASLt/cudart headers
|
||||||
9. inject NVIDIA `.ko` → staged `/usr/local/lib/nvidia/`
|
9. inject NVIDIA `.ko` → staged `/usr/local/lib/nvidia/`
|
||||||
10. inject `nvidia-smi` → staged `/usr/local/bin/nvidia-smi`
|
10. inject `nvidia-smi` → staged `/usr/local/bin/nvidia-smi`
|
||||||
11. inject `libnvidia-ml` + `libcuda` + `libcublas` + `libcublasLt` + `libcudart` → staged `/usr/lib/`
|
11. inject `libnvidia-ml` + `libcuda` + `libcublas` + `libcublasLt` + `libcudart` → staged `/usr/lib/`
|
||||||
@@ -104,7 +104,7 @@ Build host notes:
|
|||||||
1. `build-in-container.sh` / `build-nvidia-module.sh` — Debian kernel headers for module build
|
1. `build-in-container.sh` / `build-nvidia-module.sh` — Debian kernel headers for module build
|
||||||
2. `auto/config` — `linux-image-${DEBIAN_KERNEL_ABI}` in the ISO
|
2. `auto/config` — `linux-image-${DEBIAN_KERNEL_ABI}` in the ISO
|
||||||
- NVIDIA modules go to staged `usr/local/lib/nvidia/` — NOT to `/lib/modules/<kver>/extra/`.
|
- NVIDIA modules go to staged `usr/local/lib/nvidia/` — NOT to `/lib/modules/<kver>/extra/`.
|
||||||
- `bee-gpu-stress` must be built against cached CUDA userspace headers from `build-cublas.sh`, not against random host-installed CUDA headers.
|
- `bee-gpu-burn` worker must be built against cached CUDA userspace headers from `build-cublas.sh`, not against random host-installed CUDA headers.
|
||||||
- The live ISO must ship `libcublas`, `libcublasLt`, and `libcudart` together with `libcuda` so tensor-core stress works without internet or package installs at boot.
|
- The live ISO must ship `libcublas`, `libcublasLt`, and `libcudart` together with `libcuda` so tensor-core stress works without internet or package installs at boot.
|
||||||
- The source overlay in `iso/overlay/` is treated as immutable source. Build-time files are injected only into the staged overlay.
|
- The source overlay in `iso/overlay/` is treated as immutable source. Build-time files are injected only into the staged overlay.
|
||||||
- The live-build workdir under `dist/` is disposable; source files under `iso/builder/` stay clean.
|
- The live-build workdir under `dist/` is disposable; source files under `iso/builder/` stay clean.
|
||||||
@@ -153,18 +153,17 @@ Current validation state:
|
|||||||
Every collector returns `nil, nil` on tool-not-found. Errors are logged, never fatal.
|
Every collector returns `nil, nil` on tool-not-found. Errors are logged, never fatal.
|
||||||
|
|
||||||
Acceptance flows:
|
Acceptance flows:
|
||||||
- `bee sat nvidia` → diagnostic archive with `nvidia-smi -q` + `nvidia-bug-report` + mixed-precision `bee-gpu-stress`
|
- `bee sat nvidia` → diagnostic archive with `nvidia-smi -q` + `nvidia-bug-report` + lightweight `bee-gpu-burn`
|
||||||
|
- NVIDIA GPU burn-in can use either `bee-gpu-burn` or `bee-john-gpu-stress` (John the Ripper jumbo via OpenCL)
|
||||||
- `bee sat memory` → `memtester` archive
|
- `bee sat memory` → `memtester` archive
|
||||||
- `bee sat storage` → SMART/NVMe diagnostic archive and short self-test trigger where supported
|
- `bee sat storage` → SMART/NVMe diagnostic archive and short self-test trigger where supported
|
||||||
- SAT `summary.txt` now includes `overall_status` and per-job `*_status` values (`OK`, `FAILED`, `UNSUPPORTED`)
|
- SAT `summary.txt` now includes `overall_status` and per-job `*_status` values (`OK`, `FAILED`, `UNSUPPORTED`)
|
||||||
- `bee-gpu-stress` should prefer cuBLASLt GEMM load over the old integer/PTX burn path:
|
- `bee-gpu-burn` should prefer cuBLASLt GEMM load over the old integer/PTX burn path:
|
||||||
- Ampere: `fp16` + `fp32`/TF32 tensor-core load
|
- Ampere: `fp16` + `fp32`/TF32 tensor-core load
|
||||||
- Ada / Hopper: add `fp8`
|
- Ada / Hopper: add `fp8`
|
||||||
- Blackwell+: add `fp4`
|
- Blackwell+: add `fp4`
|
||||||
- PTX fallback is only for missing cuBLASLt/userspace or unsupported narrow datatypes
|
- PTX fallback is only for missing cuBLASLt/userspace or unsupported narrow datatypes
|
||||||
- Runtime overrides:
|
- Runtime overrides:
|
||||||
- `BEE_GPU_STRESS_SECONDS`
|
|
||||||
- `BEE_GPU_STRESS_SIZE_MB`
|
|
||||||
- `BEE_MEMTESTER_SIZE_MB`
|
- `BEE_MEMTESTER_SIZE_MB`
|
||||||
- `BEE_MEMTESTER_PASSES`
|
- `BEE_MEMTESTER_PASSES`
|
||||||
|
|
||||||
@@ -179,6 +178,6 @@ Web UI: Acceptance Tests page → Run Test button
|
|||||||
```
|
```
|
||||||
|
|
||||||
**Critical invariants:**
|
**Critical invariants:**
|
||||||
- `bee-gpu-stress` uses `exec.CommandContext` — killed on job context cancel.
|
- `bee-gpu-burn` / `bee-john-gpu-stress` use `exec.CommandContext` — killed on job context cancel.
|
||||||
- Metric goroutine uses stopCh/doneCh pattern; main goroutine waits `<-doneCh` before reading rows (no mutex needed).
|
- Metric goroutine uses stopCh/doneCh pattern; main goroutine waits `<-doneCh` before reading rows (no mutex needed).
|
||||||
- SVG chart is fully offline: no JS, no external CSS, pure inline SVG.
|
- SVG chart is fully offline: no JS, no external CSS, pure inline SVG.
|
||||||
|
|||||||
@@ -21,8 +21,8 @@ Fills gaps where Redfish/logpile is blind:
|
|||||||
- Read-only hardware inventory: board, CPU, memory, storage, PCIe, PSU, GPU, NIC, RAID
|
- Read-only hardware inventory: board, CPU, memory, storage, PCIe, PSU, GPU, NIC, RAID
|
||||||
- Machine-readable health summary derived from collector verdicts
|
- Machine-readable health summary derived from collector verdicts
|
||||||
- Operator-triggered acceptance tests for NVIDIA, memory, and storage
|
- Operator-triggered acceptance tests for NVIDIA, memory, and storage
|
||||||
- NVIDIA SAT includes both diagnostic collection and mixed-precision GPU stress via `bee-gpu-stress`
|
- NVIDIA SAT includes diagnostic collection plus a lightweight in-image GPU stress step via `bee-gpu-burn`
|
||||||
- `bee-gpu-stress` should exercise tensor/inference paths (`fp16`, `fp32`/TF32, `fp8`, `fp4` when supported by the GPU/userspace stack) and fall back to Driver API PTX burn only if cuBLASLt is unavailable
|
- `bee-gpu-burn` should exercise tensor/inference paths (`fp16`, `fp32`/TF32, `fp8`, `fp4` when supported by the GPU/userspace stack) and fall back to Driver API PTX burn only if cuBLASLt is unavailable
|
||||||
- Automatic boot audit with operator-facing local console and SSH access
|
- Automatic boot audit with operator-facing local console and SSH access
|
||||||
- NVIDIA proprietary driver loaded at boot for GPU enrichment via `nvidia-smi`
|
- NVIDIA proprietary driver loaded at boot for GPU enrichment via `nvidia-smi`
|
||||||
- SSH access (OpenSSH) always available for inspection and debugging
|
- SSH access (OpenSSH) always available for inspection and debugging
|
||||||
@@ -70,7 +70,7 @@ Fills gaps where Redfish/logpile is blind:
|
|||||||
| SSH | OpenSSH server |
|
| SSH | OpenSSH server |
|
||||||
| NVIDIA driver | Proprietary `.run` installer, built against Debian kernel headers |
|
| NVIDIA driver | Proprietary `.run` installer, built against Debian kernel headers |
|
||||||
| NVIDIA modules | Loaded via `insmod` from `/usr/local/lib/nvidia/` |
|
| NVIDIA modules | Loaded via `insmod` from `/usr/local/lib/nvidia/` |
|
||||||
| GPU stress backend | `bee-gpu-stress` + cuBLASLt/cuBLAS/cudart mixed-precision GEMM, with Driver API PTX fallback |
|
| GPU stress backend | `bee-gpu-burn` + cuBLASLt/cuBLAS/cudart mixed-precision GEMM, with Driver API PTX fallback |
|
||||||
| Builder | Debian 12 host/VM or Debian 12 container image |
|
| Builder | Debian 12 host/VM or Debian 12 container image |
|
||||||
|
|
||||||
## Operator UX
|
## Operator UX
|
||||||
|
|||||||
@@ -18,6 +18,8 @@ Use the official proprietary NVIDIA `.run` installer for both kernel modules and
|
|||||||
- Kernel modules and nvidia-smi come from a single verified source.
|
- Kernel modules and nvidia-smi come from a single verified source.
|
||||||
- NVIDIA publishes `.sha256sum` alongside each installer — download and verify before use.
|
- NVIDIA publishes `.sha256sum` alongside each installer — download and verify before use.
|
||||||
- Driver version pinned in `iso/builder/VERSIONS` as `NVIDIA_DRIVER_VERSION`.
|
- Driver version pinned in `iso/builder/VERSIONS` as `NVIDIA_DRIVER_VERSION`.
|
||||||
|
- DCGM must track the CUDA user-mode driver major version exposed by `nvidia-smi`.
|
||||||
|
- For NVIDIA driver branch `590` with CUDA `13.x`, use DCGM 4 package family `datacenter-gpu-manager-4-cuda13`; legacy `datacenter-gpu-manager` 3.x does not provide a working path for this stack.
|
||||||
- Build process: download `.run`, extract, compile `kernel/` sources against `linux-lts-dev`.
|
- Build process: download `.run`, extract, compile `kernel/` sources against `linux-lts-dev`.
|
||||||
- Modules cached in `dist/nvidia-<version>-<kver>/` — rebuild only on version or kernel change.
|
- Modules cached in `dist/nvidia-<version>-<kver>/` — rebuild only on version or kernel change.
|
||||||
- ISO size increases by ~50MB for .ko files + nvidia-smi.
|
- ISO size increases by ~50MB for .ko files + nvidia-smi.
|
||||||
|
|||||||
Submodule internal/chart updated: ac8120c8ab...05db6994d4
@@ -48,6 +48,7 @@ sh iso/builder/build-in-container.sh --cache-dir /path/to/cache
|
|||||||
- The builder image is automatically rebuilt if the local tag exists for the wrong architecture.
|
- The builder image is automatically rebuilt if the local tag exists for the wrong architecture.
|
||||||
- The live ISO boots with Debian `live-boot` `toram`, so the read-only medium is copied into RAM during boot and the runtime no longer depends on the original USB/BMC virtual media staying present.
|
- The live ISO boots with Debian `live-boot` `toram`, so the read-only medium is copied into RAM during boot and the runtime no longer depends on the original USB/BMC virtual media staying present.
|
||||||
- Target systems need enough RAM for the full compressed live medium plus normal runtime overhead, or boot may fail before reaching the TUI.
|
- Target systems need enough RAM for the full compressed live medium plus normal runtime overhead, or boot may fail before reaching the TUI.
|
||||||
|
- The NVIDIA variant installs DCGM 4 packages matched to the CUDA user-mode driver major version. For driver branch `590` / CUDA `13.x`, the package family is `datacenter-gpu-manager-4-cuda13` rather than legacy `datacenter-gpu-manager`.
|
||||||
- Override the container platform only if you know why:
|
- Override the container platform only if you know why:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
|
|||||||
@@ -23,6 +23,16 @@ RUN apt-get update -qq && apt-get install -y \
|
|||||||
gcc \
|
gcc \
|
||||||
make \
|
make \
|
||||||
perl \
|
perl \
|
||||||
|
pkg-config \
|
||||||
|
yasm \
|
||||||
|
libssl-dev \
|
||||||
|
zlib1g-dev \
|
||||||
|
libbz2-dev \
|
||||||
|
libgmp-dev \
|
||||||
|
libpcap-dev \
|
||||||
|
libsqlite3-dev \
|
||||||
|
libcurl4-openssl-dev \
|
||||||
|
ocl-icd-opencl-dev \
|
||||||
linux-headers-amd64 \
|
linux-headers-amd64 \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
|||||||
@@ -8,7 +8,8 @@ NCCL_TESTS_VERSION=2.13.10
|
|||||||
NVCC_VERSION=12.8
|
NVCC_VERSION=12.8
|
||||||
CUBLAS_VERSION=13.0.2.14-1
|
CUBLAS_VERSION=13.0.2.14-1
|
||||||
CUDA_USERSPACE_VERSION=13.0.96-1
|
CUDA_USERSPACE_VERSION=13.0.96-1
|
||||||
DCGM_VERSION=3.3.9
|
DCGM_VERSION=4.5.2-1
|
||||||
|
JOHN_JUMBO_COMMIT=67fcf9fe5a
|
||||||
ROCM_VERSION=6.3.4
|
ROCM_VERSION=6.3.4
|
||||||
ROCM_SMI_VERSION=7.4.0.60304-76~22.04
|
ROCM_SMI_VERSION=7.4.0.60304-76~22.04
|
||||||
ROCM_BANDWIDTH_TEST_VERSION=1.4.0.60304-76~22.04
|
ROCM_BANDWIDTH_TEST_VERSION=1.4.0.60304-76~22.04
|
||||||
|
|||||||
@@ -29,8 +29,13 @@ typedef void *CUfunction;
|
|||||||
typedef void *CUstream;
|
typedef void *CUstream;
|
||||||
|
|
||||||
#define CU_SUCCESS 0
|
#define CU_SUCCESS 0
|
||||||
|
#define CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT 16
|
||||||
#define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR 75
|
#define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR 75
|
||||||
#define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR 76
|
#define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR 76
|
||||||
|
#define MAX_STRESS_STREAMS 16
|
||||||
|
#define MAX_CUBLAS_PROFILES 5
|
||||||
|
#define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
|
||||||
|
#define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
|
||||||
|
|
||||||
static const char *ptx_source =
|
static const char *ptx_source =
|
||||||
".version 6.0\n"
|
".version 6.0\n"
|
||||||
@@ -97,6 +102,9 @@ typedef CUresult (*cuLaunchKernel_fn)(CUfunction,
|
|||||||
CUstream,
|
CUstream,
|
||||||
void **,
|
void **,
|
||||||
void **);
|
void **);
|
||||||
|
typedef CUresult (*cuMemGetInfo_fn)(size_t *, size_t *);
|
||||||
|
typedef CUresult (*cuStreamCreate_fn)(CUstream *, unsigned int);
|
||||||
|
typedef CUresult (*cuStreamDestroy_fn)(CUstream);
|
||||||
typedef CUresult (*cuGetErrorName_fn)(CUresult, const char **);
|
typedef CUresult (*cuGetErrorName_fn)(CUresult, const char **);
|
||||||
typedef CUresult (*cuGetErrorString_fn)(CUresult, const char **);
|
typedef CUresult (*cuGetErrorString_fn)(CUresult, const char **);
|
||||||
|
|
||||||
@@ -118,6 +126,9 @@ struct cuda_api {
|
|||||||
cuModuleLoadDataEx_fn cuModuleLoadDataEx;
|
cuModuleLoadDataEx_fn cuModuleLoadDataEx;
|
||||||
cuModuleGetFunction_fn cuModuleGetFunction;
|
cuModuleGetFunction_fn cuModuleGetFunction;
|
||||||
cuLaunchKernel_fn cuLaunchKernel;
|
cuLaunchKernel_fn cuLaunchKernel;
|
||||||
|
cuMemGetInfo_fn cuMemGetInfo;
|
||||||
|
cuStreamCreate_fn cuStreamCreate;
|
||||||
|
cuStreamDestroy_fn cuStreamDestroy;
|
||||||
cuGetErrorName_fn cuGetErrorName;
|
cuGetErrorName_fn cuGetErrorName;
|
||||||
cuGetErrorString_fn cuGetErrorString;
|
cuGetErrorString_fn cuGetErrorString;
|
||||||
};
|
};
|
||||||
@@ -128,9 +139,10 @@ struct stress_report {
|
|||||||
int cc_major;
|
int cc_major;
|
||||||
int cc_minor;
|
int cc_minor;
|
||||||
int buffer_mb;
|
int buffer_mb;
|
||||||
|
int stream_count;
|
||||||
unsigned long iterations;
|
unsigned long iterations;
|
||||||
uint64_t checksum;
|
uint64_t checksum;
|
||||||
char details[1024];
|
char details[16384];
|
||||||
};
|
};
|
||||||
|
|
||||||
static int load_symbol(void *lib, const char *name, void **out) {
|
static int load_symbol(void *lib, const char *name, void **out) {
|
||||||
@@ -144,7 +156,7 @@ static int load_cuda(struct cuda_api *api) {
|
|||||||
if (!api->lib) {
|
if (!api->lib) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
return
|
if (!(
|
||||||
load_symbol(api->lib, "cuInit", (void **)&api->cuInit) &&
|
load_symbol(api->lib, "cuInit", (void **)&api->cuInit) &&
|
||||||
load_symbol(api->lib, "cuDeviceGetCount", (void **)&api->cuDeviceGetCount) &&
|
load_symbol(api->lib, "cuDeviceGetCount", (void **)&api->cuDeviceGetCount) &&
|
||||||
load_symbol(api->lib, "cuDeviceGet", (void **)&api->cuDeviceGet) &&
|
load_symbol(api->lib, "cuDeviceGet", (void **)&api->cuDeviceGet) &&
|
||||||
@@ -160,7 +172,17 @@ static int load_cuda(struct cuda_api *api) {
|
|||||||
load_symbol(api->lib, "cuMemcpyDtoH_v2", (void **)&api->cuMemcpyDtoH) &&
|
load_symbol(api->lib, "cuMemcpyDtoH_v2", (void **)&api->cuMemcpyDtoH) &&
|
||||||
load_symbol(api->lib, "cuModuleLoadDataEx", (void **)&api->cuModuleLoadDataEx) &&
|
load_symbol(api->lib, "cuModuleLoadDataEx", (void **)&api->cuModuleLoadDataEx) &&
|
||||||
load_symbol(api->lib, "cuModuleGetFunction", (void **)&api->cuModuleGetFunction) &&
|
load_symbol(api->lib, "cuModuleGetFunction", (void **)&api->cuModuleGetFunction) &&
|
||||||
load_symbol(api->lib, "cuLaunchKernel", (void **)&api->cuLaunchKernel);
|
load_symbol(api->lib, "cuLaunchKernel", (void **)&api->cuLaunchKernel))) {
|
||||||
|
dlclose(api->lib);
|
||||||
|
memset(api, 0, sizeof(*api));
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
load_symbol(api->lib, "cuMemGetInfo_v2", (void **)&api->cuMemGetInfo);
|
||||||
|
load_symbol(api->lib, "cuStreamCreate", (void **)&api->cuStreamCreate);
|
||||||
|
if (!load_symbol(api->lib, "cuStreamDestroy_v2", (void **)&api->cuStreamDestroy)) {
|
||||||
|
load_symbol(api->lib, "cuStreamDestroy", (void **)&api->cuStreamDestroy);
|
||||||
|
}
|
||||||
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
static const char *cu_error_name(struct cuda_api *api, CUresult rc) {
|
static const char *cu_error_name(struct cuda_api *api, CUresult rc) {
|
||||||
@@ -193,14 +215,12 @@ static double now_seconds(void) {
|
|||||||
return (double)ts.tv_sec + ((double)ts.tv_nsec / 1000000000.0);
|
return (double)ts.tv_sec + ((double)ts.tv_nsec / 1000000000.0);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if HAVE_CUBLASLT_HEADERS
|
|
||||||
static size_t round_down_size(size_t value, size_t multiple) {
|
static size_t round_down_size(size_t value, size_t multiple) {
|
||||||
if (multiple == 0 || value < multiple) {
|
if (multiple == 0 || value < multiple) {
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
return value - (value % multiple);
|
return value - (value % multiple);
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
static int query_compute_capability(struct cuda_api *api, CUdevice dev, int *major, int *minor) {
|
static int query_compute_capability(struct cuda_api *api, CUdevice dev, int *major, int *minor) {
|
||||||
int cc_major = 0;
|
int cc_major = 0;
|
||||||
@@ -220,6 +240,75 @@ static int query_compute_capability(struct cuda_api *api, CUdevice dev, int *maj
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int query_multiprocessor_count(struct cuda_api *api, CUdevice dev, int *count) {
|
||||||
|
int mp_count = 0;
|
||||||
|
if (!check_rc(api,
|
||||||
|
"cuDeviceGetAttribute(multiprocessors)",
|
||||||
|
api->cuDeviceGetAttribute(&mp_count, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev))) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
*count = mp_count;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
static size_t clamp_budget_to_free_memory(struct cuda_api *api, size_t requested_bytes) {
|
||||||
|
size_t free_bytes = 0;
|
||||||
|
size_t total_bytes = 0;
|
||||||
|
size_t max_bytes = requested_bytes;
|
||||||
|
|
||||||
|
if (!api->cuMemGetInfo) {
|
||||||
|
return requested_bytes;
|
||||||
|
}
|
||||||
|
if (api->cuMemGetInfo(&free_bytes, &total_bytes) != CU_SUCCESS || free_bytes == 0) {
|
||||||
|
return requested_bytes;
|
||||||
|
}
|
||||||
|
|
||||||
|
max_bytes = (free_bytes * 9u) / 10u;
|
||||||
|
if (max_bytes < (size_t)4u * 1024u * 1024u) {
|
||||||
|
max_bytes = (size_t)4u * 1024u * 1024u;
|
||||||
|
}
|
||||||
|
if (requested_bytes > max_bytes) {
|
||||||
|
return max_bytes;
|
||||||
|
}
|
||||||
|
return requested_bytes;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int choose_stream_count(int mp_count, int planned_profiles, size_t total_budget, int have_streams) {
|
||||||
|
int stream_count = 1;
|
||||||
|
if (!have_streams || mp_count <= 0 || planned_profiles <= 0) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
stream_count = mp_count / 8;
|
||||||
|
if (stream_count < 2) {
|
||||||
|
stream_count = 2;
|
||||||
|
}
|
||||||
|
if (stream_count > MAX_STRESS_STREAMS) {
|
||||||
|
stream_count = MAX_STRESS_STREAMS;
|
||||||
|
}
|
||||||
|
|
||||||
|
while (stream_count > 1) {
|
||||||
|
size_t per_stream_budget = total_budget / ((size_t)planned_profiles * (size_t)stream_count);
|
||||||
|
if (per_stream_budget >= MIN_STREAM_BUDGET_BYTES) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
stream_count--;
|
||||||
|
}
|
||||||
|
return stream_count;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void destroy_streams(struct cuda_api *api, CUstream *streams, int count) {
|
||||||
|
if (!api->cuStreamDestroy) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
for (int i = 0; i < count; i++) {
|
||||||
|
if (streams[i]) {
|
||||||
|
api->cuStreamDestroy(streams[i]);
|
||||||
|
streams[i] = NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#if HAVE_CUBLASLT_HEADERS
|
#if HAVE_CUBLASLT_HEADERS
|
||||||
static void append_detail(char *buf, size_t cap, const char *fmt, ...) {
|
static void append_detail(char *buf, size_t cap, const char *fmt, ...) {
|
||||||
size_t len = strlen(buf);
|
size_t len = strlen(buf);
|
||||||
@@ -242,12 +331,19 @@ static int run_ptx_fallback(struct cuda_api *api,
|
|||||||
int size_mb,
|
int size_mb,
|
||||||
struct stress_report *report) {
|
struct stress_report *report) {
|
||||||
CUcontext ctx = NULL;
|
CUcontext ctx = NULL;
|
||||||
CUdeviceptr device_mem = 0;
|
|
||||||
CUmodule module = NULL;
|
CUmodule module = NULL;
|
||||||
CUfunction kernel = NULL;
|
CUfunction kernel = NULL;
|
||||||
uint32_t sample[256];
|
uint32_t sample[256];
|
||||||
uint32_t words = 0;
|
CUdeviceptr device_mem[MAX_STRESS_STREAMS] = {0};
|
||||||
|
CUstream streams[MAX_STRESS_STREAMS] = {0};
|
||||||
|
uint32_t words[MAX_STRESS_STREAMS] = {0};
|
||||||
|
uint32_t rounds[MAX_STRESS_STREAMS] = {0};
|
||||||
|
void *params[MAX_STRESS_STREAMS][3];
|
||||||
|
size_t bytes_per_stream[MAX_STRESS_STREAMS] = {0};
|
||||||
unsigned long iterations = 0;
|
unsigned long iterations = 0;
|
||||||
|
int mp_count = 0;
|
||||||
|
int stream_count = 1;
|
||||||
|
int launches_per_wave = 0;
|
||||||
|
|
||||||
memset(report, 0, sizeof(*report));
|
memset(report, 0, sizeof(*report));
|
||||||
snprintf(report->backend, sizeof(report->backend), "driver-ptx");
|
snprintf(report->backend, sizeof(report->backend), "driver-ptx");
|
||||||
@@ -260,64 +356,102 @@ static int run_ptx_fallback(struct cuda_api *api,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t bytes = (size_t)size_mb * 1024u * 1024u;
|
size_t requested_bytes = (size_t)size_mb * 1024u * 1024u;
|
||||||
if (bytes < 4u * 1024u * 1024u) {
|
if (requested_bytes < MIN_PROFILE_BUDGET_BYTES) {
|
||||||
bytes = 4u * 1024u * 1024u;
|
requested_bytes = MIN_PROFILE_BUDGET_BYTES;
|
||||||
}
|
}
|
||||||
if (bytes > (size_t)1024u * 1024u * 1024u) {
|
size_t total_bytes = clamp_budget_to_free_memory(api, requested_bytes);
|
||||||
bytes = (size_t)1024u * 1024u * 1024u;
|
if (total_bytes < MIN_PROFILE_BUDGET_BYTES) {
|
||||||
|
total_bytes = MIN_PROFILE_BUDGET_BYTES;
|
||||||
}
|
}
|
||||||
words = (uint32_t)(bytes / sizeof(uint32_t));
|
report->buffer_mb = (int)(total_bytes / (1024u * 1024u));
|
||||||
|
|
||||||
if (!check_rc(api, "cuMemAlloc", api->cuMemAlloc(&device_mem, bytes))) {
|
if (query_multiprocessor_count(api, dev, &mp_count) &&
|
||||||
api->cuCtxDestroy(ctx);
|
api->cuStreamCreate &&
|
||||||
return 0;
|
api->cuStreamDestroy) {
|
||||||
|
stream_count = choose_stream_count(mp_count, 1, total_bytes, 1);
|
||||||
}
|
}
|
||||||
if (!check_rc(api, "cuMemsetD8", api->cuMemsetD8(device_mem, 0, bytes))) {
|
if (stream_count > 1) {
|
||||||
api->cuMemFree(device_mem);
|
int created = 0;
|
||||||
api->cuCtxDestroy(ctx);
|
for (; created < stream_count; created++) {
|
||||||
return 0;
|
if (!check_rc(api, "cuStreamCreate", api->cuStreamCreate(&streams[created], 0))) {
|
||||||
|
destroy_streams(api, streams, created);
|
||||||
|
stream_count = 1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
report->stream_count = stream_count;
|
||||||
|
|
||||||
|
for (int lane = 0; lane < stream_count; lane++) {
|
||||||
|
size_t slice = total_bytes / (size_t)stream_count;
|
||||||
|
if (lane == stream_count - 1) {
|
||||||
|
slice = total_bytes - ((size_t)lane * (total_bytes / (size_t)stream_count));
|
||||||
|
}
|
||||||
|
slice = round_down_size(slice, sizeof(uint32_t));
|
||||||
|
if (slice < MIN_PROFILE_BUDGET_BYTES) {
|
||||||
|
slice = MIN_PROFILE_BUDGET_BYTES;
|
||||||
|
}
|
||||||
|
bytes_per_stream[lane] = slice;
|
||||||
|
words[lane] = (uint32_t)(slice / sizeof(uint32_t));
|
||||||
|
|
||||||
|
if (!check_rc(api, "cuMemAlloc", api->cuMemAlloc(&device_mem[lane], slice))) {
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
if (!check_rc(api, "cuMemsetD8", api->cuMemsetD8(device_mem[lane], 0, slice))) {
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
rounds[lane] = 2048;
|
||||||
|
params[lane][0] = &device_mem[lane];
|
||||||
|
params[lane][1] = &words[lane];
|
||||||
|
params[lane][2] = &rounds[lane];
|
||||||
|
}
|
||||||
|
|
||||||
if (!check_rc(api,
|
if (!check_rc(api,
|
||||||
"cuModuleLoadDataEx",
|
"cuModuleLoadDataEx",
|
||||||
api->cuModuleLoadDataEx(&module, ptx_source, 0, NULL, NULL))) {
|
api->cuModuleLoadDataEx(&module, ptx_source, 0, NULL, NULL))) {
|
||||||
api->cuMemFree(device_mem);
|
goto fail;
|
||||||
api->cuCtxDestroy(ctx);
|
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
if (!check_rc(api, "cuModuleGetFunction", api->cuModuleGetFunction(&kernel, module, "burn"))) {
|
if (!check_rc(api, "cuModuleGetFunction", api->cuModuleGetFunction(&kernel, module, "burn"))) {
|
||||||
api->cuMemFree(device_mem);
|
goto fail;
|
||||||
api->cuCtxDestroy(ctx);
|
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned int threads = 256;
|
unsigned int threads = 256;
|
||||||
unsigned int blocks = (unsigned int)((words + threads - 1) / threads);
|
|
||||||
uint32_t rounds = 1024;
|
|
||||||
void *params[] = {&device_mem, &words, &rounds};
|
|
||||||
|
|
||||||
double start = now_seconds();
|
double start = now_seconds();
|
||||||
double deadline = start + (double)seconds;
|
double deadline = start + (double)seconds;
|
||||||
while (now_seconds() < deadline) {
|
while (now_seconds() < deadline) {
|
||||||
if (!check_rc(api,
|
launches_per_wave = 0;
|
||||||
"cuLaunchKernel",
|
for (int lane = 0; lane < stream_count; lane++) {
|
||||||
api->cuLaunchKernel(kernel, blocks, 1, 1, threads, 1, 1, 0, NULL, params, NULL))) {
|
unsigned int blocks = (unsigned int)((words[lane] + threads - 1) / threads);
|
||||||
api->cuMemFree(device_mem);
|
if (!check_rc(api,
|
||||||
api->cuCtxDestroy(ctx);
|
"cuLaunchKernel",
|
||||||
return 0;
|
api->cuLaunchKernel(kernel,
|
||||||
|
blocks,
|
||||||
|
1,
|
||||||
|
1,
|
||||||
|
threads,
|
||||||
|
1,
|
||||||
|
1,
|
||||||
|
0,
|
||||||
|
streams[lane],
|
||||||
|
params[lane],
|
||||||
|
NULL))) {
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
launches_per_wave++;
|
||||||
}
|
}
|
||||||
iterations++;
|
if (launches_per_wave <= 0) {
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
if (!check_rc(api, "cuCtxSynchronize", api->cuCtxSynchronize())) {
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
iterations += (unsigned long)launches_per_wave;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!check_rc(api, "cuCtxSynchronize", api->cuCtxSynchronize())) {
|
if (!check_rc(api, "cuMemcpyDtoH", api->cuMemcpyDtoH(sample, device_mem[0], sizeof(sample)))) {
|
||||||
api->cuMemFree(device_mem);
|
goto fail;
|
||||||
api->cuCtxDestroy(ctx);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
if (!check_rc(api, "cuMemcpyDtoH", api->cuMemcpyDtoH(sample, device_mem, sizeof(sample)))) {
|
|
||||||
api->cuMemFree(device_mem);
|
|
||||||
api->cuCtxDestroy(ctx);
|
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < sizeof(sample) / sizeof(sample[0]); i++) {
|
for (size_t i = 0; i < sizeof(sample) / sizeof(sample[0]); i++) {
|
||||||
@@ -326,12 +460,33 @@ static int run_ptx_fallback(struct cuda_api *api,
|
|||||||
report->iterations = iterations;
|
report->iterations = iterations;
|
||||||
snprintf(report->details,
|
snprintf(report->details,
|
||||||
sizeof(report->details),
|
sizeof(report->details),
|
||||||
"profile_int32_fallback=OK iterations=%lu\n",
|
"fallback_int32=OK requested_mb=%d actual_mb=%d streams=%d per_stream_mb=%zu iterations=%lu\n",
|
||||||
|
size_mb,
|
||||||
|
report->buffer_mb,
|
||||||
|
report->stream_count,
|
||||||
|
bytes_per_stream[0] / (1024u * 1024u),
|
||||||
iterations);
|
iterations);
|
||||||
|
|
||||||
api->cuMemFree(device_mem);
|
for (int lane = 0; lane < stream_count; lane++) {
|
||||||
|
if (device_mem[lane]) {
|
||||||
|
api->cuMemFree(device_mem[lane]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
destroy_streams(api, streams, stream_count);
|
||||||
api->cuCtxDestroy(ctx);
|
api->cuCtxDestroy(ctx);
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
|
fail:
|
||||||
|
for (int lane = 0; lane < MAX_STRESS_STREAMS; lane++) {
|
||||||
|
if (device_mem[lane]) {
|
||||||
|
api->cuMemFree(device_mem[lane]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
destroy_streams(api, streams, MAX_STRESS_STREAMS);
|
||||||
|
if (ctx) {
|
||||||
|
api->cuCtxDestroy(ctx);
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
#if HAVE_CUBLASLT_HEADERS
|
#if HAVE_CUBLASLT_HEADERS
|
||||||
@@ -418,6 +573,7 @@ struct profile_desc {
|
|||||||
|
|
||||||
struct prepared_profile {
|
struct prepared_profile {
|
||||||
struct profile_desc desc;
|
struct profile_desc desc;
|
||||||
|
CUstream stream;
|
||||||
cublasLtMatmulDesc_t op_desc;
|
cublasLtMatmulDesc_t op_desc;
|
||||||
cublasLtMatrixLayout_t a_layout;
|
cublasLtMatrixLayout_t a_layout;
|
||||||
cublasLtMatrixLayout_t b_layout;
|
cublasLtMatrixLayout_t b_layout;
|
||||||
@@ -617,8 +773,8 @@ static uint64_t choose_square_dim(size_t budget_bytes, size_t bytes_per_cell, in
|
|||||||
if (dim < (uint64_t)multiple) {
|
if (dim < (uint64_t)multiple) {
|
||||||
dim = (uint64_t)multiple;
|
dim = (uint64_t)multiple;
|
||||||
}
|
}
|
||||||
if (dim > 8192u) {
|
if (dim > 65536u) {
|
||||||
dim = 8192u;
|
dim = 65536u;
|
||||||
}
|
}
|
||||||
return dim;
|
return dim;
|
||||||
}
|
}
|
||||||
@@ -704,10 +860,12 @@ static int prepare_profile(struct cublaslt_api *cublas,
|
|||||||
cublasLtHandle_t handle,
|
cublasLtHandle_t handle,
|
||||||
struct cuda_api *cuda,
|
struct cuda_api *cuda,
|
||||||
const struct profile_desc *desc,
|
const struct profile_desc *desc,
|
||||||
|
CUstream stream,
|
||||||
size_t profile_budget_bytes,
|
size_t profile_budget_bytes,
|
||||||
struct prepared_profile *out) {
|
struct prepared_profile *out) {
|
||||||
memset(out, 0, sizeof(*out));
|
memset(out, 0, sizeof(*out));
|
||||||
out->desc = *desc;
|
out->desc = *desc;
|
||||||
|
out->stream = stream;
|
||||||
|
|
||||||
size_t bytes_per_cell = 0;
|
size_t bytes_per_cell = 0;
|
||||||
bytes_per_cell += bytes_for_elements(desc->a_type, 1);
|
bytes_per_cell += bytes_for_elements(desc->a_type, 1);
|
||||||
@@ -935,7 +1093,7 @@ static int run_cublas_profile(cublasLtHandle_t handle,
|
|||||||
&profile->heuristic.algo,
|
&profile->heuristic.algo,
|
||||||
(void *)(uintptr_t)profile->workspace_dev,
|
(void *)(uintptr_t)profile->workspace_dev,
|
||||||
profile->workspace_size,
|
profile->workspace_size,
|
||||||
(cudaStream_t)0));
|
profile->stream));
|
||||||
}
|
}
|
||||||
|
|
||||||
static int run_cublaslt_stress(struct cuda_api *cuda,
|
static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||||
@@ -947,13 +1105,22 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
int size_mb,
|
int size_mb,
|
||||||
struct stress_report *report) {
|
struct stress_report *report) {
|
||||||
struct cublaslt_api cublas;
|
struct cublaslt_api cublas;
|
||||||
struct prepared_profile prepared[sizeof(k_profiles) / sizeof(k_profiles[0])];
|
struct prepared_profile prepared[MAX_STRESS_STREAMS * MAX_CUBLAS_PROFILES];
|
||||||
cublasLtHandle_t handle = NULL;
|
cublasLtHandle_t handle = NULL;
|
||||||
CUcontext ctx = NULL;
|
CUcontext ctx = NULL;
|
||||||
|
CUstream streams[MAX_STRESS_STREAMS] = {0};
|
||||||
uint16_t sample[256];
|
uint16_t sample[256];
|
||||||
int cc = cc_major * 10 + cc_minor;
|
int cc = cc_major * 10 + cc_minor;
|
||||||
int planned = 0;
|
int planned = 0;
|
||||||
int active = 0;
|
int active = 0;
|
||||||
|
int mp_count = 0;
|
||||||
|
int stream_count = 1;
|
||||||
|
int profile_count = (int)(sizeof(k_profiles) / sizeof(k_profiles[0]));
|
||||||
|
int prepared_count = 0;
|
||||||
|
int wave_launches = 0;
|
||||||
|
size_t requested_budget = 0;
|
||||||
|
size_t total_budget = 0;
|
||||||
|
size_t per_profile_budget = 0;
|
||||||
|
|
||||||
memset(report, 0, sizeof(*report));
|
memset(report, 0, sizeof(*report));
|
||||||
snprintf(report->backend, sizeof(report->backend), "cublasLt");
|
snprintf(report->backend, sizeof(report->backend), "cublasLt");
|
||||||
@@ -986,16 +1153,45 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t total_budget = (size_t)size_mb * 1024u * 1024u;
|
requested_budget = (size_t)size_mb * 1024u * 1024u;
|
||||||
if (total_budget < (size_t)planned * 4u * 1024u * 1024u) {
|
if (requested_budget < (size_t)planned * MIN_PROFILE_BUDGET_BYTES) {
|
||||||
total_budget = (size_t)planned * 4u * 1024u * 1024u;
|
requested_budget = (size_t)planned * MIN_PROFILE_BUDGET_BYTES;
|
||||||
}
|
}
|
||||||
size_t per_profile_budget = total_budget / (size_t)planned;
|
total_budget = clamp_budget_to_free_memory(cuda, requested_budget);
|
||||||
if (per_profile_budget < 4u * 1024u * 1024u) {
|
if (total_budget < (size_t)planned * MIN_PROFILE_BUDGET_BYTES) {
|
||||||
per_profile_budget = 4u * 1024u * 1024u;
|
total_budget = (size_t)planned * MIN_PROFILE_BUDGET_BYTES;
|
||||||
}
|
}
|
||||||
|
if (query_multiprocessor_count(cuda, dev, &mp_count) &&
|
||||||
|
cuda->cuStreamCreate &&
|
||||||
|
cuda->cuStreamDestroy) {
|
||||||
|
stream_count = choose_stream_count(mp_count, planned, total_budget, 1);
|
||||||
|
}
|
||||||
|
if (stream_count > 1) {
|
||||||
|
int created = 0;
|
||||||
|
for (; created < stream_count; created++) {
|
||||||
|
if (!check_rc(cuda, "cuStreamCreate", cuda->cuStreamCreate(&streams[created], 0))) {
|
||||||
|
destroy_streams(cuda, streams, created);
|
||||||
|
stream_count = 1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
report->stream_count = stream_count;
|
||||||
|
per_profile_budget = total_budget / ((size_t)planned * (size_t)stream_count);
|
||||||
|
if (per_profile_budget < MIN_PROFILE_BUDGET_BYTES) {
|
||||||
|
per_profile_budget = MIN_PROFILE_BUDGET_BYTES;
|
||||||
|
}
|
||||||
|
report->buffer_mb = (int)(total_budget / (1024u * 1024u));
|
||||||
|
append_detail(report->details,
|
||||||
|
sizeof(report->details),
|
||||||
|
"requested_mb=%d actual_mb=%d streams=%d mp_count=%d per_worker_mb=%zu\n",
|
||||||
|
size_mb,
|
||||||
|
report->buffer_mb,
|
||||||
|
report->stream_count,
|
||||||
|
mp_count,
|
||||||
|
per_profile_budget / (1024u * 1024u));
|
||||||
|
|
||||||
for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
|
for (int i = 0; i < profile_count; i++) {
|
||||||
const struct profile_desc *desc = &k_profiles[i];
|
const struct profile_desc *desc = &k_profiles[i];
|
||||||
if (!(desc->enabled && cc >= desc->min_cc)) {
|
if (!(desc->enabled && cc >= desc->min_cc)) {
|
||||||
append_detail(report->details,
|
append_detail(report->details,
|
||||||
@@ -1005,30 +1201,45 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
desc->min_cc);
|
desc->min_cc);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (prepare_profile(&cublas, handle, cuda, desc, per_profile_budget, &prepared[i])) {
|
for (int lane = 0; lane < stream_count; lane++) {
|
||||||
active++;
|
CUstream stream = streams[lane];
|
||||||
append_detail(report->details,
|
if (prepared_count >= (int)(sizeof(prepared) / sizeof(prepared[0]))) {
|
||||||
sizeof(report->details),
|
break;
|
||||||
"%s=READY dim=%llux%llux%llu block=%s\n",
|
}
|
||||||
desc->name,
|
if (prepare_profile(&cublas, handle, cuda, desc, stream, per_profile_budget, &prepared[prepared_count])) {
|
||||||
(unsigned long long)prepared[i].m,
|
active++;
|
||||||
(unsigned long long)prepared[i].n,
|
append_detail(report->details,
|
||||||
(unsigned long long)prepared[i].k,
|
sizeof(report->details),
|
||||||
desc->block_label);
|
"%s[%d]=READY dim=%llux%llux%llu block=%s stream=%d\n",
|
||||||
} else {
|
desc->name,
|
||||||
append_detail(report->details, sizeof(report->details), "%s=SKIPPED unsupported\n", desc->name);
|
lane,
|
||||||
|
(unsigned long long)prepared[prepared_count].m,
|
||||||
|
(unsigned long long)prepared[prepared_count].n,
|
||||||
|
(unsigned long long)prepared[prepared_count].k,
|
||||||
|
desc->block_label,
|
||||||
|
lane);
|
||||||
|
prepared_count++;
|
||||||
|
} else {
|
||||||
|
append_detail(report->details,
|
||||||
|
sizeof(report->details),
|
||||||
|
"%s[%d]=SKIPPED unsupported\n",
|
||||||
|
desc->name,
|
||||||
|
lane);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (active <= 0) {
|
if (active <= 0) {
|
||||||
cublas.cublasLtDestroy(handle);
|
cublas.cublasLtDestroy(handle);
|
||||||
|
destroy_streams(cuda, streams, stream_count);
|
||||||
cuda->cuCtxDestroy(ctx);
|
cuda->cuCtxDestroy(ctx);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
double deadline = now_seconds() + (double)seconds;
|
double deadline = now_seconds() + (double)seconds;
|
||||||
while (now_seconds() < deadline) {
|
while (now_seconds() < deadline) {
|
||||||
for (size_t i = 0; i < sizeof(prepared) / sizeof(prepared[0]); i++) {
|
wave_launches = 0;
|
||||||
|
for (int i = 0; i < prepared_count; i++) {
|
||||||
if (!prepared[i].ready) {
|
if (!prepared[i].ready) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -1037,31 +1248,33 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
sizeof(report->details),
|
sizeof(report->details),
|
||||||
"%s=FAILED runtime\n",
|
"%s=FAILED runtime\n",
|
||||||
prepared[i].desc.name);
|
prepared[i].desc.name);
|
||||||
for (size_t j = 0; j < sizeof(prepared) / sizeof(prepared[0]); j++) {
|
for (int j = 0; j < prepared_count; j++) {
|
||||||
destroy_profile(&cublas, cuda, &prepared[j]);
|
destroy_profile(&cublas, cuda, &prepared[j]);
|
||||||
}
|
}
|
||||||
cublas.cublasLtDestroy(handle);
|
cublas.cublasLtDestroy(handle);
|
||||||
|
destroy_streams(cuda, streams, stream_count);
|
||||||
cuda->cuCtxDestroy(ctx);
|
cuda->cuCtxDestroy(ctx);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
prepared[i].iterations++;
|
prepared[i].iterations++;
|
||||||
report->iterations++;
|
report->iterations++;
|
||||||
if (now_seconds() >= deadline) {
|
wave_launches++;
|
||||||
break;
|
}
|
||||||
|
if (wave_launches <= 0) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (!check_rc(cuda, "cuCtxSynchronize", cuda->cuCtxSynchronize())) {
|
||||||
|
for (int i = 0; i < prepared_count; i++) {
|
||||||
|
destroy_profile(&cublas, cuda, &prepared[i]);
|
||||||
}
|
}
|
||||||
|
cublas.cublasLtDestroy(handle);
|
||||||
|
destroy_streams(cuda, streams, stream_count);
|
||||||
|
cuda->cuCtxDestroy(ctx);
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!check_rc(cuda, "cuCtxSynchronize", cuda->cuCtxSynchronize())) {
|
for (int i = 0; i < prepared_count; i++) {
|
||||||
for (size_t i = 0; i < sizeof(prepared) / sizeof(prepared[0]); i++) {
|
|
||||||
destroy_profile(&cublas, cuda, &prepared[i]);
|
|
||||||
}
|
|
||||||
cublas.cublasLtDestroy(handle);
|
|
||||||
cuda->cuCtxDestroy(ctx);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (size_t i = 0; i < sizeof(prepared) / sizeof(prepared[0]); i++) {
|
|
||||||
if (!prepared[i].ready) {
|
if (!prepared[i].ready) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -1072,7 +1285,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
prepared[i].iterations);
|
prepared[i].iterations);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < sizeof(prepared) / sizeof(prepared[0]); i++) {
|
for (int i = 0; i < prepared_count; i++) {
|
||||||
if (prepared[i].ready) {
|
if (prepared[i].ready) {
|
||||||
if (check_rc(cuda, "cuMemcpyDtoH", cuda->cuMemcpyDtoH(sample, prepared[i].d_dev, sizeof(sample)))) {
|
if (check_rc(cuda, "cuMemcpyDtoH", cuda->cuMemcpyDtoH(sample, prepared[i].d_dev, sizeof(sample)))) {
|
||||||
for (size_t j = 0; j < sizeof(sample) / sizeof(sample[0]); j++) {
|
for (size_t j = 0; j < sizeof(sample) / sizeof(sample[0]); j++) {
|
||||||
@@ -1083,10 +1296,11 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < sizeof(prepared) / sizeof(prepared[0]); i++) {
|
for (int i = 0; i < prepared_count; i++) {
|
||||||
destroy_profile(&cublas, cuda, &prepared[i]);
|
destroy_profile(&cublas, cuda, &prepared[i]);
|
||||||
}
|
}
|
||||||
cublas.cublasLtDestroy(handle);
|
cublas.cublasLtDestroy(handle);
|
||||||
|
destroy_streams(cuda, streams, stream_count);
|
||||||
cuda->cuCtxDestroy(ctx);
|
cuda->cuCtxDestroy(ctx);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
@@ -1095,13 +1309,16 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
|||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv) {
|
||||||
int seconds = 5;
|
int seconds = 5;
|
||||||
int size_mb = 64;
|
int size_mb = 64;
|
||||||
|
int device_index = 0;
|
||||||
for (int i = 1; i < argc; i++) {
|
for (int i = 1; i < argc; i++) {
|
||||||
if ((strcmp(argv[i], "--seconds") == 0 || strcmp(argv[i], "-t") == 0) && i + 1 < argc) {
|
if ((strcmp(argv[i], "--seconds") == 0 || strcmp(argv[i], "-t") == 0) && i + 1 < argc) {
|
||||||
seconds = atoi(argv[++i]);
|
seconds = atoi(argv[++i]);
|
||||||
} else if ((strcmp(argv[i], "--size-mb") == 0 || strcmp(argv[i], "-m") == 0) && i + 1 < argc) {
|
} else if ((strcmp(argv[i], "--size-mb") == 0 || strcmp(argv[i], "-m") == 0) && i + 1 < argc) {
|
||||||
size_mb = atoi(argv[++i]);
|
size_mb = atoi(argv[++i]);
|
||||||
|
} else if ((strcmp(argv[i], "--device") == 0 || strcmp(argv[i], "-d") == 0) && i + 1 < argc) {
|
||||||
|
device_index = atoi(argv[++i]);
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "usage: %s [--seconds N] [--size-mb N]\n", argv[0]);
|
fprintf(stderr, "usage: %s [--seconds N] [--size-mb N] [--device N]\n", argv[0]);
|
||||||
return 2;
|
return 2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1111,6 +1328,9 @@ int main(int argc, char **argv) {
|
|||||||
if (size_mb <= 0) {
|
if (size_mb <= 0) {
|
||||||
size_mb = 64;
|
size_mb = 64;
|
||||||
}
|
}
|
||||||
|
if (device_index < 0) {
|
||||||
|
device_index = 0;
|
||||||
|
}
|
||||||
|
|
||||||
struct cuda_api cuda;
|
struct cuda_api cuda;
|
||||||
if (!load_cuda(&cuda)) {
|
if (!load_cuda(&cuda)) {
|
||||||
@@ -1133,8 +1353,13 @@ int main(int argc, char **argv) {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (device_index >= count) {
|
||||||
|
fprintf(stderr, "device index %d out of range (found %d CUDA device(s))\n", device_index, count);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
CUdevice dev = 0;
|
CUdevice dev = 0;
|
||||||
if (!check_rc(&cuda, "cuDeviceGet", cuda.cuDeviceGet(&dev, 0))) {
|
if (!check_rc(&cuda, "cuDeviceGet", cuda.cuDeviceGet(&dev, device_index))) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1162,10 +1387,12 @@ int main(int argc, char **argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
printf("device=%s\n", report.device);
|
printf("device=%s\n", report.device);
|
||||||
|
printf("device_index=%d\n", device_index);
|
||||||
printf("compute_capability=%d.%d\n", report.cc_major, report.cc_minor);
|
printf("compute_capability=%d.%d\n", report.cc_major, report.cc_minor);
|
||||||
printf("backend=%s\n", report.backend);
|
printf("backend=%s\n", report.backend);
|
||||||
printf("duration_s=%d\n", seconds);
|
printf("duration_s=%d\n", seconds);
|
||||||
printf("buffer_mb=%d\n", report.buffer_mb);
|
printf("buffer_mb=%d\n", report.buffer_mb);
|
||||||
|
printf("streams=%d\n", report.stream_count);
|
||||||
printf("iterations=%lu\n", report.iterations);
|
printf("iterations=%lu\n", report.iterations);
|
||||||
printf("checksum=%llu\n", (unsigned long long)report.checksum);
|
printf("checksum=%llu\n", (unsigned long long)report.checksum);
|
||||||
if (report.details[0] != '\0') {
|
if (report.details[0] != '\0') {
|
||||||
|
|||||||
@@ -1,9 +1,9 @@
|
|||||||
#!/bin/sh
|
#!/bin/sh
|
||||||
# build-cublas.sh — download cuBLASLt/cuBLAS/cudart runtime + headers for bee-gpu-stress.
|
# build-cublas.sh — download cuBLASLt/cuBLAS/cudart runtime + headers for bee-gpu-burn worker.
|
||||||
#
|
#
|
||||||
# Downloads .deb packages from NVIDIA's CUDA apt repository (Debian 12, x86_64),
|
# Downloads .deb packages from NVIDIA's CUDA apt repository (Debian 12, x86_64),
|
||||||
# verifies them against Packages.gz, and extracts the small subset we need:
|
# verifies them against Packages.gz, and extracts the small subset we need:
|
||||||
# - headers for compiling bee-gpu-stress against cuBLASLt
|
# - headers for compiling bee-gpu-burn worker against cuBLASLt
|
||||||
# - runtime libs for libcublas, libcublasLt, libcudart inside the ISO
|
# - runtime libs for libcublas, libcublasLt, libcudart inside the ISO
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|||||||
55
iso/builder/build-john.sh
Normal file
55
iso/builder/build-john.sh
Normal file
@@ -0,0 +1,55 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# build-john.sh — build John the Ripper jumbo with OpenCL support for the LiveCD.
|
||||||
|
#
|
||||||
|
# Downloads a pinned source snapshot from the official openwall/john repository,
|
||||||
|
# builds it inside the builder container, and caches the resulting run/ tree.
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
JOHN_COMMIT="$1"
|
||||||
|
DIST_DIR="$2"
|
||||||
|
|
||||||
|
[ -n "$JOHN_COMMIT" ] || { echo "usage: $0 <john-commit> <dist-dir>"; exit 1; }
|
||||||
|
[ -n "$DIST_DIR" ] || { echo "usage: $0 <john-commit> <dist-dir>"; exit 1; }
|
||||||
|
|
||||||
|
echo "=== John the Ripper jumbo ${JOHN_COMMIT} ==="
|
||||||
|
|
||||||
|
CACHE_DIR="${DIST_DIR}/john-${JOHN_COMMIT}"
|
||||||
|
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||||
|
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/john-downloads"
|
||||||
|
SRC_TAR="${DOWNLOAD_CACHE_DIR}/john-${JOHN_COMMIT}.tar.gz"
|
||||||
|
SRC_URL="https://github.com/openwall/john/archive/${JOHN_COMMIT}.tar.gz"
|
||||||
|
|
||||||
|
if [ -x "${CACHE_DIR}/run/john" ] && [ -f "${CACHE_DIR}/run/john.conf" ]; then
|
||||||
|
echo "=== john cached, skipping build ==="
|
||||||
|
echo "run dir: ${CACHE_DIR}/run"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
mkdir -p "${DOWNLOAD_CACHE_DIR}"
|
||||||
|
if [ ! -f "${SRC_TAR}" ]; then
|
||||||
|
echo "=== downloading john source snapshot ==="
|
||||||
|
wget --show-progress -O "${SRC_TAR}" "${SRC_URL}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
BUILD_TMP=$(mktemp -d)
|
||||||
|
trap 'rm -rf "${BUILD_TMP}"' EXIT INT TERM
|
||||||
|
|
||||||
|
cd "${BUILD_TMP}"
|
||||||
|
tar xf "${SRC_TAR}"
|
||||||
|
SRC_DIR=$(find . -maxdepth 1 -type d -name 'john-*' | head -1)
|
||||||
|
[ -n "${SRC_DIR}" ] || { echo "ERROR: john source directory not found"; exit 1; }
|
||||||
|
|
||||||
|
cd "${SRC_DIR}/src"
|
||||||
|
echo "=== configuring john ==="
|
||||||
|
./configure
|
||||||
|
echo "=== building john ==="
|
||||||
|
make clean >/dev/null 2>&1 || true
|
||||||
|
make -j"$(nproc)"
|
||||||
|
|
||||||
|
mkdir -p "${CACHE_DIR}"
|
||||||
|
cp -a "../run" "${CACHE_DIR}/run"
|
||||||
|
chmod +x "${CACHE_DIR}/run/john"
|
||||||
|
|
||||||
|
echo "=== john build complete ==="
|
||||||
|
echo "run dir: ${CACHE_DIR}/run"
|
||||||
@@ -9,6 +9,7 @@
|
|||||||
#
|
#
|
||||||
# Output layout:
|
# Output layout:
|
||||||
# $CACHE_DIR/bin/all_reduce_perf
|
# $CACHE_DIR/bin/all_reduce_perf
|
||||||
|
# $CACHE_DIR/lib/libcudart.so* copied from the nvcc toolchain used to build nccl-tests
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
@@ -30,7 +31,7 @@ CACHE_DIR="${DIST_DIR}/nccl-tests-${NCCL_TESTS_VERSION}"
|
|||||||
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||||
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nccl-tests-downloads"
|
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nccl-tests-downloads"
|
||||||
|
|
||||||
if [ -f "${CACHE_DIR}/bin/all_reduce_perf" ]; then
|
if [ -f "${CACHE_DIR}/bin/all_reduce_perf" ] && [ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcudart.so*' 2>/dev/null | wc -l)" -gt 0 ]; then
|
||||||
echo "=== nccl-tests cached, skipping build ==="
|
echo "=== nccl-tests cached, skipping build ==="
|
||||||
echo "binary: ${CACHE_DIR}/bin/all_reduce_perf"
|
echo "binary: ${CACHE_DIR}/bin/all_reduce_perf"
|
||||||
exit 0
|
exit 0
|
||||||
@@ -52,6 +53,23 @@ echo "nvcc: $NVCC"
|
|||||||
CUDA_HOME="$(dirname "$(dirname "$NVCC")")"
|
CUDA_HOME="$(dirname "$(dirname "$NVCC")")"
|
||||||
echo "CUDA_HOME: $CUDA_HOME"
|
echo "CUDA_HOME: $CUDA_HOME"
|
||||||
|
|
||||||
|
find_cudart_dir() {
|
||||||
|
for dir in \
|
||||||
|
"${CUDA_HOME}/targets/x86_64-linux/lib" \
|
||||||
|
"${CUDA_HOME}/targets/x86_64-linux/lib/stubs" \
|
||||||
|
"${CUDA_HOME}/lib64" \
|
||||||
|
"${CUDA_HOME}/lib"; do
|
||||||
|
if [ -d "$dir" ] && find "$dir" -maxdepth 1 -name 'libcudart.so*' -type f | grep -q .; then
|
||||||
|
printf '%s\n' "$dir"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
CUDART_DIR="$(find_cudart_dir)" || { echo "ERROR: libcudart.so* not found under ${CUDA_HOME}"; exit 1; }
|
||||||
|
echo "cudart dir: $CUDART_DIR"
|
||||||
|
|
||||||
# Download libnccl-dev for nccl.h
|
# Download libnccl-dev for nccl.h
|
||||||
REPO_BASE="https://developer.download.nvidia.com/compute/cuda/repos/debian${DEBIAN_VERSION}/x86_64"
|
REPO_BASE="https://developer.download.nvidia.com/compute/cuda/repos/debian${DEBIAN_VERSION}/x86_64"
|
||||||
DEV_PKG="libnccl-dev_${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}_amd64.deb"
|
DEV_PKG="libnccl-dev_${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}_amd64.deb"
|
||||||
@@ -136,6 +154,11 @@ mkdir -p "${CACHE_DIR}/bin"
|
|||||||
cp "./build/all_reduce_perf" "${CACHE_DIR}/bin/all_reduce_perf"
|
cp "./build/all_reduce_perf" "${CACHE_DIR}/bin/all_reduce_perf"
|
||||||
chmod +x "${CACHE_DIR}/bin/all_reduce_perf"
|
chmod +x "${CACHE_DIR}/bin/all_reduce_perf"
|
||||||
|
|
||||||
|
mkdir -p "${CACHE_DIR}/lib"
|
||||||
|
find "${CUDART_DIR}" -maxdepth 1 -name 'libcudart.so*' -type f -exec cp -a {} "${CACHE_DIR}/lib/" \;
|
||||||
|
[ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcudart.so*' -type f | wc -l)" -gt 0 ] || { echo "ERROR: libcudart runtime copy failed"; exit 1; }
|
||||||
|
|
||||||
echo "=== nccl-tests build complete ==="
|
echo "=== nccl-tests build complete ==="
|
||||||
echo "binary: ${CACHE_DIR}/bin/all_reduce_perf"
|
echo "binary: ${CACHE_DIR}/bin/all_reduce_perf"
|
||||||
ls -lh "${CACHE_DIR}/bin/all_reduce_perf"
|
ls -lh "${CACHE_DIR}/bin/all_reduce_perf"
|
||||||
|
ls -lh "${CACHE_DIR}/lib/"libcudart.so* 2>/dev/null || true
|
||||||
|
|||||||
@@ -10,7 +10,7 @@
|
|||||||
# Output layout:
|
# Output layout:
|
||||||
# $CACHE_DIR/modules/ — nvidia*.ko files
|
# $CACHE_DIR/modules/ — nvidia*.ko files
|
||||||
# $CACHE_DIR/bin/ — nvidia-smi, nvidia-debugdump
|
# $CACHE_DIR/bin/ — nvidia-smi, nvidia-debugdump
|
||||||
# $CACHE_DIR/lib/ — libnvidia-ml.so*, libcuda.so* (for nvidia-smi)
|
# $CACHE_DIR/lib/ — libnvidia-ml.so*, libcuda.so*, OpenCL-related libs
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
@@ -133,7 +133,14 @@ fi
|
|||||||
# Copy ALL userspace library files.
|
# Copy ALL userspace library files.
|
||||||
# libnvidia-ptxjitcompiler is required by libcuda for PTX JIT compilation
|
# libnvidia-ptxjitcompiler is required by libcuda for PTX JIT compilation
|
||||||
# (cuModuleLoadDataEx with PTX source) — without it CUDA_ERROR_JIT_COMPILER_NOT_FOUND.
|
# (cuModuleLoadDataEx with PTX source) — without it CUDA_ERROR_JIT_COMPILER_NOT_FOUND.
|
||||||
for lib in libnvidia-ml libcuda libnvidia-ptxjitcompiler; do
|
for lib in \
|
||||||
|
libnvidia-ml \
|
||||||
|
libcuda \
|
||||||
|
libnvidia-ptxjitcompiler \
|
||||||
|
libnvidia-opencl \
|
||||||
|
libnvidia-compiler \
|
||||||
|
libnvidia-nvvm \
|
||||||
|
libnvidia-fatbinaryloader; do
|
||||||
count=0
|
count=0
|
||||||
for f in $(find "$EXTRACT_DIR" -maxdepth 1 -name "${lib}.so.*" 2>/dev/null); do
|
for f in $(find "$EXTRACT_DIR" -maxdepth 1 -name "${lib}.so.*" 2>/dev/null); do
|
||||||
cp "$f" "$CACHE_DIR/lib/" && count=$((count+1))
|
cp "$f" "$CACHE_DIR/lib/" && count=$((count+1))
|
||||||
@@ -150,7 +157,14 @@ ko_count=$(ls "$CACHE_DIR/modules/"*.ko 2>/dev/null | wc -l)
|
|||||||
[ "$ko_count" -gt 0 ] || { echo "ERROR: no .ko files built in $CACHE_DIR/modules/"; exit 1; }
|
[ "$ko_count" -gt 0 ] || { echo "ERROR: no .ko files built in $CACHE_DIR/modules/"; exit 1; }
|
||||||
|
|
||||||
# Create soname symlinks: use [0-9][0-9]* to avoid circular symlink (.so.1 has single digit)
|
# Create soname symlinks: use [0-9][0-9]* to avoid circular symlink (.so.1 has single digit)
|
||||||
for lib in libnvidia-ml libcuda libnvidia-ptxjitcompiler; do
|
for lib in \
|
||||||
|
libnvidia-ml \
|
||||||
|
libcuda \
|
||||||
|
libnvidia-ptxjitcompiler \
|
||||||
|
libnvidia-opencl \
|
||||||
|
libnvidia-compiler \
|
||||||
|
libnvidia-nvvm \
|
||||||
|
libnvidia-fatbinaryloader; do
|
||||||
versioned=$(ls "$CACHE_DIR/lib/${lib}.so."[0-9][0-9]* 2>/dev/null | head -1)
|
versioned=$(ls "$CACHE_DIR/lib/${lib}.so."[0-9][0-9]* 2>/dev/null | head -1)
|
||||||
[ -n "$versioned" ] || continue
|
[ -n "$versioned" ] || continue
|
||||||
base=$(basename "$versioned")
|
base=$(basename "$versioned")
|
||||||
|
|||||||
@@ -183,7 +183,7 @@ else
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
# --- NVIDIA-only build steps ---
|
# --- NVIDIA-only build steps ---
|
||||||
GPU_STRESS_BIN="${DIST_DIR}/bee-gpu-stress-linux-amd64"
|
GPU_BURN_WORKER_BIN="${DIST_DIR}/bee-gpu-burn-worker-linux-amd64"
|
||||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||||
echo ""
|
echo ""
|
||||||
echo "=== downloading cuBLAS/cuBLASLt/cudart ${NCCL_CUDA_VERSION} userspace ==="
|
echo "=== downloading cuBLAS/cuBLASLt/cudart ${NCCL_CUDA_VERSION} userspace ==="
|
||||||
@@ -196,20 +196,20 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
|||||||
CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
||||||
|
|
||||||
GPU_STRESS_NEED_BUILD=1
|
GPU_STRESS_NEED_BUILD=1
|
||||||
if [ -f "$GPU_STRESS_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_STRESS_BIN" ]; then
|
if [ -f "$GPU_BURN_WORKER_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_BURN_WORKER_BIN" ]; then
|
||||||
GPU_STRESS_NEED_BUILD=0
|
GPU_STRESS_NEED_BUILD=0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then
|
if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then
|
||||||
echo "=== building bee-gpu-stress ==="
|
echo "=== building bee-gpu-burn worker ==="
|
||||||
gcc -O2 -s -Wall -Wextra \
|
gcc -O2 -s -Wall -Wextra \
|
||||||
-I"${CUBLAS_CACHE}/include" \
|
-I"${CUBLAS_CACHE}/include" \
|
||||||
-o "$GPU_STRESS_BIN" \
|
-o "$GPU_BURN_WORKER_BIN" \
|
||||||
"${BUILDER_DIR}/bee-gpu-stress.c" \
|
"${BUILDER_DIR}/bee-gpu-stress.c" \
|
||||||
-ldl -lm
|
-ldl -lm
|
||||||
echo "binary: $GPU_STRESS_BIN"
|
echo "binary: $GPU_BURN_WORKER_BIN"
|
||||||
else
|
else
|
||||||
echo "=== bee-gpu-stress up to date, skipping build ==="
|
echo "=== bee-gpu-burn worker up to date, skipping build ==="
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@@ -246,6 +246,10 @@ rm -f \
|
|||||||
"${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys" \
|
"${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys" \
|
||||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee" \
|
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee" \
|
||||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress" \
|
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress" \
|
||||||
|
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nccl-gpu-stress" \
|
||||||
|
"${OVERLAY_STAGE_DIR}/usr/local/bin/john" \
|
||||||
|
"${OVERLAY_STAGE_DIR}/usr/local/lib/bee/bee-gpu-burn-worker" \
|
||||||
|
"${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john" \
|
||||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" \
|
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" \
|
||||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
"${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
||||||
|
|
||||||
@@ -293,9 +297,14 @@ mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/bin"
|
|||||||
cp "${DIST_DIR}/bee-linux-amd64" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
|
cp "${DIST_DIR}/bee-linux-amd64" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
|
||||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
|
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
|
||||||
|
|
||||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ] && [ -f "$GPU_STRESS_BIN" ]; then
|
if [ "$BEE_GPU_VENDOR" = "nvidia" ] && [ -f "$GPU_BURN_WORKER_BIN" ]; then
|
||||||
cp "${GPU_STRESS_BIN}" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
|
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/lib/bee" "${OVERLAY_STAGE_DIR}/usr/local/bin"
|
||||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
|
cp "${GPU_BURN_WORKER_BIN}" "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/bee-gpu-burn-worker"
|
||||||
|
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/bee-gpu-burn-worker"
|
||||||
|
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-burn" 2>/dev/null || true
|
||||||
|
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-john-gpu-stress" 2>/dev/null || true
|
||||||
|
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nccl-gpu-stress" 2>/dev/null || true
|
||||||
|
ln -sfn bee-gpu-burn "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# --- inject smoketest into overlay so it runs directly on the live CD ---
|
# --- inject smoketest into overlay so it runs directly on the live CD ---
|
||||||
@@ -334,6 +343,8 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
|||||||
cp "${NVIDIA_CACHE}/bin/nvidia-bug-report.sh" "${OVERLAY_STAGE_DIR}/usr/local/bin/" 2>/dev/null || true
|
cp "${NVIDIA_CACHE}/bin/nvidia-bug-report.sh" "${OVERLAY_STAGE_DIR}/usr/local/bin/" 2>/dev/null || true
|
||||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/nvidia-bug-report.sh" 2>/dev/null || true
|
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/nvidia-bug-report.sh" 2>/dev/null || true
|
||||||
cp "${NVIDIA_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" 2>/dev/null || true
|
cp "${NVIDIA_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" 2>/dev/null || true
|
||||||
|
mkdir -p "${OVERLAY_STAGE_DIR}/etc/OpenCL/vendors"
|
||||||
|
printf 'libnvidia-opencl.so.1\n' > "${OVERLAY_STAGE_DIR}/etc/OpenCL/vendors/nvidia.icd"
|
||||||
|
|
||||||
# Inject GSP firmware into /lib/firmware/nvidia/<version>/
|
# Inject GSP firmware into /lib/firmware/nvidia/<version>/
|
||||||
if [ -d "${NVIDIA_CACHE}/firmware" ] && [ "$(ls -A "${NVIDIA_CACHE}/firmware" 2>/dev/null)" ]; then
|
if [ -d "${NVIDIA_CACHE}/firmware" ] && [ "$(ls -A "${NVIDIA_CACHE}/firmware" 2>/dev/null)" ]; then
|
||||||
@@ -353,7 +364,7 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
|||||||
cp "${NCCL_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
|
cp "${NCCL_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
|
||||||
echo "=== NCCL: $(ls "${NCCL_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
|
echo "=== NCCL: $(ls "${NCCL_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
|
||||||
|
|
||||||
# Inject cuBLAS/cuBLASLt/cudart runtime libs used by bee-gpu-stress tensor-core GEMM path
|
# Inject cuBLAS/cuBLASLt/cudart runtime libs used by the bee-gpu-burn worker tensor-core GEMM path
|
||||||
cp "${CUBLAS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
|
cp "${CUBLAS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
|
||||||
echo "=== cuBLAS: $(ls "${CUBLAS_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
|
echo "=== cuBLAS: $(ls "${CUBLAS_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
|
||||||
|
|
||||||
@@ -371,7 +382,18 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
|||||||
NCCL_TESTS_CACHE="${DIST_DIR}/nccl-tests-${NCCL_TESTS_VERSION}"
|
NCCL_TESTS_CACHE="${DIST_DIR}/nccl-tests-${NCCL_TESTS_VERSION}"
|
||||||
cp "${NCCL_TESTS_CACHE}/bin/all_reduce_perf" "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
cp "${NCCL_TESTS_CACHE}/bin/all_reduce_perf" "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
||||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
||||||
|
cp "${NCCL_TESTS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" 2>/dev/null || true
|
||||||
echo "=== all_reduce_perf injected ==="
|
echo "=== all_reduce_perf injected ==="
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "=== building john jumbo ${JOHN_JUMBO_COMMIT} ==="
|
||||||
|
sh "${BUILDER_DIR}/build-john.sh" "${JOHN_JUMBO_COMMIT}" "${DIST_DIR}"
|
||||||
|
JOHN_CACHE="${DIST_DIR}/john-${JOHN_JUMBO_COMMIT}"
|
||||||
|
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john"
|
||||||
|
rsync -a --delete "${JOHN_CACHE}/run/" "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john/run/"
|
||||||
|
ln -sfn ../lib/bee/john/run/john "${OVERLAY_STAGE_DIR}/usr/local/bin/john"
|
||||||
|
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john/run/john"
|
||||||
|
echo "=== john injected ==="
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# --- embed build metadata ---
|
# --- embed build metadata ---
|
||||||
@@ -385,7 +407,8 @@ NCCL_VERSION=${NCCL_VERSION}
|
|||||||
NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION}
|
NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION}
|
||||||
CUBLAS_VERSION=${CUBLAS_VERSION}
|
CUBLAS_VERSION=${CUBLAS_VERSION}
|
||||||
CUDA_USERSPACE_VERSION=${CUDA_USERSPACE_VERSION}
|
CUDA_USERSPACE_VERSION=${CUDA_USERSPACE_VERSION}
|
||||||
NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}"
|
NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}
|
||||||
|
JOHN_JUMBO_COMMIT=${JOHN_JUMBO_COMMIT}"
|
||||||
GPU_BUILD_INFO="nvidia:${NVIDIA_DRIVER_VERSION}"
|
GPU_BUILD_INFO="nvidia:${NVIDIA_DRIVER_VERSION}"
|
||||||
elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
|
elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
|
||||||
GPU_VERSION_LINE="ROCM_VERSION=${ROCM_VERSION}"
|
GPU_VERSION_LINE="ROCM_VERSION=${ROCM_VERSION}"
|
||||||
|
|||||||
@@ -60,6 +60,9 @@ chmod +x /usr/local/bin/bee 2>/dev/null || true
|
|||||||
chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true
|
||||||
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
||||||
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
|
||||||
|
chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true
|
||||||
|
chmod +x /usr/local/bin/bee-john-gpu-stress 2>/dev/null || true
|
||||||
|
chmod +x /usr/local/bin/bee-nccl-gpu-stress 2>/dev/null || true
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Reload udev rules
|
# Reload udev rules
|
||||||
|
|||||||
@@ -4,16 +4,23 @@
|
|||||||
# not inside the squashfs).
|
# not inside the squashfs).
|
||||||
#
|
#
|
||||||
# Primary: copy from chroot/boot/ (populated by package postinst).
|
# Primary: copy from chroot/boot/ (populated by package postinst).
|
||||||
# Fallback: extract directly from the cached .deb if postinst didn't place
|
# Naming fallbacks:
|
||||||
# the files (happens in chroot environments without grub triggers).
|
# Debian Bookworm: /boot/memtest86+ — EFI PE64 (no extension)
|
||||||
|
# /boot/memtest86+.bin — legacy binary
|
||||||
|
# Upstream/Ubuntu: /boot/memtest86+x64.efi, /boot/memtest86+x64.bin, etc.
|
||||||
|
# Last resort: extract directly from the cached .deb if postinst didn't place
|
||||||
|
# the files (happens in chroot environments without grub triggers).
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
MEMTEST_FILES="memtest86+x64.bin memtest86+x64.efi memtest86+ia32.bin memtest86+ia32.efi"
|
MEMTEST_FILES="memtest86+x64.bin memtest86+x64.efi memtest86+ia32.bin memtest86+ia32.efi"
|
||||||
|
|
||||||
|
# Ensure destination directory exists (absence caused silent copy failures).
|
||||||
|
mkdir -p binary/boot
|
||||||
|
|
||||||
echo "memtest: scanning chroot/boot/ for memtest files:"
|
echo "memtest: scanning chroot/boot/ for memtest files:"
|
||||||
ls chroot/boot/memtest* 2>/dev/null || echo "memtest: WARNING: no memtest files in chroot/boot/"
|
ls chroot/boot/memtest* 2>/dev/null || echo "memtest: WARNING: no memtest files in chroot/boot/"
|
||||||
|
|
||||||
# Primary path: copy from chroot/boot/
|
# Primary path: copy upstream-named files from chroot/boot/
|
||||||
for f in ${MEMTEST_FILES}; do
|
for f in ${MEMTEST_FILES}; do
|
||||||
src="chroot/boot/${f}"
|
src="chroot/boot/${f}"
|
||||||
if [ -f "${src}" ]; then
|
if [ -f "${src}" ]; then
|
||||||
@@ -22,14 +29,23 @@ for f in ${MEMTEST_FILES}; do
|
|||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
# Fallback: if EFI binary still missing, extract from cached .deb
|
# Debian Bookworm naming fallback: /boot/memtest86+ (no extension) is the EFI binary.
|
||||||
|
if [ ! -f "binary/boot/memtest86+x64.efi" ] && [ -f "chroot/boot/memtest86+" ]; then
|
||||||
|
cp "chroot/boot/memtest86+" "binary/boot/memtest86+x64.efi"
|
||||||
|
echo "memtest: copied /boot/memtest86+ as memtest86+x64.efi (Debian naming)"
|
||||||
|
fi
|
||||||
|
if [ ! -f "binary/boot/memtest86+x64.bin" ] && [ -f "chroot/boot/memtest86+.bin" ]; then
|
||||||
|
cp "chroot/boot/memtest86+.bin" "binary/boot/memtest86+x64.bin"
|
||||||
|
echo "memtest: copied /boot/memtest86+.bin as memtest86+x64.bin (Debian naming)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Last resort: if EFI binary still missing, extract from cached .deb
|
||||||
if [ ! -f "binary/boot/memtest86+x64.efi" ]; then
|
if [ ! -f "binary/boot/memtest86+x64.efi" ]; then
|
||||||
echo "memtest: EFI binary missing — attempting extraction from .deb cache"
|
echo "memtest: EFI binary missing — attempting extraction from .deb cache"
|
||||||
deb=$(find chroot/var/cache/apt/archives/ chroot/var/lib/apt/lists/ \
|
deb=$(find chroot/var/cache/apt/archives/ chroot/var/lib/apt/lists/ \
|
||||||
-name 'memtest86+_*.deb' -o -name 'memtest86+*.deb' 2>/dev/null \
|
-name 'memtest86+_*.deb' -o -name 'memtest86+*.deb' 2>/dev/null \
|
||||||
| head -1)
|
| head -1)
|
||||||
if [ -z "$deb" ]; then
|
if [ -z "$deb" ]; then
|
||||||
# Also check lb package cache
|
|
||||||
deb=$(find cache/ -name 'memtest86+_*.deb' -o -name 'memtest86+*.deb' 2>/dev/null | head -1)
|
deb=$(find cache/ -name 'memtest86+_*.deb' -o -name 'memtest86+*.deb' 2>/dev/null | head -1)
|
||||||
fi
|
fi
|
||||||
if [ -n "$deb" ]; then
|
if [ -n "$deb" ]; then
|
||||||
@@ -45,6 +61,11 @@ if [ ! -f "binary/boot/memtest86+x64.efi" ]; then
|
|||||||
echo "memtest: extracted ${f} from .deb"
|
echo "memtest: extracted ${f} from .deb"
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
# Debian naming fallback inside .deb as well
|
||||||
|
if [ ! -f "binary/boot/memtest86+x64.efi" ] && [ -f "${EXTRACT_DIR}/boot/memtest86+" ]; then
|
||||||
|
cp "${EXTRACT_DIR}/boot/memtest86+" "binary/boot/memtest86+x64.efi"
|
||||||
|
echo "memtest: extracted /boot/memtest86+ as memtest86+x64.efi from .deb"
|
||||||
|
fi
|
||||||
rm -rf "${EXTRACT_DIR}"
|
rm -rf "${EXTRACT_DIR}"
|
||||||
else
|
else
|
||||||
echo "memtest: WARNING: no memtest86+ .deb found in cache — memtest will not be available"
|
echo "memtest: WARNING: no memtest86+ .deb found in cache — memtest will not be available"
|
||||||
|
|||||||
@@ -1,2 +1,8 @@
|
|||||||
# NVIDIA DCGM (Data Center GPU Manager) — dcgmi diag for acceptance testing
|
# NVIDIA DCGM (Data Center GPU Manager) — dcgmi diag for acceptance testing.
|
||||||
datacenter-gpu-manager=1:%%DCGM_VERSION%%
|
# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with CUDA 13 userspace,
|
||||||
|
# so install the CUDA 13 build plus proprietary diagnostic components explicitly.
|
||||||
|
datacenter-gpu-manager-4-cuda13=1:%%DCGM_VERSION%%
|
||||||
|
datacenter-gpu-manager-4-proprietary=1:%%DCGM_VERSION%%
|
||||||
|
datacenter-gpu-manager-4-proprietary-cuda13=1:%%DCGM_VERSION%%
|
||||||
|
ocl-icd-libopencl1
|
||||||
|
clinfo
|
||||||
|
|||||||
@@ -21,8 +21,14 @@ openssh-server
|
|||||||
# Disk installer
|
# Disk installer
|
||||||
squashfs-tools
|
squashfs-tools
|
||||||
parted
|
parted
|
||||||
|
# grub-pc / grub-efi-amd64 provide grub-install + grub2-common (required for chroot install).
|
||||||
|
# The -bin variants only carry binary modules and do NOT include grub-install itself.
|
||||||
|
grub-pc
|
||||||
grub-pc-bin
|
grub-pc-bin
|
||||||
|
grub-efi-amd64
|
||||||
grub-efi-amd64-bin
|
grub-efi-amd64-bin
|
||||||
|
grub-efi-amd64-signed
|
||||||
|
shim-signed
|
||||||
|
|
||||||
# Filesystem support for USB export targets
|
# Filesystem support for USB export targets
|
||||||
exfatprogs
|
exfatprogs
|
||||||
@@ -39,6 +45,7 @@ vim-tiny
|
|||||||
mc
|
mc
|
||||||
htop
|
htop
|
||||||
nvtop
|
nvtop
|
||||||
|
btop
|
||||||
sudo
|
sudo
|
||||||
zstd
|
zstd
|
||||||
mstflint
|
mstflint
|
||||||
|
|||||||
@@ -1,14 +1,25 @@
|
|||||||
[Unit]
|
[Unit]
|
||||||
Description=Bee: run hardware audit
|
Description=Bee: schedule startup hardware audit via task queue
|
||||||
After=bee-network.service bee-nvidia.service bee-preflight.service
|
# Start AFTER bee-web, not before — bee-web must not wait for audit.
|
||||||
Before=bee-web.service
|
After=bee-web.service
|
||||||
|
Wants=bee-web.service
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
Type=oneshot
|
Type=oneshot
|
||||||
ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-audit.log /bin/sh -c '/usr/local/bin/bee audit --runtime livecd --output file:/appdata/bee/export/bee-audit.json; rc=$?; if [ "$rc" -ne 0 ]; then echo "[bee-audit] WARN: audit exited with rc=$rc"; fi; exit 0'
|
RemainAfterExit=yes
|
||||||
|
# Wait up to 90s for bee-web to respond on /healthz, then sleep 60s for
|
||||||
|
# the system to settle (GPU drivers, sensors), then enqueue the audit as
|
||||||
|
# a background task so it appears in the task list and logs.
|
||||||
|
ExecStart=/bin/sh -c '\
|
||||||
|
i=0; \
|
||||||
|
while [ $i -lt 90 ]; do \
|
||||||
|
if curl -sf http://localhost/healthz >/dev/null 2>&1; then break; fi; \
|
||||||
|
sleep 1; i=$((i+1)); \
|
||||||
|
done; \
|
||||||
|
sleep 60; \
|
||||||
|
curl -sf -X POST http://localhost/api/audit/run >/dev/null'
|
||||||
StandardOutput=journal
|
StandardOutput=journal
|
||||||
StandardError=journal
|
StandardError=journal
|
||||||
RemainAfterExit=yes
|
|
||||||
|
|
||||||
[Install]
|
[Install]
|
||||||
WantedBy=multi-user.target
|
WantedBy=multi-user.target
|
||||||
|
|||||||
@@ -1,7 +1,5 @@
|
|||||||
[Unit]
|
[Unit]
|
||||||
Description=Bee: hardware audit web viewer
|
Description=Bee: hardware audit web viewer
|
||||||
After=bee-network.service
|
|
||||||
Wants=bee-audit.service
|
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
Type=simple
|
Type=simple
|
||||||
@@ -11,6 +9,9 @@ RestartSec=2
|
|||||||
StandardOutput=journal
|
StandardOutput=journal
|
||||||
StandardError=journal
|
StandardError=journal
|
||||||
LimitMEMLOCK=infinity
|
LimitMEMLOCK=infinity
|
||||||
|
# Keep the web server responsive during GPU/CPU stress (children inherit nice+10
|
||||||
|
# via Setpriority in runCmdJob, but the bee-web parent stays at 0).
|
||||||
|
Nice=0
|
||||||
|
|
||||||
[Install]
|
[Install]
|
||||||
WantedBy=multi-user.target
|
WantedBy=multi-user.target
|
||||||
|
|||||||
@@ -4,3 +4,6 @@
|
|||||||
RestartSec=10
|
RestartSec=10
|
||||||
StartLimitIntervalSec=60
|
StartLimitIntervalSec=60
|
||||||
StartLimitBurst=3
|
StartLimitBurst=3
|
||||||
|
# Raise scheduling priority of the X server so the graphical console (KVM/IPMI)
|
||||||
|
# stays responsive during GPU/CPU stress tests running at nice+10.
|
||||||
|
Nice=-5
|
||||||
|
|||||||
93
iso/overlay/usr/local/bin/bee-gpu-burn
Normal file
93
iso/overlay/usr/local/bin/bee-gpu-burn
Normal file
@@ -0,0 +1,93 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
set -eu
|
||||||
|
|
||||||
|
SECONDS=5
|
||||||
|
SIZE_MB=64
|
||||||
|
DEVICES=""
|
||||||
|
EXCLUDE=""
|
||||||
|
WORKER="/usr/local/lib/bee/bee-gpu-burn-worker"
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
echo "usage: $0 [--seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3]" >&2
|
||||||
|
exit 2
|
||||||
|
}
|
||||||
|
|
||||||
|
normalize_list() {
|
||||||
|
echo "${1:-}" | tr ',' '\n' | sed 's/[[:space:]]//g' | awk 'NF' | sort -n | uniq | paste -sd, -
|
||||||
|
}
|
||||||
|
|
||||||
|
contains_csv() {
|
||||||
|
needle="$1"
|
||||||
|
haystack="${2:-}"
|
||||||
|
echo ",${haystack}," | grep -q ",${needle},"
|
||||||
|
}
|
||||||
|
|
||||||
|
while [ "$#" -gt 0 ]; do
|
||||||
|
case "$1" in
|
||||||
|
--seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
|
||||||
|
--size-mb|-m) [ "$#" -ge 2 ] || usage; SIZE_MB="$2"; shift 2 ;;
|
||||||
|
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
||||||
|
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
||||||
|
*) usage ;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
[ -x "${WORKER}" ] || { echo "bee-gpu-burn worker not found: ${WORKER}" >&2; exit 1; }
|
||||||
|
|
||||||
|
ALL_DEVICES=$(nvidia-smi --query-gpu=index --format=csv,noheader,nounits 2>/dev/null | sed 's/[[:space:]]//g' | awk 'NF' | paste -sd, -)
|
||||||
|
[ -n "${ALL_DEVICES}" ] || { echo "nvidia-smi found no NVIDIA GPUs" >&2; exit 1; }
|
||||||
|
|
||||||
|
DEVICES=$(normalize_list "${DEVICES}")
|
||||||
|
EXCLUDE=$(normalize_list "${EXCLUDE}")
|
||||||
|
SELECTED="${DEVICES}"
|
||||||
|
if [ -z "${SELECTED}" ]; then
|
||||||
|
SELECTED="${ALL_DEVICES}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
FINAL=""
|
||||||
|
for id in $(echo "${SELECTED}" | tr ',' ' '); do
|
||||||
|
[ -n "${id}" ] || continue
|
||||||
|
if contains_csv "${id}" "${EXCLUDE}"; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
if [ -z "${FINAL}" ]; then
|
||||||
|
FINAL="${id}"
|
||||||
|
else
|
||||||
|
FINAL="${FINAL},${id}"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
[ -n "${FINAL}" ] || { echo "no NVIDIA GPUs selected after filters" >&2; exit 1; }
|
||||||
|
|
||||||
|
echo "loader=bee-gpu-burn"
|
||||||
|
echo "selected_gpus=${FINAL}"
|
||||||
|
|
||||||
|
TMP_DIR=$(mktemp -d)
|
||||||
|
trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM
|
||||||
|
|
||||||
|
WORKERS=""
|
||||||
|
for id in $(echo "${FINAL}" | tr ',' ' '); do
|
||||||
|
log="${TMP_DIR}/gpu-${id}.log"
|
||||||
|
echo "starting gpu ${id}"
|
||||||
|
"${WORKER}" --device "${id}" --seconds "${SECONDS}" --size-mb "${SIZE_MB}" >"${log}" 2>&1 &
|
||||||
|
pid=$!
|
||||||
|
WORKERS="${WORKERS} ${pid}:${id}:${log}"
|
||||||
|
done
|
||||||
|
|
||||||
|
status=0
|
||||||
|
for spec in ${WORKERS}; do
|
||||||
|
pid=${spec%%:*}
|
||||||
|
rest=${spec#*:}
|
||||||
|
id=${rest%%:*}
|
||||||
|
log=${rest#*:}
|
||||||
|
if wait "${pid}"; then
|
||||||
|
echo "gpu ${id} finished: OK"
|
||||||
|
else
|
||||||
|
rc=$?
|
||||||
|
echo "gpu ${id} finished: FAILED rc=${rc}"
|
||||||
|
status=1
|
||||||
|
fi
|
||||||
|
sed "s/^/[gpu ${id}] /" "${log}" || true
|
||||||
|
done
|
||||||
|
|
||||||
|
exit "${status}"
|
||||||
@@ -158,20 +158,56 @@ mount --bind /sys "${MOUNT_ROOT}/sys"
|
|||||||
|
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
log "--- Step 7/7: Installing GRUB bootloader ---"
|
log "--- Step 7/7: Installing GRUB bootloader ---"
|
||||||
|
|
||||||
|
# Helper: run a chroot command, log all output, return its exit code.
|
||||||
|
# Needed because "cmd | while" pipelines hide the exit code of cmd.
|
||||||
|
chroot_log() {
|
||||||
|
local rc=0
|
||||||
|
local out
|
||||||
|
out=$(chroot "$MOUNT_ROOT" "$@" 2>&1) || rc=$?
|
||||||
|
echo "$out" | while IFS= read -r line; do log " $line"; done
|
||||||
|
return $rc
|
||||||
|
}
|
||||||
|
|
||||||
if [ "$UEFI" = "1" ]; then
|
if [ "$UEFI" = "1" ]; then
|
||||||
chroot "$MOUNT_ROOT" grub-install \
|
# Primary attempt: write EFI NVRAM entry (requires writable efivars)
|
||||||
--target=x86_64-efi \
|
if ! chroot_log grub-install \
|
||||||
--efi-directory=/boot/efi \
|
--target=x86_64-efi \
|
||||||
--bootloader-id=bee \
|
--efi-directory=/boot/efi \
|
||||||
--recheck 2>&1 | while read -r line; do log " $line"; done || true
|
--bootloader-id=bee \
|
||||||
|
--recheck; then
|
||||||
|
log " WARNING: grub-install (with NVRAM) failed — retrying with --no-nvram"
|
||||||
|
# --no-nvram: write grubx64.efi but skip EFI variable update.
|
||||||
|
# Needed on headless servers where efivars is read-only or unavailable.
|
||||||
|
chroot_log grub-install \
|
||||||
|
--target=x86_64-efi \
|
||||||
|
--efi-directory=/boot/efi \
|
||||||
|
--bootloader-id=bee \
|
||||||
|
--no-nvram \
|
||||||
|
--recheck || log " WARNING: grub-install --no-nvram also failed — check logs"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Always install the UEFI fallback path EFI/BOOT/BOOTX64.EFI.
|
||||||
|
# Many UEFI implementations (especially server BMCs and some firmware)
|
||||||
|
# ignore the NVRAM boot entry and only look for this path.
|
||||||
|
GRUB_EFI="${MOUNT_ROOT}/boot/efi/EFI/bee/grubx64.efi"
|
||||||
|
FALLBACK_DIR="${MOUNT_ROOT}/boot/efi/EFI/BOOT"
|
||||||
|
if [ -f "$GRUB_EFI" ]; then
|
||||||
|
mkdir -p "$FALLBACK_DIR"
|
||||||
|
cp "$GRUB_EFI" "${FALLBACK_DIR}/BOOTX64.EFI"
|
||||||
|
log " Fallback EFI binary installed: EFI/BOOT/BOOTX64.EFI"
|
||||||
|
else
|
||||||
|
log " WARNING: grubx64.efi not found at $GRUB_EFI — UEFI fallback path not set"
|
||||||
|
fi
|
||||||
else
|
else
|
||||||
chroot "$MOUNT_ROOT" grub-install \
|
chroot_log grub-install \
|
||||||
--target=i386-pc \
|
--target=i386-pc \
|
||||||
--recheck \
|
--recheck \
|
||||||
"$DEVICE" 2>&1 | while read -r line; do log " $line"; done || true
|
"$DEVICE" || log " WARNING: grub-install (BIOS) failed — check logs"
|
||||||
fi
|
fi
|
||||||
chroot "$MOUNT_ROOT" update-grub 2>&1 | while read -r line; do log " $line"; done || true
|
|
||||||
log " GRUB installed."
|
chroot_log update-grub || log " WARNING: update-grub failed — check logs"
|
||||||
|
log " GRUB step complete."
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
# Cleanup
|
# Cleanup
|
||||||
|
|||||||
100
iso/overlay/usr/local/bin/bee-john-gpu-stress
Normal file
100
iso/overlay/usr/local/bin/bee-john-gpu-stress
Normal file
@@ -0,0 +1,100 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
set -eu
|
||||||
|
|
||||||
|
SECONDS=300
|
||||||
|
DEVICES=""
|
||||||
|
EXCLUDE=""
|
||||||
|
FORMAT=""
|
||||||
|
JOHN_DIR="/usr/local/lib/bee/john/run"
|
||||||
|
JOHN_BIN="${JOHN_DIR}/john"
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
echo "usage: $0 [--seconds N] [--devices 0,1] [--exclude 2,3] [--format name]" >&2
|
||||||
|
exit 2
|
||||||
|
}
|
||||||
|
|
||||||
|
normalize_list() {
|
||||||
|
echo "${1:-}" | tr ',' '\n' | sed 's/[[:space:]]//g' | awk 'NF' | sort -n | uniq | paste -sd, -
|
||||||
|
}
|
||||||
|
|
||||||
|
contains_csv() {
|
||||||
|
needle="$1"
|
||||||
|
haystack="${2:-}"
|
||||||
|
echo ",${haystack}," | grep -q ",${needle},"
|
||||||
|
}
|
||||||
|
|
||||||
|
while [ "$#" -gt 0 ]; do
|
||||||
|
case "$1" in
|
||||||
|
--seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
|
||||||
|
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
||||||
|
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
||||||
|
--format) [ "$#" -ge 2 ] || usage; FORMAT="$2"; shift 2 ;;
|
||||||
|
*) usage ;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
[ -x "${JOHN_BIN}" ] || { echo "john binary not found: ${JOHN_BIN}" >&2; exit 1; }
|
||||||
|
|
||||||
|
ALL_DEVICES=$(nvidia-smi --query-gpu=index --format=csv,noheader,nounits 2>/dev/null | sed 's/[[:space:]]//g' | awk 'NF' | paste -sd, -)
|
||||||
|
[ -n "${ALL_DEVICES}" ] || { echo "nvidia-smi found no NVIDIA GPUs" >&2; exit 1; }
|
||||||
|
|
||||||
|
DEVICES=$(normalize_list "${DEVICES}")
|
||||||
|
EXCLUDE=$(normalize_list "${EXCLUDE}")
|
||||||
|
SELECTED="${DEVICES}"
|
||||||
|
if [ -z "${SELECTED}" ]; then
|
||||||
|
SELECTED="${ALL_DEVICES}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
FINAL=""
|
||||||
|
for id in $(echo "${SELECTED}" | tr ',' ' '); do
|
||||||
|
[ -n "${id}" ] || continue
|
||||||
|
if contains_csv "${id}" "${EXCLUDE}"; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
if [ -z "${FINAL}" ]; then
|
||||||
|
FINAL="${id}"
|
||||||
|
else
|
||||||
|
FINAL="${FINAL},${id}"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
[ -n "${FINAL}" ] || { echo "no NVIDIA GPUs selected after filters" >&2; exit 1; }
|
||||||
|
|
||||||
|
JOHN_DEVICES=""
|
||||||
|
for id in $(echo "${FINAL}" | tr ',' ' '); do
|
||||||
|
opencl_id=$((id + 1))
|
||||||
|
if [ -z "${JOHN_DEVICES}" ]; then
|
||||||
|
JOHN_DEVICES="${opencl_id}"
|
||||||
|
else
|
||||||
|
JOHN_DEVICES="${JOHN_DEVICES},${opencl_id}"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "loader=john"
|
||||||
|
echo "selected_gpus=${FINAL}"
|
||||||
|
echo "john_devices=${JOHN_DEVICES}"
|
||||||
|
|
||||||
|
cd "${JOHN_DIR}"
|
||||||
|
|
||||||
|
choose_format() {
|
||||||
|
if [ -n "${FORMAT}" ]; then
|
||||||
|
echo "${FORMAT}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
for candidate in sha512crypt-opencl pbkdf2-hmac-sha512-opencl 7z-opencl sha256crypt-opencl md5crypt-opencl; do
|
||||||
|
if ./john --test=1 --format="${candidate}" --devices="${JOHN_DEVICES}" >/dev/null 2>&1; then
|
||||||
|
echo "${candidate}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
CHOSEN_FORMAT=$(choose_format) || {
|
||||||
|
echo "no suitable john OpenCL format found" >&2
|
||||||
|
./john --list=opencl-devices >&2 || true
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
echo "format=${CHOSEN_FORMAT}"
|
||||||
|
exec ./john --test="${SECONDS}" --format="${CHOSEN_FORMAT}" --devices="${JOHN_DEVICES}"
|
||||||
91
iso/overlay/usr/local/bin/bee-nccl-gpu-stress
Normal file
91
iso/overlay/usr/local/bin/bee-nccl-gpu-stress
Normal file
@@ -0,0 +1,91 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
set -eu
|
||||||
|
|
||||||
|
SECONDS=300
|
||||||
|
DEVICES=""
|
||||||
|
EXCLUDE=""
|
||||||
|
MIN_BYTES="512M"
|
||||||
|
MAX_BYTES="4G"
|
||||||
|
FACTOR="2"
|
||||||
|
ITERS="20"
|
||||||
|
ALL_REDUCE_BIN="/usr/local/bin/all_reduce_perf"
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
echo "usage: $0 [--seconds N] [--devices 0,1] [--exclude 2,3]" >&2
|
||||||
|
exit 2
|
||||||
|
}
|
||||||
|
|
||||||
|
normalize_list() {
|
||||||
|
echo "${1:-}" | tr ',' '\n' | sed 's/[[:space:]]//g' | awk 'NF' | sort -n | uniq | paste -sd, -
|
||||||
|
}
|
||||||
|
|
||||||
|
contains_csv() {
|
||||||
|
needle="$1"
|
||||||
|
haystack="${2:-}"
|
||||||
|
echo ",${haystack}," | grep -q ",${needle},"
|
||||||
|
}
|
||||||
|
|
||||||
|
while [ "$#" -gt 0 ]; do
|
||||||
|
case "$1" in
|
||||||
|
--seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
|
||||||
|
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
||||||
|
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
||||||
|
*) usage ;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
[ -x "${ALL_REDUCE_BIN}" ] || { echo "all_reduce_perf not found: ${ALL_REDUCE_BIN}" >&2; exit 1; }
|
||||||
|
|
||||||
|
ALL_DEVICES=$(nvidia-smi --query-gpu=index --format=csv,noheader,nounits 2>/dev/null | sed 's/[[:space:]]//g' | awk 'NF' | paste -sd, -)
|
||||||
|
[ -n "${ALL_DEVICES}" ] || { echo "nvidia-smi found no NVIDIA GPUs" >&2; exit 1; }
|
||||||
|
|
||||||
|
DEVICES=$(normalize_list "${DEVICES}")
|
||||||
|
EXCLUDE=$(normalize_list "${EXCLUDE}")
|
||||||
|
SELECTED="${DEVICES}"
|
||||||
|
if [ -z "${SELECTED}" ]; then
|
||||||
|
SELECTED="${ALL_DEVICES}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
FINAL=""
|
||||||
|
for id in $(echo "${SELECTED}" | tr ',' ' '); do
|
||||||
|
[ -n "${id}" ] || continue
|
||||||
|
if contains_csv "${id}" "${EXCLUDE}"; then
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
if [ -z "${FINAL}" ]; then
|
||||||
|
FINAL="${id}"
|
||||||
|
else
|
||||||
|
FINAL="${FINAL},${id}"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
[ -n "${FINAL}" ] || { echo "no NVIDIA GPUs selected after filters" >&2; exit 1; }
|
||||||
|
|
||||||
|
GPU_COUNT=$(echo "${FINAL}" | tr ',' '\n' | awk 'NF' | wc -l | awk '{print $1}')
|
||||||
|
[ "${GPU_COUNT}" -gt 0 ] || { echo "selected GPU count is zero" >&2; exit 1; }
|
||||||
|
|
||||||
|
echo "loader=nccl"
|
||||||
|
echo "selected_gpus=${FINAL}"
|
||||||
|
echo "gpu_count=${GPU_COUNT}"
|
||||||
|
echo "range=${MIN_BYTES}..${MAX_BYTES}"
|
||||||
|
echo "iters=${ITERS}"
|
||||||
|
|
||||||
|
deadline=$(( $(date +%s) + SECONDS ))
|
||||||
|
round=0
|
||||||
|
|
||||||
|
while :; do
|
||||||
|
now=$(date +%s)
|
||||||
|
if [ "${now}" -ge "${deadline}" ]; then
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
round=$((round + 1))
|
||||||
|
remaining=$((deadline - now))
|
||||||
|
echo "round=${round} remaining_sec=${remaining}"
|
||||||
|
CUDA_VISIBLE_DEVICES="${FINAL}" \
|
||||||
|
"${ALL_REDUCE_BIN}" \
|
||||||
|
-b "${MIN_BYTES}" \
|
||||||
|
-e "${MAX_BYTES}" \
|
||||||
|
-f "${FACTOR}" \
|
||||||
|
-g "${GPU_COUNT}" \
|
||||||
|
--iters "${ITERS}"
|
||||||
|
done
|
||||||
@@ -114,4 +114,19 @@ fi
|
|||||||
ldconfig 2>/dev/null || true
|
ldconfig 2>/dev/null || true
|
||||||
log "ldconfig refreshed"
|
log "ldconfig refreshed"
|
||||||
|
|
||||||
|
# Start DCGM host engine so dcgmi can discover GPUs.
|
||||||
|
# nv-hostengine must run before any dcgmi command — without it, dcgmi reports
|
||||||
|
# "group is empty" even when GPUs and modules are present.
|
||||||
|
# Skip if already running (e.g. started by a dcgm systemd service or prior boot).
|
||||||
|
if command -v nv-hostengine >/dev/null 2>&1; then
|
||||||
|
if pgrep -x nv-hostengine >/dev/null 2>&1; then
|
||||||
|
log "nv-hostengine already running — skipping"
|
||||||
|
else
|
||||||
|
nv-hostengine
|
||||||
|
log "nv-hostengine started"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
log "WARN: nv-hostengine not found — dcgmi diagnostics will not work"
|
||||||
|
fi
|
||||||
|
|
||||||
log "done"
|
log "done"
|
||||||
|
|||||||
@@ -8,13 +8,16 @@ xset -dpms
|
|||||||
xset s noblank
|
xset s noblank
|
||||||
|
|
||||||
tint2 &
|
tint2 &
|
||||||
# Wait for bee-web to bind (Go starts fast, usually <2s)
|
|
||||||
|
# Wait up to 120s for bee-web to bind. The web server starts immediately now
|
||||||
|
# (audit is deferred), so this should succeed in a few seconds on most hardware.
|
||||||
i=0
|
i=0
|
||||||
while [ $i -lt 30 ]; do
|
while [ $i -lt 120 ]; do
|
||||||
if curl -sf http://localhost/healthz >/dev/null 2>&1; then break; fi
|
if curl -sf http://localhost/healthz >/dev/null 2>&1; then break; fi
|
||||||
sleep 1
|
sleep 1
|
||||||
i=$((i+1))
|
i=$((i+1))
|
||||||
done
|
done
|
||||||
|
|
||||||
chromium \
|
chromium \
|
||||||
--disable-infobars \
|
--disable-infobars \
|
||||||
--disable-translate \
|
--disable-translate \
|
||||||
|
|||||||
Reference in New Issue
Block a user