Compare commits
6 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d2eadedff2 | ||
|
|
a98c4d7461 | ||
|
|
2354ae367d | ||
|
|
0d0e1f55a7 | ||
|
|
35f4c53887 | ||
|
|
981315e6fd |
@@ -243,7 +243,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
|||||||
return "", fmt.Errorf("write result.json: %w", err)
|
return "", fmt.Errorf("write result.json: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
report := renderBenchmarkReport(result)
|
report := renderBenchmarkReportWithCharts(result, loadBenchmarkReportCharts(runDir, selected))
|
||||||
if err := os.WriteFile(filepath.Join(runDir, "report.txt"), []byte(report), 0644); err != nil {
|
if err := os.WriteFile(filepath.Join(runDir, "report.txt"), []byte(report), 0644); err != nil {
|
||||||
return "", fmt.Errorf("write report.txt: %w", err)
|
return "", fmt.Errorf("write report.txt: %w", err)
|
||||||
}
|
}
|
||||||
@@ -679,7 +679,10 @@ func runBenchmarkInterconnect(ctx context.Context, verboseLog, runDir string, gp
|
|||||||
"-g", strconv.Itoa(len(gpuIndices)),
|
"-g", strconv.Itoa(len(gpuIndices)),
|
||||||
"--iters", strconv.Itoa(maxInt(20, spec.NCCLSec/10)),
|
"--iters", strconv.Itoa(maxInt(20, spec.NCCLSec/10)),
|
||||||
}
|
}
|
||||||
env := []string{"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices)}
|
env := []string{
|
||||||
|
"CUDA_DEVICE_ORDER=PCI_BUS_ID",
|
||||||
|
"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices),
|
||||||
|
}
|
||||||
logFunc(fmt.Sprintf("NCCL interconnect: gpus=%s", joinIndexList(gpuIndices)))
|
logFunc(fmt.Sprintf("NCCL interconnect: gpus=%s", joinIndexList(gpuIndices)))
|
||||||
out, err := runSATCommandCtx(ctx, verboseLog, "nccl-all-reduce.log", cmd, env, logFunc)
|
out, err := runSATCommandCtx(ctx, verboseLog, "nccl-all-reduce.log", cmd, env, logFunc)
|
||||||
_ = os.WriteFile(filepath.Join(runDir, "nccl-all-reduce.log"), out, 0644)
|
_ = os.WriteFile(filepath.Join(runDir, "nccl-all-reduce.log"), out, 0644)
|
||||||
|
|||||||
@@ -2,11 +2,25 @@ package platform
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"regexp"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
|
func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
|
||||||
|
return renderBenchmarkReportWithCharts(result, nil)
|
||||||
|
}
|
||||||
|
|
||||||
|
type benchmarkReportChart struct {
|
||||||
|
Title string
|
||||||
|
Content string
|
||||||
|
}
|
||||||
|
|
||||||
|
var ansiEscapePattern = regexp.MustCompile(`\x1b\[[0-9;]*m`)
|
||||||
|
|
||||||
|
func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benchmarkReportChart) string {
|
||||||
var b strings.Builder
|
var b strings.Builder
|
||||||
fmt.Fprintf(&b, "Bee NVIDIA Benchmark Report\n")
|
fmt.Fprintf(&b, "Bee NVIDIA Benchmark Report\n")
|
||||||
fmt.Fprintf(&b, "===========================\n\n")
|
fmt.Fprintf(&b, "===========================\n\n")
|
||||||
@@ -93,6 +107,20 @@ func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
|
|||||||
b.WriteString("\n")
|
b.WriteString("\n")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if len(charts) > 0 {
|
||||||
|
fmt.Fprintf(&b, "Terminal Charts\n")
|
||||||
|
fmt.Fprintf(&b, "---------------\n")
|
||||||
|
for _, chart := range charts {
|
||||||
|
content := strings.TrimSpace(stripANSIEscapeSequences(chart.Content))
|
||||||
|
if content == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, "%s\n", chart.Title)
|
||||||
|
fmt.Fprintf(&b, "%s\n", strings.Repeat("~", len(chart.Title)))
|
||||||
|
fmt.Fprintf(&b, "%s\n\n", content)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fmt.Fprintf(&b, "Methodology\n")
|
fmt.Fprintf(&b, "Methodology\n")
|
||||||
fmt.Fprintf(&b, "-----------\n")
|
fmt.Fprintf(&b, "-----------\n")
|
||||||
fmt.Fprintf(&b, "- Profile %s uses standardized baseline, warmup, steady-state, interconnect, and cooldown phases.\n", result.BenchmarkProfile)
|
fmt.Fprintf(&b, "- Profile %s uses standardized baseline, warmup, steady-state, interconnect, and cooldown phases.\n", result.BenchmarkProfile)
|
||||||
@@ -117,6 +145,36 @@ func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
|
|||||||
return b.String()
|
return b.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func loadBenchmarkReportCharts(runDir string, gpuIndices []int) []benchmarkReportChart {
|
||||||
|
phases := []struct {
|
||||||
|
name string
|
||||||
|
label string
|
||||||
|
}{
|
||||||
|
{name: "baseline", label: "Baseline"},
|
||||||
|
{name: "steady", label: "Steady State"},
|
||||||
|
{name: "cooldown", label: "Cooldown"},
|
||||||
|
}
|
||||||
|
var charts []benchmarkReportChart
|
||||||
|
for _, idx := range gpuIndices {
|
||||||
|
for _, phase := range phases {
|
||||||
|
path := filepath.Join(runDir, fmt.Sprintf("gpu-%d-%s-metrics-term.txt", idx, phase.name))
|
||||||
|
raw, err := os.ReadFile(path)
|
||||||
|
if err != nil || len(raw) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
charts = append(charts, benchmarkReportChart{
|
||||||
|
Title: fmt.Sprintf("GPU %d %s", idx, phase.label),
|
||||||
|
Content: string(raw),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return charts
|
||||||
|
}
|
||||||
|
|
||||||
|
func stripANSIEscapeSequences(raw string) string {
|
||||||
|
return ansiEscapePattern.ReplaceAllString(raw, "")
|
||||||
|
}
|
||||||
|
|
||||||
func renderBenchmarkSummary(result NvidiaBenchmarkResult) string {
|
func renderBenchmarkSummary(result NvidiaBenchmarkResult) string {
|
||||||
var b strings.Builder
|
var b strings.Builder
|
||||||
fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
|
fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
|
||||||
|
|||||||
@@ -145,3 +145,35 @@ func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestRenderBenchmarkReportIncludesTerminalChartsWithoutANSI(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
report := renderBenchmarkReportWithCharts(NvidiaBenchmarkResult{
|
||||||
|
BenchmarkProfile: NvidiaBenchmarkProfileStandard,
|
||||||
|
OverallStatus: "OK",
|
||||||
|
SelectedGPUIndices: []int{0},
|
||||||
|
Normalization: BenchmarkNormalization{
|
||||||
|
Status: "full",
|
||||||
|
},
|
||||||
|
}, []benchmarkReportChart{
|
||||||
|
{
|
||||||
|
Title: "GPU 0 Steady State",
|
||||||
|
Content: "\x1b[31mGPU 0 chart\x1b[0m\n 42┤───",
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
for _, needle := range []string{
|
||||||
|
"Terminal Charts",
|
||||||
|
"GPU 0 Steady State",
|
||||||
|
"GPU 0 chart",
|
||||||
|
"42┤───",
|
||||||
|
} {
|
||||||
|
if !strings.Contains(report, needle) {
|
||||||
|
t.Fatalf("report missing %q\n%s", needle, report)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if strings.Contains(report, "\x1b[31m") {
|
||||||
|
t.Fatalf("report should not contain ANSI escapes\n%s", report)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -630,7 +630,10 @@ func nvidiaVisibleDevicesEnv(gpuIndices []int) []string {
|
|||||||
if len(gpuIndices) == 0 {
|
if len(gpuIndices) == 0 {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
return []string{"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices)}
|
return []string{
|
||||||
|
"CUDA_DEVICE_ORDER=PCI_BUS_ID",
|
||||||
|
"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob, logFunc func(string)) (string, error) {
|
func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob, logFunc func(string)) (string, error) {
|
||||||
@@ -671,6 +674,9 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
|
|||||||
if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
|
if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
|
||||||
return "", writeErr
|
return "", writeErr
|
||||||
}
|
}
|
||||||
|
if ctx.Err() != nil {
|
||||||
|
return "", ctx.Err()
|
||||||
|
}
|
||||||
status, rc := classifySATResult(job.name, out, err)
|
status, rc := classifySATResult(job.name, out, err)
|
||||||
stats.Add(status)
|
stats.Add(status)
|
||||||
key := strings.TrimSuffix(strings.TrimPrefix(job.name, "0"), ".log")
|
key := strings.TrimSuffix(strings.TrimPrefix(job.name, "0"), ".log")
|
||||||
|
|||||||
@@ -1,12 +1,14 @@
|
|||||||
package platform
|
package platform
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"context"
|
||||||
"errors"
|
"errors"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestStorageSATCommands(t *testing.T) {
|
func TestStorageSATCommands(t *testing.T) {
|
||||||
@@ -253,11 +255,14 @@ func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) {
|
|||||||
|
|
||||||
func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
|
func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
|
||||||
env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
|
env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
|
||||||
if len(env) != 1 {
|
if len(env) != 2 {
|
||||||
t.Fatalf("env len=%d want 1 (%v)", len(env), env)
|
t.Fatalf("env len=%d want 2 (%v)", len(env), env)
|
||||||
}
|
}
|
||||||
if env[0] != "CUDA_VISIBLE_DEVICES=0,2,4" {
|
if env[0] != "CUDA_DEVICE_ORDER=PCI_BUS_ID" {
|
||||||
t.Fatalf("env[0]=%q want CUDA_VISIBLE_DEVICES=0,2,4", env[0])
|
t.Fatalf("env[0]=%q want CUDA_DEVICE_ORDER=PCI_BUS_ID", env[0])
|
||||||
|
}
|
||||||
|
if env[1] != "CUDA_VISIBLE_DEVICES=0,2,4" {
|
||||||
|
t.Fatalf("env[1]=%q want CUDA_VISIBLE_DEVICES=0,2,4", env[1])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -350,6 +355,38 @@ func TestClassifySATResult(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestRunAcceptancePackCtxReturnsContextErrorWithoutArchive(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
t.Cleanup(cancel)
|
||||||
|
|
||||||
|
done := make(chan struct{})
|
||||||
|
go func() {
|
||||||
|
time.Sleep(100 * time.Millisecond)
|
||||||
|
cancel()
|
||||||
|
close(done)
|
||||||
|
}()
|
||||||
|
|
||||||
|
archive, err := runAcceptancePackCtx(ctx, dir, "cancelled-pack", []satJob{
|
||||||
|
{name: "01-sleep.log", cmd: []string{"sh", "-c", "sleep 5"}},
|
||||||
|
}, nil)
|
||||||
|
<-done
|
||||||
|
|
||||||
|
if !errors.Is(err, context.Canceled) {
|
||||||
|
t.Fatalf("err=%v want context.Canceled", err)
|
||||||
|
}
|
||||||
|
if archive != "" {
|
||||||
|
t.Fatalf("archive=%q want empty", archive)
|
||||||
|
}
|
||||||
|
matches, globErr := filepath.Glob(filepath.Join(dir, "cancelled-pack-*.tar.gz"))
|
||||||
|
if globErr != nil {
|
||||||
|
t.Fatalf("Glob error: %v", globErr)
|
||||||
|
}
|
||||||
|
if len(matches) != 0 {
|
||||||
|
t.Fatalf("archives=%v want none", matches)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestParseStorageDevicesSkipsUSBDisks(t *testing.T) {
|
func TestParseStorageDevicesSkipsUSBDisks(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ import (
|
|||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"regexp"
|
"regexp"
|
||||||
|
"sort"
|
||||||
"strings"
|
"strings"
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
"syscall"
|
"syscall"
|
||||||
@@ -21,13 +22,238 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
var ansiEscapeRE = regexp.MustCompile(`\x1b\[[0-9;]*[a-zA-Z]|\x1b[()][A-Z0-9]|\x1b[DABC]`)
|
var ansiEscapeRE = regexp.MustCompile(`\x1b\[[0-9;]*[a-zA-Z]|\x1b[()][A-Z0-9]|\x1b[DABC]`)
|
||||||
|
var apiListNvidiaGPUs = func(a *app.App) ([]platform.NvidiaGPU, error) {
|
||||||
|
if a == nil {
|
||||||
|
return nil, fmt.Errorf("app not configured")
|
||||||
|
}
|
||||||
|
return a.ListNvidiaGPUs()
|
||||||
|
}
|
||||||
|
|
||||||
// ── Job ID counter ────────────────────────────────────────────────────────────
|
// ── Job ID counter ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
var jobCounter atomic.Uint64
|
var jobCounter atomic.Uint64
|
||||||
|
|
||||||
func newJobID(prefix string) string {
|
func newJobID(_ string) string {
|
||||||
return fmt.Sprintf("%s-%d", prefix, jobCounter.Add(1))
|
start := int((jobCounter.Add(1) - 1) % 1000)
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
defer globalQueue.mu.Unlock()
|
||||||
|
for offset := 0; offset < 1000; offset++ {
|
||||||
|
n := (start + offset) % 1000
|
||||||
|
id := fmt.Sprintf("TASK-%03d", n)
|
||||||
|
if !taskIDInUseLocked(id) {
|
||||||
|
return id
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("TASK-%03d", start)
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskIDInUseLocked(id string) bool {
|
||||||
|
for _, t := range globalQueue.tasks {
|
||||||
|
if t != nil && t.ID == id {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
type taskRunResponse struct {
|
||||||
|
TaskID string `json:"task_id,omitempty"`
|
||||||
|
JobID string `json:"job_id,omitempty"`
|
||||||
|
TaskIDs []string `json:"task_ids,omitempty"`
|
||||||
|
JobIDs []string `json:"job_ids,omitempty"`
|
||||||
|
TaskCount int `json:"task_count,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type nvidiaTaskSelection struct {
|
||||||
|
GPUIndices []int
|
||||||
|
Label string
|
||||||
|
}
|
||||||
|
|
||||||
|
func writeTaskRunResponse(w http.ResponseWriter, tasks []*Task) {
|
||||||
|
if len(tasks) == 0 {
|
||||||
|
writeJSON(w, taskRunResponse{})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
ids := make([]string, 0, len(tasks))
|
||||||
|
for _, t := range tasks {
|
||||||
|
if t == nil || strings.TrimSpace(t.ID) == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
ids = append(ids, t.ID)
|
||||||
|
}
|
||||||
|
resp := taskRunResponse{TaskCount: len(ids)}
|
||||||
|
if len(ids) > 0 {
|
||||||
|
resp.TaskID = ids[0]
|
||||||
|
resp.JobID = ids[0]
|
||||||
|
resp.TaskIDs = ids
|
||||||
|
resp.JobIDs = ids
|
||||||
|
}
|
||||||
|
writeJSON(w, resp)
|
||||||
|
}
|
||||||
|
|
||||||
|
func shouldSplitHomogeneousNvidiaTarget(target string) bool {
|
||||||
|
switch strings.TrimSpace(target) {
|
||||||
|
case "nvidia", "nvidia-targeted-stress", "nvidia-benchmark", "nvidia-compute",
|
||||||
|
"nvidia-targeted-power", "nvidia-pulse", "nvidia-interconnect",
|
||||||
|
"nvidia-bandwidth", "nvidia-stress":
|
||||||
|
return true
|
||||||
|
default:
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func expandHomogeneousNvidiaSelections(gpus []platform.NvidiaGPU, include, exclude []int) ([]nvidiaTaskSelection, error) {
|
||||||
|
if len(gpus) == 0 {
|
||||||
|
return nil, fmt.Errorf("no NVIDIA GPUs detected")
|
||||||
|
}
|
||||||
|
indexed := make(map[int]platform.NvidiaGPU, len(gpus))
|
||||||
|
allIndices := make([]int, 0, len(gpus))
|
||||||
|
for _, gpu := range gpus {
|
||||||
|
indexed[gpu.Index] = gpu
|
||||||
|
allIndices = append(allIndices, gpu.Index)
|
||||||
|
}
|
||||||
|
sort.Ints(allIndices)
|
||||||
|
|
||||||
|
selected := allIndices
|
||||||
|
if len(include) > 0 {
|
||||||
|
selected = make([]int, 0, len(include))
|
||||||
|
seen := make(map[int]struct{}, len(include))
|
||||||
|
for _, idx := range include {
|
||||||
|
if _, ok := indexed[idx]; !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if _, dup := seen[idx]; dup {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seen[idx] = struct{}{}
|
||||||
|
selected = append(selected, idx)
|
||||||
|
}
|
||||||
|
sort.Ints(selected)
|
||||||
|
}
|
||||||
|
if len(exclude) > 0 {
|
||||||
|
skip := make(map[int]struct{}, len(exclude))
|
||||||
|
for _, idx := range exclude {
|
||||||
|
skip[idx] = struct{}{}
|
||||||
|
}
|
||||||
|
filtered := selected[:0]
|
||||||
|
for _, idx := range selected {
|
||||||
|
if _, ok := skip[idx]; ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
filtered = append(filtered, idx)
|
||||||
|
}
|
||||||
|
selected = filtered
|
||||||
|
}
|
||||||
|
if len(selected) == 0 {
|
||||||
|
return nil, fmt.Errorf("no NVIDIA GPUs selected")
|
||||||
|
}
|
||||||
|
|
||||||
|
modelGroups := make(map[string][]platform.NvidiaGPU)
|
||||||
|
modelOrder := make([]string, 0)
|
||||||
|
for _, idx := range selected {
|
||||||
|
gpu := indexed[idx]
|
||||||
|
model := strings.TrimSpace(gpu.Name)
|
||||||
|
if model == "" {
|
||||||
|
model = fmt.Sprintf("GPU %d", gpu.Index)
|
||||||
|
}
|
||||||
|
if _, ok := modelGroups[model]; !ok {
|
||||||
|
modelOrder = append(modelOrder, model)
|
||||||
|
}
|
||||||
|
modelGroups[model] = append(modelGroups[model], gpu)
|
||||||
|
}
|
||||||
|
sort.Slice(modelOrder, func(i, j int) bool {
|
||||||
|
left := modelGroups[modelOrder[i]]
|
||||||
|
right := modelGroups[modelOrder[j]]
|
||||||
|
if len(left) == 0 || len(right) == 0 {
|
||||||
|
return modelOrder[i] < modelOrder[j]
|
||||||
|
}
|
||||||
|
return left[0].Index < right[0].Index
|
||||||
|
})
|
||||||
|
|
||||||
|
var groups []nvidiaTaskSelection
|
||||||
|
var singles []nvidiaTaskSelection
|
||||||
|
for _, model := range modelOrder {
|
||||||
|
group := modelGroups[model]
|
||||||
|
sort.Slice(group, func(i, j int) bool { return group[i].Index < group[j].Index })
|
||||||
|
indices := make([]int, 0, len(group))
|
||||||
|
for _, gpu := range group {
|
||||||
|
indices = append(indices, gpu.Index)
|
||||||
|
}
|
||||||
|
if len(indices) >= 2 {
|
||||||
|
groups = append(groups, nvidiaTaskSelection{
|
||||||
|
GPUIndices: indices,
|
||||||
|
Label: fmt.Sprintf("%s; GPUs %s", model, joinTaskIndices(indices)),
|
||||||
|
})
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
gpu := group[0]
|
||||||
|
singles = append(singles, nvidiaTaskSelection{
|
||||||
|
GPUIndices: []int{gpu.Index},
|
||||||
|
Label: fmt.Sprintf("GPU %d — %s", gpu.Index, model),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return append(groups, singles...), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func joinTaskIndices(indices []int) string {
|
||||||
|
parts := make([]string, 0, len(indices))
|
||||||
|
for _, idx := range indices {
|
||||||
|
parts = append(parts, fmt.Sprintf("%d", idx))
|
||||||
|
}
|
||||||
|
return strings.Join(parts, ",")
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatSplitTaskName(baseName, selectionLabel string) string {
|
||||||
|
baseName = strings.TrimSpace(baseName)
|
||||||
|
selectionLabel = strings.TrimSpace(selectionLabel)
|
||||||
|
if baseName == "" {
|
||||||
|
return selectionLabel
|
||||||
|
}
|
||||||
|
if selectionLabel == "" {
|
||||||
|
return baseName
|
||||||
|
}
|
||||||
|
return baseName + " (" + selectionLabel + ")"
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildNvidiaTaskSet(target string, priority int, createdAt time.Time, params taskParams, baseName string, appRef *app.App, idPrefix string) ([]*Task, error) {
|
||||||
|
if !shouldSplitHomogeneousNvidiaTarget(target) {
|
||||||
|
t := &Task{
|
||||||
|
ID: newJobID(idPrefix),
|
||||||
|
Name: baseName,
|
||||||
|
Target: target,
|
||||||
|
Priority: priority,
|
||||||
|
Status: TaskPending,
|
||||||
|
CreatedAt: createdAt,
|
||||||
|
params: params,
|
||||||
|
}
|
||||||
|
return []*Task{t}, nil
|
||||||
|
}
|
||||||
|
gpus, err := apiListNvidiaGPUs(appRef)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
selections, err := expandHomogeneousNvidiaSelections(gpus, params.GPUIndices, params.ExcludeGPUIndices)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
tasks := make([]*Task, 0, len(selections))
|
||||||
|
for _, selection := range selections {
|
||||||
|
taskParamsCopy := params
|
||||||
|
taskParamsCopy.GPUIndices = append([]int(nil), selection.GPUIndices...)
|
||||||
|
taskParamsCopy.ExcludeGPUIndices = nil
|
||||||
|
displayName := formatSplitTaskName(baseName, selection.Label)
|
||||||
|
taskParamsCopy.DisplayName = displayName
|
||||||
|
tasks = append(tasks, &Task{
|
||||||
|
ID: newJobID(idPrefix),
|
||||||
|
Name: displayName,
|
||||||
|
Target: target,
|
||||||
|
Priority: priority,
|
||||||
|
Status: TaskPending,
|
||||||
|
CreatedAt: createdAt,
|
||||||
|
params: taskParamsCopy,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return tasks, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── SSE helpers ───────────────────────────────────────────────────────────────
|
// ── SSE helpers ───────────────────────────────────────────────────────────────
|
||||||
@@ -207,28 +433,28 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
|||||||
}
|
}
|
||||||
|
|
||||||
name := taskDisplayName(target, body.Profile, body.Loader)
|
name := taskDisplayName(target, body.Profile, body.Loader)
|
||||||
t := &Task{
|
|
||||||
ID: newJobID("sat-" + target),
|
|
||||||
Name: name,
|
|
||||||
Target: target,
|
|
||||||
Status: TaskPending,
|
|
||||||
CreatedAt: time.Now(),
|
|
||||||
params: taskParams{
|
|
||||||
Duration: body.Duration,
|
|
||||||
DiagLevel: body.DiagLevel,
|
|
||||||
GPUIndices: body.GPUIndices,
|
|
||||||
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
|
||||||
Loader: body.Loader,
|
|
||||||
BurnProfile: body.Profile,
|
|
||||||
DisplayName: body.DisplayName,
|
|
||||||
PlatformComponents: body.PlatformComponents,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
if strings.TrimSpace(body.DisplayName) != "" {
|
if strings.TrimSpace(body.DisplayName) != "" {
|
||||||
t.Name = body.DisplayName
|
name = body.DisplayName
|
||||||
}
|
}
|
||||||
globalQueue.enqueue(t)
|
params := taskParams{
|
||||||
writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID})
|
Duration: body.Duration,
|
||||||
|
DiagLevel: body.DiagLevel,
|
||||||
|
GPUIndices: body.GPUIndices,
|
||||||
|
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
||||||
|
Loader: body.Loader,
|
||||||
|
BurnProfile: body.Profile,
|
||||||
|
DisplayName: body.DisplayName,
|
||||||
|
PlatformComponents: body.PlatformComponents,
|
||||||
|
}
|
||||||
|
tasks, err := buildNvidiaTaskSet(target, 0, time.Now(), params, name, h.opts.App, "sat-"+target)
|
||||||
|
if err != nil {
|
||||||
|
writeError(w, http.StatusBadRequest, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for _, t := range tasks {
|
||||||
|
globalQueue.enqueue(t)
|
||||||
|
}
|
||||||
|
writeTaskRunResponse(w, tasks)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -257,27 +483,26 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
|
|||||||
if body.RunNCCL != nil {
|
if body.RunNCCL != nil {
|
||||||
runNCCL = *body.RunNCCL
|
runNCCL = *body.RunNCCL
|
||||||
}
|
}
|
||||||
t := &Task{
|
name := taskDisplayName("nvidia-benchmark", "", "")
|
||||||
ID: newJobID("benchmark-nvidia"),
|
|
||||||
Name: taskDisplayName("nvidia-benchmark", "", ""),
|
|
||||||
Target: "nvidia-benchmark",
|
|
||||||
Priority: 15,
|
|
||||||
Status: TaskPending,
|
|
||||||
CreatedAt: time.Now(),
|
|
||||||
params: taskParams{
|
|
||||||
GPUIndices: body.GPUIndices,
|
|
||||||
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
|
||||||
SizeMB: body.SizeMB,
|
|
||||||
BenchmarkProfile: body.Profile,
|
|
||||||
RunNCCL: runNCCL,
|
|
||||||
DisplayName: body.DisplayName,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
if strings.TrimSpace(body.DisplayName) != "" {
|
if strings.TrimSpace(body.DisplayName) != "" {
|
||||||
t.Name = body.DisplayName
|
name = body.DisplayName
|
||||||
}
|
}
|
||||||
globalQueue.enqueue(t)
|
tasks, err := buildNvidiaTaskSet("nvidia-benchmark", 15, time.Now(), taskParams{
|
||||||
writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID})
|
GPUIndices: body.GPUIndices,
|
||||||
|
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
||||||
|
SizeMB: body.SizeMB,
|
||||||
|
BenchmarkProfile: body.Profile,
|
||||||
|
RunNCCL: runNCCL,
|
||||||
|
DisplayName: body.DisplayName,
|
||||||
|
}, name, h.opts.App, "benchmark-nvidia")
|
||||||
|
if err != nil {
|
||||||
|
writeError(w, http.StatusBadRequest, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for _, t := range tasks {
|
||||||
|
globalQueue.enqueue(t)
|
||||||
|
}
|
||||||
|
writeTaskRunResponse(w, tasks)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (h *handler) handleAPISATStream(w http.ResponseWriter, r *http.Request) {
|
func (h *handler) handleAPISATStream(w http.ResponseWriter, r *http.Request) {
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
package webui
|
package webui
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"encoding/json"
|
||||||
"net/http/httptest"
|
"net/http/httptest"
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
@@ -74,6 +75,14 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
|
|||||||
globalQueue.tasks = originalTasks
|
globalQueue.tasks = originalTasks
|
||||||
globalQueue.mu.Unlock()
|
globalQueue.mu.Unlock()
|
||||||
})
|
})
|
||||||
|
prevList := apiListNvidiaGPUs
|
||||||
|
apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
|
||||||
|
return []platform.NvidiaGPU{
|
||||||
|
{Index: 1, Name: "NVIDIA H100 PCIe"},
|
||||||
|
{Index: 3, Name: "NVIDIA H100 PCIe"},
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
||||||
|
|
||||||
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||||
req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`))
|
req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`))
|
||||||
@@ -101,6 +110,97 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
originalTasks := globalQueue.tasks
|
||||||
|
globalQueue.tasks = nil
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
t.Cleanup(func() {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
globalQueue.tasks = originalTasks
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
})
|
||||||
|
prevList := apiListNvidiaGPUs
|
||||||
|
apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
|
||||||
|
return []platform.NvidiaGPU{
|
||||||
|
{Index: 0, Name: "NVIDIA H100 PCIe"},
|
||||||
|
{Index: 1, Name: "NVIDIA H100 PCIe"},
|
||||||
|
{Index: 2, Name: "NVIDIA H200 NVL"},
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
||||||
|
|
||||||
|
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||||
|
req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"run_nccl":false}`))
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
|
||||||
|
h.handleAPIBenchmarkNvidiaRun(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != 200 {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
var resp taskRunResponse
|
||||||
|
if err := json.Unmarshal(rec.Body.Bytes(), &resp); err != nil {
|
||||||
|
t.Fatalf("decode response: %v", err)
|
||||||
|
}
|
||||||
|
if len(resp.TaskIDs) != 2 {
|
||||||
|
t.Fatalf("task_ids=%v want 2 items", resp.TaskIDs)
|
||||||
|
}
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
defer globalQueue.mu.Unlock()
|
||||||
|
if len(globalQueue.tasks) != 2 {
|
||||||
|
t.Fatalf("tasks=%d want 2", len(globalQueue.tasks))
|
||||||
|
}
|
||||||
|
if got := globalQueue.tasks[0].params.GPUIndices; len(got) != 2 || got[0] != 0 || got[1] != 1 {
|
||||||
|
t.Fatalf("task[0] gpu indices=%v want [0 1]", got)
|
||||||
|
}
|
||||||
|
if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
|
||||||
|
t.Fatalf("task[1] gpu indices=%v want [2]", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
originalTasks := globalQueue.tasks
|
||||||
|
globalQueue.tasks = nil
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
t.Cleanup(func() {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
globalQueue.tasks = originalTasks
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
})
|
||||||
|
prevList := apiListNvidiaGPUs
|
||||||
|
apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
|
||||||
|
return []platform.NvidiaGPU{
|
||||||
|
{Index: 0, Name: "NVIDIA H100 PCIe"},
|
||||||
|
{Index: 1, Name: "NVIDIA H100 PCIe"},
|
||||||
|
{Index: 2, Name: "NVIDIA H200 NVL"},
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
||||||
|
|
||||||
|
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||||
|
req := httptest.NewRequest("POST", "/api/sat/nvidia-targeted-power/run", strings.NewReader(`{"profile":"acceptance","gpu_indices":[0,1,2]}`))
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
|
||||||
|
h.handleAPISATRun("nvidia-targeted-power").ServeHTTP(rec, req)
|
||||||
|
|
||||||
|
if rec.Code != 200 {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
defer globalQueue.mu.Unlock()
|
||||||
|
if len(globalQueue.tasks) != 2 {
|
||||||
|
t.Fatalf("tasks=%d want 2", len(globalQueue.tasks))
|
||||||
|
}
|
||||||
|
if got := globalQueue.tasks[0].params.GPUIndices; len(got) != 2 || got[0] != 0 || got[1] != 1 {
|
||||||
|
t.Fatalf("task[0] gpu indices=%v want [0 1]", got)
|
||||||
|
}
|
||||||
|
if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
|
||||||
|
t.Fatalf("task[1] gpu indices=%v want [2]", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {
|
func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {
|
||||||
h := &handler{}
|
h := &handler{}
|
||||||
h.pushFanRings([]platform.FanReading{
|
h.pushFanRings([]platform.FanReading{
|
||||||
|
|||||||
@@ -1656,6 +1656,12 @@ func renderBenchmark(opts HandlerOptions) string {
|
|||||||
<script>
|
<script>
|
||||||
let benchmarkES = null;
|
let benchmarkES = null;
|
||||||
|
|
||||||
|
function benchmarkTaskIDs(payload) {
|
||||||
|
if (payload && Array.isArray(payload.task_ids) && payload.task_ids.length) return payload.task_ids;
|
||||||
|
if (payload && payload.task_id) return [payload.task_id];
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
function benchmarkSelectedGPUIndices() {
|
function benchmarkSelectedGPUIndices() {
|
||||||
return Array.from(document.querySelectorAll('.benchmark-gpu-checkbox'))
|
return Array.from(document.querySelectorAll('.benchmark-gpu-checkbox'))
|
||||||
.filter(function(el) { return el.checked && !el.disabled; })
|
.filter(function(el) { return el.checked && !el.disabled; })
|
||||||
@@ -1755,17 +1761,37 @@ function runNvidiaBenchmark() {
|
|||||||
return payload;
|
return payload;
|
||||||
});
|
});
|
||||||
}).then(function(d) {
|
}).then(function(d) {
|
||||||
status.textContent = 'Task ' + d.task_id + ' queued.';
|
const taskIds = benchmarkTaskIDs(d);
|
||||||
term.textContent += 'Task ' + d.task_id + ' queued. Streaming log...\n';
|
if (!taskIds.length) throw new Error('No benchmark task was queued.');
|
||||||
benchmarkES = new EventSource('/api/tasks/' + d.task_id + '/stream');
|
status.textContent = taskIds.length === 1 ? ('Task ' + taskIds[0] + ' queued.') : ('Queued ' + taskIds.length + ' tasks.');
|
||||||
benchmarkES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
const streamNext = function(idx, failures) {
|
||||||
benchmarkES.addEventListener('done', function(e) {
|
if (idx >= taskIds.length) {
|
||||||
benchmarkES.close();
|
status.textContent = failures ? 'Completed with failures.' : 'Completed.';
|
||||||
benchmarkES = null;
|
return;
|
||||||
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
}
|
||||||
term.scrollTop = term.scrollHeight;
|
const taskId = taskIds[idx];
|
||||||
status.textContent = e.data ? 'Failed.' : 'Completed.';
|
term.textContent += '\n[' + (idx + 1) + '/' + taskIds.length + '] Task ' + taskId + ' queued. Streaming log...\n';
|
||||||
});
|
benchmarkES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||||
|
benchmarkES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||||
|
benchmarkES.addEventListener('done', function(e) {
|
||||||
|
benchmarkES.close();
|
||||||
|
benchmarkES = null;
|
||||||
|
if (e.data) failures += 1;
|
||||||
|
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||||
|
term.scrollTop = term.scrollHeight;
|
||||||
|
streamNext(idx + 1, failures);
|
||||||
|
});
|
||||||
|
benchmarkES.onerror = function() {
|
||||||
|
if (benchmarkES) {
|
||||||
|
benchmarkES.close();
|
||||||
|
benchmarkES = null;
|
||||||
|
}
|
||||||
|
term.textContent += '\nERROR: stream disconnected.\n';
|
||||||
|
term.scrollTop = term.scrollHeight;
|
||||||
|
streamNext(idx + 1, failures + 1);
|
||||||
|
};
|
||||||
|
};
|
||||||
|
streamNext(0, 0);
|
||||||
}).catch(function(err) {
|
}).catch(function(err) {
|
||||||
status.textContent = 'Error.';
|
status.textContent = 'Error.';
|
||||||
term.textContent += 'ERROR: ' + err.message + '\n';
|
term.textContent += 'ERROR: ' + err.message + '\n';
|
||||||
@@ -1779,13 +1805,24 @@ benchmarkLoadGPUs();
|
|||||||
|
|
||||||
func renderBenchmarkResultsCard(exportDir string) string {
|
func renderBenchmarkResultsCard(exportDir string) string {
|
||||||
columns, runs := loadBenchmarkHistory(exportDir)
|
columns, runs := loadBenchmarkHistory(exportDir)
|
||||||
if len(runs) == 0 {
|
return renderBenchmarkResultsCardFromRuns(
|
||||||
return `<div class="card"><div class="card-head">Benchmark Results</div><div class="card-body"><p style="color:var(--muted);font-size:13px">No saved benchmark runs yet.</p></div></div>`
|
"Benchmark Results",
|
||||||
}
|
"Composite score by saved benchmark run and GPU.",
|
||||||
|
"No saved benchmark runs yet.",
|
||||||
|
columns,
|
||||||
|
runs,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderBenchmarkResultsCardFromRuns(title, description, emptyMessage string, columns []benchmarkHistoryColumn, runs []benchmarkHistoryRun) string {
|
||||||
|
if len(runs) == 0 {
|
||||||
|
return `<div class="card"><div class="card-head">` + html.EscapeString(title) + `</div><div class="card-body"><p style="color:var(--muted);font-size:13px">` + html.EscapeString(emptyMessage) + `</p></div></div>`
|
||||||
|
}
|
||||||
var b strings.Builder
|
var b strings.Builder
|
||||||
b.WriteString(`<div class="card"><div class="card-head">Benchmark Results</div><div class="card-body">`)
|
b.WriteString(`<div class="card"><div class="card-head">` + html.EscapeString(title) + `</div><div class="card-body">`)
|
||||||
b.WriteString(`<p style="color:var(--muted);font-size:13px;margin-bottom:12px">Composite score by saved benchmark run and GPU.</p>`)
|
if strings.TrimSpace(description) != "" {
|
||||||
|
b.WriteString(`<p style="color:var(--muted);font-size:13px;margin-bottom:12px">` + html.EscapeString(description) + `</p>`)
|
||||||
|
}
|
||||||
b.WriteString(`<div style="overflow-x:auto">`)
|
b.WriteString(`<div style="overflow-x:auto">`)
|
||||||
b.WriteString(`<table><thead><tr><th>Test</th><th>Time</th>`)
|
b.WriteString(`<table><thead><tr><th>Test</th><th>Time</th>`)
|
||||||
for _, col := range columns {
|
for _, col := range columns {
|
||||||
@@ -1820,7 +1857,10 @@ func loadBenchmarkHistory(exportDir string) ([]benchmarkHistoryColumn, []benchma
|
|||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
sort.Strings(paths)
|
sort.Strings(paths)
|
||||||
|
return loadBenchmarkHistoryFromPaths(paths)
|
||||||
|
}
|
||||||
|
|
||||||
|
func loadBenchmarkHistoryFromPaths(paths []string) ([]benchmarkHistoryColumn, []benchmarkHistoryRun) {
|
||||||
columnByKey := make(map[string]benchmarkHistoryColumn)
|
columnByKey := make(map[string]benchmarkHistoryColumn)
|
||||||
runs := make([]benchmarkHistoryRun, 0, len(paths))
|
runs := make([]benchmarkHistoryRun, 0, len(paths))
|
||||||
for _, path := range paths {
|
for _, path := range paths {
|
||||||
@@ -2005,6 +2045,12 @@ func renderBurn() string {
|
|||||||
<script>
|
<script>
|
||||||
let biES = null;
|
let biES = null;
|
||||||
|
|
||||||
|
function burnTaskIDs(payload) {
|
||||||
|
if (payload && Array.isArray(payload.task_ids) && payload.task_ids.length) return payload.task_ids;
|
||||||
|
if (payload && payload.task_id) return [payload.task_id];
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
function burnProfile() {
|
function burnProfile() {
|
||||||
const selected = document.querySelector('input[name="burn-profile"]:checked');
|
const selected = document.querySelector('input[name="burn-profile"]:checked');
|
||||||
return selected ? selected.value : 'smoke';
|
return selected ? selected.value : 'smoke';
|
||||||
@@ -2106,6 +2152,9 @@ function streamTask(taskId, label) {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
function streamBurnTask(taskId, label, resetTerminal) {
|
function streamBurnTask(taskId, label, resetTerminal) {
|
||||||
|
return streamBurnTaskSet([taskId], label, resetTerminal);
|
||||||
|
}
|
||||||
|
function streamBurnTaskSet(taskIds, label, resetTerminal) {
|
||||||
if (biES) { biES.close(); biES = null; }
|
if (biES) { biES.close(); biES = null; }
|
||||||
document.getElementById('bi-output').style.display = 'block';
|
document.getElementById('bi-output').style.display = 'block';
|
||||||
document.getElementById('bi-title').textContent = '— ' + label + ' [' + burnProfile() + ']';
|
document.getElementById('bi-title').textContent = '— ' + label + ' [' + burnProfile() + ']';
|
||||||
@@ -2113,27 +2162,40 @@ function streamBurnTask(taskId, label, resetTerminal) {
|
|||||||
if (resetTerminal) {
|
if (resetTerminal) {
|
||||||
term.textContent = '';
|
term.textContent = '';
|
||||||
}
|
}
|
||||||
term.textContent += 'Task ' + taskId + ' queued. Streaming...\n';
|
if (!Array.isArray(taskIds) || !taskIds.length) {
|
||||||
return new Promise(function(resolve) {
|
term.textContent += 'ERROR: no tasks queued.\n';
|
||||||
biES = new EventSource('/api/tasks/' + taskId + '/stream');
|
return Promise.resolve({ok:false, error:'no tasks queued'});
|
||||||
biES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
}
|
||||||
biES.addEventListener('done', function(e) {
|
const streamNext = function(idx, failures) {
|
||||||
biES.close();
|
if (idx >= taskIds.length) {
|
||||||
biES = null;
|
return Promise.resolve({ok: failures === 0, error: failures ? (failures + ' task(s) failed') : ''});
|
||||||
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
}
|
||||||
term.scrollTop = term.scrollHeight;
|
const taskId = taskIds[idx];
|
||||||
resolve({ok: !e.data, error: e.data || ''});
|
term.textContent += '[' + (idx + 1) + '/' + taskIds.length + '] Task ' + taskId + ' queued. Streaming...\n';
|
||||||
});
|
return new Promise(function(resolve) {
|
||||||
biES.onerror = function() {
|
biES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||||
if (biES) {
|
biES.onmessage = function(e) { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||||
|
biES.addEventListener('done', function(e) {
|
||||||
biES.close();
|
biES.close();
|
||||||
biES = null;
|
biES = null;
|
||||||
}
|
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||||
term.textContent += '\nERROR: stream disconnected.\n';
|
term.scrollTop = term.scrollHeight;
|
||||||
term.scrollTop = term.scrollHeight;
|
resolve(failures + (e.data ? 1 : 0));
|
||||||
resolve({ok: false, error: 'stream disconnected'});
|
});
|
||||||
};
|
biES.onerror = function() {
|
||||||
});
|
if (biES) {
|
||||||
|
biES.close();
|
||||||
|
biES = null;
|
||||||
|
}
|
||||||
|
term.textContent += '\nERROR: stream disconnected.\n';
|
||||||
|
term.scrollTop = term.scrollHeight;
|
||||||
|
resolve(failures + 1);
|
||||||
|
};
|
||||||
|
}).then(function(nextFailures) {
|
||||||
|
return streamNext(idx + 1, nextFailures);
|
||||||
|
});
|
||||||
|
};
|
||||||
|
return streamNext(0, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
function runBurnTaskSet(tasks, statusElId) {
|
function runBurnTaskSet(tasks, statusElId) {
|
||||||
@@ -2161,7 +2223,7 @@ function runBurnTaskSet(tasks, statusElId) {
|
|||||||
if (status) status.textContent = 'Running ' + (idx + 1) + '/' + enabled.length + '...';
|
if (status) status.textContent = 'Running ' + (idx + 1) + '/' + enabled.length + '...';
|
||||||
return enqueueBurnTask(t.target, t.label, t.extra, !!t.nvidia)
|
return enqueueBurnTask(t.target, t.label, t.extra, !!t.nvidia)
|
||||||
.then(function(d) {
|
.then(function(d) {
|
||||||
return streamBurnTask(d.task_id, t.label, false);
|
return streamBurnTaskSet(burnTaskIDs(d), t.label, false);
|
||||||
})
|
})
|
||||||
.then(function() {
|
.then(function() {
|
||||||
return runNext(idx + 1);
|
return runNext(idx + 1);
|
||||||
|
|||||||
@@ -230,6 +230,9 @@ func renderTaskReportFragment(report taskReport, charts map[string]string, logTe
|
|||||||
b.WriteString(`<div style="margin-top:14px;font-size:13px;color:var(--muted)">`)
|
b.WriteString(`<div style="margin-top:14px;font-size:13px;color:var(--muted)">`)
|
||||||
b.WriteString(`Started: ` + formatTaskTime(report.StartedAt, report.CreatedAt) + ` | Finished: ` + formatTaskTime(report.DoneAt, time.Time{}) + ` | Duration: ` + formatTaskDuration(report.DurationSec))
|
b.WriteString(`Started: ` + formatTaskTime(report.StartedAt, report.CreatedAt) + ` | Finished: ` + formatTaskTime(report.DoneAt, time.Time{}) + ` | Duration: ` + formatTaskDuration(report.DurationSec))
|
||||||
b.WriteString(`</div></div></div>`)
|
b.WriteString(`</div></div></div>`)
|
||||||
|
if benchmarkCard := renderTaskBenchmarkResultsCard(report.Target, logText); benchmarkCard != "" {
|
||||||
|
b.WriteString(benchmarkCard)
|
||||||
|
}
|
||||||
|
|
||||||
if len(report.Charts) > 0 {
|
if len(report.Charts) > 0 {
|
||||||
for _, chart := range report.Charts {
|
for _, chart := range report.Charts {
|
||||||
@@ -247,6 +250,57 @@ func renderTaskReportFragment(report taskReport, charts map[string]string, logTe
|
|||||||
return b.String()
|
return b.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func renderTaskBenchmarkResultsCard(target, logText string) string {
|
||||||
|
if strings.TrimSpace(target) != "nvidia-benchmark" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
resultPath := taskBenchmarkResultPath(logText)
|
||||||
|
if strings.TrimSpace(resultPath) == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
columns, runs := loadBenchmarkHistoryFromPaths([]string{resultPath})
|
||||||
|
if len(runs) == 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return renderBenchmarkResultsCardFromRuns(
|
||||||
|
"Benchmark Results",
|
||||||
|
"Composite score for this benchmark task.",
|
||||||
|
"No benchmark results were saved for this task.",
|
||||||
|
columns,
|
||||||
|
runs,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskBenchmarkResultPath(logText string) string {
|
||||||
|
archivePath := taskArchivePathFromLog(logText)
|
||||||
|
if archivePath == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||||
|
if runDir == archivePath {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return filepath.Join(runDir, "result.json")
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskArchivePathFromLog(logText string) string {
|
||||||
|
lines := strings.Split(logText, "\n")
|
||||||
|
for i := len(lines) - 1; i >= 0; i-- {
|
||||||
|
line := strings.TrimSpace(lines[i])
|
||||||
|
if line == "" || !strings.HasPrefix(line, "Archive:") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
path := strings.TrimSpace(strings.TrimPrefix(line, "Archive:"))
|
||||||
|
if strings.HasPrefix(path, "Archive written to ") {
|
||||||
|
path = strings.TrimSpace(strings.TrimPrefix(path, "Archive written to "))
|
||||||
|
}
|
||||||
|
if strings.HasSuffix(path, ".tar.gz") {
|
||||||
|
return path
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
func renderTaskStatusBadge(status string) string {
|
func renderTaskStatusBadge(status string) string {
|
||||||
className := map[string]string{
|
className := map[string]string{
|
||||||
TaskRunning: "badge-ok",
|
TaskRunning: "badge-ok",
|
||||||
|
|||||||
@@ -1149,7 +1149,32 @@ func taskArtifactsDir(root string, t *Task, status string) string {
|
|||||||
if strings.TrimSpace(root) == "" || t == nil {
|
if strings.TrimSpace(root) == "" || t == nil {
|
||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
return filepath.Join(root, fmt.Sprintf("%s_%s_%s", t.ID, sanitizeTaskFolderPart(t.Name), taskFolderStatus(status)))
|
prefix := taskFolderNumberPrefix(t.ID)
|
||||||
|
return filepath.Join(root, fmt.Sprintf("%s_%s_%s", prefix, sanitizeTaskFolderPart(t.Name), taskFolderStatus(status)))
|
||||||
|
}
|
||||||
|
|
||||||
|
func taskFolderNumberPrefix(taskID string) string {
|
||||||
|
taskID = strings.TrimSpace(taskID)
|
||||||
|
if strings.HasPrefix(taskID, "TASK-") && len(taskID) >= len("TASK-000") {
|
||||||
|
num := strings.TrimSpace(strings.TrimPrefix(taskID, "TASK-"))
|
||||||
|
if len(num) == 3 {
|
||||||
|
allDigits := true
|
||||||
|
for _, r := range num {
|
||||||
|
if r < '0' || r > '9' {
|
||||||
|
allDigits = false
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if allDigits {
|
||||||
|
return num
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fallback := sanitizeTaskFolderPart(taskID)
|
||||||
|
if fallback == "" {
|
||||||
|
return "000"
|
||||||
|
}
|
||||||
|
return fallback
|
||||||
}
|
}
|
||||||
|
|
||||||
func ensureTaskReportPaths(t *Task) {
|
func ensureTaskReportPaths(t *Task) {
|
||||||
|
|||||||
@@ -163,6 +163,40 @@ func TestTaskQueueSnapshotSortsNewestFirst(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestNewJobIDUsesTASKPrefixAndZeroPadding(t *testing.T) {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
origTasks := globalQueue.tasks
|
||||||
|
globalQueue.tasks = nil
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
origCounter := jobCounter.Load()
|
||||||
|
jobCounter.Store(0)
|
||||||
|
t.Cleanup(func() {
|
||||||
|
globalQueue.mu.Lock()
|
||||||
|
globalQueue.tasks = origTasks
|
||||||
|
globalQueue.mu.Unlock()
|
||||||
|
jobCounter.Store(origCounter)
|
||||||
|
})
|
||||||
|
|
||||||
|
if got := newJobID("ignored"); got != "TASK-000" {
|
||||||
|
t.Fatalf("id=%q want TASK-000", got)
|
||||||
|
}
|
||||||
|
if got := newJobID("ignored"); got != "TASK-001" {
|
||||||
|
t.Fatalf("id=%q want TASK-001", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestTaskArtifactsDirStartsWithTaskNumber(t *testing.T) {
|
||||||
|
root := t.TempDir()
|
||||||
|
task := &Task{
|
||||||
|
ID: "TASK-007",
|
||||||
|
Name: "NVIDIA Benchmark",
|
||||||
|
}
|
||||||
|
got := filepath.Base(taskArtifactsDir(root, task, TaskDone))
|
||||||
|
if !strings.HasPrefix(got, "007_") {
|
||||||
|
t.Fatalf("artifacts dir=%q want prefix 007_", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestHandleAPITasksStreamReplaysPersistedLogWithoutLiveJob(t *testing.T) {
|
func TestHandleAPITasksStreamReplaysPersistedLogWithoutLiveJob(t *testing.T) {
|
||||||
dir := t.TempDir()
|
dir := t.TempDir()
|
||||||
logPath := filepath.Join(dir, "task.log")
|
logPath := filepath.Join(dir, "task.log")
|
||||||
@@ -325,6 +359,78 @@ func TestFinalizeTaskRunCreatesReportFolderAndArtifacts(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
metricsPath := filepath.Join(dir, "metrics.db")
|
||||||
|
prevMetricsPath := taskReportMetricsDBPath
|
||||||
|
taskReportMetricsDBPath = metricsPath
|
||||||
|
t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })
|
||||||
|
|
||||||
|
benchmarkDir := filepath.Join(dir, "bee-benchmark", "gpu-benchmark-20260406-120000")
|
||||||
|
if err := os.MkdirAll(benchmarkDir, 0755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
result := platform.NvidiaBenchmarkResult{
|
||||||
|
GeneratedAt: time.Date(2026, time.April, 6, 12, 0, 0, 0, time.UTC),
|
||||||
|
BenchmarkProfile: "standard",
|
||||||
|
OverallStatus: "OK",
|
||||||
|
GPUs: []platform.BenchmarkGPUResult{
|
||||||
|
{
|
||||||
|
Index: 0,
|
||||||
|
Name: "NVIDIA H100 PCIe",
|
||||||
|
Scores: platform.BenchmarkScorecard{
|
||||||
|
CompositeScore: 1176.25,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
raw, err := json.Marshal(result)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(filepath.Join(benchmarkDir, "result.json"), raw, 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
artifactsDir := filepath.Join(dir, "tasks", "task-bench_done")
|
||||||
|
if err := os.MkdirAll(artifactsDir, 0755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
task := &Task{
|
||||||
|
ID: "task-bench",
|
||||||
|
Name: "NVIDIA Benchmark",
|
||||||
|
Target: "nvidia-benchmark",
|
||||||
|
Status: TaskDone,
|
||||||
|
CreatedAt: time.Now().UTC().Add(-time.Minute),
|
||||||
|
ArtifactsDir: artifactsDir,
|
||||||
|
}
|
||||||
|
ensureTaskReportPaths(task)
|
||||||
|
logText := "line-1\nArchive: " + filepath.Join(dir, "bee-benchmark", "gpu-benchmark-20260406-120000.tar.gz") + "\n"
|
||||||
|
if err := os.WriteFile(task.LogPath, []byte(logText), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := writeTaskReportArtifacts(task); err != nil {
|
||||||
|
t.Fatalf("writeTaskReportArtifacts: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
body, err := os.ReadFile(task.ReportHTMLPath)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("ReadFile(report.html): %v", err)
|
||||||
|
}
|
||||||
|
html := string(body)
|
||||||
|
for _, needle := range []string{
|
||||||
|
`Benchmark Results`,
|
||||||
|
`Composite score for this benchmark task.`,
|
||||||
|
`NVIDIA H100 PCIe / GPU 0`,
|
||||||
|
`1176.25`,
|
||||||
|
} {
|
||||||
|
if !strings.Contains(html, needle) {
|
||||||
|
t.Fatalf("report missing %q: %s", needle, html)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestTaskLifecycleMirrorsToSerialConsole(t *testing.T) {
|
func TestTaskLifecycleMirrorsToSerialConsole(t *testing.T) {
|
||||||
var lines []string
|
var lines []string
|
||||||
prev := taskSerialWriteLine
|
prev := taskSerialWriteLine
|
||||||
|
|||||||
@@ -41,15 +41,15 @@ while [ $# -gt 0 ]; do
|
|||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
echo "unknown arg: $1" >&2
|
echo "unknown arg: $1" >&2
|
||||||
echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys] [--variant nvidia|amd|all]" >&2
|
echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys] [--variant nvidia|nvidia-legacy|amd|nogpu|all]" >&2
|
||||||
exit 1
|
exit 1
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
done
|
done
|
||||||
|
|
||||||
case "$VARIANT" in
|
case "$VARIANT" in
|
||||||
nvidia|amd|nogpu|all) ;;
|
nvidia|nvidia-legacy|amd|nogpu|all) ;;
|
||||||
*) echo "unknown variant: $VARIANT (expected nvidia, amd, nogpu, or all)" >&2; exit 1 ;;
|
*) echo "unknown variant: $VARIANT (expected nvidia, nvidia-legacy, amd, nogpu, or all)" >&2; exit 1 ;;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
if [ "$CLEAN_CACHE" = "1" ]; then
|
if [ "$CLEAN_CACHE" = "1" ]; then
|
||||||
@@ -61,8 +61,13 @@ if [ "$CLEAN_CACHE" = "1" ]; then
|
|||||||
"${CACHE_DIR:?}/lb-packages"
|
"${CACHE_DIR:?}/lb-packages"
|
||||||
echo "=== cleaning live-build work dirs ==="
|
echo "=== cleaning live-build work dirs ==="
|
||||||
rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia"
|
rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia"
|
||||||
|
rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia-legacy"
|
||||||
rm -rf "${REPO_ROOT}/dist/live-build-work-amd"
|
rm -rf "${REPO_ROOT}/dist/live-build-work-amd"
|
||||||
rm -rf "${REPO_ROOT}/dist/live-build-work-nogpu"
|
rm -rf "${REPO_ROOT}/dist/live-build-work-nogpu"
|
||||||
|
rm -rf "${REPO_ROOT}/dist/overlay-stage-nvidia"
|
||||||
|
rm -rf "${REPO_ROOT}/dist/overlay-stage-nvidia-legacy"
|
||||||
|
rm -rf "${REPO_ROOT}/dist/overlay-stage-amd"
|
||||||
|
rm -rf "${REPO_ROOT}/dist/overlay-stage-nogpu"
|
||||||
echo "=== caches cleared, proceeding with build ==="
|
echo "=== caches cleared, proceeding with build ==="
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@@ -180,6 +185,9 @@ case "$VARIANT" in
|
|||||||
nvidia)
|
nvidia)
|
||||||
run_variant nvidia
|
run_variant nvidia
|
||||||
;;
|
;;
|
||||||
|
nvidia-legacy)
|
||||||
|
run_variant nvidia-legacy
|
||||||
|
;;
|
||||||
amd)
|
amd)
|
||||||
run_variant amd
|
run_variant amd
|
||||||
;;
|
;;
|
||||||
@@ -188,6 +196,7 @@ case "$VARIANT" in
|
|||||||
;;
|
;;
|
||||||
all)
|
all)
|
||||||
run_variant nvidia
|
run_variant nvidia
|
||||||
|
run_variant nvidia-legacy
|
||||||
run_variant amd
|
run_variant amd
|
||||||
run_variant nogpu
|
run_variant nogpu
|
||||||
;;
|
;;
|
||||||
|
|||||||
@@ -1,8 +1,10 @@
|
|||||||
#!/bin/sh
|
#!/bin/sh
|
||||||
# build-nvidia-module.sh — compile NVIDIA proprietary driver modules for Debian 12
|
# build-nvidia-module.sh — compile NVIDIA kernel modules for Debian 12
|
||||||
#
|
#
|
||||||
# Downloads the official NVIDIA .run installer, extracts kernel modules and
|
# Downloads the official NVIDIA .run installer, extracts kernel modules and
|
||||||
# userspace tools (nvidia-smi, libnvidia-ml). Everything is proprietary NVIDIA.
|
# userspace tools (nvidia-smi, libnvidia-ml). Supports both:
|
||||||
|
# - open -> kernel-open/ sources from the .run installer
|
||||||
|
# - proprietary -> traditional proprietary kernel sources from the .run installer
|
||||||
#
|
#
|
||||||
# Output is cached in DIST_DIR/nvidia-<version>-<kver>/ so subsequent builds
|
# Output is cached in DIST_DIR/nvidia-<version>-<kver>/ so subsequent builds
|
||||||
# are instant unless NVIDIA_DRIVER_VERSION or kernel version changes.
|
# are instant unless NVIDIA_DRIVER_VERSION or kernel version changes.
|
||||||
@@ -17,10 +19,19 @@ set -e
|
|||||||
NVIDIA_VERSION="$1"
|
NVIDIA_VERSION="$1"
|
||||||
DIST_DIR="$2"
|
DIST_DIR="$2"
|
||||||
DEBIAN_KERNEL_ABI="$3"
|
DEBIAN_KERNEL_ABI="$3"
|
||||||
|
NVIDIA_FLAVOR="${4:-open}"
|
||||||
|
|
||||||
[ -n "$NVIDIA_VERSION" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi>"; exit 1; }
|
[ -n "$NVIDIA_VERSION" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi> [open|proprietary]"; exit 1; }
|
||||||
[ -n "$DIST_DIR" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi>"; exit 1; }
|
[ -n "$DIST_DIR" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi> [open|proprietary]"; exit 1; }
|
||||||
[ -n "$DEBIAN_KERNEL_ABI" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi>"; exit 1; }
|
[ -n "$DEBIAN_KERNEL_ABI" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi> [open|proprietary]"; exit 1; }
|
||||||
|
|
||||||
|
case "$NVIDIA_FLAVOR" in
|
||||||
|
open|proprietary) ;;
|
||||||
|
*)
|
||||||
|
echo "unsupported NVIDIA flavor: $NVIDIA_FLAVOR (expected open or proprietary)" >&2
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
||||||
# On Debian, kernel headers are split into two packages:
|
# On Debian, kernel headers are split into two packages:
|
||||||
@@ -31,22 +42,13 @@ KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
|||||||
KDIR_ARCH="/usr/src/linux-headers-${KVER}"
|
KDIR_ARCH="/usr/src/linux-headers-${KVER}"
|
||||||
KDIR_COMMON="/usr/src/linux-headers-${DEBIAN_KERNEL_ABI}-common"
|
KDIR_COMMON="/usr/src/linux-headers-${DEBIAN_KERNEL_ABI}-common"
|
||||||
|
|
||||||
echo "=== NVIDIA ${NVIDIA_VERSION} (proprietary) for kernel ${KVER} ==="
|
echo "=== NVIDIA ${NVIDIA_VERSION} (${NVIDIA_FLAVOR}) for kernel ${KVER} ==="
|
||||||
|
|
||||||
if [ ! -d "$KDIR_ARCH" ] || [ ! -d "$KDIR_COMMON" ]; then
|
CACHE_DIR="${DIST_DIR}/nvidia-${NVIDIA_FLAVOR}-${NVIDIA_VERSION}-${KVER}"
|
||||||
echo "=== installing linux-headers-${KVER} ==="
|
|
||||||
DEBIAN_FRONTEND=noninteractive apt-get install -y \
|
|
||||||
"linux-headers-${KVER}" \
|
|
||||||
gcc make perl
|
|
||||||
fi
|
|
||||||
echo "kernel headers (arch): $KDIR_ARCH"
|
|
||||||
echo "kernel headers (common): $KDIR_COMMON"
|
|
||||||
|
|
||||||
CACHE_DIR="${DIST_DIR}/nvidia-${NVIDIA_VERSION}-${KVER}"
|
|
||||||
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||||
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nvidia-downloads"
|
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nvidia-downloads"
|
||||||
EXTRACT_CACHE_DIR="${CACHE_ROOT}/nvidia-extract"
|
EXTRACT_CACHE_DIR="${CACHE_ROOT}/nvidia-extract"
|
||||||
CACHE_LAYOUT_VERSION="2"
|
CACHE_LAYOUT_VERSION="3"
|
||||||
CACHE_LAYOUT_MARKER="${CACHE_DIR}/.cache-layout-v${CACHE_LAYOUT_VERSION}"
|
CACHE_LAYOUT_MARKER="${CACHE_DIR}/.cache-layout-v${CACHE_LAYOUT_VERSION}"
|
||||||
if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ] \
|
if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ] \
|
||||||
&& [ -f "$CACHE_LAYOUT_MARKER" ] \
|
&& [ -f "$CACHE_LAYOUT_MARKER" ] \
|
||||||
@@ -57,6 +59,15 @@ if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ] \
|
|||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [ ! -d "$KDIR_ARCH" ] || [ ! -d "$KDIR_COMMON" ]; then
|
||||||
|
echo "=== installing linux-headers-${KVER} ==="
|
||||||
|
DEBIAN_FRONTEND=noninteractive apt-get install -y \
|
||||||
|
"linux-headers-${KVER}" \
|
||||||
|
gcc make perl
|
||||||
|
fi
|
||||||
|
echo "kernel headers (arch): $KDIR_ARCH"
|
||||||
|
echo "kernel headers (common): $KDIR_COMMON"
|
||||||
|
|
||||||
# Download official NVIDIA .run installer with sha256 verification
|
# Download official NVIDIA .run installer with sha256 verification
|
||||||
BASE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${NVIDIA_VERSION}"
|
BASE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${NVIDIA_VERSION}"
|
||||||
mkdir -p "$DOWNLOAD_CACHE_DIR" "$EXTRACT_CACHE_DIR"
|
mkdir -p "$DOWNLOAD_CACHE_DIR" "$EXTRACT_CACHE_DIR"
|
||||||
@@ -90,12 +101,18 @@ EXTRACT_DIR="${EXTRACT_CACHE_DIR}/nvidia-extract-${NVIDIA_VERSION}"
|
|||||||
rm -rf "$EXTRACT_DIR"
|
rm -rf "$EXTRACT_DIR"
|
||||||
"$RUN_FILE" --extract-only --target "$EXTRACT_DIR"
|
"$RUN_FILE" --extract-only --target "$EXTRACT_DIR"
|
||||||
|
|
||||||
# Find kernel source directory (proprietary: kernel/, open: kernel-open/)
|
# Find kernel source directory for the selected flavor.
|
||||||
KERNEL_SRC=""
|
KERNEL_SRC=""
|
||||||
for d in "$EXTRACT_DIR/kernel" "$EXTRACT_DIR/kernel-modules-sources" "$EXTRACT_DIR/kernel-source"; do
|
if [ "$NVIDIA_FLAVOR" = "open" ]; then
|
||||||
[ -f "$d/Makefile" ] && KERNEL_SRC="$d" && break
|
for d in "$EXTRACT_DIR/kernel-open" "$EXTRACT_DIR/kernel-open/"*; do
|
||||||
done
|
[ -f "$d/Makefile" ] && KERNEL_SRC="$d" && break
|
||||||
[ -n "$KERNEL_SRC" ] || { echo "ERROR: kernel source dir not found in:"; ls "$EXTRACT_DIR/"; exit 1; }
|
done
|
||||||
|
else
|
||||||
|
for d in "$EXTRACT_DIR/kernel" "$EXTRACT_DIR/kernel-modules-sources" "$EXTRACT_DIR/kernel-source"; do
|
||||||
|
[ -f "$d/Makefile" ] && KERNEL_SRC="$d" && break
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
[ -n "$KERNEL_SRC" ] || { echo "ERROR: kernel source dir not found for flavor ${NVIDIA_FLAVOR} in:"; ls "$EXTRACT_DIR/"; exit 1; }
|
||||||
echo "kernel source: $KERNEL_SRC"
|
echo "kernel source: $KERNEL_SRC"
|
||||||
|
|
||||||
# Build kernel modules
|
# Build kernel modules
|
||||||
|
|||||||
@@ -15,26 +15,46 @@ DIST_DIR="${REPO_ROOT}/dist"
|
|||||||
VENDOR_DIR="${REPO_ROOT}/iso/vendor"
|
VENDOR_DIR="${REPO_ROOT}/iso/vendor"
|
||||||
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||||
AUTH_KEYS=""
|
AUTH_KEYS=""
|
||||||
|
BUILD_VARIANT="nvidia"
|
||||||
BEE_GPU_VENDOR="nvidia"
|
BEE_GPU_VENDOR="nvidia"
|
||||||
|
BEE_NVIDIA_MODULE_FLAVOR="open"
|
||||||
|
|
||||||
# parse args
|
# parse args
|
||||||
while [ $# -gt 0 ]; do
|
while [ $# -gt 0 ]; do
|
||||||
case "$1" in
|
case "$1" in
|
||||||
--authorized-keys) AUTH_KEYS="$2"; shift 2 ;;
|
--authorized-keys) AUTH_KEYS="$2"; shift 2 ;;
|
||||||
--variant) BEE_GPU_VENDOR="$2"; shift 2 ;;
|
--variant) BUILD_VARIANT="$2"; shift 2 ;;
|
||||||
*) echo "unknown arg: $1"; exit 1 ;;
|
*) echo "unknown arg: $1"; exit 1 ;;
|
||||||
esac
|
esac
|
||||||
done
|
done
|
||||||
|
|
||||||
case "$BEE_GPU_VENDOR" in
|
case "$BUILD_VARIANT" in
|
||||||
nvidia|amd|nogpu) ;;
|
nvidia)
|
||||||
*) echo "unknown variant: $BEE_GPU_VENDOR (expected nvidia, amd, or nogpu)" >&2; exit 1 ;;
|
BEE_GPU_VENDOR="nvidia"
|
||||||
|
BEE_NVIDIA_MODULE_FLAVOR="open"
|
||||||
|
;;
|
||||||
|
nvidia-legacy)
|
||||||
|
BEE_GPU_VENDOR="nvidia"
|
||||||
|
BEE_NVIDIA_MODULE_FLAVOR="proprietary"
|
||||||
|
;;
|
||||||
|
amd)
|
||||||
|
BEE_GPU_VENDOR="amd"
|
||||||
|
BEE_NVIDIA_MODULE_FLAVOR=""
|
||||||
|
;;
|
||||||
|
nogpu)
|
||||||
|
BEE_GPU_VENDOR="nogpu"
|
||||||
|
BEE_NVIDIA_MODULE_FLAVOR=""
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo "unknown variant: $BUILD_VARIANT (expected nvidia, nvidia-legacy, amd, or nogpu)" >&2
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
BUILD_WORK_DIR="${DIST_DIR}/live-build-work-${BEE_GPU_VENDOR}"
|
BUILD_WORK_DIR="${DIST_DIR}/live-build-work-${BUILD_VARIANT}"
|
||||||
OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage-${BEE_GPU_VENDOR}"
|
OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage-${BUILD_VARIANT}"
|
||||||
|
|
||||||
export BEE_GPU_VENDOR
|
export BEE_GPU_VENDOR BEE_NVIDIA_MODULE_FLAVOR BUILD_VARIANT
|
||||||
|
|
||||||
. "${BUILDER_DIR}/VERSIONS"
|
. "${BUILDER_DIR}/VERSIONS"
|
||||||
export PATH="$PATH:/usr/local/go/bin"
|
export PATH="$PATH:/usr/local/go/bin"
|
||||||
@@ -627,7 +647,7 @@ recover_iso_memtest() {
|
|||||||
|
|
||||||
AUDIT_VERSION_EFFECTIVE="$(resolve_audit_version)"
|
AUDIT_VERSION_EFFECTIVE="$(resolve_audit_version)"
|
||||||
ISO_VERSION_EFFECTIVE="$(resolve_iso_version)"
|
ISO_VERSION_EFFECTIVE="$(resolve_iso_version)"
|
||||||
ISO_BASENAME="easy-bee-${BEE_GPU_VENDOR}-v${ISO_VERSION_EFFECTIVE}-amd64"
|
ISO_BASENAME="easy-bee-${BUILD_VARIANT}-v${ISO_VERSION_EFFECTIVE}-amd64"
|
||||||
# Versioned output directory: dist/easy-bee-v4.1/ — all final artefacts live here.
|
# Versioned output directory: dist/easy-bee-v4.1/ — all final artefacts live here.
|
||||||
OUT_DIR="${DIST_DIR}/easy-bee-v${ISO_VERSION_EFFECTIVE}"
|
OUT_DIR="${DIST_DIR}/easy-bee-v${ISO_VERSION_EFFECTIVE}"
|
||||||
mkdir -p "${OUT_DIR}"
|
mkdir -p "${OUT_DIR}"
|
||||||
@@ -801,7 +821,7 @@ if [ ! -d "/usr/src/linux-headers-${KVER}" ]; then
|
|||||||
apt-get install -y "linux-headers-${KVER}"
|
apt-get install -y "linux-headers-${KVER}"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "=== bee ISO build (variant: ${BEE_GPU_VENDOR}) ==="
|
echo "=== bee ISO build (variant: ${BUILD_VARIANT}) ==="
|
||||||
echo "Debian: ${DEBIAN_VERSION}, Kernel ABI: ${DEBIAN_KERNEL_ABI}, Go: ${GO_VERSION}"
|
echo "Debian: ${DEBIAN_VERSION}, Kernel ABI: ${DEBIAN_KERNEL_ABI}, Go: ${GO_VERSION}"
|
||||||
echo "Audit version: ${AUDIT_VERSION_EFFECTIVE}, ISO version: ${ISO_VERSION_EFFECTIVE}"
|
echo "Audit version: ${AUDIT_VERSION_EFFECTIVE}, ISO version: ${ISO_VERSION_EFFECTIVE}"
|
||||||
echo ""
|
echo ""
|
||||||
@@ -871,7 +891,7 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "=== preparing staged overlay (${BEE_GPU_VENDOR}) ==="
|
echo "=== preparing staged overlay (${BUILD_VARIANT}) ==="
|
||||||
mkdir -p "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}"
|
mkdir -p "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}"
|
||||||
|
|
||||||
# Sync builder config into variant work dir, preserving lb cache.
|
# Sync builder config into variant work dir, preserving lb cache.
|
||||||
@@ -981,10 +1001,10 @@ done
|
|||||||
# --- NVIDIA kernel modules and userspace libs ---
|
# --- NVIDIA kernel modules and userspace libs ---
|
||||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||||
run_step "build NVIDIA ${NVIDIA_DRIVER_VERSION} modules" "40-nvidia-module" \
|
run_step "build NVIDIA ${NVIDIA_DRIVER_VERSION} modules" "40-nvidia-module" \
|
||||||
sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}"
|
sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}" "${BEE_NVIDIA_MODULE_FLAVOR}"
|
||||||
|
|
||||||
KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
||||||
NVIDIA_CACHE="${DIST_DIR}/nvidia-${NVIDIA_DRIVER_VERSION}-${KVER}"
|
NVIDIA_CACHE="${DIST_DIR}/nvidia-${BEE_NVIDIA_MODULE_FLAVOR}-${NVIDIA_DRIVER_VERSION}-${KVER}"
|
||||||
|
|
||||||
# Inject .ko files into overlay at /usr/local/lib/nvidia/
|
# Inject .ko files into overlay at /usr/local/lib/nvidia/
|
||||||
OVERLAY_KMOD_DIR="${OVERLAY_STAGE_DIR}/usr/local/lib/nvidia"
|
OVERLAY_KMOD_DIR="${OVERLAY_STAGE_DIR}/usr/local/lib/nvidia"
|
||||||
@@ -1055,13 +1075,14 @@ GIT_COMMIT="$(git -C "${REPO_ROOT}" rev-parse --short HEAD 2>/dev/null || echo u
|
|||||||
|
|
||||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||||
GPU_VERSION_LINE="NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}
|
GPU_VERSION_LINE="NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}
|
||||||
|
NVIDIA_KERNEL_MODULES_FLAVOR=${BEE_NVIDIA_MODULE_FLAVOR}
|
||||||
NCCL_VERSION=${NCCL_VERSION}
|
NCCL_VERSION=${NCCL_VERSION}
|
||||||
NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION}
|
NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION}
|
||||||
CUBLAS_VERSION=${CUBLAS_VERSION}
|
CUBLAS_VERSION=${CUBLAS_VERSION}
|
||||||
CUDA_USERSPACE_VERSION=${CUDA_USERSPACE_VERSION}
|
CUDA_USERSPACE_VERSION=${CUDA_USERSPACE_VERSION}
|
||||||
NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}
|
NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}
|
||||||
JOHN_JUMBO_COMMIT=${JOHN_JUMBO_COMMIT}"
|
JOHN_JUMBO_COMMIT=${JOHN_JUMBO_COMMIT}"
|
||||||
GPU_BUILD_INFO="nvidia:${NVIDIA_DRIVER_VERSION}"
|
GPU_BUILD_INFO="nvidia-${BEE_NVIDIA_MODULE_FLAVOR}:${NVIDIA_DRIVER_VERSION}"
|
||||||
elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
|
elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
|
||||||
GPU_VERSION_LINE="ROCM_VERSION=${ROCM_VERSION}"
|
GPU_VERSION_LINE="ROCM_VERSION=${ROCM_VERSION}"
|
||||||
GPU_BUILD_INFO="rocm:${ROCM_VERSION}"
|
GPU_BUILD_INFO="rocm:${ROCM_VERSION}"
|
||||||
@@ -1073,6 +1094,7 @@ fi
|
|||||||
cat > "${OVERLAY_STAGE_DIR}/etc/bee-release" <<EOF
|
cat > "${OVERLAY_STAGE_DIR}/etc/bee-release" <<EOF
|
||||||
BEE_ISO_VERSION=${ISO_VERSION_EFFECTIVE}
|
BEE_ISO_VERSION=${ISO_VERSION_EFFECTIVE}
|
||||||
BEE_AUDIT_VERSION=${AUDIT_VERSION_EFFECTIVE}
|
BEE_AUDIT_VERSION=${AUDIT_VERSION_EFFECTIVE}
|
||||||
|
BEE_BUILD_VARIANT=${BUILD_VARIANT}
|
||||||
BEE_GPU_VENDOR=${BEE_GPU_VENDOR}
|
BEE_GPU_VENDOR=${BEE_GPU_VENDOR}
|
||||||
BUILD_DATE=${BUILD_DATE}
|
BUILD_DATE=${BUILD_DATE}
|
||||||
GIT_COMMIT=${GIT_COMMIT}
|
GIT_COMMIT=${GIT_COMMIT}
|
||||||
@@ -1083,6 +1105,11 @@ EOF
|
|||||||
|
|
||||||
# Write GPU vendor marker for hooks
|
# Write GPU vendor marker for hooks
|
||||||
echo "${BEE_GPU_VENDOR}" > "${OVERLAY_STAGE_DIR}/etc/bee-gpu-vendor"
|
echo "${BEE_GPU_VENDOR}" > "${OVERLAY_STAGE_DIR}/etc/bee-gpu-vendor"
|
||||||
|
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||||
|
echo "${BEE_NVIDIA_MODULE_FLAVOR}" > "${OVERLAY_STAGE_DIR}/etc/bee-nvidia-modules-flavor"
|
||||||
|
else
|
||||||
|
rm -f "${OVERLAY_STAGE_DIR}/etc/bee-nvidia-modules-flavor"
|
||||||
|
fi
|
||||||
|
|
||||||
# Patch motd with build info
|
# Patch motd with build info
|
||||||
BEE_BUILD_INFO="${BUILD_DATE} git:${GIT_COMMIT} debian:${DEBIAN_VERSION} ${GPU_BUILD_INFO}"
|
BEE_BUILD_INFO="${BUILD_DATE} git:${GIT_COMMIT} debian:${DEBIAN_VERSION} ${GPU_BUILD_INFO}"
|
||||||
@@ -1153,10 +1180,10 @@ fi
|
|||||||
|
|
||||||
# --- build ISO using live-build ---
|
# --- build ISO using live-build ---
|
||||||
echo ""
|
echo ""
|
||||||
echo "=== building ISO (live-build, variant: ${BEE_GPU_VENDOR}) ==="
|
echo "=== building ISO (variant: ${BUILD_VARIANT}) ==="
|
||||||
|
|
||||||
# Export for auto/config
|
# Export for auto/config
|
||||||
BEE_GPU_VENDOR_UPPER="$(echo "${BEE_GPU_VENDOR}" | tr 'a-z' 'A-Z')"
|
BEE_GPU_VENDOR_UPPER="$(echo "${BUILD_VARIANT}" | tr 'a-z-' 'A-Z_')"
|
||||||
export BEE_GPU_VENDOR_UPPER
|
export BEE_GPU_VENDOR_UPPER
|
||||||
|
|
||||||
cd "${LB_DIR}"
|
cd "${LB_DIR}"
|
||||||
@@ -1191,7 +1218,7 @@ if [ -f "$ISO_RAW" ]; then
|
|||||||
validate_iso_nvidia_runtime "$ISO_RAW"
|
validate_iso_nvidia_runtime "$ISO_RAW"
|
||||||
cp "$ISO_RAW" "$ISO_OUT"
|
cp "$ISO_RAW" "$ISO_OUT"
|
||||||
echo ""
|
echo ""
|
||||||
echo "=== done (${BEE_GPU_VENDOR}) ==="
|
echo "=== done (${BUILD_VARIANT}) ==="
|
||||||
echo "ISO: $ISO_OUT"
|
echo "ISO: $ISO_OUT"
|
||||||
if command -v stat >/dev/null 2>&1; then
|
if command -v stat >/dev/null 2>&1; then
|
||||||
ISO_SIZE_BYTES="$(stat -c '%s' "$ISO_OUT" 2>/dev/null || stat -f '%z' "$ISO_OUT")"
|
ISO_SIZE_BYTES="$(stat -c '%s' "$ISO_OUT" 2>/dev/null || stat -f '%z' "$ISO_OUT")"
|
||||||
|
|||||||
@@ -62,6 +62,8 @@ done
|
|||||||
echo "loader=bee-gpu-burn"
|
echo "loader=bee-gpu-burn"
|
||||||
echo "selected_gpus=${FINAL}"
|
echo "selected_gpus=${FINAL}"
|
||||||
|
|
||||||
|
export CUDA_DEVICE_ORDER="PCI_BUS_ID"
|
||||||
|
|
||||||
TMP_DIR=$(mktemp -d)
|
TMP_DIR=$(mktemp -d)
|
||||||
trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM
|
trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM
|
||||||
|
|
||||||
@@ -78,7 +80,8 @@ for id in $(echo "${FINAL}" | tr ',' ' '); do
|
|||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
echo "starting gpu ${id} size=${gpu_size_mb}MB"
|
echo "starting gpu ${id} size=${gpu_size_mb}MB"
|
||||||
"${WORKER}" --device "${id}" --seconds "${SECONDS}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
|
CUDA_VISIBLE_DEVICES="${id}" \
|
||||||
|
"${WORKER}" --device 0 --seconds "${SECONDS}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
|
||||||
pid=$!
|
pid=$!
|
||||||
WORKERS="${WORKERS} ${pid}:${id}:${log}"
|
WORKERS="${WORKERS} ${pid}:${id}:${log}"
|
||||||
done
|
done
|
||||||
|
|||||||
@@ -152,14 +152,19 @@ done
|
|||||||
|
|
||||||
[ -n "${FINAL}" ] || { echo "no NVIDIA GPUs selected after filters" >&2; exit 1; }
|
[ -n "${FINAL}" ] || { echo "no NVIDIA GPUs selected after filters" >&2; exit 1; }
|
||||||
|
|
||||||
|
export CUDA_DEVICE_ORDER="PCI_BUS_ID"
|
||||||
|
export CUDA_VISIBLE_DEVICES="${FINAL}"
|
||||||
|
|
||||||
JOHN_DEVICES=""
|
JOHN_DEVICES=""
|
||||||
|
local_id=1
|
||||||
for id in $(echo "${FINAL}" | tr ',' ' '); do
|
for id in $(echo "${FINAL}" | tr ',' ' '); do
|
||||||
opencl_id=$((id + 1))
|
opencl_id="${local_id}"
|
||||||
if [ -z "${JOHN_DEVICES}" ]; then
|
if [ -z "${JOHN_DEVICES}" ]; then
|
||||||
JOHN_DEVICES="${opencl_id}"
|
JOHN_DEVICES="${opencl_id}"
|
||||||
else
|
else
|
||||||
JOHN_DEVICES="${JOHN_DEVICES},${opencl_id}"
|
JOHN_DEVICES="${JOHN_DEVICES},${opencl_id}"
|
||||||
fi
|
fi
|
||||||
|
local_id=$((local_id + 1))
|
||||||
done
|
done
|
||||||
|
|
||||||
echo "loader=john"
|
echo "loader=john"
|
||||||
|
|||||||
@@ -70,6 +70,8 @@ echo "gpu_count=${GPU_COUNT}"
|
|||||||
echo "range=${MIN_BYTES}..${MAX_BYTES}"
|
echo "range=${MIN_BYTES}..${MAX_BYTES}"
|
||||||
echo "iters=${ITERS}"
|
echo "iters=${ITERS}"
|
||||||
|
|
||||||
|
export CUDA_DEVICE_ORDER="PCI_BUS_ID"
|
||||||
|
|
||||||
deadline=$(( $(date +%s) + SECONDS ))
|
deadline=$(( $(date +%s) + SECONDS ))
|
||||||
round=0
|
round=0
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user