Compare commits
29 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d2eadedff2 | ||
|
|
a98c4d7461 | ||
|
|
2354ae367d | ||
|
|
0d0e1f55a7 | ||
|
|
35f4c53887 | ||
|
|
981315e6fd | ||
|
|
fc5c100a29 | ||
| 6e94216f3b | |||
| 53455063b9 | |||
| 4602f97836 | |||
| c65d3ae3b1 | |||
| 7a21c370e4 | |||
| a493e3ab5b | |||
| 19b4803ec7 | |||
| 1bdfb1e9ca | |||
| c5d6b30177 | |||
| 5b9015451e | |||
| d1a6863ceb | |||
| f9aa05de8e | |||
| a9ccea8cca | |||
| fc5c985fb5 | |||
| 5eb3baddb4 | |||
| a6ac13b5d3 | |||
| 4003cb7676 | |||
| 2875313ba0 | |||
| f1621efee4 | |||
| 4461249cc3 | |||
| e609fbbc26 | |||
| cc2b49ea41 |
@@ -243,7 +243,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
||||
return "", fmt.Errorf("write result.json: %w", err)
|
||||
}
|
||||
|
||||
report := renderBenchmarkReport(result)
|
||||
report := renderBenchmarkReportWithCharts(result, loadBenchmarkReportCharts(runDir, selected))
|
||||
if err := os.WriteFile(filepath.Join(runDir, "report.txt"), []byte(report), 0644); err != nil {
|
||||
return "", fmt.Errorf("write report.txt: %w", err)
|
||||
}
|
||||
@@ -679,7 +679,10 @@ func runBenchmarkInterconnect(ctx context.Context, verboseLog, runDir string, gp
|
||||
"-g", strconv.Itoa(len(gpuIndices)),
|
||||
"--iters", strconv.Itoa(maxInt(20, spec.NCCLSec/10)),
|
||||
}
|
||||
env := []string{"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices)}
|
||||
env := []string{
|
||||
"CUDA_DEVICE_ORDER=PCI_BUS_ID",
|
||||
"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices),
|
||||
}
|
||||
logFunc(fmt.Sprintf("NCCL interconnect: gpus=%s", joinIndexList(gpuIndices)))
|
||||
out, err := runSATCommandCtx(ctx, verboseLog, "nccl-all-reduce.log", cmd, env, logFunc)
|
||||
_ = os.WriteFile(filepath.Join(runDir, "nccl-all-reduce.log"), out, 0644)
|
||||
|
||||
@@ -2,11 +2,25 @@ package platform
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
|
||||
return renderBenchmarkReportWithCharts(result, nil)
|
||||
}
|
||||
|
||||
type benchmarkReportChart struct {
|
||||
Title string
|
||||
Content string
|
||||
}
|
||||
|
||||
var ansiEscapePattern = regexp.MustCompile(`\x1b\[[0-9;]*m`)
|
||||
|
||||
func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benchmarkReportChart) string {
|
||||
var b strings.Builder
|
||||
fmt.Fprintf(&b, "Bee NVIDIA Benchmark Report\n")
|
||||
fmt.Fprintf(&b, "===========================\n\n")
|
||||
@@ -93,6 +107,20 @@ func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
if len(charts) > 0 {
|
||||
fmt.Fprintf(&b, "Terminal Charts\n")
|
||||
fmt.Fprintf(&b, "---------------\n")
|
||||
for _, chart := range charts {
|
||||
content := strings.TrimSpace(stripANSIEscapeSequences(chart.Content))
|
||||
if content == "" {
|
||||
continue
|
||||
}
|
||||
fmt.Fprintf(&b, "%s\n", chart.Title)
|
||||
fmt.Fprintf(&b, "%s\n", strings.Repeat("~", len(chart.Title)))
|
||||
fmt.Fprintf(&b, "%s\n\n", content)
|
||||
}
|
||||
}
|
||||
|
||||
fmt.Fprintf(&b, "Methodology\n")
|
||||
fmt.Fprintf(&b, "-----------\n")
|
||||
fmt.Fprintf(&b, "- Profile %s uses standardized baseline, warmup, steady-state, interconnect, and cooldown phases.\n", result.BenchmarkProfile)
|
||||
@@ -117,6 +145,36 @@ func renderBenchmarkReport(result NvidiaBenchmarkResult) string {
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func loadBenchmarkReportCharts(runDir string, gpuIndices []int) []benchmarkReportChart {
|
||||
phases := []struct {
|
||||
name string
|
||||
label string
|
||||
}{
|
||||
{name: "baseline", label: "Baseline"},
|
||||
{name: "steady", label: "Steady State"},
|
||||
{name: "cooldown", label: "Cooldown"},
|
||||
}
|
||||
var charts []benchmarkReportChart
|
||||
for _, idx := range gpuIndices {
|
||||
for _, phase := range phases {
|
||||
path := filepath.Join(runDir, fmt.Sprintf("gpu-%d-%s-metrics-term.txt", idx, phase.name))
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil || len(raw) == 0 {
|
||||
continue
|
||||
}
|
||||
charts = append(charts, benchmarkReportChart{
|
||||
Title: fmt.Sprintf("GPU %d %s", idx, phase.label),
|
||||
Content: string(raw),
|
||||
})
|
||||
}
|
||||
}
|
||||
return charts
|
||||
}
|
||||
|
||||
func stripANSIEscapeSequences(raw string) string {
|
||||
return ansiEscapePattern.ReplaceAllString(raw, "")
|
||||
}
|
||||
|
||||
func renderBenchmarkSummary(result NvidiaBenchmarkResult) string {
|
||||
var b strings.Builder
|
||||
fmt.Fprintf(&b, "run_at_utc=%s\n", result.GeneratedAt.Format(time.RFC3339))
|
||||
|
||||
@@ -145,3 +145,35 @@ func TestRenderBenchmarkReportIncludesFindingsAndScores(t *testing.T) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestRenderBenchmarkReportIncludesTerminalChartsWithoutANSI(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
report := renderBenchmarkReportWithCharts(NvidiaBenchmarkResult{
|
||||
BenchmarkProfile: NvidiaBenchmarkProfileStandard,
|
||||
OverallStatus: "OK",
|
||||
SelectedGPUIndices: []int{0},
|
||||
Normalization: BenchmarkNormalization{
|
||||
Status: "full",
|
||||
},
|
||||
}, []benchmarkReportChart{
|
||||
{
|
||||
Title: "GPU 0 Steady State",
|
||||
Content: "\x1b[31mGPU 0 chart\x1b[0m\n 42┤───",
|
||||
},
|
||||
})
|
||||
|
||||
for _, needle := range []string{
|
||||
"Terminal Charts",
|
||||
"GPU 0 Steady State",
|
||||
"GPU 0 chart",
|
||||
"42┤───",
|
||||
} {
|
||||
if !strings.Contains(report, needle) {
|
||||
t.Fatalf("report missing %q\n%s", needle, report)
|
||||
}
|
||||
}
|
||||
if strings.Contains(report, "\x1b[31m") {
|
||||
t.Fatalf("report should not contain ANSI escapes\n%s", report)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,6 +15,10 @@ var workerPatterns = []string{
|
||||
"stress-ng",
|
||||
"stressapptest",
|
||||
"memtester",
|
||||
// DCGM diagnostic workers — nvvs is spawned by dcgmi diag and survives
|
||||
// if dcgmi is killed mid-run, leaving the GPU occupied (DCGM_ST_IN_USE).
|
||||
"nvvs",
|
||||
"dcgmi",
|
||||
}
|
||||
|
||||
// KilledProcess describes a process that was sent SIGKILL.
|
||||
|
||||
@@ -16,12 +16,12 @@ func (s *System) RunNvidiaStressPack(ctx context.Context, baseDir string, opts N
|
||||
return "", err
|
||||
}
|
||||
|
||||
return runAcceptancePackCtx(ctx, baseDir, nvidiaStressArchivePrefix(opts.Loader), []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
|
||||
return runAcceptancePackCtx(ctx, baseDir, nvidiaStressArchivePrefix(opts.Loader), withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
|
||||
job,
|
||||
{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
}, logFunc)
|
||||
satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
), logFunc)
|
||||
}
|
||||
|
||||
func nvidiaStressArchivePrefix(loader string) string {
|
||||
|
||||
@@ -110,7 +110,7 @@ func (s *System) RunPlatformStress(
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
gpuCmd := buildGPUStressCmd(loadCtx, vendor)
|
||||
gpuCmd := buildGPUStressCmd(loadCtx, vendor, cycle.LoadSec)
|
||||
if gpuCmd == nil {
|
||||
return
|
||||
}
|
||||
@@ -392,6 +392,13 @@ func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
|
||||
cmdArgs = append(cmdArgs, "-M", strconv.Itoa(mb))
|
||||
}
|
||||
cmd := exec.CommandContext(ctx, path, cmdArgs...)
|
||||
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||||
cmd.Cancel = func() error {
|
||||
if cmd.Process != nil {
|
||||
_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
cmd.Stdout = nil
|
||||
cmd.Stderr = nil
|
||||
if err := startLowPriorityCmd(cmd, 15); err != nil {
|
||||
@@ -402,28 +409,28 @@ func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
|
||||
|
||||
// buildGPUStressCmd creates a GPU stress command appropriate for the detected vendor.
|
||||
// Returns nil if no GPU stress tool is available (CPU-only cycling still useful).
|
||||
func buildGPUStressCmd(ctx context.Context, vendor string) *exec.Cmd {
|
||||
func buildGPUStressCmd(ctx context.Context, vendor string, durSec int) *exec.Cmd {
|
||||
switch strings.ToLower(vendor) {
|
||||
case "amd":
|
||||
return buildAMDGPUStressCmd(ctx)
|
||||
return buildAMDGPUStressCmd(ctx, durSec)
|
||||
case "nvidia":
|
||||
return buildNvidiaGPUStressCmd(ctx)
|
||||
return buildNvidiaGPUStressCmd(ctx, durSec)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
|
||||
func buildAMDGPUStressCmd(ctx context.Context, durSec int) *exec.Cmd {
|
||||
rvsArgs, err := resolveRVSCommand()
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
rvsPath := rvsArgs[0]
|
||||
cfg := `actions:
|
||||
cfg := fmt.Sprintf(`actions:
|
||||
- name: gst_platform
|
||||
device: all
|
||||
module: gst
|
||||
parallel: true
|
||||
duration: 86400000
|
||||
duration: %d`, durSec*1000) + `
|
||||
copy_matrix: false
|
||||
target_stress: 90
|
||||
matrix_size_a: 8640
|
||||
@@ -433,13 +440,20 @@ func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
|
||||
cfgFile := "/tmp/bee-platform-gst.conf"
|
||||
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
|
||||
cmd := exec.CommandContext(ctx, rvsPath, "-c", cfgFile)
|
||||
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||||
cmd.Cancel = func() error {
|
||||
if cmd.Process != nil {
|
||||
_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
cmd.Stdout = nil
|
||||
cmd.Stderr = nil
|
||||
_ = startLowPriorityCmd(cmd, 10)
|
||||
return cmd
|
||||
}
|
||||
|
||||
func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
|
||||
func buildNvidiaGPUStressCmd(ctx context.Context, durSec int) *exec.Cmd {
|
||||
path, err := satLookPath("bee-gpu-burn")
|
||||
if err != nil {
|
||||
path, err = satLookPath("bee-gpu-stress")
|
||||
@@ -447,7 +461,17 @@ func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
cmd := exec.CommandContext(ctx, path, "--seconds", "86400")
|
||||
// Pass exact duration so bee-gpu-burn exits on its own when the cycle ends.
|
||||
// Process group kill via Setpgid+Cancel is kept as a safety net for cases
|
||||
// where the context is cancelled early (user stop, parent timeout).
|
||||
cmd := exec.CommandContext(ctx, path, "--seconds", strconv.Itoa(durSec))
|
||||
cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||||
cmd.Cancel = func() error {
|
||||
if cmd.Process != nil {
|
||||
_ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
cmd.Stdout = nil
|
||||
cmd.Stderr = nil
|
||||
_ = startLowPriorityCmd(cmd, 10)
|
||||
|
||||
@@ -173,6 +173,22 @@ func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHe
|
||||
|
||||
switch vendor {
|
||||
case "nvidia":
|
||||
if raw, err := os.ReadFile("/run/bee-nvidia-mode"); err == nil {
|
||||
health.NvidiaGSPMode = strings.TrimSpace(string(raw))
|
||||
if health.NvidiaGSPMode == "gsp-stuck" {
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "nvidia_gsp_stuck",
|
||||
Severity: "critical",
|
||||
Description: "NVIDIA GSP firmware init timed out and the kernel module is stuck. Reboot and select 'GSP=off' in the boot menu.",
|
||||
})
|
||||
} else if health.NvidiaGSPMode == "gsp-off" {
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "nvidia_gsp_disabled",
|
||||
Severity: "warning",
|
||||
Description: "NVIDIA GSP firmware disabled (fallback). Power management runs via CPU path — power draw readings may differ from reference hardware.",
|
||||
})
|
||||
}
|
||||
}
|
||||
health.DriverReady = strings.Contains(lsmodText, "nvidia ")
|
||||
if !health.DriverReady {
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
|
||||
@@ -21,10 +21,11 @@ import (
|
||||
)
|
||||
|
||||
var (
|
||||
satExecCommand = exec.Command
|
||||
satLookPath = exec.LookPath
|
||||
satGlob = filepath.Glob
|
||||
satStat = os.Stat
|
||||
satExecCommand = exec.Command
|
||||
satLookPath = exec.LookPath
|
||||
satGlob = filepath.Glob
|
||||
satStat = os.Stat
|
||||
satFreeMemBytes = freeMemBytes
|
||||
|
||||
rocmSMIExecutableGlobs = []string{
|
||||
"/opt/rocm/bin/rocm-smi",
|
||||
@@ -262,6 +263,9 @@ func (s *System) ListNvidiaGPUs() ([]NvidiaGPU, error) {
|
||||
MemoryMB: memMB,
|
||||
})
|
||||
}
|
||||
sort.Slice(gpus, func(i, j int) bool {
|
||||
return gpus[i].Index < gpus[j].Index
|
||||
})
|
||||
return gpus, nil
|
||||
}
|
||||
|
||||
@@ -274,13 +278,13 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
|
||||
if gpuCount < 1 {
|
||||
gpuCount = 1
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "nccl-tests", []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "02-all-reduce-perf.log", cmd: []string{
|
||||
return runAcceptancePackCtx(ctx, baseDir, "nccl-tests", withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{name: "02-all-reduce-perf.log", cmd: []string{
|
||||
"all_reduce_perf", "-b", "512M", "-e", "4G", "-f", "2",
|
||||
"-g", strconv.Itoa(gpuCount), "--iters", "20",
|
||||
}},
|
||||
}, logFunc)
|
||||
), logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
@@ -292,18 +296,18 @@ func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir strin
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
|
||||
{
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
|
||||
satJob{
|
||||
name: "03-dcgmproftester.log",
|
||||
cmd: profCmd,
|
||||
env: nvidiaVisibleDevicesEnv(selected),
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
},
|
||||
{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
}, logFunc)
|
||||
satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
), logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
@@ -311,16 +315,16 @@ func (s *System) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string,
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-power", withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{
|
||||
name: "02-dcgmi-targeted-power.log",
|
||||
cmd: nvidiaDCGMNamedDiagCommand("targeted_power", normalizeNvidiaBurnDuration(durationSec), selected),
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
},
|
||||
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
}, logFunc)
|
||||
satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
), logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
@@ -328,16 +332,16 @@ func (s *System) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, dur
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-pulse", withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{
|
||||
name: "02-dcgmi-pulse-test.log",
|
||||
cmd: nvidiaDCGMNamedDiagCommand("pulse_test", normalizeNvidiaBurnDuration(durationSec), selected),
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
},
|
||||
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
}, logFunc)
|
||||
satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
), logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
@@ -345,16 +349,16 @@ func (s *System) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpu
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-bandwidth", withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{
|
||||
name: "02-dcgmi-nvbandwidth.log",
|
||||
cmd: nvidiaDCGMNamedDiagCommand("nvbandwidth", 0, selected),
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
},
|
||||
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
}, logFunc)
|
||||
satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
), logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
@@ -378,16 +382,23 @@ func (s *System) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDi
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-stress", []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{
|
||||
// Kill any lingering nvvs/dcgmi processes from a previous interrupted run
|
||||
// before starting — otherwise dcgmi diag fails with DCGM_ST_IN_USE (-34).
|
||||
if killed := KillTestWorkers(); len(killed) > 0 && logFunc != nil {
|
||||
for _, p := range killed {
|
||||
logFunc(fmt.Sprintf("pre-flight: killed stale worker pid=%d name=%s", p.PID, p.Name))
|
||||
}
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-targeted-stress", withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{
|
||||
name: "02-dcgmi-targeted-stress.log",
|
||||
cmd: nvidiaDCGMNamedDiagCommand("targeted_stress", normalizeNvidiaBurnDuration(durationSec), selected),
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
},
|
||||
{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
}, logFunc)
|
||||
satJob{name: "03-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
), logFunc)
|
||||
}
|
||||
|
||||
func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) {
|
||||
@@ -404,6 +415,25 @@ func resolveDCGMGPUIndices(gpuIndices []int) ([]int, error) {
|
||||
return all, nil
|
||||
}
|
||||
|
||||
func memoryStressSizeArg() string {
|
||||
if mb := envInt("BEE_VM_STRESS_SIZE_MB", 0); mb > 0 {
|
||||
return fmt.Sprintf("%dM", mb)
|
||||
}
|
||||
availBytes := satFreeMemBytes()
|
||||
if availBytes <= 0 {
|
||||
return "80%"
|
||||
}
|
||||
availMB := availBytes / (1024 * 1024)
|
||||
targetMB := (availMB * 2) / 3
|
||||
if targetMB >= 256 {
|
||||
targetMB = (targetMB / 256) * 256
|
||||
}
|
||||
if targetMB <= 0 {
|
||||
return "80%"
|
||||
}
|
||||
return fmt.Sprintf("%dM", targetMB)
|
||||
}
|
||||
|
||||
func (s *System) RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
sizeMB := envInt("BEE_MEMTESTER_SIZE_MB", 128)
|
||||
passes := envInt("BEE_MEMTESTER_PASSES", 1)
|
||||
@@ -419,11 +449,9 @@ func (s *System) RunMemoryStressPack(ctx context.Context, baseDir string, durati
|
||||
if seconds <= 0 {
|
||||
seconds = envInt("BEE_VM_STRESS_SECONDS", 300)
|
||||
}
|
||||
// Use 80% of RAM by default; override with BEE_VM_STRESS_SIZE_MB.
|
||||
sizeArg := "80%"
|
||||
if mb := envInt("BEE_VM_STRESS_SIZE_MB", 0); mb > 0 {
|
||||
sizeArg = fmt.Sprintf("%dM", mb)
|
||||
}
|
||||
// Base the default on current MemAvailable and keep headroom for the OS and
|
||||
// concurrent stressors so mixed burn runs do not trip the OOM killer.
|
||||
sizeArg := memoryStressSizeArg()
|
||||
return runAcceptancePackCtx(ctx, baseDir, "memory-stress", []satJob{
|
||||
{name: "01-free-before.log", cmd: []string{"free", "-h"}},
|
||||
{name: "02-stress-ng-vm.log", cmd: []string{
|
||||
@@ -540,14 +568,24 @@ type satStats struct {
|
||||
Unsupported int
|
||||
}
|
||||
|
||||
func withNvidiaPersistenceMode(jobs ...satJob) []satJob {
|
||||
out := make([]satJob, 0, len(jobs)+1)
|
||||
out = append(out, satJob{
|
||||
name: "00-nvidia-smi-persistence-mode.log",
|
||||
cmd: []string{"nvidia-smi", "-pm", "1"},
|
||||
})
|
||||
out = append(out, jobs...)
|
||||
return out
|
||||
}
|
||||
|
||||
func nvidiaSATJobs() []satJob {
|
||||
return []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||
{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||
{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
|
||||
{name: "05-bee-gpu-burn.log", cmd: []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}},
|
||||
}
|
||||
return withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||
satJob{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||
satJob{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
|
||||
satJob{name: "05-bee-gpu-burn.log", cmd: []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}},
|
||||
)
|
||||
}
|
||||
|
||||
func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
|
||||
@@ -562,12 +600,12 @@ func nvidiaDCGMJobs(diagLevel int, gpuIndices []int) []satJob {
|
||||
}
|
||||
diagArgs = append(diagArgs, "-i", strings.Join(ids, ","))
|
||||
}
|
||||
return []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||
{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||
{name: "04-dcgmi-diag.log", cmd: diagArgs},
|
||||
}
|
||||
return withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||
satJob{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||
satJob{name: "04-dcgmi-diag.log", cmd: diagArgs},
|
||||
)
|
||||
}
|
||||
|
||||
func nvidiaDCGMNamedDiagCommand(name string, durationSec int, gpuIndices []int) []string {
|
||||
@@ -592,7 +630,10 @@ func nvidiaVisibleDevicesEnv(gpuIndices []int) []string {
|
||||
if len(gpuIndices) == 0 {
|
||||
return nil
|
||||
}
|
||||
return []string{"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices)}
|
||||
return []string{
|
||||
"CUDA_DEVICE_ORDER=PCI_BUS_ID",
|
||||
"CUDA_VISIBLE_DEVICES=" + joinIndexList(gpuIndices),
|
||||
}
|
||||
}
|
||||
|
||||
func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []satJob, logFunc func(string)) (string, error) {
|
||||
@@ -633,6 +674,9 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
|
||||
if writeErr := os.WriteFile(filepath.Join(runDir, job.name), out, 0644); writeErr != nil {
|
||||
return "", writeErr
|
||||
}
|
||||
if ctx.Err() != nil {
|
||||
return "", ctx.Err()
|
||||
}
|
||||
status, rc := classifySATResult(job.name, out, err)
|
||||
stats.Add(status)
|
||||
key := strings.TrimSuffix(strings.TrimPrefix(job.name, "0"), ".log")
|
||||
|
||||
@@ -1,12 +1,14 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestStorageSATCommands(t *testing.T) {
|
||||
@@ -28,13 +30,19 @@ func TestRunNvidiaAcceptancePackIncludesGPUStress(t *testing.T) {
|
||||
|
||||
jobs := nvidiaSATJobs()
|
||||
|
||||
if len(jobs) != 5 {
|
||||
t.Fatalf("jobs=%d want 5", len(jobs))
|
||||
if len(jobs) != 6 {
|
||||
t.Fatalf("jobs=%d want 6", len(jobs))
|
||||
}
|
||||
if got := jobs[4].cmd[0]; got != "bee-gpu-burn" {
|
||||
if got := jobs[0].cmd[0]; got != "nvidia-smi" {
|
||||
t.Fatalf("preflight command=%q want nvidia-smi", got)
|
||||
}
|
||||
if got := strings.Join(jobs[0].cmd, " "); got != "nvidia-smi -pm 1" {
|
||||
t.Fatalf("preflight=%q want %q", got, "nvidia-smi -pm 1")
|
||||
}
|
||||
if got := jobs[5].cmd[0]; got != "bee-gpu-burn" {
|
||||
t.Fatalf("gpu stress command=%q want bee-gpu-burn", got)
|
||||
}
|
||||
if got := jobs[3].cmd[1]; got != "--output-file" {
|
||||
if got := jobs[4].cmd[1]; got != "--output-file" {
|
||||
t.Fatalf("bug report flag=%q want --output-file", got)
|
||||
}
|
||||
}
|
||||
@@ -82,7 +90,7 @@ func TestAMDStressJobsIncludeBandwidthAndGST(t *testing.T) {
|
||||
|
||||
func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) {
|
||||
jobs := nvidiaSATJobs()
|
||||
got := jobs[4].cmd
|
||||
got := jobs[5].cmd
|
||||
want := []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}
|
||||
if len(got) != len(want) {
|
||||
t.Fatalf("cmd len=%d want %d", len(got), len(want))
|
||||
@@ -94,6 +102,19 @@ func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestNvidiaDCGMJobsEnablePersistenceModeBeforeDiag(t *testing.T) {
|
||||
jobs := nvidiaDCGMJobs(3, []int{2, 0})
|
||||
if len(jobs) != 5 {
|
||||
t.Fatalf("jobs=%d want 5", len(jobs))
|
||||
}
|
||||
if got := strings.Join(jobs[0].cmd, " "); got != "nvidia-smi -pm 1" {
|
||||
t.Fatalf("preflight=%q want %q", got, "nvidia-smi -pm 1")
|
||||
}
|
||||
if got := strings.Join(jobs[4].cmd, " "); got != "dcgmi diag -r 3 -i 2,0" {
|
||||
t.Fatalf("diag=%q want %q", got, "dcgmi diag -r 3 -i 2,0")
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildNvidiaStressJobUsesSelectedLoaderAndDevices(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
@@ -234,11 +255,14 @@ func TestNvidiaDCGMNamedDiagCommandUsesDurationAndSelection(t *testing.T) {
|
||||
|
||||
func TestNvidiaVisibleDevicesEnvUsesSelectedGPUs(t *testing.T) {
|
||||
env := nvidiaVisibleDevicesEnv([]int{0, 2, 4})
|
||||
if len(env) != 1 {
|
||||
t.Fatalf("env len=%d want 1 (%v)", len(env), env)
|
||||
if len(env) != 2 {
|
||||
t.Fatalf("env len=%d want 2 (%v)", len(env), env)
|
||||
}
|
||||
if env[0] != "CUDA_VISIBLE_DEVICES=0,2,4" {
|
||||
t.Fatalf("env[0]=%q want CUDA_VISIBLE_DEVICES=0,2,4", env[0])
|
||||
if env[0] != "CUDA_DEVICE_ORDER=PCI_BUS_ID" {
|
||||
t.Fatalf("env[0]=%q want CUDA_DEVICE_ORDER=PCI_BUS_ID", env[0])
|
||||
}
|
||||
if env[1] != "CUDA_VISIBLE_DEVICES=0,2,4" {
|
||||
t.Fatalf("env[1]=%q want CUDA_VISIBLE_DEVICES=0,2,4", env[1])
|
||||
}
|
||||
}
|
||||
|
||||
@@ -276,6 +300,37 @@ func TestEnvIntFallback(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestMemoryStressSizeArgUsesAvailableMemory(t *testing.T) {
|
||||
oldFreeMemBytes := satFreeMemBytes
|
||||
satFreeMemBytes = func() int64 { return 96 * 1024 * 1024 * 1024 }
|
||||
t.Cleanup(func() { satFreeMemBytes = oldFreeMemBytes })
|
||||
|
||||
if got := memoryStressSizeArg(); got != "65536M" {
|
||||
t.Fatalf("sizeArg=%q want 65536M", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMemoryStressSizeArgRespectsOverride(t *testing.T) {
|
||||
oldFreeMemBytes := satFreeMemBytes
|
||||
satFreeMemBytes = func() int64 { return 96 * 1024 * 1024 * 1024 }
|
||||
t.Cleanup(func() { satFreeMemBytes = oldFreeMemBytes })
|
||||
t.Setenv("BEE_VM_STRESS_SIZE_MB", "4096")
|
||||
|
||||
if got := memoryStressSizeArg(); got != "4096M" {
|
||||
t.Fatalf("sizeArg=%q want 4096M", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMemoryStressSizeArgFallsBackWhenFreeMemoryUnknown(t *testing.T) {
|
||||
oldFreeMemBytes := satFreeMemBytes
|
||||
satFreeMemBytes = func() int64 { return 0 }
|
||||
t.Cleanup(func() { satFreeMemBytes = oldFreeMemBytes })
|
||||
|
||||
if got := memoryStressSizeArg(); got != "80%" {
|
||||
t.Fatalf("sizeArg=%q want 80%%", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestClassifySATResult(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
@@ -300,6 +355,38 @@ func TestClassifySATResult(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunAcceptancePackCtxReturnsContextErrorWithoutArchive(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
t.Cleanup(cancel)
|
||||
|
||||
done := make(chan struct{})
|
||||
go func() {
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
cancel()
|
||||
close(done)
|
||||
}()
|
||||
|
||||
archive, err := runAcceptancePackCtx(ctx, dir, "cancelled-pack", []satJob{
|
||||
{name: "01-sleep.log", cmd: []string{"sh", "-c", "sleep 5"}},
|
||||
}, nil)
|
||||
<-done
|
||||
|
||||
if !errors.Is(err, context.Canceled) {
|
||||
t.Fatalf("err=%v want context.Canceled", err)
|
||||
}
|
||||
if archive != "" {
|
||||
t.Fatalf("archive=%q want empty", archive)
|
||||
}
|
||||
matches, globErr := filepath.Glob(filepath.Join(dir, "cancelled-pack-*.tar.gz"))
|
||||
if globErr != nil {
|
||||
t.Fatalf("Glob error: %v", globErr)
|
||||
}
|
||||
if len(matches) != 0 {
|
||||
t.Fatalf("archives=%v want none", matches)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseStorageDevicesSkipsUSBDisks(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
||||
@@ -61,7 +61,9 @@ func (s *System) ServiceState(name string) string {
|
||||
}
|
||||
|
||||
func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
|
||||
raw, err := exec.Command("systemctl", string(action), name).CombinedOutput()
|
||||
// bee-web runs as the bee user; sudo is required to control system services.
|
||||
// /etc/sudoers.d/bee grants bee NOPASSWD:ALL.
|
||||
raw, err := exec.Command("sudo", "systemctl", string(action), name).CombinedOutput()
|
||||
return string(raw), err
|
||||
}
|
||||
|
||||
|
||||
@@ -20,6 +20,7 @@ type RuntimeHealth struct {
|
||||
ExportDir string `json:"export_dir,omitempty"`
|
||||
DriverReady bool `json:"driver_ready,omitempty"`
|
||||
CUDAReady bool `json:"cuda_ready,omitempty"`
|
||||
NvidiaGSPMode string `json:"nvidia_gsp_mode,omitempty"` // "gsp-on", "gsp-off", "gsp-stuck"
|
||||
NetworkStatus string `json:"network_status,omitempty"`
|
||||
Issues []RuntimeIssue `json:"issues,omitempty"`
|
||||
Tools []RuntimeToolStatus `json:"tools,omitempty"`
|
||||
|
||||
@@ -11,6 +11,7 @@ import (
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
"syscall"
|
||||
@@ -21,13 +22,238 @@ import (
|
||||
)
|
||||
|
||||
var ansiEscapeRE = regexp.MustCompile(`\x1b\[[0-9;]*[a-zA-Z]|\x1b[()][A-Z0-9]|\x1b[DABC]`)
|
||||
var apiListNvidiaGPUs = func(a *app.App) ([]platform.NvidiaGPU, error) {
|
||||
if a == nil {
|
||||
return nil, fmt.Errorf("app not configured")
|
||||
}
|
||||
return a.ListNvidiaGPUs()
|
||||
}
|
||||
|
||||
// ── Job ID counter ────────────────────────────────────────────────────────────
|
||||
|
||||
var jobCounter atomic.Uint64
|
||||
|
||||
func newJobID(prefix string) string {
|
||||
return fmt.Sprintf("%s-%d", prefix, jobCounter.Add(1))
|
||||
func newJobID(_ string) string {
|
||||
start := int((jobCounter.Add(1) - 1) % 1000)
|
||||
globalQueue.mu.Lock()
|
||||
defer globalQueue.mu.Unlock()
|
||||
for offset := 0; offset < 1000; offset++ {
|
||||
n := (start + offset) % 1000
|
||||
id := fmt.Sprintf("TASK-%03d", n)
|
||||
if !taskIDInUseLocked(id) {
|
||||
return id
|
||||
}
|
||||
}
|
||||
return fmt.Sprintf("TASK-%03d", start)
|
||||
}
|
||||
|
||||
func taskIDInUseLocked(id string) bool {
|
||||
for _, t := range globalQueue.tasks {
|
||||
if t != nil && t.ID == id {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
type taskRunResponse struct {
|
||||
TaskID string `json:"task_id,omitempty"`
|
||||
JobID string `json:"job_id,omitempty"`
|
||||
TaskIDs []string `json:"task_ids,omitempty"`
|
||||
JobIDs []string `json:"job_ids,omitempty"`
|
||||
TaskCount int `json:"task_count,omitempty"`
|
||||
}
|
||||
|
||||
type nvidiaTaskSelection struct {
|
||||
GPUIndices []int
|
||||
Label string
|
||||
}
|
||||
|
||||
func writeTaskRunResponse(w http.ResponseWriter, tasks []*Task) {
|
||||
if len(tasks) == 0 {
|
||||
writeJSON(w, taskRunResponse{})
|
||||
return
|
||||
}
|
||||
ids := make([]string, 0, len(tasks))
|
||||
for _, t := range tasks {
|
||||
if t == nil || strings.TrimSpace(t.ID) == "" {
|
||||
continue
|
||||
}
|
||||
ids = append(ids, t.ID)
|
||||
}
|
||||
resp := taskRunResponse{TaskCount: len(ids)}
|
||||
if len(ids) > 0 {
|
||||
resp.TaskID = ids[0]
|
||||
resp.JobID = ids[0]
|
||||
resp.TaskIDs = ids
|
||||
resp.JobIDs = ids
|
||||
}
|
||||
writeJSON(w, resp)
|
||||
}
|
||||
|
||||
func shouldSplitHomogeneousNvidiaTarget(target string) bool {
|
||||
switch strings.TrimSpace(target) {
|
||||
case "nvidia", "nvidia-targeted-stress", "nvidia-benchmark", "nvidia-compute",
|
||||
"nvidia-targeted-power", "nvidia-pulse", "nvidia-interconnect",
|
||||
"nvidia-bandwidth", "nvidia-stress":
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func expandHomogeneousNvidiaSelections(gpus []platform.NvidiaGPU, include, exclude []int) ([]nvidiaTaskSelection, error) {
|
||||
if len(gpus) == 0 {
|
||||
return nil, fmt.Errorf("no NVIDIA GPUs detected")
|
||||
}
|
||||
indexed := make(map[int]platform.NvidiaGPU, len(gpus))
|
||||
allIndices := make([]int, 0, len(gpus))
|
||||
for _, gpu := range gpus {
|
||||
indexed[gpu.Index] = gpu
|
||||
allIndices = append(allIndices, gpu.Index)
|
||||
}
|
||||
sort.Ints(allIndices)
|
||||
|
||||
selected := allIndices
|
||||
if len(include) > 0 {
|
||||
selected = make([]int, 0, len(include))
|
||||
seen := make(map[int]struct{}, len(include))
|
||||
for _, idx := range include {
|
||||
if _, ok := indexed[idx]; !ok {
|
||||
continue
|
||||
}
|
||||
if _, dup := seen[idx]; dup {
|
||||
continue
|
||||
}
|
||||
seen[idx] = struct{}{}
|
||||
selected = append(selected, idx)
|
||||
}
|
||||
sort.Ints(selected)
|
||||
}
|
||||
if len(exclude) > 0 {
|
||||
skip := make(map[int]struct{}, len(exclude))
|
||||
for _, idx := range exclude {
|
||||
skip[idx] = struct{}{}
|
||||
}
|
||||
filtered := selected[:0]
|
||||
for _, idx := range selected {
|
||||
if _, ok := skip[idx]; ok {
|
||||
continue
|
||||
}
|
||||
filtered = append(filtered, idx)
|
||||
}
|
||||
selected = filtered
|
||||
}
|
||||
if len(selected) == 0 {
|
||||
return nil, fmt.Errorf("no NVIDIA GPUs selected")
|
||||
}
|
||||
|
||||
modelGroups := make(map[string][]platform.NvidiaGPU)
|
||||
modelOrder := make([]string, 0)
|
||||
for _, idx := range selected {
|
||||
gpu := indexed[idx]
|
||||
model := strings.TrimSpace(gpu.Name)
|
||||
if model == "" {
|
||||
model = fmt.Sprintf("GPU %d", gpu.Index)
|
||||
}
|
||||
if _, ok := modelGroups[model]; !ok {
|
||||
modelOrder = append(modelOrder, model)
|
||||
}
|
||||
modelGroups[model] = append(modelGroups[model], gpu)
|
||||
}
|
||||
sort.Slice(modelOrder, func(i, j int) bool {
|
||||
left := modelGroups[modelOrder[i]]
|
||||
right := modelGroups[modelOrder[j]]
|
||||
if len(left) == 0 || len(right) == 0 {
|
||||
return modelOrder[i] < modelOrder[j]
|
||||
}
|
||||
return left[0].Index < right[0].Index
|
||||
})
|
||||
|
||||
var groups []nvidiaTaskSelection
|
||||
var singles []nvidiaTaskSelection
|
||||
for _, model := range modelOrder {
|
||||
group := modelGroups[model]
|
||||
sort.Slice(group, func(i, j int) bool { return group[i].Index < group[j].Index })
|
||||
indices := make([]int, 0, len(group))
|
||||
for _, gpu := range group {
|
||||
indices = append(indices, gpu.Index)
|
||||
}
|
||||
if len(indices) >= 2 {
|
||||
groups = append(groups, nvidiaTaskSelection{
|
||||
GPUIndices: indices,
|
||||
Label: fmt.Sprintf("%s; GPUs %s", model, joinTaskIndices(indices)),
|
||||
})
|
||||
continue
|
||||
}
|
||||
gpu := group[0]
|
||||
singles = append(singles, nvidiaTaskSelection{
|
||||
GPUIndices: []int{gpu.Index},
|
||||
Label: fmt.Sprintf("GPU %d — %s", gpu.Index, model),
|
||||
})
|
||||
}
|
||||
return append(groups, singles...), nil
|
||||
}
|
||||
|
||||
func joinTaskIndices(indices []int) string {
|
||||
parts := make([]string, 0, len(indices))
|
||||
for _, idx := range indices {
|
||||
parts = append(parts, fmt.Sprintf("%d", idx))
|
||||
}
|
||||
return strings.Join(parts, ",")
|
||||
}
|
||||
|
||||
func formatSplitTaskName(baseName, selectionLabel string) string {
|
||||
baseName = strings.TrimSpace(baseName)
|
||||
selectionLabel = strings.TrimSpace(selectionLabel)
|
||||
if baseName == "" {
|
||||
return selectionLabel
|
||||
}
|
||||
if selectionLabel == "" {
|
||||
return baseName
|
||||
}
|
||||
return baseName + " (" + selectionLabel + ")"
|
||||
}
|
||||
|
||||
func buildNvidiaTaskSet(target string, priority int, createdAt time.Time, params taskParams, baseName string, appRef *app.App, idPrefix string) ([]*Task, error) {
|
||||
if !shouldSplitHomogeneousNvidiaTarget(target) {
|
||||
t := &Task{
|
||||
ID: newJobID(idPrefix),
|
||||
Name: baseName,
|
||||
Target: target,
|
||||
Priority: priority,
|
||||
Status: TaskPending,
|
||||
CreatedAt: createdAt,
|
||||
params: params,
|
||||
}
|
||||
return []*Task{t}, nil
|
||||
}
|
||||
gpus, err := apiListNvidiaGPUs(appRef)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
selections, err := expandHomogeneousNvidiaSelections(gpus, params.GPUIndices, params.ExcludeGPUIndices)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
tasks := make([]*Task, 0, len(selections))
|
||||
for _, selection := range selections {
|
||||
taskParamsCopy := params
|
||||
taskParamsCopy.GPUIndices = append([]int(nil), selection.GPUIndices...)
|
||||
taskParamsCopy.ExcludeGPUIndices = nil
|
||||
displayName := formatSplitTaskName(baseName, selection.Label)
|
||||
taskParamsCopy.DisplayName = displayName
|
||||
tasks = append(tasks, &Task{
|
||||
ID: newJobID(idPrefix),
|
||||
Name: displayName,
|
||||
Target: target,
|
||||
Priority: priority,
|
||||
Status: TaskPending,
|
||||
CreatedAt: createdAt,
|
||||
params: taskParamsCopy,
|
||||
})
|
||||
}
|
||||
return tasks, nil
|
||||
}
|
||||
|
||||
// ── SSE helpers ───────────────────────────────────────────────────────────────
|
||||
@@ -207,28 +433,28 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
||||
}
|
||||
|
||||
name := taskDisplayName(target, body.Profile, body.Loader)
|
||||
t := &Task{
|
||||
ID: newJobID("sat-" + target),
|
||||
Name: name,
|
||||
Target: target,
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{
|
||||
Duration: body.Duration,
|
||||
DiagLevel: body.DiagLevel,
|
||||
GPUIndices: body.GPUIndices,
|
||||
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
||||
Loader: body.Loader,
|
||||
BurnProfile: body.Profile,
|
||||
DisplayName: body.DisplayName,
|
||||
PlatformComponents: body.PlatformComponents,
|
||||
},
|
||||
}
|
||||
if strings.TrimSpace(body.DisplayName) != "" {
|
||||
t.Name = body.DisplayName
|
||||
name = body.DisplayName
|
||||
}
|
||||
globalQueue.enqueue(t)
|
||||
writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID})
|
||||
params := taskParams{
|
||||
Duration: body.Duration,
|
||||
DiagLevel: body.DiagLevel,
|
||||
GPUIndices: body.GPUIndices,
|
||||
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
||||
Loader: body.Loader,
|
||||
BurnProfile: body.Profile,
|
||||
DisplayName: body.DisplayName,
|
||||
PlatformComponents: body.PlatformComponents,
|
||||
}
|
||||
tasks, err := buildNvidiaTaskSet(target, 0, time.Now(), params, name, h.opts.App, "sat-"+target)
|
||||
if err != nil {
|
||||
writeError(w, http.StatusBadRequest, err.Error())
|
||||
return
|
||||
}
|
||||
for _, t := range tasks {
|
||||
globalQueue.enqueue(t)
|
||||
}
|
||||
writeTaskRunResponse(w, tasks)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -257,27 +483,26 @@ func (h *handler) handleAPIBenchmarkNvidiaRun(w http.ResponseWriter, r *http.Req
|
||||
if body.RunNCCL != nil {
|
||||
runNCCL = *body.RunNCCL
|
||||
}
|
||||
t := &Task{
|
||||
ID: newJobID("benchmark-nvidia"),
|
||||
Name: taskDisplayName("nvidia-benchmark", "", ""),
|
||||
Target: "nvidia-benchmark",
|
||||
Priority: 15,
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{
|
||||
GPUIndices: body.GPUIndices,
|
||||
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
||||
SizeMB: body.SizeMB,
|
||||
BenchmarkProfile: body.Profile,
|
||||
RunNCCL: runNCCL,
|
||||
DisplayName: body.DisplayName,
|
||||
},
|
||||
}
|
||||
name := taskDisplayName("nvidia-benchmark", "", "")
|
||||
if strings.TrimSpace(body.DisplayName) != "" {
|
||||
t.Name = body.DisplayName
|
||||
name = body.DisplayName
|
||||
}
|
||||
globalQueue.enqueue(t)
|
||||
writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID})
|
||||
tasks, err := buildNvidiaTaskSet("nvidia-benchmark", 15, time.Now(), taskParams{
|
||||
GPUIndices: body.GPUIndices,
|
||||
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
||||
SizeMB: body.SizeMB,
|
||||
BenchmarkProfile: body.Profile,
|
||||
RunNCCL: runNCCL,
|
||||
DisplayName: body.DisplayName,
|
||||
}, name, h.opts.App, "benchmark-nvidia")
|
||||
if err != nil {
|
||||
writeError(w, http.StatusBadRequest, err.Error())
|
||||
return
|
||||
}
|
||||
for _, t := range tasks {
|
||||
globalQueue.enqueue(t)
|
||||
}
|
||||
writeTaskRunResponse(w, tasks)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPISATStream(w http.ResponseWriter, r *http.Request) {
|
||||
@@ -383,11 +608,13 @@ func (h *handler) handleAPIServicesAction(w http.ResponseWriter, r *http.Request
|
||||
return
|
||||
}
|
||||
result, err := h.opts.App.ServiceActionResult(req.Name, action)
|
||||
status := "ok"
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
status = "error"
|
||||
}
|
||||
writeJSON(w, map[string]string{"status": "ok", "output": result.Body})
|
||||
// Always return 200 with output so the frontend can display the actual
|
||||
// systemctl error message instead of a generic "exit status 1".
|
||||
writeJSON(w, map[string]string{"status": status, "output": result.Body})
|
||||
}
|
||||
|
||||
// ── Network ───────────────────────────────────────────────────────────────────
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
@@ -74,6 +75,14 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
|
||||
globalQueue.tasks = originalTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
prevList := apiListNvidiaGPUs
|
||||
apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
|
||||
return []platform.NvidiaGPU{
|
||||
{Index: 1, Name: "NVIDIA H100 PCIe"},
|
||||
{Index: 3, Name: "NVIDIA H100 PCIe"},
|
||||
}, nil
|
||||
}
|
||||
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
||||
|
||||
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||
req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[1,3],"run_nccl":false}`))
|
||||
@@ -101,6 +110,97 @@ func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPIBenchmarkNvidiaRunSplitsMixedGPUModels(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
originalTasks := globalQueue.tasks
|
||||
globalQueue.tasks = nil
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = originalTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
prevList := apiListNvidiaGPUs
|
||||
apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
|
||||
return []platform.NvidiaGPU{
|
||||
{Index: 0, Name: "NVIDIA H100 PCIe"},
|
||||
{Index: 1, Name: "NVIDIA H100 PCIe"},
|
||||
{Index: 2, Name: "NVIDIA H200 NVL"},
|
||||
}, nil
|
||||
}
|
||||
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
||||
|
||||
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||
req := httptest.NewRequest("POST", "/api/benchmark/nvidia/run", strings.NewReader(`{"profile":"standard","gpu_indices":[0,1,2],"run_nccl":false}`))
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
h.handleAPIBenchmarkNvidiaRun(rec, req)
|
||||
|
||||
if rec.Code != 200 {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
var resp taskRunResponse
|
||||
if err := json.Unmarshal(rec.Body.Bytes(), &resp); err != nil {
|
||||
t.Fatalf("decode response: %v", err)
|
||||
}
|
||||
if len(resp.TaskIDs) != 2 {
|
||||
t.Fatalf("task_ids=%v want 2 items", resp.TaskIDs)
|
||||
}
|
||||
globalQueue.mu.Lock()
|
||||
defer globalQueue.mu.Unlock()
|
||||
if len(globalQueue.tasks) != 2 {
|
||||
t.Fatalf("tasks=%d want 2", len(globalQueue.tasks))
|
||||
}
|
||||
if got := globalQueue.tasks[0].params.GPUIndices; len(got) != 2 || got[0] != 0 || got[1] != 1 {
|
||||
t.Fatalf("task[0] gpu indices=%v want [0 1]", got)
|
||||
}
|
||||
if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
|
||||
t.Fatalf("task[1] gpu indices=%v want [2]", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPISATRunSplitsMixedNvidiaTaskSet(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
originalTasks := globalQueue.tasks
|
||||
globalQueue.tasks = nil
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = originalTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
prevList := apiListNvidiaGPUs
|
||||
apiListNvidiaGPUs = func(_ *app.App) ([]platform.NvidiaGPU, error) {
|
||||
return []platform.NvidiaGPU{
|
||||
{Index: 0, Name: "NVIDIA H100 PCIe"},
|
||||
{Index: 1, Name: "NVIDIA H100 PCIe"},
|
||||
{Index: 2, Name: "NVIDIA H200 NVL"},
|
||||
}, nil
|
||||
}
|
||||
t.Cleanup(func() { apiListNvidiaGPUs = prevList })
|
||||
|
||||
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||
req := httptest.NewRequest("POST", "/api/sat/nvidia-targeted-power/run", strings.NewReader(`{"profile":"acceptance","gpu_indices":[0,1,2]}`))
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
h.handleAPISATRun("nvidia-targeted-power").ServeHTTP(rec, req)
|
||||
|
||||
if rec.Code != 200 {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
globalQueue.mu.Lock()
|
||||
defer globalQueue.mu.Unlock()
|
||||
if len(globalQueue.tasks) != 2 {
|
||||
t.Fatalf("tasks=%d want 2", len(globalQueue.tasks))
|
||||
}
|
||||
if got := globalQueue.tasks[0].params.GPUIndices; len(got) != 2 || got[0] != 0 || got[1] != 1 {
|
||||
t.Fatalf("task[0] gpu indices=%v want [0 1]", got)
|
||||
}
|
||||
if got := globalQueue.tasks[1].params.GPUIndices; len(got) != 1 || got[0] != 2 {
|
||||
t.Fatalf("task[1] gpu indices=%v want [2]", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {
|
||||
h := &handler{}
|
||||
h.pushFanRings([]platform.FanReading{
|
||||
|
||||
@@ -6,6 +6,7 @@ import (
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
@@ -52,6 +53,12 @@ var metricChartPalette = []string{
|
||||
"#ffbe5c",
|
||||
}
|
||||
|
||||
var gpuLabelCache struct {
|
||||
mu sync.Mutex
|
||||
loadedAt time.Time
|
||||
byIndex map[int]string
|
||||
}
|
||||
|
||||
func renderMetricChartSVG(title string, labels []string, times []time.Time, datasets [][]float64, names []string, yMin, yMax *float64, canvasHeight int, timeline []chartTimelineSegment) ([]byte, error) {
|
||||
pointCount := len(labels)
|
||||
if len(times) > pointCount {
|
||||
@@ -76,15 +83,7 @@ func renderMetricChartSVG(title string, labels []string, times []time.Time, data
|
||||
}
|
||||
}
|
||||
|
||||
mn, avg, mx := globalStats(datasets)
|
||||
if mx > 0 {
|
||||
title = fmt.Sprintf("%s ↓%s ~%s ↑%s",
|
||||
title,
|
||||
chartLegendNumber(mn),
|
||||
chartLegendNumber(avg),
|
||||
chartLegendNumber(mx),
|
||||
)
|
||||
}
|
||||
statsLabel := chartStatsLabel(datasets)
|
||||
|
||||
legendItems := []metricChartSeries{}
|
||||
for i, name := range names {
|
||||
@@ -106,7 +105,7 @@ func renderMetricChartSVG(title string, labels []string, times []time.Time, data
|
||||
|
||||
var b strings.Builder
|
||||
writeSVGOpen(&b, layout.Width, layout.Height)
|
||||
writeChartFrame(&b, title, layout.Width, layout.Height)
|
||||
writeChartFrame(&b, title, statsLabel, layout.Width, layout.Height)
|
||||
writeTimelineIdleSpans(&b, layout, start, end, timeline)
|
||||
writeVerticalGrid(&b, layout, times, pointCount, 8)
|
||||
writeHorizontalGrid(&b, layout, scale)
|
||||
@@ -126,21 +125,19 @@ func renderGPUOverviewChartSVG(idx int, samples []platform.LiveMetricSample, tim
|
||||
temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
|
||||
power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
|
||||
coreClock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
|
||||
memClock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemClockMHz })
|
||||
if temp == nil && power == nil && coreClock == nil && memClock == nil {
|
||||
if temp == nil && power == nil && coreClock == nil {
|
||||
return nil, false, nil
|
||||
}
|
||||
labels := sampleTimeLabels(samples)
|
||||
times := sampleTimes(samples)
|
||||
svg, err := drawGPUOverviewChartSVG(
|
||||
fmt.Sprintf("GPU %d Overview", idx),
|
||||
gpuDisplayLabel(idx)+" Overview",
|
||||
labels,
|
||||
times,
|
||||
[]metricChartSeries{
|
||||
{Name: "Temp C", Values: coalesceDataset(temp, len(labels)), Color: "#f05a5a", AxisTitle: "Temp C"},
|
||||
{Name: "Power W", Values: coalesceDataset(power, len(labels)), Color: "#ffb357", AxisTitle: "Power W"},
|
||||
{Name: "Core Clock MHz", Values: coalesceDataset(coreClock, len(labels)), Color: "#73bf69", AxisTitle: "Core MHz"},
|
||||
{Name: "Memory Clock MHz", Values: coalesceDataset(memClock, len(labels)), Color: "#5794f2", AxisTitle: "Memory MHz"},
|
||||
},
|
||||
timeline,
|
||||
)
|
||||
@@ -151,8 +148,8 @@ func renderGPUOverviewChartSVG(idx int, samples []platform.LiveMetricSample, tim
|
||||
}
|
||||
|
||||
func drawGPUOverviewChartSVG(title string, labels []string, times []time.Time, series []metricChartSeries, timeline []chartTimelineSegment) ([]byte, error) {
|
||||
if len(series) != 4 {
|
||||
return nil, fmt.Errorf("gpu overview requires 4 series, got %d", len(series))
|
||||
if len(series) != 3 {
|
||||
return nil, fmt.Errorf("gpu overview requires 3 series, got %d", len(series))
|
||||
}
|
||||
const (
|
||||
width = 1400
|
||||
@@ -166,7 +163,6 @@ func drawGPUOverviewChartSVG(title string, labels []string, times []time.Time, s
|
||||
leftOuterAxis = 72
|
||||
leftInnerAxis = 132
|
||||
rightInnerAxis = 1268
|
||||
rightOuterAxis = 1328
|
||||
)
|
||||
layout := chartLayout{
|
||||
Width: width,
|
||||
@@ -176,7 +172,7 @@ func drawGPUOverviewChartSVG(title string, labels []string, times []time.Time, s
|
||||
PlotTop: plotTop,
|
||||
PlotBottom: plotBottom,
|
||||
}
|
||||
axisX := []int{leftOuterAxis, leftInnerAxis, rightInnerAxis, rightOuterAxis}
|
||||
axisX := []int{leftOuterAxis, leftInnerAxis, rightInnerAxis}
|
||||
pointCount := len(labels)
|
||||
if len(times) > pointCount {
|
||||
pointCount = len(times)
|
||||
@@ -214,7 +210,7 @@ func drawGPUOverviewChartSVG(title string, labels []string, times []time.Time, s
|
||||
|
||||
var b strings.Builder
|
||||
writeSVGOpen(&b, width, height)
|
||||
writeChartFrame(&b, title, width, height)
|
||||
writeChartFrame(&b, title, "", width, height)
|
||||
writeTimelineIdleSpans(&b, layout, start, end, timeline)
|
||||
writeVerticalGrid(&b, layout, times, pointCount, 8)
|
||||
writeHorizontalGrid(&b, layout, scales[0])
|
||||
@@ -457,10 +453,14 @@ func writeSVGClose(b *strings.Builder) {
|
||||
b.WriteString("</svg>\n")
|
||||
}
|
||||
|
||||
func writeChartFrame(b *strings.Builder, title string, width, height int) {
|
||||
func writeChartFrame(b *strings.Builder, title, subtitle string, width, height int) {
|
||||
fmt.Fprintf(b, `<rect width="%d" height="%d" rx="10" ry="10" fill="#ffffff" stroke="#d7e0ea"/>`+"\n", width, height)
|
||||
fmt.Fprintf(b, `<text x="%d" y="30" text-anchor="middle" font-family="sans-serif" font-size="16" font-weight="700" fill="#1f2937">%s</text>`+"\n",
|
||||
width/2, sanitizeChartText(title))
|
||||
if strings.TrimSpace(subtitle) != "" {
|
||||
fmt.Fprintf(b, `<text x="%d" y="50" text-anchor="middle" font-family="sans-serif" font-size="12" font-weight="600" fill="#64748b">%s</text>`+"\n",
|
||||
width/2, sanitizeChartText(subtitle))
|
||||
}
|
||||
}
|
||||
|
||||
func writePlotBorder(b *strings.Builder, layout chartLayout) {
|
||||
@@ -545,7 +545,21 @@ func writeSeriesPolyline(b *strings.Builder, layout chartLayout, times []time.Ti
|
||||
x := chartXForTime(chartPointTime(times, 0), start, end, layout.PlotLeft, layout.PlotRight)
|
||||
y := chartYForValue(values[0], scale, layout.PlotTop, layout.PlotBottom)
|
||||
fmt.Fprintf(b, `<circle cx="%.1f" cy="%.1f" r="3.5" fill="%s"/>`+"\n", x, y, color)
|
||||
return
|
||||
}
|
||||
peakIdx := 0
|
||||
peakValue := values[0]
|
||||
for idx, value := range values[1:] {
|
||||
if value >= peakValue {
|
||||
peakIdx = idx + 1
|
||||
peakValue = value
|
||||
}
|
||||
}
|
||||
x := chartXForTime(chartPointTime(times, peakIdx), start, end, layout.PlotLeft, layout.PlotRight)
|
||||
y := chartYForValue(peakValue, scale, layout.PlotTop, layout.PlotBottom)
|
||||
fmt.Fprintf(b, `<circle cx="%.1f" cy="%.1f" r="4.2" fill="%s" stroke="#ffffff" stroke-width="1.6"/>`+"\n", x, y, color)
|
||||
fmt.Fprintf(b, `<path d="M %.1f %.1f L %.1f %.1f L %.1f %.1f Z" fill="%s" opacity="0.9"/>`+"\n",
|
||||
x, y-10, x-5, y-18, x+5, y-18, color)
|
||||
}
|
||||
|
||||
func writeLegend(b *strings.Builder, layout chartLayout, series []metricChartSeries) {
|
||||
@@ -711,3 +725,49 @@ func valueClamp(value float64, scale chartScale) float64 {
|
||||
}
|
||||
return value
|
||||
}
|
||||
|
||||
func chartStatsLabel(datasets [][]float64) string {
|
||||
mn, avg, mx := globalStats(datasets)
|
||||
if mx <= 0 && avg <= 0 && mn <= 0 {
|
||||
return ""
|
||||
}
|
||||
return fmt.Sprintf("min %s avg %s max %s",
|
||||
chartLegendNumber(mn),
|
||||
chartLegendNumber(avg),
|
||||
chartLegendNumber(mx),
|
||||
)
|
||||
}
|
||||
|
||||
func gpuDisplayLabel(idx int) string {
|
||||
if name := gpuModelNameByIndex(idx); name != "" {
|
||||
return fmt.Sprintf("GPU %d — %s", idx, name)
|
||||
}
|
||||
return fmt.Sprintf("GPU %d", idx)
|
||||
}
|
||||
|
||||
func gpuModelNameByIndex(idx int) string {
|
||||
now := time.Now()
|
||||
gpuLabelCache.mu.Lock()
|
||||
if now.Sub(gpuLabelCache.loadedAt) > 30*time.Second || gpuLabelCache.byIndex == nil {
|
||||
gpuLabelCache.loadedAt = now
|
||||
gpuLabelCache.byIndex = loadGPUModelNames()
|
||||
}
|
||||
name := strings.TrimSpace(gpuLabelCache.byIndex[idx])
|
||||
gpuLabelCache.mu.Unlock()
|
||||
return name
|
||||
}
|
||||
|
||||
func loadGPUModelNames() map[int]string {
|
||||
out := map[int]string{}
|
||||
gpus, err := platform.New().ListNvidiaGPUs()
|
||||
if err != nil {
|
||||
return out
|
||||
}
|
||||
for _, gpu := range gpus {
|
||||
name := strings.TrimSpace(gpu.Name)
|
||||
if name != "" {
|
||||
out[gpu.Index] = name
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
@@ -9,13 +9,14 @@ import (
|
||||
|
||||
// jobState holds the output lines and completion status of an async job.
|
||||
type jobState struct {
|
||||
lines []string
|
||||
done bool
|
||||
err string
|
||||
mu sync.Mutex
|
||||
subs []chan string
|
||||
cancel func() // optional cancel function; nil if job is not cancellable
|
||||
logPath string
|
||||
lines []string
|
||||
done bool
|
||||
err string
|
||||
mu sync.Mutex
|
||||
subs []chan string
|
||||
cancel func() // optional cancel function; nil if job is not cancellable
|
||||
logPath string
|
||||
serialPrefix string
|
||||
}
|
||||
|
||||
// abort cancels the job if it has a cancel function and is not yet done.
|
||||
@@ -36,6 +37,9 @@ func (j *jobState) append(line string) {
|
||||
if j.logPath != "" {
|
||||
appendJobLog(j.logPath, line)
|
||||
}
|
||||
if j.serialPrefix != "" {
|
||||
taskSerialWriteLine(j.serialPrefix + line)
|
||||
}
|
||||
for _, ch := range j.subs {
|
||||
select {
|
||||
case ch <- line:
|
||||
@@ -107,8 +111,11 @@ func (m *jobManager) get(id string) (*jobState, bool) {
|
||||
return j, ok
|
||||
}
|
||||
|
||||
func newTaskJobState(logPath string) *jobState {
|
||||
func newTaskJobState(logPath string, serialPrefix ...string) *jobState {
|
||||
j := &jobState{logPath: logPath}
|
||||
if len(serialPrefix) > 0 {
|
||||
j.serialPrefix = serialPrefix[0]
|
||||
}
|
||||
if logPath == "" {
|
||||
return j
|
||||
}
|
||||
|
||||
@@ -22,6 +22,13 @@ type MetricsDB struct {
|
||||
db *sql.DB
|
||||
}
|
||||
|
||||
func (m *MetricsDB) Close() error {
|
||||
if m == nil || m.db == nil {
|
||||
return nil
|
||||
}
|
||||
return m.db.Close()
|
||||
}
|
||||
|
||||
// openMetricsDB opens (or creates) the metrics database at the given path.
|
||||
func openMetricsDB(path string) (*MetricsDB, error) {
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||
@@ -164,6 +171,23 @@ func (m *MetricsDB) LoadAll() ([]platform.LiveMetricSample, error) {
|
||||
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts`, nil)
|
||||
}
|
||||
|
||||
// LoadBetween returns samples in chronological order within the given time window.
|
||||
func (m *MetricsDB) LoadBetween(start, end time.Time) ([]platform.LiveMetricSample, error) {
|
||||
if m == nil {
|
||||
return nil, nil
|
||||
}
|
||||
if start.IsZero() || end.IsZero() {
|
||||
return nil, nil
|
||||
}
|
||||
if end.Before(start) {
|
||||
start, end = end, start
|
||||
}
|
||||
return m.loadSamples(
|
||||
`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics WHERE ts>=? AND ts<=? ORDER BY ts`,
|
||||
start.Unix(), end.Unix(),
|
||||
)
|
||||
}
|
||||
|
||||
// loadSamples reconstructs LiveMetricSample rows from the normalized tables.
|
||||
func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetricSample, error) {
|
||||
rows, err := m.db.Query(query, args...)
|
||||
@@ -364,9 +388,6 @@ func (m *MetricsDB) ExportCSV(w io.Writer) error {
|
||||
return cw.Error()
|
||||
}
|
||||
|
||||
// Close closes the database.
|
||||
func (m *MetricsDB) Close() { _ = m.db.Close() }
|
||||
|
||||
func nullFloat(v float64) sql.NullFloat64 {
|
||||
return sql.NullFloat64{Float64: v, Valid: true}
|
||||
}
|
||||
|
||||
@@ -143,3 +143,32 @@ CREATE TABLE temp_metrics (
|
||||
t.Fatalf("MemClockMHz=%v want 2600", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMetricsDBLoadBetweenFiltersWindow(t *testing.T) {
|
||||
db, err := openMetricsDB(filepath.Join(t.TempDir(), "metrics.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("openMetricsDB: %v", err)
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
base := time.Unix(1_700_000_000, 0).UTC()
|
||||
for i := 0; i < 5; i++ {
|
||||
if err := db.Write(platform.LiveMetricSample{
|
||||
Timestamp: base.Add(time.Duration(i) * time.Minute),
|
||||
CPULoadPct: float64(i),
|
||||
}); err != nil {
|
||||
t.Fatalf("Write(%d): %v", i, err)
|
||||
}
|
||||
}
|
||||
|
||||
got, err := db.LoadBetween(base.Add(1*time.Minute), base.Add(3*time.Minute))
|
||||
if err != nil {
|
||||
t.Fatalf("LoadBetween: %v", err)
|
||||
}
|
||||
if len(got) != 3 {
|
||||
t.Fatalf("LoadBetween len=%d want 3", len(got))
|
||||
}
|
||||
if !got[0].Timestamp.Equal(base.Add(1*time.Minute)) || !got[2].Timestamp.Equal(base.Add(3*time.Minute)) {
|
||||
t.Fatalf("window=%v..%v", got[0].Timestamp, got[2].Timestamp)
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
41
audit/internal/webui/serial_console.go
Normal file
41
audit/internal/webui/serial_console.go
Normal file
@@ -0,0 +1,41 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
var taskSerialWriteLine = writeTaskSerialLine
|
||||
|
||||
func writeTaskSerialLine(line string) {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
return
|
||||
}
|
||||
payload := fmt.Sprintf("%s %s\n", time.Now().UTC().Format("2006-01-02 15:04:05Z"), line)
|
||||
for _, path := range []string{"/dev/ttyS0", "/dev/ttyS1", "/dev/console"} {
|
||||
f, err := os.OpenFile(path, os.O_WRONLY|os.O_APPEND, 0)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
_, _ = f.WriteString(payload)
|
||||
_ = f.Close()
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
func taskSerialPrefix(t *Task) string {
|
||||
if t == nil {
|
||||
return "[task] "
|
||||
}
|
||||
return fmt.Sprintf("[task %s %s] ", t.ID, t.Name)
|
||||
}
|
||||
|
||||
func taskSerialEvent(t *Task, event string) {
|
||||
if t == nil {
|
||||
return
|
||||
}
|
||||
taskSerialWriteLine(fmt.Sprintf("%s%s", taskSerialPrefix(t), strings.TrimSpace(event)))
|
||||
}
|
||||
@@ -221,6 +221,11 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
// ── Infrastructure ──────────────────────────────────────────────────────
|
||||
mux.HandleFunc("GET /healthz", h.handleHealthz)
|
||||
mux.HandleFunc("GET /api/ready", h.handleReady)
|
||||
mux.HandleFunc("GET /loading", func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Cache-Control", "no-store")
|
||||
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||
_, _ = w.Write([]byte(loadingPageHTML))
|
||||
})
|
||||
|
||||
// ── Existing read-only endpoints (preserved for compatibility) ──────────
|
||||
mux.HandleFunc("GET /audit.json", h.handleAuditJSON)
|
||||
@@ -265,6 +270,9 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
mux.HandleFunc("POST /api/tasks/{id}/cancel", h.handleAPITasksCancel)
|
||||
mux.HandleFunc("POST /api/tasks/{id}/priority", h.handleAPITasksPriority)
|
||||
mux.HandleFunc("GET /api/tasks/{id}/stream", h.handleAPITasksStream)
|
||||
mux.HandleFunc("GET /api/tasks/{id}/charts", h.handleAPITaskChartsIndex)
|
||||
mux.HandleFunc("GET /api/tasks/{id}/chart/", h.handleAPITaskChartSVG)
|
||||
mux.HandleFunc("GET /tasks/{id}", h.handleTaskPage)
|
||||
|
||||
// Services
|
||||
mux.HandleFunc("GET /api/services", h.handleAPIServicesList)
|
||||
@@ -703,7 +711,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
||||
}
|
||||
switch sub {
|
||||
case "load":
|
||||
title = fmt.Sprintf("GPU %d Load", idx)
|
||||
title = gpuDisplayLabel(idx) + " Load"
|
||||
util := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.UsagePct })
|
||||
mem := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct })
|
||||
if util == nil && mem == nil {
|
||||
@@ -714,7 +722,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
||||
yMin = floatPtr(0)
|
||||
yMax = floatPtr(100)
|
||||
case "temp":
|
||||
title = fmt.Sprintf("GPU %d Temperature", idx)
|
||||
title = gpuDisplayLabel(idx) + " Temperature"
|
||||
temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
|
||||
if temp == nil {
|
||||
return nil, nil, nil, "", nil, nil, false
|
||||
@@ -724,7 +732,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
||||
yMin = floatPtr(0)
|
||||
yMax = autoMax120(temp)
|
||||
case "clock":
|
||||
title = fmt.Sprintf("GPU %d Core Clock", idx)
|
||||
title = gpuDisplayLabel(idx) + " Core Clock"
|
||||
clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.ClockMHz })
|
||||
if clock == nil {
|
||||
return nil, nil, nil, "", nil, nil, false
|
||||
@@ -733,7 +741,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
||||
names = []string{"Core Clock MHz"}
|
||||
yMin, yMax = autoBounds120(clock)
|
||||
case "memclock":
|
||||
title = fmt.Sprintf("GPU %d Memory Clock", idx)
|
||||
title = gpuDisplayLabel(idx) + " Memory Clock"
|
||||
clock := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemClockMHz })
|
||||
if clock == nil {
|
||||
return nil, nil, nil, "", nil, nil, false
|
||||
@@ -742,7 +750,7 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
||||
names = []string{"Memory Clock MHz"}
|
||||
yMin, yMax = autoBounds120(clock)
|
||||
default:
|
||||
title = fmt.Sprintf("GPU %d Power", idx)
|
||||
title = gpuDisplayLabel(idx) + " Power"
|
||||
power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
|
||||
if power == nil {
|
||||
return nil, nil, nil, "", nil, nil, false
|
||||
@@ -871,7 +879,7 @@ func gpuDatasets(samples []platform.LiveMetricSample, pick func(platform.GPUMetr
|
||||
continue
|
||||
}
|
||||
datasets = append(datasets, ds)
|
||||
names = append(names, fmt.Sprintf("GPU %d", idx))
|
||||
names = append(names, gpuDisplayLabel(idx))
|
||||
}
|
||||
return datasets, names
|
||||
}
|
||||
@@ -1206,37 +1214,106 @@ const loadingPageHTML = `<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>EASY-BEE</title>
|
||||
<title>EASY-BEE — Starting</title>
|
||||
<style>
|
||||
*{margin:0;padding:0;box-sizing:border-box}
|
||||
html,body{height:100%;background:#0f1117;display:flex;align-items:center;justify-content:center;font-family:'Courier New',monospace;color:#e2e8f0}
|
||||
.logo{font-size:13px;line-height:1.4;color:#f6c90e;margin-bottom:48px;white-space:pre}
|
||||
.spinner{width:48px;height:48px;border:4px solid #2d3748;border-top-color:#f6c90e;border-radius:50%;animation:spin .8s linear infinite;margin:0 auto 24px}
|
||||
.wrap{text-align:center;width:420px}
|
||||
.logo{font-size:11px;line-height:1.4;color:#f6c90e;margin-bottom:6px;white-space:pre;text-align:left}
|
||||
.subtitle{font-size:12px;color:#a0aec0;text-align:left;margin-bottom:24px;padding-left:2px}
|
||||
.spinner{width:36px;height:36px;border:3px solid #2d3748;border-top-color:#f6c90e;border-radius:50%;animation:spin .8s linear infinite;margin:0 auto 14px}
|
||||
.spinner.hidden{display:none}
|
||||
@keyframes spin{to{transform:rotate(360deg)}}
|
||||
.status{font-size:14px;color:#a0aec0;letter-spacing:.05em}
|
||||
.status{font-size:13px;color:#a0aec0;margin-bottom:20px;min-height:18px}
|
||||
table{width:100%;border-collapse:collapse;font-size:12px;margin-bottom:20px;display:none}
|
||||
td{padding:3px 6px;text-align:left}
|
||||
td:first-child{color:#718096;width:55%}
|
||||
.ok{color:#68d391}
|
||||
.run{color:#f6c90e}
|
||||
.fail{color:#fc8181}
|
||||
.dim{color:#4a5568}
|
||||
.btn{background:#1a202c;color:#a0aec0;border:1px solid #2d3748;padding:7px 18px;font-size:12px;cursor:pointer;font-family:inherit;display:none}
|
||||
.btn:hover{border-color:#718096;color:#e2e8f0}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div style="text-align:center">
|
||||
<div class="wrap">
|
||||
<div class="logo"> ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗
|
||||
██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝
|
||||
█████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗
|
||||
██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝
|
||||
███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗
|
||||
╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝</div>
|
||||
<div class="spinner"></div>
|
||||
<div class="status" id="s">Starting up...</div>
|
||||
<div class="subtitle">Hardware Audit LiveCD</div>
|
||||
<div class="spinner" id="spin"></div>
|
||||
<div class="status" id="st">Connecting to bee-web...</div>
|
||||
<table id="tbl"></table>
|
||||
<button class="btn" id="btn" onclick="go()">Open app now</button>
|
||||
</div>
|
||||
<script>
|
||||
function probe(){
|
||||
fetch('/api/ready',{cache:'no-store'})
|
||||
.then(function(r){
|
||||
if(r.ok){window.location.replace('/');}
|
||||
else{setTimeout(probe,1000);}
|
||||
(function(){
|
||||
var gone = false;
|
||||
function go(){ if(!gone){gone=true;window.location.replace('/');} }
|
||||
|
||||
function icon(s){
|
||||
if(s==='active') return '<span class="ok">● active</span>';
|
||||
if(s==='failed') return '<span class="fail">✕ failed</span>';
|
||||
if(s==='activating'||s==='reloading') return '<span class="run">○ starting</span>';
|
||||
if(s==='inactive') return '<span class="dim">○ inactive</span>';
|
||||
return '<span class="dim">'+s+'</span>';
|
||||
}
|
||||
|
||||
function allSettled(svcs){
|
||||
for(var i=0;i<svcs.length;i++){
|
||||
var s=svcs[i].state;
|
||||
if(s!=='active'&&s!=='failed'&&s!=='inactive') return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
var pollTimer=null;
|
||||
|
||||
function pollServices(){
|
||||
fetch('/api/services',{cache:'no-store'})
|
||||
.then(function(r){return r.json();})
|
||||
.then(function(svcs){
|
||||
if(!svcs||!svcs.length) return;
|
||||
var tbl=document.getElementById('tbl');
|
||||
tbl.style.display='';
|
||||
var html='';
|
||||
for(var i=0;i<svcs.length;i++)
|
||||
html+='<tr><td>'+svcs[i].name+'</td><td>'+icon(svcs[i].state)+'</td></tr>';
|
||||
tbl.innerHTML=html;
|
||||
if(allSettled(svcs)){
|
||||
clearInterval(pollTimer);
|
||||
document.getElementById('spin').className='spinner hidden';
|
||||
document.getElementById('st').textContent='Ready \u2014 opening...';
|
||||
setTimeout(go,800);
|
||||
}
|
||||
})
|
||||
.catch(function(){setTimeout(probe,1000);});
|
||||
.catch(function(){});
|
||||
}
|
||||
|
||||
function probe(){
|
||||
fetch('/healthz',{cache:'no-store'})
|
||||
.then(function(r){
|
||||
if(r.ok){
|
||||
document.getElementById('st').textContent='bee-web running \u2014 checking services...';
|
||||
document.getElementById('btn').style.display='';
|
||||
pollServices();
|
||||
pollTimer=setInterval(pollServices,1500);
|
||||
} else {
|
||||
document.getElementById('st').textContent='bee-web starting (status '+r.status+')...';
|
||||
setTimeout(probe,500);
|
||||
}
|
||||
})
|
||||
.catch(function(){
|
||||
document.getElementById('st').textContent='Waiting for bee-web to start...';
|
||||
setTimeout(probe,500);
|
||||
});
|
||||
}
|
||||
probe();
|
||||
})();
|
||||
</script>
|
||||
</body>
|
||||
</html>`
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
@@ -184,15 +185,15 @@ func TestChartDataFromSamplesIncludesGPUClockCharts(t *testing.T) {
|
||||
{
|
||||
Timestamp: time.Now().Add(-2 * time.Minute),
|
||||
GPUs: []platform.GPUMetricRow{
|
||||
{GPUIndex: 0, ClockMHz: 1400, MemClockMHz: 2600},
|
||||
{GPUIndex: 3, ClockMHz: 1500, MemClockMHz: 2800},
|
||||
{GPUIndex: 0, ClockMHz: 1400},
|
||||
{GPUIndex: 3, ClockMHz: 1500},
|
||||
},
|
||||
},
|
||||
{
|
||||
Timestamp: time.Now().Add(-1 * time.Minute),
|
||||
GPUs: []platform.GPUMetricRow{
|
||||
{GPUIndex: 0, ClockMHz: 1410, MemClockMHz: 2610},
|
||||
{GPUIndex: 3, ClockMHz: 1510, MemClockMHz: 2810},
|
||||
{GPUIndex: 0, ClockMHz: 1410},
|
||||
{GPUIndex: 3, ClockMHz: 1510},
|
||||
},
|
||||
},
|
||||
}
|
||||
@@ -210,20 +211,6 @@ func TestChartDataFromSamplesIncludesGPUClockCharts(t *testing.T) {
|
||||
if got := datasets[1][1]; got != 1510 {
|
||||
t.Fatalf("GPU 3 core clock=%v want 1510", got)
|
||||
}
|
||||
|
||||
datasets, names, _, title, _, _, ok = chartDataFromSamples("gpu-all-memclock", samples)
|
||||
if !ok {
|
||||
t.Fatal("gpu-all-memclock returned ok=false")
|
||||
}
|
||||
if title != "GPU Memory Clock" {
|
||||
t.Fatalf("title=%q", title)
|
||||
}
|
||||
if len(names) != 2 || names[0] != "GPU 0" || names[1] != "GPU 3" {
|
||||
t.Fatalf("names=%v", names)
|
||||
}
|
||||
if got := datasets[0][0]; got != 2600 {
|
||||
t.Fatalf("GPU 0 memory clock=%v want 2600", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizePowerSeriesHoldsLastPositive(t *testing.T) {
|
||||
@@ -256,10 +243,10 @@ func TestRenderMetricsUsesBufferedChartRefresh(t *testing.T) {
|
||||
if !strings.Contains(body, `/api/metrics/chart/gpu-all-clock.svg`) {
|
||||
t.Fatalf("metrics page should include GPU core clock chart: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `/api/metrics/chart/gpu-all-memclock.svg`) {
|
||||
t.Fatalf("metrics page should include GPU memory clock chart: %s", body)
|
||||
if strings.Contains(body, `/api/metrics/chart/gpu-all-memclock.svg`) {
|
||||
t.Fatalf("metrics page should not include GPU memory clock chart: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `renderGPUOverviewCards(indices)`) {
|
||||
if !strings.Contains(body, `renderGPUOverviewCards(indices, names)`) {
|
||||
t.Fatalf("metrics page should build per-GPU chart cards dynamically: %s", body)
|
||||
}
|
||||
}
|
||||
@@ -585,7 +572,7 @@ func TestAuditPageRendersViewerFrameAndActions(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestTasksPageRendersLogModalAndPaginationControls(t *testing.T) {
|
||||
func TestTasksPageRendersOpenLinksAndPaginationControls(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks", nil))
|
||||
@@ -593,8 +580,8 @@ func TestTasksPageRendersLogModalAndPaginationControls(t *testing.T) {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
if !strings.Contains(body, `id="task-log-overlay"`) {
|
||||
t.Fatalf("tasks page missing log modal overlay: %s", body)
|
||||
if !strings.Contains(body, `Open a task to view its saved logs and charts.`) {
|
||||
t.Fatalf("tasks page missing task report hint: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `_taskPageSize = 50`) {
|
||||
t.Fatalf("tasks page missing pagination size config: %s", body)
|
||||
@@ -615,8 +602,8 @@ func TestToolsPageRendersRestartGPUDriversButton(t *testing.T) {
|
||||
if !strings.Contains(body, `Restart GPU Drivers`) {
|
||||
t.Fatalf("tools page missing restart gpu drivers button: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `svcAction('bee-nvidia', 'restart')`) {
|
||||
t.Fatalf("tools page missing bee-nvidia restart action: %s", body)
|
||||
if !strings.Contains(body, `restartGPUDrivers()`) {
|
||||
t.Fatalf("tools page missing restartGPUDrivers action: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `id="boot-source-text"`) {
|
||||
t.Fatalf("tools page missing boot source field: %s", body)
|
||||
@@ -650,6 +637,66 @@ func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
exportDir := filepath.Join(dir, "export")
|
||||
runDir := filepath.Join(exportDir, "bee-benchmark", "gpu-benchmark-20260406-120000")
|
||||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
result := platform.NvidiaBenchmarkResult{
|
||||
GeneratedAt: time.Date(2026, time.April, 6, 12, 0, 0, 0, time.UTC),
|
||||
BenchmarkProfile: "standard",
|
||||
OverallStatus: "OK",
|
||||
GPUs: []platform.BenchmarkGPUResult{
|
||||
{
|
||||
Index: 0,
|
||||
Name: "NVIDIA H100 PCIe",
|
||||
Scores: platform.BenchmarkScorecard{
|
||||
CompositeScore: 1176.25,
|
||||
},
|
||||
},
|
||||
{
|
||||
Index: 1,
|
||||
Name: "NVIDIA H100 PCIe",
|
||||
Scores: platform.BenchmarkScorecard{
|
||||
CompositeScore: 1168.50,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
raw, err := json.Marshal(result)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(runDir, "result.json"), raw, 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
handler := NewHandler(HandlerOptions{ExportDir: exportDir})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/benchmark", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
wantTime := result.GeneratedAt.Local().Format("2006-01-02 15:04:05")
|
||||
for _, needle := range []string{
|
||||
`Benchmark Results`,
|
||||
`Composite score by saved benchmark run and GPU.`,
|
||||
`NVIDIA H100 PCIe / GPU 0`,
|
||||
`NVIDIA H100 PCIe / GPU 1`,
|
||||
`#1`,
|
||||
wantTime,
|
||||
`1176.25`,
|
||||
`1168.50`,
|
||||
} {
|
||||
if !strings.Contains(body, needle) {
|
||||
t.Fatalf("benchmark page missing %q: %s", needle, body)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{})
|
||||
rec := httptest.NewRecorder()
|
||||
@@ -663,6 +710,8 @@ func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
|
||||
`nvidia-targeted-stress`,
|
||||
`controlled NVIDIA DCGM load`,
|
||||
`<code>dcgmi diag targeted_stress</code>`,
|
||||
`NVIDIA GPU Selection`,
|
||||
`id="sat-gpu-list"`,
|
||||
} {
|
||||
if !strings.Contains(body, needle) {
|
||||
t.Fatalf("validate page missing %q: %s", needle, body)
|
||||
@@ -691,37 +740,154 @@ func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestTasksPageRendersScrollableLogModal(t *testing.T) {
|
||||
func TestTaskDetailPageRendersSavedReport(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "audit.json")
|
||||
exportDir := filepath.Join(dir, "export")
|
||||
if err := os.MkdirAll(exportDir, 0755); err != nil {
|
||||
reportDir := filepath.Join(exportDir, "tasks", "task-1_cpu_sat_done")
|
||||
if err := os.MkdirAll(reportDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(path, []byte(`{"collected_at":"2026-03-15T00:00:00Z"}`), 0644); err != nil {
|
||||
reportPath := filepath.Join(reportDir, "report.html")
|
||||
if err := os.WriteFile(reportPath, []byte(`<div class="card"><div class="card-head">Task Report</div><div class="card-body">saved report</div></div>`), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
handler := NewHandler(HandlerOptions{
|
||||
Title: "Bee Hardware Audit",
|
||||
AuditPath: path,
|
||||
ExportDir: exportDir,
|
||||
globalQueue.mu.Lock()
|
||||
origTasks := globalQueue.tasks
|
||||
globalQueue.tasks = []*Task{{
|
||||
ID: "task-1",
|
||||
Name: "CPU SAT",
|
||||
Target: "cpu",
|
||||
Status: TaskDone,
|
||||
CreatedAt: time.Now(),
|
||||
ArtifactsDir: reportDir,
|
||||
ReportHTMLPath: reportPath,
|
||||
}}
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = origTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
|
||||
handler := NewHandler(HandlerOptions{Title: "Bee Hardware Audit", ExportDir: exportDir})
|
||||
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks", nil))
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks/task-1", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
if !strings.Contains(body, `height:calc(100vh - 32px)`) {
|
||||
t.Fatalf("tasks page missing bounded log modal height: %s", body)
|
||||
if !strings.Contains(body, `saved report`) {
|
||||
t.Fatalf("task detail page missing saved report: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `flex:1;min-height:0;overflow:hidden`) {
|
||||
t.Fatalf("tasks page missing log modal overflow guard: %s", body)
|
||||
if !strings.Contains(body, `Back to Tasks`) {
|
||||
t.Fatalf("task detail page missing back link: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `height:100%;min-height:0;overflow:auto`) {
|
||||
t.Fatalf("tasks page missing scrollable log wrapper: %s", body)
|
||||
}
|
||||
|
||||
func TestTaskDetailPageRendersCancelForRunningTask(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
origTasks := globalQueue.tasks
|
||||
globalQueue.tasks = []*Task{{
|
||||
ID: "task-live-1",
|
||||
Name: "CPU SAT",
|
||||
Target: "cpu",
|
||||
Status: TaskRunning,
|
||||
CreatedAt: time.Now(),
|
||||
}}
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = origTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
|
||||
handler := NewHandler(HandlerOptions{Title: "Bee Hardware Audit"})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks/task-live-1", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
if !strings.Contains(body, `Cancel</button>`) {
|
||||
t.Fatalf("task detail page missing cancel button: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `function cancelTaskDetail(id)`) {
|
||||
t.Fatalf("task detail page missing cancel handler: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `/api/tasks/' + id + '/cancel`) {
|
||||
t.Fatalf("task detail page missing cancel endpoint: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `id="task-live-charts"`) {
|
||||
t.Fatalf("task detail page missing live charts container: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `/api/tasks/' + taskId + '/charts`) {
|
||||
t.Fatalf("task detail page missing live charts index endpoint: %s", body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTaskChartSVGUsesTaskTimeWindow(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
metricsPath := filepath.Join(dir, "metrics.db")
|
||||
prevMetricsPath := taskReportMetricsDBPath
|
||||
taskReportMetricsDBPath = metricsPath
|
||||
t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })
|
||||
|
||||
db, err := openMetricsDB(metricsPath)
|
||||
if err != nil {
|
||||
t.Fatalf("openMetricsDB: %v", err)
|
||||
}
|
||||
base := time.Now().UTC()
|
||||
samples := []platform.LiveMetricSample{
|
||||
{Timestamp: base.Add(-3 * time.Minute), PowerW: 100},
|
||||
{Timestamp: base.Add(-2 * time.Minute), PowerW: 200},
|
||||
{Timestamp: base.Add(-1 * time.Minute), PowerW: 300},
|
||||
}
|
||||
for _, sample := range samples {
|
||||
if err := db.Write(sample); err != nil {
|
||||
t.Fatalf("Write: %v", err)
|
||||
}
|
||||
}
|
||||
_ = db.Close()
|
||||
|
||||
started := base.Add(-2*time.Minute - 5*time.Second)
|
||||
done := base.Add(-1*time.Minute + 5*time.Second)
|
||||
globalQueue.mu.Lock()
|
||||
origTasks := globalQueue.tasks
|
||||
globalQueue.tasks = []*Task{{
|
||||
ID: "task-chart-1",
|
||||
Name: "Power Window",
|
||||
Target: "cpu",
|
||||
Status: TaskDone,
|
||||
CreatedAt: started.Add(-10 * time.Second),
|
||||
StartedAt: &started,
|
||||
DoneAt: &done,
|
||||
}}
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = origTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
|
||||
handler := NewHandler(HandlerOptions{Title: "Bee Hardware Audit"})
|
||||
req := httptest.NewRequest(http.MethodGet, "/api/tasks/task-chart-1/chart/server-power.svg", nil)
|
||||
req.SetPathValue("id", "task-chart-1")
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, req)
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
body := rec.Body.String()
|
||||
if !strings.Contains(body, "System Power") {
|
||||
t.Fatalf("task chart missing expected title: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, "min 200") {
|
||||
t.Fatalf("task chart stats should start from in-window sample: %s", body)
|
||||
}
|
||||
if strings.Contains(body, "min 100") {
|
||||
t.Fatalf("task chart should not include pre-task sample in stats: %s", body)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
267
audit/internal/webui/task_page.go
Normal file
267
audit/internal/webui/task_page.go
Normal file
@@ -0,0 +1,267 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"html"
|
||||
"net/http"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
func (h *handler) handleTaskPage(w http.ResponseWriter, r *http.Request) {
|
||||
id := r.PathValue("id")
|
||||
task, ok := globalQueue.findByID(id)
|
||||
if !ok {
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
snapshot := *task
|
||||
body := renderTaskDetailPage(h.opts, snapshot)
|
||||
w.Header().Set("Cache-Control", "no-store")
|
||||
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||
_, _ = w.Write([]byte(body))
|
||||
}
|
||||
|
||||
func (h *handler) handleAPITaskChartsIndex(w http.ResponseWriter, r *http.Request) {
|
||||
task, samples, _, _, ok := h.taskSamplesForRequest(r)
|
||||
if !ok {
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
type taskChartIndexEntry struct {
|
||||
Title string `json:"title"`
|
||||
File string `json:"file"`
|
||||
}
|
||||
entries := make([]taskChartIndexEntry, 0)
|
||||
for _, spec := range taskChartSpecsForSamples(samples) {
|
||||
title, _, ok := renderTaskChartSVG(spec.Path, samples, taskTimelineForTask(task))
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
entries = append(entries, taskChartIndexEntry{Title: title, File: spec.File})
|
||||
}
|
||||
w.Header().Set("Cache-Control", "no-store")
|
||||
w.Header().Set("Content-Type", "application/json; charset=utf-8")
|
||||
_ = json.NewEncoder(w).Encode(entries)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPITaskChartSVG(w http.ResponseWriter, r *http.Request) {
|
||||
task, samples, _, _, ok := h.taskSamplesForRequest(r)
|
||||
if !ok {
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
file := strings.TrimPrefix(r.URL.Path, "/api/tasks/"+task.ID+"/chart/")
|
||||
path, ok := taskChartPathFromFile(file)
|
||||
if !ok {
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
title, buf, hasData := renderTaskChartSVG(path, samples, taskTimelineForTask(task))
|
||||
if !hasData || len(buf) == 0 || strings.TrimSpace(title) == "" {
|
||||
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
|
||||
return
|
||||
}
|
||||
w.Header().Set("Content-Type", "image/svg+xml")
|
||||
w.Header().Set("Cache-Control", "no-store")
|
||||
_, _ = w.Write(buf)
|
||||
}
|
||||
|
||||
func renderTaskDetailPage(opts HandlerOptions, task Task) string {
|
||||
title := task.Name
|
||||
if strings.TrimSpace(title) == "" {
|
||||
title = task.ID
|
||||
}
|
||||
var body strings.Builder
|
||||
body.WriteString(`<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px;flex-wrap:wrap">`)
|
||||
body.WriteString(`<a class="btn btn-secondary btn-sm" href="/tasks">Back to Tasks</a>`)
|
||||
if task.Status == TaskRunning || task.Status == TaskPending {
|
||||
body.WriteString(`<button class="btn btn-danger btn-sm" onclick="cancelTaskDetail('` + html.EscapeString(task.ID) + `')">Cancel</button>`)
|
||||
}
|
||||
body.WriteString(`<span style="font-size:12px;color:var(--muted)">Artifacts are saved in the task folder under <code>./tasks</code>.</span>`)
|
||||
body.WriteString(`</div>`)
|
||||
|
||||
if report := loadTaskReportFragment(task); report != "" {
|
||||
body.WriteString(report)
|
||||
} else {
|
||||
body.WriteString(`<div class="card"><div class="card-head">Task Summary</div><div class="card-body">`)
|
||||
body.WriteString(`<div style="font-size:18px;font-weight:700">` + html.EscapeString(title) + `</div>`)
|
||||
body.WriteString(`<div style="margin-top:8px">` + renderTaskStatusBadge(task.Status) + `</div>`)
|
||||
if strings.TrimSpace(task.ErrMsg) != "" {
|
||||
body.WriteString(`<div style="margin-top:8px;color:var(--crit-fg)">` + html.EscapeString(task.ErrMsg) + `</div>`)
|
||||
}
|
||||
body.WriteString(`</div></div>`)
|
||||
}
|
||||
|
||||
if task.Status == TaskRunning {
|
||||
body.WriteString(`<div class="card"><div class="card-head">Live Charts</div><div class="card-body">`)
|
||||
body.WriteString(`<div id="task-live-charts" style="display:flex;flex-direction:column;gap:16px;color:var(--muted);font-size:13px">Loading charts...</div>`)
|
||||
body.WriteString(`</div></div>`)
|
||||
}
|
||||
|
||||
if task.Status == TaskRunning || task.Status == TaskPending {
|
||||
body.WriteString(`<div class="card"><div class="card-head">Live Logs</div><div class="card-body">`)
|
||||
body.WriteString(`<div id="task-live-log" class="terminal" style="max-height:none;white-space:pre-wrap">Connecting...</div>`)
|
||||
body.WriteString(`</div></div>`)
|
||||
body.WriteString(`<script>
|
||||
function cancelTaskDetail(id) {
|
||||
fetch('/api/tasks/' + id + '/cancel', {method:'POST'}).then(function(){
|
||||
var term = document.getElementById('task-live-log');
|
||||
if (term) {
|
||||
term.textContent += '\nCancel requested.\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
}
|
||||
});
|
||||
}
|
||||
function renderTaskLiveCharts(taskId, charts) {
|
||||
const host = document.getElementById('task-live-charts');
|
||||
if (!host) return;
|
||||
if (!Array.isArray(charts) || charts.length === 0) {
|
||||
host.innerHTML = 'Waiting for metric samples...';
|
||||
return;
|
||||
}
|
||||
const seen = {};
|
||||
charts.forEach(function(chart) {
|
||||
seen[chart.file] = true;
|
||||
let img = host.querySelector('img[data-chart-file="' + chart.file + '"]');
|
||||
if (img) {
|
||||
const card = img.closest('.card');
|
||||
if (card) {
|
||||
const title = card.querySelector('.card-head');
|
||||
if (title) title.textContent = chart.title;
|
||||
}
|
||||
return;
|
||||
}
|
||||
const card = document.createElement('div');
|
||||
card.className = 'card';
|
||||
card.style.margin = '0';
|
||||
card.innerHTML = '<div class="card-head"></div><div class="card-body" style="padding:12px"></div>';
|
||||
card.querySelector('.card-head').textContent = chart.title;
|
||||
const body = card.querySelector('.card-body');
|
||||
img = document.createElement('img');
|
||||
img.setAttribute('data-task-chart', '1');
|
||||
img.setAttribute('data-chart-file', chart.file);
|
||||
img.setAttribute('data-base-src', '/api/tasks/' + taskId + '/chart/' + chart.file);
|
||||
img.src = '/api/tasks/' + taskId + '/chart/' + chart.file + '?t=' + Date.now();
|
||||
img.style.width = '100%';
|
||||
img.style.display = 'block';
|
||||
img.style.borderRadius = '6px';
|
||||
img.alt = chart.title;
|
||||
body.appendChild(img);
|
||||
host.appendChild(card);
|
||||
});
|
||||
Array.from(host.querySelectorAll('img[data-task-chart="1"]')).forEach(function(img) {
|
||||
const file = img.getAttribute('data-chart-file') || '';
|
||||
if (seen[file]) return;
|
||||
const card = img.closest('.card');
|
||||
if (card) card.remove();
|
||||
});
|
||||
}
|
||||
function loadTaskLiveCharts(taskId) {
|
||||
fetch('/api/tasks/' + taskId + '/charts').then(function(r){ return r.json(); }).then(function(charts){
|
||||
renderTaskLiveCharts(taskId, charts);
|
||||
}).catch(function(){
|
||||
const host = document.getElementById('task-live-charts');
|
||||
if (host) host.innerHTML = 'Task charts are unavailable.';
|
||||
});
|
||||
}
|
||||
function refreshTaskLiveCharts() {
|
||||
document.querySelectorAll('img[data-task-chart="1"]').forEach(function(img){
|
||||
const base = img.dataset.baseSrc;
|
||||
if (!base) return;
|
||||
img.src = base + '?t=' + Date.now();
|
||||
});
|
||||
}
|
||||
var _taskDetailES = new EventSource('/api/tasks/` + html.EscapeString(task.ID) + `/stream');
|
||||
var _taskDetailTerm = document.getElementById('task-live-log');
|
||||
var _taskChartTimer = null;
|
||||
var _taskChartsFrozen = false;
|
||||
_taskDetailES.onopen = function(){ _taskDetailTerm.textContent = ''; };
|
||||
_taskDetailES.onmessage = function(e){ _taskDetailTerm.textContent += e.data + "\n"; _taskDetailTerm.scrollTop = _taskDetailTerm.scrollHeight; };
|
||||
_taskDetailES.addEventListener('done', function(e){
|
||||
if (_taskChartTimer) clearInterval(_taskChartTimer);
|
||||
_taskDetailES.close();
|
||||
_taskDetailES = null;
|
||||
_taskChartsFrozen = true;
|
||||
_taskDetailTerm.textContent += (e.data ? '\nTask finished with error.\n' : '\nTask finished.\n');
|
||||
_taskDetailTerm.scrollTop = _taskDetailTerm.scrollHeight;
|
||||
refreshTaskLiveCharts();
|
||||
});
|
||||
_taskDetailES.onerror = function(){
|
||||
if (_taskChartTimer) clearInterval(_taskChartTimer);
|
||||
if (_taskDetailES) {
|
||||
_taskDetailES.close();
|
||||
_taskDetailES = null;
|
||||
}
|
||||
};
|
||||
loadTaskLiveCharts('` + html.EscapeString(task.ID) + `');
|
||||
_taskChartTimer = setInterval(function(){
|
||||
if (_taskChartsFrozen) return;
|
||||
loadTaskLiveCharts('` + html.EscapeString(task.ID) + `');
|
||||
refreshTaskLiveCharts();
|
||||
}, 2000);
|
||||
</script>`)
|
||||
}
|
||||
|
||||
return layoutHead(opts.Title+" — "+title) +
|
||||
layoutNav("tasks", opts.BuildLabel) +
|
||||
`<div class="main"><div class="topbar"><h1>` + html.EscapeString(title) + `</h1></div><div class="content">` +
|
||||
body.String() +
|
||||
`</div></div></body></html>`
|
||||
}
|
||||
|
||||
func loadTaskReportFragment(task Task) string {
|
||||
if strings.TrimSpace(task.ReportHTMLPath) == "" {
|
||||
return ""
|
||||
}
|
||||
data, err := os.ReadFile(task.ReportHTMLPath)
|
||||
if err != nil || len(data) == 0 {
|
||||
return ""
|
||||
}
|
||||
return string(data)
|
||||
}
|
||||
|
||||
func taskArtifactDownloadLink(task Task, absPath string) string {
|
||||
if strings.TrimSpace(absPath) == "" {
|
||||
return ""
|
||||
}
|
||||
return fmt.Sprintf(`/export/file?path=%s`, absPath)
|
||||
}
|
||||
|
||||
func (h *handler) taskSamplesForRequest(r *http.Request) (Task, []platform.LiveMetricSample, time.Time, time.Time, bool) {
|
||||
id := r.PathValue("id")
|
||||
taskPtr, ok := globalQueue.findByID(id)
|
||||
if !ok {
|
||||
return Task{}, nil, time.Time{}, time.Time{}, false
|
||||
}
|
||||
task := *taskPtr
|
||||
start, end := taskTimeWindow(&task)
|
||||
samples, err := loadTaskMetricSamples(start, end)
|
||||
if err != nil {
|
||||
return task, nil, start, end, true
|
||||
}
|
||||
return task, samples, start, end, true
|
||||
}
|
||||
|
||||
func taskTimelineForTask(task Task) []chartTimelineSegment {
|
||||
start, end := taskTimeWindow(&task)
|
||||
return []chartTimelineSegment{{Start: start, End: end, Active: true}}
|
||||
}
|
||||
|
||||
func taskChartPathFromFile(file string) (string, bool) {
|
||||
file = strings.TrimSpace(file)
|
||||
for _, spec := range taskDashboardChartSpecs {
|
||||
if spec.File == file {
|
||||
return spec.Path, true
|
||||
}
|
||||
}
|
||||
if strings.HasPrefix(file, "gpu-") && strings.HasSuffix(file, "-overview.svg") {
|
||||
id := strings.TrimSuffix(strings.TrimPrefix(file, "gpu-"), "-overview.svg")
|
||||
return "gpu/" + id + "-overview", true
|
||||
}
|
||||
return "", false
|
||||
}
|
||||
343
audit/internal/webui/task_report.go
Normal file
343
audit/internal/webui/task_report.go
Normal file
@@ -0,0 +1,343 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"html"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
var taskReportMetricsDBPath = metricsDBPath
|
||||
|
||||
type taskReport struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Target string `json:"target"`
|
||||
Status string `json:"status"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
StartedAt *time.Time `json:"started_at,omitempty"`
|
||||
DoneAt *time.Time `json:"done_at,omitempty"`
|
||||
DurationSec int `json:"duration_sec,omitempty"`
|
||||
Error string `json:"error,omitempty"`
|
||||
LogFile string `json:"log_file,omitempty"`
|
||||
Charts []taskReportChart `json:"charts,omitempty"`
|
||||
GeneratedAt time.Time `json:"generated_at"`
|
||||
}
|
||||
|
||||
type taskReportChart struct {
|
||||
Title string `json:"title"`
|
||||
File string `json:"file"`
|
||||
}
|
||||
|
||||
type taskChartSpec struct {
|
||||
Path string
|
||||
File string
|
||||
}
|
||||
|
||||
var taskDashboardChartSpecs = []taskChartSpec{
|
||||
{Path: "server-load", File: "server-load.svg"},
|
||||
{Path: "server-temp-cpu", File: "server-temp-cpu.svg"},
|
||||
{Path: "server-temp-ambient", File: "server-temp-ambient.svg"},
|
||||
{Path: "server-power", File: "server-power.svg"},
|
||||
{Path: "server-fans", File: "server-fans.svg"},
|
||||
{Path: "gpu-all-load", File: "gpu-all-load.svg"},
|
||||
{Path: "gpu-all-memload", File: "gpu-all-memload.svg"},
|
||||
{Path: "gpu-all-clock", File: "gpu-all-clock.svg"},
|
||||
{Path: "gpu-all-power", File: "gpu-all-power.svg"},
|
||||
{Path: "gpu-all-temp", File: "gpu-all-temp.svg"},
|
||||
}
|
||||
|
||||
func taskChartSpecsForSamples(samples []platform.LiveMetricSample) []taskChartSpec {
|
||||
specs := make([]taskChartSpec, 0, len(taskDashboardChartSpecs)+len(taskGPUIndices(samples)))
|
||||
specs = append(specs, taskDashboardChartSpecs...)
|
||||
for _, idx := range taskGPUIndices(samples) {
|
||||
specs = append(specs, taskChartSpec{
|
||||
Path: fmt.Sprintf("gpu/%d-overview", idx),
|
||||
File: fmt.Sprintf("gpu-%d-overview.svg", idx),
|
||||
})
|
||||
}
|
||||
return specs
|
||||
}
|
||||
|
||||
func writeTaskReportArtifacts(t *Task) error {
|
||||
if t == nil {
|
||||
return nil
|
||||
}
|
||||
ensureTaskReportPaths(t)
|
||||
if strings.TrimSpace(t.ArtifactsDir) == "" {
|
||||
return nil
|
||||
}
|
||||
if err := os.MkdirAll(t.ArtifactsDir, 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
start, end := taskTimeWindow(t)
|
||||
samples, _ := loadTaskMetricSamples(start, end)
|
||||
charts, inlineCharts := writeTaskCharts(t.ArtifactsDir, start, end, samples)
|
||||
|
||||
logText := ""
|
||||
if data, err := os.ReadFile(t.LogPath); err == nil {
|
||||
logText = string(data)
|
||||
}
|
||||
|
||||
report := taskReport{
|
||||
ID: t.ID,
|
||||
Name: t.Name,
|
||||
Target: t.Target,
|
||||
Status: t.Status,
|
||||
CreatedAt: t.CreatedAt,
|
||||
StartedAt: t.StartedAt,
|
||||
DoneAt: t.DoneAt,
|
||||
DurationSec: taskElapsedSec(t, reportDoneTime(t)),
|
||||
Error: t.ErrMsg,
|
||||
LogFile: filepath.Base(t.LogPath),
|
||||
Charts: charts,
|
||||
GeneratedAt: time.Now().UTC(),
|
||||
}
|
||||
if err := writeJSONFile(t.ReportJSONPath, report); err != nil {
|
||||
return err
|
||||
}
|
||||
return os.WriteFile(t.ReportHTMLPath, []byte(renderTaskReportFragment(report, inlineCharts, logText)), 0644)
|
||||
}
|
||||
|
||||
func reportDoneTime(t *Task) time.Time {
|
||||
if t != nil && t.DoneAt != nil && !t.DoneAt.IsZero() {
|
||||
return *t.DoneAt
|
||||
}
|
||||
return time.Now()
|
||||
}
|
||||
|
||||
func taskTimeWindow(t *Task) (time.Time, time.Time) {
|
||||
if t == nil {
|
||||
now := time.Now().UTC()
|
||||
return now, now
|
||||
}
|
||||
start := t.CreatedAt.UTC()
|
||||
if t.StartedAt != nil && !t.StartedAt.IsZero() {
|
||||
start = t.StartedAt.UTC()
|
||||
}
|
||||
end := time.Now().UTC()
|
||||
if t.DoneAt != nil && !t.DoneAt.IsZero() {
|
||||
end = t.DoneAt.UTC()
|
||||
}
|
||||
if end.Before(start) {
|
||||
end = start
|
||||
}
|
||||
return start, end
|
||||
}
|
||||
|
||||
func loadTaskMetricSamples(start, end time.Time) ([]platform.LiveMetricSample, error) {
|
||||
db, err := openMetricsDB(taskReportMetricsDBPath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer db.Close()
|
||||
return db.LoadBetween(start, end)
|
||||
}
|
||||
|
||||
func writeTaskCharts(dir string, start, end time.Time, samples []platform.LiveMetricSample) ([]taskReportChart, map[string]string) {
|
||||
if len(samples) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
timeline := []chartTimelineSegment{{Start: start, End: end, Active: true}}
|
||||
var charts []taskReportChart
|
||||
inline := make(map[string]string)
|
||||
for _, spec := range taskChartSpecsForSamples(samples) {
|
||||
title, svg, ok := renderTaskChartSVG(spec.Path, samples, timeline)
|
||||
if !ok || len(svg) == 0 {
|
||||
continue
|
||||
}
|
||||
path := filepath.Join(dir, spec.File)
|
||||
if err := os.WriteFile(path, svg, 0644); err != nil {
|
||||
continue
|
||||
}
|
||||
charts = append(charts, taskReportChart{Title: title, File: spec.File})
|
||||
inline[spec.File] = string(svg)
|
||||
}
|
||||
return charts, inline
|
||||
}
|
||||
|
||||
func renderTaskChartSVG(path string, samples []platform.LiveMetricSample, timeline []chartTimelineSegment) (string, []byte, bool) {
|
||||
if idx, sub, ok := parseGPUChartPath(path); ok && sub == "overview" {
|
||||
buf, hasData, err := renderGPUOverviewChartSVG(idx, samples, timeline)
|
||||
if err != nil || !hasData {
|
||||
return "", nil, false
|
||||
}
|
||||
return gpuDisplayLabel(idx) + " Overview", buf, true
|
||||
}
|
||||
datasets, names, labels, title, yMin, yMax, ok := chartDataFromSamples(path, samples)
|
||||
if !ok {
|
||||
return "", nil, false
|
||||
}
|
||||
buf, err := renderMetricChartSVG(
|
||||
title,
|
||||
labels,
|
||||
sampleTimes(samples),
|
||||
datasets,
|
||||
names,
|
||||
yMin,
|
||||
yMax,
|
||||
chartCanvasHeightForPath(path, len(names)),
|
||||
timeline,
|
||||
)
|
||||
if err != nil {
|
||||
return "", nil, false
|
||||
}
|
||||
return title, buf, true
|
||||
}
|
||||
|
||||
func taskGPUIndices(samples []platform.LiveMetricSample) []int {
|
||||
seen := map[int]bool{}
|
||||
var out []int
|
||||
for _, s := range samples {
|
||||
for _, g := range s.GPUs {
|
||||
if seen[g.GPUIndex] {
|
||||
continue
|
||||
}
|
||||
seen[g.GPUIndex] = true
|
||||
out = append(out, g.GPUIndex)
|
||||
}
|
||||
}
|
||||
sort.Ints(out)
|
||||
return out
|
||||
}
|
||||
|
||||
func writeJSONFile(path string, v any) error {
|
||||
data, err := json.MarshalIndent(v, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return os.WriteFile(path, data, 0644)
|
||||
}
|
||||
|
||||
func renderTaskReportFragment(report taskReport, charts map[string]string, logText string) string {
|
||||
var b strings.Builder
|
||||
b.WriteString(`<div class="card"><div class="card-head">Task Report</div><div class="card-body">`)
|
||||
b.WriteString(`<div class="grid2">`)
|
||||
b.WriteString(`<div><div style="font-size:12px;color:var(--muted);margin-bottom:6px">Task</div><div style="font-size:16px;font-weight:700">` + html.EscapeString(report.Name) + `</div>`)
|
||||
b.WriteString(`<div style="font-size:13px;color:var(--muted)">` + html.EscapeString(report.Target) + `</div></div>`)
|
||||
b.WriteString(`<div><div style="font-size:12px;color:var(--muted);margin-bottom:6px">Status</div><div>` + renderTaskStatusBadge(report.Status) + `</div>`)
|
||||
if strings.TrimSpace(report.Error) != "" {
|
||||
b.WriteString(`<div style="margin-top:8px;font-size:13px;color:var(--crit-fg)">` + html.EscapeString(report.Error) + `</div>`)
|
||||
}
|
||||
b.WriteString(`</div></div>`)
|
||||
b.WriteString(`<div style="margin-top:14px;font-size:13px;color:var(--muted)">`)
|
||||
b.WriteString(`Started: ` + formatTaskTime(report.StartedAt, report.CreatedAt) + ` | Finished: ` + formatTaskTime(report.DoneAt, time.Time{}) + ` | Duration: ` + formatTaskDuration(report.DurationSec))
|
||||
b.WriteString(`</div></div></div>`)
|
||||
if benchmarkCard := renderTaskBenchmarkResultsCard(report.Target, logText); benchmarkCard != "" {
|
||||
b.WriteString(benchmarkCard)
|
||||
}
|
||||
|
||||
if len(report.Charts) > 0 {
|
||||
for _, chart := range report.Charts {
|
||||
b.WriteString(`<div class="card"><div class="card-head">` + html.EscapeString(chart.Title) + `</div><div class="card-body" style="padding:12px">`)
|
||||
b.WriteString(charts[chart.File])
|
||||
b.WriteString(`</div></div>`)
|
||||
}
|
||||
} else {
|
||||
b.WriteString(`<div class="alert alert-info">No metric samples were captured during this task window.</div>`)
|
||||
}
|
||||
|
||||
b.WriteString(`<div class="card"><div class="card-head">Logs</div><div class="card-body">`)
|
||||
b.WriteString(`<div class="terminal" style="max-height:none;white-space:pre-wrap">` + html.EscapeString(strings.TrimSpace(logText)) + `</div>`)
|
||||
b.WriteString(`</div></div>`)
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func renderTaskBenchmarkResultsCard(target, logText string) string {
|
||||
if strings.TrimSpace(target) != "nvidia-benchmark" {
|
||||
return ""
|
||||
}
|
||||
resultPath := taskBenchmarkResultPath(logText)
|
||||
if strings.TrimSpace(resultPath) == "" {
|
||||
return ""
|
||||
}
|
||||
columns, runs := loadBenchmarkHistoryFromPaths([]string{resultPath})
|
||||
if len(runs) == 0 {
|
||||
return ""
|
||||
}
|
||||
return renderBenchmarkResultsCardFromRuns(
|
||||
"Benchmark Results",
|
||||
"Composite score for this benchmark task.",
|
||||
"No benchmark results were saved for this task.",
|
||||
columns,
|
||||
runs,
|
||||
)
|
||||
}
|
||||
|
||||
func taskBenchmarkResultPath(logText string) string {
|
||||
archivePath := taskArchivePathFromLog(logText)
|
||||
if archivePath == "" {
|
||||
return ""
|
||||
}
|
||||
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||
if runDir == archivePath {
|
||||
return ""
|
||||
}
|
||||
return filepath.Join(runDir, "result.json")
|
||||
}
|
||||
|
||||
func taskArchivePathFromLog(logText string) string {
|
||||
lines := strings.Split(logText, "\n")
|
||||
for i := len(lines) - 1; i >= 0; i-- {
|
||||
line := strings.TrimSpace(lines[i])
|
||||
if line == "" || !strings.HasPrefix(line, "Archive:") {
|
||||
continue
|
||||
}
|
||||
path := strings.TrimSpace(strings.TrimPrefix(line, "Archive:"))
|
||||
if strings.HasPrefix(path, "Archive written to ") {
|
||||
path = strings.TrimSpace(strings.TrimPrefix(path, "Archive written to "))
|
||||
}
|
||||
if strings.HasSuffix(path, ".tar.gz") {
|
||||
return path
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func renderTaskStatusBadge(status string) string {
|
||||
className := map[string]string{
|
||||
TaskRunning: "badge-ok",
|
||||
TaskPending: "badge-unknown",
|
||||
TaskDone: "badge-ok",
|
||||
TaskFailed: "badge-err",
|
||||
TaskCancelled: "badge-unknown",
|
||||
}[status]
|
||||
if className == "" {
|
||||
className = "badge-unknown"
|
||||
}
|
||||
label := strings.TrimSpace(status)
|
||||
if label == "" {
|
||||
label = "unknown"
|
||||
}
|
||||
return `<span class="badge ` + className + `">` + html.EscapeString(label) + `</span>`
|
||||
}
|
||||
|
||||
func formatTaskTime(ts *time.Time, fallback time.Time) string {
|
||||
if ts != nil && !ts.IsZero() {
|
||||
return ts.Local().Format("2006-01-02 15:04:05")
|
||||
}
|
||||
if !fallback.IsZero() {
|
||||
return fallback.Local().Format("2006-01-02 15:04:05")
|
||||
}
|
||||
return "n/a"
|
||||
}
|
||||
|
||||
func formatTaskDuration(sec int) string {
|
||||
if sec <= 0 {
|
||||
return "n/a"
|
||||
}
|
||||
if sec < 60 {
|
||||
return fmt.Sprintf("%ds", sec)
|
||||
}
|
||||
if sec < 3600 {
|
||||
return fmt.Sprintf("%dm %02ds", sec/60, sec%60)
|
||||
}
|
||||
return fmt.Sprintf("%dh %02dm %02ds", sec/3600, (sec%3600)/60, sec%60)
|
||||
}
|
||||
@@ -92,17 +92,20 @@ func taskDisplayName(target, profile, loader string) string {
|
||||
|
||||
// Task represents one unit of work in the queue.
|
||||
type Task struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Target string `json:"target"`
|
||||
Priority int `json:"priority"`
|
||||
Status string `json:"status"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
StartedAt *time.Time `json:"started_at,omitempty"`
|
||||
DoneAt *time.Time `json:"done_at,omitempty"`
|
||||
ElapsedSec int `json:"elapsed_sec,omitempty"`
|
||||
ErrMsg string `json:"error,omitempty"`
|
||||
LogPath string `json:"log_path,omitempty"`
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Target string `json:"target"`
|
||||
Priority int `json:"priority"`
|
||||
Status string `json:"status"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
StartedAt *time.Time `json:"started_at,omitempty"`
|
||||
DoneAt *time.Time `json:"done_at,omitempty"`
|
||||
ElapsedSec int `json:"elapsed_sec,omitempty"`
|
||||
ErrMsg string `json:"error,omitempty"`
|
||||
LogPath string `json:"log_path,omitempty"`
|
||||
ArtifactsDir string `json:"artifacts_dir,omitempty"`
|
||||
ReportJSONPath string `json:"report_json_path,omitempty"`
|
||||
ReportHTMLPath string `json:"report_html_path,omitempty"`
|
||||
|
||||
// runtime fields (not serialised)
|
||||
job *jobState
|
||||
@@ -126,17 +129,20 @@ type taskParams struct {
|
||||
}
|
||||
|
||||
type persistedTask struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Target string `json:"target"`
|
||||
Priority int `json:"priority"`
|
||||
Status string `json:"status"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
StartedAt *time.Time `json:"started_at,omitempty"`
|
||||
DoneAt *time.Time `json:"done_at,omitempty"`
|
||||
ErrMsg string `json:"error,omitempty"`
|
||||
LogPath string `json:"log_path,omitempty"`
|
||||
Params taskParams `json:"params,omitempty"`
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Target string `json:"target"`
|
||||
Priority int `json:"priority"`
|
||||
Status string `json:"status"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
StartedAt *time.Time `json:"started_at,omitempty"`
|
||||
DoneAt *time.Time `json:"done_at,omitempty"`
|
||||
ErrMsg string `json:"error,omitempty"`
|
||||
LogPath string `json:"log_path,omitempty"`
|
||||
ArtifactsDir string `json:"artifacts_dir,omitempty"`
|
||||
ReportJSONPath string `json:"report_json_path,omitempty"`
|
||||
ReportHTMLPath string `json:"report_html_path,omitempty"`
|
||||
Params taskParams `json:"params,omitempty"`
|
||||
}
|
||||
|
||||
type burnPreset struct {
|
||||
@@ -252,6 +258,7 @@ func (q *taskQueue) enqueue(t *Task) {
|
||||
q.prune()
|
||||
q.persistLocked()
|
||||
q.mu.Unlock()
|
||||
taskSerialEvent(t, "queued")
|
||||
select {
|
||||
case q.trigger <- struct{}{}:
|
||||
default:
|
||||
@@ -416,44 +423,30 @@ func (q *taskQueue) worker() {
|
||||
setCPUGovernor("performance")
|
||||
defer setCPUGovernor("powersave")
|
||||
|
||||
// Drain all pending tasks and start them in parallel.
|
||||
q.mu.Lock()
|
||||
var batch []*Task
|
||||
for {
|
||||
q.mu.Lock()
|
||||
t := q.nextPending()
|
||||
if t == nil {
|
||||
break
|
||||
q.prune()
|
||||
q.persistLocked()
|
||||
q.mu.Unlock()
|
||||
return
|
||||
}
|
||||
now := time.Now()
|
||||
t.Status = TaskRunning
|
||||
t.StartedAt = &now
|
||||
t.DoneAt = nil
|
||||
t.ErrMsg = ""
|
||||
j := newTaskJobState(t.LogPath)
|
||||
j := newTaskJobState(t.LogPath, taskSerialPrefix(t))
|
||||
t.job = j
|
||||
batch = append(batch, t)
|
||||
}
|
||||
if len(batch) > 0 {
|
||||
q.persistLocked()
|
||||
}
|
||||
q.mu.Unlock()
|
||||
q.mu.Unlock()
|
||||
|
||||
var wg sync.WaitGroup
|
||||
for _, t := range batch {
|
||||
t := t
|
||||
j := t.job
|
||||
taskCtx, taskCancel := context.WithCancel(context.Background())
|
||||
j.cancel = taskCancel
|
||||
wg.Add(1)
|
||||
goRecoverOnce("task "+t.Target, func() {
|
||||
defer wg.Done()
|
||||
defer taskCancel()
|
||||
q.executeTask(t, j, taskCtx)
|
||||
})
|
||||
}
|
||||
wg.Wait()
|
||||
q.executeTask(t, j, taskCtx)
|
||||
taskCancel()
|
||||
|
||||
if len(batch) > 0 {
|
||||
q.mu.Lock()
|
||||
q.prune()
|
||||
q.persistLocked()
|
||||
@@ -496,8 +489,6 @@ func (q *taskQueue) executeTask(t *Task, j *jobState, ctx context.Context) {
|
||||
|
||||
func (q *taskQueue) finalizeTaskRun(t *Task, j *jobState) {
|
||||
q.mu.Lock()
|
||||
defer q.mu.Unlock()
|
||||
|
||||
now := time.Now()
|
||||
t.DoneAt = &now
|
||||
if t.Status == TaskRunning {
|
||||
@@ -509,7 +500,18 @@ func (q *taskQueue) finalizeTaskRun(t *Task, j *jobState) {
|
||||
t.ErrMsg = ""
|
||||
}
|
||||
}
|
||||
q.finalizeTaskArtifactPathsLocked(t)
|
||||
q.persistLocked()
|
||||
q.mu.Unlock()
|
||||
|
||||
if err := writeTaskReportArtifacts(t); err != nil {
|
||||
appendJobLog(t.LogPath, "WARN: task report generation failed: "+err.Error())
|
||||
}
|
||||
if t.ErrMsg != "" {
|
||||
taskSerialEvent(t, "finished with status="+t.Status+" error="+t.ErrMsg)
|
||||
return
|
||||
}
|
||||
taskSerialEvent(t, "finished with status="+t.Status)
|
||||
}
|
||||
|
||||
// setCPUGovernor writes the given governor to all CPU scaling_governor sysfs files.
|
||||
@@ -848,6 +850,7 @@ func (h *handler) handleAPITasksCancel(w http.ResponseWriter, r *http.Request) {
|
||||
now := time.Now()
|
||||
t.DoneAt = &now
|
||||
globalQueue.persistLocked()
|
||||
taskSerialEvent(t, "finished with status="+t.Status)
|
||||
writeJSON(w, map[string]string{"status": "cancelled"})
|
||||
case TaskRunning:
|
||||
if t.job != nil {
|
||||
@@ -857,6 +860,7 @@ func (h *handler) handleAPITasksCancel(w http.ResponseWriter, r *http.Request) {
|
||||
now := time.Now()
|
||||
t.DoneAt = &now
|
||||
globalQueue.persistLocked()
|
||||
taskSerialEvent(t, "finished with status="+t.Status)
|
||||
writeJSON(w, map[string]string{"status": "cancelled"})
|
||||
default:
|
||||
writeError(w, http.StatusConflict, "task is not running or pending")
|
||||
@@ -897,6 +901,7 @@ func (h *handler) handleAPITasksCancelAll(w http.ResponseWriter, _ *http.Request
|
||||
case TaskPending:
|
||||
t.Status = TaskCancelled
|
||||
t.DoneAt = &now
|
||||
taskSerialEvent(t, "finished with status="+t.Status)
|
||||
n++
|
||||
case TaskRunning:
|
||||
if t.job != nil {
|
||||
@@ -904,6 +909,7 @@ func (h *handler) handleAPITasksCancelAll(w http.ResponseWriter, _ *http.Request
|
||||
}
|
||||
t.Status = TaskCancelled
|
||||
t.DoneAt = &now
|
||||
taskSerialEvent(t, "finished with status="+t.Status)
|
||||
n++
|
||||
}
|
||||
}
|
||||
@@ -922,6 +928,7 @@ func (h *handler) handleAPITasksKillWorkers(w http.ResponseWriter, _ *http.Reque
|
||||
case TaskPending:
|
||||
t.Status = TaskCancelled
|
||||
t.DoneAt = &now
|
||||
taskSerialEvent(t, "finished with status="+t.Status)
|
||||
cancelled++
|
||||
case TaskRunning:
|
||||
if t.job != nil {
|
||||
@@ -929,6 +936,7 @@ func (h *handler) handleAPITasksKillWorkers(w http.ResponseWriter, _ *http.Reque
|
||||
}
|
||||
t.Status = TaskCancelled
|
||||
t.DoneAt = &now
|
||||
taskSerialEvent(t, "finished with status="+t.Status)
|
||||
cancelled++
|
||||
}
|
||||
}
|
||||
@@ -992,10 +1000,10 @@ func (h *handler) handleAPITasksStream(w http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
|
||||
func (q *taskQueue) assignTaskLogPathLocked(t *Task) {
|
||||
if t.LogPath != "" || q.logsDir == "" || t.ID == "" {
|
||||
if q.logsDir == "" || t.ID == "" {
|
||||
return
|
||||
}
|
||||
t.LogPath = filepath.Join(q.logsDir, t.ID+".log")
|
||||
q.ensureTaskArtifactPathsLocked(t)
|
||||
}
|
||||
|
||||
func (q *taskQueue) loadLocked() {
|
||||
@@ -1012,17 +1020,20 @@ func (q *taskQueue) loadLocked() {
|
||||
}
|
||||
for _, pt := range persisted {
|
||||
t := &Task{
|
||||
ID: pt.ID,
|
||||
Name: pt.Name,
|
||||
Target: pt.Target,
|
||||
Priority: pt.Priority,
|
||||
Status: pt.Status,
|
||||
CreatedAt: pt.CreatedAt,
|
||||
StartedAt: pt.StartedAt,
|
||||
DoneAt: pt.DoneAt,
|
||||
ErrMsg: pt.ErrMsg,
|
||||
LogPath: pt.LogPath,
|
||||
params: pt.Params,
|
||||
ID: pt.ID,
|
||||
Name: pt.Name,
|
||||
Target: pt.Target,
|
||||
Priority: pt.Priority,
|
||||
Status: pt.Status,
|
||||
CreatedAt: pt.CreatedAt,
|
||||
StartedAt: pt.StartedAt,
|
||||
DoneAt: pt.DoneAt,
|
||||
ErrMsg: pt.ErrMsg,
|
||||
LogPath: pt.LogPath,
|
||||
ArtifactsDir: pt.ArtifactsDir,
|
||||
ReportJSONPath: pt.ReportJSONPath,
|
||||
ReportHTMLPath: pt.ReportHTMLPath,
|
||||
params: pt.Params,
|
||||
}
|
||||
q.assignTaskLogPathLocked(t)
|
||||
if t.Status == TaskRunning {
|
||||
@@ -1053,17 +1064,20 @@ func (q *taskQueue) persistLocked() {
|
||||
state := make([]persistedTask, 0, len(q.tasks))
|
||||
for _, t := range q.tasks {
|
||||
state = append(state, persistedTask{
|
||||
ID: t.ID,
|
||||
Name: t.Name,
|
||||
Target: t.Target,
|
||||
Priority: t.Priority,
|
||||
Status: t.Status,
|
||||
CreatedAt: t.CreatedAt,
|
||||
StartedAt: t.StartedAt,
|
||||
DoneAt: t.DoneAt,
|
||||
ErrMsg: t.ErrMsg,
|
||||
LogPath: t.LogPath,
|
||||
Params: t.params,
|
||||
ID: t.ID,
|
||||
Name: t.Name,
|
||||
Target: t.Target,
|
||||
Priority: t.Priority,
|
||||
Status: t.Status,
|
||||
CreatedAt: t.CreatedAt,
|
||||
StartedAt: t.StartedAt,
|
||||
DoneAt: t.DoneAt,
|
||||
ErrMsg: t.ErrMsg,
|
||||
LogPath: t.LogPath,
|
||||
ArtifactsDir: t.ArtifactsDir,
|
||||
ReportJSONPath: t.ReportJSONPath,
|
||||
ReportHTMLPath: t.ReportHTMLPath,
|
||||
Params: t.params,
|
||||
})
|
||||
}
|
||||
data, err := json.MarshalIndent(state, "", " ")
|
||||
@@ -1094,3 +1108,113 @@ func taskElapsedSec(t *Task, now time.Time) int {
|
||||
}
|
||||
return int(end.Sub(start).Round(time.Second) / time.Second)
|
||||
}
|
||||
|
||||
func taskFolderStatus(status string) string {
|
||||
status = strings.TrimSpace(strings.ToLower(status))
|
||||
switch status {
|
||||
case TaskRunning, TaskDone, TaskFailed, TaskCancelled:
|
||||
return status
|
||||
default:
|
||||
return TaskPending
|
||||
}
|
||||
}
|
||||
|
||||
func sanitizeTaskFolderPart(s string) string {
|
||||
s = strings.TrimSpace(strings.ToLower(s))
|
||||
if s == "" {
|
||||
return "task"
|
||||
}
|
||||
var b strings.Builder
|
||||
lastDash := false
|
||||
for _, r := range s {
|
||||
isAlnum := (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9')
|
||||
if isAlnum {
|
||||
b.WriteRune(r)
|
||||
lastDash = false
|
||||
continue
|
||||
}
|
||||
if !lastDash {
|
||||
b.WriteByte('-')
|
||||
lastDash = true
|
||||
}
|
||||
}
|
||||
out := strings.Trim(b.String(), "-")
|
||||
if out == "" {
|
||||
return "task"
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func taskArtifactsDir(root string, t *Task, status string) string {
|
||||
if strings.TrimSpace(root) == "" || t == nil {
|
||||
return ""
|
||||
}
|
||||
prefix := taskFolderNumberPrefix(t.ID)
|
||||
return filepath.Join(root, fmt.Sprintf("%s_%s_%s", prefix, sanitizeTaskFolderPart(t.Name), taskFolderStatus(status)))
|
||||
}
|
||||
|
||||
func taskFolderNumberPrefix(taskID string) string {
|
||||
taskID = strings.TrimSpace(taskID)
|
||||
if strings.HasPrefix(taskID, "TASK-") && len(taskID) >= len("TASK-000") {
|
||||
num := strings.TrimSpace(strings.TrimPrefix(taskID, "TASK-"))
|
||||
if len(num) == 3 {
|
||||
allDigits := true
|
||||
for _, r := range num {
|
||||
if r < '0' || r > '9' {
|
||||
allDigits = false
|
||||
break
|
||||
}
|
||||
}
|
||||
if allDigits {
|
||||
return num
|
||||
}
|
||||
}
|
||||
}
|
||||
fallback := sanitizeTaskFolderPart(taskID)
|
||||
if fallback == "" {
|
||||
return "000"
|
||||
}
|
||||
return fallback
|
||||
}
|
||||
|
||||
func ensureTaskReportPaths(t *Task) {
|
||||
if t == nil || strings.TrimSpace(t.ArtifactsDir) == "" {
|
||||
return
|
||||
}
|
||||
if t.LogPath == "" || filepath.Base(t.LogPath) == "task.log" {
|
||||
t.LogPath = filepath.Join(t.ArtifactsDir, "task.log")
|
||||
}
|
||||
t.ReportJSONPath = filepath.Join(t.ArtifactsDir, "report.json")
|
||||
t.ReportHTMLPath = filepath.Join(t.ArtifactsDir, "report.html")
|
||||
}
|
||||
|
||||
func (q *taskQueue) ensureTaskArtifactPathsLocked(t *Task) {
|
||||
if t == nil || strings.TrimSpace(q.logsDir) == "" || strings.TrimSpace(t.ID) == "" {
|
||||
return
|
||||
}
|
||||
if strings.TrimSpace(t.ArtifactsDir) == "" {
|
||||
t.ArtifactsDir = taskArtifactsDir(q.logsDir, t, t.Status)
|
||||
}
|
||||
if t.ArtifactsDir != "" {
|
||||
_ = os.MkdirAll(t.ArtifactsDir, 0755)
|
||||
}
|
||||
ensureTaskReportPaths(t)
|
||||
}
|
||||
|
||||
func (q *taskQueue) finalizeTaskArtifactPathsLocked(t *Task) {
|
||||
if t == nil || strings.TrimSpace(q.logsDir) == "" || strings.TrimSpace(t.ID) == "" {
|
||||
return
|
||||
}
|
||||
q.ensureTaskArtifactPathsLocked(t)
|
||||
dstDir := taskArtifactsDir(q.logsDir, t, t.Status)
|
||||
if dstDir == "" {
|
||||
return
|
||||
}
|
||||
if t.ArtifactsDir != "" && t.ArtifactsDir != dstDir {
|
||||
if _, err := os.Stat(dstDir); err != nil {
|
||||
_ = os.Rename(t.ArtifactsDir, dstDir)
|
||||
}
|
||||
t.ArtifactsDir = dstDir
|
||||
}
|
||||
ensureTaskReportPaths(t)
|
||||
}
|
||||
|
||||
@@ -2,6 +2,7 @@ package webui
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
@@ -12,6 +13,7 @@ import (
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
func TestTaskQueuePersistsAndRecoversPendingTasks(t *testing.T) {
|
||||
@@ -161,6 +163,40 @@ func TestTaskQueueSnapshotSortsNewestFirst(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestNewJobIDUsesTASKPrefixAndZeroPadding(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
origTasks := globalQueue.tasks
|
||||
globalQueue.tasks = nil
|
||||
globalQueue.mu.Unlock()
|
||||
origCounter := jobCounter.Load()
|
||||
jobCounter.Store(0)
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = origTasks
|
||||
globalQueue.mu.Unlock()
|
||||
jobCounter.Store(origCounter)
|
||||
})
|
||||
|
||||
if got := newJobID("ignored"); got != "TASK-000" {
|
||||
t.Fatalf("id=%q want TASK-000", got)
|
||||
}
|
||||
if got := newJobID("ignored"); got != "TASK-001" {
|
||||
t.Fatalf("id=%q want TASK-001", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTaskArtifactsDirStartsWithTaskNumber(t *testing.T) {
|
||||
root := t.TempDir()
|
||||
task := &Task{
|
||||
ID: "TASK-007",
|
||||
Name: "NVIDIA Benchmark",
|
||||
}
|
||||
got := filepath.Base(taskArtifactsDir(root, task, TaskDone))
|
||||
if !strings.HasPrefix(got, "007_") {
|
||||
t.Fatalf("artifacts dir=%q want prefix 007_", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPITasksStreamReplaysPersistedLogWithoutLiveJob(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
logPath := filepath.Join(dir, "task.log")
|
||||
@@ -248,6 +284,196 @@ func TestHandleAPITasksStreamPendingTaskStartsSSEImmediately(t *testing.T) {
|
||||
t.Fatalf("stream did not emit queued status promptly, body=%q", rec.Body.String())
|
||||
}
|
||||
|
||||
func TestFinalizeTaskRunCreatesReportFolderAndArtifacts(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
metricsPath := filepath.Join(dir, "metrics.db")
|
||||
prevMetricsPath := taskReportMetricsDBPath
|
||||
taskReportMetricsDBPath = metricsPath
|
||||
t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })
|
||||
|
||||
db, err := openMetricsDB(metricsPath)
|
||||
if err != nil {
|
||||
t.Fatalf("openMetricsDB: %v", err)
|
||||
}
|
||||
base := time.Now().UTC().Add(-45 * time.Second)
|
||||
if err := db.Write(platform.LiveMetricSample{
|
||||
Timestamp: base,
|
||||
CPULoadPct: 42,
|
||||
MemLoadPct: 35,
|
||||
PowerW: 510,
|
||||
}); err != nil {
|
||||
t.Fatalf("Write: %v", err)
|
||||
}
|
||||
_ = db.Close()
|
||||
|
||||
q := &taskQueue{
|
||||
statePath: filepath.Join(dir, "tasks-state.json"),
|
||||
logsDir: filepath.Join(dir, "tasks"),
|
||||
trigger: make(chan struct{}, 1),
|
||||
}
|
||||
if err := os.MkdirAll(q.logsDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
started := time.Now().UTC().Add(-90 * time.Second)
|
||||
task := &Task{
|
||||
ID: "task-1",
|
||||
Name: "CPU SAT",
|
||||
Target: "cpu",
|
||||
Status: TaskRunning,
|
||||
CreatedAt: started.Add(-10 * time.Second),
|
||||
StartedAt: &started,
|
||||
}
|
||||
q.assignTaskLogPathLocked(task)
|
||||
appendJobLog(task.LogPath, "line-1")
|
||||
|
||||
job := newTaskJobState(task.LogPath)
|
||||
job.finish("")
|
||||
q.finalizeTaskRun(task, job)
|
||||
|
||||
if task.Status != TaskDone {
|
||||
t.Fatalf("status=%q want %q", task.Status, TaskDone)
|
||||
}
|
||||
if !strings.Contains(filepath.Base(task.ArtifactsDir), "_done") {
|
||||
t.Fatalf("artifacts dir=%q", task.ArtifactsDir)
|
||||
}
|
||||
if _, err := os.Stat(task.ReportJSONPath); err != nil {
|
||||
t.Fatalf("report json: %v", err)
|
||||
}
|
||||
if _, err := os.Stat(task.ReportHTMLPath); err != nil {
|
||||
t.Fatalf("report html: %v", err)
|
||||
}
|
||||
var report taskReport
|
||||
data, err := os.ReadFile(task.ReportJSONPath)
|
||||
if err != nil {
|
||||
t.Fatalf("ReadFile(report.json): %v", err)
|
||||
}
|
||||
if err := json.Unmarshal(data, &report); err != nil {
|
||||
t.Fatalf("Unmarshal(report.json): %v", err)
|
||||
}
|
||||
if report.ID != task.ID || report.Status != TaskDone {
|
||||
t.Fatalf("report=%+v", report)
|
||||
}
|
||||
if len(report.Charts) == 0 {
|
||||
t.Fatalf("expected charts in report, got none")
|
||||
}
|
||||
}
|
||||
|
||||
func TestWriteTaskReportArtifactsIncludesBenchmarkResultsForTask(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
metricsPath := filepath.Join(dir, "metrics.db")
|
||||
prevMetricsPath := taskReportMetricsDBPath
|
||||
taskReportMetricsDBPath = metricsPath
|
||||
t.Cleanup(func() { taskReportMetricsDBPath = prevMetricsPath })
|
||||
|
||||
benchmarkDir := filepath.Join(dir, "bee-benchmark", "gpu-benchmark-20260406-120000")
|
||||
if err := os.MkdirAll(benchmarkDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
result := platform.NvidiaBenchmarkResult{
|
||||
GeneratedAt: time.Date(2026, time.April, 6, 12, 0, 0, 0, time.UTC),
|
||||
BenchmarkProfile: "standard",
|
||||
OverallStatus: "OK",
|
||||
GPUs: []platform.BenchmarkGPUResult{
|
||||
{
|
||||
Index: 0,
|
||||
Name: "NVIDIA H100 PCIe",
|
||||
Scores: platform.BenchmarkScorecard{
|
||||
CompositeScore: 1176.25,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
raw, err := json.Marshal(result)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(benchmarkDir, "result.json"), raw, 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
artifactsDir := filepath.Join(dir, "tasks", "task-bench_done")
|
||||
if err := os.MkdirAll(artifactsDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
task := &Task{
|
||||
ID: "task-bench",
|
||||
Name: "NVIDIA Benchmark",
|
||||
Target: "nvidia-benchmark",
|
||||
Status: TaskDone,
|
||||
CreatedAt: time.Now().UTC().Add(-time.Minute),
|
||||
ArtifactsDir: artifactsDir,
|
||||
}
|
||||
ensureTaskReportPaths(task)
|
||||
logText := "line-1\nArchive: " + filepath.Join(dir, "bee-benchmark", "gpu-benchmark-20260406-120000.tar.gz") + "\n"
|
||||
if err := os.WriteFile(task.LogPath, []byte(logText), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err := writeTaskReportArtifacts(task); err != nil {
|
||||
t.Fatalf("writeTaskReportArtifacts: %v", err)
|
||||
}
|
||||
|
||||
body, err := os.ReadFile(task.ReportHTMLPath)
|
||||
if err != nil {
|
||||
t.Fatalf("ReadFile(report.html): %v", err)
|
||||
}
|
||||
html := string(body)
|
||||
for _, needle := range []string{
|
||||
`Benchmark Results`,
|
||||
`Composite score for this benchmark task.`,
|
||||
`NVIDIA H100 PCIe / GPU 0`,
|
||||
`1176.25`,
|
||||
} {
|
||||
if !strings.Contains(html, needle) {
|
||||
t.Fatalf("report missing %q: %s", needle, html)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestTaskLifecycleMirrorsToSerialConsole(t *testing.T) {
|
||||
var lines []string
|
||||
prev := taskSerialWriteLine
|
||||
taskSerialWriteLine = func(line string) { lines = append(lines, line) }
|
||||
t.Cleanup(func() { taskSerialWriteLine = prev })
|
||||
|
||||
dir := t.TempDir()
|
||||
q := &taskQueue{
|
||||
statePath: filepath.Join(dir, "tasks-state.json"),
|
||||
logsDir: filepath.Join(dir, "tasks"),
|
||||
trigger: make(chan struct{}, 1),
|
||||
}
|
||||
task := &Task{
|
||||
ID: "task-serial-1",
|
||||
Name: "CPU SAT",
|
||||
Target: "cpu",
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now().UTC(),
|
||||
}
|
||||
|
||||
q.enqueue(task)
|
||||
started := time.Now().UTC()
|
||||
task.Status = TaskRunning
|
||||
task.StartedAt = &started
|
||||
job := newTaskJobState(task.LogPath, taskSerialPrefix(task))
|
||||
job.append("Starting CPU SAT...")
|
||||
job.append("CPU stress duration: 60s")
|
||||
job.finish("")
|
||||
q.finalizeTaskRun(task, job)
|
||||
|
||||
joined := strings.Join(lines, "\n")
|
||||
for _, needle := range []string{
|
||||
"queued",
|
||||
"Starting CPU SAT...",
|
||||
"CPU stress duration: 60s",
|
||||
"finished with status=done",
|
||||
} {
|
||||
if !strings.Contains(joined, needle) {
|
||||
t.Fatalf("serial mirror missing %q in %q", needle, joined)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveBurnPreset(t *testing.T) {
|
||||
tests := []struct {
|
||||
profile string
|
||||
|
||||
@@ -32,7 +32,7 @@ lb config noauto \
|
||||
--memtest memtest86+ \
|
||||
--iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=6 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
|
||||
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=3 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
|
||||
--apt-recommends false \
|
||||
--chroot-squashfs-compression-type zstd \
|
||||
"${@}"
|
||||
|
||||
@@ -41,15 +41,15 @@ while [ $# -gt 0 ]; do
|
||||
;;
|
||||
*)
|
||||
echo "unknown arg: $1" >&2
|
||||
echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys] [--variant nvidia|amd|all]" >&2
|
||||
echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys] [--variant nvidia|nvidia-legacy|amd|nogpu|all]" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
case "$VARIANT" in
|
||||
nvidia|amd|nogpu|all) ;;
|
||||
*) echo "unknown variant: $VARIANT (expected nvidia, amd, nogpu, or all)" >&2; exit 1 ;;
|
||||
nvidia|nvidia-legacy|amd|nogpu|all) ;;
|
||||
*) echo "unknown variant: $VARIANT (expected nvidia, nvidia-legacy, amd, nogpu, or all)" >&2; exit 1 ;;
|
||||
esac
|
||||
|
||||
if [ "$CLEAN_CACHE" = "1" ]; then
|
||||
@@ -61,8 +61,13 @@ if [ "$CLEAN_CACHE" = "1" ]; then
|
||||
"${CACHE_DIR:?}/lb-packages"
|
||||
echo "=== cleaning live-build work dirs ==="
|
||||
rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia"
|
||||
rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia-legacy"
|
||||
rm -rf "${REPO_ROOT}/dist/live-build-work-amd"
|
||||
rm -rf "${REPO_ROOT}/dist/live-build-work-nogpu"
|
||||
rm -rf "${REPO_ROOT}/dist/overlay-stage-nvidia"
|
||||
rm -rf "${REPO_ROOT}/dist/overlay-stage-nvidia-legacy"
|
||||
rm -rf "${REPO_ROOT}/dist/overlay-stage-amd"
|
||||
rm -rf "${REPO_ROOT}/dist/overlay-stage-nogpu"
|
||||
echo "=== caches cleared, proceeding with build ==="
|
||||
fi
|
||||
|
||||
@@ -180,6 +185,9 @@ case "$VARIANT" in
|
||||
nvidia)
|
||||
run_variant nvidia
|
||||
;;
|
||||
nvidia-legacy)
|
||||
run_variant nvidia-legacy
|
||||
;;
|
||||
amd)
|
||||
run_variant amd
|
||||
;;
|
||||
@@ -188,6 +196,7 @@ case "$VARIANT" in
|
||||
;;
|
||||
all)
|
||||
run_variant nvidia
|
||||
run_variant nvidia-legacy
|
||||
run_variant amd
|
||||
run_variant nogpu
|
||||
;;
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
#!/bin/sh
|
||||
# build-nvidia-module.sh — compile NVIDIA proprietary driver modules for Debian 12
|
||||
# build-nvidia-module.sh — compile NVIDIA kernel modules for Debian 12
|
||||
#
|
||||
# Downloads the official NVIDIA .run installer, extracts kernel modules and
|
||||
# userspace tools (nvidia-smi, libnvidia-ml). Everything is proprietary NVIDIA.
|
||||
# userspace tools (nvidia-smi, libnvidia-ml). Supports both:
|
||||
# - open -> kernel-open/ sources from the .run installer
|
||||
# - proprietary -> traditional proprietary kernel sources from the .run installer
|
||||
#
|
||||
# Output is cached in DIST_DIR/nvidia-<version>-<kver>/ so subsequent builds
|
||||
# are instant unless NVIDIA_DRIVER_VERSION or kernel version changes.
|
||||
@@ -17,10 +19,19 @@ set -e
|
||||
NVIDIA_VERSION="$1"
|
||||
DIST_DIR="$2"
|
||||
DEBIAN_KERNEL_ABI="$3"
|
||||
NVIDIA_FLAVOR="${4:-open}"
|
||||
|
||||
[ -n "$NVIDIA_VERSION" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi>"; exit 1; }
|
||||
[ -n "$DIST_DIR" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi>"; exit 1; }
|
||||
[ -n "$DEBIAN_KERNEL_ABI" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi>"; exit 1; }
|
||||
[ -n "$NVIDIA_VERSION" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi> [open|proprietary]"; exit 1; }
|
||||
[ -n "$DIST_DIR" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi> [open|proprietary]"; exit 1; }
|
||||
[ -n "$DEBIAN_KERNEL_ABI" ] || { echo "usage: $0 <nvidia-version> <dist-dir> <debian-kernel-abi> [open|proprietary]"; exit 1; }
|
||||
|
||||
case "$NVIDIA_FLAVOR" in
|
||||
open|proprietary) ;;
|
||||
*)
|
||||
echo "unsupported NVIDIA flavor: $NVIDIA_FLAVOR (expected open or proprietary)" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
||||
# On Debian, kernel headers are split into two packages:
|
||||
@@ -31,22 +42,13 @@ KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
||||
KDIR_ARCH="/usr/src/linux-headers-${KVER}"
|
||||
KDIR_COMMON="/usr/src/linux-headers-${DEBIAN_KERNEL_ABI}-common"
|
||||
|
||||
echo "=== NVIDIA ${NVIDIA_VERSION} (proprietary) for kernel ${KVER} ==="
|
||||
echo "=== NVIDIA ${NVIDIA_VERSION} (${NVIDIA_FLAVOR}) for kernel ${KVER} ==="
|
||||
|
||||
if [ ! -d "$KDIR_ARCH" ] || [ ! -d "$KDIR_COMMON" ]; then
|
||||
echo "=== installing linux-headers-${KVER} ==="
|
||||
DEBIAN_FRONTEND=noninteractive apt-get install -y \
|
||||
"linux-headers-${KVER}" \
|
||||
gcc make perl
|
||||
fi
|
||||
echo "kernel headers (arch): $KDIR_ARCH"
|
||||
echo "kernel headers (common): $KDIR_COMMON"
|
||||
|
||||
CACHE_DIR="${DIST_DIR}/nvidia-${NVIDIA_VERSION}-${KVER}"
|
||||
CACHE_DIR="${DIST_DIR}/nvidia-${NVIDIA_FLAVOR}-${NVIDIA_VERSION}-${KVER}"
|
||||
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nvidia-downloads"
|
||||
EXTRACT_CACHE_DIR="${CACHE_ROOT}/nvidia-extract"
|
||||
CACHE_LAYOUT_VERSION="2"
|
||||
CACHE_LAYOUT_VERSION="3"
|
||||
CACHE_LAYOUT_MARKER="${CACHE_DIR}/.cache-layout-v${CACHE_LAYOUT_VERSION}"
|
||||
if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ] \
|
||||
&& [ -f "$CACHE_LAYOUT_MARKER" ] \
|
||||
@@ -57,6 +59,15 @@ if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ] \
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if [ ! -d "$KDIR_ARCH" ] || [ ! -d "$KDIR_COMMON" ]; then
|
||||
echo "=== installing linux-headers-${KVER} ==="
|
||||
DEBIAN_FRONTEND=noninteractive apt-get install -y \
|
||||
"linux-headers-${KVER}" \
|
||||
gcc make perl
|
||||
fi
|
||||
echo "kernel headers (arch): $KDIR_ARCH"
|
||||
echo "kernel headers (common): $KDIR_COMMON"
|
||||
|
||||
# Download official NVIDIA .run installer with sha256 verification
|
||||
BASE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${NVIDIA_VERSION}"
|
||||
mkdir -p "$DOWNLOAD_CACHE_DIR" "$EXTRACT_CACHE_DIR"
|
||||
@@ -90,12 +101,18 @@ EXTRACT_DIR="${EXTRACT_CACHE_DIR}/nvidia-extract-${NVIDIA_VERSION}"
|
||||
rm -rf "$EXTRACT_DIR"
|
||||
"$RUN_FILE" --extract-only --target "$EXTRACT_DIR"
|
||||
|
||||
# Find kernel source directory (proprietary: kernel/, open: kernel-open/)
|
||||
# Find kernel source directory for the selected flavor.
|
||||
KERNEL_SRC=""
|
||||
for d in "$EXTRACT_DIR/kernel" "$EXTRACT_DIR/kernel-modules-sources" "$EXTRACT_DIR/kernel-source"; do
|
||||
[ -f "$d/Makefile" ] && KERNEL_SRC="$d" && break
|
||||
done
|
||||
[ -n "$KERNEL_SRC" ] || { echo "ERROR: kernel source dir not found in:"; ls "$EXTRACT_DIR/"; exit 1; }
|
||||
if [ "$NVIDIA_FLAVOR" = "open" ]; then
|
||||
for d in "$EXTRACT_DIR/kernel-open" "$EXTRACT_DIR/kernel-open/"*; do
|
||||
[ -f "$d/Makefile" ] && KERNEL_SRC="$d" && break
|
||||
done
|
||||
else
|
||||
for d in "$EXTRACT_DIR/kernel" "$EXTRACT_DIR/kernel-modules-sources" "$EXTRACT_DIR/kernel-source"; do
|
||||
[ -f "$d/Makefile" ] && KERNEL_SRC="$d" && break
|
||||
done
|
||||
fi
|
||||
[ -n "$KERNEL_SRC" ] || { echo "ERROR: kernel source dir not found for flavor ${NVIDIA_FLAVOR} in:"; ls "$EXTRACT_DIR/"; exit 1; }
|
||||
echo "kernel source: $KERNEL_SRC"
|
||||
|
||||
# Build kernel modules
|
||||
|
||||
@@ -15,26 +15,46 @@ DIST_DIR="${REPO_ROOT}/dist"
|
||||
VENDOR_DIR="${REPO_ROOT}/iso/vendor"
|
||||
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||
AUTH_KEYS=""
|
||||
BUILD_VARIANT="nvidia"
|
||||
BEE_GPU_VENDOR="nvidia"
|
||||
BEE_NVIDIA_MODULE_FLAVOR="open"
|
||||
|
||||
# parse args
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
--authorized-keys) AUTH_KEYS="$2"; shift 2 ;;
|
||||
--variant) BEE_GPU_VENDOR="$2"; shift 2 ;;
|
||||
--variant) BUILD_VARIANT="$2"; shift 2 ;;
|
||||
*) echo "unknown arg: $1"; exit 1 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
case "$BEE_GPU_VENDOR" in
|
||||
nvidia|amd|nogpu) ;;
|
||||
*) echo "unknown variant: $BEE_GPU_VENDOR (expected nvidia, amd, or nogpu)" >&2; exit 1 ;;
|
||||
case "$BUILD_VARIANT" in
|
||||
nvidia)
|
||||
BEE_GPU_VENDOR="nvidia"
|
||||
BEE_NVIDIA_MODULE_FLAVOR="open"
|
||||
;;
|
||||
nvidia-legacy)
|
||||
BEE_GPU_VENDOR="nvidia"
|
||||
BEE_NVIDIA_MODULE_FLAVOR="proprietary"
|
||||
;;
|
||||
amd)
|
||||
BEE_GPU_VENDOR="amd"
|
||||
BEE_NVIDIA_MODULE_FLAVOR=""
|
||||
;;
|
||||
nogpu)
|
||||
BEE_GPU_VENDOR="nogpu"
|
||||
BEE_NVIDIA_MODULE_FLAVOR=""
|
||||
;;
|
||||
*)
|
||||
echo "unknown variant: $BUILD_VARIANT (expected nvidia, nvidia-legacy, amd, or nogpu)" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
BUILD_WORK_DIR="${DIST_DIR}/live-build-work-${BEE_GPU_VENDOR}"
|
||||
OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage-${BEE_GPU_VENDOR}"
|
||||
BUILD_WORK_DIR="${DIST_DIR}/live-build-work-${BUILD_VARIANT}"
|
||||
OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage-${BUILD_VARIANT}"
|
||||
|
||||
export BEE_GPU_VENDOR
|
||||
export BEE_GPU_VENDOR BEE_NVIDIA_MODULE_FLAVOR BUILD_VARIANT
|
||||
|
||||
. "${BUILDER_DIR}/VERSIONS"
|
||||
export PATH="$PATH:/usr/local/go/bin"
|
||||
@@ -627,7 +647,7 @@ recover_iso_memtest() {
|
||||
|
||||
AUDIT_VERSION_EFFECTIVE="$(resolve_audit_version)"
|
||||
ISO_VERSION_EFFECTIVE="$(resolve_iso_version)"
|
||||
ISO_BASENAME="easy-bee-${BEE_GPU_VENDOR}-v${ISO_VERSION_EFFECTIVE}-amd64"
|
||||
ISO_BASENAME="easy-bee-${BUILD_VARIANT}-v${ISO_VERSION_EFFECTIVE}-amd64"
|
||||
# Versioned output directory: dist/easy-bee-v4.1/ — all final artefacts live here.
|
||||
OUT_DIR="${DIST_DIR}/easy-bee-v${ISO_VERSION_EFFECTIVE}"
|
||||
mkdir -p "${OUT_DIR}"
|
||||
@@ -801,7 +821,7 @@ if [ ! -d "/usr/src/linux-headers-${KVER}" ]; then
|
||||
apt-get install -y "linux-headers-${KVER}"
|
||||
fi
|
||||
|
||||
echo "=== bee ISO build (variant: ${BEE_GPU_VENDOR}) ==="
|
||||
echo "=== bee ISO build (variant: ${BUILD_VARIANT}) ==="
|
||||
echo "Debian: ${DEBIAN_VERSION}, Kernel ABI: ${DEBIAN_KERNEL_ABI}, Go: ${GO_VERSION}"
|
||||
echo "Audit version: ${AUDIT_VERSION_EFFECTIVE}, ISO version: ${ISO_VERSION_EFFECTIVE}"
|
||||
echo ""
|
||||
@@ -871,7 +891,7 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "=== preparing staged overlay (${BEE_GPU_VENDOR}) ==="
|
||||
echo "=== preparing staged overlay (${BUILD_VARIANT}) ==="
|
||||
mkdir -p "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}"
|
||||
|
||||
# Sync builder config into variant work dir, preserving lb cache.
|
||||
@@ -981,10 +1001,10 @@ done
|
||||
# --- NVIDIA kernel modules and userspace libs ---
|
||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
run_step "build NVIDIA ${NVIDIA_DRIVER_VERSION} modules" "40-nvidia-module" \
|
||||
sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}"
|
||||
sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}" "${BEE_NVIDIA_MODULE_FLAVOR}"
|
||||
|
||||
KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
||||
NVIDIA_CACHE="${DIST_DIR}/nvidia-${NVIDIA_DRIVER_VERSION}-${KVER}"
|
||||
NVIDIA_CACHE="${DIST_DIR}/nvidia-${BEE_NVIDIA_MODULE_FLAVOR}-${NVIDIA_DRIVER_VERSION}-${KVER}"
|
||||
|
||||
# Inject .ko files into overlay at /usr/local/lib/nvidia/
|
||||
OVERLAY_KMOD_DIR="${OVERLAY_STAGE_DIR}/usr/local/lib/nvidia"
|
||||
@@ -1055,13 +1075,14 @@ GIT_COMMIT="$(git -C "${REPO_ROOT}" rev-parse --short HEAD 2>/dev/null || echo u
|
||||
|
||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
GPU_VERSION_LINE="NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}
|
||||
NVIDIA_KERNEL_MODULES_FLAVOR=${BEE_NVIDIA_MODULE_FLAVOR}
|
||||
NCCL_VERSION=${NCCL_VERSION}
|
||||
NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION}
|
||||
CUBLAS_VERSION=${CUBLAS_VERSION}
|
||||
CUDA_USERSPACE_VERSION=${CUDA_USERSPACE_VERSION}
|
||||
NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}
|
||||
JOHN_JUMBO_COMMIT=${JOHN_JUMBO_COMMIT}"
|
||||
GPU_BUILD_INFO="nvidia:${NVIDIA_DRIVER_VERSION}"
|
||||
GPU_BUILD_INFO="nvidia-${BEE_NVIDIA_MODULE_FLAVOR}:${NVIDIA_DRIVER_VERSION}"
|
||||
elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
|
||||
GPU_VERSION_LINE="ROCM_VERSION=${ROCM_VERSION}"
|
||||
GPU_BUILD_INFO="rocm:${ROCM_VERSION}"
|
||||
@@ -1073,6 +1094,7 @@ fi
|
||||
cat > "${OVERLAY_STAGE_DIR}/etc/bee-release" <<EOF
|
||||
BEE_ISO_VERSION=${ISO_VERSION_EFFECTIVE}
|
||||
BEE_AUDIT_VERSION=${AUDIT_VERSION_EFFECTIVE}
|
||||
BEE_BUILD_VARIANT=${BUILD_VARIANT}
|
||||
BEE_GPU_VENDOR=${BEE_GPU_VENDOR}
|
||||
BUILD_DATE=${BUILD_DATE}
|
||||
GIT_COMMIT=${GIT_COMMIT}
|
||||
@@ -1083,6 +1105,11 @@ EOF
|
||||
|
||||
# Write GPU vendor marker for hooks
|
||||
echo "${BEE_GPU_VENDOR}" > "${OVERLAY_STAGE_DIR}/etc/bee-gpu-vendor"
|
||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
echo "${BEE_NVIDIA_MODULE_FLAVOR}" > "${OVERLAY_STAGE_DIR}/etc/bee-nvidia-modules-flavor"
|
||||
else
|
||||
rm -f "${OVERLAY_STAGE_DIR}/etc/bee-nvidia-modules-flavor"
|
||||
fi
|
||||
|
||||
# Patch motd with build info
|
||||
BEE_BUILD_INFO="${BUILD_DATE} git:${GIT_COMMIT} debian:${DEBIAN_VERSION} ${GPU_BUILD_INFO}"
|
||||
@@ -1153,10 +1180,10 @@ fi
|
||||
|
||||
# --- build ISO using live-build ---
|
||||
echo ""
|
||||
echo "=== building ISO (live-build, variant: ${BEE_GPU_VENDOR}) ==="
|
||||
echo "=== building ISO (variant: ${BUILD_VARIANT}) ==="
|
||||
|
||||
# Export for auto/config
|
||||
BEE_GPU_VENDOR_UPPER="$(echo "${BEE_GPU_VENDOR}" | tr 'a-z' 'A-Z')"
|
||||
BEE_GPU_VENDOR_UPPER="$(echo "${BUILD_VARIANT}" | tr 'a-z-' 'A-Z_')"
|
||||
export BEE_GPU_VENDOR_UPPER
|
||||
|
||||
cd "${LB_DIR}"
|
||||
@@ -1191,7 +1218,7 @@ if [ -f "$ISO_RAW" ]; then
|
||||
validate_iso_nvidia_runtime "$ISO_RAW"
|
||||
cp "$ISO_RAW" "$ISO_OUT"
|
||||
echo ""
|
||||
echo "=== done (${BEE_GPU_VENDOR}) ==="
|
||||
echo "=== done (${BUILD_VARIANT}) ==="
|
||||
echo "ISO: $ISO_OUT"
|
||||
if command -v stat >/dev/null 2>&1; then
|
||||
ISO_SIZE_BYTES="$(stat -c '%s' "$ISO_OUT" 2>/dev/null || stat -f '%z' "$ISO_OUT")"
|
||||
|
||||
@@ -7,6 +7,7 @@ echo " █████╗ ███████║███████╗ ╚
|
||||
echo " ██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝"
|
||||
echo " ███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗"
|
||||
echo " ╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝"
|
||||
echo " Hardware Audit LiveCD"
|
||||
echo ""
|
||||
|
||||
menuentry "EASY-BEE" {
|
||||
@@ -14,29 +15,21 @@ menuentry "EASY-BEE" {
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
menuentry "EASY-BEE (graphics/KMS)" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
submenu "EASY-BEE (advanced options) -->" {
|
||||
menuentry "EASY-BEE — GSP=off" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
menuentry "EASY-BEE (load to RAM)" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
menuentry "EASY-BEE — KMS (no nomodeset)" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
menuentry "EASY-BEE (NVIDIA GSP=off)" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
menuentry "EASY-BEE (graphics/KMS, GSP=off)" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
menuentry "EASY-BEE (fail-safe)" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
||||
initrd @INITRD_LIVE@
|
||||
menuentry "EASY-BEE — fail-safe" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
}
|
||||
|
||||
if [ "${grub_platform}" = "efi" ]; then
|
||||
|
||||
@@ -31,6 +31,7 @@ systemctl enable bee-audit.service
|
||||
systemctl enable bee-web.service
|
||||
systemctl enable bee-sshsetup.service
|
||||
systemctl enable bee-selfheal.timer
|
||||
systemctl enable bee-boot-status.service
|
||||
systemctl enable ssh.service
|
||||
systemctl enable lightdm.service 2>/dev/null || true
|
||||
systemctl enable qemu-guest-agent.service 2>/dev/null || true
|
||||
@@ -59,7 +60,8 @@ chmod +x /usr/local/bin/bee-sshsetup 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-smoketest 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-selfheal 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-selfheal 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-boot-status 2>/dev/null || true
|
||||
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
||||
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true
|
||||
|
||||
76
iso/builder/config/hooks/normal/9001-wallpaper.hook.chroot
Executable file
76
iso/builder/config/hooks/normal/9001-wallpaper.hook.chroot
Executable file
@@ -0,0 +1,76 @@
|
||||
#!/bin/sh
|
||||
# 9001-wallpaper.hook.chroot — generate /usr/share/bee/wallpaper.png inside chroot
|
||||
set -e
|
||||
echo "=== generating bee wallpaper ==="
|
||||
mkdir -p /usr/share/bee
|
||||
|
||||
python3 - <<'PYEOF'
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
import os
|
||||
|
||||
W, H = 1920, 1080
|
||||
|
||||
LOGO = """\
|
||||
\u2588\u2588\u2588\u2588\u2588\u2588\u2557 \u2588\u2588\u2588\u2588\u2588\u2557 \u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2557\u2588\u2588\u2557 \u2588\u2588\u2557 \u2588\u2588\u2588\u2588\u2588\u2588\u2557 \u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2557\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2557
|
||||
\u2588\u2588\u2554\u2550\u2550\u2550\u2550\u255d\u2588\u2588\u2554\u2550\u2550\u2588\u2588\u2557\u2588\u2588\u2554\u2550\u2550\u2550\u2550\u255d\u255a\u2588\u2588\u2557 \u2588\u2588\u2554\u255d \u2588\u2588\u2554\u2550\u2550\u2588\u2588\u2557\u2588\u2588\u2554\u2550\u2550\u2550\u2550\u255d\u2588\u2588\u2554\u2550\u2550\u2550\u2550\u255d
|
||||
\u2588\u2588\u2588\u2588\u2588\u2557 \u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2551\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2557 \u255a\u2588\u2588\u2588\u2588\u2554\u255d \u2588\u2588\u2588\u2588\u2588\u2557\u2588\u2588\u2588\u2588\u2588\u2588\u2554\u255d\u2588\u2588\u2588\u2588\u2588\u2557 \u2588\u2588\u2588\u2588\u2588\u2557
|
||||
\u2588\u2588\u2554\u2550\u2550\u255d \u2588\u2588\u2554\u2550\u2550\u2588\u2588\u2551\u255a\u2550\u2550\u2550\u2550\u2588\u2588\u2551 \u255a\u2588\u2588\u2554\u255d \u255a\u2550\u2550\u2550\u2550\u255d\u2588\u2588\u2554\u2550\u2550\u2588\u2588\u2557\u2588\u2588\u2554\u2550\u2550\u255d \u2588\u2588\u2554\u2550\u2550\u255d
|
||||
\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2557\u2588\u2588\u2551 \u2588\u2588\u2551\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2551 \u2588\u2588\u2551 \u2588\u2588\u2588\u2588\u2588\u2588\u2554\u255d\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2557\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2557
|
||||
\u255a\u2550\u2550\u2550\u2550\u2550\u2550\u255d\u255a\u2550\u255d \u255a\u2550\u255d\u255a\u2550\u2550\u2550\u2550\u2550\u2550\u255d \u255a\u2550\u255d \u255a\u2550\u2550\u2550\u2550\u2550\u255d \u255a\u2550\u2550\u2550\u2550\u2550\u2550\u255d\u255a\u2550\u2550\u2550\u2550\u2550\u2550\u255d
|
||||
Hardware Audit LiveCD"""
|
||||
|
||||
# Find a monospace font that supports box-drawing characters
|
||||
FONT_CANDIDATES = [
|
||||
'/usr/share/fonts/truetype/dejavu/DejaVuSansMono.ttf',
|
||||
'/usr/share/fonts/truetype/liberation/LiberationMono-Regular.ttf',
|
||||
'/usr/share/fonts/truetype/freefont/FreeMono.ttf',
|
||||
'/usr/share/fonts/truetype/noto/NotoMono-Regular.ttf',
|
||||
]
|
||||
|
||||
font_path = None
|
||||
for p in FONT_CANDIDATES:
|
||||
if os.path.exists(p):
|
||||
font_path = p
|
||||
break
|
||||
|
||||
SIZE = 22
|
||||
if font_path:
|
||||
font_logo = ImageFont.truetype(font_path, SIZE)
|
||||
font_sub = ImageFont.truetype(font_path, SIZE)
|
||||
else:
|
||||
font_logo = ImageFont.load_default()
|
||||
font_sub = font_logo
|
||||
|
||||
img = Image.new('RGB', (W, H), (0, 0, 0))
|
||||
draw = ImageDraw.Draw(img)
|
||||
|
||||
# Measure logo block line by line to avoid font ascender offset
|
||||
lines = LOGO.split('\n')
|
||||
logo_lines = lines[:6]
|
||||
sub_line = lines[6] if len(lines) > 6 else ''
|
||||
|
||||
line_h = SIZE + 2
|
||||
block_h = len(logo_lines) * line_h + 8 + (SIZE if sub_line else 0)
|
||||
|
||||
# Width: measure the widest logo line
|
||||
max_w = 0
|
||||
for line in logo_lines:
|
||||
bb = draw.textbbox((0, 0), line, font=font_logo)
|
||||
max_w = max(max_w, bb[2] - bb[0])
|
||||
|
||||
x = (W - max_w) // 2
|
||||
y = (H - block_h) // 2
|
||||
|
||||
cy = y
|
||||
for line in logo_lines:
|
||||
draw.text((x, cy), line, font=font_logo, fill=(0xf6, 0xc9, 0x0e))
|
||||
cy += line_h
|
||||
cy += 8
|
||||
if sub_line:
|
||||
draw.text((x, cy), sub_line, font=font_sub, fill=(0x80, 0x68, 0x18))
|
||||
|
||||
img.save('/usr/share/bee/wallpaper.png', optimize=True)
|
||||
print('wallpaper written: /usr/share/bee/wallpaper.png')
|
||||
PYEOF
|
||||
|
||||
echo "=== wallpaper done ==="
|
||||
@@ -60,9 +60,15 @@ qrencode
|
||||
# Local desktop (openbox + chromium kiosk)
|
||||
openbox
|
||||
tint2
|
||||
feh
|
||||
python3-pil
|
||||
xorg
|
||||
xterm
|
||||
chromium
|
||||
mousepad
|
||||
pcmanfm
|
||||
ristretto
|
||||
mupdf
|
||||
xserver-xorg-video-fbdev
|
||||
xserver-xorg-video-vesa
|
||||
lightdm
|
||||
|
||||
18
iso/overlay/etc/systemd/system/bee-boot-status.service
Normal file
18
iso/overlay/etc/systemd/system/bee-boot-status.service
Normal file
@@ -0,0 +1,18 @@
|
||||
[Unit]
|
||||
Description=Bee: boot status display
|
||||
After=systemd-user-sessions.service
|
||||
Before=getty@tty1.service
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
RemainAfterExit=no
|
||||
ExecStart=/usr/local/bin/bee-boot-status
|
||||
TTYPath=/dev/tty1
|
||||
StandardInput=tty
|
||||
StandardOutput=tty
|
||||
StandardError=tty
|
||||
TTYReset=yes
|
||||
TTYVHangup=yes
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
@@ -0,0 +1,2 @@
|
||||
[Unit]
|
||||
After=bee-boot-status.service
|
||||
@@ -1,6 +1,4 @@
|
||||
[Unit]
|
||||
Wants=bee-preflight.service
|
||||
After=bee-preflight.service
|
||||
|
||||
[Service]
|
||||
ExecStartPre=/usr/local/bin/bee-display-mode
|
||||
|
||||
89
iso/overlay/usr/local/bin/bee-boot-status
Normal file
89
iso/overlay/usr/local/bin/bee-boot-status
Normal file
@@ -0,0 +1,89 @@
|
||||
#!/bin/sh
|
||||
# bee-boot-status — boot progress display on tty1.
|
||||
# Shows live service status until all bee services are done or failed,
|
||||
# then exits so getty can show the login prompt.
|
||||
|
||||
CRITICAL="bee-preflight bee-nvidia bee-audit"
|
||||
ALL="bee-sshsetup ssh bee-network bee-nvidia bee-preflight bee-audit bee-web"
|
||||
|
||||
svc_state() { systemctl is-active "$1.service" 2>/dev/null || echo "inactive"; }
|
||||
|
||||
svc_icon() {
|
||||
case "$(svc_state "$1")" in
|
||||
active) printf '\033[32m[ OK ]\033[0m' ;;
|
||||
failed) printf '\033[31m[ FAIL ]\033[0m' ;;
|
||||
activating) printf '\033[33m[ .. ]\033[0m' ;;
|
||||
deactivating) printf '\033[33m[ stop ]\033[0m' ;;
|
||||
inactive) printf '\033[90m[ ]\033[0m' ;;
|
||||
*) printf '\033[90m[ ? ]\033[0m' ;;
|
||||
esac
|
||||
}
|
||||
|
||||
svc_detail() {
|
||||
local svc="$1" state
|
||||
state="$(svc_state "$svc")"
|
||||
case "$state" in
|
||||
failed)
|
||||
local res
|
||||
res="$(systemctl show -p Result "$svc.service" 2>/dev/null | cut -d= -f2)"
|
||||
[ -n "$res" ] && [ "$res" != "success" ] && printf ' \033[31m(%s)\033[0m' "$res"
|
||||
;;
|
||||
activating)
|
||||
local line
|
||||
line="$(journalctl -u "$svc.service" -n 1 --no-pager --output=cat 2>/dev/null | cut -c1-55)"
|
||||
[ -n "$line" ] && printf ' \033[90m%s\033[0m' "$line"
|
||||
;;
|
||||
esac
|
||||
}
|
||||
|
||||
all_critical_done() {
|
||||
for svc in $CRITICAL; do
|
||||
case "$(svc_state "$svc")" in
|
||||
active|failed|inactive) ;;
|
||||
*) return 1 ;;
|
||||
esac
|
||||
done
|
||||
return 0
|
||||
}
|
||||
|
||||
while true; do
|
||||
# move to top-left and clear screen
|
||||
printf '\033[H\033[2J'
|
||||
|
||||
printf '\n'
|
||||
printf ' \033[33m███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗\033[0m\n'
|
||||
printf ' \033[33m██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝\033[0m\n'
|
||||
printf ' \033[33m█████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗\033[0m\n'
|
||||
printf ' \033[33m██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝\033[0m\n'
|
||||
printf ' \033[33m███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗\033[0m\n'
|
||||
printf ' \033[33m╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝\033[0m\n'
|
||||
printf ' Hardware Audit LiveCD\n'
|
||||
printf '\n'
|
||||
|
||||
for svc in $ALL; do
|
||||
printf ' %s %-20s%s\n' "$(svc_icon "$svc")" "$svc" "$(svc_detail "$svc")"
|
||||
done
|
||||
printf '\n'
|
||||
|
||||
# Network
|
||||
ips="$(ip -4 addr show scope global 2>/dev/null | awk '/inet /{printf " %-16s %s\n", $NF, $2}')"
|
||||
if [ -n "$ips" ]; then
|
||||
printf ' \033[1mNetwork:\033[0m\n'
|
||||
printf '%s\n' "$ips"
|
||||
printf '\n'
|
||||
fi
|
||||
|
||||
if all_critical_done; then
|
||||
printf ' \033[1;32mSystem ready.\033[0m Audit is running in the background.\n'
|
||||
first_ip="$(ip -4 addr show scope global 2>/dev/null | awk '/inet /{print $2}' | cut -d/ -f1 | head -1)"
|
||||
if [ -n "$first_ip" ]; then
|
||||
printf ' Web UI: \033[1mhttp://%s/\033[0m\n' "$first_ip"
|
||||
fi
|
||||
printf '\n'
|
||||
sleep 3
|
||||
break
|
||||
fi
|
||||
|
||||
printf ' \033[90mStarting up...\033[0m\n'
|
||||
sleep 3
|
||||
done
|
||||
@@ -62,6 +62,8 @@ done
|
||||
echo "loader=bee-gpu-burn"
|
||||
echo "selected_gpus=${FINAL}"
|
||||
|
||||
export CUDA_DEVICE_ORDER="PCI_BUS_ID"
|
||||
|
||||
TMP_DIR=$(mktemp -d)
|
||||
trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM
|
||||
|
||||
@@ -78,7 +80,8 @@ for id in $(echo "${FINAL}" | tr ',' ' '); do
|
||||
fi
|
||||
fi
|
||||
echo "starting gpu ${id} size=${gpu_size_mb}MB"
|
||||
"${WORKER}" --device "${id}" --seconds "${SECONDS}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
|
||||
CUDA_VISIBLE_DEVICES="${id}" \
|
||||
"${WORKER}" --device 0 --seconds "${SECONDS}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
|
||||
pid=$!
|
||||
WORKERS="${WORKERS} ${pid}:${id}:${log}"
|
||||
done
|
||||
|
||||
@@ -152,14 +152,19 @@ done
|
||||
|
||||
[ -n "${FINAL}" ] || { echo "no NVIDIA GPUs selected after filters" >&2; exit 1; }
|
||||
|
||||
export CUDA_DEVICE_ORDER="PCI_BUS_ID"
|
||||
export CUDA_VISIBLE_DEVICES="${FINAL}"
|
||||
|
||||
JOHN_DEVICES=""
|
||||
local_id=1
|
||||
for id in $(echo "${FINAL}" | tr ',' ' '); do
|
||||
opencl_id=$((id + 1))
|
||||
opencl_id="${local_id}"
|
||||
if [ -z "${JOHN_DEVICES}" ]; then
|
||||
JOHN_DEVICES="${opencl_id}"
|
||||
else
|
||||
JOHN_DEVICES="${JOHN_DEVICES},${opencl_id}"
|
||||
fi
|
||||
local_id=$((local_id + 1))
|
||||
done
|
||||
|
||||
echo "loader=john"
|
||||
|
||||
@@ -70,6 +70,8 @@ echo "gpu_count=${GPU_COUNT}"
|
||||
echo "range=${MIN_BYTES}..${MAX_BYTES}"
|
||||
echo "iters=${ITERS}"
|
||||
|
||||
export CUDA_DEVICE_ORDER="PCI_BUS_ID"
|
||||
|
||||
deadline=$(( $(date +%s) + SECONDS ))
|
||||
round=0
|
||||
|
||||
|
||||
@@ -50,11 +50,93 @@ load_module() {
|
||||
log "WARN: not found: $ko"
|
||||
return 1
|
||||
fi
|
||||
if insmod "$ko" "$@"; then
|
||||
if timeout 90 insmod "$ko" "$@"; then
|
||||
log "loaded: $mod $*"
|
||||
return 0
|
||||
fi
|
||||
log "WARN: failed to load: $mod"
|
||||
log "WARN: failed to load: $mod (exit $?)"
|
||||
dmesg | tail -n 10 | sed 's/^/ dmesg: /' || true
|
||||
return 1
|
||||
}
|
||||
|
||||
nvidia_is_functional() {
|
||||
grep -q ' nvidiactl$' /proc/devices 2>/dev/null
|
||||
}
|
||||
|
||||
load_module_with_gsp_fallback() {
|
||||
ko="$NVIDIA_KO_DIR/nvidia.ko"
|
||||
if [ ! -f "$ko" ]; then
|
||||
log "ERROR: not found: $ko"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# Run insmod in background — on some converted SXM→PCIe cards GSP enters an
|
||||
# infinite crash/reload loop and insmod never returns. We check for successful
|
||||
# initialization by polling /proc/devices for nvidiactl instead of waiting for
|
||||
# insmod to exit.
|
||||
log "loading nvidia (GSP enabled, timeout 90s)"
|
||||
insmod "$ko" &
|
||||
_insmod_pid=$!
|
||||
|
||||
_waited=0
|
||||
while [ $_waited -lt 90 ]; do
|
||||
if nvidia_is_functional; then
|
||||
log "loaded: nvidia (GSP enabled, ${_waited}s)"
|
||||
echo "gsp-on" > /run/bee-nvidia-mode
|
||||
return 0
|
||||
fi
|
||||
# Check if insmod exited with an error before timeout
|
||||
if ! kill -0 "$_insmod_pid" 2>/dev/null; then
|
||||
wait "$_insmod_pid"
|
||||
_rc=$?
|
||||
if [ $_rc -ne 0 ]; then
|
||||
log "nvidia load failed (exit $_rc)"
|
||||
dmesg | tail -n 10 | sed 's/^/ dmesg: /' || true
|
||||
return 1
|
||||
fi
|
||||
# insmod exited 0 but nvidiactl not yet in /proc/devices — give it a moment
|
||||
sleep 2
|
||||
if nvidia_is_functional; then
|
||||
log "loaded: nvidia (GSP enabled, ${_waited}s)"
|
||||
return 0
|
||||
fi
|
||||
log "insmod exited 0 but nvidiactl missing — treating as failure"
|
||||
return 1
|
||||
fi
|
||||
sleep 1
|
||||
_waited=$((_waited + 1))
|
||||
done
|
||||
|
||||
# GSP init timed out — kill the hanging insmod and attempt gsp-off fallback
|
||||
log "nvidia GSP init timed out after 90s"
|
||||
kill "$_insmod_pid" 2>/dev/null || true
|
||||
wait "$_insmod_pid" 2>/dev/null || true
|
||||
|
||||
# Attempt to unload the partially-initialized module
|
||||
if ! rmmod nvidia 2>/dev/null; then
|
||||
# Module is stuck in the kernel — cannot reload with different params.
|
||||
# User must reboot and select bee.nvidia.mode=gsp-off at boot menu.
|
||||
log "ERROR: rmmod nvidia failed (EBUSY) — module stuck in kernel"
|
||||
log "ERROR: reboot and select 'EASY-BEE (advanced) -> GSP=off' in boot menu"
|
||||
echo "gsp-stuck" > /run/bee-nvidia-mode
|
||||
return 1
|
||||
fi
|
||||
|
||||
sleep 2
|
||||
log "retrying with NVreg_EnableGpuFirmware=0"
|
||||
log "WARNING: GSP disabled — power management will run via CPU path, not GPU firmware"
|
||||
|
||||
if insmod "$ko" NVreg_EnableGpuFirmware=0; then
|
||||
if nvidia_is_functional; then
|
||||
log "loaded: nvidia (GSP disabled)"
|
||||
echo "gsp-off" > /run/bee-nvidia-mode
|
||||
return 0
|
||||
fi
|
||||
log "insmod gsp-off exited 0 but nvidiactl missing"
|
||||
return 1
|
||||
fi
|
||||
|
||||
log "nvidia load failed (GSP=off)"
|
||||
dmesg | tail -n 10 | sed 's/^/ dmesg: /' || true
|
||||
return 1
|
||||
}
|
||||
@@ -70,7 +152,7 @@ load_host_module() {
|
||||
|
||||
case "$nvidia_mode" in
|
||||
normal|full)
|
||||
if ! load_module nvidia; then
|
||||
if ! load_module_with_gsp_fallback; then
|
||||
exit 1
|
||||
fi
|
||||
# nvidia-modeset on some server kernels needs ACPI video helper symbols
|
||||
@@ -127,6 +209,18 @@ fi
|
||||
ldconfig 2>/dev/null || true
|
||||
log "ldconfig refreshed"
|
||||
|
||||
# Keep persistence mode enabled across the session so dcgmi / stress tools do
|
||||
# not fail with deployment warnings on otherwise healthy GPUs.
|
||||
if command -v nvidia-smi >/dev/null 2>&1; then
|
||||
if nvidia-smi -pm 1 >/dev/null 2>&1; then
|
||||
log "enabled NVIDIA persistence mode"
|
||||
else
|
||||
log "WARN: failed to enable NVIDIA persistence mode"
|
||||
fi
|
||||
else
|
||||
log "WARN: nvidia-smi not found — cannot enable persistence mode"
|
||||
fi
|
||||
|
||||
# Start DCGM host engine so dcgmi can discover GPUs.
|
||||
# nv-hostengine must run after the NVIDIA modules and device nodes are ready.
|
||||
# If it started too early (for example via systemd before bee-nvidia-load), it can
|
||||
|
||||
@@ -7,16 +7,24 @@ xset s off
|
||||
xset -dpms
|
||||
xset s noblank
|
||||
|
||||
# Set desktop background.
|
||||
if [ -f /usr/share/bee/wallpaper.png ]; then
|
||||
feh --bg-fill /usr/share/bee/wallpaper.png
|
||||
else
|
||||
xsetroot -solid '#f6c90e'
|
||||
fi
|
||||
|
||||
tint2 &
|
||||
|
||||
# Wait up to 120s for bee-web to bind. The web server starts immediately now
|
||||
# (audit is deferred), so this should succeed in a few seconds on most hardware.
|
||||
i=0
|
||||
while [ $i -lt 120 ]; do
|
||||
if curl -sf http://localhost/healthz >/dev/null 2>&1; then break; fi
|
||||
# Wait up to 60s for bee-web before opening Chromium.
|
||||
# Without this Chromium gets connection-refused and shows a blank page.
|
||||
_i=0
|
||||
while [ $_i -lt 60 ]; do
|
||||
curl -sf http://localhost/healthz >/dev/null 2>&1 && break
|
||||
sleep 1
|
||||
i=$((i+1))
|
||||
_i=$((_i+1))
|
||||
done
|
||||
unset _i
|
||||
|
||||
chromium \
|
||||
--disable-infobars \
|
||||
@@ -24,7 +32,8 @@ chromium \
|
||||
--no-first-run \
|
||||
--disable-session-crashed-bubble \
|
||||
--disable-features=TranslateUI \
|
||||
--user-data-dir=/tmp/bee-chrome \
|
||||
--start-maximized \
|
||||
http://localhost/ &
|
||||
http://localhost/loading &
|
||||
|
||||
exec openbox
|
||||
|
||||
Reference in New Issue
Block a user