Compare commits
8 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
20f834aa96 | ||
| 105d92df8b | |||
| f96b149875 | |||
| 5ee120158e | |||
| 09fe0e2e9e | |||
| ace1a9dba6 | |||
| 905c581ece | |||
| 7c2a0135d2 |
@@ -120,6 +120,7 @@ type satRunner interface {
|
||||
RunMemoryStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||
RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||
RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
|
||||
RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
|
||||
RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||
}
|
||||
|
||||
@@ -627,6 +628,13 @@ func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platfor
|
||||
return a.sat.RunFanStressTest(ctx, baseDir, opts)
|
||||
}
|
||||
|
||||
func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
|
||||
path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir, nil)
|
||||
body := "Results: " + path
|
||||
|
||||
@@ -203,6 +203,10 @@ func (f fakeSAT) RunFanStressTest(_ context.Context, _ string, _ platform.FanStr
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunPlatformStress(_ context.Context, _ string, _ platform.PlatformStressOptions, _ func(string)) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
|
||||
476
audit/internal/platform/platform_stress.go
Normal file
476
audit/internal/platform/platform_stress.go
Normal file
@@ -0,0 +1,476 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"archive/tar"
|
||||
"bytes"
|
||||
"compress/gzip"
|
||||
"context"
|
||||
"encoding/csv"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// PlatformStressCycle defines one load+idle cycle.
|
||||
type PlatformStressCycle struct {
|
||||
LoadSec int // seconds of simultaneous CPU+GPU stress
|
||||
IdleSec int // seconds of idle monitoring after load cut
|
||||
}
|
||||
|
||||
// PlatformStressOptions controls the thermal cycling test.
|
||||
type PlatformStressOptions struct {
|
||||
Cycles []PlatformStressCycle
|
||||
}
|
||||
|
||||
// platformStressRow is one second of telemetry.
|
||||
type platformStressRow struct {
|
||||
ElapsedSec float64
|
||||
Cycle int
|
||||
Phase string // "load" | "idle"
|
||||
CPULoadPct float64
|
||||
MaxCPUTempC float64
|
||||
MaxGPUTempC float64
|
||||
SysPowerW float64
|
||||
FanMinRPM float64
|
||||
FanMaxRPM float64
|
||||
GPUThrottled bool
|
||||
}
|
||||
|
||||
// RunPlatformStress runs repeated load+idle thermal cycling.
|
||||
// Each cycle starts CPU (stressapptest) and GPU stress simultaneously,
|
||||
// runs for LoadSec, then cuts load abruptly and monitors for IdleSec.
|
||||
func (s *System) RunPlatformStress(
|
||||
ctx context.Context,
|
||||
baseDir string,
|
||||
opts PlatformStressOptions,
|
||||
logFunc func(string),
|
||||
) (string, error) {
|
||||
if logFunc == nil {
|
||||
logFunc = func(string) {}
|
||||
}
|
||||
if len(opts.Cycles) == 0 {
|
||||
return "", fmt.Errorf("no cycles defined")
|
||||
}
|
||||
if err := os.MkdirAll(baseDir, 0755); err != nil {
|
||||
return "", fmt.Errorf("mkdir %s: %w", baseDir, err)
|
||||
}
|
||||
|
||||
stamp := time.Now().UTC().Format("20060102-150405")
|
||||
runDir := filepath.Join(baseDir, "platform-stress-"+stamp)
|
||||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||
return "", fmt.Errorf("mkdir run dir: %w", err)
|
||||
}
|
||||
|
||||
vendor := s.DetectGPUVendor()
|
||||
logFunc(fmt.Sprintf("Platform Thermal Cycling — %d cycle(s), GPU vendor: %s", len(opts.Cycles), vendor))
|
||||
|
||||
var rows []platformStressRow
|
||||
start := time.Now()
|
||||
|
||||
var analyses []cycleAnalysis
|
||||
|
||||
for i, cycle := range opts.Cycles {
|
||||
if ctx.Err() != nil {
|
||||
break
|
||||
}
|
||||
cycleNum := i + 1
|
||||
logFunc(fmt.Sprintf("--- Cycle %d/%d: load=%ds, idle=%ds ---", cycleNum, len(opts.Cycles), cycle.LoadSec, cycle.IdleSec))
|
||||
|
||||
// ── LOAD PHASE ───────────────────────────────────────────────────────
|
||||
loadCtx, loadCancel := context.WithTimeout(ctx, time.Duration(cycle.LoadSec)*time.Second)
|
||||
var wg sync.WaitGroup
|
||||
|
||||
// CPU stress
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
cpuCmd, err := buildCPUStressCmd(loadCtx)
|
||||
if err != nil {
|
||||
logFunc("CPU stress: " + err.Error())
|
||||
return
|
||||
}
|
||||
_ = cpuCmd.Wait() // exits when loadCtx times out (SIGKILL)
|
||||
}()
|
||||
|
||||
// GPU stress
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
gpuCmd := buildGPUStressCmd(loadCtx, vendor)
|
||||
if gpuCmd == nil {
|
||||
return
|
||||
}
|
||||
_ = gpuCmd.Wait()
|
||||
}()
|
||||
|
||||
// Monitoring goroutine for load phase
|
||||
loadRows := collectPhase(loadCtx, cycleNum, "load", start)
|
||||
for _, r := range loadRows {
|
||||
logFunc(formatPlatformRow(r))
|
||||
}
|
||||
rows = append(rows, loadRows...)
|
||||
loadCancel()
|
||||
wg.Wait()
|
||||
|
||||
if len(loadRows) > 0 {
|
||||
logFunc(fmt.Sprintf("Cycle %d load ended (%.0fs)", cycleNum, loadRows[len(loadRows)-1].ElapsedSec))
|
||||
}
|
||||
|
||||
// ── IDLE PHASE ───────────────────────────────────────────────────────
|
||||
idleCtx, idleCancel := context.WithTimeout(ctx, time.Duration(cycle.IdleSec)*time.Second)
|
||||
idleRows := collectPhase(idleCtx, cycleNum, "idle", start)
|
||||
for _, r := range idleRows {
|
||||
logFunc(formatPlatformRow(r))
|
||||
}
|
||||
rows = append(rows, idleRows...)
|
||||
idleCancel()
|
||||
|
||||
// Per-cycle analysis
|
||||
an := analyzePlatformCycle(loadRows, idleRows)
|
||||
analyses = append(analyses, an)
|
||||
logFunc(fmt.Sprintf("Cycle %d: maxCPU=%.1f°C maxGPU=%.1f°C power=%.0fW throttled=%v fanDrop=%.0f%%",
|
||||
cycleNum, an.maxCPUTemp, an.maxGPUTemp, an.maxPower, an.throttled, an.fanDropPct))
|
||||
}
|
||||
|
||||
// Write CSV
|
||||
csvData := writePlatformCSV(rows)
|
||||
_ = os.WriteFile(filepath.Join(runDir, "metrics.csv"), csvData, 0644)
|
||||
|
||||
// Write summary
|
||||
summary := writePlatformSummary(opts, analyses)
|
||||
logFunc("--- Summary ---")
|
||||
for _, line := range strings.Split(summary, "\n") {
|
||||
if line != "" {
|
||||
logFunc(line)
|
||||
}
|
||||
}
|
||||
_ = os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644)
|
||||
|
||||
// Pack tar.gz
|
||||
archivePath := filepath.Join(baseDir, "platform-stress-"+stamp+".tar.gz")
|
||||
if err := packPlatformDir(runDir, archivePath); err != nil {
|
||||
return "", fmt.Errorf("pack archive: %w", err)
|
||||
}
|
||||
_ = os.RemoveAll(runDir)
|
||||
return archivePath, nil
|
||||
}
|
||||
|
||||
// collectPhase samples live metrics every second until ctx is done.
|
||||
func collectPhase(ctx context.Context, cycle int, phase string, testStart time.Time) []platformStressRow {
|
||||
var rows []platformStressRow
|
||||
ticker := time.NewTicker(time.Second)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return rows
|
||||
case <-ticker.C:
|
||||
sample := SampleLiveMetrics()
|
||||
rows = append(rows, sampleToPlatformRow(sample, cycle, phase, testStart))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func sampleToPlatformRow(s LiveMetricSample, cycle int, phase string, testStart time.Time) platformStressRow {
|
||||
r := platformStressRow{
|
||||
ElapsedSec: time.Since(testStart).Seconds(),
|
||||
Cycle: cycle,
|
||||
Phase: phase,
|
||||
CPULoadPct: s.CPULoadPct,
|
||||
SysPowerW: s.PowerW,
|
||||
}
|
||||
for _, t := range s.Temps {
|
||||
switch t.Group {
|
||||
case "cpu":
|
||||
if t.Celsius > r.MaxCPUTempC {
|
||||
r.MaxCPUTempC = t.Celsius
|
||||
}
|
||||
case "gpu":
|
||||
if t.Celsius > r.MaxGPUTempC {
|
||||
r.MaxGPUTempC = t.Celsius
|
||||
}
|
||||
}
|
||||
}
|
||||
for _, g := range s.GPUs {
|
||||
if g.TempC > r.MaxGPUTempC {
|
||||
r.MaxGPUTempC = g.TempC
|
||||
}
|
||||
}
|
||||
if len(s.Fans) > 0 {
|
||||
r.FanMinRPM = s.Fans[0].RPM
|
||||
r.FanMaxRPM = s.Fans[0].RPM
|
||||
for _, f := range s.Fans[1:] {
|
||||
if f.RPM < r.FanMinRPM {
|
||||
r.FanMinRPM = f.RPM
|
||||
}
|
||||
if f.RPM > r.FanMaxRPM {
|
||||
r.FanMaxRPM = f.RPM
|
||||
}
|
||||
}
|
||||
}
|
||||
return r
|
||||
}
|
||||
|
||||
func formatPlatformRow(r platformStressRow) string {
|
||||
throttle := ""
|
||||
if r.GPUThrottled {
|
||||
throttle = " THROTTLE"
|
||||
}
|
||||
fans := ""
|
||||
if r.FanMinRPM > 0 {
|
||||
fans = fmt.Sprintf(" fans=%.0f-%.0fRPM", r.FanMinRPM, r.FanMaxRPM)
|
||||
}
|
||||
return fmt.Sprintf("[%5.0fs] cycle=%d phase=%-4s cpu=%.0f%% cpuT=%.1f°C gpuT=%.1f°C pwr=%.0fW%s%s",
|
||||
r.ElapsedSec, r.Cycle, r.Phase, r.CPULoadPct, r.MaxCPUTempC, r.MaxGPUTempC, r.SysPowerW, fans, throttle)
|
||||
}
|
||||
|
||||
func analyzePlatformCycle(loadRows, idleRows []platformStressRow) cycleAnalysis {
|
||||
var an cycleAnalysis
|
||||
for _, r := range loadRows {
|
||||
if r.MaxCPUTempC > an.maxCPUTemp {
|
||||
an.maxCPUTemp = r.MaxCPUTempC
|
||||
}
|
||||
if r.MaxGPUTempC > an.maxGPUTemp {
|
||||
an.maxGPUTemp = r.MaxGPUTempC
|
||||
}
|
||||
if r.SysPowerW > an.maxPower {
|
||||
an.maxPower = r.SysPowerW
|
||||
}
|
||||
if r.GPUThrottled {
|
||||
an.throttled = true
|
||||
}
|
||||
}
|
||||
// Fan RPM at cut = avg of last 5 load rows
|
||||
if n := len(loadRows); n > 0 {
|
||||
window := loadRows
|
||||
if n > 5 {
|
||||
window = loadRows[n-5:]
|
||||
}
|
||||
var sum float64
|
||||
var cnt int
|
||||
for _, r := range window {
|
||||
if r.FanMinRPM > 0 {
|
||||
sum += (r.FanMinRPM + r.FanMaxRPM) / 2
|
||||
cnt++
|
||||
}
|
||||
}
|
||||
if cnt > 0 {
|
||||
an.fanAtCutAvg = sum / float64(cnt)
|
||||
}
|
||||
}
|
||||
// Fan RPM min in first 15s of idle
|
||||
an.fanMin15s = an.fanAtCutAvg
|
||||
var cutElapsed float64
|
||||
if len(loadRows) > 0 {
|
||||
cutElapsed = loadRows[len(loadRows)-1].ElapsedSec
|
||||
}
|
||||
for _, r := range idleRows {
|
||||
if r.ElapsedSec > cutElapsed+15 {
|
||||
break
|
||||
}
|
||||
avg := (r.FanMinRPM + r.FanMaxRPM) / 2
|
||||
if avg > 0 && (an.fanMin15s == 0 || avg < an.fanMin15s) {
|
||||
an.fanMin15s = avg
|
||||
}
|
||||
}
|
||||
if an.fanAtCutAvg > 0 {
|
||||
an.fanDropPct = (an.fanAtCutAvg - an.fanMin15s) / an.fanAtCutAvg * 100
|
||||
}
|
||||
return an
|
||||
}
|
||||
|
||||
type cycleAnalysis struct {
|
||||
maxCPUTemp float64
|
||||
maxGPUTemp float64
|
||||
maxPower float64
|
||||
throttled bool
|
||||
fanAtCutAvg float64
|
||||
fanMin15s float64
|
||||
fanDropPct float64
|
||||
}
|
||||
|
||||
func writePlatformSummary(opts PlatformStressOptions, analyses []cycleAnalysis) string {
|
||||
var b strings.Builder
|
||||
fmt.Fprintf(&b, "Platform Thermal Cycling — %d cycle(s)\n", len(opts.Cycles))
|
||||
fmt.Fprintf(&b, "%s\n\n", strings.Repeat("=", 48))
|
||||
|
||||
totalThrottle := 0
|
||||
totalFanWarn := 0
|
||||
for i, an := range analyses {
|
||||
cycle := opts.Cycles[i]
|
||||
fmt.Fprintf(&b, "Cycle %d/%d (load=%ds, idle=%ds)\n", i+1, len(opts.Cycles), cycle.LoadSec, cycle.IdleSec)
|
||||
fmt.Fprintf(&b, " Max CPU temp: %.1f°C\n", an.maxCPUTemp)
|
||||
fmt.Fprintf(&b, " Max GPU temp: %.1f°C\n", an.maxGPUTemp)
|
||||
fmt.Fprintf(&b, " Max sys power: %.0f W\n", an.maxPower)
|
||||
if an.throttled {
|
||||
fmt.Fprintf(&b, " Throttle: DETECTED\n")
|
||||
totalThrottle++
|
||||
} else {
|
||||
fmt.Fprintf(&b, " Throttle: none\n")
|
||||
}
|
||||
if an.fanAtCutAvg > 0 {
|
||||
fmt.Fprintf(&b, " Fan at load cut: %.0f RPM avg\n", an.fanAtCutAvg)
|
||||
fmt.Fprintf(&b, " Fan min (first 15s idle): %.0f RPM (drop %.0f%%)\n", an.fanMin15s, an.fanDropPct)
|
||||
if an.fanDropPct > 20 {
|
||||
fmt.Fprintf(&b, " Fan response: WARN — fast spindown (>20%% drop in 15s)\n")
|
||||
totalFanWarn++
|
||||
} else {
|
||||
fmt.Fprintf(&b, " Fan response: OK\n")
|
||||
}
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
fmt.Fprintf(&b, "%s\n", strings.Repeat("=", 48))
|
||||
if totalThrottle > 0 {
|
||||
fmt.Fprintf(&b, "Overall: FAIL — throttle detected in %d/%d cycles\n", totalThrottle, len(analyses))
|
||||
} else if totalFanWarn > 0 {
|
||||
fmt.Fprintf(&b, "Overall: WARN — fast fan spindown in %d/%d cycles (cooling recovery risk)\n", totalFanWarn, len(analyses))
|
||||
} else {
|
||||
fmt.Fprintf(&b, "Overall: PASS\n")
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func writePlatformCSV(rows []platformStressRow) []byte {
|
||||
var buf bytes.Buffer
|
||||
w := csv.NewWriter(&buf)
|
||||
_ = w.Write([]string{
|
||||
"elapsed_sec", "cycle", "phase",
|
||||
"cpu_load_pct", "max_cpu_temp_c", "max_gpu_temp_c",
|
||||
"sys_power_w", "fan_min_rpm", "fan_max_rpm", "gpu_throttled",
|
||||
})
|
||||
for _, r := range rows {
|
||||
throttled := "0"
|
||||
if r.GPUThrottled {
|
||||
throttled = "1"
|
||||
}
|
||||
_ = w.Write([]string{
|
||||
strconv.FormatFloat(r.ElapsedSec, 'f', 1, 64),
|
||||
strconv.Itoa(r.Cycle),
|
||||
r.Phase,
|
||||
strconv.FormatFloat(r.CPULoadPct, 'f', 1, 64),
|
||||
strconv.FormatFloat(r.MaxCPUTempC, 'f', 1, 64),
|
||||
strconv.FormatFloat(r.MaxGPUTempC, 'f', 1, 64),
|
||||
strconv.FormatFloat(r.SysPowerW, 'f', 1, 64),
|
||||
strconv.FormatFloat(r.FanMinRPM, 'f', 0, 64),
|
||||
strconv.FormatFloat(r.FanMaxRPM, 'f', 0, 64),
|
||||
throttled,
|
||||
})
|
||||
}
|
||||
w.Flush()
|
||||
return buf.Bytes()
|
||||
}
|
||||
|
||||
// buildCPUStressCmd creates a stressapptest command that runs until ctx is cancelled.
|
||||
func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
|
||||
path, err := satLookPath("stressapptest")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("stressapptest not found: %w", err)
|
||||
}
|
||||
// Use a very long duration; the context timeout will kill it at the right time.
|
||||
cmd := exec.CommandContext(ctx, path, "-s", "86400", "-W", "--cc_test")
|
||||
cmd.Stdout = nil
|
||||
cmd.Stderr = nil
|
||||
if err := cmd.Start(); err != nil {
|
||||
return nil, fmt.Errorf("stressapptest start: %w", err)
|
||||
}
|
||||
return cmd, nil
|
||||
}
|
||||
|
||||
// buildGPUStressCmd creates a GPU stress command appropriate for the detected vendor.
|
||||
// Returns nil if no GPU stress tool is available (CPU-only cycling still useful).
|
||||
func buildGPUStressCmd(ctx context.Context, vendor string) *exec.Cmd {
|
||||
switch strings.ToLower(vendor) {
|
||||
case "amd":
|
||||
return buildAMDGPUStressCmd(ctx)
|
||||
case "nvidia":
|
||||
return buildNvidiaGPUStressCmd(ctx)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
|
||||
rvsArgs, err := resolveRVSCommand()
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
rvsPath := rvsArgs[0]
|
||||
cfg := `actions:
|
||||
- name: gst_platform
|
||||
device: all
|
||||
module: gst
|
||||
parallel: true
|
||||
duration: 86400000
|
||||
copy_matrix: false
|
||||
target_stress: 90
|
||||
matrix_size_a: 8640
|
||||
matrix_size_b: 8640
|
||||
matrix_size_c: 8640
|
||||
`
|
||||
cfgFile := "/tmp/bee-platform-gst.conf"
|
||||
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
|
||||
cmd := exec.CommandContext(ctx, rvsPath, "-c", cfgFile)
|
||||
cmd.Stdout = nil
|
||||
cmd.Stderr = nil
|
||||
_ = cmd.Start()
|
||||
return cmd
|
||||
}
|
||||
|
||||
func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
|
||||
path, err := satLookPath("bee-gpu-stress")
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
cmd := exec.CommandContext(ctx, path, "--seconds", "86400", "--size-mb", "64")
|
||||
cmd.Stdout = nil
|
||||
cmd.Stderr = nil
|
||||
_ = cmd.Start()
|
||||
return cmd
|
||||
}
|
||||
|
||||
func packPlatformDir(dir, dest string) error {
|
||||
f, err := os.Create(dest)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer f.Close()
|
||||
gz := gzip.NewWriter(f)
|
||||
defer gz.Close()
|
||||
tw := tar.NewWriter(gz)
|
||||
defer tw.Close()
|
||||
|
||||
entries, err := os.ReadDir(dir)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
base := filepath.Base(dir)
|
||||
for _, e := range entries {
|
||||
if e.IsDir() {
|
||||
continue
|
||||
}
|
||||
fpath := filepath.Join(dir, e.Name())
|
||||
data, err := os.ReadFile(fpath)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
hdr := &tar.Header{
|
||||
Name: filepath.Join(base, e.Name()),
|
||||
Size: int64(len(data)),
|
||||
Mode: 0644,
|
||||
ModTime: time.Now(),
|
||||
}
|
||||
if err := tw.WriteHeader(hdr); err != nil {
|
||||
return err
|
||||
}
|
||||
if _, err := tw.Write(data); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -9,14 +9,18 @@ import (
|
||||
"net/http"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
var ansiEscapeRE = regexp.MustCompile(`\x1b\[[0-9;]*[a-zA-Z]|\x1b[()][A-Z0-9]|\x1b[DABC]`)
|
||||
|
||||
// ── Job ID counter ────────────────────────────────────────────────────────────
|
||||
|
||||
var jobCounter atomic.Uint64
|
||||
@@ -91,11 +95,25 @@ func runCmdJob(j *jobState, cmd *exec.Cmd) {
|
||||
j.finish(err.Error())
|
||||
return
|
||||
}
|
||||
// Lower the CPU scheduling priority of stress/audit subprocesses to nice+10
|
||||
// so the X server and kernel interrupt handling remain responsive under load
|
||||
// (prevents KVM/IPMI graphical console from freezing during GPU stress tests).
|
||||
if cmd.Process != nil {
|
||||
_ = syscall.Setpriority(syscall.PRIO_PROCESS, cmd.Process.Pid, 10)
|
||||
}
|
||||
|
||||
go func() {
|
||||
scanner := bufio.NewScanner(pr)
|
||||
for scanner.Scan() {
|
||||
j.append(scanner.Text())
|
||||
// Split on \r to handle progress-bar style output (e.g. \r overwrites)
|
||||
// and strip ANSI escape codes so logs are readable in the browser.
|
||||
parts := strings.Split(scanner.Text(), "\r")
|
||||
for _, part := range parts {
|
||||
line := ansiEscapeRE.ReplaceAllString(part, "")
|
||||
if line != "" {
|
||||
j.append(line)
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
@@ -405,6 +423,58 @@ func (h *handler) handleAPIExportBundle(w http.ResponseWriter, r *http.Request)
|
||||
})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIExportUSBTargets(w http.ResponseWriter, _ *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
targets, err := h.opts.App.ListRemovableTargets()
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
if targets == nil {
|
||||
targets = []platform.RemovableTarget{}
|
||||
}
|
||||
writeJSON(w, targets)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIExportUSBAudit(w http.ResponseWriter, r *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
var target platform.RemovableTarget
|
||||
if err := json.NewDecoder(r.Body).Decode(&target); err != nil || target.Device == "" {
|
||||
writeError(w, http.StatusBadRequest, "device is required")
|
||||
return
|
||||
}
|
||||
result, err := h.opts.App.ExportLatestAuditResult(target)
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
writeJSON(w, map[string]string{"status": "ok", "message": result.Body})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIExportUSBBundle(w http.ResponseWriter, r *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
var target platform.RemovableTarget
|
||||
if err := json.NewDecoder(r.Body).Decode(&target); err != nil || target.Device == "" {
|
||||
writeError(w, http.StatusBadRequest, "device is required")
|
||||
return
|
||||
}
|
||||
result, err := h.opts.App.ExportSupportBundleResult(target)
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
writeJSON(w, map[string]string{"status": "ok", "message": result.Body})
|
||||
}
|
||||
|
||||
// ── GPU presence ──────────────────────────────────────────────────────────────
|
||||
|
||||
func (h *handler) handleAPIGPUPresence(w http.ResponseWriter, r *http.Request) {
|
||||
@@ -790,3 +860,85 @@ func (h *handler) rollbackPendingNetworkChange() error {
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// ── Display / Screen Resolution ───────────────────────────────────────────────
|
||||
|
||||
type displayMode struct {
|
||||
Output string `json:"output"`
|
||||
Mode string `json:"mode"`
|
||||
Current bool `json:"current"`
|
||||
}
|
||||
|
||||
type displayInfo struct {
|
||||
Output string `json:"output"`
|
||||
Modes []displayMode `json:"modes"`
|
||||
Current string `json:"current"`
|
||||
}
|
||||
|
||||
var xrandrOutputRE = regexp.MustCompile(`^(\S+)\s+connected`)
|
||||
var xrandrModeRE = regexp.MustCompile(`^\s{3}(\d+x\d+)\s`)
|
||||
var xrandrCurrentRE = regexp.MustCompile(`\*`)
|
||||
|
||||
func parseXrandrOutput(out string) []displayInfo {
|
||||
var infos []displayInfo
|
||||
var cur *displayInfo
|
||||
for _, line := range strings.Split(out, "\n") {
|
||||
if m := xrandrOutputRE.FindStringSubmatch(line); m != nil {
|
||||
if cur != nil {
|
||||
infos = append(infos, *cur)
|
||||
}
|
||||
cur = &displayInfo{Output: m[1]}
|
||||
continue
|
||||
}
|
||||
if cur == nil {
|
||||
continue
|
||||
}
|
||||
if m := xrandrModeRE.FindStringSubmatch(line); m != nil {
|
||||
isCurrent := xrandrCurrentRE.MatchString(line)
|
||||
mode := displayMode{Output: cur.Output, Mode: m[1], Current: isCurrent}
|
||||
cur.Modes = append(cur.Modes, mode)
|
||||
if isCurrent {
|
||||
cur.Current = m[1]
|
||||
}
|
||||
}
|
||||
}
|
||||
if cur != nil {
|
||||
infos = append(infos, *cur)
|
||||
}
|
||||
return infos
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIDisplayResolutions(w http.ResponseWriter, _ *http.Request) {
|
||||
out, err := exec.Command("xrandr").Output()
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, "xrandr: "+err.Error())
|
||||
return
|
||||
}
|
||||
writeJSON(w, parseXrandrOutput(string(out)))
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIDisplaySet(w http.ResponseWriter, r *http.Request) {
|
||||
var req struct {
|
||||
Output string `json:"output"`
|
||||
Mode string `json:"mode"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil || req.Output == "" || req.Mode == "" {
|
||||
writeError(w, http.StatusBadRequest, "output and mode are required")
|
||||
return
|
||||
}
|
||||
// Validate mode looks like WxH to prevent injection
|
||||
if !regexp.MustCompile(`^\d+x\d+$`).MatchString(req.Mode) {
|
||||
writeError(w, http.StatusBadRequest, "invalid mode format")
|
||||
return
|
||||
}
|
||||
// Validate output name (no special chars)
|
||||
if !regexp.MustCompile(`^[A-Za-z0-9_\-]+$`).MatchString(req.Output) {
|
||||
writeError(w, http.StatusBadRequest, "invalid output name")
|
||||
return
|
||||
}
|
||||
if out, err := exec.Command("xrandr", "--output", req.Output, "--mode", req.Mode).CombinedOutput(); err != nil {
|
||||
writeError(w, http.StatusInternalServerError, "xrandr: "+strings.TrimSpace(string(out)))
|
||||
return
|
||||
}
|
||||
writeJSON(w, map[string]string{"status": "ok", "output": req.Output, "mode": req.Mode})
|
||||
}
|
||||
|
||||
@@ -205,12 +205,83 @@ document.querySelectorAll('.terminal').forEach(function(t){
|
||||
|
||||
func renderDashboard(opts HandlerOptions) string {
|
||||
var b strings.Builder
|
||||
b.WriteString(renderAuditStatusBanner(opts))
|
||||
b.WriteString(renderHardwareSummaryCard(opts))
|
||||
b.WriteString(renderHealthCard(opts))
|
||||
b.WriteString(renderMetrics())
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// renderAuditStatusBanner shows a live progress banner when an audit task is
|
||||
// running and auto-reloads the page when it completes.
|
||||
func renderAuditStatusBanner(opts HandlerOptions) string {
|
||||
// If audit data already exists, no banner needed — data is fresh.
|
||||
// We still inject the polling script so a newly-triggered audit also reloads.
|
||||
hasData := false
|
||||
if _, err := loadSnapshot(opts.AuditPath); err == nil {
|
||||
hasData = true
|
||||
}
|
||||
_ = hasData
|
||||
|
||||
return `<div id="audit-banner" style="display:none" class="alert alert-warn" style="margin-bottom:16px">
|
||||
<span id="audit-banner-text">▶ Hardware audit is running — page will refresh automatically when complete.</span>
|
||||
<a href="/tasks" style="margin-left:12px;font-size:12px">View in Tasks</a>
|
||||
</div>
|
||||
<script>
|
||||
(function(){
|
||||
var _auditPoll = null;
|
||||
var _auditSeenRunning = false;
|
||||
|
||||
function pollAuditTask() {
|
||||
fetch('/api/tasks').then(function(r){ return r.json(); }).then(function(tasks){
|
||||
if (!tasks) return;
|
||||
var audit = null;
|
||||
for (var i = 0; i < tasks.length; i++) {
|
||||
if (tasks[i].target === 'audit') { audit = tasks[i]; break; }
|
||||
}
|
||||
var banner = document.getElementById('audit-banner');
|
||||
var txt = document.getElementById('audit-banner-text');
|
||||
if (!audit) {
|
||||
if (banner) banner.style.display = 'none';
|
||||
return;
|
||||
}
|
||||
if (audit.status === 'running' || audit.status === 'pending') {
|
||||
_auditSeenRunning = true;
|
||||
if (banner) {
|
||||
banner.style.display = '';
|
||||
var label = audit.status === 'pending' ? 'pending\u2026' : 'running\u2026';
|
||||
if (txt) txt.textContent = '\u25b6 Hardware audit ' + label + ' \u2014 page will refresh when complete.';
|
||||
}
|
||||
} else if (audit.status === 'done' && _auditSeenRunning) {
|
||||
// Audit just finished — reload to show fresh hardware data.
|
||||
clearInterval(_auditPoll);
|
||||
if (banner) {
|
||||
if (txt) txt.textContent = '\u2713 Audit complete \u2014 reloading\u2026';
|
||||
banner.style.background = 'var(--ok-bg,#fcfff5)';
|
||||
banner.style.color = 'var(--ok-fg,#2c662d)';
|
||||
}
|
||||
setTimeout(function(){ window.location.reload(); }, 800);
|
||||
} else if (audit.status === 'failed') {
|
||||
_auditSeenRunning = false;
|
||||
if (banner) {
|
||||
banner.style.display = '';
|
||||
banner.style.background = 'var(--crit-bg,#fff6f6)';
|
||||
banner.style.color = 'var(--crit-fg,#9f3a38)';
|
||||
if (txt) txt.textContent = '\u2717 Audit failed: ' + (audit.error||'unknown error');
|
||||
clearInterval(_auditPoll);
|
||||
}
|
||||
} else {
|
||||
if (banner) banner.style.display = 'none';
|
||||
}
|
||||
}).catch(function(){});
|
||||
}
|
||||
|
||||
_auditPoll = setInterval(pollAuditTask, 3000);
|
||||
pollAuditTask();
|
||||
})();
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderAudit() string {
|
||||
return `<div class="card"><div class="card-head">Audit Viewer <button class="btn btn-sm btn-secondary" style="margin-left:auto" onclick="openAuditModal()">Actions</button></div><div class="card-body" style="padding:0"><iframe class="viewer-frame" src="/viewer" title="Audit viewer"></iframe></div></div>`
|
||||
}
|
||||
@@ -615,6 +686,10 @@ func renderBurn() string {
|
||||
<p style="color:var(--muted);font-size:12px;margin-bottom:8px">Google stressapptest saturates CPU, memory and cache buses simultaneously. Env: <code>BEE_SAT_STRESS_SECONDS</code> (default 300), <code>BEE_SAT_STRESS_MB</code> (default auto).</p>
|
||||
<button class="btn btn-primary" onclick="runBurnIn('sat-stress')">▶ Start SAT Stress</button>
|
||||
</div></div>
|
||||
<div class="card"><div class="card-head">Platform Thermal Cycling</div><div class="card-body">
|
||||
<p style="color:var(--muted);font-size:12px;margin-bottom:8px">Runs CPU + GPU stress simultaneously across multiple load/idle cycles with varying durations. Detects cooling systems that fail to recover under repeated load cycles. Smoke: 2 cycles ~5 min. Acceptance: 4 cycles ~25 min.</p>
|
||||
<button class="btn btn-primary" onclick="runBurnIn('platform-stress')">▶ Start Thermal Cycling</button>
|
||||
</div></div>
|
||||
</div>
|
||||
<div id="bi-output" style="display:none;margin-top:16px" class="card">
|
||||
<div class="card-head">Output <span id="bi-title"></span></div>
|
||||
@@ -841,12 +916,79 @@ func renderExport(exportDir string) string {
|
||||
return `<div class="grid2">
|
||||
<div class="card"><div class="card-head">Support Bundle</div><div class="card-body">
|
||||
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Creates a tar.gz archive of all audit files, SAT results, and logs.</p>
|
||||
<a class="btn btn-primary" href="/export/support.tar.gz">⬇ Download Support Bundle</a>
|
||||
<a class="btn btn-primary" href="/export/support.tar.gz">↓ Download Support Bundle</a>
|
||||
</div></div>
|
||||
<div class="card"><div class="card-head">Export Files</div><div class="card-body">
|
||||
<table><tr><th>File</th></tr>` + rows.String() + `</table>
|
||||
</div></div>
|
||||
</div>`
|
||||
</div>
|
||||
|
||||
<div class="card" style="margin-top:16px">
|
||||
<div class="card-head">Export to USB
|
||||
<button class="btn btn-sm btn-secondary" onclick="usbRefresh()" style="margin-left:auto">↻ Refresh</button>
|
||||
</div>
|
||||
<div class="card-body">
|
||||
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Write audit JSON or support bundle directly to a removable USB drive.</p>
|
||||
<div id="usb-status" style="font-size:13px;color:var(--muted)">Scanning for USB devices...</div>
|
||||
<div id="usb-targets" style="margin-top:12px"></div>
|
||||
<div id="usb-msg" style="margin-top:10px;font-size:13px"></div>
|
||||
</div>
|
||||
</div>
|
||||
<script>
|
||||
(function(){
|
||||
function usbRefresh() {
|
||||
document.getElementById('usb-status').textContent = 'Scanning...';
|
||||
document.getElementById('usb-targets').innerHTML = '';
|
||||
document.getElementById('usb-msg').textContent = '';
|
||||
fetch('/api/export/usb').then(r=>r.json()).then(targets => {
|
||||
const st = document.getElementById('usb-status');
|
||||
const ct = document.getElementById('usb-targets');
|
||||
if (!targets || targets.length === 0) {
|
||||
st.textContent = 'No removable USB devices found.';
|
||||
return;
|
||||
}
|
||||
st.textContent = targets.length + ' device(s) found:';
|
||||
ct.innerHTML = '<table><tr><th>Device</th><th>FS</th><th>Size</th><th>Label</th><th>Model</th><th>Actions</th></tr>' +
|
||||
targets.map(t => {
|
||||
const dev = t.device || '';
|
||||
const label = t.label || '';
|
||||
const model = t.model || '';
|
||||
return '<tr>' +
|
||||
'<td style="font-family:monospace">'+dev+'</td>' +
|
||||
'<td>'+t.fs_type+'</td>' +
|
||||
'<td>'+t.size+'</td>' +
|
||||
'<td>'+label+'</td>' +
|
||||
'<td style="font-size:12px;color:var(--muted)">'+model+'</td>' +
|
||||
'<td style="white-space:nowrap">' +
|
||||
'<button class="btn btn-sm btn-primary" onclick="usbExport(\'audit\','+JSON.stringify(t)+')">Audit JSON</button> ' +
|
||||
'<button class="btn btn-sm btn-secondary" onclick="usbExport(\'bundle\','+JSON.stringify(t)+')">Support Bundle</button>' +
|
||||
'</td></tr>';
|
||||
}).join('') + '</table>';
|
||||
}).catch(e => {
|
||||
document.getElementById('usb-status').textContent = 'Error: ' + e;
|
||||
});
|
||||
}
|
||||
window.usbExport = function(type, target) {
|
||||
const msg = document.getElementById('usb-msg');
|
||||
msg.style.color = 'var(--muted)';
|
||||
msg.textContent = 'Exporting to ' + (target.device||'') + '...';
|
||||
fetch('/api/export/usb/'+type, {
|
||||
method: 'POST',
|
||||
headers: {'Content-Type':'application/json'},
|
||||
body: JSON.stringify(target)
|
||||
}).then(r=>r.json()).then(d => {
|
||||
if (d.error) { msg.style.color='var(--err,red)'; msg.textContent = 'Error: '+d.error; return; }
|
||||
msg.style.color = 'var(--ok,green)';
|
||||
msg.textContent = d.message || 'Done.';
|
||||
}).catch(e => {
|
||||
msg.style.color = 'var(--err,red)';
|
||||
msg.textContent = 'Error: '+e;
|
||||
});
|
||||
};
|
||||
window.usbRefresh = usbRefresh;
|
||||
usbRefresh();
|
||||
})();
|
||||
</script>`
|
||||
}
|
||||
|
||||
func listExportFiles(exportDir string) ([]string, error) {
|
||||
@@ -872,6 +1014,56 @@ func listExportFiles(exportDir string) ([]string, error) {
|
||||
return entries, nil
|
||||
}
|
||||
|
||||
// ── Display Resolution ────────────────────────────────────────────────────────
|
||||
|
||||
func renderDisplayInline() string {
|
||||
return `<div id="display-status" style="color:var(--muted);font-size:13px;margin-bottom:12px">Loading displays...</div>
|
||||
<div id="display-controls"></div>
|
||||
<script>
|
||||
(function(){
|
||||
function loadDisplays() {
|
||||
fetch('/api/display/resolutions').then(r=>r.json()).then(displays => {
|
||||
const status = document.getElementById('display-status');
|
||||
const ctrl = document.getElementById('display-controls');
|
||||
if (!displays || displays.length === 0) {
|
||||
status.textContent = 'No connected displays found or xrandr not available.';
|
||||
return;
|
||||
}
|
||||
status.textContent = '';
|
||||
ctrl.innerHTML = displays.map(d => {
|
||||
const opts = (d.modes||[]).map(m =>
|
||||
'<option value="'+m.mode+'"'+(m.current?' selected':'')+'>'+m.mode+(m.current?' (current)':'')+'</option>'
|
||||
).join('');
|
||||
return '<div style="margin-bottom:12px">'
|
||||
+'<span style="font-weight:600;margin-right:8px">'+d.output+'</span>'
|
||||
+'<span style="color:var(--muted);font-size:12px;margin-right:12px">Current: '+d.current+'</span>'
|
||||
+'<select id="res-sel-'+d.output+'" style="margin-right:8px">'+opts+'</select>'
|
||||
+'<button class="btn btn-sm btn-primary" onclick="applyResolution(\''+d.output+'\')">Apply</button>'
|
||||
+'</div>';
|
||||
}).join('');
|
||||
}).catch(()=>{
|
||||
document.getElementById('display-status').textContent = 'xrandr not available on this system.';
|
||||
});
|
||||
}
|
||||
window.applyResolution = function(output) {
|
||||
const sel = document.getElementById('res-sel-'+output);
|
||||
if (!sel) return;
|
||||
const mode = sel.value;
|
||||
const btn = sel.nextElementSibling;
|
||||
btn.disabled = true;
|
||||
btn.textContent = 'Applying...';
|
||||
fetch('/api/display/set', {method:'POST', headers:{'Content-Type':'application/json'}, body:JSON.stringify({output:output,mode:mode})})
|
||||
.then(r=>r.json()).then(d=>{
|
||||
if (d.error) { alert('Error: '+d.error); }
|
||||
loadDisplays();
|
||||
}).catch(e=>{ alert('Error: '+e); })
|
||||
.finally(()=>{ btn.disabled=false; btn.textContent='Apply'; });
|
||||
};
|
||||
loadDisplays();
|
||||
})();
|
||||
</script>`
|
||||
}
|
||||
|
||||
// ── Tools ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
func renderTools() string {
|
||||
@@ -923,6 +1115,9 @@ function installToRAM() {
|
||||
<div class="card"><div class="card-head">Services</div><div class="card-body">` +
|
||||
renderServicesInline() + `</div></div>
|
||||
|
||||
<div class="card"><div class="card-head">Display Resolution</div><div class="card-body">` +
|
||||
renderDisplayInline() + `</div></div>
|
||||
|
||||
<script>
|
||||
function checkTools() {
|
||||
document.getElementById('tools-table').innerHTML = '<p style="color:var(--muted);font-size:13px">Checking...</p>';
|
||||
|
||||
@@ -215,6 +215,7 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
mux.HandleFunc("POST /api/sat/amd-stress/run", h.handleAPISATRun("amd-stress"))
|
||||
mux.HandleFunc("POST /api/sat/memory-stress/run", h.handleAPISATRun("memory-stress"))
|
||||
mux.HandleFunc("POST /api/sat/sat-stress/run", h.handleAPISATRun("sat-stress"))
|
||||
mux.HandleFunc("POST /api/sat/platform-stress/run", h.handleAPISATRun("platform-stress"))
|
||||
mux.HandleFunc("GET /api/sat/stream", h.handleAPISATStream)
|
||||
mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
|
||||
|
||||
@@ -240,10 +241,17 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
// Export
|
||||
mux.HandleFunc("GET /api/export/list", h.handleAPIExportList)
|
||||
mux.HandleFunc("POST /api/export/bundle", h.handleAPIExportBundle)
|
||||
mux.HandleFunc("GET /api/export/usb", h.handleAPIExportUSBTargets)
|
||||
mux.HandleFunc("POST /api/export/usb/audit", h.handleAPIExportUSBAudit)
|
||||
mux.HandleFunc("POST /api/export/usb/bundle", h.handleAPIExportUSBBundle)
|
||||
|
||||
// Tools
|
||||
mux.HandleFunc("GET /api/tools/check", h.handleAPIToolsCheck)
|
||||
|
||||
// Display
|
||||
mux.HandleFunc("GET /api/display/resolutions", h.handleAPIDisplayResolutions)
|
||||
mux.HandleFunc("POST /api/display/set", h.handleAPIDisplaySet)
|
||||
|
||||
// GPU presence
|
||||
mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
|
||||
|
||||
|
||||
@@ -12,6 +12,7 @@ import (
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
// Task statuses.
|
||||
@@ -34,7 +35,8 @@ var taskNames = map[string]string{
|
||||
"amd-bandwidth": "AMD GPU MEM Bandwidth",
|
||||
"amd-stress": "AMD GPU Burn-in",
|
||||
"memory-stress": "Memory Burn-in",
|
||||
"sat-stress": "SAT Stress (stressapptest)",
|
||||
"sat-stress": "SAT Stress (stressapptest)",
|
||||
"platform-stress": "Platform Thermal Cycling",
|
||||
"audit": "Audit",
|
||||
"install": "Install to Disk",
|
||||
"install-to-ram": "Install to RAM",
|
||||
@@ -98,6 +100,34 @@ func resolveBurnPreset(profile string) burnPreset {
|
||||
}
|
||||
}
|
||||
|
||||
func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions {
|
||||
switch profile {
|
||||
case "overnight":
|
||||
return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{
|
||||
{LoadSec: 600, IdleSec: 120},
|
||||
{LoadSec: 600, IdleSec: 60},
|
||||
{LoadSec: 600, IdleSec: 30},
|
||||
{LoadSec: 600, IdleSec: 120},
|
||||
{LoadSec: 600, IdleSec: 60},
|
||||
{LoadSec: 600, IdleSec: 30},
|
||||
{LoadSec: 600, IdleSec: 120},
|
||||
{LoadSec: 600, IdleSec: 60},
|
||||
}}
|
||||
case "acceptance":
|
||||
return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{
|
||||
{LoadSec: 300, IdleSec: 60},
|
||||
{LoadSec: 300, IdleSec: 30},
|
||||
{LoadSec: 300, IdleSec: 60},
|
||||
{LoadSec: 300, IdleSec: 30},
|
||||
}}
|
||||
default: // smoke
|
||||
return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{
|
||||
{LoadSec: 90, IdleSec: 60},
|
||||
{LoadSec: 90, IdleSec: 30},
|
||||
}}
|
||||
}
|
||||
}
|
||||
|
||||
// taskQueue manages a priority-ordered list of tasks and runs them one at a time.
|
||||
type taskQueue struct {
|
||||
mu sync.Mutex
|
||||
@@ -410,6 +440,9 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
archive, err = runSATStressPackCtx(a, ctx, "", dur, j.append)
|
||||
case "platform-stress":
|
||||
opts := resolvePlatformStressPreset(t.params.BurnProfile)
|
||||
archive, err = a.RunPlatformStress(ctx, "", opts, j.append)
|
||||
case "audit":
|
||||
result, e := a.RunAuditNow(q.opts.RuntimeMode)
|
||||
if e != nil {
|
||||
|
||||
@@ -30,8 +30,8 @@ lb config noauto \
|
||||
--linux-flavours "amd64" \
|
||||
--linux-packages "${LB_LINUX_PACKAGES}" \
|
||||
--memtest none \
|
||||
--iso-volume "EASY-BEE" \
|
||||
--iso-application "EASY-BEE" \
|
||||
--iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \
|
||||
--apt-recommends false \
|
||||
--chroot-squashfs-compression-type zstd \
|
||||
|
||||
@@ -12,6 +12,7 @@ CACHE_DIR="${BEE_BUILDER_CACHE_DIR:-${REPO_ROOT}/dist/container-cache}"
|
||||
AUTH_KEYS=""
|
||||
REBUILD_IMAGE=0
|
||||
CLEAN_CACHE=0
|
||||
VARIANT="all"
|
||||
|
||||
. "${BUILDER_DIR}/VERSIONS"
|
||||
|
||||
@@ -34,14 +35,23 @@ while [ $# -gt 0 ]; do
|
||||
REBUILD_IMAGE=1
|
||||
shift
|
||||
;;
|
||||
--variant)
|
||||
VARIANT="$2"
|
||||
shift 2
|
||||
;;
|
||||
*)
|
||||
echo "unknown arg: $1" >&2
|
||||
echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys]" >&2
|
||||
echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys] [--variant nvidia|amd|all]" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
case "$VARIANT" in
|
||||
nvidia|amd|nogpu|all) ;;
|
||||
*) echo "unknown variant: $VARIANT (expected nvidia, amd, nogpu, or all)" >&2; exit 1 ;;
|
||||
esac
|
||||
|
||||
if [ "$CLEAN_CACHE" = "1" ]; then
|
||||
echo "=== cleaning build cache: ${CACHE_DIR} ==="
|
||||
rm -rf "${CACHE_DIR:?}/go-build" \
|
||||
@@ -49,8 +59,10 @@ if [ "$CLEAN_CACHE" = "1" ]; then
|
||||
"${CACHE_DIR:?}/tmp" \
|
||||
"${CACHE_DIR:?}/bee" \
|
||||
"${CACHE_DIR:?}/lb-packages"
|
||||
echo "=== cleaning live-build work dir: ${REPO_ROOT}/dist/live-build-work ==="
|
||||
rm -rf "${REPO_ROOT}/dist/live-build-work"
|
||||
echo "=== cleaning live-build work dirs ==="
|
||||
rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia"
|
||||
rm -rf "${REPO_ROOT}/dist/live-build-work-amd"
|
||||
rm -rf "${REPO_ROOT}/dist/live-build-work-nogpu"
|
||||
echo "=== caches cleared, proceeding with build ==="
|
||||
fi
|
||||
|
||||
@@ -108,34 +120,75 @@ else
|
||||
echo "=== using existing builder image ${IMAGE_REF} (${BUILDER_PLATFORM}) ==="
|
||||
fi
|
||||
|
||||
set -- \
|
||||
run --rm --privileged \
|
||||
--platform "${BUILDER_PLATFORM}" \
|
||||
-v "${REPO_ROOT}:/work" \
|
||||
-v "${CACHE_DIR}:/cache" \
|
||||
-e BEE_CONTAINER_BUILD=1 \
|
||||
-e GOCACHE=/cache/go-build \
|
||||
-e GOMODCACHE=/cache/go-mod \
|
||||
-e TMPDIR=/cache/tmp \
|
||||
-e BEE_CACHE_DIR=/cache/bee \
|
||||
-w /work \
|
||||
"${IMAGE_REF}" \
|
||||
sh /work/iso/builder/build.sh
|
||||
|
||||
if [ -n "$AUTH_KEYS" ]; then
|
||||
set -- run --rm --privileged \
|
||||
--platform "${BUILDER_PLATFORM}" \
|
||||
-v "${REPO_ROOT}:/work" \
|
||||
-v "${CACHE_DIR}:/cache" \
|
||||
-v "${AUTH_KEYS_DIR}:/tmp/bee-authkeys:ro" \
|
||||
# Build base docker run args (without --authorized-keys)
|
||||
build_run_args() {
|
||||
_variant="$1"
|
||||
_auth_arg=""
|
||||
if [ -n "$AUTH_KEYS" ]; then
|
||||
_auth_arg="--authorized-keys /tmp/bee-authkeys/${AUTH_KEYS_BASE}"
|
||||
fi
|
||||
echo "run --rm --privileged \
|
||||
--platform ${BUILDER_PLATFORM} \
|
||||
-v ${REPO_ROOT}:/work \
|
||||
-v ${CACHE_DIR}:/cache \
|
||||
${AUTH_KEYS:+-v ${AUTH_KEYS_DIR}:/tmp/bee-authkeys:ro} \
|
||||
-e BEE_CONTAINER_BUILD=1 \
|
||||
-e GOCACHE=/cache/go-build \
|
||||
-e GOMODCACHE=/cache/go-mod \
|
||||
-e TMPDIR=/cache/tmp \
|
||||
-e BEE_CACHE_DIR=/cache/bee \
|
||||
-w /work \
|
||||
"${IMAGE_REF}" \
|
||||
sh /work/iso/builder/build.sh --authorized-keys "/tmp/bee-authkeys/${AUTH_KEYS_BASE}"
|
||||
fi
|
||||
${IMAGE_REF} \
|
||||
sh /work/iso/builder/build.sh --variant ${_variant} ${_auth_arg}"
|
||||
}
|
||||
|
||||
"$CONTAINER_TOOL" "$@"
|
||||
run_variant() {
|
||||
_v="$1"
|
||||
echo "=== building variant: ${_v} ==="
|
||||
if [ -n "$AUTH_KEYS" ]; then
|
||||
"$CONTAINER_TOOL" run --rm --privileged \
|
||||
--platform "${BUILDER_PLATFORM}" \
|
||||
-v "${REPO_ROOT}:/work" \
|
||||
-v "${CACHE_DIR}:/cache" \
|
||||
-v "${AUTH_KEYS_DIR}:/tmp/bee-authkeys:ro" \
|
||||
-e BEE_CONTAINER_BUILD=1 \
|
||||
-e GOCACHE=/cache/go-build \
|
||||
-e GOMODCACHE=/cache/go-mod \
|
||||
-e TMPDIR=/cache/tmp \
|
||||
-e BEE_CACHE_DIR=/cache/bee \
|
||||
-w /work \
|
||||
"${IMAGE_REF}" \
|
||||
sh /work/iso/builder/build.sh --variant "${_v}" \
|
||||
--authorized-keys "/tmp/bee-authkeys/${AUTH_KEYS_BASE}"
|
||||
else
|
||||
"$CONTAINER_TOOL" run --rm --privileged \
|
||||
--platform "${BUILDER_PLATFORM}" \
|
||||
-v "${REPO_ROOT}:/work" \
|
||||
-v "${CACHE_DIR}:/cache" \
|
||||
-e BEE_CONTAINER_BUILD=1 \
|
||||
-e GOCACHE=/cache/go-build \
|
||||
-e GOMODCACHE=/cache/go-mod \
|
||||
-e TMPDIR=/cache/tmp \
|
||||
-e BEE_CACHE_DIR=/cache/bee \
|
||||
-w /work \
|
||||
"${IMAGE_REF}" \
|
||||
sh /work/iso/builder/build.sh --variant "${_v}"
|
||||
fi
|
||||
}
|
||||
|
||||
case "$VARIANT" in
|
||||
nvidia)
|
||||
run_variant nvidia
|
||||
;;
|
||||
amd)
|
||||
run_variant amd
|
||||
;;
|
||||
nogpu)
|
||||
run_variant nogpu
|
||||
;;
|
||||
all)
|
||||
run_variant nvidia
|
||||
run_variant amd
|
||||
run_variant nogpu
|
||||
;;
|
||||
esac
|
||||
|
||||
@@ -13,19 +13,29 @@ BUILDER_DIR="${REPO_ROOT}/iso/builder"
|
||||
OVERLAY_DIR="${REPO_ROOT}/iso/overlay"
|
||||
DIST_DIR="${REPO_ROOT}/dist"
|
||||
VENDOR_DIR="${REPO_ROOT}/iso/vendor"
|
||||
BUILD_WORK_DIR="${DIST_DIR}/live-build-work"
|
||||
OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage"
|
||||
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||
AUTH_KEYS=""
|
||||
BEE_GPU_VENDOR="nvidia"
|
||||
|
||||
# parse args
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
--authorized-keys) AUTH_KEYS="$2"; shift 2 ;;
|
||||
--variant) BEE_GPU_VENDOR="$2"; shift 2 ;;
|
||||
*) echo "unknown arg: $1"; exit 1 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
case "$BEE_GPU_VENDOR" in
|
||||
nvidia|amd|nogpu) ;;
|
||||
*) echo "unknown variant: $BEE_GPU_VENDOR (expected nvidia, amd, or nogpu)" >&2; exit 1 ;;
|
||||
esac
|
||||
|
||||
BUILD_WORK_DIR="${DIST_DIR}/live-build-work-${BEE_GPU_VENDOR}"
|
||||
OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage-${BEE_GPU_VENDOR}"
|
||||
|
||||
export BEE_GPU_VENDOR
|
||||
|
||||
. "${BUILDER_DIR}/VERSIONS"
|
||||
export PATH="$PATH:/usr/local/go/bin"
|
||||
|
||||
@@ -132,7 +142,7 @@ if [ ! -d "/usr/src/linux-headers-${KVER}" ]; then
|
||||
apt-get install -y "linux-headers-${KVER}"
|
||||
fi
|
||||
|
||||
echo "=== bee ISO build ==="
|
||||
echo "=== bee ISO build (variant: ${BEE_GPU_VENDOR}) ==="
|
||||
echo "Debian: ${DEBIAN_VERSION}, Kernel ABI: ${DEBIAN_KERNEL_ABI}, Go: ${GO_VERSION}"
|
||||
echo "Audit version: ${AUDIT_VERSION_EFFECTIVE}, ISO version: ${ISO_VERSION_EFFECTIVE}"
|
||||
echo ""
|
||||
@@ -141,8 +151,8 @@ echo "=== syncing git submodules ==="
|
||||
git -C "${REPO_ROOT}" submodule update --init --recursive
|
||||
|
||||
# --- compile bee binary (static, Linux amd64) ---
|
||||
# Shared between variants — built once, reused on second pass.
|
||||
BEE_BIN="${DIST_DIR}/bee-linux-amd64"
|
||||
GPU_STRESS_BIN="${DIST_DIR}/bee-gpu-stress-linux-amd64"
|
||||
NEED_BUILD=1
|
||||
if [ -f "$BEE_BIN" ]; then
|
||||
NEWEST_SRC=$(find "${REPO_ROOT}/audit" -name '*.go' -newer "$BEE_BIN" | head -1)
|
||||
@@ -172,37 +182,41 @@ else
|
||||
echo "=== bee binary up to date, skipping build ==="
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "=== downloading cuBLAS/cuBLASLt/cudart ${NCCL_CUDA_VERSION} userspace ==="
|
||||
sh "${BUILDER_DIR}/build-cublas.sh" \
|
||||
"${CUBLAS_VERSION}" \
|
||||
"${CUDA_USERSPACE_VERSION}" \
|
||||
"${NCCL_CUDA_VERSION}" \
|
||||
"${DIST_DIR}"
|
||||
# --- NVIDIA-only build steps ---
|
||||
GPU_STRESS_BIN="${DIST_DIR}/bee-gpu-stress-linux-amd64"
|
||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
echo ""
|
||||
echo "=== downloading cuBLAS/cuBLASLt/cudart ${NCCL_CUDA_VERSION} userspace ==="
|
||||
sh "${BUILDER_DIR}/build-cublas.sh" \
|
||||
"${CUBLAS_VERSION}" \
|
||||
"${CUDA_USERSPACE_VERSION}" \
|
||||
"${NCCL_CUDA_VERSION}" \
|
||||
"${DIST_DIR}"
|
||||
|
||||
CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
||||
CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
||||
|
||||
GPU_STRESS_NEED_BUILD=1
|
||||
if [ -f "$GPU_STRESS_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_STRESS_BIN" ]; then
|
||||
GPU_STRESS_NEED_BUILD=0
|
||||
GPU_STRESS_NEED_BUILD=1
|
||||
if [ -f "$GPU_STRESS_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_STRESS_BIN" ]; then
|
||||
GPU_STRESS_NEED_BUILD=0
|
||||
fi
|
||||
|
||||
if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then
|
||||
echo "=== building bee-gpu-stress ==="
|
||||
gcc -O2 -s -Wall -Wextra \
|
||||
-I"${CUBLAS_CACHE}/include" \
|
||||
-o "$GPU_STRESS_BIN" \
|
||||
"${BUILDER_DIR}/bee-gpu-stress.c" \
|
||||
-ldl -lm
|
||||
echo "binary: $GPU_STRESS_BIN"
|
||||
else
|
||||
echo "=== bee-gpu-stress up to date, skipping build ==="
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then
|
||||
echo "=== building bee-gpu-stress ==="
|
||||
gcc -O2 -s -Wall -Wextra \
|
||||
-I"${CUBLAS_CACHE}/include" \
|
||||
-o "$GPU_STRESS_BIN" \
|
||||
"${BUILDER_DIR}/bee-gpu-stress.c" \
|
||||
-ldl -lm
|
||||
echo "binary: $GPU_STRESS_BIN"
|
||||
else
|
||||
echo "=== bee-gpu-stress up to date, skipping build ==="
|
||||
fi
|
||||
|
||||
echo "=== preparing staged overlay ==="
|
||||
# Sync builder config into work dir, preserving lb cache (chroot + packages).
|
||||
# We do NOT rm -rf BUILD_WORK_DIR so lb can reuse its chroot on repeat builds.
|
||||
echo "=== preparing staged overlay (${BEE_GPU_VENDOR}) ==="
|
||||
mkdir -p "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}"
|
||||
|
||||
# Sync builder config into variant work dir, preserving lb cache.
|
||||
rsync -a --delete \
|
||||
--exclude='cache/' \
|
||||
--exclude='chroot/' \
|
||||
@@ -212,7 +226,10 @@ rsync -a --delete \
|
||||
--exclude='*.contents' \
|
||||
--exclude='*.files' \
|
||||
"${BUILDER_DIR}/" "${BUILD_WORK_DIR}/"
|
||||
# Also persist package cache to CACHE_ROOT so it survives a manual wipe of BUILD_WORK_DIR.
|
||||
|
||||
# Share deb package cache across variants.
|
||||
# Restore: populate work dir cache from shared cache before build.
|
||||
# Persist: sync back after build (done after lb build below).
|
||||
LB_PKG_CACHE="${CACHE_ROOT}/lb-packages"
|
||||
mkdir -p "${LB_PKG_CACHE}"
|
||||
if [ -d "${BUILD_WORK_DIR}/cache/packages.chroot" ]; then
|
||||
@@ -221,6 +238,7 @@ elif [ -d "${LB_PKG_CACHE}" ] && [ "$(ls -A "${LB_PKG_CACHE}" 2>/dev/null)" ]; t
|
||||
mkdir -p "${BUILD_WORK_DIR}/cache/packages.chroot"
|
||||
rsync -a "${LB_PKG_CACHE}/" "${BUILD_WORK_DIR}/cache/packages.chroot/"
|
||||
fi
|
||||
|
||||
rsync -a "${OVERLAY_DIR}/" "${OVERLAY_STAGE_DIR}/"
|
||||
rm -f \
|
||||
"${OVERLAY_STAGE_DIR}/etc/bee-ssh-password-fallback" \
|
||||
@@ -231,6 +249,12 @@ rm -f \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
||||
|
||||
# Remove NVIDIA-specific overlay files for non-nvidia variants
|
||||
if [ "$BEE_GPU_VENDOR" != "nvidia" ]; then
|
||||
rm -f "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nvidia-load"
|
||||
rm -f "${OVERLAY_STAGE_DIR}/etc/systemd/system/bee-nvidia.service"
|
||||
fi
|
||||
|
||||
# --- inject authorized_keys for SSH access ---
|
||||
AUTHORIZED_KEYS_FILE="${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys"
|
||||
mkdir -p "${OVERLAY_STAGE_DIR}/root/.ssh"
|
||||
@@ -268,8 +292,11 @@ fi
|
||||
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/bin"
|
||||
cp "${DIST_DIR}/bee-linux-amd64" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
|
||||
cp "${GPU_STRESS_BIN}" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
|
||||
|
||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ] && [ -f "$GPU_STRESS_BIN" ]; then
|
||||
cp "${GPU_STRESS_BIN}" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
|
||||
fi
|
||||
|
||||
# --- inject smoketest into overlay so it runs directly on the live CD ---
|
||||
cp "${BUILDER_DIR}/smoketest.sh" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest"
|
||||
@@ -286,100 +313,156 @@ for tool in storcli64 sas2ircu sas3ircu arcconf ssacli; do
|
||||
fi
|
||||
done
|
||||
|
||||
# --- build NVIDIA kernel modules ---
|
||||
echo ""
|
||||
echo "=== building NVIDIA ${NVIDIA_DRIVER_VERSION} modules ==="
|
||||
sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}"
|
||||
# --- NVIDIA kernel modules and userspace libs ---
|
||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
echo ""
|
||||
echo "=== building NVIDIA ${NVIDIA_DRIVER_VERSION} modules ==="
|
||||
sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}"
|
||||
|
||||
KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
||||
NVIDIA_CACHE="${DIST_DIR}/nvidia-${NVIDIA_DRIVER_VERSION}-${KVER}"
|
||||
KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
||||
NVIDIA_CACHE="${DIST_DIR}/nvidia-${NVIDIA_DRIVER_VERSION}-${KVER}"
|
||||
|
||||
# Inject .ko files into overlay at /usr/local/lib/nvidia/
|
||||
OVERLAY_KMOD_DIR="${OVERLAY_DIR}/usr/local/lib/nvidia"
|
||||
OVERLAY_KMOD_DIR="${OVERLAY_STAGE_DIR}/usr/local/lib/nvidia"
|
||||
mkdir -p "${OVERLAY_KMOD_DIR}"
|
||||
cp "${NVIDIA_CACHE}/modules/"*.ko "${OVERLAY_KMOD_DIR}/"
|
||||
# Inject .ko files into overlay at /usr/local/lib/nvidia/
|
||||
OVERLAY_KMOD_DIR="${OVERLAY_STAGE_DIR}/usr/local/lib/nvidia"
|
||||
mkdir -p "${OVERLAY_KMOD_DIR}"
|
||||
cp "${NVIDIA_CACHE}/modules/"*.ko "${OVERLAY_KMOD_DIR}/"
|
||||
|
||||
# Inject nvidia-smi and libnvidia-ml
|
||||
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/bin" "${OVERLAY_STAGE_DIR}/usr/lib"
|
||||
cp "${NVIDIA_CACHE}/bin/nvidia-smi" "${OVERLAY_STAGE_DIR}/usr/local/bin/"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/nvidia-smi"
|
||||
cp "${NVIDIA_CACHE}/bin/nvidia-bug-report.sh" "${OVERLAY_STAGE_DIR}/usr/local/bin/" 2>/dev/null || true
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/nvidia-bug-report.sh" 2>/dev/null || true
|
||||
cp "${NVIDIA_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" 2>/dev/null || true
|
||||
# Inject nvidia-smi and libnvidia-ml
|
||||
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/bin" "${OVERLAY_STAGE_DIR}/usr/lib"
|
||||
cp "${NVIDIA_CACHE}/bin/nvidia-smi" "${OVERLAY_STAGE_DIR}/usr/local/bin/"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/nvidia-smi"
|
||||
cp "${NVIDIA_CACHE}/bin/nvidia-bug-report.sh" "${OVERLAY_STAGE_DIR}/usr/local/bin/" 2>/dev/null || true
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/nvidia-bug-report.sh" 2>/dev/null || true
|
||||
cp "${NVIDIA_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" 2>/dev/null || true
|
||||
|
||||
# Inject GSP firmware into /lib/firmware/nvidia/<version>/
|
||||
if [ -d "${NVIDIA_CACHE}/firmware" ] && [ "$(ls -A "${NVIDIA_CACHE}/firmware" 2>/dev/null)" ]; then
|
||||
mkdir -p "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}"
|
||||
cp "${NVIDIA_CACHE}/firmware/"* "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}/"
|
||||
echo "=== firmware: $(ls "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}/" | wc -l) files injected ==="
|
||||
# Inject GSP firmware into /lib/firmware/nvidia/<version>/
|
||||
if [ -d "${NVIDIA_CACHE}/firmware" ] && [ "$(ls -A "${NVIDIA_CACHE}/firmware" 2>/dev/null)" ]; then
|
||||
mkdir -p "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}"
|
||||
cp "${NVIDIA_CACHE}/firmware/"* "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}/"
|
||||
echo "=== firmware: $(ls "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}/" | wc -l) files injected ==="
|
||||
fi
|
||||
|
||||
# --- build / download NCCL ---
|
||||
echo ""
|
||||
echo "=== downloading NCCL ${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION} ==="
|
||||
sh "${BUILDER_DIR}/build-nccl.sh" "${NCCL_VERSION}" "${NCCL_CUDA_VERSION}" "${DIST_DIR}" "${NCCL_SHA256:-}"
|
||||
|
||||
NCCL_CACHE="${DIST_DIR}/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
||||
|
||||
# Inject libnccl.so.* into overlay alongside other NVIDIA userspace libs
|
||||
cp "${NCCL_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
|
||||
echo "=== NCCL: $(ls "${NCCL_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
|
||||
|
||||
# Inject cuBLAS/cuBLASLt/cudart runtime libs used by bee-gpu-stress tensor-core GEMM path
|
||||
cp "${CUBLAS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
|
||||
echo "=== cuBLAS: $(ls "${CUBLAS_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
|
||||
|
||||
# --- build nccl-tests ---
|
||||
echo ""
|
||||
echo "=== building nccl-tests ${NCCL_TESTS_VERSION} ==="
|
||||
sh "${BUILDER_DIR}/build-nccl-tests.sh" \
|
||||
"${NCCL_TESTS_VERSION}" \
|
||||
"${NCCL_VERSION}" \
|
||||
"${NCCL_CUDA_VERSION}" \
|
||||
"${DIST_DIR}" \
|
||||
"${NVCC_VERSION}" \
|
||||
"${DEBIAN_VERSION}"
|
||||
|
||||
NCCL_TESTS_CACHE="${DIST_DIR}/nccl-tests-${NCCL_TESTS_VERSION}"
|
||||
cp "${NCCL_TESTS_CACHE}/bin/all_reduce_perf" "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
||||
echo "=== all_reduce_perf injected ==="
|
||||
fi
|
||||
|
||||
# --- build / download NCCL ---
|
||||
echo ""
|
||||
echo "=== downloading NCCL ${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION} ==="
|
||||
sh "${BUILDER_DIR}/build-nccl.sh" "${NCCL_VERSION}" "${NCCL_CUDA_VERSION}" "${DIST_DIR}" "${NCCL_SHA256:-}"
|
||||
|
||||
NCCL_CACHE="${DIST_DIR}/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
||||
|
||||
# Inject libnccl.so.* into overlay alongside other NVIDIA userspace libs
|
||||
cp "${NCCL_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
|
||||
echo "=== NCCL: $(ls "${NCCL_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
|
||||
|
||||
# Inject cuBLAS/cuBLASLt/cudart runtime libs used by bee-gpu-stress tensor-core GEMM path
|
||||
cp "${CUBLAS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
|
||||
echo "=== cuBLAS: $(ls "${CUBLAS_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
|
||||
|
||||
# --- build nccl-tests ---
|
||||
echo ""
|
||||
echo "=== building nccl-tests ${NCCL_TESTS_VERSION} ==="
|
||||
sh "${BUILDER_DIR}/build-nccl-tests.sh" \
|
||||
"${NCCL_TESTS_VERSION}" \
|
||||
"${NCCL_VERSION}" \
|
||||
"${NCCL_CUDA_VERSION}" \
|
||||
"${DIST_DIR}" \
|
||||
"${NVCC_VERSION}" \
|
||||
"${DEBIAN_VERSION}"
|
||||
|
||||
NCCL_TESTS_CACHE="${DIST_DIR}/nccl-tests-${NCCL_TESTS_VERSION}"
|
||||
cp "${NCCL_TESTS_CACHE}/bin/all_reduce_perf" "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
||||
echo "=== all_reduce_perf injected ==="
|
||||
|
||||
# --- embed build metadata ---
|
||||
mkdir -p "${OVERLAY_STAGE_DIR}/etc"
|
||||
BUILD_DATE="$(date +%Y-%m-%d)"
|
||||
GIT_COMMIT="$(git -C "${REPO_ROOT}" rev-parse --short HEAD 2>/dev/null || echo unknown)"
|
||||
cat > "${OVERLAY_STAGE_DIR}/etc/bee-release" <<EOF
|
||||
BEE_ISO_VERSION=${ISO_VERSION_EFFECTIVE}
|
||||
BEE_AUDIT_VERSION=${AUDIT_VERSION_EFFECTIVE}
|
||||
BUILD_DATE=${BUILD_DATE}
|
||||
GIT_COMMIT=${GIT_COMMIT}
|
||||
DEBIAN_VERSION=${DEBIAN_VERSION}
|
||||
DEBIAN_KERNEL_ABI=${DEBIAN_KERNEL_ABI}
|
||||
NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}
|
||||
|
||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
GPU_VERSION_LINE="NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}
|
||||
NCCL_VERSION=${NCCL_VERSION}
|
||||
NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION}
|
||||
CUBLAS_VERSION=${CUBLAS_VERSION}
|
||||
CUDA_USERSPACE_VERSION=${CUDA_USERSPACE_VERSION}
|
||||
NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}
|
||||
NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}"
|
||||
GPU_BUILD_INFO="nvidia:${NVIDIA_DRIVER_VERSION}"
|
||||
elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
|
||||
GPU_VERSION_LINE="ROCM_VERSION=${ROCM_VERSION}"
|
||||
GPU_BUILD_INFO="rocm:${ROCM_VERSION}"
|
||||
else
|
||||
GPU_VERSION_LINE=""
|
||||
GPU_BUILD_INFO="nogpu"
|
||||
fi
|
||||
|
||||
cat > "${OVERLAY_STAGE_DIR}/etc/bee-release" <<EOF
|
||||
BEE_ISO_VERSION=${ISO_VERSION_EFFECTIVE}
|
||||
BEE_AUDIT_VERSION=${AUDIT_VERSION_EFFECTIVE}
|
||||
BEE_GPU_VENDOR=${BEE_GPU_VENDOR}
|
||||
BUILD_DATE=${BUILD_DATE}
|
||||
GIT_COMMIT=${GIT_COMMIT}
|
||||
DEBIAN_VERSION=${DEBIAN_VERSION}
|
||||
DEBIAN_KERNEL_ABI=${DEBIAN_KERNEL_ABI}
|
||||
${GPU_VERSION_LINE}
|
||||
EOF
|
||||
|
||||
# Write GPU vendor marker for hooks
|
||||
echo "${BEE_GPU_VENDOR}" > "${OVERLAY_STAGE_DIR}/etc/bee-gpu-vendor"
|
||||
|
||||
# Patch motd with build info
|
||||
BEE_BUILD_INFO="${BUILD_DATE} git:${GIT_COMMIT} debian:${DEBIAN_VERSION} nvidia:${NVIDIA_DRIVER_VERSION}"
|
||||
BEE_BUILD_INFO="${BUILD_DATE} git:${GIT_COMMIT} debian:${DEBIAN_VERSION} ${GPU_BUILD_INFO}"
|
||||
if [ -f "${OVERLAY_STAGE_DIR}/etc/motd" ]; then
|
||||
sed "s/%%BUILD_INFO%%/${BEE_BUILD_INFO}/" "${OVERLAY_STAGE_DIR}/etc/motd" \
|
||||
> "${OVERLAY_STAGE_DIR}/etc/motd.patched"
|
||||
mv "${OVERLAY_STAGE_DIR}/etc/motd.patched" "${OVERLAY_STAGE_DIR}/etc/motd"
|
||||
fi
|
||||
|
||||
# --- substitute version placeholders in package list ---
|
||||
sed -i \
|
||||
-e "s/%%DCGM_VERSION%%/${DCGM_VERSION}/g" \
|
||||
-e "s/%%ROCM_VERSION%%/${ROCM_VERSION}/g" \
|
||||
-e "s/%%ROCM_SMI_VERSION%%/${ROCM_SMI_VERSION}/g" \
|
||||
"${BUILD_WORK_DIR}/config/package-lists/bee.list.chroot" \
|
||||
"${BUILD_WORK_DIR}/config/archives/rocm.list.chroot"
|
||||
# --- copy variant-specific package list, remove all other variant lists ---
|
||||
# live-build picks up ALL .list.chroot files — delete other variants to avoid conflicts.
|
||||
cp "${BUILD_WORK_DIR}/config/package-lists/bee-${BEE_GPU_VENDOR}.list.chroot" \
|
||||
"${BUILD_WORK_DIR}/config/package-lists/bee-gpu.list.chroot"
|
||||
rm -f "${BUILD_WORK_DIR}/config/package-lists/bee-nvidia.list.chroot" \
|
||||
"${BUILD_WORK_DIR}/config/package-lists/bee-amd.list.chroot" \
|
||||
"${BUILD_WORK_DIR}/config/package-lists/bee-nogpu.list.chroot"
|
||||
|
||||
# --- remove archives for the other vendor(s) ---
|
||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
rm -f "${BUILD_WORK_DIR}/config/archives/rocm.list.chroot" \
|
||||
"${BUILD_WORK_DIR}/config/archives/rocm.key.chroot"
|
||||
elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
|
||||
rm -f "${BUILD_WORK_DIR}/config/archives/nvidia-cuda.list.chroot" \
|
||||
"${BUILD_WORK_DIR}/config/archives/nvidia-cuda.key.chroot"
|
||||
else
|
||||
# nogpu: remove both
|
||||
rm -f "${BUILD_WORK_DIR}/config/archives/rocm.list.chroot" \
|
||||
"${BUILD_WORK_DIR}/config/archives/rocm.key.chroot" \
|
||||
"${BUILD_WORK_DIR}/config/archives/nvidia-cuda.list.chroot" \
|
||||
"${BUILD_WORK_DIR}/config/archives/nvidia-cuda.key.chroot"
|
||||
fi
|
||||
|
||||
# --- substitute version placeholders in package list and archive ---
|
||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
sed -i \
|
||||
-e "s/%%DCGM_VERSION%%/${DCGM_VERSION}/g" \
|
||||
"${BUILD_WORK_DIR}/config/package-lists/bee-gpu.list.chroot"
|
||||
elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
|
||||
sed -i \
|
||||
-e "s/%%ROCM_VERSION%%/${ROCM_VERSION}/g" \
|
||||
-e "s/%%ROCM_SMI_VERSION%%/${ROCM_SMI_VERSION}/g" \
|
||||
-e "s/%%ROCM_BANDWIDTH_TEST_VERSION%%/${ROCM_BANDWIDTH_TEST_VERSION}/g" \
|
||||
-e "s/%%ROCM_VALIDATION_SUITE_VERSION%%/${ROCM_VALIDATION_SUITE_VERSION}/g" \
|
||||
-e "s/%%ROCBLAS_VERSION%%/${ROCBLAS_VERSION}/g" \
|
||||
-e "s/%%ROCRAND_VERSION%%/${ROCRAND_VERSION}/g" \
|
||||
-e "s/%%HIP_RUNTIME_AMD_VERSION%%/${HIP_RUNTIME_AMD_VERSION}/g" \
|
||||
-e "s/%%HIPBLASLT_VERSION%%/${HIPBLASLT_VERSION}/g" \
|
||||
-e "s/%%COMGR_VERSION%%/${COMGR_VERSION}/g" \
|
||||
"${BUILD_WORK_DIR}/config/package-lists/bee-gpu.list.chroot"
|
||||
if [ -f "${BUILD_WORK_DIR}/config/archives/rocm.list.chroot" ]; then
|
||||
sed -i \
|
||||
-e "s/%%ROCM_VERSION%%/${ROCM_VERSION}/g" \
|
||||
"${BUILD_WORK_DIR}/config/archives/rocm.list.chroot"
|
||||
fi
|
||||
fi
|
||||
|
||||
# --- sync overlay into live-build includes.chroot ---
|
||||
LB_DIR="${BUILD_WORK_DIR}"
|
||||
@@ -395,20 +478,31 @@ fi
|
||||
|
||||
# --- build ISO using live-build ---
|
||||
echo ""
|
||||
echo "=== building ISO (live-build) ==="
|
||||
echo "=== building ISO (live-build, variant: ${BEE_GPU_VENDOR}) ==="
|
||||
|
||||
# Export for auto/config
|
||||
BEE_GPU_VENDOR_UPPER="$(echo "${BEE_GPU_VENDOR}" | tr 'a-z' 'A-Z')"
|
||||
export BEE_GPU_VENDOR_UPPER
|
||||
|
||||
cd "${LB_DIR}"
|
||||
lb clean 2>&1 | tail -3
|
||||
lb config 2>&1 | tail -5
|
||||
lb build 2>&1
|
||||
|
||||
# --- persist deb package cache back to shared location ---
|
||||
# This allows the second variant to reuse all downloaded packages.
|
||||
if [ -d "${BUILD_WORK_DIR}/cache/packages.chroot" ]; then
|
||||
rsync -a "${BUILD_WORK_DIR}/cache/packages.chroot/" "${LB_PKG_CACHE}/"
|
||||
echo "=== package cache synced to ${LB_PKG_CACHE} ==="
|
||||
fi
|
||||
|
||||
# live-build outputs live-image-amd64.hybrid.iso in LB_DIR
|
||||
ISO_RAW="${LB_DIR}/live-image-amd64.hybrid.iso"
|
||||
ISO_OUT="${DIST_DIR}/bee-debian${DEBIAN_VERSION}-v${ISO_VERSION_EFFECTIVE}-amd64.iso"
|
||||
ISO_OUT="${DIST_DIR}/easy-bee-${BEE_GPU_VENDOR}-v${ISO_VERSION_EFFECTIVE}-amd64.iso"
|
||||
if [ -f "$ISO_RAW" ]; then
|
||||
cp "$ISO_RAW" "$ISO_OUT"
|
||||
echo ""
|
||||
echo "=== done ==="
|
||||
echo "=== done (${BEE_GPU_VENDOR}) ==="
|
||||
echo "ISO: $ISO_OUT"
|
||||
if command -v stat >/dev/null 2>&1; then
|
||||
ISO_SIZE_BYTES="$(stat -c '%s' "$ISO_OUT" 2>/dev/null || stat -f '%z' "$ISO_OUT")"
|
||||
|
||||
@@ -10,12 +10,12 @@ echo " ╚══════╝╚═╝ ╚═╝╚══════╝
|
||||
echo ""
|
||||
|
||||
menuentry "EASY-BEE" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
menuentry "EASY-BEE (load to RAM)" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
|
||||
@@ -5,6 +5,9 @@ set -e
|
||||
|
||||
echo "=== bee chroot setup ==="
|
||||
|
||||
GPU_VENDOR=$(cat /etc/bee-gpu-vendor 2>/dev/null || echo nvidia)
|
||||
echo "=== GPU vendor: ${GPU_VENDOR} ==="
|
||||
|
||||
ensure_bee_console_user() {
|
||||
if id bee >/dev/null 2>&1; then
|
||||
usermod -d /home/bee -s /bin/bash bee 2>/dev/null || true
|
||||
@@ -21,10 +24,8 @@ ensure_bee_console_user() {
|
||||
|
||||
ensure_bee_console_user
|
||||
|
||||
# Enable bee services
|
||||
systemctl enable nvidia-dcgm.service 2>/dev/null || true
|
||||
# Enable common bee services
|
||||
systemctl enable bee-network.service
|
||||
systemctl enable bee-nvidia.service
|
||||
systemctl enable bee-preflight.service
|
||||
systemctl enable bee-audit.service
|
||||
systemctl enable bee-web.service
|
||||
@@ -36,25 +37,34 @@ systemctl enable serial-getty@ttyS0.service 2>/dev/null || true
|
||||
systemctl enable serial-getty@ttyS1.service 2>/dev/null || true
|
||||
systemctl enable bee-journal-mirror@ttyS1.service 2>/dev/null || true
|
||||
|
||||
# Enable GPU-vendor specific services
|
||||
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
||||
systemctl enable nvidia-dcgm.service 2>/dev/null || true
|
||||
systemctl enable bee-nvidia.service
|
||||
elif [ "$GPU_VENDOR" = "amd" ]; then
|
||||
# ROCm symlinks (packages install to /opt/rocm-*/bin/)
|
||||
for tool in rocm-smi rocm-bandwidth-test rvs; do
|
||||
if [ ! -e /usr/local/bin/${tool} ]; then
|
||||
bin_path="$(find /opt -path "*/bin/${tool}" -type f 2>/dev/null | sort | tail -1)"
|
||||
[ -n "${bin_path}" ] && ln -sf "${bin_path}" /usr/local/bin/${tool}
|
||||
fi
|
||||
done
|
||||
fi
|
||||
# nogpu: no GPU services needed
|
||||
|
||||
# Ensure scripts are executable
|
||||
chmod +x /usr/local/bin/bee-network.sh 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-sshsetup 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-smoketest 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true
|
||||
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
||||
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
|
||||
fi
|
||||
|
||||
# Reload udev rules
|
||||
udevadm control --reload-rules 2>/dev/null || true
|
||||
|
||||
# rocm symlinks (packages install to /opt/rocm-*/bin/)
|
||||
for tool in rocm-smi rocm-bandwidth-test rvs; do
|
||||
if [ ! -e /usr/local/bin/${tool} ]; then
|
||||
bin_path="$(find /opt -path "*/bin/${tool}" -type f 2>/dev/null | sort | tail -1)"
|
||||
[ -n "${bin_path}" ] && ln -sf "${bin_path}" /usr/local/bin/${tool}
|
||||
fi
|
||||
done
|
||||
|
||||
# Create export directory
|
||||
mkdir -p /appdata/bee/export
|
||||
|
||||
@@ -62,4 +72,4 @@ if [ -f /etc/sudoers.d/bee ]; then
|
||||
chmod 0440 /etc/sudoers.d/bee
|
||||
fi
|
||||
|
||||
echo "=== bee chroot setup complete ==="
|
||||
echo "=== bee chroot setup complete (${GPU_VENDOR}) ==="
|
||||
|
||||
@@ -2,12 +2,75 @@
|
||||
# Copy memtest86+ binaries from chroot /boot into the ISO boot directory
|
||||
# so GRUB can chainload them directly (they must be on the ISO filesystem,
|
||||
# not inside the squashfs).
|
||||
#
|
||||
# Primary: copy from chroot/boot/ (populated by package postinst).
|
||||
# Naming fallbacks:
|
||||
# Debian Bookworm: /boot/memtest86+ — EFI PE64 (no extension)
|
||||
# /boot/memtest86+.bin — legacy binary
|
||||
# Upstream/Ubuntu: /boot/memtest86+x64.efi, /boot/memtest86+x64.bin, etc.
|
||||
# Last resort: extract directly from the cached .deb if postinst didn't place
|
||||
# the files (happens in chroot environments without grub triggers).
|
||||
set -e
|
||||
|
||||
for f in memtest86+x64.bin memtest86+x64.efi memtest86+ia32.bin memtest86+ia32.efi; do
|
||||
MEMTEST_FILES="memtest86+x64.bin memtest86+x64.efi memtest86+ia32.bin memtest86+ia32.efi"
|
||||
|
||||
# Ensure destination directory exists (absence caused silent copy failures).
|
||||
mkdir -p binary/boot
|
||||
|
||||
echo "memtest: scanning chroot/boot/ for memtest files:"
|
||||
ls chroot/boot/memtest* 2>/dev/null || echo "memtest: WARNING: no memtest files in chroot/boot/"
|
||||
|
||||
# Primary path: copy upstream-named files from chroot/boot/
|
||||
for f in ${MEMTEST_FILES}; do
|
||||
src="chroot/boot/${f}"
|
||||
if [ -f "${src}" ]; then
|
||||
cp "${src}" "binary/boot/${f}"
|
||||
echo "memtest: copied ${f} to binary/boot/"
|
||||
echo "memtest: copied ${f} from chroot/boot/"
|
||||
fi
|
||||
done
|
||||
|
||||
# Debian Bookworm naming fallback: /boot/memtest86+ (no extension) is the EFI binary.
|
||||
if [ ! -f "binary/boot/memtest86+x64.efi" ] && [ -f "chroot/boot/memtest86+" ]; then
|
||||
cp "chroot/boot/memtest86+" "binary/boot/memtest86+x64.efi"
|
||||
echo "memtest: copied /boot/memtest86+ as memtest86+x64.efi (Debian naming)"
|
||||
fi
|
||||
if [ ! -f "binary/boot/memtest86+x64.bin" ] && [ -f "chroot/boot/memtest86+.bin" ]; then
|
||||
cp "chroot/boot/memtest86+.bin" "binary/boot/memtest86+x64.bin"
|
||||
echo "memtest: copied /boot/memtest86+.bin as memtest86+x64.bin (Debian naming)"
|
||||
fi
|
||||
|
||||
# Last resort: if EFI binary still missing, extract from cached .deb
|
||||
if [ ! -f "binary/boot/memtest86+x64.efi" ]; then
|
||||
echo "memtest: EFI binary missing — attempting extraction from .deb cache"
|
||||
deb=$(find chroot/var/cache/apt/archives/ chroot/var/lib/apt/lists/ \
|
||||
-name 'memtest86+_*.deb' -o -name 'memtest86+*.deb' 2>/dev/null \
|
||||
| head -1)
|
||||
if [ -z "$deb" ]; then
|
||||
deb=$(find cache/ -name 'memtest86+_*.deb' -o -name 'memtest86+*.deb' 2>/dev/null | head -1)
|
||||
fi
|
||||
if [ -n "$deb" ]; then
|
||||
echo "memtest: extracting from ${deb}"
|
||||
EXTRACT_DIR="$(mktemp -d)"
|
||||
dpkg-deb -x "${deb}" "${EXTRACT_DIR}"
|
||||
echo "memtest: files found in .deb:"
|
||||
find "${EXTRACT_DIR}/boot" -type f 2>/dev/null || echo " (none in /boot)"
|
||||
for f in ${MEMTEST_FILES}; do
|
||||
src="${EXTRACT_DIR}/boot/${f}"
|
||||
if [ -f "${src}" ]; then
|
||||
cp "${src}" "binary/boot/${f}"
|
||||
echo "memtest: extracted ${f} from .deb"
|
||||
fi
|
||||
done
|
||||
# Debian naming fallback inside .deb as well
|
||||
if [ ! -f "binary/boot/memtest86+x64.efi" ] && [ -f "${EXTRACT_DIR}/boot/memtest86+" ]; then
|
||||
cp "${EXTRACT_DIR}/boot/memtest86+" "binary/boot/memtest86+x64.efi"
|
||||
echo "memtest: extracted /boot/memtest86+ as memtest86+x64.efi from .deb"
|
||||
fi
|
||||
rm -rf "${EXTRACT_DIR}"
|
||||
else
|
||||
echo "memtest: WARNING: no memtest86+ .deb found in cache — memtest will not be available"
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "memtest: binary/boot/ contents:"
|
||||
ls binary/boot/memtest* 2>/dev/null || echo " (none)"
|
||||
|
||||
9
iso/builder/config/package-lists/bee-amd.list.chroot
Normal file
9
iso/builder/config/package-lists/bee-amd.list.chroot
Normal file
@@ -0,0 +1,9 @@
|
||||
# AMD ROCm — GPU monitoring, bandwidth test, and compute stress (RVS GST)
|
||||
rocm-smi-lib=%%ROCM_SMI_VERSION%%
|
||||
rocm-bandwidth-test=%%ROCM_BANDWIDTH_TEST_VERSION%%
|
||||
rocm-validation-suite=%%ROCM_VALIDATION_SUITE_VERSION%%
|
||||
rocblas=%%ROCBLAS_VERSION%%
|
||||
rocrand=%%ROCRAND_VERSION%%
|
||||
hip-runtime-amd=%%HIP_RUNTIME_AMD_VERSION%%
|
||||
hipblaslt=%%HIPBLASLT_VERSION%%
|
||||
comgr=%%COMGR_VERSION%%
|
||||
1
iso/builder/config/package-lists/bee-nogpu.list.chroot
Normal file
1
iso/builder/config/package-lists/bee-nogpu.list.chroot
Normal file
@@ -0,0 +1 @@
|
||||
# No GPU variant — no NVIDIA, no AMD/ROCm packages
|
||||
2
iso/builder/config/package-lists/bee-nvidia.list.chroot
Normal file
2
iso/builder/config/package-lists/bee-nvidia.list.chroot
Normal file
@@ -0,0 +1,2 @@
|
||||
# NVIDIA DCGM (Data Center GPU Manager) — dcgmi diag for acceptance testing
|
||||
datacenter-gpu-manager=1:%%DCGM_VERSION%%
|
||||
@@ -21,8 +21,14 @@ openssh-server
|
||||
# Disk installer
|
||||
squashfs-tools
|
||||
parted
|
||||
# grub-pc / grub-efi-amd64 provide grub-install + grub2-common (required for chroot install).
|
||||
# The -bin variants only carry binary modules and do NOT include grub-install itself.
|
||||
grub-pc
|
||||
grub-pc-bin
|
||||
grub-efi-amd64
|
||||
grub-efi-amd64-bin
|
||||
grub-efi-amd64-signed
|
||||
shim-signed
|
||||
|
||||
# Filesystem support for USB export targets
|
||||
exfatprogs
|
||||
@@ -39,6 +45,7 @@ vim-tiny
|
||||
mc
|
||||
htop
|
||||
nvtop
|
||||
btop
|
||||
sudo
|
||||
zstd
|
||||
mstflint
|
||||
@@ -72,18 +79,5 @@ firmware-bnx2x
|
||||
firmware-cavium
|
||||
firmware-qlogic
|
||||
|
||||
# NVIDIA DCGM (Data Center GPU Manager) — dcgmi diag for acceptance testing
|
||||
datacenter-gpu-manager=1:%%DCGM_VERSION%%
|
||||
|
||||
# AMD ROCm — GPU monitoring, bandwidth test, and compute stress (RVS GST)
|
||||
rocm-smi-lib=%%ROCM_SMI_VERSION%%
|
||||
rocm-bandwidth-test=%%ROCM_BANDWIDTH_TEST_VERSION%%
|
||||
rocm-validation-suite=%%ROCM_VALIDATION_SUITE_VERSION%%
|
||||
rocblas=%%ROCBLAS_VERSION%%
|
||||
rocrand=%%ROCRAND_VERSION%%
|
||||
hip-runtime-amd=%%HIP_RUNTIME_AMD_VERSION%%
|
||||
hipblaslt=%%HIPBLASLT_VERSION%%
|
||||
comgr=%%COMGR_VERSION%%
|
||||
|
||||
# glibc compat helpers (for any external binaries that need it)
|
||||
libc6
|
||||
|
||||
@@ -39,7 +39,7 @@ info "nvidia boot mode: ${NVIDIA_BOOT_MODE}"
|
||||
# --- PATH & binaries ---
|
||||
echo "-- PATH & binaries --"
|
||||
for tool in dmidecode smartctl nvme ipmitool lspci bee; do
|
||||
if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then
|
||||
if p=$(PATH="/usr/local/bin:/usr/sbin:/sbin:$PATH" command -v "$tool" 2>/dev/null); then
|
||||
ok "$tool found: $p"
|
||||
else
|
||||
fail "$tool: NOT FOUND"
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
export PATH="$PATH:/usr/local/bin:/opt/rocm/bin:/opt/rocm/sbin"
|
||||
export PATH="$PATH:/usr/local/bin:/usr/sbin:/sbin:/opt/rocm/bin:/opt/rocm/sbin"
|
||||
|
||||
# Print web UI URLs on the local console at login.
|
||||
if [ -z "${SSH_CONNECTION:-}" ] \
|
||||
|
||||
@@ -1,14 +1,25 @@
|
||||
[Unit]
|
||||
Description=Bee: run hardware audit
|
||||
After=bee-network.service bee-nvidia.service bee-preflight.service
|
||||
Before=bee-web.service
|
||||
Description=Bee: schedule startup hardware audit via task queue
|
||||
# Start AFTER bee-web, not before — bee-web must not wait for audit.
|
||||
After=bee-web.service
|
||||
Wants=bee-web.service
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-audit.log /bin/sh -c '/usr/local/bin/bee audit --runtime livecd --output file:/appdata/bee/export/bee-audit.json; rc=$?; if [ "$rc" -ne 0 ]; then echo "[bee-audit] WARN: audit exited with rc=$rc"; fi; exit 0'
|
||||
RemainAfterExit=yes
|
||||
# Wait up to 90s for bee-web to respond on /healthz, then sleep 60s for
|
||||
# the system to settle (GPU drivers, sensors), then enqueue the audit as
|
||||
# a background task so it appears in the task list and logs.
|
||||
ExecStart=/bin/sh -c '\
|
||||
i=0; \
|
||||
while [ $i -lt 90 ]; do \
|
||||
if curl -sf http://localhost/healthz >/dev/null 2>&1; then break; fi; \
|
||||
sleep 1; i=$((i+1)); \
|
||||
done; \
|
||||
sleep 60; \
|
||||
curl -sf -X POST http://localhost/api/audit/run >/dev/null'
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
RemainAfterExit=yes
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
[Unit]
|
||||
Description=Bee: hardware audit web viewer
|
||||
After=bee-network.service
|
||||
Wants=bee-audit.service
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
@@ -11,6 +9,9 @@ RestartSec=2
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
LimitMEMLOCK=infinity
|
||||
# Keep the web server responsive during GPU/CPU stress (children inherit nice+10
|
||||
# via Setpriority in runCmdJob, but the bee-web parent stays at 0).
|
||||
Nice=0
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
|
||||
@@ -4,3 +4,6 @@
|
||||
RestartSec=10
|
||||
StartLimitIntervalSec=60
|
||||
StartLimitBurst=3
|
||||
# Raise scheduling priority of the X server so the graphical console (KVM/IPMI)
|
||||
# stays responsive during GPU/CPU stress tests running at nice+10.
|
||||
Nice=-5
|
||||
|
||||
@@ -158,20 +158,56 @@ mount --bind /sys "${MOUNT_ROOT}/sys"
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
log "--- Step 7/7: Installing GRUB bootloader ---"
|
||||
|
||||
# Helper: run a chroot command, log all output, return its exit code.
|
||||
# Needed because "cmd | while" pipelines hide the exit code of cmd.
|
||||
chroot_log() {
|
||||
local rc=0
|
||||
local out
|
||||
out=$(chroot "$MOUNT_ROOT" "$@" 2>&1) || rc=$?
|
||||
echo "$out" | while IFS= read -r line; do log " $line"; done
|
||||
return $rc
|
||||
}
|
||||
|
||||
if [ "$UEFI" = "1" ]; then
|
||||
chroot "$MOUNT_ROOT" grub-install \
|
||||
--target=x86_64-efi \
|
||||
--efi-directory=/boot/efi \
|
||||
--bootloader-id=bee \
|
||||
--recheck 2>&1 | while read -r line; do log " $line"; done || true
|
||||
# Primary attempt: write EFI NVRAM entry (requires writable efivars)
|
||||
if ! chroot_log grub-install \
|
||||
--target=x86_64-efi \
|
||||
--efi-directory=/boot/efi \
|
||||
--bootloader-id=bee \
|
||||
--recheck; then
|
||||
log " WARNING: grub-install (with NVRAM) failed — retrying with --no-nvram"
|
||||
# --no-nvram: write grubx64.efi but skip EFI variable update.
|
||||
# Needed on headless servers where efivars is read-only or unavailable.
|
||||
chroot_log grub-install \
|
||||
--target=x86_64-efi \
|
||||
--efi-directory=/boot/efi \
|
||||
--bootloader-id=bee \
|
||||
--no-nvram \
|
||||
--recheck || log " WARNING: grub-install --no-nvram also failed — check logs"
|
||||
fi
|
||||
|
||||
# Always install the UEFI fallback path EFI/BOOT/BOOTX64.EFI.
|
||||
# Many UEFI implementations (especially server BMCs and some firmware)
|
||||
# ignore the NVRAM boot entry and only look for this path.
|
||||
GRUB_EFI="${MOUNT_ROOT}/boot/efi/EFI/bee/grubx64.efi"
|
||||
FALLBACK_DIR="${MOUNT_ROOT}/boot/efi/EFI/BOOT"
|
||||
if [ -f "$GRUB_EFI" ]; then
|
||||
mkdir -p "$FALLBACK_DIR"
|
||||
cp "$GRUB_EFI" "${FALLBACK_DIR}/BOOTX64.EFI"
|
||||
log " Fallback EFI binary installed: EFI/BOOT/BOOTX64.EFI"
|
||||
else
|
||||
log " WARNING: grubx64.efi not found at $GRUB_EFI — UEFI fallback path not set"
|
||||
fi
|
||||
else
|
||||
chroot "$MOUNT_ROOT" grub-install \
|
||||
chroot_log grub-install \
|
||||
--target=i386-pc \
|
||||
--recheck \
|
||||
"$DEVICE" 2>&1 | while read -r line; do log " $line"; done || true
|
||||
"$DEVICE" || log " WARNING: grub-install (BIOS) failed — check logs"
|
||||
fi
|
||||
chroot "$MOUNT_ROOT" update-grub 2>&1 | while read -r line; do log " $line"; done || true
|
||||
log " GRUB installed."
|
||||
|
||||
chroot_log update-grub || log " WARNING: update-grub failed — check logs"
|
||||
log " GRUB step complete."
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Cleanup
|
||||
|
||||
@@ -67,7 +67,7 @@ case "$nvidia_mode" in
|
||||
load_module nvidia-modeset || true
|
||||
load_module nvidia-uvm || true
|
||||
;;
|
||||
gsp-off|safe|*)
|
||||
gsp-off|safe)
|
||||
# NVIDIA documents that GSP firmware is enabled by default on newer GPUs and can
|
||||
# be disabled via NVreg_EnableGpuFirmware=0. Safe mode keeps the live ISO on the
|
||||
# conservative path for platforms where full boot-time GSP init is unstable.
|
||||
@@ -76,6 +76,15 @@ case "$nvidia_mode" in
|
||||
fi
|
||||
log "GSP-off mode: skipping nvidia-modeset and nvidia-uvm during boot"
|
||||
;;
|
||||
nomsi|*)
|
||||
# nomsi: disable MSI-X/MSI interrupts — use when RmInitAdapter fails with
|
||||
# "Failed to enable MSI-X" on one or more GPUs (IOMMU group interrupt limits).
|
||||
# NVreg_EnableMSI=0 forces legacy INTx interrupts for all GPUs.
|
||||
if ! load_module nvidia NVreg_EnableGpuFirmware=0 NVreg_EnableMSI=0; then
|
||||
exit 1
|
||||
fi
|
||||
log "nomsi mode: MSI-X disabled (NVreg_EnableMSI=0), skipping nvidia-modeset and nvidia-uvm"
|
||||
;;
|
||||
esac
|
||||
|
||||
# Create /dev/nvidia* device nodes (udev rules absent since we use .run installer)
|
||||
|
||||
@@ -8,13 +8,16 @@ xset -dpms
|
||||
xset s noblank
|
||||
|
||||
tint2 &
|
||||
# Wait for bee-web to bind (Go starts fast, usually <2s)
|
||||
|
||||
# Wait up to 120s for bee-web to bind. The web server starts immediately now
|
||||
# (audit is deferred), so this should succeed in a few seconds on most hardware.
|
||||
i=0
|
||||
while [ $i -lt 30 ]; do
|
||||
while [ $i -lt 120 ]; do
|
||||
if curl -sf http://localhost/healthz >/dev/null 2>&1; then break; fi
|
||||
sleep 1
|
||||
i=$((i+1))
|
||||
done
|
||||
|
||||
chromium \
|
||||
--disable-infobars \
|
||||
--disable-translate \
|
||||
|
||||
Reference in New Issue
Block a user