Compare commits
6 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| ace1a9dba6 | |||
| 905c581ece | |||
| 7c2a0135d2 | |||
| 407c1cd1c4 | |||
| e15bcc91c5 | |||
| 98f0cf0d52 |
@@ -114,10 +114,13 @@ type satRunner interface {
|
||||
DetectGPUVendor() string
|
||||
ListAMDGPUs() ([]platform.AMDGPUInfo, error)
|
||||
RunAMDAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||
RunAMDMemIntegrityPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||
RunAMDMemBandwidthPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||
RunAMDStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||
RunMemoryStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||
RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||
RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
|
||||
RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
|
||||
RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||
}
|
||||
|
||||
@@ -577,6 +580,20 @@ func (a *App) RunAMDAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||
return ActionResult{Title: "AMD GPU SAT", Body: satResultBody(path)}, err
|
||||
}
|
||||
|
||||
func (a *App) RunAMDMemIntegrityPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunAMDMemIntegrityPack(ctx, baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunAMDMemBandwidthPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunAMDMemBandwidthPack(ctx, baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.RunMemoryStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||
}
|
||||
@@ -611,6 +628,13 @@ func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platfor
|
||||
return a.sat.RunFanStressTest(ctx, baseDir, opts)
|
||||
}
|
||||
|
||||
func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
|
||||
path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir, nil)
|
||||
body := "Results: " + path
|
||||
|
||||
@@ -181,6 +181,14 @@ func (f fakeSAT) RunAMDAcceptancePack(_ context.Context, baseDir string, _ func(
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunAMDMemIntegrityPack(_ context.Context, _ string, _ func(string)) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunAMDMemBandwidthPack(_ context.Context, _ string, _ func(string)) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunAMDStressPack(_ context.Context, _ string, _ int, _ func(string)) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
@@ -195,6 +203,10 @@ func (f fakeSAT) RunFanStressTest(_ context.Context, _ string, _ platform.FanStr
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunPlatformStress(_ context.Context, _ string, _ platform.PlatformStressOptions, _ func(string)) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
|
||||
476
audit/internal/platform/platform_stress.go
Normal file
476
audit/internal/platform/platform_stress.go
Normal file
@@ -0,0 +1,476 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"archive/tar"
|
||||
"bytes"
|
||||
"compress/gzip"
|
||||
"context"
|
||||
"encoding/csv"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// PlatformStressCycle defines one load+idle cycle.
|
||||
type PlatformStressCycle struct {
|
||||
LoadSec int // seconds of simultaneous CPU+GPU stress
|
||||
IdleSec int // seconds of idle monitoring after load cut
|
||||
}
|
||||
|
||||
// PlatformStressOptions controls the thermal cycling test.
|
||||
type PlatformStressOptions struct {
|
||||
Cycles []PlatformStressCycle
|
||||
}
|
||||
|
||||
// platformStressRow is one second of telemetry.
|
||||
type platformStressRow struct {
|
||||
ElapsedSec float64
|
||||
Cycle int
|
||||
Phase string // "load" | "idle"
|
||||
CPULoadPct float64
|
||||
MaxCPUTempC float64
|
||||
MaxGPUTempC float64
|
||||
SysPowerW float64
|
||||
FanMinRPM float64
|
||||
FanMaxRPM float64
|
||||
GPUThrottled bool
|
||||
}
|
||||
|
||||
// RunPlatformStress runs repeated load+idle thermal cycling.
|
||||
// Each cycle starts CPU (stressapptest) and GPU stress simultaneously,
|
||||
// runs for LoadSec, then cuts load abruptly and monitors for IdleSec.
|
||||
func (s *System) RunPlatformStress(
|
||||
ctx context.Context,
|
||||
baseDir string,
|
||||
opts PlatformStressOptions,
|
||||
logFunc func(string),
|
||||
) (string, error) {
|
||||
if logFunc == nil {
|
||||
logFunc = func(string) {}
|
||||
}
|
||||
if len(opts.Cycles) == 0 {
|
||||
return "", fmt.Errorf("no cycles defined")
|
||||
}
|
||||
if err := os.MkdirAll(baseDir, 0755); err != nil {
|
||||
return "", fmt.Errorf("mkdir %s: %w", baseDir, err)
|
||||
}
|
||||
|
||||
stamp := time.Now().UTC().Format("20060102-150405")
|
||||
runDir := filepath.Join(baseDir, "platform-stress-"+stamp)
|
||||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||
return "", fmt.Errorf("mkdir run dir: %w", err)
|
||||
}
|
||||
|
||||
vendor := s.DetectGPUVendor()
|
||||
logFunc(fmt.Sprintf("Platform Thermal Cycling — %d cycle(s), GPU vendor: %s", len(opts.Cycles), vendor))
|
||||
|
||||
var rows []platformStressRow
|
||||
start := time.Now()
|
||||
|
||||
var analyses []cycleAnalysis
|
||||
|
||||
for i, cycle := range opts.Cycles {
|
||||
if ctx.Err() != nil {
|
||||
break
|
||||
}
|
||||
cycleNum := i + 1
|
||||
logFunc(fmt.Sprintf("--- Cycle %d/%d: load=%ds, idle=%ds ---", cycleNum, len(opts.Cycles), cycle.LoadSec, cycle.IdleSec))
|
||||
|
||||
// ── LOAD PHASE ───────────────────────────────────────────────────────
|
||||
loadCtx, loadCancel := context.WithTimeout(ctx, time.Duration(cycle.LoadSec)*time.Second)
|
||||
var wg sync.WaitGroup
|
||||
|
||||
// CPU stress
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
cpuCmd, err := buildCPUStressCmd(loadCtx)
|
||||
if err != nil {
|
||||
logFunc("CPU stress: " + err.Error())
|
||||
return
|
||||
}
|
||||
_ = cpuCmd.Wait() // exits when loadCtx times out (SIGKILL)
|
||||
}()
|
||||
|
||||
// GPU stress
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
gpuCmd := buildGPUStressCmd(loadCtx, vendor)
|
||||
if gpuCmd == nil {
|
||||
return
|
||||
}
|
||||
_ = gpuCmd.Wait()
|
||||
}()
|
||||
|
||||
// Monitoring goroutine for load phase
|
||||
loadRows := collectPhase(loadCtx, cycleNum, "load", start)
|
||||
for _, r := range loadRows {
|
||||
logFunc(formatPlatformRow(r))
|
||||
}
|
||||
rows = append(rows, loadRows...)
|
||||
loadCancel()
|
||||
wg.Wait()
|
||||
|
||||
if len(loadRows) > 0 {
|
||||
logFunc(fmt.Sprintf("Cycle %d load ended (%.0fs)", cycleNum, loadRows[len(loadRows)-1].ElapsedSec))
|
||||
}
|
||||
|
||||
// ── IDLE PHASE ───────────────────────────────────────────────────────
|
||||
idleCtx, idleCancel := context.WithTimeout(ctx, time.Duration(cycle.IdleSec)*time.Second)
|
||||
idleRows := collectPhase(idleCtx, cycleNum, "idle", start)
|
||||
for _, r := range idleRows {
|
||||
logFunc(formatPlatformRow(r))
|
||||
}
|
||||
rows = append(rows, idleRows...)
|
||||
idleCancel()
|
||||
|
||||
// Per-cycle analysis
|
||||
an := analyzePlatformCycle(loadRows, idleRows)
|
||||
analyses = append(analyses, an)
|
||||
logFunc(fmt.Sprintf("Cycle %d: maxCPU=%.1f°C maxGPU=%.1f°C power=%.0fW throttled=%v fanDrop=%.0f%%",
|
||||
cycleNum, an.maxCPUTemp, an.maxGPUTemp, an.maxPower, an.throttled, an.fanDropPct))
|
||||
}
|
||||
|
||||
// Write CSV
|
||||
csvData := writePlatformCSV(rows)
|
||||
_ = os.WriteFile(filepath.Join(runDir, "metrics.csv"), csvData, 0644)
|
||||
|
||||
// Write summary
|
||||
summary := writePlatformSummary(opts, analyses)
|
||||
logFunc("--- Summary ---")
|
||||
for _, line := range strings.Split(summary, "\n") {
|
||||
if line != "" {
|
||||
logFunc(line)
|
||||
}
|
||||
}
|
||||
_ = os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644)
|
||||
|
||||
// Pack tar.gz
|
||||
archivePath := filepath.Join(baseDir, "platform-stress-"+stamp+".tar.gz")
|
||||
if err := packPlatformDir(runDir, archivePath); err != nil {
|
||||
return "", fmt.Errorf("pack archive: %w", err)
|
||||
}
|
||||
_ = os.RemoveAll(runDir)
|
||||
return archivePath, nil
|
||||
}
|
||||
|
||||
// collectPhase samples live metrics every second until ctx is done.
|
||||
func collectPhase(ctx context.Context, cycle int, phase string, testStart time.Time) []platformStressRow {
|
||||
var rows []platformStressRow
|
||||
ticker := time.NewTicker(time.Second)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return rows
|
||||
case <-ticker.C:
|
||||
sample := SampleLiveMetrics()
|
||||
rows = append(rows, sampleToPlatformRow(sample, cycle, phase, testStart))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func sampleToPlatformRow(s LiveMetricSample, cycle int, phase string, testStart time.Time) platformStressRow {
|
||||
r := platformStressRow{
|
||||
ElapsedSec: time.Since(testStart).Seconds(),
|
||||
Cycle: cycle,
|
||||
Phase: phase,
|
||||
CPULoadPct: s.CPULoadPct,
|
||||
SysPowerW: s.PowerW,
|
||||
}
|
||||
for _, t := range s.Temps {
|
||||
switch t.Group {
|
||||
case "cpu":
|
||||
if t.Celsius > r.MaxCPUTempC {
|
||||
r.MaxCPUTempC = t.Celsius
|
||||
}
|
||||
case "gpu":
|
||||
if t.Celsius > r.MaxGPUTempC {
|
||||
r.MaxGPUTempC = t.Celsius
|
||||
}
|
||||
}
|
||||
}
|
||||
for _, g := range s.GPUs {
|
||||
if g.TempC > r.MaxGPUTempC {
|
||||
r.MaxGPUTempC = g.TempC
|
||||
}
|
||||
}
|
||||
if len(s.Fans) > 0 {
|
||||
r.FanMinRPM = s.Fans[0].RPM
|
||||
r.FanMaxRPM = s.Fans[0].RPM
|
||||
for _, f := range s.Fans[1:] {
|
||||
if f.RPM < r.FanMinRPM {
|
||||
r.FanMinRPM = f.RPM
|
||||
}
|
||||
if f.RPM > r.FanMaxRPM {
|
||||
r.FanMaxRPM = f.RPM
|
||||
}
|
||||
}
|
||||
}
|
||||
return r
|
||||
}
|
||||
|
||||
func formatPlatformRow(r platformStressRow) string {
|
||||
throttle := ""
|
||||
if r.GPUThrottled {
|
||||
throttle = " THROTTLE"
|
||||
}
|
||||
fans := ""
|
||||
if r.FanMinRPM > 0 {
|
||||
fans = fmt.Sprintf(" fans=%.0f-%.0fRPM", r.FanMinRPM, r.FanMaxRPM)
|
||||
}
|
||||
return fmt.Sprintf("[%5.0fs] cycle=%d phase=%-4s cpu=%.0f%% cpuT=%.1f°C gpuT=%.1f°C pwr=%.0fW%s%s",
|
||||
r.ElapsedSec, r.Cycle, r.Phase, r.CPULoadPct, r.MaxCPUTempC, r.MaxGPUTempC, r.SysPowerW, fans, throttle)
|
||||
}
|
||||
|
||||
func analyzePlatformCycle(loadRows, idleRows []platformStressRow) cycleAnalysis {
|
||||
var an cycleAnalysis
|
||||
for _, r := range loadRows {
|
||||
if r.MaxCPUTempC > an.maxCPUTemp {
|
||||
an.maxCPUTemp = r.MaxCPUTempC
|
||||
}
|
||||
if r.MaxGPUTempC > an.maxGPUTemp {
|
||||
an.maxGPUTemp = r.MaxGPUTempC
|
||||
}
|
||||
if r.SysPowerW > an.maxPower {
|
||||
an.maxPower = r.SysPowerW
|
||||
}
|
||||
if r.GPUThrottled {
|
||||
an.throttled = true
|
||||
}
|
||||
}
|
||||
// Fan RPM at cut = avg of last 5 load rows
|
||||
if n := len(loadRows); n > 0 {
|
||||
window := loadRows
|
||||
if n > 5 {
|
||||
window = loadRows[n-5:]
|
||||
}
|
||||
var sum float64
|
||||
var cnt int
|
||||
for _, r := range window {
|
||||
if r.FanMinRPM > 0 {
|
||||
sum += (r.FanMinRPM + r.FanMaxRPM) / 2
|
||||
cnt++
|
||||
}
|
||||
}
|
||||
if cnt > 0 {
|
||||
an.fanAtCutAvg = sum / float64(cnt)
|
||||
}
|
||||
}
|
||||
// Fan RPM min in first 15s of idle
|
||||
an.fanMin15s = an.fanAtCutAvg
|
||||
var cutElapsed float64
|
||||
if len(loadRows) > 0 {
|
||||
cutElapsed = loadRows[len(loadRows)-1].ElapsedSec
|
||||
}
|
||||
for _, r := range idleRows {
|
||||
if r.ElapsedSec > cutElapsed+15 {
|
||||
break
|
||||
}
|
||||
avg := (r.FanMinRPM + r.FanMaxRPM) / 2
|
||||
if avg > 0 && (an.fanMin15s == 0 || avg < an.fanMin15s) {
|
||||
an.fanMin15s = avg
|
||||
}
|
||||
}
|
||||
if an.fanAtCutAvg > 0 {
|
||||
an.fanDropPct = (an.fanAtCutAvg - an.fanMin15s) / an.fanAtCutAvg * 100
|
||||
}
|
||||
return an
|
||||
}
|
||||
|
||||
type cycleAnalysis struct {
|
||||
maxCPUTemp float64
|
||||
maxGPUTemp float64
|
||||
maxPower float64
|
||||
throttled bool
|
||||
fanAtCutAvg float64
|
||||
fanMin15s float64
|
||||
fanDropPct float64
|
||||
}
|
||||
|
||||
func writePlatformSummary(opts PlatformStressOptions, analyses []cycleAnalysis) string {
|
||||
var b strings.Builder
|
||||
fmt.Fprintf(&b, "Platform Thermal Cycling — %d cycle(s)\n", len(opts.Cycles))
|
||||
fmt.Fprintf(&b, "%s\n\n", strings.Repeat("=", 48))
|
||||
|
||||
totalThrottle := 0
|
||||
totalFanWarn := 0
|
||||
for i, an := range analyses {
|
||||
cycle := opts.Cycles[i]
|
||||
fmt.Fprintf(&b, "Cycle %d/%d (load=%ds, idle=%ds)\n", i+1, len(opts.Cycles), cycle.LoadSec, cycle.IdleSec)
|
||||
fmt.Fprintf(&b, " Max CPU temp: %.1f°C\n", an.maxCPUTemp)
|
||||
fmt.Fprintf(&b, " Max GPU temp: %.1f°C\n", an.maxGPUTemp)
|
||||
fmt.Fprintf(&b, " Max sys power: %.0f W\n", an.maxPower)
|
||||
if an.throttled {
|
||||
fmt.Fprintf(&b, " Throttle: DETECTED\n")
|
||||
totalThrottle++
|
||||
} else {
|
||||
fmt.Fprintf(&b, " Throttle: none\n")
|
||||
}
|
||||
if an.fanAtCutAvg > 0 {
|
||||
fmt.Fprintf(&b, " Fan at load cut: %.0f RPM avg\n", an.fanAtCutAvg)
|
||||
fmt.Fprintf(&b, " Fan min (first 15s idle): %.0f RPM (drop %.0f%%)\n", an.fanMin15s, an.fanDropPct)
|
||||
if an.fanDropPct > 20 {
|
||||
fmt.Fprintf(&b, " Fan response: WARN — fast spindown (>20%% drop in 15s)\n")
|
||||
totalFanWarn++
|
||||
} else {
|
||||
fmt.Fprintf(&b, " Fan response: OK\n")
|
||||
}
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
fmt.Fprintf(&b, "%s\n", strings.Repeat("=", 48))
|
||||
if totalThrottle > 0 {
|
||||
fmt.Fprintf(&b, "Overall: FAIL — throttle detected in %d/%d cycles\n", totalThrottle, len(analyses))
|
||||
} else if totalFanWarn > 0 {
|
||||
fmt.Fprintf(&b, "Overall: WARN — fast fan spindown in %d/%d cycles (cooling recovery risk)\n", totalFanWarn, len(analyses))
|
||||
} else {
|
||||
fmt.Fprintf(&b, "Overall: PASS\n")
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func writePlatformCSV(rows []platformStressRow) []byte {
|
||||
var buf bytes.Buffer
|
||||
w := csv.NewWriter(&buf)
|
||||
_ = w.Write([]string{
|
||||
"elapsed_sec", "cycle", "phase",
|
||||
"cpu_load_pct", "max_cpu_temp_c", "max_gpu_temp_c",
|
||||
"sys_power_w", "fan_min_rpm", "fan_max_rpm", "gpu_throttled",
|
||||
})
|
||||
for _, r := range rows {
|
||||
throttled := "0"
|
||||
if r.GPUThrottled {
|
||||
throttled = "1"
|
||||
}
|
||||
_ = w.Write([]string{
|
||||
strconv.FormatFloat(r.ElapsedSec, 'f', 1, 64),
|
||||
strconv.Itoa(r.Cycle),
|
||||
r.Phase,
|
||||
strconv.FormatFloat(r.CPULoadPct, 'f', 1, 64),
|
||||
strconv.FormatFloat(r.MaxCPUTempC, 'f', 1, 64),
|
||||
strconv.FormatFloat(r.MaxGPUTempC, 'f', 1, 64),
|
||||
strconv.FormatFloat(r.SysPowerW, 'f', 1, 64),
|
||||
strconv.FormatFloat(r.FanMinRPM, 'f', 0, 64),
|
||||
strconv.FormatFloat(r.FanMaxRPM, 'f', 0, 64),
|
||||
throttled,
|
||||
})
|
||||
}
|
||||
w.Flush()
|
||||
return buf.Bytes()
|
||||
}
|
||||
|
||||
// buildCPUStressCmd creates a stressapptest command that runs until ctx is cancelled.
|
||||
func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
|
||||
path, err := satLookPath("stressapptest")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("stressapptest not found: %w", err)
|
||||
}
|
||||
// Use a very long duration; the context timeout will kill it at the right time.
|
||||
cmd := exec.CommandContext(ctx, path, "-s", "86400", "-W", "--cc_test")
|
||||
cmd.Stdout = nil
|
||||
cmd.Stderr = nil
|
||||
if err := cmd.Start(); err != nil {
|
||||
return nil, fmt.Errorf("stressapptest start: %w", err)
|
||||
}
|
||||
return cmd, nil
|
||||
}
|
||||
|
||||
// buildGPUStressCmd creates a GPU stress command appropriate for the detected vendor.
|
||||
// Returns nil if no GPU stress tool is available (CPU-only cycling still useful).
|
||||
func buildGPUStressCmd(ctx context.Context, vendor string) *exec.Cmd {
|
||||
switch strings.ToLower(vendor) {
|
||||
case "amd":
|
||||
return buildAMDGPUStressCmd(ctx)
|
||||
case "nvidia":
|
||||
return buildNvidiaGPUStressCmd(ctx)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
|
||||
rvsArgs, err := resolveRVSCommand()
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
rvsPath := rvsArgs[0]
|
||||
cfg := `actions:
|
||||
- name: gst_platform
|
||||
device: all
|
||||
module: gst
|
||||
parallel: true
|
||||
duration: 86400000
|
||||
copy_matrix: false
|
||||
target_stress: 90
|
||||
matrix_size_a: 8640
|
||||
matrix_size_b: 8640
|
||||
matrix_size_c: 8640
|
||||
`
|
||||
cfgFile := "/tmp/bee-platform-gst.conf"
|
||||
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
|
||||
cmd := exec.CommandContext(ctx, rvsPath, "-c", cfgFile)
|
||||
cmd.Stdout = nil
|
||||
cmd.Stderr = nil
|
||||
_ = cmd.Start()
|
||||
return cmd
|
||||
}
|
||||
|
||||
func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
|
||||
path, err := satLookPath("bee-gpu-stress")
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
cmd := exec.CommandContext(ctx, path, "--seconds", "86400", "--size-mb", "64")
|
||||
cmd.Stdout = nil
|
||||
cmd.Stderr = nil
|
||||
_ = cmd.Start()
|
||||
return cmd
|
||||
}
|
||||
|
||||
func packPlatformDir(dir, dest string) error {
|
||||
f, err := os.Create(dest)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer f.Close()
|
||||
gz := gzip.NewWriter(f)
|
||||
defer gz.Close()
|
||||
tw := tar.NewWriter(gz)
|
||||
defer tw.Close()
|
||||
|
||||
entries, err := os.ReadDir(dir)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
base := filepath.Base(dir)
|
||||
for _, e := range entries {
|
||||
if e.IsDir() {
|
||||
continue
|
||||
}
|
||||
fpath := filepath.Join(dir, e.Name())
|
||||
data, err := os.ReadFile(fpath)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
hdr := &tar.Header{
|
||||
Name: filepath.Join(base, e.Name()),
|
||||
Size: int64(len(data)),
|
||||
Mode: 0644,
|
||||
ModTime: time.Now(),
|
||||
}
|
||||
if err := tw.WriteHeader(hdr); err != nil {
|
||||
return err
|
||||
}
|
||||
if _, err := tw.Write(data); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -136,6 +136,54 @@ func (s *System) RunAMDAcceptancePack(ctx context.Context, baseDir string, logFu
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
// RunAMDMemIntegrityPack runs the official RVS MEM module as a validate-style memory integrity test.
|
||||
func (s *System) RunAMDMemIntegrityPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
if err := ensureAMDRuntimeReady(); err != nil {
|
||||
return "", err
|
||||
}
|
||||
cfgFile := "/tmp/bee-amd-mem.conf"
|
||||
cfg := `actions:
|
||||
- name: mem_integrity
|
||||
device: all
|
||||
module: mem
|
||||
parallel: true
|
||||
duration: 60000
|
||||
copy_matrix: false
|
||||
target_stress: 90
|
||||
matrix_size: 8640
|
||||
`
|
||||
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-mem", []satJob{
|
||||
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
|
||||
{name: "02-rvs-mem.log", cmd: []string{"rvs", "-c", cfgFile}},
|
||||
{name: "03-rocm-smi-after.log", cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--showmemuse", "--csv"}},
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
// RunAMDMemBandwidthPack runs AMD's memory/interconnect bandwidth-oriented tools.
|
||||
func (s *System) RunAMDMemBandwidthPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
if err := ensureAMDRuntimeReady(); err != nil {
|
||||
return "", err
|
||||
}
|
||||
cfgFile := "/tmp/bee-amd-babel.conf"
|
||||
cfg := `actions:
|
||||
- name: babel_mem_bw
|
||||
device: all
|
||||
module: babel
|
||||
parallel: true
|
||||
copy_matrix: true
|
||||
target_stress: 90
|
||||
matrix_size: 134217728
|
||||
`
|
||||
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-bandwidth", []satJob{
|
||||
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
|
||||
{name: "02-rocm-bandwidth-test.log", cmd: []string{"rocm-bandwidth-test"}},
|
||||
{name: "03-rvs-babel.log", cmd: []string{"rvs", "-c", cfgFile}},
|
||||
{name: "04-rocm-smi-after.log", cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--showmemuse", "--csv"}},
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
// RunAMDStressPack runs an AMD GPU burn-in pack.
|
||||
// Missing tools are reported as UNSUPPORTED, consistent with the existing SAT pattern.
|
||||
func (s *System) RunAMDStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
@@ -146,8 +194,16 @@ func (s *System) RunAMDStressPack(ctx context.Context, baseDir string, durationS
|
||||
if err := ensureAMDRuntimeReady(); err != nil {
|
||||
return "", err
|
||||
}
|
||||
// Write RVS GST config to a temp file
|
||||
rvsCfg := fmt.Sprintf(`actions:
|
||||
// Enable copy_matrix so the same GST run drives VRAM traffic in addition to compute.
|
||||
rvsCfg := amdStressRVSConfig(seconds)
|
||||
cfgFile := "/tmp/bee-amd-gst.conf"
|
||||
_ = os.WriteFile(cfgFile, []byte(rvsCfg), 0644)
|
||||
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-stress", amdStressJobs(seconds, cfgFile), logFunc)
|
||||
}
|
||||
|
||||
func amdStressRVSConfig(seconds int) string {
|
||||
return fmt.Sprintf(`actions:
|
||||
- name: gst_stress
|
||||
device: all
|
||||
module: gst
|
||||
@@ -159,15 +215,15 @@ func (s *System) RunAMDStressPack(ctx context.Context, baseDir string, durationS
|
||||
matrix_size_b: 8640
|
||||
matrix_size_c: 8640
|
||||
`, seconds*1000)
|
||||
cfgFile := "/tmp/bee-amd-gst.conf"
|
||||
_ = os.WriteFile(cfgFile, []byte(rvsCfg), 0644)
|
||||
}
|
||||
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-stress", []satJob{
|
||||
func amdStressJobs(seconds int, cfgFile string) []satJob {
|
||||
return []satJob{
|
||||
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
|
||||
{name: "02-rocm-bandwidth-test.log", cmd: []string{"rocm-bandwidth-test"}},
|
||||
{name: fmt.Sprintf("03-rvs-gst-%ds.log", seconds), cmd: []string{"rvs", "-c", cfgFile}},
|
||||
{name: fmt.Sprintf("04-rocm-smi-after.log"), cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--csv"}},
|
||||
}, logFunc)
|
||||
}
|
||||
}
|
||||
|
||||
// ListNvidiaGPUs returns GPUs visible to nvidia-smi.
|
||||
|
||||
@@ -5,6 +5,7 @@ import (
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
@@ -38,6 +39,47 @@ func TestRunNvidiaAcceptancePackIncludesGPUStress(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestAMDStressConfigUsesSingleGSTAction(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
cfg := amdStressRVSConfig(123)
|
||||
if !strings.Contains(cfg, "module: gst") {
|
||||
t.Fatalf("config missing gst module:\n%s", cfg)
|
||||
}
|
||||
if strings.Contains(cfg, "module: mem") {
|
||||
t.Fatalf("config should not include mem module:\n%s", cfg)
|
||||
}
|
||||
if !strings.Contains(cfg, "copy_matrix: false") {
|
||||
t.Fatalf("config should use copy_matrix=false:\n%s", cfg)
|
||||
}
|
||||
if strings.Count(cfg, "duration: 123000") != 1 {
|
||||
t.Fatalf("config should apply duration once:\n%s", cfg)
|
||||
}
|
||||
for _, field := range []string{"matrix_size_a: 8640", "matrix_size_b: 8640", "matrix_size_c: 8640"} {
|
||||
if !strings.Contains(cfg, field) {
|
||||
t.Fatalf("config missing %s:\n%s", field, cfg)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestAMDStressJobsIncludeBandwidthAndGST(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
jobs := amdStressJobs(300, "/tmp/test-amd-gst.conf")
|
||||
if len(jobs) != 4 {
|
||||
t.Fatalf("jobs=%d want 4", len(jobs))
|
||||
}
|
||||
if got := jobs[1].cmd[0]; got != "rocm-bandwidth-test" {
|
||||
t.Fatalf("jobs[1]=%q want rocm-bandwidth-test", got)
|
||||
}
|
||||
if got := jobs[2].cmd[0]; got != "rvs" {
|
||||
t.Fatalf("jobs[2]=%q want rvs", got)
|
||||
}
|
||||
if got := jobs[2].cmd[2]; got != "/tmp/test-amd-gst.conf" {
|
||||
t.Fatalf("jobs[2] cfg=%q want /tmp/test-amd-gst.conf", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNvidiaSATJobsUseEnvOverrides(t *testing.T) {
|
||||
t.Setenv("BEE_GPU_STRESS_SECONDS", "9")
|
||||
t.Setenv("BEE_GPU_STRESS_SIZE_MB", "96")
|
||||
|
||||
@@ -599,10 +599,9 @@ func (h *handler) handleAPIMetricsStream(w http.ResponseWriter, r *http.Request)
|
||||
case <-r.Context().Done():
|
||||
return
|
||||
case <-ticker.C:
|
||||
sample := platform.SampleLiveMetrics()
|
||||
h.feedRings(sample)
|
||||
if h.metricsDB != nil {
|
||||
_ = h.metricsDB.Write(sample)
|
||||
sample, ok := h.latestMetric()
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
b, err := json.Marshal(sample)
|
||||
if err != nil {
|
||||
|
||||
@@ -3,7 +3,6 @@ package webui
|
||||
import (
|
||||
"database/sql"
|
||||
"encoding/csv"
|
||||
"fmt"
|
||||
"io"
|
||||
"strconv"
|
||||
"time"
|
||||
@@ -13,7 +12,6 @@ import (
|
||||
)
|
||||
|
||||
const metricsDBPath = "/appdata/bee/metrics.db"
|
||||
const metricsKeepDuration = 24 * time.Hour
|
||||
|
||||
// MetricsDB persists live metric samples to SQLite.
|
||||
type MetricsDB struct {
|
||||
@@ -116,11 +114,18 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
|
||||
}
|
||||
|
||||
// LoadRecent returns up to n samples in chronological order (oldest first).
|
||||
// It reconstructs LiveMetricSample from the normalized tables.
|
||||
func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
|
||||
rows, err := m.db.Query(
|
||||
`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?`, n,
|
||||
)
|
||||
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?`, n)
|
||||
}
|
||||
|
||||
// LoadAll returns all persisted samples in chronological order (oldest first).
|
||||
func (m *MetricsDB) LoadAll() ([]platform.LiveMetricSample, error) {
|
||||
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts`, nil)
|
||||
}
|
||||
|
||||
// loadSamples reconstructs LiveMetricSample rows from the normalized tables.
|
||||
func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetricSample, error) {
|
||||
rows, err := m.db.Query(query, args...)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -257,14 +262,6 @@ func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
|
||||
return samples, nil
|
||||
}
|
||||
|
||||
// Prune deletes samples older than keepDuration.
|
||||
func (m *MetricsDB) Prune(keepDuration time.Duration) {
|
||||
cutoff := time.Now().Add(-keepDuration).Unix()
|
||||
for _, table := range []string{"sys_metrics", "gpu_metrics", "fan_metrics", "temp_metrics"} {
|
||||
_, _ = m.db.Exec(fmt.Sprintf("DELETE FROM %s WHERE ts < ?", table), cutoff)
|
||||
}
|
||||
}
|
||||
|
||||
// ExportCSV writes all sys+gpu data as CSV to w.
|
||||
func (m *MetricsDB) ExportCSV(w io.Writer) error {
|
||||
rows, err := m.db.Query(`
|
||||
|
||||
@@ -494,7 +494,11 @@ func renderValidate() string {
|
||||
renderSATCard("memory", "Memory", "") +
|
||||
renderSATCard("storage", "Storage", "") +
|
||||
renderSATCard("cpu", "CPU", `<div class="form-row"><label>Duration (seconds)</label><input type="number" id="sat-cpu-dur" value="60" min="10"></div>`) +
|
||||
renderSATCard("amd", "AMD GPU", "") +
|
||||
renderSATCard("amd", "AMD GPU", `<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
|
||||
<button id="sat-btn-amd-mem" class="btn" type="button" onclick="runSAT('amd-mem')">MEM Integrity</button>
|
||||
<button id="sat-btn-amd-bandwidth" class="btn" type="button" onclick="runSAT('amd-bandwidth')">MEM Bandwidth</button>
|
||||
</div>
|
||||
<p style="color:var(--muted);font-size:12px;margin:0">Additional AMD memory diagnostics: RVS MEM for integrity and BABEL + rocm-bandwidth-test for memory/interconnect bandwidth.</p>`) +
|
||||
`</div>
|
||||
<div id="sat-output" style="display:none;margin-top:16px" class="card">
|
||||
<div class="card-head">Test Output <span id="sat-title"></span></div>
|
||||
@@ -505,7 +509,7 @@ let satES = null;
|
||||
function runSAT(target) {
|
||||
if (satES) { satES.close(); satES = null; }
|
||||
const body = {};
|
||||
const labels = {nvidia:'Validate GPU', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU'};
|
||||
const labels = {nvidia:'Validate GPU', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
|
||||
body.display_name = labels[target] || ('Validate ' + target);
|
||||
if (target === 'nvidia') body.diag_level = parseInt(document.getElementById('sat-nvidia-level').value)||1;
|
||||
if (target === 'cpu') body.duration = parseInt(document.getElementById('sat-cpu-dur').value)||60;
|
||||
@@ -524,7 +528,7 @@ function runSAT(target) {
|
||||
}
|
||||
function runAllSAT() {
|
||||
const cycles = Math.max(1, parseInt(document.getElementById('sat-cycles').value)||1);
|
||||
const targets = ['nvidia','memory','storage','cpu','amd'];
|
||||
const targets = ['nvidia','memory','storage','cpu','amd','amd-mem','amd-bandwidth'];
|
||||
const total = targets.length * cycles;
|
||||
let enqueued = 0;
|
||||
const status = document.getElementById('sat-all-status');
|
||||
@@ -536,7 +540,7 @@ function runAllSAT() {
|
||||
const btn = document.getElementById('sat-btn-' + target);
|
||||
if (btn && btn.disabled) { enqueueNext(cycle, idx+1); return; }
|
||||
const body = {};
|
||||
const labels = {nvidia:'Validate GPU', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU'};
|
||||
const labels = {nvidia:'Validate GPU', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
|
||||
body.display_name = labels[target] || ('Validate ' + target);
|
||||
if (target === 'nvidia') body.diag_level = parseInt(document.getElementById('sat-nvidia-level').value)||1;
|
||||
if (target === 'cpu') body.duration = parseInt(document.getElementById('sat-cpu-dur').value)||60;
|
||||
@@ -554,6 +558,8 @@ function runAllSAT() {
|
||||
fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
|
||||
if (!gp.nvidia) disableSATCard('nvidia', 'No NVIDIA GPU detected');
|
||||
if (!gp.amd) disableSATCard('amd', 'No AMD GPU detected');
|
||||
if (!gp.amd) disableSATCard('amd-mem', 'No AMD GPU detected');
|
||||
if (!gp.amd) disableSATCard('amd-bandwidth', 'No AMD GPU detected');
|
||||
});
|
||||
function disableSATCard(id, reason) {
|
||||
const btn = document.getElementById('sat-btn-' + id);
|
||||
@@ -598,7 +604,7 @@ func renderBurn() string {
|
||||
<button class="btn btn-primary" onclick="runBurnIn('cpu')">▶ Start CPU Stress</button>
|
||||
</div></div>
|
||||
<div class="card"><div class="card-head">AMD GPU Stress</div><div class="card-body">
|
||||
<p style="color:var(--muted);font-size:12px;margin-bottom:8px">Requires ROCm tools (rocm-bandwidth-test). Missing tools reported as UNSUPPORTED.</p>
|
||||
<p style="color:var(--muted);font-size:12px;margin-bottom:8px">Runs ROCm compute stress together with VRAM copy/load activity via RVS GST and records a separate <code>rocm-bandwidth-test</code> snapshot. Missing tools reported as UNSUPPORTED.</p>
|
||||
<button id="sat-btn-amd-stress" class="btn btn-primary" onclick="runBurnIn('amd-stress')">▶ Start AMD Stress</button>
|
||||
</div></div>
|
||||
<div class="card"><div class="card-head">Memory Stress</div><div class="card-body">
|
||||
@@ -609,6 +615,10 @@ func renderBurn() string {
|
||||
<p style="color:var(--muted);font-size:12px;margin-bottom:8px">Google stressapptest saturates CPU, memory and cache buses simultaneously. Env: <code>BEE_SAT_STRESS_SECONDS</code> (default 300), <code>BEE_SAT_STRESS_MB</code> (default auto).</p>
|
||||
<button class="btn btn-primary" onclick="runBurnIn('sat-stress')">▶ Start SAT Stress</button>
|
||||
</div></div>
|
||||
<div class="card"><div class="card-head">Platform Thermal Cycling</div><div class="card-body">
|
||||
<p style="color:var(--muted);font-size:12px;margin-bottom:8px">Runs CPU + GPU stress simultaneously across multiple load/idle cycles with varying durations. Detects cooling systems that fail to recover under repeated load cycles. Smoke: 2 cycles ~5 min. Acceptance: 4 cycles ~25 min.</p>
|
||||
<button class="btn btn-primary" onclick="runBurnIn('platform-stress')">▶ Start Thermal Cycling</button>
|
||||
</div></div>
|
||||
</div>
|
||||
<div id="bi-output" style="display:none;margin-top:16px" class="card">
|
||||
<div class="card-head">Output <span id="bi-title"></span></div>
|
||||
|
||||
@@ -72,29 +72,36 @@ func (r *metricsRing) snapshot() ([]float64, []string) {
|
||||
defer r.mu.Unlock()
|
||||
v := make([]float64, len(r.vals))
|
||||
copy(v, r.vals)
|
||||
now := time.Now()
|
||||
labels := make([]string, len(r.times))
|
||||
if len(r.times) == 0 {
|
||||
return v, labels
|
||||
}
|
||||
sameDay := timestampsSameLocalDay(r.times)
|
||||
for i, t := range r.times {
|
||||
labels[i] = relAgeLabel(now.Sub(t))
|
||||
labels[i] = formatTimelineLabel(t.Local(), sameDay)
|
||||
}
|
||||
return v, labels
|
||||
}
|
||||
|
||||
func relAgeLabel(age time.Duration) string {
|
||||
if age <= 0 {
|
||||
return "0"
|
||||
func timestampsSameLocalDay(times []time.Time) bool {
|
||||
if len(times) == 0 {
|
||||
return true
|
||||
}
|
||||
if age < time.Hour {
|
||||
m := int(age.Minutes())
|
||||
if m == 0 {
|
||||
return "-1m"
|
||||
first := times[0].Local()
|
||||
for _, t := range times[1:] {
|
||||
local := t.Local()
|
||||
if local.Year() != first.Year() || local.YearDay() != first.YearDay() {
|
||||
return false
|
||||
}
|
||||
return fmt.Sprintf("-%dm", m)
|
||||
}
|
||||
if age < 24*time.Hour {
|
||||
return fmt.Sprintf("-%dh", int(age.Hours()))
|
||||
return true
|
||||
}
|
||||
|
||||
func formatTimelineLabel(ts time.Time, sameDay bool) string {
|
||||
if sameDay {
|
||||
return ts.Format("15:04")
|
||||
}
|
||||
return fmt.Sprintf("-%dd", int(age.Hours()/24))
|
||||
return ts.Format("01-02 15:04")
|
||||
}
|
||||
|
||||
// gpuRings holds per-GPU ring buffers.
|
||||
@@ -132,6 +139,8 @@ type handler struct {
|
||||
// per-GPU rings (index = GPU index)
|
||||
gpuRings []*gpuRings
|
||||
ringsMu sync.Mutex
|
||||
latestMu sync.RWMutex
|
||||
latest *platform.LiveMetricSample
|
||||
// metrics persistence (nil if DB unavailable)
|
||||
metricsDB *MetricsDB
|
||||
// install job (at most one at a time)
|
||||
@@ -164,13 +173,16 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
// Open metrics DB and pre-fill ring buffers from history.
|
||||
if db, err := openMetricsDB(metricsDBPath); err == nil {
|
||||
h.metricsDB = db
|
||||
db.Prune(metricsKeepDuration)
|
||||
if samples, err := db.LoadRecent(120); err == nil {
|
||||
for _, s := range samples {
|
||||
h.feedRings(s)
|
||||
}
|
||||
if len(samples) > 0 {
|
||||
h.setLatestMetric(samples[len(samples)-1])
|
||||
}
|
||||
}
|
||||
}
|
||||
h.startMetricsCollector()
|
||||
|
||||
globalQueue.startWorker(&opts)
|
||||
mux := http.NewServeMux()
|
||||
@@ -198,9 +210,12 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
mux.HandleFunc("POST /api/sat/storage/run", h.handleAPISATRun("storage"))
|
||||
mux.HandleFunc("POST /api/sat/cpu/run", h.handleAPISATRun("cpu"))
|
||||
mux.HandleFunc("POST /api/sat/amd/run", h.handleAPISATRun("amd"))
|
||||
mux.HandleFunc("POST /api/sat/amd-mem/run", h.handleAPISATRun("amd-mem"))
|
||||
mux.HandleFunc("POST /api/sat/amd-bandwidth/run", h.handleAPISATRun("amd-bandwidth"))
|
||||
mux.HandleFunc("POST /api/sat/amd-stress/run", h.handleAPISATRun("amd-stress"))
|
||||
mux.HandleFunc("POST /api/sat/memory-stress/run", h.handleAPISATRun("memory-stress"))
|
||||
mux.HandleFunc("POST /api/sat/sat-stress/run", h.handleAPISATRun("sat-stress"))
|
||||
mux.HandleFunc("POST /api/sat/platform-stress/run", h.handleAPISATRun("platform-stress"))
|
||||
mux.HandleFunc("GET /api/sat/stream", h.handleAPISATStream)
|
||||
mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
|
||||
|
||||
@@ -260,6 +275,37 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
return mux
|
||||
}
|
||||
|
||||
func (h *handler) startMetricsCollector() {
|
||||
go func() {
|
||||
ticker := time.NewTicker(1 * time.Second)
|
||||
defer ticker.Stop()
|
||||
for range ticker.C {
|
||||
sample := platform.SampleLiveMetrics()
|
||||
h.feedRings(sample)
|
||||
h.setLatestMetric(sample)
|
||||
if h.metricsDB != nil {
|
||||
_ = h.metricsDB.Write(sample)
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
func (h *handler) setLatestMetric(sample platform.LiveMetricSample) {
|
||||
h.latestMu.Lock()
|
||||
defer h.latestMu.Unlock()
|
||||
cp := sample
|
||||
h.latest = &cp
|
||||
}
|
||||
|
||||
func (h *handler) latestMetric() (platform.LiveMetricSample, bool) {
|
||||
h.latestMu.RLock()
|
||||
defer h.latestMu.RUnlock()
|
||||
if h.latest == nil {
|
||||
return platform.LiveMetricSample{}, false
|
||||
}
|
||||
return *h.latest, true
|
||||
}
|
||||
|
||||
// ListenAndServe starts the HTTP server.
|
||||
func ListenAndServe(addr string, opts HandlerOptions) error {
|
||||
return http.ListenAndServe(addr, NewHandler(opts))
|
||||
@@ -387,6 +433,20 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
|
||||
path := strings.TrimPrefix(r.URL.Path, "/api/metrics/chart/")
|
||||
path = strings.TrimSuffix(path, ".svg")
|
||||
|
||||
if h.metricsDB != nil {
|
||||
if datasets, names, labels, title, yMin, yMax, ok := h.chartDataFromDB(path); ok {
|
||||
buf, err := renderChartSVG(title, datasets, names, labels, yMin, yMax)
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
w.Header().Set("Content-Type", "image/svg+xml")
|
||||
w.Header().Set("Cache-Control", "no-store")
|
||||
_, _ = w.Write(buf)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
var datasets [][]float64
|
||||
var names []string
|
||||
var labels []string
|
||||
@@ -601,6 +661,259 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
|
||||
_, _ = w.Write(buf)
|
||||
}
|
||||
|
||||
func (h *handler) chartDataFromDB(path string) ([][]float64, []string, []string, string, *float64, *float64, bool) {
|
||||
samples, err := h.metricsDB.LoadAll()
|
||||
if err != nil || len(samples) == 0 {
|
||||
return nil, nil, nil, "", nil, nil, false
|
||||
}
|
||||
return chartDataFromSamples(path, samples)
|
||||
}
|
||||
|
||||
func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][]float64, []string, []string, string, *float64, *float64, bool) {
|
||||
var datasets [][]float64
|
||||
var names []string
|
||||
var title string
|
||||
var yMin, yMax *float64
|
||||
labels := sampleTimeLabels(samples)
|
||||
|
||||
switch {
|
||||
case path == "server-load":
|
||||
title = "CPU / Memory Load"
|
||||
cpu := make([]float64, len(samples))
|
||||
mem := make([]float64, len(samples))
|
||||
for i, s := range samples {
|
||||
cpu[i] = s.CPULoadPct
|
||||
mem[i] = s.MemLoadPct
|
||||
}
|
||||
datasets = [][]float64{cpu, mem}
|
||||
names = []string{"CPU Load %", "Mem Load %"}
|
||||
yMin = floatPtr(0)
|
||||
yMax = floatPtr(100)
|
||||
|
||||
case path == "server-temp", path == "server-temp-cpu":
|
||||
title = "CPU Temperature"
|
||||
datasets, names = namedTempDatasets(samples, "cpu")
|
||||
yMin = floatPtr(0)
|
||||
yMax = autoMax120(datasets...)
|
||||
|
||||
case path == "server-temp-gpu":
|
||||
title = "GPU Temperature"
|
||||
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.TempC })
|
||||
yMin = floatPtr(0)
|
||||
yMax = autoMax120(datasets...)
|
||||
|
||||
case path == "server-temp-ambient":
|
||||
title = "Ambient / Other Sensors"
|
||||
datasets, names = namedTempDatasets(samples, "ambient")
|
||||
yMin = floatPtr(0)
|
||||
yMax = autoMax120(datasets...)
|
||||
|
||||
case path == "server-power":
|
||||
title = "System Power"
|
||||
power := make([]float64, len(samples))
|
||||
for i, s := range samples {
|
||||
power[i] = s.PowerW
|
||||
}
|
||||
datasets = [][]float64{power}
|
||||
names = []string{"Power W"}
|
||||
yMin, yMax = autoBounds120(power)
|
||||
|
||||
case path == "server-fans":
|
||||
title = "Fan RPM"
|
||||
datasets, names = namedFanDatasets(samples)
|
||||
yMin, yMax = autoBounds120(datasets...)
|
||||
|
||||
case path == "gpu-all-load":
|
||||
title = "GPU Compute Load"
|
||||
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.UsagePct })
|
||||
yMin = floatPtr(0)
|
||||
yMax = floatPtr(100)
|
||||
|
||||
case path == "gpu-all-memload":
|
||||
title = "GPU Memory Load"
|
||||
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct })
|
||||
yMin = floatPtr(0)
|
||||
yMax = floatPtr(100)
|
||||
|
||||
case path == "gpu-all-power":
|
||||
title = "GPU Power"
|
||||
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.PowerW })
|
||||
yMin, yMax = autoBounds120(datasets...)
|
||||
|
||||
case path == "gpu-all-temp":
|
||||
title = "GPU Temperature"
|
||||
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.TempC })
|
||||
yMin = floatPtr(0)
|
||||
yMax = autoMax120(datasets...)
|
||||
|
||||
case strings.HasPrefix(path, "gpu/"):
|
||||
rest := strings.TrimPrefix(path, "gpu/")
|
||||
sub := ""
|
||||
if i := strings.LastIndex(rest, "-"); i > 0 {
|
||||
sub = rest[i+1:]
|
||||
rest = rest[:i]
|
||||
}
|
||||
idx := 0
|
||||
fmt.Sscanf(rest, "%d", &idx)
|
||||
switch sub {
|
||||
case "load":
|
||||
title = fmt.Sprintf("GPU %d Load", idx)
|
||||
util := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.UsagePct })
|
||||
mem := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct })
|
||||
if util == nil && mem == nil {
|
||||
return nil, nil, nil, "", nil, nil, false
|
||||
}
|
||||
datasets = [][]float64{coalesceDataset(util, len(samples)), coalesceDataset(mem, len(samples))}
|
||||
names = []string{"Load %", "Mem %"}
|
||||
yMin = floatPtr(0)
|
||||
yMax = floatPtr(100)
|
||||
case "temp":
|
||||
title = fmt.Sprintf("GPU %d Temperature", idx)
|
||||
temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
|
||||
if temp == nil {
|
||||
return nil, nil, nil, "", nil, nil, false
|
||||
}
|
||||
datasets = [][]float64{temp}
|
||||
names = []string{"Temp °C"}
|
||||
yMin = floatPtr(0)
|
||||
yMax = autoMax120(temp)
|
||||
default:
|
||||
title = fmt.Sprintf("GPU %d Power", idx)
|
||||
power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
|
||||
if power == nil {
|
||||
return nil, nil, nil, "", nil, nil, false
|
||||
}
|
||||
datasets = [][]float64{power}
|
||||
names = []string{"Power W"}
|
||||
yMin, yMax = autoBounds120(power)
|
||||
}
|
||||
|
||||
default:
|
||||
return nil, nil, nil, "", nil, nil, false
|
||||
}
|
||||
|
||||
return datasets, names, labels, title, yMin, yMax, len(datasets) > 0
|
||||
}
|
||||
|
||||
func sampleTimeLabels(samples []platform.LiveMetricSample) []string {
|
||||
labels := make([]string, len(samples))
|
||||
if len(samples) == 0 {
|
||||
return labels
|
||||
}
|
||||
times := make([]time.Time, len(samples))
|
||||
for i, s := range samples {
|
||||
times[i] = s.Timestamp
|
||||
}
|
||||
sameDay := timestampsSameLocalDay(times)
|
||||
for i, s := range samples {
|
||||
labels[i] = formatTimelineLabel(s.Timestamp.Local(), sameDay)
|
||||
}
|
||||
return labels
|
||||
}
|
||||
|
||||
func namedTempDatasets(samples []platform.LiveMetricSample, group string) ([][]float64, []string) {
|
||||
seen := map[string]bool{}
|
||||
var names []string
|
||||
for _, s := range samples {
|
||||
for _, t := range s.Temps {
|
||||
if t.Group == group && !seen[t.Name] {
|
||||
seen[t.Name] = true
|
||||
names = append(names, t.Name)
|
||||
}
|
||||
}
|
||||
}
|
||||
datasets := make([][]float64, 0, len(names))
|
||||
for _, name := range names {
|
||||
ds := make([]float64, len(samples))
|
||||
for i, s := range samples {
|
||||
for _, t := range s.Temps {
|
||||
if t.Group == group && t.Name == name {
|
||||
ds[i] = t.Celsius
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
datasets = append(datasets, ds)
|
||||
}
|
||||
return datasets, names
|
||||
}
|
||||
|
||||
func namedFanDatasets(samples []platform.LiveMetricSample) ([][]float64, []string) {
|
||||
seen := map[string]bool{}
|
||||
var names []string
|
||||
for _, s := range samples {
|
||||
for _, f := range s.Fans {
|
||||
if !seen[f.Name] {
|
||||
seen[f.Name] = true
|
||||
names = append(names, f.Name)
|
||||
}
|
||||
}
|
||||
}
|
||||
datasets := make([][]float64, 0, len(names))
|
||||
for _, name := range names {
|
||||
ds := make([]float64, len(samples))
|
||||
for i, s := range samples {
|
||||
for _, f := range s.Fans {
|
||||
if f.Name == name {
|
||||
ds[i] = f.RPM
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
datasets = append(datasets, ds)
|
||||
}
|
||||
return datasets, names
|
||||
}
|
||||
|
||||
func gpuDatasets(samples []platform.LiveMetricSample, pick func(platform.GPUMetricRow) float64) ([][]float64, []string) {
|
||||
seen := map[int]bool{}
|
||||
var indices []int
|
||||
for _, s := range samples {
|
||||
for _, g := range s.GPUs {
|
||||
if !seen[g.GPUIndex] {
|
||||
seen[g.GPUIndex] = true
|
||||
indices = append(indices, g.GPUIndex)
|
||||
}
|
||||
}
|
||||
}
|
||||
datasets := make([][]float64, 0, len(indices))
|
||||
names := make([]string, 0, len(indices))
|
||||
for _, idx := range indices {
|
||||
ds := gpuDatasetByIndex(samples, idx, pick)
|
||||
if ds == nil {
|
||||
continue
|
||||
}
|
||||
datasets = append(datasets, ds)
|
||||
names = append(names, fmt.Sprintf("GPU %d", idx))
|
||||
}
|
||||
return datasets, names
|
||||
}
|
||||
|
||||
func gpuDatasetByIndex(samples []platform.LiveMetricSample, idx int, pick func(platform.GPUMetricRow) float64) []float64 {
|
||||
found := false
|
||||
ds := make([]float64, len(samples))
|
||||
for i, s := range samples {
|
||||
for _, g := range s.GPUs {
|
||||
if g.GPUIndex == idx {
|
||||
ds[i] = pick(g)
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
return nil
|
||||
}
|
||||
return ds
|
||||
}
|
||||
|
||||
func coalesceDataset(ds []float64, n int) []float64 {
|
||||
if ds != nil {
|
||||
return ds
|
||||
}
|
||||
return make([]float64, n)
|
||||
}
|
||||
|
||||
// floatPtr returns a pointer to a float64 value.
|
||||
func floatPtr(v float64) *float64 { return &v }
|
||||
|
||||
@@ -621,6 +934,47 @@ func autoMax120(datasets ...[]float64) *float64 {
|
||||
return &v
|
||||
}
|
||||
|
||||
func autoBounds120(datasets ...[]float64) (*float64, *float64) {
|
||||
min := 0.0
|
||||
max := 0.0
|
||||
first := true
|
||||
for _, ds := range datasets {
|
||||
for _, v := range ds {
|
||||
if first {
|
||||
min, max = v, v
|
||||
first = false
|
||||
continue
|
||||
}
|
||||
if v < min {
|
||||
min = v
|
||||
}
|
||||
if v > max {
|
||||
max = v
|
||||
}
|
||||
}
|
||||
}
|
||||
if first {
|
||||
return nil, nil
|
||||
}
|
||||
if max <= 0 {
|
||||
return floatPtr(0), nil
|
||||
}
|
||||
span := max - min
|
||||
if span <= 0 {
|
||||
span = max * 0.1
|
||||
if span <= 0 {
|
||||
span = 1
|
||||
}
|
||||
}
|
||||
pad := span * 0.2
|
||||
low := min - pad
|
||||
if low < 0 {
|
||||
low = 0
|
||||
}
|
||||
high := max + pad
|
||||
return floatPtr(low), floatPtr(high)
|
||||
}
|
||||
|
||||
// renderChartSVG renders a line chart SVG with a fixed Y-axis range.
|
||||
func renderChartSVG(title string, datasets [][]float64, names []string, labels []string, yMin, yMax *float64) ([]byte, error) {
|
||||
n := len(labels)
|
||||
|
||||
@@ -7,6 +7,9 @@ import (
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
func TestChartLegendNumber(t *testing.T) {
|
||||
@@ -31,6 +34,61 @@ func TestChartLegendNumber(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
|
||||
samples := []platform.LiveMetricSample{
|
||||
{
|
||||
Timestamp: time.Now().Add(-3 * time.Minute),
|
||||
CPULoadPct: 10,
|
||||
MemLoadPct: 20,
|
||||
PowerW: 300,
|
||||
GPUs: []platform.GPUMetricRow{
|
||||
{GPUIndex: 0, UsagePct: 90, MemUsagePct: 5, PowerW: 120, TempC: 50},
|
||||
},
|
||||
},
|
||||
{
|
||||
Timestamp: time.Now().Add(-2 * time.Minute),
|
||||
CPULoadPct: 30,
|
||||
MemLoadPct: 40,
|
||||
PowerW: 320,
|
||||
GPUs: []platform.GPUMetricRow{
|
||||
{GPUIndex: 0, UsagePct: 95, MemUsagePct: 7, PowerW: 125, TempC: 51},
|
||||
},
|
||||
},
|
||||
{
|
||||
Timestamp: time.Now().Add(-1 * time.Minute),
|
||||
CPULoadPct: 50,
|
||||
MemLoadPct: 60,
|
||||
PowerW: 340,
|
||||
GPUs: []platform.GPUMetricRow{
|
||||
{GPUIndex: 0, UsagePct: 97, MemUsagePct: 9, PowerW: 130, TempC: 52},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
datasets, names, labels, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
|
||||
if !ok {
|
||||
t.Fatal("chartDataFromSamples returned ok=false")
|
||||
}
|
||||
if title != "GPU Power" {
|
||||
t.Fatalf("title=%q", title)
|
||||
}
|
||||
if len(names) != 1 || names[0] != "GPU 0" {
|
||||
t.Fatalf("names=%v", names)
|
||||
}
|
||||
if len(labels) != len(samples) {
|
||||
t.Fatalf("labels len=%d want %d", len(labels), len(samples))
|
||||
}
|
||||
if len(datasets) != 1 || len(datasets[0]) != len(samples) {
|
||||
t.Fatalf("datasets shape=%v", datasets)
|
||||
}
|
||||
if got := datasets[0][0]; got != 120 {
|
||||
t.Fatalf("datasets[0][0]=%v want 120", got)
|
||||
}
|
||||
if got := datasets[0][2]; got != 130 {
|
||||
t.Fatalf("datasets[0][2]=%v want 130", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRootRendersDashboard(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "audit.json")
|
||||
|
||||
@@ -12,6 +12,7 @@ import (
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
// Task statuses.
|
||||
@@ -30,9 +31,12 @@ var taskNames = map[string]string{
|
||||
"storage": "Storage SAT",
|
||||
"cpu": "CPU SAT",
|
||||
"amd": "AMD GPU SAT",
|
||||
"amd-mem": "AMD GPU MEM Integrity",
|
||||
"amd-bandwidth": "AMD GPU MEM Bandwidth",
|
||||
"amd-stress": "AMD GPU Burn-in",
|
||||
"memory-stress": "Memory Burn-in",
|
||||
"sat-stress": "SAT Stress (stressapptest)",
|
||||
"sat-stress": "SAT Stress (stressapptest)",
|
||||
"platform-stress": "Platform Thermal Cycling",
|
||||
"audit": "Audit",
|
||||
"install": "Install to Disk",
|
||||
"install-to-ram": "Install to RAM",
|
||||
@@ -96,6 +100,34 @@ func resolveBurnPreset(profile string) burnPreset {
|
||||
}
|
||||
}
|
||||
|
||||
func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions {
|
||||
switch profile {
|
||||
case "overnight":
|
||||
return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{
|
||||
{LoadSec: 600, IdleSec: 120},
|
||||
{LoadSec: 600, IdleSec: 60},
|
||||
{LoadSec: 600, IdleSec: 30},
|
||||
{LoadSec: 600, IdleSec: 120},
|
||||
{LoadSec: 600, IdleSec: 60},
|
||||
{LoadSec: 600, IdleSec: 30},
|
||||
{LoadSec: 600, IdleSec: 120},
|
||||
{LoadSec: 600, IdleSec: 60},
|
||||
}}
|
||||
case "acceptance":
|
||||
return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{
|
||||
{LoadSec: 300, IdleSec: 60},
|
||||
{LoadSec: 300, IdleSec: 30},
|
||||
{LoadSec: 300, IdleSec: 60},
|
||||
{LoadSec: 300, IdleSec: 30},
|
||||
}}
|
||||
default: // smoke
|
||||
return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{
|
||||
{LoadSec: 90, IdleSec: 60},
|
||||
{LoadSec: 90, IdleSec: 30},
|
||||
}}
|
||||
}
|
||||
}
|
||||
|
||||
// taskQueue manages a priority-ordered list of tasks and runs them one at a time.
|
||||
type taskQueue struct {
|
||||
mu sync.Mutex
|
||||
@@ -124,6 +156,12 @@ var (
|
||||
runAMDAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunAMDAcceptancePackCtx(ctx, baseDir, logFunc)
|
||||
}
|
||||
runAMDMemIntegrityPackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunAMDMemIntegrityPackCtx(ctx, baseDir, logFunc)
|
||||
}
|
||||
runAMDMemBandwidthPackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunAMDMemBandwidthPackCtx(ctx, baseDir, logFunc)
|
||||
}
|
||||
runAMDStressPackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.RunAMDStressPackCtx(ctx, baseDir, durationSec, logFunc)
|
||||
}
|
||||
@@ -380,6 +418,10 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
archive, err = runCPUAcceptancePackCtx(a, ctx, "", dur, j.append)
|
||||
case "amd":
|
||||
archive, err = runAMDAcceptancePackCtx(a, ctx, "", j.append)
|
||||
case "amd-mem":
|
||||
archive, err = runAMDMemIntegrityPackCtx(a, ctx, "", j.append)
|
||||
case "amd-bandwidth":
|
||||
archive, err = runAMDMemBandwidthPackCtx(a, ctx, "", j.append)
|
||||
case "amd-stress":
|
||||
dur := t.params.Duration
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
@@ -398,6 +440,9 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
archive, err = runSATStressPackCtx(a, ctx, "", dur, j.append)
|
||||
case "platform-stress":
|
||||
opts := resolvePlatformStressPreset(t.params.BurnProfile)
|
||||
archive, err = a.RunPlatformStress(ctx, "", opts, j.append)
|
||||
case "audit":
|
||||
result, e := a.RunAuditNow(q.opts.RuntimeMode)
|
||||
if e != nil {
|
||||
|
||||
@@ -30,8 +30,8 @@ lb config noauto \
|
||||
--linux-flavours "amd64" \
|
||||
--linux-packages "${LB_LINUX_PACKAGES}" \
|
||||
--memtest none \
|
||||
--iso-volume "EASY-BEE" \
|
||||
--iso-application "EASY-BEE" \
|
||||
--iso-volume "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \
|
||||
--apt-recommends false \
|
||||
--chroot-squashfs-compression-type zstd \
|
||||
|
||||
@@ -12,6 +12,7 @@ CACHE_DIR="${BEE_BUILDER_CACHE_DIR:-${REPO_ROOT}/dist/container-cache}"
|
||||
AUTH_KEYS=""
|
||||
REBUILD_IMAGE=0
|
||||
CLEAN_CACHE=0
|
||||
VARIANT="all"
|
||||
|
||||
. "${BUILDER_DIR}/VERSIONS"
|
||||
|
||||
@@ -34,14 +35,23 @@ while [ $# -gt 0 ]; do
|
||||
REBUILD_IMAGE=1
|
||||
shift
|
||||
;;
|
||||
--variant)
|
||||
VARIANT="$2"
|
||||
shift 2
|
||||
;;
|
||||
*)
|
||||
echo "unknown arg: $1" >&2
|
||||
echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys]" >&2
|
||||
echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys] [--variant nvidia|amd|all]" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
case "$VARIANT" in
|
||||
nvidia|amd|all) ;;
|
||||
*) echo "unknown variant: $VARIANT (expected nvidia, amd, or all)" >&2; exit 1 ;;
|
||||
esac
|
||||
|
||||
if [ "$CLEAN_CACHE" = "1" ]; then
|
||||
echo "=== cleaning build cache: ${CACHE_DIR} ==="
|
||||
rm -rf "${CACHE_DIR:?}/go-build" \
|
||||
@@ -49,8 +59,9 @@ if [ "$CLEAN_CACHE" = "1" ]; then
|
||||
"${CACHE_DIR:?}/tmp" \
|
||||
"${CACHE_DIR:?}/bee" \
|
||||
"${CACHE_DIR:?}/lb-packages"
|
||||
echo "=== cleaning live-build work dir: ${REPO_ROOT}/dist/live-build-work ==="
|
||||
rm -rf "${REPO_ROOT}/dist/live-build-work"
|
||||
echo "=== cleaning live-build work dirs ==="
|
||||
rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia"
|
||||
rm -rf "${REPO_ROOT}/dist/live-build-work-amd"
|
||||
echo "=== caches cleared, proceeding with build ==="
|
||||
fi
|
||||
|
||||
@@ -108,34 +119,71 @@ else
|
||||
echo "=== using existing builder image ${IMAGE_REF} (${BUILDER_PLATFORM}) ==="
|
||||
fi
|
||||
|
||||
set -- \
|
||||
run --rm --privileged \
|
||||
--platform "${BUILDER_PLATFORM}" \
|
||||
-v "${REPO_ROOT}:/work" \
|
||||
-v "${CACHE_DIR}:/cache" \
|
||||
-e BEE_CONTAINER_BUILD=1 \
|
||||
-e GOCACHE=/cache/go-build \
|
||||
-e GOMODCACHE=/cache/go-mod \
|
||||
-e TMPDIR=/cache/tmp \
|
||||
-e BEE_CACHE_DIR=/cache/bee \
|
||||
-w /work \
|
||||
"${IMAGE_REF}" \
|
||||
sh /work/iso/builder/build.sh
|
||||
|
||||
if [ -n "$AUTH_KEYS" ]; then
|
||||
set -- run --rm --privileged \
|
||||
--platform "${BUILDER_PLATFORM}" \
|
||||
-v "${REPO_ROOT}:/work" \
|
||||
-v "${CACHE_DIR}:/cache" \
|
||||
-v "${AUTH_KEYS_DIR}:/tmp/bee-authkeys:ro" \
|
||||
# Build base docker run args (without --authorized-keys)
|
||||
build_run_args() {
|
||||
_variant="$1"
|
||||
_auth_arg=""
|
||||
if [ -n "$AUTH_KEYS" ]; then
|
||||
_auth_arg="--authorized-keys /tmp/bee-authkeys/${AUTH_KEYS_BASE}"
|
||||
fi
|
||||
echo "run --rm --privileged \
|
||||
--platform ${BUILDER_PLATFORM} \
|
||||
-v ${REPO_ROOT}:/work \
|
||||
-v ${CACHE_DIR}:/cache \
|
||||
${AUTH_KEYS:+-v ${AUTH_KEYS_DIR}:/tmp/bee-authkeys:ro} \
|
||||
-e BEE_CONTAINER_BUILD=1 \
|
||||
-e GOCACHE=/cache/go-build \
|
||||
-e GOMODCACHE=/cache/go-mod \
|
||||
-e TMPDIR=/cache/tmp \
|
||||
-e BEE_CACHE_DIR=/cache/bee \
|
||||
-w /work \
|
||||
"${IMAGE_REF}" \
|
||||
sh /work/iso/builder/build.sh --authorized-keys "/tmp/bee-authkeys/${AUTH_KEYS_BASE}"
|
||||
fi
|
||||
${IMAGE_REF} \
|
||||
sh /work/iso/builder/build.sh --variant ${_variant} ${_auth_arg}"
|
||||
}
|
||||
|
||||
"$CONTAINER_TOOL" "$@"
|
||||
run_variant() {
|
||||
_v="$1"
|
||||
echo "=== building variant: ${_v} ==="
|
||||
if [ -n "$AUTH_KEYS" ]; then
|
||||
"$CONTAINER_TOOL" run --rm --privileged \
|
||||
--platform "${BUILDER_PLATFORM}" \
|
||||
-v "${REPO_ROOT}:/work" \
|
||||
-v "${CACHE_DIR}:/cache" \
|
||||
-v "${AUTH_KEYS_DIR}:/tmp/bee-authkeys:ro" \
|
||||
-e BEE_CONTAINER_BUILD=1 \
|
||||
-e GOCACHE=/cache/go-build \
|
||||
-e GOMODCACHE=/cache/go-mod \
|
||||
-e TMPDIR=/cache/tmp \
|
||||
-e BEE_CACHE_DIR=/cache/bee \
|
||||
-w /work \
|
||||
"${IMAGE_REF}" \
|
||||
sh /work/iso/builder/build.sh --variant "${_v}" \
|
||||
--authorized-keys "/tmp/bee-authkeys/${AUTH_KEYS_BASE}"
|
||||
else
|
||||
"$CONTAINER_TOOL" run --rm --privileged \
|
||||
--platform "${BUILDER_PLATFORM}" \
|
||||
-v "${REPO_ROOT}:/work" \
|
||||
-v "${CACHE_DIR}:/cache" \
|
||||
-e BEE_CONTAINER_BUILD=1 \
|
||||
-e GOCACHE=/cache/go-build \
|
||||
-e GOMODCACHE=/cache/go-mod \
|
||||
-e TMPDIR=/cache/tmp \
|
||||
-e BEE_CACHE_DIR=/cache/bee \
|
||||
-w /work \
|
||||
"${IMAGE_REF}" \
|
||||
sh /work/iso/builder/build.sh --variant "${_v}"
|
||||
fi
|
||||
}
|
||||
|
||||
case "$VARIANT" in
|
||||
nvidia)
|
||||
run_variant nvidia
|
||||
;;
|
||||
amd)
|
||||
run_variant amd
|
||||
;;
|
||||
all)
|
||||
run_variant nvidia
|
||||
run_variant amd
|
||||
;;
|
||||
esac
|
||||
|
||||
@@ -13,19 +13,29 @@ BUILDER_DIR="${REPO_ROOT}/iso/builder"
|
||||
OVERLAY_DIR="${REPO_ROOT}/iso/overlay"
|
||||
DIST_DIR="${REPO_ROOT}/dist"
|
||||
VENDOR_DIR="${REPO_ROOT}/iso/vendor"
|
||||
BUILD_WORK_DIR="${DIST_DIR}/live-build-work"
|
||||
OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage"
|
||||
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||
AUTH_KEYS=""
|
||||
BEE_GPU_VENDOR="nvidia"
|
||||
|
||||
# parse args
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
--authorized-keys) AUTH_KEYS="$2"; shift 2 ;;
|
||||
--variant) BEE_GPU_VENDOR="$2"; shift 2 ;;
|
||||
*) echo "unknown arg: $1"; exit 1 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
case "$BEE_GPU_VENDOR" in
|
||||
nvidia|amd) ;;
|
||||
*) echo "unknown variant: $BEE_GPU_VENDOR (expected nvidia or amd)" >&2; exit 1 ;;
|
||||
esac
|
||||
|
||||
BUILD_WORK_DIR="${DIST_DIR}/live-build-work-${BEE_GPU_VENDOR}"
|
||||
OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage-${BEE_GPU_VENDOR}"
|
||||
|
||||
export BEE_GPU_VENDOR
|
||||
|
||||
. "${BUILDER_DIR}/VERSIONS"
|
||||
export PATH="$PATH:/usr/local/go/bin"
|
||||
|
||||
@@ -132,7 +142,7 @@ if [ ! -d "/usr/src/linux-headers-${KVER}" ]; then
|
||||
apt-get install -y "linux-headers-${KVER}"
|
||||
fi
|
||||
|
||||
echo "=== bee ISO build ==="
|
||||
echo "=== bee ISO build (variant: ${BEE_GPU_VENDOR}) ==="
|
||||
echo "Debian: ${DEBIAN_VERSION}, Kernel ABI: ${DEBIAN_KERNEL_ABI}, Go: ${GO_VERSION}"
|
||||
echo "Audit version: ${AUDIT_VERSION_EFFECTIVE}, ISO version: ${ISO_VERSION_EFFECTIVE}"
|
||||
echo ""
|
||||
@@ -141,8 +151,8 @@ echo "=== syncing git submodules ==="
|
||||
git -C "${REPO_ROOT}" submodule update --init --recursive
|
||||
|
||||
# --- compile bee binary (static, Linux amd64) ---
|
||||
# Shared between variants — built once, reused on second pass.
|
||||
BEE_BIN="${DIST_DIR}/bee-linux-amd64"
|
||||
GPU_STRESS_BIN="${DIST_DIR}/bee-gpu-stress-linux-amd64"
|
||||
NEED_BUILD=1
|
||||
if [ -f "$BEE_BIN" ]; then
|
||||
NEWEST_SRC=$(find "${REPO_ROOT}/audit" -name '*.go' -newer "$BEE_BIN" | head -1)
|
||||
@@ -172,37 +182,41 @@ else
|
||||
echo "=== bee binary up to date, skipping build ==="
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "=== downloading cuBLAS/cuBLASLt/cudart ${NCCL_CUDA_VERSION} userspace ==="
|
||||
sh "${BUILDER_DIR}/build-cublas.sh" \
|
||||
"${CUBLAS_VERSION}" \
|
||||
"${CUDA_USERSPACE_VERSION}" \
|
||||
"${NCCL_CUDA_VERSION}" \
|
||||
"${DIST_DIR}"
|
||||
# --- NVIDIA-only build steps ---
|
||||
GPU_STRESS_BIN="${DIST_DIR}/bee-gpu-stress-linux-amd64"
|
||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
echo ""
|
||||
echo "=== downloading cuBLAS/cuBLASLt/cudart ${NCCL_CUDA_VERSION} userspace ==="
|
||||
sh "${BUILDER_DIR}/build-cublas.sh" \
|
||||
"${CUBLAS_VERSION}" \
|
||||
"${CUDA_USERSPACE_VERSION}" \
|
||||
"${NCCL_CUDA_VERSION}" \
|
||||
"${DIST_DIR}"
|
||||
|
||||
CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
||||
CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
||||
|
||||
GPU_STRESS_NEED_BUILD=1
|
||||
if [ -f "$GPU_STRESS_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_STRESS_BIN" ]; then
|
||||
GPU_STRESS_NEED_BUILD=0
|
||||
GPU_STRESS_NEED_BUILD=1
|
||||
if [ -f "$GPU_STRESS_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_STRESS_BIN" ]; then
|
||||
GPU_STRESS_NEED_BUILD=0
|
||||
fi
|
||||
|
||||
if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then
|
||||
echo "=== building bee-gpu-stress ==="
|
||||
gcc -O2 -s -Wall -Wextra \
|
||||
-I"${CUBLAS_CACHE}/include" \
|
||||
-o "$GPU_STRESS_BIN" \
|
||||
"${BUILDER_DIR}/bee-gpu-stress.c" \
|
||||
-ldl -lm
|
||||
echo "binary: $GPU_STRESS_BIN"
|
||||
else
|
||||
echo "=== bee-gpu-stress up to date, skipping build ==="
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then
|
||||
echo "=== building bee-gpu-stress ==="
|
||||
gcc -O2 -s -Wall -Wextra \
|
||||
-I"${CUBLAS_CACHE}/include" \
|
||||
-o "$GPU_STRESS_BIN" \
|
||||
"${BUILDER_DIR}/bee-gpu-stress.c" \
|
||||
-ldl -lm
|
||||
echo "binary: $GPU_STRESS_BIN"
|
||||
else
|
||||
echo "=== bee-gpu-stress up to date, skipping build ==="
|
||||
fi
|
||||
|
||||
echo "=== preparing staged overlay ==="
|
||||
# Sync builder config into work dir, preserving lb cache (chroot + packages).
|
||||
# We do NOT rm -rf BUILD_WORK_DIR so lb can reuse its chroot on repeat builds.
|
||||
echo "=== preparing staged overlay (${BEE_GPU_VENDOR}) ==="
|
||||
mkdir -p "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}"
|
||||
|
||||
# Sync builder config into variant work dir, preserving lb cache.
|
||||
rsync -a --delete \
|
||||
--exclude='cache/' \
|
||||
--exclude='chroot/' \
|
||||
@@ -212,7 +226,10 @@ rsync -a --delete \
|
||||
--exclude='*.contents' \
|
||||
--exclude='*.files' \
|
||||
"${BUILDER_DIR}/" "${BUILD_WORK_DIR}/"
|
||||
# Also persist package cache to CACHE_ROOT so it survives a manual wipe of BUILD_WORK_DIR.
|
||||
|
||||
# Share deb package cache across variants.
|
||||
# Restore: populate work dir cache from shared cache before build.
|
||||
# Persist: sync back after build (done after lb build below).
|
||||
LB_PKG_CACHE="${CACHE_ROOT}/lb-packages"
|
||||
mkdir -p "${LB_PKG_CACHE}"
|
||||
if [ -d "${BUILD_WORK_DIR}/cache/packages.chroot" ]; then
|
||||
@@ -221,6 +238,7 @@ elif [ -d "${LB_PKG_CACHE}" ] && [ "$(ls -A "${LB_PKG_CACHE}" 2>/dev/null)" ]; t
|
||||
mkdir -p "${BUILD_WORK_DIR}/cache/packages.chroot"
|
||||
rsync -a "${LB_PKG_CACHE}/" "${BUILD_WORK_DIR}/cache/packages.chroot/"
|
||||
fi
|
||||
|
||||
rsync -a "${OVERLAY_DIR}/" "${OVERLAY_STAGE_DIR}/"
|
||||
rm -f \
|
||||
"${OVERLAY_STAGE_DIR}/etc/bee-ssh-password-fallback" \
|
||||
@@ -231,6 +249,12 @@ rm -f \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
||||
|
||||
# Remove NVIDIA-specific overlay files for AMD variant
|
||||
if [ "$BEE_GPU_VENDOR" = "amd" ]; then
|
||||
rm -f "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nvidia-load"
|
||||
rm -f "${OVERLAY_STAGE_DIR}/etc/systemd/system/bee-nvidia.service"
|
||||
fi
|
||||
|
||||
# --- inject authorized_keys for SSH access ---
|
||||
AUTHORIZED_KEYS_FILE="${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys"
|
||||
mkdir -p "${OVERLAY_STAGE_DIR}/root/.ssh"
|
||||
@@ -268,8 +292,11 @@ fi
|
||||
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/bin"
|
||||
cp "${DIST_DIR}/bee-linux-amd64" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
|
||||
cp "${GPU_STRESS_BIN}" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
|
||||
|
||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ] && [ -f "$GPU_STRESS_BIN" ]; then
|
||||
cp "${GPU_STRESS_BIN}" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
|
||||
fi
|
||||
|
||||
# --- inject smoketest into overlay so it runs directly on the live CD ---
|
||||
cp "${BUILDER_DIR}/smoketest.sh" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest"
|
||||
@@ -286,100 +313,143 @@ for tool in storcli64 sas2ircu sas3ircu arcconf ssacli; do
|
||||
fi
|
||||
done
|
||||
|
||||
# --- build NVIDIA kernel modules ---
|
||||
echo ""
|
||||
echo "=== building NVIDIA ${NVIDIA_DRIVER_VERSION} modules ==="
|
||||
sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}"
|
||||
# --- NVIDIA kernel modules and userspace libs ---
|
||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
echo ""
|
||||
echo "=== building NVIDIA ${NVIDIA_DRIVER_VERSION} modules ==="
|
||||
sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}"
|
||||
|
||||
KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
||||
NVIDIA_CACHE="${DIST_DIR}/nvidia-${NVIDIA_DRIVER_VERSION}-${KVER}"
|
||||
KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
||||
NVIDIA_CACHE="${DIST_DIR}/nvidia-${NVIDIA_DRIVER_VERSION}-${KVER}"
|
||||
|
||||
# Inject .ko files into overlay at /usr/local/lib/nvidia/
|
||||
OVERLAY_KMOD_DIR="${OVERLAY_DIR}/usr/local/lib/nvidia"
|
||||
OVERLAY_KMOD_DIR="${OVERLAY_STAGE_DIR}/usr/local/lib/nvidia"
|
||||
mkdir -p "${OVERLAY_KMOD_DIR}"
|
||||
cp "${NVIDIA_CACHE}/modules/"*.ko "${OVERLAY_KMOD_DIR}/"
|
||||
# Inject .ko files into overlay at /usr/local/lib/nvidia/
|
||||
OVERLAY_KMOD_DIR="${OVERLAY_STAGE_DIR}/usr/local/lib/nvidia"
|
||||
mkdir -p "${OVERLAY_KMOD_DIR}"
|
||||
cp "${NVIDIA_CACHE}/modules/"*.ko "${OVERLAY_KMOD_DIR}/"
|
||||
|
||||
# Inject nvidia-smi and libnvidia-ml
|
||||
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/bin" "${OVERLAY_STAGE_DIR}/usr/lib"
|
||||
cp "${NVIDIA_CACHE}/bin/nvidia-smi" "${OVERLAY_STAGE_DIR}/usr/local/bin/"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/nvidia-smi"
|
||||
cp "${NVIDIA_CACHE}/bin/nvidia-bug-report.sh" "${OVERLAY_STAGE_DIR}/usr/local/bin/" 2>/dev/null || true
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/nvidia-bug-report.sh" 2>/dev/null || true
|
||||
cp "${NVIDIA_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" 2>/dev/null || true
|
||||
# Inject nvidia-smi and libnvidia-ml
|
||||
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/bin" "${OVERLAY_STAGE_DIR}/usr/lib"
|
||||
cp "${NVIDIA_CACHE}/bin/nvidia-smi" "${OVERLAY_STAGE_DIR}/usr/local/bin/"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/nvidia-smi"
|
||||
cp "${NVIDIA_CACHE}/bin/nvidia-bug-report.sh" "${OVERLAY_STAGE_DIR}/usr/local/bin/" 2>/dev/null || true
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/nvidia-bug-report.sh" 2>/dev/null || true
|
||||
cp "${NVIDIA_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" 2>/dev/null || true
|
||||
|
||||
# Inject GSP firmware into /lib/firmware/nvidia/<version>/
|
||||
if [ -d "${NVIDIA_CACHE}/firmware" ] && [ "$(ls -A "${NVIDIA_CACHE}/firmware" 2>/dev/null)" ]; then
|
||||
mkdir -p "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}"
|
||||
cp "${NVIDIA_CACHE}/firmware/"* "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}/"
|
||||
echo "=== firmware: $(ls "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}/" | wc -l) files injected ==="
|
||||
# Inject GSP firmware into /lib/firmware/nvidia/<version>/
|
||||
if [ -d "${NVIDIA_CACHE}/firmware" ] && [ "$(ls -A "${NVIDIA_CACHE}/firmware" 2>/dev/null)" ]; then
|
||||
mkdir -p "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}"
|
||||
cp "${NVIDIA_CACHE}/firmware/"* "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}/"
|
||||
echo "=== firmware: $(ls "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}/" | wc -l) files injected ==="
|
||||
fi
|
||||
|
||||
# --- build / download NCCL ---
|
||||
echo ""
|
||||
echo "=== downloading NCCL ${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION} ==="
|
||||
sh "${BUILDER_DIR}/build-nccl.sh" "${NCCL_VERSION}" "${NCCL_CUDA_VERSION}" "${DIST_DIR}" "${NCCL_SHA256:-}"
|
||||
|
||||
NCCL_CACHE="${DIST_DIR}/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
||||
|
||||
# Inject libnccl.so.* into overlay alongside other NVIDIA userspace libs
|
||||
cp "${NCCL_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
|
||||
echo "=== NCCL: $(ls "${NCCL_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
|
||||
|
||||
# Inject cuBLAS/cuBLASLt/cudart runtime libs used by bee-gpu-stress tensor-core GEMM path
|
||||
cp "${CUBLAS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
|
||||
echo "=== cuBLAS: $(ls "${CUBLAS_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
|
||||
|
||||
# --- build nccl-tests ---
|
||||
echo ""
|
||||
echo "=== building nccl-tests ${NCCL_TESTS_VERSION} ==="
|
||||
sh "${BUILDER_DIR}/build-nccl-tests.sh" \
|
||||
"${NCCL_TESTS_VERSION}" \
|
||||
"${NCCL_VERSION}" \
|
||||
"${NCCL_CUDA_VERSION}" \
|
||||
"${DIST_DIR}" \
|
||||
"${NVCC_VERSION}" \
|
||||
"${DEBIAN_VERSION}"
|
||||
|
||||
NCCL_TESTS_CACHE="${DIST_DIR}/nccl-tests-${NCCL_TESTS_VERSION}"
|
||||
cp "${NCCL_TESTS_CACHE}/bin/all_reduce_perf" "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
||||
echo "=== all_reduce_perf injected ==="
|
||||
fi
|
||||
|
||||
# --- build / download NCCL ---
|
||||
echo ""
|
||||
echo "=== downloading NCCL ${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION} ==="
|
||||
sh "${BUILDER_DIR}/build-nccl.sh" "${NCCL_VERSION}" "${NCCL_CUDA_VERSION}" "${DIST_DIR}" "${NCCL_SHA256:-}"
|
||||
|
||||
NCCL_CACHE="${DIST_DIR}/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
||||
|
||||
# Inject libnccl.so.* into overlay alongside other NVIDIA userspace libs
|
||||
cp "${NCCL_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
|
||||
echo "=== NCCL: $(ls "${NCCL_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
|
||||
|
||||
# Inject cuBLAS/cuBLASLt/cudart runtime libs used by bee-gpu-stress tensor-core GEMM path
|
||||
cp "${CUBLAS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
|
||||
echo "=== cuBLAS: $(ls "${CUBLAS_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
|
||||
|
||||
# --- build nccl-tests ---
|
||||
echo ""
|
||||
echo "=== building nccl-tests ${NCCL_TESTS_VERSION} ==="
|
||||
sh "${BUILDER_DIR}/build-nccl-tests.sh" \
|
||||
"${NCCL_TESTS_VERSION}" \
|
||||
"${NCCL_VERSION}" \
|
||||
"${NCCL_CUDA_VERSION}" \
|
||||
"${DIST_DIR}" \
|
||||
"${NVCC_VERSION}" \
|
||||
"${DEBIAN_VERSION}"
|
||||
|
||||
NCCL_TESTS_CACHE="${DIST_DIR}/nccl-tests-${NCCL_TESTS_VERSION}"
|
||||
cp "${NCCL_TESTS_CACHE}/bin/all_reduce_perf" "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
||||
echo "=== all_reduce_perf injected ==="
|
||||
|
||||
# --- embed build metadata ---
|
||||
mkdir -p "${OVERLAY_STAGE_DIR}/etc"
|
||||
BUILD_DATE="$(date +%Y-%m-%d)"
|
||||
GIT_COMMIT="$(git -C "${REPO_ROOT}" rev-parse --short HEAD 2>/dev/null || echo unknown)"
|
||||
cat > "${OVERLAY_STAGE_DIR}/etc/bee-release" <<EOF
|
||||
BEE_ISO_VERSION=${ISO_VERSION_EFFECTIVE}
|
||||
BEE_AUDIT_VERSION=${AUDIT_VERSION_EFFECTIVE}
|
||||
BUILD_DATE=${BUILD_DATE}
|
||||
GIT_COMMIT=${GIT_COMMIT}
|
||||
DEBIAN_VERSION=${DEBIAN_VERSION}
|
||||
DEBIAN_KERNEL_ABI=${DEBIAN_KERNEL_ABI}
|
||||
NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}
|
||||
|
||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
GPU_VERSION_LINE="NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}
|
||||
NCCL_VERSION=${NCCL_VERSION}
|
||||
NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION}
|
||||
CUBLAS_VERSION=${CUBLAS_VERSION}
|
||||
CUDA_USERSPACE_VERSION=${CUDA_USERSPACE_VERSION}
|
||||
NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}
|
||||
NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}"
|
||||
GPU_BUILD_INFO="nvidia:${NVIDIA_DRIVER_VERSION}"
|
||||
else
|
||||
GPU_VERSION_LINE="ROCM_VERSION=${ROCM_VERSION}"
|
||||
GPU_BUILD_INFO="rocm:${ROCM_VERSION}"
|
||||
fi
|
||||
|
||||
cat > "${OVERLAY_STAGE_DIR}/etc/bee-release" <<EOF
|
||||
BEE_ISO_VERSION=${ISO_VERSION_EFFECTIVE}
|
||||
BEE_AUDIT_VERSION=${AUDIT_VERSION_EFFECTIVE}
|
||||
BEE_GPU_VENDOR=${BEE_GPU_VENDOR}
|
||||
BUILD_DATE=${BUILD_DATE}
|
||||
GIT_COMMIT=${GIT_COMMIT}
|
||||
DEBIAN_VERSION=${DEBIAN_VERSION}
|
||||
DEBIAN_KERNEL_ABI=${DEBIAN_KERNEL_ABI}
|
||||
${GPU_VERSION_LINE}
|
||||
EOF
|
||||
|
||||
# Write GPU vendor marker for hooks
|
||||
echo "${BEE_GPU_VENDOR}" > "${OVERLAY_STAGE_DIR}/etc/bee-gpu-vendor"
|
||||
|
||||
# Patch motd with build info
|
||||
BEE_BUILD_INFO="${BUILD_DATE} git:${GIT_COMMIT} debian:${DEBIAN_VERSION} nvidia:${NVIDIA_DRIVER_VERSION}"
|
||||
BEE_BUILD_INFO="${BUILD_DATE} git:${GIT_COMMIT} debian:${DEBIAN_VERSION} ${GPU_BUILD_INFO}"
|
||||
if [ -f "${OVERLAY_STAGE_DIR}/etc/motd" ]; then
|
||||
sed "s/%%BUILD_INFO%%/${BEE_BUILD_INFO}/" "${OVERLAY_STAGE_DIR}/etc/motd" \
|
||||
> "${OVERLAY_STAGE_DIR}/etc/motd.patched"
|
||||
mv "${OVERLAY_STAGE_DIR}/etc/motd.patched" "${OVERLAY_STAGE_DIR}/etc/motd"
|
||||
fi
|
||||
|
||||
# --- substitute version placeholders in package list ---
|
||||
sed -i \
|
||||
-e "s/%%DCGM_VERSION%%/${DCGM_VERSION}/g" \
|
||||
-e "s/%%ROCM_VERSION%%/${ROCM_VERSION}/g" \
|
||||
-e "s/%%ROCM_SMI_VERSION%%/${ROCM_SMI_VERSION}/g" \
|
||||
"${BUILD_WORK_DIR}/config/package-lists/bee.list.chroot" \
|
||||
"${BUILD_WORK_DIR}/config/archives/rocm.list.chroot"
|
||||
# --- copy variant-specific package list into work dir ---
|
||||
cp "${BUILD_WORK_DIR}/config/package-lists/bee-${BEE_GPU_VENDOR}.list.chroot" \
|
||||
"${BUILD_WORK_DIR}/config/package-lists/bee-gpu.list.chroot"
|
||||
|
||||
# --- remove archives for the other vendor ---
|
||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
rm -f "${BUILD_WORK_DIR}/config/archives/rocm.list.chroot" \
|
||||
"${BUILD_WORK_DIR}/config/archives/rocm.key.chroot"
|
||||
else
|
||||
rm -f "${BUILD_WORK_DIR}/config/archives/nvidia-cuda.list.chroot" \
|
||||
"${BUILD_WORK_DIR}/config/archives/nvidia-cuda.key.chroot"
|
||||
fi
|
||||
|
||||
# --- substitute version placeholders in package list and archive ---
|
||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
sed -i \
|
||||
-e "s/%%DCGM_VERSION%%/${DCGM_VERSION}/g" \
|
||||
"${BUILD_WORK_DIR}/config/package-lists/bee-gpu.list.chroot"
|
||||
else
|
||||
sed -i \
|
||||
-e "s/%%ROCM_VERSION%%/${ROCM_VERSION}/g" \
|
||||
-e "s/%%ROCM_SMI_VERSION%%/${ROCM_SMI_VERSION}/g" \
|
||||
-e "s/%%ROCM_BANDWIDTH_TEST_VERSION%%/${ROCM_BANDWIDTH_TEST_VERSION}/g" \
|
||||
-e "s/%%ROCM_VALIDATION_SUITE_VERSION%%/${ROCM_VALIDATION_SUITE_VERSION}/g" \
|
||||
-e "s/%%ROCBLAS_VERSION%%/${ROCBLAS_VERSION}/g" \
|
||||
-e "s/%%ROCRAND_VERSION%%/${ROCRAND_VERSION}/g" \
|
||||
-e "s/%%HIP_RUNTIME_AMD_VERSION%%/${HIP_RUNTIME_AMD_VERSION}/g" \
|
||||
-e "s/%%HIPBLASLT_VERSION%%/${HIPBLASLT_VERSION}/g" \
|
||||
-e "s/%%COMGR_VERSION%%/${COMGR_VERSION}/g" \
|
||||
"${BUILD_WORK_DIR}/config/package-lists/bee-gpu.list.chroot"
|
||||
if [ -f "${BUILD_WORK_DIR}/config/archives/rocm.list.chroot" ]; then
|
||||
sed -i \
|
||||
-e "s/%%ROCM_VERSION%%/${ROCM_VERSION}/g" \
|
||||
"${BUILD_WORK_DIR}/config/archives/rocm.list.chroot"
|
||||
fi
|
||||
fi
|
||||
|
||||
# --- sync overlay into live-build includes.chroot ---
|
||||
LB_DIR="${BUILD_WORK_DIR}"
|
||||
@@ -395,20 +465,31 @@ fi
|
||||
|
||||
# --- build ISO using live-build ---
|
||||
echo ""
|
||||
echo "=== building ISO (live-build) ==="
|
||||
echo "=== building ISO (live-build, variant: ${BEE_GPU_VENDOR}) ==="
|
||||
|
||||
# Export for auto/config
|
||||
BEE_GPU_VENDOR_UPPER="$(echo "${BEE_GPU_VENDOR}" | tr 'a-z' 'A-Z')"
|
||||
export BEE_GPU_VENDOR_UPPER
|
||||
|
||||
cd "${LB_DIR}"
|
||||
lb clean 2>&1 | tail -3
|
||||
lb config 2>&1 | tail -5
|
||||
lb build 2>&1
|
||||
|
||||
# --- persist deb package cache back to shared location ---
|
||||
# This allows the second variant to reuse all downloaded packages.
|
||||
if [ -d "${BUILD_WORK_DIR}/cache/packages.chroot" ]; then
|
||||
rsync -a "${BUILD_WORK_DIR}/cache/packages.chroot/" "${LB_PKG_CACHE}/"
|
||||
echo "=== package cache synced to ${LB_PKG_CACHE} ==="
|
||||
fi
|
||||
|
||||
# live-build outputs live-image-amd64.hybrid.iso in LB_DIR
|
||||
ISO_RAW="${LB_DIR}/live-image-amd64.hybrid.iso"
|
||||
ISO_OUT="${DIST_DIR}/bee-debian${DEBIAN_VERSION}-v${ISO_VERSION_EFFECTIVE}-amd64.iso"
|
||||
ISO_OUT="${DIST_DIR}/easy-bee-${BEE_GPU_VENDOR}-v${ISO_VERSION_EFFECTIVE}-amd64.iso"
|
||||
if [ -f "$ISO_RAW" ]; then
|
||||
cp "$ISO_RAW" "$ISO_OUT"
|
||||
echo ""
|
||||
echo "=== done ==="
|
||||
echo "=== done (${BEE_GPU_VENDOR}) ==="
|
||||
echo "ISO: $ISO_OUT"
|
||||
if command -v stat >/dev/null 2>&1; then
|
||||
ISO_SIZE_BYTES="$(stat -c '%s' "$ISO_OUT" 2>/dev/null || stat -f '%z' "$ISO_OUT")"
|
||||
|
||||
@@ -10,12 +10,12 @@ echo " ╚══════╝╚═╝ ╚═╝╚══════╝
|
||||
echo ""
|
||||
|
||||
menuentry "EASY-BEE" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
menuentry "EASY-BEE (load to RAM)" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
|
||||
@@ -5,6 +5,9 @@ set -e
|
||||
|
||||
echo "=== bee chroot setup ==="
|
||||
|
||||
GPU_VENDOR=$(cat /etc/bee-gpu-vendor 2>/dev/null || echo nvidia)
|
||||
echo "=== GPU vendor: ${GPU_VENDOR} ==="
|
||||
|
||||
ensure_bee_console_user() {
|
||||
if id bee >/dev/null 2>&1; then
|
||||
usermod -d /home/bee -s /bin/bash bee 2>/dev/null || true
|
||||
@@ -21,10 +24,8 @@ ensure_bee_console_user() {
|
||||
|
||||
ensure_bee_console_user
|
||||
|
||||
# Enable bee services
|
||||
systemctl enable nvidia-dcgm.service 2>/dev/null || true
|
||||
# Enable common bee services
|
||||
systemctl enable bee-network.service
|
||||
systemctl enable bee-nvidia.service
|
||||
systemctl enable bee-preflight.service
|
||||
systemctl enable bee-audit.service
|
||||
systemctl enable bee-web.service
|
||||
@@ -36,25 +37,33 @@ systemctl enable serial-getty@ttyS0.service 2>/dev/null || true
|
||||
systemctl enable serial-getty@ttyS1.service 2>/dev/null || true
|
||||
systemctl enable bee-journal-mirror@ttyS1.service 2>/dev/null || true
|
||||
|
||||
# Enable GPU-vendor specific services
|
||||
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
||||
systemctl enable nvidia-dcgm.service 2>/dev/null || true
|
||||
systemctl enable bee-nvidia.service
|
||||
elif [ "$GPU_VENDOR" = "amd" ]; then
|
||||
# ROCm symlinks (packages install to /opt/rocm-*/bin/)
|
||||
for tool in rocm-smi rocm-bandwidth-test rvs; do
|
||||
if [ ! -e /usr/local/bin/${tool} ]; then
|
||||
bin_path="$(find /opt -path "*/bin/${tool}" -type f 2>/dev/null | sort | tail -1)"
|
||||
[ -n "${bin_path}" ] && ln -sf "${bin_path}" /usr/local/bin/${tool}
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
# Ensure scripts are executable
|
||||
chmod +x /usr/local/bin/bee-network.sh 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-sshsetup 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-smoketest 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true
|
||||
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
||||
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
|
||||
fi
|
||||
|
||||
# Reload udev rules
|
||||
udevadm control --reload-rules 2>/dev/null || true
|
||||
|
||||
# rocm symlinks (packages install to /opt/rocm-*/bin/)
|
||||
for tool in rocm-smi rocm-bandwidth-test rvs; do
|
||||
if [ ! -e /usr/local/bin/${tool} ]; then
|
||||
bin_path="$(find /opt -path "*/bin/${tool}" -type f 2>/dev/null | sort | tail -1)"
|
||||
[ -n "${bin_path}" ] && ln -sf "${bin_path}" /usr/local/bin/${tool}
|
||||
fi
|
||||
done
|
||||
|
||||
# Create export directory
|
||||
mkdir -p /appdata/bee/export
|
||||
|
||||
@@ -62,4 +71,4 @@ if [ -f /etc/sudoers.d/bee ]; then
|
||||
chmod 0440 /etc/sudoers.d/bee
|
||||
fi
|
||||
|
||||
echo "=== bee chroot setup complete ==="
|
||||
echo "=== bee chroot setup complete (${GPU_VENDOR}) ==="
|
||||
|
||||
@@ -4,6 +4,9 @@
|
||||
# not inside the squashfs).
|
||||
set -e
|
||||
|
||||
echo "memtest: scanning chroot/boot/ for memtest files:"
|
||||
ls chroot/boot/memtest* 2>/dev/null || echo "memtest: WARNING: no memtest files found in chroot/boot/"
|
||||
|
||||
for f in memtest86+x64.bin memtest86+x64.efi memtest86+ia32.bin memtest86+ia32.efi; do
|
||||
src="chroot/boot/${f}"
|
||||
if [ -f "${src}" ]; then
|
||||
|
||||
9
iso/builder/config/package-lists/bee-amd.list.chroot
Normal file
9
iso/builder/config/package-lists/bee-amd.list.chroot
Normal file
@@ -0,0 +1,9 @@
|
||||
# AMD ROCm — GPU monitoring, bandwidth test, and compute stress (RVS GST)
|
||||
rocm-smi-lib=%%ROCM_SMI_VERSION%%
|
||||
rocm-bandwidth-test=%%ROCM_BANDWIDTH_TEST_VERSION%%
|
||||
rocm-validation-suite=%%ROCM_VALIDATION_SUITE_VERSION%%
|
||||
rocblas=%%ROCBLAS_VERSION%%
|
||||
rocrand=%%ROCRAND_VERSION%%
|
||||
hip-runtime-amd=%%HIP_RUNTIME_AMD_VERSION%%
|
||||
hipblaslt=%%HIPBLASLT_VERSION%%
|
||||
comgr=%%COMGR_VERSION%%
|
||||
2
iso/builder/config/package-lists/bee-nvidia.list.chroot
Normal file
2
iso/builder/config/package-lists/bee-nvidia.list.chroot
Normal file
@@ -0,0 +1,2 @@
|
||||
# NVIDIA DCGM (Data Center GPU Manager) — dcgmi diag for acceptance testing
|
||||
datacenter-gpu-manager=1:%%DCGM_VERSION%%
|
||||
@@ -72,18 +72,5 @@ firmware-bnx2x
|
||||
firmware-cavium
|
||||
firmware-qlogic
|
||||
|
||||
# NVIDIA DCGM (Data Center GPU Manager) — dcgmi diag for acceptance testing
|
||||
datacenter-gpu-manager=1:%%DCGM_VERSION%%
|
||||
|
||||
# AMD ROCm — GPU monitoring, bandwidth test, and compute stress (RVS GST)
|
||||
rocm-smi-lib=%%ROCM_SMI_VERSION%%
|
||||
rocm-bandwidth-test=%%ROCM_BANDWIDTH_TEST_VERSION%%
|
||||
rocm-validation-suite=%%ROCM_VALIDATION_SUITE_VERSION%%
|
||||
rocblas=%%ROCBLAS_VERSION%%
|
||||
rocrand=%%ROCRAND_VERSION%%
|
||||
hip-runtime-amd=%%HIP_RUNTIME_AMD_VERSION%%
|
||||
hipblaslt=%%HIPBLASLT_VERSION%%
|
||||
comgr=%%COMGR_VERSION%%
|
||||
|
||||
# glibc compat helpers (for any external binaries that need it)
|
||||
libc6
|
||||
|
||||
@@ -39,7 +39,7 @@ info "nvidia boot mode: ${NVIDIA_BOOT_MODE}"
|
||||
# --- PATH & binaries ---
|
||||
echo "-- PATH & binaries --"
|
||||
for tool in dmidecode smartctl nvme ipmitool lspci bee; do
|
||||
if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then
|
||||
if p=$(PATH="/usr/local/bin:/usr/sbin:/sbin:$PATH" command -v "$tool" 2>/dev/null); then
|
||||
ok "$tool found: $p"
|
||||
else
|
||||
fail "$tool: NOT FOUND"
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
export PATH="$PATH:/usr/local/bin:/opt/rocm/bin:/opt/rocm/sbin"
|
||||
export PATH="$PATH:/usr/local/bin:/usr/sbin:/sbin:/opt/rocm/bin:/opt/rocm/sbin"
|
||||
|
||||
# Print web UI URLs on the local console at login.
|
||||
if [ -z "${SSH_CONNECTION:-}" ] \
|
||||
|
||||
Reference in New Issue
Block a user