Compare commits
7 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 09fe0e2e9e | |||
| ace1a9dba6 | |||
| 905c581ece | |||
| 7c2a0135d2 | |||
| 407c1cd1c4 | |||
| e15bcc91c5 | |||
| 98f0cf0d52 |
@@ -114,10 +114,13 @@ type satRunner interface {
|
|||||||
DetectGPUVendor() string
|
DetectGPUVendor() string
|
||||||
ListAMDGPUs() ([]platform.AMDGPUInfo, error)
|
ListAMDGPUs() ([]platform.AMDGPUInfo, error)
|
||||||
RunAMDAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
RunAMDAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||||
|
RunAMDMemIntegrityPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||||
|
RunAMDMemBandwidthPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||||
RunAMDStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
RunAMDStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||||
RunMemoryStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
RunMemoryStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||||
RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||||
RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
|
RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
|
||||||
|
RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
|
||||||
RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -577,6 +580,20 @@ func (a *App) RunAMDAcceptancePackResult(baseDir string) (ActionResult, error) {
|
|||||||
return ActionResult{Title: "AMD GPU SAT", Body: satResultBody(path)}, err
|
return ActionResult{Title: "AMD GPU SAT", Body: satResultBody(path)}, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (a *App) RunAMDMemIntegrityPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunAMDMemIntegrityPack(ctx, baseDir, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunAMDMemBandwidthPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunAMDMemBandwidthPack(ctx, baseDir, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
func (a *App) RunMemoryStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
func (a *App) RunMemoryStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
return a.RunMemoryStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
|
return a.RunMemoryStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||||
}
|
}
|
||||||
@@ -611,6 +628,13 @@ func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platfor
|
|||||||
return a.sat.RunFanStressTest(ctx, baseDir, opts)
|
return a.sat.RunFanStressTest(ctx, baseDir, opts)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
|
func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
|
||||||
path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir, nil)
|
path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir, nil)
|
||||||
body := "Results: " + path
|
body := "Results: " + path
|
||||||
|
|||||||
@@ -181,6 +181,14 @@ func (f fakeSAT) RunAMDAcceptancePack(_ context.Context, baseDir string, _ func(
|
|||||||
return "", nil
|
return "", nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunAMDMemIntegrityPack(_ context.Context, _ string, _ func(string)) (string, error) {
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunAMDMemBandwidthPack(_ context.Context, _ string, _ func(string)) (string, error) {
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunAMDStressPack(_ context.Context, _ string, _ int, _ func(string)) (string, error) {
|
func (f fakeSAT) RunAMDStressPack(_ context.Context, _ string, _ int, _ func(string)) (string, error) {
|
||||||
return "", nil
|
return "", nil
|
||||||
}
|
}
|
||||||
@@ -195,6 +203,10 @@ func (f fakeSAT) RunFanStressTest(_ context.Context, _ string, _ platform.FanStr
|
|||||||
return "", nil
|
return "", nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunPlatformStress(_ context.Context, _ string, _ platform.PlatformStressOptions, _ func(string)) (string, error) {
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) {
|
func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) {
|
||||||
return "", nil
|
return "", nil
|
||||||
}
|
}
|
||||||
|
|||||||
476
audit/internal/platform/platform_stress.go
Normal file
476
audit/internal/platform/platform_stress.go
Normal file
@@ -0,0 +1,476 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"archive/tar"
|
||||||
|
"bytes"
|
||||||
|
"compress/gzip"
|
||||||
|
"context"
|
||||||
|
"encoding/csv"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// PlatformStressCycle defines one load+idle cycle.
|
||||||
|
type PlatformStressCycle struct {
|
||||||
|
LoadSec int // seconds of simultaneous CPU+GPU stress
|
||||||
|
IdleSec int // seconds of idle monitoring after load cut
|
||||||
|
}
|
||||||
|
|
||||||
|
// PlatformStressOptions controls the thermal cycling test.
|
||||||
|
type PlatformStressOptions struct {
|
||||||
|
Cycles []PlatformStressCycle
|
||||||
|
}
|
||||||
|
|
||||||
|
// platformStressRow is one second of telemetry.
|
||||||
|
type platformStressRow struct {
|
||||||
|
ElapsedSec float64
|
||||||
|
Cycle int
|
||||||
|
Phase string // "load" | "idle"
|
||||||
|
CPULoadPct float64
|
||||||
|
MaxCPUTempC float64
|
||||||
|
MaxGPUTempC float64
|
||||||
|
SysPowerW float64
|
||||||
|
FanMinRPM float64
|
||||||
|
FanMaxRPM float64
|
||||||
|
GPUThrottled bool
|
||||||
|
}
|
||||||
|
|
||||||
|
// RunPlatformStress runs repeated load+idle thermal cycling.
|
||||||
|
// Each cycle starts CPU (stressapptest) and GPU stress simultaneously,
|
||||||
|
// runs for LoadSec, then cuts load abruptly and monitors for IdleSec.
|
||||||
|
func (s *System) RunPlatformStress(
|
||||||
|
ctx context.Context,
|
||||||
|
baseDir string,
|
||||||
|
opts PlatformStressOptions,
|
||||||
|
logFunc func(string),
|
||||||
|
) (string, error) {
|
||||||
|
if logFunc == nil {
|
||||||
|
logFunc = func(string) {}
|
||||||
|
}
|
||||||
|
if len(opts.Cycles) == 0 {
|
||||||
|
return "", fmt.Errorf("no cycles defined")
|
||||||
|
}
|
||||||
|
if err := os.MkdirAll(baseDir, 0755); err != nil {
|
||||||
|
return "", fmt.Errorf("mkdir %s: %w", baseDir, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
stamp := time.Now().UTC().Format("20060102-150405")
|
||||||
|
runDir := filepath.Join(baseDir, "platform-stress-"+stamp)
|
||||||
|
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||||
|
return "", fmt.Errorf("mkdir run dir: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
vendor := s.DetectGPUVendor()
|
||||||
|
logFunc(fmt.Sprintf("Platform Thermal Cycling — %d cycle(s), GPU vendor: %s", len(opts.Cycles), vendor))
|
||||||
|
|
||||||
|
var rows []platformStressRow
|
||||||
|
start := time.Now()
|
||||||
|
|
||||||
|
var analyses []cycleAnalysis
|
||||||
|
|
||||||
|
for i, cycle := range opts.Cycles {
|
||||||
|
if ctx.Err() != nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
cycleNum := i + 1
|
||||||
|
logFunc(fmt.Sprintf("--- Cycle %d/%d: load=%ds, idle=%ds ---", cycleNum, len(opts.Cycles), cycle.LoadSec, cycle.IdleSec))
|
||||||
|
|
||||||
|
// ── LOAD PHASE ───────────────────────────────────────────────────────
|
||||||
|
loadCtx, loadCancel := context.WithTimeout(ctx, time.Duration(cycle.LoadSec)*time.Second)
|
||||||
|
var wg sync.WaitGroup
|
||||||
|
|
||||||
|
// CPU stress
|
||||||
|
wg.Add(1)
|
||||||
|
go func() {
|
||||||
|
defer wg.Done()
|
||||||
|
cpuCmd, err := buildCPUStressCmd(loadCtx)
|
||||||
|
if err != nil {
|
||||||
|
logFunc("CPU stress: " + err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
_ = cpuCmd.Wait() // exits when loadCtx times out (SIGKILL)
|
||||||
|
}()
|
||||||
|
|
||||||
|
// GPU stress
|
||||||
|
wg.Add(1)
|
||||||
|
go func() {
|
||||||
|
defer wg.Done()
|
||||||
|
gpuCmd := buildGPUStressCmd(loadCtx, vendor)
|
||||||
|
if gpuCmd == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
_ = gpuCmd.Wait()
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Monitoring goroutine for load phase
|
||||||
|
loadRows := collectPhase(loadCtx, cycleNum, "load", start)
|
||||||
|
for _, r := range loadRows {
|
||||||
|
logFunc(formatPlatformRow(r))
|
||||||
|
}
|
||||||
|
rows = append(rows, loadRows...)
|
||||||
|
loadCancel()
|
||||||
|
wg.Wait()
|
||||||
|
|
||||||
|
if len(loadRows) > 0 {
|
||||||
|
logFunc(fmt.Sprintf("Cycle %d load ended (%.0fs)", cycleNum, loadRows[len(loadRows)-1].ElapsedSec))
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── IDLE PHASE ───────────────────────────────────────────────────────
|
||||||
|
idleCtx, idleCancel := context.WithTimeout(ctx, time.Duration(cycle.IdleSec)*time.Second)
|
||||||
|
idleRows := collectPhase(idleCtx, cycleNum, "idle", start)
|
||||||
|
for _, r := range idleRows {
|
||||||
|
logFunc(formatPlatformRow(r))
|
||||||
|
}
|
||||||
|
rows = append(rows, idleRows...)
|
||||||
|
idleCancel()
|
||||||
|
|
||||||
|
// Per-cycle analysis
|
||||||
|
an := analyzePlatformCycle(loadRows, idleRows)
|
||||||
|
analyses = append(analyses, an)
|
||||||
|
logFunc(fmt.Sprintf("Cycle %d: maxCPU=%.1f°C maxGPU=%.1f°C power=%.0fW throttled=%v fanDrop=%.0f%%",
|
||||||
|
cycleNum, an.maxCPUTemp, an.maxGPUTemp, an.maxPower, an.throttled, an.fanDropPct))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write CSV
|
||||||
|
csvData := writePlatformCSV(rows)
|
||||||
|
_ = os.WriteFile(filepath.Join(runDir, "metrics.csv"), csvData, 0644)
|
||||||
|
|
||||||
|
// Write summary
|
||||||
|
summary := writePlatformSummary(opts, analyses)
|
||||||
|
logFunc("--- Summary ---")
|
||||||
|
for _, line := range strings.Split(summary, "\n") {
|
||||||
|
if line != "" {
|
||||||
|
logFunc(line)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ = os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644)
|
||||||
|
|
||||||
|
// Pack tar.gz
|
||||||
|
archivePath := filepath.Join(baseDir, "platform-stress-"+stamp+".tar.gz")
|
||||||
|
if err := packPlatformDir(runDir, archivePath); err != nil {
|
||||||
|
return "", fmt.Errorf("pack archive: %w", err)
|
||||||
|
}
|
||||||
|
_ = os.RemoveAll(runDir)
|
||||||
|
return archivePath, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// collectPhase samples live metrics every second until ctx is done.
|
||||||
|
func collectPhase(ctx context.Context, cycle int, phase string, testStart time.Time) []platformStressRow {
|
||||||
|
var rows []platformStressRow
|
||||||
|
ticker := time.NewTicker(time.Second)
|
||||||
|
defer ticker.Stop()
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return rows
|
||||||
|
case <-ticker.C:
|
||||||
|
sample := SampleLiveMetrics()
|
||||||
|
rows = append(rows, sampleToPlatformRow(sample, cycle, phase, testStart))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func sampleToPlatformRow(s LiveMetricSample, cycle int, phase string, testStart time.Time) platformStressRow {
|
||||||
|
r := platformStressRow{
|
||||||
|
ElapsedSec: time.Since(testStart).Seconds(),
|
||||||
|
Cycle: cycle,
|
||||||
|
Phase: phase,
|
||||||
|
CPULoadPct: s.CPULoadPct,
|
||||||
|
SysPowerW: s.PowerW,
|
||||||
|
}
|
||||||
|
for _, t := range s.Temps {
|
||||||
|
switch t.Group {
|
||||||
|
case "cpu":
|
||||||
|
if t.Celsius > r.MaxCPUTempC {
|
||||||
|
r.MaxCPUTempC = t.Celsius
|
||||||
|
}
|
||||||
|
case "gpu":
|
||||||
|
if t.Celsius > r.MaxGPUTempC {
|
||||||
|
r.MaxGPUTempC = t.Celsius
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, g := range s.GPUs {
|
||||||
|
if g.TempC > r.MaxGPUTempC {
|
||||||
|
r.MaxGPUTempC = g.TempC
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(s.Fans) > 0 {
|
||||||
|
r.FanMinRPM = s.Fans[0].RPM
|
||||||
|
r.FanMaxRPM = s.Fans[0].RPM
|
||||||
|
for _, f := range s.Fans[1:] {
|
||||||
|
if f.RPM < r.FanMinRPM {
|
||||||
|
r.FanMinRPM = f.RPM
|
||||||
|
}
|
||||||
|
if f.RPM > r.FanMaxRPM {
|
||||||
|
r.FanMaxRPM = f.RPM
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return r
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatPlatformRow(r platformStressRow) string {
|
||||||
|
throttle := ""
|
||||||
|
if r.GPUThrottled {
|
||||||
|
throttle = " THROTTLE"
|
||||||
|
}
|
||||||
|
fans := ""
|
||||||
|
if r.FanMinRPM > 0 {
|
||||||
|
fans = fmt.Sprintf(" fans=%.0f-%.0fRPM", r.FanMinRPM, r.FanMaxRPM)
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("[%5.0fs] cycle=%d phase=%-4s cpu=%.0f%% cpuT=%.1f°C gpuT=%.1f°C pwr=%.0fW%s%s",
|
||||||
|
r.ElapsedSec, r.Cycle, r.Phase, r.CPULoadPct, r.MaxCPUTempC, r.MaxGPUTempC, r.SysPowerW, fans, throttle)
|
||||||
|
}
|
||||||
|
|
||||||
|
func analyzePlatformCycle(loadRows, idleRows []platformStressRow) cycleAnalysis {
|
||||||
|
var an cycleAnalysis
|
||||||
|
for _, r := range loadRows {
|
||||||
|
if r.MaxCPUTempC > an.maxCPUTemp {
|
||||||
|
an.maxCPUTemp = r.MaxCPUTempC
|
||||||
|
}
|
||||||
|
if r.MaxGPUTempC > an.maxGPUTemp {
|
||||||
|
an.maxGPUTemp = r.MaxGPUTempC
|
||||||
|
}
|
||||||
|
if r.SysPowerW > an.maxPower {
|
||||||
|
an.maxPower = r.SysPowerW
|
||||||
|
}
|
||||||
|
if r.GPUThrottled {
|
||||||
|
an.throttled = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Fan RPM at cut = avg of last 5 load rows
|
||||||
|
if n := len(loadRows); n > 0 {
|
||||||
|
window := loadRows
|
||||||
|
if n > 5 {
|
||||||
|
window = loadRows[n-5:]
|
||||||
|
}
|
||||||
|
var sum float64
|
||||||
|
var cnt int
|
||||||
|
for _, r := range window {
|
||||||
|
if r.FanMinRPM > 0 {
|
||||||
|
sum += (r.FanMinRPM + r.FanMaxRPM) / 2
|
||||||
|
cnt++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if cnt > 0 {
|
||||||
|
an.fanAtCutAvg = sum / float64(cnt)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Fan RPM min in first 15s of idle
|
||||||
|
an.fanMin15s = an.fanAtCutAvg
|
||||||
|
var cutElapsed float64
|
||||||
|
if len(loadRows) > 0 {
|
||||||
|
cutElapsed = loadRows[len(loadRows)-1].ElapsedSec
|
||||||
|
}
|
||||||
|
for _, r := range idleRows {
|
||||||
|
if r.ElapsedSec > cutElapsed+15 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
avg := (r.FanMinRPM + r.FanMaxRPM) / 2
|
||||||
|
if avg > 0 && (an.fanMin15s == 0 || avg < an.fanMin15s) {
|
||||||
|
an.fanMin15s = avg
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if an.fanAtCutAvg > 0 {
|
||||||
|
an.fanDropPct = (an.fanAtCutAvg - an.fanMin15s) / an.fanAtCutAvg * 100
|
||||||
|
}
|
||||||
|
return an
|
||||||
|
}
|
||||||
|
|
||||||
|
type cycleAnalysis struct {
|
||||||
|
maxCPUTemp float64
|
||||||
|
maxGPUTemp float64
|
||||||
|
maxPower float64
|
||||||
|
throttled bool
|
||||||
|
fanAtCutAvg float64
|
||||||
|
fanMin15s float64
|
||||||
|
fanDropPct float64
|
||||||
|
}
|
||||||
|
|
||||||
|
func writePlatformSummary(opts PlatformStressOptions, analyses []cycleAnalysis) string {
|
||||||
|
var b strings.Builder
|
||||||
|
fmt.Fprintf(&b, "Platform Thermal Cycling — %d cycle(s)\n", len(opts.Cycles))
|
||||||
|
fmt.Fprintf(&b, "%s\n\n", strings.Repeat("=", 48))
|
||||||
|
|
||||||
|
totalThrottle := 0
|
||||||
|
totalFanWarn := 0
|
||||||
|
for i, an := range analyses {
|
||||||
|
cycle := opts.Cycles[i]
|
||||||
|
fmt.Fprintf(&b, "Cycle %d/%d (load=%ds, idle=%ds)\n", i+1, len(opts.Cycles), cycle.LoadSec, cycle.IdleSec)
|
||||||
|
fmt.Fprintf(&b, " Max CPU temp: %.1f°C\n", an.maxCPUTemp)
|
||||||
|
fmt.Fprintf(&b, " Max GPU temp: %.1f°C\n", an.maxGPUTemp)
|
||||||
|
fmt.Fprintf(&b, " Max sys power: %.0f W\n", an.maxPower)
|
||||||
|
if an.throttled {
|
||||||
|
fmt.Fprintf(&b, " Throttle: DETECTED\n")
|
||||||
|
totalThrottle++
|
||||||
|
} else {
|
||||||
|
fmt.Fprintf(&b, " Throttle: none\n")
|
||||||
|
}
|
||||||
|
if an.fanAtCutAvg > 0 {
|
||||||
|
fmt.Fprintf(&b, " Fan at load cut: %.0f RPM avg\n", an.fanAtCutAvg)
|
||||||
|
fmt.Fprintf(&b, " Fan min (first 15s idle): %.0f RPM (drop %.0f%%)\n", an.fanMin15s, an.fanDropPct)
|
||||||
|
if an.fanDropPct > 20 {
|
||||||
|
fmt.Fprintf(&b, " Fan response: WARN — fast spindown (>20%% drop in 15s)\n")
|
||||||
|
totalFanWarn++
|
||||||
|
} else {
|
||||||
|
fmt.Fprintf(&b, " Fan response: OK\n")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Fprintf(&b, "%s\n", strings.Repeat("=", 48))
|
||||||
|
if totalThrottle > 0 {
|
||||||
|
fmt.Fprintf(&b, "Overall: FAIL — throttle detected in %d/%d cycles\n", totalThrottle, len(analyses))
|
||||||
|
} else if totalFanWarn > 0 {
|
||||||
|
fmt.Fprintf(&b, "Overall: WARN — fast fan spindown in %d/%d cycles (cooling recovery risk)\n", totalFanWarn, len(analyses))
|
||||||
|
} else {
|
||||||
|
fmt.Fprintf(&b, "Overall: PASS\n")
|
||||||
|
}
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func writePlatformCSV(rows []platformStressRow) []byte {
|
||||||
|
var buf bytes.Buffer
|
||||||
|
w := csv.NewWriter(&buf)
|
||||||
|
_ = w.Write([]string{
|
||||||
|
"elapsed_sec", "cycle", "phase",
|
||||||
|
"cpu_load_pct", "max_cpu_temp_c", "max_gpu_temp_c",
|
||||||
|
"sys_power_w", "fan_min_rpm", "fan_max_rpm", "gpu_throttled",
|
||||||
|
})
|
||||||
|
for _, r := range rows {
|
||||||
|
throttled := "0"
|
||||||
|
if r.GPUThrottled {
|
||||||
|
throttled = "1"
|
||||||
|
}
|
||||||
|
_ = w.Write([]string{
|
||||||
|
strconv.FormatFloat(r.ElapsedSec, 'f', 1, 64),
|
||||||
|
strconv.Itoa(r.Cycle),
|
||||||
|
r.Phase,
|
||||||
|
strconv.FormatFloat(r.CPULoadPct, 'f', 1, 64),
|
||||||
|
strconv.FormatFloat(r.MaxCPUTempC, 'f', 1, 64),
|
||||||
|
strconv.FormatFloat(r.MaxGPUTempC, 'f', 1, 64),
|
||||||
|
strconv.FormatFloat(r.SysPowerW, 'f', 1, 64),
|
||||||
|
strconv.FormatFloat(r.FanMinRPM, 'f', 0, 64),
|
||||||
|
strconv.FormatFloat(r.FanMaxRPM, 'f', 0, 64),
|
||||||
|
throttled,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
w.Flush()
|
||||||
|
return buf.Bytes()
|
||||||
|
}
|
||||||
|
|
||||||
|
// buildCPUStressCmd creates a stressapptest command that runs until ctx is cancelled.
|
||||||
|
func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
|
||||||
|
path, err := satLookPath("stressapptest")
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("stressapptest not found: %w", err)
|
||||||
|
}
|
||||||
|
// Use a very long duration; the context timeout will kill it at the right time.
|
||||||
|
cmd := exec.CommandContext(ctx, path, "-s", "86400", "-W", "--cc_test")
|
||||||
|
cmd.Stdout = nil
|
||||||
|
cmd.Stderr = nil
|
||||||
|
if err := cmd.Start(); err != nil {
|
||||||
|
return nil, fmt.Errorf("stressapptest start: %w", err)
|
||||||
|
}
|
||||||
|
return cmd, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// buildGPUStressCmd creates a GPU stress command appropriate for the detected vendor.
|
||||||
|
// Returns nil if no GPU stress tool is available (CPU-only cycling still useful).
|
||||||
|
func buildGPUStressCmd(ctx context.Context, vendor string) *exec.Cmd {
|
||||||
|
switch strings.ToLower(vendor) {
|
||||||
|
case "amd":
|
||||||
|
return buildAMDGPUStressCmd(ctx)
|
||||||
|
case "nvidia":
|
||||||
|
return buildNvidiaGPUStressCmd(ctx)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
|
||||||
|
rvsArgs, err := resolveRVSCommand()
|
||||||
|
if err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
rvsPath := rvsArgs[0]
|
||||||
|
cfg := `actions:
|
||||||
|
- name: gst_platform
|
||||||
|
device: all
|
||||||
|
module: gst
|
||||||
|
parallel: true
|
||||||
|
duration: 86400000
|
||||||
|
copy_matrix: false
|
||||||
|
target_stress: 90
|
||||||
|
matrix_size_a: 8640
|
||||||
|
matrix_size_b: 8640
|
||||||
|
matrix_size_c: 8640
|
||||||
|
`
|
||||||
|
cfgFile := "/tmp/bee-platform-gst.conf"
|
||||||
|
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
|
||||||
|
cmd := exec.CommandContext(ctx, rvsPath, "-c", cfgFile)
|
||||||
|
cmd.Stdout = nil
|
||||||
|
cmd.Stderr = nil
|
||||||
|
_ = cmd.Start()
|
||||||
|
return cmd
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
|
||||||
|
path, err := satLookPath("bee-gpu-stress")
|
||||||
|
if err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
cmd := exec.CommandContext(ctx, path, "--seconds", "86400", "--size-mb", "64")
|
||||||
|
cmd.Stdout = nil
|
||||||
|
cmd.Stderr = nil
|
||||||
|
_ = cmd.Start()
|
||||||
|
return cmd
|
||||||
|
}
|
||||||
|
|
||||||
|
func packPlatformDir(dir, dest string) error {
|
||||||
|
f, err := os.Create(dest)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
gz := gzip.NewWriter(f)
|
||||||
|
defer gz.Close()
|
||||||
|
tw := tar.NewWriter(gz)
|
||||||
|
defer tw.Close()
|
||||||
|
|
||||||
|
entries, err := os.ReadDir(dir)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
base := filepath.Base(dir)
|
||||||
|
for _, e := range entries {
|
||||||
|
if e.IsDir() {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
fpath := filepath.Join(dir, e.Name())
|
||||||
|
data, err := os.ReadFile(fpath)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
hdr := &tar.Header{
|
||||||
|
Name: filepath.Join(base, e.Name()),
|
||||||
|
Size: int64(len(data)),
|
||||||
|
Mode: 0644,
|
||||||
|
ModTime: time.Now(),
|
||||||
|
}
|
||||||
|
if err := tw.WriteHeader(hdr); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if _, err := tw.Write(data); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
@@ -136,6 +136,54 @@ func (s *System) RunAMDAcceptancePack(ctx context.Context, baseDir string, logFu
|
|||||||
}, logFunc)
|
}, logFunc)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// RunAMDMemIntegrityPack runs the official RVS MEM module as a validate-style memory integrity test.
|
||||||
|
func (s *System) RunAMDMemIntegrityPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
if err := ensureAMDRuntimeReady(); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
cfgFile := "/tmp/bee-amd-mem.conf"
|
||||||
|
cfg := `actions:
|
||||||
|
- name: mem_integrity
|
||||||
|
device: all
|
||||||
|
module: mem
|
||||||
|
parallel: true
|
||||||
|
duration: 60000
|
||||||
|
copy_matrix: false
|
||||||
|
target_stress: 90
|
||||||
|
matrix_size: 8640
|
||||||
|
`
|
||||||
|
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
|
||||||
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-mem", []satJob{
|
||||||
|
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
|
||||||
|
{name: "02-rvs-mem.log", cmd: []string{"rvs", "-c", cfgFile}},
|
||||||
|
{name: "03-rocm-smi-after.log", cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--showmemuse", "--csv"}},
|
||||||
|
}, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
// RunAMDMemBandwidthPack runs AMD's memory/interconnect bandwidth-oriented tools.
|
||||||
|
func (s *System) RunAMDMemBandwidthPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
if err := ensureAMDRuntimeReady(); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
cfgFile := "/tmp/bee-amd-babel.conf"
|
||||||
|
cfg := `actions:
|
||||||
|
- name: babel_mem_bw
|
||||||
|
device: all
|
||||||
|
module: babel
|
||||||
|
parallel: true
|
||||||
|
copy_matrix: true
|
||||||
|
target_stress: 90
|
||||||
|
matrix_size: 134217728
|
||||||
|
`
|
||||||
|
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
|
||||||
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-bandwidth", []satJob{
|
||||||
|
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
|
||||||
|
{name: "02-rocm-bandwidth-test.log", cmd: []string{"rocm-bandwidth-test"}},
|
||||||
|
{name: "03-rvs-babel.log", cmd: []string{"rvs", "-c", cfgFile}},
|
||||||
|
{name: "04-rocm-smi-after.log", cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--showmemuse", "--csv"}},
|
||||||
|
}, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
// RunAMDStressPack runs an AMD GPU burn-in pack.
|
// RunAMDStressPack runs an AMD GPU burn-in pack.
|
||||||
// Missing tools are reported as UNSUPPORTED, consistent with the existing SAT pattern.
|
// Missing tools are reported as UNSUPPORTED, consistent with the existing SAT pattern.
|
||||||
func (s *System) RunAMDStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
func (s *System) RunAMDStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
@@ -146,8 +194,16 @@ func (s *System) RunAMDStressPack(ctx context.Context, baseDir string, durationS
|
|||||||
if err := ensureAMDRuntimeReady(); err != nil {
|
if err := ensureAMDRuntimeReady(); err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
// Write RVS GST config to a temp file
|
// Enable copy_matrix so the same GST run drives VRAM traffic in addition to compute.
|
||||||
rvsCfg := fmt.Sprintf(`actions:
|
rvsCfg := amdStressRVSConfig(seconds)
|
||||||
|
cfgFile := "/tmp/bee-amd-gst.conf"
|
||||||
|
_ = os.WriteFile(cfgFile, []byte(rvsCfg), 0644)
|
||||||
|
|
||||||
|
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-stress", amdStressJobs(seconds, cfgFile), logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func amdStressRVSConfig(seconds int) string {
|
||||||
|
return fmt.Sprintf(`actions:
|
||||||
- name: gst_stress
|
- name: gst_stress
|
||||||
device: all
|
device: all
|
||||||
module: gst
|
module: gst
|
||||||
@@ -159,15 +215,15 @@ func (s *System) RunAMDStressPack(ctx context.Context, baseDir string, durationS
|
|||||||
matrix_size_b: 8640
|
matrix_size_b: 8640
|
||||||
matrix_size_c: 8640
|
matrix_size_c: 8640
|
||||||
`, seconds*1000)
|
`, seconds*1000)
|
||||||
cfgFile := "/tmp/bee-amd-gst.conf"
|
}
|
||||||
_ = os.WriteFile(cfgFile, []byte(rvsCfg), 0644)
|
|
||||||
|
|
||||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-stress", []satJob{
|
func amdStressJobs(seconds int, cfgFile string) []satJob {
|
||||||
|
return []satJob{
|
||||||
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
|
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
|
||||||
{name: "02-rocm-bandwidth-test.log", cmd: []string{"rocm-bandwidth-test"}},
|
{name: "02-rocm-bandwidth-test.log", cmd: []string{"rocm-bandwidth-test"}},
|
||||||
{name: fmt.Sprintf("03-rvs-gst-%ds.log", seconds), cmd: []string{"rvs", "-c", cfgFile}},
|
{name: fmt.Sprintf("03-rvs-gst-%ds.log", seconds), cmd: []string{"rvs", "-c", cfgFile}},
|
||||||
{name: fmt.Sprintf("04-rocm-smi-after.log"), cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--csv"}},
|
{name: fmt.Sprintf("04-rocm-smi-after.log"), cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--csv"}},
|
||||||
}, logFunc)
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ListNvidiaGPUs returns GPUs visible to nvidia-smi.
|
// ListNvidiaGPUs returns GPUs visible to nvidia-smi.
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -38,6 +39,47 @@ func TestRunNvidiaAcceptancePackIncludesGPUStress(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestAMDStressConfigUsesSingleGSTAction(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
cfg := amdStressRVSConfig(123)
|
||||||
|
if !strings.Contains(cfg, "module: gst") {
|
||||||
|
t.Fatalf("config missing gst module:\n%s", cfg)
|
||||||
|
}
|
||||||
|
if strings.Contains(cfg, "module: mem") {
|
||||||
|
t.Fatalf("config should not include mem module:\n%s", cfg)
|
||||||
|
}
|
||||||
|
if !strings.Contains(cfg, "copy_matrix: false") {
|
||||||
|
t.Fatalf("config should use copy_matrix=false:\n%s", cfg)
|
||||||
|
}
|
||||||
|
if strings.Count(cfg, "duration: 123000") != 1 {
|
||||||
|
t.Fatalf("config should apply duration once:\n%s", cfg)
|
||||||
|
}
|
||||||
|
for _, field := range []string{"matrix_size_a: 8640", "matrix_size_b: 8640", "matrix_size_c: 8640"} {
|
||||||
|
if !strings.Contains(cfg, field) {
|
||||||
|
t.Fatalf("config missing %s:\n%s", field, cfg)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestAMDStressJobsIncludeBandwidthAndGST(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
jobs := amdStressJobs(300, "/tmp/test-amd-gst.conf")
|
||||||
|
if len(jobs) != 4 {
|
||||||
|
t.Fatalf("jobs=%d want 4", len(jobs))
|
||||||
|
}
|
||||||
|
if got := jobs[1].cmd[0]; got != "rocm-bandwidth-test" {
|
||||||
|
t.Fatalf("jobs[1]=%q want rocm-bandwidth-test", got)
|
||||||
|
}
|
||||||
|
if got := jobs[2].cmd[0]; got != "rvs" {
|
||||||
|
t.Fatalf("jobs[2]=%q want rvs", got)
|
||||||
|
}
|
||||||
|
if got := jobs[2].cmd[2]; got != "/tmp/test-amd-gst.conf" {
|
||||||
|
t.Fatalf("jobs[2] cfg=%q want /tmp/test-amd-gst.conf", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestNvidiaSATJobsUseEnvOverrides(t *testing.T) {
|
func TestNvidiaSATJobsUseEnvOverrides(t *testing.T) {
|
||||||
t.Setenv("BEE_GPU_STRESS_SECONDS", "9")
|
t.Setenv("BEE_GPU_STRESS_SECONDS", "9")
|
||||||
t.Setenv("BEE_GPU_STRESS_SIZE_MB", "96")
|
t.Setenv("BEE_GPU_STRESS_SIZE_MB", "96")
|
||||||
|
|||||||
@@ -599,10 +599,9 @@ func (h *handler) handleAPIMetricsStream(w http.ResponseWriter, r *http.Request)
|
|||||||
case <-r.Context().Done():
|
case <-r.Context().Done():
|
||||||
return
|
return
|
||||||
case <-ticker.C:
|
case <-ticker.C:
|
||||||
sample := platform.SampleLiveMetrics()
|
sample, ok := h.latestMetric()
|
||||||
h.feedRings(sample)
|
if !ok {
|
||||||
if h.metricsDB != nil {
|
continue
|
||||||
_ = h.metricsDB.Write(sample)
|
|
||||||
}
|
}
|
||||||
b, err := json.Marshal(sample)
|
b, err := json.Marshal(sample)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
@@ -3,7 +3,6 @@ package webui
|
|||||||
import (
|
import (
|
||||||
"database/sql"
|
"database/sql"
|
||||||
"encoding/csv"
|
"encoding/csv"
|
||||||
"fmt"
|
|
||||||
"io"
|
"io"
|
||||||
"strconv"
|
"strconv"
|
||||||
"time"
|
"time"
|
||||||
@@ -13,7 +12,6 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
const metricsDBPath = "/appdata/bee/metrics.db"
|
const metricsDBPath = "/appdata/bee/metrics.db"
|
||||||
const metricsKeepDuration = 24 * time.Hour
|
|
||||||
|
|
||||||
// MetricsDB persists live metric samples to SQLite.
|
// MetricsDB persists live metric samples to SQLite.
|
||||||
type MetricsDB struct {
|
type MetricsDB struct {
|
||||||
@@ -116,11 +114,18 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// LoadRecent returns up to n samples in chronological order (oldest first).
|
// LoadRecent returns up to n samples in chronological order (oldest first).
|
||||||
// It reconstructs LiveMetricSample from the normalized tables.
|
|
||||||
func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
|
func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
|
||||||
rows, err := m.db.Query(
|
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?`, n)
|
||||||
`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?`, n,
|
}
|
||||||
)
|
|
||||||
|
// LoadAll returns all persisted samples in chronological order (oldest first).
|
||||||
|
func (m *MetricsDB) LoadAll() ([]platform.LiveMetricSample, error) {
|
||||||
|
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts`, nil)
|
||||||
|
}
|
||||||
|
|
||||||
|
// loadSamples reconstructs LiveMetricSample rows from the normalized tables.
|
||||||
|
func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetricSample, error) {
|
||||||
|
rows, err := m.db.Query(query, args...)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@@ -257,14 +262,6 @@ func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
|
|||||||
return samples, nil
|
return samples, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Prune deletes samples older than keepDuration.
|
|
||||||
func (m *MetricsDB) Prune(keepDuration time.Duration) {
|
|
||||||
cutoff := time.Now().Add(-keepDuration).Unix()
|
|
||||||
for _, table := range []string{"sys_metrics", "gpu_metrics", "fan_metrics", "temp_metrics"} {
|
|
||||||
_, _ = m.db.Exec(fmt.Sprintf("DELETE FROM %s WHERE ts < ?", table), cutoff)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ExportCSV writes all sys+gpu data as CSV to w.
|
// ExportCSV writes all sys+gpu data as CSV to w.
|
||||||
func (m *MetricsDB) ExportCSV(w io.Writer) error {
|
func (m *MetricsDB) ExportCSV(w io.Writer) error {
|
||||||
rows, err := m.db.Query(`
|
rows, err := m.db.Query(`
|
||||||
|
|||||||
@@ -494,7 +494,11 @@ func renderValidate() string {
|
|||||||
renderSATCard("memory", "Memory", "") +
|
renderSATCard("memory", "Memory", "") +
|
||||||
renderSATCard("storage", "Storage", "") +
|
renderSATCard("storage", "Storage", "") +
|
||||||
renderSATCard("cpu", "CPU", `<div class="form-row"><label>Duration (seconds)</label><input type="number" id="sat-cpu-dur" value="60" min="10"></div>`) +
|
renderSATCard("cpu", "CPU", `<div class="form-row"><label>Duration (seconds)</label><input type="number" id="sat-cpu-dur" value="60" min="10"></div>`) +
|
||||||
renderSATCard("amd", "AMD GPU", "") +
|
renderSATCard("amd", "AMD GPU", `<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
|
||||||
|
<button id="sat-btn-amd-mem" class="btn" type="button" onclick="runSAT('amd-mem')">MEM Integrity</button>
|
||||||
|
<button id="sat-btn-amd-bandwidth" class="btn" type="button" onclick="runSAT('amd-bandwidth')">MEM Bandwidth</button>
|
||||||
|
</div>
|
||||||
|
<p style="color:var(--muted);font-size:12px;margin:0">Additional AMD memory diagnostics: RVS MEM for integrity and BABEL + rocm-bandwidth-test for memory/interconnect bandwidth.</p>`) +
|
||||||
`</div>
|
`</div>
|
||||||
<div id="sat-output" style="display:none;margin-top:16px" class="card">
|
<div id="sat-output" style="display:none;margin-top:16px" class="card">
|
||||||
<div class="card-head">Test Output <span id="sat-title"></span></div>
|
<div class="card-head">Test Output <span id="sat-title"></span></div>
|
||||||
@@ -505,7 +509,7 @@ let satES = null;
|
|||||||
function runSAT(target) {
|
function runSAT(target) {
|
||||||
if (satES) { satES.close(); satES = null; }
|
if (satES) { satES.close(); satES = null; }
|
||||||
const body = {};
|
const body = {};
|
||||||
const labels = {nvidia:'Validate GPU', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU'};
|
const labels = {nvidia:'Validate GPU', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
|
||||||
body.display_name = labels[target] || ('Validate ' + target);
|
body.display_name = labels[target] || ('Validate ' + target);
|
||||||
if (target === 'nvidia') body.diag_level = parseInt(document.getElementById('sat-nvidia-level').value)||1;
|
if (target === 'nvidia') body.diag_level = parseInt(document.getElementById('sat-nvidia-level').value)||1;
|
||||||
if (target === 'cpu') body.duration = parseInt(document.getElementById('sat-cpu-dur').value)||60;
|
if (target === 'cpu') body.duration = parseInt(document.getElementById('sat-cpu-dur').value)||60;
|
||||||
@@ -524,7 +528,7 @@ function runSAT(target) {
|
|||||||
}
|
}
|
||||||
function runAllSAT() {
|
function runAllSAT() {
|
||||||
const cycles = Math.max(1, parseInt(document.getElementById('sat-cycles').value)||1);
|
const cycles = Math.max(1, parseInt(document.getElementById('sat-cycles').value)||1);
|
||||||
const targets = ['nvidia','memory','storage','cpu','amd'];
|
const targets = ['nvidia','memory','storage','cpu','amd','amd-mem','amd-bandwidth'];
|
||||||
const total = targets.length * cycles;
|
const total = targets.length * cycles;
|
||||||
let enqueued = 0;
|
let enqueued = 0;
|
||||||
const status = document.getElementById('sat-all-status');
|
const status = document.getElementById('sat-all-status');
|
||||||
@@ -536,7 +540,7 @@ function runAllSAT() {
|
|||||||
const btn = document.getElementById('sat-btn-' + target);
|
const btn = document.getElementById('sat-btn-' + target);
|
||||||
if (btn && btn.disabled) { enqueueNext(cycle, idx+1); return; }
|
if (btn && btn.disabled) { enqueueNext(cycle, idx+1); return; }
|
||||||
const body = {};
|
const body = {};
|
||||||
const labels = {nvidia:'Validate GPU', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU'};
|
const labels = {nvidia:'Validate GPU', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
|
||||||
body.display_name = labels[target] || ('Validate ' + target);
|
body.display_name = labels[target] || ('Validate ' + target);
|
||||||
if (target === 'nvidia') body.diag_level = parseInt(document.getElementById('sat-nvidia-level').value)||1;
|
if (target === 'nvidia') body.diag_level = parseInt(document.getElementById('sat-nvidia-level').value)||1;
|
||||||
if (target === 'cpu') body.duration = parseInt(document.getElementById('sat-cpu-dur').value)||60;
|
if (target === 'cpu') body.duration = parseInt(document.getElementById('sat-cpu-dur').value)||60;
|
||||||
@@ -554,6 +558,8 @@ function runAllSAT() {
|
|||||||
fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
|
fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
|
||||||
if (!gp.nvidia) disableSATCard('nvidia', 'No NVIDIA GPU detected');
|
if (!gp.nvidia) disableSATCard('nvidia', 'No NVIDIA GPU detected');
|
||||||
if (!gp.amd) disableSATCard('amd', 'No AMD GPU detected');
|
if (!gp.amd) disableSATCard('amd', 'No AMD GPU detected');
|
||||||
|
if (!gp.amd) disableSATCard('amd-mem', 'No AMD GPU detected');
|
||||||
|
if (!gp.amd) disableSATCard('amd-bandwidth', 'No AMD GPU detected');
|
||||||
});
|
});
|
||||||
function disableSATCard(id, reason) {
|
function disableSATCard(id, reason) {
|
||||||
const btn = document.getElementById('sat-btn-' + id);
|
const btn = document.getElementById('sat-btn-' + id);
|
||||||
@@ -598,7 +604,7 @@ func renderBurn() string {
|
|||||||
<button class="btn btn-primary" onclick="runBurnIn('cpu')">▶ Start CPU Stress</button>
|
<button class="btn btn-primary" onclick="runBurnIn('cpu')">▶ Start CPU Stress</button>
|
||||||
</div></div>
|
</div></div>
|
||||||
<div class="card"><div class="card-head">AMD GPU Stress</div><div class="card-body">
|
<div class="card"><div class="card-head">AMD GPU Stress</div><div class="card-body">
|
||||||
<p style="color:var(--muted);font-size:12px;margin-bottom:8px">Requires ROCm tools (rocm-bandwidth-test). Missing tools reported as UNSUPPORTED.</p>
|
<p style="color:var(--muted);font-size:12px;margin-bottom:8px">Runs ROCm compute stress together with VRAM copy/load activity via RVS GST and records a separate <code>rocm-bandwidth-test</code> snapshot. Missing tools reported as UNSUPPORTED.</p>
|
||||||
<button id="sat-btn-amd-stress" class="btn btn-primary" onclick="runBurnIn('amd-stress')">▶ Start AMD Stress</button>
|
<button id="sat-btn-amd-stress" class="btn btn-primary" onclick="runBurnIn('amd-stress')">▶ Start AMD Stress</button>
|
||||||
</div></div>
|
</div></div>
|
||||||
<div class="card"><div class="card-head">Memory Stress</div><div class="card-body">
|
<div class="card"><div class="card-head">Memory Stress</div><div class="card-body">
|
||||||
@@ -609,6 +615,10 @@ func renderBurn() string {
|
|||||||
<p style="color:var(--muted);font-size:12px;margin-bottom:8px">Google stressapptest saturates CPU, memory and cache buses simultaneously. Env: <code>BEE_SAT_STRESS_SECONDS</code> (default 300), <code>BEE_SAT_STRESS_MB</code> (default auto).</p>
|
<p style="color:var(--muted);font-size:12px;margin-bottom:8px">Google stressapptest saturates CPU, memory and cache buses simultaneously. Env: <code>BEE_SAT_STRESS_SECONDS</code> (default 300), <code>BEE_SAT_STRESS_MB</code> (default auto).</p>
|
||||||
<button class="btn btn-primary" onclick="runBurnIn('sat-stress')">▶ Start SAT Stress</button>
|
<button class="btn btn-primary" onclick="runBurnIn('sat-stress')">▶ Start SAT Stress</button>
|
||||||
</div></div>
|
</div></div>
|
||||||
|
<div class="card"><div class="card-head">Platform Thermal Cycling</div><div class="card-body">
|
||||||
|
<p style="color:var(--muted);font-size:12px;margin-bottom:8px">Runs CPU + GPU stress simultaneously across multiple load/idle cycles with varying durations. Detects cooling systems that fail to recover under repeated load cycles. Smoke: 2 cycles ~5 min. Acceptance: 4 cycles ~25 min.</p>
|
||||||
|
<button class="btn btn-primary" onclick="runBurnIn('platform-stress')">▶ Start Thermal Cycling</button>
|
||||||
|
</div></div>
|
||||||
</div>
|
</div>
|
||||||
<div id="bi-output" style="display:none;margin-top:16px" class="card">
|
<div id="bi-output" style="display:none;margin-top:16px" class="card">
|
||||||
<div class="card-head">Output <span id="bi-title"></span></div>
|
<div class="card-head">Output <span id="bi-title"></span></div>
|
||||||
|
|||||||
@@ -72,29 +72,36 @@ func (r *metricsRing) snapshot() ([]float64, []string) {
|
|||||||
defer r.mu.Unlock()
|
defer r.mu.Unlock()
|
||||||
v := make([]float64, len(r.vals))
|
v := make([]float64, len(r.vals))
|
||||||
copy(v, r.vals)
|
copy(v, r.vals)
|
||||||
now := time.Now()
|
|
||||||
labels := make([]string, len(r.times))
|
labels := make([]string, len(r.times))
|
||||||
|
if len(r.times) == 0 {
|
||||||
|
return v, labels
|
||||||
|
}
|
||||||
|
sameDay := timestampsSameLocalDay(r.times)
|
||||||
for i, t := range r.times {
|
for i, t := range r.times {
|
||||||
labels[i] = relAgeLabel(now.Sub(t))
|
labels[i] = formatTimelineLabel(t.Local(), sameDay)
|
||||||
}
|
}
|
||||||
return v, labels
|
return v, labels
|
||||||
}
|
}
|
||||||
|
|
||||||
func relAgeLabel(age time.Duration) string {
|
func timestampsSameLocalDay(times []time.Time) bool {
|
||||||
if age <= 0 {
|
if len(times) == 0 {
|
||||||
return "0"
|
return true
|
||||||
}
|
}
|
||||||
if age < time.Hour {
|
first := times[0].Local()
|
||||||
m := int(age.Minutes())
|
for _, t := range times[1:] {
|
||||||
if m == 0 {
|
local := t.Local()
|
||||||
return "-1m"
|
if local.Year() != first.Year() || local.YearDay() != first.YearDay() {
|
||||||
|
return false
|
||||||
}
|
}
|
||||||
return fmt.Sprintf("-%dm", m)
|
|
||||||
}
|
}
|
||||||
if age < 24*time.Hour {
|
return true
|
||||||
return fmt.Sprintf("-%dh", int(age.Hours()))
|
}
|
||||||
|
|
||||||
|
func formatTimelineLabel(ts time.Time, sameDay bool) string {
|
||||||
|
if sameDay {
|
||||||
|
return ts.Format("15:04")
|
||||||
}
|
}
|
||||||
return fmt.Sprintf("-%dd", int(age.Hours()/24))
|
return ts.Format("01-02 15:04")
|
||||||
}
|
}
|
||||||
|
|
||||||
// gpuRings holds per-GPU ring buffers.
|
// gpuRings holds per-GPU ring buffers.
|
||||||
@@ -132,6 +139,8 @@ type handler struct {
|
|||||||
// per-GPU rings (index = GPU index)
|
// per-GPU rings (index = GPU index)
|
||||||
gpuRings []*gpuRings
|
gpuRings []*gpuRings
|
||||||
ringsMu sync.Mutex
|
ringsMu sync.Mutex
|
||||||
|
latestMu sync.RWMutex
|
||||||
|
latest *platform.LiveMetricSample
|
||||||
// metrics persistence (nil if DB unavailable)
|
// metrics persistence (nil if DB unavailable)
|
||||||
metricsDB *MetricsDB
|
metricsDB *MetricsDB
|
||||||
// install job (at most one at a time)
|
// install job (at most one at a time)
|
||||||
@@ -164,13 +173,16 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
|||||||
// Open metrics DB and pre-fill ring buffers from history.
|
// Open metrics DB and pre-fill ring buffers from history.
|
||||||
if db, err := openMetricsDB(metricsDBPath); err == nil {
|
if db, err := openMetricsDB(metricsDBPath); err == nil {
|
||||||
h.metricsDB = db
|
h.metricsDB = db
|
||||||
db.Prune(metricsKeepDuration)
|
|
||||||
if samples, err := db.LoadRecent(120); err == nil {
|
if samples, err := db.LoadRecent(120); err == nil {
|
||||||
for _, s := range samples {
|
for _, s := range samples {
|
||||||
h.feedRings(s)
|
h.feedRings(s)
|
||||||
}
|
}
|
||||||
|
if len(samples) > 0 {
|
||||||
|
h.setLatestMetric(samples[len(samples)-1])
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
h.startMetricsCollector()
|
||||||
|
|
||||||
globalQueue.startWorker(&opts)
|
globalQueue.startWorker(&opts)
|
||||||
mux := http.NewServeMux()
|
mux := http.NewServeMux()
|
||||||
@@ -198,9 +210,12 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
|||||||
mux.HandleFunc("POST /api/sat/storage/run", h.handleAPISATRun("storage"))
|
mux.HandleFunc("POST /api/sat/storage/run", h.handleAPISATRun("storage"))
|
||||||
mux.HandleFunc("POST /api/sat/cpu/run", h.handleAPISATRun("cpu"))
|
mux.HandleFunc("POST /api/sat/cpu/run", h.handleAPISATRun("cpu"))
|
||||||
mux.HandleFunc("POST /api/sat/amd/run", h.handleAPISATRun("amd"))
|
mux.HandleFunc("POST /api/sat/amd/run", h.handleAPISATRun("amd"))
|
||||||
|
mux.HandleFunc("POST /api/sat/amd-mem/run", h.handleAPISATRun("amd-mem"))
|
||||||
|
mux.HandleFunc("POST /api/sat/amd-bandwidth/run", h.handleAPISATRun("amd-bandwidth"))
|
||||||
mux.HandleFunc("POST /api/sat/amd-stress/run", h.handleAPISATRun("amd-stress"))
|
mux.HandleFunc("POST /api/sat/amd-stress/run", h.handleAPISATRun("amd-stress"))
|
||||||
mux.HandleFunc("POST /api/sat/memory-stress/run", h.handleAPISATRun("memory-stress"))
|
mux.HandleFunc("POST /api/sat/memory-stress/run", h.handleAPISATRun("memory-stress"))
|
||||||
mux.HandleFunc("POST /api/sat/sat-stress/run", h.handleAPISATRun("sat-stress"))
|
mux.HandleFunc("POST /api/sat/sat-stress/run", h.handleAPISATRun("sat-stress"))
|
||||||
|
mux.HandleFunc("POST /api/sat/platform-stress/run", h.handleAPISATRun("platform-stress"))
|
||||||
mux.HandleFunc("GET /api/sat/stream", h.handleAPISATStream)
|
mux.HandleFunc("GET /api/sat/stream", h.handleAPISATStream)
|
||||||
mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
|
mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
|
||||||
|
|
||||||
@@ -260,6 +275,37 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
|||||||
return mux
|
return mux
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (h *handler) startMetricsCollector() {
|
||||||
|
go func() {
|
||||||
|
ticker := time.NewTicker(1 * time.Second)
|
||||||
|
defer ticker.Stop()
|
||||||
|
for range ticker.C {
|
||||||
|
sample := platform.SampleLiveMetrics()
|
||||||
|
h.feedRings(sample)
|
||||||
|
h.setLatestMetric(sample)
|
||||||
|
if h.metricsDB != nil {
|
||||||
|
_ = h.metricsDB.Write(sample)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) setLatestMetric(sample platform.LiveMetricSample) {
|
||||||
|
h.latestMu.Lock()
|
||||||
|
defer h.latestMu.Unlock()
|
||||||
|
cp := sample
|
||||||
|
h.latest = &cp
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) latestMetric() (platform.LiveMetricSample, bool) {
|
||||||
|
h.latestMu.RLock()
|
||||||
|
defer h.latestMu.RUnlock()
|
||||||
|
if h.latest == nil {
|
||||||
|
return platform.LiveMetricSample{}, false
|
||||||
|
}
|
||||||
|
return *h.latest, true
|
||||||
|
}
|
||||||
|
|
||||||
// ListenAndServe starts the HTTP server.
|
// ListenAndServe starts the HTTP server.
|
||||||
func ListenAndServe(addr string, opts HandlerOptions) error {
|
func ListenAndServe(addr string, opts HandlerOptions) error {
|
||||||
return http.ListenAndServe(addr, NewHandler(opts))
|
return http.ListenAndServe(addr, NewHandler(opts))
|
||||||
@@ -387,6 +433,20 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
|
|||||||
path := strings.TrimPrefix(r.URL.Path, "/api/metrics/chart/")
|
path := strings.TrimPrefix(r.URL.Path, "/api/metrics/chart/")
|
||||||
path = strings.TrimSuffix(path, ".svg")
|
path = strings.TrimSuffix(path, ".svg")
|
||||||
|
|
||||||
|
if h.metricsDB != nil {
|
||||||
|
if datasets, names, labels, title, yMin, yMax, ok := h.chartDataFromDB(path); ok {
|
||||||
|
buf, err := renderChartSVG(title, datasets, names, labels, yMin, yMax)
|
||||||
|
if err != nil {
|
||||||
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
w.Header().Set("Content-Type", "image/svg+xml")
|
||||||
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
|
_, _ = w.Write(buf)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
var datasets [][]float64
|
var datasets [][]float64
|
||||||
var names []string
|
var names []string
|
||||||
var labels []string
|
var labels []string
|
||||||
@@ -601,6 +661,259 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
|
|||||||
_, _ = w.Write(buf)
|
_, _ = w.Write(buf)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (h *handler) chartDataFromDB(path string) ([][]float64, []string, []string, string, *float64, *float64, bool) {
|
||||||
|
samples, err := h.metricsDB.LoadAll()
|
||||||
|
if err != nil || len(samples) == 0 {
|
||||||
|
return nil, nil, nil, "", nil, nil, false
|
||||||
|
}
|
||||||
|
return chartDataFromSamples(path, samples)
|
||||||
|
}
|
||||||
|
|
||||||
|
func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][]float64, []string, []string, string, *float64, *float64, bool) {
|
||||||
|
var datasets [][]float64
|
||||||
|
var names []string
|
||||||
|
var title string
|
||||||
|
var yMin, yMax *float64
|
||||||
|
labels := sampleTimeLabels(samples)
|
||||||
|
|
||||||
|
switch {
|
||||||
|
case path == "server-load":
|
||||||
|
title = "CPU / Memory Load"
|
||||||
|
cpu := make([]float64, len(samples))
|
||||||
|
mem := make([]float64, len(samples))
|
||||||
|
for i, s := range samples {
|
||||||
|
cpu[i] = s.CPULoadPct
|
||||||
|
mem[i] = s.MemLoadPct
|
||||||
|
}
|
||||||
|
datasets = [][]float64{cpu, mem}
|
||||||
|
names = []string{"CPU Load %", "Mem Load %"}
|
||||||
|
yMin = floatPtr(0)
|
||||||
|
yMax = floatPtr(100)
|
||||||
|
|
||||||
|
case path == "server-temp", path == "server-temp-cpu":
|
||||||
|
title = "CPU Temperature"
|
||||||
|
datasets, names = namedTempDatasets(samples, "cpu")
|
||||||
|
yMin = floatPtr(0)
|
||||||
|
yMax = autoMax120(datasets...)
|
||||||
|
|
||||||
|
case path == "server-temp-gpu":
|
||||||
|
title = "GPU Temperature"
|
||||||
|
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.TempC })
|
||||||
|
yMin = floatPtr(0)
|
||||||
|
yMax = autoMax120(datasets...)
|
||||||
|
|
||||||
|
case path == "server-temp-ambient":
|
||||||
|
title = "Ambient / Other Sensors"
|
||||||
|
datasets, names = namedTempDatasets(samples, "ambient")
|
||||||
|
yMin = floatPtr(0)
|
||||||
|
yMax = autoMax120(datasets...)
|
||||||
|
|
||||||
|
case path == "server-power":
|
||||||
|
title = "System Power"
|
||||||
|
power := make([]float64, len(samples))
|
||||||
|
for i, s := range samples {
|
||||||
|
power[i] = s.PowerW
|
||||||
|
}
|
||||||
|
datasets = [][]float64{power}
|
||||||
|
names = []string{"Power W"}
|
||||||
|
yMin, yMax = autoBounds120(power)
|
||||||
|
|
||||||
|
case path == "server-fans":
|
||||||
|
title = "Fan RPM"
|
||||||
|
datasets, names = namedFanDatasets(samples)
|
||||||
|
yMin, yMax = autoBounds120(datasets...)
|
||||||
|
|
||||||
|
case path == "gpu-all-load":
|
||||||
|
title = "GPU Compute Load"
|
||||||
|
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.UsagePct })
|
||||||
|
yMin = floatPtr(0)
|
||||||
|
yMax = floatPtr(100)
|
||||||
|
|
||||||
|
case path == "gpu-all-memload":
|
||||||
|
title = "GPU Memory Load"
|
||||||
|
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct })
|
||||||
|
yMin = floatPtr(0)
|
||||||
|
yMax = floatPtr(100)
|
||||||
|
|
||||||
|
case path == "gpu-all-power":
|
||||||
|
title = "GPU Power"
|
||||||
|
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.PowerW })
|
||||||
|
yMin, yMax = autoBounds120(datasets...)
|
||||||
|
|
||||||
|
case path == "gpu-all-temp":
|
||||||
|
title = "GPU Temperature"
|
||||||
|
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.TempC })
|
||||||
|
yMin = floatPtr(0)
|
||||||
|
yMax = autoMax120(datasets...)
|
||||||
|
|
||||||
|
case strings.HasPrefix(path, "gpu/"):
|
||||||
|
rest := strings.TrimPrefix(path, "gpu/")
|
||||||
|
sub := ""
|
||||||
|
if i := strings.LastIndex(rest, "-"); i > 0 {
|
||||||
|
sub = rest[i+1:]
|
||||||
|
rest = rest[:i]
|
||||||
|
}
|
||||||
|
idx := 0
|
||||||
|
fmt.Sscanf(rest, "%d", &idx)
|
||||||
|
switch sub {
|
||||||
|
case "load":
|
||||||
|
title = fmt.Sprintf("GPU %d Load", idx)
|
||||||
|
util := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.UsagePct })
|
||||||
|
mem := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct })
|
||||||
|
if util == nil && mem == nil {
|
||||||
|
return nil, nil, nil, "", nil, nil, false
|
||||||
|
}
|
||||||
|
datasets = [][]float64{coalesceDataset(util, len(samples)), coalesceDataset(mem, len(samples))}
|
||||||
|
names = []string{"Load %", "Mem %"}
|
||||||
|
yMin = floatPtr(0)
|
||||||
|
yMax = floatPtr(100)
|
||||||
|
case "temp":
|
||||||
|
title = fmt.Sprintf("GPU %d Temperature", idx)
|
||||||
|
temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
|
||||||
|
if temp == nil {
|
||||||
|
return nil, nil, nil, "", nil, nil, false
|
||||||
|
}
|
||||||
|
datasets = [][]float64{temp}
|
||||||
|
names = []string{"Temp °C"}
|
||||||
|
yMin = floatPtr(0)
|
||||||
|
yMax = autoMax120(temp)
|
||||||
|
default:
|
||||||
|
title = fmt.Sprintf("GPU %d Power", idx)
|
||||||
|
power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
|
||||||
|
if power == nil {
|
||||||
|
return nil, nil, nil, "", nil, nil, false
|
||||||
|
}
|
||||||
|
datasets = [][]float64{power}
|
||||||
|
names = []string{"Power W"}
|
||||||
|
yMin, yMax = autoBounds120(power)
|
||||||
|
}
|
||||||
|
|
||||||
|
default:
|
||||||
|
return nil, nil, nil, "", nil, nil, false
|
||||||
|
}
|
||||||
|
|
||||||
|
return datasets, names, labels, title, yMin, yMax, len(datasets) > 0
|
||||||
|
}
|
||||||
|
|
||||||
|
func sampleTimeLabels(samples []platform.LiveMetricSample) []string {
|
||||||
|
labels := make([]string, len(samples))
|
||||||
|
if len(samples) == 0 {
|
||||||
|
return labels
|
||||||
|
}
|
||||||
|
times := make([]time.Time, len(samples))
|
||||||
|
for i, s := range samples {
|
||||||
|
times[i] = s.Timestamp
|
||||||
|
}
|
||||||
|
sameDay := timestampsSameLocalDay(times)
|
||||||
|
for i, s := range samples {
|
||||||
|
labels[i] = formatTimelineLabel(s.Timestamp.Local(), sameDay)
|
||||||
|
}
|
||||||
|
return labels
|
||||||
|
}
|
||||||
|
|
||||||
|
func namedTempDatasets(samples []platform.LiveMetricSample, group string) ([][]float64, []string) {
|
||||||
|
seen := map[string]bool{}
|
||||||
|
var names []string
|
||||||
|
for _, s := range samples {
|
||||||
|
for _, t := range s.Temps {
|
||||||
|
if t.Group == group && !seen[t.Name] {
|
||||||
|
seen[t.Name] = true
|
||||||
|
names = append(names, t.Name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
datasets := make([][]float64, 0, len(names))
|
||||||
|
for _, name := range names {
|
||||||
|
ds := make([]float64, len(samples))
|
||||||
|
for i, s := range samples {
|
||||||
|
for _, t := range s.Temps {
|
||||||
|
if t.Group == group && t.Name == name {
|
||||||
|
ds[i] = t.Celsius
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
datasets = append(datasets, ds)
|
||||||
|
}
|
||||||
|
return datasets, names
|
||||||
|
}
|
||||||
|
|
||||||
|
func namedFanDatasets(samples []platform.LiveMetricSample) ([][]float64, []string) {
|
||||||
|
seen := map[string]bool{}
|
||||||
|
var names []string
|
||||||
|
for _, s := range samples {
|
||||||
|
for _, f := range s.Fans {
|
||||||
|
if !seen[f.Name] {
|
||||||
|
seen[f.Name] = true
|
||||||
|
names = append(names, f.Name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
datasets := make([][]float64, 0, len(names))
|
||||||
|
for _, name := range names {
|
||||||
|
ds := make([]float64, len(samples))
|
||||||
|
for i, s := range samples {
|
||||||
|
for _, f := range s.Fans {
|
||||||
|
if f.Name == name {
|
||||||
|
ds[i] = f.RPM
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
datasets = append(datasets, ds)
|
||||||
|
}
|
||||||
|
return datasets, names
|
||||||
|
}
|
||||||
|
|
||||||
|
func gpuDatasets(samples []platform.LiveMetricSample, pick func(platform.GPUMetricRow) float64) ([][]float64, []string) {
|
||||||
|
seen := map[int]bool{}
|
||||||
|
var indices []int
|
||||||
|
for _, s := range samples {
|
||||||
|
for _, g := range s.GPUs {
|
||||||
|
if !seen[g.GPUIndex] {
|
||||||
|
seen[g.GPUIndex] = true
|
||||||
|
indices = append(indices, g.GPUIndex)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
datasets := make([][]float64, 0, len(indices))
|
||||||
|
names := make([]string, 0, len(indices))
|
||||||
|
for _, idx := range indices {
|
||||||
|
ds := gpuDatasetByIndex(samples, idx, pick)
|
||||||
|
if ds == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
datasets = append(datasets, ds)
|
||||||
|
names = append(names, fmt.Sprintf("GPU %d", idx))
|
||||||
|
}
|
||||||
|
return datasets, names
|
||||||
|
}
|
||||||
|
|
||||||
|
func gpuDatasetByIndex(samples []platform.LiveMetricSample, idx int, pick func(platform.GPUMetricRow) float64) []float64 {
|
||||||
|
found := false
|
||||||
|
ds := make([]float64, len(samples))
|
||||||
|
for i, s := range samples {
|
||||||
|
for _, g := range s.GPUs {
|
||||||
|
if g.GPUIndex == idx {
|
||||||
|
ds[i] = pick(g)
|
||||||
|
found = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !found {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return ds
|
||||||
|
}
|
||||||
|
|
||||||
|
func coalesceDataset(ds []float64, n int) []float64 {
|
||||||
|
if ds != nil {
|
||||||
|
return ds
|
||||||
|
}
|
||||||
|
return make([]float64, n)
|
||||||
|
}
|
||||||
|
|
||||||
// floatPtr returns a pointer to a float64 value.
|
// floatPtr returns a pointer to a float64 value.
|
||||||
func floatPtr(v float64) *float64 { return &v }
|
func floatPtr(v float64) *float64 { return &v }
|
||||||
|
|
||||||
@@ -621,6 +934,47 @@ func autoMax120(datasets ...[]float64) *float64 {
|
|||||||
return &v
|
return &v
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func autoBounds120(datasets ...[]float64) (*float64, *float64) {
|
||||||
|
min := 0.0
|
||||||
|
max := 0.0
|
||||||
|
first := true
|
||||||
|
for _, ds := range datasets {
|
||||||
|
for _, v := range ds {
|
||||||
|
if first {
|
||||||
|
min, max = v, v
|
||||||
|
first = false
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if v < min {
|
||||||
|
min = v
|
||||||
|
}
|
||||||
|
if v > max {
|
||||||
|
max = v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if first {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
if max <= 0 {
|
||||||
|
return floatPtr(0), nil
|
||||||
|
}
|
||||||
|
span := max - min
|
||||||
|
if span <= 0 {
|
||||||
|
span = max * 0.1
|
||||||
|
if span <= 0 {
|
||||||
|
span = 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pad := span * 0.2
|
||||||
|
low := min - pad
|
||||||
|
if low < 0 {
|
||||||
|
low = 0
|
||||||
|
}
|
||||||
|
high := max + pad
|
||||||
|
return floatPtr(low), floatPtr(high)
|
||||||
|
}
|
||||||
|
|
||||||
// renderChartSVG renders a line chart SVG with a fixed Y-axis range.
|
// renderChartSVG renders a line chart SVG with a fixed Y-axis range.
|
||||||
func renderChartSVG(title string, datasets [][]float64, names []string, labels []string, yMin, yMax *float64) ([]byte, error) {
|
func renderChartSVG(title string, datasets [][]float64, names []string, labels []string, yMin, yMax *float64) ([]byte, error) {
|
||||||
n := len(labels)
|
n := len(labels)
|
||||||
|
|||||||
@@ -7,6 +7,9 @@ import (
|
|||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"bee/audit/internal/platform"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestChartLegendNumber(t *testing.T) {
|
func TestChartLegendNumber(t *testing.T) {
|
||||||
@@ -31,6 +34,61 @@ func TestChartLegendNumber(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
|
||||||
|
samples := []platform.LiveMetricSample{
|
||||||
|
{
|
||||||
|
Timestamp: time.Now().Add(-3 * time.Minute),
|
||||||
|
CPULoadPct: 10,
|
||||||
|
MemLoadPct: 20,
|
||||||
|
PowerW: 300,
|
||||||
|
GPUs: []platform.GPUMetricRow{
|
||||||
|
{GPUIndex: 0, UsagePct: 90, MemUsagePct: 5, PowerW: 120, TempC: 50},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Timestamp: time.Now().Add(-2 * time.Minute),
|
||||||
|
CPULoadPct: 30,
|
||||||
|
MemLoadPct: 40,
|
||||||
|
PowerW: 320,
|
||||||
|
GPUs: []platform.GPUMetricRow{
|
||||||
|
{GPUIndex: 0, UsagePct: 95, MemUsagePct: 7, PowerW: 125, TempC: 51},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Timestamp: time.Now().Add(-1 * time.Minute),
|
||||||
|
CPULoadPct: 50,
|
||||||
|
MemLoadPct: 60,
|
||||||
|
PowerW: 340,
|
||||||
|
GPUs: []platform.GPUMetricRow{
|
||||||
|
{GPUIndex: 0, UsagePct: 97, MemUsagePct: 9, PowerW: 130, TempC: 52},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
datasets, names, labels, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
|
||||||
|
if !ok {
|
||||||
|
t.Fatal("chartDataFromSamples returned ok=false")
|
||||||
|
}
|
||||||
|
if title != "GPU Power" {
|
||||||
|
t.Fatalf("title=%q", title)
|
||||||
|
}
|
||||||
|
if len(names) != 1 || names[0] != "GPU 0" {
|
||||||
|
t.Fatalf("names=%v", names)
|
||||||
|
}
|
||||||
|
if len(labels) != len(samples) {
|
||||||
|
t.Fatalf("labels len=%d want %d", len(labels), len(samples))
|
||||||
|
}
|
||||||
|
if len(datasets) != 1 || len(datasets[0]) != len(samples) {
|
||||||
|
t.Fatalf("datasets shape=%v", datasets)
|
||||||
|
}
|
||||||
|
if got := datasets[0][0]; got != 120 {
|
||||||
|
t.Fatalf("datasets[0][0]=%v want 120", got)
|
||||||
|
}
|
||||||
|
if got := datasets[0][2]; got != 130 {
|
||||||
|
t.Fatalf("datasets[0][2]=%v want 130", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestRootRendersDashboard(t *testing.T) {
|
func TestRootRendersDashboard(t *testing.T) {
|
||||||
dir := t.TempDir()
|
dir := t.TempDir()
|
||||||
path := filepath.Join(dir, "audit.json")
|
path := filepath.Join(dir, "audit.json")
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
|
|
||||||
"bee/audit/internal/app"
|
"bee/audit/internal/app"
|
||||||
|
"bee/audit/internal/platform"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Task statuses.
|
// Task statuses.
|
||||||
@@ -30,9 +31,12 @@ var taskNames = map[string]string{
|
|||||||
"storage": "Storage SAT",
|
"storage": "Storage SAT",
|
||||||
"cpu": "CPU SAT",
|
"cpu": "CPU SAT",
|
||||||
"amd": "AMD GPU SAT",
|
"amd": "AMD GPU SAT",
|
||||||
|
"amd-mem": "AMD GPU MEM Integrity",
|
||||||
|
"amd-bandwidth": "AMD GPU MEM Bandwidth",
|
||||||
"amd-stress": "AMD GPU Burn-in",
|
"amd-stress": "AMD GPU Burn-in",
|
||||||
"memory-stress": "Memory Burn-in",
|
"memory-stress": "Memory Burn-in",
|
||||||
"sat-stress": "SAT Stress (stressapptest)",
|
"sat-stress": "SAT Stress (stressapptest)",
|
||||||
|
"platform-stress": "Platform Thermal Cycling",
|
||||||
"audit": "Audit",
|
"audit": "Audit",
|
||||||
"install": "Install to Disk",
|
"install": "Install to Disk",
|
||||||
"install-to-ram": "Install to RAM",
|
"install-to-ram": "Install to RAM",
|
||||||
@@ -96,6 +100,34 @@ func resolveBurnPreset(profile string) burnPreset {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions {
|
||||||
|
switch profile {
|
||||||
|
case "overnight":
|
||||||
|
return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{
|
||||||
|
{LoadSec: 600, IdleSec: 120},
|
||||||
|
{LoadSec: 600, IdleSec: 60},
|
||||||
|
{LoadSec: 600, IdleSec: 30},
|
||||||
|
{LoadSec: 600, IdleSec: 120},
|
||||||
|
{LoadSec: 600, IdleSec: 60},
|
||||||
|
{LoadSec: 600, IdleSec: 30},
|
||||||
|
{LoadSec: 600, IdleSec: 120},
|
||||||
|
{LoadSec: 600, IdleSec: 60},
|
||||||
|
}}
|
||||||
|
case "acceptance":
|
||||||
|
return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{
|
||||||
|
{LoadSec: 300, IdleSec: 60},
|
||||||
|
{LoadSec: 300, IdleSec: 30},
|
||||||
|
{LoadSec: 300, IdleSec: 60},
|
||||||
|
{LoadSec: 300, IdleSec: 30},
|
||||||
|
}}
|
||||||
|
default: // smoke
|
||||||
|
return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{
|
||||||
|
{LoadSec: 90, IdleSec: 60},
|
||||||
|
{LoadSec: 90, IdleSec: 30},
|
||||||
|
}}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// taskQueue manages a priority-ordered list of tasks and runs them one at a time.
|
// taskQueue manages a priority-ordered list of tasks and runs them one at a time.
|
||||||
type taskQueue struct {
|
type taskQueue struct {
|
||||||
mu sync.Mutex
|
mu sync.Mutex
|
||||||
@@ -124,6 +156,12 @@ var (
|
|||||||
runAMDAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
runAMDAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
return a.RunAMDAcceptancePackCtx(ctx, baseDir, logFunc)
|
return a.RunAMDAcceptancePackCtx(ctx, baseDir, logFunc)
|
||||||
}
|
}
|
||||||
|
runAMDMemIntegrityPackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunAMDMemIntegrityPackCtx(ctx, baseDir, logFunc)
|
||||||
|
}
|
||||||
|
runAMDMemBandwidthPackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunAMDMemBandwidthPackCtx(ctx, baseDir, logFunc)
|
||||||
|
}
|
||||||
runAMDStressPackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
runAMDStressPackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
return a.RunAMDStressPackCtx(ctx, baseDir, durationSec, logFunc)
|
return a.RunAMDStressPackCtx(ctx, baseDir, durationSec, logFunc)
|
||||||
}
|
}
|
||||||
@@ -380,6 +418,10 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
archive, err = runCPUAcceptancePackCtx(a, ctx, "", dur, j.append)
|
archive, err = runCPUAcceptancePackCtx(a, ctx, "", dur, j.append)
|
||||||
case "amd":
|
case "amd":
|
||||||
archive, err = runAMDAcceptancePackCtx(a, ctx, "", j.append)
|
archive, err = runAMDAcceptancePackCtx(a, ctx, "", j.append)
|
||||||
|
case "amd-mem":
|
||||||
|
archive, err = runAMDMemIntegrityPackCtx(a, ctx, "", j.append)
|
||||||
|
case "amd-bandwidth":
|
||||||
|
archive, err = runAMDMemBandwidthPackCtx(a, ctx, "", j.append)
|
||||||
case "amd-stress":
|
case "amd-stress":
|
||||||
dur := t.params.Duration
|
dur := t.params.Duration
|
||||||
if t.params.BurnProfile != "" && dur <= 0 {
|
if t.params.BurnProfile != "" && dur <= 0 {
|
||||||
@@ -398,6 +440,9 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||||
}
|
}
|
||||||
archive, err = runSATStressPackCtx(a, ctx, "", dur, j.append)
|
archive, err = runSATStressPackCtx(a, ctx, "", dur, j.append)
|
||||||
|
case "platform-stress":
|
||||||
|
opts := resolvePlatformStressPreset(t.params.BurnProfile)
|
||||||
|
archive, err = a.RunPlatformStress(ctx, "", opts, j.append)
|
||||||
case "audit":
|
case "audit":
|
||||||
result, e := a.RunAuditNow(q.opts.RuntimeMode)
|
result, e := a.RunAuditNow(q.opts.RuntimeMode)
|
||||||
if e != nil {
|
if e != nil {
|
||||||
|
|||||||
@@ -30,8 +30,8 @@ lb config noauto \
|
|||||||
--linux-flavours "amd64" \
|
--linux-flavours "amd64" \
|
||||||
--linux-packages "${LB_LINUX_PACKAGES}" \
|
--linux-packages "${LB_LINUX_PACKAGES}" \
|
||||||
--memtest none \
|
--memtest none \
|
||||||
--iso-volume "EASY-BEE" \
|
--iso-volume "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||||
--iso-application "EASY-BEE" \
|
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||||
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \
|
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \
|
||||||
--apt-recommends false \
|
--apt-recommends false \
|
||||||
--chroot-squashfs-compression-type zstd \
|
--chroot-squashfs-compression-type zstd \
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ CACHE_DIR="${BEE_BUILDER_CACHE_DIR:-${REPO_ROOT}/dist/container-cache}"
|
|||||||
AUTH_KEYS=""
|
AUTH_KEYS=""
|
||||||
REBUILD_IMAGE=0
|
REBUILD_IMAGE=0
|
||||||
CLEAN_CACHE=0
|
CLEAN_CACHE=0
|
||||||
|
VARIANT="all"
|
||||||
|
|
||||||
. "${BUILDER_DIR}/VERSIONS"
|
. "${BUILDER_DIR}/VERSIONS"
|
||||||
|
|
||||||
@@ -34,14 +35,23 @@ while [ $# -gt 0 ]; do
|
|||||||
REBUILD_IMAGE=1
|
REBUILD_IMAGE=1
|
||||||
shift
|
shift
|
||||||
;;
|
;;
|
||||||
|
--variant)
|
||||||
|
VARIANT="$2"
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
*)
|
*)
|
||||||
echo "unknown arg: $1" >&2
|
echo "unknown arg: $1" >&2
|
||||||
echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys]" >&2
|
echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys] [--variant nvidia|amd|all]" >&2
|
||||||
exit 1
|
exit 1
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
done
|
done
|
||||||
|
|
||||||
|
case "$VARIANT" in
|
||||||
|
nvidia|amd|nogpu|all) ;;
|
||||||
|
*) echo "unknown variant: $VARIANT (expected nvidia, amd, nogpu, or all)" >&2; exit 1 ;;
|
||||||
|
esac
|
||||||
|
|
||||||
if [ "$CLEAN_CACHE" = "1" ]; then
|
if [ "$CLEAN_CACHE" = "1" ]; then
|
||||||
echo "=== cleaning build cache: ${CACHE_DIR} ==="
|
echo "=== cleaning build cache: ${CACHE_DIR} ==="
|
||||||
rm -rf "${CACHE_DIR:?}/go-build" \
|
rm -rf "${CACHE_DIR:?}/go-build" \
|
||||||
@@ -49,8 +59,10 @@ if [ "$CLEAN_CACHE" = "1" ]; then
|
|||||||
"${CACHE_DIR:?}/tmp" \
|
"${CACHE_DIR:?}/tmp" \
|
||||||
"${CACHE_DIR:?}/bee" \
|
"${CACHE_DIR:?}/bee" \
|
||||||
"${CACHE_DIR:?}/lb-packages"
|
"${CACHE_DIR:?}/lb-packages"
|
||||||
echo "=== cleaning live-build work dir: ${REPO_ROOT}/dist/live-build-work ==="
|
echo "=== cleaning live-build work dirs ==="
|
||||||
rm -rf "${REPO_ROOT}/dist/live-build-work"
|
rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia"
|
||||||
|
rm -rf "${REPO_ROOT}/dist/live-build-work-amd"
|
||||||
|
rm -rf "${REPO_ROOT}/dist/live-build-work-nogpu"
|
||||||
echo "=== caches cleared, proceeding with build ==="
|
echo "=== caches cleared, proceeding with build ==="
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@@ -108,34 +120,75 @@ else
|
|||||||
echo "=== using existing builder image ${IMAGE_REF} (${BUILDER_PLATFORM}) ==="
|
echo "=== using existing builder image ${IMAGE_REF} (${BUILDER_PLATFORM}) ==="
|
||||||
fi
|
fi
|
||||||
|
|
||||||
set -- \
|
# Build base docker run args (without --authorized-keys)
|
||||||
run --rm --privileged \
|
build_run_args() {
|
||||||
--platform "${BUILDER_PLATFORM}" \
|
_variant="$1"
|
||||||
-v "${REPO_ROOT}:/work" \
|
_auth_arg=""
|
||||||
-v "${CACHE_DIR}:/cache" \
|
if [ -n "$AUTH_KEYS" ]; then
|
||||||
-e BEE_CONTAINER_BUILD=1 \
|
_auth_arg="--authorized-keys /tmp/bee-authkeys/${AUTH_KEYS_BASE}"
|
||||||
-e GOCACHE=/cache/go-build \
|
fi
|
||||||
-e GOMODCACHE=/cache/go-mod \
|
echo "run --rm --privileged \
|
||||||
-e TMPDIR=/cache/tmp \
|
--platform ${BUILDER_PLATFORM} \
|
||||||
-e BEE_CACHE_DIR=/cache/bee \
|
-v ${REPO_ROOT}:/work \
|
||||||
-w /work \
|
-v ${CACHE_DIR}:/cache \
|
||||||
"${IMAGE_REF}" \
|
${AUTH_KEYS:+-v ${AUTH_KEYS_DIR}:/tmp/bee-authkeys:ro} \
|
||||||
sh /work/iso/builder/build.sh
|
|
||||||
|
|
||||||
if [ -n "$AUTH_KEYS" ]; then
|
|
||||||
set -- run --rm --privileged \
|
|
||||||
--platform "${BUILDER_PLATFORM}" \
|
|
||||||
-v "${REPO_ROOT}:/work" \
|
|
||||||
-v "${CACHE_DIR}:/cache" \
|
|
||||||
-v "${AUTH_KEYS_DIR}:/tmp/bee-authkeys:ro" \
|
|
||||||
-e BEE_CONTAINER_BUILD=1 \
|
-e BEE_CONTAINER_BUILD=1 \
|
||||||
-e GOCACHE=/cache/go-build \
|
-e GOCACHE=/cache/go-build \
|
||||||
-e GOMODCACHE=/cache/go-mod \
|
-e GOMODCACHE=/cache/go-mod \
|
||||||
-e TMPDIR=/cache/tmp \
|
-e TMPDIR=/cache/tmp \
|
||||||
-e BEE_CACHE_DIR=/cache/bee \
|
-e BEE_CACHE_DIR=/cache/bee \
|
||||||
-w /work \
|
-w /work \
|
||||||
"${IMAGE_REF}" \
|
${IMAGE_REF} \
|
||||||
sh /work/iso/builder/build.sh --authorized-keys "/tmp/bee-authkeys/${AUTH_KEYS_BASE}"
|
sh /work/iso/builder/build.sh --variant ${_variant} ${_auth_arg}"
|
||||||
fi
|
}
|
||||||
|
|
||||||
"$CONTAINER_TOOL" "$@"
|
run_variant() {
|
||||||
|
_v="$1"
|
||||||
|
echo "=== building variant: ${_v} ==="
|
||||||
|
if [ -n "$AUTH_KEYS" ]; then
|
||||||
|
"$CONTAINER_TOOL" run --rm --privileged \
|
||||||
|
--platform "${BUILDER_PLATFORM}" \
|
||||||
|
-v "${REPO_ROOT}:/work" \
|
||||||
|
-v "${CACHE_DIR}:/cache" \
|
||||||
|
-v "${AUTH_KEYS_DIR}:/tmp/bee-authkeys:ro" \
|
||||||
|
-e BEE_CONTAINER_BUILD=1 \
|
||||||
|
-e GOCACHE=/cache/go-build \
|
||||||
|
-e GOMODCACHE=/cache/go-mod \
|
||||||
|
-e TMPDIR=/cache/tmp \
|
||||||
|
-e BEE_CACHE_DIR=/cache/bee \
|
||||||
|
-w /work \
|
||||||
|
"${IMAGE_REF}" \
|
||||||
|
sh /work/iso/builder/build.sh --variant "${_v}" \
|
||||||
|
--authorized-keys "/tmp/bee-authkeys/${AUTH_KEYS_BASE}"
|
||||||
|
else
|
||||||
|
"$CONTAINER_TOOL" run --rm --privileged \
|
||||||
|
--platform "${BUILDER_PLATFORM}" \
|
||||||
|
-v "${REPO_ROOT}:/work" \
|
||||||
|
-v "${CACHE_DIR}:/cache" \
|
||||||
|
-e BEE_CONTAINER_BUILD=1 \
|
||||||
|
-e GOCACHE=/cache/go-build \
|
||||||
|
-e GOMODCACHE=/cache/go-mod \
|
||||||
|
-e TMPDIR=/cache/tmp \
|
||||||
|
-e BEE_CACHE_DIR=/cache/bee \
|
||||||
|
-w /work \
|
||||||
|
"${IMAGE_REF}" \
|
||||||
|
sh /work/iso/builder/build.sh --variant "${_v}"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
case "$VARIANT" in
|
||||||
|
nvidia)
|
||||||
|
run_variant nvidia
|
||||||
|
;;
|
||||||
|
amd)
|
||||||
|
run_variant amd
|
||||||
|
;;
|
||||||
|
nogpu)
|
||||||
|
run_variant nogpu
|
||||||
|
;;
|
||||||
|
all)
|
||||||
|
run_variant nvidia
|
||||||
|
run_variant amd
|
||||||
|
run_variant nogpu
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|||||||
@@ -13,19 +13,29 @@ BUILDER_DIR="${REPO_ROOT}/iso/builder"
|
|||||||
OVERLAY_DIR="${REPO_ROOT}/iso/overlay"
|
OVERLAY_DIR="${REPO_ROOT}/iso/overlay"
|
||||||
DIST_DIR="${REPO_ROOT}/dist"
|
DIST_DIR="${REPO_ROOT}/dist"
|
||||||
VENDOR_DIR="${REPO_ROOT}/iso/vendor"
|
VENDOR_DIR="${REPO_ROOT}/iso/vendor"
|
||||||
BUILD_WORK_DIR="${DIST_DIR}/live-build-work"
|
|
||||||
OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage"
|
|
||||||
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||||
AUTH_KEYS=""
|
AUTH_KEYS=""
|
||||||
|
BEE_GPU_VENDOR="nvidia"
|
||||||
|
|
||||||
# parse args
|
# parse args
|
||||||
while [ $# -gt 0 ]; do
|
while [ $# -gt 0 ]; do
|
||||||
case "$1" in
|
case "$1" in
|
||||||
--authorized-keys) AUTH_KEYS="$2"; shift 2 ;;
|
--authorized-keys) AUTH_KEYS="$2"; shift 2 ;;
|
||||||
|
--variant) BEE_GPU_VENDOR="$2"; shift 2 ;;
|
||||||
*) echo "unknown arg: $1"; exit 1 ;;
|
*) echo "unknown arg: $1"; exit 1 ;;
|
||||||
esac
|
esac
|
||||||
done
|
done
|
||||||
|
|
||||||
|
case "$BEE_GPU_VENDOR" in
|
||||||
|
nvidia|amd|nogpu) ;;
|
||||||
|
*) echo "unknown variant: $BEE_GPU_VENDOR (expected nvidia, amd, or nogpu)" >&2; exit 1 ;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
BUILD_WORK_DIR="${DIST_DIR}/live-build-work-${BEE_GPU_VENDOR}"
|
||||||
|
OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage-${BEE_GPU_VENDOR}"
|
||||||
|
|
||||||
|
export BEE_GPU_VENDOR
|
||||||
|
|
||||||
. "${BUILDER_DIR}/VERSIONS"
|
. "${BUILDER_DIR}/VERSIONS"
|
||||||
export PATH="$PATH:/usr/local/go/bin"
|
export PATH="$PATH:/usr/local/go/bin"
|
||||||
|
|
||||||
@@ -132,7 +142,7 @@ if [ ! -d "/usr/src/linux-headers-${KVER}" ]; then
|
|||||||
apt-get install -y "linux-headers-${KVER}"
|
apt-get install -y "linux-headers-${KVER}"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "=== bee ISO build ==="
|
echo "=== bee ISO build (variant: ${BEE_GPU_VENDOR}) ==="
|
||||||
echo "Debian: ${DEBIAN_VERSION}, Kernel ABI: ${DEBIAN_KERNEL_ABI}, Go: ${GO_VERSION}"
|
echo "Debian: ${DEBIAN_VERSION}, Kernel ABI: ${DEBIAN_KERNEL_ABI}, Go: ${GO_VERSION}"
|
||||||
echo "Audit version: ${AUDIT_VERSION_EFFECTIVE}, ISO version: ${ISO_VERSION_EFFECTIVE}"
|
echo "Audit version: ${AUDIT_VERSION_EFFECTIVE}, ISO version: ${ISO_VERSION_EFFECTIVE}"
|
||||||
echo ""
|
echo ""
|
||||||
@@ -141,8 +151,8 @@ echo "=== syncing git submodules ==="
|
|||||||
git -C "${REPO_ROOT}" submodule update --init --recursive
|
git -C "${REPO_ROOT}" submodule update --init --recursive
|
||||||
|
|
||||||
# --- compile bee binary (static, Linux amd64) ---
|
# --- compile bee binary (static, Linux amd64) ---
|
||||||
|
# Shared between variants — built once, reused on second pass.
|
||||||
BEE_BIN="${DIST_DIR}/bee-linux-amd64"
|
BEE_BIN="${DIST_DIR}/bee-linux-amd64"
|
||||||
GPU_STRESS_BIN="${DIST_DIR}/bee-gpu-stress-linux-amd64"
|
|
||||||
NEED_BUILD=1
|
NEED_BUILD=1
|
||||||
if [ -f "$BEE_BIN" ]; then
|
if [ -f "$BEE_BIN" ]; then
|
||||||
NEWEST_SRC=$(find "${REPO_ROOT}/audit" -name '*.go' -newer "$BEE_BIN" | head -1)
|
NEWEST_SRC=$(find "${REPO_ROOT}/audit" -name '*.go' -newer "$BEE_BIN" | head -1)
|
||||||
@@ -172,37 +182,41 @@ else
|
|||||||
echo "=== bee binary up to date, skipping build ==="
|
echo "=== bee binary up to date, skipping build ==="
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo ""
|
# --- NVIDIA-only build steps ---
|
||||||
echo "=== downloading cuBLAS/cuBLASLt/cudart ${NCCL_CUDA_VERSION} userspace ==="
|
GPU_STRESS_BIN="${DIST_DIR}/bee-gpu-stress-linux-amd64"
|
||||||
sh "${BUILDER_DIR}/build-cublas.sh" \
|
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||||
"${CUBLAS_VERSION}" \
|
echo ""
|
||||||
"${CUDA_USERSPACE_VERSION}" \
|
echo "=== downloading cuBLAS/cuBLASLt/cudart ${NCCL_CUDA_VERSION} userspace ==="
|
||||||
"${NCCL_CUDA_VERSION}" \
|
sh "${BUILDER_DIR}/build-cublas.sh" \
|
||||||
"${DIST_DIR}"
|
"${CUBLAS_VERSION}" \
|
||||||
|
"${CUDA_USERSPACE_VERSION}" \
|
||||||
|
"${NCCL_CUDA_VERSION}" \
|
||||||
|
"${DIST_DIR}"
|
||||||
|
|
||||||
CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
||||||
|
|
||||||
GPU_STRESS_NEED_BUILD=1
|
GPU_STRESS_NEED_BUILD=1
|
||||||
if [ -f "$GPU_STRESS_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_STRESS_BIN" ]; then
|
if [ -f "$GPU_STRESS_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_STRESS_BIN" ]; then
|
||||||
GPU_STRESS_NEED_BUILD=0
|
GPU_STRESS_NEED_BUILD=0
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then
|
||||||
|
echo "=== building bee-gpu-stress ==="
|
||||||
|
gcc -O2 -s -Wall -Wextra \
|
||||||
|
-I"${CUBLAS_CACHE}/include" \
|
||||||
|
-o "$GPU_STRESS_BIN" \
|
||||||
|
"${BUILDER_DIR}/bee-gpu-stress.c" \
|
||||||
|
-ldl -lm
|
||||||
|
echo "binary: $GPU_STRESS_BIN"
|
||||||
|
else
|
||||||
|
echo "=== bee-gpu-stress up to date, skipping build ==="
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then
|
echo "=== preparing staged overlay (${BEE_GPU_VENDOR}) ==="
|
||||||
echo "=== building bee-gpu-stress ==="
|
|
||||||
gcc -O2 -s -Wall -Wextra \
|
|
||||||
-I"${CUBLAS_CACHE}/include" \
|
|
||||||
-o "$GPU_STRESS_BIN" \
|
|
||||||
"${BUILDER_DIR}/bee-gpu-stress.c" \
|
|
||||||
-ldl -lm
|
|
||||||
echo "binary: $GPU_STRESS_BIN"
|
|
||||||
else
|
|
||||||
echo "=== bee-gpu-stress up to date, skipping build ==="
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "=== preparing staged overlay ==="
|
|
||||||
# Sync builder config into work dir, preserving lb cache (chroot + packages).
|
|
||||||
# We do NOT rm -rf BUILD_WORK_DIR so lb can reuse its chroot on repeat builds.
|
|
||||||
mkdir -p "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}"
|
mkdir -p "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}"
|
||||||
|
|
||||||
|
# Sync builder config into variant work dir, preserving lb cache.
|
||||||
rsync -a --delete \
|
rsync -a --delete \
|
||||||
--exclude='cache/' \
|
--exclude='cache/' \
|
||||||
--exclude='chroot/' \
|
--exclude='chroot/' \
|
||||||
@@ -212,7 +226,10 @@ rsync -a --delete \
|
|||||||
--exclude='*.contents' \
|
--exclude='*.contents' \
|
||||||
--exclude='*.files' \
|
--exclude='*.files' \
|
||||||
"${BUILDER_DIR}/" "${BUILD_WORK_DIR}/"
|
"${BUILDER_DIR}/" "${BUILD_WORK_DIR}/"
|
||||||
# Also persist package cache to CACHE_ROOT so it survives a manual wipe of BUILD_WORK_DIR.
|
|
||||||
|
# Share deb package cache across variants.
|
||||||
|
# Restore: populate work dir cache from shared cache before build.
|
||||||
|
# Persist: sync back after build (done after lb build below).
|
||||||
LB_PKG_CACHE="${CACHE_ROOT}/lb-packages"
|
LB_PKG_CACHE="${CACHE_ROOT}/lb-packages"
|
||||||
mkdir -p "${LB_PKG_CACHE}"
|
mkdir -p "${LB_PKG_CACHE}"
|
||||||
if [ -d "${BUILD_WORK_DIR}/cache/packages.chroot" ]; then
|
if [ -d "${BUILD_WORK_DIR}/cache/packages.chroot" ]; then
|
||||||
@@ -221,6 +238,7 @@ elif [ -d "${LB_PKG_CACHE}" ] && [ "$(ls -A "${LB_PKG_CACHE}" 2>/dev/null)" ]; t
|
|||||||
mkdir -p "${BUILD_WORK_DIR}/cache/packages.chroot"
|
mkdir -p "${BUILD_WORK_DIR}/cache/packages.chroot"
|
||||||
rsync -a "${LB_PKG_CACHE}/" "${BUILD_WORK_DIR}/cache/packages.chroot/"
|
rsync -a "${LB_PKG_CACHE}/" "${BUILD_WORK_DIR}/cache/packages.chroot/"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
rsync -a "${OVERLAY_DIR}/" "${OVERLAY_STAGE_DIR}/"
|
rsync -a "${OVERLAY_DIR}/" "${OVERLAY_STAGE_DIR}/"
|
||||||
rm -f \
|
rm -f \
|
||||||
"${OVERLAY_STAGE_DIR}/etc/bee-ssh-password-fallback" \
|
"${OVERLAY_STAGE_DIR}/etc/bee-ssh-password-fallback" \
|
||||||
@@ -231,6 +249,12 @@ rm -f \
|
|||||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" \
|
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" \
|
||||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
"${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
||||||
|
|
||||||
|
# Remove NVIDIA-specific overlay files for non-nvidia variants
|
||||||
|
if [ "$BEE_GPU_VENDOR" != "nvidia" ]; then
|
||||||
|
rm -f "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nvidia-load"
|
||||||
|
rm -f "${OVERLAY_STAGE_DIR}/etc/systemd/system/bee-nvidia.service"
|
||||||
|
fi
|
||||||
|
|
||||||
# --- inject authorized_keys for SSH access ---
|
# --- inject authorized_keys for SSH access ---
|
||||||
AUTHORIZED_KEYS_FILE="${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys"
|
AUTHORIZED_KEYS_FILE="${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys"
|
||||||
mkdir -p "${OVERLAY_STAGE_DIR}/root/.ssh"
|
mkdir -p "${OVERLAY_STAGE_DIR}/root/.ssh"
|
||||||
@@ -268,8 +292,11 @@ fi
|
|||||||
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/bin"
|
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/bin"
|
||||||
cp "${DIST_DIR}/bee-linux-amd64" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
|
cp "${DIST_DIR}/bee-linux-amd64" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
|
||||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
|
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
|
||||||
cp "${GPU_STRESS_BIN}" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
|
|
||||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
|
if [ "$BEE_GPU_VENDOR" = "nvidia" ] && [ -f "$GPU_STRESS_BIN" ]; then
|
||||||
|
cp "${GPU_STRESS_BIN}" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
|
||||||
|
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
|
||||||
|
fi
|
||||||
|
|
||||||
# --- inject smoketest into overlay so it runs directly on the live CD ---
|
# --- inject smoketest into overlay so it runs directly on the live CD ---
|
||||||
cp "${BUILDER_DIR}/smoketest.sh" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest"
|
cp "${BUILDER_DIR}/smoketest.sh" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest"
|
||||||
@@ -286,100 +313,152 @@ for tool in storcli64 sas2ircu sas3ircu arcconf ssacli; do
|
|||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
# --- build NVIDIA kernel modules ---
|
# --- NVIDIA kernel modules and userspace libs ---
|
||||||
echo ""
|
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||||
echo "=== building NVIDIA ${NVIDIA_DRIVER_VERSION} modules ==="
|
echo ""
|
||||||
sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}"
|
echo "=== building NVIDIA ${NVIDIA_DRIVER_VERSION} modules ==="
|
||||||
|
sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}"
|
||||||
|
|
||||||
KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
||||||
NVIDIA_CACHE="${DIST_DIR}/nvidia-${NVIDIA_DRIVER_VERSION}-${KVER}"
|
NVIDIA_CACHE="${DIST_DIR}/nvidia-${NVIDIA_DRIVER_VERSION}-${KVER}"
|
||||||
|
|
||||||
# Inject .ko files into overlay at /usr/local/lib/nvidia/
|
# Inject .ko files into overlay at /usr/local/lib/nvidia/
|
||||||
OVERLAY_KMOD_DIR="${OVERLAY_DIR}/usr/local/lib/nvidia"
|
OVERLAY_KMOD_DIR="${OVERLAY_STAGE_DIR}/usr/local/lib/nvidia"
|
||||||
OVERLAY_KMOD_DIR="${OVERLAY_STAGE_DIR}/usr/local/lib/nvidia"
|
mkdir -p "${OVERLAY_KMOD_DIR}"
|
||||||
mkdir -p "${OVERLAY_KMOD_DIR}"
|
cp "${NVIDIA_CACHE}/modules/"*.ko "${OVERLAY_KMOD_DIR}/"
|
||||||
cp "${NVIDIA_CACHE}/modules/"*.ko "${OVERLAY_KMOD_DIR}/"
|
|
||||||
|
|
||||||
# Inject nvidia-smi and libnvidia-ml
|
# Inject nvidia-smi and libnvidia-ml
|
||||||
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/bin" "${OVERLAY_STAGE_DIR}/usr/lib"
|
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/bin" "${OVERLAY_STAGE_DIR}/usr/lib"
|
||||||
cp "${NVIDIA_CACHE}/bin/nvidia-smi" "${OVERLAY_STAGE_DIR}/usr/local/bin/"
|
cp "${NVIDIA_CACHE}/bin/nvidia-smi" "${OVERLAY_STAGE_DIR}/usr/local/bin/"
|
||||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/nvidia-smi"
|
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/nvidia-smi"
|
||||||
cp "${NVIDIA_CACHE}/bin/nvidia-bug-report.sh" "${OVERLAY_STAGE_DIR}/usr/local/bin/" 2>/dev/null || true
|
cp "${NVIDIA_CACHE}/bin/nvidia-bug-report.sh" "${OVERLAY_STAGE_DIR}/usr/local/bin/" 2>/dev/null || true
|
||||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/nvidia-bug-report.sh" 2>/dev/null || true
|
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/nvidia-bug-report.sh" 2>/dev/null || true
|
||||||
cp "${NVIDIA_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" 2>/dev/null || true
|
cp "${NVIDIA_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" 2>/dev/null || true
|
||||||
|
|
||||||
# Inject GSP firmware into /lib/firmware/nvidia/<version>/
|
# Inject GSP firmware into /lib/firmware/nvidia/<version>/
|
||||||
if [ -d "${NVIDIA_CACHE}/firmware" ] && [ "$(ls -A "${NVIDIA_CACHE}/firmware" 2>/dev/null)" ]; then
|
if [ -d "${NVIDIA_CACHE}/firmware" ] && [ "$(ls -A "${NVIDIA_CACHE}/firmware" 2>/dev/null)" ]; then
|
||||||
mkdir -p "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}"
|
mkdir -p "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}"
|
||||||
cp "${NVIDIA_CACHE}/firmware/"* "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}/"
|
cp "${NVIDIA_CACHE}/firmware/"* "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}/"
|
||||||
echo "=== firmware: $(ls "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}/" | wc -l) files injected ==="
|
echo "=== firmware: $(ls "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}/" | wc -l) files injected ==="
|
||||||
|
fi
|
||||||
|
|
||||||
|
# --- build / download NCCL ---
|
||||||
|
echo ""
|
||||||
|
echo "=== downloading NCCL ${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION} ==="
|
||||||
|
sh "${BUILDER_DIR}/build-nccl.sh" "${NCCL_VERSION}" "${NCCL_CUDA_VERSION}" "${DIST_DIR}" "${NCCL_SHA256:-}"
|
||||||
|
|
||||||
|
NCCL_CACHE="${DIST_DIR}/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
||||||
|
|
||||||
|
# Inject libnccl.so.* into overlay alongside other NVIDIA userspace libs
|
||||||
|
cp "${NCCL_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
|
||||||
|
echo "=== NCCL: $(ls "${NCCL_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
|
||||||
|
|
||||||
|
# Inject cuBLAS/cuBLASLt/cudart runtime libs used by bee-gpu-stress tensor-core GEMM path
|
||||||
|
cp "${CUBLAS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
|
||||||
|
echo "=== cuBLAS: $(ls "${CUBLAS_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
|
||||||
|
|
||||||
|
# --- build nccl-tests ---
|
||||||
|
echo ""
|
||||||
|
echo "=== building nccl-tests ${NCCL_TESTS_VERSION} ==="
|
||||||
|
sh "${BUILDER_DIR}/build-nccl-tests.sh" \
|
||||||
|
"${NCCL_TESTS_VERSION}" \
|
||||||
|
"${NCCL_VERSION}" \
|
||||||
|
"${NCCL_CUDA_VERSION}" \
|
||||||
|
"${DIST_DIR}" \
|
||||||
|
"${NVCC_VERSION}" \
|
||||||
|
"${DEBIAN_VERSION}"
|
||||||
|
|
||||||
|
NCCL_TESTS_CACHE="${DIST_DIR}/nccl-tests-${NCCL_TESTS_VERSION}"
|
||||||
|
cp "${NCCL_TESTS_CACHE}/bin/all_reduce_perf" "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
||||||
|
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
||||||
|
echo "=== all_reduce_perf injected ==="
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# --- build / download NCCL ---
|
|
||||||
echo ""
|
|
||||||
echo "=== downloading NCCL ${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION} ==="
|
|
||||||
sh "${BUILDER_DIR}/build-nccl.sh" "${NCCL_VERSION}" "${NCCL_CUDA_VERSION}" "${DIST_DIR}" "${NCCL_SHA256:-}"
|
|
||||||
|
|
||||||
NCCL_CACHE="${DIST_DIR}/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
|
||||||
|
|
||||||
# Inject libnccl.so.* into overlay alongside other NVIDIA userspace libs
|
|
||||||
cp "${NCCL_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
|
|
||||||
echo "=== NCCL: $(ls "${NCCL_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
|
|
||||||
|
|
||||||
# Inject cuBLAS/cuBLASLt/cudart runtime libs used by bee-gpu-stress tensor-core GEMM path
|
|
||||||
cp "${CUBLAS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
|
|
||||||
echo "=== cuBLAS: $(ls "${CUBLAS_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
|
|
||||||
|
|
||||||
# --- build nccl-tests ---
|
|
||||||
echo ""
|
|
||||||
echo "=== building nccl-tests ${NCCL_TESTS_VERSION} ==="
|
|
||||||
sh "${BUILDER_DIR}/build-nccl-tests.sh" \
|
|
||||||
"${NCCL_TESTS_VERSION}" \
|
|
||||||
"${NCCL_VERSION}" \
|
|
||||||
"${NCCL_CUDA_VERSION}" \
|
|
||||||
"${DIST_DIR}" \
|
|
||||||
"${NVCC_VERSION}" \
|
|
||||||
"${DEBIAN_VERSION}"
|
|
||||||
|
|
||||||
NCCL_TESTS_CACHE="${DIST_DIR}/nccl-tests-${NCCL_TESTS_VERSION}"
|
|
||||||
cp "${NCCL_TESTS_CACHE}/bin/all_reduce_perf" "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
|
||||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
|
||||||
echo "=== all_reduce_perf injected ==="
|
|
||||||
|
|
||||||
# --- embed build metadata ---
|
# --- embed build metadata ---
|
||||||
mkdir -p "${OVERLAY_STAGE_DIR}/etc"
|
mkdir -p "${OVERLAY_STAGE_DIR}/etc"
|
||||||
BUILD_DATE="$(date +%Y-%m-%d)"
|
BUILD_DATE="$(date +%Y-%m-%d)"
|
||||||
GIT_COMMIT="$(git -C "${REPO_ROOT}" rev-parse --short HEAD 2>/dev/null || echo unknown)"
|
GIT_COMMIT="$(git -C "${REPO_ROOT}" rev-parse --short HEAD 2>/dev/null || echo unknown)"
|
||||||
cat > "${OVERLAY_STAGE_DIR}/etc/bee-release" <<EOF
|
|
||||||
BEE_ISO_VERSION=${ISO_VERSION_EFFECTIVE}
|
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||||
BEE_AUDIT_VERSION=${AUDIT_VERSION_EFFECTIVE}
|
GPU_VERSION_LINE="NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}
|
||||||
BUILD_DATE=${BUILD_DATE}
|
|
||||||
GIT_COMMIT=${GIT_COMMIT}
|
|
||||||
DEBIAN_VERSION=${DEBIAN_VERSION}
|
|
||||||
DEBIAN_KERNEL_ABI=${DEBIAN_KERNEL_ABI}
|
|
||||||
NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}
|
|
||||||
NCCL_VERSION=${NCCL_VERSION}
|
NCCL_VERSION=${NCCL_VERSION}
|
||||||
NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION}
|
NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION}
|
||||||
CUBLAS_VERSION=${CUBLAS_VERSION}
|
CUBLAS_VERSION=${CUBLAS_VERSION}
|
||||||
CUDA_USERSPACE_VERSION=${CUDA_USERSPACE_VERSION}
|
CUDA_USERSPACE_VERSION=${CUDA_USERSPACE_VERSION}
|
||||||
NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}
|
NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}"
|
||||||
|
GPU_BUILD_INFO="nvidia:${NVIDIA_DRIVER_VERSION}"
|
||||||
|
elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
|
||||||
|
GPU_VERSION_LINE="ROCM_VERSION=${ROCM_VERSION}"
|
||||||
|
GPU_BUILD_INFO="rocm:${ROCM_VERSION}"
|
||||||
|
else
|
||||||
|
GPU_VERSION_LINE=""
|
||||||
|
GPU_BUILD_INFO="nogpu"
|
||||||
|
fi
|
||||||
|
|
||||||
|
cat > "${OVERLAY_STAGE_DIR}/etc/bee-release" <<EOF
|
||||||
|
BEE_ISO_VERSION=${ISO_VERSION_EFFECTIVE}
|
||||||
|
BEE_AUDIT_VERSION=${AUDIT_VERSION_EFFECTIVE}
|
||||||
|
BEE_GPU_VENDOR=${BEE_GPU_VENDOR}
|
||||||
|
BUILD_DATE=${BUILD_DATE}
|
||||||
|
GIT_COMMIT=${GIT_COMMIT}
|
||||||
|
DEBIAN_VERSION=${DEBIAN_VERSION}
|
||||||
|
DEBIAN_KERNEL_ABI=${DEBIAN_KERNEL_ABI}
|
||||||
|
${GPU_VERSION_LINE}
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
|
# Write GPU vendor marker for hooks
|
||||||
|
echo "${BEE_GPU_VENDOR}" > "${OVERLAY_STAGE_DIR}/etc/bee-gpu-vendor"
|
||||||
|
|
||||||
# Patch motd with build info
|
# Patch motd with build info
|
||||||
BEE_BUILD_INFO="${BUILD_DATE} git:${GIT_COMMIT} debian:${DEBIAN_VERSION} nvidia:${NVIDIA_DRIVER_VERSION}"
|
BEE_BUILD_INFO="${BUILD_DATE} git:${GIT_COMMIT} debian:${DEBIAN_VERSION} ${GPU_BUILD_INFO}"
|
||||||
if [ -f "${OVERLAY_STAGE_DIR}/etc/motd" ]; then
|
if [ -f "${OVERLAY_STAGE_DIR}/etc/motd" ]; then
|
||||||
sed "s/%%BUILD_INFO%%/${BEE_BUILD_INFO}/" "${OVERLAY_STAGE_DIR}/etc/motd" \
|
sed "s/%%BUILD_INFO%%/${BEE_BUILD_INFO}/" "${OVERLAY_STAGE_DIR}/etc/motd" \
|
||||||
> "${OVERLAY_STAGE_DIR}/etc/motd.patched"
|
> "${OVERLAY_STAGE_DIR}/etc/motd.patched"
|
||||||
mv "${OVERLAY_STAGE_DIR}/etc/motd.patched" "${OVERLAY_STAGE_DIR}/etc/motd"
|
mv "${OVERLAY_STAGE_DIR}/etc/motd.patched" "${OVERLAY_STAGE_DIR}/etc/motd"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# --- substitute version placeholders in package list ---
|
# --- copy variant-specific package list into work dir ---
|
||||||
sed -i \
|
cp "${BUILD_WORK_DIR}/config/package-lists/bee-${BEE_GPU_VENDOR}.list.chroot" \
|
||||||
-e "s/%%DCGM_VERSION%%/${DCGM_VERSION}/g" \
|
"${BUILD_WORK_DIR}/config/package-lists/bee-gpu.list.chroot"
|
||||||
-e "s/%%ROCM_VERSION%%/${ROCM_VERSION}/g" \
|
|
||||||
-e "s/%%ROCM_SMI_VERSION%%/${ROCM_SMI_VERSION}/g" \
|
# --- remove archives for the other vendor(s) ---
|
||||||
"${BUILD_WORK_DIR}/config/package-lists/bee.list.chroot" \
|
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||||
"${BUILD_WORK_DIR}/config/archives/rocm.list.chroot"
|
rm -f "${BUILD_WORK_DIR}/config/archives/rocm.list.chroot" \
|
||||||
|
"${BUILD_WORK_DIR}/config/archives/rocm.key.chroot"
|
||||||
|
elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
|
||||||
|
rm -f "${BUILD_WORK_DIR}/config/archives/nvidia-cuda.list.chroot" \
|
||||||
|
"${BUILD_WORK_DIR}/config/archives/nvidia-cuda.key.chroot"
|
||||||
|
else
|
||||||
|
# nogpu: remove both
|
||||||
|
rm -f "${BUILD_WORK_DIR}/config/archives/rocm.list.chroot" \
|
||||||
|
"${BUILD_WORK_DIR}/config/archives/rocm.key.chroot" \
|
||||||
|
"${BUILD_WORK_DIR}/config/archives/nvidia-cuda.list.chroot" \
|
||||||
|
"${BUILD_WORK_DIR}/config/archives/nvidia-cuda.key.chroot"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# --- substitute version placeholders in package list and archive ---
|
||||||
|
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||||
|
sed -i \
|
||||||
|
-e "s/%%DCGM_VERSION%%/${DCGM_VERSION}/g" \
|
||||||
|
"${BUILD_WORK_DIR}/config/package-lists/bee-gpu.list.chroot"
|
||||||
|
elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
|
||||||
|
sed -i \
|
||||||
|
-e "s/%%ROCM_VERSION%%/${ROCM_VERSION}/g" \
|
||||||
|
-e "s/%%ROCM_SMI_VERSION%%/${ROCM_SMI_VERSION}/g" \
|
||||||
|
-e "s/%%ROCM_BANDWIDTH_TEST_VERSION%%/${ROCM_BANDWIDTH_TEST_VERSION}/g" \
|
||||||
|
-e "s/%%ROCM_VALIDATION_SUITE_VERSION%%/${ROCM_VALIDATION_SUITE_VERSION}/g" \
|
||||||
|
-e "s/%%ROCBLAS_VERSION%%/${ROCBLAS_VERSION}/g" \
|
||||||
|
-e "s/%%ROCRAND_VERSION%%/${ROCRAND_VERSION}/g" \
|
||||||
|
-e "s/%%HIP_RUNTIME_AMD_VERSION%%/${HIP_RUNTIME_AMD_VERSION}/g" \
|
||||||
|
-e "s/%%HIPBLASLT_VERSION%%/${HIPBLASLT_VERSION}/g" \
|
||||||
|
-e "s/%%COMGR_VERSION%%/${COMGR_VERSION}/g" \
|
||||||
|
"${BUILD_WORK_DIR}/config/package-lists/bee-gpu.list.chroot"
|
||||||
|
if [ -f "${BUILD_WORK_DIR}/config/archives/rocm.list.chroot" ]; then
|
||||||
|
sed -i \
|
||||||
|
-e "s/%%ROCM_VERSION%%/${ROCM_VERSION}/g" \
|
||||||
|
"${BUILD_WORK_DIR}/config/archives/rocm.list.chroot"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
# --- sync overlay into live-build includes.chroot ---
|
# --- sync overlay into live-build includes.chroot ---
|
||||||
LB_DIR="${BUILD_WORK_DIR}"
|
LB_DIR="${BUILD_WORK_DIR}"
|
||||||
@@ -395,20 +474,31 @@ fi
|
|||||||
|
|
||||||
# --- build ISO using live-build ---
|
# --- build ISO using live-build ---
|
||||||
echo ""
|
echo ""
|
||||||
echo "=== building ISO (live-build) ==="
|
echo "=== building ISO (live-build, variant: ${BEE_GPU_VENDOR}) ==="
|
||||||
|
|
||||||
|
# Export for auto/config
|
||||||
|
BEE_GPU_VENDOR_UPPER="$(echo "${BEE_GPU_VENDOR}" | tr 'a-z' 'A-Z')"
|
||||||
|
export BEE_GPU_VENDOR_UPPER
|
||||||
|
|
||||||
cd "${LB_DIR}"
|
cd "${LB_DIR}"
|
||||||
lb clean 2>&1 | tail -3
|
lb clean 2>&1 | tail -3
|
||||||
lb config 2>&1 | tail -5
|
lb config 2>&1 | tail -5
|
||||||
lb build 2>&1
|
lb build 2>&1
|
||||||
|
|
||||||
|
# --- persist deb package cache back to shared location ---
|
||||||
|
# This allows the second variant to reuse all downloaded packages.
|
||||||
|
if [ -d "${BUILD_WORK_DIR}/cache/packages.chroot" ]; then
|
||||||
|
rsync -a "${BUILD_WORK_DIR}/cache/packages.chroot/" "${LB_PKG_CACHE}/"
|
||||||
|
echo "=== package cache synced to ${LB_PKG_CACHE} ==="
|
||||||
|
fi
|
||||||
|
|
||||||
# live-build outputs live-image-amd64.hybrid.iso in LB_DIR
|
# live-build outputs live-image-amd64.hybrid.iso in LB_DIR
|
||||||
ISO_RAW="${LB_DIR}/live-image-amd64.hybrid.iso"
|
ISO_RAW="${LB_DIR}/live-image-amd64.hybrid.iso"
|
||||||
ISO_OUT="${DIST_DIR}/bee-debian${DEBIAN_VERSION}-v${ISO_VERSION_EFFECTIVE}-amd64.iso"
|
ISO_OUT="${DIST_DIR}/easy-bee-${BEE_GPU_VENDOR}-v${ISO_VERSION_EFFECTIVE}-amd64.iso"
|
||||||
if [ -f "$ISO_RAW" ]; then
|
if [ -f "$ISO_RAW" ]; then
|
||||||
cp "$ISO_RAW" "$ISO_OUT"
|
cp "$ISO_RAW" "$ISO_OUT"
|
||||||
echo ""
|
echo ""
|
||||||
echo "=== done ==="
|
echo "=== done (${BEE_GPU_VENDOR}) ==="
|
||||||
echo "ISO: $ISO_OUT"
|
echo "ISO: $ISO_OUT"
|
||||||
if command -v stat >/dev/null 2>&1; then
|
if command -v stat >/dev/null 2>&1; then
|
||||||
ISO_SIZE_BYTES="$(stat -c '%s' "$ISO_OUT" 2>/dev/null || stat -f '%z' "$ISO_OUT")"
|
ISO_SIZE_BYTES="$(stat -c '%s' "$ISO_OUT" 2>/dev/null || stat -f '%z' "$ISO_OUT")"
|
||||||
|
|||||||
@@ -10,12 +10,12 @@ echo " ╚══════╝╚═╝ ╚═╝╚══════╝
|
|||||||
echo ""
|
echo ""
|
||||||
|
|
||||||
menuentry "EASY-BEE" {
|
menuentry "EASY-BEE" {
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||||
initrd @INITRD_LIVE@
|
initrd @INITRD_LIVE@
|
||||||
}
|
}
|
||||||
|
|
||||||
menuentry "EASY-BEE (load to RAM)" {
|
menuentry "EASY-BEE (load to RAM)" {
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||||
initrd @INITRD_LIVE@
|
initrd @INITRD_LIVE@
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -5,6 +5,9 @@ set -e
|
|||||||
|
|
||||||
echo "=== bee chroot setup ==="
|
echo "=== bee chroot setup ==="
|
||||||
|
|
||||||
|
GPU_VENDOR=$(cat /etc/bee-gpu-vendor 2>/dev/null || echo nvidia)
|
||||||
|
echo "=== GPU vendor: ${GPU_VENDOR} ==="
|
||||||
|
|
||||||
ensure_bee_console_user() {
|
ensure_bee_console_user() {
|
||||||
if id bee >/dev/null 2>&1; then
|
if id bee >/dev/null 2>&1; then
|
||||||
usermod -d /home/bee -s /bin/bash bee 2>/dev/null || true
|
usermod -d /home/bee -s /bin/bash bee 2>/dev/null || true
|
||||||
@@ -21,10 +24,8 @@ ensure_bee_console_user() {
|
|||||||
|
|
||||||
ensure_bee_console_user
|
ensure_bee_console_user
|
||||||
|
|
||||||
# Enable bee services
|
# Enable common bee services
|
||||||
systemctl enable nvidia-dcgm.service 2>/dev/null || true
|
|
||||||
systemctl enable bee-network.service
|
systemctl enable bee-network.service
|
||||||
systemctl enable bee-nvidia.service
|
|
||||||
systemctl enable bee-preflight.service
|
systemctl enable bee-preflight.service
|
||||||
systemctl enable bee-audit.service
|
systemctl enable bee-audit.service
|
||||||
systemctl enable bee-web.service
|
systemctl enable bee-web.service
|
||||||
@@ -36,25 +37,34 @@ systemctl enable serial-getty@ttyS0.service 2>/dev/null || true
|
|||||||
systemctl enable serial-getty@ttyS1.service 2>/dev/null || true
|
systemctl enable serial-getty@ttyS1.service 2>/dev/null || true
|
||||||
systemctl enable bee-journal-mirror@ttyS1.service 2>/dev/null || true
|
systemctl enable bee-journal-mirror@ttyS1.service 2>/dev/null || true
|
||||||
|
|
||||||
|
# Enable GPU-vendor specific services
|
||||||
|
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
||||||
|
systemctl enable nvidia-dcgm.service 2>/dev/null || true
|
||||||
|
systemctl enable bee-nvidia.service
|
||||||
|
elif [ "$GPU_VENDOR" = "amd" ]; then
|
||||||
|
# ROCm symlinks (packages install to /opt/rocm-*/bin/)
|
||||||
|
for tool in rocm-smi rocm-bandwidth-test rvs; do
|
||||||
|
if [ ! -e /usr/local/bin/${tool} ]; then
|
||||||
|
bin_path="$(find /opt -path "*/bin/${tool}" -type f 2>/dev/null | sort | tail -1)"
|
||||||
|
[ -n "${bin_path}" ] && ln -sf "${bin_path}" /usr/local/bin/${tool}
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
# nogpu: no GPU services needed
|
||||||
|
|
||||||
# Ensure scripts are executable
|
# Ensure scripts are executable
|
||||||
chmod +x /usr/local/bin/bee-network.sh 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-network.sh 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
|
|
||||||
chmod +x /usr/local/bin/bee-sshsetup 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-sshsetup 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee-smoketest 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-smoketest 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee 2>/dev/null || true
|
chmod +x /usr/local/bin/bee 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true
|
||||||
|
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
||||||
|
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
|
||||||
# Reload udev rules
|
# Reload udev rules
|
||||||
udevadm control --reload-rules 2>/dev/null || true
|
udevadm control --reload-rules 2>/dev/null || true
|
||||||
|
|
||||||
# rocm symlinks (packages install to /opt/rocm-*/bin/)
|
|
||||||
for tool in rocm-smi rocm-bandwidth-test rvs; do
|
|
||||||
if [ ! -e /usr/local/bin/${tool} ]; then
|
|
||||||
bin_path="$(find /opt -path "*/bin/${tool}" -type f 2>/dev/null | sort | tail -1)"
|
|
||||||
[ -n "${bin_path}" ] && ln -sf "${bin_path}" /usr/local/bin/${tool}
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
# Create export directory
|
# Create export directory
|
||||||
mkdir -p /appdata/bee/export
|
mkdir -p /appdata/bee/export
|
||||||
|
|
||||||
@@ -62,4 +72,4 @@ if [ -f /etc/sudoers.d/bee ]; then
|
|||||||
chmod 0440 /etc/sudoers.d/bee
|
chmod 0440 /etc/sudoers.d/bee
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "=== bee chroot setup complete ==="
|
echo "=== bee chroot setup complete (${GPU_VENDOR}) ==="
|
||||||
|
|||||||
@@ -4,6 +4,9 @@
|
|||||||
# not inside the squashfs).
|
# not inside the squashfs).
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
|
echo "memtest: scanning chroot/boot/ for memtest files:"
|
||||||
|
ls chroot/boot/memtest* 2>/dev/null || echo "memtest: WARNING: no memtest files found in chroot/boot/"
|
||||||
|
|
||||||
for f in memtest86+x64.bin memtest86+x64.efi memtest86+ia32.bin memtest86+ia32.efi; do
|
for f in memtest86+x64.bin memtest86+x64.efi memtest86+ia32.bin memtest86+ia32.efi; do
|
||||||
src="chroot/boot/${f}"
|
src="chroot/boot/${f}"
|
||||||
if [ -f "${src}" ]; then
|
if [ -f "${src}" ]; then
|
||||||
|
|||||||
9
iso/builder/config/package-lists/bee-amd.list.chroot
Normal file
9
iso/builder/config/package-lists/bee-amd.list.chroot
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
# AMD ROCm — GPU monitoring, bandwidth test, and compute stress (RVS GST)
|
||||||
|
rocm-smi-lib=%%ROCM_SMI_VERSION%%
|
||||||
|
rocm-bandwidth-test=%%ROCM_BANDWIDTH_TEST_VERSION%%
|
||||||
|
rocm-validation-suite=%%ROCM_VALIDATION_SUITE_VERSION%%
|
||||||
|
rocblas=%%ROCBLAS_VERSION%%
|
||||||
|
rocrand=%%ROCRAND_VERSION%%
|
||||||
|
hip-runtime-amd=%%HIP_RUNTIME_AMD_VERSION%%
|
||||||
|
hipblaslt=%%HIPBLASLT_VERSION%%
|
||||||
|
comgr=%%COMGR_VERSION%%
|
||||||
1
iso/builder/config/package-lists/bee-nogpu.list.chroot
Normal file
1
iso/builder/config/package-lists/bee-nogpu.list.chroot
Normal file
@@ -0,0 +1 @@
|
|||||||
|
# No GPU variant — no NVIDIA, no AMD/ROCm packages
|
||||||
2
iso/builder/config/package-lists/bee-nvidia.list.chroot
Normal file
2
iso/builder/config/package-lists/bee-nvidia.list.chroot
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
# NVIDIA DCGM (Data Center GPU Manager) — dcgmi diag for acceptance testing
|
||||||
|
datacenter-gpu-manager=1:%%DCGM_VERSION%%
|
||||||
@@ -72,18 +72,5 @@ firmware-bnx2x
|
|||||||
firmware-cavium
|
firmware-cavium
|
||||||
firmware-qlogic
|
firmware-qlogic
|
||||||
|
|
||||||
# NVIDIA DCGM (Data Center GPU Manager) — dcgmi diag for acceptance testing
|
|
||||||
datacenter-gpu-manager=1:%%DCGM_VERSION%%
|
|
||||||
|
|
||||||
# AMD ROCm — GPU monitoring, bandwidth test, and compute stress (RVS GST)
|
|
||||||
rocm-smi-lib=%%ROCM_SMI_VERSION%%
|
|
||||||
rocm-bandwidth-test=%%ROCM_BANDWIDTH_TEST_VERSION%%
|
|
||||||
rocm-validation-suite=%%ROCM_VALIDATION_SUITE_VERSION%%
|
|
||||||
rocblas=%%ROCBLAS_VERSION%%
|
|
||||||
rocrand=%%ROCRAND_VERSION%%
|
|
||||||
hip-runtime-amd=%%HIP_RUNTIME_AMD_VERSION%%
|
|
||||||
hipblaslt=%%HIPBLASLT_VERSION%%
|
|
||||||
comgr=%%COMGR_VERSION%%
|
|
||||||
|
|
||||||
# glibc compat helpers (for any external binaries that need it)
|
# glibc compat helpers (for any external binaries that need it)
|
||||||
libc6
|
libc6
|
||||||
|
|||||||
@@ -39,7 +39,7 @@ info "nvidia boot mode: ${NVIDIA_BOOT_MODE}"
|
|||||||
# --- PATH & binaries ---
|
# --- PATH & binaries ---
|
||||||
echo "-- PATH & binaries --"
|
echo "-- PATH & binaries --"
|
||||||
for tool in dmidecode smartctl nvme ipmitool lspci bee; do
|
for tool in dmidecode smartctl nvme ipmitool lspci bee; do
|
||||||
if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then
|
if p=$(PATH="/usr/local/bin:/usr/sbin:/sbin:$PATH" command -v "$tool" 2>/dev/null); then
|
||||||
ok "$tool found: $p"
|
ok "$tool found: $p"
|
||||||
else
|
else
|
||||||
fail "$tool: NOT FOUND"
|
fail "$tool: NOT FOUND"
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
export PATH="$PATH:/usr/local/bin:/opt/rocm/bin:/opt/rocm/sbin"
|
export PATH="$PATH:/usr/local/bin:/usr/sbin:/sbin:/opt/rocm/bin:/opt/rocm/sbin"
|
||||||
|
|
||||||
# Print web UI URLs on the local console at login.
|
# Print web UI URLs on the local console at login.
|
||||||
if [ -z "${SSH_CONNECTION:-}" ] \
|
if [ -z "${SSH_CONNECTION:-}" ] \
|
||||||
|
|||||||
Reference in New Issue
Block a user