Compare commits

...

6 Commits
v3.2 ... v3.4

Author SHA1 Message Date
ace1a9dba6 feat(iso): split into nvidia and amd variants, fix KVM graphics and PATH
- build.sh: add --variant nvidia|amd; separate work dirs per variant
  (live-build-work-nvidia / live-build-work-amd); GPU-specific steps
  (modules, NCCL, cuBLAS, nccl-tests) run only for nvidia; deb package
  cache synced back to shared location after each lb build so second
  variant reuses downloaded packages; ISO output named
  easy-bee-{variant}-v{ver}-amd64.iso
- build-in-container.sh: add --variant nvidia|amd|all (default: all);
  runs build.sh twice in one container for 'all'; --clean-build wipes
  both variant work dirs
- package-lists: remove GPU packages from bee.list.chroot; add
  bee-nvidia.list.chroot (DCGM) and bee-amd.list.chroot (ROCm)
- 9000-bee-setup hook: read /etc/bee-gpu-vendor; enable bee-nvidia.service
  and DCGM only for nvidia; set up ROCm symlinks only for amd
- auto/config: --iso-volume uses BEE_GPU_VENDOR_UPPER env var
- grub.cfg: add nomodeset to EASY-BEE and EASY-BEE (load to RAM) entries
  — fixes X/lightdm on BMC KVM (ASPEED AST chip requires nomodeset for
  fbdev to work; NVIDIA H100 compute does not need KMS)
- bee.sh / smoketest.sh: add /usr/sbin to PATH so dmidecode, smartctl,
  nvme are found
- 9100-memtest hook: add diagnostic listing of chroot/boot/memtest* files

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-30 22:24:37 +03:00
905c581ece fix(iso): substitute all ROCm package version placeholders in build.sh
ROCM_BANDWIDTH_TEST_VERSION, ROCM_VALIDATION_SUITE_VERSION, ROCBLAS,
ROCRAND, HIP_RUNTIME_AMD, HIPBLASLT, COMGR were defined in VERSIONS and
in bee.list.chroot but the sed substitution block only covered 3 of them.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-29 22:00:05 +03:00
7c2a0135d2 feat(audit): add platform thermal cycling stress test
Runs CPU (stressapptest) + GPU stress simultaneously across multiple
load/idle cycles with varying idle durations (120s/60s/30s) to detect
cooling systems that fail to recover under repeated load.

Presets: smoke (~5 min), acceptance (~25 min), overnight (~100 min).
Outputs metrics.csv + summary.txt with per-cycle throttle and fan
spindown analysis, packed as tar.gz.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-29 21:57:33 +03:00
407c1cd1c4 fix(charts): unify timeline labels across graphs 2026-03-29 21:24:06 +03:00
e15bcc91c5 feat(metrics): persist history in sqlite and add AMD memory validate tests 2026-03-29 12:28:06 +03:00
98f0cf0d52 fix(amd-stress): include VRAM load in GST burn 2026-03-29 12:03:50 +03:00
22 changed files with 1422 additions and 210 deletions

View File

@@ -114,10 +114,13 @@ type satRunner interface {
DetectGPUVendor() string
ListAMDGPUs() ([]platform.AMDGPUInfo, error)
RunAMDAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
RunAMDMemIntegrityPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
RunAMDMemBandwidthPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
RunAMDStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
RunMemoryStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
}
@@ -577,6 +580,20 @@ func (a *App) RunAMDAcceptancePackResult(baseDir string) (ActionResult, error) {
return ActionResult{Title: "AMD GPU SAT", Body: satResultBody(path)}, err
}
func (a *App) RunAMDMemIntegrityPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunAMDMemIntegrityPack(ctx, baseDir, logFunc)
}
func (a *App) RunAMDMemBandwidthPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunAMDMemBandwidthPack(ctx, baseDir, logFunc)
}
func (a *App) RunMemoryStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
return a.RunMemoryStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
}
@@ -611,6 +628,13 @@ func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platfor
return a.sat.RunFanStressTest(ctx, baseDir, opts)
}
func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc)
}
func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
path, err := a.sat.RunNCCLTests(ctx, DefaultSATBaseDir, nil)
body := "Results: " + path

View File

@@ -181,6 +181,14 @@ func (f fakeSAT) RunAMDAcceptancePack(_ context.Context, baseDir string, _ func(
return "", nil
}
func (f fakeSAT) RunAMDMemIntegrityPack(_ context.Context, _ string, _ func(string)) (string, error) {
return "", nil
}
func (f fakeSAT) RunAMDMemBandwidthPack(_ context.Context, _ string, _ func(string)) (string, error) {
return "", nil
}
func (f fakeSAT) RunAMDStressPack(_ context.Context, _ string, _ int, _ func(string)) (string, error) {
return "", nil
}
@@ -195,6 +203,10 @@ func (f fakeSAT) RunFanStressTest(_ context.Context, _ string, _ platform.FanStr
return "", nil
}
func (f fakeSAT) RunPlatformStress(_ context.Context, _ string, _ platform.PlatformStressOptions, _ func(string)) (string, error) {
return "", nil
}
func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) {
return "", nil
}

View File

@@ -0,0 +1,476 @@
package platform
import (
"archive/tar"
"bytes"
"compress/gzip"
"context"
"encoding/csv"
"fmt"
"os"
"os/exec"
"path/filepath"
"strconv"
"strings"
"sync"
"time"
)
// PlatformStressCycle defines one load+idle cycle.
type PlatformStressCycle struct {
LoadSec int // seconds of simultaneous CPU+GPU stress
IdleSec int // seconds of idle monitoring after load cut
}
// PlatformStressOptions controls the thermal cycling test.
type PlatformStressOptions struct {
Cycles []PlatformStressCycle
}
// platformStressRow is one second of telemetry.
type platformStressRow struct {
ElapsedSec float64
Cycle int
Phase string // "load" | "idle"
CPULoadPct float64
MaxCPUTempC float64
MaxGPUTempC float64
SysPowerW float64
FanMinRPM float64
FanMaxRPM float64
GPUThrottled bool
}
// RunPlatformStress runs repeated load+idle thermal cycling.
// Each cycle starts CPU (stressapptest) and GPU stress simultaneously,
// runs for LoadSec, then cuts load abruptly and monitors for IdleSec.
func (s *System) RunPlatformStress(
ctx context.Context,
baseDir string,
opts PlatformStressOptions,
logFunc func(string),
) (string, error) {
if logFunc == nil {
logFunc = func(string) {}
}
if len(opts.Cycles) == 0 {
return "", fmt.Errorf("no cycles defined")
}
if err := os.MkdirAll(baseDir, 0755); err != nil {
return "", fmt.Errorf("mkdir %s: %w", baseDir, err)
}
stamp := time.Now().UTC().Format("20060102-150405")
runDir := filepath.Join(baseDir, "platform-stress-"+stamp)
if err := os.MkdirAll(runDir, 0755); err != nil {
return "", fmt.Errorf("mkdir run dir: %w", err)
}
vendor := s.DetectGPUVendor()
logFunc(fmt.Sprintf("Platform Thermal Cycling — %d cycle(s), GPU vendor: %s", len(opts.Cycles), vendor))
var rows []platformStressRow
start := time.Now()
var analyses []cycleAnalysis
for i, cycle := range opts.Cycles {
if ctx.Err() != nil {
break
}
cycleNum := i + 1
logFunc(fmt.Sprintf("--- Cycle %d/%d: load=%ds, idle=%ds ---", cycleNum, len(opts.Cycles), cycle.LoadSec, cycle.IdleSec))
// ── LOAD PHASE ───────────────────────────────────────────────────────
loadCtx, loadCancel := context.WithTimeout(ctx, time.Duration(cycle.LoadSec)*time.Second)
var wg sync.WaitGroup
// CPU stress
wg.Add(1)
go func() {
defer wg.Done()
cpuCmd, err := buildCPUStressCmd(loadCtx)
if err != nil {
logFunc("CPU stress: " + err.Error())
return
}
_ = cpuCmd.Wait() // exits when loadCtx times out (SIGKILL)
}()
// GPU stress
wg.Add(1)
go func() {
defer wg.Done()
gpuCmd := buildGPUStressCmd(loadCtx, vendor)
if gpuCmd == nil {
return
}
_ = gpuCmd.Wait()
}()
// Monitoring goroutine for load phase
loadRows := collectPhase(loadCtx, cycleNum, "load", start)
for _, r := range loadRows {
logFunc(formatPlatformRow(r))
}
rows = append(rows, loadRows...)
loadCancel()
wg.Wait()
if len(loadRows) > 0 {
logFunc(fmt.Sprintf("Cycle %d load ended (%.0fs)", cycleNum, loadRows[len(loadRows)-1].ElapsedSec))
}
// ── IDLE PHASE ───────────────────────────────────────────────────────
idleCtx, idleCancel := context.WithTimeout(ctx, time.Duration(cycle.IdleSec)*time.Second)
idleRows := collectPhase(idleCtx, cycleNum, "idle", start)
for _, r := range idleRows {
logFunc(formatPlatformRow(r))
}
rows = append(rows, idleRows...)
idleCancel()
// Per-cycle analysis
an := analyzePlatformCycle(loadRows, idleRows)
analyses = append(analyses, an)
logFunc(fmt.Sprintf("Cycle %d: maxCPU=%.1f°C maxGPU=%.1f°C power=%.0fW throttled=%v fanDrop=%.0f%%",
cycleNum, an.maxCPUTemp, an.maxGPUTemp, an.maxPower, an.throttled, an.fanDropPct))
}
// Write CSV
csvData := writePlatformCSV(rows)
_ = os.WriteFile(filepath.Join(runDir, "metrics.csv"), csvData, 0644)
// Write summary
summary := writePlatformSummary(opts, analyses)
logFunc("--- Summary ---")
for _, line := range strings.Split(summary, "\n") {
if line != "" {
logFunc(line)
}
}
_ = os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644)
// Pack tar.gz
archivePath := filepath.Join(baseDir, "platform-stress-"+stamp+".tar.gz")
if err := packPlatformDir(runDir, archivePath); err != nil {
return "", fmt.Errorf("pack archive: %w", err)
}
_ = os.RemoveAll(runDir)
return archivePath, nil
}
// collectPhase samples live metrics every second until ctx is done.
func collectPhase(ctx context.Context, cycle int, phase string, testStart time.Time) []platformStressRow {
var rows []platformStressRow
ticker := time.NewTicker(time.Second)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return rows
case <-ticker.C:
sample := SampleLiveMetrics()
rows = append(rows, sampleToPlatformRow(sample, cycle, phase, testStart))
}
}
}
func sampleToPlatformRow(s LiveMetricSample, cycle int, phase string, testStart time.Time) platformStressRow {
r := platformStressRow{
ElapsedSec: time.Since(testStart).Seconds(),
Cycle: cycle,
Phase: phase,
CPULoadPct: s.CPULoadPct,
SysPowerW: s.PowerW,
}
for _, t := range s.Temps {
switch t.Group {
case "cpu":
if t.Celsius > r.MaxCPUTempC {
r.MaxCPUTempC = t.Celsius
}
case "gpu":
if t.Celsius > r.MaxGPUTempC {
r.MaxGPUTempC = t.Celsius
}
}
}
for _, g := range s.GPUs {
if g.TempC > r.MaxGPUTempC {
r.MaxGPUTempC = g.TempC
}
}
if len(s.Fans) > 0 {
r.FanMinRPM = s.Fans[0].RPM
r.FanMaxRPM = s.Fans[0].RPM
for _, f := range s.Fans[1:] {
if f.RPM < r.FanMinRPM {
r.FanMinRPM = f.RPM
}
if f.RPM > r.FanMaxRPM {
r.FanMaxRPM = f.RPM
}
}
}
return r
}
func formatPlatformRow(r platformStressRow) string {
throttle := ""
if r.GPUThrottled {
throttle = " THROTTLE"
}
fans := ""
if r.FanMinRPM > 0 {
fans = fmt.Sprintf(" fans=%.0f-%.0fRPM", r.FanMinRPM, r.FanMaxRPM)
}
return fmt.Sprintf("[%5.0fs] cycle=%d phase=%-4s cpu=%.0f%% cpuT=%.1f°C gpuT=%.1f°C pwr=%.0fW%s%s",
r.ElapsedSec, r.Cycle, r.Phase, r.CPULoadPct, r.MaxCPUTempC, r.MaxGPUTempC, r.SysPowerW, fans, throttle)
}
func analyzePlatformCycle(loadRows, idleRows []platformStressRow) cycleAnalysis {
var an cycleAnalysis
for _, r := range loadRows {
if r.MaxCPUTempC > an.maxCPUTemp {
an.maxCPUTemp = r.MaxCPUTempC
}
if r.MaxGPUTempC > an.maxGPUTemp {
an.maxGPUTemp = r.MaxGPUTempC
}
if r.SysPowerW > an.maxPower {
an.maxPower = r.SysPowerW
}
if r.GPUThrottled {
an.throttled = true
}
}
// Fan RPM at cut = avg of last 5 load rows
if n := len(loadRows); n > 0 {
window := loadRows
if n > 5 {
window = loadRows[n-5:]
}
var sum float64
var cnt int
for _, r := range window {
if r.FanMinRPM > 0 {
sum += (r.FanMinRPM + r.FanMaxRPM) / 2
cnt++
}
}
if cnt > 0 {
an.fanAtCutAvg = sum / float64(cnt)
}
}
// Fan RPM min in first 15s of idle
an.fanMin15s = an.fanAtCutAvg
var cutElapsed float64
if len(loadRows) > 0 {
cutElapsed = loadRows[len(loadRows)-1].ElapsedSec
}
for _, r := range idleRows {
if r.ElapsedSec > cutElapsed+15 {
break
}
avg := (r.FanMinRPM + r.FanMaxRPM) / 2
if avg > 0 && (an.fanMin15s == 0 || avg < an.fanMin15s) {
an.fanMin15s = avg
}
}
if an.fanAtCutAvg > 0 {
an.fanDropPct = (an.fanAtCutAvg - an.fanMin15s) / an.fanAtCutAvg * 100
}
return an
}
type cycleAnalysis struct {
maxCPUTemp float64
maxGPUTemp float64
maxPower float64
throttled bool
fanAtCutAvg float64
fanMin15s float64
fanDropPct float64
}
func writePlatformSummary(opts PlatformStressOptions, analyses []cycleAnalysis) string {
var b strings.Builder
fmt.Fprintf(&b, "Platform Thermal Cycling — %d cycle(s)\n", len(opts.Cycles))
fmt.Fprintf(&b, "%s\n\n", strings.Repeat("=", 48))
totalThrottle := 0
totalFanWarn := 0
for i, an := range analyses {
cycle := opts.Cycles[i]
fmt.Fprintf(&b, "Cycle %d/%d (load=%ds, idle=%ds)\n", i+1, len(opts.Cycles), cycle.LoadSec, cycle.IdleSec)
fmt.Fprintf(&b, " Max CPU temp: %.1f°C\n", an.maxCPUTemp)
fmt.Fprintf(&b, " Max GPU temp: %.1f°C\n", an.maxGPUTemp)
fmt.Fprintf(&b, " Max sys power: %.0f W\n", an.maxPower)
if an.throttled {
fmt.Fprintf(&b, " Throttle: DETECTED\n")
totalThrottle++
} else {
fmt.Fprintf(&b, " Throttle: none\n")
}
if an.fanAtCutAvg > 0 {
fmt.Fprintf(&b, " Fan at load cut: %.0f RPM avg\n", an.fanAtCutAvg)
fmt.Fprintf(&b, " Fan min (first 15s idle): %.0f RPM (drop %.0f%%)\n", an.fanMin15s, an.fanDropPct)
if an.fanDropPct > 20 {
fmt.Fprintf(&b, " Fan response: WARN — fast spindown (>20%% drop in 15s)\n")
totalFanWarn++
} else {
fmt.Fprintf(&b, " Fan response: OK\n")
}
}
b.WriteString("\n")
}
fmt.Fprintf(&b, "%s\n", strings.Repeat("=", 48))
if totalThrottle > 0 {
fmt.Fprintf(&b, "Overall: FAIL — throttle detected in %d/%d cycles\n", totalThrottle, len(analyses))
} else if totalFanWarn > 0 {
fmt.Fprintf(&b, "Overall: WARN — fast fan spindown in %d/%d cycles (cooling recovery risk)\n", totalFanWarn, len(analyses))
} else {
fmt.Fprintf(&b, "Overall: PASS\n")
}
return b.String()
}
func writePlatformCSV(rows []platformStressRow) []byte {
var buf bytes.Buffer
w := csv.NewWriter(&buf)
_ = w.Write([]string{
"elapsed_sec", "cycle", "phase",
"cpu_load_pct", "max_cpu_temp_c", "max_gpu_temp_c",
"sys_power_w", "fan_min_rpm", "fan_max_rpm", "gpu_throttled",
})
for _, r := range rows {
throttled := "0"
if r.GPUThrottled {
throttled = "1"
}
_ = w.Write([]string{
strconv.FormatFloat(r.ElapsedSec, 'f', 1, 64),
strconv.Itoa(r.Cycle),
r.Phase,
strconv.FormatFloat(r.CPULoadPct, 'f', 1, 64),
strconv.FormatFloat(r.MaxCPUTempC, 'f', 1, 64),
strconv.FormatFloat(r.MaxGPUTempC, 'f', 1, 64),
strconv.FormatFloat(r.SysPowerW, 'f', 1, 64),
strconv.FormatFloat(r.FanMinRPM, 'f', 0, 64),
strconv.FormatFloat(r.FanMaxRPM, 'f', 0, 64),
throttled,
})
}
w.Flush()
return buf.Bytes()
}
// buildCPUStressCmd creates a stressapptest command that runs until ctx is cancelled.
func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
path, err := satLookPath("stressapptest")
if err != nil {
return nil, fmt.Errorf("stressapptest not found: %w", err)
}
// Use a very long duration; the context timeout will kill it at the right time.
cmd := exec.CommandContext(ctx, path, "-s", "86400", "-W", "--cc_test")
cmd.Stdout = nil
cmd.Stderr = nil
if err := cmd.Start(); err != nil {
return nil, fmt.Errorf("stressapptest start: %w", err)
}
return cmd, nil
}
// buildGPUStressCmd creates a GPU stress command appropriate for the detected vendor.
// Returns nil if no GPU stress tool is available (CPU-only cycling still useful).
func buildGPUStressCmd(ctx context.Context, vendor string) *exec.Cmd {
switch strings.ToLower(vendor) {
case "amd":
return buildAMDGPUStressCmd(ctx)
case "nvidia":
return buildNvidiaGPUStressCmd(ctx)
}
return nil
}
func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
rvsArgs, err := resolveRVSCommand()
if err != nil {
return nil
}
rvsPath := rvsArgs[0]
cfg := `actions:
- name: gst_platform
device: all
module: gst
parallel: true
duration: 86400000
copy_matrix: false
target_stress: 90
matrix_size_a: 8640
matrix_size_b: 8640
matrix_size_c: 8640
`
cfgFile := "/tmp/bee-platform-gst.conf"
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
cmd := exec.CommandContext(ctx, rvsPath, "-c", cfgFile)
cmd.Stdout = nil
cmd.Stderr = nil
_ = cmd.Start()
return cmd
}
func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
path, err := satLookPath("bee-gpu-stress")
if err != nil {
return nil
}
cmd := exec.CommandContext(ctx, path, "--seconds", "86400", "--size-mb", "64")
cmd.Stdout = nil
cmd.Stderr = nil
_ = cmd.Start()
return cmd
}
func packPlatformDir(dir, dest string) error {
f, err := os.Create(dest)
if err != nil {
return err
}
defer f.Close()
gz := gzip.NewWriter(f)
defer gz.Close()
tw := tar.NewWriter(gz)
defer tw.Close()
entries, err := os.ReadDir(dir)
if err != nil {
return err
}
base := filepath.Base(dir)
for _, e := range entries {
if e.IsDir() {
continue
}
fpath := filepath.Join(dir, e.Name())
data, err := os.ReadFile(fpath)
if err != nil {
continue
}
hdr := &tar.Header{
Name: filepath.Join(base, e.Name()),
Size: int64(len(data)),
Mode: 0644,
ModTime: time.Now(),
}
if err := tw.WriteHeader(hdr); err != nil {
return err
}
if _, err := tw.Write(data); err != nil {
return err
}
}
return nil
}

View File

@@ -136,6 +136,54 @@ func (s *System) RunAMDAcceptancePack(ctx context.Context, baseDir string, logFu
}, logFunc)
}
// RunAMDMemIntegrityPack runs the official RVS MEM module as a validate-style memory integrity test.
func (s *System) RunAMDMemIntegrityPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
if err := ensureAMDRuntimeReady(); err != nil {
return "", err
}
cfgFile := "/tmp/bee-amd-mem.conf"
cfg := `actions:
- name: mem_integrity
device: all
module: mem
parallel: true
duration: 60000
copy_matrix: false
target_stress: 90
matrix_size: 8640
`
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-mem", []satJob{
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
{name: "02-rvs-mem.log", cmd: []string{"rvs", "-c", cfgFile}},
{name: "03-rocm-smi-after.log", cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--showmemuse", "--csv"}},
}, logFunc)
}
// RunAMDMemBandwidthPack runs AMD's memory/interconnect bandwidth-oriented tools.
func (s *System) RunAMDMemBandwidthPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
if err := ensureAMDRuntimeReady(); err != nil {
return "", err
}
cfgFile := "/tmp/bee-amd-babel.conf"
cfg := `actions:
- name: babel_mem_bw
device: all
module: babel
parallel: true
copy_matrix: true
target_stress: 90
matrix_size: 134217728
`
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-bandwidth", []satJob{
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
{name: "02-rocm-bandwidth-test.log", cmd: []string{"rocm-bandwidth-test"}},
{name: "03-rvs-babel.log", cmd: []string{"rvs", "-c", cfgFile}},
{name: "04-rocm-smi-after.log", cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--showmemuse", "--csv"}},
}, logFunc)
}
// RunAMDStressPack runs an AMD GPU burn-in pack.
// Missing tools are reported as UNSUPPORTED, consistent with the existing SAT pattern.
func (s *System) RunAMDStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
@@ -146,8 +194,16 @@ func (s *System) RunAMDStressPack(ctx context.Context, baseDir string, durationS
if err := ensureAMDRuntimeReady(); err != nil {
return "", err
}
// Write RVS GST config to a temp file
rvsCfg := fmt.Sprintf(`actions:
// Enable copy_matrix so the same GST run drives VRAM traffic in addition to compute.
rvsCfg := amdStressRVSConfig(seconds)
cfgFile := "/tmp/bee-amd-gst.conf"
_ = os.WriteFile(cfgFile, []byte(rvsCfg), 0644)
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-stress", amdStressJobs(seconds, cfgFile), logFunc)
}
func amdStressRVSConfig(seconds int) string {
return fmt.Sprintf(`actions:
- name: gst_stress
device: all
module: gst
@@ -159,15 +215,15 @@ func (s *System) RunAMDStressPack(ctx context.Context, baseDir string, durationS
matrix_size_b: 8640
matrix_size_c: 8640
`, seconds*1000)
cfgFile := "/tmp/bee-amd-gst.conf"
_ = os.WriteFile(cfgFile, []byte(rvsCfg), 0644)
}
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-stress", []satJob{
func amdStressJobs(seconds int, cfgFile string) []satJob {
return []satJob{
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
{name: "02-rocm-bandwidth-test.log", cmd: []string{"rocm-bandwidth-test"}},
{name: fmt.Sprintf("03-rvs-gst-%ds.log", seconds), cmd: []string{"rvs", "-c", cfgFile}},
{name: fmt.Sprintf("04-rocm-smi-after.log"), cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--csv"}},
}, logFunc)
}
}
// ListNvidiaGPUs returns GPUs visible to nvidia-smi.

View File

@@ -5,6 +5,7 @@ import (
"os"
"os/exec"
"path/filepath"
"strings"
"testing"
)
@@ -38,6 +39,47 @@ func TestRunNvidiaAcceptancePackIncludesGPUStress(t *testing.T) {
}
}
func TestAMDStressConfigUsesSingleGSTAction(t *testing.T) {
t.Parallel()
cfg := amdStressRVSConfig(123)
if !strings.Contains(cfg, "module: gst") {
t.Fatalf("config missing gst module:\n%s", cfg)
}
if strings.Contains(cfg, "module: mem") {
t.Fatalf("config should not include mem module:\n%s", cfg)
}
if !strings.Contains(cfg, "copy_matrix: false") {
t.Fatalf("config should use copy_matrix=false:\n%s", cfg)
}
if strings.Count(cfg, "duration: 123000") != 1 {
t.Fatalf("config should apply duration once:\n%s", cfg)
}
for _, field := range []string{"matrix_size_a: 8640", "matrix_size_b: 8640", "matrix_size_c: 8640"} {
if !strings.Contains(cfg, field) {
t.Fatalf("config missing %s:\n%s", field, cfg)
}
}
}
func TestAMDStressJobsIncludeBandwidthAndGST(t *testing.T) {
t.Parallel()
jobs := amdStressJobs(300, "/tmp/test-amd-gst.conf")
if len(jobs) != 4 {
t.Fatalf("jobs=%d want 4", len(jobs))
}
if got := jobs[1].cmd[0]; got != "rocm-bandwidth-test" {
t.Fatalf("jobs[1]=%q want rocm-bandwidth-test", got)
}
if got := jobs[2].cmd[0]; got != "rvs" {
t.Fatalf("jobs[2]=%q want rvs", got)
}
if got := jobs[2].cmd[2]; got != "/tmp/test-amd-gst.conf" {
t.Fatalf("jobs[2] cfg=%q want /tmp/test-amd-gst.conf", got)
}
}
func TestNvidiaSATJobsUseEnvOverrides(t *testing.T) {
t.Setenv("BEE_GPU_STRESS_SECONDS", "9")
t.Setenv("BEE_GPU_STRESS_SIZE_MB", "96")

View File

@@ -599,10 +599,9 @@ func (h *handler) handleAPIMetricsStream(w http.ResponseWriter, r *http.Request)
case <-r.Context().Done():
return
case <-ticker.C:
sample := platform.SampleLiveMetrics()
h.feedRings(sample)
if h.metricsDB != nil {
_ = h.metricsDB.Write(sample)
sample, ok := h.latestMetric()
if !ok {
continue
}
b, err := json.Marshal(sample)
if err != nil {

View File

@@ -3,7 +3,6 @@ package webui
import (
"database/sql"
"encoding/csv"
"fmt"
"io"
"strconv"
"time"
@@ -13,7 +12,6 @@ import (
)
const metricsDBPath = "/appdata/bee/metrics.db"
const metricsKeepDuration = 24 * time.Hour
// MetricsDB persists live metric samples to SQLite.
type MetricsDB struct {
@@ -116,11 +114,18 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
}
// LoadRecent returns up to n samples in chronological order (oldest first).
// It reconstructs LiveMetricSample from the normalized tables.
func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
rows, err := m.db.Query(
`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?`, n,
)
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?`, n)
}
// LoadAll returns all persisted samples in chronological order (oldest first).
func (m *MetricsDB) LoadAll() ([]platform.LiveMetricSample, error) {
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts`, nil)
}
// loadSamples reconstructs LiveMetricSample rows from the normalized tables.
func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetricSample, error) {
rows, err := m.db.Query(query, args...)
if err != nil {
return nil, err
}
@@ -257,14 +262,6 @@ func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
return samples, nil
}
// Prune deletes samples older than keepDuration.
func (m *MetricsDB) Prune(keepDuration time.Duration) {
cutoff := time.Now().Add(-keepDuration).Unix()
for _, table := range []string{"sys_metrics", "gpu_metrics", "fan_metrics", "temp_metrics"} {
_, _ = m.db.Exec(fmt.Sprintf("DELETE FROM %s WHERE ts < ?", table), cutoff)
}
}
// ExportCSV writes all sys+gpu data as CSV to w.
func (m *MetricsDB) ExportCSV(w io.Writer) error {
rows, err := m.db.Query(`

View File

@@ -494,7 +494,11 @@ func renderValidate() string {
renderSATCard("memory", "Memory", "") +
renderSATCard("storage", "Storage", "") +
renderSATCard("cpu", "CPU", `<div class="form-row"><label>Duration (seconds)</label><input type="number" id="sat-cpu-dur" value="60" min="10"></div>`) +
renderSATCard("amd", "AMD GPU", "") +
renderSATCard("amd", "AMD GPU", `<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
<button id="sat-btn-amd-mem" class="btn" type="button" onclick="runSAT('amd-mem')">MEM Integrity</button>
<button id="sat-btn-amd-bandwidth" class="btn" type="button" onclick="runSAT('amd-bandwidth')">MEM Bandwidth</button>
</div>
<p style="color:var(--muted);font-size:12px;margin:0">Additional AMD memory diagnostics: RVS MEM for integrity and BABEL + rocm-bandwidth-test for memory/interconnect bandwidth.</p>`) +
`</div>
<div id="sat-output" style="display:none;margin-top:16px" class="card">
<div class="card-head">Test Output <span id="sat-title"></span></div>
@@ -505,7 +509,7 @@ let satES = null;
function runSAT(target) {
if (satES) { satES.close(); satES = null; }
const body = {};
const labels = {nvidia:'Validate GPU', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU'};
const labels = {nvidia:'Validate GPU', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
body.display_name = labels[target] || ('Validate ' + target);
if (target === 'nvidia') body.diag_level = parseInt(document.getElementById('sat-nvidia-level').value)||1;
if (target === 'cpu') body.duration = parseInt(document.getElementById('sat-cpu-dur').value)||60;
@@ -524,7 +528,7 @@ function runSAT(target) {
}
function runAllSAT() {
const cycles = Math.max(1, parseInt(document.getElementById('sat-cycles').value)||1);
const targets = ['nvidia','memory','storage','cpu','amd'];
const targets = ['nvidia','memory','storage','cpu','amd','amd-mem','amd-bandwidth'];
const total = targets.length * cycles;
let enqueued = 0;
const status = document.getElementById('sat-all-status');
@@ -536,7 +540,7 @@ function runAllSAT() {
const btn = document.getElementById('sat-btn-' + target);
if (btn && btn.disabled) { enqueueNext(cycle, idx+1); return; }
const body = {};
const labels = {nvidia:'Validate GPU', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU'};
const labels = {nvidia:'Validate GPU', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
body.display_name = labels[target] || ('Validate ' + target);
if (target === 'nvidia') body.diag_level = parseInt(document.getElementById('sat-nvidia-level').value)||1;
if (target === 'cpu') body.duration = parseInt(document.getElementById('sat-cpu-dur').value)||60;
@@ -554,6 +558,8 @@ function runAllSAT() {
fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
if (!gp.nvidia) disableSATCard('nvidia', 'No NVIDIA GPU detected');
if (!gp.amd) disableSATCard('amd', 'No AMD GPU detected');
if (!gp.amd) disableSATCard('amd-mem', 'No AMD GPU detected');
if (!gp.amd) disableSATCard('amd-bandwidth', 'No AMD GPU detected');
});
function disableSATCard(id, reason) {
const btn = document.getElementById('sat-btn-' + id);
@@ -598,7 +604,7 @@ func renderBurn() string {
<button class="btn btn-primary" onclick="runBurnIn('cpu')">&#9654; Start CPU Stress</button>
</div></div>
<div class="card"><div class="card-head">AMD GPU Stress</div><div class="card-body">
<p style="color:var(--muted);font-size:12px;margin-bottom:8px">Requires ROCm tools (rocm-bandwidth-test). Missing tools reported as UNSUPPORTED.</p>
<p style="color:var(--muted);font-size:12px;margin-bottom:8px">Runs ROCm compute stress together with VRAM copy/load activity via RVS GST and records a separate <code>rocm-bandwidth-test</code> snapshot. Missing tools reported as UNSUPPORTED.</p>
<button id="sat-btn-amd-stress" class="btn btn-primary" onclick="runBurnIn('amd-stress')">&#9654; Start AMD Stress</button>
</div></div>
<div class="card"><div class="card-head">Memory Stress</div><div class="card-body">
@@ -609,6 +615,10 @@ func renderBurn() string {
<p style="color:var(--muted);font-size:12px;margin-bottom:8px">Google stressapptest saturates CPU, memory and cache buses simultaneously. Env: <code>BEE_SAT_STRESS_SECONDS</code> (default 300), <code>BEE_SAT_STRESS_MB</code> (default auto).</p>
<button class="btn btn-primary" onclick="runBurnIn('sat-stress')">&#9654; Start SAT Stress</button>
</div></div>
<div class="card"><div class="card-head">Platform Thermal Cycling</div><div class="card-body">
<p style="color:var(--muted);font-size:12px;margin-bottom:8px">Runs CPU + GPU stress simultaneously across multiple load/idle cycles with varying durations. Detects cooling systems that fail to recover under repeated load cycles. Smoke: 2 cycles ~5 min. Acceptance: 4 cycles ~25 min.</p>
<button class="btn btn-primary" onclick="runBurnIn('platform-stress')">&#9654; Start Thermal Cycling</button>
</div></div>
</div>
<div id="bi-output" style="display:none;margin-top:16px" class="card">
<div class="card-head">Output <span id="bi-title"></span></div>

View File

@@ -72,29 +72,36 @@ func (r *metricsRing) snapshot() ([]float64, []string) {
defer r.mu.Unlock()
v := make([]float64, len(r.vals))
copy(v, r.vals)
now := time.Now()
labels := make([]string, len(r.times))
if len(r.times) == 0 {
return v, labels
}
sameDay := timestampsSameLocalDay(r.times)
for i, t := range r.times {
labels[i] = relAgeLabel(now.Sub(t))
labels[i] = formatTimelineLabel(t.Local(), sameDay)
}
return v, labels
}
func relAgeLabel(age time.Duration) string {
if age <= 0 {
return "0"
func timestampsSameLocalDay(times []time.Time) bool {
if len(times) == 0 {
return true
}
if age < time.Hour {
m := int(age.Minutes())
if m == 0 {
return "-1m"
first := times[0].Local()
for _, t := range times[1:] {
local := t.Local()
if local.Year() != first.Year() || local.YearDay() != first.YearDay() {
return false
}
return fmt.Sprintf("-%dm", m)
}
if age < 24*time.Hour {
return fmt.Sprintf("-%dh", int(age.Hours()))
return true
}
func formatTimelineLabel(ts time.Time, sameDay bool) string {
if sameDay {
return ts.Format("15:04")
}
return fmt.Sprintf("-%dd", int(age.Hours()/24))
return ts.Format("01-02 15:04")
}
// gpuRings holds per-GPU ring buffers.
@@ -132,6 +139,8 @@ type handler struct {
// per-GPU rings (index = GPU index)
gpuRings []*gpuRings
ringsMu sync.Mutex
latestMu sync.RWMutex
latest *platform.LiveMetricSample
// metrics persistence (nil if DB unavailable)
metricsDB *MetricsDB
// install job (at most one at a time)
@@ -164,13 +173,16 @@ func NewHandler(opts HandlerOptions) http.Handler {
// Open metrics DB and pre-fill ring buffers from history.
if db, err := openMetricsDB(metricsDBPath); err == nil {
h.metricsDB = db
db.Prune(metricsKeepDuration)
if samples, err := db.LoadRecent(120); err == nil {
for _, s := range samples {
h.feedRings(s)
}
if len(samples) > 0 {
h.setLatestMetric(samples[len(samples)-1])
}
}
}
h.startMetricsCollector()
globalQueue.startWorker(&opts)
mux := http.NewServeMux()
@@ -198,9 +210,12 @@ func NewHandler(opts HandlerOptions) http.Handler {
mux.HandleFunc("POST /api/sat/storage/run", h.handleAPISATRun("storage"))
mux.HandleFunc("POST /api/sat/cpu/run", h.handleAPISATRun("cpu"))
mux.HandleFunc("POST /api/sat/amd/run", h.handleAPISATRun("amd"))
mux.HandleFunc("POST /api/sat/amd-mem/run", h.handleAPISATRun("amd-mem"))
mux.HandleFunc("POST /api/sat/amd-bandwidth/run", h.handleAPISATRun("amd-bandwidth"))
mux.HandleFunc("POST /api/sat/amd-stress/run", h.handleAPISATRun("amd-stress"))
mux.HandleFunc("POST /api/sat/memory-stress/run", h.handleAPISATRun("memory-stress"))
mux.HandleFunc("POST /api/sat/sat-stress/run", h.handleAPISATRun("sat-stress"))
mux.HandleFunc("POST /api/sat/platform-stress/run", h.handleAPISATRun("platform-stress"))
mux.HandleFunc("GET /api/sat/stream", h.handleAPISATStream)
mux.HandleFunc("POST /api/sat/abort", h.handleAPISATAbort)
@@ -260,6 +275,37 @@ func NewHandler(opts HandlerOptions) http.Handler {
return mux
}
func (h *handler) startMetricsCollector() {
go func() {
ticker := time.NewTicker(1 * time.Second)
defer ticker.Stop()
for range ticker.C {
sample := platform.SampleLiveMetrics()
h.feedRings(sample)
h.setLatestMetric(sample)
if h.metricsDB != nil {
_ = h.metricsDB.Write(sample)
}
}
}()
}
func (h *handler) setLatestMetric(sample platform.LiveMetricSample) {
h.latestMu.Lock()
defer h.latestMu.Unlock()
cp := sample
h.latest = &cp
}
func (h *handler) latestMetric() (platform.LiveMetricSample, bool) {
h.latestMu.RLock()
defer h.latestMu.RUnlock()
if h.latest == nil {
return platform.LiveMetricSample{}, false
}
return *h.latest, true
}
// ListenAndServe starts the HTTP server.
func ListenAndServe(addr string, opts HandlerOptions) error {
return http.ListenAndServe(addr, NewHandler(opts))
@@ -387,6 +433,20 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
path := strings.TrimPrefix(r.URL.Path, "/api/metrics/chart/")
path = strings.TrimSuffix(path, ".svg")
if h.metricsDB != nil {
if datasets, names, labels, title, yMin, yMax, ok := h.chartDataFromDB(path); ok {
buf, err := renderChartSVG(title, datasets, names, labels, yMin, yMax)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "image/svg+xml")
w.Header().Set("Cache-Control", "no-store")
_, _ = w.Write(buf)
return
}
}
var datasets [][]float64
var names []string
var labels []string
@@ -601,6 +661,259 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
_, _ = w.Write(buf)
}
func (h *handler) chartDataFromDB(path string) ([][]float64, []string, []string, string, *float64, *float64, bool) {
samples, err := h.metricsDB.LoadAll()
if err != nil || len(samples) == 0 {
return nil, nil, nil, "", nil, nil, false
}
return chartDataFromSamples(path, samples)
}
func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][]float64, []string, []string, string, *float64, *float64, bool) {
var datasets [][]float64
var names []string
var title string
var yMin, yMax *float64
labels := sampleTimeLabels(samples)
switch {
case path == "server-load":
title = "CPU / Memory Load"
cpu := make([]float64, len(samples))
mem := make([]float64, len(samples))
for i, s := range samples {
cpu[i] = s.CPULoadPct
mem[i] = s.MemLoadPct
}
datasets = [][]float64{cpu, mem}
names = []string{"CPU Load %", "Mem Load %"}
yMin = floatPtr(0)
yMax = floatPtr(100)
case path == "server-temp", path == "server-temp-cpu":
title = "CPU Temperature"
datasets, names = namedTempDatasets(samples, "cpu")
yMin = floatPtr(0)
yMax = autoMax120(datasets...)
case path == "server-temp-gpu":
title = "GPU Temperature"
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.TempC })
yMin = floatPtr(0)
yMax = autoMax120(datasets...)
case path == "server-temp-ambient":
title = "Ambient / Other Sensors"
datasets, names = namedTempDatasets(samples, "ambient")
yMin = floatPtr(0)
yMax = autoMax120(datasets...)
case path == "server-power":
title = "System Power"
power := make([]float64, len(samples))
for i, s := range samples {
power[i] = s.PowerW
}
datasets = [][]float64{power}
names = []string{"Power W"}
yMin, yMax = autoBounds120(power)
case path == "server-fans":
title = "Fan RPM"
datasets, names = namedFanDatasets(samples)
yMin, yMax = autoBounds120(datasets...)
case path == "gpu-all-load":
title = "GPU Compute Load"
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.UsagePct })
yMin = floatPtr(0)
yMax = floatPtr(100)
case path == "gpu-all-memload":
title = "GPU Memory Load"
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct })
yMin = floatPtr(0)
yMax = floatPtr(100)
case path == "gpu-all-power":
title = "GPU Power"
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.PowerW })
yMin, yMax = autoBounds120(datasets...)
case path == "gpu-all-temp":
title = "GPU Temperature"
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.TempC })
yMin = floatPtr(0)
yMax = autoMax120(datasets...)
case strings.HasPrefix(path, "gpu/"):
rest := strings.TrimPrefix(path, "gpu/")
sub := ""
if i := strings.LastIndex(rest, "-"); i > 0 {
sub = rest[i+1:]
rest = rest[:i]
}
idx := 0
fmt.Sscanf(rest, "%d", &idx)
switch sub {
case "load":
title = fmt.Sprintf("GPU %d Load", idx)
util := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.UsagePct })
mem := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct })
if util == nil && mem == nil {
return nil, nil, nil, "", nil, nil, false
}
datasets = [][]float64{coalesceDataset(util, len(samples)), coalesceDataset(mem, len(samples))}
names = []string{"Load %", "Mem %"}
yMin = floatPtr(0)
yMax = floatPtr(100)
case "temp":
title = fmt.Sprintf("GPU %d Temperature", idx)
temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
if temp == nil {
return nil, nil, nil, "", nil, nil, false
}
datasets = [][]float64{temp}
names = []string{"Temp °C"}
yMin = floatPtr(0)
yMax = autoMax120(temp)
default:
title = fmt.Sprintf("GPU %d Power", idx)
power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
if power == nil {
return nil, nil, nil, "", nil, nil, false
}
datasets = [][]float64{power}
names = []string{"Power W"}
yMin, yMax = autoBounds120(power)
}
default:
return nil, nil, nil, "", nil, nil, false
}
return datasets, names, labels, title, yMin, yMax, len(datasets) > 0
}
func sampleTimeLabels(samples []platform.LiveMetricSample) []string {
labels := make([]string, len(samples))
if len(samples) == 0 {
return labels
}
times := make([]time.Time, len(samples))
for i, s := range samples {
times[i] = s.Timestamp
}
sameDay := timestampsSameLocalDay(times)
for i, s := range samples {
labels[i] = formatTimelineLabel(s.Timestamp.Local(), sameDay)
}
return labels
}
func namedTempDatasets(samples []platform.LiveMetricSample, group string) ([][]float64, []string) {
seen := map[string]bool{}
var names []string
for _, s := range samples {
for _, t := range s.Temps {
if t.Group == group && !seen[t.Name] {
seen[t.Name] = true
names = append(names, t.Name)
}
}
}
datasets := make([][]float64, 0, len(names))
for _, name := range names {
ds := make([]float64, len(samples))
for i, s := range samples {
for _, t := range s.Temps {
if t.Group == group && t.Name == name {
ds[i] = t.Celsius
break
}
}
}
datasets = append(datasets, ds)
}
return datasets, names
}
func namedFanDatasets(samples []platform.LiveMetricSample) ([][]float64, []string) {
seen := map[string]bool{}
var names []string
for _, s := range samples {
for _, f := range s.Fans {
if !seen[f.Name] {
seen[f.Name] = true
names = append(names, f.Name)
}
}
}
datasets := make([][]float64, 0, len(names))
for _, name := range names {
ds := make([]float64, len(samples))
for i, s := range samples {
for _, f := range s.Fans {
if f.Name == name {
ds[i] = f.RPM
break
}
}
}
datasets = append(datasets, ds)
}
return datasets, names
}
func gpuDatasets(samples []platform.LiveMetricSample, pick func(platform.GPUMetricRow) float64) ([][]float64, []string) {
seen := map[int]bool{}
var indices []int
for _, s := range samples {
for _, g := range s.GPUs {
if !seen[g.GPUIndex] {
seen[g.GPUIndex] = true
indices = append(indices, g.GPUIndex)
}
}
}
datasets := make([][]float64, 0, len(indices))
names := make([]string, 0, len(indices))
for _, idx := range indices {
ds := gpuDatasetByIndex(samples, idx, pick)
if ds == nil {
continue
}
datasets = append(datasets, ds)
names = append(names, fmt.Sprintf("GPU %d", idx))
}
return datasets, names
}
func gpuDatasetByIndex(samples []platform.LiveMetricSample, idx int, pick func(platform.GPUMetricRow) float64) []float64 {
found := false
ds := make([]float64, len(samples))
for i, s := range samples {
for _, g := range s.GPUs {
if g.GPUIndex == idx {
ds[i] = pick(g)
found = true
break
}
}
}
if !found {
return nil
}
return ds
}
func coalesceDataset(ds []float64, n int) []float64 {
if ds != nil {
return ds
}
return make([]float64, n)
}
// floatPtr returns a pointer to a float64 value.
func floatPtr(v float64) *float64 { return &v }
@@ -621,6 +934,47 @@ func autoMax120(datasets ...[]float64) *float64 {
return &v
}
func autoBounds120(datasets ...[]float64) (*float64, *float64) {
min := 0.0
max := 0.0
first := true
for _, ds := range datasets {
for _, v := range ds {
if first {
min, max = v, v
first = false
continue
}
if v < min {
min = v
}
if v > max {
max = v
}
}
}
if first {
return nil, nil
}
if max <= 0 {
return floatPtr(0), nil
}
span := max - min
if span <= 0 {
span = max * 0.1
if span <= 0 {
span = 1
}
}
pad := span * 0.2
low := min - pad
if low < 0 {
low = 0
}
high := max + pad
return floatPtr(low), floatPtr(high)
}
// renderChartSVG renders a line chart SVG with a fixed Y-axis range.
func renderChartSVG(title string, datasets [][]float64, names []string, labels []string, yMin, yMax *float64) ([]byte, error) {
n := len(labels)

View File

@@ -7,6 +7,9 @@ import (
"path/filepath"
"strings"
"testing"
"time"
"bee/audit/internal/platform"
)
func TestChartLegendNumber(t *testing.T) {
@@ -31,6 +34,61 @@ func TestChartLegendNumber(t *testing.T) {
}
}
func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
samples := []platform.LiveMetricSample{
{
Timestamp: time.Now().Add(-3 * time.Minute),
CPULoadPct: 10,
MemLoadPct: 20,
PowerW: 300,
GPUs: []platform.GPUMetricRow{
{GPUIndex: 0, UsagePct: 90, MemUsagePct: 5, PowerW: 120, TempC: 50},
},
},
{
Timestamp: time.Now().Add(-2 * time.Minute),
CPULoadPct: 30,
MemLoadPct: 40,
PowerW: 320,
GPUs: []platform.GPUMetricRow{
{GPUIndex: 0, UsagePct: 95, MemUsagePct: 7, PowerW: 125, TempC: 51},
},
},
{
Timestamp: time.Now().Add(-1 * time.Minute),
CPULoadPct: 50,
MemLoadPct: 60,
PowerW: 340,
GPUs: []platform.GPUMetricRow{
{GPUIndex: 0, UsagePct: 97, MemUsagePct: 9, PowerW: 130, TempC: 52},
},
},
}
datasets, names, labels, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
if !ok {
t.Fatal("chartDataFromSamples returned ok=false")
}
if title != "GPU Power" {
t.Fatalf("title=%q", title)
}
if len(names) != 1 || names[0] != "GPU 0" {
t.Fatalf("names=%v", names)
}
if len(labels) != len(samples) {
t.Fatalf("labels len=%d want %d", len(labels), len(samples))
}
if len(datasets) != 1 || len(datasets[0]) != len(samples) {
t.Fatalf("datasets shape=%v", datasets)
}
if got := datasets[0][0]; got != 120 {
t.Fatalf("datasets[0][0]=%v want 120", got)
}
if got := datasets[0][2]; got != 130 {
t.Fatalf("datasets[0][2]=%v want 130", got)
}
}
func TestRootRendersDashboard(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "audit.json")

View File

@@ -12,6 +12,7 @@ import (
"time"
"bee/audit/internal/app"
"bee/audit/internal/platform"
)
// Task statuses.
@@ -30,9 +31,12 @@ var taskNames = map[string]string{
"storage": "Storage SAT",
"cpu": "CPU SAT",
"amd": "AMD GPU SAT",
"amd-mem": "AMD GPU MEM Integrity",
"amd-bandwidth": "AMD GPU MEM Bandwidth",
"amd-stress": "AMD GPU Burn-in",
"memory-stress": "Memory Burn-in",
"sat-stress": "SAT Stress (stressapptest)",
"sat-stress": "SAT Stress (stressapptest)",
"platform-stress": "Platform Thermal Cycling",
"audit": "Audit",
"install": "Install to Disk",
"install-to-ram": "Install to RAM",
@@ -96,6 +100,34 @@ func resolveBurnPreset(profile string) burnPreset {
}
}
func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions {
switch profile {
case "overnight":
return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{
{LoadSec: 600, IdleSec: 120},
{LoadSec: 600, IdleSec: 60},
{LoadSec: 600, IdleSec: 30},
{LoadSec: 600, IdleSec: 120},
{LoadSec: 600, IdleSec: 60},
{LoadSec: 600, IdleSec: 30},
{LoadSec: 600, IdleSec: 120},
{LoadSec: 600, IdleSec: 60},
}}
case "acceptance":
return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{
{LoadSec: 300, IdleSec: 60},
{LoadSec: 300, IdleSec: 30},
{LoadSec: 300, IdleSec: 60},
{LoadSec: 300, IdleSec: 30},
}}
default: // smoke
return platform.PlatformStressOptions{Cycles: []platform.PlatformStressCycle{
{LoadSec: 90, IdleSec: 60},
{LoadSec: 90, IdleSec: 30},
}}
}
}
// taskQueue manages a priority-ordered list of tasks and runs them one at a time.
type taskQueue struct {
mu sync.Mutex
@@ -124,6 +156,12 @@ var (
runAMDAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
return a.RunAMDAcceptancePackCtx(ctx, baseDir, logFunc)
}
runAMDMemIntegrityPackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
return a.RunAMDMemIntegrityPackCtx(ctx, baseDir, logFunc)
}
runAMDMemBandwidthPackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
return a.RunAMDMemBandwidthPackCtx(ctx, baseDir, logFunc)
}
runAMDStressPackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
return a.RunAMDStressPackCtx(ctx, baseDir, durationSec, logFunc)
}
@@ -380,6 +418,10 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
archive, err = runCPUAcceptancePackCtx(a, ctx, "", dur, j.append)
case "amd":
archive, err = runAMDAcceptancePackCtx(a, ctx, "", j.append)
case "amd-mem":
archive, err = runAMDMemIntegrityPackCtx(a, ctx, "", j.append)
case "amd-bandwidth":
archive, err = runAMDMemBandwidthPackCtx(a, ctx, "", j.append)
case "amd-stress":
dur := t.params.Duration
if t.params.BurnProfile != "" && dur <= 0 {
@@ -398,6 +440,9 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
}
archive, err = runSATStressPackCtx(a, ctx, "", dur, j.append)
case "platform-stress":
opts := resolvePlatformStressPreset(t.params.BurnProfile)
archive, err = a.RunPlatformStress(ctx, "", opts, j.append)
case "audit":
result, e := a.RunAuditNow(q.opts.RuntimeMode)
if e != nil {

View File

@@ -30,8 +30,8 @@ lb config noauto \
--linux-flavours "amd64" \
--linux-packages "${LB_LINUX_PACKAGES}" \
--memtest none \
--iso-volume "EASY-BEE" \
--iso-application "EASY-BEE" \
--iso-volume "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \
--apt-recommends false \
--chroot-squashfs-compression-type zstd \

View File

@@ -12,6 +12,7 @@ CACHE_DIR="${BEE_BUILDER_CACHE_DIR:-${REPO_ROOT}/dist/container-cache}"
AUTH_KEYS=""
REBUILD_IMAGE=0
CLEAN_CACHE=0
VARIANT="all"
. "${BUILDER_DIR}/VERSIONS"
@@ -34,14 +35,23 @@ while [ $# -gt 0 ]; do
REBUILD_IMAGE=1
shift
;;
--variant)
VARIANT="$2"
shift 2
;;
*)
echo "unknown arg: $1" >&2
echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys]" >&2
echo "usage: $0 [--cache-dir /path] [--rebuild-image] [--clean-build] [--authorized-keys /path/to/authorized_keys] [--variant nvidia|amd|all]" >&2
exit 1
;;
esac
done
case "$VARIANT" in
nvidia|amd|all) ;;
*) echo "unknown variant: $VARIANT (expected nvidia, amd, or all)" >&2; exit 1 ;;
esac
if [ "$CLEAN_CACHE" = "1" ]; then
echo "=== cleaning build cache: ${CACHE_DIR} ==="
rm -rf "${CACHE_DIR:?}/go-build" \
@@ -49,8 +59,9 @@ if [ "$CLEAN_CACHE" = "1" ]; then
"${CACHE_DIR:?}/tmp" \
"${CACHE_DIR:?}/bee" \
"${CACHE_DIR:?}/lb-packages"
echo "=== cleaning live-build work dir: ${REPO_ROOT}/dist/live-build-work ==="
rm -rf "${REPO_ROOT}/dist/live-build-work"
echo "=== cleaning live-build work dirs ==="
rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia"
rm -rf "${REPO_ROOT}/dist/live-build-work-amd"
echo "=== caches cleared, proceeding with build ==="
fi
@@ -108,34 +119,71 @@ else
echo "=== using existing builder image ${IMAGE_REF} (${BUILDER_PLATFORM}) ==="
fi
set -- \
run --rm --privileged \
--platform "${BUILDER_PLATFORM}" \
-v "${REPO_ROOT}:/work" \
-v "${CACHE_DIR}:/cache" \
-e BEE_CONTAINER_BUILD=1 \
-e GOCACHE=/cache/go-build \
-e GOMODCACHE=/cache/go-mod \
-e TMPDIR=/cache/tmp \
-e BEE_CACHE_DIR=/cache/bee \
-w /work \
"${IMAGE_REF}" \
sh /work/iso/builder/build.sh
if [ -n "$AUTH_KEYS" ]; then
set -- run --rm --privileged \
--platform "${BUILDER_PLATFORM}" \
-v "${REPO_ROOT}:/work" \
-v "${CACHE_DIR}:/cache" \
-v "${AUTH_KEYS_DIR}:/tmp/bee-authkeys:ro" \
# Build base docker run args (without --authorized-keys)
build_run_args() {
_variant="$1"
_auth_arg=""
if [ -n "$AUTH_KEYS" ]; then
_auth_arg="--authorized-keys /tmp/bee-authkeys/${AUTH_KEYS_BASE}"
fi
echo "run --rm --privileged \
--platform ${BUILDER_PLATFORM} \
-v ${REPO_ROOT}:/work \
-v ${CACHE_DIR}:/cache \
${AUTH_KEYS:+-v ${AUTH_KEYS_DIR}:/tmp/bee-authkeys:ro} \
-e BEE_CONTAINER_BUILD=1 \
-e GOCACHE=/cache/go-build \
-e GOMODCACHE=/cache/go-mod \
-e TMPDIR=/cache/tmp \
-e BEE_CACHE_DIR=/cache/bee \
-w /work \
"${IMAGE_REF}" \
sh /work/iso/builder/build.sh --authorized-keys "/tmp/bee-authkeys/${AUTH_KEYS_BASE}"
fi
${IMAGE_REF} \
sh /work/iso/builder/build.sh --variant ${_variant} ${_auth_arg}"
}
"$CONTAINER_TOOL" "$@"
run_variant() {
_v="$1"
echo "=== building variant: ${_v} ==="
if [ -n "$AUTH_KEYS" ]; then
"$CONTAINER_TOOL" run --rm --privileged \
--platform "${BUILDER_PLATFORM}" \
-v "${REPO_ROOT}:/work" \
-v "${CACHE_DIR}:/cache" \
-v "${AUTH_KEYS_DIR}:/tmp/bee-authkeys:ro" \
-e BEE_CONTAINER_BUILD=1 \
-e GOCACHE=/cache/go-build \
-e GOMODCACHE=/cache/go-mod \
-e TMPDIR=/cache/tmp \
-e BEE_CACHE_DIR=/cache/bee \
-w /work \
"${IMAGE_REF}" \
sh /work/iso/builder/build.sh --variant "${_v}" \
--authorized-keys "/tmp/bee-authkeys/${AUTH_KEYS_BASE}"
else
"$CONTAINER_TOOL" run --rm --privileged \
--platform "${BUILDER_PLATFORM}" \
-v "${REPO_ROOT}:/work" \
-v "${CACHE_DIR}:/cache" \
-e BEE_CONTAINER_BUILD=1 \
-e GOCACHE=/cache/go-build \
-e GOMODCACHE=/cache/go-mod \
-e TMPDIR=/cache/tmp \
-e BEE_CACHE_DIR=/cache/bee \
-w /work \
"${IMAGE_REF}" \
sh /work/iso/builder/build.sh --variant "${_v}"
fi
}
case "$VARIANT" in
nvidia)
run_variant nvidia
;;
amd)
run_variant amd
;;
all)
run_variant nvidia
run_variant amd
;;
esac

View File

@@ -13,19 +13,29 @@ BUILDER_DIR="${REPO_ROOT}/iso/builder"
OVERLAY_DIR="${REPO_ROOT}/iso/overlay"
DIST_DIR="${REPO_ROOT}/dist"
VENDOR_DIR="${REPO_ROOT}/iso/vendor"
BUILD_WORK_DIR="${DIST_DIR}/live-build-work"
OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage"
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
AUTH_KEYS=""
BEE_GPU_VENDOR="nvidia"
# parse args
while [ $# -gt 0 ]; do
case "$1" in
--authorized-keys) AUTH_KEYS="$2"; shift 2 ;;
--variant) BEE_GPU_VENDOR="$2"; shift 2 ;;
*) echo "unknown arg: $1"; exit 1 ;;
esac
done
case "$BEE_GPU_VENDOR" in
nvidia|amd) ;;
*) echo "unknown variant: $BEE_GPU_VENDOR (expected nvidia or amd)" >&2; exit 1 ;;
esac
BUILD_WORK_DIR="${DIST_DIR}/live-build-work-${BEE_GPU_VENDOR}"
OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage-${BEE_GPU_VENDOR}"
export BEE_GPU_VENDOR
. "${BUILDER_DIR}/VERSIONS"
export PATH="$PATH:/usr/local/go/bin"
@@ -132,7 +142,7 @@ if [ ! -d "/usr/src/linux-headers-${KVER}" ]; then
apt-get install -y "linux-headers-${KVER}"
fi
echo "=== bee ISO build ==="
echo "=== bee ISO build (variant: ${BEE_GPU_VENDOR}) ==="
echo "Debian: ${DEBIAN_VERSION}, Kernel ABI: ${DEBIAN_KERNEL_ABI}, Go: ${GO_VERSION}"
echo "Audit version: ${AUDIT_VERSION_EFFECTIVE}, ISO version: ${ISO_VERSION_EFFECTIVE}"
echo ""
@@ -141,8 +151,8 @@ echo "=== syncing git submodules ==="
git -C "${REPO_ROOT}" submodule update --init --recursive
# --- compile bee binary (static, Linux amd64) ---
# Shared between variants — built once, reused on second pass.
BEE_BIN="${DIST_DIR}/bee-linux-amd64"
GPU_STRESS_BIN="${DIST_DIR}/bee-gpu-stress-linux-amd64"
NEED_BUILD=1
if [ -f "$BEE_BIN" ]; then
NEWEST_SRC=$(find "${REPO_ROOT}/audit" -name '*.go' -newer "$BEE_BIN" | head -1)
@@ -172,37 +182,41 @@ else
echo "=== bee binary up to date, skipping build ==="
fi
echo ""
echo "=== downloading cuBLAS/cuBLASLt/cudart ${NCCL_CUDA_VERSION} userspace ==="
sh "${BUILDER_DIR}/build-cublas.sh" \
"${CUBLAS_VERSION}" \
"${CUDA_USERSPACE_VERSION}" \
"${NCCL_CUDA_VERSION}" \
"${DIST_DIR}"
# --- NVIDIA-only build steps ---
GPU_STRESS_BIN="${DIST_DIR}/bee-gpu-stress-linux-amd64"
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
echo ""
echo "=== downloading cuBLAS/cuBLASLt/cudart ${NCCL_CUDA_VERSION} userspace ==="
sh "${BUILDER_DIR}/build-cublas.sh" \
"${CUBLAS_VERSION}" \
"${CUDA_USERSPACE_VERSION}" \
"${NCCL_CUDA_VERSION}" \
"${DIST_DIR}"
CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"
CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"
GPU_STRESS_NEED_BUILD=1
if [ -f "$GPU_STRESS_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_STRESS_BIN" ]; then
GPU_STRESS_NEED_BUILD=0
GPU_STRESS_NEED_BUILD=1
if [ -f "$GPU_STRESS_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_STRESS_BIN" ]; then
GPU_STRESS_NEED_BUILD=0
fi
if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then
echo "=== building bee-gpu-stress ==="
gcc -O2 -s -Wall -Wextra \
-I"${CUBLAS_CACHE}/include" \
-o "$GPU_STRESS_BIN" \
"${BUILDER_DIR}/bee-gpu-stress.c" \
-ldl -lm
echo "binary: $GPU_STRESS_BIN"
else
echo "=== bee-gpu-stress up to date, skipping build ==="
fi
fi
if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then
echo "=== building bee-gpu-stress ==="
gcc -O2 -s -Wall -Wextra \
-I"${CUBLAS_CACHE}/include" \
-o "$GPU_STRESS_BIN" \
"${BUILDER_DIR}/bee-gpu-stress.c" \
-ldl -lm
echo "binary: $GPU_STRESS_BIN"
else
echo "=== bee-gpu-stress up to date, skipping build ==="
fi
echo "=== preparing staged overlay ==="
# Sync builder config into work dir, preserving lb cache (chroot + packages).
# We do NOT rm -rf BUILD_WORK_DIR so lb can reuse its chroot on repeat builds.
echo "=== preparing staged overlay (${BEE_GPU_VENDOR}) ==="
mkdir -p "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}"
# Sync builder config into variant work dir, preserving lb cache.
rsync -a --delete \
--exclude='cache/' \
--exclude='chroot/' \
@@ -212,7 +226,10 @@ rsync -a --delete \
--exclude='*.contents' \
--exclude='*.files' \
"${BUILDER_DIR}/" "${BUILD_WORK_DIR}/"
# Also persist package cache to CACHE_ROOT so it survives a manual wipe of BUILD_WORK_DIR.
# Share deb package cache across variants.
# Restore: populate work dir cache from shared cache before build.
# Persist: sync back after build (done after lb build below).
LB_PKG_CACHE="${CACHE_ROOT}/lb-packages"
mkdir -p "${LB_PKG_CACHE}"
if [ -d "${BUILD_WORK_DIR}/cache/packages.chroot" ]; then
@@ -221,6 +238,7 @@ elif [ -d "${LB_PKG_CACHE}" ] && [ "$(ls -A "${LB_PKG_CACHE}" 2>/dev/null)" ]; t
mkdir -p "${BUILD_WORK_DIR}/cache/packages.chroot"
rsync -a "${LB_PKG_CACHE}/" "${BUILD_WORK_DIR}/cache/packages.chroot/"
fi
rsync -a "${OVERLAY_DIR}/" "${OVERLAY_STAGE_DIR}/"
rm -f \
"${OVERLAY_STAGE_DIR}/etc/bee-ssh-password-fallback" \
@@ -231,6 +249,12 @@ rm -f \
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" \
"${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
# Remove NVIDIA-specific overlay files for AMD variant
if [ "$BEE_GPU_VENDOR" = "amd" ]; then
rm -f "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nvidia-load"
rm -f "${OVERLAY_STAGE_DIR}/etc/systemd/system/bee-nvidia.service"
fi
# --- inject authorized_keys for SSH access ---
AUTHORIZED_KEYS_FILE="${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys"
mkdir -p "${OVERLAY_STAGE_DIR}/root/.ssh"
@@ -268,8 +292,11 @@ fi
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/bin"
cp "${DIST_DIR}/bee-linux-amd64" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
cp "${GPU_STRESS_BIN}" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
if [ "$BEE_GPU_VENDOR" = "nvidia" ] && [ -f "$GPU_STRESS_BIN" ]; then
cp "${GPU_STRESS_BIN}" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
fi
# --- inject smoketest into overlay so it runs directly on the live CD ---
cp "${BUILDER_DIR}/smoketest.sh" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest"
@@ -286,100 +313,143 @@ for tool in storcli64 sas2ircu sas3ircu arcconf ssacli; do
fi
done
# --- build NVIDIA kernel modules ---
echo ""
echo "=== building NVIDIA ${NVIDIA_DRIVER_VERSION} modules ==="
sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}"
# --- NVIDIA kernel modules and userspace libs ---
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
echo ""
echo "=== building NVIDIA ${NVIDIA_DRIVER_VERSION} modules ==="
sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}"
KVER="${DEBIAN_KERNEL_ABI}-amd64"
NVIDIA_CACHE="${DIST_DIR}/nvidia-${NVIDIA_DRIVER_VERSION}-${KVER}"
KVER="${DEBIAN_KERNEL_ABI}-amd64"
NVIDIA_CACHE="${DIST_DIR}/nvidia-${NVIDIA_DRIVER_VERSION}-${KVER}"
# Inject .ko files into overlay at /usr/local/lib/nvidia/
OVERLAY_KMOD_DIR="${OVERLAY_DIR}/usr/local/lib/nvidia"
OVERLAY_KMOD_DIR="${OVERLAY_STAGE_DIR}/usr/local/lib/nvidia"
mkdir -p "${OVERLAY_KMOD_DIR}"
cp "${NVIDIA_CACHE}/modules/"*.ko "${OVERLAY_KMOD_DIR}/"
# Inject .ko files into overlay at /usr/local/lib/nvidia/
OVERLAY_KMOD_DIR="${OVERLAY_STAGE_DIR}/usr/local/lib/nvidia"
mkdir -p "${OVERLAY_KMOD_DIR}"
cp "${NVIDIA_CACHE}/modules/"*.ko "${OVERLAY_KMOD_DIR}/"
# Inject nvidia-smi and libnvidia-ml
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/bin" "${OVERLAY_STAGE_DIR}/usr/lib"
cp "${NVIDIA_CACHE}/bin/nvidia-smi" "${OVERLAY_STAGE_DIR}/usr/local/bin/"
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/nvidia-smi"
cp "${NVIDIA_CACHE}/bin/nvidia-bug-report.sh" "${OVERLAY_STAGE_DIR}/usr/local/bin/" 2>/dev/null || true
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/nvidia-bug-report.sh" 2>/dev/null || true
cp "${NVIDIA_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" 2>/dev/null || true
# Inject nvidia-smi and libnvidia-ml
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/bin" "${OVERLAY_STAGE_DIR}/usr/lib"
cp "${NVIDIA_CACHE}/bin/nvidia-smi" "${OVERLAY_STAGE_DIR}/usr/local/bin/"
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/nvidia-smi"
cp "${NVIDIA_CACHE}/bin/nvidia-bug-report.sh" "${OVERLAY_STAGE_DIR}/usr/local/bin/" 2>/dev/null || true
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/nvidia-bug-report.sh" 2>/dev/null || true
cp "${NVIDIA_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" 2>/dev/null || true
# Inject GSP firmware into /lib/firmware/nvidia/<version>/
if [ -d "${NVIDIA_CACHE}/firmware" ] && [ "$(ls -A "${NVIDIA_CACHE}/firmware" 2>/dev/null)" ]; then
mkdir -p "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}"
cp "${NVIDIA_CACHE}/firmware/"* "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}/"
echo "=== firmware: $(ls "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}/" | wc -l) files injected ==="
# Inject GSP firmware into /lib/firmware/nvidia/<version>/
if [ -d "${NVIDIA_CACHE}/firmware" ] && [ "$(ls -A "${NVIDIA_CACHE}/firmware" 2>/dev/null)" ]; then
mkdir -p "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}"
cp "${NVIDIA_CACHE}/firmware/"* "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}/"
echo "=== firmware: $(ls "${OVERLAY_STAGE_DIR}/lib/firmware/nvidia/${NVIDIA_DRIVER_VERSION}/" | wc -l) files injected ==="
fi
# --- build / download NCCL ---
echo ""
echo "=== downloading NCCL ${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION} ==="
sh "${BUILDER_DIR}/build-nccl.sh" "${NCCL_VERSION}" "${NCCL_CUDA_VERSION}" "${DIST_DIR}" "${NCCL_SHA256:-}"
NCCL_CACHE="${DIST_DIR}/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}"
# Inject libnccl.so.* into overlay alongside other NVIDIA userspace libs
cp "${NCCL_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
echo "=== NCCL: $(ls "${NCCL_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
# Inject cuBLAS/cuBLASLt/cudart runtime libs used by bee-gpu-stress tensor-core GEMM path
cp "${CUBLAS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
echo "=== cuBLAS: $(ls "${CUBLAS_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
# --- build nccl-tests ---
echo ""
echo "=== building nccl-tests ${NCCL_TESTS_VERSION} ==="
sh "${BUILDER_DIR}/build-nccl-tests.sh" \
"${NCCL_TESTS_VERSION}" \
"${NCCL_VERSION}" \
"${NCCL_CUDA_VERSION}" \
"${DIST_DIR}" \
"${NVCC_VERSION}" \
"${DEBIAN_VERSION}"
NCCL_TESTS_CACHE="${DIST_DIR}/nccl-tests-${NCCL_TESTS_VERSION}"
cp "${NCCL_TESTS_CACHE}/bin/all_reduce_perf" "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
echo "=== all_reduce_perf injected ==="
fi
# --- build / download NCCL ---
echo ""
echo "=== downloading NCCL ${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION} ==="
sh "${BUILDER_DIR}/build-nccl.sh" "${NCCL_VERSION}" "${NCCL_CUDA_VERSION}" "${DIST_DIR}" "${NCCL_SHA256:-}"
NCCL_CACHE="${DIST_DIR}/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}"
# Inject libnccl.so.* into overlay alongside other NVIDIA userspace libs
cp "${NCCL_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
echo "=== NCCL: $(ls "${NCCL_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
# Inject cuBLAS/cuBLASLt/cudart runtime libs used by bee-gpu-stress tensor-core GEMM path
cp "${CUBLAS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
echo "=== cuBLAS: $(ls "${CUBLAS_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
# --- build nccl-tests ---
echo ""
echo "=== building nccl-tests ${NCCL_TESTS_VERSION} ==="
sh "${BUILDER_DIR}/build-nccl-tests.sh" \
"${NCCL_TESTS_VERSION}" \
"${NCCL_VERSION}" \
"${NCCL_CUDA_VERSION}" \
"${DIST_DIR}" \
"${NVCC_VERSION}" \
"${DEBIAN_VERSION}"
NCCL_TESTS_CACHE="${DIST_DIR}/nccl-tests-${NCCL_TESTS_VERSION}"
cp "${NCCL_TESTS_CACHE}/bin/all_reduce_perf" "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
echo "=== all_reduce_perf injected ==="
# --- embed build metadata ---
mkdir -p "${OVERLAY_STAGE_DIR}/etc"
BUILD_DATE="$(date +%Y-%m-%d)"
GIT_COMMIT="$(git -C "${REPO_ROOT}" rev-parse --short HEAD 2>/dev/null || echo unknown)"
cat > "${OVERLAY_STAGE_DIR}/etc/bee-release" <<EOF
BEE_ISO_VERSION=${ISO_VERSION_EFFECTIVE}
BEE_AUDIT_VERSION=${AUDIT_VERSION_EFFECTIVE}
BUILD_DATE=${BUILD_DATE}
GIT_COMMIT=${GIT_COMMIT}
DEBIAN_VERSION=${DEBIAN_VERSION}
DEBIAN_KERNEL_ABI=${DEBIAN_KERNEL_ABI}
NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
GPU_VERSION_LINE="NVIDIA_DRIVER_VERSION=${NVIDIA_DRIVER_VERSION}
NCCL_VERSION=${NCCL_VERSION}
NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION}
CUBLAS_VERSION=${CUBLAS_VERSION}
CUDA_USERSPACE_VERSION=${CUDA_USERSPACE_VERSION}
NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}
NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}"
GPU_BUILD_INFO="nvidia:${NVIDIA_DRIVER_VERSION}"
else
GPU_VERSION_LINE="ROCM_VERSION=${ROCM_VERSION}"
GPU_BUILD_INFO="rocm:${ROCM_VERSION}"
fi
cat > "${OVERLAY_STAGE_DIR}/etc/bee-release" <<EOF
BEE_ISO_VERSION=${ISO_VERSION_EFFECTIVE}
BEE_AUDIT_VERSION=${AUDIT_VERSION_EFFECTIVE}
BEE_GPU_VENDOR=${BEE_GPU_VENDOR}
BUILD_DATE=${BUILD_DATE}
GIT_COMMIT=${GIT_COMMIT}
DEBIAN_VERSION=${DEBIAN_VERSION}
DEBIAN_KERNEL_ABI=${DEBIAN_KERNEL_ABI}
${GPU_VERSION_LINE}
EOF
# Write GPU vendor marker for hooks
echo "${BEE_GPU_VENDOR}" > "${OVERLAY_STAGE_DIR}/etc/bee-gpu-vendor"
# Patch motd with build info
BEE_BUILD_INFO="${BUILD_DATE} git:${GIT_COMMIT} debian:${DEBIAN_VERSION} nvidia:${NVIDIA_DRIVER_VERSION}"
BEE_BUILD_INFO="${BUILD_DATE} git:${GIT_COMMIT} debian:${DEBIAN_VERSION} ${GPU_BUILD_INFO}"
if [ -f "${OVERLAY_STAGE_DIR}/etc/motd" ]; then
sed "s/%%BUILD_INFO%%/${BEE_BUILD_INFO}/" "${OVERLAY_STAGE_DIR}/etc/motd" \
> "${OVERLAY_STAGE_DIR}/etc/motd.patched"
mv "${OVERLAY_STAGE_DIR}/etc/motd.patched" "${OVERLAY_STAGE_DIR}/etc/motd"
fi
# --- substitute version placeholders in package list ---
sed -i \
-e "s/%%DCGM_VERSION%%/${DCGM_VERSION}/g" \
-e "s/%%ROCM_VERSION%%/${ROCM_VERSION}/g" \
-e "s/%%ROCM_SMI_VERSION%%/${ROCM_SMI_VERSION}/g" \
"${BUILD_WORK_DIR}/config/package-lists/bee.list.chroot" \
"${BUILD_WORK_DIR}/config/archives/rocm.list.chroot"
# --- copy variant-specific package list into work dir ---
cp "${BUILD_WORK_DIR}/config/package-lists/bee-${BEE_GPU_VENDOR}.list.chroot" \
"${BUILD_WORK_DIR}/config/package-lists/bee-gpu.list.chroot"
# --- remove archives for the other vendor ---
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
rm -f "${BUILD_WORK_DIR}/config/archives/rocm.list.chroot" \
"${BUILD_WORK_DIR}/config/archives/rocm.key.chroot"
else
rm -f "${BUILD_WORK_DIR}/config/archives/nvidia-cuda.list.chroot" \
"${BUILD_WORK_DIR}/config/archives/nvidia-cuda.key.chroot"
fi
# --- substitute version placeholders in package list and archive ---
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
sed -i \
-e "s/%%DCGM_VERSION%%/${DCGM_VERSION}/g" \
"${BUILD_WORK_DIR}/config/package-lists/bee-gpu.list.chroot"
else
sed -i \
-e "s/%%ROCM_VERSION%%/${ROCM_VERSION}/g" \
-e "s/%%ROCM_SMI_VERSION%%/${ROCM_SMI_VERSION}/g" \
-e "s/%%ROCM_BANDWIDTH_TEST_VERSION%%/${ROCM_BANDWIDTH_TEST_VERSION}/g" \
-e "s/%%ROCM_VALIDATION_SUITE_VERSION%%/${ROCM_VALIDATION_SUITE_VERSION}/g" \
-e "s/%%ROCBLAS_VERSION%%/${ROCBLAS_VERSION}/g" \
-e "s/%%ROCRAND_VERSION%%/${ROCRAND_VERSION}/g" \
-e "s/%%HIP_RUNTIME_AMD_VERSION%%/${HIP_RUNTIME_AMD_VERSION}/g" \
-e "s/%%HIPBLASLT_VERSION%%/${HIPBLASLT_VERSION}/g" \
-e "s/%%COMGR_VERSION%%/${COMGR_VERSION}/g" \
"${BUILD_WORK_DIR}/config/package-lists/bee-gpu.list.chroot"
if [ -f "${BUILD_WORK_DIR}/config/archives/rocm.list.chroot" ]; then
sed -i \
-e "s/%%ROCM_VERSION%%/${ROCM_VERSION}/g" \
"${BUILD_WORK_DIR}/config/archives/rocm.list.chroot"
fi
fi
# --- sync overlay into live-build includes.chroot ---
LB_DIR="${BUILD_WORK_DIR}"
@@ -395,20 +465,31 @@ fi
# --- build ISO using live-build ---
echo ""
echo "=== building ISO (live-build) ==="
echo "=== building ISO (live-build, variant: ${BEE_GPU_VENDOR}) ==="
# Export for auto/config
BEE_GPU_VENDOR_UPPER="$(echo "${BEE_GPU_VENDOR}" | tr 'a-z' 'A-Z')"
export BEE_GPU_VENDOR_UPPER
cd "${LB_DIR}"
lb clean 2>&1 | tail -3
lb config 2>&1 | tail -5
lb build 2>&1
# --- persist deb package cache back to shared location ---
# This allows the second variant to reuse all downloaded packages.
if [ -d "${BUILD_WORK_DIR}/cache/packages.chroot" ]; then
rsync -a "${BUILD_WORK_DIR}/cache/packages.chroot/" "${LB_PKG_CACHE}/"
echo "=== package cache synced to ${LB_PKG_CACHE} ==="
fi
# live-build outputs live-image-amd64.hybrid.iso in LB_DIR
ISO_RAW="${LB_DIR}/live-image-amd64.hybrid.iso"
ISO_OUT="${DIST_DIR}/bee-debian${DEBIAN_VERSION}-v${ISO_VERSION_EFFECTIVE}-amd64.iso"
ISO_OUT="${DIST_DIR}/easy-bee-${BEE_GPU_VENDOR}-v${ISO_VERSION_EFFECTIVE}-amd64.iso"
if [ -f "$ISO_RAW" ]; then
cp "$ISO_RAW" "$ISO_OUT"
echo ""
echo "=== done ==="
echo "=== done (${BEE_GPU_VENDOR}) ==="
echo "ISO: $ISO_OUT"
if command -v stat >/dev/null 2>&1; then
ISO_SIZE_BYTES="$(stat -c '%s' "$ISO_OUT" 2>/dev/null || stat -f '%z' "$ISO_OUT")"

View File

@@ -10,12 +10,12 @@ echo " ╚══════╝╚═╝ ╚═╝╚══════╝
echo ""
menuentry "EASY-BEE" {
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
initrd @INITRD_LIVE@
}
menuentry "EASY-BEE (load to RAM)" {
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
initrd @INITRD_LIVE@
}

View File

@@ -5,6 +5,9 @@ set -e
echo "=== bee chroot setup ==="
GPU_VENDOR=$(cat /etc/bee-gpu-vendor 2>/dev/null || echo nvidia)
echo "=== GPU vendor: ${GPU_VENDOR} ==="
ensure_bee_console_user() {
if id bee >/dev/null 2>&1; then
usermod -d /home/bee -s /bin/bash bee 2>/dev/null || true
@@ -21,10 +24,8 @@ ensure_bee_console_user() {
ensure_bee_console_user
# Enable bee services
systemctl enable nvidia-dcgm.service 2>/dev/null || true
# Enable common bee services
systemctl enable bee-network.service
systemctl enable bee-nvidia.service
systemctl enable bee-preflight.service
systemctl enable bee-audit.service
systemctl enable bee-web.service
@@ -36,25 +37,33 @@ systemctl enable serial-getty@ttyS0.service 2>/dev/null || true
systemctl enable serial-getty@ttyS1.service 2>/dev/null || true
systemctl enable bee-journal-mirror@ttyS1.service 2>/dev/null || true
# Enable GPU-vendor specific services
if [ "$GPU_VENDOR" = "nvidia" ]; then
systemctl enable nvidia-dcgm.service 2>/dev/null || true
systemctl enable bee-nvidia.service
elif [ "$GPU_VENDOR" = "amd" ]; then
# ROCm symlinks (packages install to /opt/rocm-*/bin/)
for tool in rocm-smi rocm-bandwidth-test rvs; do
if [ ! -e /usr/local/bin/${tool} ]; then
bin_path="$(find /opt -path "*/bin/${tool}" -type f 2>/dev/null | sort | tail -1)"
[ -n "${bin_path}" ] && ln -sf "${bin_path}" /usr/local/bin/${tool}
fi
done
fi
# Ensure scripts are executable
chmod +x /usr/local/bin/bee-network.sh 2>/dev/null || true
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
chmod +x /usr/local/bin/bee-sshsetup 2>/dev/null || true
chmod +x /usr/local/bin/bee-smoketest 2>/dev/null || true
chmod +x /usr/local/bin/bee 2>/dev/null || true
chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true
if [ "$GPU_VENDOR" = "nvidia" ]; then
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
fi
# Reload udev rules
udevadm control --reload-rules 2>/dev/null || true
# rocm symlinks (packages install to /opt/rocm-*/bin/)
for tool in rocm-smi rocm-bandwidth-test rvs; do
if [ ! -e /usr/local/bin/${tool} ]; then
bin_path="$(find /opt -path "*/bin/${tool}" -type f 2>/dev/null | sort | tail -1)"
[ -n "${bin_path}" ] && ln -sf "${bin_path}" /usr/local/bin/${tool}
fi
done
# Create export directory
mkdir -p /appdata/bee/export
@@ -62,4 +71,4 @@ if [ -f /etc/sudoers.d/bee ]; then
chmod 0440 /etc/sudoers.d/bee
fi
echo "=== bee chroot setup complete ==="
echo "=== bee chroot setup complete (${GPU_VENDOR}) ==="

View File

@@ -4,6 +4,9 @@
# not inside the squashfs).
set -e
echo "memtest: scanning chroot/boot/ for memtest files:"
ls chroot/boot/memtest* 2>/dev/null || echo "memtest: WARNING: no memtest files found in chroot/boot/"
for f in memtest86+x64.bin memtest86+x64.efi memtest86+ia32.bin memtest86+ia32.efi; do
src="chroot/boot/${f}"
if [ -f "${src}" ]; then

View File

@@ -0,0 +1,9 @@
# AMD ROCm — GPU monitoring, bandwidth test, and compute stress (RVS GST)
rocm-smi-lib=%%ROCM_SMI_VERSION%%
rocm-bandwidth-test=%%ROCM_BANDWIDTH_TEST_VERSION%%
rocm-validation-suite=%%ROCM_VALIDATION_SUITE_VERSION%%
rocblas=%%ROCBLAS_VERSION%%
rocrand=%%ROCRAND_VERSION%%
hip-runtime-amd=%%HIP_RUNTIME_AMD_VERSION%%
hipblaslt=%%HIPBLASLT_VERSION%%
comgr=%%COMGR_VERSION%%

View File

@@ -0,0 +1,2 @@
# NVIDIA DCGM (Data Center GPU Manager) — dcgmi diag for acceptance testing
datacenter-gpu-manager=1:%%DCGM_VERSION%%

View File

@@ -72,18 +72,5 @@ firmware-bnx2x
firmware-cavium
firmware-qlogic
# NVIDIA DCGM (Data Center GPU Manager) — dcgmi diag for acceptance testing
datacenter-gpu-manager=1:%%DCGM_VERSION%%
# AMD ROCm — GPU monitoring, bandwidth test, and compute stress (RVS GST)
rocm-smi-lib=%%ROCM_SMI_VERSION%%
rocm-bandwidth-test=%%ROCM_BANDWIDTH_TEST_VERSION%%
rocm-validation-suite=%%ROCM_VALIDATION_SUITE_VERSION%%
rocblas=%%ROCBLAS_VERSION%%
rocrand=%%ROCRAND_VERSION%%
hip-runtime-amd=%%HIP_RUNTIME_AMD_VERSION%%
hipblaslt=%%HIPBLASLT_VERSION%%
comgr=%%COMGR_VERSION%%
# glibc compat helpers (for any external binaries that need it)
libc6

View File

@@ -39,7 +39,7 @@ info "nvidia boot mode: ${NVIDIA_BOOT_MODE}"
# --- PATH & binaries ---
echo "-- PATH & binaries --"
for tool in dmidecode smartctl nvme ipmitool lspci bee; do
if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then
if p=$(PATH="/usr/local/bin:/usr/sbin:/sbin:$PATH" command -v "$tool" 2>/dev/null); then
ok "$tool found: $p"
else
fail "$tool: NOT FOUND"

View File

@@ -1,4 +1,4 @@
export PATH="$PATH:/usr/local/bin:/opt/rocm/bin:/opt/rocm/sbin"
export PATH="$PATH:/usr/local/bin:/usr/sbin:/sbin:/opt/rocm/bin:/opt/rocm/sbin"
# Print web UI URLs on the local console at login.
if [ -z "${SSH_CONNECTION:-}" ] \