Compare commits
47 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
444a7d16cc | ||
|
|
fd722692a4 | ||
|
|
99cece524c | ||
|
|
c27449c60e | ||
|
|
5ef879e307 | ||
|
|
e7df63bae1 | ||
|
|
17ff3811f8 | ||
|
|
fc7fe0b08e | ||
|
|
3cf75a541a | ||
|
|
1f750d3edd | ||
|
|
b2b0444131 | ||
| dbab43db90 | |||
| bcb7fe5fe9 | |||
| d21d9d191b | |||
| ef45246ea0 | |||
| 348db35119 | |||
| 1dd7f243f5 | |||
| 938e499ac2 | |||
| 964ab39656 | |||
| c2aecc6ce9 | |||
| 439b86ce59 | |||
| eb60100297 | |||
|
|
2baf3be640 | ||
|
|
d92f8f41d0 | ||
|
|
76a9100779 | ||
|
|
1b6d592bf3 | ||
|
|
c95bbff23b | ||
|
|
4e4debd4da | ||
|
|
5839f870b7 | ||
|
|
b447717a5a | ||
|
|
f6f4923ac9 | ||
|
|
c394845b34 | ||
|
|
3472afea32 | ||
|
|
942f11937f | ||
|
|
b5b34983f1 | ||
| 45221d1e9a | |||
| 3869788bac | |||
| 3dbc2184ef | |||
| 60cb8f889a | |||
| c9ee078622 | |||
| ea660500c9 | |||
| d43a9aeec7 | |||
|
|
f5622e351e | ||
|
|
a20806afc8 | ||
|
|
4f9b6b3bcd | ||
|
|
c850b39b01 | ||
|
|
6dee8f3509 |
4
PLAN.md
4
PLAN.md
@@ -343,9 +343,9 @@ Planned code shape:
|
||||
- `bee tui` can rerun the audit manually
|
||||
- `bee tui` can export the latest audit JSON to removable media
|
||||
- `bee tui` can show health summary and run NVIDIA/memory/storage acceptance tests
|
||||
- NVIDIA SAT now includes a lightweight in-image GPU stress step via `bee-gpu-stress`
|
||||
- NVIDIA SAT now includes a lightweight in-image GPU stress step via `bee-gpu-burn`
|
||||
- SAT summaries now expose `overall_status` plus per-job `OK/FAILED/UNSUPPORTED`
|
||||
- Memory/GPU SAT runtime defaults can be overridden via `BEE_MEMTESTER_*` and `BEE_GPU_STRESS_*`
|
||||
- Memory SAT runtime defaults can be overridden via `BEE_MEMTESTER_*`
|
||||
- removable export requires explicit target selection, mount, confirmation, copy, and cleanup
|
||||
|
||||
### 2.6 — Vendor utilities and optional assets
|
||||
|
||||
@@ -356,6 +356,7 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
|
||||
fs := flag.NewFlagSet("sat", flag.ContinueOnError)
|
||||
fs.SetOutput(stderr)
|
||||
duration := fs.Int("duration", 0, "stress-ng duration in seconds (cpu only; default: 60)")
|
||||
diagLevel := fs.Int("diag-level", 0, "DCGM diagnostic level for nvidia (1=quick, 2=medium, 3=targeted stress, 4=extended stress; default: 1)")
|
||||
if err := fs.Parse(args[1:]); err != nil {
|
||||
if err == flag.ErrHelp {
|
||||
return 0
|
||||
@@ -370,7 +371,7 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
|
||||
target := args[0]
|
||||
if target != "nvidia" && target != "memory" && target != "storage" && target != "cpu" {
|
||||
fmt.Fprintf(stderr, "bee sat: unknown target %q\n", target)
|
||||
fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>]")
|
||||
fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>] [--diag-level <1-4>]")
|
||||
return 2
|
||||
}
|
||||
|
||||
@@ -382,7 +383,12 @@ func runSAT(args []string, stdout, stderr io.Writer) int {
|
||||
logLine := func(s string) { fmt.Fprintln(os.Stderr, s) }
|
||||
switch target {
|
||||
case "nvidia":
|
||||
archive, err = application.RunNvidiaAcceptancePack("", logLine)
|
||||
level := *diagLevel
|
||||
if level > 0 {
|
||||
_, err = application.RunNvidiaAcceptancePackWithOptions(context.Background(), "", level, nil, logLine)
|
||||
} else {
|
||||
archive, err = application.RunNvidiaAcceptancePack("", logLine)
|
||||
}
|
||||
case "memory":
|
||||
archive, err = application.RunMemoryAcceptancePackCtx(context.Background(), "", logLine)
|
||||
case "storage":
|
||||
|
||||
@@ -40,6 +40,8 @@ type App struct {
|
||||
sat satRunner
|
||||
runtime runtimeChecker
|
||||
installer installer
|
||||
// StatusDB is the unified component health store (nil if unavailable).
|
||||
StatusDB *ComponentStatusDB
|
||||
}
|
||||
|
||||
type ActionResult struct {
|
||||
@@ -80,6 +82,7 @@ type installer interface {
|
||||
ListInstallDisks() ([]platform.InstallDisk, error)
|
||||
InstallToDisk(ctx context.Context, device string, logFile string) error
|
||||
IsLiveMediaInRAM() bool
|
||||
LiveBootSource() platform.LiveBootSource
|
||||
RunInstallToRAM(ctx context.Context, logFunc func(string)) error
|
||||
}
|
||||
|
||||
@@ -100,6 +103,10 @@ func (a *App) IsLiveMediaInRAM() bool {
|
||||
return a.installer.IsLiveMediaInRAM()
|
||||
}
|
||||
|
||||
func (a *App) LiveBootSource() platform.LiveBootSource {
|
||||
return a.installer.LiveBootSource()
|
||||
}
|
||||
|
||||
func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
|
||||
return a.installer.RunInstallToRAM(ctx, logFunc)
|
||||
}
|
||||
@@ -107,6 +114,7 @@ func (a *App) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
|
||||
type satRunner interface {
|
||||
RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error)
|
||||
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaStressPack(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error)
|
||||
RunMemoryAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||
RunStorageAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||
RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||
@@ -130,7 +138,7 @@ type runtimeChecker interface {
|
||||
}
|
||||
|
||||
func New(platform *platform.System) *App {
|
||||
return &App{
|
||||
a := &App{
|
||||
network: platform,
|
||||
services: platform,
|
||||
exports: platform,
|
||||
@@ -139,19 +147,32 @@ func New(platform *platform.System) *App {
|
||||
runtime: platform,
|
||||
installer: platform,
|
||||
}
|
||||
if db, err := OpenComponentStatusDB(DefaultExportDir + "/component-status.json"); err == nil {
|
||||
a.StatusDB = db
|
||||
}
|
||||
return a
|
||||
}
|
||||
|
||||
// ApplySATOverlay parses a raw audit JSON, overlays the latest SAT results,
|
||||
// and returns the updated JSON. Used by the web UI to serve always-fresh status.
|
||||
func ApplySATOverlay(auditJSON []byte) ([]byte, error) {
|
||||
var snap schema.HardwareIngestRequest
|
||||
if err := json.Unmarshal(auditJSON, &snap); err != nil {
|
||||
snap, err := readAuditSnapshot(auditJSON)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
applyLatestSATStatuses(&snap.Hardware, DefaultSATBaseDir)
|
||||
applyLatestSATStatuses(&snap.Hardware, DefaultSATBaseDir, nil)
|
||||
return json.MarshalIndent(snap, "", " ")
|
||||
}
|
||||
|
||||
func readAuditSnapshot(auditJSON []byte) (schema.HardwareIngestRequest, error) {
|
||||
var snap schema.HardwareIngestRequest
|
||||
if err := json.Unmarshal(auditJSON, &snap); err != nil {
|
||||
return schema.HardwareIngestRequest{}, err
|
||||
}
|
||||
collector.NormalizeSnapshot(&snap.Hardware, snap.CollectedAt)
|
||||
return snap, nil
|
||||
}
|
||||
|
||||
func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, error) {
|
||||
if runtimeMode == runtimeenv.ModeLiveCD {
|
||||
if err := a.runtime.CaptureTechnicalDump(DefaultTechDumpDir); err != nil {
|
||||
@@ -159,7 +180,7 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
|
||||
}
|
||||
}
|
||||
result := collector.Run(runtimeMode)
|
||||
applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir)
|
||||
applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir, a.StatusDB)
|
||||
if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil {
|
||||
result.Runtime = &health
|
||||
}
|
||||
@@ -275,6 +296,9 @@ func (a *App) ExportLatestAudit(target platform.RemovableTarget) (string, error)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
if normalized, normErr := ApplySATOverlay(data); normErr == nil {
|
||||
data = normalized
|
||||
}
|
||||
if err := os.WriteFile(tmpPath, data, 0644); err != nil {
|
||||
return "", err
|
||||
}
|
||||
@@ -508,6 +532,17 @@ func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir st
|
||||
return ActionResult{Title: "NVIDIA DCGM", Body: body}, err
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaStressPack(baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||
return a.RunNvidiaStressPackCtx(context.Background(), baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaStressPack(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, logFunc)
|
||||
}
|
||||
@@ -721,6 +756,7 @@ func (a *App) HealthSummaryResult() ActionResult {
|
||||
if err := json.Unmarshal(raw, &snapshot); err != nil {
|
||||
return ActionResult{Title: "Health summary", Body: "Audit JSON is unreadable."}
|
||||
}
|
||||
collector.NormalizeSnapshot(&snapshot.Hardware, snapshot.CollectedAt)
|
||||
|
||||
summary := collector.BuildHealthSummary(snapshot.Hardware)
|
||||
var body strings.Builder
|
||||
@@ -755,6 +791,7 @@ func (a *App) MainBanner() string {
|
||||
if err := json.Unmarshal(raw, &snapshot); err != nil {
|
||||
return ""
|
||||
}
|
||||
collector.NormalizeSnapshot(&snapshot.Hardware, snapshot.CollectedAt)
|
||||
|
||||
var lines []string
|
||||
if system := formatSystemLine(snapshot.Hardware.Board); system != "" {
|
||||
|
||||
@@ -120,14 +120,15 @@ func (f fakeTools) CheckTools(names []string) []platform.ToolStatus {
|
||||
}
|
||||
|
||||
type fakeSAT struct {
|
||||
runNvidiaFn func(string) (string, error)
|
||||
runMemoryFn func(string) (string, error)
|
||||
runStorageFn func(string) (string, error)
|
||||
runCPUFn func(string, int) (string, error)
|
||||
detectVendorFn func() string
|
||||
listAMDGPUsFn func() ([]platform.AMDGPUInfo, error)
|
||||
runAMDPackFn func(string) (string, error)
|
||||
listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error)
|
||||
runNvidiaFn func(string) (string, error)
|
||||
runNvidiaStressFn func(string, platform.NvidiaStressOptions) (string, error)
|
||||
runMemoryFn func(string) (string, error)
|
||||
runStorageFn func(string) (string, error)
|
||||
runCPUFn func(string, int) (string, error)
|
||||
detectVendorFn func() string
|
||||
listAMDGPUsFn func() ([]platform.AMDGPUInfo, error)
|
||||
runAMDPackFn func(string) (string, error)
|
||||
listNvidiaGPUsFn func() ([]platform.NvidiaGPU, error)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaAcceptancePack(baseDir string, _ func(string)) (string, error) {
|
||||
@@ -138,6 +139,13 @@ func (f fakeSAT) RunNvidiaAcceptancePackWithOptions(_ context.Context, baseDir s
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaStressPack(_ context.Context, baseDir string, opts platform.NvidiaStressOptions, _ func(string)) (string, error) {
|
||||
if f.runNvidiaStressFn != nil {
|
||||
return f.runNvidiaStressFn(baseDir, opts)
|
||||
}
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
|
||||
if f.listNvidiaGPUsFn != nil {
|
||||
return f.listNvidiaGPUsFn()
|
||||
@@ -652,13 +660,50 @@ func TestHealthSummaryResultIncludesCompactSATSummary(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplySATOverlayFiltersIgnoredLegacyDevices(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
oldSATBaseDir := DefaultSATBaseDir
|
||||
DefaultSATBaseDir = filepath.Join(tmp, "sat")
|
||||
t.Cleanup(func() { DefaultSATBaseDir = oldSATBaseDir })
|
||||
|
||||
raw := `{
|
||||
"collected_at": "2026-03-15T10:00:00Z",
|
||||
"hardware": {
|
||||
"board": {"serial_number": "SRV123"},
|
||||
"storage": [
|
||||
{"model": "Virtual HDisk0", "serial_number": "AAAABBBBCCCC3"},
|
||||
{"model": "PASCARI", "serial_number": "DISK1", "status": "OK"}
|
||||
],
|
||||
"pcie_devices": [
|
||||
{"device_class": "Co-processor", "model": "402xx Series QAT", "status": "OK"},
|
||||
{"device_class": "VideoController", "model": "NVIDIA H100", "status": "OK"}
|
||||
]
|
||||
}
|
||||
}`
|
||||
|
||||
got, err := ApplySATOverlay([]byte(raw))
|
||||
if err != nil {
|
||||
t.Fatalf("ApplySATOverlay error: %v", err)
|
||||
}
|
||||
text := string(got)
|
||||
if contains(text, "Virtual HDisk0") {
|
||||
t.Fatalf("overlaid audit should drop virtual hdisk:\n%s", text)
|
||||
}
|
||||
if contains(text, "\"device_class\": \"Co-processor\"") {
|
||||
t.Fatalf("overlaid audit should drop co-processors:\n%s", text)
|
||||
}
|
||||
if !contains(text, "PASCARI") || !contains(text, "NVIDIA H100") {
|
||||
t.Fatalf("overlaid audit should keep real devices:\n%s", text)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
exportDir := filepath.Join(tmp, "export")
|
||||
if err := os.MkdirAll(filepath.Join(exportDir, "bee-sat", "memory-run"), 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-audit.json"), []byte(`{"ok":true}`), 0644); err != nil {
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-audit.json"), []byte(`{"collected_at":"2026-03-15T10:00:00Z","hardware":{"board":{"serial_number":"SRV123"},"storage":[{"model":"Virtual HDisk0","serial_number":"AAAABBBBCCCC3"},{"model":"PASCARI","serial_number":"DISK1"}],"pcie_devices":[{"device_class":"Co-processor","model":"402xx Series QAT"},{"device_class":"VideoController","model":"NVIDIA H100"}]}}`), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-sat", "memory-run", "verbose.log"), []byte("sat verbose"), 0644); err != nil {
|
||||
@@ -690,6 +735,7 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
||||
|
||||
tr := tar.NewReader(gzr)
|
||||
var names []string
|
||||
var auditJSON string
|
||||
for {
|
||||
hdr, err := tr.Next()
|
||||
if errors.Is(err, io.EOF) {
|
||||
@@ -699,6 +745,13 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
||||
t.Fatalf("read tar entry: %v", err)
|
||||
}
|
||||
names = append(names, hdr.Name)
|
||||
if contains(hdr.Name, "/export/bee-audit.json") {
|
||||
body, err := io.ReadAll(tr)
|
||||
if err != nil {
|
||||
t.Fatalf("read audit entry: %v", err)
|
||||
}
|
||||
auditJSON = string(body)
|
||||
}
|
||||
}
|
||||
|
||||
var foundRaw bool
|
||||
@@ -713,6 +766,12 @@ func TestBuildSupportBundleIncludesExportDirContents(t *testing.T) {
|
||||
if !foundRaw {
|
||||
t.Fatalf("support bundle missing raw SAT log, names=%v", names)
|
||||
}
|
||||
if contains(auditJSON, "Virtual HDisk0") || contains(auditJSON, "\"device_class\": \"Co-processor\"") {
|
||||
t.Fatalf("support bundle should normalize ignored devices:\n%s", auditJSON)
|
||||
}
|
||||
if !contains(auditJSON, "PASCARI") || !contains(auditJSON, "NVIDIA H100") {
|
||||
t.Fatalf("support bundle should keep real devices:\n%s", auditJSON)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMainBanner(t *testing.T) {
|
||||
@@ -726,6 +785,10 @@ func TestMainBanner(t *testing.T) {
|
||||
product := "PowerEdge R760"
|
||||
cpuModel := "Intel Xeon Gold 6430"
|
||||
memoryType := "DDR5"
|
||||
memorySerialA := "DIMM-A"
|
||||
memorySerialB := "DIMM-B"
|
||||
storageSerialA := "DISK-A"
|
||||
storageSerialB := "DISK-B"
|
||||
gpuClass := "VideoController"
|
||||
gpuModel := "NVIDIA H100"
|
||||
|
||||
@@ -741,12 +804,12 @@ func TestMainBanner(t *testing.T) {
|
||||
{Model: &cpuModel},
|
||||
},
|
||||
Memory: []schema.HardwareMemory{
|
||||
{Present: &trueValue, SizeMB: intPtr(524288), Type: &memoryType},
|
||||
{Present: &trueValue, SizeMB: intPtr(524288), Type: &memoryType},
|
||||
{Present: &trueValue, SizeMB: intPtr(524288), Type: &memoryType, SerialNumber: &memorySerialA},
|
||||
{Present: &trueValue, SizeMB: intPtr(524288), Type: &memoryType, SerialNumber: &memorySerialB},
|
||||
},
|
||||
Storage: []schema.HardwareStorage{
|
||||
{Present: &trueValue, SizeGB: intPtr(3840)},
|
||||
{Present: &trueValue, SizeGB: intPtr(3840)},
|
||||
{Present: &trueValue, SizeGB: intPtr(3840), SerialNumber: &storageSerialA},
|
||||
{Present: &trueValue, SizeGB: intPtr(3840), SerialNumber: &storageSerialB},
|
||||
},
|
||||
PCIeDevices: []schema.HardwarePCIeDevice{
|
||||
{DeviceClass: &gpuClass, Model: &gpuModel},
|
||||
|
||||
266
audit/internal/app/component_status_db.go
Normal file
266
audit/internal/app/component_status_db.go
Normal file
@@ -0,0 +1,266 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// ComponentStatusDB is a persistent, append-only store of hardware component health records.
|
||||
// Records are keyed by component identity strings (e.g. "pcie:0000:c8:00.0", "storage:nvme0n1").
|
||||
// Once a component is marked Warning or Critical, subsequent OK entries do not downgrade it —
|
||||
// the component stays at the highest observed severity until explicitly reset.
|
||||
type ComponentStatusDB struct {
|
||||
path string
|
||||
mu sync.Mutex
|
||||
records map[string]*ComponentStatusRecord
|
||||
}
|
||||
|
||||
// ComponentStatusRecord holds the current and historical health of one hardware component.
|
||||
type ComponentStatusRecord struct {
|
||||
ComponentKey string `json:"component_key"`
|
||||
Status string `json:"status"` // "OK", "Warning", "Critical", "Unknown"
|
||||
LastCheckedAt time.Time `json:"last_checked_at"`
|
||||
LastChangedAt time.Time `json:"last_changed_at"`
|
||||
ErrorSummary string `json:"error_summary,omitempty"`
|
||||
History []ComponentStatusEntry `json:"history"`
|
||||
}
|
||||
|
||||
// ComponentStatusEntry is one observation written to a component's history.
|
||||
type ComponentStatusEntry struct {
|
||||
At time.Time `json:"at"`
|
||||
Status string `json:"status"`
|
||||
Source string `json:"source"` // e.g. "sat:nvidia", "sat:memory", "watchdog:kmsg"
|
||||
Detail string `json:"detail,omitempty"`
|
||||
}
|
||||
|
||||
// OpenComponentStatusDB opens (or creates) the JSON status DB at path.
|
||||
func OpenComponentStatusDB(path string) (*ComponentStatusDB, error) {
|
||||
db := &ComponentStatusDB{
|
||||
path: path,
|
||||
records: make(map[string]*ComponentStatusRecord),
|
||||
}
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil && !os.IsNotExist(err) {
|
||||
return nil, err
|
||||
}
|
||||
if len(data) > 0 {
|
||||
var records []ComponentStatusRecord
|
||||
if err := json.Unmarshal(data, &records); err == nil {
|
||||
for i := range records {
|
||||
db.records[records[i].ComponentKey] = &records[i]
|
||||
}
|
||||
}
|
||||
}
|
||||
return db, nil
|
||||
}
|
||||
|
||||
// Record writes one observation for the given component key.
|
||||
// source is a short label like "sat:nvidia" or "watchdog:kmsg".
|
||||
// status is "OK", "Warning", "Critical", or "Unknown".
|
||||
// OK never downgrades an existing Warning or Critical status.
|
||||
func (db *ComponentStatusDB) Record(key, source, status, detail string) {
|
||||
if db == nil || strings.TrimSpace(key) == "" {
|
||||
return
|
||||
}
|
||||
db.mu.Lock()
|
||||
defer db.mu.Unlock()
|
||||
|
||||
now := time.Now().UTC()
|
||||
rec, exists := db.records[key]
|
||||
if !exists {
|
||||
rec = &ComponentStatusRecord{ComponentKey: key}
|
||||
db.records[key] = rec
|
||||
}
|
||||
rec.LastCheckedAt = now
|
||||
|
||||
entry := ComponentStatusEntry{At: now, Status: status, Source: source, Detail: detail}
|
||||
rec.History = append(rec.History, entry)
|
||||
|
||||
// Status merge: OK never downgrades Warning/Critical.
|
||||
newSev := componentSeverity(status)
|
||||
curSev := componentSeverity(rec.Status)
|
||||
if newSev > curSev {
|
||||
rec.Status = status
|
||||
rec.LastChangedAt = now
|
||||
rec.ErrorSummary = detail
|
||||
} else if rec.Status == "" {
|
||||
rec.Status = status
|
||||
rec.LastChangedAt = now
|
||||
}
|
||||
|
||||
_ = db.saveLocked()
|
||||
}
|
||||
|
||||
// Get returns the current record for a component key.
|
||||
func (db *ComponentStatusDB) Get(key string) (ComponentStatusRecord, bool) {
|
||||
if db == nil {
|
||||
return ComponentStatusRecord{}, false
|
||||
}
|
||||
db.mu.Lock()
|
||||
defer db.mu.Unlock()
|
||||
r, ok := db.records[key]
|
||||
if !ok {
|
||||
return ComponentStatusRecord{}, false
|
||||
}
|
||||
return *r, true
|
||||
}
|
||||
|
||||
// All returns a snapshot of all records.
|
||||
func (db *ComponentStatusDB) All() []ComponentStatusRecord {
|
||||
if db == nil {
|
||||
return nil
|
||||
}
|
||||
db.mu.Lock()
|
||||
defer db.mu.Unlock()
|
||||
out := make([]ComponentStatusRecord, 0, len(db.records))
|
||||
for _, r := range db.records {
|
||||
out = append(out, *r)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func (db *ComponentStatusDB) saveLocked() error {
|
||||
records := make([]ComponentStatusRecord, 0, len(db.records))
|
||||
for _, r := range db.records {
|
||||
records = append(records, *r)
|
||||
}
|
||||
data, err := json.MarshalIndent(records, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return os.WriteFile(db.path, data, 0644)
|
||||
}
|
||||
|
||||
// componentSeverity returns a numeric severity so higher values win.
|
||||
func componentSeverity(status string) int {
|
||||
switch strings.TrimSpace(status) {
|
||||
case "Critical":
|
||||
return 3
|
||||
case "Warning":
|
||||
return 2
|
||||
case "OK":
|
||||
return 1
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
// ApplySATResultToDB reads a SAT summary.txt from the run directory next to archivePath
|
||||
// and writes component status records to db for the given SAT target.
|
||||
// archivePath may be either a bare .tar.gz path or "Archive written to /path/foo.tar.gz".
|
||||
func ApplySATResultToDB(db *ComponentStatusDB, target, archivePath string) {
|
||||
if db == nil || strings.TrimSpace(archivePath) == "" {
|
||||
return
|
||||
}
|
||||
archivePath = extractArchivePath(archivePath)
|
||||
if archivePath == "" {
|
||||
return
|
||||
}
|
||||
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||
data, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
kv := parseSATKV(string(data))
|
||||
overall := strings.ToUpper(strings.TrimSpace(kv["overall_status"]))
|
||||
if overall == "" {
|
||||
return
|
||||
}
|
||||
|
||||
source := "sat:" + target
|
||||
dbStatus := satStatusToDBStatus(overall)
|
||||
|
||||
// Map SAT target to component keys.
|
||||
switch target {
|
||||
case "nvidia", "amd", "nvidia-stress", "amd-stress", "amd-mem", "amd-bandwidth":
|
||||
db.Record("pcie:gpu:"+target, source, dbStatus, target+" SAT: "+overall)
|
||||
case "memory", "memory-stress", "sat-stress":
|
||||
db.Record("memory:all", source, dbStatus, target+" SAT: "+overall)
|
||||
case "cpu", "platform-stress":
|
||||
db.Record("cpu:all", source, dbStatus, target+" SAT: "+overall)
|
||||
case "storage":
|
||||
// Try to record per-device if available in summary.
|
||||
recordedAny := false
|
||||
for key, val := range kv {
|
||||
if !strings.HasSuffix(key, "_status") || key == "overall_status" {
|
||||
continue
|
||||
}
|
||||
base := strings.TrimSuffix(key, "_status")
|
||||
idx := strings.Index(base, "_")
|
||||
if idx <= 0 {
|
||||
continue
|
||||
}
|
||||
devName := base[:idx]
|
||||
devStatus := satStatusToDBStatus(strings.ToUpper(strings.TrimSpace(val)))
|
||||
db.Record("storage:"+devName, source, devStatus, "storage SAT: "+val)
|
||||
recordedAny = true
|
||||
}
|
||||
if !recordedAny {
|
||||
db.Record("storage:all", source, dbStatus, "storage SAT: "+overall)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func satStatusToDBStatus(overall string) string {
|
||||
switch overall {
|
||||
case "OK":
|
||||
return "OK"
|
||||
case "FAILED":
|
||||
return "Warning"
|
||||
case "PARTIAL", "UNSUPPORTED":
|
||||
return "Unknown"
|
||||
default:
|
||||
return "Unknown"
|
||||
}
|
||||
}
|
||||
|
||||
// ExtractArchivePath extracts a bare .tar.gz path from a string that may be
|
||||
// "Archive written to /path/foo.tar.gz" or already a bare path.
|
||||
func ExtractArchivePath(s string) string {
|
||||
return extractArchivePath(s)
|
||||
}
|
||||
|
||||
// ReadSATOverallStatus reads the overall_status value from the summary.txt
|
||||
// file located in the run directory alongside archivePath.
|
||||
// Returns "" if the file cannot be read.
|
||||
func ReadSATOverallStatus(archivePath string) string {
|
||||
if strings.TrimSpace(archivePath) == "" {
|
||||
return ""
|
||||
}
|
||||
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||
data, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
kv := parseSATKV(string(data))
|
||||
return strings.ToUpper(strings.TrimSpace(kv["overall_status"]))
|
||||
}
|
||||
|
||||
func extractArchivePath(s string) string {
|
||||
s = strings.TrimSpace(s)
|
||||
if strings.HasSuffix(s, ".tar.gz") {
|
||||
parts := strings.Fields(s)
|
||||
if len(parts) > 0 {
|
||||
return parts[len(parts)-1]
|
||||
}
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
func parseSATKV(raw string) map[string]string {
|
||||
kv := make(map[string]string)
|
||||
for _, line := range strings.Split(raw, "\n") {
|
||||
k, v, ok := strings.Cut(strings.TrimSpace(line), "=")
|
||||
if ok {
|
||||
kv[strings.TrimSpace(k)] = strings.TrimSpace(v)
|
||||
}
|
||||
}
|
||||
return kv
|
||||
}
|
||||
@@ -9,7 +9,7 @@ import (
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string) {
|
||||
func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string, db *ComponentStatusDB) {
|
||||
if snap == nil || strings.TrimSpace(baseDir) == "" {
|
||||
return
|
||||
}
|
||||
@@ -28,6 +28,8 @@ func applyLatestSATStatuses(snap *schema.HardwareSnapshot, baseDir string) {
|
||||
if summary, ok := loadLatestSATSummary(baseDir, "storage-"); ok {
|
||||
applyStorageSAT(snap.Storage, summary)
|
||||
}
|
||||
// Apply unified component status DB — overlaid last so it can only upgrade severity.
|
||||
applyComponentStatusDB(snap, db)
|
||||
}
|
||||
|
||||
type satSummary struct {
|
||||
@@ -206,6 +208,86 @@ func matchesGPUVendor(dev schema.HardwarePCIeDevice, vendor string) bool {
|
||||
}
|
||||
}
|
||||
|
||||
func applyComponentStatusDB(snap *schema.HardwareSnapshot, db *ComponentStatusDB) {
|
||||
if snap == nil || db == nil {
|
||||
return
|
||||
}
|
||||
for _, rec := range db.All() {
|
||||
key := rec.ComponentKey
|
||||
status := dbStatusToSATStatus(rec.Status)
|
||||
if status == "" {
|
||||
continue
|
||||
}
|
||||
detail := rec.ErrorSummary
|
||||
ts := rec.LastChangedAt.UTC().Format("2006-01-02T15:04:05Z")
|
||||
|
||||
switch {
|
||||
case strings.HasPrefix(key, "pcie:"):
|
||||
bdf := strings.TrimPrefix(key, "pcie:")
|
||||
bdf = strings.TrimPrefix(bdf, "gpu:") // strip sub-type if present
|
||||
// bdf may be empty (e.g. "pcie:gpu:nvidia") — skip BDF matching
|
||||
if sanitizeBDFForLookup(bdf) == "" {
|
||||
break
|
||||
}
|
||||
normalized := sanitizeBDFForLookup(bdf)
|
||||
for i := range snap.PCIeDevices {
|
||||
if snap.PCIeDevices[i].BDF == nil {
|
||||
continue
|
||||
}
|
||||
if sanitizeBDFForLookup(*snap.PCIeDevices[i].BDF) == normalized {
|
||||
mergeComponentStatus(&snap.PCIeDevices[i].HardwareComponentStatus, ts, status, detail)
|
||||
}
|
||||
}
|
||||
case strings.HasPrefix(key, "storage:"):
|
||||
devName := strings.TrimPrefix(key, "storage:")
|
||||
if devName == "all" {
|
||||
for i := range snap.Storage {
|
||||
mergeComponentStatus(&snap.Storage[i].HardwareComponentStatus, ts, status, detail)
|
||||
}
|
||||
} else {
|
||||
for i := range snap.Storage {
|
||||
linuxDev, _ := snap.Storage[i].Telemetry["linux_device"].(string)
|
||||
if filepath.Base(strings.TrimSpace(linuxDev)) == devName {
|
||||
mergeComponentStatus(&snap.Storage[i].HardwareComponentStatus, ts, status, detail)
|
||||
}
|
||||
}
|
||||
}
|
||||
case strings.HasPrefix(key, "memory:"):
|
||||
for i := range snap.Memory {
|
||||
mergeComponentStatus(&snap.Memory[i].HardwareComponentStatus, ts, status, detail)
|
||||
}
|
||||
case strings.HasPrefix(key, "cpu:"):
|
||||
for i := range snap.CPUs {
|
||||
mergeComponentStatus(&snap.CPUs[i].HardwareComponentStatus, ts, status, detail)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// dbStatusToSATStatus converts ComponentStatusDB status strings to the format
|
||||
// expected by mergeComponentStatus (which uses "OK", "Warning", "Critical", "Unknown").
|
||||
func dbStatusToSATStatus(s string) string {
|
||||
switch strings.TrimSpace(s) {
|
||||
case "OK", "Warning", "Critical", "Unknown":
|
||||
return s
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
// sanitizeBDFForLookup normalises a PCIe BDF address to a canonical lower-case form
|
||||
// suitable for comparison. "c8:00.0" → "0000:c8:00.0"; already-full BDFs are left as-is.
|
||||
func sanitizeBDFForLookup(bdf string) string {
|
||||
bdf = strings.ToLower(strings.TrimSpace(bdf))
|
||||
if bdf == "" || bdf == "gpu" || strings.ContainsAny(bdf, " \t") {
|
||||
return ""
|
||||
}
|
||||
if strings.Count(bdf, ":") == 1 {
|
||||
bdf = "0000:" + bdf
|
||||
}
|
||||
return bdf
|
||||
}
|
||||
|
||||
func ptrString(v *string) string {
|
||||
if v == nil {
|
||||
return ""
|
||||
|
||||
@@ -23,7 +23,7 @@ func TestApplyLatestSATStatusesMarksStorageByDevice(t *testing.T) {
|
||||
usb := schema.HardwareStorage{Telemetry: map[string]any{"linux_device": "/dev/sda"}}
|
||||
snap := schema.HardwareSnapshot{Storage: []schema.HardwareStorage{nvme, usb}}
|
||||
|
||||
applyLatestSATStatuses(&snap, baseDir)
|
||||
applyLatestSATStatuses(&snap, baseDir, nil)
|
||||
|
||||
if snap.Storage[0].Status == nil || *snap.Storage[0].Status != "OK" {
|
||||
t.Fatalf("nvme status=%v want OK", snap.Storage[0].Status)
|
||||
@@ -53,7 +53,7 @@ func TestApplyLatestSATStatusesMarksAMDGPUs(t *testing.T) {
|
||||
}},
|
||||
}
|
||||
|
||||
applyLatestSATStatuses(&snap, baseDir)
|
||||
applyLatestSATStatuses(&snap, baseDir, nil)
|
||||
|
||||
if snap.PCIeDevices[0].Status == nil || *snap.PCIeDevices[0].Status != "Critical" {
|
||||
t.Fatalf("gpu status=%v want Critical", snap.PCIeDevices[0].Status)
|
||||
|
||||
@@ -27,15 +27,39 @@ var supportBundleCommands = []struct {
|
||||
cmd []string
|
||||
}{
|
||||
{name: "system/uname.txt", cmd: []string{"uname", "-a"}},
|
||||
{name: "system/cmdline.txt", cmd: []string{"cat", "/proc/cmdline"}},
|
||||
{name: "system/lsmod.txt", cmd: []string{"lsmod"}},
|
||||
{name: "system/lspci-nn.txt", cmd: []string{"lspci", "-nn"}},
|
||||
{name: "system/lspci-vvv.txt", cmd: []string{"lspci", "-vvv"}},
|
||||
{name: "system/ip-addr.txt", cmd: []string{"ip", "addr"}},
|
||||
{name: "system/ip-route.txt", cmd: []string{"ip", "route"}},
|
||||
{name: "system/mount.txt", cmd: []string{"mount"}},
|
||||
{name: "system/df-h.txt", cmd: []string{"df", "-h"}},
|
||||
{name: "system/dmesg-tail.txt", cmd: []string{"sh", "-c", "dmesg | tail -n 200"}},
|
||||
{name: "system/dmesg.txt", cmd: []string{"dmesg"}},
|
||||
{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "system/pcie-nvidia-link.txt", cmd: []string{"sh", "-c", `
|
||||
for d in /sys/bus/pci/devices/*/; do
|
||||
vendor=$(cat "$d/vendor" 2>/dev/null)
|
||||
[ "$vendor" = "0x10de" ] || continue
|
||||
dev=$(basename "$d")
|
||||
echo "=== $dev ==="
|
||||
for f in current_link_speed current_link_width max_link_speed max_link_width; do
|
||||
printf " %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)"
|
||||
done
|
||||
done
|
||||
`}},
|
||||
}
|
||||
|
||||
var supportBundleOptionalFiles = []struct {
|
||||
name string
|
||||
src string
|
||||
}{
|
||||
{name: "system/kern.log", src: "/var/log/kern.log"},
|
||||
{name: "system/syslog.txt", src: "/var/log/syslog"},
|
||||
}
|
||||
|
||||
const supportBundleGlob = "bee-support-*.tar.gz"
|
||||
|
||||
func BuildSupportBundle(exportDir string) (string, error) {
|
||||
exportDir = strings.TrimSpace(exportDir)
|
||||
if exportDir == "" {
|
||||
@@ -75,6 +99,9 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
||||
return "", err
|
||||
}
|
||||
}
|
||||
for _, item := range supportBundleOptionalFiles {
|
||||
_ = copyOptionalFile(item.src, filepath.Join(stageRoot, item.name))
|
||||
}
|
||||
if err := writeManifest(filepath.Join(stageRoot, "manifest.txt"), exportDir, stageRoot); err != nil {
|
||||
return "", err
|
||||
}
|
||||
@@ -86,34 +113,64 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
||||
return archivePath, nil
|
||||
}
|
||||
|
||||
func LatestSupportBundlePath() (string, error) {
|
||||
return latestSupportBundlePath(os.TempDir())
|
||||
}
|
||||
|
||||
func cleanupOldSupportBundles(dir string) error {
|
||||
matches, err := filepath.Glob(filepath.Join(dir, "bee-support-*.tar.gz"))
|
||||
matches, err := filepath.Glob(filepath.Join(dir, supportBundleGlob))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
type entry struct {
|
||||
path string
|
||||
mod time.Time
|
||||
entries := supportBundleEntries(matches)
|
||||
for path, mod := range entries {
|
||||
if time.Since(mod) > 24*time.Hour {
|
||||
_ = os.Remove(path)
|
||||
delete(entries, path)
|
||||
}
|
||||
}
|
||||
list := make([]entry, 0, len(matches))
|
||||
ordered := orderSupportBundles(entries)
|
||||
if len(ordered) > 3 {
|
||||
for _, old := range ordered[3:] {
|
||||
_ = os.Remove(old)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func latestSupportBundlePath(dir string) (string, error) {
|
||||
matches, err := filepath.Glob(filepath.Join(dir, supportBundleGlob))
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
ordered := orderSupportBundles(supportBundleEntries(matches))
|
||||
if len(ordered) == 0 {
|
||||
return "", os.ErrNotExist
|
||||
}
|
||||
return ordered[0], nil
|
||||
}
|
||||
|
||||
func supportBundleEntries(matches []string) map[string]time.Time {
|
||||
entries := make(map[string]time.Time, len(matches))
|
||||
for _, match := range matches {
|
||||
info, err := os.Stat(match)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if time.Since(info.ModTime()) > 24*time.Hour {
|
||||
_ = os.Remove(match)
|
||||
continue
|
||||
}
|
||||
list = append(list, entry{path: match, mod: info.ModTime()})
|
||||
entries[match] = info.ModTime()
|
||||
}
|
||||
sort.Slice(list, func(i, j int) bool { return list[i].mod.After(list[j].mod) })
|
||||
if len(list) > 3 {
|
||||
for _, old := range list[3:] {
|
||||
_ = os.Remove(old.path)
|
||||
}
|
||||
return entries
|
||||
}
|
||||
|
||||
func orderSupportBundles(entries map[string]time.Time) []string {
|
||||
ordered := make([]string, 0, len(entries))
|
||||
for path := range entries {
|
||||
ordered = append(ordered, path)
|
||||
}
|
||||
return nil
|
||||
sort.Slice(ordered, func(i, j int) bool {
|
||||
return entries[ordered[i]].After(entries[ordered[j]])
|
||||
})
|
||||
return ordered
|
||||
}
|
||||
|
||||
func writeJournalDump(dst string) error {
|
||||
@@ -152,6 +209,24 @@ func writeCommandOutput(dst string, cmd []string) error {
|
||||
return os.WriteFile(dst, raw, 0644)
|
||||
}
|
||||
|
||||
func copyOptionalFile(src, dst string) error {
|
||||
in, err := os.Open(src)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer in.Close()
|
||||
if err := os.MkdirAll(filepath.Dir(dst), 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
out, err := os.Create(dst)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer out.Close()
|
||||
_, err = io.Copy(out, in)
|
||||
return err
|
||||
}
|
||||
|
||||
func writeManifest(dst, exportDir, stageRoot string) error {
|
||||
if err := os.MkdirAll(filepath.Dir(dst), 0755); err != nil {
|
||||
return err
|
||||
@@ -215,7 +290,7 @@ func copyDirContents(srcDir, dstDir string) error {
|
||||
}
|
||||
|
||||
func copyExportDirForSupportBundle(srcDir, dstDir string) error {
|
||||
return copyDirContentsFiltered(srcDir, dstDir, func(rel string, info os.FileInfo) bool {
|
||||
if err := copyDirContentsFiltered(srcDir, dstDir, func(rel string, info os.FileInfo) bool {
|
||||
cleanRel := filepath.ToSlash(strings.TrimPrefix(filepath.Clean(rel), "./"))
|
||||
if cleanRel == "" {
|
||||
return true
|
||||
@@ -227,7 +302,25 @@ func copyExportDirForSupportBundle(srcDir, dstDir string) error {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
})
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
return normalizeSupportBundleAuditJSON(filepath.Join(dstDir, "bee-audit.json"))
|
||||
}
|
||||
|
||||
func normalizeSupportBundleAuditJSON(path string) error {
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
normalized, err := ApplySATOverlay(data)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
return os.WriteFile(path, normalized, 0644)
|
||||
}
|
||||
|
||||
func copyDirContentsFiltered(srcDir, dstDir string, keep func(rel string, info os.FileInfo) bool) error {
|
||||
|
||||
@@ -1,10 +1,18 @@
|
||||
package collector
|
||||
|
||||
import "bee/audit/internal/schema"
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func NormalizeSnapshot(snap *schema.HardwareSnapshot, collectedAt string) {
|
||||
finalizeSnapshot(snap, collectedAt)
|
||||
}
|
||||
|
||||
func finalizeSnapshot(snap *schema.HardwareSnapshot, collectedAt string) {
|
||||
snap.Memory = filterMemory(snap.Memory)
|
||||
snap.Storage = filterStorage(snap.Storage)
|
||||
snap.PCIeDevices = filterPCIe(snap.PCIeDevices)
|
||||
snap.PowerSupplies = filterPSUs(snap.PowerSupplies)
|
||||
|
||||
setComponentStatusMetadata(snap, collectedAt)
|
||||
@@ -33,11 +41,25 @@ func filterStorage(disks []schema.HardwareStorage) []schema.HardwareStorage {
|
||||
if disk.SerialNumber == nil || *disk.SerialNumber == "" {
|
||||
continue
|
||||
}
|
||||
if disk.Model != nil && isVirtualHDiskModel(*disk.Model) {
|
||||
continue
|
||||
}
|
||||
out = append(out, disk)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func filterPCIe(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice {
|
||||
out := make([]schema.HardwarePCIeDevice, 0, len(devs))
|
||||
for _, dev := range devs {
|
||||
if dev.DeviceClass != nil && strings.Contains(strings.ToLower(strings.TrimSpace(*dev.DeviceClass)), "co-processor") {
|
||||
continue
|
||||
}
|
||||
out = append(out, dev)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func filterPSUs(psus []schema.HardwarePowerSupply) []schema.HardwarePowerSupply {
|
||||
out := make([]schema.HardwarePowerSupply, 0, len(psus))
|
||||
for _, psu := range psus {
|
||||
|
||||
@@ -10,6 +10,10 @@ func TestFinalizeSnapshotFiltersComponentsWithoutRequiredSerials(t *testing.T) {
|
||||
present := true
|
||||
status := statusOK
|
||||
serial := "SN-1"
|
||||
virtualModel := "Virtual HDisk1"
|
||||
realModel := "PASCARI"
|
||||
coProcessorClass := "Co-processor"
|
||||
gpuClass := "VideoController"
|
||||
|
||||
snap := schema.HardwareSnapshot{
|
||||
Memory: []schema.HardwareMemory{
|
||||
@@ -17,9 +21,15 @@ func TestFinalizeSnapshotFiltersComponentsWithoutRequiredSerials(t *testing.T) {
|
||||
{Present: &present, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
},
|
||||
Storage: []schema.HardwareStorage{
|
||||
{Model: &virtualModel, SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
{SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
{Model: &realModel, SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
{HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
},
|
||||
PCIeDevices: []schema.HardwarePCIeDevice{
|
||||
{DeviceClass: &coProcessorClass, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
{DeviceClass: &gpuClass, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
},
|
||||
PowerSupplies: []schema.HardwarePowerSupply{
|
||||
{SerialNumber: &serial, HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
{HardwareComponentStatus: schema.HardwareComponentStatus{Status: &status}},
|
||||
@@ -31,9 +41,12 @@ func TestFinalizeSnapshotFiltersComponentsWithoutRequiredSerials(t *testing.T) {
|
||||
if len(snap.Memory) != 1 || snap.Memory[0].StatusCheckedAt == nil || *snap.Memory[0].StatusCheckedAt != collectedAt {
|
||||
t.Fatalf("memory finalize mismatch: %+v", snap.Memory)
|
||||
}
|
||||
if len(snap.Storage) != 1 || snap.Storage[0].StatusCheckedAt == nil || *snap.Storage[0].StatusCheckedAt != collectedAt {
|
||||
if len(snap.Storage) != 2 || snap.Storage[0].StatusCheckedAt == nil || *snap.Storage[0].StatusCheckedAt != collectedAt {
|
||||
t.Fatalf("storage finalize mismatch: %+v", snap.Storage)
|
||||
}
|
||||
if len(snap.PCIeDevices) != 1 || snap.PCIeDevices[0].DeviceClass == nil || *snap.PCIeDevices[0].DeviceClass != gpuClass {
|
||||
t.Fatalf("pcie finalize mismatch: %+v", snap.PCIeDevices)
|
||||
}
|
||||
if len(snap.PowerSupplies) != 1 || snap.PowerSupplies[0].StatusCheckedAt == nil || *snap.PowerSupplies[0].StatusCheckedAt != collectedAt {
|
||||
t.Fatalf("psu finalize mismatch: %+v", snap.PowerSupplies)
|
||||
}
|
||||
|
||||
@@ -13,14 +13,18 @@ import (
|
||||
const nvidiaVendorID = 0x10de
|
||||
|
||||
type nvidiaGPUInfo struct {
|
||||
BDF string
|
||||
Serial string
|
||||
VBIOS string
|
||||
TemperatureC *float64
|
||||
PowerW *float64
|
||||
ECCUncorrected *int64
|
||||
ECCCorrected *int64
|
||||
HWSlowdown *bool
|
||||
BDF string
|
||||
Serial string
|
||||
VBIOS string
|
||||
TemperatureC *float64
|
||||
PowerW *float64
|
||||
ECCUncorrected *int64
|
||||
ECCCorrected *int64
|
||||
HWSlowdown *bool
|
||||
PCIeLinkGenCurrent *int
|
||||
PCIeLinkGenMax *int
|
||||
PCIeLinkWidthCur *int
|
||||
PCIeLinkWidthMax *int
|
||||
}
|
||||
|
||||
// enrichPCIeWithNVIDIA enriches NVIDIA PCIe devices with data from nvidia-smi.
|
||||
@@ -94,7 +98,7 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
|
||||
func queryNVIDIAGPUs() (map[string]nvidiaGPUInfo, error) {
|
||||
out, err := exec.Command(
|
||||
"nvidia-smi",
|
||||
"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown",
|
||||
"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown,pcie.link.gen.current,pcie.link.gen.max,pcie.link.width.current,pcie.link.width.max",
|
||||
"--format=csv,noheader,nounits",
|
||||
).Output()
|
||||
if err != nil {
|
||||
@@ -118,8 +122,8 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
|
||||
if len(rec) == 0 {
|
||||
continue
|
||||
}
|
||||
if len(rec) < 9 {
|
||||
return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 9", len(rec))
|
||||
if len(rec) < 13 {
|
||||
return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 13", len(rec))
|
||||
}
|
||||
|
||||
bdf := normalizePCIeBDF(rec[1])
|
||||
@@ -128,14 +132,18 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
|
||||
}
|
||||
|
||||
info := nvidiaGPUInfo{
|
||||
BDF: bdf,
|
||||
Serial: strings.TrimSpace(rec[2]),
|
||||
VBIOS: strings.TrimSpace(rec[3]),
|
||||
TemperatureC: parseMaybeFloat(rec[4]),
|
||||
PowerW: parseMaybeFloat(rec[5]),
|
||||
ECCUncorrected: parseMaybeInt64(rec[6]),
|
||||
ECCCorrected: parseMaybeInt64(rec[7]),
|
||||
HWSlowdown: parseMaybeBool(rec[8]),
|
||||
BDF: bdf,
|
||||
Serial: strings.TrimSpace(rec[2]),
|
||||
VBIOS: strings.TrimSpace(rec[3]),
|
||||
TemperatureC: parseMaybeFloat(rec[4]),
|
||||
PowerW: parseMaybeFloat(rec[5]),
|
||||
ECCUncorrected: parseMaybeInt64(rec[6]),
|
||||
ECCCorrected: parseMaybeInt64(rec[7]),
|
||||
HWSlowdown: parseMaybeBool(rec[8]),
|
||||
PCIeLinkGenCurrent: parseMaybeInt(rec[9]),
|
||||
PCIeLinkGenMax: parseMaybeInt(rec[10]),
|
||||
PCIeLinkWidthCur: parseMaybeInt(rec[11]),
|
||||
PCIeLinkWidthMax: parseMaybeInt(rec[12]),
|
||||
}
|
||||
result[bdf] = info
|
||||
}
|
||||
@@ -167,6 +175,22 @@ func parseMaybeInt64(v string) *int64 {
|
||||
return &n
|
||||
}
|
||||
|
||||
func parseMaybeInt(v string) *int {
|
||||
v = strings.TrimSpace(v)
|
||||
if v == "" || strings.EqualFold(v, "n/a") || strings.EqualFold(v, "not supported") || strings.EqualFold(v, "[not supported]") {
|
||||
return nil
|
||||
}
|
||||
n, err := strconv.Atoi(v)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
return &n
|
||||
}
|
||||
|
||||
func pcieLinkGenLabel(gen int) string {
|
||||
return fmt.Sprintf("Gen%d", gen)
|
||||
}
|
||||
|
||||
func parseMaybeBool(v string) *bool {
|
||||
v = strings.TrimSpace(strings.ToLower(v))
|
||||
switch v {
|
||||
@@ -231,4 +255,22 @@ func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) {
|
||||
if info.HWSlowdown != nil {
|
||||
dev.HWSlowdown = info.HWSlowdown
|
||||
}
|
||||
// Override PCIe link speed/width with nvidia-smi driver values.
|
||||
// sysfs current_link_speed reflects the instantaneous physical link state and
|
||||
// can show Gen1 when the GPU is idle due to ASPM power management. The driver
|
||||
// knows the negotiated speed regardless of the current power state.
|
||||
if info.PCIeLinkGenCurrent != nil {
|
||||
s := pcieLinkGenLabel(*info.PCIeLinkGenCurrent)
|
||||
dev.LinkSpeed = &s
|
||||
}
|
||||
if info.PCIeLinkGenMax != nil {
|
||||
s := pcieLinkGenLabel(*info.PCIeLinkGenMax)
|
||||
dev.MaxLinkSpeed = &s
|
||||
}
|
||||
if info.PCIeLinkWidthCur != nil {
|
||||
dev.LinkWidth = info.PCIeLinkWidthCur
|
||||
}
|
||||
if info.PCIeLinkWidthMax != nil {
|
||||
dev.MaxLinkWidth = info.PCIeLinkWidthMax
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,7 +6,7 @@ import (
|
||||
)
|
||||
|
||||
func TestParseNVIDIASMIQuery(t *testing.T) {
|
||||
raw := "0, 00000000:65:00.0, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active\n"
|
||||
raw := "0, 00000000:65:00.0, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active, 4, 4, 16, 16\n"
|
||||
byBDF, err := parseNVIDIASMIQuery(raw)
|
||||
if err != nil {
|
||||
t.Fatalf("parse failed: %v", err)
|
||||
@@ -28,6 +28,12 @@ func TestParseNVIDIASMIQuery(t *testing.T) {
|
||||
if gpu.HWSlowdown == nil || *gpu.HWSlowdown {
|
||||
t.Fatalf("hw slowdown: got %v, want false", gpu.HWSlowdown)
|
||||
}
|
||||
if gpu.PCIeLinkGenCurrent == nil || *gpu.PCIeLinkGenCurrent != 4 {
|
||||
t.Fatalf("pcie link gen current: got %v, want 4", gpu.PCIeLinkGenCurrent)
|
||||
}
|
||||
if gpu.PCIeLinkGenMax == nil || *gpu.PCIeLinkGenMax != 4 {
|
||||
t.Fatalf("pcie link gen max: got %v, want 4", gpu.PCIeLinkGenMax)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizePCIeBDF(t *testing.T) {
|
||||
|
||||
@@ -59,6 +59,7 @@ func shouldIncludePCIeDevice(class, vendor, device string) bool {
|
||||
"host bridge",
|
||||
"isa bridge",
|
||||
"pci bridge",
|
||||
"co-processor",
|
||||
"performance counter",
|
||||
"performance counters",
|
||||
"ram memory",
|
||||
|
||||
@@ -19,6 +19,7 @@ func TestShouldIncludePCIeDevice(t *testing.T) {
|
||||
{name: "audio", class: "Audio device", want: false},
|
||||
{name: "host bridge", class: "Host bridge", want: false},
|
||||
{name: "pci bridge", class: "PCI bridge", want: false},
|
||||
{name: "co-processor", class: "Co-processor", want: false},
|
||||
{name: "smbus", class: "SMBus", want: false},
|
||||
{name: "perf", class: "Performance counters", want: false},
|
||||
{name: "non essential instrumentation", class: "Non-Essential Instrumentation", want: false},
|
||||
@@ -76,6 +77,20 @@ func TestParseLspci_filtersAMDChipsetNoise(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseLspci_filtersCoProcessors(t *testing.T) {
|
||||
input := "" +
|
||||
"Slot:\t0000:01:00.0\nClass:\tCo-processor\nVendor:\tIntel Corporation\nDevice:\t402xx Series QAT\n\n" +
|
||||
"Slot:\t0000:65:00.0\nClass:\tVGA compatible controller\nVendor:\tNVIDIA Corporation\nDevice:\tH100\n\n"
|
||||
|
||||
devs := parseLspci(input)
|
||||
if len(devs) != 1 {
|
||||
t.Fatalf("expected 1 remaining device, got %d", len(devs))
|
||||
}
|
||||
if devs[0].Model == nil || *devs[0].Model != "H100" {
|
||||
t.Fatalf("unexpected remaining device: %+v", devs[0])
|
||||
}
|
||||
}
|
||||
|
||||
func TestPCIeJSONUsesSlotNotBDF(t *testing.T) {
|
||||
input := "Slot:\t0000:65:00.0\nClass:\tVGA compatible controller\nVendor:\tNVIDIA Corporation\nDevice:\tH100\n\n"
|
||||
|
||||
|
||||
@@ -77,11 +77,28 @@ func discoverStorageDevices() []lsblkDevice {
|
||||
if dev.Type != "disk" {
|
||||
continue
|
||||
}
|
||||
if isVirtualBMCDisk(dev) {
|
||||
slog.Debug("storage: skipping BMC virtual disk", "name", dev.Name, "model", dev.Model)
|
||||
continue
|
||||
}
|
||||
disks = append(disks, dev)
|
||||
}
|
||||
return disks
|
||||
}
|
||||
|
||||
// isVirtualBMCDisk returns true for BMC/IPMI virtual USB mass storage devices
|
||||
// that appear as disks but are not real hardware (e.g. iDRAC Virtual HDisk*).
|
||||
// These have zero reported size, a generic fake serial, and a model name that
|
||||
// starts with "Virtual HDisk".
|
||||
func isVirtualBMCDisk(dev lsblkDevice) bool {
|
||||
return isVirtualHDiskModel(dev.Model)
|
||||
}
|
||||
|
||||
func isVirtualHDiskModel(model string) bool {
|
||||
model = strings.ToLower(strings.TrimSpace(model))
|
||||
return strings.HasPrefix(model, "virtual hdisk")
|
||||
}
|
||||
|
||||
func lsblkDevices() []lsblkDevice {
|
||||
out, err := exec.Command("lsblk", "-J", "-d",
|
||||
"-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL").Output()
|
||||
|
||||
139
audit/internal/platform/error_patterns.go
Normal file
139
audit/internal/platform/error_patterns.go
Normal file
@@ -0,0 +1,139 @@
|
||||
package platform
|
||||
|
||||
import "regexp"
|
||||
|
||||
// ErrorPattern describes a kernel log pattern that indicates a hardware error.
|
||||
// Add new patterns by appending to HardwareErrorPatterns — no other code changes needed.
|
||||
type ErrorPattern struct {
|
||||
// Name is a short machine-readable label for logging and deduplication.
|
||||
Name string
|
||||
// Re is the compiled regular expression matched against a single kmsg line.
|
||||
Re *regexp.Regexp
|
||||
// Category groups related errors: "gpu", "pcie", "storage", "mce", "memory", "cpu".
|
||||
Category string
|
||||
// Severity is "warning" for recoverable/uncertain faults, "critical" for definitive failures.
|
||||
Severity string
|
||||
// BDFGroup is the capture group index (1-based) that contains a PCIe BDF address
|
||||
// (e.g. "0000:c8:00.0"). 0 means no BDF is captured by this pattern.
|
||||
BDFGroup int
|
||||
// DevGroup is the capture group index (1-based) that contains a device name
|
||||
// (e.g. "sda", "nvme0"). 0 means no device name is captured by this pattern.
|
||||
DevGroup int
|
||||
}
|
||||
|
||||
// HardwareErrorPatterns is the global list of kernel log patterns that indicate hardware faults.
|
||||
// To add a new pattern: append a new ErrorPattern struct to this slice.
|
||||
var HardwareErrorPatterns = []ErrorPattern{
|
||||
// ── GPU / NVIDIA ────────────────────────────────────────────────────────────
|
||||
{
|
||||
Name: "nvidia-rminitadapter",
|
||||
Re: mustPat(`(?i)NVRM:.*GPU\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
|
||||
Category: "gpu",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "nvidia-msi-fail",
|
||||
Re: mustPat(`(?i)NVRM:.*Failed to enable MSI`),
|
||||
Category: "gpu",
|
||||
Severity: "warning",
|
||||
},
|
||||
{
|
||||
Name: "nvidia-aer",
|
||||
Re: mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
|
||||
Category: "gpu",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "nvidia-xid",
|
||||
Re: mustPat(`(?i)NVRM:.*Xid.*\b([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
|
||||
Category: "gpu",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
|
||||
// ── PCIe AER (generic) ──────────────────────────────────────────────────────
|
||||
{
|
||||
Name: "pcie-aer",
|
||||
Re: mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
|
||||
Category: "pcie",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "pcie-uncorrectable",
|
||||
Re: mustPat(`(?i)([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*[Uu]ncorrectable`),
|
||||
Category: "pcie",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "pcie-link-down",
|
||||
Re: mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*[Ll]ink.*[Dd]own`),
|
||||
Category: "pcie",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
|
||||
// ── Storage ─────────────────────────────────────────────────────────────────
|
||||
{
|
||||
Name: "blk-io-error",
|
||||
Re: mustPat(`(?i)blk_update_request.*I/O error.*dev\s+(\w+)`),
|
||||
Category: "storage",
|
||||
Severity: "warning",
|
||||
DevGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "nvme-timeout",
|
||||
Re: mustPat(`(?i)nvme\s+(\w+):.*timeout`),
|
||||
Category: "storage",
|
||||
Severity: "warning",
|
||||
DevGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "scsi-failed",
|
||||
Re: mustPat(`(?i)sd\s+[\da-f:]+:.*FAILED`),
|
||||
Category: "storage",
|
||||
Severity: "warning",
|
||||
},
|
||||
{
|
||||
Name: "nvme-reset",
|
||||
Re: mustPat(`(?i)nvme\s+(\w+):.*reset`),
|
||||
Category: "storage",
|
||||
Severity: "warning",
|
||||
DevGroup: 1,
|
||||
},
|
||||
|
||||
// ── Machine Check Exceptions ────────────────────────────────────────────────
|
||||
{
|
||||
Name: "mce-hardware-error",
|
||||
Re: mustPat(`(?i)mce:.*[Hh]ardware [Ee]rror`),
|
||||
Category: "mce",
|
||||
Severity: "warning",
|
||||
},
|
||||
{
|
||||
Name: "mce-corrected",
|
||||
Re: mustPat(`(?i)mce:.*[Cc]orrected`),
|
||||
Category: "mce",
|
||||
Severity: "warning",
|
||||
},
|
||||
|
||||
// ── Memory ─────────────────────────────────────────────────────────────────
|
||||
{
|
||||
Name: "edac-ue",
|
||||
Re: mustPat(`(?i)EDAC.*[Uu]ncorrectable`),
|
||||
Category: "memory",
|
||||
Severity: "warning",
|
||||
},
|
||||
{
|
||||
Name: "edac-ce",
|
||||
Re: mustPat(`(?i)EDAC.*[Cc]orrectable`),
|
||||
Category: "memory",
|
||||
Severity: "warning",
|
||||
},
|
||||
}
|
||||
|
||||
func mustPat(s string) *regexp.Regexp {
|
||||
return regexp.MustCompile(s)
|
||||
}
|
||||
@@ -11,10 +11,10 @@ import (
|
||||
|
||||
// InstallDisk describes a candidate disk for installation.
|
||||
type InstallDisk struct {
|
||||
Device string // e.g. /dev/sda
|
||||
Model string
|
||||
Size string // human-readable, e.g. "500G"
|
||||
SizeBytes int64 // raw byte count from lsblk
|
||||
Device string // e.g. /dev/sda
|
||||
Model string
|
||||
Size string // human-readable, e.g. "500G"
|
||||
SizeBytes int64 // raw byte count from lsblk
|
||||
MountedParts []string // partition mount points currently active
|
||||
}
|
||||
|
||||
@@ -117,6 +117,61 @@ func findLiveBootDevice() string {
|
||||
return "/dev/" + strings.TrimSpace(string(out2))
|
||||
}
|
||||
|
||||
func mountSource(target string) string {
|
||||
out, err := exec.Command("findmnt", "-n", "-o", "SOURCE", target).Output()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(string(out))
|
||||
}
|
||||
|
||||
func mountFSType(target string) string {
|
||||
out, err := exec.Command("findmnt", "-n", "-o", "FSTYPE", target).Output()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(string(out))
|
||||
}
|
||||
|
||||
func blockDeviceType(device string) string {
|
||||
if strings.TrimSpace(device) == "" {
|
||||
return ""
|
||||
}
|
||||
out, err := exec.Command("lsblk", "-dn", "-o", "TYPE", device).Output()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(string(out))
|
||||
}
|
||||
|
||||
func blockDeviceTransport(device string) string {
|
||||
if strings.TrimSpace(device) == "" {
|
||||
return ""
|
||||
}
|
||||
out, err := exec.Command("lsblk", "-dn", "-o", "TRAN", device).Output()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(string(out))
|
||||
}
|
||||
|
||||
func inferLiveBootKind(fsType, source, deviceType, transport string) string {
|
||||
switch {
|
||||
case strings.EqualFold(strings.TrimSpace(fsType), "tmpfs"):
|
||||
return "ram"
|
||||
case strings.EqualFold(strings.TrimSpace(deviceType), "rom"):
|
||||
return "cdrom"
|
||||
case strings.EqualFold(strings.TrimSpace(transport), "usb"):
|
||||
return "usb"
|
||||
case strings.HasPrefix(strings.TrimSpace(source), "/dev/sr"):
|
||||
return "cdrom"
|
||||
case strings.HasPrefix(strings.TrimSpace(source), "/dev/"):
|
||||
return "disk"
|
||||
default:
|
||||
return "unknown"
|
||||
}
|
||||
}
|
||||
|
||||
// MinInstallBytes returns the minimum recommended disk size for installation:
|
||||
// squashfs size × 1.5 to allow for extracted filesystem and bootloader.
|
||||
// Returns 0 if the squashfs is not available (non-live environment).
|
||||
|
||||
@@ -12,11 +12,40 @@ import (
|
||||
)
|
||||
|
||||
func (s *System) IsLiveMediaInRAM() bool {
|
||||
out, err := exec.Command("findmnt", "-n", "-o", "FSTYPE", "/run/live/medium").Output()
|
||||
if err != nil {
|
||||
fsType := mountFSType("/run/live/medium")
|
||||
if fsType == "" {
|
||||
return toramActive()
|
||||
}
|
||||
return strings.TrimSpace(string(out)) == "tmpfs"
|
||||
return strings.EqualFold(fsType, "tmpfs")
|
||||
}
|
||||
|
||||
func (s *System) LiveBootSource() LiveBootSource {
|
||||
fsType := mountFSType("/run/live/medium")
|
||||
source := mountSource("/run/live/medium")
|
||||
device := findLiveBootDevice()
|
||||
status := LiveBootSource{
|
||||
InRAM: strings.EqualFold(fsType, "tmpfs"),
|
||||
Source: source,
|
||||
Device: device,
|
||||
}
|
||||
if fsType == "" && source == "" && device == "" {
|
||||
if toramActive() {
|
||||
status.InRAM = true
|
||||
status.Kind = "ram"
|
||||
status.Source = "tmpfs"
|
||||
return status
|
||||
}
|
||||
status.Kind = "unknown"
|
||||
return status
|
||||
}
|
||||
status.Kind = inferLiveBootKind(fsType, source, blockDeviceType(device), blockDeviceTransport(device))
|
||||
if status.Kind == "" {
|
||||
status.Kind = "unknown"
|
||||
}
|
||||
if status.InRAM && strings.TrimSpace(status.Source) == "" {
|
||||
status.Source = "tmpfs"
|
||||
}
|
||||
return status
|
||||
}
|
||||
|
||||
func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) error {
|
||||
|
||||
28
audit/internal/platform/install_to_ram_test.go
Normal file
28
audit/internal/platform/install_to_ram_test.go
Normal file
@@ -0,0 +1,28 @@
|
||||
package platform
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestInferLiveBootKind(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
fsType string
|
||||
source string
|
||||
deviceType string
|
||||
transport string
|
||||
want string
|
||||
}{
|
||||
{name: "ram tmpfs", fsType: "tmpfs", source: "/dev/shm/bee-live", want: "ram"},
|
||||
{name: "usb disk", source: "/dev/sdb1", deviceType: "disk", transport: "usb", want: "usb"},
|
||||
{name: "cdrom rom", source: "/dev/sr0", deviceType: "rom", want: "cdrom"},
|
||||
{name: "disk sata", source: "/dev/nvme0n1p1", deviceType: "disk", transport: "nvme", want: "disk"},
|
||||
{name: "unknown", source: "overlay", want: "unknown"},
|
||||
}
|
||||
for _, tc := range tests {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
got := inferLiveBootKind(tc.fsType, tc.source, tc.deviceType, tc.transport)
|
||||
if got != tc.want {
|
||||
t.Fatalf("inferLiveBootKind(%q,%q,%q,%q)=%q want %q", tc.fsType, tc.source, tc.deviceType, tc.transport, got, tc.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
64
audit/internal/platform/kill_workers.go
Normal file
64
audit/internal/platform/kill_workers.go
Normal file
@@ -0,0 +1,64 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"syscall"
|
||||
)
|
||||
|
||||
// workerPatterns are substrings matched against /proc/<pid>/cmdline to identify
|
||||
// bee test worker processes that should be killed by KillTestWorkers.
|
||||
var workerPatterns = []string{
|
||||
"bee-gpu-burn",
|
||||
"stress-ng",
|
||||
"stressapptest",
|
||||
"memtester",
|
||||
}
|
||||
|
||||
// KilledProcess describes a process that was sent SIGKILL.
|
||||
type KilledProcess struct {
|
||||
PID int `json:"pid"`
|
||||
Name string `json:"name"`
|
||||
}
|
||||
|
||||
// KillTestWorkers scans /proc for running test worker processes and sends
|
||||
// SIGKILL to each one found. It returns a list of killed processes.
|
||||
// Errors for individual processes (e.g. already exited) are silently ignored.
|
||||
func KillTestWorkers() []KilledProcess {
|
||||
entries, err := os.ReadDir("/proc")
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
var killed []KilledProcess
|
||||
for _, e := range entries {
|
||||
if !e.IsDir() {
|
||||
continue
|
||||
}
|
||||
pid, err := strconv.Atoi(e.Name())
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
cmdline, err := os.ReadFile(fmt.Sprintf("/proc/%d/cmdline", pid))
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
// /proc/*/cmdline uses NUL bytes as argument separators.
|
||||
args := strings.SplitN(strings.ReplaceAll(string(cmdline), "\x00", " "), " ", 2)
|
||||
exe := strings.TrimSpace(args[0])
|
||||
base := exe
|
||||
if idx := strings.LastIndexByte(exe, '/'); idx >= 0 {
|
||||
base = exe[idx+1:]
|
||||
}
|
||||
for _, pat := range workerPatterns {
|
||||
if strings.Contains(base, pat) || strings.Contains(exe, pat) {
|
||||
_ = syscall.Kill(pid, syscall.SIGKILL)
|
||||
killed = append(killed, KilledProcess{PID: pid, Name: base})
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
return killed
|
||||
}
|
||||
@@ -68,18 +68,20 @@ func SampleLiveMetrics() LiveMetricSample {
|
||||
|
||||
// sampleCPULoadPct reads two /proc/stat snapshots 200ms apart and returns
|
||||
// the overall CPU utilisation percentage.
|
||||
var cpuStatPrev [2]uint64 // [total, idle]
|
||||
|
||||
func sampleCPULoadPct() float64 {
|
||||
total, idle := readCPUStat()
|
||||
if total == 0 {
|
||||
total0, idle0 := readCPUStat()
|
||||
if total0 == 0 {
|
||||
return 0
|
||||
}
|
||||
prevTotal, prevIdle := cpuStatPrev[0], cpuStatPrev[1]
|
||||
cpuStatPrev = [2]uint64{total, idle}
|
||||
if prevTotal == 0 {
|
||||
time.Sleep(200 * time.Millisecond)
|
||||
total1, idle1 := readCPUStat()
|
||||
if total1 == 0 {
|
||||
return 0
|
||||
}
|
||||
return cpuLoadPctBetween(total0, idle0, total1, idle1)
|
||||
}
|
||||
|
||||
func cpuLoadPctBetween(prevTotal, prevIdle, total, idle uint64) float64 {
|
||||
dt := float64(total - prevTotal)
|
||||
di := float64(idle - prevIdle)
|
||||
if dt <= 0 {
|
||||
|
||||
@@ -42,3 +42,53 @@ func TestCompactAmbientTempName(t *testing.T) {
|
||||
t.Fatalf("got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCPULoadPctBetween(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
prevTotal uint64
|
||||
prevIdle uint64
|
||||
total uint64
|
||||
idle uint64
|
||||
want float64
|
||||
}{
|
||||
{
|
||||
name: "busy half",
|
||||
prevTotal: 100,
|
||||
prevIdle: 40,
|
||||
total: 200,
|
||||
idle: 90,
|
||||
want: 50,
|
||||
},
|
||||
{
|
||||
name: "fully busy",
|
||||
prevTotal: 100,
|
||||
prevIdle: 40,
|
||||
total: 200,
|
||||
idle: 40,
|
||||
want: 100,
|
||||
},
|
||||
{
|
||||
name: "no progress",
|
||||
prevTotal: 100,
|
||||
prevIdle: 40,
|
||||
total: 100,
|
||||
idle: 40,
|
||||
want: 0,
|
||||
},
|
||||
{
|
||||
name: "idle delta larger than total clamps to zero",
|
||||
prevTotal: 100,
|
||||
prevIdle: 40,
|
||||
total: 200,
|
||||
idle: 150,
|
||||
want: 0,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range tests {
|
||||
if got := cpuLoadPctBetween(tc.prevTotal, tc.prevIdle, tc.total, tc.idle); got != tc.want {
|
||||
t.Fatalf("%s: cpuLoadPctBetween(...)=%v want %v", tc.name, got, tc.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
203
audit/internal/platform/nvidia_stress.go
Normal file
203
audit/internal/platform/nvidia_stress.go
Normal file
@@ -0,0 +1,203 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func (s *System) RunNvidiaStressPack(ctx context.Context, baseDir string, opts NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||
normalizeNvidiaStressOptions(&opts)
|
||||
|
||||
job, err := buildNvidiaStressJob(opts)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return runAcceptancePackCtx(ctx, baseDir, nvidiaStressArchivePrefix(opts.Loader), []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "02-nvidia-smi-list.log", cmd: []string{"nvidia-smi", "-L"}},
|
||||
job,
|
||||
{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
func nvidiaStressArchivePrefix(loader string) string {
|
||||
switch strings.TrimSpace(strings.ToLower(loader)) {
|
||||
case NvidiaStressLoaderJohn:
|
||||
return "gpu-nvidia-john"
|
||||
case NvidiaStressLoaderNCCL:
|
||||
return "gpu-nvidia-nccl"
|
||||
default:
|
||||
return "gpu-nvidia-burn"
|
||||
}
|
||||
}
|
||||
|
||||
func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
|
||||
selected, err := resolveNvidiaGPUSelection(opts.GPUIndices, opts.ExcludeGPUIndices)
|
||||
if err != nil {
|
||||
return satJob{}, err
|
||||
}
|
||||
|
||||
loader := strings.TrimSpace(strings.ToLower(opts.Loader))
|
||||
switch loader {
|
||||
case "", NvidiaStressLoaderBuiltin:
|
||||
cmd := []string{
|
||||
"bee-gpu-burn",
|
||||
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||
"--size-mb", strconv.Itoa(opts.SizeMB),
|
||||
}
|
||||
if len(selected) > 0 {
|
||||
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||
}
|
||||
return satJob{
|
||||
name: "03-bee-gpu-burn.log",
|
||||
cmd: cmd,
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
}, nil
|
||||
case NvidiaStressLoaderJohn:
|
||||
cmd := []string{
|
||||
"bee-john-gpu-stress",
|
||||
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||
}
|
||||
if len(selected) > 0 {
|
||||
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||
}
|
||||
return satJob{
|
||||
name: "03-john-gpu-stress.log",
|
||||
cmd: cmd,
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
}, nil
|
||||
case NvidiaStressLoaderNCCL:
|
||||
cmd := []string{
|
||||
"bee-nccl-gpu-stress",
|
||||
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||
}
|
||||
if len(selected) > 0 {
|
||||
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||
}
|
||||
return satJob{
|
||||
name: "03-bee-nccl-gpu-stress.log",
|
||||
cmd: cmd,
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
}, nil
|
||||
default:
|
||||
return satJob{}, fmt.Errorf("unknown NVIDIA stress loader %q", opts.Loader)
|
||||
}
|
||||
}
|
||||
|
||||
func normalizeNvidiaStressOptions(opts *NvidiaStressOptions) {
|
||||
if opts.DurationSec <= 0 {
|
||||
opts.DurationSec = 300
|
||||
}
|
||||
// SizeMB=0 means "auto" — bee-gpu-burn will query per-GPU memory at runtime.
|
||||
switch strings.TrimSpace(strings.ToLower(opts.Loader)) {
|
||||
case "", NvidiaStressLoaderBuiltin:
|
||||
opts.Loader = NvidiaStressLoaderBuiltin
|
||||
case NvidiaStressLoaderJohn:
|
||||
opts.Loader = NvidiaStressLoaderJohn
|
||||
case NvidiaStressLoaderNCCL:
|
||||
opts.Loader = NvidiaStressLoaderNCCL
|
||||
default:
|
||||
opts.Loader = NvidiaStressLoaderBuiltin
|
||||
}
|
||||
opts.GPUIndices = dedupeSortedIndices(opts.GPUIndices)
|
||||
opts.ExcludeGPUIndices = dedupeSortedIndices(opts.ExcludeGPUIndices)
|
||||
}
|
||||
|
||||
func resolveNvidiaGPUSelection(include, exclude []int) ([]int, error) {
|
||||
all, err := listNvidiaGPUIndices()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if len(all) == 0 {
|
||||
return nil, fmt.Errorf("nvidia-smi found no NVIDIA GPUs")
|
||||
}
|
||||
|
||||
selected := all
|
||||
if len(include) > 0 {
|
||||
want := make(map[int]struct{}, len(include))
|
||||
for _, idx := range include {
|
||||
want[idx] = struct{}{}
|
||||
}
|
||||
selected = selected[:0]
|
||||
for _, idx := range all {
|
||||
if _, ok := want[idx]; ok {
|
||||
selected = append(selected, idx)
|
||||
}
|
||||
}
|
||||
}
|
||||
if len(exclude) > 0 {
|
||||
skip := make(map[int]struct{}, len(exclude))
|
||||
for _, idx := range exclude {
|
||||
skip[idx] = struct{}{}
|
||||
}
|
||||
filtered := selected[:0]
|
||||
for _, idx := range selected {
|
||||
if _, ok := skip[idx]; ok {
|
||||
continue
|
||||
}
|
||||
filtered = append(filtered, idx)
|
||||
}
|
||||
selected = filtered
|
||||
}
|
||||
if len(selected) == 0 {
|
||||
return nil, fmt.Errorf("no NVIDIA GPUs selected after applying filters")
|
||||
}
|
||||
out := append([]int(nil), selected...)
|
||||
sort.Ints(out)
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func listNvidiaGPUIndices() ([]int, error) {
|
||||
out, err := satExecCommand("nvidia-smi", "--query-gpu=index", "--format=csv,noheader,nounits").Output()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("nvidia-smi: %w", err)
|
||||
}
|
||||
var indices []int
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
idx, err := strconv.Atoi(line)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
indices = append(indices, idx)
|
||||
}
|
||||
return dedupeSortedIndices(indices), nil
|
||||
}
|
||||
|
||||
func dedupeSortedIndices(values []int) []int {
|
||||
if len(values) == 0 {
|
||||
return nil
|
||||
}
|
||||
seen := make(map[int]struct{}, len(values))
|
||||
out := make([]int, 0, len(values))
|
||||
for _, value := range values {
|
||||
if value < 0 {
|
||||
continue
|
||||
}
|
||||
if _, ok := seen[value]; ok {
|
||||
continue
|
||||
}
|
||||
seen[value] = struct{}{}
|
||||
out = append(out, value)
|
||||
}
|
||||
sort.Ints(out)
|
||||
return out
|
||||
}
|
||||
|
||||
func joinIndexList(values []int) string {
|
||||
parts := make([]string, 0, len(values))
|
||||
for _, value := range values {
|
||||
parts = append(parts, strconv.Itoa(value))
|
||||
}
|
||||
return strings.Join(parts, ",")
|
||||
}
|
||||
@@ -10,9 +10,11 @@ import (
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
)
|
||||
|
||||
@@ -24,7 +26,8 @@ type PlatformStressCycle struct {
|
||||
|
||||
// PlatformStressOptions controls the thermal cycling test.
|
||||
type PlatformStressOptions struct {
|
||||
Cycles []PlatformStressCycle
|
||||
Cycles []PlatformStressCycle
|
||||
Components []string // if empty: run all; values: "cpu", "gpu"
|
||||
}
|
||||
|
||||
// platformStressRow is one second of telemetry.
|
||||
@@ -66,8 +69,11 @@ func (s *System) RunPlatformStress(
|
||||
return "", fmt.Errorf("mkdir run dir: %w", err)
|
||||
}
|
||||
|
||||
hasCPU := len(opts.Components) == 0 || containsComponent(opts.Components, "cpu")
|
||||
hasGPU := len(opts.Components) == 0 || containsComponent(opts.Components, "gpu")
|
||||
|
||||
vendor := s.DetectGPUVendor()
|
||||
logFunc(fmt.Sprintf("Platform Thermal Cycling — %d cycle(s), GPU vendor: %s", len(opts.Cycles), vendor))
|
||||
logFunc(fmt.Sprintf("Platform Thermal Cycling — %d cycle(s), GPU vendor: %s, cpu=%v gpu=%v", len(opts.Cycles), vendor, hasCPU, hasGPU))
|
||||
|
||||
var rows []platformStressRow
|
||||
start := time.Now()
|
||||
@@ -86,27 +92,31 @@ func (s *System) RunPlatformStress(
|
||||
var wg sync.WaitGroup
|
||||
|
||||
// CPU stress
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
cpuCmd, err := buildCPUStressCmd(loadCtx)
|
||||
if err != nil {
|
||||
logFunc("CPU stress: " + err.Error())
|
||||
return
|
||||
}
|
||||
_ = cpuCmd.Wait() // exits when loadCtx times out (SIGKILL)
|
||||
}()
|
||||
if hasCPU {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
cpuCmd, err := buildCPUStressCmd(loadCtx)
|
||||
if err != nil {
|
||||
logFunc("CPU stress: " + err.Error())
|
||||
return
|
||||
}
|
||||
_ = cpuCmd.Wait() // exits when loadCtx times out (SIGKILL)
|
||||
}()
|
||||
}
|
||||
|
||||
// GPU stress
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
gpuCmd := buildGPUStressCmd(loadCtx, vendor)
|
||||
if gpuCmd == nil {
|
||||
return
|
||||
}
|
||||
_ = gpuCmd.Wait()
|
||||
}()
|
||||
if hasGPU {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
gpuCmd := buildGPUStressCmd(loadCtx, vendor)
|
||||
if gpuCmd == nil {
|
||||
return
|
||||
}
|
||||
_ = gpuCmd.Wait()
|
||||
}()
|
||||
}
|
||||
|
||||
// Monitoring goroutine for load phase
|
||||
loadRows := collectPhase(loadCtx, cycleNum, "load", start)
|
||||
@@ -374,10 +384,17 @@ func buildCPUStressCmd(ctx context.Context) (*exec.Cmd, error) {
|
||||
return nil, fmt.Errorf("stressapptest not found: %w", err)
|
||||
}
|
||||
// Use a very long duration; the context timeout will kill it at the right time.
|
||||
cmd := exec.CommandContext(ctx, path, "-s", "86400", "-W", "--cc_test")
|
||||
cmdArgs := []string{"-s", "86400", "-W", "--cc_test"}
|
||||
if threads := platformStressCPUThreads(); threads > 0 {
|
||||
cmdArgs = append(cmdArgs, "-m", strconv.Itoa(threads))
|
||||
}
|
||||
if mb := platformStressMemoryMB(); mb > 0 {
|
||||
cmdArgs = append(cmdArgs, "-M", strconv.Itoa(mb))
|
||||
}
|
||||
cmd := exec.CommandContext(ctx, path, cmdArgs...)
|
||||
cmd.Stdout = nil
|
||||
cmd.Stderr = nil
|
||||
if err := cmd.Start(); err != nil {
|
||||
if err := startLowPriorityCmd(cmd, 15); err != nil {
|
||||
return nil, fmt.Errorf("stressapptest start: %w", err)
|
||||
}
|
||||
return cmd, nil
|
||||
@@ -418,22 +435,74 @@ func buildAMDGPUStressCmd(ctx context.Context) *exec.Cmd {
|
||||
cmd := exec.CommandContext(ctx, rvsPath, "-c", cfgFile)
|
||||
cmd.Stdout = nil
|
||||
cmd.Stderr = nil
|
||||
_ = cmd.Start()
|
||||
_ = startLowPriorityCmd(cmd, 10)
|
||||
return cmd
|
||||
}
|
||||
|
||||
func buildNvidiaGPUStressCmd(ctx context.Context) *exec.Cmd {
|
||||
path, err := satLookPath("bee-gpu-stress")
|
||||
path, err := satLookPath("bee-gpu-burn")
|
||||
if err != nil {
|
||||
path, err = satLookPath("bee-gpu-stress")
|
||||
}
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
cmd := exec.CommandContext(ctx, path, "--seconds", "86400", "--size-mb", "64")
|
||||
cmd := exec.CommandContext(ctx, path, "--seconds", "86400")
|
||||
cmd.Stdout = nil
|
||||
cmd.Stderr = nil
|
||||
_ = cmd.Start()
|
||||
_ = startLowPriorityCmd(cmd, 10)
|
||||
return cmd
|
||||
}
|
||||
|
||||
func startLowPriorityCmd(cmd *exec.Cmd, nice int) error {
|
||||
if err := cmd.Start(); err != nil {
|
||||
return err
|
||||
}
|
||||
if cmd.Process != nil {
|
||||
_ = syscall.Setpriority(syscall.PRIO_PROCESS, cmd.Process.Pid, nice)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func platformStressCPUThreads() int {
|
||||
if n := envInt("BEE_PLATFORM_STRESS_THREADS", 0); n > 0 {
|
||||
return n
|
||||
}
|
||||
cpus := runtime.NumCPU()
|
||||
switch {
|
||||
case cpus <= 2:
|
||||
return 1
|
||||
case cpus <= 8:
|
||||
return cpus - 1
|
||||
default:
|
||||
return cpus - 2
|
||||
}
|
||||
}
|
||||
|
||||
func platformStressMemoryMB() int {
|
||||
if mb := envInt("BEE_PLATFORM_STRESS_MB", 0); mb > 0 {
|
||||
return mb
|
||||
}
|
||||
free := freeMemBytes()
|
||||
if free <= 0 {
|
||||
return 0
|
||||
}
|
||||
mb := int((free * 60) / 100 / (1024 * 1024))
|
||||
if mb < 1024 {
|
||||
return 1024
|
||||
}
|
||||
return mb
|
||||
}
|
||||
|
||||
func containsComponent(components []string, name string) bool {
|
||||
for _, c := range components {
|
||||
if c == name {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func packPlatformDir(dir, dest string) error {
|
||||
f, err := os.Create(dest)
|
||||
if err != nil {
|
||||
|
||||
34
audit/internal/platform/platform_stress_test.go
Normal file
34
audit/internal/platform/platform_stress_test.go
Normal file
@@ -0,0 +1,34 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"runtime"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestPlatformStressCPUThreadsOverride(t *testing.T) {
|
||||
t.Setenv("BEE_PLATFORM_STRESS_THREADS", "7")
|
||||
if got := platformStressCPUThreads(); got != 7 {
|
||||
t.Fatalf("platformStressCPUThreads=%d want 7", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestPlatformStressCPUThreadsDefaultLeavesHeadroom(t *testing.T) {
|
||||
t.Setenv("BEE_PLATFORM_STRESS_THREADS", "")
|
||||
got := platformStressCPUThreads()
|
||||
if got < 1 {
|
||||
t.Fatalf("platformStressCPUThreads=%d want >= 1", got)
|
||||
}
|
||||
if got > runtime.NumCPU() {
|
||||
t.Fatalf("platformStressCPUThreads=%d want <= NumCPU=%d", got, runtime.NumCPU())
|
||||
}
|
||||
if runtime.NumCPU() > 2 && got >= runtime.NumCPU() {
|
||||
t.Fatalf("platformStressCPUThreads=%d want headroom below NumCPU=%d", got, runtime.NumCPU())
|
||||
}
|
||||
}
|
||||
|
||||
func TestPlatformStressMemoryMBOverride(t *testing.T) {
|
||||
t.Setenv("BEE_PLATFORM_STRESS_MB", "8192")
|
||||
if got := platformStressMemoryMB(); got != 8192 {
|
||||
t.Fatalf("platformStressMemoryMB=%d want 8192", got)
|
||||
}
|
||||
}
|
||||
@@ -136,7 +136,10 @@ func (s *System) runtimeToolStatuses(vendor string) []ToolStatus {
|
||||
tools = append(tools, s.CheckTools([]string{
|
||||
"nvidia-smi",
|
||||
"nvidia-bug-report.sh",
|
||||
"bee-gpu-stress",
|
||||
"bee-gpu-burn",
|
||||
"bee-john-gpu-stress",
|
||||
"bee-nccl-gpu-stress",
|
||||
"all_reduce_perf",
|
||||
})...)
|
||||
case "amd":
|
||||
tool := ToolStatus{Name: "rocm-smi"}
|
||||
@@ -176,8 +179,8 @@ func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHe
|
||||
health.DriverReady = true
|
||||
}
|
||||
|
||||
if lookErr := exec.Command("sh", "-c", "command -v bee-gpu-stress >/dev/null 2>&1").Run(); lookErr == nil {
|
||||
out, err := exec.Command("bee-gpu-stress", "--seconds", "1", "--size-mb", "1").CombinedOutput()
|
||||
if _, lookErr := exec.LookPath("bee-gpu-burn"); lookErr == nil {
|
||||
out, err := exec.Command("bee-gpu-burn", "--seconds", "1", "--size-mb", "1").CombinedOutput()
|
||||
if err == nil {
|
||||
health.CUDAReady = true
|
||||
} else if strings.Contains(strings.ToLower(string(out)), "cuda_error_system_not_ready") {
|
||||
|
||||
@@ -12,6 +12,7 @@ import (
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"syscall"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
@@ -425,14 +426,12 @@ type satStats struct {
|
||||
}
|
||||
|
||||
func nvidiaSATJobs() []satJob {
|
||||
seconds := envInt("BEE_GPU_STRESS_SECONDS", 5)
|
||||
sizeMB := envInt("BEE_GPU_STRESS_SIZE_MB", 64)
|
||||
return []satJob{
|
||||
{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "02-dmidecode-baseboard.log", cmd: []string{"dmidecode", "-t", "baseboard"}},
|
||||
{name: "03-dmidecode-system.log", cmd: []string{"dmidecode", "-t", "system"}},
|
||||
{name: "04-nvidia-bug-report.log", cmd: []string{"nvidia-bug-report.sh", "--output-file", "{{run_dir}}/nvidia-bug-report.log"}},
|
||||
{name: "05-bee-gpu-stress.log", cmd: []string{"bee-gpu-stress", "--seconds", fmt.Sprintf("%d", seconds), "--size-mb", fmt.Sprintf("%d", sizeMB)}},
|
||||
{name: "05-bee-gpu-burn.log", cmd: []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}},
|
||||
}
|
||||
}
|
||||
|
||||
@@ -533,6 +532,13 @@ func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string
|
||||
}
|
||||
|
||||
c := exec.CommandContext(ctx, resolvedCmd[0], resolvedCmd[1:]...)
|
||||
c.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||||
c.Cancel = func() error {
|
||||
if c.Process != nil {
|
||||
_ = syscall.Kill(-c.Process.Pid, syscall.SIGKILL)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
if len(env) > 0 {
|
||||
c.Env = append(os.Environ(), env...)
|
||||
}
|
||||
@@ -686,7 +692,11 @@ func resolveSATCommand(cmd []string) ([]string, error) {
|
||||
case "rvs":
|
||||
return resolveRVSCommand(cmd[1:]...)
|
||||
}
|
||||
return cmd, nil
|
||||
path, err := satLookPath(cmd[0])
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("%s not found in PATH: %w", cmd[0], err)
|
||||
}
|
||||
return append([]string{path}, cmd[1:]...), nil
|
||||
}
|
||||
|
||||
func resolveRVSCommand(args ...string) ([]string, error) {
|
||||
|
||||
@@ -51,6 +51,18 @@ type FanStressRow struct {
|
||||
SysPowerW float64 // DCMI system power reading
|
||||
}
|
||||
|
||||
type cachedPowerReading struct {
|
||||
Value float64
|
||||
UpdatedAt time.Time
|
||||
}
|
||||
|
||||
var (
|
||||
systemPowerCacheMu sync.Mutex
|
||||
systemPowerCache cachedPowerReading
|
||||
)
|
||||
|
||||
const systemPowerHoldTTL = 15 * time.Second
|
||||
|
||||
// RunFanStressTest runs a two-phase GPU stress test while monitoring fan speeds,
|
||||
// temperatures, and power draw every second. Exports metrics.csv and fan-sensors.csv.
|
||||
// Designed to reproduce case-04 fan-speed lag and detect GPU thermal throttling.
|
||||
@@ -130,26 +142,21 @@ func (s *System) RunFanStressTest(ctx context.Context, baseDir string, opts FanS
|
||||
stats.OK++
|
||||
}
|
||||
|
||||
// loadPhase runs bee-gpu-stress for durSec; sampler stamps phaseName on each row.
|
||||
// loadPhase runs bee-gpu-burn for durSec; sampler stamps phaseName on each row.
|
||||
loadPhase := func(phaseName, stepName string, durSec int) {
|
||||
if ctx.Err() != nil {
|
||||
return
|
||||
}
|
||||
setPhase(phaseName)
|
||||
var env []string
|
||||
if len(opts.GPUIndices) > 0 {
|
||||
ids := make([]string, len(opts.GPUIndices))
|
||||
for i, idx := range opts.GPUIndices {
|
||||
ids[i] = strconv.Itoa(idx)
|
||||
}
|
||||
env = []string{"CUDA_VISIBLE_DEVICES=" + strings.Join(ids, ",")}
|
||||
}
|
||||
cmd := []string{
|
||||
"bee-gpu-stress",
|
||||
"bee-gpu-burn",
|
||||
"--seconds", strconv.Itoa(durSec),
|
||||
"--size-mb", strconv.Itoa(opts.SizeMB),
|
||||
}
|
||||
out, err := runSATCommandCtx(ctx, verboseLog, stepName, cmd, env, nil)
|
||||
if len(opts.GPUIndices) > 0 {
|
||||
cmd = append(cmd, "--devices", joinIndexList(dedupeSortedIndices(opts.GPUIndices)))
|
||||
}
|
||||
out, err := runSATCommandCtx(ctx, verboseLog, stepName, cmd, nil, nil)
|
||||
_ = os.WriteFile(filepath.Join(runDir, stepName+".log"), out, 0644)
|
||||
if err != nil && err != context.Canceled && err.Error() != "signal: killed" {
|
||||
fmt.Fprintf(&summary, "%s_status=FAILED\n", stepName)
|
||||
@@ -323,8 +330,9 @@ func sampleFanSpeeds() ([]FanReading, error) {
|
||||
|
||||
// parseFanSpeeds parses "ipmitool sdr type Fan" output.
|
||||
// Handles two formats:
|
||||
// Old: "FAN1 | 2400.000 | RPM | ok" (value in col[1], unit in col[2])
|
||||
// New: "FAN1 | 41h | ok | 29.1 | 4340 RPM" (value+unit combined in last col)
|
||||
//
|
||||
// Old: "FAN1 | 2400.000 | RPM | ok" (value in col[1], unit in col[2])
|
||||
// New: "FAN1 | 41h | ok | 29.1 | 4340 RPM" (value+unit combined in last col)
|
||||
func parseFanSpeeds(raw string) []FanReading {
|
||||
var fans []FanReading
|
||||
for _, line := range strings.Split(strings.TrimSpace(raw), "\n") {
|
||||
@@ -512,11 +520,17 @@ func sampleCPUTempViaSensors() float64 {
|
||||
|
||||
// sampleSystemPower reads system power draw via DCMI.
|
||||
func sampleSystemPower() float64 {
|
||||
now := time.Now()
|
||||
current := 0.0
|
||||
out, err := exec.Command("ipmitool", "dcmi", "power", "reading").Output()
|
||||
if err != nil {
|
||||
return 0
|
||||
if err == nil {
|
||||
current = parseDCMIPowerReading(string(out))
|
||||
}
|
||||
return parseDCMIPowerReading(string(out))
|
||||
systemPowerCacheMu.Lock()
|
||||
defer systemPowerCacheMu.Unlock()
|
||||
value, updated := effectiveSystemPowerReading(systemPowerCache, current, now)
|
||||
systemPowerCache = updated
|
||||
return value
|
||||
}
|
||||
|
||||
// parseDCMIPowerReading extracts the instantaneous power reading from ipmitool dcmi output.
|
||||
@@ -539,6 +553,17 @@ func parseDCMIPowerReading(raw string) float64 {
|
||||
return 0
|
||||
}
|
||||
|
||||
func effectiveSystemPowerReading(cache cachedPowerReading, current float64, now time.Time) (float64, cachedPowerReading) {
|
||||
if current > 0 {
|
||||
cache = cachedPowerReading{Value: current, UpdatedAt: now}
|
||||
return current, cache
|
||||
}
|
||||
if cache.Value > 0 && !cache.UpdatedAt.IsZero() && now.Sub(cache.UpdatedAt) <= systemPowerHoldTTL {
|
||||
return cache.Value, cache
|
||||
}
|
||||
return 0, cache
|
||||
}
|
||||
|
||||
// analyzeThrottling returns true if any GPU reported an active throttle reason
|
||||
// during either load phase.
|
||||
func analyzeThrottling(rows []FanStressRow) bool {
|
||||
|
||||
@@ -1,6 +1,9 @@
|
||||
package platform
|
||||
|
||||
import "testing"
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestParseFanSpeeds(t *testing.T) {
|
||||
raw := "FAN1 | 2400.000 | RPM | ok\nFAN2 | 1800 RPM | ok | ok\nFAN3 | na | RPM | ns\n"
|
||||
@@ -25,3 +28,40 @@ func TestFirstFanInputValue(t *testing.T) {
|
||||
t.Fatalf("got=%v ok=%v", got, ok)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseDCMIPowerReading(t *testing.T) {
|
||||
raw := `
|
||||
Instantaneous power reading: 512 Watts
|
||||
Minimum during sampling period: 498 Watts
|
||||
`
|
||||
if got := parseDCMIPowerReading(raw); got != 512 {
|
||||
t.Fatalf("parseDCMIPowerReading()=%v want 512", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestEffectiveSystemPowerReading(t *testing.T) {
|
||||
now := time.Now()
|
||||
cache := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-5 * time.Second)}
|
||||
|
||||
got, updated := effectiveSystemPowerReading(cache, 0, now)
|
||||
if got != 480 {
|
||||
t.Fatalf("got=%v want cached 480", got)
|
||||
}
|
||||
if updated.Value != 480 {
|
||||
t.Fatalf("updated=%+v", updated)
|
||||
}
|
||||
|
||||
got, updated = effectiveSystemPowerReading(cache, 530, now)
|
||||
if got != 530 {
|
||||
t.Fatalf("got=%v want 530", got)
|
||||
}
|
||||
if updated.Value != 530 {
|
||||
t.Fatalf("updated=%+v", updated)
|
||||
}
|
||||
|
||||
expired := cachedPowerReading{Value: 480, UpdatedAt: now.Add(-systemPowerHoldTTL - time.Second)}
|
||||
got, _ = effectiveSystemPowerReading(expired, 0, now)
|
||||
if got != 0 {
|
||||
t.Fatalf("expired cache returned %v want 0", got)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -31,8 +31,8 @@ func TestRunNvidiaAcceptancePackIncludesGPUStress(t *testing.T) {
|
||||
if len(jobs) != 5 {
|
||||
t.Fatalf("jobs=%d want 5", len(jobs))
|
||||
}
|
||||
if got := jobs[4].cmd[0]; got != "bee-gpu-stress" {
|
||||
t.Fatalf("gpu stress command=%q want bee-gpu-stress", got)
|
||||
if got := jobs[4].cmd[0]; got != "bee-gpu-burn" {
|
||||
t.Fatalf("gpu stress command=%q want bee-gpu-burn", got)
|
||||
}
|
||||
if got := jobs[3].cmd[1]; got != "--output-file" {
|
||||
t.Fatalf("bug report flag=%q want --output-file", got)
|
||||
@@ -80,13 +80,10 @@ func TestAMDStressJobsIncludeBandwidthAndGST(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestNvidiaSATJobsUseEnvOverrides(t *testing.T) {
|
||||
t.Setenv("BEE_GPU_STRESS_SECONDS", "9")
|
||||
t.Setenv("BEE_GPU_STRESS_SIZE_MB", "96")
|
||||
|
||||
func TestNvidiaSATJobsUseBuiltinBurnDefaults(t *testing.T) {
|
||||
jobs := nvidiaSATJobs()
|
||||
got := jobs[4].cmd
|
||||
want := []string{"bee-gpu-stress", "--seconds", "9", "--size-mb", "96"}
|
||||
want := []string{"bee-gpu-burn", "--seconds", "5", "--size-mb", "64"}
|
||||
if len(got) != len(want) {
|
||||
t.Fatalf("cmd len=%d want %d", len(got), len(want))
|
||||
}
|
||||
@@ -97,6 +94,93 @@ func TestNvidiaSATJobsUseEnvOverrides(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildNvidiaStressJobUsesSelectedLoaderAndDevices(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
oldExecCommand := satExecCommand
|
||||
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||
if name == "nvidia-smi" {
|
||||
return exec.Command("sh", "-c", "printf '0\n1\n2\n'")
|
||||
}
|
||||
return exec.Command(name, args...)
|
||||
}
|
||||
t.Cleanup(func() { satExecCommand = oldExecCommand })
|
||||
|
||||
job, err := buildNvidiaStressJob(NvidiaStressOptions{
|
||||
DurationSec: 600,
|
||||
Loader: NvidiaStressLoaderJohn,
|
||||
ExcludeGPUIndices: []int{1},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("buildNvidiaStressJob error: %v", err)
|
||||
}
|
||||
wantCmd := []string{"bee-john-gpu-stress", "--seconds", "600", "--devices", "0,2"}
|
||||
if len(job.cmd) != len(wantCmd) {
|
||||
t.Fatalf("cmd len=%d want %d (%v)", len(job.cmd), len(wantCmd), job.cmd)
|
||||
}
|
||||
for i := range wantCmd {
|
||||
if job.cmd[i] != wantCmd[i] {
|
||||
t.Fatalf("cmd[%d]=%q want %q", i, job.cmd[i], wantCmd[i])
|
||||
}
|
||||
}
|
||||
if got := joinIndexList(job.gpuIndices); got != "0,2" {
|
||||
t.Fatalf("gpuIndices=%q want 0,2", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildNvidiaStressJobUsesNCCLLoader(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
oldExecCommand := satExecCommand
|
||||
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||
if name == "nvidia-smi" {
|
||||
return exec.Command("sh", "-c", "printf '0\n1\n2\n'")
|
||||
}
|
||||
return exec.Command(name, args...)
|
||||
}
|
||||
t.Cleanup(func() { satExecCommand = oldExecCommand })
|
||||
|
||||
job, err := buildNvidiaStressJob(NvidiaStressOptions{
|
||||
DurationSec: 120,
|
||||
Loader: NvidiaStressLoaderNCCL,
|
||||
GPUIndices: []int{2, 0},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("buildNvidiaStressJob error: %v", err)
|
||||
}
|
||||
wantCmd := []string{"bee-nccl-gpu-stress", "--seconds", "120", "--devices", "0,2"}
|
||||
if len(job.cmd) != len(wantCmd) {
|
||||
t.Fatalf("cmd len=%d want %d (%v)", len(job.cmd), len(wantCmd), job.cmd)
|
||||
}
|
||||
for i := range wantCmd {
|
||||
if job.cmd[i] != wantCmd[i] {
|
||||
t.Fatalf("cmd[%d]=%q want %q", i, job.cmd[i], wantCmd[i])
|
||||
}
|
||||
}
|
||||
if got := joinIndexList(job.gpuIndices); got != "0,2" {
|
||||
t.Fatalf("gpuIndices=%q want 0,2", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNvidiaStressArchivePrefixByLoader(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tests := []struct {
|
||||
loader string
|
||||
want string
|
||||
}{
|
||||
{loader: NvidiaStressLoaderBuiltin, want: "gpu-nvidia-burn"},
|
||||
{loader: NvidiaStressLoaderJohn, want: "gpu-nvidia-john"},
|
||||
{loader: NvidiaStressLoaderNCCL, want: "gpu-nvidia-nccl"},
|
||||
{loader: "", want: "gpu-nvidia-burn"},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
if got := nvidiaStressArchivePrefix(tt.loader); got != tt.want {
|
||||
t.Fatalf("loader=%q prefix=%q want %q", tt.loader, got, tt.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestEnvIntFallback(t *testing.T) {
|
||||
os.Unsetenv("BEE_MEMTESTER_SIZE_MB")
|
||||
if got := envInt("BEE_MEMTESTER_SIZE_MB", 123); got != 123 {
|
||||
@@ -122,8 +206,8 @@ func TestClassifySATResult(t *testing.T) {
|
||||
}{
|
||||
{name: "ok", job: "memtester", out: "done", err: nil, status: "OK"},
|
||||
{name: "unsupported", job: "smartctl-self-test-short", out: "Self-test not supported", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
||||
{name: "failed", job: "bee-gpu-stress", out: "cuda error", err: errors.New("rc 1"), status: "FAILED"},
|
||||
{name: "cuda not ready", job: "bee-gpu-stress", out: "cuInit failed: CUDA_ERROR_SYSTEM_NOT_READY", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
||||
{name: "failed", job: "bee-gpu-burn", out: "cuda error", err: errors.New("rc 1"), status: "FAILED"},
|
||||
{name: "cuda not ready", job: "bee-gpu-burn", out: "cuInit failed: CUDA_ERROR_SYSTEM_NOT_READY", err: errors.New("rc 1"), status: "UNSUPPORTED"},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
@@ -172,6 +256,44 @@ func TestResolveROCmSMICommandFromPATH(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveSATCommandUsesLookPathForGenericTools(t *testing.T) {
|
||||
oldLookPath := satLookPath
|
||||
satLookPath = func(file string) (string, error) {
|
||||
if file == "stress-ng" {
|
||||
return "/usr/bin/stress-ng", nil
|
||||
}
|
||||
return "", exec.ErrNotFound
|
||||
}
|
||||
t.Cleanup(func() { satLookPath = oldLookPath })
|
||||
|
||||
cmd, err := resolveSATCommand([]string{"stress-ng", "--cpu", "0"})
|
||||
if err != nil {
|
||||
t.Fatalf("resolveSATCommand error: %v", err)
|
||||
}
|
||||
if len(cmd) != 3 {
|
||||
t.Fatalf("cmd len=%d want 3 (%v)", len(cmd), cmd)
|
||||
}
|
||||
if cmd[0] != "/usr/bin/stress-ng" {
|
||||
t.Fatalf("cmd[0]=%q want /usr/bin/stress-ng", cmd[0])
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveSATCommandFailsForMissingGenericTool(t *testing.T) {
|
||||
oldLookPath := satLookPath
|
||||
satLookPath = func(file string) (string, error) {
|
||||
return "", exec.ErrNotFound
|
||||
}
|
||||
t.Cleanup(func() { satLookPath = oldLookPath })
|
||||
|
||||
_, err := resolveSATCommand([]string{"stress-ng", "--cpu", "0"})
|
||||
if err == nil {
|
||||
t.Fatal("expected error")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "stress-ng not found in PATH") {
|
||||
t.Fatalf("error=%q", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestResolveROCmSMICommandFallsBackToROCmTree(t *testing.T) {
|
||||
tmp := t.TempDir()
|
||||
execPath := filepath.Join(tmp, "opt", "rocm", "bin", "rocm-smi")
|
||||
|
||||
@@ -2,6 +2,13 @@ package platform
|
||||
|
||||
type System struct{}
|
||||
|
||||
type LiveBootSource struct {
|
||||
InRAM bool `json:"in_ram"`
|
||||
Kind string `json:"kind"`
|
||||
Source string `json:"source,omitempty"`
|
||||
Device string `json:"device,omitempty"`
|
||||
}
|
||||
|
||||
type InterfaceInfo struct {
|
||||
Name string
|
||||
State string
|
||||
@@ -51,6 +58,20 @@ type ToolStatus struct {
|
||||
OK bool
|
||||
}
|
||||
|
||||
const (
|
||||
NvidiaStressLoaderBuiltin = "builtin"
|
||||
NvidiaStressLoaderJohn = "john"
|
||||
NvidiaStressLoaderNCCL = "nccl"
|
||||
)
|
||||
|
||||
type NvidiaStressOptions struct {
|
||||
DurationSec int
|
||||
SizeMB int
|
||||
Loader string
|
||||
GPUIndices []int
|
||||
ExcludeGPUIndices []int
|
||||
}
|
||||
|
||||
func New() *System {
|
||||
return &System{}
|
||||
}
|
||||
|
||||
@@ -2,11 +2,12 @@ package webui
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
@@ -62,6 +63,10 @@ func streamJob(w http.ResponseWriter, r *http.Request, j *jobState) {
|
||||
if !sseStart(w) {
|
||||
return
|
||||
}
|
||||
streamSubscribedJob(w, r, j)
|
||||
}
|
||||
|
||||
func streamSubscribedJob(w http.ResponseWriter, r *http.Request, j *jobState) {
|
||||
existing, ch := j.subscribe()
|
||||
for _, line := range existing {
|
||||
sseWrite(w, "", line)
|
||||
@@ -85,15 +90,16 @@ func streamJob(w http.ResponseWriter, r *http.Request, j *jobState) {
|
||||
}
|
||||
}
|
||||
|
||||
// runCmdJob runs an exec.Cmd as a background job, streaming stdout+stderr lines.
|
||||
func runCmdJob(j *jobState, cmd *exec.Cmd) {
|
||||
// streamCmdJob runs an exec.Cmd and streams stdout+stderr lines into j.
|
||||
func streamCmdJob(j *jobState, cmd *exec.Cmd) error {
|
||||
pr, pw := io.Pipe()
|
||||
cmd.Stdout = pw
|
||||
cmd.Stderr = pw
|
||||
|
||||
if err := cmd.Start(); err != nil {
|
||||
j.finish(err.Error())
|
||||
return
|
||||
_ = pw.Close()
|
||||
_ = pr.Close()
|
||||
return err
|
||||
}
|
||||
// Lower the CPU scheduling priority of stress/audit subprocesses to nice+10
|
||||
// so the X server and kernel interrupt handling remain responsive under load
|
||||
@@ -102,8 +108,10 @@ func runCmdJob(j *jobState, cmd *exec.Cmd) {
|
||||
_ = syscall.Setpriority(syscall.PRIO_PROCESS, cmd.Process.Pid, 10)
|
||||
}
|
||||
|
||||
scanDone := make(chan error, 1)
|
||||
go func() {
|
||||
scanner := bufio.NewScanner(pr)
|
||||
scanner.Buffer(make([]byte, 0, 64*1024), 1024*1024)
|
||||
for scanner.Scan() {
|
||||
// Split on \r to handle progress-bar style output (e.g. \r overwrites)
|
||||
// and strip ANSI escape codes so logs are readable in the browser.
|
||||
@@ -115,15 +123,21 @@ func runCmdJob(j *jobState, cmd *exec.Cmd) {
|
||||
}
|
||||
}
|
||||
}
|
||||
if err := scanner.Err(); err != nil && !errors.Is(err, io.ErrClosedPipe) {
|
||||
scanDone <- err
|
||||
return
|
||||
}
|
||||
scanDone <- nil
|
||||
}()
|
||||
|
||||
err := cmd.Wait()
|
||||
_ = pw.Close()
|
||||
scanErr := <-scanDone
|
||||
_ = pr.Close()
|
||||
if err != nil {
|
||||
j.finish(err.Error())
|
||||
} else {
|
||||
j.finish("")
|
||||
return err
|
||||
}
|
||||
return scanErr
|
||||
}
|
||||
|
||||
// ── Audit ─────────────────────────────────────────────────────────────────────
|
||||
@@ -171,20 +185,23 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
||||
}
|
||||
|
||||
var body struct {
|
||||
Duration int `json:"duration"`
|
||||
DiagLevel int `json:"diag_level"`
|
||||
GPUIndices []int `json:"gpu_indices"`
|
||||
Profile string `json:"profile"`
|
||||
DisplayName string `json:"display_name"`
|
||||
Duration int `json:"duration"`
|
||||
DiagLevel int `json:"diag_level"`
|
||||
GPUIndices []int `json:"gpu_indices"`
|
||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
|
||||
Loader string `json:"loader"`
|
||||
Profile string `json:"profile"`
|
||||
DisplayName string `json:"display_name"`
|
||||
PlatformComponents []string `json:"platform_components"`
|
||||
}
|
||||
if r.ContentLength > 0 {
|
||||
_ = json.NewDecoder(r.Body).Decode(&body)
|
||||
if r.Body != nil {
|
||||
if err := json.NewDecoder(r.Body).Decode(&body); err != nil && !errors.Is(err, io.EOF) {
|
||||
writeError(w, http.StatusBadRequest, "invalid request body")
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
name := taskNames[target]
|
||||
if name == "" {
|
||||
name = target
|
||||
}
|
||||
name := taskDisplayName(target, body.Profile, body.Loader)
|
||||
t := &Task{
|
||||
ID: newJobID("sat-" + target),
|
||||
Name: name,
|
||||
@@ -192,11 +209,14 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{
|
||||
Duration: body.Duration,
|
||||
DiagLevel: body.DiagLevel,
|
||||
GPUIndices: body.GPUIndices,
|
||||
BurnProfile: body.Profile,
|
||||
DisplayName: body.DisplayName,
|
||||
Duration: body.Duration,
|
||||
DiagLevel: body.DiagLevel,
|
||||
GPUIndices: body.GPUIndices,
|
||||
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
||||
Loader: body.Loader,
|
||||
BurnProfile: body.Profile,
|
||||
DisplayName: body.DisplayName,
|
||||
PlatformComponents: body.PlatformComponents,
|
||||
},
|
||||
}
|
||||
if strings.TrimSpace(body.DisplayName) != "" {
|
||||
@@ -330,8 +350,10 @@ func (h *handler) handleAPINetworkStatus(w http.ResponseWriter, r *http.Request)
|
||||
return
|
||||
}
|
||||
writeJSON(w, map[string]any{
|
||||
"interfaces": ifaces,
|
||||
"default_route": h.opts.App.DefaultRoute(),
|
||||
"interfaces": ifaces,
|
||||
"default_route": h.opts.App.DefaultRoute(),
|
||||
"pending_change": h.hasPendingNetworkChange(),
|
||||
"rollback_in": h.pendingNetworkRollbackIn(),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -410,19 +432,6 @@ func (h *handler) handleAPIExportList(w http.ResponseWriter, r *http.Request) {
|
||||
writeJSON(w, entries)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIExportBundle(w http.ResponseWriter, r *http.Request) {
|
||||
archive, err := app.BuildSupportBundle(h.opts.ExportDir)
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
writeJSON(w, map[string]string{
|
||||
"status": "ok",
|
||||
"path": archive,
|
||||
"url": "/export/support.tar.gz",
|
||||
})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIExportUSBTargets(w http.ResponseWriter, _ *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
@@ -490,6 +499,26 @@ func (h *handler) handleAPIGPUPresence(w http.ResponseWriter, r *http.Request) {
|
||||
})
|
||||
}
|
||||
|
||||
// ── GPU tools ─────────────────────────────────────────────────────────────────
|
||||
|
||||
func (h *handler) handleAPIGPUTools(w http.ResponseWriter, _ *http.Request) {
|
||||
type toolEntry struct {
|
||||
ID string `json:"id"`
|
||||
Available bool `json:"available"`
|
||||
Vendor string `json:"vendor"` // "nvidia" | "amd"
|
||||
}
|
||||
_, nvidiaErr := os.Stat("/dev/nvidia0")
|
||||
_, amdErr := os.Stat("/dev/kfd")
|
||||
nvidiaUp := nvidiaErr == nil
|
||||
amdUp := amdErr == nil
|
||||
writeJSON(w, []toolEntry{
|
||||
{ID: "bee-gpu-burn", Available: nvidiaUp, Vendor: "nvidia"},
|
||||
{ID: "john", Available: nvidiaUp, Vendor: "nvidia"},
|
||||
{ID: "nccl", Available: nvidiaUp, Vendor: "nvidia"},
|
||||
{ID: "rvs", Available: amdUp, Vendor: "amd"},
|
||||
})
|
||||
}
|
||||
|
||||
// ── System ────────────────────────────────────────────────────────────────────
|
||||
|
||||
func (h *handler) handleAPIRAMStatus(w http.ResponseWriter, r *http.Request) {
|
||||
@@ -497,9 +526,9 @@ func (h *handler) handleAPIRAMStatus(w http.ResponseWriter, r *http.Request) {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
inRAM := h.opts.App.IsLiveMediaInRAM()
|
||||
status := h.opts.App.LiveBootSource()
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_ = json.NewEncoder(w).Encode(map[string]bool{"in_ram": inRAM})
|
||||
_ = json.NewEncoder(w).Encode(status)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIInstallToRAM(w http.ResponseWriter, r *http.Request) {
|
||||
@@ -507,10 +536,7 @@ func (h *handler) handleAPIInstallToRAM(w http.ResponseWriter, r *http.Request)
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
h.installMu.Lock()
|
||||
installRunning := h.installJob != nil && !h.installJob.isDone()
|
||||
h.installMu.Unlock()
|
||||
if installRunning {
|
||||
if globalQueue.hasActiveTarget("install") {
|
||||
writeError(w, http.StatusConflict, "install to disk is already running")
|
||||
return
|
||||
}
|
||||
@@ -625,39 +651,43 @@ func (h *handler) handleAPIInstallRun(w http.ResponseWriter, r *http.Request) {
|
||||
writeError(w, http.StatusConflict, "install to RAM task is already pending or running")
|
||||
return
|
||||
}
|
||||
|
||||
h.installMu.Lock()
|
||||
if h.installJob != nil && !h.installJob.isDone() {
|
||||
h.installMu.Unlock()
|
||||
writeError(w, http.StatusConflict, "install already running")
|
||||
if globalQueue.hasActiveTarget("install") {
|
||||
writeError(w, http.StatusConflict, "install task is already pending or running")
|
||||
return
|
||||
}
|
||||
j := &jobState{}
|
||||
h.installJob = j
|
||||
h.installMu.Unlock()
|
||||
|
||||
logFile := platform.InstallLogPath(req.Device)
|
||||
go runCmdJob(j, exec.CommandContext(context.Background(), "bee-install", req.Device, logFile))
|
||||
|
||||
w.WriteHeader(http.StatusNoContent)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIInstallStream(w http.ResponseWriter, r *http.Request) {
|
||||
h.installMu.Lock()
|
||||
j := h.installJob
|
||||
h.installMu.Unlock()
|
||||
if j == nil {
|
||||
if !sseStart(w) {
|
||||
return
|
||||
}
|
||||
sseWrite(w, "done", "")
|
||||
return
|
||||
t := &Task{
|
||||
ID: newJobID("install"),
|
||||
Name: "Install to Disk",
|
||||
Target: "install",
|
||||
Priority: 20,
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{
|
||||
Device: req.Device,
|
||||
},
|
||||
}
|
||||
streamJob(w, r, j)
|
||||
globalQueue.enqueue(t)
|
||||
writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID})
|
||||
}
|
||||
|
||||
// ── Metrics SSE ───────────────────────────────────────────────────────────────
|
||||
|
||||
func (h *handler) handleAPIMetricsLatest(w http.ResponseWriter, r *http.Request) {
|
||||
sample, ok := h.latestMetric()
|
||||
if !ok {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_, _ = w.Write([]byte("{}"))
|
||||
return
|
||||
}
|
||||
b, err := json.Marshal(sample)
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
_, _ = w.Write(b)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIMetricsStream(w http.ResponseWriter, r *http.Request) {
|
||||
if !sseStart(w) {
|
||||
return
|
||||
@@ -699,13 +729,7 @@ func (h *handler) feedRings(sample platform.LiveMetricSample) {
|
||||
h.ringMemLoad.push(sample.MemLoadPct)
|
||||
|
||||
h.ringsMu.Lock()
|
||||
for i, fan := range sample.Fans {
|
||||
for len(h.ringFans) <= i {
|
||||
h.ringFans = append(h.ringFans, newMetricsRing(120))
|
||||
h.fanNames = append(h.fanNames, fan.Name)
|
||||
}
|
||||
h.ringFans[i].push(float64(fan.RPM))
|
||||
}
|
||||
h.pushFanRings(sample.Fans)
|
||||
for _, gpu := range sample.GPUs {
|
||||
idx := gpu.GPUIndex
|
||||
for len(h.gpuRings) <= idx {
|
||||
@@ -724,6 +748,51 @@ func (h *handler) feedRings(sample platform.LiveMetricSample) {
|
||||
h.ringsMu.Unlock()
|
||||
}
|
||||
|
||||
func (h *handler) pushFanRings(fans []platform.FanReading) {
|
||||
if len(fans) == 0 && len(h.ringFans) == 0 {
|
||||
return
|
||||
}
|
||||
fanValues := make(map[string]float64, len(fans))
|
||||
for _, fan := range fans {
|
||||
if fan.Name == "" {
|
||||
continue
|
||||
}
|
||||
fanValues[fan.Name] = fan.RPM
|
||||
found := false
|
||||
for i, name := range h.fanNames {
|
||||
if name == fan.Name {
|
||||
found = true
|
||||
if i >= len(h.ringFans) {
|
||||
h.ringFans = append(h.ringFans, newMetricsRing(120))
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
h.fanNames = append(h.fanNames, fan.Name)
|
||||
h.ringFans = append(h.ringFans, newMetricsRing(120))
|
||||
}
|
||||
}
|
||||
for i, ring := range h.ringFans {
|
||||
if ring == nil {
|
||||
continue
|
||||
}
|
||||
name := ""
|
||||
if i < len(h.fanNames) {
|
||||
name = h.fanNames[i]
|
||||
}
|
||||
if rpm, ok := fanValues[name]; ok {
|
||||
ring.push(rpm)
|
||||
continue
|
||||
}
|
||||
if last, ok := ring.latest(); ok {
|
||||
ring.push(last)
|
||||
continue
|
||||
}
|
||||
ring.push(0)
|
||||
}
|
||||
}
|
||||
|
||||
func (h *handler) pushNamedMetricRing(dst *[]*namedMetricsRing, name string, value float64) {
|
||||
if name == "" {
|
||||
return
|
||||
@@ -802,7 +871,10 @@ func (h *handler) applyPendingNetworkChange(apply func() (app.ActionResult, erro
|
||||
return result, err
|
||||
}
|
||||
|
||||
pnc := &pendingNetChange{snapshot: snapshot}
|
||||
pnc := &pendingNetChange{
|
||||
snapshot: snapshot,
|
||||
deadline: time.Now().Add(netRollbackTimeout),
|
||||
}
|
||||
pnc.timer = time.AfterFunc(netRollbackTimeout, func() {
|
||||
_ = h.opts.App.RestoreNetworkSnapshot(snapshot)
|
||||
h.pendingNetMu.Lock()
|
||||
@@ -819,6 +891,25 @@ func (h *handler) applyPendingNetworkChange(apply func() (app.ActionResult, erro
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (h *handler) hasPendingNetworkChange() bool {
|
||||
h.pendingNetMu.Lock()
|
||||
defer h.pendingNetMu.Unlock()
|
||||
return h.pendingNet != nil
|
||||
}
|
||||
|
||||
func (h *handler) pendingNetworkRollbackIn() int {
|
||||
h.pendingNetMu.Lock()
|
||||
defer h.pendingNetMu.Unlock()
|
||||
if h.pendingNet == nil {
|
||||
return 0
|
||||
}
|
||||
remaining := int(time.Until(h.pendingNet.deadline).Seconds())
|
||||
if remaining < 1 {
|
||||
return 1
|
||||
}
|
||||
return remaining
|
||||
}
|
||||
|
||||
func (h *handler) handleAPINetworkConfirm(w http.ResponseWriter, _ *http.Request) {
|
||||
h.pendingNetMu.Lock()
|
||||
pnc := h.pendingNet
|
||||
@@ -908,8 +999,31 @@ func parseXrandrOutput(out string) []displayInfo {
|
||||
return infos
|
||||
}
|
||||
|
||||
func xrandrCommand(args ...string) *exec.Cmd {
|
||||
cmd := exec.Command("xrandr", args...)
|
||||
env := append([]string{}, os.Environ()...)
|
||||
hasDisplay := false
|
||||
hasXAuthority := false
|
||||
for _, kv := range env {
|
||||
if strings.HasPrefix(kv, "DISPLAY=") && strings.TrimPrefix(kv, "DISPLAY=") != "" {
|
||||
hasDisplay = true
|
||||
}
|
||||
if strings.HasPrefix(kv, "XAUTHORITY=") && strings.TrimPrefix(kv, "XAUTHORITY=") != "" {
|
||||
hasXAuthority = true
|
||||
}
|
||||
}
|
||||
if !hasDisplay {
|
||||
env = append(env, "DISPLAY=:0")
|
||||
}
|
||||
if !hasXAuthority {
|
||||
env = append(env, "XAUTHORITY=/home/bee/.Xauthority")
|
||||
}
|
||||
cmd.Env = env
|
||||
return cmd
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIDisplayResolutions(w http.ResponseWriter, _ *http.Request) {
|
||||
out, err := exec.Command("xrandr").Output()
|
||||
out, err := xrandrCommand().Output()
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, "xrandr: "+err.Error())
|
||||
return
|
||||
@@ -936,7 +1050,7 @@ func (h *handler) handleAPIDisplaySet(w http.ResponseWriter, r *http.Request) {
|
||||
writeError(w, http.StatusBadRequest, "invalid output name")
|
||||
return
|
||||
}
|
||||
if out, err := exec.Command("xrandr", "--output", req.Output, "--mode", req.Mode).CombinedOutput(); err != nil {
|
||||
if out, err := xrandrCommand("--output", req.Output, "--mode", req.Mode).CombinedOutput(); err != nil {
|
||||
writeError(w, http.StatusInternalServerError, "xrandr: "+strings.TrimSpace(string(out)))
|
||||
return
|
||||
}
|
||||
|
||||
92
audit/internal/webui/api_test.go
Normal file
92
audit/internal/webui/api_test.go
Normal file
@@ -0,0 +1,92 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"net/http/httptest"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
func TestXrandrCommandAddsDefaultX11Env(t *testing.T) {
|
||||
t.Setenv("DISPLAY", "")
|
||||
t.Setenv("XAUTHORITY", "")
|
||||
|
||||
cmd := xrandrCommand("--query")
|
||||
|
||||
var hasDisplay bool
|
||||
var hasXAuthority bool
|
||||
for _, kv := range cmd.Env {
|
||||
if kv == "DISPLAY=:0" {
|
||||
hasDisplay = true
|
||||
}
|
||||
if kv == "XAUTHORITY=/home/bee/.Xauthority" {
|
||||
hasXAuthority = true
|
||||
}
|
||||
}
|
||||
if !hasDisplay {
|
||||
t.Fatalf("DISPLAY not injected: %v", cmd.Env)
|
||||
}
|
||||
if !hasXAuthority {
|
||||
t.Fatalf("XAUTHORITY not injected: %v", cmd.Env)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
originalTasks := globalQueue.tasks
|
||||
globalQueue.tasks = nil
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = originalTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
|
||||
h := &handler{opts: HandlerOptions{App: &app.App{}}}
|
||||
req := httptest.NewRequest("POST", "/api/sat/cpu/run", strings.NewReader(`{"profile":"smoke"}`))
|
||||
req.ContentLength = -1
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
h.handleAPISATRun("cpu").ServeHTTP(rec, req)
|
||||
|
||||
if rec.Code != 200 {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
globalQueue.mu.Lock()
|
||||
defer globalQueue.mu.Unlock()
|
||||
if len(globalQueue.tasks) != 1 {
|
||||
t.Fatalf("tasks=%d want 1", len(globalQueue.tasks))
|
||||
}
|
||||
if got := globalQueue.tasks[0].params.BurnProfile; got != "smoke" {
|
||||
t.Fatalf("burn profile=%q want smoke", got)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {
|
||||
h := &handler{}
|
||||
h.pushFanRings([]platform.FanReading{
|
||||
{Name: "FAN_A", RPM: 4200},
|
||||
{Name: "FAN_B", RPM: 5100},
|
||||
})
|
||||
h.pushFanRings([]platform.FanReading{
|
||||
{Name: "FAN_B", RPM: 5200},
|
||||
})
|
||||
|
||||
if len(h.fanNames) != 2 || h.fanNames[0] != "FAN_A" || h.fanNames[1] != "FAN_B" {
|
||||
t.Fatalf("fanNames=%v", h.fanNames)
|
||||
}
|
||||
aVals, _ := h.ringFans[0].snapshot()
|
||||
bVals, _ := h.ringFans[1].snapshot()
|
||||
if len(aVals) != 2 || len(bVals) != 2 {
|
||||
t.Fatalf("fan ring lengths: A=%d B=%d", len(aVals), len(bVals))
|
||||
}
|
||||
if aVals[1] != 4200 {
|
||||
t.Fatalf("FAN_A should carry forward last value, got %v", aVals)
|
||||
}
|
||||
if bVals[1] != 5200 {
|
||||
t.Fatalf("FAN_B should use latest sampled value, got %v", bVals)
|
||||
}
|
||||
}
|
||||
230
audit/internal/webui/kmsg_watcher.go
Normal file
230
audit/internal/webui/kmsg_watcher.go
Normal file
@@ -0,0 +1,230 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"io"
|
||||
"log/slog"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
// kmsgWatcher reads /dev/kmsg and accumulates hardware error events.
|
||||
// During an active SAT task window it records matching lines; on task finish
|
||||
// it writes Warning status records to the component status DB.
|
||||
type kmsgWatcher struct {
|
||||
mu sync.Mutex
|
||||
activeWindow *kmsgWindow
|
||||
statusDB *app.ComponentStatusDB
|
||||
}
|
||||
|
||||
type kmsgWindow struct {
|
||||
taskID string
|
||||
target string
|
||||
startedAt time.Time
|
||||
seen map[kmsgEventKey]bool
|
||||
events []kmsgEvent
|
||||
}
|
||||
|
||||
type kmsgEventKey struct {
|
||||
id string // BDF or device name
|
||||
category string
|
||||
}
|
||||
|
||||
type kmsgEvent struct {
|
||||
timestamp time.Time
|
||||
raw string
|
||||
ids []string // BDF addresses or device names extracted
|
||||
category string
|
||||
}
|
||||
|
||||
func newKmsgWatcher(statusDB *app.ComponentStatusDB) *kmsgWatcher {
|
||||
return &kmsgWatcher{statusDB: statusDB}
|
||||
}
|
||||
|
||||
// start launches the background kmsg reading goroutine.
|
||||
func (w *kmsgWatcher) start() {
|
||||
go w.run()
|
||||
}
|
||||
|
||||
func (w *kmsgWatcher) run() {
|
||||
f, err := os.Open("/dev/kmsg")
|
||||
if err != nil {
|
||||
slog.Warn("kmsg watcher unavailable", "err", err)
|
||||
return
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
// Best-effort seek to end so we only capture events from now forward.
|
||||
_, _ = f.Seek(0, io.SeekEnd)
|
||||
|
||||
scanner := bufio.NewScanner(f)
|
||||
scanner.Buffer(make([]byte, 64*1024), 64*1024)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
evt, ok := parseKmsgLine(line)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
w.mu.Lock()
|
||||
if w.activeWindow != nil {
|
||||
w.recordEvent(evt)
|
||||
}
|
||||
w.mu.Unlock()
|
||||
}
|
||||
if err := scanner.Err(); err != nil {
|
||||
slog.Warn("kmsg watcher stopped", "err", err)
|
||||
}
|
||||
}
|
||||
|
||||
// recordEvent appends evt to the active window, deduplicating by (id, category).
|
||||
// Must be called with w.mu held.
|
||||
func (w *kmsgWatcher) recordEvent(evt kmsgEvent) {
|
||||
if len(evt.ids) == 0 {
|
||||
// Events without a device ID (e.g. MCE) — deduplicate by category.
|
||||
key := kmsgEventKey{id: "", category: evt.category}
|
||||
if !w.activeWindow.seen[key] {
|
||||
w.activeWindow.seen[key] = true
|
||||
w.activeWindow.events = append(w.activeWindow.events, evt)
|
||||
}
|
||||
return
|
||||
}
|
||||
for _, id := range evt.ids {
|
||||
key := kmsgEventKey{id: id, category: evt.category}
|
||||
if !w.activeWindow.seen[key] {
|
||||
w.activeWindow.seen[key] = true
|
||||
w.activeWindow.events = append(w.activeWindow.events, evt)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// NotifyTaskStarted opens a new event window for the given SAT task.
|
||||
func (w *kmsgWatcher) NotifyTaskStarted(taskID, target string) {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
w.activeWindow = &kmsgWindow{
|
||||
taskID: taskID,
|
||||
target: target,
|
||||
startedAt: time.Now(),
|
||||
seen: make(map[kmsgEventKey]bool),
|
||||
}
|
||||
}
|
||||
|
||||
// NotifyTaskFinished closes the event window and asynchronously writes status records.
|
||||
func (w *kmsgWatcher) NotifyTaskFinished(taskID string) {
|
||||
w.mu.Lock()
|
||||
window := w.activeWindow
|
||||
if window != nil && window.taskID == taskID {
|
||||
w.activeWindow = nil
|
||||
}
|
||||
w.mu.Unlock()
|
||||
|
||||
if window == nil || len(window.events) == 0 {
|
||||
return
|
||||
}
|
||||
go w.flushWindow(window)
|
||||
}
|
||||
|
||||
func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
|
||||
if w.statusDB == nil {
|
||||
return
|
||||
}
|
||||
source := "watchdog:kmsg"
|
||||
// Collect unique component keys from events.
|
||||
seen := map[string]string{} // componentKey → first raw line
|
||||
for _, evt := range window.events {
|
||||
if len(evt.ids) == 0 {
|
||||
// MCE or un-identified error.
|
||||
key := "cpu:all"
|
||||
if evt.category == "memory" {
|
||||
key = "memory:all"
|
||||
}
|
||||
if _, exists := seen[key]; !exists {
|
||||
seen[key] = evt.raw
|
||||
}
|
||||
continue
|
||||
}
|
||||
for _, id := range evt.ids {
|
||||
var key string
|
||||
switch evt.category {
|
||||
case "gpu", "pcie":
|
||||
key = "pcie:" + normalizeBDF(id)
|
||||
case "storage":
|
||||
key = "storage:" + id
|
||||
default:
|
||||
key = "pcie:" + normalizeBDF(id)
|
||||
}
|
||||
if _, exists := seen[key]; !exists {
|
||||
seen[key] = evt.raw
|
||||
}
|
||||
}
|
||||
}
|
||||
for key, detail := range seen {
|
||||
detail = "kernel error during " + window.target + " SAT: " + truncate(detail, 120)
|
||||
w.statusDB.Record(key, source, "Warning", detail)
|
||||
}
|
||||
}
|
||||
|
||||
// parseKmsgLine parses a single /dev/kmsg line and returns an event if it matches
|
||||
// any pattern in platform.HardwareErrorPatterns.
|
||||
// kmsg format: "<priority>,<sequence>,<timestamp_usec>,-;message text"
|
||||
func parseKmsgLine(raw string) (kmsgEvent, bool) {
|
||||
msg := raw
|
||||
if idx := strings.Index(raw, ";"); idx >= 0 {
|
||||
msg = strings.TrimSpace(raw[idx+1:])
|
||||
}
|
||||
if msg == "" {
|
||||
return kmsgEvent{}, false
|
||||
}
|
||||
|
||||
for _, p := range platform.HardwareErrorPatterns {
|
||||
m := p.Re.FindStringSubmatch(msg)
|
||||
if m == nil {
|
||||
continue
|
||||
}
|
||||
evt := kmsgEvent{
|
||||
timestamp: time.Now(),
|
||||
raw: msg,
|
||||
category: p.Category,
|
||||
}
|
||||
if p.BDFGroup > 0 && p.BDFGroup < len(m) {
|
||||
evt.ids = append(evt.ids, normalizeBDF(m[p.BDFGroup]))
|
||||
}
|
||||
if p.DevGroup > 0 && p.DevGroup < len(m) {
|
||||
evt.ids = append(evt.ids, m[p.DevGroup])
|
||||
}
|
||||
return evt, true
|
||||
}
|
||||
return kmsgEvent{}, false
|
||||
}
|
||||
|
||||
// normalizeBDF normalizes a PCIe BDF to the 4-part form "0000:c8:00.0".
|
||||
func normalizeBDF(bdf string) string {
|
||||
bdf = strings.ToLower(strings.TrimSpace(bdf))
|
||||
if strings.Count(bdf, ":") == 1 {
|
||||
return "0000:" + bdf
|
||||
}
|
||||
return bdf
|
||||
}
|
||||
|
||||
func truncate(s string, max int) string {
|
||||
if len(s) <= max {
|
||||
return s
|
||||
}
|
||||
return s[:max] + "..."
|
||||
}
|
||||
|
||||
// isSATTarget returns true for task targets that run hardware acceptance tests.
|
||||
func isSATTarget(target string) bool {
|
||||
switch target {
|
||||
case "nvidia", "nvidia-stress", "memory", "memory-stress", "storage",
|
||||
"cpu", "sat-stress", "amd", "amd-mem", "amd-bandwidth", "amd-stress",
|
||||
"platform-stress":
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
@@ -4,6 +4,9 @@ import (
|
||||
"database/sql"
|
||||
"encoding/csv"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
@@ -20,6 +23,9 @@ type MetricsDB struct {
|
||||
|
||||
// openMetricsDB opens (or creates) the metrics database at the given path.
|
||||
func openMetricsDB(path string) (*MetricsDB, error) {
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
db, err := sql.Open("sqlite", path+"?_journal=WAL&_busy_timeout=5000")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@@ -115,7 +121,7 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
|
||||
|
||||
// LoadRecent returns up to n samples in chronological order (oldest first).
|
||||
func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
|
||||
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?`, n)
|
||||
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
|
||||
}
|
||||
|
||||
// LoadAll returns all persisted samples in chronological order (oldest first).
|
||||
@@ -132,7 +138,7 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
||||
defer rows.Close()
|
||||
|
||||
type sysRow struct {
|
||||
ts int64
|
||||
ts int64
|
||||
cpu, mem, pwr float64
|
||||
}
|
||||
var sysRows []sysRow
|
||||
@@ -146,17 +152,15 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
||||
if len(sysRows) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
// Reverse to chronological order
|
||||
for i, j := 0, len(sysRows)-1; i < j; i, j = i+1, j-1 {
|
||||
sysRows[i], sysRows[j] = sysRows[j], sysRows[i]
|
||||
}
|
||||
|
||||
// Collect min/max ts for range query
|
||||
minTS := sysRows[0].ts
|
||||
maxTS := sysRows[len(sysRows)-1].ts
|
||||
|
||||
// Load GPU rows in range
|
||||
type gpuKey struct{ ts int64; idx int }
|
||||
type gpuKey struct {
|
||||
ts int64
|
||||
idx int
|
||||
}
|
||||
gpuData := map[gpuKey]platform.GPUMetricRow{}
|
||||
gRows, err := m.db.Query(
|
||||
`SELECT ts,gpu_index,temp_c,usage_pct,mem_usage_pct,power_w FROM gpu_metrics WHERE ts>=? AND ts<=? ORDER BY ts,gpu_index`,
|
||||
@@ -174,7 +178,10 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
||||
}
|
||||
|
||||
// Load fan rows in range
|
||||
type fanKey struct{ ts int64; name string }
|
||||
type fanKey struct {
|
||||
ts int64
|
||||
name string
|
||||
}
|
||||
fanData := map[fanKey]float64{}
|
||||
fRows, err := m.db.Query(
|
||||
`SELECT ts,name,rpm FROM fan_metrics WHERE ts>=? AND ts<=?`, minTS, maxTS,
|
||||
@@ -192,7 +199,10 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
||||
}
|
||||
|
||||
// Load temp rows in range
|
||||
type tempKey struct{ ts int64; name string }
|
||||
type tempKey struct {
|
||||
ts int64
|
||||
name string
|
||||
}
|
||||
tempData := map[tempKey]platform.TempReading{}
|
||||
tRows, err := m.db.Query(
|
||||
`SELECT ts,name,grp,celsius FROM temp_metrics WHERE ts>=? AND ts<=?`, minTS, maxTS,
|
||||
@@ -208,7 +218,9 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
||||
}
|
||||
}
|
||||
|
||||
// Collect unique GPU indices and fan names from loaded data (preserve order)
|
||||
// Collect unique GPU indices and fan/temp names from loaded data.
|
||||
// Sort each list so that sample reconstruction is deterministic regardless
|
||||
// of Go's non-deterministic map iteration order.
|
||||
seenGPU := map[int]bool{}
|
||||
var gpuIndices []int
|
||||
for k := range gpuData {
|
||||
@@ -217,6 +229,8 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
||||
gpuIndices = append(gpuIndices, k.idx)
|
||||
}
|
||||
}
|
||||
sort.Ints(gpuIndices)
|
||||
|
||||
seenFan := map[string]bool{}
|
||||
var fanNames []string
|
||||
for k := range fanData {
|
||||
@@ -225,6 +239,8 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
||||
fanNames = append(fanNames, k.name)
|
||||
}
|
||||
}
|
||||
sort.Strings(fanNames)
|
||||
|
||||
seenTemp := map[string]bool{}
|
||||
var tempNames []string
|
||||
for k := range tempData {
|
||||
@@ -233,6 +249,7 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
||||
tempNames = append(tempNames, k.name)
|
||||
}
|
||||
}
|
||||
sort.Strings(tempNames)
|
||||
|
||||
samples := make([]platform.LiveMetricSample, len(sysRows))
|
||||
for i, r := range sysRows {
|
||||
|
||||
69
audit/internal/webui/metricsdb_test.go
Normal file
69
audit/internal/webui/metricsdb_test.go
Normal file
@@ -0,0 +1,69 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
func TestMetricsDBLoadSamplesKeepsChronologicalRangeForGPUs(t *testing.T) {
|
||||
db, err := openMetricsDB(filepath.Join(t.TempDir(), "metrics.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("openMetricsDB: %v", err)
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
base := time.Unix(1_700_000_000, 0).UTC()
|
||||
for i := 0; i < 3; i++ {
|
||||
err := db.Write(platform.LiveMetricSample{
|
||||
Timestamp: base.Add(time.Duration(i) * time.Second),
|
||||
CPULoadPct: float64(10 + i),
|
||||
MemLoadPct: float64(20 + i),
|
||||
PowerW: float64(300 + i),
|
||||
GPUs: []platform.GPUMetricRow{
|
||||
{GPUIndex: 0, PowerW: float64(100 + i)},
|
||||
{GPUIndex: 2, PowerW: float64(200 + i)},
|
||||
},
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("Write(%d): %v", i, err)
|
||||
}
|
||||
}
|
||||
|
||||
all, err := db.LoadAll()
|
||||
if err != nil {
|
||||
t.Fatalf("LoadAll: %v", err)
|
||||
}
|
||||
if len(all) != 3 {
|
||||
t.Fatalf("LoadAll len=%d want 3", len(all))
|
||||
}
|
||||
for i, sample := range all {
|
||||
if len(sample.GPUs) != 2 {
|
||||
t.Fatalf("LoadAll sample %d GPUs=%v want 2 rows", i, sample.GPUs)
|
||||
}
|
||||
if sample.GPUs[0].GPUIndex != 0 || sample.GPUs[0].PowerW != float64(100+i) {
|
||||
t.Fatalf("LoadAll sample %d GPU0=%+v", i, sample.GPUs[0])
|
||||
}
|
||||
if sample.GPUs[1].GPUIndex != 2 || sample.GPUs[1].PowerW != float64(200+i) {
|
||||
t.Fatalf("LoadAll sample %d GPU1=%+v", i, sample.GPUs[1])
|
||||
}
|
||||
}
|
||||
|
||||
recent, err := db.LoadRecent(2)
|
||||
if err != nil {
|
||||
t.Fatalf("LoadRecent: %v", err)
|
||||
}
|
||||
if len(recent) != 2 {
|
||||
t.Fatalf("LoadRecent len=%d want 2", len(recent))
|
||||
}
|
||||
if !recent[0].Timestamp.Before(recent[1].Timestamp) {
|
||||
t.Fatalf("LoadRecent timestamps not ascending: %v >= %v", recent[0].Timestamp, recent[1].Timestamp)
|
||||
}
|
||||
for i, sample := range recent {
|
||||
if len(sample.GPUs) != 2 {
|
||||
t.Fatalf("LoadRecent sample %d GPUs=%v want 2 rows", i, sample.GPUs)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -289,7 +289,7 @@ func renderAudit() string {
|
||||
func renderHardwareSummaryCard(opts HandlerOptions) string {
|
||||
data, err := loadSnapshot(opts.AuditPath)
|
||||
if err != nil {
|
||||
return `<div class="card"><div class="card-head">Hardware Summary</div><div class="card-body"><span class="badge badge-unknown">No audit data</span></div></div>`
|
||||
return `<div class="card"><div class="card-head">Hardware Summary</div><div class="card-body"><button class="btn btn-primary" onclick="auditModalRun()">▶ Run Audit</button></div></div>`
|
||||
}
|
||||
// Parse just enough fields for the summary banner
|
||||
var snap struct {
|
||||
@@ -522,26 +522,37 @@ func renderMetrics() string {
|
||||
</div>
|
||||
|
||||
<script>
|
||||
const chartIds = [
|
||||
'chart-server-load','chart-server-temp-cpu','chart-server-temp-gpu','chart-server-temp-ambient','chart-server-power','chart-server-fans',
|
||||
'chart-gpu-all-load','chart-gpu-all-memload','chart-gpu-all-power','chart-gpu-all-temp'
|
||||
];
|
||||
|
||||
function refreshChartImage(el) {
|
||||
if (!el || el.dataset.loading === '1') return;
|
||||
const baseSrc = el.dataset.baseSrc || el.src.split('?')[0];
|
||||
const nextSrc = baseSrc + '?t=' + Date.now();
|
||||
const probe = new Image();
|
||||
el.dataset.baseSrc = baseSrc;
|
||||
el.dataset.loading = '1';
|
||||
probe.onload = function() {
|
||||
el.src = nextSrc;
|
||||
el.dataset.loading = '0';
|
||||
};
|
||||
probe.onerror = function() {
|
||||
el.dataset.loading = '0';
|
||||
};
|
||||
probe.src = nextSrc;
|
||||
}
|
||||
|
||||
function refreshCharts() {
|
||||
const t = '?t=' + Date.now();
|
||||
['chart-server-load','chart-server-temp-cpu','chart-server-temp-gpu','chart-server-temp-ambient','chart-server-power','chart-server-fans',
|
||||
'chart-gpu-all-load','chart-gpu-all-memload','chart-gpu-all-power','chart-gpu-all-temp'].forEach(id => {
|
||||
const el = document.getElementById(id);
|
||||
if (el) el.src = el.src.split('?')[0] + t;
|
||||
});
|
||||
chartIds.forEach(id => refreshChartImage(document.getElementById(id)));
|
||||
}
|
||||
setInterval(refreshCharts, 3000);
|
||||
|
||||
const es = new EventSource('/api/metrics/stream');
|
||||
es.addEventListener('metrics', e => {
|
||||
const d = JSON.parse(e.data);
|
||||
|
||||
// Show/hide Fan RPM card based on data availability
|
||||
fetch('/api/metrics/latest').then(r => r.json()).then(d => {
|
||||
const fanCard = document.getElementById('card-server-fans');
|
||||
if (fanCard) fanCard.style.display = (d.fans && d.fans.length > 0) ? '' : 'none';
|
||||
|
||||
});
|
||||
es.onerror = () => {};
|
||||
}).catch(() => {});
|
||||
</script>`
|
||||
}
|
||||
|
||||
@@ -663,80 +674,210 @@ func renderSATCard(id, label, extra string) string {
|
||||
func renderBurn() string {
|
||||
return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>⚠ Warning:</strong> Stress tests on this page run hardware at maximum load. Repeated or prolonged use may reduce hardware lifespan (storage endurance, GPU wear). Use only when necessary.</div>
|
||||
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||
<div class="card"><div class="card-head">Burn Profile</div><div class="card-body">
|
||||
<div class="form-row" style="max-width:320px"><label>Preset</label><select id="burn-profile"><option value="smoke">Smoke: 5 minutes</option><option value="acceptance">Acceptance: 1 hour</option><option value="overnight">Overnight: 8 hours</option></select></div>
|
||||
<p style="color:var(--muted);font-size:12px">Applied to all tests on this page. NVIDIA uses mapped DCGM levels: smoke=quick, acceptance=targeted stress, overnight=extended stress.</p>
|
||||
</div></div>
|
||||
<div class="grid3">
|
||||
<div class="card"><div class="card-head">NVIDIA GPU Stress</div><div class="card-body">
|
||||
<button id="sat-btn-nvidia" class="btn btn-primary" onclick="runBurnIn('nvidia')">▶ Start NVIDIA Stress</button>
|
||||
</div></div>
|
||||
<div class="card"><div class="card-head">CPU Stress</div><div class="card-body">
|
||||
<button class="btn btn-primary" onclick="runBurnIn('cpu')">▶ Start CPU Stress</button>
|
||||
</div></div>
|
||||
<div class="card"><div class="card-head">AMD GPU Stress</div><div class="card-body">
|
||||
<p style="color:var(--muted);font-size:12px;margin-bottom:8px">Runs ROCm compute stress together with VRAM copy/load activity via RVS GST and records a separate <code>rocm-bandwidth-test</code> snapshot. Missing tools reported as UNSUPPORTED.</p>
|
||||
<button id="sat-btn-amd-stress" class="btn btn-primary" onclick="runBurnIn('amd-stress')">▶ Start AMD Stress</button>
|
||||
</div></div>
|
||||
<div class="card"><div class="card-head">Memory Stress</div><div class="card-body">
|
||||
<p style="color:var(--muted);font-size:12px;margin-bottom:8px">stress-ng --vm writes and verifies memory patterns across all of RAM. Env: <code>BEE_VM_STRESS_SECONDS</code> (default 300), <code>BEE_VM_STRESS_SIZE_MB</code> (default 80%).</p>
|
||||
<button class="btn btn-primary" onclick="runBurnIn('memory-stress')">▶ Start Memory Stress</button>
|
||||
</div></div>
|
||||
<div class="card"><div class="card-head">SAT Stress (stressapptest)</div><div class="card-body">
|
||||
<p style="color:var(--muted);font-size:12px;margin-bottom:8px">Google stressapptest saturates CPU, memory and cache buses simultaneously. Env: <code>BEE_SAT_STRESS_SECONDS</code> (default 300), <code>BEE_SAT_STRESS_MB</code> (default auto).</p>
|
||||
<button class="btn btn-primary" onclick="runBurnIn('sat-stress')">▶ Start SAT Stress</button>
|
||||
</div></div>
|
||||
<div class="card"><div class="card-head">Platform Thermal Cycling</div><div class="card-body">
|
||||
<p style="color:var(--muted);font-size:12px;margin-bottom:8px">Runs CPU + GPU stress simultaneously across multiple load/idle cycles with varying durations. Detects cooling systems that fail to recover under repeated load cycles. Smoke: 2 cycles ~5 min. Acceptance: 4 cycles ~25 min.</p>
|
||||
<button class="btn btn-primary" onclick="runBurnIn('platform-stress')">▶ Start Thermal Cycling</button>
|
||||
</div></div>
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">Burn Profile</div>
|
||||
<div class="card-body" style="display:flex;align-items:center;gap:16px;flex-wrap:wrap">
|
||||
<div class="form-row" style="margin:0;max-width:380px"><label>Preset</label><select id="burn-profile">
|
||||
<option value="smoke" selected>Smoke — quick check (~5 min)</option>
|
||||
<option value="acceptance">Acceptance — 1 hour</option>
|
||||
<option value="overnight">Overnight — 8 hours</option>
|
||||
</select></div>
|
||||
<button class="btn btn-primary" onclick="runAll()">▶ Run All</button>
|
||||
<span id="burn-all-status" style="font-size:12px;color:var(--muted)"></span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="grid3" style="margin-bottom:16px">
|
||||
|
||||
<div class="card">
|
||||
<div class="card-head">GPU Stress</div>
|
||||
<div class="card-body">
|
||||
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">Tests run on all GPUs in the system. Availability determined by driver status.</p>
|
||||
<div id="gpu-tools-list">
|
||||
<label class="cb-row"><input type="checkbox" id="burn-gpu-bee" value="bee-gpu-burn" disabled><span>bee-gpu-burn <span class="cb-note" id="note-bee"></span></span></label>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-gpu-john" value="john" disabled><span>John the Ripper (OpenCL) <span class="cb-note" id="note-john"></span></span></label>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-gpu-nccl" value="nccl" disabled><span>NCCL all_reduce_perf <span class="cb-note" id="note-nccl"></span></span></label>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-gpu-rvs" value="rvs" disabled><span>RVS GST (AMD) <span class="cb-note" id="note-rvs"></span></span></label>
|
||||
</div>
|
||||
<button class="btn btn-primary" style="margin-top:10px" onclick="runGPUStress()">▶ Run GPU Stress</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card">
|
||||
<div class="card-head">Compute Stress</div>
|
||||
<div class="card-body">
|
||||
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">Select which subsystems to stress. Each checked item runs as a separate task.</p>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-cpu" checked><span>CPU stress (stress-ng)</span></label>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-mem-stress" checked><span>Memory stress (stress-ng --vm)</span></label>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-sat-stress"><span>stressapptest (CPU + memory bus)</span></label>
|
||||
<button class="btn btn-primary" style="margin-top:10px" onclick="runComputeStress()">▶ Run Compute Stress</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card">
|
||||
<div class="card-head">Platform Thermal Cycling</div>
|
||||
<div class="card-body">
|
||||
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">Repeated load+idle cycles. Detects cooling recovery failures and GPU throttle. Smoke: 2×90s. Acceptance: 4×300s.</p>
|
||||
<p style="font-size:12px;font-weight:600;margin:0 0 6px">Load components:</p>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-pt-cpu" checked><span>CPU (stressapptest)</span></label>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-pt-nvidia" disabled><span>NVIDIA GPU <span class="cb-note" id="note-pt-nvidia"></span></span></label>
|
||||
<label class="cb-row"><input type="checkbox" id="burn-pt-amd" disabled><span>AMD GPU <span class="cb-note" id="note-pt-amd"></span></span></label>
|
||||
<button class="btn btn-primary" style="margin-top:10px" onclick="runPlatformStress()">▶ Run Thermal Cycling</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
<div id="bi-output" style="display:none;margin-top:16px" class="card">
|
||||
<div class="card-head">Output <span id="bi-title"></span></div>
|
||||
<div class="card-body"><div id="bi-terminal" class="terminal"></div></div>
|
||||
</div>
|
||||
|
||||
<style>
|
||||
.cb-row { display:flex; align-items:center; gap:8px; padding:4px 0; cursor:pointer; font-size:13px; }
|
||||
.cb-row input[type=checkbox] { width:16px; height:16px; flex-shrink:0; }
|
||||
.cb-row input[type=checkbox]:disabled { opacity:0.4; cursor:not-allowed; }
|
||||
.cb-row input[type=checkbox]:disabled ~ span { opacity:0.45; cursor:not-allowed; }
|
||||
.cb-note { font-size:11px; color:var(--muted); font-style:italic; }
|
||||
</style>
|
||||
|
||||
<script>
|
||||
let biES = null;
|
||||
function runBurnIn(target) {
|
||||
|
||||
function profile() { return document.getElementById('burn-profile').value || 'smoke'; }
|
||||
|
||||
function enqueueTask(target, extra) {
|
||||
const body = Object.assign({ profile: profile() }, extra || {});
|
||||
return fetch('/api/sat/'+target+'/run', {
|
||||
method: 'POST', headers: {'Content-Type':'application/json'}, body: JSON.stringify(body)
|
||||
}).then(r => r.json());
|
||||
}
|
||||
|
||||
function streamTask(taskId, label) {
|
||||
if (biES) { biES.close(); biES = null; }
|
||||
const body = { profile: document.getElementById('burn-profile').value || 'smoke' };
|
||||
document.getElementById('bi-output').style.display='block';
|
||||
document.getElementById('bi-title').textContent = '— ' + target + ' [' + body.profile + ']';
|
||||
document.getElementById('bi-output').style.display = 'block';
|
||||
document.getElementById('bi-title').textContent = '— ' + label + ' [' + profile() + ']';
|
||||
const term = document.getElementById('bi-terminal');
|
||||
term.textContent = 'Enqueuing ' + target + ' stress...\n';
|
||||
fetch('/api/sat/'+target+'/run', {method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify(body)})
|
||||
.then(r => r.json())
|
||||
.then(d => {
|
||||
term.textContent += 'Task ' + d.task_id + ' queued.\n';
|
||||
biES = new EventSource('/api/tasks/'+d.task_id+'/stream');
|
||||
biES.onmessage = e => { term.textContent += e.data+'\n'; term.scrollTop=term.scrollHeight; };
|
||||
biES.addEventListener('done', e => { biES.close(); biES=null; term.textContent += (e.data ? '\nERROR: '+e.data : '\nCompleted.')+'\n'; });
|
||||
});
|
||||
term.textContent = 'Task ' + taskId + ' queued. Streaming...\n';
|
||||
biES = new EventSource('/api/tasks/'+taskId+'/stream');
|
||||
biES.onmessage = e => { term.textContent += e.data+'\n'; term.scrollTop = term.scrollHeight; };
|
||||
biES.addEventListener('done', e => {
|
||||
biES.close(); biES = null;
|
||||
term.textContent += (e.data ? '\nERROR: '+e.data : '\nCompleted.')+'\n';
|
||||
});
|
||||
}
|
||||
</script>
|
||||
<script>
|
||||
fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
|
||||
if (!gp.nvidia) disableSATCard('nvidia', 'No NVIDIA GPU detected');
|
||||
if (!gp.amd) disableSATCard('amd-stress', 'No AMD GPU detected');
|
||||
});
|
||||
function disableSATCard(id, reason) {
|
||||
const btn = document.getElementById('sat-btn-' + id);
|
||||
if (!btn) return;
|
||||
btn.disabled = true;
|
||||
btn.title = reason;
|
||||
btn.style.opacity = '0.4';
|
||||
const card = btn.closest('.card');
|
||||
if (card) {
|
||||
let note = card.querySelector('.sat-unavail');
|
||||
if (!note) {
|
||||
note = document.createElement('p');
|
||||
note.className = 'sat-unavail';
|
||||
note.style.cssText = 'color:var(--muted);font-size:12px;margin-top:6px';
|
||||
btn.parentNode.insertBefore(note, btn.nextSibling);
|
||||
}
|
||||
note.textContent = reason;
|
||||
|
||||
function runGPUStress() {
|
||||
const ids = ['burn-gpu-bee','burn-gpu-john','burn-gpu-nccl','burn-gpu-rvs'];
|
||||
const loaderMap = {'burn-gpu-bee':'builtin','burn-gpu-john':'john','burn-gpu-nccl':'nccl','burn-gpu-rvs':'rvs'};
|
||||
const targetMap = {'burn-gpu-bee':'nvidia-stress','burn-gpu-john':'nvidia-stress','burn-gpu-nccl':'nvidia-stress','burn-gpu-rvs':'amd-stress'};
|
||||
let last = null;
|
||||
ids.filter(id => {
|
||||
const el = document.getElementById(id);
|
||||
return el && el.checked && !el.disabled;
|
||||
}).forEach(id => {
|
||||
const target = targetMap[id];
|
||||
const extra = target === 'nvidia-stress' ? {loader: loaderMap[id]} : {};
|
||||
enqueueTask(target, extra).then(d => { last = d; streamTask(d.task_id, target + ' / ' + loaderMap[id]); });
|
||||
});
|
||||
}
|
||||
|
||||
function runComputeStress() {
|
||||
const tasks = [
|
||||
{id:'burn-cpu', target:'cpu'},
|
||||
{id:'burn-mem-stress', target:'memory-stress'},
|
||||
{id:'burn-sat-stress', target:'sat-stress'},
|
||||
];
|
||||
let last = null;
|
||||
tasks.filter(t => {
|
||||
const el = document.getElementById(t.id);
|
||||
return el && el.checked;
|
||||
}).forEach(t => {
|
||||
enqueueTask(t.target).then(d => { last = d; streamTask(d.task_id, t.target); });
|
||||
});
|
||||
}
|
||||
|
||||
function runPlatformStress() {
|
||||
const comps = [];
|
||||
if (document.getElementById('burn-pt-cpu').checked) comps.push('cpu');
|
||||
const nv = document.getElementById('burn-pt-nvidia');
|
||||
if (nv && nv.checked && !nv.disabled) comps.push('gpu');
|
||||
const am = document.getElementById('burn-pt-amd');
|
||||
if (am && am.checked && !am.disabled) comps.push('gpu');
|
||||
const extra = comps.length > 0 ? {platform_components: comps} : {};
|
||||
enqueueTask('platform-stress', extra).then(d => streamTask(d.task_id, 'platform-stress'));
|
||||
}
|
||||
|
||||
function runAll() {
|
||||
const status = document.getElementById('burn-all-status');
|
||||
status.textContent = 'Enqueuing...';
|
||||
let count = 0;
|
||||
const done = () => { count++; status.textContent = count + ' tasks queued.'; };
|
||||
|
||||
// GPU tests
|
||||
const gpuIds = ['burn-gpu-bee','burn-gpu-john','burn-gpu-nccl','burn-gpu-rvs'];
|
||||
const loaderMap = {'burn-gpu-bee':'builtin','burn-gpu-john':'john','burn-gpu-nccl':'nccl','burn-gpu-rvs':'rvs'};
|
||||
const gpuTargetMap = {'burn-gpu-bee':'nvidia-stress','burn-gpu-john':'nvidia-stress','burn-gpu-nccl':'nvidia-stress','burn-gpu-rvs':'amd-stress'};
|
||||
gpuIds.filter(id => { const el = document.getElementById(id); return el && el.checked && !el.disabled; }).forEach(id => {
|
||||
const target = gpuTargetMap[id];
|
||||
const extra = target === 'nvidia-stress' ? {loader: loaderMap[id]} : {};
|
||||
enqueueTask(target, extra).then(d => { streamTask(d.task_id, target); done(); });
|
||||
});
|
||||
|
||||
// Compute tests
|
||||
[{id:'burn-cpu',target:'cpu'},{id:'burn-mem-stress',target:'memory-stress'},{id:'burn-sat-stress',target:'sat-stress'}]
|
||||
.filter(t => { const el = document.getElementById(t.id); return el && el.checked; })
|
||||
.forEach(t => enqueueTask(t.target).then(d => { streamTask(d.task_id, t.target); done(); }));
|
||||
|
||||
// Platform
|
||||
const comps = [];
|
||||
if (document.getElementById('burn-pt-cpu').checked) comps.push('cpu');
|
||||
const nv = document.getElementById('burn-pt-nvidia');
|
||||
if (nv && nv.checked && !nv.disabled) comps.push('gpu');
|
||||
const am = document.getElementById('burn-pt-amd');
|
||||
if (am && am.checked && !am.disabled) comps.push('gpu');
|
||||
const ptExtra = comps.length > 0 ? {platform_components: comps} : {};
|
||||
enqueueTask('platform-stress', ptExtra).then(d => { streamTask(d.task_id, 'platform-stress'); done(); });
|
||||
}
|
||||
|
||||
// Load GPU tool availability
|
||||
fetch('/api/gpu/tools').then(r => r.json()).then(tools => {
|
||||
const nvidiaMap = {'bee-gpu-burn':'burn-gpu-bee','john':'burn-gpu-john','nccl':'burn-gpu-nccl','rvs':'burn-gpu-rvs'};
|
||||
const noteMap = {'bee-gpu-burn':'note-bee','john':'note-john','nccl':'note-nccl','rvs':'note-rvs'};
|
||||
tools.forEach(t => {
|
||||
const cb = document.getElementById(nvidiaMap[t.id]);
|
||||
const note = document.getElementById(noteMap[t.id]);
|
||||
if (!cb) return;
|
||||
if (t.available) {
|
||||
cb.disabled = false;
|
||||
if (t.id === 'bee-gpu-burn') cb.checked = true;
|
||||
} else {
|
||||
const reason = t.vendor === 'nvidia' ? 'NVIDIA driver not running' : 'AMD driver not running';
|
||||
if (note) note.textContent = '— ' + reason;
|
||||
}
|
||||
}
|
||||
});
|
||||
}).catch(() => {});
|
||||
|
||||
// Load GPU presence for platform thermal cycling
|
||||
fetch('/api/gpu/presence').then(r => r.json()).then(gp => {
|
||||
const nvCb = document.getElementById('burn-pt-nvidia');
|
||||
const amCb = document.getElementById('burn-pt-amd');
|
||||
const nvNote = document.getElementById('note-pt-nvidia');
|
||||
const amNote = document.getElementById('note-pt-amd');
|
||||
if (gp.nvidia) {
|
||||
nvCb.disabled = false;
|
||||
nvCb.checked = true;
|
||||
} else {
|
||||
if (nvNote) nvNote.textContent = '— NVIDIA driver not running';
|
||||
}
|
||||
if (gp.amd) {
|
||||
amCb.disabled = false;
|
||||
amCb.checked = true;
|
||||
} else {
|
||||
if (amNote) amNote.textContent = '— AMD driver not running';
|
||||
}
|
||||
}).catch(() => {});
|
||||
</script>`
|
||||
}
|
||||
|
||||
@@ -768,6 +909,8 @@ func renderNetworkInline() string {
|
||||
</div>
|
||||
<script>
|
||||
var _netCountdownTimer = null;
|
||||
var _netRefreshTimer = null;
|
||||
const NET_ROLLBACK_SECS = 60;
|
||||
function loadNetwork() {
|
||||
fetch('/api/network').then(r=>r.json()).then(d => {
|
||||
const rows = (d.interfaces||[]).map(i =>
|
||||
@@ -778,21 +921,33 @@ function loadNetwork() {
|
||||
document.getElementById('iface-table').innerHTML =
|
||||
'<table><tr><th>Interface</th><th>State (click to toggle)</th><th>Addresses</th></tr>'+rows+'</table>' +
|
||||
(d.default_route ? '<p style="font-size:12px;color:var(--muted);margin-top:8px">Default route: '+d.default_route+'</p>' : '');
|
||||
});
|
||||
if (d.pending_change) showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
|
||||
else hideNetPending();
|
||||
}).catch(function() {});
|
||||
}
|
||||
function selectIface(iface) {
|
||||
document.getElementById('dhcp-iface').value = iface;
|
||||
document.getElementById('st-iface').value = iface;
|
||||
}
|
||||
function toggleIface(iface, currentState) {
|
||||
showNetPending(NET_ROLLBACK_SECS);
|
||||
fetch('/api/network/toggle',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({iface:iface})})
|
||||
.then(r=>r.json()).then(d => {
|
||||
if (d.error) { alert('Error: '+d.error); return; }
|
||||
if (d.error) { hideNetPending(); alert('Error: '+d.error); return; }
|
||||
loadNetwork();
|
||||
showNetPending(d.rollback_in || 60);
|
||||
showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
|
||||
}).catch(function() {
|
||||
setTimeout(loadNetwork, 1500);
|
||||
});
|
||||
}
|
||||
function hideNetPending() {
|
||||
const el = document.getElementById('net-pending');
|
||||
if (_netCountdownTimer) clearInterval(_netCountdownTimer);
|
||||
_netCountdownTimer = null;
|
||||
el.style.display = 'none';
|
||||
}
|
||||
function showNetPending(secs) {
|
||||
if (!secs || secs < 1) { hideNetPending(); return; }
|
||||
const el = document.getElementById('net-pending');
|
||||
el.style.display = 'block';
|
||||
if (_netCountdownTimer) clearInterval(_netCountdownTimer);
|
||||
@@ -801,30 +956,33 @@ function showNetPending(secs) {
|
||||
_netCountdownTimer = setInterval(function() {
|
||||
remaining--;
|
||||
document.getElementById('net-countdown').textContent = remaining;
|
||||
if (remaining <= 0) { clearInterval(_netCountdownTimer); _netCountdownTimer=null; el.style.display='none'; loadNetwork(); }
|
||||
if (remaining <= 0) { hideNetPending(); loadNetwork(); }
|
||||
}, 1000);
|
||||
}
|
||||
function confirmNetChange() {
|
||||
if (_netCountdownTimer) { clearInterval(_netCountdownTimer); _netCountdownTimer=null; }
|
||||
document.getElementById('net-pending').style.display='none';
|
||||
fetch('/api/network/confirm',{method:'POST'});
|
||||
hideNetPending();
|
||||
fetch('/api/network/confirm',{method:'POST'}).then(()=>loadNetwork()).catch(()=>{});
|
||||
}
|
||||
function rollbackNetChange() {
|
||||
if (_netCountdownTimer) { clearInterval(_netCountdownTimer); _netCountdownTimer=null; }
|
||||
document.getElementById('net-pending').style.display='none';
|
||||
fetch('/api/network/rollback',{method:'POST'}).then(()=>loadNetwork());
|
||||
hideNetPending();
|
||||
fetch('/api/network/rollback',{method:'POST'}).then(()=>loadNetwork()).catch(()=>{});
|
||||
}
|
||||
function runDHCP() {
|
||||
const iface = document.getElementById('dhcp-iface').value.trim();
|
||||
showNetPending(NET_ROLLBACK_SECS);
|
||||
fetch('/api/network/dhcp',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({interface:iface||'all'})})
|
||||
.then(r=>r.json()).then(d => {
|
||||
document.getElementById('dhcp-out').textContent = d.output || d.error || 'Done.';
|
||||
if (!d.error) showNetPending(d.rollback_in || 60);
|
||||
if (d.error) { hideNetPending(); return; }
|
||||
showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
|
||||
loadNetwork();
|
||||
}).catch(function() {
|
||||
setTimeout(loadNetwork, 1500);
|
||||
});
|
||||
}
|
||||
function setStatic() {
|
||||
const dns = document.getElementById('st-dns').value.split(',').map(s=>s.trim()).filter(Boolean);
|
||||
showNetPending(NET_ROLLBACK_SECS);
|
||||
fetch('/api/network/static',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({
|
||||
interface: document.getElementById('st-iface').value,
|
||||
address: document.getElementById('st-addr').value,
|
||||
@@ -833,11 +991,16 @@ function setStatic() {
|
||||
dns: dns,
|
||||
})}).then(r=>r.json()).then(d => {
|
||||
document.getElementById('static-out').textContent = d.output || d.error || 'Done.';
|
||||
if (!d.error) showNetPending(d.rollback_in || 60);
|
||||
if (d.error) { hideNetPending(); return; }
|
||||
showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
|
||||
loadNetwork();
|
||||
}).catch(function() {
|
||||
setTimeout(loadNetwork, 1500);
|
||||
});
|
||||
}
|
||||
loadNetwork();
|
||||
if (_netRefreshTimer) clearInterval(_netRefreshTimer);
|
||||
_netRefreshTimer = setInterval(loadNetwork, 5000);
|
||||
</script>`
|
||||
}
|
||||
|
||||
@@ -850,7 +1013,7 @@ func renderNetwork() string {
|
||||
// ── Services ──────────────────────────────────────────────────────────────────
|
||||
|
||||
func renderServicesInline() string {
|
||||
return `<div style="display:flex;justify-content:flex-end;margin-bottom:8px"><button class="btn btn-sm btn-secondary" onclick="loadServices()">↻ Refresh</button></div>
|
||||
return `<div style="display:flex;justify-content:flex-end;gap:8px;flex-wrap:wrap;margin-bottom:8px"><button class="btn btn-sm btn-secondary" onclick="restartGPUDrivers()">Restart GPU Drivers</button><button class="btn btn-sm btn-secondary" onclick="loadServices()">↻ Refresh</button></div>
|
||||
<div id="svc-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
|
||||
<div id="svc-out" style="display:none;margin-top:8px" class="card">
|
||||
<div class="card-head">Output</div>
|
||||
@@ -891,6 +1054,9 @@ function svcAction(name, action) {
|
||||
setTimeout(loadServices, 1000);
|
||||
});
|
||||
}
|
||||
function restartGPUDrivers() {
|
||||
svcAction('bee-nvidia', 'restart');
|
||||
}
|
||||
loadServices();
|
||||
</script>`
|
||||
}
|
||||
@@ -916,7 +1082,7 @@ func renderExport(exportDir string) string {
|
||||
return `<div class="grid2">
|
||||
<div class="card"><div class="card-head">Support Bundle</div><div class="card-body">
|
||||
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Creates a tar.gz archive of all audit files, SAT results, and logs.</p>
|
||||
<a class="btn btn-primary" href="/export/support.tar.gz">↓ Download Support Bundle</a>
|
||||
` + renderSupportBundleInline() + `
|
||||
</div></div>
|
||||
<div class="card"><div class="card-head">Export Files</div><div class="card-body">
|
||||
<table><tr><th>File</th></tr>` + rows.String() + `</table>
|
||||
@@ -1014,6 +1180,50 @@ func listExportFiles(exportDir string) ([]string, error) {
|
||||
return entries, nil
|
||||
}
|
||||
|
||||
func renderSupportBundleInline() string {
|
||||
return `<button id="support-bundle-btn" class="btn btn-primary" onclick="supportBundleDownload()">↓ Download Support Bundle</button>
|
||||
<div id="support-bundle-status" style="margin-top:10px;font-size:13px;color:var(--muted)"></div>
|
||||
<script>
|
||||
window.supportBundleDownload = function() {
|
||||
var btn = document.getElementById('support-bundle-btn');
|
||||
var status = document.getElementById('support-bundle-status');
|
||||
btn.disabled = true;
|
||||
btn.textContent = 'Building...';
|
||||
status.textContent = 'Collecting logs and export data\u2026';
|
||||
status.style.color = 'var(--muted)';
|
||||
var filename = 'bee-support.tar.gz';
|
||||
fetch('/export/support.tar.gz')
|
||||
.then(function(r) {
|
||||
if (!r.ok) throw new Error('HTTP ' + r.status);
|
||||
var cd = r.headers.get('Content-Disposition') || '';
|
||||
var m = cd.match(/filename="?([^";]+)"?/);
|
||||
if (m) filename = m[1];
|
||||
return r.blob();
|
||||
})
|
||||
.then(function(blob) {
|
||||
var url = URL.createObjectURL(blob);
|
||||
var a = document.createElement('a');
|
||||
a.href = url;
|
||||
a.download = filename;
|
||||
document.body.appendChild(a);
|
||||
a.click();
|
||||
document.body.removeChild(a);
|
||||
URL.revokeObjectURL(url);
|
||||
status.textContent = 'Download started.';
|
||||
status.style.color = 'var(--ok-fg)';
|
||||
})
|
||||
.catch(function(e) {
|
||||
status.textContent = 'Error: ' + e.message;
|
||||
status.style.color = 'var(--crit-fg)';
|
||||
})
|
||||
.finally(function() {
|
||||
btn.disabled = false;
|
||||
btn.textContent = '\u2195 Download Support Bundle';
|
||||
});
|
||||
};
|
||||
</script>`
|
||||
}
|
||||
|
||||
// ── Display Resolution ────────────────────────────────────────────────────────
|
||||
|
||||
func renderDisplayInline() string {
|
||||
@@ -1072,6 +1282,7 @@ func renderTools() string {
|
||||
<div class="card-body">
|
||||
<div style="margin-bottom:20px">
|
||||
<div style="font-weight:600;margin-bottom:8px">Install to RAM</div>
|
||||
<p id="boot-source-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Detecting boot source...</p>
|
||||
<p id="ram-status-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Checking...</p>
|
||||
<button id="ram-install-btn" class="btn btn-primary" onclick="installToRAM()" style="display:none">▶ Copy to RAM</button>
|
||||
</div>
|
||||
@@ -1083,8 +1294,18 @@ func renderTools() string {
|
||||
</div>
|
||||
<script>
|
||||
fetch('/api/system/ram-status').then(r=>r.json()).then(d=>{
|
||||
const boot = document.getElementById('boot-source-text');
|
||||
const txt = document.getElementById('ram-status-text');
|
||||
const btn = document.getElementById('ram-install-btn');
|
||||
let source = d.device || d.source || 'unknown source';
|
||||
let kind = d.kind || 'unknown';
|
||||
let label = source;
|
||||
if (kind === 'ram') label = 'RAM';
|
||||
else if (kind === 'usb') label = 'USB (' + source + ')';
|
||||
else if (kind === 'cdrom') label = 'CD-ROM (' + source + ')';
|
||||
else if (kind === 'disk') label = 'disk (' + source + ')';
|
||||
else label = source;
|
||||
boot.textContent = 'Current boot source: ' + label + '.';
|
||||
if (d.in_ram) {
|
||||
txt.textContent = '✓ Running from RAM — installation media can be safely disconnected.';
|
||||
txt.style.color = 'var(--ok, green)';
|
||||
@@ -1103,7 +1324,7 @@ function installToRAM() {
|
||||
|
||||
<div class="card"><div class="card-head">Support Bundle</div><div class="card-body">
|
||||
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Downloads a tar.gz archive of all audit files, SAT results, and logs.</p>
|
||||
<a class="btn btn-primary" href="/export/support.tar.gz">↓ Download Support Bundle</a>
|
||||
` + renderSupportBundleInline() + `
|
||||
</div></div>
|
||||
|
||||
<div class="card"><div class="card-head">Tool Check <button class="btn btn-sm btn-secondary" onclick="checkTools()" style="margin-left:auto">↻ Check</button></div>
|
||||
@@ -1282,21 +1503,23 @@ function installStart() {
|
||||
headers: {'Content-Type': 'application/json'},
|
||||
body: JSON.stringify({device: _installSelected.device})
|
||||
}).then(function(r){
|
||||
if (r.status === 204) {
|
||||
installStreamLog();
|
||||
} else {
|
||||
return r.json().then(function(j){ throw new Error(j.error || r.statusText); });
|
||||
}
|
||||
return r.json().then(function(j){
|
||||
if (!r.ok) throw new Error(j.error || r.statusText);
|
||||
return j;
|
||||
});
|
||||
}).then(function(j){
|
||||
if (!j.task_id) throw new Error('missing task id');
|
||||
installStreamLog(j.task_id);
|
||||
}).catch(function(e){
|
||||
status.textContent = 'Error: ' + e;
|
||||
status.style.color = 'var(--crit-fg)';
|
||||
});
|
||||
}
|
||||
|
||||
function installStreamLog() {
|
||||
function installStreamLog(taskId) {
|
||||
var term = document.getElementById('install-terminal');
|
||||
var status = document.getElementById('install-status');
|
||||
var es = new EventSource('/api/install/stream');
|
||||
var es = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||
es.onmessage = function(e) {
|
||||
term.textContent += e.data + '\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
@@ -1341,31 +1564,47 @@ func renderInstall() string {
|
||||
// ── Tasks ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
func renderTasks() string {
|
||||
return `<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px">
|
||||
return `<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px;flex-wrap:wrap">
|
||||
<button class="btn btn-danger btn-sm" onclick="cancelAll()">Cancel All</button>
|
||||
<button class="btn btn-sm" style="background:#b45309;color:#fff" onclick="killWorkers()" title="Send SIGKILL to all running test processes (bee-gpu-burn, stress-ng, stressapptest, memtester)">Kill Workers</button>
|
||||
<span id="kill-toast" style="font-size:12px;color:var(--muted);display:none"></span>
|
||||
<span style="font-size:12px;color:var(--muted)">Tasks run one at a time. Logs persist after navigation.</span>
|
||||
</div>
|
||||
<div class="card">
|
||||
<div id="tasks-table"><p style="color:var(--muted);font-size:13px;padding:16px">Loading...</p></div>
|
||||
</div>
|
||||
<div id="task-log-section" style="display:none;margin-top:16px" class="card">
|
||||
<div class="card-head">Logs — <span id="task-log-title"></span>
|
||||
<button class="btn btn-sm btn-secondary" onclick="closeTaskLog()" style="margin-left:auto">✕</button>
|
||||
<div id="task-log-overlay" style="display:none;position:fixed;inset:0;background:rgba(0,0,0,.58);z-index:120;align-items:center;justify-content:center;padding:16px">
|
||||
<div style="background:#fff;border-radius:6px;box-shadow:0 24px 60px rgba(0,0,0,.35);width:calc(100vw - 32px);max-width:1600px;height:calc(100vh - 32px);display:flex;flex-direction:column;overflow:hidden;position:relative">
|
||||
<div class="card-head" style="padding:14px 18px;font-size:14px">Logs — <span id="task-log-title"></span>
|
||||
<button class="btn btn-sm btn-secondary" onclick="closeTaskLog()" style="margin-left:auto">✕</button>
|
||||
</div>
|
||||
<div class="card-body" style="padding:16px;flex:1;min-height:0"><div id="task-log-terminal" class="terminal" style="height:100%;max-height:none"></div></div>
|
||||
</div>
|
||||
<div class="card-body"><div id="task-log-terminal" class="terminal" style="max-height:500px"></div></div>
|
||||
</div>
|
||||
<script>
|
||||
var _taskLogES = null;
|
||||
var _taskRefreshTimer = null;
|
||||
var _tasksAll = [];
|
||||
var _taskPage = 1;
|
||||
var _taskPageSize = 50;
|
||||
var _taskLogID = '';
|
||||
|
||||
function loadTasks() {
|
||||
fetch('/api/tasks').then(r=>r.json()).then(tasks => {
|
||||
if (!tasks || tasks.length === 0) {
|
||||
_tasksAll = Array.isArray(tasks) ? tasks : [];
|
||||
if (_tasksAll.length === 0) {
|
||||
_taskPage = 1;
|
||||
document.getElementById('tasks-table').innerHTML = '<p style="color:var(--muted);font-size:13px;padding:16px">No tasks.</p>';
|
||||
syncTaskLogFromHash();
|
||||
return;
|
||||
}
|
||||
const rows = tasks.map(t => {
|
||||
const dur = t.started_at ? formatDur(t.started_at, t.done_at) : '';
|
||||
const totalPages = Math.max(1, Math.ceil(_tasksAll.length / _taskPageSize));
|
||||
if (_taskPage > totalPages) _taskPage = totalPages;
|
||||
if (_taskPage < 1) _taskPage = 1;
|
||||
const start = (_taskPage - 1) * _taskPageSize;
|
||||
const pageTasks = _tasksAll.slice(start, start + _taskPageSize);
|
||||
const rows = pageTasks.map(t => {
|
||||
const dur = t.elapsed_sec ? formatDurSec(t.elapsed_sec) : '';
|
||||
const statusClass = {running:'badge-ok',pending:'badge-unknown',done:'badge-ok',failed:'badge-err',cancelled:'badge-unknown'}[t.status]||'badge-unknown';
|
||||
const statusLabel = {running:'▶ running',pending:'pending',done:'✓ done',failed:'✗ failed',cancelled:'cancelled'}[t.status]||t.status;
|
||||
let actions = '<button class="btn btn-sm btn-secondary" onclick="viewLog(\''+t.id+'\',\''+escHtml(t.name)+'\')">Logs</button>';
|
||||
@@ -1383,21 +1622,35 @@ function loadTasks() {
|
||||
'<td>'+t.priority+'</td>' +
|
||||
'<td>'+actions+'</td></tr>';
|
||||
}).join('');
|
||||
const showingFrom = start + 1;
|
||||
const showingTo = Math.min(start + pageTasks.length, _tasksAll.length);
|
||||
const pager =
|
||||
'<div style="display:flex;align-items:center;justify-content:space-between;gap:12px;flex-wrap:wrap;padding:12px 14px;border-top:1px solid var(--border-lite);background:var(--surface-2)">' +
|
||||
'<div style="font-size:12px;color:var(--muted)">Showing '+showingFrom+'-'+showingTo+' of '+_tasksAll.length+' tasks</div>' +
|
||||
'<div style="display:flex;align-items:center;gap:8px">' +
|
||||
'<button class="btn btn-sm btn-secondary" onclick="setTaskPage('+(_taskPage-1)+')" '+(_taskPage <= 1 ? 'disabled' : '')+'>Previous</button>' +
|
||||
'<span style="font-size:12px;color:var(--muted)">Page '+_taskPage+' / '+totalPages+'</span>' +
|
||||
'<button class="btn btn-sm btn-secondary" onclick="setTaskPage('+(_taskPage+1)+')" '+(_taskPage >= totalPages ? 'disabled' : '')+'>Next</button>' +
|
||||
'</div>' +
|
||||
'</div>';
|
||||
document.getElementById('tasks-table').innerHTML =
|
||||
'<table><tr><th>Name</th><th>Status</th><th>Created</th><th>Duration</th><th>Priority</th><th>Actions</th></tr>'+rows+'</table>';
|
||||
'<table><tr><th>Name</th><th>Status</th><th>Created</th><th>Duration</th><th>Priority</th><th>Actions</th></tr>'+rows+'</table>' + pager;
|
||||
syncTaskLogFromHash();
|
||||
});
|
||||
}
|
||||
|
||||
function escHtml(s) { return (s||'').replace(/&/g,'&').replace(/</g,'<').replace(/>/g,'>').replace(/"/g,'"'); }
|
||||
function fmtTime(s) { if (!s) return ''; try { return new Date(s).toLocaleTimeString(); } catch(e){ return s; } }
|
||||
function formatDur(start, end) {
|
||||
try {
|
||||
const s = new Date(start), e = end ? new Date(end) : new Date();
|
||||
const sec = Math.round((e-s)/1000);
|
||||
if (sec < 60) return sec+'s';
|
||||
const m = Math.floor(sec/60), ss = sec%60;
|
||||
return m+'m '+ss+'s';
|
||||
} catch(e){ return ''; }
|
||||
function formatDurSec(sec) {
|
||||
sec = Math.max(0, Math.round(sec||0));
|
||||
if (sec < 60) return sec+'s';
|
||||
const m = Math.floor(sec/60), ss = sec%60;
|
||||
return m+'m '+ss+'s';
|
||||
}
|
||||
function setTaskPage(page) {
|
||||
const totalPages = Math.max(1, Math.ceil(_tasksAll.length / _taskPageSize));
|
||||
_taskPage = Math.min(totalPages, Math.max(1, page));
|
||||
loadTasks();
|
||||
}
|
||||
|
||||
function cancelTask(id) {
|
||||
@@ -1406,28 +1659,78 @@ function cancelTask(id) {
|
||||
function cancelAll() {
|
||||
fetch('/api/tasks/cancel-all',{method:'POST'}).then(()=>loadTasks());
|
||||
}
|
||||
function killWorkers() {
|
||||
if (!confirm('Send SIGKILL to all running test workers (bee-gpu-burn, stress-ng, stressapptest, memtester)?\n\nThis will also cancel all queued and running tasks.')) return;
|
||||
fetch('/api/tasks/kill-workers',{method:'POST'})
|
||||
.then(r=>r.json())
|
||||
.then(d=>{
|
||||
loadTasks();
|
||||
var toast = document.getElementById('kill-toast');
|
||||
var parts = [];
|
||||
if (d.cancelled > 0) parts.push(d.cancelled+' task'+(d.cancelled===1?'':'s')+' cancelled');
|
||||
if (d.killed > 0) parts.push(d.killed+' process'+(d.killed===1?'':'es')+' killed');
|
||||
toast.textContent = parts.length ? parts.join(', ')+'.' : 'No processes found.';
|
||||
toast.style.display = '';
|
||||
setTimeout(()=>{ toast.style.display='none'; }, 5000);
|
||||
});
|
||||
}
|
||||
function setPriority(id, delta) {
|
||||
fetch('/api/tasks/'+id+'/priority',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({delta:delta})})
|
||||
.then(()=>loadTasks());
|
||||
}
|
||||
function resetTaskLog(term, text) {
|
||||
term.textContent = text ? text + '\n' : '';
|
||||
if (text) term.dataset.placeholder = '1';
|
||||
else delete term.dataset.placeholder;
|
||||
}
|
||||
function prependTaskLogLine(term, line) {
|
||||
if (term.dataset.placeholder === '1') {
|
||||
term.textContent = '';
|
||||
delete term.dataset.placeholder;
|
||||
}
|
||||
term.prepend(document.createTextNode(line + '\n'));
|
||||
term.scrollTop = 0;
|
||||
}
|
||||
function viewLog(id, name) {
|
||||
if (_taskLogES) { _taskLogES.close(); _taskLogES = null; }
|
||||
document.getElementById('task-log-section').style.display = '';
|
||||
_taskLogID = id;
|
||||
window.location.hash = id;
|
||||
document.getElementById('task-log-overlay').style.display = 'flex';
|
||||
document.getElementById('task-log-title').textContent = name;
|
||||
const term = document.getElementById('task-log-terminal');
|
||||
term.textContent = 'Connecting...\n';
|
||||
resetTaskLog(term, 'Connecting...');
|
||||
_taskLogES = new EventSource('/api/tasks/'+id+'/stream');
|
||||
_taskLogES.onmessage = e => { term.textContent += e.data+'\n'; term.scrollTop=term.scrollHeight; };
|
||||
_taskLogES.onopen = () => {
|
||||
if (term.dataset.placeholder === '1') resetTaskLog(term, 'Connected. Waiting for output...');
|
||||
};
|
||||
_taskLogES.onmessage = e => { prependTaskLogLine(term, e.data); };
|
||||
_taskLogES.addEventListener('done', e => {
|
||||
_taskLogES.close(); _taskLogES=null;
|
||||
term.textContent += (e.data ? '\nERROR: '+e.data : '\nDone.')+'\n';
|
||||
prependTaskLogLine(term, e.data ? 'ERROR: '+e.data : 'Done.');
|
||||
});
|
||||
}
|
||||
function syncTaskLogFromHash() {
|
||||
const id = (window.location.hash || '').replace(/^#/, '');
|
||||
if (!id || id === _taskLogID) return;
|
||||
const task = _tasksAll.find(t => t.id === id);
|
||||
if (!task) return;
|
||||
viewLog(task.id, task.name || task.id);
|
||||
}
|
||||
function closeTaskLog() {
|
||||
if (_taskLogES) { _taskLogES.close(); _taskLogES=null; }
|
||||
document.getElementById('task-log-section').style.display='none';
|
||||
_taskLogID = '';
|
||||
if (window.location.hash) history.replaceState(null, '', '/tasks');
|
||||
document.getElementById('task-log-overlay').style.display='none';
|
||||
}
|
||||
|
||||
document.getElementById('task-log-overlay').addEventListener('click', function(e) {
|
||||
if (e.target === this) closeTaskLog();
|
||||
});
|
||||
window.addEventListener('hashchange', syncTaskLogFromHash);
|
||||
window.addEventListener('keydown', function(e) {
|
||||
if (e.key === 'Escape' && document.getElementById('task-log-overlay').style.display !== 'none') closeTaskLog();
|
||||
});
|
||||
|
||||
loadTasks();
|
||||
_taskRefreshTimer = setInterval(loadTasks, 2000);
|
||||
</script>`
|
||||
|
||||
@@ -5,10 +5,12 @@ import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"html"
|
||||
"log/slog"
|
||||
"mime"
|
||||
"net/http"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
@@ -83,6 +85,15 @@ func (r *metricsRing) snapshot() ([]float64, []string) {
|
||||
return v, labels
|
||||
}
|
||||
|
||||
func (r *metricsRing) latest() (float64, bool) {
|
||||
r.mu.Lock()
|
||||
defer r.mu.Unlock()
|
||||
if len(r.vals) == 0 {
|
||||
return 0, false
|
||||
}
|
||||
return r.vals[len(r.vals)-1], true
|
||||
}
|
||||
|
||||
func timestampsSameLocalDay(times []time.Time) bool {
|
||||
if len(times) == 0 {
|
||||
return true
|
||||
@@ -117,9 +128,16 @@ type namedMetricsRing struct {
|
||||
Ring *metricsRing
|
||||
}
|
||||
|
||||
// metricsChartWindow is the number of samples kept in the live ring buffer.
|
||||
// At metricsCollectInterval = 5 s this covers 30 minutes of live history.
|
||||
const metricsChartWindow = 360
|
||||
|
||||
var metricsCollectInterval = 5 * time.Second
|
||||
|
||||
// pendingNetChange tracks a network state change awaiting confirmation.
|
||||
type pendingNetChange struct {
|
||||
snapshot platform.NetworkSnapshot
|
||||
deadline time.Time
|
||||
timer *time.Timer
|
||||
mu sync.Mutex
|
||||
}
|
||||
@@ -143,12 +161,11 @@ type handler struct {
|
||||
latest *platform.LiveMetricSample
|
||||
// metrics persistence (nil if DB unavailable)
|
||||
metricsDB *MetricsDB
|
||||
// install job (at most one at a time)
|
||||
installJob *jobState
|
||||
installMu sync.Mutex
|
||||
// pending network change (rollback on timeout)
|
||||
pendingNet *pendingNetChange
|
||||
pendingNetMu sync.Mutex
|
||||
// kmsg hardware error watcher
|
||||
kmsg *kmsgWatcher
|
||||
}
|
||||
|
||||
// NewHandler creates the HTTP mux with all routes.
|
||||
@@ -173,17 +190,28 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
// Open metrics DB and pre-fill ring buffers from history.
|
||||
if db, err := openMetricsDB(metricsDBPath); err == nil {
|
||||
h.metricsDB = db
|
||||
if samples, err := db.LoadRecent(120); err == nil {
|
||||
if samples, err := db.LoadRecent(metricsChartWindow); err == nil {
|
||||
for _, s := range samples {
|
||||
h.feedRings(s)
|
||||
}
|
||||
if len(samples) > 0 {
|
||||
h.setLatestMetric(samples[len(samples)-1])
|
||||
}
|
||||
} else {
|
||||
slog.Warn("metrics history unavailable", "path", metricsDBPath, "err", err)
|
||||
}
|
||||
} else {
|
||||
slog.Warn("metrics db disabled", "path", metricsDBPath, "err", err)
|
||||
}
|
||||
h.startMetricsCollector()
|
||||
|
||||
// Start kmsg hardware error watcher if the app (and its status DB) is available.
|
||||
if opts.App != nil {
|
||||
h.kmsg = newKmsgWatcher(opts.App.StatusDB)
|
||||
h.kmsg.start()
|
||||
globalQueue.kmsgWatcher = h.kmsg
|
||||
}
|
||||
|
||||
globalQueue.startWorker(&opts)
|
||||
mux := http.NewServeMux()
|
||||
|
||||
@@ -206,6 +234,7 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
|
||||
// SAT
|
||||
mux.HandleFunc("POST /api/sat/nvidia/run", h.handleAPISATRun("nvidia"))
|
||||
mux.HandleFunc("POST /api/sat/nvidia-stress/run", h.handleAPISATRun("nvidia-stress"))
|
||||
mux.HandleFunc("POST /api/sat/memory/run", h.handleAPISATRun("memory"))
|
||||
mux.HandleFunc("POST /api/sat/storage/run", h.handleAPISATRun("storage"))
|
||||
mux.HandleFunc("POST /api/sat/cpu/run", h.handleAPISATRun("cpu"))
|
||||
@@ -222,6 +251,7 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
// Tasks
|
||||
mux.HandleFunc("GET /api/tasks", h.handleAPITasksList)
|
||||
mux.HandleFunc("POST /api/tasks/cancel-all", h.handleAPITasksCancelAll)
|
||||
mux.HandleFunc("POST /api/tasks/kill-workers", h.handleAPITasksKillWorkers)
|
||||
mux.HandleFunc("POST /api/tasks/{id}/cancel", h.handleAPITasksCancel)
|
||||
mux.HandleFunc("POST /api/tasks/{id}/priority", h.handleAPITasksPriority)
|
||||
mux.HandleFunc("GET /api/tasks/{id}/stream", h.handleAPITasksStream)
|
||||
@@ -240,7 +270,6 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
|
||||
// Export
|
||||
mux.HandleFunc("GET /api/export/list", h.handleAPIExportList)
|
||||
mux.HandleFunc("POST /api/export/bundle", h.handleAPIExportBundle)
|
||||
mux.HandleFunc("GET /api/export/usb", h.handleAPIExportUSBTargets)
|
||||
mux.HandleFunc("POST /api/export/usb/audit", h.handleAPIExportUSBAudit)
|
||||
mux.HandleFunc("POST /api/export/usb/bundle", h.handleAPIExportUSBBundle)
|
||||
@@ -252,8 +281,9 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
mux.HandleFunc("GET /api/display/resolutions", h.handleAPIDisplayResolutions)
|
||||
mux.HandleFunc("POST /api/display/set", h.handleAPIDisplaySet)
|
||||
|
||||
// GPU presence
|
||||
// GPU presence / tools
|
||||
mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
|
||||
mux.HandleFunc("GET /api/gpu/tools", h.handleAPIGPUTools)
|
||||
|
||||
// System
|
||||
mux.HandleFunc("GET /api/system/ram-status", h.handleAPIRAMStatus)
|
||||
@@ -265,10 +295,10 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
// Install
|
||||
mux.HandleFunc("GET /api/install/disks", h.handleAPIInstallDisks)
|
||||
mux.HandleFunc("POST /api/install/run", h.handleAPIInstallRun)
|
||||
mux.HandleFunc("GET /api/install/stream", h.handleAPIInstallStream)
|
||||
|
||||
// Metrics — SSE stream of live sensor data + server-side SVG charts + CSV export
|
||||
mux.HandleFunc("GET /api/metrics/stream", h.handleAPIMetricsStream)
|
||||
mux.HandleFunc("GET /api/metrics/latest", h.handleAPIMetricsLatest)
|
||||
mux.HandleFunc("GET /api/metrics/chart/", h.handleMetricsChartSVG)
|
||||
mux.HandleFunc("GET /api/metrics/export.csv", h.handleAPIMetricsExportCSV)
|
||||
|
||||
@@ -284,15 +314,15 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
|
||||
func (h *handler) startMetricsCollector() {
|
||||
go func() {
|
||||
ticker := time.NewTicker(1 * time.Second)
|
||||
ticker := time.NewTicker(metricsCollectInterval)
|
||||
defer ticker.Stop()
|
||||
for range ticker.C {
|
||||
sample := platform.SampleLiveMetrics()
|
||||
h.feedRings(sample)
|
||||
h.setLatestMetric(sample)
|
||||
if h.metricsDB != nil {
|
||||
_ = h.metricsDB.Write(sample)
|
||||
}
|
||||
h.feedRings(sample)
|
||||
h.setLatestMetric(sample)
|
||||
}
|
||||
}()
|
||||
}
|
||||
@@ -369,6 +399,7 @@ func (h *handler) handleSupportBundleDownload(w http.ResponseWriter, r *http.Req
|
||||
http.Error(w, fmt.Sprintf("build support bundle: %v", err), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
defer os.Remove(archive)
|
||||
w.Header().Set("Cache-Control", "no-store")
|
||||
w.Header().Set("Content-Type", "application/gzip")
|
||||
w.Header().Set("Content-Disposition", fmt.Sprintf("attachment; filename=%q", filepath.Base(archive)))
|
||||
@@ -440,221 +471,13 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
|
||||
path := strings.TrimPrefix(r.URL.Path, "/api/metrics/chart/")
|
||||
path = strings.TrimSuffix(path, ".svg")
|
||||
|
||||
if h.metricsDB != nil {
|
||||
if datasets, names, labels, title, yMin, yMax, ok := h.chartDataFromDB(path); ok {
|
||||
buf, err := renderChartSVG(title, datasets, names, labels, yMin, yMax)
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
w.Header().Set("Content-Type", "image/svg+xml")
|
||||
w.Header().Set("Cache-Control", "no-store")
|
||||
_, _ = w.Write(buf)
|
||||
return
|
||||
}
|
||||
if h.metricsDB == nil {
|
||||
http.Error(w, "metrics database not available", http.StatusServiceUnavailable)
|
||||
return
|
||||
}
|
||||
|
||||
var datasets [][]float64
|
||||
var names []string
|
||||
var labels []string
|
||||
var title string
|
||||
var yMin, yMax *float64 // nil = auto; for load charts fixed 0-100
|
||||
|
||||
switch {
|
||||
// ── Server sub-charts ─────────────────────────────────────────────────
|
||||
case path == "server-load":
|
||||
title = "CPU / Memory Load"
|
||||
vCPULoad, l := h.ringCPULoad.snapshot()
|
||||
vMemLoad, _ := h.ringMemLoad.snapshot()
|
||||
labels = l
|
||||
datasets = [][]float64{vCPULoad, vMemLoad}
|
||||
names = []string{"CPU Load %", "Mem Load %"}
|
||||
yMin = floatPtr(0)
|
||||
yMax = floatPtr(100)
|
||||
|
||||
case path == "server-temp", path == "server-temp-cpu":
|
||||
title = "CPU Temperature"
|
||||
h.ringsMu.Lock()
|
||||
datasets, names, labels = snapshotNamedRings(h.cpuTempRings)
|
||||
h.ringsMu.Unlock()
|
||||
yMin = floatPtr(0)
|
||||
yMax = autoMax120(datasets...)
|
||||
|
||||
case path == "server-temp-gpu":
|
||||
title = "GPU Temperature"
|
||||
h.ringsMu.Lock()
|
||||
for idx, gr := range h.gpuRings {
|
||||
if gr == nil {
|
||||
continue
|
||||
}
|
||||
vTemp, l := gr.Temp.snapshot()
|
||||
datasets = append(datasets, vTemp)
|
||||
names = append(names, fmt.Sprintf("GPU %d", idx))
|
||||
if len(labels) == 0 {
|
||||
labels = l
|
||||
}
|
||||
}
|
||||
h.ringsMu.Unlock()
|
||||
yMin = floatPtr(0)
|
||||
yMax = autoMax120(datasets...)
|
||||
|
||||
case path == "server-temp-ambient":
|
||||
title = "Ambient / Other Sensors"
|
||||
h.ringsMu.Lock()
|
||||
datasets, names, labels = snapshotNamedRings(h.ambientTempRings)
|
||||
h.ringsMu.Unlock()
|
||||
yMin = floatPtr(0)
|
||||
yMax = autoMax120(datasets...)
|
||||
|
||||
case path == "server-power":
|
||||
title = "System Power"
|
||||
vPower, l := h.ringPower.snapshot()
|
||||
labels = l
|
||||
datasets = [][]float64{vPower}
|
||||
names = []string{"Power W"}
|
||||
yMin = floatPtr(0)
|
||||
yMax = autoMax120(vPower)
|
||||
|
||||
case path == "server-fans":
|
||||
title = "Fan RPM"
|
||||
h.ringsMu.Lock()
|
||||
for i, fr := range h.ringFans {
|
||||
fv, _ := fr.snapshot()
|
||||
datasets = append(datasets, fv)
|
||||
name := "Fan"
|
||||
if i < len(h.fanNames) {
|
||||
name = h.fanNames[i]
|
||||
}
|
||||
names = append(names, name+" RPM")
|
||||
}
|
||||
h.ringsMu.Unlock()
|
||||
yMin = floatPtr(0)
|
||||
yMax = autoMax120(datasets...)
|
||||
|
||||
// ── Combined GPU charts (all GPUs on one chart) ───────────────────────
|
||||
case path == "gpu-all-load":
|
||||
title = "GPU Compute Load"
|
||||
h.ringsMu.Lock()
|
||||
for idx, gr := range h.gpuRings {
|
||||
if gr == nil {
|
||||
continue
|
||||
}
|
||||
vUtil, l := gr.Util.snapshot()
|
||||
datasets = append(datasets, vUtil)
|
||||
names = append(names, fmt.Sprintf("GPU %d", idx))
|
||||
if len(labels) == 0 {
|
||||
labels = l
|
||||
}
|
||||
}
|
||||
h.ringsMu.Unlock()
|
||||
yMin = floatPtr(0)
|
||||
yMax = floatPtr(100)
|
||||
|
||||
case path == "gpu-all-memload":
|
||||
title = "GPU Memory Load"
|
||||
h.ringsMu.Lock()
|
||||
for idx, gr := range h.gpuRings {
|
||||
if gr == nil {
|
||||
continue
|
||||
}
|
||||
vMem, l := gr.MemUtil.snapshot()
|
||||
datasets = append(datasets, vMem)
|
||||
names = append(names, fmt.Sprintf("GPU %d", idx))
|
||||
if len(labels) == 0 {
|
||||
labels = l
|
||||
}
|
||||
}
|
||||
h.ringsMu.Unlock()
|
||||
yMin = floatPtr(0)
|
||||
yMax = floatPtr(100)
|
||||
|
||||
case path == "gpu-all-power":
|
||||
title = "GPU Power"
|
||||
h.ringsMu.Lock()
|
||||
for idx, gr := range h.gpuRings {
|
||||
if gr == nil {
|
||||
continue
|
||||
}
|
||||
vPow, l := gr.Power.snapshot()
|
||||
datasets = append(datasets, vPow)
|
||||
names = append(names, fmt.Sprintf("GPU %d", idx))
|
||||
if len(labels) == 0 {
|
||||
labels = l
|
||||
}
|
||||
}
|
||||
h.ringsMu.Unlock()
|
||||
yMin = floatPtr(0)
|
||||
yMax = autoMax120(datasets...)
|
||||
|
||||
case path == "gpu-all-temp":
|
||||
title = "GPU Temperature"
|
||||
h.ringsMu.Lock()
|
||||
for idx, gr := range h.gpuRings {
|
||||
if gr == nil {
|
||||
continue
|
||||
}
|
||||
vTemp, l := gr.Temp.snapshot()
|
||||
datasets = append(datasets, vTemp)
|
||||
names = append(names, fmt.Sprintf("GPU %d", idx))
|
||||
if len(labels) == 0 {
|
||||
labels = l
|
||||
}
|
||||
}
|
||||
h.ringsMu.Unlock()
|
||||
yMin = floatPtr(0)
|
||||
yMax = autoMax120(datasets...)
|
||||
|
||||
// ── Per-GPU sub-charts ────────────────────────────────────────────────
|
||||
case strings.HasPrefix(path, "gpu/"):
|
||||
rest := strings.TrimPrefix(path, "gpu/")
|
||||
// rest is either "{idx}-load", "{idx}-temp", "{idx}-power", or legacy "{idx}"
|
||||
sub := ""
|
||||
if i := strings.LastIndex(rest, "-"); i > 0 {
|
||||
sub = rest[i+1:]
|
||||
rest = rest[:i]
|
||||
}
|
||||
idx := 0
|
||||
fmt.Sscanf(rest, "%d", &idx)
|
||||
h.ringsMu.Lock()
|
||||
var gr *gpuRings
|
||||
if idx < len(h.gpuRings) {
|
||||
gr = h.gpuRings[idx]
|
||||
}
|
||||
h.ringsMu.Unlock()
|
||||
if gr == nil {
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
switch sub {
|
||||
case "load":
|
||||
vUtil, l := gr.Util.snapshot()
|
||||
vMemUtil, _ := gr.MemUtil.snapshot()
|
||||
labels = l
|
||||
title = fmt.Sprintf("GPU %d Load", idx)
|
||||
datasets = [][]float64{vUtil, vMemUtil}
|
||||
names = []string{"Load %", "Mem %"}
|
||||
yMin = floatPtr(0)
|
||||
yMax = floatPtr(100)
|
||||
case "temp":
|
||||
vTemp, l := gr.Temp.snapshot()
|
||||
labels = l
|
||||
title = fmt.Sprintf("GPU %d Temperature", idx)
|
||||
datasets = [][]float64{vTemp}
|
||||
names = []string{"Temp °C"}
|
||||
yMin = floatPtr(0)
|
||||
yMax = autoMax120(vTemp)
|
||||
default: // "power" or legacy (no sub)
|
||||
vPower, l := gr.Power.snapshot()
|
||||
labels = l
|
||||
title = fmt.Sprintf("GPU %d Power", idx)
|
||||
datasets = [][]float64{vPower}
|
||||
names = []string{"Power W"}
|
||||
yMin = floatPtr(0)
|
||||
yMax = autoMax120(vPower)
|
||||
}
|
||||
|
||||
default:
|
||||
http.NotFound(w, r)
|
||||
datasets, names, labels, title, yMin, yMax, ok := h.chartDataFromDB(path)
|
||||
if !ok {
|
||||
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -721,9 +544,11 @@ func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][
|
||||
for i, s := range samples {
|
||||
power[i] = s.PowerW
|
||||
}
|
||||
power = normalizePowerSeries(power)
|
||||
datasets = [][]float64{power}
|
||||
names = []string{"Power W"}
|
||||
yMin, yMax = autoBounds120(power)
|
||||
yMin = floatPtr(0)
|
||||
yMax = autoMax120(power)
|
||||
|
||||
case path == "server-fans":
|
||||
title = "Fan RPM"
|
||||
@@ -829,6 +654,7 @@ func namedTempDatasets(samples []platform.LiveMetricSample, group string) ([][]f
|
||||
}
|
||||
}
|
||||
}
|
||||
sort.Strings(names)
|
||||
datasets := make([][]float64, 0, len(names))
|
||||
for _, name := range names {
|
||||
ds := make([]float64, len(samples))
|
||||
@@ -856,6 +682,7 @@ func namedFanDatasets(samples []platform.LiveMetricSample) ([][]float64, []strin
|
||||
}
|
||||
}
|
||||
}
|
||||
sort.Strings(names)
|
||||
datasets := make([][]float64, 0, len(names))
|
||||
for _, name := range names {
|
||||
ds := make([]float64, len(samples))
|
||||
@@ -867,7 +694,7 @@ func namedFanDatasets(samples []platform.LiveMetricSample) ([][]float64, []strin
|
||||
}
|
||||
}
|
||||
}
|
||||
datasets = append(datasets, ds)
|
||||
datasets = append(datasets, normalizeFanSeries(ds))
|
||||
}
|
||||
return datasets, names
|
||||
}
|
||||
@@ -883,6 +710,7 @@ func gpuDatasets(samples []platform.LiveMetricSample, pick func(platform.GPUMetr
|
||||
}
|
||||
}
|
||||
}
|
||||
sort.Ints(indices)
|
||||
datasets := make([][]float64, 0, len(indices))
|
||||
names := make([]string, 0, len(indices))
|
||||
for _, idx := range indices {
|
||||
@@ -921,6 +749,48 @@ func coalesceDataset(ds []float64, n int) []float64 {
|
||||
return make([]float64, n)
|
||||
}
|
||||
|
||||
func normalizePowerSeries(ds []float64) []float64 {
|
||||
if len(ds) == 0 {
|
||||
return nil
|
||||
}
|
||||
out := make([]float64, len(ds))
|
||||
copy(out, ds)
|
||||
last := 0.0
|
||||
haveLast := false
|
||||
for i, v := range out {
|
||||
if v > 0 {
|
||||
last = v
|
||||
haveLast = true
|
||||
continue
|
||||
}
|
||||
if haveLast {
|
||||
out[i] = last
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func normalizeFanSeries(ds []float64) []float64 {
|
||||
if len(ds) == 0 {
|
||||
return nil
|
||||
}
|
||||
out := make([]float64, len(ds))
|
||||
var lastPositive float64
|
||||
for i, v := range ds {
|
||||
if v > 0 {
|
||||
lastPositive = v
|
||||
out[i] = v
|
||||
continue
|
||||
}
|
||||
if lastPositive > 0 {
|
||||
out[i] = lastPositive
|
||||
continue
|
||||
}
|
||||
out[i] = 0
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// floatPtr returns a pointer to a float64 value.
|
||||
func floatPtr(v float64) *float64 { return &v }
|
||||
|
||||
@@ -1012,15 +882,17 @@ func renderChartSVG(title string, datasets [][]float64, names []string, labels [
|
||||
opt.Title = gocharts.TitleOption{Text: title}
|
||||
opt.XAxis.Labels = sparse
|
||||
opt.Legend = gocharts.LegendOption{SeriesNames: names}
|
||||
if chartLegendVisible(len(names)) {
|
||||
opt.Legend.Offset = gocharts.OffsetStr{Top: gocharts.PositionBottom}
|
||||
opt.Legend.OverlayChart = gocharts.Ptr(false)
|
||||
} else {
|
||||
opt.Legend.Show = gocharts.Ptr(false)
|
||||
}
|
||||
opt.Symbol = gocharts.SymbolNone
|
||||
// Right padding: reserve space for the MarkLine label (library recommendation).
|
||||
opt.Padding = gocharts.NewBox(20, 20, 80, 20)
|
||||
if yMin != nil || yMax != nil {
|
||||
opt.YAxis = []gocharts.YAxisOption{{
|
||||
Min: yMin,
|
||||
Max: yMax,
|
||||
ValueFormatter: chartLegendNumber,
|
||||
}}
|
||||
opt.YAxis = []gocharts.YAxisOption{chartYAxisOption(yMin, yMax)}
|
||||
}
|
||||
|
||||
// Add a single peak mark line on the series that holds the global maximum.
|
||||
@@ -1032,7 +904,7 @@ func renderChartSVG(title string, datasets [][]float64, names []string, labels [
|
||||
p := gocharts.NewPainter(gocharts.PainterOptions{
|
||||
OutputFormat: gocharts.ChartOutputSVG,
|
||||
Width: 1400,
|
||||
Height: 240,
|
||||
Height: chartCanvasHeight(len(names)),
|
||||
}, gocharts.PainterThemeOption(gocharts.GetTheme("grafana")))
|
||||
if err := p.LineChart(opt); err != nil {
|
||||
return nil, err
|
||||
@@ -1040,6 +912,26 @@ func renderChartSVG(title string, datasets [][]float64, names []string, labels [
|
||||
return p.Bytes()
|
||||
}
|
||||
|
||||
func chartLegendVisible(seriesCount int) bool {
|
||||
return seriesCount <= 8
|
||||
}
|
||||
|
||||
func chartCanvasHeight(seriesCount int) int {
|
||||
if chartLegendVisible(seriesCount) {
|
||||
return 360
|
||||
}
|
||||
return 288
|
||||
}
|
||||
|
||||
func chartYAxisOption(yMin, yMax *float64) gocharts.YAxisOption {
|
||||
return gocharts.YAxisOption{
|
||||
Min: yMin,
|
||||
Max: yMax,
|
||||
LabelCount: 11,
|
||||
ValueFormatter: chartYAxisNumber,
|
||||
}
|
||||
}
|
||||
|
||||
// globalPeakSeries returns the index of the series containing the global maximum
|
||||
// value across all datasets, and that maximum value.
|
||||
func globalPeakSeries(datasets [][]float64) (idx int, peak float64) {
|
||||
@@ -1127,6 +1019,28 @@ func snapshotNamedRings(rings []*namedMetricsRing) ([][]float64, []string, []str
|
||||
return datasets, names, labels
|
||||
}
|
||||
|
||||
func snapshotFanRings(rings []*metricsRing, fanNames []string) ([][]float64, []string, []string) {
|
||||
var datasets [][]float64
|
||||
var names []string
|
||||
var labels []string
|
||||
for i, ring := range rings {
|
||||
if ring == nil {
|
||||
continue
|
||||
}
|
||||
vals, l := ring.snapshot()
|
||||
datasets = append(datasets, normalizeFanSeries(vals))
|
||||
name := "Fan"
|
||||
if i < len(fanNames) {
|
||||
name = fanNames[i]
|
||||
}
|
||||
names = append(names, name+" RPM")
|
||||
if len(labels) == 0 {
|
||||
labels = l
|
||||
}
|
||||
}
|
||||
return datasets, names, labels
|
||||
}
|
||||
|
||||
func chartLegendNumber(v float64) string {
|
||||
neg := v < 0
|
||||
if v < 0 {
|
||||
@@ -1149,6 +1063,30 @@ func chartLegendNumber(v float64) string {
|
||||
return out
|
||||
}
|
||||
|
||||
func chartYAxisNumber(v float64) string {
|
||||
neg := v < 0
|
||||
if neg {
|
||||
v = -v
|
||||
}
|
||||
var out string
|
||||
switch {
|
||||
case v >= 10000:
|
||||
out = fmt.Sprintf("%dк", int((v+500)/1000))
|
||||
case v >= 1000:
|
||||
// Use one decimal place so ticks like 1400, 1600, 1800 read as
|
||||
// "1,4к", "1,6к", "1,8к" instead of the ambiguous "1к"/"2к".
|
||||
s := fmt.Sprintf("%.1f", v/1000)
|
||||
s = strings.TrimRight(strings.TrimRight(s, "0"), ".")
|
||||
out = strings.ReplaceAll(s, ".", ",") + "к"
|
||||
default:
|
||||
out = fmt.Sprintf("%.0f", v)
|
||||
}
|
||||
if neg {
|
||||
return "-" + out
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func sparseLabels(labels []string, n int) []string {
|
||||
out := make([]string, len(labels))
|
||||
step := len(labels) / n
|
||||
@@ -1229,13 +1167,6 @@ probe();
|
||||
func (h *handler) handlePage(w http.ResponseWriter, r *http.Request) {
|
||||
page := strings.TrimPrefix(r.URL.Path, "/")
|
||||
if page == "" {
|
||||
// Serve loading page until audit snapshot exists
|
||||
if _, err := os.Stat(h.opts.AuditPath); err != nil {
|
||||
w.Header().Set("Cache-Control", "no-store")
|
||||
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||
_, _ = w.Write([]byte(loadingPageHTML))
|
||||
return
|
||||
}
|
||||
page = "dashboard"
|
||||
}
|
||||
// Redirect old routes to new names
|
||||
|
||||
@@ -89,6 +89,180 @@ func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestChartDataFromSamplesKeepsStableGPUSeriesOrder(t *testing.T) {
|
||||
samples := []platform.LiveMetricSample{
|
||||
{
|
||||
Timestamp: time.Now().Add(-2 * time.Minute),
|
||||
GPUs: []platform.GPUMetricRow{
|
||||
{GPUIndex: 7, PowerW: 170},
|
||||
{GPUIndex: 2, PowerW: 120},
|
||||
{GPUIndex: 0, PowerW: 100},
|
||||
},
|
||||
},
|
||||
{
|
||||
Timestamp: time.Now().Add(-1 * time.Minute),
|
||||
GPUs: []platform.GPUMetricRow{
|
||||
{GPUIndex: 0, PowerW: 101},
|
||||
{GPUIndex: 7, PowerW: 171},
|
||||
{GPUIndex: 2, PowerW: 121},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
|
||||
if !ok {
|
||||
t.Fatal("chartDataFromSamples returned ok=false")
|
||||
}
|
||||
if title != "GPU Power" {
|
||||
t.Fatalf("title=%q", title)
|
||||
}
|
||||
wantNames := []string{"GPU 0", "GPU 2", "GPU 7"}
|
||||
if len(names) != len(wantNames) {
|
||||
t.Fatalf("names len=%d want %d: %v", len(names), len(wantNames), names)
|
||||
}
|
||||
for i := range wantNames {
|
||||
if names[i] != wantNames[i] {
|
||||
t.Fatalf("names[%d]=%q want %q; full=%v", i, names[i], wantNames[i], names)
|
||||
}
|
||||
}
|
||||
if got := datasets[0]; len(got) != 2 || got[0] != 100 || got[1] != 101 {
|
||||
t.Fatalf("GPU 0 dataset=%v want [100 101]", got)
|
||||
}
|
||||
if got := datasets[1]; len(got) != 2 || got[0] != 120 || got[1] != 121 {
|
||||
t.Fatalf("GPU 2 dataset=%v want [120 121]", got)
|
||||
}
|
||||
if got := datasets[2]; len(got) != 2 || got[0] != 170 || got[1] != 171 {
|
||||
t.Fatalf("GPU 7 dataset=%v want [170 171]", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizePowerSeriesHoldsLastPositive(t *testing.T) {
|
||||
got := normalizePowerSeries([]float64{0, 480, 0, 0, 510, 0})
|
||||
want := []float64{0, 480, 480, 480, 510, 510}
|
||||
if len(got) != len(want) {
|
||||
t.Fatalf("len=%d want %d", len(got), len(want))
|
||||
}
|
||||
for i := range want {
|
||||
if got[i] != want[i] {
|
||||
t.Fatalf("got[%d]=%v want %v", i, got[i], want[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestRenderMetricsUsesBufferedChartRefresh(t *testing.T) {
|
||||
body := renderMetrics()
|
||||
if !strings.Contains(body, "const probe = new Image();") {
|
||||
t.Fatalf("metrics page should preload chart images before swap: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, "el.dataset.loading === '1'") {
|
||||
t.Fatalf("metrics page should avoid overlapping chart reloads: %s", body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestChartLegendVisible(t *testing.T) {
|
||||
if !chartLegendVisible(8) {
|
||||
t.Fatal("legend should stay visible for charts with up to 8 series")
|
||||
}
|
||||
if chartLegendVisible(9) {
|
||||
t.Fatal("legend should be hidden for charts with more than 8 series")
|
||||
}
|
||||
}
|
||||
|
||||
func TestChartYAxisNumber(t *testing.T) {
|
||||
tests := []struct {
|
||||
in float64
|
||||
want string
|
||||
}{
|
||||
{in: 999, want: "999"},
|
||||
{in: 1000, want: "1к"},
|
||||
{in: 1370, want: "1,4к"},
|
||||
{in: 1500, want: "1,5к"},
|
||||
{in: 1700, want: "1,7к"},
|
||||
{in: 2000, want: "2к"},
|
||||
{in: 9999, want: "10к"},
|
||||
{in: 10200, want: "10к"},
|
||||
{in: -1500, want: "-1,5к"},
|
||||
}
|
||||
for _, tc := range tests {
|
||||
if got := chartYAxisNumber(tc.in); got != tc.want {
|
||||
t.Fatalf("chartYAxisNumber(%v)=%q want %q", tc.in, got, tc.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestChartCanvasHeight(t *testing.T) {
|
||||
if got := chartCanvasHeight(4); got != 360 {
|
||||
t.Fatalf("chartCanvasHeight(4)=%d want 360", got)
|
||||
}
|
||||
if got := chartCanvasHeight(12); got != 288 {
|
||||
t.Fatalf("chartCanvasHeight(12)=%d want 288", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalizeFanSeriesHoldsLastPositive(t *testing.T) {
|
||||
got := normalizeFanSeries([]float64{4200, 0, 0, 4300, 0})
|
||||
want := []float64{4200, 4200, 4200, 4300, 4300}
|
||||
if len(got) != len(want) {
|
||||
t.Fatalf("len=%d want %d", len(got), len(want))
|
||||
}
|
||||
for i := range want {
|
||||
if got[i] != want[i] {
|
||||
t.Fatalf("got[%d]=%v want %v", i, got[i], want[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestChartYAxisOption(t *testing.T) {
|
||||
min := floatPtr(0)
|
||||
max := floatPtr(100)
|
||||
opt := chartYAxisOption(min, max)
|
||||
if opt.Min != min || opt.Max != max {
|
||||
t.Fatalf("chartYAxisOption min/max mismatch: %#v", opt)
|
||||
}
|
||||
if opt.LabelCount != 11 {
|
||||
t.Fatalf("chartYAxisOption labelCount=%d want 11", opt.LabelCount)
|
||||
}
|
||||
if got := opt.ValueFormatter(1000); got != "1к" {
|
||||
t.Fatalf("chartYAxisOption formatter(1000)=%q want 1к", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSnapshotFanRingsUsesTimelineLabels(t *testing.T) {
|
||||
r1 := newMetricsRing(4)
|
||||
r2 := newMetricsRing(4)
|
||||
r1.push(1000)
|
||||
r1.push(1100)
|
||||
r2.push(1200)
|
||||
r2.push(1300)
|
||||
|
||||
datasets, names, labels := snapshotFanRings([]*metricsRing{r1, r2}, []string{"FAN_A", "FAN_B"})
|
||||
if len(datasets) != 2 {
|
||||
t.Fatalf("datasets=%d want 2", len(datasets))
|
||||
}
|
||||
if len(names) != 2 || names[0] != "FAN_A RPM" || names[1] != "FAN_B RPM" {
|
||||
t.Fatalf("names=%v", names)
|
||||
}
|
||||
if len(labels) != 2 {
|
||||
t.Fatalf("labels=%v want 2 entries", labels)
|
||||
}
|
||||
if labels[0] == "" || labels[1] == "" {
|
||||
t.Fatalf("labels should contain timeline values, got %v", labels)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRenderNetworkInlineSyncsPendingState(t *testing.T) {
|
||||
body := renderNetworkInline()
|
||||
if !strings.Contains(body, "d.pending_change") {
|
||||
t.Fatalf("network UI should read pending network state from API: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, "setInterval(loadNetwork, 5000)") {
|
||||
t.Fatalf("network UI should periodically refresh network state: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, "showNetPending(NET_ROLLBACK_SECS)") {
|
||||
t.Fatalf("network UI should show pending confirmation immediately on apply: %s", body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRootRendersDashboard(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "audit.json")
|
||||
@@ -136,6 +310,33 @@ func TestRootRendersDashboard(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestRootShowsRunAuditButtonWhenSnapshotMissing(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
exportDir := filepath.Join(dir, "export")
|
||||
if err := os.MkdirAll(exportDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
handler := NewHandler(HandlerOptions{
|
||||
Title: "Bee Hardware Audit",
|
||||
AuditPath: filepath.Join(dir, "missing-audit.json"),
|
||||
ExportDir: exportDir,
|
||||
})
|
||||
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
if !strings.Contains(body, `Run Audit`) {
|
||||
t.Fatalf("dashboard missing run audit button: %s", body)
|
||||
}
|
||||
if strings.Contains(body, `No audit data`) {
|
||||
t.Fatalf("dashboard still shows empty audit badge: %s", body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAuditPageRendersViewerFrameAndActions(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "audit.json")
|
||||
@@ -158,6 +359,44 @@ func TestAuditPageRendersViewerFrameAndActions(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestTasksPageRendersLogModalAndPaginationControls(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tasks", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
if !strings.Contains(body, `id="task-log-overlay"`) {
|
||||
t.Fatalf("tasks page missing log modal overlay: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `_taskPageSize = 50`) {
|
||||
t.Fatalf("tasks page missing pagination size config: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `Previous</button>`) || !strings.Contains(body, `Next</button>`) {
|
||||
t.Fatalf("tasks page missing pagination controls: %s", body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestToolsPageRendersRestartGPUDriversButton(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tools", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
if !strings.Contains(body, `Restart GPU Drivers`) {
|
||||
t.Fatalf("tools page missing restart gpu drivers button: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `svcAction('bee-nvidia', 'restart')`) {
|
||||
t.Fatalf("tools page missing bee-nvidia restart action: %s", body)
|
||||
}
|
||||
if !strings.Contains(body, `id="boot-source-text"`) {
|
||||
t.Fatalf("tools page missing boot source field: %s", body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestViewerRendersLatestSnapshot(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "audit.json")
|
||||
@@ -232,6 +471,17 @@ func TestSupportBundleEndpointReturnsArchive(t *testing.T) {
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "bee-audit.log"), []byte("audit log"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
archive, err := os.CreateTemp(os.TempDir(), "bee-support-server-test-*.tar.gz")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
t.Cleanup(func() { _ = os.Remove(archive.Name()) })
|
||||
if _, err := archive.WriteString("support-bundle"); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := archive.Close(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
handler := NewHandler(HandlerOptions{ExportDir: exportDir})
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
@@ -6,8 +6,10 @@ import (
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
@@ -24,36 +26,74 @@ const (
|
||||
TaskCancelled = "cancelled"
|
||||
)
|
||||
|
||||
// taskNames maps target → human-readable name.
|
||||
// taskNames maps target → human-readable name for validate (SAT) runs.
|
||||
var taskNames = map[string]string{
|
||||
"nvidia": "NVIDIA SAT",
|
||||
"memory": "Memory SAT",
|
||||
"storage": "Storage SAT",
|
||||
"cpu": "CPU SAT",
|
||||
"amd": "AMD GPU SAT",
|
||||
"amd-mem": "AMD GPU MEM Integrity",
|
||||
"amd-bandwidth": "AMD GPU MEM Bandwidth",
|
||||
"amd-stress": "AMD GPU Burn-in",
|
||||
"memory-stress": "Memory Burn-in",
|
||||
"sat-stress": "SAT Stress (stressapptest)",
|
||||
"nvidia": "NVIDIA SAT",
|
||||
"nvidia-stress": "NVIDIA GPU Stress",
|
||||
"memory": "Memory SAT",
|
||||
"storage": "Storage SAT",
|
||||
"cpu": "CPU SAT",
|
||||
"amd": "AMD GPU SAT",
|
||||
"amd-mem": "AMD GPU MEM Integrity",
|
||||
"amd-bandwidth": "AMD GPU MEM Bandwidth",
|
||||
"amd-stress": "AMD GPU Burn-in",
|
||||
"memory-stress": "Memory Burn-in",
|
||||
"sat-stress": "SAT Stress (stressapptest)",
|
||||
"platform-stress": "Platform Thermal Cycling",
|
||||
"audit": "Audit",
|
||||
"install": "Install to Disk",
|
||||
"install-to-ram": "Install to RAM",
|
||||
"audit": "Audit",
|
||||
"support-bundle": "Support Bundle",
|
||||
"install": "Install to Disk",
|
||||
"install-to-ram": "Install to RAM",
|
||||
}
|
||||
|
||||
// burnNames maps target → human-readable name when a burn profile is set.
|
||||
var burnNames = map[string]string{
|
||||
"nvidia": "NVIDIA Burn-in",
|
||||
"memory": "Memory Burn-in",
|
||||
"cpu": "CPU Burn-in",
|
||||
"amd": "AMD GPU Burn-in",
|
||||
}
|
||||
|
||||
func nvidiaStressTaskName(loader string) string {
|
||||
switch strings.TrimSpace(strings.ToLower(loader)) {
|
||||
case platform.NvidiaStressLoaderJohn:
|
||||
return "NVIDIA GPU Stress (John/OpenCL)"
|
||||
case platform.NvidiaStressLoaderNCCL:
|
||||
return "NVIDIA GPU Stress (NCCL)"
|
||||
default:
|
||||
return "NVIDIA GPU Stress (bee-gpu-burn)"
|
||||
}
|
||||
}
|
||||
|
||||
func taskDisplayName(target, profile, loader string) string {
|
||||
name := taskNames[target]
|
||||
if profile != "" {
|
||||
if n, ok := burnNames[target]; ok {
|
||||
name = n
|
||||
}
|
||||
}
|
||||
if target == "nvidia-stress" {
|
||||
name = nvidiaStressTaskName(loader)
|
||||
}
|
||||
if name == "" {
|
||||
name = target
|
||||
}
|
||||
return name
|
||||
}
|
||||
|
||||
// Task represents one unit of work in the queue.
|
||||
type Task struct {
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Target string `json:"target"`
|
||||
Priority int `json:"priority"`
|
||||
Status string `json:"status"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
StartedAt *time.Time `json:"started_at,omitempty"`
|
||||
DoneAt *time.Time `json:"done_at,omitempty"`
|
||||
ErrMsg string `json:"error,omitempty"`
|
||||
LogPath string `json:"log_path,omitempty"`
|
||||
ID string `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Target string `json:"target"`
|
||||
Priority int `json:"priority"`
|
||||
Status string `json:"status"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
StartedAt *time.Time `json:"started_at,omitempty"`
|
||||
DoneAt *time.Time `json:"done_at,omitempty"`
|
||||
ElapsedSec int `json:"elapsed_sec,omitempty"`
|
||||
ErrMsg string `json:"error,omitempty"`
|
||||
LogPath string `json:"log_path,omitempty"`
|
||||
|
||||
// runtime fields (not serialised)
|
||||
job *jobState
|
||||
@@ -62,12 +102,15 @@ type Task struct {
|
||||
|
||||
// taskParams holds optional parameters parsed from the run request.
|
||||
type taskParams struct {
|
||||
Duration int `json:"duration,omitempty"`
|
||||
DiagLevel int `json:"diag_level,omitempty"`
|
||||
GPUIndices []int `json:"gpu_indices,omitempty"`
|
||||
BurnProfile string `json:"burn_profile,omitempty"`
|
||||
DisplayName string `json:"display_name,omitempty"`
|
||||
Device string `json:"device,omitempty"` // for install
|
||||
Duration int `json:"duration,omitempty"`
|
||||
DiagLevel int `json:"diag_level,omitempty"`
|
||||
GPUIndices []int `json:"gpu_indices,omitempty"`
|
||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices,omitempty"`
|
||||
Loader string `json:"loader,omitempty"`
|
||||
BurnProfile string `json:"burn_profile,omitempty"`
|
||||
DisplayName string `json:"display_name,omitempty"`
|
||||
Device string `json:"device,omitempty"` // for install
|
||||
PlatformComponents []string `json:"platform_components,omitempty"`
|
||||
}
|
||||
|
||||
type persistedTask struct {
|
||||
@@ -130,13 +173,14 @@ func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions
|
||||
|
||||
// taskQueue manages a priority-ordered list of tasks and runs them one at a time.
|
||||
type taskQueue struct {
|
||||
mu sync.Mutex
|
||||
tasks []*Task
|
||||
trigger chan struct{}
|
||||
opts *HandlerOptions // set by startWorker
|
||||
statePath string
|
||||
logsDir string
|
||||
started bool
|
||||
mu sync.Mutex
|
||||
tasks []*Task
|
||||
trigger chan struct{}
|
||||
opts *HandlerOptions // set by startWorker
|
||||
statePath string
|
||||
logsDir string
|
||||
started bool
|
||||
kmsgWatcher *kmsgWatcher
|
||||
}
|
||||
|
||||
var globalQueue = &taskQueue{trigger: make(chan struct{}, 1)}
|
||||
@@ -162,6 +206,9 @@ var (
|
||||
runAMDMemBandwidthPackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunAMDMemBandwidthPackCtx(ctx, baseDir, logFunc)
|
||||
}
|
||||
runNvidiaStressPackCtx = func(a *app.App, ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||
return a.RunNvidiaStressPackCtx(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
runAMDStressPackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.RunAMDStressPackCtx(ctx, baseDir, durationSec, logFunc)
|
||||
}
|
||||
@@ -171,6 +218,10 @@ var (
|
||||
runSATStressPackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.RunSATStressPackCtx(ctx, baseDir, durationSec, logFunc)
|
||||
}
|
||||
buildSupportBundle = app.BuildSupportBundle
|
||||
installCommand = func(ctx context.Context, device string, logPath string) *exec.Cmd {
|
||||
return exec.CommandContext(ctx, "bee-install", device, logPath)
|
||||
}
|
||||
)
|
||||
|
||||
// enqueue adds a task to the queue and notifies the worker.
|
||||
@@ -241,6 +292,30 @@ func (q *taskQueue) findJob(id string) (*jobState, bool) {
|
||||
return t.job, true
|
||||
}
|
||||
|
||||
type taskStreamSource struct {
|
||||
status string
|
||||
errMsg string
|
||||
logPath string
|
||||
job *jobState
|
||||
}
|
||||
|
||||
func (q *taskQueue) taskStreamSource(id string) (taskStreamSource, bool) {
|
||||
q.mu.Lock()
|
||||
defer q.mu.Unlock()
|
||||
for _, t := range q.tasks {
|
||||
if t.ID != id {
|
||||
continue
|
||||
}
|
||||
return taskStreamSource{
|
||||
status: t.Status,
|
||||
errMsg: t.ErrMsg,
|
||||
logPath: t.LogPath,
|
||||
job: t.job,
|
||||
}, true
|
||||
}
|
||||
return taskStreamSource{}, false
|
||||
}
|
||||
|
||||
func (q *taskQueue) hasActiveTarget(target string) bool {
|
||||
q.mu.Lock()
|
||||
defer q.mu.Unlock()
|
||||
@@ -255,15 +330,19 @@ func (q *taskQueue) hasActiveTarget(target string) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
// snapshot returns a copy of all tasks sorted for display (running first, then pending by priority, then done by doneAt desc).
|
||||
// snapshot returns a copy of all tasks sorted for display with newest tasks first.
|
||||
func (q *taskQueue) snapshot() []Task {
|
||||
q.mu.Lock()
|
||||
defer q.mu.Unlock()
|
||||
out := make([]Task, len(q.tasks))
|
||||
for i, t := range q.tasks {
|
||||
out[i] = *t
|
||||
out[i].ElapsedSec = taskElapsedSec(&out[i], time.Now())
|
||||
}
|
||||
sort.SliceStable(out, func(i, j int) bool {
|
||||
if !out[i].CreatedAt.Equal(out[j].CreatedAt) {
|
||||
return out[i].CreatedAt.After(out[j].CreatedAt)
|
||||
}
|
||||
si := statusOrder(out[i].Status)
|
||||
sj := statusOrder(out[j].Status)
|
||||
if si != sj {
|
||||
@@ -272,7 +351,7 @@ func (q *taskQueue) snapshot() []Task {
|
||||
if out[i].Priority != out[j].Priority {
|
||||
return out[i].Priority > out[j].Priority
|
||||
}
|
||||
return out[i].CreatedAt.Before(out[j].CreatedAt)
|
||||
return out[i].Name < out[j].Name
|
||||
})
|
||||
return out
|
||||
}
|
||||
@@ -333,8 +412,16 @@ func (q *taskQueue) worker() {
|
||||
q.persistLocked()
|
||||
q.mu.Unlock()
|
||||
|
||||
if q.kmsgWatcher != nil && isSATTarget(t.Target) {
|
||||
q.kmsgWatcher.NotifyTaskStarted(t.ID, t.Target)
|
||||
}
|
||||
|
||||
q.runTask(t, j, ctx)
|
||||
|
||||
if q.kmsgWatcher != nil {
|
||||
q.kmsgWatcher.NotifyTaskFinished(t.ID)
|
||||
}
|
||||
|
||||
q.mu.Lock()
|
||||
now2 := time.Now()
|
||||
t.DoneAt = &now2
|
||||
@@ -368,9 +455,9 @@ func setCPUGovernor(governor string) {
|
||||
|
||||
// runTask executes the work for a task, writing output to j.
|
||||
func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
if q.opts == nil || q.opts.App == nil {
|
||||
j.append("ERROR: app not configured")
|
||||
j.finish("app not configured")
|
||||
if q.opts == nil {
|
||||
j.append("ERROR: handler options not configured")
|
||||
j.finish("handler options not configured")
|
||||
return
|
||||
}
|
||||
a := q.opts.App
|
||||
@@ -387,6 +474,10 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
|
||||
switch t.Target {
|
||||
case "nvidia":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
diagLevel := t.params.DiagLevel
|
||||
if t.params.BurnProfile != "" && diagLevel <= 0 {
|
||||
diagLevel = resolveBurnPreset(t.params.BurnProfile).NvidiaDiag
|
||||
@@ -403,11 +494,38 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
} else {
|
||||
archive, err = a.RunNvidiaAcceptancePack("", j.append)
|
||||
}
|
||||
case "nvidia-stress":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
dur := t.params.Duration
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
|
||||
DurationSec: dur,
|
||||
Loader: t.params.Loader,
|
||||
GPUIndices: t.params.GPUIndices,
|
||||
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||
}, j.append)
|
||||
case "memory":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = runMemoryAcceptancePackCtx(a, ctx, "", j.append)
|
||||
case "storage":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = runStorageAcceptancePackCtx(a, ctx, "", j.append)
|
||||
case "cpu":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
dur := t.params.Duration
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
@@ -415,35 +533,69 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
if dur <= 0 {
|
||||
dur = 60
|
||||
}
|
||||
j.append(fmt.Sprintf("CPU stress duration: %ds", dur))
|
||||
archive, err = runCPUAcceptancePackCtx(a, ctx, "", dur, j.append)
|
||||
case "amd":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = runAMDAcceptancePackCtx(a, ctx, "", j.append)
|
||||
case "amd-mem":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = runAMDMemIntegrityPackCtx(a, ctx, "", j.append)
|
||||
case "amd-bandwidth":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = runAMDMemBandwidthPackCtx(a, ctx, "", j.append)
|
||||
case "amd-stress":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
dur := t.params.Duration
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
archive, err = runAMDStressPackCtx(a, ctx, "", dur, j.append)
|
||||
case "memory-stress":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
dur := t.params.Duration
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
archive, err = runMemoryStressPackCtx(a, ctx, "", dur, j.append)
|
||||
case "sat-stress":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
dur := t.params.Duration
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
archive, err = runSATStressPackCtx(a, ctx, "", dur, j.append)
|
||||
case "platform-stress":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
opts := resolvePlatformStressPreset(t.params.BurnProfile)
|
||||
opts.Components = t.params.PlatformComponents
|
||||
archive, err = a.RunPlatformStress(ctx, "", opts, j.append)
|
||||
case "audit":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
result, e := a.RunAuditNow(q.opts.RuntimeMode)
|
||||
if e != nil {
|
||||
err = e
|
||||
@@ -452,7 +604,22 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
j.append(line)
|
||||
}
|
||||
}
|
||||
case "support-bundle":
|
||||
j.append("Building support bundle...")
|
||||
archive, err = buildSupportBundle(q.opts.ExportDir)
|
||||
case "install":
|
||||
if strings.TrimSpace(t.params.Device) == "" {
|
||||
err = fmt.Errorf("device is required")
|
||||
break
|
||||
}
|
||||
installLogPath := platform.InstallLogPath(t.params.Device)
|
||||
j.append("Install log: " + installLogPath)
|
||||
err = streamCmdJob(j, installCommand(ctx, t.params.Device, installLogPath))
|
||||
case "install-to-ram":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
err = a.RunInstallToRAM(ctx, j.append)
|
||||
default:
|
||||
j.append("ERROR: unknown target: " + t.Target)
|
||||
@@ -460,6 +627,19 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
// If the SAT archive was produced, check overall_status and write to component DB.
|
||||
if archive != "" {
|
||||
archivePath := app.ExtractArchivePath(archive)
|
||||
if err == nil {
|
||||
if app.ReadSATOverallStatus(archivePath) == "FAILED" {
|
||||
err = fmt.Errorf("SAT overall_status=FAILED (see summary.txt)")
|
||||
}
|
||||
}
|
||||
if db := q.statusDB(); db != nil {
|
||||
app.ApplySATResultToDB(db, t.Target, archivePath)
|
||||
}
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
if ctx.Err() != nil {
|
||||
j.append("Aborted.")
|
||||
@@ -476,6 +656,13 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
j.finish("")
|
||||
}
|
||||
|
||||
func (q *taskQueue) statusDB() *app.ComponentStatusDB {
|
||||
if q.opts == nil || q.opts.App == nil {
|
||||
return nil
|
||||
}
|
||||
return q.opts.App.StatusDB
|
||||
}
|
||||
|
||||
func splitLines(s string) []string {
|
||||
var out []string
|
||||
for _, l := range splitNL(s) {
|
||||
@@ -585,23 +772,83 @@ func (h *handler) handleAPITasksCancelAll(w http.ResponseWriter, _ *http.Request
|
||||
writeJSON(w, map[string]int{"cancelled": n})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPITasksKillWorkers(w http.ResponseWriter, _ *http.Request) {
|
||||
// Cancel all queued/running tasks in the queue first.
|
||||
globalQueue.mu.Lock()
|
||||
now := time.Now()
|
||||
cancelled := 0
|
||||
for _, t := range globalQueue.tasks {
|
||||
switch t.Status {
|
||||
case TaskPending:
|
||||
t.Status = TaskCancelled
|
||||
t.DoneAt = &now
|
||||
cancelled++
|
||||
case TaskRunning:
|
||||
if t.job != nil {
|
||||
t.job.abort()
|
||||
}
|
||||
t.Status = TaskCancelled
|
||||
t.DoneAt = &now
|
||||
cancelled++
|
||||
}
|
||||
}
|
||||
globalQueue.persistLocked()
|
||||
globalQueue.mu.Unlock()
|
||||
|
||||
// Kill orphaned test worker processes at the OS level.
|
||||
killed := platform.KillTestWorkers()
|
||||
writeJSON(w, map[string]any{
|
||||
"cancelled": cancelled,
|
||||
"killed": len(killed),
|
||||
"processes": killed,
|
||||
})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPITasksStream(w http.ResponseWriter, r *http.Request) {
|
||||
id := r.PathValue("id")
|
||||
// Wait up to 5s for the task to get a job (it may be pending)
|
||||
deadline := time.Now().Add(5 * time.Second)
|
||||
var j *jobState
|
||||
for time.Now().Before(deadline) {
|
||||
if jj, ok := globalQueue.findJob(id); ok {
|
||||
j = jj
|
||||
break
|
||||
}
|
||||
time.Sleep(200 * time.Millisecond)
|
||||
}
|
||||
if j == nil {
|
||||
http.Error(w, "task not found or not yet started", http.StatusNotFound)
|
||||
src, ok := globalQueue.taskStreamSource(id)
|
||||
if !ok {
|
||||
http.Error(w, "task not found", http.StatusNotFound)
|
||||
return
|
||||
}
|
||||
streamJob(w, r, j)
|
||||
if src.job != nil {
|
||||
streamJob(w, r, src.job)
|
||||
return
|
||||
}
|
||||
if src.status == TaskDone || src.status == TaskFailed || src.status == TaskCancelled {
|
||||
j := newTaskJobState(src.logPath)
|
||||
j.finish(src.errMsg)
|
||||
streamJob(w, r, j)
|
||||
return
|
||||
}
|
||||
if !sseStart(w) {
|
||||
return
|
||||
}
|
||||
sseWrite(w, "", "Task is queued. Waiting for worker...")
|
||||
ticker := time.NewTicker(200 * time.Millisecond)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
src, ok = globalQueue.taskStreamSource(id)
|
||||
if !ok {
|
||||
sseWrite(w, "done", "task not found")
|
||||
return
|
||||
}
|
||||
if src.job != nil {
|
||||
streamSubscribedJob(w, r, src.job)
|
||||
return
|
||||
}
|
||||
if src.status == TaskDone || src.status == TaskFailed || src.status == TaskCancelled {
|
||||
j := newTaskJobState(src.logPath)
|
||||
j.finish(src.errMsg)
|
||||
streamSubscribedJob(w, r, j)
|
||||
return
|
||||
}
|
||||
case <-r.Context().Done():
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (q *taskQueue) assignTaskLogPathLocked(t *Task) {
|
||||
@@ -638,8 +885,18 @@ func (q *taskQueue) loadLocked() {
|
||||
params: pt.Params,
|
||||
}
|
||||
q.assignTaskLogPathLocked(t)
|
||||
if t.Status == TaskPending || t.Status == TaskRunning {
|
||||
t.Status = TaskPending
|
||||
if t.Status == TaskRunning {
|
||||
// The task was interrupted by a bee-web restart. Child processes
|
||||
// (e.g. bee-gpu-burn-worker) survive the restart in their own
|
||||
// process groups and cannot be cancelled retroactively. Mark the
|
||||
// task as failed so the user can decide whether to re-run it
|
||||
// rather than blindly re-launching duplicate workers.
|
||||
now := time.Now()
|
||||
t.Status = TaskFailed
|
||||
t.DoneAt = &now
|
||||
t.ErrMsg = "interrupted by bee-web restart"
|
||||
} else if t.Status == TaskPending {
|
||||
t.StartedAt = nil
|
||||
t.DoneAt = nil
|
||||
t.ErrMsg = ""
|
||||
}
|
||||
@@ -679,3 +936,21 @@ func (q *taskQueue) persistLocked() {
|
||||
}
|
||||
_ = os.Rename(tmp, q.statePath)
|
||||
}
|
||||
|
||||
func taskElapsedSec(t *Task, now time.Time) int {
|
||||
if t == nil || t.StartedAt == nil || t.StartedAt.IsZero() {
|
||||
return 0
|
||||
}
|
||||
start := *t.StartedAt
|
||||
if !t.CreatedAt.IsZero() && start.Before(t.CreatedAt) {
|
||||
start = t.CreatedAt
|
||||
}
|
||||
end := now
|
||||
if t.DoneAt != nil && !t.DoneAt.IsZero() {
|
||||
end = *t.DoneAt
|
||||
}
|
||||
if end.Before(start) {
|
||||
return 0
|
||||
}
|
||||
return int(end.Sub(start).Round(time.Second) / time.Second)
|
||||
}
|
||||
|
||||
@@ -2,8 +2,12 @@ package webui
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
@@ -22,21 +26,34 @@ func TestTaskQueuePersistsAndRecoversPendingTasks(t *testing.T) {
|
||||
}
|
||||
|
||||
started := time.Now().Add(-time.Minute)
|
||||
task := &Task{
|
||||
ID: "task-1",
|
||||
|
||||
// A task that was pending (not yet started) must be re-queued on restart.
|
||||
pendingTask := &Task{
|
||||
ID: "task-pending",
|
||||
Name: "Memory Burn-in",
|
||||
Target: "memory-stress",
|
||||
Priority: 2,
|
||||
Status: TaskRunning,
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now().Add(-2 * time.Minute),
|
||||
StartedAt: &started,
|
||||
params: taskParams{
|
||||
Duration: 300,
|
||||
BurnProfile: "smoke",
|
||||
},
|
||||
params: taskParams{Duration: 300, BurnProfile: "smoke"},
|
||||
}
|
||||
// A task that was running when bee-web crashed must NOT be re-queued —
|
||||
// its child processes (e.g. gpu-burn-worker) survive the restart in
|
||||
// their own process groups and can't be cancelled retroactively.
|
||||
runningTask := &Task{
|
||||
ID: "task-running",
|
||||
Name: "NVIDIA GPU Stress",
|
||||
Target: "nvidia-stress",
|
||||
Priority: 1,
|
||||
Status: TaskRunning,
|
||||
CreatedAt: time.Now().Add(-3 * time.Minute),
|
||||
StartedAt: &started,
|
||||
params: taskParams{Duration: 86400},
|
||||
}
|
||||
for _, task := range []*Task{pendingTask, runningTask} {
|
||||
q.tasks = append(q.tasks, task)
|
||||
q.assignTaskLogPathLocked(task)
|
||||
}
|
||||
q.tasks = append(q.tasks, task)
|
||||
q.assignTaskLogPathLocked(task)
|
||||
q.persistLocked()
|
||||
|
||||
recovered := &taskQueue{
|
||||
@@ -46,18 +63,47 @@ func TestTaskQueuePersistsAndRecoversPendingTasks(t *testing.T) {
|
||||
}
|
||||
recovered.loadLocked()
|
||||
|
||||
if len(recovered.tasks) != 1 {
|
||||
t.Fatalf("tasks=%d want 1", len(recovered.tasks))
|
||||
if len(recovered.tasks) != 2 {
|
||||
t.Fatalf("tasks=%d want 2", len(recovered.tasks))
|
||||
}
|
||||
got := recovered.tasks[0]
|
||||
if got.Status != TaskPending {
|
||||
t.Fatalf("status=%q want %q", got.Status, TaskPending)
|
||||
|
||||
byID := map[string]*Task{}
|
||||
for i := range recovered.tasks {
|
||||
byID[recovered.tasks[i].ID] = recovered.tasks[i]
|
||||
}
|
||||
if got.params.Duration != 300 || got.params.BurnProfile != "smoke" {
|
||||
t.Fatalf("params=%+v", got.params)
|
||||
|
||||
// Pending task must be re-queued as pending with params intact.
|
||||
p := byID["task-pending"]
|
||||
if p == nil {
|
||||
t.Fatal("task-pending not found")
|
||||
}
|
||||
if got.LogPath == "" {
|
||||
t.Fatal("expected log path")
|
||||
if p.Status != TaskPending {
|
||||
t.Fatalf("pending task: status=%q want %q", p.Status, TaskPending)
|
||||
}
|
||||
if p.StartedAt != nil {
|
||||
t.Fatalf("pending task: started_at=%v want nil", p.StartedAt)
|
||||
}
|
||||
if p.params.Duration != 300 || p.params.BurnProfile != "smoke" {
|
||||
t.Fatalf("pending task: params=%+v", p.params)
|
||||
}
|
||||
if p.LogPath == "" {
|
||||
t.Fatal("pending task: expected log path")
|
||||
}
|
||||
|
||||
// Running task must be marked failed, not re-queued, to prevent
|
||||
// launching duplicate workers (e.g. a second set of gpu-burn-workers).
|
||||
r := byID["task-running"]
|
||||
if r == nil {
|
||||
t.Fatal("task-running not found")
|
||||
}
|
||||
if r.Status != TaskFailed {
|
||||
t.Fatalf("running task: status=%q want %q", r.Status, TaskFailed)
|
||||
}
|
||||
if r.ErrMsg == "" {
|
||||
t.Fatal("running task: expected non-empty error message")
|
||||
}
|
||||
if r.DoneAt == nil {
|
||||
t.Fatal("running task: expected done_at to be set")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -78,6 +124,130 @@ func TestNewTaskJobStateLoadsExistingLog(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestTaskQueueSnapshotSortsNewestFirst(t *testing.T) {
|
||||
now := time.Date(2026, 4, 2, 12, 0, 0, 0, time.UTC)
|
||||
q := &taskQueue{
|
||||
tasks: []*Task{
|
||||
{
|
||||
ID: "old-running",
|
||||
Name: "Old Running",
|
||||
Status: TaskRunning,
|
||||
Priority: 10,
|
||||
CreatedAt: now.Add(-3 * time.Minute),
|
||||
},
|
||||
{
|
||||
ID: "new-done",
|
||||
Name: "New Done",
|
||||
Status: TaskDone,
|
||||
Priority: 0,
|
||||
CreatedAt: now.Add(-1 * time.Minute),
|
||||
},
|
||||
{
|
||||
ID: "mid-pending",
|
||||
Name: "Mid Pending",
|
||||
Status: TaskPending,
|
||||
Priority: 1,
|
||||
CreatedAt: now.Add(-2 * time.Minute),
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
got := q.snapshot()
|
||||
if len(got) != 3 {
|
||||
t.Fatalf("snapshot len=%d want 3", len(got))
|
||||
}
|
||||
if got[0].ID != "new-done" || got[1].ID != "mid-pending" || got[2].ID != "old-running" {
|
||||
t.Fatalf("snapshot order=%q,%q,%q", got[0].ID, got[1].ID, got[2].ID)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPITasksStreamReplaysPersistedLogWithoutLiveJob(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
logPath := filepath.Join(dir, "task.log")
|
||||
if err := os.WriteFile(logPath, []byte("line1\nline2\n"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
globalQueue.mu.Lock()
|
||||
origTasks := globalQueue.tasks
|
||||
globalQueue.tasks = []*Task{{
|
||||
ID: "done-1",
|
||||
Name: "Done Task",
|
||||
Status: TaskDone,
|
||||
CreatedAt: time.Now(),
|
||||
LogPath: logPath,
|
||||
}}
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = origTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
|
||||
req := httptest.NewRequest(http.MethodGet, "/api/tasks/done-1/stream", nil)
|
||||
req.SetPathValue("id", "done-1")
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
h := &handler{}
|
||||
h.handleAPITasksStream(rec, req)
|
||||
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
body := rec.Body.String()
|
||||
if !strings.Contains(body, "data: line1\n\n") || !strings.Contains(body, "data: line2\n\n") {
|
||||
t.Fatalf("body=%q", body)
|
||||
}
|
||||
if !strings.Contains(body, "event: done\n") {
|
||||
t.Fatalf("missing done event: %q", body)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPITasksStreamPendingTaskStartsSSEImmediately(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
origTasks := globalQueue.tasks
|
||||
globalQueue.tasks = []*Task{{
|
||||
ID: "pending-1",
|
||||
Name: "Pending Task",
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
}}
|
||||
globalQueue.mu.Unlock()
|
||||
t.Cleanup(func() {
|
||||
globalQueue.mu.Lock()
|
||||
globalQueue.tasks = origTasks
|
||||
globalQueue.mu.Unlock()
|
||||
})
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
req := httptest.NewRequest(http.MethodGet, "/api/tasks/pending-1/stream", nil).WithContext(ctx)
|
||||
req.SetPathValue("id", "pending-1")
|
||||
rec := httptest.NewRecorder()
|
||||
|
||||
done := make(chan struct{})
|
||||
go func() {
|
||||
h := &handler{}
|
||||
h.handleAPITasksStream(rec, req)
|
||||
close(done)
|
||||
}()
|
||||
|
||||
deadline := time.Now().Add(2 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
if strings.Contains(rec.Body.String(), "Task is queued. Waiting for worker...") {
|
||||
cancel()
|
||||
<-done
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
return
|
||||
}
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
}
|
||||
cancel()
|
||||
<-done
|
||||
t.Fatalf("stream did not emit queued status promptly, body=%q", rec.Body.String())
|
||||
}
|
||||
|
||||
func TestResolveBurnPreset(t *testing.T) {
|
||||
tests := []struct {
|
||||
profile string
|
||||
@@ -95,9 +265,24 @@ func TestResolveBurnPreset(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunTaskHonorsCancel(t *testing.T) {
|
||||
t.Parallel()
|
||||
func TestTaskDisplayNameUsesNvidiaStressLoader(t *testing.T) {
|
||||
tests := []struct {
|
||||
loader string
|
||||
want string
|
||||
}{
|
||||
{loader: "", want: "NVIDIA GPU Stress (bee-gpu-burn)"},
|
||||
{loader: "builtin", want: "NVIDIA GPU Stress (bee-gpu-burn)"},
|
||||
{loader: "john", want: "NVIDIA GPU Stress (John/OpenCL)"},
|
||||
{loader: "nccl", want: "NVIDIA GPU Stress (NCCL)"},
|
||||
}
|
||||
for _, tc := range tests {
|
||||
if got := taskDisplayName("nvidia-stress", "acceptance", tc.loader); got != tc.want {
|
||||
t.Fatalf("taskDisplayName(loader=%q)=%q want %q", tc.loader, got, tc.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunTaskHonorsCancel(t *testing.T) {
|
||||
blocked := make(chan struct{})
|
||||
released := make(chan struct{})
|
||||
aRun := func(_ any, ctx context.Context, _ string, _ int, _ func(string)) (string, error) {
|
||||
@@ -154,3 +339,131 @@ func TestRunTaskHonorsCancel(t *testing.T) {
|
||||
t.Fatal("runTask did not return after cancel")
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunTaskUsesBurnProfileDurationForCPU(t *testing.T) {
|
||||
var gotDuration int
|
||||
q := &taskQueue{
|
||||
opts: &HandlerOptions{App: &app.App{}},
|
||||
}
|
||||
tk := &Task{
|
||||
ID: "cpu-burn-1",
|
||||
Name: "CPU Burn-in",
|
||||
Target: "cpu",
|
||||
Status: TaskRunning,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{BurnProfile: "smoke"},
|
||||
}
|
||||
j := &jobState{}
|
||||
|
||||
orig := runCPUAcceptancePackCtx
|
||||
runCPUAcceptancePackCtx = func(_ *app.App, _ context.Context, _ string, durationSec int, _ func(string)) (string, error) {
|
||||
gotDuration = durationSec
|
||||
return "/tmp/cpu-burn.tar.gz", nil
|
||||
}
|
||||
defer func() { runCPUAcceptancePackCtx = orig }()
|
||||
|
||||
q.runTask(tk, j, context.Background())
|
||||
|
||||
if gotDuration != 5*60 {
|
||||
t.Fatalf("duration=%d want %d", gotDuration, 5*60)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunTaskBuildsSupportBundleWithoutApp(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
q := &taskQueue{
|
||||
opts: &HandlerOptions{ExportDir: dir},
|
||||
}
|
||||
tk := &Task{
|
||||
ID: "support-bundle-1",
|
||||
Name: "Support Bundle",
|
||||
Target: "support-bundle",
|
||||
Status: TaskRunning,
|
||||
CreatedAt: time.Now(),
|
||||
}
|
||||
j := &jobState{}
|
||||
|
||||
var gotExportDir string
|
||||
orig := buildSupportBundle
|
||||
buildSupportBundle = func(exportDir string) (string, error) {
|
||||
gotExportDir = exportDir
|
||||
return filepath.Join(exportDir, "bundle.tar.gz"), nil
|
||||
}
|
||||
defer func() { buildSupportBundle = orig }()
|
||||
|
||||
q.runTask(tk, j, context.Background())
|
||||
|
||||
if gotExportDir != dir {
|
||||
t.Fatalf("exportDir=%q want %q", gotExportDir, dir)
|
||||
}
|
||||
if j.err != "" {
|
||||
t.Fatalf("unexpected error: %q", j.err)
|
||||
}
|
||||
if !strings.Contains(strings.Join(j.lines, "\n"), "Archive: "+filepath.Join(dir, "bundle.tar.gz")) {
|
||||
t.Fatalf("lines=%v", j.lines)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTaskElapsedSecClampsInvalidStartedAt(t *testing.T) {
|
||||
now := time.Date(2026, 4, 1, 19, 10, 0, 0, time.UTC)
|
||||
created := time.Date(2026, 4, 1, 19, 4, 5, 0, time.UTC)
|
||||
started := time.Time{}
|
||||
task := &Task{
|
||||
Status: TaskRunning,
|
||||
CreatedAt: created,
|
||||
StartedAt: &started,
|
||||
}
|
||||
if got := taskElapsedSec(task, now); got != 0 {
|
||||
t.Fatalf("taskElapsedSec(zero start)=%d want 0", got)
|
||||
}
|
||||
|
||||
stale := created.Add(-24 * time.Hour)
|
||||
task.StartedAt = &stale
|
||||
if got := taskElapsedSec(task, now); got != int(now.Sub(created).Seconds()) {
|
||||
t.Fatalf("taskElapsedSec(stale start)=%d want %d", got, int(now.Sub(created).Seconds()))
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunTaskInstallUsesSharedCommandStreaming(t *testing.T) {
|
||||
q := &taskQueue{
|
||||
opts: &HandlerOptions{},
|
||||
}
|
||||
tk := &Task{
|
||||
ID: "install-1",
|
||||
Name: "Install to Disk",
|
||||
Target: "install",
|
||||
Status: TaskRunning,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{Device: "/dev/sda"},
|
||||
}
|
||||
j := &jobState{}
|
||||
|
||||
var gotDevice string
|
||||
var gotLogPath string
|
||||
orig := installCommand
|
||||
installCommand = func(ctx context.Context, device string, logPath string) *exec.Cmd {
|
||||
gotDevice = device
|
||||
gotLogPath = logPath
|
||||
return exec.CommandContext(ctx, "sh", "-c", "printf 'line1\nline2\n'")
|
||||
}
|
||||
defer func() { installCommand = orig }()
|
||||
|
||||
q.runTask(tk, j, context.Background())
|
||||
|
||||
if gotDevice != "/dev/sda" {
|
||||
t.Fatalf("device=%q want /dev/sda", gotDevice)
|
||||
}
|
||||
if gotLogPath == "" {
|
||||
t.Fatal("expected install log path")
|
||||
}
|
||||
logs := strings.Join(j.lines, "\n")
|
||||
if !strings.Contains(logs, "Install log: ") {
|
||||
t.Fatalf("missing install log line: %v", j.lines)
|
||||
}
|
||||
if !strings.Contains(logs, "line1") || !strings.Contains(logs, "line2") {
|
||||
t.Fatalf("missing streamed output: %v", j.lines)
|
||||
}
|
||||
if j.err != "" {
|
||||
t.Fatalf("unexpected error: %q", j.err)
|
||||
}
|
||||
}
|
||||
|
||||
2
bible
2
bible
Submodule bible updated: 456c1f022c...688b87e98d
@@ -9,6 +9,34 @@ All live metrics charts in the web UI are server-side SVG images served by Go
|
||||
and polled by the browser every 2 seconds via `<img src="...?t=now">`.
|
||||
There is no client-side canvas or JS chart library.
|
||||
|
||||
## Rule: live charts must be visually uniform
|
||||
|
||||
Live charts are a single UI family, not a set of one-off widgets. New charts and
|
||||
changes to existing charts must keep the same rendering model and presentation
|
||||
rules unless there is an explicit architectural decision to diverge.
|
||||
|
||||
Default expectations:
|
||||
|
||||
- same server-side SVG pipeline for all live metrics charts
|
||||
- same refresh behaviour and failure handling in the browser
|
||||
- same canvas size class and card layout
|
||||
- same legend placement policy across charts
|
||||
- same axis, title, and summary conventions
|
||||
- no chart-specific visual exceptions added as a quick fix
|
||||
|
||||
Current default for live charts:
|
||||
|
||||
- legend below the plot area when a chart has 8 series or fewer
|
||||
- legend hidden when a chart has more than 8 series
|
||||
- 10 equal Y-axis steps across the chart height
|
||||
- 1400 x 360 SVG canvas with legend
|
||||
- 1400 x 288 SVG canvas without legend
|
||||
- full-width card rendering in a single-column stack
|
||||
|
||||
If one chart needs a different layout or legend behaviour, treat that as a
|
||||
design-level decision affecting the whole chart family, not as a local tweak to
|
||||
just one endpoint.
|
||||
|
||||
### Why go-analyze/charts
|
||||
|
||||
- Pure Go, no CGO — builds cleanly inside the live-build container
|
||||
@@ -29,7 +57,8 @@ self-contained SVG renderer used **only** for completed SAT run reports
|
||||
| `GET /api/metrics/chart/server.svg` | CPU temp, CPU load %, mem load %, power W, fan RPMs |
|
||||
| `GET /api/metrics/chart/gpu/{idx}.svg` | GPU temp °C, load %, mem %, power W |
|
||||
|
||||
Charts are 1400 × 280 px SVG. The page renders them at `width: 100%` in a
|
||||
Charts are 1400 × 360 px SVG when the legend is shown, and 1400 × 288 px when
|
||||
the legend is hidden. The page renders them at `width: 100%` in a
|
||||
single-column layout so they always fill the viewport width.
|
||||
|
||||
### Ring buffers
|
||||
|
||||
@@ -60,6 +60,8 @@ Rules:
|
||||
- Chromium opens `http://localhost/` — the full interactive web UI
|
||||
- SSH is independent from the desktop path
|
||||
- serial console support is enabled for VM boot debugging
|
||||
- Default boot keeps the server-safe graphics path (`nomodeset` + forced `fbdev`) for IPMI/BMC consoles
|
||||
- Higher-resolution mode selection is expected only when booting through an explicit `bee.display=kms` menu entry, which disables the forced `fbdev` Xorg config before `lightdm`
|
||||
|
||||
## ISO build sequence
|
||||
|
||||
@@ -81,9 +83,9 @@ build-in-container.sh [--authorized-keys /path/to/keys]
|
||||
7. `build-cublas.sh`:
|
||||
a. download `libcublas`, `libcublasLt`, `libcudart` runtime + dev packages from the NVIDIA CUDA Debian repo
|
||||
b. verify packages against repo `Packages.gz`
|
||||
c. extract headers for `bee-gpu-stress` build
|
||||
c. extract headers for `bee-gpu-burn` worker build
|
||||
d. cache userspace libs in `dist/cublas-<version>+cuda<series>/`
|
||||
8. build `bee-gpu-stress` against extracted cuBLASLt/cudart headers
|
||||
8. build `bee-gpu-burn` worker against extracted cuBLASLt/cudart headers
|
||||
9. inject NVIDIA `.ko` → staged `/usr/local/lib/nvidia/`
|
||||
10. inject `nvidia-smi` → staged `/usr/local/bin/nvidia-smi`
|
||||
11. inject `libnvidia-ml` + `libcuda` + `libcublas` + `libcublasLt` + `libcudart` → staged `/usr/lib/`
|
||||
@@ -104,7 +106,7 @@ Build host notes:
|
||||
1. `build-in-container.sh` / `build-nvidia-module.sh` — Debian kernel headers for module build
|
||||
2. `auto/config` — `linux-image-${DEBIAN_KERNEL_ABI}` in the ISO
|
||||
- NVIDIA modules go to staged `usr/local/lib/nvidia/` — NOT to `/lib/modules/<kver>/extra/`.
|
||||
- `bee-gpu-stress` must be built against cached CUDA userspace headers from `build-cublas.sh`, not against random host-installed CUDA headers.
|
||||
- `bee-gpu-burn` worker must be built against cached CUDA userspace headers from `build-cublas.sh`, not against random host-installed CUDA headers.
|
||||
- The live ISO must ship `libcublas`, `libcublasLt`, and `libcudart` together with `libcuda` so tensor-core stress works without internet or package installs at boot.
|
||||
- The source overlay in `iso/overlay/` is treated as immutable source. Build-time files are injected only into the staged overlay.
|
||||
- The live-build workdir under `dist/` is disposable; source files under `iso/builder/` stay clean.
|
||||
@@ -153,18 +155,17 @@ Current validation state:
|
||||
Every collector returns `nil, nil` on tool-not-found. Errors are logged, never fatal.
|
||||
|
||||
Acceptance flows:
|
||||
- `bee sat nvidia` → diagnostic archive with `nvidia-smi -q` + `nvidia-bug-report` + mixed-precision `bee-gpu-stress`
|
||||
- `bee sat nvidia` → diagnostic archive with `nvidia-smi -q` + `nvidia-bug-report` + lightweight `bee-gpu-burn`
|
||||
- NVIDIA GPU burn-in can use either `bee-gpu-burn` or `bee-john-gpu-stress` (John the Ripper jumbo via OpenCL)
|
||||
- `bee sat memory` → `memtester` archive
|
||||
- `bee sat storage` → SMART/NVMe diagnostic archive and short self-test trigger where supported
|
||||
- SAT `summary.txt` now includes `overall_status` and per-job `*_status` values (`OK`, `FAILED`, `UNSUPPORTED`)
|
||||
- `bee-gpu-stress` should prefer cuBLASLt GEMM load over the old integer/PTX burn path:
|
||||
- `bee-gpu-burn` should prefer cuBLASLt GEMM load over the old integer/PTX burn path:
|
||||
- Ampere: `fp16` + `fp32`/TF32 tensor-core load
|
||||
- Ada / Hopper: add `fp8`
|
||||
- Blackwell+: add `fp4`
|
||||
- PTX fallback is only for missing cuBLASLt/userspace or unsupported narrow datatypes
|
||||
- Runtime overrides:
|
||||
- `BEE_GPU_STRESS_SECONDS`
|
||||
- `BEE_GPU_STRESS_SIZE_MB`
|
||||
- `BEE_MEMTESTER_SIZE_MB`
|
||||
- `BEE_MEMTESTER_PASSES`
|
||||
|
||||
@@ -179,6 +180,6 @@ Web UI: Acceptance Tests page → Run Test button
|
||||
```
|
||||
|
||||
**Critical invariants:**
|
||||
- `bee-gpu-stress` uses `exec.CommandContext` — killed on job context cancel.
|
||||
- `bee-gpu-burn` / `bee-john-gpu-stress` use `exec.CommandContext` — killed on job context cancel.
|
||||
- Metric goroutine uses stopCh/doneCh pattern; main goroutine waits `<-doneCh` before reading rows (no mutex needed).
|
||||
- SVG chart is fully offline: no JS, no external CSS, pure inline SVG.
|
||||
|
||||
@@ -21,8 +21,8 @@ Fills gaps where Redfish/logpile is blind:
|
||||
- Read-only hardware inventory: board, CPU, memory, storage, PCIe, PSU, GPU, NIC, RAID
|
||||
- Machine-readable health summary derived from collector verdicts
|
||||
- Operator-triggered acceptance tests for NVIDIA, memory, and storage
|
||||
- NVIDIA SAT includes both diagnostic collection and mixed-precision GPU stress via `bee-gpu-stress`
|
||||
- `bee-gpu-stress` should exercise tensor/inference paths (`fp16`, `fp32`/TF32, `fp8`, `fp4` when supported by the GPU/userspace stack) and fall back to Driver API PTX burn only if cuBLASLt is unavailable
|
||||
- NVIDIA SAT includes diagnostic collection plus a lightweight in-image GPU stress step via `bee-gpu-burn`
|
||||
- `bee-gpu-burn` should exercise tensor/inference paths (`fp16`, `fp32`/TF32, `fp8`, `fp4` when supported by the GPU/userspace stack) and fall back to Driver API PTX burn only if cuBLASLt is unavailable
|
||||
- Automatic boot audit with operator-facing local console and SSH access
|
||||
- NVIDIA proprietary driver loaded at boot for GPU enrichment via `nvidia-smi`
|
||||
- SSH access (OpenSSH) always available for inspection and debugging
|
||||
@@ -70,7 +70,7 @@ Fills gaps where Redfish/logpile is blind:
|
||||
| SSH | OpenSSH server |
|
||||
| NVIDIA driver | Proprietary `.run` installer, built against Debian kernel headers |
|
||||
| NVIDIA modules | Loaded via `insmod` from `/usr/local/lib/nvidia/` |
|
||||
| GPU stress backend | `bee-gpu-stress` + cuBLASLt/cuBLAS/cudart mixed-precision GEMM, with Driver API PTX fallback |
|
||||
| GPU stress backend | `bee-gpu-burn` + cuBLASLt/cuBLAS/cudart mixed-precision GEMM, with Driver API PTX fallback |
|
||||
| Builder | Debian 12 host/VM or Debian 12 container image |
|
||||
|
||||
## Operator UX
|
||||
|
||||
@@ -18,6 +18,8 @@ Use the official proprietary NVIDIA `.run` installer for both kernel modules and
|
||||
- Kernel modules and nvidia-smi come from a single verified source.
|
||||
- NVIDIA publishes `.sha256sum` alongside each installer — download and verify before use.
|
||||
- Driver version pinned in `iso/builder/VERSIONS` as `NVIDIA_DRIVER_VERSION`.
|
||||
- DCGM must track the CUDA user-mode driver major version exposed by `nvidia-smi`.
|
||||
- For NVIDIA driver branch `590` with CUDA `13.x`, use DCGM 4 package family `datacenter-gpu-manager-4-cuda13`; legacy `datacenter-gpu-manager` 3.x does not provide a working path for this stack.
|
||||
- Build process: download `.run`, extract, compile `kernel/` sources against `linux-lts-dev`.
|
||||
- Modules cached in `dist/nvidia-<version>-<kver>/` — rebuild only on version or kernel change.
|
||||
- ISO size increases by ~50MB for .ko files + nvidia-smi.
|
||||
|
||||
224
bible-local/decisions/2026-04-01-memtest-build-strategy.md
Normal file
224
bible-local/decisions/2026-04-01-memtest-build-strategy.md
Normal file
@@ -0,0 +1,224 @@
|
||||
# Decision: Treat memtest as explicit ISO content, not as trusted live-build magic
|
||||
|
||||
**Date:** 2026-04-01
|
||||
**Status:** resolved
|
||||
|
||||
## Context
|
||||
|
||||
We have already iterated on `memtest` multiple times and kept cycling between the same ideas.
|
||||
The commit history shows several distinct attempts:
|
||||
|
||||
- `f91bce8` — fixed Bookworm memtest file names to `memtest86+x64.bin` / `memtest86+x64.efi`
|
||||
- `5857805` — added a binary hook to copy memtest files from the build tree into the ISO root
|
||||
- `f96b149` — added fallback extraction from the cached `.deb` when `chroot/boot/` stayed empty
|
||||
- `d43a9ae` — removed the custom hook and switched back to live-build built-in memtest integration
|
||||
- `60cb8f8` — restored explicit memtest menu entries and added ISO validation
|
||||
- `3dbc218` / `3869788` — added archived build logs and better memtest diagnostics
|
||||
|
||||
Current evidence from the archived `easy-bee-nvidia-v3.14-amd64` logs dated 2026-04-01:
|
||||
|
||||
- `lb binary_memtest` does run and installs `memtest86+`
|
||||
- but the final ISO still does **not** contain `boot/memtest86+x64.bin`
|
||||
- the final ISO also does **not** contain memtest menu entries in `boot/grub/grub.cfg` or `isolinux/live.cfg`
|
||||
|
||||
So the assumption "live-build built-in memtest integration is enough on this stack" is currently false for this project until proven otherwise by a real built ISO.
|
||||
|
||||
Additional evidence from the archived `easy-bee-nvidia-v3.17-dirty-amd64` logs dated 2026-04-01:
|
||||
|
||||
- the build now completes successfully because memtest is non-blocking by default
|
||||
- `lb binary_memtest` still runs and installs `memtest86+`
|
||||
- the project-owned hook `config/hooks/normal/9100-memtest.hook.binary` does execute
|
||||
- but it executes too early for its current target paths:
|
||||
- `binary/boot/grub/grub.cfg` is still missing at hook time
|
||||
- `binary/isolinux/live.cfg` is still missing at hook time
|
||||
- memtest binaries are also still absent in `binary/boot/`
|
||||
- later in the build, live-build does create intermediate bootloader configs with memtest lines in the workdir
|
||||
- but the final ISO still lacks memtest binaries and still lacks memtest lines in extracted ISO `boot/grub/grub.cfg` and `isolinux/live.cfg`
|
||||
|
||||
So the assumption "the current normal binary hook path is late enough to patch final memtest artifacts" is also false.
|
||||
|
||||
Correction after inspecting the real `easy-bee-nvidia-v3.20-5-g76a9100-amd64.iso`
|
||||
artifact dated 2026-04-01:
|
||||
|
||||
- the final ISO does contain `boot/memtest86+x64.bin`
|
||||
- the final ISO does contain `boot/memtest86+x64.efi`
|
||||
- the final ISO does contain memtest menu entries in both `boot/grub/grub.cfg`
|
||||
and `isolinux/live.cfg`
|
||||
- so `v3.20-5-g76a9100` was **not** another real memtest regression in the
|
||||
shipped ISO
|
||||
- the regression was in the build-time validator/debug path in `build.sh`
|
||||
|
||||
Root cause of the false alarm:
|
||||
|
||||
- `build.sh` treated "ISO reader command exists" as equivalent to "ISO reader
|
||||
successfully listed/extracted members"
|
||||
- `iso_list_files` / `iso_extract_file` failures were collapsed into the same
|
||||
observable output as "memtest content missing"
|
||||
- this made a reader failure look identical to a missing memtest payload
|
||||
- as a result, we re-entered the same memtest investigation loop even though
|
||||
the real ISO was already correct
|
||||
|
||||
Additional correction from the subsequent `v3.21` build logs dated 2026-04-01:
|
||||
|
||||
- once ISO reading was fixed, the post-build debug correctly showed the raw ISO
|
||||
still carried live-build's default memtest layout (`live/memtest.bin`,
|
||||
`live/memtest.efi`, `boot/grub/memtest.cfg`, `isolinux/memtest.cfg`)
|
||||
- that mismatch is expected to trigger project recovery, because `bee` requires
|
||||
`boot/memtest86+x64.bin` / `boot/memtest86+x64.efi` plus matching menu paths
|
||||
- however, `build.sh` exited before recovery because `set -e` treated a direct
|
||||
`iso_memtest_present` return code of `1` as fatal
|
||||
- so the next repeated loop was caused by shell control flow, not by proof that
|
||||
the recovery design itself was wrong
|
||||
|
||||
## Known Failed Attempts
|
||||
|
||||
These approaches were already tried and should not be repeated blindly:
|
||||
|
||||
1. Built-in live-build memtest only.
|
||||
Reason it failed:
|
||||
- `lb binary_memtest` runs, but the final ISO still misses memtest binaries and menu entries.
|
||||
|
||||
2. Fixing only the memtest file names for Debian Bookworm.
|
||||
Reason it failed:
|
||||
- correct file names alone do not make the files appear in the final ISO.
|
||||
|
||||
3. Copying memtest from `chroot/boot/` into `binary/boot/` via a binary hook.
|
||||
Reason it failed:
|
||||
- in this stack `chroot/boot/` is often empty for memtest payloads at the relevant time.
|
||||
|
||||
4. Fallback extraction from cached `memtest86+` `.deb`.
|
||||
Reason it failed:
|
||||
- this was explored already and was not enough to stabilize the final ISO path end-to-end.
|
||||
|
||||
5. Restoring explicit memtest menu entries in source bootloader templates only.
|
||||
Reason it failed:
|
||||
- memtest lines in source templates or intermediate workdir configs do not guarantee the final ISO contains them.
|
||||
|
||||
6. Patching `binary/boot/grub/grub.cfg` and `binary/isolinux/live.cfg` from the current `config/hooks/normal/9100-memtest.hook.binary`.
|
||||
Reason it failed:
|
||||
- the hook runs before those files exist, so the hook cannot patch them there.
|
||||
|
||||
## What This Means
|
||||
|
||||
When revisiting memtest later, start from the constraints above rather than retrying the same patterns:
|
||||
|
||||
- do not assume the built-in memtest stage is sufficient
|
||||
- do not assume `chroot/boot/` will contain memtest payloads
|
||||
- do not assume source bootloader templates are the last writer of final ISO configs
|
||||
- do not assume the current normal binary hook timing is late enough for final patching
|
||||
|
||||
Any future memtest fix must explicitly identify:
|
||||
|
||||
- where the memtest binaries are reliably available at build time
|
||||
- which exact build stage writes the final bootloader configs that land in the ISO
|
||||
- and a post-build proof from a real ISO, not only from intermediate workdir files
|
||||
- whether the ISO inspection step itself succeeded, rather than merely whether
|
||||
the validator printed a memtest warning
|
||||
- whether a non-zero probe is intentionally handled inside an `if` / `case`
|
||||
context rather than accidentally tripping `set -e`
|
||||
|
||||
## Decision
|
||||
|
||||
For `bee`, memtest must be treated as an explicit ISO artifact with explicit post-build validation.
|
||||
|
||||
Project rules from now on:
|
||||
|
||||
- Do **not** trust `--memtest memtest86+` by itself.
|
||||
- A memtest implementation is considered valid only if the produced ISO actually contains:
|
||||
- `boot/memtest86+x64.bin`
|
||||
- `boot/memtest86+x64.efi`
|
||||
- a GRUB menu entry
|
||||
- an isolinux menu entry
|
||||
- If live-build built-in integration does not produce those artifacts, use an explicit project-owned mechanism such as:
|
||||
- a binary hook copying files into `binary/boot/`
|
||||
- extraction from the cached `memtest86+` `.deb`
|
||||
- another deterministic build-time copy step
|
||||
- Do **not** remove such explicit logic later unless a fresh real ISO build proves that built-in integration alone produces all required files and menu entries.
|
||||
|
||||
Current implementation direction:
|
||||
|
||||
- keep the live-build memtest stage enabled if it helps package acquisition
|
||||
- do not rely on the current early `binary_hooks` timing for final patching
|
||||
- prefer a post-`lb build` recovery step in `build.sh` that:
|
||||
- patches the fully materialized `LB_DIR/binary` tree
|
||||
- injects memtest binaries there
|
||||
- ensures final bootloader entries there
|
||||
- reruns late binary stages (`binary_checksums`, `binary_iso`, `binary_zsync`) after the patch
|
||||
- also treat ISO validation tooling as part of the critical path:
|
||||
- install a stable ISO reader in the builder image
|
||||
- fail with an explicit reader error if ISO listing/extraction fails
|
||||
- do not treat reader failure as evidence that memtest is missing
|
||||
- do not call a probe that may return "needs recovery" as a bare command under
|
||||
`set -e`; wrap it in explicit control flow
|
||||
|
||||
## Consequences
|
||||
|
||||
- Future memtest changes must begin by reading this ADR and the commits listed above.
|
||||
- Future memtest changes must also begin by reading the failed-attempt list above.
|
||||
- We should stop re-introducing "prefer built-in live-build memtest" as a default assumption without new evidence.
|
||||
- Memtest validation in `build.sh` is not optional; it is the acceptance gate that prevents another silent regression.
|
||||
- But validation output is only trustworthy if ISO reading itself succeeded. A
|
||||
"missing memtest" warning without a successful ISO read is not evidence.
|
||||
- If we change memtest strategy again, we must update this ADR with the exact build evidence that justified the change.
|
||||
|
||||
## Working Solution (confirmed 2026-04-01, commits 76a9100 → 2baf3be)
|
||||
|
||||
This approach was confirmed working in ISO `easy-bee-nvidia-v3.20-5-g76a9100-amd64.iso`
|
||||
and validated again in subsequent builds. The final ISO contains all required memtest artifacts.
|
||||
|
||||
### Components
|
||||
|
||||
**1. Binary hook `config/hooks/normal/9100-memtest.hook.binary`**
|
||||
|
||||
Runs inside the live-build binary phase. Does not patch bootloader files at hook time —
|
||||
those files may not exist yet. Instead:
|
||||
|
||||
- Tries to copy `memtest86+x64.bin` / `memtest86+x64.efi` from `chroot/boot/` first.
|
||||
- Falls back to extracting from the cached `.deb` (via `dpkg-deb -x`) if `chroot/boot/` is empty.
|
||||
- Appends GRUB and isolinux menu entries only if the respective cfg files already exist at hook time.
|
||||
If they do not exist, the hook warns and continues (does not fail).
|
||||
|
||||
Controlled by `BEE_REQUIRE_MEMTEST=1` env var to turn warnings into hard errors when needed.
|
||||
|
||||
**2. Post-`lb build` recovery step in `build.sh`**
|
||||
|
||||
After `lb build` completes, `build.sh` checks whether the fully materialized `binary/` tree
|
||||
contains all required memtest artifacts. If not:
|
||||
|
||||
- Copies/extracts memtest binaries into `binary/boot/`.
|
||||
- Patches `binary/boot/grub/grub.cfg` and `binary/isolinux/live.cfg` directly.
|
||||
- Reruns the late binary stages (`binary_checksums`, `binary_iso`, `binary_zsync`) to rebuild
|
||||
the ISO with the patched tree.
|
||||
|
||||
This is the deterministic safety net: even if the hook runs at the wrong time, the recovery
|
||||
step handles the final `binary/` tree after live-build has written all bootloader configs.
|
||||
|
||||
**3. ISO validation hardening**
|
||||
|
||||
The memtest probe in `build.sh` is wrapped in explicit `if` / `case` control flow, not called
|
||||
as a bare command under `set -e`. A non-zero probe return (needs recovery) is intentional and
|
||||
handled — it does not abort the build prematurely.
|
||||
|
||||
ISO reading (`xorriso -indev -ls` / extraction) is treated as a separate prerequisite.
|
||||
If the reader fails, the validator reports a reader error explicitly, not a memtest warning.
|
||||
This prevents the false-negative loop that burned 2026-04-01 v3.14–v3.19.
|
||||
|
||||
### Why this works when earlier attempts did not
|
||||
|
||||
The earlier patterns all shared a single flaw: they assumed a single build-time point
|
||||
(hook or source template) would be the last writer of bootloader configs and memtest payloads.
|
||||
In live-build on Debian Bookworm that assumption is false — live-build continues writing
|
||||
bootloader files after custom hooks run, and `chroot/boot/` does not reliably hold memtest payloads.
|
||||
|
||||
The recovery step sidesteps the ordering problem entirely: it acts on the fully materialized
|
||||
`binary/` tree after `lb build` finishes, then rebuilds the ISO from that patched tree.
|
||||
There is no ordering dependency to get wrong.
|
||||
|
||||
### Do not revert
|
||||
|
||||
Do not remove the recovery step or the hook without a fresh real ISO build proving
|
||||
live-build alone produces all four required artifacts:
|
||||
- `boot/memtest86+x64.bin`
|
||||
- `boot/memtest86+x64.efi`
|
||||
- memtest entry in `boot/grub/grub.cfg`
|
||||
- memtest entry in `isolinux/live.cfg`
|
||||
@@ -5,3 +5,4 @@ One file per decision, named `YYYY-MM-DD-short-topic.md`.
|
||||
| Date | Decision | Status |
|
||||
|---|---|---|
|
||||
| 2026-03-05 | Use NVIDIA proprietary driver | active |
|
||||
| 2026-04-01 | Treat memtest as explicit ISO content | active |
|
||||
|
||||
@@ -13,9 +13,50 @@ Use one of:
|
||||
|
||||
This applies to:
|
||||
- `iso/builder/config/package-lists/*.list.chroot`
|
||||
- Any package referenced in `grub.cfg`, hooks, or overlay scripts (e.g. file paths like `/boot/memtest86+x64.bin`)
|
||||
- Any package referenced in bootloader configs, hooks, or overlay scripts
|
||||
|
||||
## Example of what goes wrong without this
|
||||
## Memtest rule
|
||||
|
||||
`memtest86+` in Debian bookworm installs `/boot/memtest86+x64.bin`, not `/boot/memtest86+.bin`.
|
||||
Guessing the filename caused a broken GRUB entry that only surfaced at boot time, after a full rebuild.
|
||||
Do not assume live-build's built-in memtest integration is sufficient for `bee`.
|
||||
We already tried that path and regressed again on 2026-04-01: `lb binary_memtest`
|
||||
ran, but the final ISO still lacked memtest binaries and menu entries.
|
||||
|
||||
For this project, memtest is accepted only when the produced ISO actually
|
||||
contains all of the following:
|
||||
|
||||
- `boot/memtest86+x64.bin`
|
||||
- `boot/memtest86+x64.efi`
|
||||
- a memtest entry in `boot/grub/grub.cfg`
|
||||
- a memtest entry in `isolinux/live.cfg`
|
||||
|
||||
Rules:
|
||||
|
||||
- Keep explicit post-build memtest validation in `build.sh`.
|
||||
- Treat ISO reader success as a separate prerequisite from memtest content.
|
||||
If the reader cannot list or extract from the ISO, that is a validator
|
||||
failure, not proof that memtest is missing.
|
||||
- If built-in integration does not produce the artifacts above, use a
|
||||
deterministic project-owned copy/extract step instead of hoping live-build
|
||||
will "start working".
|
||||
- Do not switch back to built-in-only memtest without fresh build evidence from
|
||||
a real ISO.
|
||||
- If you reference memtest files manually, verify the exact package file list
|
||||
first for the target Debian release.
|
||||
|
||||
Known bad loops for this repository:
|
||||
|
||||
- Do not retry built-in-only memtest without new evidence. We already proved
|
||||
that `lb binary_memtest` can run while the final ISO still has no memtest.
|
||||
- Do not assume fixing memtest file names is enough. Correct names did not fix
|
||||
the final artifact path.
|
||||
- Do not assume `chroot/boot/` contains memtest payloads at the time hooks run.
|
||||
- Do not assume source `grub.cfg` / `live.cfg.in` are the final writers of ISO
|
||||
bootloader configs.
|
||||
- Do not assume the current `config/hooks/normal/9100-memtest.hook.binary`
|
||||
timing is late enough to patch final `binary/boot/grub/grub.cfg` or
|
||||
`binary/isolinux/live.cfg`; logs from 2026-04-01 showed those files were not
|
||||
present yet when the hook executed.
|
||||
- Do not treat a validator warning as ground truth until you have confirmed the
|
||||
ISO reader actually succeeded. On 2026-04-01 we misdiagnosed another memtest
|
||||
regression because the final ISO was correct but the validator produced a
|
||||
false negative.
|
||||
|
||||
@@ -48,6 +48,7 @@ sh iso/builder/build-in-container.sh --cache-dir /path/to/cache
|
||||
- The builder image is automatically rebuilt if the local tag exists for the wrong architecture.
|
||||
- The live ISO boots with Debian `live-boot` `toram`, so the read-only medium is copied into RAM during boot and the runtime no longer depends on the original USB/BMC virtual media staying present.
|
||||
- Target systems need enough RAM for the full compressed live medium plus normal runtime overhead, or boot may fail before reaching the TUI.
|
||||
- The NVIDIA variant installs DCGM 4 packages matched to the CUDA user-mode driver major version. For driver branch `590` / CUDA `13.x`, the package family is `datacenter-gpu-manager-4-cuda13` rather than legacy `datacenter-gpu-manager`.
|
||||
- Override the container platform only if you know why:
|
||||
|
||||
```sh
|
||||
|
||||
@@ -17,12 +17,23 @@ RUN apt-get update -qq && apt-get install -y \
|
||||
wget \
|
||||
curl \
|
||||
tar \
|
||||
libarchive-tools \
|
||||
xz-utils \
|
||||
rsync \
|
||||
build-essential \
|
||||
gcc \
|
||||
make \
|
||||
perl \
|
||||
pkg-config \
|
||||
yasm \
|
||||
libssl-dev \
|
||||
zlib1g-dev \
|
||||
libbz2-dev \
|
||||
libgmp-dev \
|
||||
libpcap-dev \
|
||||
libsqlite3-dev \
|
||||
libcurl4-openssl-dev \
|
||||
ocl-icd-opencl-dev \
|
||||
linux-headers-amd64 \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
|
||||
@@ -8,7 +8,8 @@ NCCL_TESTS_VERSION=2.13.10
|
||||
NVCC_VERSION=12.8
|
||||
CUBLAS_VERSION=13.0.2.14-1
|
||||
CUDA_USERSPACE_VERSION=13.0.96-1
|
||||
DCGM_VERSION=3.3.9
|
||||
DCGM_VERSION=4.5.3-1
|
||||
JOHN_JUMBO_COMMIT=67fcf9fe5a
|
||||
ROCM_VERSION=6.3.4
|
||||
ROCM_SMI_VERSION=7.4.0.60304-76~22.04
|
||||
ROCM_BANDWIDTH_TEST_VERSION=1.4.0.60304-76~22.04
|
||||
|
||||
@@ -29,10 +29,10 @@ lb config noauto \
|
||||
--security true \
|
||||
--linux-flavours "amd64" \
|
||||
--linux-packages "${LB_LINUX_PACKAGES}" \
|
||||
--memtest none \
|
||||
--memtest memtest86+ \
|
||||
--iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \
|
||||
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=6 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
|
||||
--apt-recommends false \
|
||||
--chroot-squashfs-compression-type zstd \
|
||||
"${@}"
|
||||
|
||||
@@ -29,8 +29,14 @@ typedef void *CUfunction;
|
||||
typedef void *CUstream;
|
||||
|
||||
#define CU_SUCCESS 0
|
||||
#define CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT 16
|
||||
#define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR 75
|
||||
#define CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR 76
|
||||
#define MAX_STRESS_STREAMS 16
|
||||
#define MAX_CUBLAS_PROFILES 5
|
||||
#define MIN_PROFILE_BUDGET_BYTES ((size_t)4u * 1024u * 1024u)
|
||||
#define MIN_STREAM_BUDGET_BYTES ((size_t)64u * 1024u * 1024u)
|
||||
#define STRESS_LAUNCH_DEPTH 8
|
||||
|
||||
static const char *ptx_source =
|
||||
".version 6.0\n"
|
||||
@@ -97,6 +103,9 @@ typedef CUresult (*cuLaunchKernel_fn)(CUfunction,
|
||||
CUstream,
|
||||
void **,
|
||||
void **);
|
||||
typedef CUresult (*cuMemGetInfo_fn)(size_t *, size_t *);
|
||||
typedef CUresult (*cuStreamCreate_fn)(CUstream *, unsigned int);
|
||||
typedef CUresult (*cuStreamDestroy_fn)(CUstream);
|
||||
typedef CUresult (*cuGetErrorName_fn)(CUresult, const char **);
|
||||
typedef CUresult (*cuGetErrorString_fn)(CUresult, const char **);
|
||||
|
||||
@@ -118,6 +127,9 @@ struct cuda_api {
|
||||
cuModuleLoadDataEx_fn cuModuleLoadDataEx;
|
||||
cuModuleGetFunction_fn cuModuleGetFunction;
|
||||
cuLaunchKernel_fn cuLaunchKernel;
|
||||
cuMemGetInfo_fn cuMemGetInfo;
|
||||
cuStreamCreate_fn cuStreamCreate;
|
||||
cuStreamDestroy_fn cuStreamDestroy;
|
||||
cuGetErrorName_fn cuGetErrorName;
|
||||
cuGetErrorString_fn cuGetErrorString;
|
||||
};
|
||||
@@ -128,9 +140,10 @@ struct stress_report {
|
||||
int cc_major;
|
||||
int cc_minor;
|
||||
int buffer_mb;
|
||||
int stream_count;
|
||||
unsigned long iterations;
|
||||
uint64_t checksum;
|
||||
char details[1024];
|
||||
char details[16384];
|
||||
};
|
||||
|
||||
static int load_symbol(void *lib, const char *name, void **out) {
|
||||
@@ -144,7 +157,7 @@ static int load_cuda(struct cuda_api *api) {
|
||||
if (!api->lib) {
|
||||
return 0;
|
||||
}
|
||||
return
|
||||
if (!(
|
||||
load_symbol(api->lib, "cuInit", (void **)&api->cuInit) &&
|
||||
load_symbol(api->lib, "cuDeviceGetCount", (void **)&api->cuDeviceGetCount) &&
|
||||
load_symbol(api->lib, "cuDeviceGet", (void **)&api->cuDeviceGet) &&
|
||||
@@ -160,7 +173,17 @@ static int load_cuda(struct cuda_api *api) {
|
||||
load_symbol(api->lib, "cuMemcpyDtoH_v2", (void **)&api->cuMemcpyDtoH) &&
|
||||
load_symbol(api->lib, "cuModuleLoadDataEx", (void **)&api->cuModuleLoadDataEx) &&
|
||||
load_symbol(api->lib, "cuModuleGetFunction", (void **)&api->cuModuleGetFunction) &&
|
||||
load_symbol(api->lib, "cuLaunchKernel", (void **)&api->cuLaunchKernel);
|
||||
load_symbol(api->lib, "cuLaunchKernel", (void **)&api->cuLaunchKernel))) {
|
||||
dlclose(api->lib);
|
||||
memset(api, 0, sizeof(*api));
|
||||
return 0;
|
||||
}
|
||||
load_symbol(api->lib, "cuMemGetInfo_v2", (void **)&api->cuMemGetInfo);
|
||||
load_symbol(api->lib, "cuStreamCreate", (void **)&api->cuStreamCreate);
|
||||
if (!load_symbol(api->lib, "cuStreamDestroy_v2", (void **)&api->cuStreamDestroy)) {
|
||||
load_symbol(api->lib, "cuStreamDestroy", (void **)&api->cuStreamDestroy);
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
static const char *cu_error_name(struct cuda_api *api, CUresult rc) {
|
||||
@@ -193,14 +216,12 @@ static double now_seconds(void) {
|
||||
return (double)ts.tv_sec + ((double)ts.tv_nsec / 1000000000.0);
|
||||
}
|
||||
|
||||
#if HAVE_CUBLASLT_HEADERS
|
||||
static size_t round_down_size(size_t value, size_t multiple) {
|
||||
if (multiple == 0 || value < multiple) {
|
||||
return value;
|
||||
}
|
||||
return value - (value % multiple);
|
||||
}
|
||||
#endif
|
||||
|
||||
static int query_compute_capability(struct cuda_api *api, CUdevice dev, int *major, int *minor) {
|
||||
int cc_major = 0;
|
||||
@@ -220,6 +241,75 @@ static int query_compute_capability(struct cuda_api *api, CUdevice dev, int *maj
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int query_multiprocessor_count(struct cuda_api *api, CUdevice dev, int *count) {
|
||||
int mp_count = 0;
|
||||
if (!check_rc(api,
|
||||
"cuDeviceGetAttribute(multiprocessors)",
|
||||
api->cuDeviceGetAttribute(&mp_count, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev))) {
|
||||
return 0;
|
||||
}
|
||||
*count = mp_count;
|
||||
return 1;
|
||||
}
|
||||
|
||||
static size_t clamp_budget_to_free_memory(struct cuda_api *api, size_t requested_bytes) {
|
||||
size_t free_bytes = 0;
|
||||
size_t total_bytes = 0;
|
||||
size_t max_bytes = requested_bytes;
|
||||
|
||||
if (!api->cuMemGetInfo) {
|
||||
return requested_bytes;
|
||||
}
|
||||
if (api->cuMemGetInfo(&free_bytes, &total_bytes) != CU_SUCCESS || free_bytes == 0) {
|
||||
return requested_bytes;
|
||||
}
|
||||
|
||||
max_bytes = (free_bytes * 9u) / 10u;
|
||||
if (max_bytes < (size_t)4u * 1024u * 1024u) {
|
||||
max_bytes = (size_t)4u * 1024u * 1024u;
|
||||
}
|
||||
if (requested_bytes > max_bytes) {
|
||||
return max_bytes;
|
||||
}
|
||||
return requested_bytes;
|
||||
}
|
||||
|
||||
static int choose_stream_count(int mp_count, int planned_profiles, size_t total_budget, int have_streams) {
|
||||
int stream_count = 1;
|
||||
if (!have_streams || mp_count <= 0 || planned_profiles <= 0) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
stream_count = mp_count / 8;
|
||||
if (stream_count < 2) {
|
||||
stream_count = 2;
|
||||
}
|
||||
if (stream_count > MAX_STRESS_STREAMS) {
|
||||
stream_count = MAX_STRESS_STREAMS;
|
||||
}
|
||||
|
||||
while (stream_count > 1) {
|
||||
size_t per_stream_budget = total_budget / ((size_t)planned_profiles * (size_t)stream_count);
|
||||
if (per_stream_budget >= MIN_STREAM_BUDGET_BYTES) {
|
||||
break;
|
||||
}
|
||||
stream_count--;
|
||||
}
|
||||
return stream_count;
|
||||
}
|
||||
|
||||
static void destroy_streams(struct cuda_api *api, CUstream *streams, int count) {
|
||||
if (!api->cuStreamDestroy) {
|
||||
return;
|
||||
}
|
||||
for (int i = 0; i < count; i++) {
|
||||
if (streams[i]) {
|
||||
api->cuStreamDestroy(streams[i]);
|
||||
streams[i] = NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if HAVE_CUBLASLT_HEADERS
|
||||
static void append_detail(char *buf, size_t cap, const char *fmt, ...) {
|
||||
size_t len = strlen(buf);
|
||||
@@ -242,12 +332,19 @@ static int run_ptx_fallback(struct cuda_api *api,
|
||||
int size_mb,
|
||||
struct stress_report *report) {
|
||||
CUcontext ctx = NULL;
|
||||
CUdeviceptr device_mem = 0;
|
||||
CUmodule module = NULL;
|
||||
CUfunction kernel = NULL;
|
||||
uint32_t sample[256];
|
||||
uint32_t words = 0;
|
||||
CUdeviceptr device_mem[MAX_STRESS_STREAMS] = {0};
|
||||
CUstream streams[MAX_STRESS_STREAMS] = {0};
|
||||
uint32_t words[MAX_STRESS_STREAMS] = {0};
|
||||
uint32_t rounds[MAX_STRESS_STREAMS] = {0};
|
||||
void *params[MAX_STRESS_STREAMS][3];
|
||||
size_t bytes_per_stream[MAX_STRESS_STREAMS] = {0};
|
||||
unsigned long iterations = 0;
|
||||
int mp_count = 0;
|
||||
int stream_count = 1;
|
||||
int launches_per_wave = 0;
|
||||
|
||||
memset(report, 0, sizeof(*report));
|
||||
snprintf(report->backend, sizeof(report->backend), "driver-ptx");
|
||||
@@ -260,64 +357,109 @@ static int run_ptx_fallback(struct cuda_api *api,
|
||||
return 0;
|
||||
}
|
||||
|
||||
size_t bytes = (size_t)size_mb * 1024u * 1024u;
|
||||
if (bytes < 4u * 1024u * 1024u) {
|
||||
bytes = 4u * 1024u * 1024u;
|
||||
size_t requested_bytes = (size_t)size_mb * 1024u * 1024u;
|
||||
if (requested_bytes < MIN_PROFILE_BUDGET_BYTES) {
|
||||
requested_bytes = MIN_PROFILE_BUDGET_BYTES;
|
||||
}
|
||||
if (bytes > (size_t)1024u * 1024u * 1024u) {
|
||||
bytes = (size_t)1024u * 1024u * 1024u;
|
||||
size_t total_bytes = clamp_budget_to_free_memory(api, requested_bytes);
|
||||
if (total_bytes < MIN_PROFILE_BUDGET_BYTES) {
|
||||
total_bytes = MIN_PROFILE_BUDGET_BYTES;
|
||||
}
|
||||
words = (uint32_t)(bytes / sizeof(uint32_t));
|
||||
report->buffer_mb = (int)(total_bytes / (1024u * 1024u));
|
||||
|
||||
if (!check_rc(api, "cuMemAlloc", api->cuMemAlloc(&device_mem, bytes))) {
|
||||
api->cuCtxDestroy(ctx);
|
||||
return 0;
|
||||
if (query_multiprocessor_count(api, dev, &mp_count) &&
|
||||
api->cuStreamCreate &&
|
||||
api->cuStreamDestroy) {
|
||||
stream_count = choose_stream_count(mp_count, 1, total_bytes, 1);
|
||||
}
|
||||
if (!check_rc(api, "cuMemsetD8", api->cuMemsetD8(device_mem, 0, bytes))) {
|
||||
api->cuMemFree(device_mem);
|
||||
api->cuCtxDestroy(ctx);
|
||||
return 0;
|
||||
if (stream_count > 1) {
|
||||
int created = 0;
|
||||
for (; created < stream_count; created++) {
|
||||
if (!check_rc(api, "cuStreamCreate", api->cuStreamCreate(&streams[created], 0))) {
|
||||
destroy_streams(api, streams, created);
|
||||
stream_count = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
report->stream_count = stream_count;
|
||||
|
||||
for (int lane = 0; lane < stream_count; lane++) {
|
||||
size_t slice = total_bytes / (size_t)stream_count;
|
||||
if (lane == stream_count - 1) {
|
||||
slice = total_bytes - ((size_t)lane * (total_bytes / (size_t)stream_count));
|
||||
}
|
||||
slice = round_down_size(slice, sizeof(uint32_t));
|
||||
if (slice < MIN_PROFILE_BUDGET_BYTES) {
|
||||
slice = MIN_PROFILE_BUDGET_BYTES;
|
||||
}
|
||||
bytes_per_stream[lane] = slice;
|
||||
words[lane] = (uint32_t)(slice / sizeof(uint32_t));
|
||||
|
||||
if (!check_rc(api, "cuMemAlloc", api->cuMemAlloc(&device_mem[lane], slice))) {
|
||||
goto fail;
|
||||
}
|
||||
if (!check_rc(api, "cuMemsetD8", api->cuMemsetD8(device_mem[lane], 0, slice))) {
|
||||
goto fail;
|
||||
}
|
||||
rounds[lane] = 2048;
|
||||
params[lane][0] = &device_mem[lane];
|
||||
params[lane][1] = &words[lane];
|
||||
params[lane][2] = &rounds[lane];
|
||||
}
|
||||
|
||||
if (!check_rc(api,
|
||||
"cuModuleLoadDataEx",
|
||||
api->cuModuleLoadDataEx(&module, ptx_source, 0, NULL, NULL))) {
|
||||
api->cuMemFree(device_mem);
|
||||
api->cuCtxDestroy(ctx);
|
||||
return 0;
|
||||
goto fail;
|
||||
}
|
||||
if (!check_rc(api, "cuModuleGetFunction", api->cuModuleGetFunction(&kernel, module, "burn"))) {
|
||||
api->cuMemFree(device_mem);
|
||||
api->cuCtxDestroy(ctx);
|
||||
return 0;
|
||||
goto fail;
|
||||
}
|
||||
|
||||
unsigned int threads = 256;
|
||||
unsigned int blocks = (unsigned int)((words + threads - 1) / threads);
|
||||
uint32_t rounds = 1024;
|
||||
void *params[] = {&device_mem, &words, &rounds};
|
||||
|
||||
double start = now_seconds();
|
||||
double deadline = start + (double)seconds;
|
||||
while (now_seconds() < deadline) {
|
||||
if (!check_rc(api,
|
||||
"cuLaunchKernel",
|
||||
api->cuLaunchKernel(kernel, blocks, 1, 1, threads, 1, 1, 0, NULL, params, NULL))) {
|
||||
api->cuMemFree(device_mem);
|
||||
api->cuCtxDestroy(ctx);
|
||||
return 0;
|
||||
launches_per_wave = 0;
|
||||
for (int depth = 0; depth < STRESS_LAUNCH_DEPTH && now_seconds() < deadline; depth++) {
|
||||
int launched_this_batch = 0;
|
||||
for (int lane = 0; lane < stream_count; lane++) {
|
||||
unsigned int blocks = (unsigned int)((words[lane] + threads - 1) / threads);
|
||||
if (!check_rc(api,
|
||||
"cuLaunchKernel",
|
||||
api->cuLaunchKernel(kernel,
|
||||
blocks,
|
||||
1,
|
||||
1,
|
||||
threads,
|
||||
1,
|
||||
1,
|
||||
0,
|
||||
streams[lane],
|
||||
params[lane],
|
||||
NULL))) {
|
||||
goto fail;
|
||||
}
|
||||
launches_per_wave++;
|
||||
launched_this_batch++;
|
||||
}
|
||||
if (launched_this_batch <= 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
iterations++;
|
||||
if (launches_per_wave <= 0) {
|
||||
goto fail;
|
||||
}
|
||||
if (!check_rc(api, "cuCtxSynchronize", api->cuCtxSynchronize())) {
|
||||
goto fail;
|
||||
}
|
||||
iterations += (unsigned long)launches_per_wave;
|
||||
}
|
||||
|
||||
if (!check_rc(api, "cuCtxSynchronize", api->cuCtxSynchronize())) {
|
||||
api->cuMemFree(device_mem);
|
||||
api->cuCtxDestroy(ctx);
|
||||
return 0;
|
||||
}
|
||||
if (!check_rc(api, "cuMemcpyDtoH", api->cuMemcpyDtoH(sample, device_mem, sizeof(sample)))) {
|
||||
api->cuMemFree(device_mem);
|
||||
api->cuCtxDestroy(ctx);
|
||||
return 0;
|
||||
if (!check_rc(api, "cuMemcpyDtoH", api->cuMemcpyDtoH(sample, device_mem[0], sizeof(sample)))) {
|
||||
goto fail;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < sizeof(sample) / sizeof(sample[0]); i++) {
|
||||
@@ -326,12 +468,34 @@ static int run_ptx_fallback(struct cuda_api *api,
|
||||
report->iterations = iterations;
|
||||
snprintf(report->details,
|
||||
sizeof(report->details),
|
||||
"profile_int32_fallback=OK iterations=%lu\n",
|
||||
"fallback_int32=OK requested_mb=%d actual_mb=%d streams=%d queue_depth=%d per_stream_mb=%zu iterations=%lu\n",
|
||||
size_mb,
|
||||
report->buffer_mb,
|
||||
report->stream_count,
|
||||
STRESS_LAUNCH_DEPTH,
|
||||
bytes_per_stream[0] / (1024u * 1024u),
|
||||
iterations);
|
||||
|
||||
api->cuMemFree(device_mem);
|
||||
for (int lane = 0; lane < stream_count; lane++) {
|
||||
if (device_mem[lane]) {
|
||||
api->cuMemFree(device_mem[lane]);
|
||||
}
|
||||
}
|
||||
destroy_streams(api, streams, stream_count);
|
||||
api->cuCtxDestroy(ctx);
|
||||
return 1;
|
||||
|
||||
fail:
|
||||
for (int lane = 0; lane < MAX_STRESS_STREAMS; lane++) {
|
||||
if (device_mem[lane]) {
|
||||
api->cuMemFree(device_mem[lane]);
|
||||
}
|
||||
}
|
||||
destroy_streams(api, streams, MAX_STRESS_STREAMS);
|
||||
if (ctx) {
|
||||
api->cuCtxDestroy(ctx);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
#if HAVE_CUBLASLT_HEADERS
|
||||
@@ -418,6 +582,7 @@ struct profile_desc {
|
||||
|
||||
struct prepared_profile {
|
||||
struct profile_desc desc;
|
||||
CUstream stream;
|
||||
cublasLtMatmulDesc_t op_desc;
|
||||
cublasLtMatrixLayout_t a_layout;
|
||||
cublasLtMatrixLayout_t b_layout;
|
||||
@@ -617,8 +782,8 @@ static uint64_t choose_square_dim(size_t budget_bytes, size_t bytes_per_cell, in
|
||||
if (dim < (uint64_t)multiple) {
|
||||
dim = (uint64_t)multiple;
|
||||
}
|
||||
if (dim > 8192u) {
|
||||
dim = 8192u;
|
||||
if (dim > 65536u) {
|
||||
dim = 65536u;
|
||||
}
|
||||
return dim;
|
||||
}
|
||||
@@ -704,10 +869,12 @@ static int prepare_profile(struct cublaslt_api *cublas,
|
||||
cublasLtHandle_t handle,
|
||||
struct cuda_api *cuda,
|
||||
const struct profile_desc *desc,
|
||||
CUstream stream,
|
||||
size_t profile_budget_bytes,
|
||||
struct prepared_profile *out) {
|
||||
memset(out, 0, sizeof(*out));
|
||||
out->desc = *desc;
|
||||
out->stream = stream;
|
||||
|
||||
size_t bytes_per_cell = 0;
|
||||
bytes_per_cell += bytes_for_elements(desc->a_type, 1);
|
||||
@@ -935,7 +1102,7 @@ static int run_cublas_profile(cublasLtHandle_t handle,
|
||||
&profile->heuristic.algo,
|
||||
(void *)(uintptr_t)profile->workspace_dev,
|
||||
profile->workspace_size,
|
||||
(cudaStream_t)0));
|
||||
profile->stream));
|
||||
}
|
||||
|
||||
static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
@@ -947,13 +1114,22 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
int size_mb,
|
||||
struct stress_report *report) {
|
||||
struct cublaslt_api cublas;
|
||||
struct prepared_profile prepared[sizeof(k_profiles) / sizeof(k_profiles[0])];
|
||||
struct prepared_profile prepared[MAX_STRESS_STREAMS * MAX_CUBLAS_PROFILES];
|
||||
cublasLtHandle_t handle = NULL;
|
||||
CUcontext ctx = NULL;
|
||||
CUstream streams[MAX_STRESS_STREAMS] = {0};
|
||||
uint16_t sample[256];
|
||||
int cc = cc_major * 10 + cc_minor;
|
||||
int planned = 0;
|
||||
int active = 0;
|
||||
int mp_count = 0;
|
||||
int stream_count = 1;
|
||||
int profile_count = (int)(sizeof(k_profiles) / sizeof(k_profiles[0]));
|
||||
int prepared_count = 0;
|
||||
int wave_launches = 0;
|
||||
size_t requested_budget = 0;
|
||||
size_t total_budget = 0;
|
||||
size_t per_profile_budget = 0;
|
||||
|
||||
memset(report, 0, sizeof(*report));
|
||||
snprintf(report->backend, sizeof(report->backend), "cublasLt");
|
||||
@@ -986,16 +1162,46 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
return 0;
|
||||
}
|
||||
|
||||
size_t total_budget = (size_t)size_mb * 1024u * 1024u;
|
||||
if (total_budget < (size_t)planned * 4u * 1024u * 1024u) {
|
||||
total_budget = (size_t)planned * 4u * 1024u * 1024u;
|
||||
requested_budget = (size_t)size_mb * 1024u * 1024u;
|
||||
if (requested_budget < (size_t)planned * MIN_PROFILE_BUDGET_BYTES) {
|
||||
requested_budget = (size_t)planned * MIN_PROFILE_BUDGET_BYTES;
|
||||
}
|
||||
size_t per_profile_budget = total_budget / (size_t)planned;
|
||||
if (per_profile_budget < 4u * 1024u * 1024u) {
|
||||
per_profile_budget = 4u * 1024u * 1024u;
|
||||
total_budget = clamp_budget_to_free_memory(cuda, requested_budget);
|
||||
if (total_budget < (size_t)planned * MIN_PROFILE_BUDGET_BYTES) {
|
||||
total_budget = (size_t)planned * MIN_PROFILE_BUDGET_BYTES;
|
||||
}
|
||||
if (query_multiprocessor_count(cuda, dev, &mp_count) &&
|
||||
cuda->cuStreamCreate &&
|
||||
cuda->cuStreamDestroy) {
|
||||
stream_count = choose_stream_count(mp_count, planned, total_budget, 1);
|
||||
}
|
||||
if (stream_count > 1) {
|
||||
int created = 0;
|
||||
for (; created < stream_count; created++) {
|
||||
if (!check_rc(cuda, "cuStreamCreate", cuda->cuStreamCreate(&streams[created], 0))) {
|
||||
destroy_streams(cuda, streams, created);
|
||||
stream_count = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
report->stream_count = stream_count;
|
||||
per_profile_budget = total_budget / ((size_t)planned * (size_t)stream_count);
|
||||
if (per_profile_budget < MIN_PROFILE_BUDGET_BYTES) {
|
||||
per_profile_budget = MIN_PROFILE_BUDGET_BYTES;
|
||||
}
|
||||
report->buffer_mb = (int)(total_budget / (1024u * 1024u));
|
||||
append_detail(report->details,
|
||||
sizeof(report->details),
|
||||
"requested_mb=%d actual_mb=%d streams=%d queue_depth=%d mp_count=%d per_worker_mb=%zu\n",
|
||||
size_mb,
|
||||
report->buffer_mb,
|
||||
report->stream_count,
|
||||
STRESS_LAUNCH_DEPTH,
|
||||
mp_count,
|
||||
per_profile_budget / (1024u * 1024u));
|
||||
|
||||
for (size_t i = 0; i < sizeof(k_profiles) / sizeof(k_profiles[0]); i++) {
|
||||
for (int i = 0; i < profile_count; i++) {
|
||||
const struct profile_desc *desc = &k_profiles[i];
|
||||
if (!(desc->enabled && cc >= desc->min_cc)) {
|
||||
append_detail(report->details,
|
||||
@@ -1005,63 +1211,87 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
desc->min_cc);
|
||||
continue;
|
||||
}
|
||||
if (prepare_profile(&cublas, handle, cuda, desc, per_profile_budget, &prepared[i])) {
|
||||
active++;
|
||||
append_detail(report->details,
|
||||
sizeof(report->details),
|
||||
"%s=READY dim=%llux%llux%llu block=%s\n",
|
||||
desc->name,
|
||||
(unsigned long long)prepared[i].m,
|
||||
(unsigned long long)prepared[i].n,
|
||||
(unsigned long long)prepared[i].k,
|
||||
desc->block_label);
|
||||
} else {
|
||||
append_detail(report->details, sizeof(report->details), "%s=SKIPPED unsupported\n", desc->name);
|
||||
for (int lane = 0; lane < stream_count; lane++) {
|
||||
CUstream stream = streams[lane];
|
||||
if (prepared_count >= (int)(sizeof(prepared) / sizeof(prepared[0]))) {
|
||||
break;
|
||||
}
|
||||
if (prepare_profile(&cublas, handle, cuda, desc, stream, per_profile_budget, &prepared[prepared_count])) {
|
||||
active++;
|
||||
append_detail(report->details,
|
||||
sizeof(report->details),
|
||||
"%s[%d]=READY dim=%llux%llux%llu block=%s stream=%d\n",
|
||||
desc->name,
|
||||
lane,
|
||||
(unsigned long long)prepared[prepared_count].m,
|
||||
(unsigned long long)prepared[prepared_count].n,
|
||||
(unsigned long long)prepared[prepared_count].k,
|
||||
desc->block_label,
|
||||
lane);
|
||||
prepared_count++;
|
||||
} else {
|
||||
append_detail(report->details,
|
||||
sizeof(report->details),
|
||||
"%s[%d]=SKIPPED unsupported\n",
|
||||
desc->name,
|
||||
lane);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (active <= 0) {
|
||||
cublas.cublasLtDestroy(handle);
|
||||
destroy_streams(cuda, streams, stream_count);
|
||||
cuda->cuCtxDestroy(ctx);
|
||||
return 0;
|
||||
}
|
||||
|
||||
double deadline = now_seconds() + (double)seconds;
|
||||
while (now_seconds() < deadline) {
|
||||
for (size_t i = 0; i < sizeof(prepared) / sizeof(prepared[0]); i++) {
|
||||
if (!prepared[i].ready) {
|
||||
continue;
|
||||
}
|
||||
if (!run_cublas_profile(handle, &cublas, &prepared[i])) {
|
||||
append_detail(report->details,
|
||||
sizeof(report->details),
|
||||
"%s=FAILED runtime\n",
|
||||
prepared[i].desc.name);
|
||||
for (size_t j = 0; j < sizeof(prepared) / sizeof(prepared[0]); j++) {
|
||||
destroy_profile(&cublas, cuda, &prepared[j]);
|
||||
wave_launches = 0;
|
||||
for (int depth = 0; depth < STRESS_LAUNCH_DEPTH && now_seconds() < deadline; depth++) {
|
||||
int launched_this_batch = 0;
|
||||
for (int i = 0; i < prepared_count; i++) {
|
||||
if (!prepared[i].ready) {
|
||||
continue;
|
||||
}
|
||||
cublas.cublasLtDestroy(handle);
|
||||
cuda->cuCtxDestroy(ctx);
|
||||
return 0;
|
||||
if (!run_cublas_profile(handle, &cublas, &prepared[i])) {
|
||||
append_detail(report->details,
|
||||
sizeof(report->details),
|
||||
"%s=FAILED runtime\n",
|
||||
prepared[i].desc.name);
|
||||
for (int j = 0; j < prepared_count; j++) {
|
||||
destroy_profile(&cublas, cuda, &prepared[j]);
|
||||
}
|
||||
cublas.cublasLtDestroy(handle);
|
||||
destroy_streams(cuda, streams, stream_count);
|
||||
cuda->cuCtxDestroy(ctx);
|
||||
return 0;
|
||||
}
|
||||
prepared[i].iterations++;
|
||||
report->iterations++;
|
||||
wave_launches++;
|
||||
launched_this_batch++;
|
||||
}
|
||||
prepared[i].iterations++;
|
||||
report->iterations++;
|
||||
if (now_seconds() >= deadline) {
|
||||
if (launched_this_batch <= 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!check_rc(cuda, "cuCtxSynchronize", cuda->cuCtxSynchronize())) {
|
||||
for (size_t i = 0; i < sizeof(prepared) / sizeof(prepared[0]); i++) {
|
||||
destroy_profile(&cublas, cuda, &prepared[i]);
|
||||
if (wave_launches <= 0) {
|
||||
break;
|
||||
}
|
||||
if (!check_rc(cuda, "cuCtxSynchronize", cuda->cuCtxSynchronize())) {
|
||||
for (int i = 0; i < prepared_count; i++) {
|
||||
destroy_profile(&cublas, cuda, &prepared[i]);
|
||||
}
|
||||
cublas.cublasLtDestroy(handle);
|
||||
destroy_streams(cuda, streams, stream_count);
|
||||
cuda->cuCtxDestroy(ctx);
|
||||
return 0;
|
||||
}
|
||||
cublas.cublasLtDestroy(handle);
|
||||
cuda->cuCtxDestroy(ctx);
|
||||
return 0;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < sizeof(prepared) / sizeof(prepared[0]); i++) {
|
||||
for (int i = 0; i < prepared_count; i++) {
|
||||
if (!prepared[i].ready) {
|
||||
continue;
|
||||
}
|
||||
@@ -1072,7 +1302,7 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
prepared[i].iterations);
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < sizeof(prepared) / sizeof(prepared[0]); i++) {
|
||||
for (int i = 0; i < prepared_count; i++) {
|
||||
if (prepared[i].ready) {
|
||||
if (check_rc(cuda, "cuMemcpyDtoH", cuda->cuMemcpyDtoH(sample, prepared[i].d_dev, sizeof(sample)))) {
|
||||
for (size_t j = 0; j < sizeof(sample) / sizeof(sample[0]); j++) {
|
||||
@@ -1083,10 +1313,11 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < sizeof(prepared) / sizeof(prepared[0]); i++) {
|
||||
for (int i = 0; i < prepared_count; i++) {
|
||||
destroy_profile(&cublas, cuda, &prepared[i]);
|
||||
}
|
||||
cublas.cublasLtDestroy(handle);
|
||||
destroy_streams(cuda, streams, stream_count);
|
||||
cuda->cuCtxDestroy(ctx);
|
||||
return 1;
|
||||
}
|
||||
@@ -1095,13 +1326,16 @@ static int run_cublaslt_stress(struct cuda_api *cuda,
|
||||
int main(int argc, char **argv) {
|
||||
int seconds = 5;
|
||||
int size_mb = 64;
|
||||
int device_index = 0;
|
||||
for (int i = 1; i < argc; i++) {
|
||||
if ((strcmp(argv[i], "--seconds") == 0 || strcmp(argv[i], "-t") == 0) && i + 1 < argc) {
|
||||
seconds = atoi(argv[++i]);
|
||||
} else if ((strcmp(argv[i], "--size-mb") == 0 || strcmp(argv[i], "-m") == 0) && i + 1 < argc) {
|
||||
size_mb = atoi(argv[++i]);
|
||||
} else if ((strcmp(argv[i], "--device") == 0 || strcmp(argv[i], "-d") == 0) && i + 1 < argc) {
|
||||
device_index = atoi(argv[++i]);
|
||||
} else {
|
||||
fprintf(stderr, "usage: %s [--seconds N] [--size-mb N]\n", argv[0]);
|
||||
fprintf(stderr, "usage: %s [--seconds N] [--size-mb N] [--device N]\n", argv[0]);
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
@@ -1111,6 +1345,9 @@ int main(int argc, char **argv) {
|
||||
if (size_mb <= 0) {
|
||||
size_mb = 64;
|
||||
}
|
||||
if (device_index < 0) {
|
||||
device_index = 0;
|
||||
}
|
||||
|
||||
struct cuda_api cuda;
|
||||
if (!load_cuda(&cuda)) {
|
||||
@@ -1133,8 +1370,13 @@ int main(int argc, char **argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (device_index >= count) {
|
||||
fprintf(stderr, "device index %d out of range (found %d CUDA device(s))\n", device_index, count);
|
||||
return 1;
|
||||
}
|
||||
|
||||
CUdevice dev = 0;
|
||||
if (!check_rc(&cuda, "cuDeviceGet", cuda.cuDeviceGet(&dev, 0))) {
|
||||
if (!check_rc(&cuda, "cuDeviceGet", cuda.cuDeviceGet(&dev, device_index))) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -1162,10 +1404,12 @@ int main(int argc, char **argv) {
|
||||
}
|
||||
|
||||
printf("device=%s\n", report.device);
|
||||
printf("device_index=%d\n", device_index);
|
||||
printf("compute_capability=%d.%d\n", report.cc_major, report.cc_minor);
|
||||
printf("backend=%s\n", report.backend);
|
||||
printf("duration_s=%d\n", seconds);
|
||||
printf("buffer_mb=%d\n", report.buffer_mb);
|
||||
printf("streams=%d\n", report.stream_count);
|
||||
printf("iterations=%lu\n", report.iterations);
|
||||
printf("checksum=%llu\n", (unsigned long long)report.checksum);
|
||||
if (report.details[0] != '\0') {
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
#!/bin/sh
|
||||
# build-cublas.sh — download cuBLASLt/cuBLAS/cudart runtime + headers for bee-gpu-stress.
|
||||
# build-cublas.sh — download cuBLASLt/cuBLAS/cudart runtime + headers for bee-gpu-burn worker.
|
||||
#
|
||||
# Downloads .deb packages from NVIDIA's CUDA apt repository (Debian 12, x86_64),
|
||||
# verifies them against Packages.gz, and extracts the small subset we need:
|
||||
# - headers for compiling bee-gpu-stress against cuBLASLt
|
||||
# - headers for compiling bee-gpu-burn worker against cuBLASLt
|
||||
# - runtime libs for libcublas, libcublasLt, libcudart inside the ISO
|
||||
|
||||
set -e
|
||||
|
||||
55
iso/builder/build-john.sh
Normal file
55
iso/builder/build-john.sh
Normal file
@@ -0,0 +1,55 @@
|
||||
#!/bin/sh
|
||||
# build-john.sh — build John the Ripper jumbo with OpenCL support for the LiveCD.
|
||||
#
|
||||
# Downloads a pinned source snapshot from the official openwall/john repository,
|
||||
# builds it inside the builder container, and caches the resulting run/ tree.
|
||||
|
||||
set -e
|
||||
|
||||
JOHN_COMMIT="$1"
|
||||
DIST_DIR="$2"
|
||||
|
||||
[ -n "$JOHN_COMMIT" ] || { echo "usage: $0 <john-commit> <dist-dir>"; exit 1; }
|
||||
[ -n "$DIST_DIR" ] || { echo "usage: $0 <john-commit> <dist-dir>"; exit 1; }
|
||||
|
||||
echo "=== John the Ripper jumbo ${JOHN_COMMIT} ==="
|
||||
|
||||
CACHE_DIR="${DIST_DIR}/john-${JOHN_COMMIT}"
|
||||
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/john-downloads"
|
||||
SRC_TAR="${DOWNLOAD_CACHE_DIR}/john-${JOHN_COMMIT}.tar.gz"
|
||||
SRC_URL="https://github.com/openwall/john/archive/${JOHN_COMMIT}.tar.gz"
|
||||
|
||||
if [ -x "${CACHE_DIR}/run/john" ] && [ -f "${CACHE_DIR}/run/john.conf" ]; then
|
||||
echo "=== john cached, skipping build ==="
|
||||
echo "run dir: ${CACHE_DIR}/run"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
mkdir -p "${DOWNLOAD_CACHE_DIR}"
|
||||
if [ ! -f "${SRC_TAR}" ]; then
|
||||
echo "=== downloading john source snapshot ==="
|
||||
wget --show-progress -O "${SRC_TAR}" "${SRC_URL}"
|
||||
fi
|
||||
|
||||
BUILD_TMP=$(mktemp -d)
|
||||
trap 'rm -rf "${BUILD_TMP}"' EXIT INT TERM
|
||||
|
||||
cd "${BUILD_TMP}"
|
||||
tar xf "${SRC_TAR}"
|
||||
SRC_DIR=$(find . -maxdepth 1 -type d -name 'john-*' | head -1)
|
||||
[ -n "${SRC_DIR}" ] || { echo "ERROR: john source directory not found"; exit 1; }
|
||||
|
||||
cd "${SRC_DIR}/src"
|
||||
echo "=== configuring john ==="
|
||||
./configure
|
||||
echo "=== building john ==="
|
||||
make clean >/dev/null 2>&1 || true
|
||||
make -j"$(nproc)"
|
||||
|
||||
mkdir -p "${CACHE_DIR}"
|
||||
cp -a "../run" "${CACHE_DIR}/run"
|
||||
chmod +x "${CACHE_DIR}/run/john"
|
||||
|
||||
echo "=== john build complete ==="
|
||||
echo "run dir: ${CACHE_DIR}/run"
|
||||
@@ -9,6 +9,7 @@
|
||||
#
|
||||
# Output layout:
|
||||
# $CACHE_DIR/bin/all_reduce_perf
|
||||
# $CACHE_DIR/lib/libcudart.so* copied from the nvcc toolchain used to build nccl-tests
|
||||
|
||||
set -e
|
||||
|
||||
@@ -30,7 +31,7 @@ CACHE_DIR="${DIST_DIR}/nccl-tests-${NCCL_TESTS_VERSION}"
|
||||
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nccl-tests-downloads"
|
||||
|
||||
if [ -f "${CACHE_DIR}/bin/all_reduce_perf" ]; then
|
||||
if [ -f "${CACHE_DIR}/bin/all_reduce_perf" ] && [ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcudart.so*' 2>/dev/null | wc -l)" -gt 0 ]; then
|
||||
echo "=== nccl-tests cached, skipping build ==="
|
||||
echo "binary: ${CACHE_DIR}/bin/all_reduce_perf"
|
||||
exit 0
|
||||
@@ -52,6 +53,23 @@ echo "nvcc: $NVCC"
|
||||
CUDA_HOME="$(dirname "$(dirname "$NVCC")")"
|
||||
echo "CUDA_HOME: $CUDA_HOME"
|
||||
|
||||
find_cudart_dir() {
|
||||
for dir in \
|
||||
"${CUDA_HOME}/targets/x86_64-linux/lib" \
|
||||
"${CUDA_HOME}/targets/x86_64-linux/lib/stubs" \
|
||||
"${CUDA_HOME}/lib64" \
|
||||
"${CUDA_HOME}/lib"; do
|
||||
if [ -d "$dir" ] && find "$dir" -maxdepth 1 -name 'libcudart.so*' -type f | grep -q .; then
|
||||
printf '%s\n' "$dir"
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
CUDART_DIR="$(find_cudart_dir)" || { echo "ERROR: libcudart.so* not found under ${CUDA_HOME}"; exit 1; }
|
||||
echo "cudart dir: $CUDART_DIR"
|
||||
|
||||
# Download libnccl-dev for nccl.h
|
||||
REPO_BASE="https://developer.download.nvidia.com/compute/cuda/repos/debian${DEBIAN_VERSION}/x86_64"
|
||||
DEV_PKG="libnccl-dev_${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}_amd64.deb"
|
||||
@@ -136,6 +154,11 @@ mkdir -p "${CACHE_DIR}/bin"
|
||||
cp "./build/all_reduce_perf" "${CACHE_DIR}/bin/all_reduce_perf"
|
||||
chmod +x "${CACHE_DIR}/bin/all_reduce_perf"
|
||||
|
||||
mkdir -p "${CACHE_DIR}/lib"
|
||||
find "${CUDART_DIR}" -maxdepth 1 -name 'libcudart.so*' -type f -exec cp -a {} "${CACHE_DIR}/lib/" \;
|
||||
[ "$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libcudart.so*' -type f | wc -l)" -gt 0 ] || { echo "ERROR: libcudart runtime copy failed"; exit 1; }
|
||||
|
||||
echo "=== nccl-tests build complete ==="
|
||||
echo "binary: ${CACHE_DIR}/bin/all_reduce_perf"
|
||||
ls -lh "${CACHE_DIR}/bin/all_reduce_perf"
|
||||
ls -lh "${CACHE_DIR}/lib/"libcudart.so* 2>/dev/null || true
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
# Output layout:
|
||||
# $CACHE_DIR/modules/ — nvidia*.ko files
|
||||
# $CACHE_DIR/bin/ — nvidia-smi, nvidia-debugdump
|
||||
# $CACHE_DIR/lib/ — libnvidia-ml.so*, libcuda.so* (for nvidia-smi)
|
||||
# $CACHE_DIR/lib/ — libnvidia-ml.so*, libcuda.so*, OpenCL-related libs
|
||||
|
||||
set -e
|
||||
|
||||
@@ -46,7 +46,10 @@ CACHE_DIR="${DIST_DIR}/nvidia-${NVIDIA_VERSION}-${KVER}"
|
||||
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/nvidia-downloads"
|
||||
EXTRACT_CACHE_DIR="${CACHE_ROOT}/nvidia-extract"
|
||||
CACHE_LAYOUT_VERSION="2"
|
||||
CACHE_LAYOUT_MARKER="${CACHE_DIR}/.cache-layout-v${CACHE_LAYOUT_VERSION}"
|
||||
if [ -d "$CACHE_DIR/modules" ] && [ -f "$CACHE_DIR/bin/nvidia-smi" ] \
|
||||
&& [ -f "$CACHE_LAYOUT_MARKER" ] \
|
||||
&& [ "$(ls "$CACHE_DIR/lib/libnvidia-ptxjitcompiler.so."* 2>/dev/null | wc -l)" -gt 0 ]; then
|
||||
echo "=== NVIDIA cached, skipping build ==="
|
||||
echo "cache: $CACHE_DIR"
|
||||
@@ -130,17 +133,30 @@ else
|
||||
echo "WARNING: no firmware/ dir found in installer (may be needed for Hopper GPUs)"
|
||||
fi
|
||||
|
||||
# Copy ALL userspace library files.
|
||||
# libnvidia-ptxjitcompiler is required by libcuda for PTX JIT compilation
|
||||
# (cuModuleLoadDataEx with PTX source) — without it CUDA_ERROR_JIT_COMPILER_NOT_FOUND.
|
||||
for lib in libnvidia-ml libcuda libnvidia-ptxjitcompiler; do
|
||||
count=0
|
||||
for f in $(find "$EXTRACT_DIR" -maxdepth 1 -name "${lib}.so.*" 2>/dev/null); do
|
||||
cp "$f" "$CACHE_DIR/lib/" && count=$((count+1))
|
||||
done
|
||||
if [ "$count" -eq 0 ]; then
|
||||
echo "ERROR: ${lib}.so.* not found in $EXTRACT_DIR"
|
||||
ls "$EXTRACT_DIR/"*.so* 2>/dev/null | head -20 || true
|
||||
# Copy NVIDIA userspace libraries broadly instead of whitelisting a few names.
|
||||
# Newer driver branches add extra runtime deps (for example OpenCL/compiler side
|
||||
# libraries). If we only copy a narrow allowlist, clinfo/John can see nvidia.icd
|
||||
# but still fail with "no OpenCL platforms" because one dependent .so is absent.
|
||||
copied_libs=0
|
||||
for f in $(find "$EXTRACT_DIR" -maxdepth 1 \( -name 'libnvidia*.so.*' -o -name 'libcuda.so.*' \) -type f 2>/dev/null | sort); do
|
||||
cp "$f" "$CACHE_DIR/lib/"
|
||||
copied_libs=$((copied_libs+1))
|
||||
done
|
||||
|
||||
if [ "$copied_libs" -eq 0 ]; then
|
||||
echo "ERROR: no NVIDIA userspace libraries found in $EXTRACT_DIR"
|
||||
ls "$EXTRACT_DIR/"*.so* 2>/dev/null | head -40 || true
|
||||
exit 1
|
||||
fi
|
||||
|
||||
for lib in \
|
||||
libnvidia-ml \
|
||||
libcuda \
|
||||
libnvidia-ptxjitcompiler \
|
||||
libnvidia-opencl; do
|
||||
if ! ls "$CACHE_DIR/lib/${lib}.so."* >/dev/null 2>&1; then
|
||||
echo "ERROR: required ${lib}.so.* not found in extracted userspace libs"
|
||||
ls "$CACHE_DIR/lib/" | sort >&2 || true
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
@@ -149,16 +165,17 @@ done
|
||||
ko_count=$(ls "$CACHE_DIR/modules/"*.ko 2>/dev/null | wc -l)
|
||||
[ "$ko_count" -gt 0 ] || { echo "ERROR: no .ko files built in $CACHE_DIR/modules/"; exit 1; }
|
||||
|
||||
# Create soname symlinks: use [0-9][0-9]* to avoid circular symlink (.so.1 has single digit)
|
||||
for lib in libnvidia-ml libcuda libnvidia-ptxjitcompiler; do
|
||||
versioned=$(ls "$CACHE_DIR/lib/${lib}.so."[0-9][0-9]* 2>/dev/null | head -1)
|
||||
[ -n "$versioned" ] || continue
|
||||
# Create soname symlinks for every copied versioned library.
|
||||
for versioned in "$CACHE_DIR"/lib/*.so.*; do
|
||||
[ -f "$versioned" ] || continue
|
||||
base=$(basename "$versioned")
|
||||
ln -sf "$base" "$CACHE_DIR/lib/${lib}.so.1"
|
||||
ln -sf "${lib}.so.1" "$CACHE_DIR/lib/${lib}.so" 2>/dev/null || true
|
||||
echo "${lib}: .so.1 -> $base"
|
||||
stem=${base%%.so.*}
|
||||
ln -sf "$base" "$CACHE_DIR/lib/${stem}.so.1"
|
||||
ln -sf "${stem}.so.1" "$CACHE_DIR/lib/${stem}.so" 2>/dev/null || true
|
||||
done
|
||||
|
||||
touch "$CACHE_LAYOUT_MARKER"
|
||||
|
||||
echo "=== NVIDIA build complete ==="
|
||||
echo "cache: $CACHE_DIR"
|
||||
echo "modules: $ko_count .ko files"
|
||||
|
||||
@@ -38,6 +38,7 @@ export BEE_GPU_VENDOR
|
||||
|
||||
. "${BUILDER_DIR}/VERSIONS"
|
||||
export PATH="$PATH:/usr/local/go/bin"
|
||||
: "${BEE_REQUIRE_MEMTEST:=0}"
|
||||
|
||||
# Allow git to read the bind-mounted repo (different UID inside container).
|
||||
git config --global safe.directory "${REPO_ROOT}"
|
||||
@@ -111,8 +112,629 @@ resolve_iso_version() {
|
||||
resolve_audit_version
|
||||
}
|
||||
|
||||
iso_list_files() {
|
||||
iso_path="$1"
|
||||
|
||||
if command -v bsdtar >/dev/null 2>&1; then
|
||||
bsdtar -tf "$iso_path"
|
||||
return $?
|
||||
fi
|
||||
|
||||
if command -v xorriso >/dev/null 2>&1; then
|
||||
xorriso -indev "$iso_path" -find / -type f -print 2>/dev/null | sed 's#^/##'
|
||||
return $?
|
||||
fi
|
||||
|
||||
return 127
|
||||
}
|
||||
|
||||
iso_extract_file() {
|
||||
iso_path="$1"
|
||||
iso_member="$2"
|
||||
|
||||
if command -v bsdtar >/dev/null 2>&1; then
|
||||
bsdtar -xOf "$iso_path" "$iso_member"
|
||||
return $?
|
||||
fi
|
||||
|
||||
if command -v xorriso >/dev/null 2>&1; then
|
||||
xorriso -osirrox on -indev "$iso_path" -cat "/$iso_member" 2>/dev/null
|
||||
return $?
|
||||
fi
|
||||
|
||||
return 127
|
||||
}
|
||||
|
||||
iso_read_file_list() {
|
||||
iso_path="$1"
|
||||
out_path="$2"
|
||||
|
||||
iso_list_files "$iso_path" > "$out_path" || return 1
|
||||
[ -s "$out_path" ] || return 1
|
||||
return 0
|
||||
}
|
||||
|
||||
iso_read_member() {
|
||||
iso_path="$1"
|
||||
iso_member="$2"
|
||||
out_path="$3"
|
||||
|
||||
iso_extract_file "$iso_path" "$iso_member" > "$out_path" || return 1
|
||||
[ -s "$out_path" ] || return 1
|
||||
return 0
|
||||
}
|
||||
|
||||
require_iso_reader() {
|
||||
command -v bsdtar >/dev/null 2>&1 && return 0
|
||||
command -v xorriso >/dev/null 2>&1 && return 0
|
||||
memtest_fail "ISO reader is required for validation/debug (expected bsdtar or xorriso)" "${1:-}"
|
||||
}
|
||||
|
||||
dump_memtest_debug() {
|
||||
phase="$1"
|
||||
lb_dir="${2:-}"
|
||||
iso_path="${3:-}"
|
||||
phase_slug="$(printf '%s' "${phase}" | tr ' /' '__')"
|
||||
memtest_log="${LOG_DIR:-}/memtest-${phase_slug}.log"
|
||||
|
||||
(
|
||||
echo "=== memtest debug: ${phase} ==="
|
||||
|
||||
echo "-- auto/config --"
|
||||
if [ -f "${BUILDER_DIR}/auto/config" ]; then
|
||||
grep -n -- '--memtest' "${BUILDER_DIR}/auto/config" || echo " (no --memtest line found)"
|
||||
else
|
||||
echo " (missing ${BUILDER_DIR}/auto/config)"
|
||||
fi
|
||||
|
||||
echo "-- source bootloader templates --"
|
||||
for cfg in \
|
||||
"${BUILDER_DIR}/config/bootloaders/grub-pc/grub.cfg" \
|
||||
"${BUILDER_DIR}/config/bootloaders/isolinux/live.cfg.in"; do
|
||||
if [ -f "$cfg" ]; then
|
||||
echo " file: $cfg"
|
||||
grep -n 'Memory Test\|memtest' "$cfg" || echo " (no memtest lines)"
|
||||
fi
|
||||
done
|
||||
|
||||
echo "-- source binary hooks --"
|
||||
for hook in \
|
||||
"${BUILDER_DIR}/config/hooks/normal/9100-memtest.hook.binary"; do
|
||||
if [ -f "$hook" ]; then
|
||||
echo " hook: $hook"
|
||||
else
|
||||
echo " (missing $hook)"
|
||||
fi
|
||||
done
|
||||
|
||||
if [ -n "$lb_dir" ] && [ -d "$lb_dir" ]; then
|
||||
echo "-- live-build workdir package lists --"
|
||||
for pkg in \
|
||||
"$lb_dir/config/package-lists/bee.list.chroot" \
|
||||
"$lb_dir/config/package-lists/bee-gpu.list.chroot" \
|
||||
"$lb_dir/config/package-lists/bee-nvidia.list.chroot"; do
|
||||
if [ -f "$pkg" ]; then
|
||||
echo " file: $pkg"
|
||||
grep -n 'memtest' "$pkg" || echo " (no memtest lines)"
|
||||
fi
|
||||
done
|
||||
|
||||
echo "-- live-build chroot/boot --"
|
||||
if [ -d "$lb_dir/chroot/boot" ]; then
|
||||
find "$lb_dir/chroot/boot" -maxdepth 1 -name 'memtest*' -print | sed 's/^/ /' || true
|
||||
else
|
||||
echo " (missing $lb_dir/chroot/boot)"
|
||||
fi
|
||||
|
||||
echo "-- live-build binary/boot --"
|
||||
if [ -d "$lb_dir/binary/boot" ]; then
|
||||
find "$lb_dir/binary/boot" -maxdepth 1 -name 'memtest*' -print | sed 's/^/ /' || true
|
||||
else
|
||||
echo " (missing $lb_dir/binary/boot)"
|
||||
fi
|
||||
|
||||
echo "-- live-build binary grub cfg --"
|
||||
if [ -f "$lb_dir/binary/boot/grub/grub.cfg" ]; then
|
||||
grep -n 'Memory Test\|memtest' "$lb_dir/binary/boot/grub/grub.cfg" || echo " (no memtest lines)"
|
||||
else
|
||||
echo " (missing $lb_dir/binary/boot/grub/grub.cfg)"
|
||||
fi
|
||||
|
||||
echo "-- live-build binary isolinux cfg --"
|
||||
if [ -f "$lb_dir/binary/isolinux/live.cfg" ]; then
|
||||
grep -n 'Memory Test\|memtest' "$lb_dir/binary/isolinux/live.cfg" || echo " (no memtest lines)"
|
||||
else
|
||||
echo " (missing $lb_dir/binary/isolinux/live.cfg)"
|
||||
fi
|
||||
|
||||
echo "-- live-build package cache --"
|
||||
if [ -d "$lb_dir/cache/packages.chroot" ]; then
|
||||
find "$lb_dir/cache/packages.chroot" -maxdepth 1 -name 'memtest86+*.deb' -print | sed 's/^/ /' || true
|
||||
else
|
||||
echo " (missing $lb_dir/cache/packages.chroot)"
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ -n "$iso_path" ] && [ -f "$iso_path" ]; then
|
||||
iso_files="$(mktemp)"
|
||||
iso_grub_cfg="$(mktemp)"
|
||||
iso_isolinux_cfg="$(mktemp)"
|
||||
|
||||
echo "-- ISO memtest files --"
|
||||
if iso_read_file_list "$iso_path" "$iso_files"; then
|
||||
grep 'memtest' "$iso_files" | sed 's/^/ /' || echo " (no memtest files in ISO)"
|
||||
else
|
||||
echo " (failed to list ISO contents)"
|
||||
fi
|
||||
|
||||
echo "-- ISO GRUB memtest lines --"
|
||||
if iso_read_member "$iso_path" boot/grub/grub.cfg "$iso_grub_cfg"; then
|
||||
grep -n 'Memory Test\|memtest' "$iso_grub_cfg" || echo " (no memtest lines in boot/grub/grub.cfg)"
|
||||
else
|
||||
echo " (failed to read boot/grub/grub.cfg from ISO)"
|
||||
fi
|
||||
|
||||
echo "-- ISO isolinux memtest lines --"
|
||||
if iso_read_member "$iso_path" isolinux/live.cfg "$iso_isolinux_cfg"; then
|
||||
grep -n 'Memory Test\|memtest' "$iso_isolinux_cfg" || echo " (no memtest lines in isolinux/live.cfg)"
|
||||
else
|
||||
echo " (failed to read isolinux/live.cfg from ISO)"
|
||||
fi
|
||||
|
||||
rm -f "$iso_files" "$iso_grub_cfg" "$iso_isolinux_cfg"
|
||||
fi
|
||||
|
||||
echo "=== end memtest debug: ${phase} ==="
|
||||
) | {
|
||||
if [ -n "${LOG_DIR:-}" ] && [ -d "${LOG_DIR}" ]; then
|
||||
tee "${memtest_log}"
|
||||
else
|
||||
cat
|
||||
fi
|
||||
}
|
||||
}
|
||||
|
||||
memtest_fail() {
|
||||
msg="$1"
|
||||
iso_path="${2:-}"
|
||||
level="WARNING"
|
||||
if [ "${BEE_REQUIRE_MEMTEST:-0}" = "1" ]; then
|
||||
level="ERROR"
|
||||
fi
|
||||
echo "${level}: ${msg}" >&2
|
||||
dump_memtest_debug "failure" "${LB_DIR:-}" "$iso_path" >&2
|
||||
if [ "${BEE_REQUIRE_MEMTEST:-0}" = "1" ]; then
|
||||
exit 1
|
||||
fi
|
||||
return 0
|
||||
}
|
||||
|
||||
iso_memtest_present() {
|
||||
iso_path="$1"
|
||||
iso_files="$(mktemp)"
|
||||
|
||||
[ -f "$iso_path" ] || return 1
|
||||
|
||||
if command -v bsdtar >/dev/null 2>&1; then
|
||||
:
|
||||
elif command -v xorriso >/dev/null 2>&1; then
|
||||
:
|
||||
else
|
||||
return 2
|
||||
fi
|
||||
|
||||
iso_read_file_list "$iso_path" "$iso_files" || {
|
||||
rm -f "$iso_files"
|
||||
return 2
|
||||
}
|
||||
|
||||
grep -q '^boot/memtest86+x64\.bin$' "$iso_files" || {
|
||||
rm -f "$iso_files"
|
||||
return 1
|
||||
}
|
||||
grep -q '^boot/memtest86+x64\.efi$' "$iso_files" || {
|
||||
rm -f "$iso_files"
|
||||
return 1
|
||||
}
|
||||
|
||||
grub_cfg="$(mktemp)"
|
||||
isolinux_cfg="$(mktemp)"
|
||||
|
||||
iso_read_member "$iso_path" boot/grub/grub.cfg "$grub_cfg" || {
|
||||
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||
return 2
|
||||
}
|
||||
iso_read_member "$iso_path" isolinux/live.cfg "$isolinux_cfg" || {
|
||||
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||
return 2
|
||||
}
|
||||
|
||||
grep -q 'Memory Test (memtest86+)' "$grub_cfg" || {
|
||||
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||
return 1
|
||||
}
|
||||
grep -q '/boot/memtest86+x64\.efi' "$grub_cfg" || {
|
||||
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||
return 1
|
||||
}
|
||||
grep -q '/boot/memtest86+x64\.bin' "$grub_cfg" || {
|
||||
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||
return 1
|
||||
}
|
||||
grep -q 'Memory Test (memtest86+)' "$isolinux_cfg" || {
|
||||
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||
return 1
|
||||
}
|
||||
grep -q '/boot/memtest86+x64\.bin' "$isolinux_cfg" || {
|
||||
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||
return 1
|
||||
}
|
||||
|
||||
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||
return 0
|
||||
}
|
||||
|
||||
validate_iso_memtest() {
|
||||
iso_path="$1"
|
||||
echo "=== validating memtest in ISO ==="
|
||||
|
||||
[ -f "$iso_path" ] || {
|
||||
memtest_fail "ISO not found for validation: $iso_path" "$iso_path"
|
||||
return 0
|
||||
}
|
||||
require_iso_reader "$iso_path" || return 0
|
||||
|
||||
iso_files="$(mktemp)"
|
||||
iso_read_file_list "$iso_path" "$iso_files" || {
|
||||
memtest_fail "failed to list ISO contents while validating memtest" "$iso_path"
|
||||
rm -f "$iso_files"
|
||||
return 0
|
||||
}
|
||||
|
||||
grep -q '^boot/memtest86+x64\.bin$' "$iso_files" || {
|
||||
memtest_fail "memtest BIOS binary missing in ISO: boot/memtest86+x64.bin" "$iso_path"
|
||||
rm -f "$iso_files"
|
||||
return 0
|
||||
}
|
||||
grep -q '^boot/memtest86+x64\.efi$' "$iso_files" || {
|
||||
memtest_fail "memtest EFI binary missing in ISO: boot/memtest86+x64.efi" "$iso_path"
|
||||
rm -f "$iso_files"
|
||||
return 0
|
||||
}
|
||||
|
||||
grub_cfg="$(mktemp)"
|
||||
isolinux_cfg="$(mktemp)"
|
||||
|
||||
iso_read_member "$iso_path" boot/grub/grub.cfg "$grub_cfg" || {
|
||||
memtest_fail "failed to read boot/grub/grub.cfg from ISO" "$iso_path"
|
||||
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||
return 0
|
||||
}
|
||||
iso_read_member "$iso_path" isolinux/live.cfg "$isolinux_cfg" || {
|
||||
memtest_fail "failed to read isolinux/live.cfg from ISO" "$iso_path"
|
||||
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||
return 0
|
||||
}
|
||||
|
||||
grep -q 'Memory Test (memtest86+)' "$grub_cfg" || {
|
||||
memtest_fail "GRUB menu entry for memtest is missing" "$iso_path"
|
||||
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||
return 0
|
||||
}
|
||||
grep -q '/boot/memtest86+x64\.efi' "$grub_cfg" || {
|
||||
memtest_fail "GRUB memtest EFI path is missing" "$iso_path"
|
||||
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||
return 0
|
||||
}
|
||||
grep -q '/boot/memtest86+x64\.bin' "$grub_cfg" || {
|
||||
memtest_fail "GRUB memtest BIOS path is missing" "$iso_path"
|
||||
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||
return 0
|
||||
}
|
||||
grep -q 'Memory Test (memtest86+)' "$isolinux_cfg" || {
|
||||
memtest_fail "isolinux menu entry for memtest is missing" "$iso_path"
|
||||
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||
return 0
|
||||
}
|
||||
grep -q '/boot/memtest86+x64\.bin' "$isolinux_cfg" || {
|
||||
memtest_fail "isolinux memtest path is missing" "$iso_path"
|
||||
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||
return 0
|
||||
}
|
||||
|
||||
rm -f "$iso_files" "$grub_cfg" "$isolinux_cfg"
|
||||
echo "=== memtest validation OK ==="
|
||||
}
|
||||
|
||||
append_memtest_grub_entry() {
|
||||
grub_cfg="$1"
|
||||
[ -f "$grub_cfg" ] || return 1
|
||||
grep -q 'Memory Test (memtest86+)' "$grub_cfg" && return 0
|
||||
grep -q '### BEE MEMTEST ###' "$grub_cfg" && return 0
|
||||
|
||||
cat >> "$grub_cfg" <<'EOF'
|
||||
|
||||
### BEE MEMTEST ###
|
||||
if [ "${grub_platform}" = "efi" ]; then
|
||||
menuentry "Memory Test (memtest86+)" {
|
||||
chainloader /boot/memtest86+x64.efi
|
||||
}
|
||||
else
|
||||
menuentry "Memory Test (memtest86+)" {
|
||||
linux16 /boot/memtest86+x64.bin
|
||||
}
|
||||
fi
|
||||
### /BEE MEMTEST ###
|
||||
EOF
|
||||
}
|
||||
|
||||
append_memtest_isolinux_entry() {
|
||||
isolinux_cfg="$1"
|
||||
[ -f "$isolinux_cfg" ] || return 1
|
||||
grep -q 'Memory Test (memtest86+)' "$isolinux_cfg" && return 0
|
||||
grep -q '### BEE MEMTEST ###' "$isolinux_cfg" && return 0
|
||||
|
||||
cat >> "$isolinux_cfg" <<'EOF'
|
||||
|
||||
# ### BEE MEMTEST ###
|
||||
label memtest
|
||||
menu label ^Memory Test (memtest86+)
|
||||
linux /boot/memtest86+x64.bin
|
||||
# ### /BEE MEMTEST ###
|
||||
EOF
|
||||
}
|
||||
|
||||
copy_memtest_from_deb() {
|
||||
deb="$1"
|
||||
dst_boot="$2"
|
||||
tmpdir="$(mktemp -d)"
|
||||
|
||||
dpkg-deb -x "$deb" "$tmpdir"
|
||||
for f in memtest86+x64.bin memtest86+x64.efi; do
|
||||
if [ -f "$tmpdir/boot/$f" ]; then
|
||||
cp "$tmpdir/boot/$f" "$dst_boot/$f"
|
||||
fi
|
||||
done
|
||||
rm -rf "$tmpdir"
|
||||
}
|
||||
|
||||
reset_live_build_stage() {
|
||||
lb_dir="$1"
|
||||
stage="$2"
|
||||
|
||||
for root in \
|
||||
"$lb_dir/.build" \
|
||||
"$lb_dir/.stage" \
|
||||
"$lb_dir/auto"; do
|
||||
[ -d "$root" ] || continue
|
||||
find "$root" -maxdepth 1 \( -name "${stage}" -o -name "${stage}.*" -o -name "*${stage}*" \) -exec rm -rf {} + 2>/dev/null || true
|
||||
done
|
||||
}
|
||||
|
||||
recover_iso_memtest() {
|
||||
lb_dir="$1"
|
||||
iso_path="$2"
|
||||
binary_boot="$lb_dir/binary/boot"
|
||||
grub_cfg="$lb_dir/binary/boot/grub/grub.cfg"
|
||||
isolinux_cfg="$lb_dir/binary/isolinux/live.cfg"
|
||||
|
||||
echo "=== attempting memtest recovery in binary tree ==="
|
||||
|
||||
mkdir -p "$binary_boot"
|
||||
|
||||
for root in \
|
||||
"$lb_dir/chroot/boot" \
|
||||
"/boot"; do
|
||||
for f in memtest86+x64.bin memtest86+x64.efi; do
|
||||
if [ ! -f "$binary_boot/$f" ] && [ -f "$root/$f" ]; then
|
||||
cp "$root/$f" "$binary_boot/$f"
|
||||
echo "memtest recovery: copied $f from $root"
|
||||
fi
|
||||
done
|
||||
done
|
||||
|
||||
if [ ! -f "$binary_boot/memtest86+x64.bin" ] || [ ! -f "$binary_boot/memtest86+x64.efi" ]; then
|
||||
for dir in \
|
||||
"$lb_dir/cache/packages.binary" \
|
||||
"$lb_dir/cache/packages.chroot" \
|
||||
"$lb_dir/chroot/var/cache/apt/archives" \
|
||||
"${BEE_CACHE_DIR:-${DIST_DIR}/cache}/lb-packages" \
|
||||
"/var/cache/apt/archives"; do
|
||||
[ -d "$dir" ] || continue
|
||||
deb="$(find "$dir" -maxdepth 1 -type f -name 'memtest86+*.deb' 2>/dev/null | head -1)"
|
||||
[ -n "$deb" ] || continue
|
||||
echo "memtest recovery: extracting payload from $deb"
|
||||
copy_memtest_from_deb "$deb" "$binary_boot"
|
||||
break
|
||||
done
|
||||
fi
|
||||
|
||||
if [ ! -f "$binary_boot/memtest86+x64.bin" ] || [ ! -f "$binary_boot/memtest86+x64.efi" ]; then
|
||||
tmpdl="$(mktemp -d)"
|
||||
if (
|
||||
cd "$tmpdl" && apt-get download memtest86+ >/dev/null 2>&1
|
||||
); then
|
||||
deb="$(find "$tmpdl" -maxdepth 1 -type f -name 'memtest86+*.deb' 2>/dev/null | head -1)"
|
||||
if [ -n "$deb" ]; then
|
||||
echo "memtest recovery: downloaded $deb"
|
||||
copy_memtest_from_deb "$deb" "$binary_boot"
|
||||
fi
|
||||
fi
|
||||
rm -rf "$tmpdl"
|
||||
fi
|
||||
|
||||
if [ -f "$grub_cfg" ]; then
|
||||
append_memtest_grub_entry "$grub_cfg" && echo "memtest recovery: ensured GRUB entry"
|
||||
else
|
||||
echo "memtest recovery: WARNING: missing $grub_cfg"
|
||||
fi
|
||||
|
||||
if [ -f "$isolinux_cfg" ]; then
|
||||
append_memtest_isolinux_entry "$isolinux_cfg" && echo "memtest recovery: ensured isolinux entry"
|
||||
else
|
||||
echo "memtest recovery: WARNING: missing $isolinux_cfg"
|
||||
fi
|
||||
|
||||
reset_live_build_stage "$lb_dir" "binary_checksums"
|
||||
reset_live_build_stage "$lb_dir" "binary_iso"
|
||||
reset_live_build_stage "$lb_dir" "binary_zsync"
|
||||
|
||||
run_optional_step_sh "rebuild live-build checksums after memtest recovery" "91-lb-checksums" "lb binary_checksums 2>&1"
|
||||
run_optional_step_sh "rebuild ISO after memtest recovery" "92-lb-binary-iso" "rm -f '$iso_path' && lb binary_iso 2>&1"
|
||||
run_optional_step_sh "rebuild zsync after memtest recovery" "93-lb-zsync" "lb binary_zsync 2>&1"
|
||||
|
||||
if [ ! -f "$iso_path" ]; then
|
||||
memtest_fail "ISO rebuild was skipped or failed after memtest recovery: $iso_path" "$iso_path"
|
||||
fi
|
||||
}
|
||||
|
||||
AUDIT_VERSION_EFFECTIVE="$(resolve_audit_version)"
|
||||
ISO_VERSION_EFFECTIVE="$(resolve_iso_version)"
|
||||
ISO_BASENAME="easy-bee-${BEE_GPU_VENDOR}-v${ISO_VERSION_EFFECTIVE}-amd64"
|
||||
# Versioned output directory: dist/easy-bee-v4.1/ — all final artefacts live here.
|
||||
OUT_DIR="${DIST_DIR}/easy-bee-v${ISO_VERSION_EFFECTIVE}"
|
||||
mkdir -p "${OUT_DIR}"
|
||||
LOG_DIR="${OUT_DIR}/${ISO_BASENAME}.logs"
|
||||
LOG_ARCHIVE="${OUT_DIR}/${ISO_BASENAME}.logs.tar.gz"
|
||||
ISO_OUT="${OUT_DIR}/${ISO_BASENAME}.iso"
|
||||
LOG_OUT="${LOG_DIR}/build.log"
|
||||
|
||||
cleanup_build_log() {
|
||||
status="${1:-$?}"
|
||||
trap - EXIT INT TERM HUP
|
||||
|
||||
if [ "${STEP_LOG_ACTIVE:-0}" = "1" ]; then
|
||||
cleanup_step_log "${status}" || true
|
||||
fi
|
||||
|
||||
if [ "${BUILD_LOG_ACTIVE:-0}" = "1" ]; then
|
||||
BUILD_LOG_ACTIVE=0
|
||||
exec 1>&3 2>&4
|
||||
exec 3>&- 4>&-
|
||||
if [ -n "${BUILD_TEE_PID:-}" ]; then
|
||||
wait "${BUILD_TEE_PID}" 2>/dev/null || true
|
||||
fi
|
||||
rm -f "${BUILD_LOG_PIPE}"
|
||||
fi
|
||||
|
||||
if [ -n "${LOG_DIR:-}" ] && [ -d "${LOG_DIR}" ] && command -v tar >/dev/null 2>&1; then
|
||||
rm -f "${LOG_ARCHIVE}"
|
||||
tar -czf "${LOG_ARCHIVE}" -C "$(dirname "${LOG_DIR}")" "$(basename "${LOG_DIR}")" 2>/dev/null || true
|
||||
rm -rf "${LOG_DIR}"
|
||||
fi
|
||||
|
||||
exit "${status}"
|
||||
}
|
||||
|
||||
start_build_log() {
|
||||
command -v tee >/dev/null 2>&1 || {
|
||||
echo "ERROR: tee is required for build logging" >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
rm -rf "${LOG_DIR}"
|
||||
rm -f "${LOG_ARCHIVE}"
|
||||
mkdir -p "${LOG_DIR}"
|
||||
BUILD_LOG_PIPE="$(mktemp -u "${TMPDIR:-/tmp}/bee-build-log.XXXXXX")"
|
||||
mkfifo "${BUILD_LOG_PIPE}"
|
||||
|
||||
exec 3>&1 4>&2
|
||||
tee "${LOG_OUT}" < "${BUILD_LOG_PIPE}" &
|
||||
BUILD_TEE_PID=$!
|
||||
exec > "${BUILD_LOG_PIPE}" 2>&1
|
||||
BUILD_LOG_ACTIVE=1
|
||||
|
||||
trap 'cleanup_build_log "$?"' EXIT INT TERM HUP
|
||||
|
||||
echo "=== build log dir: ${LOG_DIR} ==="
|
||||
echo "=== build log: ${LOG_OUT} ==="
|
||||
echo "=== build log archive: ${LOG_ARCHIVE} ==="
|
||||
}
|
||||
|
||||
cleanup_step_log() {
|
||||
status="${1:-$?}"
|
||||
|
||||
if [ "${STEP_LOG_ACTIVE:-0}" = "1" ]; then
|
||||
STEP_LOG_ACTIVE=0
|
||||
exec 1>&5 2>&6
|
||||
exec 5>&- 6>&-
|
||||
if [ -n "${STEP_TEE_PID:-}" ]; then
|
||||
wait "${STEP_TEE_PID}" 2>/dev/null || true
|
||||
fi
|
||||
rm -f "${STEP_LOG_PIPE}"
|
||||
fi
|
||||
|
||||
return "${status}"
|
||||
}
|
||||
|
||||
run_step() {
|
||||
step_name="$1"
|
||||
step_slug="$2"
|
||||
shift 2
|
||||
|
||||
step_log="${LOG_DIR}/${step_slug}.log"
|
||||
echo ""
|
||||
echo "=== step: ${step_name} ==="
|
||||
echo "=== step log: ${step_log} ==="
|
||||
|
||||
STEP_LOG_PIPE="$(mktemp -u "${TMPDIR:-/tmp}/bee-step-log.XXXXXX")"
|
||||
mkfifo "${STEP_LOG_PIPE}"
|
||||
|
||||
exec 5>&1 6>&2
|
||||
tee "${step_log}" < "${STEP_LOG_PIPE}" >&5 &
|
||||
STEP_TEE_PID=$!
|
||||
exec > "${STEP_LOG_PIPE}" 2>&1
|
||||
STEP_LOG_ACTIVE=1
|
||||
|
||||
set +e
|
||||
"$@"
|
||||
step_status=$?
|
||||
set -e
|
||||
|
||||
cleanup_step_log "${step_status}"
|
||||
if [ "${step_status}" -ne 0 ]; then
|
||||
echo "ERROR: step failed: ${step_name} (see ${step_log})" >&2
|
||||
exit "${step_status}"
|
||||
fi
|
||||
|
||||
echo "=== step OK: ${step_name} ==="
|
||||
}
|
||||
|
||||
run_step_sh() {
|
||||
step_name="$1"
|
||||
step_slug="$2"
|
||||
step_script="$3"
|
||||
|
||||
run_step "${step_name}" "${step_slug}" sh -c "${step_script}"
|
||||
}
|
||||
|
||||
run_optional_step_sh() {
|
||||
step_name="$1"
|
||||
step_slug="$2"
|
||||
step_script="$3"
|
||||
|
||||
if [ "${BEE_REQUIRE_MEMTEST:-0}" = "1" ]; then
|
||||
run_step_sh "${step_name}" "${step_slug}" "${step_script}"
|
||||
return 0
|
||||
fi
|
||||
|
||||
step_log="${LOG_DIR}/${step_slug}.log"
|
||||
echo ""
|
||||
echo "=== optional step: ${step_name} ==="
|
||||
echo "=== optional step log: ${step_log} ==="
|
||||
set +e
|
||||
sh -c "${step_script}" > "${step_log}" 2>&1
|
||||
step_status=$?
|
||||
set -e
|
||||
cat "${step_log}"
|
||||
if [ "${step_status}" -ne 0 ]; then
|
||||
echo "WARNING: optional step failed: ${step_name} (see ${step_log})" >&2
|
||||
else
|
||||
echo "=== optional step OK: ${step_name} ==="
|
||||
fi
|
||||
}
|
||||
|
||||
start_build_log
|
||||
|
||||
# Auto-detect kernel ABI: refresh apt index, then query current linux-image-amd64 dependency.
|
||||
# If headers for the detected ABI are not yet installed (kernel updated since image build),
|
||||
@@ -147,8 +769,8 @@ echo "Debian: ${DEBIAN_VERSION}, Kernel ABI: ${DEBIAN_KERNEL_ABI}, Go: ${GO_VERS
|
||||
echo "Audit version: ${AUDIT_VERSION_EFFECTIVE}, ISO version: ${ISO_VERSION_EFFECTIVE}"
|
||||
echo ""
|
||||
|
||||
echo "=== syncing git submodules ==="
|
||||
git -C "${REPO_ROOT}" submodule update --init --recursive
|
||||
run_step "sync git submodules" "05-git-submodules" \
|
||||
git -C "${REPO_ROOT}" submodule update --init --recursive
|
||||
|
||||
# --- compile bee binary (static, Linux amd64) ---
|
||||
# Shared between variants — built once, reused on second pass.
|
||||
@@ -160,13 +782,13 @@ if [ -f "$BEE_BIN" ]; then
|
||||
fi
|
||||
|
||||
if [ "$NEED_BUILD" = "1" ]; then
|
||||
echo "=== building bee binary ==="
|
||||
cd "${REPO_ROOT}/audit"
|
||||
GOOS=linux GOARCH=amd64 CGO_ENABLED=0 \
|
||||
go build \
|
||||
-ldflags "-s -w -X main.Version=${AUDIT_VERSION_EFFECTIVE}" \
|
||||
-o "$BEE_BIN" \
|
||||
./cmd/bee
|
||||
run_step_sh "build bee binary" "10-build-bee" \
|
||||
"cd '${REPO_ROOT}/audit' && \
|
||||
env GOOS=linux GOARCH=amd64 CGO_ENABLED=0 \
|
||||
go build \
|
||||
-ldflags '-s -w -X main.Version=${AUDIT_VERSION_EFFECTIVE}' \
|
||||
-o '${BEE_BIN}' \
|
||||
./cmd/bee"
|
||||
echo "binary: $BEE_BIN"
|
||||
if command -v stat >/dev/null 2>&1; then
|
||||
BEE_SIZE_BYTES="$(stat -c '%s' "$BEE_BIN" 2>/dev/null || stat -f '%z' "$BEE_BIN")"
|
||||
@@ -183,11 +805,10 @@ else
|
||||
fi
|
||||
|
||||
# --- NVIDIA-only build steps ---
|
||||
GPU_STRESS_BIN="${DIST_DIR}/bee-gpu-stress-linux-amd64"
|
||||
GPU_BURN_WORKER_BIN="${DIST_DIR}/bee-gpu-burn-worker-linux-amd64"
|
||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
echo ""
|
||||
echo "=== downloading cuBLAS/cuBLASLt/cudart ${NCCL_CUDA_VERSION} userspace ==="
|
||||
sh "${BUILDER_DIR}/build-cublas.sh" \
|
||||
run_step "download cuBLAS/cuBLASLt/cudart ${NCCL_CUDA_VERSION} userspace" "20-cublas" \
|
||||
sh "${BUILDER_DIR}/build-cublas.sh" \
|
||||
"${CUBLAS_VERSION}" \
|
||||
"${CUDA_USERSPACE_VERSION}" \
|
||||
"${NCCL_CUDA_VERSION}" \
|
||||
@@ -196,20 +817,20 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
||||
|
||||
GPU_STRESS_NEED_BUILD=1
|
||||
if [ -f "$GPU_STRESS_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_STRESS_BIN" ]; then
|
||||
if [ -f "$GPU_BURN_WORKER_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_BURN_WORKER_BIN" ]; then
|
||||
GPU_STRESS_NEED_BUILD=0
|
||||
fi
|
||||
|
||||
if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then
|
||||
echo "=== building bee-gpu-stress ==="
|
||||
gcc -O2 -s -Wall -Wextra \
|
||||
run_step "build bee-gpu-burn worker" "21-gpu-burn-worker" \
|
||||
gcc -O2 -s -Wall -Wextra \
|
||||
-I"${CUBLAS_CACHE}/include" \
|
||||
-o "$GPU_STRESS_BIN" \
|
||||
-o "$GPU_BURN_WORKER_BIN" \
|
||||
"${BUILDER_DIR}/bee-gpu-stress.c" \
|
||||
-ldl -lm
|
||||
echo "binary: $GPU_STRESS_BIN"
|
||||
echo "binary: $GPU_BURN_WORKER_BIN"
|
||||
else
|
||||
echo "=== bee-gpu-stress up to date, skipping build ==="
|
||||
echo "=== bee-gpu-burn worker up to date, skipping build ==="
|
||||
fi
|
||||
fi
|
||||
|
||||
@@ -245,9 +866,12 @@ rm -f \
|
||||
"${OVERLAY_STAGE_DIR}/etc/bee-release" \
|
||||
"${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/john" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/lib/bee/bee-gpu-burn-worker" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
||||
rm -rf \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john"
|
||||
|
||||
# Remove NVIDIA-specific overlay files for non-nvidia variants
|
||||
if [ "$BEE_GPU_VENDOR" != "nvidia" ]; then
|
||||
@@ -293,9 +917,13 @@ mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/bin"
|
||||
cp "${DIST_DIR}/bee-linux-amd64" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
|
||||
|
||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ] && [ -f "$GPU_STRESS_BIN" ]; then
|
||||
cp "${GPU_STRESS_BIN}" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
|
||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ] && [ -f "$GPU_BURN_WORKER_BIN" ]; then
|
||||
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/lib/bee" "${OVERLAY_STAGE_DIR}/usr/local/bin"
|
||||
cp "${GPU_BURN_WORKER_BIN}" "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/bee-gpu-burn-worker"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/bee-gpu-burn-worker"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-burn" 2>/dev/null || true
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-john-gpu-stress" 2>/dev/null || true
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nccl-gpu-stress" 2>/dev/null || true
|
||||
fi
|
||||
|
||||
# --- inject smoketest into overlay so it runs directly on the live CD ---
|
||||
@@ -315,9 +943,8 @@ done
|
||||
|
||||
# --- NVIDIA kernel modules and userspace libs ---
|
||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
echo ""
|
||||
echo "=== building NVIDIA ${NVIDIA_DRIVER_VERSION} modules ==="
|
||||
sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}"
|
||||
run_step "build NVIDIA ${NVIDIA_DRIVER_VERSION} modules" "40-nvidia-module" \
|
||||
sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}"
|
||||
|
||||
KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
||||
NVIDIA_CACHE="${DIST_DIR}/nvidia-${NVIDIA_DRIVER_VERSION}-${KVER}"
|
||||
@@ -334,6 +961,8 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
cp "${NVIDIA_CACHE}/bin/nvidia-bug-report.sh" "${OVERLAY_STAGE_DIR}/usr/local/bin/" 2>/dev/null || true
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/nvidia-bug-report.sh" 2>/dev/null || true
|
||||
cp "${NVIDIA_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" 2>/dev/null || true
|
||||
mkdir -p "${OVERLAY_STAGE_DIR}/etc/OpenCL/vendors"
|
||||
printf 'libnvidia-opencl.so.1\n' > "${OVERLAY_STAGE_DIR}/etc/OpenCL/vendors/nvidia.icd"
|
||||
|
||||
# Inject GSP firmware into /lib/firmware/nvidia/<version>/
|
||||
if [ -d "${NVIDIA_CACHE}/firmware" ] && [ "$(ls -A "${NVIDIA_CACHE}/firmware" 2>/dev/null)" ]; then
|
||||
@@ -343,9 +972,8 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
fi
|
||||
|
||||
# --- build / download NCCL ---
|
||||
echo ""
|
||||
echo "=== downloading NCCL ${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION} ==="
|
||||
sh "${BUILDER_DIR}/build-nccl.sh" "${NCCL_VERSION}" "${NCCL_CUDA_VERSION}" "${DIST_DIR}" "${NCCL_SHA256:-}"
|
||||
run_step "download NCCL ${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}" "50-nccl" \
|
||||
sh "${BUILDER_DIR}/build-nccl.sh" "${NCCL_VERSION}" "${NCCL_CUDA_VERSION}" "${DIST_DIR}" "${NCCL_SHA256:-}"
|
||||
|
||||
NCCL_CACHE="${DIST_DIR}/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
||||
|
||||
@@ -353,14 +981,13 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
cp "${NCCL_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
|
||||
echo "=== NCCL: $(ls "${NCCL_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
|
||||
|
||||
# Inject cuBLAS/cuBLASLt/cudart runtime libs used by bee-gpu-stress tensor-core GEMM path
|
||||
# Inject cuBLAS/cuBLASLt/cudart runtime libs used by the bee-gpu-burn worker tensor-core GEMM path
|
||||
cp "${CUBLAS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
|
||||
echo "=== cuBLAS: $(ls "${CUBLAS_CACHE}/lib/" | wc -l) files injected into /usr/lib/ ==="
|
||||
|
||||
# --- build nccl-tests ---
|
||||
echo ""
|
||||
echo "=== building nccl-tests ${NCCL_TESTS_VERSION} ==="
|
||||
sh "${BUILDER_DIR}/build-nccl-tests.sh" \
|
||||
run_step "build nccl-tests ${NCCL_TESTS_VERSION}" "60-nccl-tests" \
|
||||
sh "${BUILDER_DIR}/build-nccl-tests.sh" \
|
||||
"${NCCL_TESTS_VERSION}" \
|
||||
"${NCCL_VERSION}" \
|
||||
"${NCCL_CUDA_VERSION}" \
|
||||
@@ -371,7 +998,17 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
NCCL_TESTS_CACHE="${DIST_DIR}/nccl-tests-${NCCL_TESTS_VERSION}"
|
||||
cp "${NCCL_TESTS_CACHE}/bin/all_reduce_perf" "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
||||
cp "${NCCL_TESTS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" 2>/dev/null || true
|
||||
echo "=== all_reduce_perf injected ==="
|
||||
|
||||
run_step "build john jumbo ${JOHN_JUMBO_COMMIT}" "70-john" \
|
||||
sh "${BUILDER_DIR}/build-john.sh" "${JOHN_JUMBO_COMMIT}" "${DIST_DIR}"
|
||||
JOHN_CACHE="${DIST_DIR}/john-${JOHN_JUMBO_COMMIT}"
|
||||
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john"
|
||||
rsync -a --delete "${JOHN_CACHE}/run/" "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john/run/"
|
||||
ln -sfn ../lib/bee/john/run/john "${OVERLAY_STAGE_DIR}/usr/local/bin/john"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john/run/john"
|
||||
echo "=== john injected ==="
|
||||
fi
|
||||
|
||||
# --- embed build metadata ---
|
||||
@@ -385,7 +1022,8 @@ NCCL_VERSION=${NCCL_VERSION}
|
||||
NCCL_CUDA_VERSION=${NCCL_CUDA_VERSION}
|
||||
CUBLAS_VERSION=${CUBLAS_VERSION}
|
||||
CUDA_USERSPACE_VERSION=${CUDA_USERSPACE_VERSION}
|
||||
NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}"
|
||||
NCCL_TESTS_VERSION=${NCCL_TESTS_VERSION}
|
||||
JOHN_JUMBO_COMMIT=${JOHN_JUMBO_COMMIT}"
|
||||
GPU_BUILD_INFO="nvidia:${NVIDIA_DRIVER_VERSION}"
|
||||
elif [ "$BEE_GPU_VENDOR" = "amd" ]; then
|
||||
GPU_VERSION_LINE="ROCM_VERSION=${ROCM_VERSION}"
|
||||
@@ -485,9 +1123,10 @@ BEE_GPU_VENDOR_UPPER="$(echo "${BEE_GPU_VENDOR}" | tr 'a-z' 'A-Z')"
|
||||
export BEE_GPU_VENDOR_UPPER
|
||||
|
||||
cd "${LB_DIR}"
|
||||
lb clean 2>&1 | tail -3
|
||||
lb config 2>&1 | tail -5
|
||||
lb build 2>&1
|
||||
run_step_sh "live-build clean" "80-lb-clean" "lb clean 2>&1 | tail -3"
|
||||
run_step_sh "live-build config" "81-lb-config" "lb config 2>&1 | tail -5"
|
||||
dump_memtest_debug "pre-build" "${LB_DIR}"
|
||||
run_step_sh "live-build build" "90-lb-build" "lb build 2>&1"
|
||||
|
||||
# --- persist deb package cache back to shared location ---
|
||||
# This allows the second variant to reuse all downloaded packages.
|
||||
@@ -498,8 +1137,20 @@ fi
|
||||
|
||||
# live-build outputs live-image-amd64.hybrid.iso in LB_DIR
|
||||
ISO_RAW="${LB_DIR}/live-image-amd64.hybrid.iso"
|
||||
ISO_OUT="${DIST_DIR}/easy-bee-${BEE_GPU_VENDOR}-v${ISO_VERSION_EFFECTIVE}-amd64.iso"
|
||||
if [ -f "$ISO_RAW" ]; then
|
||||
dump_memtest_debug "post-build" "${LB_DIR}" "$ISO_RAW"
|
||||
if iso_memtest_present "$ISO_RAW"; then
|
||||
:
|
||||
else
|
||||
memtest_status=$?
|
||||
if [ "$memtest_status" -eq 1 ]; then
|
||||
recover_iso_memtest "${LB_DIR}" "$ISO_RAW"
|
||||
dump_memtest_debug "post-recovery" "${LB_DIR}" "$ISO_RAW"
|
||||
elif [ "$memtest_status" -eq 2 ]; then
|
||||
memtest_fail "failed to inspect ISO for memtest before recovery" "$ISO_RAW"
|
||||
fi
|
||||
fi
|
||||
validate_iso_memtest "$ISO_RAW"
|
||||
cp "$ISO_RAW" "$ISO_OUT"
|
||||
echo ""
|
||||
echo "=== done (${BEE_GPU_VENDOR}) ==="
|
||||
|
||||
@@ -14,6 +14,11 @@ menuentry "EASY-BEE" {
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
menuentry "EASY-BEE (graphics/KMS)" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
menuentry "EASY-BEE (load to RAM)" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
@@ -24,6 +29,11 @@ menuentry "EASY-BEE (NVIDIA GSP=off)" {
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
menuentry "EASY-BEE (graphics/KMS, GSP=off)" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
menuentry "EASY-BEE (fail-safe)" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
||||
initrd @INITRD_LIVE@
|
||||
|
||||
@@ -5,6 +5,12 @@ label live-@FLAVOUR@-normal
|
||||
initrd @INITRD@
|
||||
append @APPEND_LIVE@ bee.nvidia.mode=normal
|
||||
|
||||
label live-@FLAVOUR@-kms
|
||||
menu label EASY-BEE (^graphics/KMS)
|
||||
linux @LINUX@
|
||||
initrd @INITRD@
|
||||
append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal
|
||||
|
||||
label live-@FLAVOUR@-toram
|
||||
menu label EASY-BEE (^load to RAM)
|
||||
linux @LINUX@
|
||||
@@ -17,8 +23,18 @@ label live-@FLAVOUR@-gsp-off
|
||||
initrd @INITRD@
|
||||
append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off
|
||||
|
||||
label live-@FLAVOUR@-kms-gsp-off
|
||||
menu label EASY-BEE (g^raphics/KMS, GSP=off)
|
||||
linux @LINUX@
|
||||
initrd @INITRD@
|
||||
append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=gsp-off
|
||||
|
||||
label live-@FLAVOUR@-failsafe
|
||||
menu label EASY-BEE (^fail-safe)
|
||||
linux @LINUX@
|
||||
initrd @INITRD@
|
||||
append @APPEND_LIVE@ bee.nvidia.mode=gsp-off memtest noapic noapm nodma nomce nolapic nosmp vga=normal
|
||||
|
||||
label memtest
|
||||
menu label ^Memory Test (memtest86+)
|
||||
linux /boot/memtest86+x64.bin
|
||||
|
||||
@@ -60,6 +60,9 @@ chmod +x /usr/local/bin/bee 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true
|
||||
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
||||
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-john-gpu-stress 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-nccl-gpu-stress 2>/dev/null || true
|
||||
fi
|
||||
|
||||
# Reload udev rules
|
||||
|
||||
@@ -1,76 +1,139 @@
|
||||
#!/bin/sh
|
||||
# Copy memtest86+ binaries from chroot /boot into the ISO boot directory
|
||||
# so GRUB can chainload them directly (they must be on the ISO filesystem,
|
||||
# not inside the squashfs).
|
||||
#
|
||||
# Primary: copy from chroot/boot/ (populated by package postinst).
|
||||
# Naming fallbacks:
|
||||
# Debian Bookworm: /boot/memtest86+ — EFI PE64 (no extension)
|
||||
# /boot/memtest86+.bin — legacy binary
|
||||
# Upstream/Ubuntu: /boot/memtest86+x64.efi, /boot/memtest86+x64.bin, etc.
|
||||
# Last resort: extract directly from the cached .deb if postinst didn't place
|
||||
# the files (happens in chroot environments without grub triggers).
|
||||
# Ensure memtest is present in the final ISO even if live-build's built-in
|
||||
# memtest stage does not copy the binaries or expose menu entries.
|
||||
set -e
|
||||
|
||||
MEMTEST_FILES="memtest86+x64.bin memtest86+x64.efi memtest86+ia32.bin memtest86+ia32.efi"
|
||||
: "${BEE_REQUIRE_MEMTEST:=0}"
|
||||
|
||||
# Ensure destination directory exists (absence caused silent copy failures).
|
||||
mkdir -p binary/boot
|
||||
MEMTEST_FILES="memtest86+x64.bin memtest86+x64.efi"
|
||||
BINARY_BOOT_DIR="binary/boot"
|
||||
GRUB_CFG="binary/boot/grub/grub.cfg"
|
||||
ISOLINUX_CFG="binary/isolinux/live.cfg"
|
||||
|
||||
echo "memtest: scanning chroot/boot/ for memtest files:"
|
||||
ls chroot/boot/memtest* 2>/dev/null || echo "memtest: WARNING: no memtest files in chroot/boot/"
|
||||
log() {
|
||||
echo "memtest hook: $*"
|
||||
}
|
||||
|
||||
# Primary path: copy upstream-named files from chroot/boot/
|
||||
for f in ${MEMTEST_FILES}; do
|
||||
src="chroot/boot/${f}"
|
||||
if [ -f "${src}" ]; then
|
||||
cp "${src}" "binary/boot/${f}"
|
||||
echo "memtest: copied ${f} from chroot/boot/"
|
||||
fail_or_warn() {
|
||||
msg="$1"
|
||||
if [ "${BEE_REQUIRE_MEMTEST}" = "1" ]; then
|
||||
log "ERROR: ${msg}"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
log "WARNING: ${msg}"
|
||||
return 0
|
||||
}
|
||||
|
||||
# Debian Bookworm naming fallback: /boot/memtest86+ (no extension) is the EFI binary.
|
||||
if [ ! -f "binary/boot/memtest86+x64.efi" ] && [ -f "chroot/boot/memtest86+" ]; then
|
||||
cp "chroot/boot/memtest86+" "binary/boot/memtest86+x64.efi"
|
||||
echo "memtest: copied /boot/memtest86+ as memtest86+x64.efi (Debian naming)"
|
||||
fi
|
||||
if [ ! -f "binary/boot/memtest86+x64.bin" ] && [ -f "chroot/boot/memtest86+.bin" ]; then
|
||||
cp "chroot/boot/memtest86+.bin" "binary/boot/memtest86+x64.bin"
|
||||
echo "memtest: copied /boot/memtest86+.bin as memtest86+x64.bin (Debian naming)"
|
||||
fi
|
||||
copy_memtest_file() {
|
||||
src="$1"
|
||||
base="$(basename "$src")"
|
||||
dst="${BINARY_BOOT_DIR}/${base}"
|
||||
|
||||
# Last resort: if EFI binary still missing, extract from cached .deb
|
||||
if [ ! -f "binary/boot/memtest86+x64.efi" ]; then
|
||||
echo "memtest: EFI binary missing — attempting extraction from .deb cache"
|
||||
deb=$(find chroot/var/cache/apt/archives/ chroot/var/lib/apt/lists/ \
|
||||
-name 'memtest86+_*.deb' -o -name 'memtest86+*.deb' 2>/dev/null \
|
||||
| head -1)
|
||||
if [ -z "$deb" ]; then
|
||||
deb=$(find cache/ -name 'memtest86+_*.deb' -o -name 'memtest86+*.deb' 2>/dev/null | head -1)
|
||||
fi
|
||||
if [ -n "$deb" ]; then
|
||||
echo "memtest: extracting from ${deb}"
|
||||
EXTRACT_DIR="$(mktemp -d)"
|
||||
dpkg-deb -x "${deb}" "${EXTRACT_DIR}"
|
||||
echo "memtest: files found in .deb:"
|
||||
find "${EXTRACT_DIR}/boot" -type f 2>/dev/null || echo " (none in /boot)"
|
||||
for f in ${MEMTEST_FILES}; do
|
||||
src="${EXTRACT_DIR}/boot/${f}"
|
||||
if [ -f "${src}" ]; then
|
||||
cp "${src}" "binary/boot/${f}"
|
||||
echo "memtest: extracted ${f} from .deb"
|
||||
fi
|
||||
done
|
||||
# Debian naming fallback inside .deb as well
|
||||
if [ ! -f "binary/boot/memtest86+x64.efi" ] && [ -f "${EXTRACT_DIR}/boot/memtest86+" ]; then
|
||||
cp "${EXTRACT_DIR}/boot/memtest86+" "binary/boot/memtest86+x64.efi"
|
||||
echo "memtest: extracted /boot/memtest86+ as memtest86+x64.efi from .deb"
|
||||
[ -f "$src" ] || return 1
|
||||
mkdir -p "${BINARY_BOOT_DIR}"
|
||||
cp "$src" "$dst"
|
||||
log "copied ${base} from ${src}"
|
||||
}
|
||||
|
||||
extract_memtest_from_deb() {
|
||||
deb="$1"
|
||||
tmpdir="$(mktemp -d)"
|
||||
|
||||
log "extracting memtest payload from ${deb}"
|
||||
dpkg-deb -x "$deb" "$tmpdir"
|
||||
for f in ${MEMTEST_FILES}; do
|
||||
if [ -f "${tmpdir}/boot/${f}" ]; then
|
||||
copy_memtest_file "${tmpdir}/boot/${f}"
|
||||
fi
|
||||
rm -rf "${EXTRACT_DIR}"
|
||||
else
|
||||
echo "memtest: WARNING: no memtest86+ .deb found in cache — memtest will not be available"
|
||||
fi
|
||||
fi
|
||||
done
|
||||
rm -rf "$tmpdir"
|
||||
}
|
||||
|
||||
echo "memtest: binary/boot/ contents:"
|
||||
ls binary/boot/memtest* 2>/dev/null || echo " (none)"
|
||||
ensure_memtest_binaries() {
|
||||
missing=0
|
||||
for f in ${MEMTEST_FILES}; do
|
||||
[ -f "${BINARY_BOOT_DIR}/${f}" ] || missing=1
|
||||
done
|
||||
[ "$missing" -eq 1 ] || return 0
|
||||
|
||||
for root in chroot/boot /boot; do
|
||||
for f in ${MEMTEST_FILES}; do
|
||||
[ -f "${BINARY_BOOT_DIR}/${f}" ] || copy_memtest_file "${root}/${f}" || true
|
||||
done
|
||||
done
|
||||
|
||||
missing=0
|
||||
for f in ${MEMTEST_FILES}; do
|
||||
[ -f "${BINARY_BOOT_DIR}/${f}" ] || missing=1
|
||||
done
|
||||
[ "$missing" -eq 1 ] || return 0
|
||||
|
||||
for root in cache chroot/var/cache/apt/archives /var/cache/apt/archives; do
|
||||
[ -d "$root" ] || continue
|
||||
deb="$(find "$root" -type f \( -name 'memtest86+_*.deb' -o -name 'memtest86+*.deb' \) 2>/dev/null | head -1)"
|
||||
[ -n "$deb" ] || continue
|
||||
extract_memtest_from_deb "$deb"
|
||||
break
|
||||
done
|
||||
|
||||
missing=0
|
||||
for f in ${MEMTEST_FILES}; do
|
||||
if [ ! -f "${BINARY_BOOT_DIR}/${f}" ]; then
|
||||
fail_or_warn "missing ${BINARY_BOOT_DIR}/${f}"
|
||||
missing=1
|
||||
fi
|
||||
done
|
||||
[ "$missing" -eq 0 ] || return 0
|
||||
}
|
||||
|
||||
ensure_grub_entry() {
|
||||
[ -f "$GRUB_CFG" ] || {
|
||||
fail_or_warn "missing ${GRUB_CFG}"
|
||||
return 0
|
||||
}
|
||||
|
||||
grep -q '### BEE MEMTEST ###' "$GRUB_CFG" && return 0
|
||||
|
||||
cat >> "$GRUB_CFG" <<'EOF'
|
||||
|
||||
### BEE MEMTEST ###
|
||||
if [ "${grub_platform}" = "efi" ]; then
|
||||
menuentry "Memory Test (memtest86+)" {
|
||||
chainloader /boot/memtest86+x64.efi
|
||||
}
|
||||
else
|
||||
menuentry "Memory Test (memtest86+)" {
|
||||
linux16 /boot/memtest86+x64.bin
|
||||
}
|
||||
fi
|
||||
### /BEE MEMTEST ###
|
||||
EOF
|
||||
|
||||
log "appended memtest entry to ${GRUB_CFG}"
|
||||
}
|
||||
|
||||
ensure_isolinux_entry() {
|
||||
[ -f "$ISOLINUX_CFG" ] || {
|
||||
fail_or_warn "missing ${ISOLINUX_CFG}"
|
||||
return 0
|
||||
}
|
||||
|
||||
grep -q '### BEE MEMTEST ###' "$ISOLINUX_CFG" && return 0
|
||||
|
||||
cat >> "$ISOLINUX_CFG" <<'EOF'
|
||||
|
||||
# ### BEE MEMTEST ###
|
||||
label memtest
|
||||
menu label ^Memory Test (memtest86+)
|
||||
linux /boot/memtest86+x64.bin
|
||||
# ### /BEE MEMTEST ###
|
||||
EOF
|
||||
|
||||
log "appended memtest entry to ${ISOLINUX_CFG}"
|
||||
}
|
||||
|
||||
log "ensuring memtest binaries and menu entries in binary image"
|
||||
ensure_memtest_binaries
|
||||
ensure_grub_entry
|
||||
ensure_isolinux_entry
|
||||
log "memtest assets ready"
|
||||
|
||||
@@ -1,3 +1,6 @@
|
||||
# AMD GPU firmware
|
||||
firmware-amd-graphics
|
||||
|
||||
# AMD ROCm — GPU monitoring, bandwidth test, and compute stress (RVS GST)
|
||||
rocm-smi-lib=%%ROCM_SMI_VERSION%%
|
||||
rocm-bandwidth-test=%%ROCM_BANDWIDTH_TEST_VERSION%%
|
||||
|
||||
@@ -1,2 +1,8 @@
|
||||
# NVIDIA DCGM (Data Center GPU Manager) — dcgmi diag for acceptance testing
|
||||
datacenter-gpu-manager=1:%%DCGM_VERSION%%
|
||||
# NVIDIA DCGM (Data Center GPU Manager) — dcgmi diag for acceptance testing.
|
||||
# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with CUDA 13 userspace,
|
||||
# so install the CUDA 13 build plus proprietary diagnostic components explicitly.
|
||||
datacenter-gpu-manager-4-cuda13=1:%%DCGM_VERSION%%
|
||||
datacenter-gpu-manager-4-proprietary=1:%%DCGM_VERSION%%
|
||||
datacenter-gpu-manager-4-proprietary-cuda13=1:%%DCGM_VERSION%%
|
||||
ocl-icd-libopencl1
|
||||
clinfo
|
||||
|
||||
@@ -21,14 +21,15 @@ openssh-server
|
||||
# Disk installer
|
||||
squashfs-tools
|
||||
parted
|
||||
# grub-pc / grub-efi-amd64 provide grub-install + grub2-common (required for chroot install).
|
||||
# The -bin variants only carry binary modules and do NOT include grub-install itself.
|
||||
grub-pc
|
||||
# Keep GRUB install tools without selecting a single active platform package.
|
||||
# grub-pc and grub-efi-amd64 conflict with each other, but grub2-common
|
||||
# provides grub-install/update-grub and the *-bin packages provide BIOS/UEFI modules.
|
||||
grub2-common
|
||||
grub-pc-bin
|
||||
grub-efi-amd64
|
||||
grub-efi-amd64-bin
|
||||
grub-efi-amd64-signed
|
||||
shim-signed
|
||||
efibootmgr
|
||||
|
||||
# Filesystem support for USB export targets
|
||||
exfatprogs
|
||||
@@ -50,7 +51,6 @@ sudo
|
||||
zstd
|
||||
mstflint
|
||||
memtester
|
||||
memtest86+
|
||||
stress-ng
|
||||
stressapptest
|
||||
|
||||
@@ -71,9 +71,7 @@ lightdm
|
||||
firmware-linux-free
|
||||
firmware-linux-nonfree
|
||||
firmware-misc-nonfree
|
||||
firmware-amd-graphics
|
||||
firmware-realtek
|
||||
firmware-intel-sound
|
||||
firmware-bnx2
|
||||
firmware-bnx2x
|
||||
firmware-cavium
|
||||
|
||||
@@ -52,6 +52,14 @@ else
|
||||
fail "nvidia-smi: NOT FOUND"
|
||||
fi
|
||||
|
||||
for tool in bee-gpu-burn bee-john-gpu-stress bee-nccl-gpu-stress all_reduce_perf; do
|
||||
if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then
|
||||
ok "$tool found: $p"
|
||||
else
|
||||
fail "$tool: NOT FOUND"
|
||||
fi
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "-- NVIDIA modules --"
|
||||
KO_DIR="/usr/local/lib/nvidia"
|
||||
@@ -109,6 +117,40 @@ else
|
||||
fail "nvidia-smi: not found in PATH"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "-- OpenCL / John --"
|
||||
if [ -f /etc/OpenCL/vendors/nvidia.icd ]; then
|
||||
ok "OpenCL ICD present: /etc/OpenCL/vendors/nvidia.icd"
|
||||
else
|
||||
fail "OpenCL ICD missing: /etc/OpenCL/vendors/nvidia.icd"
|
||||
fi
|
||||
|
||||
if ldconfig -p 2>/dev/null | grep -q "libnvidia-opencl.so.1"; then
|
||||
ok "libnvidia-opencl.so.1 present in linker cache"
|
||||
else
|
||||
fail "libnvidia-opencl.so.1 missing from linker cache"
|
||||
fi
|
||||
|
||||
if command -v clinfo >/dev/null 2>&1; then
|
||||
if clinfo -l 2>/dev/null | grep -q "Platform"; then
|
||||
ok "clinfo: OpenCL platform detected"
|
||||
else
|
||||
fail "clinfo: no OpenCL platform detected"
|
||||
fi
|
||||
else
|
||||
fail "clinfo: not found in PATH"
|
||||
fi
|
||||
|
||||
if command -v john >/dev/null 2>&1; then
|
||||
if john --list=opencl-devices 2>/dev/null | grep -q "Device #"; then
|
||||
ok "john: OpenCL devices detected"
|
||||
else
|
||||
fail "john: no OpenCL devices detected"
|
||||
fi
|
||||
else
|
||||
fail "john: not found in PATH"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "-- lib symlinks --"
|
||||
for lib in libnvidia-ml libcuda; do
|
||||
|
||||
@@ -1,23 +1,12 @@
|
||||
[Unit]
|
||||
Description=Bee: schedule startup hardware audit via task queue
|
||||
# Start AFTER bee-web, not before — bee-web must not wait for audit.
|
||||
After=bee-web.service
|
||||
Wants=bee-web.service
|
||||
Description=Bee: hardware audit
|
||||
After=bee-preflight.service bee-network.service bee-nvidia.service
|
||||
Before=bee-web.service
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
RemainAfterExit=yes
|
||||
# Wait up to 90s for bee-web to respond on /healthz, then sleep 60s for
|
||||
# the system to settle (GPU drivers, sensors), then enqueue the audit as
|
||||
# a background task so it appears in the task list and logs.
|
||||
ExecStart=/bin/sh -c '\
|
||||
i=0; \
|
||||
while [ $i -lt 90 ]; do \
|
||||
if curl -sf http://localhost/healthz >/dev/null 2>&1; then break; fi; \
|
||||
sleep 1; i=$((i+1)); \
|
||||
done; \
|
||||
sleep 60; \
|
||||
curl -sf -X POST http://localhost/api/audit/run >/dev/null'
|
||||
ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-audit.log /usr/local/bin/bee audit --runtime auto --output file:/appdata/bee/export/bee-audit.json
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
[Unit]
|
||||
Description=Bee: hardware audit web viewer
|
||||
After=bee-audit.service
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
|
||||
@@ -0,0 +1,6 @@
|
||||
[Unit]
|
||||
Wants=bee-preflight.service
|
||||
After=bee-preflight.service
|
||||
|
||||
[Service]
|
||||
ExecStartPre=/usr/local/bin/bee-display-mode
|
||||
54
iso/overlay/usr/local/bin/bee-display-mode
Executable file
54
iso/overlay/usr/local/bin/bee-display-mode
Executable file
@@ -0,0 +1,54 @@
|
||||
#!/bin/sh
|
||||
# Select Xorg display mode based on kernel cmdline.
|
||||
# Default is the current server-safe path: keep forced fbdev.
|
||||
set -eu
|
||||
|
||||
cmdline_param() {
|
||||
key="$1"
|
||||
for token in $(cat /proc/cmdline 2>/dev/null); do
|
||||
case "$token" in
|
||||
"$key"=*)
|
||||
echo "${token#*=}"
|
||||
return 0
|
||||
;;
|
||||
esac
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
log() {
|
||||
echo "bee-display-mode: $*"
|
||||
}
|
||||
|
||||
mode="$(cmdline_param bee.display || true)"
|
||||
if [ -z "$mode" ]; then
|
||||
mode="safe"
|
||||
fi
|
||||
|
||||
xorg_dir="/etc/X11/xorg.conf.d"
|
||||
fbdev_conf="${xorg_dir}/10-fbdev.conf"
|
||||
fbdev_park="${xorg_dir}/10-fbdev.conf.disabled"
|
||||
|
||||
mkdir -p "$xorg_dir"
|
||||
|
||||
case "$mode" in
|
||||
kms|auto)
|
||||
if [ -f "$fbdev_conf" ]; then
|
||||
mv "$fbdev_conf" "$fbdev_park"
|
||||
log "mode=${mode}; disabled forced fbdev config"
|
||||
else
|
||||
log "mode=${mode}; fbdev config already disabled"
|
||||
fi
|
||||
;;
|
||||
safe|fbdev|"")
|
||||
if [ -f "$fbdev_park" ] && [ ! -f "$fbdev_conf" ]; then
|
||||
mv "$fbdev_park" "$fbdev_conf"
|
||||
log "mode=${mode}; restored forced fbdev config"
|
||||
else
|
||||
log "mode=${mode}; keeping forced fbdev config"
|
||||
fi
|
||||
;;
|
||||
*)
|
||||
log "unknown bee.display=${mode}; keeping forced fbdev config"
|
||||
;;
|
||||
esac
|
||||
102
iso/overlay/usr/local/bin/bee-gpu-burn
Normal file
102
iso/overlay/usr/local/bin/bee-gpu-burn
Normal file
@@ -0,0 +1,102 @@
|
||||
#!/bin/sh
|
||||
set -eu
|
||||
|
||||
SECONDS=5
|
||||
SIZE_MB=0
|
||||
DEVICES=""
|
||||
EXCLUDE=""
|
||||
WORKER="/usr/local/lib/bee/bee-gpu-burn-worker"
|
||||
|
||||
usage() {
|
||||
echo "usage: $0 [--seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3]" >&2
|
||||
exit 2
|
||||
}
|
||||
|
||||
normalize_list() {
|
||||
echo "${1:-}" | tr ',' '\n' | sed 's/[[:space:]]//g' | awk 'NF' | sort -n | uniq | paste -sd, -
|
||||
}
|
||||
|
||||
contains_csv() {
|
||||
needle="$1"
|
||||
haystack="${2:-}"
|
||||
echo ",${haystack}," | grep -q ",${needle},"
|
||||
}
|
||||
|
||||
while [ "$#" -gt 0 ]; do
|
||||
case "$1" in
|
||||
--seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
|
||||
--size-mb|-m) [ "$#" -ge 2 ] || usage; SIZE_MB="$2"; shift 2 ;;
|
||||
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
||||
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
||||
*) usage ;;
|
||||
esac
|
||||
done
|
||||
|
||||
[ -x "${WORKER}" ] || { echo "bee-gpu-burn worker not found: ${WORKER}" >&2; exit 1; }
|
||||
|
||||
ALL_DEVICES=$(nvidia-smi --query-gpu=index --format=csv,noheader,nounits 2>/dev/null | sed 's/[[:space:]]//g' | awk 'NF' | paste -sd, -)
|
||||
[ -n "${ALL_DEVICES}" ] || { echo "nvidia-smi found no NVIDIA GPUs" >&2; exit 1; }
|
||||
|
||||
DEVICES=$(normalize_list "${DEVICES}")
|
||||
EXCLUDE=$(normalize_list "${EXCLUDE}")
|
||||
SELECTED="${DEVICES}"
|
||||
if [ -z "${SELECTED}" ]; then
|
||||
SELECTED="${ALL_DEVICES}"
|
||||
fi
|
||||
|
||||
FINAL=""
|
||||
for id in $(echo "${SELECTED}" | tr ',' ' '); do
|
||||
[ -n "${id}" ] || continue
|
||||
if contains_csv "${id}" "${EXCLUDE}"; then
|
||||
continue
|
||||
fi
|
||||
if [ -z "${FINAL}" ]; then
|
||||
FINAL="${id}"
|
||||
else
|
||||
FINAL="${FINAL},${id}"
|
||||
fi
|
||||
done
|
||||
|
||||
[ -n "${FINAL}" ] || { echo "no NVIDIA GPUs selected after filters" >&2; exit 1; }
|
||||
|
||||
echo "loader=bee-gpu-burn"
|
||||
echo "selected_gpus=${FINAL}"
|
||||
|
||||
TMP_DIR=$(mktemp -d)
|
||||
trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM
|
||||
|
||||
WORKERS=""
|
||||
for id in $(echo "${FINAL}" | tr ',' ' '); do
|
||||
log="${TMP_DIR}/gpu-${id}.log"
|
||||
gpu_size_mb="${SIZE_MB}"
|
||||
if [ "${gpu_size_mb}" -le 0 ] 2>/dev/null; then
|
||||
total_mb=$(nvidia-smi --id="${id}" --query-gpu=memory.total --format=csv,noheader,nounits 2>/dev/null | tr -d '[:space:]')
|
||||
if [ -n "${total_mb}" ] && [ "${total_mb}" -gt 0 ] 2>/dev/null; then
|
||||
gpu_size_mb=$(( total_mb * 95 / 100 ))
|
||||
else
|
||||
gpu_size_mb=512
|
||||
fi
|
||||
fi
|
||||
echo "starting gpu ${id} size=${gpu_size_mb}MB"
|
||||
"${WORKER}" --device "${id}" --seconds "${SECONDS}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
|
||||
pid=$!
|
||||
WORKERS="${WORKERS} ${pid}:${id}:${log}"
|
||||
done
|
||||
|
||||
status=0
|
||||
for spec in ${WORKERS}; do
|
||||
pid=${spec%%:*}
|
||||
rest=${spec#*:}
|
||||
id=${rest%%:*}
|
||||
log=${rest#*:}
|
||||
if wait "${pid}"; then
|
||||
echo "gpu ${id} finished: OK"
|
||||
else
|
||||
rc=$?
|
||||
echo "gpu ${id} finished: FAILED rc=${rc}"
|
||||
status=1
|
||||
fi
|
||||
sed "s/^/[gpu ${id}] /" "${log}" || true
|
||||
done
|
||||
|
||||
exit "${status}"
|
||||
@@ -12,17 +12,55 @@
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
usage() {
|
||||
cat >&2 <<'EOF'
|
||||
Usage: bee-install <device> [logfile]
|
||||
|
||||
Installs the live system to a local disk (WIPES the target).
|
||||
|
||||
device Target block device, e.g. /dev/sda or /dev/nvme0n1
|
||||
Must be a hard disk or NVMe — NOT a CD-ROM (/dev/sr*)
|
||||
logfile Optional path for progress log (default: /tmp/bee-install.log)
|
||||
|
||||
Examples:
|
||||
bee-install /dev/sda
|
||||
bee-install /dev/nvme0n1
|
||||
bee-install /dev/sdb /tmp/my-install.log
|
||||
|
||||
WARNING: ALL DATA ON <device> WILL BE ERASED.
|
||||
|
||||
Layout (UEFI): GPT — partition 1: EFI 512MB vfat, partition 2: root ext4
|
||||
Layout (BIOS): MBR — partition 1: root ext4
|
||||
EOF
|
||||
exit 1
|
||||
}
|
||||
|
||||
DEVICE="${1:-}"
|
||||
LOGFILE="${2:-/tmp/bee-install.log}"
|
||||
|
||||
if [ -z "$DEVICE" ]; then
|
||||
echo "Usage: bee-install <device> [logfile]" >&2
|
||||
exit 1
|
||||
if [ -z "$DEVICE" ] || [ "$DEVICE" = "--help" ] || [ "$DEVICE" = "-h" ]; then
|
||||
usage
|
||||
fi
|
||||
if [ ! -b "$DEVICE" ]; then
|
||||
echo "ERROR: $DEVICE is not a block device" >&2
|
||||
echo "Run 'lsblk' to list available disks." >&2
|
||||
exit 1
|
||||
fi
|
||||
# Block CD-ROM devices
|
||||
case "$DEVICE" in
|
||||
/dev/sr*|/dev/scd*)
|
||||
echo "ERROR: $DEVICE is a CD-ROM/optical device — cannot install to it." >&2
|
||||
echo "Run 'lsblk' to find the target disk (e.g. /dev/sda, /dev/nvme0n1)." >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
# Check required tools
|
||||
for tool in parted mkfs.vfat mkfs.ext4 unsquashfs grub-install update-grub; do
|
||||
if ! command -v "$tool" >/dev/null 2>&1; then
|
||||
echo "ERROR: required tool not found: $tool" >&2
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
||||
SQUASHFS="/run/live/medium/live/filesystem.squashfs"
|
||||
if [ ! -f "$SQUASHFS" ]; then
|
||||
|
||||
205
iso/overlay/usr/local/bin/bee-john-gpu-stress
Normal file
205
iso/overlay/usr/local/bin/bee-john-gpu-stress
Normal file
@@ -0,0 +1,205 @@
|
||||
#!/bin/sh
|
||||
set -eu
|
||||
|
||||
SECONDS=300
|
||||
DEVICES=""
|
||||
EXCLUDE=""
|
||||
FORMAT=""
|
||||
JOHN_DIR="/usr/local/lib/bee/john/run"
|
||||
JOHN_BIN="${JOHN_DIR}/john"
|
||||
export OCL_ICD_VENDORS="/etc/OpenCL/vendors"
|
||||
export LD_LIBRARY_PATH="/usr/lib:/usr/local/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
|
||||
|
||||
usage() {
|
||||
echo "usage: $0 [--seconds N] [--devices 0,1] [--exclude 2,3] [--format name]" >&2
|
||||
exit 2
|
||||
}
|
||||
|
||||
normalize_list() {
|
||||
echo "${1:-}" | tr ',' '\n' | sed 's/[[:space:]]//g' | awk 'NF' | sort -n | uniq | paste -sd, -
|
||||
}
|
||||
|
||||
contains_csv() {
|
||||
needle="$1"
|
||||
haystack="${2:-}"
|
||||
echo ",${haystack}," | grep -q ",${needle},"
|
||||
}
|
||||
|
||||
show_opencl_diagnostics() {
|
||||
echo "-- OpenCL ICD vendors --" >&2
|
||||
if [ -d /etc/OpenCL/vendors ]; then
|
||||
ls -l /etc/OpenCL/vendors >&2 || true
|
||||
for icd in /etc/OpenCL/vendors/*.icd; do
|
||||
[ -f "${icd}" ] || continue
|
||||
echo " file: ${icd}" >&2
|
||||
sed 's/^/ /' "${icd}" >&2 || true
|
||||
done
|
||||
else
|
||||
echo " /etc/OpenCL/vendors is missing" >&2
|
||||
fi
|
||||
echo "-- NVIDIA device nodes --" >&2
|
||||
ls -l /dev/nvidia* >&2 || true
|
||||
echo "-- ldconfig OpenCL/NVIDIA --" >&2
|
||||
ldconfig -p 2>/dev/null | grep 'libOpenCL\|libcuda\|libnvidia-opencl' >&2 || true
|
||||
if command -v clinfo >/dev/null 2>&1; then
|
||||
echo "-- clinfo -l --" >&2
|
||||
clinfo -l >&2 || true
|
||||
fi
|
||||
echo "-- john --list=opencl-devices --" >&2
|
||||
./john --list=opencl-devices >&2 || true
|
||||
}
|
||||
|
||||
refresh_nvidia_runtime() {
|
||||
if [ "$(id -u)" != "0" ]; then
|
||||
return 1
|
||||
fi
|
||||
if command -v bee-nvidia-load >/dev/null 2>&1; then
|
||||
bee-nvidia-load >/dev/null 2>&1 || true
|
||||
fi
|
||||
ldconfig >/dev/null 2>&1 || true
|
||||
return 0
|
||||
}
|
||||
|
||||
ensure_nvidia_uvm() {
|
||||
if lsmod 2>/dev/null | grep -q '^nvidia_uvm '; then
|
||||
return 0
|
||||
fi
|
||||
if [ "$(id -u)" != "0" ]; then
|
||||
return 1
|
||||
fi
|
||||
|
||||
ko="/usr/local/lib/nvidia/nvidia-uvm.ko"
|
||||
[ -f "${ko}" ] || return 1
|
||||
|
||||
if ! insmod "${ko}" >/dev/null 2>&1; then
|
||||
return 1
|
||||
fi
|
||||
|
||||
uvm_major=$(grep -m1 ' nvidia-uvm$' /proc/devices | awk '{print $1}')
|
||||
if [ -n "${uvm_major}" ]; then
|
||||
mknod -m 666 /dev/nvidia-uvm c "${uvm_major}" 0 2>/dev/null || true
|
||||
mknod -m 666 /dev/nvidia-uvm-tools c "${uvm_major}" 1 2>/dev/null || true
|
||||
fi
|
||||
return 0
|
||||
}
|
||||
|
||||
ensure_opencl_ready() {
|
||||
out=$(./john --list=opencl-devices 2>&1 || true)
|
||||
if echo "${out}" | grep -q "Device #"; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
if refresh_nvidia_runtime; then
|
||||
out=$(./john --list=opencl-devices 2>&1 || true)
|
||||
if echo "${out}" | grep -q "Device #"; then
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
|
||||
if ensure_nvidia_uvm; then
|
||||
out=$(./john --list=opencl-devices 2>&1 || true)
|
||||
if echo "${out}" | grep -q "Device #"; then
|
||||
return 0
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "OpenCL devices are not available for John." >&2
|
||||
if ! lsmod 2>/dev/null | grep -q '^nvidia_uvm '; then
|
||||
echo "nvidia_uvm is not loaded." >&2
|
||||
fi
|
||||
if [ ! -e /dev/nvidia-uvm ]; then
|
||||
echo "/dev/nvidia-uvm is missing." >&2
|
||||
fi
|
||||
show_opencl_diagnostics
|
||||
return 1
|
||||
}
|
||||
|
||||
while [ "$#" -gt 0 ]; do
|
||||
case "$1" in
|
||||
--seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
|
||||
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
||||
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
||||
--format) [ "$#" -ge 2 ] || usage; FORMAT="$2"; shift 2 ;;
|
||||
*) usage ;;
|
||||
esac
|
||||
done
|
||||
|
||||
[ -x "${JOHN_BIN}" ] || { echo "john binary not found: ${JOHN_BIN}" >&2; exit 1; }
|
||||
|
||||
ALL_DEVICES=$(nvidia-smi --query-gpu=index --format=csv,noheader,nounits 2>/dev/null | sed 's/[[:space:]]//g' | awk 'NF' | paste -sd, -)
|
||||
[ -n "${ALL_DEVICES}" ] || { echo "nvidia-smi found no NVIDIA GPUs" >&2; exit 1; }
|
||||
|
||||
DEVICES=$(normalize_list "${DEVICES}")
|
||||
EXCLUDE=$(normalize_list "${EXCLUDE}")
|
||||
SELECTED="${DEVICES}"
|
||||
if [ -z "${SELECTED}" ]; then
|
||||
SELECTED="${ALL_DEVICES}"
|
||||
fi
|
||||
|
||||
FINAL=""
|
||||
for id in $(echo "${SELECTED}" | tr ',' ' '); do
|
||||
[ -n "${id}" ] || continue
|
||||
if contains_csv "${id}" "${EXCLUDE}"; then
|
||||
continue
|
||||
fi
|
||||
if [ -z "${FINAL}" ]; then
|
||||
FINAL="${id}"
|
||||
else
|
||||
FINAL="${FINAL},${id}"
|
||||
fi
|
||||
done
|
||||
|
||||
[ -n "${FINAL}" ] || { echo "no NVIDIA GPUs selected after filters" >&2; exit 1; }
|
||||
|
||||
JOHN_DEVICES=""
|
||||
for id in $(echo "${FINAL}" | tr ',' ' '); do
|
||||
opencl_id=$((id + 1))
|
||||
if [ -z "${JOHN_DEVICES}" ]; then
|
||||
JOHN_DEVICES="${opencl_id}"
|
||||
else
|
||||
JOHN_DEVICES="${JOHN_DEVICES},${opencl_id}"
|
||||
fi
|
||||
done
|
||||
|
||||
echo "loader=john"
|
||||
echo "selected_gpus=${FINAL}"
|
||||
echo "john_devices=${JOHN_DEVICES}"
|
||||
|
||||
cd "${JOHN_DIR}"
|
||||
|
||||
ensure_opencl_ready || exit 1
|
||||
|
||||
choose_format() {
|
||||
if [ -n "${FORMAT}" ]; then
|
||||
echo "${FORMAT}"
|
||||
return 0
|
||||
fi
|
||||
for candidate in sha512crypt-opencl pbkdf2-hmac-sha512-opencl 7z-opencl sha256crypt-opencl md5crypt-opencl; do
|
||||
if ./john --test=1 --format="${candidate}" --devices="${JOHN_DEVICES}" >/dev/null 2>&1; then
|
||||
echo "${candidate}"
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
CHOSEN_FORMAT=$(choose_format) || {
|
||||
echo "no suitable john OpenCL format found" >&2
|
||||
./john --list=opencl-devices >&2 || true
|
||||
exit 1
|
||||
}
|
||||
|
||||
echo "format=${CHOSEN_FORMAT}"
|
||||
PIDS=""
|
||||
_first=1
|
||||
for opencl_id in $(echo "${JOHN_DEVICES}" | tr ',' ' '); do
|
||||
[ "${_first}" = "1" ] || sleep 3
|
||||
_first=0
|
||||
./john --test="${SECONDS}" --format="${CHOSEN_FORMAT}" --devices="${opencl_id}" &
|
||||
PIDS="${PIDS} $!"
|
||||
done
|
||||
FAIL=0
|
||||
for pid in ${PIDS}; do
|
||||
wait "${pid}" || FAIL=$((FAIL+1))
|
||||
done
|
||||
[ "${FAIL}" -eq 0 ] || { echo "john: ${FAIL} device(s) failed" >&2; exit 1; }
|
||||
@@ -17,7 +17,7 @@ mkdir -p "$(dirname "$log_file")"
|
||||
serial_sink() {
|
||||
local tty="$1"
|
||||
if [ -w "$tty" ]; then
|
||||
cat > "$tty"
|
||||
cat > "$tty" 2>/dev/null || true
|
||||
else
|
||||
cat > /dev/null
|
||||
fi
|
||||
|
||||
91
iso/overlay/usr/local/bin/bee-nccl-gpu-stress
Normal file
91
iso/overlay/usr/local/bin/bee-nccl-gpu-stress
Normal file
@@ -0,0 +1,91 @@
|
||||
#!/bin/sh
|
||||
set -eu
|
||||
|
||||
SECONDS=300
|
||||
DEVICES=""
|
||||
EXCLUDE=""
|
||||
MIN_BYTES="512M"
|
||||
MAX_BYTES="4G"
|
||||
FACTOR="2"
|
||||
ITERS="20"
|
||||
ALL_REDUCE_BIN="/usr/local/bin/all_reduce_perf"
|
||||
|
||||
usage() {
|
||||
echo "usage: $0 [--seconds N] [--devices 0,1] [--exclude 2,3]" >&2
|
||||
exit 2
|
||||
}
|
||||
|
||||
normalize_list() {
|
||||
echo "${1:-}" | tr ',' '\n' | sed 's/[[:space:]]//g' | awk 'NF' | sort -n | uniq | paste -sd, -
|
||||
}
|
||||
|
||||
contains_csv() {
|
||||
needle="$1"
|
||||
haystack="${2:-}"
|
||||
echo ",${haystack}," | grep -q ",${needle},"
|
||||
}
|
||||
|
||||
while [ "$#" -gt 0 ]; do
|
||||
case "$1" in
|
||||
--seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
|
||||
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
||||
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
||||
*) usage ;;
|
||||
esac
|
||||
done
|
||||
|
||||
[ -x "${ALL_REDUCE_BIN}" ] || { echo "all_reduce_perf not found: ${ALL_REDUCE_BIN}" >&2; exit 1; }
|
||||
|
||||
ALL_DEVICES=$(nvidia-smi --query-gpu=index --format=csv,noheader,nounits 2>/dev/null | sed 's/[[:space:]]//g' | awk 'NF' | paste -sd, -)
|
||||
[ -n "${ALL_DEVICES}" ] || { echo "nvidia-smi found no NVIDIA GPUs" >&2; exit 1; }
|
||||
|
||||
DEVICES=$(normalize_list "${DEVICES}")
|
||||
EXCLUDE=$(normalize_list "${EXCLUDE}")
|
||||
SELECTED="${DEVICES}"
|
||||
if [ -z "${SELECTED}" ]; then
|
||||
SELECTED="${ALL_DEVICES}"
|
||||
fi
|
||||
|
||||
FINAL=""
|
||||
for id in $(echo "${SELECTED}" | tr ',' ' '); do
|
||||
[ -n "${id}" ] || continue
|
||||
if contains_csv "${id}" "${EXCLUDE}"; then
|
||||
continue
|
||||
fi
|
||||
if [ -z "${FINAL}" ]; then
|
||||
FINAL="${id}"
|
||||
else
|
||||
FINAL="${FINAL},${id}"
|
||||
fi
|
||||
done
|
||||
|
||||
[ -n "${FINAL}" ] || { echo "no NVIDIA GPUs selected after filters" >&2; exit 1; }
|
||||
|
||||
GPU_COUNT=$(echo "${FINAL}" | tr ',' '\n' | awk 'NF' | wc -l | awk '{print $1}')
|
||||
[ "${GPU_COUNT}" -gt 0 ] || { echo "selected GPU count is zero" >&2; exit 1; }
|
||||
|
||||
echo "loader=nccl"
|
||||
echo "selected_gpus=${FINAL}"
|
||||
echo "gpu_count=${GPU_COUNT}"
|
||||
echo "range=${MIN_BYTES}..${MAX_BYTES}"
|
||||
echo "iters=${ITERS}"
|
||||
|
||||
deadline=$(( $(date +%s) + SECONDS ))
|
||||
round=0
|
||||
|
||||
while :; do
|
||||
now=$(date +%s)
|
||||
if [ "${now}" -ge "${deadline}" ]; then
|
||||
break
|
||||
fi
|
||||
round=$((round + 1))
|
||||
remaining=$((deadline - now))
|
||||
echo "round=${round} remaining_sec=${remaining}"
|
||||
CUDA_VISIBLE_DEVICES="${FINAL}" \
|
||||
"${ALL_REDUCE_BIN}" \
|
||||
-b "${MIN_BYTES}" \
|
||||
-e "${MAX_BYTES}" \
|
||||
-f "${FACTOR}" \
|
||||
-g "${GPU_COUNT}" \
|
||||
--iters "${ITERS}"
|
||||
done
|
||||
@@ -6,25 +6,66 @@ LOG_PREFIX="bee-network"
|
||||
|
||||
log() { echo "[$LOG_PREFIX] $*"; }
|
||||
|
||||
# find physical interfaces: exclude lo and virtual (docker/virbr/veth/tun/tap)
|
||||
interfaces=$(ip -o link show \
|
||||
| awk -F': ' '{print $2}' \
|
||||
| grep -v '^lo$' \
|
||||
| grep -vE '^(docker|virbr|veth|tun|tap|br-|bond|dummy)' \
|
||||
| sort)
|
||||
list_interfaces() {
|
||||
ip -o link show \
|
||||
| awk -F': ' '{print $2}' \
|
||||
| grep -v '^lo$' \
|
||||
| grep -vE '^(docker|virbr|veth|tun|tap|br-|bond|dummy)' \
|
||||
| sort
|
||||
}
|
||||
|
||||
if [ -z "$interfaces" ]; then
|
||||
# Give udev a short chance to expose late NICs before the first scan.
|
||||
if command -v udevadm >/dev/null 2>&1; then
|
||||
udevadm settle --timeout=5 >/dev/null 2>&1 || log "WARN: udevadm settle timed out"
|
||||
fi
|
||||
|
||||
started_ifaces=""
|
||||
started_count=0
|
||||
scan_pass=1
|
||||
|
||||
# Some server NICs appear a bit later after module/firmware init. Do a small
|
||||
# bounded rescan window without turning network bring-up into a boot blocker.
|
||||
while [ "$scan_pass" -le 3 ]; do
|
||||
interfaces=$(list_interfaces)
|
||||
|
||||
if [ -n "$interfaces" ]; then
|
||||
for iface in $interfaces; do
|
||||
case " $started_ifaces " in
|
||||
*" $iface "*) continue ;;
|
||||
esac
|
||||
|
||||
log "bringing up $iface"
|
||||
if ! ip link set "$iface" up; then
|
||||
log "WARN: could not bring up $iface"
|
||||
continue
|
||||
fi
|
||||
|
||||
carrier=$(cat "/sys/class/net/$iface/carrier" 2>/dev/null || true)
|
||||
if [ "$carrier" = "1" ]; then
|
||||
log "carrier detected on $iface"
|
||||
else
|
||||
log "carrier not detected yet on $iface"
|
||||
fi
|
||||
|
||||
# DHCP in background — non-blocking, keep dhclient verbose output in the service log.
|
||||
dhclient -4 -v -nw "$iface" &
|
||||
log "DHCP started for $iface (pid $!)"
|
||||
|
||||
started_ifaces="$started_ifaces $iface"
|
||||
started_count=$((started_count + 1))
|
||||
done
|
||||
fi
|
||||
|
||||
if [ "$scan_pass" -ge 3 ]; then
|
||||
break
|
||||
fi
|
||||
scan_pass=$((scan_pass + 1))
|
||||
sleep 2
|
||||
done
|
||||
|
||||
if [ "$started_count" -eq 0 ]; then
|
||||
log "no physical interfaces found"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
for iface in $interfaces; do
|
||||
log "bringing up $iface"
|
||||
ip link set "$iface" up || { log "WARN: could not bring up $iface"; continue; }
|
||||
|
||||
# DHCP in background — non-blocking, keep dhclient verbose output in the service log.
|
||||
dhclient -4 -v -nw "$iface" &
|
||||
log "DHCP started for $iface (pid $!)"
|
||||
done
|
||||
|
||||
log "done"
|
||||
log "done (interfaces started: $started_count)"
|
||||
|
||||
@@ -59,11 +59,24 @@ load_module() {
|
||||
return 1
|
||||
}
|
||||
|
||||
load_host_module() {
|
||||
mod="$1"
|
||||
if modprobe "$mod" >/dev/null 2>&1; then
|
||||
log "host module loaded: $mod"
|
||||
return 0
|
||||
fi
|
||||
return 1
|
||||
}
|
||||
|
||||
case "$nvidia_mode" in
|
||||
normal|full)
|
||||
if ! load_module nvidia; then
|
||||
exit 1
|
||||
fi
|
||||
# nvidia-modeset on some server kernels needs ACPI video helper symbols
|
||||
# exported by the generic "video" module. Best-effort only; compute paths
|
||||
# remain functional even if display-related modules stay absent.
|
||||
load_host_module video || true
|
||||
load_module nvidia-modeset || true
|
||||
load_module nvidia-uvm || true
|
||||
;;
|
||||
@@ -114,4 +127,19 @@ fi
|
||||
ldconfig 2>/dev/null || true
|
||||
log "ldconfig refreshed"
|
||||
|
||||
# Start DCGM host engine so dcgmi can discover GPUs.
|
||||
# nv-hostengine must run before any dcgmi command — without it, dcgmi reports
|
||||
# "group is empty" even when GPUs and modules are present.
|
||||
# Skip if already running (e.g. started by a dcgm systemd service or prior boot).
|
||||
if command -v nv-hostengine >/dev/null 2>&1; then
|
||||
if pgrep -x nv-hostengine >/dev/null 2>&1; then
|
||||
log "nv-hostengine already running — skipping"
|
||||
else
|
||||
nv-hostengine
|
||||
log "nv-hostengine started"
|
||||
fi
|
||||
else
|
||||
log "WARN: nv-hostengine not found — dcgmi diagnostics will not work"
|
||||
fi
|
||||
|
||||
log "done"
|
||||
|
||||
@@ -24,7 +24,7 @@ chromium \
|
||||
--no-first-run \
|
||||
--disable-session-crashed-bubble \
|
||||
--disable-features=TranslateUI \
|
||||
--start-fullscreen \
|
||||
--start-maximized \
|
||||
http://localhost/ &
|
||||
|
||||
exec openbox
|
||||
|
||||
@@ -3,6 +3,11 @@
|
||||
# Type 'a' at any prompt to abort, 'b' to go back.
|
||||
set -e
|
||||
|
||||
# Requires root for ip/dhclient/resolv.conf — re-exec via sudo if needed.
|
||||
if [ "$(id -u)" -ne 0 ]; then
|
||||
exec sudo "$0" "$@"
|
||||
fi
|
||||
|
||||
abort() { echo "Aborted."; exit 0; }
|
||||
|
||||
ask() {
|
||||
|
||||
Reference in New Issue
Block a user