Compare commits

...

16 Commits
v6.5 ... v7.5

Author SHA1 Message Date
05241f2e0e Redesign dashboard: split Runtime Health and Hardware Summary
- Runtime Health now shows only LiveCD system status (services, tools,
  drivers, network, CUDA/ROCm) — hardware component rows removed
- Hardware Summary now shows server components with readable descriptions
  (model, count×size) and component-status.json health badges
- Add Network Adapters row to Hardware Summary
- SFP module static info (vendor, PN, SN, connector, type, wavelength)
  now collected via ethtool -m regardless of carrier state
- PSU statuses from IPMI audit written to component-status.json so PSU
  badge shows actual status after first audit instead of UNKNOWN

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-09 23:41:23 +03:00
Mikhail Chusavitin
c1690a084b Fix app tests that mutate global defaults 2026-04-09 15:28:25 +03:00
Mikhail Chusavitin
9481ca2805 Add staged NVIDIA burn ramp-up mode 2026-04-09 15:21:14 +03:00
Mikhail Chusavitin
a78fdadd88 Refine validate and burn profile layout 2026-04-09 15:14:48 +03:00
Mikhail Chusavitin
4ef403898f Tighten NVIDIA GPU PCI detection 2026-04-09 15:14:48 +03:00
025548ab3c UI: amber accents, smaller wallpaper logo, new support bundle name, drop display resolution
- Bootloader: GRUB fallback text colors → yellow/brown (amber tone)
- CLI charts: all GPU metric series use single amber color (xterm-256 #214)
- Wallpaper: logo width scaled to 400 px dynamically, shadow scales with font size
- Support bundle: renamed to YYYY-MM-DD (BEE-SP vX.X) SRV_MODEL SRV_SN ToD.tar.gz
  using dmidecode for server model (spaces→underscores) and serial number
- Remove display resolution feature (UI card, API routes, handlers, tests)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-08 21:37:01 +03:00
Mikhail Chusavitin
e0d94d7f47 Remove HPL from build and audit flows 2026-04-08 10:00:23 +03:00
Mikhail Chusavitin
13899aa864 Drop incompatible HPL git fallback 2026-04-08 09:50:58 +03:00
Mikhail Chusavitin
f345d8a89d Build HPL serially to avoid upstream make races 2026-04-08 09:47:35 +03:00
Mikhail Chusavitin
4715059ac0 Fix HPL MPI stub header and keep full build logs 2026-04-08 09:45:14 +03:00
Mikhail Chusavitin
0660a40287 Harden HPL builder cache and runtime libs 2026-04-08 09:40:18 +03:00
Mikhail Chusavitin
67369d9b7b Fix OpenBLAS package lookup in HPL build 2026-04-08 09:32:49 +03:00
Mikhail Chusavitin
3f41a026ca Add resilient HPL source fallbacks 2026-04-08 09:25:31 +03:00
Mikhail Chusavitin
0ee4f46537 Restore MOTD-style ASCII wallpaper 2026-04-08 09:14:27 +03:00
8db40b098a Update bible submodule
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-08 07:14:31 +03:00
16e7ae00e7 Add HPL (LINPACK) benchmark as validate/stress task
HPL 2.3 from netlib compiled against OpenBLAS with a minimal
single-process MPI stub — no MPI package required in the ISO.
Matrix size is auto-sized to 80% of total RAM at runtime.

Build:
- VERSIONS: HPL_VERSION=2.3, HPL_SHA256=32c5c17d…
- build-hpl.sh: downloads HPL + OpenBLAS from Debian 12 repo,
  compiles xhpl with a self-contained mpi_stub.c
- build.sh: step 80-hpl, injects xhpl + libopenblas into overlay

Runtime:
- bee-hpl: generates HPL.dat (N auto from /proc/meminfo, NB=256,
  P=1 Q=1), runs xhpl, prints standard WR... Gflops output
- platform/hpl.go: RunHPL(), parses WR line → GFlops + PASSED/FAILED
- tasks.go: target "hpl"
- pages.go: LINPACK (HPL) card in validate/stress grid (stress-only)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-08 07:08:18 +03:00
25 changed files with 780 additions and 450 deletions

View File

@@ -117,7 +117,7 @@ type satRunner interface {
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error)
RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
@@ -190,6 +190,7 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
} }
result := collector.Run(runtimeMode) result := collector.Run(runtimeMode)
applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir, a.StatusDB) applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir, a.StatusDB)
writePSUStatusesToDB(a.StatusDB, result.Hardware.PowerSupplies)
if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil { if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil {
result.Runtime = &health result.Runtime = &health
} }
@@ -566,11 +567,11 @@ func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts pl
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc) return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
} }
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) { func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" { if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir baseDir = DefaultSATBaseDir
} }
return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, logFunc) return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, staggerSec, logFunc)
} }
func (a *App) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) { func (a *App) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
@@ -926,6 +927,41 @@ func bodyOr(body, fallback string) string {
return body return body
} }
// writePSUStatusesToDB records PSU statuses collected during audit into the
// component-status DB so they are visible in the Hardware Summary card.
// PSU status is sourced from IPMI (ipmitool fru + sdr) during audit.
func writePSUStatusesToDB(db *ComponentStatusDB, psus []schema.HardwarePowerSupply) {
if db == nil || len(psus) == 0 {
return
}
const source = "audit:ipmi"
worstStatus := "OK"
for _, psu := range psus {
if psu.Status == nil {
continue
}
slot := "?"
if psu.Slot != nil {
slot = *psu.Slot
}
st := *psu.Status
detail := ""
if psu.ErrorDescription != nil {
detail = *psu.ErrorDescription
}
db.Record("psu:"+slot, source, st, detail)
switch st {
case "Critical":
worstStatus = "Critical"
case "Warning":
if worstStatus != "Critical" {
worstStatus = "Warning"
}
}
}
db.Record("psu:all", source, worstStatus, "")
}
func ReadRuntimeHealth(path string) (schema.RuntimeHealth, error) { func ReadRuntimeHealth(path string) (schema.RuntimeHealth, error) {
raw, err := os.ReadFile(path) raw, err := os.ReadFile(path)
if err != nil { if err != nil {

View File

@@ -161,7 +161,7 @@ func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir
return f.runNvidiaFn(baseDir) return f.runNvidiaFn(baseDir)
} }
func (f fakeSAT) RunNvidiaOfficialComputePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) { func (f fakeSAT) RunNvidiaOfficialComputePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ int, _ func(string)) (string, error) {
if f.runNvidiaComputeFn != nil { if f.runNvidiaComputeFn != nil {
return f.runNvidiaComputeFn(baseDir, durationSec, gpuIndices) return f.runNvidiaComputeFn(baseDir, durationSec, gpuIndices)
} }
@@ -542,8 +542,6 @@ func TestActionResultsUseFallbackBody(t *testing.T) {
} }
func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) { func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
t.Parallel()
tmp := t.TempDir() tmp := t.TempDir()
oldExportDir := DefaultExportDir oldExportDir := DefaultExportDir
DefaultExportDir = tmp DefaultExportDir = tmp
@@ -580,8 +578,6 @@ func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
} }
func TestExportSupportBundleResultDoesNotPretendSuccessOnError(t *testing.T) { func TestExportSupportBundleResultDoesNotPretendSuccessOnError(t *testing.T) {
t.Parallel()
tmp := t.TempDir() tmp := t.TempDir()
oldExportDir := DefaultExportDir oldExportDir := DefaultExportDir
DefaultExportDir = tmp DefaultExportDir = tmp
@@ -643,8 +639,6 @@ func TestRunNvidiaAcceptancePackResult(t *testing.T) {
} }
func TestRunSATDefaultsToExportDir(t *testing.T) { func TestRunSATDefaultsToExportDir(t *testing.T) {
t.Parallel()
oldSATBaseDir := DefaultSATBaseDir oldSATBaseDir := DefaultSATBaseDir
DefaultSATBaseDir = "/tmp/export/bee-sat" DefaultSATBaseDir = "/tmp/export/bee-sat"
t.Cleanup(func() { DefaultSATBaseDir = oldSATBaseDir }) t.Cleanup(func() { DefaultSATBaseDir = oldSATBaseDir })

View File

@@ -54,7 +54,7 @@ if ! command -v lspci >/dev/null 2>&1; then
exit 0 exit 0
fi fi
found=0 found=0
for gpu in $(lspci -Dn | awk '$3 ~ /^10de:/ {print $1}'); do for gpu in $(lspci -Dn | awk '$2 ~ /^03(00|02):$/ && $3 ~ /^10de:/ {print $1}'); do
found=1 found=1
echo "=== GPU $gpu ===" echo "=== GPU $gpu ==="
lspci -s "$gpu" -vv 2>&1 || true lspci -s "$gpu" -vv 2>&1 || true
@@ -73,8 +73,13 @@ fi
{name: "system/pcie-nvidia-link.txt", cmd: []string{"sh", "-c", ` {name: "system/pcie-nvidia-link.txt", cmd: []string{"sh", "-c", `
for d in /sys/bus/pci/devices/*/; do for d in /sys/bus/pci/devices/*/; do
vendor=$(cat "$d/vendor" 2>/dev/null) vendor=$(cat "$d/vendor" 2>/dev/null)
[ "$vendor" = "0x10de" ] || continue [ "$vendor" = "0x10de" ] || continue
dev=$(basename "$d") class=$(cat "$d/class" 2>/dev/null)
case "$class" in
0x030000|0x030200) ;;
*) continue ;;
esac
dev=$(basename "$d")
echo "=== $dev ===" echo "=== $dev ==="
for f in current_link_speed current_link_width max_link_speed max_link_width; do for f in current_link_speed current_link_width max_link_speed max_link_width; do
printf " %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)" printf " %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)"
@@ -192,7 +197,7 @@ var supportBundleOptionalFiles = []struct {
{name: "system/syslog.txt", src: "/var/log/syslog"}, {name: "system/syslog.txt", src: "/var/log/syslog"},
} }
const supportBundleGlob = "bee-support-*.tar.gz" const supportBundleGlob = "????-??-?? (BEE-SP*)*.tar.gz"
func BuildSupportBundle(exportDir string) (string, error) { func BuildSupportBundle(exportDir string) (string, error) {
exportDir = strings.TrimSpace(exportDir) exportDir = strings.TrimSpace(exportDir)
@@ -206,9 +211,14 @@ func BuildSupportBundle(exportDir string) (string, error) {
return "", err return "", err
} }
host := sanitizeFilename(hostnameOr("unknown")) now := time.Now().UTC()
ts := time.Now().UTC().Format("20060102-150405") date := now.Format("2006-01-02")
stageRoot := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-%s-%s", host, ts)) tod := now.Format("15:04:05")
ver := bundleVersion()
model := serverModelForBundle()
sn := serverSerialForBundle()
stageRoot := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-stage-%s-%s", sanitizeFilename(hostnameOr("unknown")), now.Format("20060102-150405")))
if err := os.MkdirAll(stageRoot, 0755); err != nil { if err := os.MkdirAll(stageRoot, 0755); err != nil {
return "", err return "", err
} }
@@ -240,7 +250,8 @@ func BuildSupportBundle(exportDir string) (string, error) {
return "", err return "", err
} }
archivePath := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-%s-%s.tar.gz", host, ts)) archiveName := fmt.Sprintf("%s (BEE-SP v%s) %s %s %s.tar.gz", date, ver, model, sn, tod)
archivePath := filepath.Join(os.TempDir(), archiveName)
if err := createSupportTarGz(archivePath, stageRoot); err != nil { if err := createSupportTarGz(archivePath, stageRoot); err != nil {
return "", err return "", err
} }
@@ -397,6 +408,60 @@ func writeManifest(dst, exportDir, stageRoot string) error {
return os.WriteFile(dst, []byte(body.String()), 0644) return os.WriteFile(dst, []byte(body.String()), 0644)
} }
func bundleVersion() string {
v := buildVersion()
v = strings.TrimPrefix(v, "v")
v = strings.TrimPrefix(v, "V")
if v == "" || v == "unknown" {
return "0.0"
}
return v
}
func serverModelForBundle() string {
raw, err := exec.Command("dmidecode", "-t", "1").Output()
if err != nil {
return "unknown"
}
for _, line := range strings.Split(string(raw), "\n") {
line = strings.TrimSpace(line)
key, val, ok := strings.Cut(line, ": ")
if !ok {
continue
}
if strings.TrimSpace(key) == "Product Name" {
val = strings.TrimSpace(val)
if val == "" {
return "unknown"
}
return strings.ReplaceAll(val, " ", "_")
}
}
return "unknown"
}
func serverSerialForBundle() string {
raw, err := exec.Command("dmidecode", "-t", "1").Output()
if err != nil {
return "unknown"
}
for _, line := range strings.Split(string(raw), "\n") {
line = strings.TrimSpace(line)
key, val, ok := strings.Cut(line, ": ")
if !ok {
continue
}
if strings.TrimSpace(key) == "Serial Number" {
val = strings.TrimSpace(val)
if val == "" {
return "unknown"
}
return val
}
}
return "unknown"
}
func buildVersion() string { func buildVersion() string {
raw, err := exec.Command("bee", "version").CombinedOutput() raw, err := exec.Command("bee", "version").CombinedOutput()
if err != nil { if err != nil {

View File

@@ -179,11 +179,3 @@ func commandOutputWithTimeout(timeout time.Duration, name string, args ...string
defer cancel() defer cancel()
return exec.CommandContext(ctx, name, args...).Output() return exec.CommandContext(ctx, name, args...).Output()
} }
func interfaceHasCarrier(iface string) bool {
raw, err := readNetCarrierFile(iface)
if err != nil {
return false
}
return strings.TrimSpace(raw) == "1"
}

View File

@@ -58,12 +58,10 @@ func enrichPCIeWithNICTelemetry(devs []schema.HardwarePCIeDevice) []schema.Hardw
} }
} }
if interfaceHasCarrier(iface) { if out, err := ethtoolModuleQuery(iface); err == nil {
if out, err := ethtoolModuleQuery(iface); err == nil { if injectSFPDOMTelemetry(&devs[i], out) {
if injectSFPDOMTelemetry(&devs[i], out) { enriched++
enriched++ continue
continue
}
} }
} }
if len(devs[i].MacAddresses) > 0 || devs[i].Firmware != nil { if len(devs[i].MacAddresses) > 0 || devs[i].Firmware != nil {
@@ -115,8 +113,38 @@ func injectSFPDOMTelemetry(dev *schema.HardwarePCIeDevice, raw string) bool {
} }
key := strings.ToLower(strings.TrimSpace(trimmed[:idx])) key := strings.ToLower(strings.TrimSpace(trimmed[:idx]))
val := strings.TrimSpace(trimmed[idx+1:]) val := strings.TrimSpace(trimmed[idx+1:])
if val == "" || strings.EqualFold(val, "not supported") || strings.EqualFold(val, "unknown") {
continue
}
switch { switch {
case key == "identifier":
s := parseSFPIdentifier(val)
dev.SFPIdentifier = &s
t := true
dev.SFPPresent = &t
changed = true
case key == "connector":
s := parseSFPConnector(val)
dev.SFPConnector = &s
changed = true
case key == "vendor name":
s := strings.TrimSpace(val)
dev.SFPVendor = &s
changed = true
case key == "vendor pn":
s := strings.TrimSpace(val)
dev.SFPPartNumber = &s
changed = true
case key == "vendor sn":
s := strings.TrimSpace(val)
dev.SFPSerialNumber = &s
changed = true
case strings.Contains(key, "laser wavelength"):
if f, ok := firstFloat(val); ok {
dev.SFPWavelengthNM = &f
changed = true
}
case strings.Contains(key, "module temperature"): case strings.Contains(key, "module temperature"):
if f, ok := firstFloat(val); ok { if f, ok := firstFloat(val); ok {
dev.SFPTemperatureC = &f dev.SFPTemperatureC = &f
@@ -147,12 +175,61 @@ func injectSFPDOMTelemetry(dev *schema.HardwarePCIeDevice, raw string) bool {
return changed return changed
} }
// parseSFPIdentifier extracts the human-readable transceiver type from the
// raw ethtool identifier line, e.g. "0x03 (SFP)" → "SFP".
func parseSFPIdentifier(val string) string {
if s := extractParens(val); s != "" {
return s
}
return val
}
// parseSFPConnector extracts the connector type from the raw ethtool line,
// e.g. "0x07 (LC)" → "LC".
func parseSFPConnector(val string) string {
if s := extractParens(val); s != "" {
return s
}
return val
}
var parenRe = regexp.MustCompile(`\(([^)]+)\)`)
func extractParens(s string) string {
m := parenRe.FindStringSubmatch(s)
if len(m) < 2 {
return ""
}
return strings.TrimSpace(m[1])
}
func parseSFPDOM(raw string) map[string]any { func parseSFPDOM(raw string) map[string]any {
dev := schema.HardwarePCIeDevice{} dev := schema.HardwarePCIeDevice{}
if !injectSFPDOMTelemetry(&dev, raw) { if !injectSFPDOMTelemetry(&dev, raw) {
return map[string]any{} return map[string]any{}
} }
out := map[string]any{} out := map[string]any{}
if dev.SFPPresent != nil {
out["sfp_present"] = *dev.SFPPresent
}
if dev.SFPIdentifier != nil {
out["sfp_identifier"] = *dev.SFPIdentifier
}
if dev.SFPConnector != nil {
out["sfp_connector"] = *dev.SFPConnector
}
if dev.SFPVendor != nil {
out["sfp_vendor"] = *dev.SFPVendor
}
if dev.SFPPartNumber != nil {
out["sfp_part_number"] = *dev.SFPPartNumber
}
if dev.SFPSerialNumber != nil {
out["sfp_serial_number"] = *dev.SFPSerialNumber
}
if dev.SFPWavelengthNM != nil {
out["sfp_wavelength_nm"] = *dev.SFPWavelengthNM
}
if dev.SFPTemperatureC != nil { if dev.SFPTemperatureC != nil {
out["sfp_temperature_c"] = *dev.SFPTemperatureC out["sfp_temperature_c"] = *dev.SFPTemperatureC
} }

View File

@@ -122,10 +122,7 @@ func TestEnrichPCIeWithNICTelemetrySkipsModuleQueryWithoutCarrier(t *testing.T)
readNetAddressFile = func(string) (string, error) { return "aa:bb:cc:dd:ee:ff", nil } readNetAddressFile = func(string) (string, error) { return "aa:bb:cc:dd:ee:ff", nil }
readNetCarrierFile = func(string) (string, error) { return "0", nil } readNetCarrierFile = func(string) (string, error) { return "0", nil }
ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") } ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
ethtoolModuleQuery = func(string) (string, error) { ethtoolModuleQuery = func(string) (string, error) { return "", fmt.Errorf("no module") }
t.Fatal("ethtool -m should not be called without carrier")
return "", nil
}
class := "EthernetController" class := "EthernetController"
bdf := "0000:18:00.0" bdf := "0000:18:00.0"

View File

@@ -383,10 +383,7 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
} }
const ( const (
ansiRed = "\033[31m" ansiAmber = "\033[38;5;214m"
ansiBlue = "\033[34m"
ansiGreen = "\033[32m"
ansiYellow = "\033[33m"
ansiReset = "\033[0m" ansiReset = "\033[0m"
) )
@@ -415,10 +412,10 @@ func RenderGPUTerminalChart(rows []GPUMetricRow) string {
fn func(GPUMetricRow) float64 fn func(GPUMetricRow) float64
} }
defs := []seriesDef{ defs := []seriesDef{
{"Temperature (°C)", ansiRed, func(r GPUMetricRow) float64 { return r.TempC }}, {"Temperature (°C)", ansiAmber, func(r GPUMetricRow) float64 { return r.TempC }},
{"GPU Usage (%)", ansiBlue, func(r GPUMetricRow) float64 { return r.UsagePct }}, {"GPU Usage (%)", ansiAmber, func(r GPUMetricRow) float64 { return r.UsagePct }},
{"Power (W)", ansiGreen, func(r GPUMetricRow) float64 { return r.PowerW }}, {"Power (W)", ansiAmber, func(r GPUMetricRow) float64 { return r.PowerW }},
{"Clock (MHz)", ansiYellow, func(r GPUMetricRow) float64 { return r.ClockMHz }}, {"Clock (MHz)", ansiAmber, func(r GPUMetricRow) float64 { return r.ClockMHz }},
} }
var b strings.Builder var b strings.Builder

View File

@@ -49,6 +49,9 @@ func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
"--seconds", strconv.Itoa(opts.DurationSec), "--seconds", strconv.Itoa(opts.DurationSec),
"--size-mb", strconv.Itoa(opts.SizeMB), "--size-mb", strconv.Itoa(opts.SizeMB),
} }
if opts.StaggerSeconds > 0 && len(selected) > 1 {
cmd = append(cmd, "--stagger-seconds", strconv.Itoa(opts.StaggerSeconds))
}
if len(selected) > 0 { if len(selected) > 0 {
cmd = append(cmd, "--devices", joinIndexList(selected)) cmd = append(cmd, "--devices", joinIndexList(selected))
} }
@@ -63,6 +66,9 @@ func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
"bee-john-gpu-stress", "bee-john-gpu-stress",
"--seconds", strconv.Itoa(opts.DurationSec), "--seconds", strconv.Itoa(opts.DurationSec),
} }
if opts.StaggerSeconds > 0 && len(selected) > 1 {
cmd = append(cmd, "--stagger-seconds", strconv.Itoa(opts.StaggerSeconds))
}
if len(selected) > 0 { if len(selected) > 0 {
cmd = append(cmd, "--devices", joinIndexList(selected)) cmd = append(cmd, "--devices", joinIndexList(selected))
} }

View File

@@ -384,25 +384,39 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
), logFunc) ), logFunc)
} }
func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) { func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
selected, err := resolveDCGMGPUIndices(gpuIndices) selected, err := resolveDCGMGPUIndices(gpuIndices)
if err != nil { if err != nil {
return "", err return "", err
} }
profCmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec))) var (
if err != nil { profCmd []string
return "", err profEnv []string
)
if staggerSec > 0 && len(selected) > 1 {
profCmd = []string{
"bee-dcgmproftester-staggered",
"--seconds", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)),
"--stagger-seconds", strconv.Itoa(staggerSec),
"--devices", joinIndexList(selected),
}
} else {
profCmd, err = resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)))
if err != nil {
return "", err
}
profEnv = nvidiaVisibleDevicesEnv(selected)
} }
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", withNvidiaPersistenceMode( return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", withNvidiaPersistenceMode(
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}}, satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
satJob{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}}, satJob{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
satJob{ satJob{
name: "03-dcgmproftester.log", name: "03-dcgmproftester.log",
cmd: profCmd, cmd: profCmd,
env: nvidiaVisibleDevicesEnv(selected), env: profEnv,
collectGPU: true, collectGPU: true,
gpuIndices: selected, gpuIndices: selected,
}, },
satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}}, satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
), logFunc) ), logFunc)
} }

View File

@@ -70,6 +70,7 @@ type NvidiaStressOptions struct {
Loader string Loader string
GPUIndices []int GPUIndices []int
ExcludeGPUIndices []int ExcludeGPUIndices []int
StaggerSeconds int
} }
func New() *System { func New() *System {

View File

@@ -183,6 +183,13 @@ type HardwarePCIeDevice struct {
BatteryTemperatureC *float64 `json:"battery_temperature_c,omitempty"` BatteryTemperatureC *float64 `json:"battery_temperature_c,omitempty"`
BatteryVoltageV *float64 `json:"battery_voltage_v,omitempty"` BatteryVoltageV *float64 `json:"battery_voltage_v,omitempty"`
BatteryReplaceRequired *bool `json:"battery_replace_required,omitempty"` BatteryReplaceRequired *bool `json:"battery_replace_required,omitempty"`
SFPPresent *bool `json:"sfp_present,omitempty"`
SFPIdentifier *string `json:"sfp_identifier,omitempty"`
SFPConnector *string `json:"sfp_connector,omitempty"`
SFPVendor *string `json:"sfp_vendor,omitempty"`
SFPPartNumber *string `json:"sfp_part_number,omitempty"`
SFPSerialNumber *string `json:"sfp_serial_number,omitempty"`
SFPWavelengthNM *float64 `json:"sfp_wavelength_nm,omitempty"`
SFPTemperatureC *float64 `json:"sfp_temperature_c,omitempty"` SFPTemperatureC *float64 `json:"sfp_temperature_c,omitempty"`
SFPTXPowerDBM *float64 `json:"sfp_tx_power_dbm,omitempty"` SFPTXPowerDBM *float64 `json:"sfp_tx_power_dbm,omitempty"`
SFPRXPowerDBM *float64 `json:"sfp_rx_power_dbm,omitempty"` SFPRXPowerDBM *float64 `json:"sfp_rx_power_dbm,omitempty"`

View File

@@ -482,12 +482,13 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
return return
} }
var body struct { var body struct {
Duration int `json:"duration"` Duration int `json:"duration"`
StressMode bool `json:"stress_mode"` StressMode bool `json:"stress_mode"`
GPUIndices []int `json:"gpu_indices"` GPUIndices []int `json:"gpu_indices"`
ExcludeGPUIndices []int `json:"exclude_gpu_indices"` ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
Loader string `json:"loader"` StaggerGPUStart bool `json:"stagger_gpu_start"`
Loader string `json:"loader"`
Profile string `json:"profile"` Profile string `json:"profile"`
DisplayName string `json:"display_name"` DisplayName string `json:"display_name"`
PlatformComponents []string `json:"platform_components"` PlatformComponents []string `json:"platform_components"`
@@ -503,12 +504,13 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
if strings.TrimSpace(body.DisplayName) != "" { if strings.TrimSpace(body.DisplayName) != "" {
name = body.DisplayName name = body.DisplayName
} }
params := taskParams{ params := taskParams{
Duration: body.Duration, Duration: body.Duration,
StressMode: body.StressMode, StressMode: body.StressMode,
GPUIndices: body.GPUIndices, GPUIndices: body.GPUIndices,
ExcludeGPUIndices: body.ExcludeGPUIndices, ExcludeGPUIndices: body.ExcludeGPUIndices,
Loader: body.Loader, StaggerGPUStart: body.StaggerGPUStart,
Loader: body.Loader,
BurnProfile: body.Profile, BurnProfile: body.Profile,
DisplayName: body.DisplayName, DisplayName: body.DisplayName,
PlatformComponents: body.PlatformComponents, PlatformComponents: body.PlatformComponents,
@@ -1376,107 +1378,3 @@ func (h *handler) rollbackPendingNetworkChange() error {
return nil return nil
} }
// ── Display / Screen Resolution ───────────────────────────────────────────────
type displayMode struct {
Output string `json:"output"`
Mode string `json:"mode"`
Current bool `json:"current"`
}
type displayInfo struct {
Output string `json:"output"`
Modes []displayMode `json:"modes"`
Current string `json:"current"`
}
var xrandrOutputRE = regexp.MustCompile(`^(\S+)\s+connected`)
var xrandrModeRE = regexp.MustCompile(`^\s{3}(\d+x\d+)\s`)
var xrandrCurrentRE = regexp.MustCompile(`\*`)
func parseXrandrOutput(out string) []displayInfo {
var infos []displayInfo
var cur *displayInfo
for _, line := range strings.Split(out, "\n") {
if m := xrandrOutputRE.FindStringSubmatch(line); m != nil {
if cur != nil {
infos = append(infos, *cur)
}
cur = &displayInfo{Output: m[1]}
continue
}
if cur == nil {
continue
}
if m := xrandrModeRE.FindStringSubmatch(line); m != nil {
isCurrent := xrandrCurrentRE.MatchString(line)
mode := displayMode{Output: cur.Output, Mode: m[1], Current: isCurrent}
cur.Modes = append(cur.Modes, mode)
if isCurrent {
cur.Current = m[1]
}
}
}
if cur != nil {
infos = append(infos, *cur)
}
return infos
}
func xrandrCommand(args ...string) *exec.Cmd {
cmd := exec.Command("xrandr", args...)
env := append([]string{}, os.Environ()...)
hasDisplay := false
hasXAuthority := false
for _, kv := range env {
if strings.HasPrefix(kv, "DISPLAY=") && strings.TrimPrefix(kv, "DISPLAY=") != "" {
hasDisplay = true
}
if strings.HasPrefix(kv, "XAUTHORITY=") && strings.TrimPrefix(kv, "XAUTHORITY=") != "" {
hasXAuthority = true
}
}
if !hasDisplay {
env = append(env, "DISPLAY=:0")
}
if !hasXAuthority {
env = append(env, "XAUTHORITY=/home/bee/.Xauthority")
}
cmd.Env = env
return cmd
}
func (h *handler) handleAPIDisplayResolutions(w http.ResponseWriter, _ *http.Request) {
out, err := xrandrCommand().Output()
if err != nil {
writeError(w, http.StatusInternalServerError, "xrandr: "+err.Error())
return
}
writeJSON(w, parseXrandrOutput(string(out)))
}
func (h *handler) handleAPIDisplaySet(w http.ResponseWriter, r *http.Request) {
var req struct {
Output string `json:"output"`
Mode string `json:"mode"`
}
if err := json.NewDecoder(r.Body).Decode(&req); err != nil || req.Output == "" || req.Mode == "" {
writeError(w, http.StatusBadRequest, "output and mode are required")
return
}
// Validate mode looks like WxH to prevent injection
if !regexp.MustCompile(`^\d+x\d+$`).MatchString(req.Mode) {
writeError(w, http.StatusBadRequest, "invalid mode format")
return
}
// Validate output name (no special chars)
if !regexp.MustCompile(`^[A-Za-z0-9_\-]+$`).MatchString(req.Output) {
writeError(w, http.StatusBadRequest, "invalid output name")
return
}
if out, err := xrandrCommand("--output", req.Output, "--mode", req.Mode).CombinedOutput(); err != nil {
writeError(w, http.StatusInternalServerError, "xrandr: "+strings.TrimSpace(string(out)))
return
}
writeJSON(w, map[string]string{"status": "ok", "output": req.Output, "mode": req.Mode})
}

View File

@@ -10,30 +10,6 @@ import (
"bee/audit/internal/platform" "bee/audit/internal/platform"
) )
func TestXrandrCommandAddsDefaultX11Env(t *testing.T) {
t.Setenv("DISPLAY", "")
t.Setenv("XAUTHORITY", "")
cmd := xrandrCommand("--query")
var hasDisplay bool
var hasXAuthority bool
for _, kv := range cmd.Env {
if kv == "DISPLAY=:0" {
hasDisplay = true
}
if kv == "XAUTHORITY=/home/bee/.Xauthority" {
hasXAuthority = true
}
}
if !hasDisplay {
t.Fatalf("DISPLAY not injected: %v", cmd.Env)
}
if !hasXAuthority {
t.Fatalf("XAUTHORITY not injected: %v", cmd.Env)
}
}
func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) { func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
globalQueue.mu.Lock() globalQueue.mu.Lock()
originalTasks := globalQueue.tasks originalTasks := globalQueue.tasks

View File

@@ -317,106 +317,271 @@ func renderHardwareSummaryCard(opts HandlerOptions) string {
if err != nil { if err != nil {
return `<div class="card"><div class="card-head card-head-actions"><span>Hardware Summary</span><div class="card-head-buttons"><button class="btn btn-primary btn-sm" onclick="auditModalRun()">Run audit</button></div></div><div class="card-body"></div></div>` return `<div class="card"><div class="card-head card-head-actions"><span>Hardware Summary</span><div class="card-head-buttons"><button class="btn btn-primary btn-sm" onclick="auditModalRun()">Run audit</button></div></div><div class="card-body"></div></div>`
} }
// Parse just enough fields for the summary banner var ingest schema.HardwareIngestRequest
var snap struct { if err := json.Unmarshal(data, &ingest); err != nil {
Summary struct {
CPU struct{ Model string }
Memory struct{ TotalGB float64 }
Storage []struct{ Device, Model, Size string }
GPUs []struct{ Model string }
PSUs []struct{ Model string }
}
Network struct {
Interfaces []struct {
Name string
IPv4 []string
State string
}
}
}
// Try to extract top-level fields loosely
var raw map[string]json.RawMessage
if err := json.Unmarshal(data, &raw); err != nil {
return `<div class="card"><div class="card-head">Hardware Summary</div><div class="card-body"><span class="badge badge-err">Parse error</span></div></div>` return `<div class="card"><div class="card-head">Hardware Summary</div><div class="card-body"><span class="badge badge-err">Parse error</span></div></div>`
} }
_ = snap hw := ingest.Hardware
// Also load runtime-health for badges var records []app.ComponentStatusRecord
type componentHealth struct { if db, err := app.OpenComponentStatusDB(filepath.Join(opts.ExportDir, "component-status.json")); err == nil {
FailCount int `json:"fail_count"` records = db.All()
WarnCount int `json:"warn_count"`
} }
type healthSummary struct {
CPU componentHealth `json:"cpu"`
Memory componentHealth `json:"memory"`
Storage componentHealth `json:"storage"`
GPU componentHealth `json:"gpu"`
PSU componentHealth `json:"psu"`
Network componentHealth `json:"network"`
}
var health struct {
HardwareHealth healthSummary `json:"hardware_health"`
}
if hdata, herr := loadSnapshot(filepath.Join(opts.ExportDir, "runtime-health.json")); herr == nil {
_ = json.Unmarshal(hdata, &health)
}
badge := func(h componentHealth) string {
if h.FailCount > 0 {
return `<span class="badge badge-err">FAIL</span>`
}
if h.WarnCount > 0 {
return `<span class="badge badge-warn">WARN</span>`
}
return `<span class="badge badge-ok">OK</span>`
}
// Extract readable strings from raw JSON
getString := func(key string) string {
v, ok := raw[key]
if !ok {
return ""
}
var s string
if err := json.Unmarshal(v, &s); err == nil {
return s
}
return ""
}
cpuModel := getString("cpu_model")
memStr := getString("memory_summary")
gpuSummary := getString("gpu_summary")
var b strings.Builder var b strings.Builder
b.WriteString(`<div class="card"><div class="card-head">Hardware Summary</div><div class="card-body">`) b.WriteString(`<div class="card"><div class="card-head">Hardware Summary</div><div class="card-body">`)
b.WriteString(`<table style="width:auto">`) b.WriteString(`<table style="width:auto">`)
writeRow := func(label, value, badgeHTML string) { writeRow := func(label, value, badgeHTML string) {
b.WriteString(fmt.Sprintf(`<tr><td style="padding:6px 14px 6px 0;font-weight:700;white-space:nowrap">%s</td><td style="padding:6px 0">%s</td><td style="padding:6px 0 6px 12px">%s</td></tr>`, b.WriteString(fmt.Sprintf(`<tr><td style="padding:6px 14px 6px 0;font-weight:700;white-space:nowrap">%s</td><td style="padding:6px 0;color:var(--muted);font-size:13px">%s</td><td style="padding:6px 0 6px 12px">%s</td></tr>`,
html.EscapeString(label), html.EscapeString(value), badgeHTML)) html.EscapeString(label), html.EscapeString(value), badgeHTML))
} }
if cpuModel != "" {
writeRow("CPU", cpuModel, badge(health.HardwareHealth.CPU)) cpuRow := aggregateComponentStatus("CPU", records, []string{"cpu:all"}, nil)
} else { writeRow("CPU", hwDescribeCPU(hw), runtimeStatusBadge(cpuRow.Status))
writeRow("CPU", "—", badge(health.HardwareHealth.CPU))
memRow := aggregateComponentStatus("Memory", records, []string{"memory:all"}, []string{"memory:"})
writeRow("Memory", hwDescribeMemory(hw), runtimeStatusBadge(memRow.Status))
storageRow := aggregateComponentStatus("Storage", records, []string{"storage:all"}, []string{"storage:"})
writeRow("Storage", hwDescribeStorage(hw), runtimeStatusBadge(storageRow.Status))
gpuRow := aggregateComponentStatus("GPU", records, nil, []string{"pcie:gpu:"})
writeRow("GPU", hwDescribeGPU(hw), runtimeStatusBadge(gpuRow.Status))
psuRow := aggregateComponentStatus("PSU", records, nil, []string{"psu:"})
writeRow("PSU", hwDescribePSU(hw), runtimeStatusBadge(psuRow.Status))
if nicDesc := hwDescribeNIC(hw); nicDesc != "" {
writeRow("Network", nicDesc, "")
} }
if memStr != "" {
writeRow("Memory", memStr, badge(health.HardwareHealth.Memory))
} else {
writeRow("Memory", "—", badge(health.HardwareHealth.Memory))
}
if gpuSummary != "" {
writeRow("GPU", gpuSummary, badge(health.HardwareHealth.GPU))
} else {
writeRow("GPU", "—", badge(health.HardwareHealth.GPU))
}
writeRow("Storage", "—", badge(health.HardwareHealth.Storage))
writeRow("PSU", "—", badge(health.HardwareHealth.PSU))
b.WriteString(`</table>`) b.WriteString(`</table>`)
b.WriteString(`</div></div>`) b.WriteString(`</div></div>`)
return b.String() return b.String()
} }
// hwDescribeCPU returns a human-readable CPU summary, e.g. "2× Intel Xeon Gold 6338".
func hwDescribeCPU(hw schema.HardwareSnapshot) string {
counts := map[string]int{}
order := []string{}
for _, cpu := range hw.CPUs {
model := "Unknown CPU"
if cpu.Model != nil && *cpu.Model != "" {
model = *cpu.Model
}
if counts[model] == 0 {
order = append(order, model)
}
counts[model]++
}
if len(order) == 0 {
return "—"
}
parts := make([]string, 0, len(order))
for _, m := range order {
if counts[m] > 1 {
parts = append(parts, fmt.Sprintf("%d× %s", counts[m], m))
} else {
parts = append(parts, m)
}
}
return strings.Join(parts, ", ")
}
// hwDescribeMemory returns a summary like "16× 32 GB DDR4".
func hwDescribeMemory(hw schema.HardwareSnapshot) string {
type key struct {
sizeMB int
typ string
}
counts := map[key]int{}
order := []key{}
for _, dimm := range hw.Memory {
if dimm.SizeMB == nil || *dimm.SizeMB == 0 {
continue
}
t := ""
if dimm.Type != nil {
t = *dimm.Type
}
k := key{*dimm.SizeMB, t}
if counts[k] == 0 {
order = append(order, k)
}
counts[k]++
}
if len(order) == 0 {
return "—"
}
parts := make([]string, 0, len(order))
for _, k := range order {
gb := k.sizeMB / 1024
desc := fmt.Sprintf("%d× %d GB", counts[k], gb)
if k.typ != "" {
desc += " " + k.typ
}
parts = append(parts, desc)
}
return strings.Join(parts, ", ")
}
// hwDescribeStorage returns a summary like "4× 3.84 TB NVMe, 2× 1.92 TB SATA".
func hwDescribeStorage(hw schema.HardwareSnapshot) string {
type key struct {
sizeGB int
iface string
}
counts := map[key]int{}
order := []key{}
for _, disk := range hw.Storage {
sz := 0
if disk.SizeGB != nil {
sz = *disk.SizeGB
}
iface := ""
if disk.Interface != nil {
iface = *disk.Interface
} else if disk.Type != nil {
iface = *disk.Type
}
k := key{sz, iface}
if counts[k] == 0 {
order = append(order, k)
}
counts[k]++
}
if len(order) == 0 {
return "—"
}
parts := make([]string, 0, len(order))
for _, k := range order {
var sizeStr string
if k.sizeGB >= 1000 {
sizeStr = fmt.Sprintf("%.2g TB", float64(k.sizeGB)/1000)
} else if k.sizeGB > 0 {
sizeStr = fmt.Sprintf("%d GB", k.sizeGB)
} else {
sizeStr = "?"
}
desc := fmt.Sprintf("%d× %s", counts[k], sizeStr)
if k.iface != "" {
desc += " " + k.iface
}
parts = append(parts, desc)
}
return strings.Join(parts, ", ")
}
// hwDescribeGPU returns a summary like "8× NVIDIA H100 80GB".
func hwDescribeGPU(hw schema.HardwareSnapshot) string {
counts := map[string]int{}
order := []string{}
for _, dev := range hw.PCIeDevices {
if dev.DeviceClass == nil {
continue
}
if !isGPUDeviceClass(*dev.DeviceClass) {
continue
}
model := "Unknown GPU"
if dev.Model != nil && *dev.Model != "" {
model = *dev.Model
}
if counts[model] == 0 {
order = append(order, model)
}
counts[model]++
}
if len(order) == 0 {
return "—"
}
parts := make([]string, 0, len(order))
for _, m := range order {
if counts[m] > 1 {
parts = append(parts, fmt.Sprintf("%d× %s", counts[m], m))
} else {
parts = append(parts, m)
}
}
return strings.Join(parts, ", ")
}
// hwDescribePSU returns a summary like "2× 1600 W" or "2× PSU".
func hwDescribePSU(hw schema.HardwareSnapshot) string {
n := len(hw.PowerSupplies)
if n == 0 {
return "—"
}
// Try to get a consistent wattage
watt := 0
consistent := true
for _, psu := range hw.PowerSupplies {
if psu.WattageW == nil {
consistent = false
break
}
if watt == 0 {
watt = *psu.WattageW
} else if *psu.WattageW != watt {
consistent = false
break
}
}
if consistent && watt > 0 {
return fmt.Sprintf("%d× %d W", n, watt)
}
return fmt.Sprintf("%d× PSU", n)
}
// hwDescribeNIC returns a summary like "2× Mellanox ConnectX-6".
func hwDescribeNIC(hw schema.HardwareSnapshot) string {
counts := map[string]int{}
order := []string{}
for _, dev := range hw.PCIeDevices {
isNIC := false
if dev.DeviceClass != nil {
c := strings.ToLower(strings.TrimSpace(*dev.DeviceClass))
isNIC = c == "ethernetcontroller" || c == "networkcontroller" || strings.Contains(c, "fibrechannel")
}
if !isNIC && len(dev.MacAddresses) == 0 {
continue
}
model := ""
if dev.Model != nil && *dev.Model != "" {
model = *dev.Model
} else if dev.Manufacturer != nil && *dev.Manufacturer != "" {
model = *dev.Manufacturer + " NIC"
} else {
model = "NIC"
}
if counts[model] == 0 {
order = append(order, model)
}
counts[model]++
}
if len(order) == 0 {
return ""
}
parts := make([]string, 0, len(order))
for _, m := range order {
if counts[m] > 1 {
parts = append(parts, fmt.Sprintf("%d× %s", counts[m], m))
} else {
parts = append(parts, m)
}
}
return strings.Join(parts, ", ")
}
func isGPUDeviceClass(class string) bool {
switch strings.TrimSpace(class) {
case "VideoController", "DisplayController", "ProcessingAccelerator":
return true
default:
return false
}
}
func renderAuditModal() string { func renderAuditModal() string {
return `<div id="audit-modal-overlay" style="display:none;position:fixed;inset:0;background:rgba(0,0,0,.5);z-index:100;align-items:center;justify-content:center"> return `<div id="audit-modal-overlay" style="display:none;position:fixed;inset:0;background:rgba(0,0,0,.5);z-index:100;align-items:center;justify-content:center">
<div style="background:#fff;border-radius:6px;padding:24px;min-width:480px;max-width:1100px;width:min(1100px,92vw);max-height:92vh;overflow:auto;position:relative"> <div style="background:#fff;border-radius:6px;padding:24px;min-width:480px;max-width:1100px;width:min(1100px,92vw);max-height:92vh;overflow:auto;position:relative">
@@ -482,7 +647,6 @@ func renderHealthCard(opts HandlerOptions) string {
buildRuntimeToolsRow(health), buildRuntimeToolsRow(health),
buildRuntimeServicesRow(health), buildRuntimeServicesRow(health),
} }
rows = append(rows, buildHardwareComponentRows(opts.ExportDir)...)
b.WriteString(`<table><thead><tr><th>Check</th><th>Status</th><th>Source</th><th>Issue</th></tr></thead><tbody>`) b.WriteString(`<table><thead><tr><th>Check</th><th>Status</th><th>Source</th><th>Issue</th></tr></thead><tbody>`)
for _, row := range rows { for _, row := range rows {
b.WriteString(`<tr><td>` + html.EscapeString(row.Title) + `</td><td>` + runtimeStatusBadge(row.Status) + `</td><td>` + html.EscapeString(row.Source) + `</td><td>` + rowIssueHTML(row.Issue) + `</td></tr>`) b.WriteString(`<tr><td>` + html.EscapeString(row.Title) + `</td><td>` + runtimeStatusBadge(row.Status) + `</td><td>` + html.EscapeString(row.Source) + `</td><td>` + rowIssueHTML(row.Issue) + `</td></tr>`)
@@ -1031,25 +1195,26 @@ func renderValidate(opts HandlerOptions) string {
return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div> return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div>
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p> <p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
<div class="card" style="margin-bottom:16px"> <div class="card" style="margin-bottom:16px">
<div class="card-head">Validate Profile</div> <div class="card-head">Validate Profile</div>
<div class="card-body validate-profile-body"> <div class="card-body validate-profile-body">
<div class="validate-profile-col"> <div class="validate-profile-col">
<div class="form-row" style="margin:0"><label>Cycles</label><input type="number" id="sat-cycles" value="1" min="1" max="100" style="width:100%"></div> <div class="form-row" style="margin:0"><label>Cycles</label><input type="number" id="sat-cycles" value="1" min="1" max="100" style="width:100%"></div>
<div class="form-row" style="margin:12px 0 0"><label>Mode</label></div> </div>
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-validate" value="validate" checked onchange="satModeChanged()"><span>Validate — quick non-destructive check</span></label> <div class="validate-profile-col">
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (~3060 min)</span></label> <div class="form-row" style="margin:12px 0 0"><label>Mode</label></div>
</div> <label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-validate" value="validate" checked onchange="satModeChanged()"><span>Validate — quick non-destructive check</span></label>
<div class="validate-profile-col validate-profile-action"> <label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (~3060 min)</span></label>
<p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially with the selected cycle count and mode. Validate is quick (~515 min total); Stress is thorough (~3060 min total).</p> </div>
<button class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button> <div class="validate-profile-col validate-profile-action">
</div> <p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially with the selected cycle count and mode. Validate is quick (~515 min total); Stress is thorough (~3060 min total).</p>
<div class="validate-profile-col"></div> <button type="button" class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
</div> <div style="margin-top:12px">
<div class="card-body" style="padding-top:0;display:flex;justify-content:center"> <span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
<span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span> </div>
</div> </div>
</div> </div>
</div>
<div class="grid3"> <div class="grid3">
` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody( ` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody(
@@ -1156,7 +1321,7 @@ func renderValidate(opts HandlerOptions) string {
</div> </div>
<style> <style>
.validate-profile-body { display:grid; grid-template-columns:1fr 1fr 1fr; gap:24px; align-items:stretch; } .validate-profile-body { display:grid; grid-template-columns:1fr 1fr 1fr; gap:24px; align-items:stretch; }
.validate-profile-col { min-width:0; } .validate-profile-col { min-width:0; display:flex; flex-direction:column; }
.validate-profile-action { display:flex; flex-direction:column; align-items:center; justify-content:center; } .validate-profile-action { display:flex; flex-direction:column; align-items:center; justify-content:center; }
.validate-card-body { padding:0; } .validate-card-body { padding:0; }
.validate-card-section { padding:12px 16px 0; } .validate-card-section { padding:12px 16px 0; }
@@ -1188,7 +1353,7 @@ function satModeChanged() {
}); });
} }
function satLabels() { function satLabels() {
return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress (dcgmi diag targeted_stress)', 'nvidia-targeted-power':'NVIDIA Targeted Power (dcgmi diag targeted_power)', 'nvidia-pulse':'NVIDIA Pulse Test (dcgmi diag pulse_test)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL all_reduce_perf)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'}; return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress (dcgmi diag targeted_stress)', 'nvidia-targeted-power':'NVIDIA Targeted Power (dcgmi diag targeted_power)', 'nvidia-pulse':'NVIDIA PSU Pulse Test (dcgmi diag pulse_test)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL all_reduce_perf)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
} }
let satNvidiaGPUsPromise = null; let satNvidiaGPUsPromise = null;
function loadSatNvidiaGPUs() { function loadSatNvidiaGPUs() {
@@ -1437,8 +1602,8 @@ function runAllSAT() {
const cycles = Math.max(1, parseInt(document.getElementById('sat-cycles').value)||1); const cycles = Math.max(1, parseInt(document.getElementById('sat-cycles').value)||1);
const status = document.getElementById('sat-all-status'); const status = document.getElementById('sat-all-status');
status.textContent = 'Enqueuing...'; status.textContent = 'Enqueuing...';
const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth']; const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth', 'hpl'];
const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','memory','storage','cpu'].concat(selectedAMDValidateTargets()); const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','hpl','memory','storage','cpu'].concat(selectedAMDValidateTargets());
const activeTargets = baseTargets.filter(target => { const activeTargets = baseTargets.filter(target => {
if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false; if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false;
const btn = document.getElementById('sat-btn-' + target); const btn = document.getElementById('sat-btn-' + target);
@@ -2082,7 +2247,7 @@ func benchmarkHistoryParallelLabel(serverModel, gpuName string, count int) strin
func renderBurn() string { func renderBurn() string {
return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>&#9888; Warning:</strong> Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.</div> return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>&#9888; Warning:</strong> Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.</div>
<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> DCGM diagnostics and ` + "targeted_stress" + ` remain in <a href="/validate">Validate</a>. Burn exposes official NVIDIA load recipes by test goal plus separate custom stress tools.</div> <div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `), NCCL, NVBandwidth, and LINPACK remain in <a href="/validate">Validate → Stress mode</a>. Burn exposes sustained GPU compute load recipes.</div>
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p> <p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
<div class="card" style="margin-bottom:16px"> <div class="card" style="margin-bottom:16px">
@@ -2095,11 +2260,11 @@ func renderBurn() string {
<label class="cb-row"><input type="radio" name="burn-profile" value="overnight"><span>Overnight — 8 hours</span></label> <label class="cb-row"><input type="radio" name="burn-profile" value="overnight"><span>Overnight — 8 hours</span></label>
</div> </div>
<div class="burn-profile-col burn-profile-action"> <div class="burn-profile-col burn-profile-action">
<button class="btn btn-primary" onclick="runAllBurnTasks()">Burn one by one</button> <button type="button" class="btn btn-primary" onclick="runAllBurnTasks()">Burn one by one</button>
<p>Run checked tests one by one. Tests run without cooldown. Each test duration is determined by the Burn Profile. Total test duration is the sum of all selected tests multiplied by the Burn Profile duration.</p> <p>Run checked tests one by one. Tests run without cooldown. Each test duration is determined by the Burn Profile. Total test duration is the sum of all selected tests multiplied by the Burn Profile duration.</p>
</div> </div>
<div class="burn-profile-col burn-profile-action"> <div class="burn-profile-col burn-profile-action">
<button class="btn btn-secondary" onclick="runPlatformStress()">Thermal Cycling</button> <button type="button" class="btn btn-secondary" onclick="runPlatformStress()">Thermal Cycling</button>
<p>Run checked core test modules (CPU, MEM, GPU). Tests start at the same time and run for a period with short cooldown phases to stress the server cooling system.</p> <p>Run checked core test modules (CPU, MEM, GPU). Tests start at the same time and run for a period with short cooldown phases to stress the server cooling system.</p>
</div> </div>
</div> </div>
@@ -2116,12 +2281,16 @@ func renderBurn() string {
<button class="btn btn-sm btn-secondary" type="button" onclick="burnSelectAll()">Select All</button> <button class="btn btn-sm btn-secondary" type="button" onclick="burnSelectAll()">Select All</button>
<button class="btn btn-sm btn-secondary" type="button" onclick="burnSelectNone()">Clear</button> <button class="btn btn-sm btn-secondary" type="button" onclick="burnSelectNone()">Clear</button>
</div> </div>
<div id="burn-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px"> <div id="burn-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p> <p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
</div> </div>
<p id="burn-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA burn recipes.</p> <p id="burn-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA burn recipes.</p>
</div> <label class="cb-row" style="margin-top:10px">
</div> <input type="checkbox" id="burn-stagger-nvidia">
<span>Ramp selected NVIDIA GPUs one by one before full-load hold. Uses a 3-minute stabilization window per GPU, then keeps all selected GPUs under load for the chosen Burn Profile duration.</span>
</label>
</div>
</div>
<div class="burn-section">Core Burn Paths</div> <div class="burn-section">Core Burn Paths</div>
<div class="grid2 burn-grid" style="margin-bottom:16px"> <div class="grid2 burn-grid" style="margin-bottom:16px">
@@ -2147,10 +2316,6 @@ func renderBurn() string {
</div> </div>
</div> </div>
<div class="burn-section">GPU-Specific Tests</div>
<div class="grid2 burn-grid" style="margin-bottom:16px">
</div>
<div id="bi-output" style="display:none;margin-top:16px" class="card"> <div id="bi-output" style="display:none;margin-top:16px" class="card">
<div class="card-head">Output <span id="bi-title"></span></div> <div class="card-head">Output <span id="bi-title"></span></div>
<div class="card-body"><div id="bi-terminal" class="terminal"></div></div> <div class="card-body"><div id="bi-terminal" class="terminal"></div></div>
@@ -2199,6 +2364,11 @@ function burnSelectedGPUIndices() {
.sort(function(a, b) { return a - b; }); .sort(function(a, b) { return a - b; });
} }
function burnUseNvidiaRampUp() {
const el = document.getElementById('burn-stagger-nvidia');
return !!(el && el.checked);
}
function burnUpdateSelectionNote() { function burnUpdateSelectionNote() {
const note = document.getElementById('burn-selection-note'); const note = document.getElementById('burn-selection-note');
const selected = burnSelectedGPUIndices(); const selected = burnSelectedGPUIndices();
@@ -2258,6 +2428,9 @@ function enqueueBurnTask(target, label, extra, useSelectedNvidia) {
return Promise.reject(new Error('Select at least one NVIDIA GPU.')); return Promise.reject(new Error('Select at least one NVIDIA GPU.'));
} }
body.gpu_indices = selected; body.gpu_indices = selected;
if (burnUseNvidiaRampUp() && selected.length > 1) {
body.stagger_gpu_start = true;
}
} }
return fetch('/api/sat/' + target + '/run', { return fetch('/api/sat/' + target + '/run', {
method: 'POST', method: 'POST',
@@ -2849,55 +3022,6 @@ usbRefresh();
</script>` </script>`
} }
// ── Display Resolution ────────────────────────────────────────────────────────
func renderDisplayInline() string {
return `<div id="display-status" style="color:var(--muted);font-size:13px;margin-bottom:12px">Loading displays...</div>
<div id="display-controls"></div>
<script>
(function(){
function loadDisplays() {
fetch('/api/display/resolutions').then(r=>r.json()).then(displays => {
const status = document.getElementById('display-status');
const ctrl = document.getElementById('display-controls');
if (!displays || displays.length === 0) {
status.textContent = 'No connected displays found or xrandr not available.';
return;
}
status.textContent = '';
ctrl.innerHTML = displays.map(d => {
const opts = (d.modes||[]).map(m =>
'<option value="'+m.mode+'"'+(m.current?' selected':'')+'>'+m.mode+(m.current?' (current)':'')+'</option>'
).join('');
return '<div style="margin-bottom:12px">'
+'<span style="font-weight:600;margin-right:8px">'+d.output+'</span>'
+'<span style="color:var(--muted);font-size:12px;margin-right:12px">Current: '+d.current+'</span>'
+'<select id="res-sel-'+d.output+'" style="margin-right:8px">'+opts+'</select>'
+'<button class="btn btn-sm btn-primary" onclick="applyResolution(\''+d.output+'\')">Apply</button>'
+'</div>';
}).join('');
}).catch(()=>{
document.getElementById('display-status').textContent = 'xrandr not available on this system.';
});
}
window.applyResolution = function(output) {
const sel = document.getElementById('res-sel-'+output);
if (!sel) return;
const mode = sel.value;
const btn = sel.nextElementSibling;
btn.disabled = true;
btn.textContent = 'Applying...';
fetch('/api/display/set', {method:'POST', headers:{'Content-Type':'application/json'}, body:JSON.stringify({output:output,mode:mode})})
.then(r=>r.json()).then(d=>{
if (d.error) { alert('Error: '+d.error); }
loadDisplays();
}).catch(e=>{ alert('Error: '+e); })
.finally(()=>{ btn.disabled=false; btn.textContent='Apply'; });
};
loadDisplays();
})();
</script>`
}
func renderNvidiaSelfHealInline() string { func renderNvidiaSelfHealInline() string {
return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Inspect NVIDIA GPU health, restart the bee-nvidia driver service, and issue a per-GPU reset when the driver reports reset required.</p> return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Inspect NVIDIA GPU health, restart the bee-nvidia driver service, and issue a per-GPU reset when the driver reports reset required.</p>
@@ -3086,8 +3210,6 @@ function installToRAM() {
<div class="card"><div class="card-head">Services</div><div class="card-body">` + <div class="card"><div class="card-head">Services</div><div class="card-body">` +
renderServicesInline() + `</div></div> renderServicesInline() + `</div></div>
<div class="card"><div class="card-head">Display Resolution</div><div class="card-body">` +
renderDisplayInline() + `</div></div>
<script> <script>
function checkTools() { function checkTools() {

View File

@@ -295,10 +295,6 @@ func NewHandler(opts HandlerOptions) http.Handler {
// Tools // Tools
mux.HandleFunc("GET /api/tools/check", h.handleAPIToolsCheck) mux.HandleFunc("GET /api/tools/check", h.handleAPIToolsCheck)
// Display
mux.HandleFunc("GET /api/display/resolutions", h.handleAPIDisplayResolutions)
mux.HandleFunc("POST /api/display/set", h.handleAPIDisplaySet)
// GPU presence / tools // GPU presence / tools
mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence) mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
mux.HandleFunc("GET /api/gpu/nvidia", h.handleAPIGNVIDIAGPUs) mux.HandleFunc("GET /api/gpu/nvidia", h.handleAPIGNVIDIAGPUs)

View File

@@ -741,8 +741,8 @@ func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
for _, needle := range []string{ for _, needle := range []string{
`NVIDIA Max Compute Load`, `NVIDIA Max Compute Load`,
`dcgmproftester`, `dcgmproftester`,
`targeted_stress remain in <a href="/validate">Validate</a>`, `NCCL`,
`NVIDIA Interconnect Test (NCCL all_reduce_perf)`, `Validate → Stress mode`,
`id="burn-gpu-list"`, `id="burn-gpu-list"`,
} { } {
if !strings.Contains(body, needle) { if !strings.Contains(body, needle) {
@@ -1094,6 +1094,7 @@ func TestDashboardRendersRuntimeHealthTable(t *testing.T) {
} }
body := rec.Body.String() body := rec.Body.String()
for _, needle := range []string{ for _, needle := range []string{
// Runtime Health card — LiveCD checks only
`Runtime Health`, `Runtime Health`,
`<th>Check</th><th>Status</th><th>Source</th><th>Issue</th>`, `<th>Check</th><th>Status</th><th>Source</th><th>Issue</th>`,
`Export Directory`, `Export Directory`,
@@ -1102,16 +1103,18 @@ func TestDashboardRendersRuntimeHealthTable(t *testing.T) {
`CUDA / ROCm`, `CUDA / ROCm`,
`Required Utilities`, `Required Utilities`,
`Bee Services`, `Bee Services`,
`<td>CPU</td>`,
`<td>Memory</td>`,
`<td>Storage</td>`,
`<td>GPU</td>`,
`CUDA runtime is not ready for GPU SAT.`, `CUDA runtime is not ready for GPU SAT.`,
`Missing: nvidia-smi`, `Missing: nvidia-smi`,
`bee-nvidia=inactive`, `bee-nvidia=inactive`,
`cpu SAT: FAILED`, // Hardware Summary card — component health badges
`storage SAT: FAILED`, `Hardware Summary`,
`sat:nvidia`, `>CPU<`,
`>Memory<`,
`>Storage<`,
`>GPU<`,
`>PSU<`,
`badge-warn`, // cpu Warning badge
`badge-err`, // storage Critical badge
} { } {
if !strings.Contains(body, needle) { if !strings.Contains(body, needle) {
t.Fatalf("dashboard missing %q: %s", needle, body) t.Fatalf("dashboard missing %q: %s", needle, body)

View File

@@ -118,6 +118,7 @@ type taskParams struct {
StressMode bool `json:"stress_mode,omitempty"` StressMode bool `json:"stress_mode,omitempty"`
GPUIndices []int `json:"gpu_indices,omitempty"` GPUIndices []int `json:"gpu_indices,omitempty"`
ExcludeGPUIndices []int `json:"exclude_gpu_indices,omitempty"` ExcludeGPUIndices []int `json:"exclude_gpu_indices,omitempty"`
StaggerGPUStart bool `json:"stagger_gpu_start,omitempty"`
SizeMB int `json:"size_mb,omitempty"` SizeMB int `json:"size_mb,omitempty"`
Passes int `json:"passes,omitempty"` Passes int `json:"passes,omitempty"`
Loader string `json:"loader,omitempty"` Loader string `json:"loader,omitempty"`
@@ -162,6 +163,13 @@ func resolveBurnPreset(profile string) burnPreset {
} }
} }
func boolToNvidiaStaggerSeconds(enabled bool, selected []int) int {
if enabled && len(selected) > 1 {
return 180
}
return 0
}
func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions { func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions {
acceptanceCycles := []platform.PlatformStressCycle{ acceptanceCycles := []platform.PlatformStressCycle{
{LoadSec: 85, IdleSec: 5}, {LoadSec: 85, IdleSec: 5},
@@ -592,7 +600,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
RunNCCL: t.params.RunNCCL, RunNCCL: t.params.RunNCCL,
ParallelGPUs: t.params.ParallelGPUs, ParallelGPUs: t.params.ParallelGPUs,
}, j.append) }, j.append)
case "nvidia-compute": case "nvidia-compute":
if a == nil { if a == nil {
err = fmt.Errorf("app not configured") err = fmt.Errorf("app not configured")
break break
@@ -601,7 +609,11 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
if t.params.BurnProfile != "" && dur <= 0 { if t.params.BurnProfile != "" && dur <= 0 {
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
} }
archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, j.append) staggerSec := boolToNvidiaStaggerSeconds(t.params.StaggerGPUStart, t.params.GPUIndices)
if staggerSec > 0 {
j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU", staggerSec))
}
archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, staggerSec, j.append)
case "nvidia-targeted-power": case "nvidia-targeted-power":
if a == nil { if a == nil {
err = fmt.Errorf("app not configured") err = fmt.Errorf("app not configured")
@@ -651,12 +663,13 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
if t.params.BurnProfile != "" && dur <= 0 { if t.params.BurnProfile != "" && dur <= 0 {
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
} }
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{ archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
DurationSec: dur, DurationSec: dur,
Loader: t.params.Loader, Loader: t.params.Loader,
GPUIndices: t.params.GPUIndices, GPUIndices: t.params.GPUIndices,
ExcludeGPUIndices: t.params.ExcludeGPUIndices, ExcludeGPUIndices: t.params.ExcludeGPUIndices,
}, j.append) StaggerSeconds: boolToNvidiaStaggerSeconds(t.params.StaggerGPUStart, t.params.GPUIndices),
}, j.append)
case "memory": case "memory":
if a == nil { if a == nil {
err = fmt.Errorf("app not configured") err = fmt.Errorf("app not configured")

2
bible

Submodule bible updated: 688b87e98d...1d89a4918e

View File

@@ -1,9 +1,9 @@
set color_normal=light-gray/black set color_normal=light-gray/black
set color_highlight=white/dark-gray set color_highlight=yellow/black
if [ -e /boot/grub/splash.png ]; then if [ -e /boot/grub/splash.png ]; then
set theme=/boot/grub/live-theme/theme.txt set theme=/boot/grub/live-theme/theme.txt
else else
set menu_color_normal=cyan/black set menu_color_normal=yellow/black
set menu_color_highlight=white/dark-gray set menu_color_highlight=white/brown
fi fi

View File

@@ -10,20 +10,15 @@ import os
W, H = 1920, 1080 W, H = 1920, 1080
GLYPHS = { ASCII_ART = [
'E': ["11111", "10000", "11110", "10000", "10000", "10000", "11111"], " ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗",
'A': ["01110", "10001", "10001", "11111", "10001", "10001", "10001"], " ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝",
'S': ["01111", "10000", "10000", "01110", "00001", "00001", "11110"], " █████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗",
'Y': ["10001", "10001", "01010", "00100", "00100", "00100", "00100"], " ██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝",
'B': ["11110", "10001", "10001", "11110", "10001", "10001", "11110"], " ███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗",
'-': ["00000", "00000", "11111", "00000", "00000", "00000", "00000"], " ╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝",
} ]
SUBTITLE = " Hardware Audit LiveCD"
TITLE = "EASY-BEE"
SUBTITLE = "Hardware Audit LiveCD"
CELL = 30
GLYPH_GAP = 18
ROW_GAP = 6
FG = (0xF6, 0xD0, 0x47) FG = (0xF6, 0xD0, 0x47)
FG_DIM = (0xD4, 0xA9, 0x1C) FG_DIM = (0xD4, 0xA9, 0x1C)
@@ -31,6 +26,12 @@ SHADOW = (0x5E, 0x47, 0x05)
SUB = (0x96, 0x7A, 0x17) SUB = (0x96, 0x7A, 0x17)
BG = (0x05, 0x05, 0x05) BG = (0x05, 0x05, 0x05)
MONO_FONT_CANDIDATES = [
'/usr/share/fonts/truetype/dejavu/DejaVuSansMono-Bold.ttf',
'/usr/share/fonts/truetype/liberation2/LiberationMono-Bold.ttf',
'/usr/share/fonts/truetype/liberation/LiberationMono-Bold.ttf',
'/usr/share/fonts/truetype/freefont/FreeMonoBold.ttf',
]
SUB_FONT_CANDIDATES = [ SUB_FONT_CANDIDATES = [
'/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf', '/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf',
'/usr/share/fonts/truetype/liberation2/LiberationSans-Bold.ttf', '/usr/share/fonts/truetype/liberation2/LiberationSans-Bold.ttf',
@@ -39,43 +40,34 @@ SUB_FONT_CANDIDATES = [
] ]
def load_font(size): def load_font(candidates, size):
for path in SUB_FONT_CANDIDATES: for path in candidates:
if os.path.exists(path): if os.path.exists(path):
return ImageFont.truetype(path, size) return ImageFont.truetype(path, size)
return ImageFont.load_default() return ImageFont.load_default()
def glyph_width(ch): def mono_metrics(font):
return len(GLYPHS[ch][0]) probe = Image.new('L', (W, H), 0)
draw = ImageDraw.Draw(probe)
char_w = int(round(draw.textlength("M", font=font)))
bb = draw.textbbox((0, 0), "Mg", font=font)
char_h = bb[3] - bb[1]
return char_w, char_h
def render_logo_mask(): def render_ascii_mask(font, lines, char_w, char_h, line_gap):
width_cells = 0 width = max(len(line) for line in lines) * char_w
for idx, ch in enumerate(TITLE): height = len(lines) * char_h + line_gap * (len(lines) - 1)
width_cells += glyph_width(ch) mask = Image.new('L', (width, height), 0)
if idx != len(TITLE) - 1:
width_cells += 1
mask_w = width_cells * CELL + (len(TITLE) - 1) * GLYPH_GAP
mask_h = 7 * CELL + 6 * ROW_GAP
mask = Image.new('L', (mask_w, mask_h), 0)
draw = ImageDraw.Draw(mask) draw = ImageDraw.Draw(mask)
for row, line in enumerate(lines):
cx = 0 y = row * (char_h + line_gap)
for idx, ch in enumerate(TITLE): for col, ch in enumerate(line):
glyph = GLYPHS[ch] if ch == ' ':
for row_idx, row in enumerate(glyph): continue
for col_idx, cell in enumerate(row): x = col * char_w
if cell != '1': draw.text((x, y), ch, font=font, fill=255)
continue
x0 = cx + col_idx * CELL
y0 = row_idx * (CELL + ROW_GAP)
x1 = x0 + CELL - 4
y1 = y0 + CELL - 4
draw.rounded_rectangle((x0, y0, x1, y1), radius=4, fill=255)
cx += glyph_width(ch) * CELL
if idx != len(TITLE) - 1:
cx += CELL + GLYPH_GAP
return mask return mask
@@ -90,20 +82,28 @@ glow_draw.ellipse((520, 340, 1400, 760), fill=(255, 190, 40, 36))
glow = glow.filter(ImageFilter.GaussianBlur(60)) glow = glow.filter(ImageFilter.GaussianBlur(60))
img = Image.alpha_composite(img.convert('RGBA'), glow) img = Image.alpha_composite(img.convert('RGBA'), glow)
logo_mask = render_logo_mask() TARGET_LOGO_W = 400
max_chars = max(len(line) for line in ASCII_ART)
_probe_font = load_font(MONO_FONT_CANDIDATES, 64)
_probe_cw, _ = mono_metrics(_probe_font)
font_size_logo = max(6, int(64 * TARGET_LOGO_W / (_probe_cw * max_chars)))
font_logo = load_font(MONO_FONT_CANDIDATES, font_size_logo)
char_w, char_h = mono_metrics(font_logo)
logo_mask = render_ascii_mask(font_logo, ASCII_ART, char_w, char_h, 2)
logo_w, logo_h = logo_mask.size logo_w, logo_h = logo_mask.size
logo_x = (W - logo_w) // 2 logo_x = (W - logo_w) // 2
logo_y = 290 logo_y = 380
shadow_mask = logo_mask.filter(ImageFilter.GaussianBlur(2)) sh_off = max(1, font_size_logo // 6)
img.paste(SHADOW, (logo_x + 16, logo_y + 14), shadow_mask) shadow_mask = logo_mask.filter(ImageFilter.GaussianBlur(1))
img.paste(FG_DIM, (logo_x + 8, logo_y + 7), logo_mask) img.paste(SHADOW, (logo_x + sh_off * 2, logo_y + sh_off * 2), shadow_mask)
img.paste(FG_DIM, (logo_x + sh_off, logo_y + sh_off), logo_mask)
img.paste(FG, (logo_x, logo_y), logo_mask) img.paste(FG, (logo_x, logo_y), logo_mask)
font_sub = load_font(30) font_sub = load_font(SUB_FONT_CANDIDATES, 30)
sub_bb = draw.textbbox((0, 0), SUBTITLE, font=font_sub) sub_bb = draw.textbbox((0, 0), SUBTITLE, font=font_sub)
sub_x = (W - (sub_bb[2] - sub_bb[0])) // 2 sub_x = (W - (sub_bb[2] - sub_bb[0])) // 2
sub_y = logo_y + logo_h + 54 sub_y = logo_y + logo_h + 48
draw = ImageDraw.Draw(img) draw = ImageDraw.Draw(img)
draw.text((sub_x + 2, sub_y + 2), SUBTITLE, font=font_sub, fill=(35, 28, 6)) draw.text((sub_x + 2, sub_y + 2), SUBTITLE, font=font_sub, fill=(35, 28, 6))
draw.text((sub_x, sub_y), SUBTITLE, font=font_sub, fill=SUB) draw.text((sub_x, sub_y), SUBTITLE, font=font_sub, fill=SUB)

View File

@@ -0,0 +1,110 @@
#!/bin/sh
set -eu
SECONDS=300
STAGGER_SECONDS=180
DEVICES=""
EXCLUDE=""
usage() {
echo "usage: $0 [--seconds N] [--stagger-seconds N] [--devices 0,1] [--exclude 2,3]" >&2
exit 2
}
normalize_list() {
echo "${1:-}" | tr ',' '\n' | sed 's/[[:space:]]//g' | awk 'NF' | sort -n | uniq | paste -sd, -
}
contains_csv() {
needle="$1"
haystack="${2:-}"
echo ",${haystack}," | grep -q ",${needle},"
}
resolve_dcgmproftester() {
for candidate in dcgmproftester dcgmproftester13 dcgmproftester12 dcgmproftester11; do
if command -v "${candidate}" >/dev/null 2>&1; then
command -v "${candidate}"
return 0
fi
done
return 1
}
while [ "$#" -gt 0 ]; do
case "$1" in
--seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
--stagger-seconds) [ "$#" -ge 2 ] || usage; STAGGER_SECONDS="$2"; shift 2 ;;
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
*) usage ;;
esac
done
PROF=$(resolve_dcgmproftester) || { echo "dcgmproftester not found in PATH" >&2; exit 1; }
ALL_DEVICES=$(nvidia-smi --query-gpu=index --format=csv,noheader,nounits 2>/dev/null | sed 's/[[:space:]]//g' | awk 'NF' | paste -sd, -)
[ -n "${ALL_DEVICES}" ] || { echo "nvidia-smi found no NVIDIA GPUs" >&2; exit 1; }
DEVICES=$(normalize_list "${DEVICES}")
EXCLUDE=$(normalize_list "${EXCLUDE}")
SELECTED="${DEVICES}"
if [ -z "${SELECTED}" ]; then
SELECTED="${ALL_DEVICES}"
fi
FINAL=""
for id in $(echo "${SELECTED}" | tr ',' ' '); do
[ -n "${id}" ] || continue
if contains_csv "${id}" "${EXCLUDE}"; then
continue
fi
if [ -z "${FINAL}" ]; then
FINAL="${id}"
else
FINAL="${FINAL},${id}"
fi
done
[ -n "${FINAL}" ] || { echo "no NVIDIA GPUs selected after filters" >&2; exit 1; }
echo "loader=dcgmproftester-staggered"
echo "selected_gpus=${FINAL}"
echo "stagger_seconds=${STAGGER_SECONDS}"
TMP_DIR=$(mktemp -d)
trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM
GPU_COUNT=$(echo "${FINAL}" | tr ',' '\n' | awk 'NF' | wc -l | tr -d '[:space:]')
gpu_pos=0
WORKERS=""
for id in $(echo "${FINAL}" | tr ',' ' '); do
gpu_pos=$((gpu_pos + 1))
log="${TMP_DIR}/gpu-${id}.log"
extra_sec=$(( STAGGER_SECONDS * (GPU_COUNT - gpu_pos) ))
gpu_seconds=$(( SECONDS + extra_sec ))
echo "starting gpu ${id} seconds=${gpu_seconds}"
CUDA_VISIBLE_DEVICES="${id}" "${PROF}" --no-dcgm-validation -t 1004 -d "${gpu_seconds}" >"${log}" 2>&1 &
pid=$!
WORKERS="${WORKERS} ${pid}:${id}:${log}"
if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${gpu_pos}" -lt "${GPU_COUNT}" ]; then
sleep "${STAGGER_SECONDS}"
fi
done
status=0
for spec in ${WORKERS}; do
pid=${spec%%:*}
rest=${spec#*:}
id=${rest%%:*}
log=${rest#*:}
if wait "${pid}"; then
echo "gpu ${id} finished: OK"
else
rc=$?
echo "gpu ${id} finished: FAILED rc=${rc}"
status=1
fi
sed "s/^/[gpu ${id}] /" "${log}" || true
done
exit "${status}"

17
iso/overlay/usr/local/bin/bee-gpu-burn Normal file → Executable file
View File

@@ -2,13 +2,14 @@
set -eu set -eu
SECONDS=5 SECONDS=5
STAGGER_SECONDS=0
SIZE_MB=0 SIZE_MB=0
DEVICES="" DEVICES=""
EXCLUDE="" EXCLUDE=""
WORKER="/usr/local/lib/bee/bee-gpu-burn-worker" WORKER="/usr/local/lib/bee/bee-gpu-burn-worker"
usage() { usage() {
echo "usage: $0 [--seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3]" >&2 echo "usage: $0 [--seconds N] [--stagger-seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3]" >&2
exit 2 exit 2
} }
@@ -25,6 +26,7 @@ contains_csv() {
while [ "$#" -gt 0 ]; do while [ "$#" -gt 0 ]; do
case "$1" in case "$1" in
--seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;; --seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
--stagger-seconds) [ "$#" -ge 2 ] || usage; STAGGER_SECONDS="$2"; shift 2 ;;
--size-mb|-m) [ "$#" -ge 2 ] || usage; SIZE_MB="$2"; shift 2 ;; --size-mb|-m) [ "$#" -ge 2 ] || usage; SIZE_MB="$2"; shift 2 ;;
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;; --devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;; --exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
@@ -61,14 +63,18 @@ done
echo "loader=bee-gpu-burn" echo "loader=bee-gpu-burn"
echo "selected_gpus=${FINAL}" echo "selected_gpus=${FINAL}"
echo "stagger_seconds=${STAGGER_SECONDS}"
export CUDA_DEVICE_ORDER="PCI_BUS_ID" export CUDA_DEVICE_ORDER="PCI_BUS_ID"
TMP_DIR=$(mktemp -d) TMP_DIR=$(mktemp -d)
trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM
GPU_COUNT=$(echo "${FINAL}" | tr ',' '\n' | awk 'NF' | wc -l | tr -d '[:space:]')
gpu_pos=0
WORKERS="" WORKERS=""
for id in $(echo "${FINAL}" | tr ',' ' '); do for id in $(echo "${FINAL}" | tr ',' ' '); do
gpu_pos=$((gpu_pos + 1))
log="${TMP_DIR}/gpu-${id}.log" log="${TMP_DIR}/gpu-${id}.log"
gpu_size_mb="${SIZE_MB}" gpu_size_mb="${SIZE_MB}"
if [ "${gpu_size_mb}" -le 0 ] 2>/dev/null; then if [ "${gpu_size_mb}" -le 0 ] 2>/dev/null; then
@@ -79,11 +85,16 @@ for id in $(echo "${FINAL}" | tr ',' ' '); do
gpu_size_mb=512 gpu_size_mb=512
fi fi
fi fi
echo "starting gpu ${id} size=${gpu_size_mb}MB" extra_sec=$(( STAGGER_SECONDS * (GPU_COUNT - gpu_pos) ))
gpu_seconds=$(( SECONDS + extra_sec ))
echo "starting gpu ${id} size=${gpu_size_mb}MB seconds=${gpu_seconds}"
CUDA_VISIBLE_DEVICES="${id}" \ CUDA_VISIBLE_DEVICES="${id}" \
"${WORKER}" --device 0 --seconds "${SECONDS}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 & "${WORKER}" --device 0 --seconds "${gpu_seconds}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
pid=$! pid=$!
WORKERS="${WORKERS} ${pid}:${id}:${log}" WORKERS="${WORKERS} ${pid}:${id}:${log}"
if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${gpu_pos}" -lt "${GPU_COUNT}" ]; then
sleep "${STAGGER_SECONDS}"
fi
done done
status=0 status=0

16
iso/overlay/usr/local/bin/bee-john-gpu-stress Normal file → Executable file
View File

@@ -2,6 +2,7 @@
set -eu set -eu
DURATION_SEC=300 DURATION_SEC=300
STAGGER_SECONDS=0
DEVICES="" DEVICES=""
EXCLUDE="" EXCLUDE=""
FORMAT="" FORMAT=""
@@ -12,7 +13,7 @@ export OCL_ICD_VENDORS="/etc/OpenCL/vendors"
export LD_LIBRARY_PATH="/usr/lib:/usr/local/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" export LD_LIBRARY_PATH="/usr/lib:/usr/local/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
usage() { usage() {
echo "usage: $0 [--seconds N] [--devices 0,1] [--exclude 2,3] [--format name]" >&2 echo "usage: $0 [--seconds N] [--stagger-seconds N] [--devices 0,1] [--exclude 2,3] [--format name]" >&2
exit 2 exit 2
} }
@@ -118,6 +119,7 @@ ensure_opencl_ready() {
while [ "$#" -gt 0 ]; do while [ "$#" -gt 0 ]; do
case "$1" in case "$1" in
--seconds|-t) [ "$#" -ge 2 ] || usage; DURATION_SEC="$2"; shift 2 ;; --seconds|-t) [ "$#" -ge 2 ] || usage; DURATION_SEC="$2"; shift 2 ;;
--stagger-seconds) [ "$#" -ge 2 ] || usage; STAGGER_SECONDS="$2"; shift 2 ;;
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;; --devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;; --exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
--format) [ "$#" -ge 2 ] || usage; FORMAT="$2"; shift 2 ;; --format) [ "$#" -ge 2 ] || usage; FORMAT="$2"; shift 2 ;;
@@ -170,6 +172,7 @@ done
echo "loader=john" echo "loader=john"
echo "selected_gpus=${FINAL}" echo "selected_gpus=${FINAL}"
echo "john_devices=${JOHN_DEVICES}" echo "john_devices=${JOHN_DEVICES}"
echo "stagger_seconds=${STAGGER_SECONDS}"
cd "${JOHN_DIR}" cd "${JOHN_DIR}"
@@ -232,14 +235,21 @@ trap cleanup EXIT INT TERM
echo "format=${CHOSEN_FORMAT}" echo "format=${CHOSEN_FORMAT}"
echo "target_seconds=${DURATION_SEC}" echo "target_seconds=${DURATION_SEC}"
echo "slice_seconds=${TEST_SLICE_SECONDS}" echo "slice_seconds=${TEST_SLICE_SECONDS}"
DEADLINE=$(( $(date +%s) + DURATION_SEC )) TOTAL_DEVICES=$(echo "${JOHN_DEVICES}" | tr ',' '\n' | awk 'NF' | wc -l | tr -d '[:space:]')
_first=1 _first=1
pos=0
for opencl_id in $(echo "${JOHN_DEVICES}" | tr ',' ' '); do for opencl_id in $(echo "${JOHN_DEVICES}" | tr ',' ' '); do
pos=$((pos + 1))
[ "${_first}" = "1" ] || sleep 3 [ "${_first}" = "1" ] || sleep 3
_first=0 _first=0
run_john_loop "${opencl_id}" "${DEADLINE}" & extra_sec=$(( STAGGER_SECONDS * (TOTAL_DEVICES - pos) ))
deadline=$(( $(date +%s) + DURATION_SEC + extra_sec ))
run_john_loop "${opencl_id}" "${deadline}" &
pid=$! pid=$!
PIDS="${PIDS} ${pid}" PIDS="${PIDS} ${pid}"
if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${pos}" -lt "${TOTAL_DEVICES}" ]; then
sleep "${STAGGER_SECONDS}"
fi
done done
FAIL=0 FAIL=0
for pid in ${PIDS}; do for pid in ${PIDS}; do

View File

@@ -21,8 +21,13 @@ read_nvidia_modules_flavor() {
log "kernel: $(uname -r)" log "kernel: $(uname -r)"
# Skip if no NVIDIA GPU present (PCI vendor 10de) # Skip if no NVIDIA display/compute GPU is present.
if ! lspci -nn 2>/dev/null | grep -qi '10de:'; then # Match only display-class PCI functions (0300 VGA, 0302 3D controller) from vendor 10de.
have_nvidia_gpu() {
lspci -Dn 2>/dev/null | awk '$2 ~ /^03(00|02):$/ && $3 ~ /^10de:/ { found=1; exit } END { exit(found ? 0 : 1) }'
}
if ! have_nvidia_gpu; then
log "no NVIDIA GPU detected — skipping module load" log "no NVIDIA GPU detected — skipping module load"
exit 0 exit 0
fi fi

View File

@@ -14,7 +14,7 @@ log() {
} }
have_nvidia_gpu() { have_nvidia_gpu() {
lspci -nn 2>/dev/null | grep -qi '10de:' lspci -Dn 2>/dev/null | awk '$2 ~ /^03(00|02):$/ && $3 ~ /^10de:/ { found=1; exit } END { exit(found ? 0 : 1) }'
} }
service_active() { service_active() {