Compare commits
20 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4f94ebcb2c | ||
|
|
05c1fde233 | ||
| 825ef6b98a | |||
| ba16021cdb | |||
|
|
bb1218ddd4 | ||
|
|
65faae8ede | ||
| 05241f2e0e | |||
|
|
c1690a084b | ||
|
|
9481ca2805 | ||
|
|
a78fdadd88 | ||
|
|
4ef403898f | ||
| 025548ab3c | |||
|
|
e0d94d7f47 | ||
|
|
13899aa864 | ||
|
|
f345d8a89d | ||
|
|
4715059ac0 | ||
|
|
0660a40287 | ||
|
|
67369d9b7b | ||
|
|
3f41a026ca | ||
|
|
0ee4f46537 |
@@ -117,7 +117,7 @@ type satRunner interface {
|
||||
RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaBenchmark(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error)
|
||||
RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error)
|
||||
RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error)
|
||||
@@ -139,7 +139,6 @@ type satRunner interface {
|
||||
RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error)
|
||||
RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error)
|
||||
RunNCCLTests(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||
RunHPL(ctx context.Context, baseDir string, opts platform.HPLOptions, logFunc func(string)) (string, *platform.HPLResult, error)
|
||||
}
|
||||
|
||||
type runtimeChecker interface {
|
||||
@@ -191,6 +190,7 @@ func (a *App) RunAudit(runtimeMode runtimeenv.Mode, output string) (string, erro
|
||||
}
|
||||
result := collector.Run(runtimeMode)
|
||||
applyLatestSATStatuses(&result.Hardware, DefaultSATBaseDir, a.StatusDB)
|
||||
writePSUStatusesToDB(a.StatusDB, result.Hardware.PowerSupplies)
|
||||
if health, err := ReadRuntimeHealth(DefaultRuntimeJSONPath); err == nil {
|
||||
result.Runtime = &health
|
||||
}
|
||||
@@ -567,11 +567,11 @@ func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts pl
|
||||
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
||||
return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, staggerSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
@@ -738,13 +738,6 @@ func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
|
||||
return ActionResult{Title: "NCCL bandwidth test", Body: body}, err
|
||||
}
|
||||
|
||||
func (a *App) RunHPL(ctx context.Context, baseDir string, opts platform.HPLOptions, logFunc func(string)) (string, *platform.HPLResult, error) {
|
||||
if a == nil {
|
||||
return "", nil, fmt.Errorf("app not configured")
|
||||
}
|
||||
return a.sat.RunHPL(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunFanStressTestResult(ctx context.Context, opts platform.FanStressOptions) (ActionResult, error) {
|
||||
path, err := a.RunFanStressTest(ctx, "", opts)
|
||||
body := formatFanStressResult(path)
|
||||
@@ -934,6 +927,41 @@ func bodyOr(body, fallback string) string {
|
||||
return body
|
||||
}
|
||||
|
||||
// writePSUStatusesToDB records PSU statuses collected during audit into the
|
||||
// component-status DB so they are visible in the Hardware Summary card.
|
||||
// PSU status is sourced from IPMI (ipmitool fru + sdr) during audit.
|
||||
func writePSUStatusesToDB(db *ComponentStatusDB, psus []schema.HardwarePowerSupply) {
|
||||
if db == nil || len(psus) == 0 {
|
||||
return
|
||||
}
|
||||
const source = "audit:ipmi"
|
||||
worstStatus := "OK"
|
||||
for _, psu := range psus {
|
||||
if psu.Status == nil {
|
||||
continue
|
||||
}
|
||||
slot := "?"
|
||||
if psu.Slot != nil {
|
||||
slot = *psu.Slot
|
||||
}
|
||||
st := *psu.Status
|
||||
detail := ""
|
||||
if psu.ErrorDescription != nil {
|
||||
detail = *psu.ErrorDescription
|
||||
}
|
||||
db.Record("psu:"+slot, source, st, detail)
|
||||
switch st {
|
||||
case "Critical":
|
||||
worstStatus = "Critical"
|
||||
case "Warning":
|
||||
if worstStatus != "Critical" {
|
||||
worstStatus = "Warning"
|
||||
}
|
||||
}
|
||||
}
|
||||
db.Record("psu:all", source, worstStatus, "")
|
||||
}
|
||||
|
||||
func ReadRuntimeHealth(path string) (schema.RuntimeHealth, error) {
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
|
||||
@@ -161,7 +161,7 @@ func (f fakeSAT) RunNvidiaTargetedStressValidatePack(_ context.Context, baseDir
|
||||
return f.runNvidiaFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunNvidiaOfficialComputePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ func(string)) (string, error) {
|
||||
func (f fakeSAT) RunNvidiaOfficialComputePack(_ context.Context, baseDir string, durationSec int, gpuIndices []int, _ int, _ func(string)) (string, error) {
|
||||
if f.runNvidiaComputeFn != nil {
|
||||
return f.runNvidiaComputeFn(baseDir, durationSec, gpuIndices)
|
||||
}
|
||||
@@ -282,9 +282,6 @@ func (f fakeSAT) RunPlatformStress(_ context.Context, _ string, _ platform.Platf
|
||||
func (f fakeSAT) RunNCCLTests(_ context.Context, _ string, _ func(string)) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
func (f fakeSAT) RunHPL(_ context.Context, _ string, _ platform.HPLOptions, _ func(string)) (string, *platform.HPLResult, error) {
|
||||
return "", nil, nil
|
||||
}
|
||||
|
||||
func TestNetworkStatusFormatsInterfacesAndRoute(t *testing.T) {
|
||||
t.Parallel()
|
||||
@@ -545,8 +542,6 @@ func TestActionResultsUseFallbackBody(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tmp := t.TempDir()
|
||||
oldExportDir := DefaultExportDir
|
||||
DefaultExportDir = tmp
|
||||
@@ -583,8 +578,6 @@ func TestExportSupportBundleResultMentionsUnmountedUSB(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestExportSupportBundleResultDoesNotPretendSuccessOnError(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tmp := t.TempDir()
|
||||
oldExportDir := DefaultExportDir
|
||||
DefaultExportDir = tmp
|
||||
@@ -646,8 +639,6 @@ func TestRunNvidiaAcceptancePackResult(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestRunSATDefaultsToExportDir(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
oldSATBaseDir := DefaultSATBaseDir
|
||||
DefaultSATBaseDir = "/tmp/export/bee-sat"
|
||||
t.Cleanup(func() { DefaultSATBaseDir = oldSATBaseDir })
|
||||
|
||||
@@ -54,7 +54,7 @@ if ! command -v lspci >/dev/null 2>&1; then
|
||||
exit 0
|
||||
fi
|
||||
found=0
|
||||
for gpu in $(lspci -Dn | awk '$3 ~ /^10de:/ {print $1}'); do
|
||||
for gpu in $(lspci -Dn | awk '$2 ~ /^03(00|02):$/ && $3 ~ /^10de:/ {print $1}'); do
|
||||
found=1
|
||||
echo "=== GPU $gpu ==="
|
||||
lspci -s "$gpu" -vv 2>&1 || true
|
||||
@@ -73,8 +73,13 @@ fi
|
||||
{name: "system/pcie-nvidia-link.txt", cmd: []string{"sh", "-c", `
|
||||
for d in /sys/bus/pci/devices/*/; do
|
||||
vendor=$(cat "$d/vendor" 2>/dev/null)
|
||||
[ "$vendor" = "0x10de" ] || continue
|
||||
dev=$(basename "$d")
|
||||
[ "$vendor" = "0x10de" ] || continue
|
||||
class=$(cat "$d/class" 2>/dev/null)
|
||||
case "$class" in
|
||||
0x030000|0x030200) ;;
|
||||
*) continue ;;
|
||||
esac
|
||||
dev=$(basename "$d")
|
||||
echo "=== $dev ==="
|
||||
for f in current_link_speed current_link_width max_link_speed max_link_width; do
|
||||
printf " %-22s %s\n" "$f" "$(cat "$d/$f" 2>/dev/null)"
|
||||
@@ -192,7 +197,7 @@ var supportBundleOptionalFiles = []struct {
|
||||
{name: "system/syslog.txt", src: "/var/log/syslog"},
|
||||
}
|
||||
|
||||
const supportBundleGlob = "bee-support-*.tar.gz"
|
||||
const supportBundleGlob = "????-??-?? (BEE-SP*)*.tar.gz"
|
||||
|
||||
func BuildSupportBundle(exportDir string) (string, error) {
|
||||
exportDir = strings.TrimSpace(exportDir)
|
||||
@@ -206,9 +211,14 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
||||
return "", err
|
||||
}
|
||||
|
||||
host := sanitizeFilename(hostnameOr("unknown"))
|
||||
ts := time.Now().UTC().Format("20060102-150405")
|
||||
stageRoot := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-%s-%s", host, ts))
|
||||
now := time.Now().UTC()
|
||||
date := now.Format("2006-01-02")
|
||||
tod := now.Format("150405")
|
||||
ver := bundleVersion()
|
||||
model := serverModelForBundle()
|
||||
sn := serverSerialForBundle()
|
||||
|
||||
stageRoot := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-stage-%s-%s", sanitizeFilename(hostnameOr("unknown")), now.Format("20060102-150405")))
|
||||
if err := os.MkdirAll(stageRoot, 0755); err != nil {
|
||||
return "", err
|
||||
}
|
||||
@@ -240,7 +250,8 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
||||
return "", err
|
||||
}
|
||||
|
||||
archivePath := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-%s-%s.tar.gz", host, ts))
|
||||
archiveName := fmt.Sprintf("%s (BEE-SP v%s) %s %s %s.tar.gz", date, ver, model, sn, tod)
|
||||
archivePath := filepath.Join(os.TempDir(), archiveName)
|
||||
if err := createSupportTarGz(archivePath, stageRoot); err != nil {
|
||||
return "", err
|
||||
}
|
||||
@@ -397,6 +408,60 @@ func writeManifest(dst, exportDir, stageRoot string) error {
|
||||
return os.WriteFile(dst, []byte(body.String()), 0644)
|
||||
}
|
||||
|
||||
func bundleVersion() string {
|
||||
v := buildVersion()
|
||||
v = strings.TrimPrefix(v, "v")
|
||||
v = strings.TrimPrefix(v, "V")
|
||||
if v == "" || v == "unknown" {
|
||||
return "0.0"
|
||||
}
|
||||
return v
|
||||
}
|
||||
|
||||
func serverModelForBundle() string {
|
||||
raw, err := exec.Command("dmidecode", "-t", "1").Output()
|
||||
if err != nil {
|
||||
return "unknown"
|
||||
}
|
||||
for _, line := range strings.Split(string(raw), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
key, val, ok := strings.Cut(line, ": ")
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if strings.TrimSpace(key) == "Product Name" {
|
||||
val = strings.TrimSpace(val)
|
||||
if val == "" {
|
||||
return "unknown"
|
||||
}
|
||||
return strings.ReplaceAll(val, " ", "_")
|
||||
}
|
||||
}
|
||||
return "unknown"
|
||||
}
|
||||
|
||||
func serverSerialForBundle() string {
|
||||
raw, err := exec.Command("dmidecode", "-t", "1").Output()
|
||||
if err != nil {
|
||||
return "unknown"
|
||||
}
|
||||
for _, line := range strings.Split(string(raw), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
key, val, ok := strings.Cut(line, ": ")
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if strings.TrimSpace(key) == "Serial Number" {
|
||||
val = strings.TrimSpace(val)
|
||||
if val == "" {
|
||||
return "unknown"
|
||||
}
|
||||
return val
|
||||
}
|
||||
}
|
||||
return "unknown"
|
||||
}
|
||||
|
||||
func buildVersion() string {
|
||||
raw, err := exec.Command("bee", "version").CombinedOutput()
|
||||
if err != nil {
|
||||
|
||||
@@ -179,11 +179,3 @@ func commandOutputWithTimeout(timeout time.Duration, name string, args ...string
|
||||
defer cancel()
|
||||
return exec.CommandContext(ctx, name, args...).Output()
|
||||
}
|
||||
|
||||
func interfaceHasCarrier(iface string) bool {
|
||||
raw, err := readNetCarrierFile(iface)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
return strings.TrimSpace(raw) == "1"
|
||||
}
|
||||
|
||||
@@ -58,12 +58,10 @@ func enrichPCIeWithNICTelemetry(devs []schema.HardwarePCIeDevice) []schema.Hardw
|
||||
}
|
||||
}
|
||||
|
||||
if interfaceHasCarrier(iface) {
|
||||
if out, err := ethtoolModuleQuery(iface); err == nil {
|
||||
if injectSFPDOMTelemetry(&devs[i], out) {
|
||||
enriched++
|
||||
continue
|
||||
}
|
||||
if out, err := ethtoolModuleQuery(iface); err == nil {
|
||||
if injectSFPDOMTelemetry(&devs[i], out) {
|
||||
enriched++
|
||||
continue
|
||||
}
|
||||
}
|
||||
if len(devs[i].MacAddresses) > 0 || devs[i].Firmware != nil {
|
||||
@@ -115,8 +113,38 @@ func injectSFPDOMTelemetry(dev *schema.HardwarePCIeDevice, raw string) bool {
|
||||
}
|
||||
key := strings.ToLower(strings.TrimSpace(trimmed[:idx]))
|
||||
val := strings.TrimSpace(trimmed[idx+1:])
|
||||
if val == "" || strings.EqualFold(val, "not supported") || strings.EqualFold(val, "unknown") {
|
||||
continue
|
||||
}
|
||||
|
||||
switch {
|
||||
case key == "identifier":
|
||||
s := parseSFPIdentifier(val)
|
||||
dev.SFPIdentifier = &s
|
||||
t := true
|
||||
dev.SFPPresent = &t
|
||||
changed = true
|
||||
case key == "connector":
|
||||
s := parseSFPConnector(val)
|
||||
dev.SFPConnector = &s
|
||||
changed = true
|
||||
case key == "vendor name":
|
||||
s := strings.TrimSpace(val)
|
||||
dev.SFPVendor = &s
|
||||
changed = true
|
||||
case key == "vendor pn":
|
||||
s := strings.TrimSpace(val)
|
||||
dev.SFPPartNumber = &s
|
||||
changed = true
|
||||
case key == "vendor sn":
|
||||
s := strings.TrimSpace(val)
|
||||
dev.SFPSerialNumber = &s
|
||||
changed = true
|
||||
case strings.Contains(key, "laser wavelength"):
|
||||
if f, ok := firstFloat(val); ok {
|
||||
dev.SFPWavelengthNM = &f
|
||||
changed = true
|
||||
}
|
||||
case strings.Contains(key, "module temperature"):
|
||||
if f, ok := firstFloat(val); ok {
|
||||
dev.SFPTemperatureC = &f
|
||||
@@ -147,12 +175,61 @@ func injectSFPDOMTelemetry(dev *schema.HardwarePCIeDevice, raw string) bool {
|
||||
return changed
|
||||
}
|
||||
|
||||
// parseSFPIdentifier extracts the human-readable transceiver type from the
|
||||
// raw ethtool identifier line, e.g. "0x03 (SFP)" → "SFP".
|
||||
func parseSFPIdentifier(val string) string {
|
||||
if s := extractParens(val); s != "" {
|
||||
return s
|
||||
}
|
||||
return val
|
||||
}
|
||||
|
||||
// parseSFPConnector extracts the connector type from the raw ethtool line,
|
||||
// e.g. "0x07 (LC)" → "LC".
|
||||
func parseSFPConnector(val string) string {
|
||||
if s := extractParens(val); s != "" {
|
||||
return s
|
||||
}
|
||||
return val
|
||||
}
|
||||
|
||||
var parenRe = regexp.MustCompile(`\(([^)]+)\)`)
|
||||
|
||||
func extractParens(s string) string {
|
||||
m := parenRe.FindStringSubmatch(s)
|
||||
if len(m) < 2 {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(m[1])
|
||||
}
|
||||
|
||||
func parseSFPDOM(raw string) map[string]any {
|
||||
dev := schema.HardwarePCIeDevice{}
|
||||
if !injectSFPDOMTelemetry(&dev, raw) {
|
||||
return map[string]any{}
|
||||
}
|
||||
out := map[string]any{}
|
||||
if dev.SFPPresent != nil {
|
||||
out["sfp_present"] = *dev.SFPPresent
|
||||
}
|
||||
if dev.SFPIdentifier != nil {
|
||||
out["sfp_identifier"] = *dev.SFPIdentifier
|
||||
}
|
||||
if dev.SFPConnector != nil {
|
||||
out["sfp_connector"] = *dev.SFPConnector
|
||||
}
|
||||
if dev.SFPVendor != nil {
|
||||
out["sfp_vendor"] = *dev.SFPVendor
|
||||
}
|
||||
if dev.SFPPartNumber != nil {
|
||||
out["sfp_part_number"] = *dev.SFPPartNumber
|
||||
}
|
||||
if dev.SFPSerialNumber != nil {
|
||||
out["sfp_serial_number"] = *dev.SFPSerialNumber
|
||||
}
|
||||
if dev.SFPWavelengthNM != nil {
|
||||
out["sfp_wavelength_nm"] = *dev.SFPWavelengthNM
|
||||
}
|
||||
if dev.SFPTemperatureC != nil {
|
||||
out["sfp_temperature_c"] = *dev.SFPTemperatureC
|
||||
}
|
||||
|
||||
@@ -122,10 +122,7 @@ func TestEnrichPCIeWithNICTelemetrySkipsModuleQueryWithoutCarrier(t *testing.T)
|
||||
readNetAddressFile = func(string) (string, error) { return "aa:bb:cc:dd:ee:ff", nil }
|
||||
readNetCarrierFile = func(string) (string, error) { return "0", nil }
|
||||
ethtoolInfoQuery = func(string) (string, error) { return "", fmt.Errorf("skip firmware") }
|
||||
ethtoolModuleQuery = func(string) (string, error) {
|
||||
t.Fatal("ethtool -m should not be called without carrier")
|
||||
return "", nil
|
||||
}
|
||||
ethtoolModuleQuery = func(string) (string, error) { return "", fmt.Errorf("no module") }
|
||||
|
||||
class := "EthernetController"
|
||||
bdf := "0000:18:00.0"
|
||||
|
||||
@@ -15,6 +15,7 @@ const nvidiaVendorID = 0x10de
|
||||
type nvidiaGPUInfo struct {
|
||||
Index int
|
||||
BDF string
|
||||
Name string
|
||||
Serial string
|
||||
VBIOS string
|
||||
TemperatureC *float64
|
||||
@@ -73,6 +74,9 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
|
||||
continue
|
||||
}
|
||||
|
||||
if v := strings.TrimSpace(info.Name); v != "" {
|
||||
devs[i].Model = &v
|
||||
}
|
||||
if v := strings.TrimSpace(info.Serial); v != "" {
|
||||
devs[i].SerialNumber = &v
|
||||
}
|
||||
@@ -99,7 +103,7 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
|
||||
func queryNVIDIAGPUs() (map[string]nvidiaGPUInfo, error) {
|
||||
out, err := exec.Command(
|
||||
"nvidia-smi",
|
||||
"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown,pcie.link.gen.current,pcie.link.gen.max,pcie.link.width.current,pcie.link.width.max",
|
||||
"--query-gpu=index,pci.bus_id,name,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown,pcie.link.gen.current,pcie.link.gen.max,pcie.link.width.current,pcie.link.width.max",
|
||||
"--format=csv,noheader,nounits",
|
||||
).Output()
|
||||
if err != nil {
|
||||
@@ -123,8 +127,8 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
|
||||
if len(rec) == 0 {
|
||||
continue
|
||||
}
|
||||
if len(rec) < 13 {
|
||||
return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 13", len(rec))
|
||||
if len(rec) < 14 {
|
||||
return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 14", len(rec))
|
||||
}
|
||||
|
||||
bdf := normalizePCIeBDF(rec[1])
|
||||
@@ -135,17 +139,18 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
|
||||
info := nvidiaGPUInfo{
|
||||
Index: parseRequiredInt(rec[0]),
|
||||
BDF: bdf,
|
||||
Serial: strings.TrimSpace(rec[2]),
|
||||
VBIOS: strings.TrimSpace(rec[3]),
|
||||
TemperatureC: parseMaybeFloat(rec[4]),
|
||||
PowerW: parseMaybeFloat(rec[5]),
|
||||
ECCUncorrected: parseMaybeInt64(rec[6]),
|
||||
ECCCorrected: parseMaybeInt64(rec[7]),
|
||||
HWSlowdown: parseMaybeBool(rec[8]),
|
||||
PCIeLinkGenCurrent: parseMaybeInt(rec[9]),
|
||||
PCIeLinkGenMax: parseMaybeInt(rec[10]),
|
||||
PCIeLinkWidthCur: parseMaybeInt(rec[11]),
|
||||
PCIeLinkWidthMax: parseMaybeInt(rec[12]),
|
||||
Name: strings.TrimSpace(rec[2]),
|
||||
Serial: strings.TrimSpace(rec[3]),
|
||||
VBIOS: strings.TrimSpace(rec[4]),
|
||||
TemperatureC: parseMaybeFloat(rec[5]),
|
||||
PowerW: parseMaybeFloat(rec[6]),
|
||||
ECCUncorrected: parseMaybeInt64(rec[7]),
|
||||
ECCCorrected: parseMaybeInt64(rec[8]),
|
||||
HWSlowdown: parseMaybeBool(rec[9]),
|
||||
PCIeLinkGenCurrent: parseMaybeInt(rec[10]),
|
||||
PCIeLinkGenMax: parseMaybeInt(rec[11]),
|
||||
PCIeLinkWidthCur: parseMaybeInt(rec[12]),
|
||||
PCIeLinkWidthMax: parseMaybeInt(rec[13]),
|
||||
}
|
||||
result[bdf] = info
|
||||
}
|
||||
|
||||
@@ -6,7 +6,7 @@ import (
|
||||
)
|
||||
|
||||
func TestParseNVIDIASMIQuery(t *testing.T) {
|
||||
raw := "0, 00000000:65:00.0, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active, 4, 4, 16, 16\n"
|
||||
raw := "0, 00000000:65:00.0, NVIDIA H100 80GB HBM3, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active, 4, 4, 16, 16\n"
|
||||
byBDF, err := parseNVIDIASMIQuery(raw)
|
||||
if err != nil {
|
||||
t.Fatalf("parse failed: %v", err)
|
||||
@@ -16,6 +16,9 @@ func TestParseNVIDIASMIQuery(t *testing.T) {
|
||||
if !ok {
|
||||
t.Fatalf("gpu by normalized bdf not found")
|
||||
}
|
||||
if gpu.Name != "NVIDIA H100 80GB HBM3" {
|
||||
t.Fatalf("name: got %q", gpu.Name)
|
||||
}
|
||||
if gpu.Serial != "GPU-SERIAL-1" {
|
||||
t.Fatalf("serial: got %q", gpu.Serial)
|
||||
}
|
||||
|
||||
@@ -2,6 +2,7 @@ package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
@@ -79,6 +80,25 @@ func shouldIncludePCIeDevice(class, vendor, device string) bool {
|
||||
}
|
||||
}
|
||||
|
||||
// Exclude BMC/management virtual VGA adapters — these are firmware video chips,
|
||||
// not real GPUs, and pollute the GPU inventory (e.g. iBMC, iDRAC, iLO VGA).
|
||||
if strings.Contains(c, "vga") || strings.Contains(c, "display") || strings.Contains(c, "3d") {
|
||||
bmcPatterns := []string{
|
||||
"management system chip",
|
||||
"management controller",
|
||||
"ibmc",
|
||||
"idrac",
|
||||
"ilo vga",
|
||||
"aspeed",
|
||||
"matrox",
|
||||
}
|
||||
for _, bad := range bmcPatterns {
|
||||
if strings.Contains(d, bad) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if strings.Contains(v, "advanced micro devices") || strings.Contains(v, "[amd]") {
|
||||
internalAMDPatterns := []string{
|
||||
"dummy function",
|
||||
@@ -153,6 +173,9 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
|
||||
|
||||
// SVendor/SDevice available but not in schema — skip
|
||||
|
||||
// Warn if PCIe link is running below its maximum negotiated speed.
|
||||
applyPCIeLinkSpeedWarning(&dev)
|
||||
|
||||
return dev
|
||||
}
|
||||
|
||||
@@ -222,6 +245,41 @@ func readPCIStringAttribute(bdf, attribute string) (string, bool) {
|
||||
return value, true
|
||||
}
|
||||
|
||||
// applyPCIeLinkSpeedWarning sets the device status to Warning if the current PCIe link
|
||||
// speed is below the maximum negotiated speed supported by both ends.
|
||||
func applyPCIeLinkSpeedWarning(dev *schema.HardwarePCIeDevice) {
|
||||
if dev.LinkSpeed == nil || dev.MaxLinkSpeed == nil {
|
||||
return
|
||||
}
|
||||
if pcieLinkSpeedRank(*dev.LinkSpeed) < pcieLinkSpeedRank(*dev.MaxLinkSpeed) {
|
||||
warn := statusWarning
|
||||
dev.Status = &warn
|
||||
desc := fmt.Sprintf("PCIe link speed degraded: running at %s, capable of %s", *dev.LinkSpeed, *dev.MaxLinkSpeed)
|
||||
dev.ErrorDescription = &desc
|
||||
}
|
||||
}
|
||||
|
||||
// pcieLinkSpeedRank returns a numeric rank for a normalized Gen string (e.g. "Gen4" → 4).
|
||||
// Returns 0 for unrecognised values so comparisons fail safe.
|
||||
func pcieLinkSpeedRank(gen string) int {
|
||||
switch gen {
|
||||
case "Gen1":
|
||||
return 1
|
||||
case "Gen2":
|
||||
return 2
|
||||
case "Gen3":
|
||||
return 3
|
||||
case "Gen4":
|
||||
return 4
|
||||
case "Gen5":
|
||||
return 5
|
||||
case "Gen6":
|
||||
return 6
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
func normalizePCILinkSpeed(raw string) string {
|
||||
raw = strings.TrimSpace(strings.ToLower(raw))
|
||||
switch {
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"encoding/json"
|
||||
"strings"
|
||||
"testing"
|
||||
@@ -29,6 +30,8 @@ func TestShouldIncludePCIeDevice(t *testing.T) {
|
||||
{name: "raid", class: "RAID bus controller", want: true},
|
||||
{name: "nvme", class: "Non-Volatile memory controller", want: true},
|
||||
{name: "vga", class: "VGA compatible controller", want: true},
|
||||
{name: "ibmc vga", class: "VGA compatible controller", vendor: "Huawei Technologies Co., Ltd.", device: "Hi171x Series [iBMC Intelligent Management system chip w/VGA support]", want: false},
|
||||
{name: "aspeed vga", class: "VGA compatible controller", vendor: "ASPEED Technology, Inc.", device: "ASPEED Graphics Family", want: false},
|
||||
{name: "other encryption controller", class: "Encryption controller", vendor: "Intel Corporation", device: "QuickAssist", want: true},
|
||||
}
|
||||
|
||||
@@ -139,3 +142,77 @@ func TestNormalizePCILinkSpeed(t *testing.T) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyPCIeLinkSpeedWarning(t *testing.T) {
|
||||
ptr := func(s string) *string { return &s }
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
linkSpeed *string
|
||||
maxSpeed *string
|
||||
wantWarning bool
|
||||
wantGenIn string // substring expected in ErrorDescription when warning
|
||||
}{
|
||||
{
|
||||
name: "degraded Gen1 vs Gen5",
|
||||
linkSpeed: ptr("Gen1"),
|
||||
maxSpeed: ptr("Gen5"),
|
||||
wantWarning: true,
|
||||
wantGenIn: "Gen1",
|
||||
},
|
||||
{
|
||||
name: "at max Gen5",
|
||||
linkSpeed: ptr("Gen5"),
|
||||
maxSpeed: ptr("Gen5"),
|
||||
wantWarning: false,
|
||||
},
|
||||
{
|
||||
name: "degraded Gen4 vs Gen5",
|
||||
linkSpeed: ptr("Gen4"),
|
||||
maxSpeed: ptr("Gen5"),
|
||||
wantWarning: true,
|
||||
wantGenIn: "Gen4",
|
||||
},
|
||||
{
|
||||
name: "missing current speed — no warning",
|
||||
linkSpeed: nil,
|
||||
maxSpeed: ptr("Gen5"),
|
||||
wantWarning: false,
|
||||
},
|
||||
{
|
||||
name: "missing max speed — no warning",
|
||||
linkSpeed: ptr("Gen1"),
|
||||
maxSpeed: nil,
|
||||
wantWarning: false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
dev := schema.HardwarePCIeDevice{}
|
||||
ok := statusOK
|
||||
dev.Status = &ok
|
||||
dev.LinkSpeed = tt.linkSpeed
|
||||
dev.MaxLinkSpeed = tt.maxSpeed
|
||||
|
||||
applyPCIeLinkSpeedWarning(&dev)
|
||||
|
||||
gotWarn := dev.Status != nil && *dev.Status == statusWarning
|
||||
if gotWarn != tt.wantWarning {
|
||||
t.Fatalf("wantWarning=%v gotWarning=%v (status=%v)", tt.wantWarning, gotWarn, dev.Status)
|
||||
}
|
||||
if tt.wantWarning {
|
||||
if dev.ErrorDescription == nil {
|
||||
t.Fatal("expected ErrorDescription to be set")
|
||||
}
|
||||
if !strings.Contains(*dev.ErrorDescription, tt.wantGenIn) {
|
||||
t.Fatalf("ErrorDescription %q does not contain %q", *dev.ErrorDescription, tt.wantGenIn)
|
||||
}
|
||||
} else {
|
||||
if dev.ErrorDescription != nil {
|
||||
t.Fatalf("unexpected ErrorDescription: %s", *dev.ErrorDescription)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -335,11 +335,7 @@ func (s *System) RunNvidiaBenchmark(ctx context.Context, baseDir string, opts Nv
|
||||
return "", fmt.Errorf("write summary.txt: %w", err)
|
||||
}
|
||||
|
||||
archive := filepath.Join(baseDir, "gpu-benchmark-"+ts+".tar.gz")
|
||||
if err := createTarGz(archive, runDir); err != nil {
|
||||
return "", fmt.Errorf("pack benchmark archive: %w", err)
|
||||
}
|
||||
return archive, nil
|
||||
return runDir, nil
|
||||
}
|
||||
|
||||
func normalizeNvidiaBenchmarkOptionsForBenchmark(opts NvidiaBenchmarkOptions) NvidiaBenchmarkOptions {
|
||||
|
||||
@@ -90,7 +90,7 @@ func renderBenchmarkReportWithCharts(result NvidiaBenchmarkResult, charts []benc
|
||||
for _, gpu := range result.GPUs {
|
||||
name := strings.TrimSpace(gpu.Name)
|
||||
if name == "" {
|
||||
name = "Unknown"
|
||||
name = "Unknown GPU"
|
||||
}
|
||||
interconnect := "-"
|
||||
if gpu.Scores.InterconnectScore > 0 {
|
||||
|
||||
@@ -383,10 +383,7 @@ func drawGPUChartSVG(rows []GPUMetricRow, gpuIdx int) string {
|
||||
}
|
||||
|
||||
const (
|
||||
ansiRed = "\033[31m"
|
||||
ansiBlue = "\033[34m"
|
||||
ansiGreen = "\033[32m"
|
||||
ansiYellow = "\033[33m"
|
||||
ansiAmber = "\033[38;5;214m"
|
||||
ansiReset = "\033[0m"
|
||||
)
|
||||
|
||||
@@ -415,10 +412,10 @@ func RenderGPUTerminalChart(rows []GPUMetricRow) string {
|
||||
fn func(GPUMetricRow) float64
|
||||
}
|
||||
defs := []seriesDef{
|
||||
{"Temperature (°C)", ansiRed, func(r GPUMetricRow) float64 { return r.TempC }},
|
||||
{"GPU Usage (%)", ansiBlue, func(r GPUMetricRow) float64 { return r.UsagePct }},
|
||||
{"Power (W)", ansiGreen, func(r GPUMetricRow) float64 { return r.PowerW }},
|
||||
{"Clock (MHz)", ansiYellow, func(r GPUMetricRow) float64 { return r.ClockMHz }},
|
||||
{"Temperature (°C)", ansiAmber, func(r GPUMetricRow) float64 { return r.TempC }},
|
||||
{"GPU Usage (%)", ansiAmber, func(r GPUMetricRow) float64 { return r.UsagePct }},
|
||||
{"Power (W)", ansiAmber, func(r GPUMetricRow) float64 { return r.PowerW }},
|
||||
{"Clock (MHz)", ansiAmber, func(r GPUMetricRow) float64 { return r.ClockMHz }},
|
||||
}
|
||||
|
||||
var b strings.Builder
|
||||
|
||||
@@ -1,142 +0,0 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// HPLOptions configures the HPL (LINPACK) benchmark run.
|
||||
type HPLOptions struct {
|
||||
MemFraction float64 // fraction of RAM to use (default 0.80)
|
||||
NB int // block size (default 256)
|
||||
}
|
||||
|
||||
// HPLResult holds the parsed result of an HPL run.
|
||||
type HPLResult struct {
|
||||
N int // matrix dimension
|
||||
NB int // block size
|
||||
P int // process grid rows
|
||||
Q int // process grid cols
|
||||
TimeSec float64 // wall time in seconds
|
||||
GFlops float64 // achieved performance
|
||||
Residual float64 // backward error residual (from HPL verification line)
|
||||
Status string // "PASSED" or "FAILED"
|
||||
RawOutput string // full xhpl output
|
||||
}
|
||||
|
||||
func applyHPLDefaults(opts *HPLOptions) {
|
||||
if opts.MemFraction <= 0 || opts.MemFraction > 1 {
|
||||
opts.MemFraction = 0.80
|
||||
}
|
||||
if opts.NB <= 0 {
|
||||
opts.NB = 256
|
||||
}
|
||||
}
|
||||
|
||||
// RunHPL runs bee-hpl and returns parsed results plus a tar.gz artifact path.
|
||||
func (s *System) RunHPL(ctx context.Context, baseDir string, opts HPLOptions, logFunc func(string)) (string, *HPLResult, error) {
|
||||
applyHPLDefaults(&opts)
|
||||
|
||||
if baseDir == "" {
|
||||
baseDir = "/var/log/bee-sat"
|
||||
}
|
||||
ts := time.Now().UTC().Format("20060102-150405")
|
||||
runDir := filepath.Join(baseDir, "hpl-"+ts)
|
||||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||
return "", nil, fmt.Errorf("mkdir %s: %w", runDir, err)
|
||||
}
|
||||
|
||||
logPath := filepath.Join(runDir, "hpl.log")
|
||||
|
||||
cmd := []string{
|
||||
"bee-hpl",
|
||||
"--mem-fraction", strconv.FormatFloat(opts.MemFraction, 'f', 2, 64),
|
||||
"--nb", strconv.Itoa(opts.NB),
|
||||
}
|
||||
|
||||
if logFunc != nil {
|
||||
logFunc(fmt.Sprintf("HPL: N will be auto-sized to %.0f%% of RAM, NB=%d", opts.MemFraction*100, opts.NB))
|
||||
}
|
||||
|
||||
out, err := runSATCommandCtx(ctx, "", "hpl", cmd, nil, logFunc)
|
||||
_ = os.WriteFile(logPath, out, 0644)
|
||||
|
||||
result := parseHPLOutput(string(out))
|
||||
result.RawOutput = string(out)
|
||||
|
||||
if err != nil && err != context.Canceled {
|
||||
return "", result, fmt.Errorf("bee-hpl failed: %w", err)
|
||||
}
|
||||
if err == nil && result.GFlops <= 0 {
|
||||
return "", result, fmt.Errorf("HPL completed but no Gflops result found in output")
|
||||
}
|
||||
|
||||
// Write summary
|
||||
summary := fmt.Sprintf("N=%d NB=%d time=%.2fs gflops=%.3f status=%s\n",
|
||||
result.N, result.NB, result.TimeSec, result.GFlops, result.Status)
|
||||
_ = os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644)
|
||||
|
||||
if logFunc != nil {
|
||||
logFunc(fmt.Sprintf("HPL result: N=%d NB=%d %.2fs %.3f Gflops %s",
|
||||
result.N, result.NB, result.TimeSec, result.GFlops, result.Status))
|
||||
}
|
||||
|
||||
ts2 := time.Now().UTC().Format("20060102-150405")
|
||||
archive := filepath.Join(baseDir, "hpl-"+ts2+".tar.gz")
|
||||
if archErr := createTarGz(archive, runDir); archErr != nil {
|
||||
return runDir, result, err
|
||||
}
|
||||
return archive, result, err
|
||||
}
|
||||
|
||||
// parseHPLOutput extracts N, NB, time, and Gflops from standard HPL output.
|
||||
//
|
||||
// HPL prints a result line of the form:
|
||||
//
|
||||
// WR00L2L2 45312 256 1 1 1234.56 5.678e+01
|
||||
// T/V N NB P Q Time Gflops
|
||||
func parseHPLOutput(output string) *HPLResult {
|
||||
result := &HPLResult{Status: "FAILED"}
|
||||
for _, line := range strings.Split(output, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
// Result line starts with WR
|
||||
if strings.HasPrefix(line, "WR") {
|
||||
fields := strings.Fields(line)
|
||||
// WR00L2L2 N NB P Q Time Gflops
|
||||
if len(fields) >= 7 {
|
||||
result.N, _ = strconv.Atoi(fields[1])
|
||||
result.NB, _ = strconv.Atoi(fields[2])
|
||||
result.P, _ = strconv.Atoi(fields[3])
|
||||
result.Q, _ = strconv.Atoi(fields[4])
|
||||
result.TimeSec, _ = strconv.ParseFloat(fields[5], 64)
|
||||
result.GFlops, _ = strconv.ParseFloat(fields[6], 64)
|
||||
}
|
||||
}
|
||||
// Verification line: "||Ax-b||_oo/(eps*(||A||_oo*||x||_oo+||b||_oo)*N)= ... PASSED"
|
||||
if strings.Contains(line, "PASSED") {
|
||||
result.Status = "PASSED"
|
||||
fields := strings.Fields(line)
|
||||
for i, f := range fields {
|
||||
if f == "PASSED" && i > 0 {
|
||||
result.Residual, _ = strconv.ParseFloat(fields[i-1], 64)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// hplAvailable returns true if bee-hpl and xhpl are present and executable.
|
||||
func hplAvailable() bool {
|
||||
if _, err := exec.LookPath("bee-hpl"); err != nil {
|
||||
return false
|
||||
}
|
||||
_, err := os.Stat("/usr/local/lib/bee/xhpl")
|
||||
return err == nil
|
||||
}
|
||||
@@ -49,6 +49,9 @@ func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
|
||||
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||
"--size-mb", strconv.Itoa(opts.SizeMB),
|
||||
}
|
||||
if opts.StaggerSeconds > 0 && len(selected) > 1 {
|
||||
cmd = append(cmd, "--stagger-seconds", strconv.Itoa(opts.StaggerSeconds))
|
||||
}
|
||||
if len(selected) > 0 {
|
||||
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||
}
|
||||
@@ -63,6 +66,9 @@ func buildNvidiaStressJob(opts NvidiaStressOptions) (satJob, error) {
|
||||
"bee-john-gpu-stress",
|
||||
"--seconds", strconv.Itoa(opts.DurationSec),
|
||||
}
|
||||
if opts.StaggerSeconds > 0 && len(selected) > 1 {
|
||||
cmd = append(cmd, "--stagger-seconds", strconv.Itoa(opts.StaggerSeconds))
|
||||
}
|
||||
if len(selected) > 0 {
|
||||
cmd = append(cmd, "--devices", joinIndexList(selected))
|
||||
}
|
||||
|
||||
@@ -161,13 +161,7 @@ func (s *System) RunPlatformStress(
|
||||
}
|
||||
_ = os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644)
|
||||
|
||||
// Pack tar.gz
|
||||
archivePath := filepath.Join(baseDir, "platform-stress-"+stamp+".tar.gz")
|
||||
if err := packPlatformDir(runDir, archivePath); err != nil {
|
||||
return "", fmt.Errorf("pack archive: %w", err)
|
||||
}
|
||||
_ = os.RemoveAll(runDir)
|
||||
return archivePath, nil
|
||||
return runDir, nil
|
||||
}
|
||||
|
||||
// collectPhase samples live metrics every second until ctx is done.
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strings"
|
||||
@@ -114,6 +115,8 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
|
||||
}
|
||||
|
||||
s.collectGPURuntimeHealth(vendor, &health)
|
||||
s.collectToRAMHealth(&health)
|
||||
s.collectUSBExportHealth(&health)
|
||||
|
||||
if health.Status != "FAILED" && len(health.Issues) > 0 {
|
||||
health.Status = "PARTIAL"
|
||||
@@ -168,6 +171,90 @@ func resolvedToolStatus(display string, candidates ...string) ToolStatus {
|
||||
return ToolStatus{Name: display}
|
||||
}
|
||||
|
||||
// collectToRAMHealth checks whether the LiveCD ISO has been copied to RAM.
|
||||
// Status values: "ok" = in RAM, "warning" = toram not active (no copy attempted),
|
||||
// "failed" = toram was requested but medium is not in RAM (copy failed or in progress).
|
||||
func (s *System) collectToRAMHealth(health *schema.RuntimeHealth) {
|
||||
inRAM := s.IsLiveMediaInRAM()
|
||||
active := toramActive()
|
||||
switch {
|
||||
case inRAM:
|
||||
health.ToRAMStatus = "ok"
|
||||
case active:
|
||||
// toram was requested but medium is not yet/no longer in RAM
|
||||
health.ToRAMStatus = "failed"
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "toram_copy_failed",
|
||||
Severity: "warning",
|
||||
Description: "toram boot parameter is set but the live medium is not mounted from RAM.",
|
||||
})
|
||||
default:
|
||||
health.ToRAMStatus = "warning"
|
||||
}
|
||||
}
|
||||
|
||||
// collectUSBExportHealth scans /proc/mounts for a writable USB-backed filesystem
|
||||
// suitable for log export. Sets USBExportPath to the first match found.
|
||||
func (s *System) collectUSBExportHealth(health *schema.RuntimeHealth) {
|
||||
health.USBExportPath = findUSBExportMount()
|
||||
}
|
||||
|
||||
// findUSBExportMount returns the mount point of the first writable USB filesystem
|
||||
// found in /proc/mounts (vfat, exfat, ext2/3/4, ntfs) whose backing block device
|
||||
// has USB transport. Returns "" if none found.
|
||||
func findUSBExportMount() string {
|
||||
f, err := os.Open("/proc/mounts")
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
// fs types that are expected on USB export drives
|
||||
exportFSTypes := map[string]bool{
|
||||
"vfat": true,
|
||||
"exfat": true,
|
||||
"ext2": true,
|
||||
"ext3": true,
|
||||
"ext4": true,
|
||||
"ntfs": true,
|
||||
"ntfs3": true,
|
||||
"fuseblk": true,
|
||||
}
|
||||
|
||||
scanner := bufio.NewScanner(f)
|
||||
for scanner.Scan() {
|
||||
// fields: device mountpoint fstype options dump pass
|
||||
fields := strings.Fields(scanner.Text())
|
||||
if len(fields) < 4 {
|
||||
continue
|
||||
}
|
||||
device, mountPoint, fsType, options := fields[0], fields[1], fields[2], fields[3]
|
||||
if !exportFSTypes[strings.ToLower(fsType)] {
|
||||
continue
|
||||
}
|
||||
// Skip read-only mounts
|
||||
opts := strings.Split(options, ",")
|
||||
readOnly := false
|
||||
for _, o := range opts {
|
||||
if strings.TrimSpace(o) == "ro" {
|
||||
readOnly = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if readOnly {
|
||||
continue
|
||||
}
|
||||
// Check USB transport via lsblk on the device
|
||||
if !strings.HasPrefix(device, "/dev/") {
|
||||
continue
|
||||
}
|
||||
if blockDeviceTransport(device) == "usb" {
|
||||
return mountPoint
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func (s *System) collectGPURuntimeHealth(vendor string, health *schema.RuntimeHealth) {
|
||||
lsmodText := commandText("lsmod")
|
||||
|
||||
|
||||
@@ -384,25 +384,39 @@ func (s *System) RunNCCLTests(ctx context.Context, baseDir string, logFunc func(
|
||||
), logFunc)
|
||||
}
|
||||
|
||||
func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
func (s *System) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
|
||||
selected, err := resolveDCGMGPUIndices(gpuIndices)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
profCmd, err := resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)))
|
||||
if err != nil {
|
||||
return "", err
|
||||
var (
|
||||
profCmd []string
|
||||
profEnv []string
|
||||
)
|
||||
if staggerSec > 0 && len(selected) > 1 {
|
||||
profCmd = []string{
|
||||
"bee-dcgmproftester-staggered",
|
||||
"--seconds", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)),
|
||||
"--stagger-seconds", strconv.Itoa(staggerSec),
|
||||
"--devices", joinIndexList(selected),
|
||||
}
|
||||
} else {
|
||||
profCmd, err = resolveDCGMProfTesterCommand("--no-dcgm-validation", "-t", "1004", "-d", strconv.Itoa(normalizeNvidiaBurnDuration(durationSec)))
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
profEnv = nvidiaVisibleDevicesEnv(selected)
|
||||
}
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-nvidia-compute", withNvidiaPersistenceMode(
|
||||
satJob{name: "01-nvidia-smi-q.log", cmd: []string{"nvidia-smi", "-q"}},
|
||||
satJob{name: "02-dcgmi-version.log", cmd: []string{"dcgmi", "-v"}},
|
||||
satJob{
|
||||
name: "03-dcgmproftester.log",
|
||||
cmd: profCmd,
|
||||
env: nvidiaVisibleDevicesEnv(selected),
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
},
|
||||
satJob{
|
||||
name: "03-dcgmproftester.log",
|
||||
cmd: profCmd,
|
||||
env: profEnv,
|
||||
collectGPU: true,
|
||||
gpuIndices: selected,
|
||||
},
|
||||
satJob{name: "04-nvidia-smi-after.log", cmd: []string{"nvidia-smi", "--query-gpu=index,name,temperature.gpu,power.draw,utilization.gpu,memory.used,memory.total", "--format=csv,noheader,nounits"}},
|
||||
), logFunc)
|
||||
}
|
||||
@@ -648,11 +662,7 @@ func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, e
|
||||
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary.String()), 0644); err != nil {
|
||||
return "", err
|
||||
}
|
||||
archive := filepath.Join(baseDir, "storage-"+ts+".tar.gz")
|
||||
if err := createTarGz(archive, runDir); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return archive, nil
|
||||
return runDir, nil
|
||||
}
|
||||
|
||||
type satJob struct {
|
||||
@@ -838,11 +848,7 @@ func runAcceptancePackCtx(ctx context.Context, baseDir, prefix string, jobs []sa
|
||||
}
|
||||
}
|
||||
|
||||
archive := filepath.Join(baseDir, prefix+"-"+ts+".tar.gz")
|
||||
if err := createTarGz(archive, runDir); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return archive, nil
|
||||
return runDir, nil
|
||||
}
|
||||
|
||||
func updateNvidiaGPUStatus(perGPU map[int]*nvidiaGPUStatusFile, idx int, status, jobName, detail string) {
|
||||
@@ -905,7 +911,7 @@ func writeNvidiaGPUStatusFiles(runDir, overall string, perGPU map[int]*nvidiaGPU
|
||||
entry.Health = "UNKNOWN"
|
||||
}
|
||||
if entry.Name == "" {
|
||||
entry.Name = "unknown"
|
||||
entry.Name = "Unknown GPU"
|
||||
}
|
||||
var body strings.Builder
|
||||
fmt.Fprintf(&body, "gpu_index=%d\n", entry.Index)
|
||||
|
||||
@@ -223,11 +223,7 @@ func (s *System) RunFanStressTest(ctx context.Context, baseDir string, opts FanS
|
||||
return "", err
|
||||
}
|
||||
|
||||
archive := filepath.Join(baseDir, "fan-stress-"+ts+".tar.gz")
|
||||
if err := createTarGz(archive, runDir); err != nil {
|
||||
return "", err
|
||||
}
|
||||
return archive, nil
|
||||
return runDir, nil
|
||||
}
|
||||
|
||||
func applyFanStressDefaults(opts *FanStressOptions) {
|
||||
|
||||
@@ -20,6 +20,7 @@ var techDumpFixedCommands = []struct {
|
||||
{Name: "dmidecode", Args: []string{"-t", "4"}, File: "dmidecode-type4.txt"},
|
||||
{Name: "dmidecode", Args: []string{"-t", "17"}, File: "dmidecode-type17.txt"},
|
||||
{Name: "lspci", Args: []string{"-vmm", "-D"}, File: "lspci-vmm.txt"},
|
||||
{Name: "lspci", Args: []string{"-vvv"}, File: "lspci-vvv.txt"},
|
||||
{Name: "lsblk", Args: []string{"-J", "-d", "-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL"}, File: "lsblk.json"},
|
||||
{Name: "sensors", Args: []string{"-j"}, File: "sensors.json"},
|
||||
{Name: "ipmitool", Args: []string{"fru", "print"}, File: "ipmitool-fru.txt"},
|
||||
|
||||
@@ -70,6 +70,7 @@ type NvidiaStressOptions struct {
|
||||
Loader string
|
||||
GPUIndices []int
|
||||
ExcludeGPUIndices []int
|
||||
StaggerSeconds int
|
||||
}
|
||||
|
||||
func New() *System {
|
||||
|
||||
@@ -22,6 +22,10 @@ type RuntimeHealth struct {
|
||||
CUDAReady bool `json:"cuda_ready,omitempty"`
|
||||
NvidiaGSPMode string `json:"nvidia_gsp_mode,omitempty"` // "gsp-on", "gsp-off", "gsp-stuck"
|
||||
NetworkStatus string `json:"network_status,omitempty"`
|
||||
// ToRAMStatus: "ok" (ISO in RAM), "warning" (toram not active), "failed" (toram active but copy failed)
|
||||
ToRAMStatus string `json:"toram_status,omitempty"`
|
||||
// USBExportPath: mount point of the first writable USB drive found, empty if none.
|
||||
USBExportPath string `json:"usb_export_path,omitempty"`
|
||||
Issues []RuntimeIssue `json:"issues,omitempty"`
|
||||
Tools []RuntimeToolStatus `json:"tools,omitempty"`
|
||||
Services []RuntimeServiceStatus `json:"services,omitempty"`
|
||||
@@ -183,6 +187,13 @@ type HardwarePCIeDevice struct {
|
||||
BatteryTemperatureC *float64 `json:"battery_temperature_c,omitempty"`
|
||||
BatteryVoltageV *float64 `json:"battery_voltage_v,omitempty"`
|
||||
BatteryReplaceRequired *bool `json:"battery_replace_required,omitempty"`
|
||||
SFPPresent *bool `json:"sfp_present,omitempty"`
|
||||
SFPIdentifier *string `json:"sfp_identifier,omitempty"`
|
||||
SFPConnector *string `json:"sfp_connector,omitempty"`
|
||||
SFPVendor *string `json:"sfp_vendor,omitempty"`
|
||||
SFPPartNumber *string `json:"sfp_part_number,omitempty"`
|
||||
SFPSerialNumber *string `json:"sfp_serial_number,omitempty"`
|
||||
SFPWavelengthNM *float64 `json:"sfp_wavelength_nm,omitempty"`
|
||||
SFPTemperatureC *float64 `json:"sfp_temperature_c,omitempty"`
|
||||
SFPTXPowerDBM *float64 `json:"sfp_tx_power_dbm,omitempty"`
|
||||
SFPRXPowerDBM *float64 `json:"sfp_rx_power_dbm,omitempty"`
|
||||
|
||||
@@ -482,12 +482,13 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
||||
return
|
||||
}
|
||||
|
||||
var body struct {
|
||||
Duration int `json:"duration"`
|
||||
StressMode bool `json:"stress_mode"`
|
||||
GPUIndices []int `json:"gpu_indices"`
|
||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
|
||||
Loader string `json:"loader"`
|
||||
var body struct {
|
||||
Duration int `json:"duration"`
|
||||
StressMode bool `json:"stress_mode"`
|
||||
GPUIndices []int `json:"gpu_indices"`
|
||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices"`
|
||||
StaggerGPUStart bool `json:"stagger_gpu_start"`
|
||||
Loader string `json:"loader"`
|
||||
Profile string `json:"profile"`
|
||||
DisplayName string `json:"display_name"`
|
||||
PlatformComponents []string `json:"platform_components"`
|
||||
@@ -503,12 +504,13 @@ func (h *handler) handleAPISATRun(target string) http.HandlerFunc {
|
||||
if strings.TrimSpace(body.DisplayName) != "" {
|
||||
name = body.DisplayName
|
||||
}
|
||||
params := taskParams{
|
||||
Duration: body.Duration,
|
||||
StressMode: body.StressMode,
|
||||
GPUIndices: body.GPUIndices,
|
||||
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
||||
Loader: body.Loader,
|
||||
params := taskParams{
|
||||
Duration: body.Duration,
|
||||
StressMode: body.StressMode,
|
||||
GPUIndices: body.GPUIndices,
|
||||
ExcludeGPUIndices: body.ExcludeGPUIndices,
|
||||
StaggerGPUStart: body.StaggerGPUStart,
|
||||
Loader: body.Loader,
|
||||
BurnProfile: body.Profile,
|
||||
DisplayName: body.DisplayName,
|
||||
PlatformComponents: body.PlatformComponents,
|
||||
@@ -1376,107 +1378,3 @@ func (h *handler) rollbackPendingNetworkChange() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// ── Display / Screen Resolution ───────────────────────────────────────────────
|
||||
|
||||
type displayMode struct {
|
||||
Output string `json:"output"`
|
||||
Mode string `json:"mode"`
|
||||
Current bool `json:"current"`
|
||||
}
|
||||
|
||||
type displayInfo struct {
|
||||
Output string `json:"output"`
|
||||
Modes []displayMode `json:"modes"`
|
||||
Current string `json:"current"`
|
||||
}
|
||||
|
||||
var xrandrOutputRE = regexp.MustCompile(`^(\S+)\s+connected`)
|
||||
var xrandrModeRE = regexp.MustCompile(`^\s{3}(\d+x\d+)\s`)
|
||||
var xrandrCurrentRE = regexp.MustCompile(`\*`)
|
||||
|
||||
func parseXrandrOutput(out string) []displayInfo {
|
||||
var infos []displayInfo
|
||||
var cur *displayInfo
|
||||
for _, line := range strings.Split(out, "\n") {
|
||||
if m := xrandrOutputRE.FindStringSubmatch(line); m != nil {
|
||||
if cur != nil {
|
||||
infos = append(infos, *cur)
|
||||
}
|
||||
cur = &displayInfo{Output: m[1]}
|
||||
continue
|
||||
}
|
||||
if cur == nil {
|
||||
continue
|
||||
}
|
||||
if m := xrandrModeRE.FindStringSubmatch(line); m != nil {
|
||||
isCurrent := xrandrCurrentRE.MatchString(line)
|
||||
mode := displayMode{Output: cur.Output, Mode: m[1], Current: isCurrent}
|
||||
cur.Modes = append(cur.Modes, mode)
|
||||
if isCurrent {
|
||||
cur.Current = m[1]
|
||||
}
|
||||
}
|
||||
}
|
||||
if cur != nil {
|
||||
infos = append(infos, *cur)
|
||||
}
|
||||
return infos
|
||||
}
|
||||
|
||||
func xrandrCommand(args ...string) *exec.Cmd {
|
||||
cmd := exec.Command("xrandr", args...)
|
||||
env := append([]string{}, os.Environ()...)
|
||||
hasDisplay := false
|
||||
hasXAuthority := false
|
||||
for _, kv := range env {
|
||||
if strings.HasPrefix(kv, "DISPLAY=") && strings.TrimPrefix(kv, "DISPLAY=") != "" {
|
||||
hasDisplay = true
|
||||
}
|
||||
if strings.HasPrefix(kv, "XAUTHORITY=") && strings.TrimPrefix(kv, "XAUTHORITY=") != "" {
|
||||
hasXAuthority = true
|
||||
}
|
||||
}
|
||||
if !hasDisplay {
|
||||
env = append(env, "DISPLAY=:0")
|
||||
}
|
||||
if !hasXAuthority {
|
||||
env = append(env, "XAUTHORITY=/home/bee/.Xauthority")
|
||||
}
|
||||
cmd.Env = env
|
||||
return cmd
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIDisplayResolutions(w http.ResponseWriter, _ *http.Request) {
|
||||
out, err := xrandrCommand().Output()
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, "xrandr: "+err.Error())
|
||||
return
|
||||
}
|
||||
writeJSON(w, parseXrandrOutput(string(out)))
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIDisplaySet(w http.ResponseWriter, r *http.Request) {
|
||||
var req struct {
|
||||
Output string `json:"output"`
|
||||
Mode string `json:"mode"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil || req.Output == "" || req.Mode == "" {
|
||||
writeError(w, http.StatusBadRequest, "output and mode are required")
|
||||
return
|
||||
}
|
||||
// Validate mode looks like WxH to prevent injection
|
||||
if !regexp.MustCompile(`^\d+x\d+$`).MatchString(req.Mode) {
|
||||
writeError(w, http.StatusBadRequest, "invalid mode format")
|
||||
return
|
||||
}
|
||||
// Validate output name (no special chars)
|
||||
if !regexp.MustCompile(`^[A-Za-z0-9_\-]+$`).MatchString(req.Output) {
|
||||
writeError(w, http.StatusBadRequest, "invalid output name")
|
||||
return
|
||||
}
|
||||
if out, err := xrandrCommand("--output", req.Output, "--mode", req.Mode).CombinedOutput(); err != nil {
|
||||
writeError(w, http.StatusInternalServerError, "xrandr: "+strings.TrimSpace(string(out)))
|
||||
return
|
||||
}
|
||||
writeJSON(w, map[string]string{"status": "ok", "output": req.Output, "mode": req.Mode})
|
||||
}
|
||||
|
||||
@@ -10,30 +10,6 @@ import (
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
func TestXrandrCommandAddsDefaultX11Env(t *testing.T) {
|
||||
t.Setenv("DISPLAY", "")
|
||||
t.Setenv("XAUTHORITY", "")
|
||||
|
||||
cmd := xrandrCommand("--query")
|
||||
|
||||
var hasDisplay bool
|
||||
var hasXAuthority bool
|
||||
for _, kv := range cmd.Env {
|
||||
if kv == "DISPLAY=:0" {
|
||||
hasDisplay = true
|
||||
}
|
||||
if kv == "XAUTHORITY=/home/bee/.Xauthority" {
|
||||
hasXAuthority = true
|
||||
}
|
||||
}
|
||||
if !hasDisplay {
|
||||
t.Fatalf("DISPLAY not injected: %v", cmd.Env)
|
||||
}
|
||||
if !hasXAuthority {
|
||||
t.Fatalf("XAUTHORITY not injected: %v", cmd.Env)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
originalTasks := globalQueue.tasks
|
||||
|
||||
@@ -83,6 +83,10 @@ func renderMetricChartSVG(title string, labels []string, times []time.Time, data
|
||||
}
|
||||
}
|
||||
|
||||
// Downsample to at most ~1400 points (one per pixel) before building SVG.
|
||||
times, datasets = downsampleTimeSeries(times, datasets, 1400)
|
||||
pointCount = len(times)
|
||||
|
||||
statsLabel := chartStatsLabel(datasets)
|
||||
|
||||
legendItems := []metricChartSeries{}
|
||||
@@ -196,6 +200,19 @@ func drawGPUOverviewChartSVG(title string, labels []string, times []time.Time, s
|
||||
}
|
||||
}
|
||||
|
||||
// Downsample to at most ~1400 points before building SVG.
|
||||
{
|
||||
datasets := make([][]float64, len(series))
|
||||
for i := range series {
|
||||
datasets[i] = series[i].Values
|
||||
}
|
||||
times, datasets = downsampleTimeSeries(times, datasets, 1400)
|
||||
pointCount = len(times)
|
||||
for i := range series {
|
||||
series[i].Values = datasets[i]
|
||||
}
|
||||
}
|
||||
|
||||
scales := make([]chartScale, len(series))
|
||||
for i := range series {
|
||||
min, max := chartSeriesBounds(series[i].Values)
|
||||
@@ -626,6 +643,87 @@ func writeTimelineBoundaries(b *strings.Builder, layout chartLayout, start, end
|
||||
b.WriteString(`</g>` + "\n")
|
||||
}
|
||||
|
||||
// downsampleTimeSeries reduces the time series to at most maxPts points using
|
||||
// min-max bucketing. Each bucket contributes the index of its min and max value
|
||||
// (using the first full-length dataset as the reference series). All parallel
|
||||
// datasets are sampled at those same indices so all series stay aligned.
|
||||
// If len(times) <= maxPts the inputs are returned unchanged.
|
||||
func downsampleTimeSeries(times []time.Time, datasets [][]float64, maxPts int) ([]time.Time, [][]float64) {
|
||||
n := len(times)
|
||||
if n <= maxPts || maxPts <= 0 {
|
||||
return times, datasets
|
||||
}
|
||||
buckets := maxPts / 2
|
||||
if buckets < 1 {
|
||||
buckets = 1
|
||||
}
|
||||
// Use the first dataset that has the same length as times as the reference
|
||||
// for deciding which two indices to keep per bucket.
|
||||
var ref []float64
|
||||
for _, ds := range datasets {
|
||||
if len(ds) == n {
|
||||
ref = ds
|
||||
break
|
||||
}
|
||||
}
|
||||
selected := make([]int, 0, maxPts)
|
||||
bucketSize := float64(n) / float64(buckets)
|
||||
for b := 0; b < buckets; b++ {
|
||||
lo := int(math.Round(float64(b) * bucketSize))
|
||||
hi := int(math.Round(float64(b+1) * bucketSize))
|
||||
if hi > n {
|
||||
hi = n
|
||||
}
|
||||
if lo >= hi {
|
||||
continue
|
||||
}
|
||||
if ref == nil {
|
||||
selected = append(selected, lo)
|
||||
if hi-1 != lo {
|
||||
selected = append(selected, hi-1)
|
||||
}
|
||||
continue
|
||||
}
|
||||
minIdx, maxIdx := lo, lo
|
||||
for i := lo + 1; i < hi; i++ {
|
||||
if ref[i] < ref[minIdx] {
|
||||
minIdx = i
|
||||
}
|
||||
if ref[i] > ref[maxIdx] {
|
||||
maxIdx = i
|
||||
}
|
||||
}
|
||||
if minIdx <= maxIdx {
|
||||
selected = append(selected, minIdx)
|
||||
if maxIdx != minIdx {
|
||||
selected = append(selected, maxIdx)
|
||||
}
|
||||
} else {
|
||||
selected = append(selected, maxIdx)
|
||||
if minIdx != maxIdx {
|
||||
selected = append(selected, minIdx)
|
||||
}
|
||||
}
|
||||
}
|
||||
outTimes := make([]time.Time, len(selected))
|
||||
for i, idx := range selected {
|
||||
outTimes[i] = times[idx]
|
||||
}
|
||||
outDatasets := make([][]float64, len(datasets))
|
||||
for d, ds := range datasets {
|
||||
if len(ds) != n {
|
||||
outDatasets[d] = ds
|
||||
continue
|
||||
}
|
||||
out := make([]float64, len(selected))
|
||||
for i, idx := range selected {
|
||||
out[i] = ds[idx]
|
||||
}
|
||||
outDatasets[d] = out
|
||||
}
|
||||
return outTimes, outDatasets
|
||||
}
|
||||
|
||||
func chartXForTime(ts, start, end time.Time, left, right int) float64 {
|
||||
if !end.After(start) {
|
||||
return float64(left+right) / 2
|
||||
|
||||
@@ -317,106 +317,299 @@ func renderHardwareSummaryCard(opts HandlerOptions) string {
|
||||
if err != nil {
|
||||
return `<div class="card"><div class="card-head card-head-actions"><span>Hardware Summary</span><div class="card-head-buttons"><button class="btn btn-primary btn-sm" onclick="auditModalRun()">Run audit</button></div></div><div class="card-body"></div></div>`
|
||||
}
|
||||
// Parse just enough fields for the summary banner
|
||||
var snap struct {
|
||||
Summary struct {
|
||||
CPU struct{ Model string }
|
||||
Memory struct{ TotalGB float64 }
|
||||
Storage []struct{ Device, Model, Size string }
|
||||
GPUs []struct{ Model string }
|
||||
PSUs []struct{ Model string }
|
||||
}
|
||||
Network struct {
|
||||
Interfaces []struct {
|
||||
Name string
|
||||
IPv4 []string
|
||||
State string
|
||||
}
|
||||
}
|
||||
}
|
||||
// Try to extract top-level fields loosely
|
||||
var raw map[string]json.RawMessage
|
||||
if err := json.Unmarshal(data, &raw); err != nil {
|
||||
var ingest schema.HardwareIngestRequest
|
||||
if err := json.Unmarshal(data, &ingest); err != nil {
|
||||
return `<div class="card"><div class="card-head">Hardware Summary</div><div class="card-body"><span class="badge badge-err">Parse error</span></div></div>`
|
||||
}
|
||||
_ = snap
|
||||
hw := ingest.Hardware
|
||||
|
||||
// Also load runtime-health for badges
|
||||
type componentHealth struct {
|
||||
FailCount int `json:"fail_count"`
|
||||
WarnCount int `json:"warn_count"`
|
||||
var records []app.ComponentStatusRecord
|
||||
if db, err := app.OpenComponentStatusDB(filepath.Join(opts.ExportDir, "component-status.json")); err == nil {
|
||||
records = db.All()
|
||||
}
|
||||
type healthSummary struct {
|
||||
CPU componentHealth `json:"cpu"`
|
||||
Memory componentHealth `json:"memory"`
|
||||
Storage componentHealth `json:"storage"`
|
||||
GPU componentHealth `json:"gpu"`
|
||||
PSU componentHealth `json:"psu"`
|
||||
Network componentHealth `json:"network"`
|
||||
}
|
||||
var health struct {
|
||||
HardwareHealth healthSummary `json:"hardware_health"`
|
||||
}
|
||||
if hdata, herr := loadSnapshot(filepath.Join(opts.ExportDir, "runtime-health.json")); herr == nil {
|
||||
_ = json.Unmarshal(hdata, &health)
|
||||
}
|
||||
|
||||
badge := func(h componentHealth) string {
|
||||
if h.FailCount > 0 {
|
||||
return `<span class="badge badge-err">FAIL</span>`
|
||||
}
|
||||
if h.WarnCount > 0 {
|
||||
return `<span class="badge badge-warn">WARN</span>`
|
||||
}
|
||||
return `<span class="badge badge-ok">OK</span>`
|
||||
}
|
||||
|
||||
// Extract readable strings from raw JSON
|
||||
getString := func(key string) string {
|
||||
v, ok := raw[key]
|
||||
if !ok {
|
||||
return ""
|
||||
}
|
||||
var s string
|
||||
if err := json.Unmarshal(v, &s); err == nil {
|
||||
return s
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
cpuModel := getString("cpu_model")
|
||||
memStr := getString("memory_summary")
|
||||
gpuSummary := getString("gpu_summary")
|
||||
|
||||
var b strings.Builder
|
||||
b.WriteString(`<div class="card"><div class="card-head">Hardware Summary</div><div class="card-body">`)
|
||||
b.WriteString(`<table style="width:auto">`)
|
||||
writeRow := func(label, value, badgeHTML string) {
|
||||
b.WriteString(fmt.Sprintf(`<tr><td style="padding:6px 14px 6px 0;font-weight:700;white-space:nowrap">%s</td><td style="padding:6px 0">%s</td><td style="padding:6px 0 6px 12px">%s</td></tr>`,
|
||||
b.WriteString(fmt.Sprintf(`<tr><td style="padding:6px 14px 6px 0;font-weight:700;white-space:nowrap">%s</td><td style="padding:6px 0;color:var(--muted);font-size:13px">%s</td><td style="padding:6px 0 6px 12px">%s</td></tr>`,
|
||||
html.EscapeString(label), html.EscapeString(value), badgeHTML))
|
||||
}
|
||||
if cpuModel != "" {
|
||||
writeRow("CPU", cpuModel, badge(health.HardwareHealth.CPU))
|
||||
} else {
|
||||
writeRow("CPU", "—", badge(health.HardwareHealth.CPU))
|
||||
|
||||
cpuRow := aggregateComponentStatus("CPU", records, []string{"cpu:all"}, nil)
|
||||
writeRow("CPU", hwDescribeCPU(hw), runtimeStatusBadge(cpuRow.Status))
|
||||
|
||||
memRow := aggregateComponentStatus("Memory", records, []string{"memory:all"}, []string{"memory:"})
|
||||
writeRow("Memory", hwDescribeMemory(hw), runtimeStatusBadge(memRow.Status))
|
||||
|
||||
storageRow := aggregateComponentStatus("Storage", records, []string{"storage:all"}, []string{"storage:"})
|
||||
writeRow("Storage", hwDescribeStorage(hw), runtimeStatusBadge(storageRow.Status))
|
||||
|
||||
gpuRow := aggregateComponentStatus("GPU", records, nil, []string{"pcie:gpu:"})
|
||||
writeRow("GPU", hwDescribeGPU(hw), runtimeStatusBadge(gpuRow.Status))
|
||||
|
||||
psuRow := aggregateComponentStatus("PSU", records, nil, []string{"psu:"})
|
||||
if psuRow.Status == "UNKNOWN" && len(hw.PowerSupplies) > 0 {
|
||||
psuRow.Status = hwPSUStatus(hw.PowerSupplies)
|
||||
}
|
||||
if memStr != "" {
|
||||
writeRow("Memory", memStr, badge(health.HardwareHealth.Memory))
|
||||
} else {
|
||||
writeRow("Memory", "—", badge(health.HardwareHealth.Memory))
|
||||
writeRow("PSU", hwDescribePSU(hw), runtimeStatusBadge(psuRow.Status))
|
||||
|
||||
if nicDesc := hwDescribeNIC(hw); nicDesc != "" {
|
||||
writeRow("Network", nicDesc, "")
|
||||
}
|
||||
if gpuSummary != "" {
|
||||
writeRow("GPU", gpuSummary, badge(health.HardwareHealth.GPU))
|
||||
} else {
|
||||
writeRow("GPU", "—", badge(health.HardwareHealth.GPU))
|
||||
}
|
||||
writeRow("Storage", "—", badge(health.HardwareHealth.Storage))
|
||||
writeRow("PSU", "—", badge(health.HardwareHealth.PSU))
|
||||
|
||||
b.WriteString(`</table>`)
|
||||
b.WriteString(`</div></div>`)
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// hwDescribeCPU returns a human-readable CPU summary, e.g. "2× Intel Xeon Gold 6338".
|
||||
func hwDescribeCPU(hw schema.HardwareSnapshot) string {
|
||||
counts := map[string]int{}
|
||||
order := []string{}
|
||||
for _, cpu := range hw.CPUs {
|
||||
model := "Unknown CPU"
|
||||
if cpu.Model != nil && *cpu.Model != "" {
|
||||
model = *cpu.Model
|
||||
}
|
||||
if counts[model] == 0 {
|
||||
order = append(order, model)
|
||||
}
|
||||
counts[model]++
|
||||
}
|
||||
if len(order) == 0 {
|
||||
return "—"
|
||||
}
|
||||
parts := make([]string, 0, len(order))
|
||||
for _, m := range order {
|
||||
if counts[m] > 1 {
|
||||
parts = append(parts, fmt.Sprintf("%d× %s", counts[m], m))
|
||||
} else {
|
||||
parts = append(parts, m)
|
||||
}
|
||||
}
|
||||
return strings.Join(parts, ", ")
|
||||
}
|
||||
|
||||
// hwDescribeMemory returns a summary like "16× 32 GB DDR4".
|
||||
func hwDescribeMemory(hw schema.HardwareSnapshot) string {
|
||||
type key struct {
|
||||
sizeMB int
|
||||
typ string
|
||||
}
|
||||
counts := map[key]int{}
|
||||
order := []key{}
|
||||
for _, dimm := range hw.Memory {
|
||||
if dimm.SizeMB == nil || *dimm.SizeMB == 0 {
|
||||
continue
|
||||
}
|
||||
t := ""
|
||||
if dimm.Type != nil {
|
||||
t = *dimm.Type
|
||||
}
|
||||
k := key{*dimm.SizeMB, t}
|
||||
if counts[k] == 0 {
|
||||
order = append(order, k)
|
||||
}
|
||||
counts[k]++
|
||||
}
|
||||
if len(order) == 0 {
|
||||
return "—"
|
||||
}
|
||||
parts := make([]string, 0, len(order))
|
||||
for _, k := range order {
|
||||
gb := k.sizeMB / 1024
|
||||
desc := fmt.Sprintf("%d× %d GB", counts[k], gb)
|
||||
if k.typ != "" {
|
||||
desc += " " + k.typ
|
||||
}
|
||||
parts = append(parts, desc)
|
||||
}
|
||||
return strings.Join(parts, ", ")
|
||||
}
|
||||
|
||||
// hwDescribeStorage returns a summary like "4× 3.84 TB NVMe, 2× 1.92 TB SATA".
|
||||
func hwDescribeStorage(hw schema.HardwareSnapshot) string {
|
||||
type key struct {
|
||||
sizeGB int
|
||||
iface string
|
||||
}
|
||||
counts := map[key]int{}
|
||||
order := []key{}
|
||||
for _, disk := range hw.Storage {
|
||||
sz := 0
|
||||
if disk.SizeGB != nil {
|
||||
sz = *disk.SizeGB
|
||||
}
|
||||
iface := ""
|
||||
if disk.Interface != nil {
|
||||
iface = *disk.Interface
|
||||
} else if disk.Type != nil {
|
||||
iface = *disk.Type
|
||||
}
|
||||
k := key{sz, iface}
|
||||
if counts[k] == 0 {
|
||||
order = append(order, k)
|
||||
}
|
||||
counts[k]++
|
||||
}
|
||||
if len(order) == 0 {
|
||||
return "—"
|
||||
}
|
||||
parts := make([]string, 0, len(order))
|
||||
for _, k := range order {
|
||||
var sizeStr string
|
||||
if k.sizeGB >= 1000 {
|
||||
sizeStr = fmt.Sprintf("%.2g TB", float64(k.sizeGB)/1000)
|
||||
} else if k.sizeGB > 0 {
|
||||
sizeStr = fmt.Sprintf("%d GB", k.sizeGB)
|
||||
} else {
|
||||
sizeStr = "?"
|
||||
}
|
||||
desc := fmt.Sprintf("%d× %s", counts[k], sizeStr)
|
||||
if k.iface != "" {
|
||||
desc += " " + k.iface
|
||||
}
|
||||
parts = append(parts, desc)
|
||||
}
|
||||
return strings.Join(parts, ", ")
|
||||
}
|
||||
|
||||
// hwDescribeGPU returns a summary like "8× NVIDIA H100 80GB".
|
||||
func hwDescribeGPU(hw schema.HardwareSnapshot) string {
|
||||
counts := map[string]int{}
|
||||
order := []string{}
|
||||
for _, dev := range hw.PCIeDevices {
|
||||
if dev.DeviceClass == nil {
|
||||
continue
|
||||
}
|
||||
if !isGPUDeviceClass(*dev.DeviceClass) {
|
||||
continue
|
||||
}
|
||||
model := "Unknown GPU"
|
||||
if dev.Model != nil && *dev.Model != "" {
|
||||
model = *dev.Model
|
||||
}
|
||||
if counts[model] == 0 {
|
||||
order = append(order, model)
|
||||
}
|
||||
counts[model]++
|
||||
}
|
||||
if len(order) == 0 {
|
||||
return "—"
|
||||
}
|
||||
parts := make([]string, 0, len(order))
|
||||
for _, m := range order {
|
||||
if counts[m] > 1 {
|
||||
parts = append(parts, fmt.Sprintf("%d× %s", counts[m], m))
|
||||
} else {
|
||||
parts = append(parts, m)
|
||||
}
|
||||
}
|
||||
return strings.Join(parts, ", ")
|
||||
}
|
||||
|
||||
// hwPSUStatus returns "OK", "CRITICAL", "WARNING", or "UNKNOWN" based on
|
||||
// PSU statuses from the audit snapshot. Used as fallback when component-status.json
|
||||
// has no psu: records yet (e.g. first boot before audit writes them).
|
||||
func hwPSUStatus(psus []schema.HardwarePowerSupply) string {
|
||||
worst := "UNKNOWN"
|
||||
for _, psu := range psus {
|
||||
if psu.Status == nil {
|
||||
continue
|
||||
}
|
||||
switch strings.ToUpper(strings.TrimSpace(*psu.Status)) {
|
||||
case "CRITICAL":
|
||||
return "CRITICAL"
|
||||
case "WARNING":
|
||||
if worst != "CRITICAL" {
|
||||
worst = "WARNING"
|
||||
}
|
||||
case "OK":
|
||||
if worst == "UNKNOWN" {
|
||||
worst = "OK"
|
||||
}
|
||||
}
|
||||
}
|
||||
return worst
|
||||
}
|
||||
|
||||
// hwDescribePSU returns a summary like "2× 1600 W" or "2× PSU".
|
||||
func hwDescribePSU(hw schema.HardwareSnapshot) string {
|
||||
n := len(hw.PowerSupplies)
|
||||
if n == 0 {
|
||||
return "—"
|
||||
}
|
||||
// Try to get a consistent wattage
|
||||
watt := 0
|
||||
consistent := true
|
||||
for _, psu := range hw.PowerSupplies {
|
||||
if psu.WattageW == nil {
|
||||
consistent = false
|
||||
break
|
||||
}
|
||||
if watt == 0 {
|
||||
watt = *psu.WattageW
|
||||
} else if *psu.WattageW != watt {
|
||||
consistent = false
|
||||
break
|
||||
}
|
||||
}
|
||||
if consistent && watt > 0 {
|
||||
return fmt.Sprintf("%d× %d W", n, watt)
|
||||
}
|
||||
return fmt.Sprintf("%d× PSU", n)
|
||||
}
|
||||
|
||||
// hwDescribeNIC returns a summary like "2× Mellanox ConnectX-6".
|
||||
func hwDescribeNIC(hw schema.HardwareSnapshot) string {
|
||||
counts := map[string]int{}
|
||||
order := []string{}
|
||||
for _, dev := range hw.PCIeDevices {
|
||||
isNIC := false
|
||||
if dev.DeviceClass != nil {
|
||||
c := strings.ToLower(strings.TrimSpace(*dev.DeviceClass))
|
||||
isNIC = c == "ethernetcontroller" || c == "networkcontroller" || strings.Contains(c, "fibrechannel")
|
||||
}
|
||||
if !isNIC && len(dev.MacAddresses) == 0 {
|
||||
continue
|
||||
}
|
||||
model := ""
|
||||
if dev.Model != nil && *dev.Model != "" {
|
||||
model = *dev.Model
|
||||
} else if dev.Manufacturer != nil && *dev.Manufacturer != "" {
|
||||
model = *dev.Manufacturer + " NIC"
|
||||
} else {
|
||||
model = "NIC"
|
||||
}
|
||||
if counts[model] == 0 {
|
||||
order = append(order, model)
|
||||
}
|
||||
counts[model]++
|
||||
}
|
||||
if len(order) == 0 {
|
||||
return ""
|
||||
}
|
||||
parts := make([]string, 0, len(order))
|
||||
for _, m := range order {
|
||||
if counts[m] > 1 {
|
||||
parts = append(parts, fmt.Sprintf("%d× %s", counts[m], m))
|
||||
} else {
|
||||
parts = append(parts, m)
|
||||
}
|
||||
}
|
||||
return strings.Join(parts, ", ")
|
||||
}
|
||||
|
||||
func isGPUDeviceClass(class string) bool {
|
||||
switch strings.TrimSpace(class) {
|
||||
case "VideoController", "DisplayController", "ProcessingAccelerator":
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func renderAuditModal() string {
|
||||
return `<div id="audit-modal-overlay" style="display:none;position:fixed;inset:0;background:rgba(0,0,0,.5);z-index:100;align-items:center;justify-content:center">
|
||||
<div style="background:#fff;border-radius:6px;padding:24px;min-width:480px;max-width:1100px;width:min(1100px,92vw);max-height:92vh;overflow:auto;position:relative">
|
||||
@@ -481,8 +674,9 @@ func renderHealthCard(opts HandlerOptions) string {
|
||||
buildRuntimeAccelerationRow(health),
|
||||
buildRuntimeToolsRow(health),
|
||||
buildRuntimeServicesRow(health),
|
||||
buildRuntimeUSBExportRow(health),
|
||||
buildRuntimeToRAMRow(health),
|
||||
}
|
||||
rows = append(rows, buildHardwareComponentRows(opts.ExportDir)...)
|
||||
b.WriteString(`<table><thead><tr><th>Check</th><th>Status</th><th>Source</th><th>Issue</th></tr></thead><tbody>`)
|
||||
for _, row := range rows {
|
||||
b.WriteString(`<tr><td>` + html.EscapeString(row.Title) + `</td><td>` + runtimeStatusBadge(row.Status) + `</td><td>` + html.EscapeString(row.Source) + `</td><td>` + rowIssueHTML(row.Issue) + `</td></tr>`)
|
||||
@@ -578,7 +772,13 @@ func buildRuntimeServicesRow(health schema.RuntimeHealth) runtimeHealthRow {
|
||||
nonActive := make([]string, 0)
|
||||
for _, svc := range health.Services {
|
||||
state := strings.TrimSpace(strings.ToLower(svc.Status))
|
||||
if state != "active" {
|
||||
// "activating" and "deactivating" are transient states for oneshot services
|
||||
// (RemainAfterExit=yes) — the service is running normally, not failed.
|
||||
// Only "failed" and "inactive" (after services should be running) are problems.
|
||||
switch state {
|
||||
case "active", "activating", "deactivating", "reloading":
|
||||
// OK — service is running or transitioning normally
|
||||
default:
|
||||
nonActive = append(nonActive, svc.Name+"="+svc.Status)
|
||||
}
|
||||
}
|
||||
@@ -591,6 +791,51 @@ func buildRuntimeServicesRow(health schema.RuntimeHealth) runtimeHealthRow {
|
||||
return runtimeHealthRow{Title: "Bee Services", Status: status, Source: "ServiceState", Issue: issue}
|
||||
}
|
||||
|
||||
func buildRuntimeUSBExportRow(health schema.RuntimeHealth) runtimeHealthRow {
|
||||
path := strings.TrimSpace(health.USBExportPath)
|
||||
if path != "" {
|
||||
return runtimeHealthRow{
|
||||
Title: "USB Export Drive",
|
||||
Status: "OK",
|
||||
Source: "/proc/mounts + lsblk",
|
||||
Issue: path,
|
||||
}
|
||||
}
|
||||
return runtimeHealthRow{
|
||||
Title: "USB Export Drive",
|
||||
Status: "WARNING",
|
||||
Source: "/proc/mounts + lsblk",
|
||||
Issue: "No writable USB drive mounted. Plug in a USB drive to enable log export.",
|
||||
}
|
||||
}
|
||||
|
||||
func buildRuntimeToRAMRow(health schema.RuntimeHealth) runtimeHealthRow {
|
||||
switch strings.ToLower(strings.TrimSpace(health.ToRAMStatus)) {
|
||||
case "ok":
|
||||
return runtimeHealthRow{
|
||||
Title: "LiveCD in RAM",
|
||||
Status: "OK",
|
||||
Source: "live-boot / /proc/mounts",
|
||||
Issue: "",
|
||||
}
|
||||
case "failed":
|
||||
return runtimeHealthRow{
|
||||
Title: "LiveCD in RAM",
|
||||
Status: "FAILED",
|
||||
Source: "live-boot / /proc/mounts",
|
||||
Issue: "toram boot parameter set but ISO is not mounted from RAM. Copy may have failed.",
|
||||
}
|
||||
default:
|
||||
// toram not active — ISO still on original boot media (USB/CD)
|
||||
return runtimeHealthRow{
|
||||
Title: "LiveCD in RAM",
|
||||
Status: "WARNING",
|
||||
Source: "live-boot / /proc/mounts",
|
||||
Issue: "ISO not copied to RAM. Use \u201cCopy to RAM\u201d to free the boot drive and improve performance.",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func buildHardwareComponentRows(exportDir string) []runtimeHealthRow {
|
||||
path := filepath.Join(exportDir, "component-status.json")
|
||||
db, err := app.OpenComponentStatusDB(path)
|
||||
@@ -1031,25 +1276,26 @@ func renderValidate(opts HandlerOptions) string {
|
||||
return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div>
|
||||
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">Validate Profile</div>
|
||||
<div class="card-body validate-profile-body">
|
||||
<div class="validate-profile-col">
|
||||
<div class="form-row" style="margin:0"><label>Cycles</label><input type="number" id="sat-cycles" value="1" min="1" max="100" style="width:100%"></div>
|
||||
<div class="form-row" style="margin:12px 0 0"><label>Mode</label></div>
|
||||
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-validate" value="validate" checked onchange="satModeChanged()"><span>Validate — quick non-destructive check</span></label>
|
||||
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (~30–60 min)</span></label>
|
||||
</div>
|
||||
<div class="validate-profile-col validate-profile-action">
|
||||
<p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially with the selected cycle count and mode. Validate is quick (~5–15 min total); Stress is thorough (~30–60 min total).</p>
|
||||
<button class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
|
||||
</div>
|
||||
<div class="validate-profile-col"></div>
|
||||
</div>
|
||||
<div class="card-body" style="padding-top:0;display:flex;justify-content:center">
|
||||
<span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
|
||||
</div>
|
||||
</div>
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">Validate Profile</div>
|
||||
<div class="card-body validate-profile-body">
|
||||
<div class="validate-profile-col">
|
||||
<div class="form-row" style="margin:0"><label>Cycles</label><input type="number" id="sat-cycles" value="1" min="1" max="100" style="width:100%"></div>
|
||||
</div>
|
||||
<div class="validate-profile-col">
|
||||
<div class="form-row" style="margin:12px 0 0"><label>Mode</label></div>
|
||||
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-validate" value="validate" checked onchange="satModeChanged()"><span>Validate — quick non-destructive check</span></label>
|
||||
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (~30–60 min)</span></label>
|
||||
</div>
|
||||
<div class="validate-profile-col validate-profile-action">
|
||||
<p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially with the selected cycle count and mode. Validate is quick (~5–15 min total); Stress is thorough (~30–60 min total).</p>
|
||||
<button type="button" class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
|
||||
<div style="margin-top:12px">
|
||||
<span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="grid3">
|
||||
` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody(
|
||||
@@ -1143,16 +1389,6 @@ func renderValidate(opts HandlerOptions) string {
|
||||
`</div>` +
|
||||
`</div>
|
||||
<div class="grid3" style="margin-top:16px">
|
||||
` + `<div id="sat-card-hpl">` +
|
||||
renderSATCard("hpl", "LINPACK (HPL)", "runSAT('hpl')", "", renderValidateCardBody(
|
||||
``,
|
||||
`Standard High Performance LINPACK benchmark. Measures sustained FP64 GFLOPS and memory bandwidth of the CPU subsystem. Uses 80% of available RAM. Pass/fail based on HPL residual check.`,
|
||||
`<code>xhpl</code> (HPL 2.3, OpenBLAS)`,
|
||||
`Skipped in Validate mode. Runs in Stress mode only. Runtime scales with RAM — expect 5–30 min.<p id="sat-hpl-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`</div>
|
||||
<div class="grid3" style="margin-top:16px">
|
||||
` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody(
|
||||
inv.AMD,
|
||||
`Runs the selected AMD checks only. GPU Validate collects inventory; MEM Integrity uses the RVS MEM module; MEM Bandwidth uses rocm-bandwidth-test and the RVS BABEL module.`,
|
||||
@@ -1166,7 +1402,7 @@ func renderValidate(opts HandlerOptions) string {
|
||||
</div>
|
||||
<style>
|
||||
.validate-profile-body { display:grid; grid-template-columns:1fr 1fr 1fr; gap:24px; align-items:stretch; }
|
||||
.validate-profile-col { min-width:0; }
|
||||
.validate-profile-col { min-width:0; display:flex; flex-direction:column; }
|
||||
.validate-profile-action { display:flex; flex-direction:column; align-items:center; justify-content:center; }
|
||||
.validate-card-body { padding:0; }
|
||||
.validate-card-section { padding:12px 16px 0; }
|
||||
@@ -1188,7 +1424,6 @@ function satModeChanged() {
|
||||
{card: 'sat-card-nvidia-pulse', hint: 'sat-pt-mode-hint'},
|
||||
{card: 'sat-card-nvidia-interconnect', hint: 'sat-ni-mode-hint'},
|
||||
{card: 'sat-card-nvidia-bandwidth', hint: 'sat-nb-mode-hint'},
|
||||
{card: 'sat-card-hpl', hint: 'sat-hpl-mode-hint'},
|
||||
].forEach(function(item) {
|
||||
const card = document.getElementById(item.card);
|
||||
if (card) {
|
||||
@@ -1199,7 +1434,7 @@ function satModeChanged() {
|
||||
});
|
||||
}
|
||||
function satLabels() {
|
||||
return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress (dcgmi diag targeted_stress)', 'nvidia-targeted-power':'NVIDIA Targeted Power (dcgmi diag targeted_power)', 'nvidia-pulse':'NVIDIA PSU Pulse Test (dcgmi diag pulse_test)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL all_reduce_perf)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', hpl:'LINPACK (HPL)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
|
||||
return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress (dcgmi diag targeted_stress)', 'nvidia-targeted-power':'NVIDIA Targeted Power (dcgmi diag targeted_power)', 'nvidia-pulse':'NVIDIA PSU Pulse Test (dcgmi diag pulse_test)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL all_reduce_perf)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
|
||||
}
|
||||
let satNvidiaGPUsPromise = null;
|
||||
function loadSatNvidiaGPUs() {
|
||||
@@ -1448,8 +1683,8 @@ function runAllSAT() {
|
||||
const cycles = Math.max(1, parseInt(document.getElementById('sat-cycles').value)||1);
|
||||
const status = document.getElementById('sat-all-status');
|
||||
status.textContent = 'Enqueuing...';
|
||||
const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth', 'hpl'];
|
||||
const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','hpl','memory','storage','cpu'].concat(selectedAMDValidateTargets());
|
||||
const stressOnlyTargets = ['nvidia-targeted-stress', 'nvidia-targeted-power', 'nvidia-pulse', 'nvidia-interconnect', 'nvidia-bandwidth'];
|
||||
const baseTargets = ['nvidia','nvidia-targeted-stress','nvidia-targeted-power','nvidia-pulse','nvidia-interconnect','nvidia-bandwidth','memory','storage','cpu'].concat(selectedAMDValidateTargets());
|
||||
const activeTargets = baseTargets.filter(target => {
|
||||
if (stressOnlyTargets.indexOf(target) >= 0 && !satStressMode()) return false;
|
||||
const btn = document.getElementById('sat-btn-' + target);
|
||||
@@ -1623,6 +1858,11 @@ func formatValidateDeviceSummary(total int, models map[string]int, unit string)
|
||||
if total != 1 {
|
||||
label += "s"
|
||||
}
|
||||
// If there is only one model the leading count duplicates the per-model
|
||||
// count already in parts (e.g. "4 GPU: 4 x RTX …" → "4 x RTX …").
|
||||
if len(parts) == 1 {
|
||||
return parts[0] + " " + label
|
||||
}
|
||||
return fmt.Sprintf("%d %s: %s", total, label, strings.Join(parts, ", "))
|
||||
}
|
||||
|
||||
@@ -2106,11 +2346,11 @@ func renderBurn() string {
|
||||
<label class="cb-row"><input type="radio" name="burn-profile" value="overnight"><span>Overnight — 8 hours</span></label>
|
||||
</div>
|
||||
<div class="burn-profile-col burn-profile-action">
|
||||
<button class="btn btn-primary" onclick="runAllBurnTasks()">Burn one by one</button>
|
||||
<button type="button" class="btn btn-primary" onclick="runAllBurnTasks()">Burn one by one</button>
|
||||
<p>Run checked tests one by one. Tests run without cooldown. Each test duration is determined by the Burn Profile. Total test duration is the sum of all selected tests multiplied by the Burn Profile duration.</p>
|
||||
</div>
|
||||
<div class="burn-profile-col burn-profile-action">
|
||||
<button class="btn btn-secondary" onclick="runPlatformStress()">Thermal Cycling</button>
|
||||
<button type="button" class="btn btn-secondary" onclick="runPlatformStress()">Thermal Cycling</button>
|
||||
<p>Run checked core test modules (CPU, MEM, GPU). Tests start at the same time and run for a period with short cooldown phases to stress the server cooling system.</p>
|
||||
</div>
|
||||
</div>
|
||||
@@ -2127,12 +2367,16 @@ func renderBurn() string {
|
||||
<button class="btn btn-sm btn-secondary" type="button" onclick="burnSelectAll()">Select All</button>
|
||||
<button class="btn btn-sm btn-secondary" type="button" onclick="burnSelectNone()">Clear</button>
|
||||
</div>
|
||||
<div id="burn-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
|
||||
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
|
||||
</div>
|
||||
<p id="burn-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA burn recipes.</p>
|
||||
</div>
|
||||
</div>
|
||||
<div id="burn-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
|
||||
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
|
||||
</div>
|
||||
<p id="burn-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA burn recipes.</p>
|
||||
<label class="cb-row" style="margin-top:10px">
|
||||
<input type="checkbox" id="burn-stagger-nvidia">
|
||||
<span>Ramp selected NVIDIA GPUs one by one before full-load hold. Uses a 3-minute stabilization window per GPU, then keeps all selected GPUs under load for the chosen Burn Profile duration.</span>
|
||||
</label>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="burn-section">Core Burn Paths</div>
|
||||
<div class="grid2 burn-grid" style="margin-bottom:16px">
|
||||
@@ -2158,10 +2402,6 @@ func renderBurn() string {
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="burn-section">GPU-Specific Tests</div>
|
||||
<div class="grid2 burn-grid" style="margin-bottom:16px">
|
||||
</div>
|
||||
|
||||
<div id="bi-output" style="display:none;margin-top:16px" class="card">
|
||||
<div class="card-head">Output <span id="bi-title"></span></div>
|
||||
<div class="card-body"><div id="bi-terminal" class="terminal"></div></div>
|
||||
@@ -2210,6 +2450,11 @@ function burnSelectedGPUIndices() {
|
||||
.sort(function(a, b) { return a - b; });
|
||||
}
|
||||
|
||||
function burnUseNvidiaRampUp() {
|
||||
const el = document.getElementById('burn-stagger-nvidia');
|
||||
return !!(el && el.checked);
|
||||
}
|
||||
|
||||
function burnUpdateSelectionNote() {
|
||||
const note = document.getElementById('burn-selection-note');
|
||||
const selected = burnSelectedGPUIndices();
|
||||
@@ -2269,6 +2514,9 @@ function enqueueBurnTask(target, label, extra, useSelectedNvidia) {
|
||||
return Promise.reject(new Error('Select at least one NVIDIA GPU.'));
|
||||
}
|
||||
body.gpu_indices = selected;
|
||||
if (burnUseNvidiaRampUp() && selected.length > 1) {
|
||||
body.stagger_gpu_start = true;
|
||||
}
|
||||
}
|
||||
return fetch('/api/sat/' + target + '/run', {
|
||||
method: 'POST',
|
||||
@@ -2860,55 +3108,6 @@ usbRefresh();
|
||||
</script>`
|
||||
}
|
||||
|
||||
// ── Display Resolution ────────────────────────────────────────────────────────
|
||||
|
||||
func renderDisplayInline() string {
|
||||
return `<div id="display-status" style="color:var(--muted);font-size:13px;margin-bottom:12px">Loading displays...</div>
|
||||
<div id="display-controls"></div>
|
||||
<script>
|
||||
(function(){
|
||||
function loadDisplays() {
|
||||
fetch('/api/display/resolutions').then(r=>r.json()).then(displays => {
|
||||
const status = document.getElementById('display-status');
|
||||
const ctrl = document.getElementById('display-controls');
|
||||
if (!displays || displays.length === 0) {
|
||||
status.textContent = 'No connected displays found or xrandr not available.';
|
||||
return;
|
||||
}
|
||||
status.textContent = '';
|
||||
ctrl.innerHTML = displays.map(d => {
|
||||
const opts = (d.modes||[]).map(m =>
|
||||
'<option value="'+m.mode+'"'+(m.current?' selected':'')+'>'+m.mode+(m.current?' (current)':'')+'</option>'
|
||||
).join('');
|
||||
return '<div style="margin-bottom:12px">'
|
||||
+'<span style="font-weight:600;margin-right:8px">'+d.output+'</span>'
|
||||
+'<span style="color:var(--muted);font-size:12px;margin-right:12px">Current: '+d.current+'</span>'
|
||||
+'<select id="res-sel-'+d.output+'" style="margin-right:8px">'+opts+'</select>'
|
||||
+'<button class="btn btn-sm btn-primary" onclick="applyResolution(\''+d.output+'\')">Apply</button>'
|
||||
+'</div>';
|
||||
}).join('');
|
||||
}).catch(()=>{
|
||||
document.getElementById('display-status').textContent = 'xrandr not available on this system.';
|
||||
});
|
||||
}
|
||||
window.applyResolution = function(output) {
|
||||
const sel = document.getElementById('res-sel-'+output);
|
||||
if (!sel) return;
|
||||
const mode = sel.value;
|
||||
const btn = sel.nextElementSibling;
|
||||
btn.disabled = true;
|
||||
btn.textContent = 'Applying...';
|
||||
fetch('/api/display/set', {method:'POST', headers:{'Content-Type':'application/json'}, body:JSON.stringify({output:output,mode:mode})})
|
||||
.then(r=>r.json()).then(d=>{
|
||||
if (d.error) { alert('Error: '+d.error); }
|
||||
loadDisplays();
|
||||
}).catch(e=>{ alert('Error: '+e); })
|
||||
.finally(()=>{ btn.disabled=false; btn.textContent='Apply'; });
|
||||
};
|
||||
loadDisplays();
|
||||
})();
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderNvidiaSelfHealInline() string {
|
||||
return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Inspect NVIDIA GPU health, restart the bee-nvidia driver service, and issue a per-GPU reset when the driver reports reset required.</p>
|
||||
@@ -3097,8 +3296,6 @@ function installToRAM() {
|
||||
<div class="card"><div class="card-head">Services</div><div class="card-body">` +
|
||||
renderServicesInline() + `</div></div>
|
||||
|
||||
<div class="card"><div class="card-head">Display Resolution</div><div class="card-body">` +
|
||||
renderDisplayInline() + `</div></div>
|
||||
|
||||
<script>
|
||||
function checkTools() {
|
||||
|
||||
@@ -295,10 +295,6 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
// Tools
|
||||
mux.HandleFunc("GET /api/tools/check", h.handleAPIToolsCheck)
|
||||
|
||||
// Display
|
||||
mux.HandleFunc("GET /api/display/resolutions", h.handleAPIDisplayResolutions)
|
||||
mux.HandleFunc("POST /api/display/set", h.handleAPIDisplaySet)
|
||||
|
||||
// GPU presence / tools
|
||||
mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
|
||||
mux.HandleFunc("GET /api/gpu/nvidia", h.handleAPIGNVIDIAGPUs)
|
||||
|
||||
@@ -1094,6 +1094,7 @@ func TestDashboardRendersRuntimeHealthTable(t *testing.T) {
|
||||
}
|
||||
body := rec.Body.String()
|
||||
for _, needle := range []string{
|
||||
// Runtime Health card — LiveCD checks only
|
||||
`Runtime Health`,
|
||||
`<th>Check</th><th>Status</th><th>Source</th><th>Issue</th>`,
|
||||
`Export Directory`,
|
||||
@@ -1102,16 +1103,18 @@ func TestDashboardRendersRuntimeHealthTable(t *testing.T) {
|
||||
`CUDA / ROCm`,
|
||||
`Required Utilities`,
|
||||
`Bee Services`,
|
||||
`<td>CPU</td>`,
|
||||
`<td>Memory</td>`,
|
||||
`<td>Storage</td>`,
|
||||
`<td>GPU</td>`,
|
||||
`CUDA runtime is not ready for GPU SAT.`,
|
||||
`Missing: nvidia-smi`,
|
||||
`bee-nvidia=inactive`,
|
||||
`cpu SAT: FAILED`,
|
||||
`storage SAT: FAILED`,
|
||||
`sat:nvidia`,
|
||||
// Hardware Summary card — component health badges
|
||||
`Hardware Summary`,
|
||||
`>CPU<`,
|
||||
`>Memory<`,
|
||||
`>Storage<`,
|
||||
`>GPU<`,
|
||||
`>PSU<`,
|
||||
`badge-warn`, // cpu Warning badge
|
||||
`badge-err`, // storage Critical badge
|
||||
} {
|
||||
if !strings.Contains(body, needle) {
|
||||
t.Fatalf("dashboard missing %q: %s", needle, body)
|
||||
|
||||
@@ -39,7 +39,6 @@ var taskNames = map[string]string{
|
||||
"nvidia-interconnect": "NVIDIA Interconnect Test (NCCL all_reduce_perf)",
|
||||
"nvidia-bandwidth": "NVIDIA Bandwidth Test (NVBandwidth)",
|
||||
"nvidia-stress": "NVIDIA GPU Stress",
|
||||
"hpl": "LINPACK (HPL)",
|
||||
"memory": "Memory SAT",
|
||||
"storage": "Storage SAT",
|
||||
"cpu": "CPU SAT",
|
||||
@@ -119,6 +118,7 @@ type taskParams struct {
|
||||
StressMode bool `json:"stress_mode,omitempty"`
|
||||
GPUIndices []int `json:"gpu_indices,omitempty"`
|
||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices,omitempty"`
|
||||
StaggerGPUStart bool `json:"stagger_gpu_start,omitempty"`
|
||||
SizeMB int `json:"size_mb,omitempty"`
|
||||
Passes int `json:"passes,omitempty"`
|
||||
Loader string `json:"loader,omitempty"`
|
||||
@@ -163,6 +163,13 @@ func resolveBurnPreset(profile string) burnPreset {
|
||||
}
|
||||
}
|
||||
|
||||
func boolToNvidiaStaggerSeconds(enabled bool, selected []int) int {
|
||||
if enabled && len(selected) > 1 {
|
||||
return 180
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func resolvePlatformStressPreset(profile string) platform.PlatformStressOptions {
|
||||
acceptanceCycles := []platform.PlatformStressCycle{
|
||||
{LoadSec: 85, IdleSec: 5},
|
||||
@@ -593,7 +600,7 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
RunNCCL: t.params.RunNCCL,
|
||||
ParallelGPUs: t.params.ParallelGPUs,
|
||||
}, j.append)
|
||||
case "nvidia-compute":
|
||||
case "nvidia-compute":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
@@ -602,7 +609,11 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, j.append)
|
||||
staggerSec := boolToNvidiaStaggerSeconds(t.params.StaggerGPUStart, t.params.GPUIndices)
|
||||
if staggerSec > 0 {
|
||||
j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU", staggerSec))
|
||||
}
|
||||
archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, staggerSec, j.append)
|
||||
case "nvidia-targeted-power":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
@@ -652,12 +663,13 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
|
||||
DurationSec: dur,
|
||||
Loader: t.params.Loader,
|
||||
GPUIndices: t.params.GPUIndices,
|
||||
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||
}, j.append)
|
||||
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
|
||||
DurationSec: dur,
|
||||
Loader: t.params.Loader,
|
||||
GPUIndices: t.params.GPUIndices,
|
||||
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||
StaggerSeconds: boolToNvidiaStaggerSeconds(t.params.StaggerGPUStart, t.params.GPUIndices),
|
||||
}, j.append)
|
||||
case "memory":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
@@ -740,19 +752,6 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
archive, err = runSATStressPackCtx(a, ctx, "", dur, j.append)
|
||||
case "hpl":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
opts := platform.HPLOptions{
|
||||
MemFraction: 0.80,
|
||||
NB: 256,
|
||||
}
|
||||
archive, err = func() (string, error) {
|
||||
path, _, runErr := a.RunHPL(ctx, "", opts, j.append)
|
||||
return path, runErr
|
||||
}()
|
||||
case "platform-stress":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
|
||||
117
bible-local/docs/gpu-model-propagation.md
Normal file
117
bible-local/docs/gpu-model-propagation.md
Normal file
@@ -0,0 +1,117 @@
|
||||
# GPU Model Name Propagation
|
||||
|
||||
How GPU model names are detected, stored, and displayed throughout the project.
|
||||
|
||||
---
|
||||
|
||||
## Detection Sources
|
||||
|
||||
There are **two separate pipelines** for GPU model names — they use different structs and don't share state.
|
||||
|
||||
### Pipeline A — Live / SAT (nvidia-smi query at runtime)
|
||||
|
||||
**File:** `audit/internal/platform/sat.go`
|
||||
|
||||
- `ListNvidiaGPUs()` → `NvidiaGPU.Name` (field: `name`, from `nvidia-smi --query-gpu=index,name,...`)
|
||||
- `ListNvidiaGPUStatuses()` → `NvidiaGPUStatus.Name`
|
||||
- Used by: GPU selection UI, live metrics labels, burn/stress test logic
|
||||
|
||||
### Pipeline B — Benchmark results
|
||||
|
||||
**File:** `audit/internal/platform/benchmark.go`, line 124
|
||||
|
||||
- `queryBenchmarkGPUInfo(selected)` → `benchmarkGPUInfo.Name`
|
||||
- Stored in `BenchmarkGPUResult.Name` (`json:"name,omitempty"`)
|
||||
- Used by: benchmark history table, benchmark report
|
||||
|
||||
### Pipeline C — Hardware audit JSON (PCIe schema)
|
||||
|
||||
**File:** `audit/internal/schema/hardware.go`
|
||||
|
||||
- `HardwarePCIeDevice.Model *string` (field name is **Model**, not Name)
|
||||
- For AMD GPUs: populated by `audit/internal/collector/amdgpu.go` from `info.Product`
|
||||
- For NVIDIA GPUs: **NOT populated** by `audit/internal/collector/nvidia.go` — the NVIDIA enricher sets telemetry/status but skips the Model field
|
||||
- Used by: hardware summary page (`hwDescribeGPU` in `pages.go:487`)
|
||||
|
||||
---
|
||||
|
||||
## Key Inconsistency: NVIDIA PCIe Model is Never Set
|
||||
|
||||
`audit/internal/collector/nvidia.go` — `enrichPCIeWithNVIDIAData()` enriches NVIDIA PCIe devices with telemetry and status but does **not** populate `HardwarePCIeDevice.Model`.
|
||||
|
||||
This means:
|
||||
- Hardware summary page shows "Unknown GPU" for all NVIDIA devices (falls back at `pages.go:486`)
|
||||
- AMD GPUs do have their model populated
|
||||
|
||||
The fix would be: copy `gpu.Name` from the SAT pipeline into `dev.Model` inside `enrichPCIeWithNVIDIAData`.
|
||||
|
||||
---
|
||||
|
||||
## Benchmark History "Unknown GPU" Issue
|
||||
|
||||
**Symptom:** Benchmark history table shows "GPU #N — Unknown GPU" columns instead of real GPU model names.
|
||||
|
||||
**Root cause:** `BenchmarkGPUResult.Name` has tag `json:"name,omitempty"`. If `queryBenchmarkGPUInfo()` fails (warns at `benchmark.go:126`) or returns empty names, the Name field is never set and is omitted from JSON. Loaded results have empty Name → falls back to "Unknown GPU" at `pages.go:2226, 2237`.
|
||||
|
||||
This happens for:
|
||||
- Older result files saved before the `Name` field was added
|
||||
- Runs where nvidia-smi query failed before the benchmark started
|
||||
|
||||
---
|
||||
|
||||
## Fallback Strings — Current State
|
||||
|
||||
| Location | File | Fallback string |
|
||||
|---|---|---|
|
||||
| Hardware summary (PCIe) | `pages.go:486` | `"Unknown GPU"` |
|
||||
| Benchmark report summary | `benchmark_report.go:43` | `"Unknown GPU"` |
|
||||
| Benchmark report scorecard | `benchmark_report.go:93` | `"Unknown"` ← inconsistent |
|
||||
| Benchmark report detail | `benchmark_report.go:122` | `"Unknown GPU"` |
|
||||
| Benchmark history per-GPU col | `pages.go:2226` | `"Unknown GPU"` |
|
||||
| Benchmark history parallel col | `pages.go:2237` | `"Unknown GPU"` |
|
||||
| SAT status file write | `sat.go:922` | `"unknown"` ← lowercase, inconsistent |
|
||||
| GPU selection API | `api.go:163` | `"GPU N"` (no "Unknown") |
|
||||
|
||||
**Rule:** all UI fallbacks should use `"Unknown GPU"`. The two outliers are `benchmark_report.go:93` (`"Unknown"`) and `sat.go:922` (`"unknown"`).
|
||||
|
||||
---
|
||||
|
||||
## GPU Selection UI
|
||||
|
||||
**File:** `audit/internal/webui/pages.go`
|
||||
|
||||
- Source: `GET /api/gpus` → `api.go` → `ListNvidiaGPUs()` → live nvidia-smi
|
||||
- Render: `'GPU ' + gpu.index + ' — ' + gpu.name + ' · ' + mem`
|
||||
- Fallback: `gpu.name || 'GPU ' + idx` (JS, line ~1432)
|
||||
|
||||
This always shows the correct model because it queries nvidia-smi live. It is **not** connected to benchmark result data.
|
||||
|
||||
---
|
||||
|
||||
## Data Flow Summary
|
||||
|
||||
```
|
||||
nvidia-smi (live)
|
||||
└─ ListNvidiaGPUs() → NvidiaGPU.Name
|
||||
├─ GPU selection UI (always correct)
|
||||
├─ Live metrics labels (charts_svg.go)
|
||||
└─ SAT/burn status file (sat.go)
|
||||
|
||||
nvidia-smi (at benchmark start)
|
||||
└─ queryBenchmarkGPUInfo() → benchmarkGPUInfo.Name
|
||||
└─ BenchmarkGPUResult.Name (json:"name,omitempty")
|
||||
├─ Benchmark report
|
||||
└─ Benchmark history table columns
|
||||
|
||||
nvidia-smi / lspci (audit collection)
|
||||
└─ HardwarePCIeDevice.Model (NVIDIA: NOT populated; AMD: populated)
|
||||
└─ Hardware summary page hwDescribeGPU()
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## What Needs Fixing
|
||||
|
||||
1. **NVIDIA PCIe Model** — `enrichPCIeWithNVIDIAData()` should set `dev.Model = &gpu.Name`
|
||||
2. **Fallback consistency** — `benchmark_report.go:93` should say `"Unknown GPU"` not `"Unknown"`; `sat.go:922` should say `"Unknown GPU"` not `"unknown"`
|
||||
3. **Old benchmark JSONs** — no fix possible for already-saved results with missing names (display-only issue)
|
||||
@@ -19,7 +19,5 @@ ROCRAND_VERSION=3.2.0.60304-76~22.04
|
||||
HIP_RUNTIME_AMD_VERSION=6.3.42134.60304-76~22.04
|
||||
HIPBLASLT_VERSION=0.10.0.60304-76~22.04
|
||||
COMGR_VERSION=2.8.0.60304-76~22.04
|
||||
HPL_VERSION=2.3
|
||||
HPL_SHA256=32c5c17d22330e6f2337b681aded51637fb6008d3f0eb7c277b163fadd612830
|
||||
GO_VERSION=1.24.0
|
||||
AUDIT_VERSION=1.0.0
|
||||
|
||||
@@ -1,244 +0,0 @@
|
||||
#!/bin/sh
|
||||
# build-hpl.sh — build HPL (High Performance LINPACK) for the bee LiveCD.
|
||||
#
|
||||
# Downloads HPL 2.3 from netlib, downloads OpenBLAS runtime from the Debian 12
|
||||
# apt repo, and compiles xhpl using a minimal single-process MPI stub so that
|
||||
# no MPI package is required inside the ISO.
|
||||
#
|
||||
# The resulting xhpl binary is a standard HPL binary whose output is compatible
|
||||
# with the accepted HPL format (WR... Gflops lines).
|
||||
#
|
||||
# Output:
|
||||
# $CACHE_DIR/bin/xhpl
|
||||
# $CACHE_DIR/lib/libopenblas.so* (runtime, injected into ISO /usr/lib/)
|
||||
|
||||
set -e
|
||||
|
||||
HPL_VERSION="$1"
|
||||
HPL_SHA256="$2"
|
||||
DIST_DIR="$3"
|
||||
|
||||
[ -n "$HPL_VERSION" ] || { echo "usage: $0 <hpl-version> <sha256> <dist-dir>"; exit 1; }
|
||||
[ -n "$HPL_SHA256" ] || { echo "usage: $0 <hpl-version> <sha256> <dist-dir>"; exit 1; }
|
||||
[ -n "$DIST_DIR" ] || { echo "usage: $0 <hpl-version> <sha256> <dist-dir>"; exit 1; }
|
||||
|
||||
echo "=== HPL ${HPL_VERSION} ==="
|
||||
|
||||
CACHE_DIR="${DIST_DIR}/hpl-${HPL_VERSION}"
|
||||
CACHE_ROOT="${BEE_CACHE_DIR:-${DIST_DIR}/cache}"
|
||||
DOWNLOAD_CACHE_DIR="${CACHE_ROOT}/hpl-downloads"
|
||||
|
||||
if [ -x "${CACHE_DIR}/bin/xhpl" ]; then
|
||||
echo "=== HPL cached, skipping build ==="
|
||||
echo "binary: ${CACHE_DIR}/bin/xhpl"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
mkdir -p "${DOWNLOAD_CACHE_DIR}" "${CACHE_DIR}/bin" "${CACHE_DIR}/lib"
|
||||
|
||||
# ── download HPL source ────────────────────────────────────────────────────────
|
||||
HPL_TAR="${DOWNLOAD_CACHE_DIR}/hpl-${HPL_VERSION}.tar.gz"
|
||||
HPL_URL="https://www.netlib.org/benchmark/hpl/hpl-${HPL_VERSION}.tar.gz"
|
||||
|
||||
if [ ! -f "${HPL_TAR}" ]; then
|
||||
echo "=== downloading HPL ${HPL_VERSION} ==="
|
||||
wget --show-progress -O "${HPL_TAR}" "${HPL_URL}"
|
||||
fi
|
||||
|
||||
actual_sha="$(sha256sum "${HPL_TAR}" | awk '{print $1}')"
|
||||
if [ "${actual_sha}" != "${HPL_SHA256}" ]; then
|
||||
echo "ERROR: sha256 mismatch for hpl-${HPL_VERSION}.tar.gz" >&2
|
||||
echo " expected: ${HPL_SHA256}" >&2
|
||||
echo " actual: ${actual_sha}" >&2
|
||||
rm -f "${HPL_TAR}"
|
||||
exit 1
|
||||
fi
|
||||
echo "sha256 OK: hpl-${HPL_VERSION}.tar.gz"
|
||||
|
||||
# ── download OpenBLAS from Debian 12 apt repo ─────────────────────────────────
|
||||
REPO_BASE="https://deb.debian.org/debian/pool/main/o/openblas"
|
||||
PACKAGES_GZ="${DOWNLOAD_CACHE_DIR}/Packages.gz"
|
||||
OPENBLAS_PKG="libopenblas0-openmp"
|
||||
|
||||
echo "=== fetching Debian 12 Packages.gz ==="
|
||||
wget -q -O "${PACKAGES_GZ}" \
|
||||
"https://deb.debian.org/debian/dists/bookworm/main/binary-amd64/Packages.gz"
|
||||
|
||||
lookup_deb() {
|
||||
pkg="$1"
|
||||
gzip -dc "${PACKAGES_GZ}" | awk -v pkg="$pkg" '
|
||||
/^Package: / { cur=$2 }
|
||||
/^Filename: / { file=$2 }
|
||||
/^SHA256: / { sha=$2 }
|
||||
/^$/ {
|
||||
if (cur == pkg) { print file " " sha; exit }
|
||||
cur=""; file=""; sha=""
|
||||
}
|
||||
END {
|
||||
if (cur == pkg) print file " " sha
|
||||
}'
|
||||
}
|
||||
|
||||
meta="$(lookup_deb "${OPENBLAS_PKG}")"
|
||||
[ -n "$meta" ] || { echo "ERROR: ${OPENBLAS_PKG} not found in Packages.gz"; exit 1; }
|
||||
repo_file="$(printf '%s' "$meta" | awk '{print $1}')"
|
||||
repo_sha="$(printf '%s' "$meta" | awk '{print $2}')"
|
||||
|
||||
OPENBLAS_DEB="${DOWNLOAD_CACHE_DIR}/$(basename "${repo_file}")"
|
||||
if [ -f "${OPENBLAS_DEB}" ]; then
|
||||
actual="$(sha256sum "${OPENBLAS_DEB}" | awk '{print $1}')"
|
||||
[ "$actual" = "$repo_sha" ] || rm -f "${OPENBLAS_DEB}"
|
||||
fi
|
||||
if [ ! -f "${OPENBLAS_DEB}" ]; then
|
||||
echo "=== downloading ${OPENBLAS_PKG} ==="
|
||||
wget --show-progress -O "${OPENBLAS_DEB}" "https://deb.debian.org/debian/${repo_file}"
|
||||
actual="$(sha256sum "${OPENBLAS_DEB}" | awk '{print $1}')"
|
||||
[ "$actual" = "$repo_sha" ] || { echo "ERROR: sha256 mismatch for ${OPENBLAS_PKG}"; rm -f "${OPENBLAS_DEB}"; exit 1; }
|
||||
fi
|
||||
|
||||
# extract libopenblas shared libs
|
||||
TMP_DEB=$(mktemp -d)
|
||||
trap 'rm -rf "${TMP_DEB}" "${BUILD_TMP:-}"' EXIT INT TERM
|
||||
(
|
||||
cd "${TMP_DEB}"
|
||||
ar x "${OPENBLAS_DEB}"
|
||||
tar xf data.tar.*
|
||||
)
|
||||
find "${TMP_DEB}" \( -name 'libopenblas*.so*' \) \( -type f -o -type l \) \
|
||||
-exec cp -a {} "${CACHE_DIR}/lib/" \;
|
||||
echo "=== OpenBLAS libs: $(ls "${CACHE_DIR}/lib/" | wc -l) files ==="
|
||||
|
||||
# also need libopenblas-dev header for compilation (we only need the .so symlink)
|
||||
OPENBLAS_SO="$(find "${CACHE_DIR}/lib" -maxdepth 1 -name 'libopenblas.so.*' -type f | sort | head -1)"
|
||||
[ -n "${OPENBLAS_SO}" ] || { echo "ERROR: libopenblas.so not extracted"; exit 1; }
|
||||
SONAME="$(basename "${OPENBLAS_SO}")"
|
||||
ln -sf "${SONAME}" "${CACHE_DIR}/lib/libopenblas.so" 2>/dev/null || true
|
||||
ln -sf "${SONAME}" "${CACHE_DIR}/lib/libblas.so" 2>/dev/null || true
|
||||
|
||||
# ── build HPL ─────────────────────────────────────────────────────────────────
|
||||
BUILD_TMP=$(mktemp -d)
|
||||
|
||||
cd "${BUILD_TMP}"
|
||||
tar xf "${HPL_TAR}"
|
||||
SRC_DIR="$(find . -maxdepth 1 -type d -name 'hpl-*' | head -1)"
|
||||
[ -n "${SRC_DIR}" ] || { echo "ERROR: HPL source dir not found"; exit 1; }
|
||||
cd "${SRC_DIR}"
|
||||
|
||||
# Write a minimal single-process MPI stub so we don't need an MPI package.
|
||||
# HPL only needs these functions for single-process execution.
|
||||
cat > "${BUILD_TMP}/mpi_stub.c" <<'MPISTUB'
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <sys/time.h>
|
||||
|
||||
typedef int MPI_Comm;
|
||||
typedef int MPI_Datatype;
|
||||
typedef int MPI_Op;
|
||||
typedef int MPI_Status;
|
||||
typedef int MPI_Request;
|
||||
|
||||
#define MPI_COMM_WORLD 0
|
||||
#define MPI_SUCCESS 0
|
||||
#define MPI_DOUBLE 6
|
||||
#define MPI_INT 5
|
||||
#define MPI_SUM 0
|
||||
#define MPI_MAX 1
|
||||
#define MPI_MIN 2
|
||||
#define MPI_BYTE 1
|
||||
#define MPI_ANY_SOURCE -1
|
||||
#define MPI_ANY_TAG -1
|
||||
#define MPI_STATUS_IGNORE ((MPI_Status*)0)
|
||||
|
||||
int MPI_Init(int *argc, char ***argv) { (void)argc; (void)argv; return MPI_SUCCESS; }
|
||||
int MPI_Finalize(void) { return MPI_SUCCESS; }
|
||||
int MPI_Comm_rank(MPI_Comm c, int *rank) { (void)c; *rank = 0; return MPI_SUCCESS; }
|
||||
int MPI_Comm_size(MPI_Comm c, int *size) { (void)c; *size = 1; return MPI_SUCCESS; }
|
||||
int MPI_Bcast(void *b, int n, MPI_Datatype t, int r, MPI_Comm c)
|
||||
{ (void)b;(void)n;(void)t;(void)r;(void)c; return MPI_SUCCESS; }
|
||||
int MPI_Reduce(const void *s, void *r, int n, MPI_Datatype t, MPI_Op op, int root, MPI_Comm c) {
|
||||
(void)op;(void)root;(void)c;
|
||||
size_t sz = (t==MPI_DOUBLE)?sizeof(double):(t==MPI_INT)?sizeof(int):1;
|
||||
memcpy(r, s, (size_t)n * sz);
|
||||
return MPI_SUCCESS;
|
||||
}
|
||||
int MPI_Allreduce(const void *s, void *r, int n, MPI_Datatype t, MPI_Op op, MPI_Comm c)
|
||||
{ return MPI_Reduce(s,r,n,t,op,0,c); }
|
||||
int MPI_Send(const void *b, int n, MPI_Datatype t, int d, int tag, MPI_Comm c)
|
||||
{ (void)b;(void)n;(void)t;(void)d;(void)tag;(void)c; return MPI_SUCCESS; }
|
||||
int MPI_Recv(void *b, int n, MPI_Datatype t, int s, int tag, MPI_Comm c, MPI_Status *st)
|
||||
{ (void)b;(void)n;(void)t;(void)s;(void)tag;(void)c;(void)st; return MPI_SUCCESS; }
|
||||
int MPI_Sendrecv(const void *sb, int sn, MPI_Datatype st2, int dest, int stag,
|
||||
void *rb, int rn, MPI_Datatype rt, int src, int rtag,
|
||||
MPI_Comm c, MPI_Status *status)
|
||||
{ (void)sb;(void)sn;(void)st2;(void)dest;(void)stag;
|
||||
(void)rb;(void)rn;(void)rt;(void)src;(void)rtag;(void)c;(void)status;
|
||||
return MPI_SUCCESS; }
|
||||
int MPI_Irecv(void *b, int n, MPI_Datatype t, int s, int tag, MPI_Comm c, MPI_Request *req)
|
||||
{ (void)b;(void)n;(void)t;(void)s;(void)tag;(void)c;(void)req; return MPI_SUCCESS; }
|
||||
int MPI_Wait(MPI_Request *req, MPI_Status *st)
|
||||
{ (void)req;(void)st; return MPI_SUCCESS; }
|
||||
int MPI_Abort(MPI_Comm c, int code) { (void)c; exit(code); }
|
||||
double MPI_Wtime(void) {
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, NULL);
|
||||
return (double)tv.tv_sec + (double)tv.tv_usec * 1e-6;
|
||||
}
|
||||
MPISTUB
|
||||
|
||||
# Write Make.bee — HPL makefile configuration
|
||||
cat > Make.bee <<MAKEFILE
|
||||
SHELL = /bin/sh
|
||||
CD = cd
|
||||
CP = cp
|
||||
LN_S = ln -s
|
||||
MKDIR = mkdir -p
|
||||
RM = /bin/rm -f
|
||||
TOUCH = touch
|
||||
ARCH = bee
|
||||
|
||||
# Directories
|
||||
TOPdir = \$(shell pwd)
|
||||
INCdir = \$(TOPdir)/include
|
||||
BINdir = \$(TOPdir)/bin/\$(ARCH)
|
||||
LIBdir = \$(TOPdir)/lib/\$(ARCH)
|
||||
HPLlib = \$(LIBdir)/libhpl.a
|
||||
|
||||
# Compiler
|
||||
CC = gcc
|
||||
CCNOOPT = \$(HPL_DEFS)
|
||||
CCFLAGS = \$(HPL_DEFS) -O3 -march=native -funroll-loops -fomit-frame-pointer
|
||||
|
||||
# Linker
|
||||
LINKER = gcc
|
||||
LINKFLAGS = \$(CCFLAGS)
|
||||
|
||||
# MPI (single-process stub — no actual MPI needed)
|
||||
MPdir =
|
||||
MPinc = -I${BUILD_TMP}
|
||||
MPlib = ${BUILD_TMP}/mpi_stub.o
|
||||
|
||||
# BLAS (OpenBLAS)
|
||||
LAdir = ${CACHE_DIR}/lib
|
||||
LAinc =
|
||||
LAlib = -L\$(LAdir) -Wl,-rpath,/usr/lib -lopenblas
|
||||
|
||||
HPL_OPTS =
|
||||
HPL_DEFS = \$(HPL_OPTS) -DHPL_CALL_CBLAS
|
||||
MAKEFILE
|
||||
echo "=== Make.bee written ==="
|
||||
|
||||
# compile MPI stub
|
||||
gcc -O2 -c -o "${BUILD_TMP}/mpi_stub.o" "${BUILD_TMP}/mpi_stub.c"
|
||||
|
||||
# build HPL
|
||||
echo "=== building HPL ${HPL_VERSION} ==="
|
||||
make -j"$(nproc)" arch=bee 2>&1 | tail -20
|
||||
|
||||
XHPL_BIN="bin/bee/xhpl"
|
||||
[ -x "${XHPL_BIN}" ] || { echo "ERROR: xhpl not found after build"; exit 1; }
|
||||
|
||||
cp "${XHPL_BIN}" "${CACHE_DIR}/bin/xhpl"
|
||||
chmod +x "${CACHE_DIR}/bin/xhpl"
|
||||
echo "=== HPL build complete ==="
|
||||
echo "binary: ${CACHE_DIR}/bin/xhpl"
|
||||
echo "libs: $(ls "${CACHE_DIR}/lib/")"
|
||||
@@ -1148,19 +1148,6 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
echo "=== john injected ==="
|
||||
fi
|
||||
|
||||
# --- build HPL (CPU LINPACK) — runs on all variants ---
|
||||
run_step "build HPL ${HPL_VERSION}" "80-hpl" \
|
||||
sh "${BUILDER_DIR}/build-hpl.sh" "${HPL_VERSION}" "${HPL_SHA256}" "${DIST_DIR}"
|
||||
|
||||
HPL_CACHE="${DIST_DIR}/hpl-${HPL_VERSION}"
|
||||
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/lib/bee"
|
||||
cp "${HPL_CACHE}/bin/xhpl" "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/xhpl"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/xhpl"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-hpl" 2>/dev/null || true
|
||||
# Inject OpenBLAS runtime libs needed by xhpl
|
||||
cp "${HPL_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" 2>/dev/null || true
|
||||
echo "=== HPL injected: xhpl + $(ls "${HPL_CACHE}/lib/" | wc -l) OpenBLAS libs ==="
|
||||
|
||||
# --- embed build metadata ---
|
||||
mkdir -p "${OVERLAY_STAGE_DIR}/etc"
|
||||
BUILD_DATE="$(date +%Y-%m-%d)"
|
||||
@@ -1193,7 +1180,6 @@ BUILD_DATE=${BUILD_DATE}
|
||||
GIT_COMMIT=${GIT_COMMIT}
|
||||
DEBIAN_VERSION=${DEBIAN_VERSION}
|
||||
DEBIAN_KERNEL_ABI=${DEBIAN_KERNEL_ABI}
|
||||
HPL_VERSION=${HPL_VERSION}
|
||||
${GPU_VERSION_LINE}
|
||||
EOF
|
||||
|
||||
|
||||
@@ -11,18 +11,18 @@ echo " Hardware Audit LiveCD"
|
||||
echo ""
|
||||
|
||||
menuentry "EASY-BEE" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
submenu "EASY-BEE (advanced options) -->" {
|
||||
menuentry "EASY-BEE — GSP=off" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
menuentry "EASY-BEE — KMS (no nomodeset)" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable nowatchdog nosoftlockup
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
set color_normal=light-gray/black
|
||||
set color_highlight=white/dark-gray
|
||||
set color_highlight=yellow/black
|
||||
|
||||
if [ -e /boot/grub/splash.png ]; then
|
||||
set theme=/boot/grub/live-theme/theme.txt
|
||||
else
|
||||
set menu_color_normal=cyan/black
|
||||
set menu_color_highlight=white/dark-gray
|
||||
set menu_color_normal=yellow/black
|
||||
set menu_color_highlight=white/brown
|
||||
fi
|
||||
|
||||
@@ -3,31 +3,31 @@ label live-@FLAVOUR@-normal
|
||||
menu default
|
||||
linux @LINUX@
|
||||
initrd @INITRD@
|
||||
append @APPEND_LIVE@ bee.nvidia.mode=normal
|
||||
append @APPEND_LIVE@ bee.nvidia.mode=normal pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
|
||||
|
||||
label live-@FLAVOUR@-kms
|
||||
menu label EASY-BEE (^graphics/KMS)
|
||||
linux @LINUX@
|
||||
initrd @INITRD@
|
||||
append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal
|
||||
append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
|
||||
|
||||
label live-@FLAVOUR@-toram
|
||||
menu label EASY-BEE (^load to RAM)
|
||||
linux @LINUX@
|
||||
initrd @INITRD@
|
||||
append @APPEND_LIVE@ toram bee.nvidia.mode=normal
|
||||
append @APPEND_LIVE@ toram bee.nvidia.mode=normal pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
|
||||
|
||||
label live-@FLAVOUR@-gsp-off
|
||||
menu label EASY-BEE (^NVIDIA GSP=off)
|
||||
linux @LINUX@
|
||||
initrd @INITRD@
|
||||
append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off
|
||||
append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
|
||||
|
||||
label live-@FLAVOUR@-kms-gsp-off
|
||||
menu label EASY-BEE (g^raphics/KMS, GSP=off)
|
||||
linux @LINUX@
|
||||
initrd @INITRD@
|
||||
append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=gsp-off
|
||||
append @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=gsp-off pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1
|
||||
|
||||
label live-@FLAVOUR@-failsafe
|
||||
menu label EASY-BEE (^fail-safe)
|
||||
|
||||
@@ -25,6 +25,7 @@ ensure_bee_console_user() {
|
||||
ensure_bee_console_user
|
||||
|
||||
# Enable common bee services
|
||||
systemctl enable bee-hpc-tuning.service
|
||||
systemctl enable bee-network.service
|
||||
systemctl enable bee-preflight.service
|
||||
systemctl enable bee-audit.service
|
||||
@@ -55,6 +56,7 @@ fi
|
||||
# nogpu: no GPU services needed
|
||||
|
||||
# Ensure scripts are executable
|
||||
chmod +x /usr/local/bin/bee-hpc-tuning 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-network.sh 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-sshsetup 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-smoketest 2>/dev/null || true
|
||||
|
||||
@@ -10,20 +10,15 @@ import os
|
||||
|
||||
W, H = 1920, 1080
|
||||
|
||||
GLYPHS = {
|
||||
'E': ["11111", "10000", "11110", "10000", "10000", "10000", "11111"],
|
||||
'A': ["01110", "10001", "10001", "11111", "10001", "10001", "10001"],
|
||||
'S': ["01111", "10000", "10000", "01110", "00001", "00001", "11110"],
|
||||
'Y': ["10001", "10001", "01010", "00100", "00100", "00100", "00100"],
|
||||
'B': ["11110", "10001", "10001", "11110", "10001", "10001", "11110"],
|
||||
'-': ["00000", "00000", "11111", "00000", "00000", "00000", "00000"],
|
||||
}
|
||||
|
||||
TITLE = "EASY-BEE"
|
||||
SUBTITLE = "Hardware Audit LiveCD"
|
||||
CELL = 30
|
||||
GLYPH_GAP = 18
|
||||
ROW_GAP = 6
|
||||
ASCII_ART = [
|
||||
" ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗",
|
||||
" ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝",
|
||||
" █████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗",
|
||||
" ██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝",
|
||||
" ███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗",
|
||||
" ╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝",
|
||||
]
|
||||
SUBTITLE = " Hardware Audit LiveCD"
|
||||
|
||||
FG = (0xF6, 0xD0, 0x47)
|
||||
FG_DIM = (0xD4, 0xA9, 0x1C)
|
||||
@@ -31,6 +26,12 @@ SHADOW = (0x5E, 0x47, 0x05)
|
||||
SUB = (0x96, 0x7A, 0x17)
|
||||
BG = (0x05, 0x05, 0x05)
|
||||
|
||||
MONO_FONT_CANDIDATES = [
|
||||
'/usr/share/fonts/truetype/dejavu/DejaVuSansMono-Bold.ttf',
|
||||
'/usr/share/fonts/truetype/liberation2/LiberationMono-Bold.ttf',
|
||||
'/usr/share/fonts/truetype/liberation/LiberationMono-Bold.ttf',
|
||||
'/usr/share/fonts/truetype/freefont/FreeMonoBold.ttf',
|
||||
]
|
||||
SUB_FONT_CANDIDATES = [
|
||||
'/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf',
|
||||
'/usr/share/fonts/truetype/liberation2/LiberationSans-Bold.ttf',
|
||||
@@ -39,43 +40,34 @@ SUB_FONT_CANDIDATES = [
|
||||
]
|
||||
|
||||
|
||||
def load_font(size):
|
||||
for path in SUB_FONT_CANDIDATES:
|
||||
def load_font(candidates, size):
|
||||
for path in candidates:
|
||||
if os.path.exists(path):
|
||||
return ImageFont.truetype(path, size)
|
||||
return ImageFont.load_default()
|
||||
|
||||
|
||||
def glyph_width(ch):
|
||||
return len(GLYPHS[ch][0])
|
||||
def mono_metrics(font):
|
||||
probe = Image.new('L', (W, H), 0)
|
||||
draw = ImageDraw.Draw(probe)
|
||||
char_w = int(round(draw.textlength("M", font=font)))
|
||||
bb = draw.textbbox((0, 0), "Mg", font=font)
|
||||
char_h = bb[3] - bb[1]
|
||||
return char_w, char_h
|
||||
|
||||
|
||||
def render_logo_mask():
|
||||
width_cells = 0
|
||||
for idx, ch in enumerate(TITLE):
|
||||
width_cells += glyph_width(ch)
|
||||
if idx != len(TITLE) - 1:
|
||||
width_cells += 1
|
||||
mask_w = width_cells * CELL + (len(TITLE) - 1) * GLYPH_GAP
|
||||
mask_h = 7 * CELL + 6 * ROW_GAP
|
||||
mask = Image.new('L', (mask_w, mask_h), 0)
|
||||
def render_ascii_mask(font, lines, char_w, char_h, line_gap):
|
||||
width = max(len(line) for line in lines) * char_w
|
||||
height = len(lines) * char_h + line_gap * (len(lines) - 1)
|
||||
mask = Image.new('L', (width, height), 0)
|
||||
draw = ImageDraw.Draw(mask)
|
||||
|
||||
cx = 0
|
||||
for idx, ch in enumerate(TITLE):
|
||||
glyph = GLYPHS[ch]
|
||||
for row_idx, row in enumerate(glyph):
|
||||
for col_idx, cell in enumerate(row):
|
||||
if cell != '1':
|
||||
continue
|
||||
x0 = cx + col_idx * CELL
|
||||
y0 = row_idx * (CELL + ROW_GAP)
|
||||
x1 = x0 + CELL - 4
|
||||
y1 = y0 + CELL - 4
|
||||
draw.rounded_rectangle((x0, y0, x1, y1), radius=4, fill=255)
|
||||
cx += glyph_width(ch) * CELL
|
||||
if idx != len(TITLE) - 1:
|
||||
cx += CELL + GLYPH_GAP
|
||||
for row, line in enumerate(lines):
|
||||
y = row * (char_h + line_gap)
|
||||
for col, ch in enumerate(line):
|
||||
if ch == ' ':
|
||||
continue
|
||||
x = col * char_w
|
||||
draw.text((x, y), ch, font=font, fill=255)
|
||||
return mask
|
||||
|
||||
|
||||
@@ -90,20 +82,28 @@ glow_draw.ellipse((520, 340, 1400, 760), fill=(255, 190, 40, 36))
|
||||
glow = glow.filter(ImageFilter.GaussianBlur(60))
|
||||
img = Image.alpha_composite(img.convert('RGBA'), glow)
|
||||
|
||||
logo_mask = render_logo_mask()
|
||||
TARGET_LOGO_W = 400
|
||||
max_chars = max(len(line) for line in ASCII_ART)
|
||||
_probe_font = load_font(MONO_FONT_CANDIDATES, 64)
|
||||
_probe_cw, _ = mono_metrics(_probe_font)
|
||||
font_size_logo = max(6, int(64 * TARGET_LOGO_W / (_probe_cw * max_chars)))
|
||||
font_logo = load_font(MONO_FONT_CANDIDATES, font_size_logo)
|
||||
char_w, char_h = mono_metrics(font_logo)
|
||||
logo_mask = render_ascii_mask(font_logo, ASCII_ART, char_w, char_h, 2)
|
||||
logo_w, logo_h = logo_mask.size
|
||||
logo_x = (W - logo_w) // 2
|
||||
logo_y = 290
|
||||
logo_y = 380
|
||||
|
||||
shadow_mask = logo_mask.filter(ImageFilter.GaussianBlur(2))
|
||||
img.paste(SHADOW, (logo_x + 16, logo_y + 14), shadow_mask)
|
||||
img.paste(FG_DIM, (logo_x + 8, logo_y + 7), logo_mask)
|
||||
sh_off = max(1, font_size_logo // 6)
|
||||
shadow_mask = logo_mask.filter(ImageFilter.GaussianBlur(1))
|
||||
img.paste(SHADOW, (logo_x + sh_off * 2, logo_y + sh_off * 2), shadow_mask)
|
||||
img.paste(FG_DIM, (logo_x + sh_off, logo_y + sh_off), logo_mask)
|
||||
img.paste(FG, (logo_x, logo_y), logo_mask)
|
||||
|
||||
font_sub = load_font(30)
|
||||
font_sub = load_font(SUB_FONT_CANDIDATES, 30)
|
||||
sub_bb = draw.textbbox((0, 0), SUBTITLE, font=font_sub)
|
||||
sub_x = (W - (sub_bb[2] - sub_bb[0])) // 2
|
||||
sub_y = logo_y + logo_h + 54
|
||||
sub_y = logo_y + logo_h + 48
|
||||
draw = ImageDraw.Draw(img)
|
||||
draw.text((sub_x + 2, sub_y + 2), SUBTITLE, font=font_sub, fill=(35, 28, 6))
|
||||
draw.text((sub_x, sub_y), SUBTITLE, font=font_sub, fill=SUB)
|
||||
|
||||
14
iso/overlay/etc/systemd/system/bee-hpc-tuning.service
Normal file
14
iso/overlay/etc/systemd/system/bee-hpc-tuning.service
Normal file
@@ -0,0 +1,14 @@
|
||||
[Unit]
|
||||
Description=Bee: HPC tuning (CPU governor, C-states)
|
||||
After=local-fs.target
|
||||
Before=bee-nvidia.service bee-audit.service
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
ExecStart=/usr/local/bin/bee-log-run /appdata/bee/export/bee-hpc-tuning.log /usr/local/bin/bee-hpc-tuning
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
RemainAfterExit=yes
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
110
iso/overlay/usr/local/bin/bee-dcgmproftester-staggered
Executable file
110
iso/overlay/usr/local/bin/bee-dcgmproftester-staggered
Executable file
@@ -0,0 +1,110 @@
|
||||
#!/bin/sh
|
||||
set -eu
|
||||
|
||||
SECONDS=300
|
||||
STAGGER_SECONDS=180
|
||||
DEVICES=""
|
||||
EXCLUDE=""
|
||||
|
||||
usage() {
|
||||
echo "usage: $0 [--seconds N] [--stagger-seconds N] [--devices 0,1] [--exclude 2,3]" >&2
|
||||
exit 2
|
||||
}
|
||||
|
||||
normalize_list() {
|
||||
echo "${1:-}" | tr ',' '\n' | sed 's/[[:space:]]//g' | awk 'NF' | sort -n | uniq | paste -sd, -
|
||||
}
|
||||
|
||||
contains_csv() {
|
||||
needle="$1"
|
||||
haystack="${2:-}"
|
||||
echo ",${haystack}," | grep -q ",${needle},"
|
||||
}
|
||||
|
||||
resolve_dcgmproftester() {
|
||||
for candidate in dcgmproftester dcgmproftester13 dcgmproftester12 dcgmproftester11; do
|
||||
if command -v "${candidate}" >/dev/null 2>&1; then
|
||||
command -v "${candidate}"
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
while [ "$#" -gt 0 ]; do
|
||||
case "$1" in
|
||||
--seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
|
||||
--stagger-seconds) [ "$#" -ge 2 ] || usage; STAGGER_SECONDS="$2"; shift 2 ;;
|
||||
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
||||
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
||||
*) usage ;;
|
||||
esac
|
||||
done
|
||||
|
||||
PROF=$(resolve_dcgmproftester) || { echo "dcgmproftester not found in PATH" >&2; exit 1; }
|
||||
ALL_DEVICES=$(nvidia-smi --query-gpu=index --format=csv,noheader,nounits 2>/dev/null | sed 's/[[:space:]]//g' | awk 'NF' | paste -sd, -)
|
||||
[ -n "${ALL_DEVICES}" ] || { echo "nvidia-smi found no NVIDIA GPUs" >&2; exit 1; }
|
||||
|
||||
DEVICES=$(normalize_list "${DEVICES}")
|
||||
EXCLUDE=$(normalize_list "${EXCLUDE}")
|
||||
SELECTED="${DEVICES}"
|
||||
if [ -z "${SELECTED}" ]; then
|
||||
SELECTED="${ALL_DEVICES}"
|
||||
fi
|
||||
|
||||
FINAL=""
|
||||
for id in $(echo "${SELECTED}" | tr ',' ' '); do
|
||||
[ -n "${id}" ] || continue
|
||||
if contains_csv "${id}" "${EXCLUDE}"; then
|
||||
continue
|
||||
fi
|
||||
if [ -z "${FINAL}" ]; then
|
||||
FINAL="${id}"
|
||||
else
|
||||
FINAL="${FINAL},${id}"
|
||||
fi
|
||||
done
|
||||
|
||||
[ -n "${FINAL}" ] || { echo "no NVIDIA GPUs selected after filters" >&2; exit 1; }
|
||||
|
||||
echo "loader=dcgmproftester-staggered"
|
||||
echo "selected_gpus=${FINAL}"
|
||||
echo "stagger_seconds=${STAGGER_SECONDS}"
|
||||
|
||||
TMP_DIR=$(mktemp -d)
|
||||
trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM
|
||||
|
||||
GPU_COUNT=$(echo "${FINAL}" | tr ',' '\n' | awk 'NF' | wc -l | tr -d '[:space:]')
|
||||
gpu_pos=0
|
||||
WORKERS=""
|
||||
for id in $(echo "${FINAL}" | tr ',' ' '); do
|
||||
gpu_pos=$((gpu_pos + 1))
|
||||
log="${TMP_DIR}/gpu-${id}.log"
|
||||
extra_sec=$(( STAGGER_SECONDS * (GPU_COUNT - gpu_pos) ))
|
||||
gpu_seconds=$(( SECONDS + extra_sec ))
|
||||
echo "starting gpu ${id} seconds=${gpu_seconds}"
|
||||
CUDA_VISIBLE_DEVICES="${id}" "${PROF}" --no-dcgm-validation -t 1004 -d "${gpu_seconds}" >"${log}" 2>&1 &
|
||||
pid=$!
|
||||
WORKERS="${WORKERS} ${pid}:${id}:${log}"
|
||||
if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${gpu_pos}" -lt "${GPU_COUNT}" ]; then
|
||||
sleep "${STAGGER_SECONDS}"
|
||||
fi
|
||||
done
|
||||
|
||||
status=0
|
||||
for spec in ${WORKERS}; do
|
||||
pid=${spec%%:*}
|
||||
rest=${spec#*:}
|
||||
id=${rest%%:*}
|
||||
log=${rest#*:}
|
||||
if wait "${pid}"; then
|
||||
echo "gpu ${id} finished: OK"
|
||||
else
|
||||
rc=$?
|
||||
echo "gpu ${id} finished: FAILED rc=${rc}"
|
||||
status=1
|
||||
fi
|
||||
sed "s/^/[gpu ${id}] /" "${log}" || true
|
||||
done
|
||||
|
||||
exit "${status}"
|
||||
17
iso/overlay/usr/local/bin/bee-gpu-burn
Normal file → Executable file
17
iso/overlay/usr/local/bin/bee-gpu-burn
Normal file → Executable file
@@ -2,13 +2,14 @@
|
||||
set -eu
|
||||
|
||||
SECONDS=5
|
||||
STAGGER_SECONDS=0
|
||||
SIZE_MB=0
|
||||
DEVICES=""
|
||||
EXCLUDE=""
|
||||
WORKER="/usr/local/lib/bee/bee-gpu-burn-worker"
|
||||
|
||||
usage() {
|
||||
echo "usage: $0 [--seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3]" >&2
|
||||
echo "usage: $0 [--seconds N] [--stagger-seconds N] [--size-mb N] [--devices 0,1] [--exclude 2,3]" >&2
|
||||
exit 2
|
||||
}
|
||||
|
||||
@@ -25,6 +26,7 @@ contains_csv() {
|
||||
while [ "$#" -gt 0 ]; do
|
||||
case "$1" in
|
||||
--seconds|-t) [ "$#" -ge 2 ] || usage; SECONDS="$2"; shift 2 ;;
|
||||
--stagger-seconds) [ "$#" -ge 2 ] || usage; STAGGER_SECONDS="$2"; shift 2 ;;
|
||||
--size-mb|-m) [ "$#" -ge 2 ] || usage; SIZE_MB="$2"; shift 2 ;;
|
||||
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
||||
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
||||
@@ -61,14 +63,18 @@ done
|
||||
|
||||
echo "loader=bee-gpu-burn"
|
||||
echo "selected_gpus=${FINAL}"
|
||||
echo "stagger_seconds=${STAGGER_SECONDS}"
|
||||
|
||||
export CUDA_DEVICE_ORDER="PCI_BUS_ID"
|
||||
|
||||
TMP_DIR=$(mktemp -d)
|
||||
trap 'rm -rf "${TMP_DIR}"' EXIT INT TERM
|
||||
|
||||
GPU_COUNT=$(echo "${FINAL}" | tr ',' '\n' | awk 'NF' | wc -l | tr -d '[:space:]')
|
||||
gpu_pos=0
|
||||
WORKERS=""
|
||||
for id in $(echo "${FINAL}" | tr ',' ' '); do
|
||||
gpu_pos=$((gpu_pos + 1))
|
||||
log="${TMP_DIR}/gpu-${id}.log"
|
||||
gpu_size_mb="${SIZE_MB}"
|
||||
if [ "${gpu_size_mb}" -le 0 ] 2>/dev/null; then
|
||||
@@ -79,11 +85,16 @@ for id in $(echo "${FINAL}" | tr ',' ' '); do
|
||||
gpu_size_mb=512
|
||||
fi
|
||||
fi
|
||||
echo "starting gpu ${id} size=${gpu_size_mb}MB"
|
||||
extra_sec=$(( STAGGER_SECONDS * (GPU_COUNT - gpu_pos) ))
|
||||
gpu_seconds=$(( SECONDS + extra_sec ))
|
||||
echo "starting gpu ${id} size=${gpu_size_mb}MB seconds=${gpu_seconds}"
|
||||
CUDA_VISIBLE_DEVICES="${id}" \
|
||||
"${WORKER}" --device 0 --seconds "${SECONDS}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
|
||||
"${WORKER}" --device 0 --seconds "${gpu_seconds}" --size-mb "${gpu_size_mb}" >"${log}" 2>&1 &
|
||||
pid=$!
|
||||
WORKERS="${WORKERS} ${pid}:${id}:${log}"
|
||||
if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${gpu_pos}" -lt "${GPU_COUNT}" ]; then
|
||||
sleep "${STAGGER_SECONDS}"
|
||||
fi
|
||||
done
|
||||
|
||||
status=0
|
||||
|
||||
41
iso/overlay/usr/local/bin/bee-hpc-tuning
Normal file
41
iso/overlay/usr/local/bin/bee-hpc-tuning
Normal file
@@ -0,0 +1,41 @@
|
||||
#!/bin/sh
|
||||
# bee-hpc-tuning — apply HPC tuning for deterministic benchmarking
|
||||
# Called by bee-hpc-tuning.service at boot.
|
||||
|
||||
log() { echo "[bee-hpc-tuning] $*"; }
|
||||
|
||||
# ── CPU governor ────────────────────────────────────────────────────────────
|
||||
# Set all CPU cores to performance governor via sysfs.
|
||||
# cpupower is not available; write directly to scaling_governor.
|
||||
governor_ok=0
|
||||
governor_fail=0
|
||||
for gov_path in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor; do
|
||||
[ -f "$gov_path" ] || continue
|
||||
if echo performance > "$gov_path" 2>/dev/null; then
|
||||
governor_ok=$((governor_ok + 1))
|
||||
else
|
||||
governor_fail=$((governor_fail + 1))
|
||||
fi
|
||||
done
|
||||
|
||||
if [ "$governor_ok" -gt 0 ] && [ "$governor_fail" -eq 0 ]; then
|
||||
log "CPU governor set to performance on ${governor_ok} core(s)"
|
||||
elif [ "$governor_ok" -gt 0 ]; then
|
||||
log "WARN: CPU governor: ${governor_ok} OK, ${governor_fail} failed"
|
||||
elif [ "$governor_fail" -gt 0 ]; then
|
||||
log "WARN: failed to set CPU governor on ${governor_fail} core(s)"
|
||||
else
|
||||
log "WARN: no cpufreq scaling_governor paths found (C-state governor or HW-controlled)"
|
||||
fi
|
||||
|
||||
# ── Transparent Huge Pages ───────────────────────────────────────────────────
|
||||
# Kernel cmdline sets transparent_hugepage=always at boot, but confirm and log.
|
||||
thp_path=/sys/kernel/mm/transparent_hugepage/enabled
|
||||
if [ -f "$thp_path" ]; then
|
||||
current=$(cat "$thp_path" 2>/dev/null)
|
||||
log "transparent_hugepage: ${current}"
|
||||
else
|
||||
log "WARN: transparent_hugepage sysfs path not found"
|
||||
fi
|
||||
|
||||
log "done"
|
||||
@@ -1,97 +0,0 @@
|
||||
#!/bin/sh
|
||||
# bee-hpl — run HPL (High Performance LINPACK) with auto-sized problem.
|
||||
#
|
||||
# Generates HPL.dat based on available RAM, runs xhpl, and prints standard
|
||||
# HPL output. The WR... line with Gflops is parsed by the bee audit tool.
|
||||
#
|
||||
# Usage: bee-hpl [--mem-fraction 0.80] [--nb 256] [--seconds N]
|
||||
#
|
||||
# --mem-fraction fraction of total RAM to use for the matrix (default 0.80)
|
||||
# --nb block size; 256 is good for modern CPUs (default 256)
|
||||
# --seconds ignored — HPL runtime is determined by problem size; kept
|
||||
# for interface compatibility with other bee stress tools
|
||||
|
||||
set -eu
|
||||
|
||||
XHPL="/usr/local/lib/bee/xhpl"
|
||||
MEM_FRACTION="0.80"
|
||||
NB=256
|
||||
|
||||
usage() {
|
||||
echo "usage: $0 [--mem-fraction 0.80] [--nb 256] [--seconds N]" >&2
|
||||
exit 2
|
||||
}
|
||||
|
||||
while [ "$#" -gt 0 ]; do
|
||||
case "$1" in
|
||||
--mem-fraction) [ "$#" -ge 2 ] || usage; MEM_FRACTION="$2"; shift 2 ;;
|
||||
--nb) [ "$#" -ge 2 ] || usage; NB="$2"; shift 2 ;;
|
||||
--seconds) [ "$#" -ge 2 ] || usage; shift 2 ;; # accepted, ignored
|
||||
*) usage ;;
|
||||
esac
|
||||
done
|
||||
|
||||
[ -x "${XHPL}" ] || { echo "ERROR: xhpl not found at ${XHPL}" >&2; exit 1; }
|
||||
|
||||
# Detect total RAM in bytes
|
||||
TOTAL_KB=$(grep MemTotal /proc/meminfo | awk '{print $2}')
|
||||
[ -n "${TOTAL_KB}" ] || { echo "ERROR: cannot read MemTotal from /proc/meminfo" >&2; exit 1; }
|
||||
TOTAL_BYTES=$(( TOTAL_KB * 1024 ))
|
||||
|
||||
# N = floor(sqrt(fraction * total_bytes / 8)) rounded down to multiple of NB
|
||||
# Use awk for floating-point sqrt
|
||||
N=$(awk -v total="${TOTAL_BYTES}" -v frac="${MEM_FRACTION}" -v nb="${NB}" '
|
||||
BEGIN {
|
||||
raw = int(sqrt(total * frac / 8.0))
|
||||
n = int(raw / nb) * nb
|
||||
if (n < nb) n = nb
|
||||
print n
|
||||
}')
|
||||
|
||||
echo "loader=bee-hpl"
|
||||
echo "total_ram_mb=$(( TOTAL_KB / 1024 ))"
|
||||
echo "matrix_n=${N}"
|
||||
echo "block_nb=${NB}"
|
||||
echo "mem_fraction=${MEM_FRACTION}"
|
||||
|
||||
# Generate HPL.dat in a temp directory and run from there
|
||||
RUNDIR=$(mktemp -d)
|
||||
trap 'rm -rf "${RUNDIR}"' EXIT INT TERM
|
||||
|
||||
cat > "${RUNDIR}/HPL.dat" <<DAT
|
||||
HPLinpack benchmark input file
|
||||
Innovative Computing Laboratory, University of Tennessee
|
||||
HPL.out output file name (if any)
|
||||
6 device out (6=stdout, 7=stderr, file)
|
||||
1 # of problems sizes (N)
|
||||
${N} Ns
|
||||
1 # of NBs
|
||||
${NB} NBs
|
||||
0 PMAP process mapping (0=Row-,1=Column-major)
|
||||
1 # of process grids (P x Q)
|
||||
1 Ps
|
||||
1 Qs
|
||||
16.0 threshold
|
||||
1 # of panel fact
|
||||
2 PFACTs (0=left, 1=Crout, 2=Right)
|
||||
1 # of recursive stopping criterium
|
||||
4 NBMINs (>= 1)
|
||||
1 # of panels in recursion
|
||||
2 NDIVs
|
||||
1 # of recursive panel fact.
|
||||
1 RFACTs (0=left, 1=Crout, 2=Right)
|
||||
1 # of broadcast
|
||||
1 BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM)
|
||||
1 # of lookahead depth
|
||||
1 DEPTHs (>=0)
|
||||
2 SWAP (0=bin-exch,1=long,2=mix)
|
||||
64 swapping threshold
|
||||
0 L1 in (0=transposed,1=no-transposed) form
|
||||
0 U in (0=transposed,1=no-transposed) form
|
||||
1 Equilibration (0=no,1=yes)
|
||||
8 memory alignment in double (> 0)
|
||||
DAT
|
||||
|
||||
cd "${RUNDIR}"
|
||||
echo "---"
|
||||
"${XHPL}"
|
||||
16
iso/overlay/usr/local/bin/bee-john-gpu-stress
Normal file → Executable file
16
iso/overlay/usr/local/bin/bee-john-gpu-stress
Normal file → Executable file
@@ -2,6 +2,7 @@
|
||||
set -eu
|
||||
|
||||
DURATION_SEC=300
|
||||
STAGGER_SECONDS=0
|
||||
DEVICES=""
|
||||
EXCLUDE=""
|
||||
FORMAT=""
|
||||
@@ -12,7 +13,7 @@ export OCL_ICD_VENDORS="/etc/OpenCL/vendors"
|
||||
export LD_LIBRARY_PATH="/usr/lib:/usr/local/lib${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
|
||||
|
||||
usage() {
|
||||
echo "usage: $0 [--seconds N] [--devices 0,1] [--exclude 2,3] [--format name]" >&2
|
||||
echo "usage: $0 [--seconds N] [--stagger-seconds N] [--devices 0,1] [--exclude 2,3] [--format name]" >&2
|
||||
exit 2
|
||||
}
|
||||
|
||||
@@ -118,6 +119,7 @@ ensure_opencl_ready() {
|
||||
while [ "$#" -gt 0 ]; do
|
||||
case "$1" in
|
||||
--seconds|-t) [ "$#" -ge 2 ] || usage; DURATION_SEC="$2"; shift 2 ;;
|
||||
--stagger-seconds) [ "$#" -ge 2 ] || usage; STAGGER_SECONDS="$2"; shift 2 ;;
|
||||
--devices) [ "$#" -ge 2 ] || usage; DEVICES="$2"; shift 2 ;;
|
||||
--exclude) [ "$#" -ge 2 ] || usage; EXCLUDE="$2"; shift 2 ;;
|
||||
--format) [ "$#" -ge 2 ] || usage; FORMAT="$2"; shift 2 ;;
|
||||
@@ -170,6 +172,7 @@ done
|
||||
echo "loader=john"
|
||||
echo "selected_gpus=${FINAL}"
|
||||
echo "john_devices=${JOHN_DEVICES}"
|
||||
echo "stagger_seconds=${STAGGER_SECONDS}"
|
||||
|
||||
cd "${JOHN_DIR}"
|
||||
|
||||
@@ -232,14 +235,21 @@ trap cleanup EXIT INT TERM
|
||||
echo "format=${CHOSEN_FORMAT}"
|
||||
echo "target_seconds=${DURATION_SEC}"
|
||||
echo "slice_seconds=${TEST_SLICE_SECONDS}"
|
||||
DEADLINE=$(( $(date +%s) + DURATION_SEC ))
|
||||
TOTAL_DEVICES=$(echo "${JOHN_DEVICES}" | tr ',' '\n' | awk 'NF' | wc -l | tr -d '[:space:]')
|
||||
_first=1
|
||||
pos=0
|
||||
for opencl_id in $(echo "${JOHN_DEVICES}" | tr ',' ' '); do
|
||||
pos=$((pos + 1))
|
||||
[ "${_first}" = "1" ] || sleep 3
|
||||
_first=0
|
||||
run_john_loop "${opencl_id}" "${DEADLINE}" &
|
||||
extra_sec=$(( STAGGER_SECONDS * (TOTAL_DEVICES - pos) ))
|
||||
deadline=$(( $(date +%s) + DURATION_SEC + extra_sec ))
|
||||
run_john_loop "${opencl_id}" "${deadline}" &
|
||||
pid=$!
|
||||
PIDS="${PIDS} ${pid}"
|
||||
if [ "${STAGGER_SECONDS}" -gt 0 ] && [ "${pos}" -lt "${TOTAL_DEVICES}" ]; then
|
||||
sleep "${STAGGER_SECONDS}"
|
||||
fi
|
||||
done
|
||||
FAIL=0
|
||||
for pid in ${PIDS}; do
|
||||
|
||||
@@ -21,8 +21,13 @@ read_nvidia_modules_flavor() {
|
||||
|
||||
log "kernel: $(uname -r)"
|
||||
|
||||
# Skip if no NVIDIA GPU present (PCI vendor 10de)
|
||||
if ! lspci -nn 2>/dev/null | grep -qi '10de:'; then
|
||||
# Skip if no NVIDIA display/compute GPU is present.
|
||||
# Match only display-class PCI functions (0300 VGA, 0302 3D controller) from vendor 10de.
|
||||
have_nvidia_gpu() {
|
||||
lspci -Dn 2>/dev/null | awk '$2 ~ /^03(00|02):$/ && $3 ~ /^10de:/ { found=1; exit } END { exit(found ? 0 : 1) }'
|
||||
}
|
||||
|
||||
if ! have_nvidia_gpu; then
|
||||
log "no NVIDIA GPU detected — skipping module load"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
@@ -14,7 +14,7 @@ log() {
|
||||
}
|
||||
|
||||
have_nvidia_gpu() {
|
||||
lspci -nn 2>/dev/null | grep -qi '10de:'
|
||||
lspci -Dn 2>/dev/null | awk '$2 ~ /^03(00|02):$/ && $3 ~ /^10de:/ { found=1; exit } END { exit(found ? 0 : 1) }'
|
||||
}
|
||||
|
||||
service_active() {
|
||||
|
||||
Reference in New Issue
Block a user