Compare commits
12 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4262c5b798 | ||
|
|
b2e177af31 | ||
|
|
271dadda03 | ||
|
|
20766ccc76 | ||
|
|
966944d6d8 | ||
| ce6b1e0eb7 | |||
| 4066e842a9 | |||
| 7d2e904d14 | |||
| 2320925433 | |||
| e169a7722c | |||
| 74a3c65f64 | |||
| 884988cb2a |
File diff suppressed because it is too large
Load Diff
405
audit/internal/app/app_format.go
Normal file
405
audit/internal/app/app_format.go
Normal file
@@ -0,0 +1,405 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/collector"
|
||||
"bee/audit/internal/platform"
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
func hostnameOr(fallback string) string {
|
||||
hn, err := os.Hostname()
|
||||
if err != nil || strings.TrimSpace(hn) == "" {
|
||||
return fallback
|
||||
}
|
||||
return hn
|
||||
}
|
||||
|
||||
func sanitizeFilename(v string) string {
|
||||
var out []rune
|
||||
for _, r := range v {
|
||||
switch {
|
||||
case r >= 'a' && r <= 'z', r >= 'A' && r <= 'Z', r >= '0' && r <= '9', r == '-', r == '_', r == '.':
|
||||
out = append(out, r)
|
||||
default:
|
||||
out = append(out, '-')
|
||||
}
|
||||
}
|
||||
if len(out) == 0 {
|
||||
return "unknown"
|
||||
}
|
||||
return string(out)
|
||||
}
|
||||
|
||||
func bodyOr(body, fallback string) string {
|
||||
body = strings.TrimSpace(body)
|
||||
if body == "" {
|
||||
return fallback
|
||||
}
|
||||
return body
|
||||
}
|
||||
|
||||
func trimPtr(value *string) string {
|
||||
if value == nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(*value)
|
||||
}
|
||||
|
||||
func joinSortedKeys(values map[string]struct{}) string {
|
||||
if len(values) == 0 {
|
||||
return ""
|
||||
}
|
||||
keys := make([]string, 0, len(values))
|
||||
for key := range values {
|
||||
keys = append(keys, key)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
return strings.Join(keys, "/")
|
||||
}
|
||||
|
||||
func humanizeMB(totalMB int) string {
|
||||
if totalMB <= 0 {
|
||||
return ""
|
||||
}
|
||||
gb := float64(totalMB) / 1024.0
|
||||
if gb >= 1024.0 {
|
||||
tb := gb / 1024.0
|
||||
return fmt.Sprintf("%.1f TB", tb)
|
||||
}
|
||||
if gb == float64(int64(gb)) {
|
||||
return fmt.Sprintf("%.0f GB", gb)
|
||||
}
|
||||
return fmt.Sprintf("%.1f GB", gb)
|
||||
}
|
||||
|
||||
func humanizeGB(totalGB int) string {
|
||||
if totalGB <= 0 {
|
||||
return ""
|
||||
}
|
||||
tb := float64(totalGB) / 1024.0
|
||||
if tb >= 1.0 {
|
||||
return fmt.Sprintf("%.1f TB", tb)
|
||||
}
|
||||
return fmt.Sprintf("%d GB", totalGB)
|
||||
}
|
||||
|
||||
func parseKeyValueSummary(raw string) map[string]string {
|
||||
out := map[string]string{}
|
||||
for _, line := range strings.Split(raw, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
key, value, ok := strings.Cut(line, "=")
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
out[strings.TrimSpace(key)] = strings.TrimSpace(value)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func firstNonEmpty(values ...string) string {
|
||||
for _, value := range values {
|
||||
value = strings.TrimSpace(value)
|
||||
if value != "" {
|
||||
return value
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func cleanSummaryKey(key string) string {
|
||||
idx := strings.Index(key, "-")
|
||||
if idx <= 0 {
|
||||
return key
|
||||
}
|
||||
prefix := key[:idx]
|
||||
for _, c := range prefix {
|
||||
if c < '0' || c > '9' {
|
||||
return key
|
||||
}
|
||||
}
|
||||
return key[idx+1:]
|
||||
}
|
||||
|
||||
func isGPUDevice(dev schema.HardwarePCIeDevice) bool {
|
||||
// Exclude Aspeed BMC VGA adapters (not compute GPUs).
|
||||
if dev.VendorID != nil && *dev.VendorID == collector.AspeedVendorID {
|
||||
return false
|
||||
}
|
||||
class := trimPtr(dev.DeviceClass)
|
||||
// AMD Instinct / Radeon compute GPUs always carry ProcessingAccelerator or DisplayController.
|
||||
// Do NOT match AMD vendor alone — CPU chipset PCIe devices share that vendor ID.
|
||||
if class == "VideoController" || class == "DisplayController" || class == "ProcessingAccelerator" {
|
||||
return true
|
||||
}
|
||||
// NVIDIA devices sometimes expose class values outside the standard GPU set.
|
||||
return dev.VendorID != nil && *dev.VendorID == collector.NvidiaVendorID
|
||||
}
|
||||
|
||||
func formatSystemLine(board schema.HardwareBoard) string {
|
||||
model := strings.TrimSpace(strings.Join([]string{
|
||||
trimPtr(board.Manufacturer),
|
||||
trimPtr(board.ProductName),
|
||||
}, " "))
|
||||
serial := strings.TrimSpace(board.SerialNumber)
|
||||
switch {
|
||||
case model != "" && serial != "":
|
||||
return fmt.Sprintf("System: %s | S/N %s", model, serial)
|
||||
case model != "":
|
||||
return "System: " + model
|
||||
case serial != "":
|
||||
return "System S/N: " + serial
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
func formatCPULine(cpus []schema.HardwareCPU) string {
|
||||
if len(cpus) == 0 {
|
||||
return ""
|
||||
}
|
||||
modelCounts := map[string]int{}
|
||||
unknown := 0
|
||||
for _, cpu := range cpus {
|
||||
model := trimPtr(cpu.Model)
|
||||
if model == "" {
|
||||
unknown++
|
||||
continue
|
||||
}
|
||||
modelCounts[model]++
|
||||
}
|
||||
if len(modelCounts) == 1 && unknown == 0 {
|
||||
for model, count := range modelCounts {
|
||||
return fmt.Sprintf("CPU: %d x %s", count, model)
|
||||
}
|
||||
}
|
||||
parts := make([]string, 0, len(modelCounts)+1)
|
||||
if len(modelCounts) > 0 {
|
||||
keys := make([]string, 0, len(modelCounts))
|
||||
for key := range modelCounts {
|
||||
keys = append(keys, key)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
for _, key := range keys {
|
||||
parts = append(parts, fmt.Sprintf("%d x %s", modelCounts[key], key))
|
||||
}
|
||||
}
|
||||
if unknown > 0 {
|
||||
parts = append(parts, fmt.Sprintf("%d x unknown", unknown))
|
||||
}
|
||||
return "CPU: " + strings.Join(parts, ", ")
|
||||
}
|
||||
|
||||
func formatMemoryLine(dimms []schema.HardwareMemory) string {
|
||||
totalMB := 0
|
||||
present := 0
|
||||
types := map[string]struct{}{}
|
||||
for _, dimm := range dimms {
|
||||
if dimm.Present != nil && !*dimm.Present {
|
||||
continue
|
||||
}
|
||||
if dimm.SizeMB == nil || *dimm.SizeMB <= 0 {
|
||||
continue
|
||||
}
|
||||
present++
|
||||
totalMB += *dimm.SizeMB
|
||||
if value := trimPtr(dimm.Type); value != "" {
|
||||
types[value] = struct{}{}
|
||||
}
|
||||
}
|
||||
if totalMB == 0 {
|
||||
return ""
|
||||
}
|
||||
typeText := joinSortedKeys(types)
|
||||
line := fmt.Sprintf("Memory: %s", humanizeMB(totalMB))
|
||||
if typeText != "" {
|
||||
line += " " + typeText
|
||||
}
|
||||
if present > 0 {
|
||||
line += fmt.Sprintf(" (%d DIMMs)", present)
|
||||
}
|
||||
return line
|
||||
}
|
||||
|
||||
func formatStorageLine(disks []schema.HardwareStorage) string {
|
||||
count := 0
|
||||
totalGB := 0
|
||||
for _, disk := range disks {
|
||||
if disk.Present != nil && !*disk.Present {
|
||||
continue
|
||||
}
|
||||
count++
|
||||
if disk.SizeGB != nil && *disk.SizeGB > 0 {
|
||||
totalGB += *disk.SizeGB
|
||||
}
|
||||
}
|
||||
if count == 0 {
|
||||
return ""
|
||||
}
|
||||
line := fmt.Sprintf("Storage: %d drives", count)
|
||||
if totalGB > 0 {
|
||||
line += fmt.Sprintf(" / %s", humanizeGB(totalGB))
|
||||
}
|
||||
return line
|
||||
}
|
||||
|
||||
func formatGPULine(devices []schema.HardwarePCIeDevice) string {
|
||||
gpus := map[string]int{}
|
||||
for _, dev := range devices {
|
||||
if !isGPUDevice(dev) {
|
||||
continue
|
||||
}
|
||||
name := firstNonEmpty(trimPtr(dev.Model), trimPtr(dev.Manufacturer), "unknown")
|
||||
gpus[name]++
|
||||
}
|
||||
if len(gpus) == 0 {
|
||||
return ""
|
||||
}
|
||||
keys := make([]string, 0, len(gpus))
|
||||
for key := range gpus {
|
||||
keys = append(keys, key)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
parts := make([]string, 0, len(keys))
|
||||
for _, key := range keys {
|
||||
parts = append(parts, fmt.Sprintf("%d x %s", gpus[key], key))
|
||||
}
|
||||
return "GPU: " + strings.Join(parts, ", ")
|
||||
}
|
||||
|
||||
func formatIPLine(list func() ([]platform.InterfaceInfo, error)) string {
|
||||
if list == nil {
|
||||
return ""
|
||||
}
|
||||
ifaces, err := list()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
seen := map[string]struct{}{}
|
||||
var ips []string
|
||||
for _, iface := range ifaces {
|
||||
for _, ip := range iface.IPv4 {
|
||||
ip = strings.TrimSpace(ip)
|
||||
if ip == "" {
|
||||
continue
|
||||
}
|
||||
if _, ok := seen[ip]; ok {
|
||||
continue
|
||||
}
|
||||
seen[ip] = struct{}{}
|
||||
ips = append(ips, ip)
|
||||
}
|
||||
}
|
||||
if len(ips) == 0 {
|
||||
return ""
|
||||
}
|
||||
sort.Strings(ips)
|
||||
return "IP: " + strings.Join(ips, ", ")
|
||||
}
|
||||
|
||||
func formatSATDetail(raw string) string {
|
||||
var b strings.Builder
|
||||
kv := parseKeyValueSummary(raw)
|
||||
|
||||
if t, ok := kv["run_at_utc"]; ok {
|
||||
fmt.Fprintf(&b, "Run: %s\n\n", t)
|
||||
}
|
||||
|
||||
lines := strings.Split(raw, "\n")
|
||||
var stepKeys []string
|
||||
seenStep := map[string]bool{}
|
||||
for _, line := range lines {
|
||||
if idx := strings.Index(line, "_status="); idx >= 0 {
|
||||
key := line[:idx]
|
||||
if !seenStep[key] && key != "overall" {
|
||||
seenStep[key] = true
|
||||
stepKeys = append(stepKeys, key)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for _, key := range stepKeys {
|
||||
status := kv[key+"_status"]
|
||||
display := cleanSummaryKey(key)
|
||||
switch status {
|
||||
case "OK":
|
||||
fmt.Fprintf(&b, "PASS %s\n", display)
|
||||
case "FAILED":
|
||||
fmt.Fprintf(&b, "FAIL %s\n", display)
|
||||
case "UNSUPPORTED":
|
||||
fmt.Fprintf(&b, "SKIP %s\n", display)
|
||||
default:
|
||||
fmt.Fprintf(&b, "? %s\n", display)
|
||||
}
|
||||
}
|
||||
|
||||
if overall, ok := kv["overall_status"]; ok {
|
||||
ok2 := kv["job_ok"]
|
||||
failed := kv["job_failed"]
|
||||
fmt.Fprintf(&b, "\nOverall: %s (ok=%s failed=%s)", overall, ok2, failed)
|
||||
}
|
||||
|
||||
return strings.TrimSpace(b.String())
|
||||
}
|
||||
|
||||
func formatSATSummary(label, raw string) string {
|
||||
values := parseKeyValueSummary(raw)
|
||||
var body strings.Builder
|
||||
fmt.Fprintf(&body, "%s:", label)
|
||||
if overall := firstNonEmpty(values["overall_status"], "UNKNOWN"); overall != "" {
|
||||
fmt.Fprintf(&body, " %s", overall)
|
||||
}
|
||||
if ok := firstNonEmpty(values["job_ok"], "0"); ok != "" {
|
||||
fmt.Fprintf(&body, " ok=%s", ok)
|
||||
}
|
||||
if failed := firstNonEmpty(values["job_failed"], "0"); failed != "" {
|
||||
fmt.Fprintf(&body, " failed=%s", failed)
|
||||
}
|
||||
if unsupported := firstNonEmpty(values["job_unsupported"], "0"); unsupported != "" && unsupported != "0" {
|
||||
fmt.Fprintf(&body, " unsupported=%s", unsupported)
|
||||
}
|
||||
if devices := strings.TrimSpace(values["devices"]); devices != "" {
|
||||
fmt.Fprintf(&body, "\nDevices: %s", devices)
|
||||
}
|
||||
return body.String()
|
||||
}
|
||||
|
||||
func latestSATSummaries() []string {
|
||||
patterns := []struct {
|
||||
label string
|
||||
prefix string
|
||||
}{
|
||||
{label: "NVIDIA SAT", prefix: "gpu-nvidia-"},
|
||||
{label: "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)", prefix: "gpu-nvidia-targeted-stress-"},
|
||||
{label: "NVIDIA Max Compute Load (dcgmproftester)", prefix: "gpu-nvidia-compute-"},
|
||||
{label: "NVIDIA Targeted Power (dcgmi diag targeted_power)", prefix: "gpu-nvidia-targeted-power-"},
|
||||
{label: "NVIDIA Pulse Test (dcgmi diag pulse_test)", prefix: "gpu-nvidia-pulse-"},
|
||||
{label: "NVIDIA Interconnect Test (NCCL all_reduce_perf)", prefix: "gpu-nvidia-nccl-"},
|
||||
{label: "NVIDIA Bandwidth Test (NVBandwidth)", prefix: "gpu-nvidia-bandwidth-"},
|
||||
{label: "Memory SAT", prefix: "memory-"},
|
||||
{label: "Storage SAT", prefix: "storage-"},
|
||||
{label: "CPU SAT", prefix: "cpu-"},
|
||||
}
|
||||
var out []string
|
||||
for _, item := range patterns {
|
||||
matches, err := filepath.Glob(filepath.Join(DefaultSATBaseDir, item.prefix+"*/summary.txt"))
|
||||
if err != nil || len(matches) == 0 {
|
||||
continue
|
||||
}
|
||||
sort.Strings(matches)
|
||||
raw, err := os.ReadFile(matches[len(matches)-1])
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
out = append(out, formatSATSummary(item.label, string(raw)))
|
||||
}
|
||||
return out
|
||||
}
|
||||
76
audit/internal/app/app_install.go
Normal file
76
audit/internal/app/app_install.go
Normal file
@@ -0,0 +1,76 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
func (a *App) ListRemovableTargets() ([]platform.RemovableTarget, error) {
|
||||
return a.exports.ListRemovableTargets()
|
||||
}
|
||||
|
||||
func (a *App) ExportLatestAudit(target platform.RemovableTarget) (string, error) {
|
||||
if _, err := os.Stat(DefaultAuditJSONPath); err != nil {
|
||||
return "", err
|
||||
}
|
||||
filename := fmt.Sprintf("audit-%s-%s.json", sanitizeFilename(hostnameOr("unknown")), time.Now().UTC().Format("20060102-150405"))
|
||||
tmpPath := filepath.Join(os.TempDir(), filename)
|
||||
data, err := readFileLimited(DefaultAuditJSONPath, 100<<20)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
if normalized, normErr := ApplySATOverlay(data); normErr == nil {
|
||||
data = normalized
|
||||
}
|
||||
if err := os.WriteFile(tmpPath, data, 0644); err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer os.Remove(tmpPath)
|
||||
return a.exports.ExportFileToTarget(tmpPath, target)
|
||||
}
|
||||
|
||||
func (a *App) ExportLatestAuditResult(target platform.RemovableTarget) (ActionResult, error) {
|
||||
path, err := a.ExportLatestAudit(target)
|
||||
body := "Audit export failed."
|
||||
if err == nil {
|
||||
body = "Audit exported."
|
||||
}
|
||||
if err == nil && path != "" {
|
||||
body = "Audit exported to " + path
|
||||
}
|
||||
return ActionResult{Title: "Export audit", Body: body}, err
|
||||
}
|
||||
|
||||
func (a *App) ExportSupportBundle(target platform.RemovableTarget) (string, error) {
|
||||
archive, err := BuildSupportBundle(DefaultExportDir)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer os.Remove(archive)
|
||||
return a.exports.ExportFileToTarget(archive, target)
|
||||
}
|
||||
|
||||
func (a *App) ExportSupportBundleResult(target platform.RemovableTarget) (ActionResult, error) {
|
||||
path, err := a.ExportSupportBundle(target)
|
||||
body := "Support bundle export failed."
|
||||
if err == nil {
|
||||
body = "Support bundle exported. USB target unmounted and safe to remove."
|
||||
}
|
||||
if err == nil && path != "" {
|
||||
body = "Support bundle exported to " + path + ".\n\nUSB target unmounted and safe to remove."
|
||||
}
|
||||
return ActionResult{Title: "Export support bundle", Body: body}, err
|
||||
}
|
||||
|
||||
func (a *App) ListInstallDisks() ([]platform.InstallDisk, error) {
|
||||
return a.installer.ListInstallDisks()
|
||||
}
|
||||
|
||||
func (a *App) InstallToDisk(ctx context.Context, device string, logFile string) error {
|
||||
return a.installer.InstallToDisk(ctx, device, logFile)
|
||||
}
|
||||
106
audit/internal/app/app_network.go
Normal file
106
audit/internal/app/app_network.go
Normal file
@@ -0,0 +1,106 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
func (a *App) ListInterfaces() ([]platform.InterfaceInfo, error) {
|
||||
return a.network.ListInterfaces()
|
||||
}
|
||||
|
||||
func (a *App) DefaultRoute() string {
|
||||
return a.network.DefaultRoute()
|
||||
}
|
||||
|
||||
func (a *App) DHCPOne(iface string) (string, error) {
|
||||
return a.network.DHCPOne(iface)
|
||||
}
|
||||
|
||||
func (a *App) DHCPOneResult(iface string) (ActionResult, error) {
|
||||
body, err := a.network.DHCPOne(iface)
|
||||
return ActionResult{Title: "DHCP: " + iface, Body: bodyOr(body, "DHCP completed.")}, err
|
||||
}
|
||||
|
||||
func (a *App) DHCPAll() (string, error) {
|
||||
return a.network.DHCPAll()
|
||||
}
|
||||
|
||||
func (a *App) DHCPAllResult() (ActionResult, error) {
|
||||
body, err := a.network.DHCPAll()
|
||||
return ActionResult{Title: "DHCP: all interfaces", Body: bodyOr(body, "DHCP completed.")}, err
|
||||
}
|
||||
|
||||
func (a *App) SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error) {
|
||||
return a.network.SetStaticIPv4(cfg)
|
||||
}
|
||||
|
||||
func (a *App) SetInterfaceState(iface string, up bool) error {
|
||||
return a.network.SetInterfaceState(iface, up)
|
||||
}
|
||||
|
||||
func (a *App) GetInterfaceState(iface string) (bool, error) {
|
||||
return a.network.GetInterfaceState(iface)
|
||||
}
|
||||
|
||||
func (a *App) CaptureNetworkSnapshot() (platform.NetworkSnapshot, error) {
|
||||
return a.network.CaptureNetworkSnapshot()
|
||||
}
|
||||
|
||||
func (a *App) RestoreNetworkSnapshot(snapshot platform.NetworkSnapshot) error {
|
||||
return a.network.RestoreNetworkSnapshot(snapshot)
|
||||
}
|
||||
|
||||
func (a *App) SetStaticIPv4Result(cfg platform.StaticIPv4Config) (ActionResult, error) {
|
||||
body, err := a.network.SetStaticIPv4(cfg)
|
||||
return ActionResult{Title: "Static IPv4: " + cfg.Interface, Body: bodyOr(body, "Static IPv4 updated.")}, err
|
||||
}
|
||||
|
||||
func (a *App) NetworkStatus() (ActionResult, error) {
|
||||
ifaces, err := a.network.ListInterfaces()
|
||||
if err != nil {
|
||||
return ActionResult{Title: "Network status"}, err
|
||||
}
|
||||
if len(ifaces) == 0 {
|
||||
return ActionResult{Title: "Network status", Body: "No physical interfaces found."}, nil
|
||||
}
|
||||
var body strings.Builder
|
||||
for _, iface := range ifaces {
|
||||
ipv4 := "(no IPv4)"
|
||||
if len(iface.IPv4) > 0 {
|
||||
ipv4 = strings.Join(iface.IPv4, ", ")
|
||||
}
|
||||
fmt.Fprintf(&body, "- %s: state=%s ip=%s\n", iface.Name, iface.State, ipv4)
|
||||
}
|
||||
if gw := a.network.DefaultRoute(); gw != "" {
|
||||
fmt.Fprintf(&body, "\nDefault route: %s\n", gw)
|
||||
}
|
||||
return ActionResult{Title: "Network status", Body: strings.TrimSpace(body.String())}, nil
|
||||
}
|
||||
|
||||
func (a *App) DefaultStaticIPv4FormFields(iface string) []string {
|
||||
return []string{
|
||||
"",
|
||||
"24",
|
||||
strings.TrimSpace(a.network.DefaultRoute()),
|
||||
"77.88.8.8 77.88.8.1 1.1.1.1 8.8.8.8",
|
||||
}
|
||||
}
|
||||
|
||||
func (a *App) ParseStaticIPv4Config(iface string, fields []string) platform.StaticIPv4Config {
|
||||
get := func(index int) string {
|
||||
if index >= 0 && index < len(fields) {
|
||||
return strings.TrimSpace(fields[index])
|
||||
}
|
||||
return ""
|
||||
}
|
||||
return platform.StaticIPv4Config{
|
||||
Interface: iface,
|
||||
Address: get(0),
|
||||
Prefix: get(1),
|
||||
Gateway: get(2),
|
||||
DNS: strings.Fields(get(3)),
|
||||
}
|
||||
}
|
||||
370
audit/internal/app/app_packs.go
Normal file
370
audit/internal/app/app_packs.go
Normal file
@@ -0,0 +1,370 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
func (a *App) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaAcceptancePack(baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||
path, err := a.RunNvidiaAcceptancePack(baseDir, nil)
|
||||
body := "Archive written."
|
||||
if path != "" {
|
||||
body = "Archive written to " + path
|
||||
}
|
||||
return ActionResult{Title: "NVIDIA SAT", Body: body}, err
|
||||
}
|
||||
|
||||
func (a *App) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
|
||||
return a.sat.ListNvidiaGPUs()
|
||||
}
|
||||
|
||||
func (a *App) ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error) {
|
||||
return a.sat.ListNvidiaGPUStatuses()
|
||||
}
|
||||
|
||||
func (a *App) ResetNvidiaGPU(index int) (ActionResult, error) {
|
||||
out, err := a.sat.ResetNvidiaGPU(index)
|
||||
return ActionResult{Title: fmt.Sprintf("Reset NVIDIA GPU %d", index), Body: strings.TrimSpace(out)}, err
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (ActionResult, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
path, err := a.sat.RunNvidiaAcceptancePackWithOptions(ctx, baseDir, diagLevel, gpuIndices, logFunc)
|
||||
body := "Archive written."
|
||||
if path != "" {
|
||||
body = "Archive written to " + path
|
||||
}
|
||||
return ActionResult{Title: "NVIDIA DCGM", Body: body}, err
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaTargetedStressValidatePack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaStressPack(baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||
return a.RunNvidiaStressPackCtx(context.Background(), baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaBenchmark(baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||
return a.RunNvidiaBenchmarkCtx(context.Background(), baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultBeeBenchPerfDir
|
||||
}
|
||||
resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "performance", logFunc)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
opts.ServerPowerSource = resolved.SelectedSource
|
||||
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaPowerBenchCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultBeeBenchPowerDir
|
||||
}
|
||||
resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "power-fit", logFunc)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
opts.ServerPowerSource = resolved.SelectedSource
|
||||
return a.sat.RunNvidiaPowerBench(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaPowerSourceAutotuneCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultBeeBenchAutotuneDir
|
||||
}
|
||||
return a.sat.RunNvidiaPowerSourceAutotune(ctx, baseDir, opts, benchmarkKind, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) LoadBenchmarkPowerAutotune() (*platform.BenchmarkPowerAutotuneConfig, error) {
|
||||
return platform.LoadBenchmarkPowerAutotuneConfig(DefaultBeeBenchPowerSourceConfigPath)
|
||||
}
|
||||
|
||||
func (a *App) ensureBenchmarkPowerAutotune(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (platform.BenchmarkPowerAutotuneConfig, error) {
|
||||
cfgPath := platform.BenchmarkPowerSourceConfigPath(baseDir)
|
||||
if cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath); err == nil {
|
||||
if logFunc != nil {
|
||||
logFunc(fmt.Sprintf("benchmark autotune: using saved server power source %s", cfg.SelectedSource))
|
||||
}
|
||||
return *cfg, nil
|
||||
}
|
||||
if logFunc != nil {
|
||||
logFunc("benchmark autotune: no saved power source config, running autotune first")
|
||||
}
|
||||
autotuneDir := filepath.Join(filepath.Dir(baseDir), "autotune")
|
||||
if _, err := a.RunNvidiaPowerSourceAutotuneCtx(ctx, autotuneDir, opts, benchmarkKind, logFunc); err != nil {
|
||||
return platform.BenchmarkPowerAutotuneConfig{}, err
|
||||
}
|
||||
cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath)
|
||||
if err != nil {
|
||||
return platform.BenchmarkPowerAutotuneConfig{}, err
|
||||
}
|
||||
return *cfg, nil
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, staggerSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaTargetedPowerPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaPulseTestPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaBandwidthPack(ctx, baseDir, gpuIndices, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaStressPack(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, 256, 1, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryAcceptancePackCtx(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunMemoryAcceptancePack(ctx, baseDir, sizeMB, passes, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||
path, err := a.RunMemoryAcceptancePack(baseDir, nil)
|
||||
return ActionResult{Title: "Memory SAT", Body: satResultBody(path)}, err
|
||||
}
|
||||
|
||||
func (a *App) RunCPUAcceptancePack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.RunCPUAcceptancePackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunCPUAcceptancePackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunCPUAcceptancePack(ctx, baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunCPUAcceptancePackResult(baseDir string, durationSec int) (ActionResult, error) {
|
||||
path, err := a.RunCPUAcceptancePack(baseDir, durationSec, nil)
|
||||
return ActionResult{Title: "CPU SAT", Body: satResultBody(path)}, err
|
||||
}
|
||||
|
||||
func (a *App) RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunStorageAcceptancePackCtx(context.Background(), baseDir, false, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunStorageAcceptancePackCtx(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunStorageAcceptancePack(ctx, baseDir, extended, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||
path, err := a.RunStorageAcceptancePack(baseDir, nil)
|
||||
return ActionResult{Title: "Storage SAT", Body: satResultBody(path)}, err
|
||||
}
|
||||
|
||||
func (a *App) DetectGPUVendor() string {
|
||||
return a.sat.DetectGPUVendor()
|
||||
}
|
||||
|
||||
func (a *App) ListAMDGPUs() ([]platform.AMDGPUInfo, error) {
|
||||
return a.sat.ListAMDGPUs()
|
||||
}
|
||||
|
||||
func (a *App) RunAMDAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunAMDAcceptancePackCtx(context.Background(), baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunAMDAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunAMDAcceptancePack(ctx, baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunAMDAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||
path, err := a.RunAMDAcceptancePack(baseDir, nil)
|
||||
return ActionResult{Title: "AMD GPU SAT", Body: satResultBody(path)}, err
|
||||
}
|
||||
|
||||
func (a *App) RunAMDMemIntegrityPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunAMDMemIntegrityPack(ctx, baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunAMDMemBandwidthPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunAMDMemBandwidthPack(ctx, baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.RunMemoryStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunSATStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.RunSATStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunAMDStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.RunAMDStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.sat.RunMemoryStressPack(ctx, baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunSATStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.sat.RunSATStressPack(ctx, baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunAMDStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunAMDStressPack(ctx, baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunFanStressTest(ctx, baseDir, opts)
|
||||
}
|
||||
|
||||
func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNCCLTests(ctx, baseDir, gpuIndices, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
|
||||
path, err := a.RunNCCLTests(ctx, DefaultSATBaseDir, nil, nil)
|
||||
body := "Results: " + path
|
||||
if err != nil && err != context.Canceled {
|
||||
body += "\nERROR: " + err.Error()
|
||||
}
|
||||
return ActionResult{Title: "NCCL bandwidth test", Body: body}, err
|
||||
}
|
||||
|
||||
func (a *App) RunFanStressTestResult(ctx context.Context, opts platform.FanStressOptions) (ActionResult, error) {
|
||||
path, err := a.RunFanStressTest(ctx, "", opts)
|
||||
body := formatFanStressResult(path)
|
||||
if err != nil && err != context.Canceled {
|
||||
body += "\nERROR: " + err.Error()
|
||||
}
|
||||
return ActionResult{Title: "GPU Platform Stress Test", Body: body}, err
|
||||
}
|
||||
|
||||
// formatFanStressResult formats the summary.txt from a fan-stress run, including
|
||||
// the per-step pass/fail display and the analysis section (throttling, max temps, fan response).
|
||||
func formatFanStressResult(archivePath string) string {
|
||||
if archivePath == "" {
|
||||
return "No output produced."
|
||||
}
|
||||
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||
raw, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||
if err != nil {
|
||||
return "Archive written to " + archivePath
|
||||
}
|
||||
content := strings.TrimSpace(string(raw))
|
||||
kv := parseKeyValueSummary(content)
|
||||
|
||||
var b strings.Builder
|
||||
b.WriteString(formatSATDetail(content))
|
||||
|
||||
// Append analysis section.
|
||||
var analysis []string
|
||||
if v, ok := kv["throttling_detected"]; ok {
|
||||
label := "NO"
|
||||
if v == "true" {
|
||||
label = "YES ← throttling detected during load"
|
||||
}
|
||||
analysis = append(analysis, "Throttling: "+label)
|
||||
}
|
||||
if v, ok := kv["max_gpu_temp_c"]; ok && v != "0.0" {
|
||||
analysis = append(analysis, "Max GPU temp: "+v+"°C")
|
||||
}
|
||||
if v, ok := kv["max_cpu_temp_c"]; ok && v != "0.0" {
|
||||
analysis = append(analysis, "Max CPU temp: "+v+"°C")
|
||||
}
|
||||
if v, ok := kv["fan_response_sec"]; ok && v != "N/A" && v != "-1.0" {
|
||||
analysis = append(analysis, "Fan response: "+v+"s")
|
||||
}
|
||||
|
||||
if len(analysis) > 0 {
|
||||
b.WriteString("\n\n=== Analysis ===\n")
|
||||
for _, line := range analysis {
|
||||
b.WriteString(line + "\n")
|
||||
}
|
||||
}
|
||||
return strings.TrimSpace(b.String())
|
||||
}
|
||||
|
||||
// satResultBody reads summary.txt from the SAT run directory (archive path without .tar.gz)
|
||||
// and returns a formatted human-readable result. Falls back to a plain message if unreadable.
|
||||
func satResultBody(archivePath string) string {
|
||||
if archivePath == "" {
|
||||
return "No output produced."
|
||||
}
|
||||
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||
raw, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||
if err != nil {
|
||||
return "Archive written to " + archivePath
|
||||
}
|
||||
return formatSATDetail(strings.TrimSpace(string(raw)))
|
||||
}
|
||||
67
audit/internal/app/app_services.go
Normal file
67
audit/internal/app/app_services.go
Normal file
@@ -0,0 +1,67 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
func (a *App) ListBeeServices() ([]string, error) {
|
||||
return a.services.ListBeeServices()
|
||||
}
|
||||
|
||||
func (a *App) ServiceState(name string) string {
|
||||
return a.services.ServiceState(name)
|
||||
}
|
||||
|
||||
func (a *App) ServiceStatus(name string) (string, error) {
|
||||
return a.services.ServiceStatus(name)
|
||||
}
|
||||
|
||||
func (a *App) ServiceStatusResult(name string) (ActionResult, error) {
|
||||
body, err := a.services.ServiceStatus(name)
|
||||
return ActionResult{Title: "service status: " + name, Body: bodyOr(body, "No status output.")}, err
|
||||
}
|
||||
|
||||
func (a *App) ServiceDo(name string, action platform.ServiceAction) (string, error) {
|
||||
return a.services.ServiceDo(name, action)
|
||||
}
|
||||
|
||||
func (a *App) ServiceActionResult(name string, action platform.ServiceAction) (ActionResult, error) {
|
||||
body, err := a.services.ServiceDo(name, action)
|
||||
return ActionResult{Title: "service " + string(action) + ": " + name, Body: bodyOr(body, "Action completed.")}, err
|
||||
}
|
||||
|
||||
func (a *App) TailFile(path string, lines int) string {
|
||||
return a.tools.TailFile(path, lines)
|
||||
}
|
||||
|
||||
func (a *App) CheckTools(names []string) []platform.ToolStatus {
|
||||
return a.tools.CheckTools(names)
|
||||
}
|
||||
|
||||
func (a *App) ToolCheckResult(names []string) ActionResult {
|
||||
if len(names) == 0 {
|
||||
return ActionResult{Title: "Required tools", Body: "No tools checked."}
|
||||
}
|
||||
var body strings.Builder
|
||||
for _, tool := range a.tools.CheckTools(names) {
|
||||
status := "MISSING"
|
||||
if tool.OK {
|
||||
status = "OK (" + tool.Path + ")"
|
||||
}
|
||||
fmt.Fprintf(&body, "- %s: %s\n", tool.Name, status)
|
||||
}
|
||||
return ActionResult{Title: "Required tools", Body: strings.TrimSpace(body.String())}
|
||||
}
|
||||
|
||||
func (a *App) AuditLogTailResult() ActionResult {
|
||||
logTail := strings.TrimSpace(a.tools.TailFile(DefaultAuditLogPath, 40))
|
||||
jsonTail := strings.TrimSpace(a.tools.TailFile(DefaultAuditJSONPath, 20))
|
||||
body := strings.TrimSpace(logTail + "\n\n" + jsonTail)
|
||||
if body == "" {
|
||||
body = "No audit logs found."
|
||||
}
|
||||
return ActionResult{Title: "Audit log tail", Body: body}
|
||||
}
|
||||
@@ -3,10 +3,11 @@ package app
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/collector"
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
@@ -313,17 +314,20 @@ func statusSeverity(status string) int {
|
||||
}
|
||||
|
||||
func matchesGPUVendor(dev schema.HardwarePCIeDevice, vendor string) bool {
|
||||
if dev.DeviceClass == nil || !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Controller") && !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Accelerator") {
|
||||
if dev.DeviceClass == nil || !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Display") && !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Video") {
|
||||
return false
|
||||
}
|
||||
if dev.DeviceClass == nil {
|
||||
return false
|
||||
}
|
||||
class := strings.TrimSpace(*dev.DeviceClass)
|
||||
isGPUClass := strings.Contains(class, "Controller") || strings.Contains(class, "Accelerator") ||
|
||||
strings.Contains(class, "Display") || strings.Contains(class, "Video")
|
||||
if !isGPUClass {
|
||||
return false
|
||||
}
|
||||
manufacturer := strings.ToLower(strings.TrimSpace(ptrString(dev.Manufacturer)))
|
||||
switch vendor {
|
||||
case "amd":
|
||||
return strings.Contains(manufacturer, "advanced micro devices") || strings.Contains(manufacturer, "amd/ati")
|
||||
return dev.VendorID != nil && *dev.VendorID == collector.AMDVendorID
|
||||
case "nvidia":
|
||||
return strings.Contains(manufacturer, "nvidia")
|
||||
return dev.VendorID != nil && *dev.VendorID == collector.NvidiaVendorID
|
||||
default:
|
||||
return false
|
||||
}
|
||||
|
||||
@@ -5,6 +5,7 @@ import (
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"bee/audit/internal/collector"
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
@@ -46,10 +47,12 @@ func TestApplyLatestSATStatusesMarksAMDGPUs(t *testing.T) {
|
||||
|
||||
class := "DisplayController"
|
||||
manufacturer := "Advanced Micro Devices, Inc. [AMD/ATI]"
|
||||
amdVendorID := collector.AMDVendorID
|
||||
snap := schema.HardwareSnapshot{
|
||||
PCIeDevices: []schema.HardwarePCIeDevice{{
|
||||
DeviceClass: &class,
|
||||
Manufacturer: &manufacturer,
|
||||
VendorID: &amdVendorID,
|
||||
}},
|
||||
}
|
||||
|
||||
|
||||
@@ -84,11 +84,10 @@ func hasAMDGPUDevices(devs []schema.HardwarePCIeDevice) bool {
|
||||
}
|
||||
|
||||
func isAMDGPUDevice(dev schema.HardwarePCIeDevice) bool {
|
||||
if dev.Manufacturer == nil || dev.DeviceClass == nil {
|
||||
if dev.DeviceClass == nil {
|
||||
return false
|
||||
}
|
||||
manufacturer := strings.ToLower(strings.TrimSpace(*dev.Manufacturer))
|
||||
return strings.Contains(manufacturer, "advanced micro devices") && isGPUClass(strings.TrimSpace(*dev.DeviceClass))
|
||||
return dev.VendorID != nil && *dev.VendorID == AMDVendorID && isGPUClass(strings.TrimSpace(*dev.DeviceClass))
|
||||
}
|
||||
|
||||
func queryAMDGPUs() (map[string]amdGPUInfo, error) {
|
||||
|
||||
@@ -11,7 +11,6 @@ import (
|
||||
"time"
|
||||
)
|
||||
|
||||
const mellanoxVendorID = 0x15b3
|
||||
const nicProbeTimeout = 2 * time.Second
|
||||
|
||||
var (
|
||||
@@ -80,16 +79,7 @@ func enrichPCIeWithMellanox(devs []schema.HardwarePCIeDevice) []schema.HardwareP
|
||||
}
|
||||
|
||||
func isMellanoxDevice(dev schema.HardwarePCIeDevice) bool {
|
||||
if dev.VendorID != nil && *dev.VendorID == mellanoxVendorID {
|
||||
return true
|
||||
}
|
||||
if dev.Manufacturer != nil {
|
||||
m := strings.ToLower(*dev.Manufacturer)
|
||||
if strings.Contains(m, "mellanox") || strings.Contains(m, "nvidia networking") {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
return dev.VendorID != nil && *dev.VendorID == MellanoxVendorID
|
||||
}
|
||||
|
||||
func queryMellanoxFromMstflint(bdf string) (firmware, serial string) {
|
||||
|
||||
@@ -55,7 +55,7 @@ func TestEnrichPCIeWithMellanox_mstflint(t *testing.T) {
|
||||
}
|
||||
netIfacesByBDF = func(string) []string { return nil }
|
||||
|
||||
vendorID := mellanoxVendorID
|
||||
vendorID := MellanoxVendorID
|
||||
bdf := "0000:18:00.0"
|
||||
manufacturer := "Mellanox Technologies"
|
||||
devs := []schema.HardwarePCIeDevice{{
|
||||
@@ -99,7 +99,7 @@ func TestEnrichPCIeWithMellanox_fallbackEthtool(t *testing.T) {
|
||||
return "driver: mlx5_core\nfirmware-version: 28.40.1000\n", nil
|
||||
}
|
||||
|
||||
vendorID := mellanoxVendorID
|
||||
vendorID := MellanoxVendorID
|
||||
bdf := "0000:18:00.0"
|
||||
manufacturer := "NVIDIA Networking"
|
||||
devs := []schema.HardwarePCIeDevice{{
|
||||
|
||||
@@ -10,8 +10,6 @@ import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
const nvidiaVendorID = 0x10de
|
||||
|
||||
type nvidiaGPUInfo struct {
|
||||
Index int
|
||||
BDF string
|
||||
@@ -240,13 +238,7 @@ func normalizePCIeBDF(bdf string) string {
|
||||
}
|
||||
|
||||
func isNVIDIADevice(dev schema.HardwarePCIeDevice) bool {
|
||||
if dev.VendorID != nil && *dev.VendorID == nvidiaVendorID {
|
||||
return true
|
||||
}
|
||||
if dev.Manufacturer != nil && strings.Contains(strings.ToLower(*dev.Manufacturer), "nvidia") {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
return dev.VendorID != nil && *dev.VendorID == NvidiaVendorID
|
||||
}
|
||||
|
||||
func setPCIeFallback(dev *schema.HardwarePCIeDevice) {
|
||||
|
||||
@@ -57,7 +57,7 @@ func TestNormalizePCIeBDF(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestEnrichPCIeWithNVIDIAData_driverLoaded(t *testing.T) {
|
||||
vendorID := nvidiaVendorID
|
||||
vendorID := NvidiaVendorID
|
||||
bdf := "0000:65:00.0"
|
||||
manufacturer := "NVIDIA Corporation"
|
||||
status := "OK"
|
||||
@@ -104,7 +104,7 @@ func TestEnrichPCIeWithNVIDIAData_driverLoaded(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestEnrichPCIeWithNVIDIAData_driverMissingFallback(t *testing.T) {
|
||||
vendorID := nvidiaVendorID
|
||||
vendorID := NvidiaVendorID
|
||||
bdf := "0000:17:00.0"
|
||||
manufacturer := "NVIDIA Corporation"
|
||||
devices := []schema.HardwarePCIeDevice{
|
||||
|
||||
11
audit/internal/collector/pci_vendors.go
Normal file
11
audit/internal/collector/pci_vendors.go
Normal file
@@ -0,0 +1,11 @@
|
||||
package collector
|
||||
|
||||
// PCI vendor IDs for hardware classification.
|
||||
// Source: https://pcisig.com / https://pci-ids.ucw.cz/
|
||||
const (
|
||||
NvidiaVendorID = 0x10de
|
||||
AMDVendorID = 0x1002
|
||||
AspeedVendorID = 0x1a03
|
||||
MellanoxVendorID = 0x15b3
|
||||
IntelVendorID = 0x8086
|
||||
)
|
||||
@@ -278,6 +278,11 @@ func readPCIStringAttribute(bdf, attribute string) (string, bool) {
|
||||
// below the device maximum. Regular PCIe slots get Warning; NVLink bridge cards
|
||||
// get Critical because they are fixed internal connectors that must always train
|
||||
// to max speed — any downgrade signals a hardware fault.
|
||||
//
|
||||
// Disabled devices (sysfs enable==0) are skipped: they carry no data traffic and
|
||||
// their link state has no operational impact. This covers management endpoints
|
||||
// (e.g. PCIe switch fabric controllers on HGX baseboards) that the kernel never
|
||||
// activates but that lspci still reports with link stats.
|
||||
func applyPCIeLinkSpeedWarning(dev *schema.HardwarePCIeDevice) {
|
||||
if dev.LinkSpeed == nil || dev.MaxLinkSpeed == nil {
|
||||
return
|
||||
@@ -285,6 +290,11 @@ func applyPCIeLinkSpeedWarning(dev *schema.HardwarePCIeDevice) {
|
||||
if pcieLinkSpeedRank(*dev.LinkSpeed) >= pcieLinkSpeedRank(*dev.MaxLinkSpeed) {
|
||||
return
|
||||
}
|
||||
if dev.BDF != nil {
|
||||
if enabled, ok := readPCIIntAttribute(*dev.BDF, "enable"); ok && enabled == 0 {
|
||||
return
|
||||
}
|
||||
}
|
||||
desc := fmt.Sprintf("PCIe link speed degraded: running at %s, capable of %s", *dev.LinkSpeed, *dev.MaxLinkSpeed)
|
||||
dev.ErrorDescription = &desc
|
||||
|
||||
|
||||
@@ -58,7 +58,6 @@ func buildSensorsFromDoc(doc sensorsDoc) *schema.HardwareSensors {
|
||||
|
||||
for _, chip := range chips {
|
||||
features := doc[chip]
|
||||
location := sensorLocation(chip)
|
||||
|
||||
keys := make([]string, 0, len(features))
|
||||
for key := range features {
|
||||
@@ -80,25 +79,25 @@ func buildSensorsFromDoc(doc sensorsDoc) *schema.HardwareSensors {
|
||||
}
|
||||
switch classifySensorFeature(feature) {
|
||||
case "fan":
|
||||
item := buildFanSensor(name, location, feature)
|
||||
item := buildFanSensor(name, feature)
|
||||
if item == nil || duplicateSensor(seen, "fan", item.Name) {
|
||||
continue
|
||||
}
|
||||
result.Fans = append(result.Fans, *item)
|
||||
case "temp":
|
||||
item := buildTempSensor(name, location, feature)
|
||||
item := buildTempSensor(name, feature)
|
||||
if item == nil || duplicateSensor(seen, "temp", item.Name) {
|
||||
continue
|
||||
}
|
||||
result.Temperatures = append(result.Temperatures, *item)
|
||||
case "power":
|
||||
item := buildPowerSensor(name, location, feature)
|
||||
item := buildPowerSensor(name, feature)
|
||||
if item == nil || duplicateSensor(seen, "power", item.Name) {
|
||||
continue
|
||||
}
|
||||
result.Power = append(result.Power, *item)
|
||||
default:
|
||||
item := buildOtherSensor(name, location, feature)
|
||||
item := buildOtherSensor(name, feature)
|
||||
if item == nil || duplicateSensor(seen, "other", item.Name) {
|
||||
continue
|
||||
}
|
||||
@@ -128,14 +127,6 @@ func duplicateSensor(seen map[string]struct{}, sensorType, name string) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
func sensorLocation(chip string) *string {
|
||||
chip = strings.TrimSpace(chip)
|
||||
if chip == "" {
|
||||
return nil
|
||||
}
|
||||
return &chip
|
||||
}
|
||||
|
||||
func classifySensorFeature(feature map[string]any) string {
|
||||
for key := range feature {
|
||||
switch {
|
||||
@@ -154,24 +145,24 @@ func classifySensorFeature(feature map[string]any) string {
|
||||
return "other"
|
||||
}
|
||||
|
||||
func buildFanSensor(name string, location *string, feature map[string]any) *schema.HardwareFanSensor {
|
||||
func buildFanSensor(name string, feature map[string]any) *schema.HardwareFanSensor {
|
||||
rpm, ok := firstFeatureInt(feature, "_input")
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
item := &schema.HardwareFanSensor{Name: name, Location: location, RPM: &rpm}
|
||||
item := &schema.HardwareFanSensor{Name: name, RPM: &rpm}
|
||||
if status := sensorStatusFromFeature(feature); status != nil {
|
||||
item.Status = status
|
||||
}
|
||||
return item
|
||||
}
|
||||
|
||||
func buildTempSensor(name string, location *string, feature map[string]any) *schema.HardwareTemperatureSensor {
|
||||
func buildTempSensor(name string, feature map[string]any) *schema.HardwareTemperatureSensor {
|
||||
celsius, ok := firstFeatureFloat(feature, "_input")
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
item := &schema.HardwareTemperatureSensor{Name: name, Location: location, Celsius: &celsius}
|
||||
item := &schema.HardwareTemperatureSensor{Name: name, Celsius: &celsius}
|
||||
if warning, ok := firstFeatureFloatWithSuffixes(feature, []string{"_max", "_high"}); ok {
|
||||
item.ThresholdWarningCelsius = &warning
|
||||
}
|
||||
@@ -186,8 +177,8 @@ func buildTempSensor(name string, location *string, feature map[string]any) *sch
|
||||
return item
|
||||
}
|
||||
|
||||
func buildPowerSensor(name string, location *string, feature map[string]any) *schema.HardwarePowerSensor {
|
||||
item := &schema.HardwarePowerSensor{Name: name, Location: location}
|
||||
func buildPowerSensor(name string, feature map[string]any) *schema.HardwarePowerSensor {
|
||||
item := &schema.HardwarePowerSensor{Name: name}
|
||||
if v, ok := firstFeatureFloatWithContains(feature, []string{"power"}); ok {
|
||||
item.PowerW = &v
|
||||
}
|
||||
@@ -206,12 +197,12 @@ func buildPowerSensor(name string, location *string, feature map[string]any) *sc
|
||||
return item
|
||||
}
|
||||
|
||||
func buildOtherSensor(name string, location *string, feature map[string]any) *schema.HardwareOtherSensor {
|
||||
func buildOtherSensor(name string, feature map[string]any) *schema.HardwareOtherSensor {
|
||||
value, unit, ok := firstGenericSensorValue(feature)
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
item := &schema.HardwareOtherSensor{Name: name, Location: location, Value: &value}
|
||||
item := &schema.HardwareOtherSensor{Name: name, Value: &value}
|
||||
if unit != "" {
|
||||
item.Unit = &unit
|
||||
}
|
||||
|
||||
@@ -36,6 +36,24 @@ func bestEffortRescanHotplugStorage() {
|
||||
slog.Info("storage: scsi host scan skipped", "pattern", scsiHostScanGlob, "err", err)
|
||||
} else {
|
||||
for _, path := range hostPaths {
|
||||
// SAS HBAs (e.g. smartpqi) block indefinitely in sas_user_scan when
|
||||
// written to — SAS topology is discovered by the driver itself.
|
||||
// Detect via two methods: (1) sas_host class registration, and
|
||||
// (2) driver proc_name — smartpqi uses scsi_transport_sas but does
|
||||
// not register a sas_host object, so (1) alone misses it.
|
||||
host := filepath.Base(filepath.Dir(path))
|
||||
if _, err := os.Stat("/sys/class/sas_host/" + host); err == nil {
|
||||
slog.Info("storage: scsi host scan skipped (SAS host)", "path", path)
|
||||
continue
|
||||
}
|
||||
if procName, err := os.ReadFile("/sys/class/scsi_host/" + host + "/proc_name"); err == nil {
|
||||
switch strings.TrimSpace(string(procName)) {
|
||||
case "smartpqi", "hpsa":
|
||||
slog.Info("storage: scsi host scan skipped (SAS transport driver)",
|
||||
"path", path, "driver", strings.TrimSpace(string(procName)))
|
||||
continue
|
||||
}
|
||||
}
|
||||
if err := hotplugWriteFile(path, []byte("- - -\n"), 0644); err != nil {
|
||||
slog.Info("storage: scsi host scan write failed", "path", path, "err", err)
|
||||
continue
|
||||
@@ -406,20 +424,23 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
|
||||
}
|
||||
|
||||
// nvmeSmartLog is the subset of `nvme smart-log -o json` output we care about.
|
||||
// nvme-cli emits most counters as JSON strings (e.g. "power_on_hours":"49"),
|
||||
// so all numeric fields use jsonInt64 which accepts both bare numbers and
|
||||
// quoted strings. Field names match nvme-cli JSON output, not NVMe spec prose.
|
||||
type nvmeSmartLog struct {
|
||||
CriticalWarning int `json:"critical_warning"`
|
||||
PercentageUsed int `json:"percentage_used"`
|
||||
AvailableSpare int `json:"available_spare"`
|
||||
SpareThreshold int `json:"spare_thresh"`
|
||||
Temperature int64 `json:"temperature"`
|
||||
PowerOnHours int64 `json:"power_on_hours"`
|
||||
PowerCycles int64 `json:"power_cycles"`
|
||||
UnsafeShutdowns int64 `json:"unsafe_shutdowns"`
|
||||
DataUnitsRead int64 `json:"data_units_read"`
|
||||
DataUnitsWritten int64 `json:"data_units_written"`
|
||||
ControllerBusy int64 `json:"controller_busy_time"`
|
||||
MediaErrors int64 `json:"media_errors"`
|
||||
NumErrLogEntries int64 `json:"num_err_log_entries"`
|
||||
CriticalWarning jsonInt64 `json:"critical_warning"`
|
||||
PercentageUsed jsonInt64 `json:"percent_used"`
|
||||
AvailableSpare jsonInt64 `json:"avail_spare"`
|
||||
SpareThreshold jsonInt64 `json:"spare_thresh"`
|
||||
Temperature jsonInt64 `json:"temperature"`
|
||||
PowerOnHours jsonInt64 `json:"power_on_hours"`
|
||||
PowerCycles jsonInt64 `json:"power_cycles"`
|
||||
UnsafeShutdowns jsonInt64 `json:"unsafe_shutdowns"`
|
||||
DataUnitsRead jsonInt64 `json:"data_units_read"`
|
||||
DataUnitsWritten jsonInt64 `json:"data_units_written"`
|
||||
ControllerBusy jsonInt64 `json:"controller_busy_time"`
|
||||
MediaErrors jsonInt64 `json:"media_errors"`
|
||||
NumErrLogEntries jsonInt64 `json:"num_err_log_entries"`
|
||||
}
|
||||
|
||||
// nvmeIDCtrl is the subset of `nvme id-ctrl -o json` output.
|
||||
@@ -484,13 +505,16 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
||||
var log nvmeSmartLog
|
||||
if json.Unmarshal(out, &log) == nil {
|
||||
if log.PowerOnHours > 0 {
|
||||
s.PowerOnHours = &log.PowerOnHours
|
||||
v := int64(log.PowerOnHours)
|
||||
s.PowerOnHours = &v
|
||||
}
|
||||
if log.PowerCycles > 0 {
|
||||
s.PowerCycles = &log.PowerCycles
|
||||
v := int64(log.PowerCycles)
|
||||
s.PowerCycles = &v
|
||||
}
|
||||
if log.UnsafeShutdowns > 0 {
|
||||
s.UnsafeShutdowns = &log.UnsafeShutdowns
|
||||
v := int64(log.UnsafeShutdowns)
|
||||
s.UnsafeShutdowns = &v
|
||||
}
|
||||
if log.PercentageUsed > 0 {
|
||||
v := float64(log.PercentageUsed)
|
||||
@@ -499,11 +523,11 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
||||
s.LifeRemainingPct = &remaining
|
||||
}
|
||||
if log.DataUnitsWritten > 0 {
|
||||
v := nvmeDataUnitsToBytes(log.DataUnitsWritten)
|
||||
v := nvmeDataUnitsToBytes(int64(log.DataUnitsWritten))
|
||||
s.WrittenBytes = &v
|
||||
}
|
||||
if log.DataUnitsRead > 0 {
|
||||
v := nvmeDataUnitsToBytes(log.DataUnitsRead)
|
||||
v := nvmeDataUnitsToBytes(int64(log.DataUnitsRead))
|
||||
s.ReadBytes = &v
|
||||
}
|
||||
if log.AvailableSpare > 0 {
|
||||
@@ -511,23 +535,25 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
||||
s.AvailableSparePct = &v
|
||||
}
|
||||
if log.MediaErrors > 0 {
|
||||
s.MediaErrors = &log.MediaErrors
|
||||
v := int64(log.MediaErrors)
|
||||
s.MediaErrors = &v
|
||||
}
|
||||
if log.NumErrLogEntries > 0 {
|
||||
s.ErrorLogEntries = &log.NumErrLogEntries
|
||||
v := int64(log.NumErrLogEntries)
|
||||
s.ErrorLogEntries = &v
|
||||
}
|
||||
if log.Temperature > 0 {
|
||||
v := float64(log.Temperature - 273)
|
||||
s.TemperatureC = &v
|
||||
}
|
||||
setStorageHealthStatus(&s, storageHealthStatus{
|
||||
criticalWarning: log.CriticalWarning,
|
||||
criticalWarning: int(log.CriticalWarning),
|
||||
percentageUsed: int64(log.PercentageUsed),
|
||||
availableSpare: int64(log.AvailableSpare),
|
||||
spareThreshold: int64(log.SpareThreshold),
|
||||
unsafeShutdowns: log.UnsafeShutdowns,
|
||||
mediaErrors: log.MediaErrors,
|
||||
errorLogEntries: log.NumErrLogEntries,
|
||||
unsafeShutdowns: int64(log.UnsafeShutdowns),
|
||||
mediaErrors: int64(log.MediaErrors),
|
||||
errorLogEntries: int64(log.NumErrLogEntries),
|
||||
})
|
||||
return s
|
||||
}
|
||||
|
||||
@@ -1,11 +1,65 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"testing"
|
||||
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
// TestNVMeSmartLogUnmarshal verifies that nvme-cli JSON output (where most
|
||||
// counters are quoted strings and field names differ from NVMe spec prose)
|
||||
// is correctly parsed into nvmeSmartLog.
|
||||
func TestNVMeSmartLogUnmarshal(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
// Real nvme-cli output: counters are JSON strings, spare is "avail_spare",
|
||||
// percentage used is "percent_used".
|
||||
raw := `{
|
||||
"critical_warning": 0,
|
||||
"temperature": 310,
|
||||
"avail_spare": 100,
|
||||
"spare_thresh": 5,
|
||||
"percent_used": 0,
|
||||
"data_units_read": "10925415",
|
||||
"data_units_written": "8497672",
|
||||
"controller_busy_time": "305",
|
||||
"power_cycles": "53",
|
||||
"power_on_hours": "49",
|
||||
"unsafe_shutdowns": "22",
|
||||
"media_errors": "0",
|
||||
"num_err_log_entries": "0"
|
||||
}`
|
||||
var log nvmeSmartLog
|
||||
if err := json.Unmarshal([]byte(raw), &log); err != nil {
|
||||
t.Fatalf("json.Unmarshal failed: %v", err)
|
||||
}
|
||||
if log.PowerOnHours != 49 {
|
||||
t.Errorf("PowerOnHours=%d want 49", log.PowerOnHours)
|
||||
}
|
||||
if log.PowerCycles != 53 {
|
||||
t.Errorf("PowerCycles=%d want 53", log.PowerCycles)
|
||||
}
|
||||
if log.AvailableSpare != 100 {
|
||||
t.Errorf("AvailableSpare=%d want 100", log.AvailableSpare)
|
||||
}
|
||||
if log.SpareThreshold != 5 {
|
||||
t.Errorf("SpareThreshold=%d want 5", log.SpareThreshold)
|
||||
}
|
||||
if log.PercentageUsed != 0 {
|
||||
t.Errorf("PercentageUsed=%d want 0", log.PercentageUsed)
|
||||
}
|
||||
if log.Temperature != 310 {
|
||||
t.Errorf("Temperature=%d want 310", log.Temperature)
|
||||
}
|
||||
if log.MediaErrors != 0 {
|
||||
t.Errorf("MediaErrors=%d want 0", log.MediaErrors)
|
||||
}
|
||||
if log.UnsafeShutdowns != 22 {
|
||||
t.Errorf("UnsafeShutdowns=%d want 22", log.UnsafeShutdowns)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSetStorageHealthStatus(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
||||
@@ -258,7 +258,7 @@ func (s *System) GetInterfaceState(iface string) (bool, error) {
|
||||
func interfaceAdminState(iface string) (bool, error) {
|
||||
raw, err := exec.Command("ip", "-o", "link", "show", "dev", iface).Output()
|
||||
if err != nil {
|
||||
return false, err
|
||||
return false, fmt.Errorf("ip link show dev %s: %w", iface, err)
|
||||
}
|
||||
return parseInterfaceAdminState(string(raw))
|
||||
}
|
||||
@@ -288,7 +288,7 @@ func interfaceIPv4Addrs(iface string) ([]string, error) {
|
||||
if errors.As(err, &exitErr) {
|
||||
return nil, nil
|
||||
}
|
||||
return nil, err
|
||||
return nil, fmt.Errorf("ip addr show dev %s: %w", iface, err)
|
||||
}
|
||||
var ipv4 []string
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(raw)), "\n") {
|
||||
|
||||
@@ -2,6 +2,8 @@
|
||||
// core/internal/ingest/parser_hardware.go. No import dependency on core.
|
||||
package schema
|
||||
|
||||
import "encoding/json"
|
||||
|
||||
// HardwareIngestRequest is the top-level output document produced by `bee audit`.
|
||||
// It is accepted as-is by the core /api/ingest/hardware endpoint.
|
||||
type HardwareIngestRequest struct {
|
||||
@@ -64,9 +66,10 @@ type HardwareSnapshot struct {
|
||||
Storage []HardwareStorage `json:"storage,omitempty"`
|
||||
PCIeDevices []HardwarePCIeDevice `json:"pcie_devices,omitempty"`
|
||||
PowerSupplies []HardwarePowerSupply `json:"power_supplies,omitempty"`
|
||||
Sensors *HardwareSensors `json:"sensors,omitempty"`
|
||||
EventLogs []HardwareEventLog `json:"event_logs,omitempty"`
|
||||
VROCLicense *string `json:"vroc_license,omitempty"`
|
||||
Sensors *HardwareSensors `json:"sensors,omitempty"`
|
||||
EventLogs []HardwareEventLog `json:"event_logs,omitempty"`
|
||||
PlatformConfig *json.RawMessage `json:"platform_config,omitempty"`
|
||||
VROCLicense *string `json:"vroc_license,omitempty"`
|
||||
}
|
||||
|
||||
type HardwareHealthSummary struct {
|
||||
@@ -123,7 +126,7 @@ type HardwareCPU struct {
|
||||
type HardwareMemory struct {
|
||||
HardwareComponentStatus
|
||||
Slot *string `json:"slot,omitempty"`
|
||||
Location *string `json:"location,omitempty"`
|
||||
Location *string `json:"-"` // internal: used for DIMM telemetry matching only
|
||||
Present *bool `json:"present,omitempty"`
|
||||
SizeMB *int `json:"size_mb,omitempty"`
|
||||
Type *string `json:"type,omitempty"`
|
||||
@@ -261,15 +264,13 @@ type HardwareSensors struct {
|
||||
}
|
||||
|
||||
type HardwareFanSensor struct {
|
||||
Name string `json:"name"`
|
||||
Location *string `json:"location,omitempty"`
|
||||
RPM *int `json:"rpm,omitempty"`
|
||||
Status *string `json:"status,omitempty"`
|
||||
Name string `json:"name"`
|
||||
RPM *int `json:"rpm,omitempty"`
|
||||
Status *string `json:"status,omitempty"`
|
||||
}
|
||||
|
||||
type HardwarePowerSensor struct {
|
||||
Name string `json:"name"`
|
||||
Location *string `json:"location,omitempty"`
|
||||
VoltageV *float64 `json:"voltage_v,omitempty"`
|
||||
CurrentA *float64 `json:"current_a,omitempty"`
|
||||
PowerW *float64 `json:"power_w,omitempty"`
|
||||
@@ -278,7 +279,6 @@ type HardwarePowerSensor struct {
|
||||
|
||||
type HardwareTemperatureSensor struct {
|
||||
Name string `json:"name"`
|
||||
Location *string `json:"location,omitempty"`
|
||||
Celsius *float64 `json:"celsius,omitempty"`
|
||||
ThresholdWarningCelsius *float64 `json:"threshold_warning_celsius,omitempty"`
|
||||
ThresholdCriticalCelsius *float64 `json:"threshold_critical_celsius,omitempty"`
|
||||
@@ -286,11 +286,10 @@ type HardwareTemperatureSensor struct {
|
||||
}
|
||||
|
||||
type HardwareOtherSensor struct {
|
||||
Name string `json:"name"`
|
||||
Location *string `json:"location,omitempty"`
|
||||
Value *float64 `json:"value,omitempty"`
|
||||
Unit *string `json:"unit,omitempty"`
|
||||
Status *string `json:"status,omitempty"`
|
||||
Name string `json:"name"`
|
||||
Value *float64 `json:"value,omitempty"`
|
||||
Unit *string `json:"unit,omitempty"`
|
||||
Status *string `json:"status,omitempty"`
|
||||
}
|
||||
|
||||
type HardwareEventLog struct {
|
||||
|
||||
@@ -1297,7 +1297,7 @@ func (h *handler) handleAPIInstallToRAM(w http.ResponseWriter, r *http.Request)
|
||||
var standardTools = []string{
|
||||
"dmidecode", "smartctl", "nvme", "lspci", "ipmitool",
|
||||
"nvidia-smi", "dcgmi", "nv-hostengine", "memtester", "stress-ng", "nvtop",
|
||||
"mstflint",
|
||||
"mstflint", "saa",
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIToolsCheck(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
@@ -68,6 +68,11 @@ tbody tr:hover td{background:rgba(0,0,0,.03)}
|
||||
.chip-warn{background:var(--warn-bg);color:var(--warn-fg);border:1px solid #c9ba9b}
|
||||
.chip-fail{background:var(--crit-bg);color:var(--crit-fg);border:1px solid var(--crit-border)}
|
||||
.chip-unknown{background:var(--surface-2);color:var(--muted);border:1px solid var(--border)}
|
||||
/* Tasks nav badge */
|
||||
.tasks-nav-btn{display:flex;justify-content:space-between;align-items:center;padding:10px 16px;color:rgba(255,255,255,.55);font-size:12px;text-decoration:none;border-top:1px solid rgba(255,255,255,.12);margin-top:auto;transition:color .15s}
|
||||
.tasks-nav-btn:hover{color:#fff}
|
||||
.tasks-nav-count{background:var(--accent);color:#fff;border-radius:10px;padding:1px 7px;font-size:11px;font-weight:700;display:none}
|
||||
.tasks-nav-count.active{display:inline}
|
||||
/* Output terminal */
|
||||
.terminal{background:#1b1c1d;border:1px solid rgba(0,0,0,.2);border-radius:4px;padding:14px;font-family:monospace;font-size:12px;color:#b5cea8;max-height:400px;overflow-y:auto;white-space:pre-wrap;word-break:break-all;user-select:text;-webkit-user-select:text}
|
||||
.terminal-wrap{position:relative}.terminal-copy{position:absolute;top:6px;right:6px;background:#2d2f30;border:1px solid #444;color:#aaa;font-size:11px;padding:2px 8px;border-radius:3px;cursor:pointer;opacity:.7}.terminal-copy:hover{opacity:1}
|
||||
@@ -93,14 +98,15 @@ tbody tr:hover td{background:rgba(0,0,0,.03)}
|
||||
}
|
||||
|
||||
func layoutNav(active string, buildLabel string) string {
|
||||
items := []struct{ id, label, href, onclick string }{
|
||||
{"dashboard", "Dashboard", "/", ""},
|
||||
{"audit", "Audit", "/audit", ""},
|
||||
{"validate", "Validate", "/validate", ""},
|
||||
{"burn", "Burn", "/burn", ""},
|
||||
{"benchmark", "Benchmark", "/benchmark", ""},
|
||||
{"tasks", "Tasks", "/tasks", ""},
|
||||
{"tools", "Tools", "/tools", ""},
|
||||
items := []struct{ id, label, href string }{
|
||||
{"dashboard", "Dashboard", "/"},
|
||||
{"audit", "1. Audit", "/audit"},
|
||||
{"check", "2. Check", "/check"},
|
||||
{"load", "3. Load", "/load"},
|
||||
{"speed", "4. Speed", "/speed"},
|
||||
{"endurance", "5. Endurance", "/endurance"},
|
||||
{"tools", "6. Tools", "/tools"},
|
||||
{"settings", "7. Settings", "/settings"},
|
||||
}
|
||||
var b strings.Builder
|
||||
b.WriteString(`<aside class="sidebar">`)
|
||||
@@ -124,15 +130,16 @@ func layoutNav(active string, buildLabel string) string {
|
||||
if item.id == active {
|
||||
cls += " active"
|
||||
}
|
||||
if item.onclick != "" {
|
||||
b.WriteString(fmt.Sprintf(`<a class="%s" href="%s" onclick="%s">%s</a>`,
|
||||
cls, item.href, item.onclick, item.label))
|
||||
} else {
|
||||
b.WriteString(fmt.Sprintf(`<a class="%s" href="%s">%s</a>`,
|
||||
cls, item.href, item.label))
|
||||
}
|
||||
b.WriteString(fmt.Sprintf(`<a class="%s" href="%s">%s</a>`, cls, item.href, item.label))
|
||||
}
|
||||
b.WriteString(`</nav>`)
|
||||
b.WriteString(`<a href="/tasks" class="tasks-nav-btn" id="tasks-nav-btn">`)
|
||||
b.WriteString(`<span>Tasks</span>`)
|
||||
b.WriteString(`<span class="tasks-nav-count" id="tasks-nav-count"></span>`)
|
||||
b.WriteString(`</a>`)
|
||||
b.WriteString(`<script>`)
|
||||
b.WriteString(`(function(){function u(){fetch('/api/tasks',{cache:'no-store'}).then(function(r){return r.json();}).then(function(d){var n=Array.isArray(d)?d.filter(function(t){return t.status==='pending'||t.status==='running';}).length:0;var c=document.getElementById('tasks-nav-count');var b=document.getElementById('tasks-nav-btn');if(c){c.textContent=n>0?String(n):'';c.className='tasks-nav-count'+(n>0?' active':'');}if(b){b.style.color=n>0?'#f6c90e':'';}}).catch(function(){});}u();setInterval(u,5000);})();`)
|
||||
b.WriteString(`</script>`)
|
||||
b.WriteString(`</aside>`)
|
||||
return b.String()
|
||||
}
|
||||
|
||||
@@ -611,3 +611,20 @@ func renderPowerBenchmarkResultsCard(exportDir string) string {
|
||||
b.WriteString(`</div></div>`)
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// renderSpeed renders the Speed page (step 4): performance benchmarks.
|
||||
// Uses the same benchmark infrastructure; defaults to Standard profile (throughput/bandwidth).
|
||||
// For long-duration stability/overnight runs, see Endurance (step 5).
|
||||
func renderSpeed(opts HandlerOptions) string {
|
||||
base := renderBenchmark(opts)
|
||||
return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Speed:</strong> Measures GPU compute throughput and memory bandwidth. For overnight stability testing, go to <a href="/endurance">5. Endurance</a>.</div>` + base
|
||||
}
|
||||
|
||||
// renderEndurance renders the Endurance page (step 5): long-duration reliability tests.
|
||||
// Focuses on Stability and Overnight profiles for multi-hour burn validation.
|
||||
// For short load tests, see Load (step 3). For throughput measurement, see Speed (step 4).
|
||||
func renderEndurance(opts HandlerOptions) string {
|
||||
base := renderBenchmark(opts)
|
||||
return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>Endurance:</strong> Long-duration reliability tests — Stability (several hours) and Overnight (8+ h) profiles. These profiles run hardware at sustained load; results show whether the server holds its performance envelope over time.</div>
|
||||
<div class="alert alert-info" style="margin-bottom:16px">Use the <strong>Stability</strong> or <strong>Overnight</strong> profile in the setup card below. The Standard profile is available too but is better suited for the <a href="/speed">4. Speed</a> page.</div>` + base
|
||||
}
|
||||
|
||||
@@ -1,8 +1,13 @@
|
||||
package webui
|
||||
|
||||
// renderLoad renders the Load page (step 3): sustained stress tests.
|
||||
// For non-destructive status checks, see Check (step 2).
|
||||
// For DCGM targeted diagnostics (targeted_stress, targeted_power, pulse), see Check → Validate mode.
|
||||
func renderLoad() string { return renderBurn() }
|
||||
|
||||
func renderBurn() string {
|
||||
return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>⚠ Warning:</strong> Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.</div>
|
||||
<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> Burn exposes sustained GPU compute load recipes. DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `) and LINPACK remain in <a href="/validate">Validate → Stress mode</a>; NCCL and NVBandwidth are available directly from <a href="/validate">Validate</a>.</div>
|
||||
<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> Load runs sustained GPU compute and CPU/memory stress recipes. DCGM diagnostics (<code>targeted_stress</code>, <code>targeted_power</code>, <code>pulse_test</code>) and NCCL/NVBandwidth are on the <a href="/check">2. Check</a> page. For overnight endurance runs, see <a href="/endurance">5. Endurance</a>.</div>
|
||||
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
|
||||
@@ -477,6 +477,8 @@ function installToRAM() {
|
||||
|
||||
` + renderNVMeFormatCard() + `
|
||||
|
||||
` + renderSAADMICard() + `
|
||||
|
||||
<script>
|
||||
function checkTools() {
|
||||
document.getElementById('tools-table').innerHTML = '<p style="color:var(--muted);font-size:13px">Checking...</p>';
|
||||
|
||||
77
audit/internal/webui/page_settings.go
Normal file
77
audit/internal/webui/page_settings.go
Normal file
@@ -0,0 +1,77 @@
|
||||
package webui
|
||||
|
||||
import "html"
|
||||
|
||||
func renderSettings(opts HandlerOptions) string {
|
||||
version := opts.BuildLabel
|
||||
if version == "" {
|
||||
version = "dev"
|
||||
}
|
||||
return `<div class="grid2">
|
||||
|
||||
<div class="card">
|
||||
<div class="card-head">Blackbox Logging</div>
|
||||
<div class="card-body">
|
||||
<p style="font-size:13px;color:var(--muted);margin-bottom:14px">Continuous hardware monitoring that writes a rolling log of sensor readings to the export directory. Useful for capturing thermal or power anomalies during long runs.</p>
|
||||
<div style="display:flex;gap:8px;align-items:center">
|
||||
<button class="btn btn-primary btn-sm" onclick="blackboxToggle('enable')">Enable</button>
|
||||
<button class="btn btn-secondary btn-sm" onclick="blackboxToggle('disable')">Disable</button>
|
||||
<span id="blackbox-status" style="font-size:12px;color:var(--muted)">Loading...</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card">
|
||||
<div class="card-head">NVIDIA Recovery</div>
|
||||
<div class="card-body">
|
||||
<p style="font-size:13px;color:var(--muted);margin-bottom:14px">Reset NVIDIA GPU driver state. Use when <code>nvidia-smi</code> reports errors or GPUs appear stuck after a failed test.</p>
|
||||
<div style="display:flex;gap:8px;align-items:center">
|
||||
<button class="btn btn-danger btn-sm" onclick="nvidiaReset()">Reset NVIDIA Driver</button>
|
||||
<span id="nvidia-reset-status" style="font-size:12px;color:var(--muted)"></span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="card" style="margin-top:0">
|
||||
<div class="card-head">Build Info</div>
|
||||
<div class="card-body">
|
||||
<table style="width:auto">
|
||||
<tbody>
|
||||
<tr><td style="color:var(--muted);padding-right:24px">Version</td><td>` + html.EscapeString(version) + `</td></tr>
|
||||
<tr><td style="color:var(--muted);padding-right:24px">Title</td><td>` + html.EscapeString(opts.Title) + `</td></tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
(function() {
|
||||
fetch('/api/blackbox/status', {cache:'no-store'}).then(r => r.json()).then(d => {
|
||||
var el = document.getElementById('blackbox-status');
|
||||
if (el) el.textContent = d.enabled ? 'Enabled' : 'Disabled';
|
||||
}).catch(() => {
|
||||
var el = document.getElementById('blackbox-status');
|
||||
if (el) el.textContent = 'Status unavailable';
|
||||
});
|
||||
})();
|
||||
function blackboxToggle(action) {
|
||||
var el = document.getElementById('blackbox-status');
|
||||
if (el) el.textContent = 'Updating...';
|
||||
fetch('/api/blackbox/' + action, {method:'POST', cache:'no-store'})
|
||||
.then(r => r.json())
|
||||
.then(d => { if (el) el.textContent = d.enabled ? 'Enabled' : 'Disabled'; })
|
||||
.catch(err => { if (el) el.textContent = 'Error: ' + err.message; });
|
||||
}
|
||||
function nvidiaReset() {
|
||||
var el = document.getElementById('nvidia-reset-status');
|
||||
if (!confirm('Reset NVIDIA driver? This will interrupt any running GPU tasks.')) return;
|
||||
if (el) el.textContent = 'Resetting...';
|
||||
fetch('/api/gpu/nvidia-reset', {method:'POST', cache:'no-store'})
|
||||
.then(r => r.json())
|
||||
.then(d => { if (el) el.textContent = d.error ? ('Error: ' + d.error) : 'Done — driver reset.'; })
|
||||
.catch(err => { if (el) el.textContent = 'Error: ' + err.message; });
|
||||
}
|
||||
</script>`
|
||||
}
|
||||
@@ -11,6 +11,13 @@ import (
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
// PCI vendor IDs used for GPU classification (source: pci-ids.ucw.cz).
|
||||
const (
|
||||
pciVendorNvidia = 0x10de
|
||||
pciVendorAMD = 0x1002
|
||||
pciVendorAspeed = 0x1a03
|
||||
)
|
||||
|
||||
type validateInventory struct {
|
||||
CPU string
|
||||
Memory string
|
||||
@@ -634,25 +641,307 @@ func validateFirstNonEmpty(values ...string) string {
|
||||
}
|
||||
|
||||
func validateIsVendorGPU(dev schema.HardwarePCIeDevice, vendor string) bool {
|
||||
model := strings.ToLower(validateTrimPtr(dev.Model))
|
||||
manufacturer := strings.ToLower(validateTrimPtr(dev.Manufacturer))
|
||||
class := strings.ToLower(validateTrimPtr(dev.DeviceClass))
|
||||
if strings.Contains(model, "aspeed") || strings.Contains(manufacturer, "aspeed") {
|
||||
if dev.VendorID != nil && *dev.VendorID == pciVendorAspeed {
|
||||
return false
|
||||
}
|
||||
class := strings.ToLower(validateTrimPtr(dev.DeviceClass))
|
||||
isGPUClass := class == "videocontroller" || class == "processingaccelerator" || class == "displaycontroller"
|
||||
switch vendor {
|
||||
case "nvidia":
|
||||
return strings.Contains(model, "nvidia") || strings.Contains(manufacturer, "nvidia")
|
||||
return isGPUClass && dev.VendorID != nil && *dev.VendorID == pciVendorNvidia
|
||||
case "amd":
|
||||
isGPUClass := class == "processingaccelerator" || class == "displaycontroller" || class == "videocontroller"
|
||||
isAMDVendor := strings.Contains(manufacturer, "advanced micro devices") || strings.Contains(manufacturer, "amd") || strings.Contains(manufacturer, "ati")
|
||||
isAMDModel := strings.Contains(model, "instinct") || strings.Contains(model, "radeon") || strings.Contains(model, "amd")
|
||||
return isGPUClass && (isAMDVendor || isAMDModel)
|
||||
return isGPUClass && dev.VendorID != nil && *dev.VendorID == pciVendorAMD
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// renderCheck renders the non-destructive Check page (step 2).
|
||||
// Shows validate-mode tests only: CPU, Memory, Storage, NVIDIA L2, NCCL, NVBandwidth, AMD.
|
||||
// Stress-mode tests (targeted-stress, targeted-power, pulse) are on the Load page.
|
||||
func renderCheck(opts HandlerOptions) string {
|
||||
inv := loadValidateInventory(opts)
|
||||
n := inv.NvidiaGPUCount
|
||||
validateTotalStr := validateFmtDur(validateTotalValidateSec(n))
|
||||
gpuNote := ""
|
||||
if n > 0 {
|
||||
gpuNote = fmt.Sprintf(" (%d GPU)", n)
|
||||
}
|
||||
return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Check tests collect diagnostics only — no writes to disks, no sustained load, no hardware wear counters incremented. For stress testing, go to <a href="/load">3. Load</a>.</div>
|
||||
<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px">
|
||||
<button type="button" class="btn btn-primary" onclick="runAllCheckSAT()">Run All Checks</button>
|
||||
<span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
|
||||
<span style="font-size:12px;color:var(--muted)">est. ` + validateTotalStr + gpuNote + `</span>
|
||||
</div>
|
||||
|
||||
<div class="grid3">
|
||||
` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody(
|
||||
inv.CPU,
|
||||
`Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`,
|
||||
`<code>lscpu</code>, <code>sensors</code>, <code>stress-ng</code>`,
|
||||
validateFmtDur(platform.SATEstimatedCPUValidateSec)+` (stress-ng 60 s).`,
|
||||
)) +
|
||||
renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody(
|
||||
inv.Memory,
|
||||
`Runs a RAM validation pass and records memory state around the test.`,
|
||||
`<code>free</code>, <code>memtester</code>`,
|
||||
validateFmtDur(platform.SATEstimatedMemoryValidateSec)+` (256 MB × 1 pass).`,
|
||||
)) +
|
||||
renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
|
||||
inv.Storage,
|
||||
`Scans all storage devices and runs the matching health or self-test path for each.`,
|
||||
`<code>lsblk</code>; NVMe: <code>nvme</code>; SATA/SAS: <code>smartctl</code>`,
|
||||
`Seconds (NVMe: instant device query; SATA/SAS: short self-test).`,
|
||||
)) +
|
||||
`</div>
|
||||
<div style="height:1px;background:var(--border);margin:16px 0"></div>
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">NVIDIA GPU Selection</div>
|
||||
<div class="card-body">
|
||||
<p style="font-size:12px;color:var(--muted);margin:0 0 8px">` + inv.NVIDIA + `</p>
|
||||
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
|
||||
<button class="btn btn-sm btn-secondary" type="button" onclick="satSelectAllGPUs()">Select All</button>
|
||||
<button class="btn btn-sm btn-secondary" type="button" onclick="satSelectNoGPUs()">Clear</button>
|
||||
</div>
|
||||
<div id="sat-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
|
||||
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
|
||||
</div>
|
||||
<p id="sat-gpu-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA check tasks.</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="grid3">
|
||||
` + renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Runs NVIDIA diagnostics and board inventory checks (DCGM Level 2).`,
|
||||
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
|
||||
validateFmtDur(platform.SATEstimatedNvidiaGPUValidateSec)+` (Level 2, all GPUs simultaneously).`,
|
||||
)) +
|
||||
renderSATCard("nvidia-interconnect", "NVIDIA Interconnect (NCCL)", "runNvidiaFabricValidate('nvidia-interconnect')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs.`,
|
||||
`<code>all_reduce_perf</code> (NCCL tests)`,
|
||||
validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`,
|
||||
)) +
|
||||
renderSATCard("nvidia-bandwidth", "NVIDIA Bandwidth (NVBandwidth)", "runNvidiaFabricValidate('nvidia-bandwidth')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
|
||||
`<code>nvbandwidth</code>`,
|
||||
validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously).`,
|
||||
)) +
|
||||
`</div>
|
||||
<div class="grid3" style="margin-top:16px">
|
||||
` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody(
|
||||
inv.AMD,
|
||||
`Runs AMD GPU inventory, MEM integrity, and MEM bandwidth checks.`,
|
||||
`GPU Validate: <code>rocm-smi</code>, <code>dmidecode</code>; MEM Integrity: <code>rvs mem</code>; MEM Bandwidth: <code>rocm-bandwidth-test</code>, <code>rvs babel</code>`,
|
||||
`<div style="display:flex;flex-direction:column;gap:4px"><label class="cb-row"><input type="checkbox" id="sat-amd-target" checked><span>GPU Validate</span></label><label class="cb-row"><input type="checkbox" id="sat-amd-mem-target" checked><span>MEM Integrity</span></label><label class="cb-row"><input type="checkbox" id="sat-amd-bandwidth-target" checked><span>MEM Bandwidth</span></label></div>`,
|
||||
)) +
|
||||
`</div>
|
||||
<div id="sat-output" style="display:none;margin-top:16px" class="card">
|
||||
<div class="card-head">Test Output <span id="sat-title"></span></div>
|
||||
<div class="card-body"><div id="sat-terminal" class="terminal"></div></div>
|
||||
</div>
|
||||
<style>
|
||||
.validate-card-body { padding:0; }
|
||||
.validate-card-section { padding:12px 16px 0; }
|
||||
.validate-card-section:last-child { padding-bottom:16px; }
|
||||
.sat-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
|
||||
.sat-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||
.cb-row { display:flex; align-items:flex-start; gap:8px; padding:4px 0; cursor:pointer; font-size:13px; }
|
||||
.cb-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||
</style>
|
||||
<script>
|
||||
let satES = null;
|
||||
function satLabels() {
|
||||
return {nvidia:'Check GPU (DCGM L2)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Check Memory', storage:'Check Storage', cpu:'Check CPU', amd:'Check AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
|
||||
}
|
||||
let satNvidiaGPUsPromise = null;
|
||||
function loadSatNvidiaGPUs() {
|
||||
if (!satNvidiaGPUsPromise) {
|
||||
satNvidiaGPUsPromise = fetch('/api/gpu/nvidia').then(r => {
|
||||
if (!r.ok) throw new Error('Failed to load NVIDIA GPUs.');
|
||||
return r.json();
|
||||
}).then(list => Array.isArray(list) ? list : []);
|
||||
}
|
||||
return satNvidiaGPUsPromise;
|
||||
}
|
||||
function satSelectedGPUIndices() {
|
||||
return Array.from(document.querySelectorAll('.sat-nvidia-checkbox'))
|
||||
.filter(el => el.checked && !el.disabled)
|
||||
.map(el => parseInt(el.value, 10))
|
||||
.filter(v => !Number.isNaN(v))
|
||||
.sort((a, b) => a - b);
|
||||
}
|
||||
function satUpdateGPUSelectionNote() {
|
||||
const note = document.getElementById('sat-gpu-selection-note');
|
||||
if (!note) return;
|
||||
const sel = satSelectedGPUIndices();
|
||||
note.textContent = sel.length
|
||||
? 'Selected GPUs: ' + sel.join(', ') + '. Multi-GPU tests will use all selected GPUs.'
|
||||
: 'Select at least one NVIDIA GPU to enable NVIDIA check tasks.';
|
||||
}
|
||||
function satRenderGPUList(gpus) {
|
||||
const root = document.getElementById('sat-gpu-list');
|
||||
if (!root) return;
|
||||
if (!gpus || !gpus.length) {
|
||||
root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
|
||||
satUpdateGPUSelectionNote(); return;
|
||||
}
|
||||
root.innerHTML = gpus.map(gpu => {
|
||||
const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
|
||||
return '<label class="sat-gpu-row"><input class="sat-nvidia-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="satUpdateGPUSelectionNote()"><span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span></label>';
|
||||
}).join('');
|
||||
satUpdateGPUSelectionNote();
|
||||
}
|
||||
function satSelectAllGPUs() { document.querySelectorAll('.sat-nvidia-checkbox').forEach(el => { el.checked = true; }); satUpdateGPUSelectionNote(); }
|
||||
function satSelectNoGPUs() { document.querySelectorAll('.sat-nvidia-checkbox').forEach(el => { el.checked = false; }); satUpdateGPUSelectionNote(); }
|
||||
function satGPULoadInit() {
|
||||
loadSatNvidiaGPUs().then(satRenderGPUList).catch(err => {
|
||||
const root = document.getElementById('sat-gpu-list');
|
||||
if (root) root.innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
|
||||
satUpdateGPUSelectionNote();
|
||||
});
|
||||
}
|
||||
function satRequestBody(target, overrides) {
|
||||
const body = {};
|
||||
const labels = satLabels();
|
||||
body.display_name = labels[target] || ('Check ' + target);
|
||||
body.stress_mode = false;
|
||||
if (target === 'cpu') body.duration = 60;
|
||||
if (overrides) Object.keys(overrides).forEach(k => { body[k] = overrides[k]; });
|
||||
return body;
|
||||
}
|
||||
function enqueueSATTarget(target, overrides) {
|
||||
return fetch('/api/sat/' + target + '/run', {method:'POST', headers:{'Content-Type':'application/json'}, body:JSON.stringify(satRequestBody(target, overrides))}).then(r => r.json());
|
||||
}
|
||||
function streamSATTask(taskId, title, resetTerminal) {
|
||||
if (satES) { satES.close(); satES = null; }
|
||||
document.getElementById('sat-output').style.display = 'block';
|
||||
document.getElementById('sat-title').textContent = '— ' + title;
|
||||
const term = document.getElementById('sat-terminal');
|
||||
if (resetTerminal) term.textContent = '';
|
||||
term.textContent += 'Task ' + taskId + ' queued. Streaming log...\n';
|
||||
return new Promise(resolve => {
|
||||
satES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||
satES.onmessage = e => { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||
satES.addEventListener('done', e => {
|
||||
satES.close(); satES = null;
|
||||
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
resolve({ok: !e.data, error: e.data || ''});
|
||||
});
|
||||
satES.onerror = () => {
|
||||
if (satES) { satES.close(); satES = null; }
|
||||
term.textContent += '\nERROR: stream disconnected.\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
resolve({ok: false, error: 'stream disconnected'});
|
||||
};
|
||||
});
|
||||
}
|
||||
function selectedAMDValidateTargets() {
|
||||
const targets = [];
|
||||
const gpu = document.getElementById('sat-amd-target');
|
||||
const mem = document.getElementById('sat-amd-mem-target');
|
||||
const bw = document.getElementById('sat-amd-bandwidth-target');
|
||||
if (gpu && gpu.checked && !gpu.disabled) targets.push('amd');
|
||||
if (mem && mem.checked && !mem.disabled) targets.push('amd-mem');
|
||||
if (bw && bw.checked && !bw.disabled) targets.push('amd-bandwidth');
|
||||
return targets;
|
||||
}
|
||||
function runSAT(target) { return runSATWithOverrides(target, null); }
|
||||
function runSATWithOverrides(target, overrides) {
|
||||
const title = (overrides && overrides.display_name) || target;
|
||||
document.getElementById('sat-output').style.display = 'block';
|
||||
document.getElementById('sat-title').textContent = '— ' + title;
|
||||
const term = document.getElementById('sat-terminal');
|
||||
term.textContent = 'Enqueuing ' + title + ' test...\n';
|
||||
return enqueueSATTarget(target, overrides).then(d => streamSATTask(d.task_id, title, false));
|
||||
}
|
||||
function runNvidiaFabricValidate(target) {
|
||||
const indices = satSelectedGPUIndices();
|
||||
if (!indices.length) { alert('No NVIDIA GPUs available.'); return; }
|
||||
runSATWithOverrides(target, {gpu_indices: indices, display_name: satLabels()[target] || target});
|
||||
}
|
||||
function runNvidiaValidateSet(target) {
|
||||
const sel = satSelectedGPUIndices();
|
||||
if (!sel.length) { alert('Select at least one NVIDIA GPU.'); return; }
|
||||
return runSATWithOverrides(target, {gpu_indices: sel, display_name: satLabels()[target] || target});
|
||||
}
|
||||
function runAMDValidateSet() {
|
||||
const targets = selectedAMDValidateTargets();
|
||||
if (!targets.length) return;
|
||||
if (targets.length === 1) return runSAT(targets[0]);
|
||||
const term = document.getElementById('sat-terminal');
|
||||
document.getElementById('sat-output').style.display = 'block';
|
||||
document.getElementById('sat-title').textContent = '— amd';
|
||||
term.textContent = 'Running AMD check set...\n';
|
||||
const labels = satLabels();
|
||||
const runNext = idx => {
|
||||
if (idx >= targets.length) return Promise.resolve();
|
||||
const t = targets[idx];
|
||||
term.textContent += '\n[' + (idx + 1) + '/' + targets.length + '] ' + labels[t] + '\n';
|
||||
return enqueueSATTarget(t).then(d => streamSATTask(d.task_id, labels[t], false)).then(() => runNext(idx + 1));
|
||||
};
|
||||
return runNext(0);
|
||||
}
|
||||
function runAllCheckSAT() {
|
||||
const status = document.getElementById('sat-all-status');
|
||||
status.textContent = 'Enqueuing...';
|
||||
const nvidiaIndices = satSelectedGPUIndices();
|
||||
const nvidiaAllTargets = ['nvidia', 'nvidia-interconnect', 'nvidia-bandwidth'];
|
||||
const baseTargets = ['cpu', 'memory', 'storage'];
|
||||
const amdTargets = selectedAMDValidateTargets();
|
||||
const expanded = [];
|
||||
baseTargets.forEach(t => expanded.push({target: t}));
|
||||
if (nvidiaIndices.length) {
|
||||
nvidiaAllTargets.forEach(t => {
|
||||
const btn = document.getElementById('sat-btn-' + t);
|
||||
if (!(btn && btn.disabled)) expanded.push({target: t, overrides: {gpu_indices: nvidiaIndices, display_name: satLabels()[t] || t}});
|
||||
});
|
||||
}
|
||||
amdTargets.forEach(t => expanded.push({target: t}));
|
||||
if (!expanded.length) { status.textContent = 'No tasks selected.'; return; }
|
||||
const total = expanded.length;
|
||||
const runNext = idx => {
|
||||
if (idx >= expanded.length) { status.textContent = 'Completed ' + total + ' task(s).'; return Promise.resolve(); }
|
||||
const item = expanded[idx];
|
||||
status.textContent = 'Running ' + (idx + 1) + '/' + total + '...';
|
||||
return enqueueSATTarget(item.target, item.overrides).then(() => runNext(idx + 1));
|
||||
};
|
||||
runNext(0).catch(err => { status.textContent = 'Error: ' + err.message; });
|
||||
}
|
||||
function disableSATCard(id, reason) {
|
||||
const btn = document.getElementById('sat-btn-' + id);
|
||||
if (!btn) return;
|
||||
btn.disabled = true; btn.title = reason; btn.style.opacity = '0.4';
|
||||
const card = btn.closest('.card');
|
||||
if (card) {
|
||||
let note = card.querySelector('.sat-unavail');
|
||||
if (!note) {
|
||||
note = document.createElement('p');
|
||||
note.className = 'sat-unavail';
|
||||
note.style.cssText = 'color:var(--muted);font-size:12px;margin:0 0 8px';
|
||||
const body = card.querySelector('.card-body');
|
||||
if (body) body.insertBefore(note, body.firstChild);
|
||||
}
|
||||
note.textContent = reason;
|
||||
}
|
||||
}
|
||||
fetch('/api/gpu/presence').then(r => r.json()).then(gp => {
|
||||
if (!gp.nvidia) ['nvidia','nvidia-interconnect','nvidia-bandwidth'].forEach(t => disableSATCard(t, 'No NVIDIA GPU detected'));
|
||||
if (!gp.amd) {
|
||||
disableSATCard('amd', 'No AMD GPU detected');
|
||||
['sat-amd-target','sat-amd-mem-target','sat-amd-bandwidth-target'].forEach(id => {
|
||||
const cb = document.getElementById(id);
|
||||
if (cb) { cb.disabled = true; cb.checked = false; }
|
||||
});
|
||||
}
|
||||
});
|
||||
satGPULoadInit();
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderSATCard(id, label, runAction, headerActions, body string) string {
|
||||
actions := `<button id="sat-btn-` + id + `" class="btn btn-primary btn-sm" onclick="` + runAction + `">Run</button>`
|
||||
if strings.TrimSpace(headerActions) != "" {
|
||||
|
||||
@@ -24,41 +24,54 @@ func renderPage(page string, opts HandlerOptions) string {
|
||||
body = renderDashboard(opts)
|
||||
case "audit":
|
||||
pageID = "audit"
|
||||
title = "Audit"
|
||||
title = "1. Audit"
|
||||
body = renderAudit()
|
||||
case "validate":
|
||||
pageID = "validate"
|
||||
title = "Validate"
|
||||
body = renderValidate(opts)
|
||||
case "burn":
|
||||
pageID = "burn"
|
||||
title = "Burn"
|
||||
body = renderBurn()
|
||||
case "check":
|
||||
pageID = "check"
|
||||
title = "2. Check"
|
||||
body = renderCheck(opts)
|
||||
case "load":
|
||||
pageID = "load"
|
||||
title = "3. Load"
|
||||
body = renderLoad()
|
||||
case "speed":
|
||||
pageID = "speed"
|
||||
title = "4. Speed"
|
||||
body = renderSpeed(opts)
|
||||
case "endurance":
|
||||
pageID = "endurance"
|
||||
title = "5. Endurance"
|
||||
body = renderEndurance(opts)
|
||||
case "tools":
|
||||
pageID = "tools"
|
||||
title = "6. Tools"
|
||||
body = renderTools()
|
||||
case "settings":
|
||||
pageID = "settings"
|
||||
title = "7. Settings"
|
||||
body = renderSettings(opts)
|
||||
// Legacy routes (redirected at HTTP level in handlePage; these are fallbacks)
|
||||
case "validate", "tests":
|
||||
pageID = "check"
|
||||
title = "2. Check"
|
||||
body = renderCheck(opts)
|
||||
case "burn", "burn-in":
|
||||
pageID = "load"
|
||||
title = "3. Load"
|
||||
body = renderLoad()
|
||||
case "benchmark":
|
||||
pageID = "benchmark"
|
||||
title = "Benchmark"
|
||||
body = renderBenchmark(opts)
|
||||
pageID = "speed"
|
||||
title = "4. Speed"
|
||||
body = renderSpeed(opts)
|
||||
case "tasks":
|
||||
pageID = "tasks"
|
||||
title = "Tasks"
|
||||
body = renderTasks()
|
||||
case "tools":
|
||||
pageID = "tools"
|
||||
title = "Tools"
|
||||
body = renderTools()
|
||||
// Legacy routes kept accessible but not in nav
|
||||
// Hidden pages (not in nav, accessible by direct URL)
|
||||
case "metrics":
|
||||
pageID = "metrics"
|
||||
title = "Live Metrics"
|
||||
body = renderMetrics()
|
||||
case "tests":
|
||||
pageID = "validate"
|
||||
title = "Acceptance Tests"
|
||||
body = renderValidate(opts)
|
||||
case "burn-in":
|
||||
pageID = "burn"
|
||||
title = "Burn-in Tests"
|
||||
body = renderBurn()
|
||||
case "network":
|
||||
pageID = "network"
|
||||
title = "Network"
|
||||
|
||||
287
audit/internal/webui/saa_dmi.go
Normal file
287
audit/internal/webui/saa_dmi.go
Normal file
@@ -0,0 +1,287 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type dmiField struct {
|
||||
Name string `json:"name"`
|
||||
Shn string `json:"shn"`
|
||||
Value string `json:"value"`
|
||||
}
|
||||
|
||||
type saaChange struct {
|
||||
Shn string `json:"shn"`
|
||||
Value string `json:"value"`
|
||||
}
|
||||
|
||||
var shnRE = regexp.MustCompile(`^[A-Za-z0-9_]{1,16}$`)
|
||||
|
||||
// parseDMIFile parses the DMI.txt produced by "saa GetDmiInfo".
|
||||
// Supports two formats:
|
||||
// - Name|Shn|Value (pipe-separated, primary)
|
||||
// - Shn=Value (key=value fallback)
|
||||
//
|
||||
// Lines starting with '#', empty lines, "version=..." and section headers are skipped.
|
||||
func parseDMIFile(content string) []dmiField {
|
||||
var fields []dmiField
|
||||
for _, line := range strings.Split(content, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" || strings.HasPrefix(line, "#") {
|
||||
continue
|
||||
}
|
||||
lower := strings.ToLower(line)
|
||||
if strings.HasPrefix(lower, "version=") || strings.HasPrefix(lower, "[") {
|
||||
continue
|
||||
}
|
||||
parts := strings.SplitN(line, "|", 3)
|
||||
if len(parts) == 3 {
|
||||
name := strings.TrimSpace(parts[0])
|
||||
shn := strings.TrimSpace(parts[1])
|
||||
value := strings.TrimSpace(parts[2])
|
||||
if shnRE.MatchString(shn) {
|
||||
fields = append(fields, dmiField{Name: name, Shn: shn, Value: value})
|
||||
continue
|
||||
}
|
||||
}
|
||||
if idx := strings.IndexByte(line, '='); idx > 0 {
|
||||
shn := strings.TrimSpace(line[:idx])
|
||||
value := strings.TrimSpace(line[idx+1:])
|
||||
if shnRE.MatchString(shn) {
|
||||
fields = append(fields, dmiField{Name: shn, Shn: shn, Value: value})
|
||||
}
|
||||
}
|
||||
}
|
||||
return fields
|
||||
}
|
||||
|
||||
func (h *handler) handleAPISAADMIRead(w http.ResponseWriter, r *http.Request) {
|
||||
ctx, cancel := context.WithTimeout(r.Context(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
tmpDir, err := os.MkdirTemp("", "bee-saa-*")
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, "create temp dir: "+err.Error())
|
||||
return
|
||||
}
|
||||
defer os.RemoveAll(tmpDir)
|
||||
|
||||
dmiFile := filepath.Join(tmpDir, "DMI.txt")
|
||||
out, err := exec.CommandContext(ctx, "saa", "-c", "GetDmiInfo", "--file", dmiFile, "--overwrite").CombinedOutput()
|
||||
if err != nil {
|
||||
msg := strings.TrimSpace(string(out))
|
||||
if msg == "" {
|
||||
msg = err.Error()
|
||||
}
|
||||
writeError(w, http.StatusInternalServerError, "saa GetDmiInfo: "+msg)
|
||||
return
|
||||
}
|
||||
|
||||
raw, err := os.ReadFile(dmiFile)
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, "read DMI file: "+err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
fields := parseDMIFile(string(raw))
|
||||
if len(fields) == 0 {
|
||||
writeError(w, http.StatusInternalServerError, "no DMI fields found (file may be empty — reboot the server and try again)")
|
||||
return
|
||||
}
|
||||
writeJSON(w, fields)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPISAADMIWrite(w http.ResponseWriter, r *http.Request) {
|
||||
var req struct {
|
||||
Changes []saaChange `json:"changes"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid request body")
|
||||
return
|
||||
}
|
||||
if len(req.Changes) == 0 {
|
||||
writeError(w, http.StatusUnprocessableEntity, "no changes provided")
|
||||
return
|
||||
}
|
||||
for _, c := range req.Changes {
|
||||
if !shnRE.MatchString(c.Shn) {
|
||||
writeError(w, http.StatusUnprocessableEntity, "invalid shn: "+c.Shn)
|
||||
return
|
||||
}
|
||||
if len(c.Value) == 0 || len(c.Value) > 64 {
|
||||
writeError(w, http.StatusUnprocessableEntity, "value length out of range for shn: "+c.Shn)
|
||||
return
|
||||
}
|
||||
for _, ch := range c.Value {
|
||||
if ch < 0x20 || ch > 0x7E {
|
||||
writeError(w, http.StatusUnprocessableEntity, "value contains non-printable character for shn: "+c.Shn)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
t := &Task{
|
||||
ID: newJobID("saa-dmi-write"),
|
||||
Name: fmt.Sprintf("SAA DMI Write (%d field(s))", len(req.Changes)),
|
||||
Target: "saa-dmi-write",
|
||||
Priority: defaultTaskPriority("saa-dmi-write", taskParams{}),
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{
|
||||
SAADmiChanges: req.Changes,
|
||||
},
|
||||
}
|
||||
globalQueue.enqueue(t)
|
||||
writeJSON(w, map[string]string{"task_id": t.ID})
|
||||
}
|
||||
|
||||
func runSAADMIWriteTask(ctx context.Context, j *jobState, exportDir string, p taskParams) error {
|
||||
tmpDir, err := os.MkdirTemp("", "bee-saa-*")
|
||||
if err != nil {
|
||||
return fmt.Errorf("create temp dir: %w", err)
|
||||
}
|
||||
defer os.RemoveAll(tmpDir)
|
||||
dmiFile := filepath.Join(tmpDir, "DMI.txt")
|
||||
|
||||
j.append("Reading current DMI configuration...")
|
||||
if err := streamCmdJob(j, exec.CommandContext(ctx, "saa", "-c", "GetDmiInfo", "--file", dmiFile, "--overwrite")); err != nil {
|
||||
return fmt.Errorf("GetDmiInfo: %w", err)
|
||||
}
|
||||
|
||||
backupDir := filepath.Join(exportDir, "dmi-backups")
|
||||
if err := os.MkdirAll(backupDir, 0o755); err != nil {
|
||||
return fmt.Errorf("create backup dir: %w", err)
|
||||
}
|
||||
backupName := "dmi-" + time.Now().UTC().Format("20060102-150405") + ".txt"
|
||||
backupPath := filepath.Join(backupDir, backupName)
|
||||
raw, err := os.ReadFile(dmiFile)
|
||||
if err != nil {
|
||||
return fmt.Errorf("read DMI file: %w", err)
|
||||
}
|
||||
if err := os.WriteFile(backupPath, raw, 0o644); err != nil {
|
||||
return fmt.Errorf("write backup: %w", err)
|
||||
}
|
||||
j.append("Backup saved: dmi-backups/" + backupName)
|
||||
|
||||
for _, c := range p.SAADmiChanges {
|
||||
j.append("Setting " + c.Shn + " = " + c.Value)
|
||||
cmd := exec.CommandContext(ctx, "saa", "-c", "EditDmiInfo", "--file", dmiFile, "--shn", c.Shn, "--value", c.Value)
|
||||
if err := streamCmdJob(j, cmd); err != nil {
|
||||
return fmt.Errorf("EditDmiInfo %s: %w", c.Shn, err)
|
||||
}
|
||||
}
|
||||
|
||||
j.append("Applying changes to hardware...")
|
||||
if err := streamCmdJob(j, exec.CommandContext(ctx, "saa", "-c", "ChangeDmiInfo", "--file", dmiFile)); err != nil {
|
||||
return fmt.Errorf("ChangeDmiInfo: %w", err)
|
||||
}
|
||||
|
||||
j.append("Done. Reboot the server for changes to take effect.")
|
||||
return nil
|
||||
}
|
||||
|
||||
func renderSAADMICard() string {
|
||||
return `<div class="card"><div class="card-head">SAA — DMI <button class="btn btn-sm btn-secondary" onclick="saaDMIRead()" style="margin-left:auto">Read</button></div><div class="card-body">
|
||||
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Reads and edits DMI fields via SAA (In-Band). Requires <code>saa</code> on PATH.</p>
|
||||
<div id="saa-dmi-status" style="font-size:13px;color:var(--muted);margin-bottom:8px"></div>
|
||||
<div id="saa-dmi-table"></div>
|
||||
<div id="saa-dmi-save-row" style="display:none;margin-top:12px">
|
||||
<button class="btn btn-primary" id="saa-dmi-save-btn" onclick="saaDMISave()">Save</button>
|
||||
<span id="saa-dmi-save-msg" style="font-size:13px;color:var(--muted);margin-left:10px"></span>
|
||||
</div>
|
||||
<script>
|
||||
function saaDMIEsc(s) {
|
||||
return String(s==null?'':s).replace(/[&<>"']/g,function(c){return{'&':'&','<':'<','>':'>','"':'"',"'":'''}[c];});
|
||||
}
|
||||
function saaDMIUpdateSaveBtn() {
|
||||
var inputs = document.querySelectorAll('#saa-dmi-table input[data-original]');
|
||||
var dirty = [];
|
||||
inputs.forEach(function(inp){if(inp.value!==inp.dataset.original)dirty.push(inp);});
|
||||
var row = document.getElementById('saa-dmi-save-row');
|
||||
var btn = document.getElementById('saa-dmi-save-btn');
|
||||
if(dirty.length>0){row.style.display='';btn.textContent='Save ('+dirty.length+' changed)';}
|
||||
else{row.style.display='none';}
|
||||
}
|
||||
function saaDMIRead() {
|
||||
var status = document.getElementById('saa-dmi-status');
|
||||
var table = document.getElementById('saa-dmi-table');
|
||||
var saveRow = document.getElementById('saa-dmi-save-row');
|
||||
status.textContent = 'Reading...';
|
||||
status.style.color = 'var(--muted)';
|
||||
table.innerHTML = '';
|
||||
saveRow.style.display = 'none';
|
||||
fetch('/api/tools/saa-dmi').then(function(r){return r.json().then(function(d){if(!r.ok)throw new Error(d.error||('HTTP '+r.status));return d;});}).then(function(fields){
|
||||
status.textContent = fields.length+' field(s) loaded.';
|
||||
var rows = fields.map(function(f){
|
||||
return '<tr>'
|
||||
+'<td style="font-size:13px;white-space:nowrap;padding-right:8px">'+saaDMIEsc(f.name)+'</td>'
|
||||
+'<td style="font-family:monospace;font-size:13px;white-space:nowrap;padding-right:8px">'+saaDMIEsc(f.shn)+'</td>'
|
||||
+'<td><input type="text" value="'+saaDMIEsc(f.value)+'" data-shn="'+saaDMIEsc(f.shn)+'" data-original="'+saaDMIEsc(f.value)+'" oninput="saaDMIMarkDirty(this)" style="width:100%;font-family:monospace;font-size:13px;border:1px solid var(--line);padding:3px 6px;border-radius:3px"></td>'
|
||||
+'<td id="saa-dmi-dirty-'+saaDMIEsc(f.shn)+'" style="font-size:12px;color:var(--warn,#b45309);width:50px;padding-left:6px"></td>'
|
||||
+'</tr>';
|
||||
}).join('');
|
||||
table.innerHTML = '<table style="width:100%;border-collapse:collapse"><tr><th style="text-align:left;font-size:13px;padding-bottom:6px">Field</th><th style="text-align:left;font-size:13px;padding-bottom:6px">Shn</th><th style="text-align:left;font-size:13px;padding-bottom:6px">Value</th><th></th></tr>'+rows+'</table>';
|
||||
}).catch(function(e){
|
||||
status.textContent = 'Error: '+e.message;
|
||||
status.style.color = 'var(--crit-fg,#9f3a38)';
|
||||
});
|
||||
}
|
||||
function saaDMIMarkDirty(inp) {
|
||||
var shn = inp.dataset.shn;
|
||||
var cell = document.getElementById('saa-dmi-dirty-'+shn);
|
||||
if(cell)cell.textContent = inp.value!==inp.dataset.original?'changed':'';
|
||||
saaDMIUpdateSaveBtn();
|
||||
}
|
||||
function saaDMIWaitTask(taskID) {
|
||||
var msg = document.getElementById('saa-dmi-save-msg');
|
||||
msg.textContent = 'Task '+taskID+' queued...';
|
||||
msg.style.color = 'var(--muted)';
|
||||
var timer = setInterval(function(){
|
||||
fetch('/api/tasks').then(function(r){return r.json();}).then(function(tasks){
|
||||
var task = (tasks||[]).find(function(t){return t.id===taskID;});
|
||||
if(!task)return;
|
||||
if(task.status==='done'||task.status==='failed'||task.status==='cancelled'){
|
||||
clearInterval(timer);
|
||||
msg.textContent = task.status==='done'?'Saved. Reboot to apply.':'Failed: '+(task.error||task.status);
|
||||
msg.style.color = task.status==='done'?'var(--ok,green)':'var(--crit-fg,#9f3a38)';
|
||||
document.getElementById('saa-dmi-save-btn').disabled = false;
|
||||
}
|
||||
}).catch(function(){});
|
||||
}, 1500);
|
||||
}
|
||||
function saaDMISave() {
|
||||
var inputs = document.querySelectorAll('#saa-dmi-table input[data-original]');
|
||||
var changes = [];
|
||||
inputs.forEach(function(inp){if(inp.value!==inp.dataset.original)changes.push({shn:inp.dataset.shn,value:inp.value});});
|
||||
if(!changes.length)return;
|
||||
var names = changes.map(function(c){return c.shn;}).join(', ');
|
||||
if(!window.confirm('Apply DMI changes for: '+names+'?\n\nThe server will need to be rebooted for changes to take effect.'))return;
|
||||
var btn = document.getElementById('saa-dmi-save-btn');
|
||||
var msg = document.getElementById('saa-dmi-save-msg');
|
||||
btn.disabled = true;
|
||||
msg.textContent = 'Submitting...';
|
||||
msg.style.color = 'var(--muted)';
|
||||
fetch('/api/tools/saa-dmi/write',{
|
||||
method:'POST',
|
||||
headers:{'Content-Type':'application/json'},
|
||||
body:JSON.stringify({changes:changes})
|
||||
}).then(function(r){return r.json().then(function(d){if(!r.ok)throw new Error(d.error||('HTTP '+r.status));return d;});}).then(function(d){
|
||||
saaDMIWaitTask(d.task_id);
|
||||
}).catch(function(e){
|
||||
msg.textContent = 'Error: '+e.message;
|
||||
msg.style.color = 'var(--crit-fg,#9f3a38)';
|
||||
btn.disabled = false;
|
||||
});
|
||||
}
|
||||
</script>
|
||||
</div></div>`
|
||||
}
|
||||
@@ -314,6 +314,8 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
mux.HandleFunc("GET /api/tools/check", h.handleAPIToolsCheck)
|
||||
mux.HandleFunc("GET /api/tools/nvme-formats", h.handleAPINVMeFormats)
|
||||
mux.HandleFunc("POST /api/tools/nvme-format/run", h.handleAPINVMeFormatRun)
|
||||
mux.HandleFunc("GET /api/tools/saa-dmi", h.handleAPISAADMIRead)
|
||||
mux.HandleFunc("POST /api/tools/saa-dmi/write", h.handleAPISAADMIWrite)
|
||||
|
||||
// GPU presence / tools
|
||||
mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
|
||||
@@ -1419,13 +1421,16 @@ func (h *handler) handlePage(w http.ResponseWriter, r *http.Request) {
|
||||
if page == "" {
|
||||
page = "dashboard"
|
||||
}
|
||||
// Redirect old routes to new names
|
||||
// Redirect legacy routes to new named pages
|
||||
switch page {
|
||||
case "tests":
|
||||
http.Redirect(w, r, "/validate", http.StatusMovedPermanently)
|
||||
case "validate", "tests":
|
||||
http.Redirect(w, r, "/check", http.StatusMovedPermanently)
|
||||
return
|
||||
case "burn-in":
|
||||
http.Redirect(w, r, "/burn", http.StatusMovedPermanently)
|
||||
case "burn", "burn-in":
|
||||
http.Redirect(w, r, "/load", http.StatusMovedPermanently)
|
||||
return
|
||||
case "benchmark":
|
||||
http.Redirect(w, r, "/speed", http.StatusMovedPermanently)
|
||||
return
|
||||
}
|
||||
body := renderPage(page, h.opts)
|
||||
|
||||
@@ -707,13 +707,13 @@ func TestToolsPageRendersNvidiaSelfHealSection(t *testing.T) {
|
||||
func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/benchmark", nil))
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/speed", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
for _, needle := range []string{
|
||||
`href="/benchmark"`,
|
||||
`href="/speed"`,
|
||||
`id="benchmark-gpu-list"`,
|
||||
`/api/gpu/nvidia`,
|
||||
`/api/bee-bench/nvidia/perf/run`,
|
||||
@@ -769,7 +769,7 @@ func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) {
|
||||
|
||||
handler := NewHandler(HandlerOptions{ExportDir: exportDir})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/benchmark", nil))
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/speed", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
@@ -791,54 +791,53 @@ func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
|
||||
func TestCheckPageRendersGPUSelectionAndNvidiaCards(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil))
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/check", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
for _, needle := range []string{
|
||||
`NVIDIA GPU Targeted Stress`,
|
||||
`nvidia-targeted-stress`,
|
||||
`controlled NVIDIA DCGM load`,
|
||||
`<code>dcgmi diag targeted_stress</code>`,
|
||||
`NVIDIA GPU Selection`,
|
||||
`All NVIDIA validate tasks use only the GPUs selected here.`,
|
||||
`Select All`,
|
||||
`id="sat-gpu-list"`,
|
||||
`Select All`,
|
||||
`id="sat-btn-nvidia"`,
|
||||
`NVIDIA Interconnect (NCCL)`,
|
||||
`NVIDIA Bandwidth (NVBandwidth)`,
|
||||
`Non-destructive`,
|
||||
} {
|
||||
if !strings.Contains(body, needle) {
|
||||
t.Fatalf("validate page missing %q: %s", needle, body)
|
||||
t.Fatalf("check page missing %q: %s", needle, body)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidatePageRendersNvidiaFabricCardsInValidateMode(t *testing.T) {
|
||||
func TestCheckPageRendersNvidiaFabricCards(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil))
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/check", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
for _, needle := range []string{
|
||||
`NVIDIA Interconnect (NCCL)`,
|
||||
`Validate and Stress:`,
|
||||
`NVIDIA Bandwidth (NVBandwidth)`,
|
||||
`nvbandwidth runs all built-in tests without a time limit`,
|
||||
`nvbandwidth`,
|
||||
`all_reduce_perf`,
|
||||
} {
|
||||
if !strings.Contains(body, needle) {
|
||||
t.Fatalf("validate page missing %q: %s", needle, body)
|
||||
t.Fatalf("check page missing %q: %s", needle, body)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
|
||||
func TestLoadPageRendersGoalBasedNVIDIACards(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/burn", nil))
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/load", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
@@ -847,7 +846,6 @@ func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
|
||||
`NVIDIA Max Compute Load`,
|
||||
`dcgmproftester`,
|
||||
`NCCL`,
|
||||
`Validate → Stress mode`,
|
||||
`id="burn-gpu-list"`,
|
||||
} {
|
||||
if !strings.Contains(body, needle) {
|
||||
|
||||
@@ -382,6 +382,12 @@ func executeTaskWithOptions(opts *HandlerOptions, t *Task, j *jobState, ctx cont
|
||||
break
|
||||
}
|
||||
err = runNVMeFormatTask(ctx, j, t.params.Device, t.params.LBAF)
|
||||
case "saa-dmi-write":
|
||||
if len(t.params.SAADmiChanges) == 0 {
|
||||
err = fmt.Errorf("no changes provided")
|
||||
break
|
||||
}
|
||||
err = runSAADMIWriteTask(ctx, j, opts.ExportDir, t.params)
|
||||
default:
|
||||
j.append("ERROR: unknown target: " + t.Target)
|
||||
j.finish("unknown target")
|
||||
|
||||
@@ -137,9 +137,10 @@ type taskParams struct {
|
||||
RampTotal int `json:"ramp_total,omitempty"`
|
||||
RampRunID string `json:"ramp_run_id,omitempty"`
|
||||
DisplayName string `json:"display_name,omitempty"`
|
||||
Device string `json:"device,omitempty"` // for install
|
||||
LBAF int `json:"lbaf,omitempty"`
|
||||
PlatformComponents []string `json:"platform_components,omitempty"`
|
||||
Device string `json:"device,omitempty"` // for install
|
||||
LBAF int `json:"lbaf,omitempty"`
|
||||
PlatformComponents []string `json:"platform_components,omitempty"`
|
||||
SAADmiChanges []saaChange `json:"saa_dmi_changes,omitempty"`
|
||||
}
|
||||
|
||||
type persistedTask struct {
|
||||
|
||||
2
bible
2
bible
Submodule bible updated: d2600f1279...1977730d93
185
bible-local/architecture/api-surface.md
Normal file
185
bible-local/architecture/api-surface.md
Normal file
@@ -0,0 +1,185 @@
|
||||
# API Surface
|
||||
|
||||
HTTP endpoints exposed by `bee web` (binds `0.0.0.0:80`).
|
||||
Handler registration: `audit/internal/webui/server.go` → `NewHandler()`.
|
||||
|
||||
---
|
||||
|
||||
## Health & readiness
|
||||
|
||||
| Method | Path | Description |
|
||||
|--------|----------------|-----------------------------------------------------|
|
||||
| GET | `/healthz` | Always 200. Used by load balancers / boot scripts. |
|
||||
| GET | `/api/ready` | 200 when audit JSON exists and is readable. |
|
||||
| GET | `/loading` | HTML loading page shown before first audit. |
|
||||
|
||||
---
|
||||
|
||||
## Audit
|
||||
|
||||
| Method | Path | Description |
|
||||
|--------|-----------------------|--------------------------------------------------------------|
|
||||
| GET | `/audit.json` | Latest audit JSON with SAT overlay applied. |
|
||||
| GET | `/runtime-health.json`| Latest runtime preflight JSON. |
|
||||
| POST | `/api/audit/run` | Enqueue a full `bee audit` run. Returns task ID. |
|
||||
| GET | `/api/audit/stream` | SSE: audit run log lines (`data:` + newline per line). |
|
||||
| GET | `/api/preflight` | Run runtime preflight check (synchronous, returns JSON). |
|
||||
| GET | `/api/hardware-summary` | Hardware health summary (status counts + failures). |
|
||||
| GET | `/api/components/{type}` | HTML fragment for component detail dialog (e.g. `cpu`, `memory`, `storage`, `pcie`). |
|
||||
|
||||
---
|
||||
|
||||
## SAT (System Acceptance Testing)
|
||||
|
||||
All SAT run endpoints enqueue an async task. Response: `{"task_id": "..."}`.
|
||||
|
||||
| Method | Path | Description |
|
||||
|--------|--------------------------------------------|-----------------------------------|
|
||||
| POST | `/api/sat/nvidia/run` | NVIDIA DCGM SAT |
|
||||
| POST | `/api/sat/nvidia-targeted-stress/run` | NVIDIA targeted stress validate |
|
||||
| POST | `/api/sat/nvidia-compute/run` | NVIDIA max compute load |
|
||||
| POST | `/api/sat/nvidia-targeted-power/run` | NVIDIA targeted power |
|
||||
| POST | `/api/sat/nvidia-pulse/run` | NVIDIA pulse test |
|
||||
| POST | `/api/sat/nvidia-interconnect/run` | NCCL all_reduce_perf |
|
||||
| POST | `/api/sat/nvidia-bandwidth/run` | NVBandwidth test |
|
||||
| POST | `/api/sat/nvidia-stress/run` | NVIDIA stress pack |
|
||||
| POST | `/api/sat/memory/run` | Memory acceptance |
|
||||
| POST | `/api/sat/storage/run` | Storage acceptance (smartctl) |
|
||||
| POST | `/api/sat/cpu/run` | CPU acceptance (stress-ng) |
|
||||
| POST | `/api/sat/amd/run` | AMD GPU SAT (ROCm) |
|
||||
| POST | `/api/sat/amd-mem/run` | AMD memory integrity + bandwidth |
|
||||
| POST | `/api/sat/amd-bandwidth/run` | AMD memory bandwidth |
|
||||
| POST | `/api/sat/amd-stress/run` | AMD GPU stress |
|
||||
| POST | `/api/sat/memory-stress/run` | Memory stress |
|
||||
| POST | `/api/sat/sat-stress/run` | Combined storage+memory stress |
|
||||
| POST | `/api/sat/platform-stress/run` | Fan + thermal stress |
|
||||
| GET | `/api/sat/stream` | SSE: live SAT log stream |
|
||||
| POST | `/api/sat/abort` | Abort the running SAT task |
|
||||
|
||||
---
|
||||
|
||||
## Benchmarks
|
||||
|
||||
| Method | Path | Description |
|
||||
|--------|-----------------------------------------|----------------------------------------------|
|
||||
| POST | `/api/bee-bench/nvidia/perf/run` | NVIDIA performance benchmark |
|
||||
| POST | `/api/bee-bench/nvidia/power/run` | NVIDIA power benchmark |
|
||||
| POST | `/api/bee-bench/nvidia/autotune/run` | Power source autotune (prerequisite for benchmarks) |
|
||||
| GET | `/api/bee-bench/nvidia/autotune/status` | Current autotune result / status |
|
||||
| GET | `/api/benchmark/results` | List completed benchmark result archives |
|
||||
|
||||
---
|
||||
|
||||
## Tasks (async job queue)
|
||||
|
||||
| Method | Path | Description |
|
||||
|--------|-----------------------------|----------------------------------------------------|
|
||||
| GET | `/api/tasks` | List all tasks with status |
|
||||
| POST | `/api/tasks/cancel-all` | Cancel all pending/running tasks |
|
||||
| POST | `/api/tasks/kill-workers` | Force-kill worker goroutines |
|
||||
| POST | `/api/tasks/{id}/cancel` | Cancel a specific task |
|
||||
| POST | `/api/tasks/{id}/priority` | Elevate task priority |
|
||||
| GET | `/api/tasks/{id}/stream` | SSE: live log stream for a task |
|
||||
| GET | `/api/tasks/{id}/charts` | List chart names for a task |
|
||||
| GET | `/api/tasks/{id}/chart/` | SVG chart for a task result |
|
||||
| GET | `/tasks/{id}` | HTML task detail page |
|
||||
|
||||
---
|
||||
|
||||
## Services
|
||||
|
||||
| Method | Path | Description |
|
||||
|--------|---------------------------|--------------------------------------------------|
|
||||
| GET | `/api/services` | List bee-* systemd services and their states |
|
||||
| POST | `/api/services/action` | start/stop/restart a service |
|
||||
|
||||
---
|
||||
|
||||
## Network
|
||||
|
||||
| Method | Path | Description |
|
||||
|--------|----------------------------|-----------------------------------------------------|
|
||||
| GET | `/api/network` | List interfaces with state and IPv4 addresses |
|
||||
| POST | `/api/network/dhcp` | Run dhclient on one or all interfaces |
|
||||
| POST | `/api/network/static` | Set static IPv4 address |
|
||||
| POST | `/api/network/toggle` | Bring interface up or down |
|
||||
| POST | `/api/network/confirm` | Confirm pending network change (clears rollback) |
|
||||
| POST | `/api/network/rollback` | Restore pre-change network snapshot |
|
||||
|
||||
---
|
||||
|
||||
## Export
|
||||
|
||||
| Method | Path | Description |
|
||||
|--------|-------------------------------|---------------------------------------------------|
|
||||
| GET | `/export/support.tar.gz` | Download support bundle (live-generated) |
|
||||
| GET | `/export/file` | Download a file from the export dir by path param |
|
||||
| GET | `/export/` | Browse export dir (HTML index) |
|
||||
| GET | `/api/export/list` | JSON list of files in export dir |
|
||||
| GET | `/api/export/usb` | List removable USB targets available for export |
|
||||
|
||||
---
|
||||
|
||||
## GPU
|
||||
|
||||
| Method | Path | Description |
|
||||
|--------|----------------------------|----------------------------------------------------|
|
||||
| GET | `/api/gpu/presence` | `{"nvidia": bool, "amd": bool}` |
|
||||
| GET | `/api/gpu/nvidia` | List NVIDIA GPUs from nvidia-smi |
|
||||
| GET | `/api/gpu/nvidia-status` | Per-GPU status (ECC, power, throttle) |
|
||||
| POST | `/api/gpu/nvidia-reset` | GPU reset by index |
|
||||
| GET | `/api/gpu/tools` | nvidia-smi / rocm-smi tool availability |
|
||||
|
||||
---
|
||||
|
||||
## System
|
||||
|
||||
| Method | Path | Description |
|
||||
|--------|------------------------------|---------------------------------------------------|
|
||||
| GET | `/api/system/ram-status` | toram boot state and ISO copy status |
|
||||
| POST | `/api/system/install-to-ram` | Copy ISO to RAM (background task) |
|
||||
| GET | `/api/install/disks` | List block devices suitable for disk installation |
|
||||
| POST | `/api/install/run` | Install bee to disk (background task) |
|
||||
|
||||
---
|
||||
|
||||
## Tools & NVMe
|
||||
|
||||
| Method | Path | Description |
|
||||
|--------|-------------------------------|--------------------------------------------------|
|
||||
| GET | `/api/tools/check` | Check availability of required CLI tools |
|
||||
| GET | `/api/tools/nvme-formats` | List NVMe format options for a device |
|
||||
| POST | `/api/tools/nvme-format/run` | Run nvme-format on a device |
|
||||
|
||||
---
|
||||
|
||||
## Live metrics
|
||||
|
||||
| Method | Path | Description |
|
||||
|--------|------------------------------|---------------------------------------------------|
|
||||
| GET | `/api/metrics/stream` | SSE: live metrics (GPU power, temp, utilization) |
|
||||
| GET | `/api/metrics/latest` | Latest metrics snapshot (JSON) |
|
||||
| GET | `/api/metrics/chart/` | SVG chart for a metric over time |
|
||||
| GET | `/api/metrics/export.csv` | Download metrics history as CSV |
|
||||
|
||||
---
|
||||
|
||||
## Blackbox logging
|
||||
|
||||
| Method | Path | Description |
|
||||
|--------|----------------------------|-----------------------------------------------|
|
||||
| GET | `/api/blackbox/status` | Blackbox log state (enabled, size, path) |
|
||||
| POST | `/api/blackbox/enable` | Start recording blackbox log |
|
||||
| POST | `/api/blackbox/disable` | Stop recording, flush to disk |
|
||||
|
||||
---
|
||||
|
||||
## UI pages
|
||||
|
||||
| Method | Path | Description |
|
||||
|--------|------------|-----------------------------------------------|
|
||||
| GET | `/` | Main dashboard (serves all page routes) |
|
||||
| GET | `/viewer` | Standalone JSON viewer for uploaded audit files |
|
||||
|
||||
All pages are rendered server-side as HTML. The `/` route handles sub-paths such as
|
||||
`/network`, `/services`, `/sat`, `/benchmark`, `/install`, `/validate`, `/export`.
|
||||
137
bible-local/architecture/data-model.md
Normal file
137
bible-local/architecture/data-model.md
Normal file
@@ -0,0 +1,137 @@
|
||||
# Data Model
|
||||
|
||||
The canonical output of `bee audit` is a `HardwareIngestRequest` JSON document accepted
|
||||
by the Reanimator `/api/ingest/hardware` endpoint. The ingest endpoint uses a strict
|
||||
decoder — unknown fields cause `400 Bad Request`.
|
||||
|
||||
Source of truth: `audit/internal/schema/hardware.go`
|
||||
|
||||
---
|
||||
|
||||
## Top-level: HardwareIngestRequest
|
||||
|
||||
```
|
||||
HardwareIngestRequest
|
||||
├── collected_at string RFC3339 UTC timestamp of collection
|
||||
├── hardware HardwareSnapshot
|
||||
├── runtime RuntimeHealth? from bee-runtime-preflight service
|
||||
├── filename string?
|
||||
├── source_type string?
|
||||
├── protocol string?
|
||||
└── target_host string?
|
||||
```
|
||||
|
||||
`collected_at` is the primary sort key used by Reanimator to deduplicate ingests.
|
||||
|
||||
---
|
||||
|
||||
## HardwareSnapshot
|
||||
|
||||
All component arrays are `omitempty` — absent when the collector finds nothing.
|
||||
|
||||
| JSON key | Go type | Source |
|
||||
|-------------------|----------------------------|------------------------------|
|
||||
| `board` | HardwareBoard | dmidecode type 1/2 |
|
||||
| `firmware` | []HardwareFirmwareRecord | dmidecode type 0/13 |
|
||||
| `cpus` | []HardwareCPU | dmidecode type 4 |
|
||||
| `memory` | []HardwareMemory | dmidecode type 17 |
|
||||
| `storage` | []HardwareStorage | lsblk + nvme-cli + smartctl |
|
||||
| `pcie_devices` | []HardwarePCIeDevice | lspci |
|
||||
| `power_supplies` | []HardwarePowerSupply | ipmitool fru + sdr |
|
||||
| `sensors` | *HardwareSensors | sensors -j |
|
||||
| `event_logs` | []HardwareEventLog | ipmitool sel + journald |
|
||||
| `platform_config` | *json.RawMessage | reserved, nil until used |
|
||||
| `vroc_license` | *string | vroc-cli |
|
||||
|
||||
---
|
||||
|
||||
## Identity keys
|
||||
|
||||
Reanimator uses these fields to match components across successive audits:
|
||||
|
||||
| Component | Identity key |
|
||||
|----------------|------------------------------------------------|
|
||||
| Board | `board.serial_number` (required, never empty) |
|
||||
| CPU | `serial_number` if present; else generated key |
|
||||
| Memory DIMM | `serial_number` — absent DIMMs have `present: false` |
|
||||
| Storage | `serial_number` if present; else `linux_device` from Telemetry |
|
||||
| PCIe device | `bdf` (Bus:Device.Function address) |
|
||||
| PSU | `slot` |
|
||||
|
||||
Components without a stable identity are still emitted but may not be matched across runs.
|
||||
|
||||
---
|
||||
|
||||
## HardwareComponentStatus (embedded in all components)
|
||||
|
||||
```go
|
||||
type HardwareComponentStatus struct {
|
||||
Status *string `json:"status,omitempty"` // OK | Warning | Critical | Unknown
|
||||
ErrorDescription *string `json:"error_description,omitempty"`
|
||||
}
|
||||
```
|
||||
|
||||
Status is set by collectors and overwritten at render time by `ApplySATOverlay`
|
||||
(latest SAT run results are always merged on top before display).
|
||||
|
||||
---
|
||||
|
||||
## HardwarePCIeDevice
|
||||
|
||||
The most enriched component type. Key fields:
|
||||
|
||||
| JSON key | Meaning |
|
||||
|----------------------|------------------------------------------------|
|
||||
| `bdf` | PCI address (identity key), e.g. `0000:4b:00.0` |
|
||||
| `vendor_id` | Numeric PCI vendor ID (hex). Use this for classification — not `manufacturer`. |
|
||||
| `device_id` | Numeric PCI device ID (hex) |
|
||||
| `device_class` | Human-readable class, e.g. `VideoController` |
|
||||
| `manufacturer` | String label from lspci — for display only |
|
||||
| `model` | From nvidia-smi / rocm-smi — display name |
|
||||
| `link_speed` | Current PCIe link speed, e.g. `Gen4` |
|
||||
| `max_link_speed` | Max negotiated speed |
|
||||
| `link_width` | Current lane count |
|
||||
| `max_link_width` | Max lane count |
|
||||
| `temperature_c` | From nvidia-smi / rocm-smi |
|
||||
| `power_w` | Current power draw |
|
||||
| `ecc_uncorrected_total` | Cumulative ECC uncorrected errors (NVIDIA) |
|
||||
| `ecc_corrected_total` | Cumulative ECC corrected errors (NVIDIA) |
|
||||
| `hw_slowdown` | HW throttle active (NVIDIA) |
|
||||
| `telemetry` | Free-form map for vendor-specific extras |
|
||||
|
||||
**Classification rule**: use `vendor_id` (numeric PCI ID), never `manufacturer` string.
|
||||
|
||||
| Vendor | vendor_id |
|
||||
|-----------|-----------|
|
||||
| NVIDIA | `0x10de` |
|
||||
| AMD | `0x1002` |
|
||||
| Mellanox | `0x15b3` |
|
||||
| Aspeed | `0x1a03` |
|
||||
| Intel | `0x8086` |
|
||||
|
||||
Constants live in `audit/internal/collector/pci_vendors.go`.
|
||||
|
||||
---
|
||||
|
||||
## HardwareMemory
|
||||
|
||||
`location` field exists in the Go struct with `json:"-"` — it is intentionally excluded
|
||||
from JSON output because the Reanimator schema does not include it. It is used internally
|
||||
for DIMM telemetry matching only (`collector/memory_telemetry.go`).
|
||||
|
||||
---
|
||||
|
||||
## HardwareSensors
|
||||
|
||||
Sensor structs (`HardwareFanSensor`, `HardwareTemperatureSensor`,
|
||||
`HardwarePowerSensor`, `HardwareOtherSensor`) do **not** have a `location` field.
|
||||
Location was removed in contract v2.8. The Go types mirror the schema exactly.
|
||||
|
||||
---
|
||||
|
||||
## JSON naming convention
|
||||
|
||||
All JSON keys are `snake_case`. Go field names are `CamelCase`. The mapping is
|
||||
maintained by struct tags in `audit/internal/schema/hardware.go`.
|
||||
|
||||
All pointer fields use `omitempty` — absent means not collected (not zero).
|
||||
@@ -0,0 +1,41 @@
|
||||
# Decision: Skip PCIe link-speed warnings for disabled devices
|
||||
|
||||
**Date:** 2026-06-12
|
||||
**Status:** active
|
||||
|
||||
## Context
|
||||
|
||||
On HGX H100 SXM5 baseboards, the Microchip Switchtec PM41028 PSX PCIe switch
|
||||
(vendor 11F8, device 4128, NVIDIA subsystem 10DE:1643) appears in `lspci` as a
|
||||
"Memory controller". Its upstream link trains at Gen3 x2 while the device is
|
||||
capable of Gen4 x16. The device is permanently in a disabled state: memory access
|
||||
and bus-mastering are both off (Mem-, BusMaster-); `/sys/bus/pci/devices/<bdf>/enable`
|
||||
reads `0`.
|
||||
|
||||
This chip is the PCIe fabric management endpoint for the NVSwitch interconnect — it
|
||||
carries only management traffic at low bandwidth and is intentionally not activated
|
||||
by any Linux driver. The bee audit was reporting a `statusWarning` with message
|
||||
"PCIe link speed degraded" for this device, which is misleading because the device
|
||||
is not in the data path.
|
||||
|
||||
## Decision
|
||||
|
||||
`applyPCIeLinkSpeedWarning` reads `/sys/bus/pci/devices/<bdf>/enable` via the
|
||||
existing `readPCIIntAttribute` helper. If the value is `0` the function returns
|
||||
early without setting any warning status.
|
||||
|
||||
The check is vendor-agnostic: it applies to any PCIe device that Linux has not
|
||||
activated, regardless of make or model. This is consistent with the
|
||||
`no-hardcoded-vendors` contract — no vendor ID, device ID, or name string is
|
||||
used as a condition.
|
||||
|
||||
## Consequences
|
||||
|
||||
- PCIe fabric management endpoints, IPMI virtual devices, and other permanently
|
||||
disabled PCIe functions no longer produce spurious link-degradation warnings.
|
||||
- Real link degradation on active devices (GPUs, NICs, NVMe, NVLink bridges)
|
||||
continues to be detected and reported as before.
|
||||
- NVLink bridge cards retain their existing `statusCritical` path (they are always
|
||||
enabled, so the early return is never taken for them).
|
||||
- The Switchtec device on HGX H100 boards shows `statusOK` with no
|
||||
`error_description` in the audit JSON.
|
||||
@@ -7,3 +7,4 @@ One file per decision, named `YYYY-MM-DD-short-topic.md`.
|
||||
| 2026-03-05 | Use NVIDIA proprietary driver | active |
|
||||
| 2026-04-01 | Treat memtest as explicit ISO content | active |
|
||||
| 2026-04-29 | Treat embedded submodules as read-only | active |
|
||||
| 2026-06-12 | Skip PCIe link-speed warnings for disabled devices | active |
|
||||
|
||||
312
bible-local/docs/grub-bitmap-error-history.md
Normal file
312
bible-local/docs/grub-bitmap-error-history.md
Normal file
@@ -0,0 +1,312 @@
|
||||
# GRUB Bitmap Error History
|
||||
|
||||
## Symptom
|
||||
|
||||
On some servers GRUB prints:
|
||||
|
||||
```text
|
||||
error: null src bitmap in grub_video_bitmap_create_scaled.
|
||||
Press any key to continue...
|
||||
```
|
||||
|
||||
The important new observation as of `v10.7` is:
|
||||
|
||||
- the error still appears even when the logo image block is removed from
|
||||
`iso/builder/config/bootloaders/grub-efi/live-theme/theme.txt`
|
||||
- therefore the current error can no longer be explained only by
|
||||
`bee-logo.png` / `bee-logo.tga`
|
||||
|
||||
That does not prove the theme system is healthy. It proves only that the
|
||||
currently remaining failure is deeper than "bad logo file".
|
||||
|
||||
## Current State
|
||||
|
||||
Current source files:
|
||||
|
||||
- [iso/builder/config/bootloaders/grub-efi/live-theme/theme.txt](/Users/mchusavitin/Documents/git/bee/iso/builder/config/bootloaders/grub-efi/live-theme/theme.txt:1)
|
||||
has no `image` block anymore
|
||||
- [iso/builder/config/bootloaders/grub-efi/config.cfg](/Users/mchusavitin/Documents/git/bee/iso/builder/config/bootloaders/grub-efi/config.cfg:1)
|
||||
still does `insmod tga` and then `source /boot/grub/theme.cfg`
|
||||
|
||||
Implication:
|
||||
|
||||
- if the error still fires, the trigger is likely elsewhere in GRUB theme
|
||||
rendering or in the assets/config GRUB resolves while sourcing `theme.cfg`
|
||||
- the old "PNG parser fragility" story is no longer a sufficient explanation
|
||||
for the current failure mode
|
||||
|
||||
Current artifact facts:
|
||||
|
||||
- the provided `easy-bee-nvidia-v10.7-amd64.logs` build logs reference
|
||||
`linux-image-6.1.0-45`
|
||||
- the provided `easy-bee-nvidia-v10.7-amd64.iso` contains
|
||||
`live/initrd.img-6.1.0-45-amd64` and `live/vmlinuz-6.1.0-45-amd64`
|
||||
- a later `BOOT FAILED!` screenshot showed `live/initrd.img-6.1.0-44-amd64`
|
||||
and `live/vmlinuz-6.1.0-44-amd64`
|
||||
|
||||
Implication:
|
||||
|
||||
- the `BOOT FAILED!` screenshot is not from the same artifact as the provided
|
||||
`v10.7` ISO/log set
|
||||
- until the exact ISO filename and checksum are tied to that failure, the
|
||||
GRUB bitmap issue and the live-boot failure must be treated as separate
|
||||
problems
|
||||
|
||||
## Chronology
|
||||
|
||||
### 1. Initial bee GRUB theme introduction
|
||||
|
||||
Relevant commit:
|
||||
|
||||
- `d52ec67` `Stability hardening, build script fixes, GRUB bee logo`
|
||||
|
||||
What changed:
|
||||
|
||||
- bee-branded GRUB theme introduced
|
||||
- image block with explicit `width` / `height`
|
||||
|
||||
Observed result:
|
||||
|
||||
- bitmap error appeared
|
||||
|
||||
### 2. Remove explicit scaling dimensions
|
||||
|
||||
Relevant commit:
|
||||
|
||||
- `aa284ae` `fix(iso): avoid grub logo scaling error`
|
||||
|
||||
What changed:
|
||||
|
||||
- removed `width = 400`
|
||||
- removed `height = 400`
|
||||
|
||||
Reason stated by the change:
|
||||
|
||||
- try to avoid the scaling path
|
||||
|
||||
Observed result:
|
||||
|
||||
- error persisted
|
||||
|
||||
Conclusion:
|
||||
|
||||
- explicit width/height were not the sole trigger
|
||||
|
||||
### 3. Rework PNG handling and menu rendering
|
||||
|
||||
Relevant commit:
|
||||
|
||||
- `6112094` `fix(grub): fix bitmap error and menu rendering`
|
||||
|
||||
Commit message says the change was intended to:
|
||||
|
||||
- convert `bee-logo.png` to RGBA and strip metadata
|
||||
- move `terminal_output gfxterm` before `insmod png` / theme load
|
||||
- remove ASCII banner from GRUB menu area
|
||||
- fix theme typography/layout fields
|
||||
|
||||
Observed result:
|
||||
|
||||
- error persisted
|
||||
|
||||
Notes:
|
||||
|
||||
- this was still operating under the assumption that the issue was the PNG
|
||||
payload or the order of gfxterm/theme init
|
||||
|
||||
### 4. Convert logo PNG back to RGB
|
||||
|
||||
Relevant commit:
|
||||
|
||||
- `333c44f` `Fix GRUB splash: convert bee-logo.png from RGBA to RGB`
|
||||
|
||||
Intended reason:
|
||||
|
||||
- GRUB might dislike RGBA PNG and want RGB PNG
|
||||
|
||||
Observed result:
|
||||
|
||||
- error still persisted according to later project notes
|
||||
|
||||
### 5. Add post-build canonical GRUB/isolinux sync
|
||||
|
||||
Relevant commit:
|
||||
|
||||
- `0cdfbc5` `fix(iso): restore boot UX and boot logs`
|
||||
|
||||
What this introduced:
|
||||
|
||||
- post-`lb build` rewriting of `binary/boot/grub/grub.cfg`
|
||||
- post-`lb build` rewriting of `binary/isolinux/live.cfg`
|
||||
- forced rebuild of `binary_checksums`, `binary_iso`, `binary_zsync`
|
||||
|
||||
Why it was added:
|
||||
|
||||
- restore canonical EASY-BEE boot UX after live-build wrote its own bootloader
|
||||
outputs
|
||||
- restore expected boot menu and logs
|
||||
|
||||
Important note:
|
||||
|
||||
- this commit did not directly solve the bitmap issue
|
||||
- it added a second layer of bootloader mutation after live-build
|
||||
|
||||
### 6. Switch from PNG to TGA
|
||||
|
||||
Relevant commit:
|
||||
|
||||
- `626763e` `Fix GRUB bitmap error: switch from PNG to TGA for splash logo`
|
||||
|
||||
Commit message says:
|
||||
|
||||
- GRUB PNG reader was considered fragile
|
||||
- switch to uncompressed 24-bit TGA
|
||||
- `config.cfg`: `insmod png` -> `insmod tga`
|
||||
- `theme.txt`: `bee-logo.png` -> `bee-logo.tga`
|
||||
|
||||
Observed result:
|
||||
|
||||
- this did not eliminate the problem in the current lineage
|
||||
- today the system still errors even after the entire image block was removed
|
||||
|
||||
Conclusion:
|
||||
|
||||
- switching PNG -> TGA was not a durable root-cause fix
|
||||
|
||||
### 7. Patch EFI image after build
|
||||
|
||||
Relevant commit:
|
||||
|
||||
- `4f20c92` `Make UEFI boot safe and remove GRUB logo`
|
||||
|
||||
What this introduced:
|
||||
|
||||
- `sync_efi_grub_theme_assets`
|
||||
- direct `mtools` patching of `efi.img`
|
||||
- copying `config.cfg`, `theme.cfg`, and `live-theme/*` into the EFI FAT image
|
||||
- removal of the logo image block from `theme.txt`
|
||||
|
||||
Why it was added:
|
||||
|
||||
- make UEFI path "safe"
|
||||
- keep EFI GRUB image aligned with canonical bootloader assets
|
||||
|
||||
Observed result:
|
||||
|
||||
- later this became the direct cause of `Disk full` during build once
|
||||
`bee-logo.tga` was large enough
|
||||
- and even with the logo removed from `theme.txt`, the bitmap error still
|
||||
remained
|
||||
|
||||
Conclusion:
|
||||
|
||||
- EFI post-build patching increased build complexity
|
||||
- removing the logo alone did not remove the runtime GRUB error
|
||||
|
||||
### 8. Remove ASCII logo banners
|
||||
|
||||
Relevant commit:
|
||||
|
||||
- `14505ef` `Remove easy bee ASCII logo banners`
|
||||
|
||||
What changed:
|
||||
|
||||
- web loading page ASCII cleanup only
|
||||
|
||||
Relevance here:
|
||||
|
||||
- none for GRUB bitmap error
|
||||
- included here only to avoid confusion with other "logo removal" work
|
||||
|
||||
### 9. Remove EFI post-build patching
|
||||
|
||||
Relevant commit:
|
||||
|
||||
- `5dc022d` `Drop post-build EFI bootloader patching`
|
||||
|
||||
Why it was done:
|
||||
|
||||
- stop mutating `efi.img` post-build
|
||||
- remove dependence on `mtools` for EFI patching
|
||||
- remove the `Disk full` failure mode
|
||||
|
||||
Impact:
|
||||
|
||||
- this did not target the GRUB bitmap error directly
|
||||
- it targeted build-system complexity and EFI image overflow
|
||||
|
||||
### 10. Restore only GRUB/isolinux post-build sync
|
||||
|
||||
Relevant commit:
|
||||
|
||||
- `42774d4` `Restore post-build GRUB and isolinux sync`
|
||||
|
||||
Why it was needed:
|
||||
|
||||
- removing all post-build sync caused final ISO validation to fail with
|
||||
missing canonical EASY-BEE boot entries
|
||||
- memtest was still fine, but final GRUB menu was no longer canonical
|
||||
|
||||
What it restored:
|
||||
|
||||
- only `binary/boot/grub/grub.cfg`
|
||||
- only `binary/isolinux/live.cfg`
|
||||
|
||||
What it did not restore:
|
||||
|
||||
- no EFI FAT image patching
|
||||
- no `mtools` path
|
||||
|
||||
## What Is Proven False
|
||||
|
||||
The current evidence rules out several simplistic explanations:
|
||||
|
||||
- "the error is only caused by explicit image scaling"
|
||||
- "the error is only caused by PNG vs TGA"
|
||||
- "the error is only caused by the logo file itself"
|
||||
|
||||
Why:
|
||||
|
||||
- scaling dimensions were removed and error persisted
|
||||
- PNG was replaced with TGA and error still survived in the lineage
|
||||
- the image block itself is now absent, and the error still occurs
|
||||
|
||||
## Working Hypotheses Left
|
||||
|
||||
The remaining plausible layers are:
|
||||
|
||||
- GRUB theme engine still tries to render some bitmap-related element even
|
||||
without the logo image block
|
||||
- GRUB is resolving stale theme assets from the built EFI/ISO path rather than
|
||||
what we think the source tree says
|
||||
- `theme.cfg` / `theme.txt` / GRUB module loading order still triggers a bitmap
|
||||
code path elsewhere
|
||||
- live-build may still package a stale `theme.txt` or stale `live-theme`
|
||||
directory into the final image
|
||||
- the GRUB environment on the failing hardware may behave differently from the
|
||||
assumptions in our source tree
|
||||
|
||||
## Decision Boundary
|
||||
|
||||
Before making another change, the next step should be evidence gathering from
|
||||
the real built artifact, not another speculative edit.
|
||||
|
||||
That means checking on the actual built ISO or EFI image:
|
||||
|
||||
- exact `boot/grub/theme.cfg`
|
||||
- exact `boot/grub/live-theme/theme.txt`
|
||||
- exact contents of `boot/grub/live-theme/`
|
||||
- whether the final image still contains a stale logo reference
|
||||
- whether the EFI path and non-EFI path differ
|
||||
|
||||
## Relevant Commits
|
||||
|
||||
- `d52ec67` `Stability hardening, build script fixes, GRUB bee logo`
|
||||
- `aa284ae` `fix(iso): avoid grub logo scaling error`
|
||||
- `6112094` `fix(grub): fix bitmap error and menu rendering`
|
||||
- `333c44f` `Fix GRUB splash: convert bee-logo.png from RGBA to RGB`
|
||||
- `0cdfbc5` `fix(iso): restore boot UX and boot logs`
|
||||
- `626763e` `Fix GRUB bitmap error: switch from PNG to TGA for splash logo`
|
||||
- `4f20c92` `Make UEFI boot safe and remove GRUB logo`
|
||||
- `5dc022d` `Drop post-build EFI bootloader patching`
|
||||
- `42774d4` `Restore post-build GRUB and isolinux sync`
|
||||
Submodule internal/chart updated: 2a15bc87f1...8105c7ec08
@@ -9,7 +9,7 @@ NCCL_TESTS_VERSION=2.13.10
|
||||
NVCC_VERSION=12.8
|
||||
CUBLAS_VERSION=13.1.1.3-1
|
||||
CUDA_USERSPACE_VERSION=13.0.96-1
|
||||
DCGM_VERSION=4.5.3-1
|
||||
DCGM_VERSION=4.6.0-1
|
||||
JOHN_JUMBO_COMMIT=67fcf9fe5a
|
||||
ROCM_VERSION=6.3.4
|
||||
ROCM_SMI_VERSION=7.4.0.60304-76~22.04
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# AMD GPU firmware
|
||||
firmware-amd-graphics
|
||||
nvtop
|
||||
|
||||
# AMD ROCm — GPU monitoring, bandwidth test, and compute stress (RVS GST)
|
||||
rocm-smi-lib=%%ROCM_SMI_VERSION%%
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with
|
||||
# CUDA 13 userspace, so install the CUDA 13 build plus proprietary components
|
||||
# explicitly.
|
||||
nvtop
|
||||
nvidia-fabricmanager=%%NVIDIA_FABRICMANAGER_VERSION%%
|
||||
datacenter-gpu-manager-4-cuda13=1:%%DCGM_VERSION%%
|
||||
datacenter-gpu-manager-4-proprietary=1:%%DCGM_VERSION%%
|
||||
|
||||
@@ -47,7 +47,6 @@ less
|
||||
vim-tiny
|
||||
mc
|
||||
htop
|
||||
nvtop
|
||||
sudo
|
||||
zstd
|
||||
mstflint
|
||||
|
||||
@@ -1,3 +1,8 @@
|
||||
[Unit]
|
||||
# bee-nvidia.service loads the NVIDIA kernel modules; fabricmanager must wait
|
||||
# for them to be fully initialized before attempting to open /dev/nvidiactl.
|
||||
After=bee-nvidia.service
|
||||
|
||||
[Service]
|
||||
# Skip fabricmanager on systems without NVSwitch hardware.
|
||||
# ExecCondition exits 1-254 → unit is silently skipped (inactive, not failed).
|
||||
|
||||
Reference in New Issue
Block a user