A (hardware-ingest-json v2.8-2.9): remove sensor location fields from schema and collector; tag HardwareMemory.Location as json:"-"; add PlatformConfig to HardwareSnapshot. B (no-hardcoded-vendors): consolidate PCI vendor IDs into collector/pci_vendors.go; replace all vendor-name string checks in isGPUDevice, isNVIDIADevice, isMellanoxDevice, isAMDGPUDevice, matchesGPUVendor (sat_overlay), and validateIsVendorGPU (page_validate) with numeric vendor_id comparisons. C (module-structure): split app/app.go (1413 lines) into app.go + app_format.go, app_network.go, app_services.go, app_packs.go, app_install.go — no logic changes. D (go-code-style): wrap bare return err in interfaceAdminState and interfaceIPv4Addrs (platform/network.go) with fmt.Errorf context including the interface name. E (go-project-bible): add bible-local/architecture/data-model.md and bible-local/architecture/api-surface.md. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
406 lines
9.5 KiB
Go
406 lines
9.5 KiB
Go
package app
|
|
|
|
import (
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"sort"
|
|
"strings"
|
|
|
|
"bee/audit/internal/collector"
|
|
"bee/audit/internal/platform"
|
|
"bee/audit/internal/schema"
|
|
)
|
|
|
|
func hostnameOr(fallback string) string {
|
|
hn, err := os.Hostname()
|
|
if err != nil || strings.TrimSpace(hn) == "" {
|
|
return fallback
|
|
}
|
|
return hn
|
|
}
|
|
|
|
func sanitizeFilename(v string) string {
|
|
var out []rune
|
|
for _, r := range v {
|
|
switch {
|
|
case r >= 'a' && r <= 'z', r >= 'A' && r <= 'Z', r >= '0' && r <= '9', r == '-', r == '_', r == '.':
|
|
out = append(out, r)
|
|
default:
|
|
out = append(out, '-')
|
|
}
|
|
}
|
|
if len(out) == 0 {
|
|
return "unknown"
|
|
}
|
|
return string(out)
|
|
}
|
|
|
|
func bodyOr(body, fallback string) string {
|
|
body = strings.TrimSpace(body)
|
|
if body == "" {
|
|
return fallback
|
|
}
|
|
return body
|
|
}
|
|
|
|
func trimPtr(value *string) string {
|
|
if value == nil {
|
|
return ""
|
|
}
|
|
return strings.TrimSpace(*value)
|
|
}
|
|
|
|
func joinSortedKeys(values map[string]struct{}) string {
|
|
if len(values) == 0 {
|
|
return ""
|
|
}
|
|
keys := make([]string, 0, len(values))
|
|
for key := range values {
|
|
keys = append(keys, key)
|
|
}
|
|
sort.Strings(keys)
|
|
return strings.Join(keys, "/")
|
|
}
|
|
|
|
func humanizeMB(totalMB int) string {
|
|
if totalMB <= 0 {
|
|
return ""
|
|
}
|
|
gb := float64(totalMB) / 1024.0
|
|
if gb >= 1024.0 {
|
|
tb := gb / 1024.0
|
|
return fmt.Sprintf("%.1f TB", tb)
|
|
}
|
|
if gb == float64(int64(gb)) {
|
|
return fmt.Sprintf("%.0f GB", gb)
|
|
}
|
|
return fmt.Sprintf("%.1f GB", gb)
|
|
}
|
|
|
|
func humanizeGB(totalGB int) string {
|
|
if totalGB <= 0 {
|
|
return ""
|
|
}
|
|
tb := float64(totalGB) / 1024.0
|
|
if tb >= 1.0 {
|
|
return fmt.Sprintf("%.1f TB", tb)
|
|
}
|
|
return fmt.Sprintf("%d GB", totalGB)
|
|
}
|
|
|
|
func parseKeyValueSummary(raw string) map[string]string {
|
|
out := map[string]string{}
|
|
for _, line := range strings.Split(raw, "\n") {
|
|
line = strings.TrimSpace(line)
|
|
if line == "" {
|
|
continue
|
|
}
|
|
key, value, ok := strings.Cut(line, "=")
|
|
if !ok {
|
|
continue
|
|
}
|
|
out[strings.TrimSpace(key)] = strings.TrimSpace(value)
|
|
}
|
|
return out
|
|
}
|
|
|
|
func firstNonEmpty(values ...string) string {
|
|
for _, value := range values {
|
|
value = strings.TrimSpace(value)
|
|
if value != "" {
|
|
return value
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func cleanSummaryKey(key string) string {
|
|
idx := strings.Index(key, "-")
|
|
if idx <= 0 {
|
|
return key
|
|
}
|
|
prefix := key[:idx]
|
|
for _, c := range prefix {
|
|
if c < '0' || c > '9' {
|
|
return key
|
|
}
|
|
}
|
|
return key[idx+1:]
|
|
}
|
|
|
|
func isGPUDevice(dev schema.HardwarePCIeDevice) bool {
|
|
// Exclude Aspeed BMC VGA adapters (not compute GPUs).
|
|
if dev.VendorID != nil && *dev.VendorID == collector.AspeedVendorID {
|
|
return false
|
|
}
|
|
class := trimPtr(dev.DeviceClass)
|
|
// AMD Instinct / Radeon compute GPUs always carry ProcessingAccelerator or DisplayController.
|
|
// Do NOT match AMD vendor alone — CPU chipset PCIe devices share that vendor ID.
|
|
if class == "VideoController" || class == "DisplayController" || class == "ProcessingAccelerator" {
|
|
return true
|
|
}
|
|
// NVIDIA devices sometimes expose class values outside the standard GPU set.
|
|
return dev.VendorID != nil && *dev.VendorID == collector.NvidiaVendorID
|
|
}
|
|
|
|
func formatSystemLine(board schema.HardwareBoard) string {
|
|
model := strings.TrimSpace(strings.Join([]string{
|
|
trimPtr(board.Manufacturer),
|
|
trimPtr(board.ProductName),
|
|
}, " "))
|
|
serial := strings.TrimSpace(board.SerialNumber)
|
|
switch {
|
|
case model != "" && serial != "":
|
|
return fmt.Sprintf("System: %s | S/N %s", model, serial)
|
|
case model != "":
|
|
return "System: " + model
|
|
case serial != "":
|
|
return "System S/N: " + serial
|
|
default:
|
|
return ""
|
|
}
|
|
}
|
|
|
|
func formatCPULine(cpus []schema.HardwareCPU) string {
|
|
if len(cpus) == 0 {
|
|
return ""
|
|
}
|
|
modelCounts := map[string]int{}
|
|
unknown := 0
|
|
for _, cpu := range cpus {
|
|
model := trimPtr(cpu.Model)
|
|
if model == "" {
|
|
unknown++
|
|
continue
|
|
}
|
|
modelCounts[model]++
|
|
}
|
|
if len(modelCounts) == 1 && unknown == 0 {
|
|
for model, count := range modelCounts {
|
|
return fmt.Sprintf("CPU: %d x %s", count, model)
|
|
}
|
|
}
|
|
parts := make([]string, 0, len(modelCounts)+1)
|
|
if len(modelCounts) > 0 {
|
|
keys := make([]string, 0, len(modelCounts))
|
|
for key := range modelCounts {
|
|
keys = append(keys, key)
|
|
}
|
|
sort.Strings(keys)
|
|
for _, key := range keys {
|
|
parts = append(parts, fmt.Sprintf("%d x %s", modelCounts[key], key))
|
|
}
|
|
}
|
|
if unknown > 0 {
|
|
parts = append(parts, fmt.Sprintf("%d x unknown", unknown))
|
|
}
|
|
return "CPU: " + strings.Join(parts, ", ")
|
|
}
|
|
|
|
func formatMemoryLine(dimms []schema.HardwareMemory) string {
|
|
totalMB := 0
|
|
present := 0
|
|
types := map[string]struct{}{}
|
|
for _, dimm := range dimms {
|
|
if dimm.Present != nil && !*dimm.Present {
|
|
continue
|
|
}
|
|
if dimm.SizeMB == nil || *dimm.SizeMB <= 0 {
|
|
continue
|
|
}
|
|
present++
|
|
totalMB += *dimm.SizeMB
|
|
if value := trimPtr(dimm.Type); value != "" {
|
|
types[value] = struct{}{}
|
|
}
|
|
}
|
|
if totalMB == 0 {
|
|
return ""
|
|
}
|
|
typeText := joinSortedKeys(types)
|
|
line := fmt.Sprintf("Memory: %s", humanizeMB(totalMB))
|
|
if typeText != "" {
|
|
line += " " + typeText
|
|
}
|
|
if present > 0 {
|
|
line += fmt.Sprintf(" (%d DIMMs)", present)
|
|
}
|
|
return line
|
|
}
|
|
|
|
func formatStorageLine(disks []schema.HardwareStorage) string {
|
|
count := 0
|
|
totalGB := 0
|
|
for _, disk := range disks {
|
|
if disk.Present != nil && !*disk.Present {
|
|
continue
|
|
}
|
|
count++
|
|
if disk.SizeGB != nil && *disk.SizeGB > 0 {
|
|
totalGB += *disk.SizeGB
|
|
}
|
|
}
|
|
if count == 0 {
|
|
return ""
|
|
}
|
|
line := fmt.Sprintf("Storage: %d drives", count)
|
|
if totalGB > 0 {
|
|
line += fmt.Sprintf(" / %s", humanizeGB(totalGB))
|
|
}
|
|
return line
|
|
}
|
|
|
|
func formatGPULine(devices []schema.HardwarePCIeDevice) string {
|
|
gpus := map[string]int{}
|
|
for _, dev := range devices {
|
|
if !isGPUDevice(dev) {
|
|
continue
|
|
}
|
|
name := firstNonEmpty(trimPtr(dev.Model), trimPtr(dev.Manufacturer), "unknown")
|
|
gpus[name]++
|
|
}
|
|
if len(gpus) == 0 {
|
|
return ""
|
|
}
|
|
keys := make([]string, 0, len(gpus))
|
|
for key := range gpus {
|
|
keys = append(keys, key)
|
|
}
|
|
sort.Strings(keys)
|
|
parts := make([]string, 0, len(keys))
|
|
for _, key := range keys {
|
|
parts = append(parts, fmt.Sprintf("%d x %s", gpus[key], key))
|
|
}
|
|
return "GPU: " + strings.Join(parts, ", ")
|
|
}
|
|
|
|
func formatIPLine(list func() ([]platform.InterfaceInfo, error)) string {
|
|
if list == nil {
|
|
return ""
|
|
}
|
|
ifaces, err := list()
|
|
if err != nil {
|
|
return ""
|
|
}
|
|
seen := map[string]struct{}{}
|
|
var ips []string
|
|
for _, iface := range ifaces {
|
|
for _, ip := range iface.IPv4 {
|
|
ip = strings.TrimSpace(ip)
|
|
if ip == "" {
|
|
continue
|
|
}
|
|
if _, ok := seen[ip]; ok {
|
|
continue
|
|
}
|
|
seen[ip] = struct{}{}
|
|
ips = append(ips, ip)
|
|
}
|
|
}
|
|
if len(ips) == 0 {
|
|
return ""
|
|
}
|
|
sort.Strings(ips)
|
|
return "IP: " + strings.Join(ips, ", ")
|
|
}
|
|
|
|
func formatSATDetail(raw string) string {
|
|
var b strings.Builder
|
|
kv := parseKeyValueSummary(raw)
|
|
|
|
if t, ok := kv["run_at_utc"]; ok {
|
|
fmt.Fprintf(&b, "Run: %s\n\n", t)
|
|
}
|
|
|
|
lines := strings.Split(raw, "\n")
|
|
var stepKeys []string
|
|
seenStep := map[string]bool{}
|
|
for _, line := range lines {
|
|
if idx := strings.Index(line, "_status="); idx >= 0 {
|
|
key := line[:idx]
|
|
if !seenStep[key] && key != "overall" {
|
|
seenStep[key] = true
|
|
stepKeys = append(stepKeys, key)
|
|
}
|
|
}
|
|
}
|
|
|
|
for _, key := range stepKeys {
|
|
status := kv[key+"_status"]
|
|
display := cleanSummaryKey(key)
|
|
switch status {
|
|
case "OK":
|
|
fmt.Fprintf(&b, "PASS %s\n", display)
|
|
case "FAILED":
|
|
fmt.Fprintf(&b, "FAIL %s\n", display)
|
|
case "UNSUPPORTED":
|
|
fmt.Fprintf(&b, "SKIP %s\n", display)
|
|
default:
|
|
fmt.Fprintf(&b, "? %s\n", display)
|
|
}
|
|
}
|
|
|
|
if overall, ok := kv["overall_status"]; ok {
|
|
ok2 := kv["job_ok"]
|
|
failed := kv["job_failed"]
|
|
fmt.Fprintf(&b, "\nOverall: %s (ok=%s failed=%s)", overall, ok2, failed)
|
|
}
|
|
|
|
return strings.TrimSpace(b.String())
|
|
}
|
|
|
|
func formatSATSummary(label, raw string) string {
|
|
values := parseKeyValueSummary(raw)
|
|
var body strings.Builder
|
|
fmt.Fprintf(&body, "%s:", label)
|
|
if overall := firstNonEmpty(values["overall_status"], "UNKNOWN"); overall != "" {
|
|
fmt.Fprintf(&body, " %s", overall)
|
|
}
|
|
if ok := firstNonEmpty(values["job_ok"], "0"); ok != "" {
|
|
fmt.Fprintf(&body, " ok=%s", ok)
|
|
}
|
|
if failed := firstNonEmpty(values["job_failed"], "0"); failed != "" {
|
|
fmt.Fprintf(&body, " failed=%s", failed)
|
|
}
|
|
if unsupported := firstNonEmpty(values["job_unsupported"], "0"); unsupported != "" && unsupported != "0" {
|
|
fmt.Fprintf(&body, " unsupported=%s", unsupported)
|
|
}
|
|
if devices := strings.TrimSpace(values["devices"]); devices != "" {
|
|
fmt.Fprintf(&body, "\nDevices: %s", devices)
|
|
}
|
|
return body.String()
|
|
}
|
|
|
|
func latestSATSummaries() []string {
|
|
patterns := []struct {
|
|
label string
|
|
prefix string
|
|
}{
|
|
{label: "NVIDIA SAT", prefix: "gpu-nvidia-"},
|
|
{label: "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)", prefix: "gpu-nvidia-targeted-stress-"},
|
|
{label: "NVIDIA Max Compute Load (dcgmproftester)", prefix: "gpu-nvidia-compute-"},
|
|
{label: "NVIDIA Targeted Power (dcgmi diag targeted_power)", prefix: "gpu-nvidia-targeted-power-"},
|
|
{label: "NVIDIA Pulse Test (dcgmi diag pulse_test)", prefix: "gpu-nvidia-pulse-"},
|
|
{label: "NVIDIA Interconnect Test (NCCL all_reduce_perf)", prefix: "gpu-nvidia-nccl-"},
|
|
{label: "NVIDIA Bandwidth Test (NVBandwidth)", prefix: "gpu-nvidia-bandwidth-"},
|
|
{label: "Memory SAT", prefix: "memory-"},
|
|
{label: "Storage SAT", prefix: "storage-"},
|
|
{label: "CPU SAT", prefix: "cpu-"},
|
|
}
|
|
var out []string
|
|
for _, item := range patterns {
|
|
matches, err := filepath.Glob(filepath.Join(DefaultSATBaseDir, item.prefix+"*/summary.txt"))
|
|
if err != nil || len(matches) == 0 {
|
|
continue
|
|
}
|
|
sort.Strings(matches)
|
|
raw, err := os.ReadFile(matches[len(matches)-1])
|
|
if err != nil {
|
|
continue
|
|
}
|
|
out = append(out, formatSATSummary(item.label, string(raw)))
|
|
}
|
|
return out
|
|
}
|