Bring codebase into compliance with bible contracts (A–E)
A (hardware-ingest-json v2.8-2.9): remove sensor location fields from schema and collector; tag HardwareMemory.Location as json:"-"; add PlatformConfig to HardwareSnapshot. B (no-hardcoded-vendors): consolidate PCI vendor IDs into collector/pci_vendors.go; replace all vendor-name string checks in isGPUDevice, isNVIDIADevice, isMellanoxDevice, isAMDGPUDevice, matchesGPUVendor (sat_overlay), and validateIsVendorGPU (page_validate) with numeric vendor_id comparisons. C (module-structure): split app/app.go (1413 lines) into app.go + app_format.go, app_network.go, app_services.go, app_packs.go, app_install.go — no logic changes. D (go-code-style): wrap bare return err in interfaceAdminState and interfaceIPv4Addrs (platform/network.go) with fmt.Errorf context including the interface name. E (go-project-bible): add bible-local/architecture/data-model.md and bible-local/architecture/api-surface.md. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
File diff suppressed because it is too large
Load Diff
405
audit/internal/app/app_format.go
Normal file
405
audit/internal/app/app_format.go
Normal file
@@ -0,0 +1,405 @@
|
|||||||
|
package app
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"sort"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"bee/audit/internal/collector"
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
)
|
||||||
|
|
||||||
|
func hostnameOr(fallback string) string {
|
||||||
|
hn, err := os.Hostname()
|
||||||
|
if err != nil || strings.TrimSpace(hn) == "" {
|
||||||
|
return fallback
|
||||||
|
}
|
||||||
|
return hn
|
||||||
|
}
|
||||||
|
|
||||||
|
func sanitizeFilename(v string) string {
|
||||||
|
var out []rune
|
||||||
|
for _, r := range v {
|
||||||
|
switch {
|
||||||
|
case r >= 'a' && r <= 'z', r >= 'A' && r <= 'Z', r >= '0' && r <= '9', r == '-', r == '_', r == '.':
|
||||||
|
out = append(out, r)
|
||||||
|
default:
|
||||||
|
out = append(out, '-')
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(out) == 0 {
|
||||||
|
return "unknown"
|
||||||
|
}
|
||||||
|
return string(out)
|
||||||
|
}
|
||||||
|
|
||||||
|
func bodyOr(body, fallback string) string {
|
||||||
|
body = strings.TrimSpace(body)
|
||||||
|
if body == "" {
|
||||||
|
return fallback
|
||||||
|
}
|
||||||
|
return body
|
||||||
|
}
|
||||||
|
|
||||||
|
func trimPtr(value *string) string {
|
||||||
|
if value == nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return strings.TrimSpace(*value)
|
||||||
|
}
|
||||||
|
|
||||||
|
func joinSortedKeys(values map[string]struct{}) string {
|
||||||
|
if len(values) == 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
keys := make([]string, 0, len(values))
|
||||||
|
for key := range values {
|
||||||
|
keys = append(keys, key)
|
||||||
|
}
|
||||||
|
sort.Strings(keys)
|
||||||
|
return strings.Join(keys, "/")
|
||||||
|
}
|
||||||
|
|
||||||
|
func humanizeMB(totalMB int) string {
|
||||||
|
if totalMB <= 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
gb := float64(totalMB) / 1024.0
|
||||||
|
if gb >= 1024.0 {
|
||||||
|
tb := gb / 1024.0
|
||||||
|
return fmt.Sprintf("%.1f TB", tb)
|
||||||
|
}
|
||||||
|
if gb == float64(int64(gb)) {
|
||||||
|
return fmt.Sprintf("%.0f GB", gb)
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("%.1f GB", gb)
|
||||||
|
}
|
||||||
|
|
||||||
|
func humanizeGB(totalGB int) string {
|
||||||
|
if totalGB <= 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
tb := float64(totalGB) / 1024.0
|
||||||
|
if tb >= 1.0 {
|
||||||
|
return fmt.Sprintf("%.1f TB", tb)
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("%d GB", totalGB)
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseKeyValueSummary(raw string) map[string]string {
|
||||||
|
out := map[string]string{}
|
||||||
|
for _, line := range strings.Split(raw, "\n") {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
if line == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
key, value, ok := strings.Cut(line, "=")
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
out[strings.TrimSpace(key)] = strings.TrimSpace(value)
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func firstNonEmpty(values ...string) string {
|
||||||
|
for _, value := range values {
|
||||||
|
value = strings.TrimSpace(value)
|
||||||
|
if value != "" {
|
||||||
|
return value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
func cleanSummaryKey(key string) string {
|
||||||
|
idx := strings.Index(key, "-")
|
||||||
|
if idx <= 0 {
|
||||||
|
return key
|
||||||
|
}
|
||||||
|
prefix := key[:idx]
|
||||||
|
for _, c := range prefix {
|
||||||
|
if c < '0' || c > '9' {
|
||||||
|
return key
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return key[idx+1:]
|
||||||
|
}
|
||||||
|
|
||||||
|
func isGPUDevice(dev schema.HardwarePCIeDevice) bool {
|
||||||
|
// Exclude Aspeed BMC VGA adapters (not compute GPUs).
|
||||||
|
if dev.VendorID != nil && *dev.VendorID == collector.AspeedVendorID {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
class := trimPtr(dev.DeviceClass)
|
||||||
|
// AMD Instinct / Radeon compute GPUs always carry ProcessingAccelerator or DisplayController.
|
||||||
|
// Do NOT match AMD vendor alone — CPU chipset PCIe devices share that vendor ID.
|
||||||
|
if class == "VideoController" || class == "DisplayController" || class == "ProcessingAccelerator" {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
// NVIDIA devices sometimes expose class values outside the standard GPU set.
|
||||||
|
return dev.VendorID != nil && *dev.VendorID == collector.NvidiaVendorID
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatSystemLine(board schema.HardwareBoard) string {
|
||||||
|
model := strings.TrimSpace(strings.Join([]string{
|
||||||
|
trimPtr(board.Manufacturer),
|
||||||
|
trimPtr(board.ProductName),
|
||||||
|
}, " "))
|
||||||
|
serial := strings.TrimSpace(board.SerialNumber)
|
||||||
|
switch {
|
||||||
|
case model != "" && serial != "":
|
||||||
|
return fmt.Sprintf("System: %s | S/N %s", model, serial)
|
||||||
|
case model != "":
|
||||||
|
return "System: " + model
|
||||||
|
case serial != "":
|
||||||
|
return "System S/N: " + serial
|
||||||
|
default:
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatCPULine(cpus []schema.HardwareCPU) string {
|
||||||
|
if len(cpus) == 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
modelCounts := map[string]int{}
|
||||||
|
unknown := 0
|
||||||
|
for _, cpu := range cpus {
|
||||||
|
model := trimPtr(cpu.Model)
|
||||||
|
if model == "" {
|
||||||
|
unknown++
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
modelCounts[model]++
|
||||||
|
}
|
||||||
|
if len(modelCounts) == 1 && unknown == 0 {
|
||||||
|
for model, count := range modelCounts {
|
||||||
|
return fmt.Sprintf("CPU: %d x %s", count, model)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
parts := make([]string, 0, len(modelCounts)+1)
|
||||||
|
if len(modelCounts) > 0 {
|
||||||
|
keys := make([]string, 0, len(modelCounts))
|
||||||
|
for key := range modelCounts {
|
||||||
|
keys = append(keys, key)
|
||||||
|
}
|
||||||
|
sort.Strings(keys)
|
||||||
|
for _, key := range keys {
|
||||||
|
parts = append(parts, fmt.Sprintf("%d x %s", modelCounts[key], key))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if unknown > 0 {
|
||||||
|
parts = append(parts, fmt.Sprintf("%d x unknown", unknown))
|
||||||
|
}
|
||||||
|
return "CPU: " + strings.Join(parts, ", ")
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatMemoryLine(dimms []schema.HardwareMemory) string {
|
||||||
|
totalMB := 0
|
||||||
|
present := 0
|
||||||
|
types := map[string]struct{}{}
|
||||||
|
for _, dimm := range dimms {
|
||||||
|
if dimm.Present != nil && !*dimm.Present {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if dimm.SizeMB == nil || *dimm.SizeMB <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
present++
|
||||||
|
totalMB += *dimm.SizeMB
|
||||||
|
if value := trimPtr(dimm.Type); value != "" {
|
||||||
|
types[value] = struct{}{}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if totalMB == 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
typeText := joinSortedKeys(types)
|
||||||
|
line := fmt.Sprintf("Memory: %s", humanizeMB(totalMB))
|
||||||
|
if typeText != "" {
|
||||||
|
line += " " + typeText
|
||||||
|
}
|
||||||
|
if present > 0 {
|
||||||
|
line += fmt.Sprintf(" (%d DIMMs)", present)
|
||||||
|
}
|
||||||
|
return line
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatStorageLine(disks []schema.HardwareStorage) string {
|
||||||
|
count := 0
|
||||||
|
totalGB := 0
|
||||||
|
for _, disk := range disks {
|
||||||
|
if disk.Present != nil && !*disk.Present {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
count++
|
||||||
|
if disk.SizeGB != nil && *disk.SizeGB > 0 {
|
||||||
|
totalGB += *disk.SizeGB
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if count == 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
line := fmt.Sprintf("Storage: %d drives", count)
|
||||||
|
if totalGB > 0 {
|
||||||
|
line += fmt.Sprintf(" / %s", humanizeGB(totalGB))
|
||||||
|
}
|
||||||
|
return line
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatGPULine(devices []schema.HardwarePCIeDevice) string {
|
||||||
|
gpus := map[string]int{}
|
||||||
|
for _, dev := range devices {
|
||||||
|
if !isGPUDevice(dev) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
name := firstNonEmpty(trimPtr(dev.Model), trimPtr(dev.Manufacturer), "unknown")
|
||||||
|
gpus[name]++
|
||||||
|
}
|
||||||
|
if len(gpus) == 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
keys := make([]string, 0, len(gpus))
|
||||||
|
for key := range gpus {
|
||||||
|
keys = append(keys, key)
|
||||||
|
}
|
||||||
|
sort.Strings(keys)
|
||||||
|
parts := make([]string, 0, len(keys))
|
||||||
|
for _, key := range keys {
|
||||||
|
parts = append(parts, fmt.Sprintf("%d x %s", gpus[key], key))
|
||||||
|
}
|
||||||
|
return "GPU: " + strings.Join(parts, ", ")
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatIPLine(list func() ([]platform.InterfaceInfo, error)) string {
|
||||||
|
if list == nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
ifaces, err := list()
|
||||||
|
if err != nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
seen := map[string]struct{}{}
|
||||||
|
var ips []string
|
||||||
|
for _, iface := range ifaces {
|
||||||
|
for _, ip := range iface.IPv4 {
|
||||||
|
ip = strings.TrimSpace(ip)
|
||||||
|
if ip == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if _, ok := seen[ip]; ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seen[ip] = struct{}{}
|
||||||
|
ips = append(ips, ip)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(ips) == 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
sort.Strings(ips)
|
||||||
|
return "IP: " + strings.Join(ips, ", ")
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatSATDetail(raw string) string {
|
||||||
|
var b strings.Builder
|
||||||
|
kv := parseKeyValueSummary(raw)
|
||||||
|
|
||||||
|
if t, ok := kv["run_at_utc"]; ok {
|
||||||
|
fmt.Fprintf(&b, "Run: %s\n\n", t)
|
||||||
|
}
|
||||||
|
|
||||||
|
lines := strings.Split(raw, "\n")
|
||||||
|
var stepKeys []string
|
||||||
|
seenStep := map[string]bool{}
|
||||||
|
for _, line := range lines {
|
||||||
|
if idx := strings.Index(line, "_status="); idx >= 0 {
|
||||||
|
key := line[:idx]
|
||||||
|
if !seenStep[key] && key != "overall" {
|
||||||
|
seenStep[key] = true
|
||||||
|
stepKeys = append(stepKeys, key)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, key := range stepKeys {
|
||||||
|
status := kv[key+"_status"]
|
||||||
|
display := cleanSummaryKey(key)
|
||||||
|
switch status {
|
||||||
|
case "OK":
|
||||||
|
fmt.Fprintf(&b, "PASS %s\n", display)
|
||||||
|
case "FAILED":
|
||||||
|
fmt.Fprintf(&b, "FAIL %s\n", display)
|
||||||
|
case "UNSUPPORTED":
|
||||||
|
fmt.Fprintf(&b, "SKIP %s\n", display)
|
||||||
|
default:
|
||||||
|
fmt.Fprintf(&b, "? %s\n", display)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if overall, ok := kv["overall_status"]; ok {
|
||||||
|
ok2 := kv["job_ok"]
|
||||||
|
failed := kv["job_failed"]
|
||||||
|
fmt.Fprintf(&b, "\nOverall: %s (ok=%s failed=%s)", overall, ok2, failed)
|
||||||
|
}
|
||||||
|
|
||||||
|
return strings.TrimSpace(b.String())
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatSATSummary(label, raw string) string {
|
||||||
|
values := parseKeyValueSummary(raw)
|
||||||
|
var body strings.Builder
|
||||||
|
fmt.Fprintf(&body, "%s:", label)
|
||||||
|
if overall := firstNonEmpty(values["overall_status"], "UNKNOWN"); overall != "" {
|
||||||
|
fmt.Fprintf(&body, " %s", overall)
|
||||||
|
}
|
||||||
|
if ok := firstNonEmpty(values["job_ok"], "0"); ok != "" {
|
||||||
|
fmt.Fprintf(&body, " ok=%s", ok)
|
||||||
|
}
|
||||||
|
if failed := firstNonEmpty(values["job_failed"], "0"); failed != "" {
|
||||||
|
fmt.Fprintf(&body, " failed=%s", failed)
|
||||||
|
}
|
||||||
|
if unsupported := firstNonEmpty(values["job_unsupported"], "0"); unsupported != "" && unsupported != "0" {
|
||||||
|
fmt.Fprintf(&body, " unsupported=%s", unsupported)
|
||||||
|
}
|
||||||
|
if devices := strings.TrimSpace(values["devices"]); devices != "" {
|
||||||
|
fmt.Fprintf(&body, "\nDevices: %s", devices)
|
||||||
|
}
|
||||||
|
return body.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func latestSATSummaries() []string {
|
||||||
|
patterns := []struct {
|
||||||
|
label string
|
||||||
|
prefix string
|
||||||
|
}{
|
||||||
|
{label: "NVIDIA SAT", prefix: "gpu-nvidia-"},
|
||||||
|
{label: "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)", prefix: "gpu-nvidia-targeted-stress-"},
|
||||||
|
{label: "NVIDIA Max Compute Load (dcgmproftester)", prefix: "gpu-nvidia-compute-"},
|
||||||
|
{label: "NVIDIA Targeted Power (dcgmi diag targeted_power)", prefix: "gpu-nvidia-targeted-power-"},
|
||||||
|
{label: "NVIDIA Pulse Test (dcgmi diag pulse_test)", prefix: "gpu-nvidia-pulse-"},
|
||||||
|
{label: "NVIDIA Interconnect Test (NCCL all_reduce_perf)", prefix: "gpu-nvidia-nccl-"},
|
||||||
|
{label: "NVIDIA Bandwidth Test (NVBandwidth)", prefix: "gpu-nvidia-bandwidth-"},
|
||||||
|
{label: "Memory SAT", prefix: "memory-"},
|
||||||
|
{label: "Storage SAT", prefix: "storage-"},
|
||||||
|
{label: "CPU SAT", prefix: "cpu-"},
|
||||||
|
}
|
||||||
|
var out []string
|
||||||
|
for _, item := range patterns {
|
||||||
|
matches, err := filepath.Glob(filepath.Join(DefaultSATBaseDir, item.prefix+"*/summary.txt"))
|
||||||
|
if err != nil || len(matches) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
sort.Strings(matches)
|
||||||
|
raw, err := os.ReadFile(matches[len(matches)-1])
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
out = append(out, formatSATSummary(item.label, string(raw)))
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
76
audit/internal/app/app_install.go
Normal file
76
audit/internal/app/app_install.go
Normal file
@@ -0,0 +1,76 @@
|
|||||||
|
package app
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
)
|
||||||
|
|
||||||
|
func (a *App) ListRemovableTargets() ([]platform.RemovableTarget, error) {
|
||||||
|
return a.exports.ListRemovableTargets()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) ExportLatestAudit(target platform.RemovableTarget) (string, error) {
|
||||||
|
if _, err := os.Stat(DefaultAuditJSONPath); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
filename := fmt.Sprintf("audit-%s-%s.json", sanitizeFilename(hostnameOr("unknown")), time.Now().UTC().Format("20060102-150405"))
|
||||||
|
tmpPath := filepath.Join(os.TempDir(), filename)
|
||||||
|
data, err := readFileLimited(DefaultAuditJSONPath, 100<<20)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
if normalized, normErr := ApplySATOverlay(data); normErr == nil {
|
||||||
|
data = normalized
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(tmpPath, data, 0644); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
defer os.Remove(tmpPath)
|
||||||
|
return a.exports.ExportFileToTarget(tmpPath, target)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) ExportLatestAuditResult(target platform.RemovableTarget) (ActionResult, error) {
|
||||||
|
path, err := a.ExportLatestAudit(target)
|
||||||
|
body := "Audit export failed."
|
||||||
|
if err == nil {
|
||||||
|
body = "Audit exported."
|
||||||
|
}
|
||||||
|
if err == nil && path != "" {
|
||||||
|
body = "Audit exported to " + path
|
||||||
|
}
|
||||||
|
return ActionResult{Title: "Export audit", Body: body}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) ExportSupportBundle(target platform.RemovableTarget) (string, error) {
|
||||||
|
archive, err := BuildSupportBundle(DefaultExportDir)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
defer os.Remove(archive)
|
||||||
|
return a.exports.ExportFileToTarget(archive, target)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) ExportSupportBundleResult(target platform.RemovableTarget) (ActionResult, error) {
|
||||||
|
path, err := a.ExportSupportBundle(target)
|
||||||
|
body := "Support bundle export failed."
|
||||||
|
if err == nil {
|
||||||
|
body = "Support bundle exported. USB target unmounted and safe to remove."
|
||||||
|
}
|
||||||
|
if err == nil && path != "" {
|
||||||
|
body = "Support bundle exported to " + path + ".\n\nUSB target unmounted and safe to remove."
|
||||||
|
}
|
||||||
|
return ActionResult{Title: "Export support bundle", Body: body}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) ListInstallDisks() ([]platform.InstallDisk, error) {
|
||||||
|
return a.installer.ListInstallDisks()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) InstallToDisk(ctx context.Context, device string, logFile string) error {
|
||||||
|
return a.installer.InstallToDisk(ctx, device, logFile)
|
||||||
|
}
|
||||||
106
audit/internal/app/app_network.go
Normal file
106
audit/internal/app/app_network.go
Normal file
@@ -0,0 +1,106 @@
|
|||||||
|
package app
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
)
|
||||||
|
|
||||||
|
func (a *App) ListInterfaces() ([]platform.InterfaceInfo, error) {
|
||||||
|
return a.network.ListInterfaces()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) DefaultRoute() string {
|
||||||
|
return a.network.DefaultRoute()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) DHCPOne(iface string) (string, error) {
|
||||||
|
return a.network.DHCPOne(iface)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) DHCPOneResult(iface string) (ActionResult, error) {
|
||||||
|
body, err := a.network.DHCPOne(iface)
|
||||||
|
return ActionResult{Title: "DHCP: " + iface, Body: bodyOr(body, "DHCP completed.")}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) DHCPAll() (string, error) {
|
||||||
|
return a.network.DHCPAll()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) DHCPAllResult() (ActionResult, error) {
|
||||||
|
body, err := a.network.DHCPAll()
|
||||||
|
return ActionResult{Title: "DHCP: all interfaces", Body: bodyOr(body, "DHCP completed.")}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error) {
|
||||||
|
return a.network.SetStaticIPv4(cfg)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) SetInterfaceState(iface string, up bool) error {
|
||||||
|
return a.network.SetInterfaceState(iface, up)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) GetInterfaceState(iface string) (bool, error) {
|
||||||
|
return a.network.GetInterfaceState(iface)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) CaptureNetworkSnapshot() (platform.NetworkSnapshot, error) {
|
||||||
|
return a.network.CaptureNetworkSnapshot()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RestoreNetworkSnapshot(snapshot platform.NetworkSnapshot) error {
|
||||||
|
return a.network.RestoreNetworkSnapshot(snapshot)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) SetStaticIPv4Result(cfg platform.StaticIPv4Config) (ActionResult, error) {
|
||||||
|
body, err := a.network.SetStaticIPv4(cfg)
|
||||||
|
return ActionResult{Title: "Static IPv4: " + cfg.Interface, Body: bodyOr(body, "Static IPv4 updated.")}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) NetworkStatus() (ActionResult, error) {
|
||||||
|
ifaces, err := a.network.ListInterfaces()
|
||||||
|
if err != nil {
|
||||||
|
return ActionResult{Title: "Network status"}, err
|
||||||
|
}
|
||||||
|
if len(ifaces) == 0 {
|
||||||
|
return ActionResult{Title: "Network status", Body: "No physical interfaces found."}, nil
|
||||||
|
}
|
||||||
|
var body strings.Builder
|
||||||
|
for _, iface := range ifaces {
|
||||||
|
ipv4 := "(no IPv4)"
|
||||||
|
if len(iface.IPv4) > 0 {
|
||||||
|
ipv4 = strings.Join(iface.IPv4, ", ")
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&body, "- %s: state=%s ip=%s\n", iface.Name, iface.State, ipv4)
|
||||||
|
}
|
||||||
|
if gw := a.network.DefaultRoute(); gw != "" {
|
||||||
|
fmt.Fprintf(&body, "\nDefault route: %s\n", gw)
|
||||||
|
}
|
||||||
|
return ActionResult{Title: "Network status", Body: strings.TrimSpace(body.String())}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) DefaultStaticIPv4FormFields(iface string) []string {
|
||||||
|
return []string{
|
||||||
|
"",
|
||||||
|
"24",
|
||||||
|
strings.TrimSpace(a.network.DefaultRoute()),
|
||||||
|
"77.88.8.8 77.88.8.1 1.1.1.1 8.8.8.8",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) ParseStaticIPv4Config(iface string, fields []string) platform.StaticIPv4Config {
|
||||||
|
get := func(index int) string {
|
||||||
|
if index >= 0 && index < len(fields) {
|
||||||
|
return strings.TrimSpace(fields[index])
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return platform.StaticIPv4Config{
|
||||||
|
Interface: iface,
|
||||||
|
Address: get(0),
|
||||||
|
Prefix: get(1),
|
||||||
|
Gateway: get(2),
|
||||||
|
DNS: strings.Fields(get(3)),
|
||||||
|
}
|
||||||
|
}
|
||||||
370
audit/internal/app/app_packs.go
Normal file
370
audit/internal/app/app_packs.go
Normal file
@@ -0,0 +1,370 @@
|
|||||||
|
package app
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
)
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunNvidiaAcceptancePack(baseDir, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||||
|
path, err := a.RunNvidiaAcceptancePack(baseDir, nil)
|
||||||
|
body := "Archive written."
|
||||||
|
if path != "" {
|
||||||
|
body = "Archive written to " + path
|
||||||
|
}
|
||||||
|
return ActionResult{Title: "NVIDIA SAT", Body: body}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
|
||||||
|
return a.sat.ListNvidiaGPUs()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error) {
|
||||||
|
return a.sat.ListNvidiaGPUStatuses()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) ResetNvidiaGPU(index int) (ActionResult, error) {
|
||||||
|
out, err := a.sat.ResetNvidiaGPU(index)
|
||||||
|
return ActionResult{Title: fmt.Sprintf("Reset NVIDIA GPU %d", index), Body: strings.TrimSpace(out)}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (ActionResult, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
path, err := a.sat.RunNvidiaAcceptancePackWithOptions(ctx, baseDir, diagLevel, gpuIndices, logFunc)
|
||||||
|
body := "Archive written."
|
||||||
|
if path != "" {
|
||||||
|
body = "Archive written to " + path
|
||||||
|
}
|
||||||
|
return ActionResult{Title: "NVIDIA DCGM", Body: body}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunNvidiaTargetedStressValidatePack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaStressPack(baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunNvidiaStressPackCtx(context.Background(), baseDir, opts, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaBenchmark(baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunNvidiaBenchmarkCtx(context.Background(), baseDir, opts, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultBeeBenchPerfDir
|
||||||
|
}
|
||||||
|
resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "performance", logFunc)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
opts.ServerPowerSource = resolved.SelectedSource
|
||||||
|
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaPowerBenchCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultBeeBenchPowerDir
|
||||||
|
}
|
||||||
|
resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "power-fit", logFunc)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
opts.ServerPowerSource = resolved.SelectedSource
|
||||||
|
return a.sat.RunNvidiaPowerBench(ctx, baseDir, opts, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaPowerSourceAutotuneCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultBeeBenchAutotuneDir
|
||||||
|
}
|
||||||
|
return a.sat.RunNvidiaPowerSourceAutotune(ctx, baseDir, opts, benchmarkKind, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) LoadBenchmarkPowerAutotune() (*platform.BenchmarkPowerAutotuneConfig, error) {
|
||||||
|
return platform.LoadBenchmarkPowerAutotuneConfig(DefaultBeeBenchPowerSourceConfigPath)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) ensureBenchmarkPowerAutotune(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (platform.BenchmarkPowerAutotuneConfig, error) {
|
||||||
|
cfgPath := platform.BenchmarkPowerSourceConfigPath(baseDir)
|
||||||
|
if cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath); err == nil {
|
||||||
|
if logFunc != nil {
|
||||||
|
logFunc(fmt.Sprintf("benchmark autotune: using saved server power source %s", cfg.SelectedSource))
|
||||||
|
}
|
||||||
|
return *cfg, nil
|
||||||
|
}
|
||||||
|
if logFunc != nil {
|
||||||
|
logFunc("benchmark autotune: no saved power source config, running autotune first")
|
||||||
|
}
|
||||||
|
autotuneDir := filepath.Join(filepath.Dir(baseDir), "autotune")
|
||||||
|
if _, err := a.RunNvidiaPowerSourceAutotuneCtx(ctx, autotuneDir, opts, benchmarkKind, logFunc); err != nil {
|
||||||
|
return platform.BenchmarkPowerAutotuneConfig{}, err
|
||||||
|
}
|
||||||
|
cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath)
|
||||||
|
if err != nil {
|
||||||
|
return platform.BenchmarkPowerAutotuneConfig{}, err
|
||||||
|
}
|
||||||
|
return *cfg, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, staggerSec, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunNvidiaTargetedPowerPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunNvidiaPulseTestPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunNvidiaBandwidthPack(ctx, baseDir, gpuIndices, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunNvidiaStressPack(ctx, baseDir, opts, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, 256, 1, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunMemoryAcceptancePackCtx(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunMemoryAcceptancePack(ctx, baseDir, sizeMB, passes, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunMemoryAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||||
|
path, err := a.RunMemoryAcceptancePack(baseDir, nil)
|
||||||
|
return ActionResult{Title: "Memory SAT", Body: satResultBody(path)}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunCPUAcceptancePack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunCPUAcceptancePackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunCPUAcceptancePackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunCPUAcceptancePack(ctx, baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunCPUAcceptancePackResult(baseDir string, durationSec int) (ActionResult, error) {
|
||||||
|
path, err := a.RunCPUAcceptancePack(baseDir, durationSec, nil)
|
||||||
|
return ActionResult{Title: "CPU SAT", Body: satResultBody(path)}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunStorageAcceptancePackCtx(context.Background(), baseDir, false, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunStorageAcceptancePackCtx(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunStorageAcceptancePack(ctx, baseDir, extended, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||||
|
path, err := a.RunStorageAcceptancePack(baseDir, nil)
|
||||||
|
return ActionResult{Title: "Storage SAT", Body: satResultBody(path)}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) DetectGPUVendor() string {
|
||||||
|
return a.sat.DetectGPUVendor()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) ListAMDGPUs() ([]platform.AMDGPUInfo, error) {
|
||||||
|
return a.sat.ListAMDGPUs()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunAMDAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunAMDAcceptancePackCtx(context.Background(), baseDir, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunAMDAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunAMDAcceptancePack(ctx, baseDir, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunAMDAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||||
|
path, err := a.RunAMDAcceptancePack(baseDir, nil)
|
||||||
|
return ActionResult{Title: "AMD GPU SAT", Body: satResultBody(path)}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunAMDMemIntegrityPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunAMDMemIntegrityPack(ctx, baseDir, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunAMDMemBandwidthPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunAMDMemBandwidthPack(ctx, baseDir, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunMemoryStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunMemoryStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunSATStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunSATStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunAMDStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunAMDStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunMemoryStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
return a.sat.RunMemoryStressPack(ctx, baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunSATStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
return a.sat.RunSATStressPack(ctx, baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunAMDStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunAMDStressPack(ctx, baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunFanStressTest(ctx, baseDir, opts)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunNCCLTests(ctx, baseDir, gpuIndices, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
|
||||||
|
path, err := a.RunNCCLTests(ctx, DefaultSATBaseDir, nil, nil)
|
||||||
|
body := "Results: " + path
|
||||||
|
if err != nil && err != context.Canceled {
|
||||||
|
body += "\nERROR: " + err.Error()
|
||||||
|
}
|
||||||
|
return ActionResult{Title: "NCCL bandwidth test", Body: body}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunFanStressTestResult(ctx context.Context, opts platform.FanStressOptions) (ActionResult, error) {
|
||||||
|
path, err := a.RunFanStressTest(ctx, "", opts)
|
||||||
|
body := formatFanStressResult(path)
|
||||||
|
if err != nil && err != context.Canceled {
|
||||||
|
body += "\nERROR: " + err.Error()
|
||||||
|
}
|
||||||
|
return ActionResult{Title: "GPU Platform Stress Test", Body: body}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// formatFanStressResult formats the summary.txt from a fan-stress run, including
|
||||||
|
// the per-step pass/fail display and the analysis section (throttling, max temps, fan response).
|
||||||
|
func formatFanStressResult(archivePath string) string {
|
||||||
|
if archivePath == "" {
|
||||||
|
return "No output produced."
|
||||||
|
}
|
||||||
|
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||||
|
raw, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||||
|
if err != nil {
|
||||||
|
return "Archive written to " + archivePath
|
||||||
|
}
|
||||||
|
content := strings.TrimSpace(string(raw))
|
||||||
|
kv := parseKeyValueSummary(content)
|
||||||
|
|
||||||
|
var b strings.Builder
|
||||||
|
b.WriteString(formatSATDetail(content))
|
||||||
|
|
||||||
|
// Append analysis section.
|
||||||
|
var analysis []string
|
||||||
|
if v, ok := kv["throttling_detected"]; ok {
|
||||||
|
label := "NO"
|
||||||
|
if v == "true" {
|
||||||
|
label = "YES ← throttling detected during load"
|
||||||
|
}
|
||||||
|
analysis = append(analysis, "Throttling: "+label)
|
||||||
|
}
|
||||||
|
if v, ok := kv["max_gpu_temp_c"]; ok && v != "0.0" {
|
||||||
|
analysis = append(analysis, "Max GPU temp: "+v+"°C")
|
||||||
|
}
|
||||||
|
if v, ok := kv["max_cpu_temp_c"]; ok && v != "0.0" {
|
||||||
|
analysis = append(analysis, "Max CPU temp: "+v+"°C")
|
||||||
|
}
|
||||||
|
if v, ok := kv["fan_response_sec"]; ok && v != "N/A" && v != "-1.0" {
|
||||||
|
analysis = append(analysis, "Fan response: "+v+"s")
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(analysis) > 0 {
|
||||||
|
b.WriteString("\n\n=== Analysis ===\n")
|
||||||
|
for _, line := range analysis {
|
||||||
|
b.WriteString(line + "\n")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return strings.TrimSpace(b.String())
|
||||||
|
}
|
||||||
|
|
||||||
|
// satResultBody reads summary.txt from the SAT run directory (archive path without .tar.gz)
|
||||||
|
// and returns a formatted human-readable result. Falls back to a plain message if unreadable.
|
||||||
|
func satResultBody(archivePath string) string {
|
||||||
|
if archivePath == "" {
|
||||||
|
return "No output produced."
|
||||||
|
}
|
||||||
|
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||||
|
raw, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||||
|
if err != nil {
|
||||||
|
return "Archive written to " + archivePath
|
||||||
|
}
|
||||||
|
return formatSATDetail(strings.TrimSpace(string(raw)))
|
||||||
|
}
|
||||||
67
audit/internal/app/app_services.go
Normal file
67
audit/internal/app/app_services.go
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
package app
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
)
|
||||||
|
|
||||||
|
func (a *App) ListBeeServices() ([]string, error) {
|
||||||
|
return a.services.ListBeeServices()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) ServiceState(name string) string {
|
||||||
|
return a.services.ServiceState(name)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) ServiceStatus(name string) (string, error) {
|
||||||
|
return a.services.ServiceStatus(name)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) ServiceStatusResult(name string) (ActionResult, error) {
|
||||||
|
body, err := a.services.ServiceStatus(name)
|
||||||
|
return ActionResult{Title: "service status: " + name, Body: bodyOr(body, "No status output.")}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) ServiceDo(name string, action platform.ServiceAction) (string, error) {
|
||||||
|
return a.services.ServiceDo(name, action)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) ServiceActionResult(name string, action platform.ServiceAction) (ActionResult, error) {
|
||||||
|
body, err := a.services.ServiceDo(name, action)
|
||||||
|
return ActionResult{Title: "service " + string(action) + ": " + name, Body: bodyOr(body, "Action completed.")}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) TailFile(path string, lines int) string {
|
||||||
|
return a.tools.TailFile(path, lines)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) CheckTools(names []string) []platform.ToolStatus {
|
||||||
|
return a.tools.CheckTools(names)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) ToolCheckResult(names []string) ActionResult {
|
||||||
|
if len(names) == 0 {
|
||||||
|
return ActionResult{Title: "Required tools", Body: "No tools checked."}
|
||||||
|
}
|
||||||
|
var body strings.Builder
|
||||||
|
for _, tool := range a.tools.CheckTools(names) {
|
||||||
|
status := "MISSING"
|
||||||
|
if tool.OK {
|
||||||
|
status = "OK (" + tool.Path + ")"
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&body, "- %s: %s\n", tool.Name, status)
|
||||||
|
}
|
||||||
|
return ActionResult{Title: "Required tools", Body: strings.TrimSpace(body.String())}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) AuditLogTailResult() ActionResult {
|
||||||
|
logTail := strings.TrimSpace(a.tools.TailFile(DefaultAuditLogPath, 40))
|
||||||
|
jsonTail := strings.TrimSpace(a.tools.TailFile(DefaultAuditJSONPath, 20))
|
||||||
|
body := strings.TrimSpace(logTail + "\n\n" + jsonTail)
|
||||||
|
if body == "" {
|
||||||
|
body = "No audit logs found."
|
||||||
|
}
|
||||||
|
return ActionResult{Title: "Audit log tail", Body: body}
|
||||||
|
}
|
||||||
@@ -3,10 +3,11 @@ package app
|
|||||||
import (
|
import (
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strconv"
|
|
||||||
"sort"
|
"sort"
|
||||||
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
|
"bee/audit/internal/collector"
|
||||||
"bee/audit/internal/schema"
|
"bee/audit/internal/schema"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -313,17 +314,20 @@ func statusSeverity(status string) int {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func matchesGPUVendor(dev schema.HardwarePCIeDevice, vendor string) bool {
|
func matchesGPUVendor(dev schema.HardwarePCIeDevice, vendor string) bool {
|
||||||
if dev.DeviceClass == nil || !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Controller") && !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Accelerator") {
|
if dev.DeviceClass == nil {
|
||||||
if dev.DeviceClass == nil || !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Display") && !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Video") {
|
return false
|
||||||
return false
|
}
|
||||||
}
|
class := strings.TrimSpace(*dev.DeviceClass)
|
||||||
|
isGPUClass := strings.Contains(class, "Controller") || strings.Contains(class, "Accelerator") ||
|
||||||
|
strings.Contains(class, "Display") || strings.Contains(class, "Video")
|
||||||
|
if !isGPUClass {
|
||||||
|
return false
|
||||||
}
|
}
|
||||||
manufacturer := strings.ToLower(strings.TrimSpace(ptrString(dev.Manufacturer)))
|
|
||||||
switch vendor {
|
switch vendor {
|
||||||
case "amd":
|
case "amd":
|
||||||
return strings.Contains(manufacturer, "advanced micro devices") || strings.Contains(manufacturer, "amd/ati")
|
return dev.VendorID != nil && *dev.VendorID == collector.AMDVendorID
|
||||||
case "nvidia":
|
case "nvidia":
|
||||||
return strings.Contains(manufacturer, "nvidia")
|
return dev.VendorID != nil && *dev.VendorID == collector.NvidiaVendorID
|
||||||
default:
|
default:
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ import (
|
|||||||
"path/filepath"
|
"path/filepath"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
|
"bee/audit/internal/collector"
|
||||||
"bee/audit/internal/schema"
|
"bee/audit/internal/schema"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -46,10 +47,12 @@ func TestApplyLatestSATStatusesMarksAMDGPUs(t *testing.T) {
|
|||||||
|
|
||||||
class := "DisplayController"
|
class := "DisplayController"
|
||||||
manufacturer := "Advanced Micro Devices, Inc. [AMD/ATI]"
|
manufacturer := "Advanced Micro Devices, Inc. [AMD/ATI]"
|
||||||
|
amdVendorID := collector.AMDVendorID
|
||||||
snap := schema.HardwareSnapshot{
|
snap := schema.HardwareSnapshot{
|
||||||
PCIeDevices: []schema.HardwarePCIeDevice{{
|
PCIeDevices: []schema.HardwarePCIeDevice{{
|
||||||
DeviceClass: &class,
|
DeviceClass: &class,
|
||||||
Manufacturer: &manufacturer,
|
Manufacturer: &manufacturer,
|
||||||
|
VendorID: &amdVendorID,
|
||||||
}},
|
}},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -84,11 +84,10 @@ func hasAMDGPUDevices(devs []schema.HardwarePCIeDevice) bool {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func isAMDGPUDevice(dev schema.HardwarePCIeDevice) bool {
|
func isAMDGPUDevice(dev schema.HardwarePCIeDevice) bool {
|
||||||
if dev.Manufacturer == nil || dev.DeviceClass == nil {
|
if dev.DeviceClass == nil {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
manufacturer := strings.ToLower(strings.TrimSpace(*dev.Manufacturer))
|
return dev.VendorID != nil && *dev.VendorID == AMDVendorID && isGPUClass(strings.TrimSpace(*dev.DeviceClass))
|
||||||
return strings.Contains(manufacturer, "advanced micro devices") && isGPUClass(strings.TrimSpace(*dev.DeviceClass))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func queryAMDGPUs() (map[string]amdGPUInfo, error) {
|
func queryAMDGPUs() (map[string]amdGPUInfo, error) {
|
||||||
|
|||||||
@@ -11,7 +11,6 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
const mellanoxVendorID = 0x15b3
|
|
||||||
const nicProbeTimeout = 2 * time.Second
|
const nicProbeTimeout = 2 * time.Second
|
||||||
|
|
||||||
var (
|
var (
|
||||||
@@ -80,16 +79,7 @@ func enrichPCIeWithMellanox(devs []schema.HardwarePCIeDevice) []schema.HardwareP
|
|||||||
}
|
}
|
||||||
|
|
||||||
func isMellanoxDevice(dev schema.HardwarePCIeDevice) bool {
|
func isMellanoxDevice(dev schema.HardwarePCIeDevice) bool {
|
||||||
if dev.VendorID != nil && *dev.VendorID == mellanoxVendorID {
|
return dev.VendorID != nil && *dev.VendorID == MellanoxVendorID
|
||||||
return true
|
|
||||||
}
|
|
||||||
if dev.Manufacturer != nil {
|
|
||||||
m := strings.ToLower(*dev.Manufacturer)
|
|
||||||
if strings.Contains(m, "mellanox") || strings.Contains(m, "nvidia networking") {
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func queryMellanoxFromMstflint(bdf string) (firmware, serial string) {
|
func queryMellanoxFromMstflint(bdf string) (firmware, serial string) {
|
||||||
|
|||||||
@@ -55,7 +55,7 @@ func TestEnrichPCIeWithMellanox_mstflint(t *testing.T) {
|
|||||||
}
|
}
|
||||||
netIfacesByBDF = func(string) []string { return nil }
|
netIfacesByBDF = func(string) []string { return nil }
|
||||||
|
|
||||||
vendorID := mellanoxVendorID
|
vendorID := MellanoxVendorID
|
||||||
bdf := "0000:18:00.0"
|
bdf := "0000:18:00.0"
|
||||||
manufacturer := "Mellanox Technologies"
|
manufacturer := "Mellanox Technologies"
|
||||||
devs := []schema.HardwarePCIeDevice{{
|
devs := []schema.HardwarePCIeDevice{{
|
||||||
@@ -99,7 +99,7 @@ func TestEnrichPCIeWithMellanox_fallbackEthtool(t *testing.T) {
|
|||||||
return "driver: mlx5_core\nfirmware-version: 28.40.1000\n", nil
|
return "driver: mlx5_core\nfirmware-version: 28.40.1000\n", nil
|
||||||
}
|
}
|
||||||
|
|
||||||
vendorID := mellanoxVendorID
|
vendorID := MellanoxVendorID
|
||||||
bdf := "0000:18:00.0"
|
bdf := "0000:18:00.0"
|
||||||
manufacturer := "NVIDIA Networking"
|
manufacturer := "NVIDIA Networking"
|
||||||
devs := []schema.HardwarePCIeDevice{{
|
devs := []schema.HardwarePCIeDevice{{
|
||||||
|
|||||||
@@ -10,8 +10,6 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
const nvidiaVendorID = 0x10de
|
|
||||||
|
|
||||||
type nvidiaGPUInfo struct {
|
type nvidiaGPUInfo struct {
|
||||||
Index int
|
Index int
|
||||||
BDF string
|
BDF string
|
||||||
@@ -240,13 +238,7 @@ func normalizePCIeBDF(bdf string) string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func isNVIDIADevice(dev schema.HardwarePCIeDevice) bool {
|
func isNVIDIADevice(dev schema.HardwarePCIeDevice) bool {
|
||||||
if dev.VendorID != nil && *dev.VendorID == nvidiaVendorID {
|
return dev.VendorID != nil && *dev.VendorID == NvidiaVendorID
|
||||||
return true
|
|
||||||
}
|
|
||||||
if dev.Manufacturer != nil && strings.Contains(strings.ToLower(*dev.Manufacturer), "nvidia") {
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
return false
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func setPCIeFallback(dev *schema.HardwarePCIeDevice) {
|
func setPCIeFallback(dev *schema.HardwarePCIeDevice) {
|
||||||
|
|||||||
@@ -57,7 +57,7 @@ func TestNormalizePCIeBDF(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestEnrichPCIeWithNVIDIAData_driverLoaded(t *testing.T) {
|
func TestEnrichPCIeWithNVIDIAData_driverLoaded(t *testing.T) {
|
||||||
vendorID := nvidiaVendorID
|
vendorID := NvidiaVendorID
|
||||||
bdf := "0000:65:00.0"
|
bdf := "0000:65:00.0"
|
||||||
manufacturer := "NVIDIA Corporation"
|
manufacturer := "NVIDIA Corporation"
|
||||||
status := "OK"
|
status := "OK"
|
||||||
@@ -104,7 +104,7 @@ func TestEnrichPCIeWithNVIDIAData_driverLoaded(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestEnrichPCIeWithNVIDIAData_driverMissingFallback(t *testing.T) {
|
func TestEnrichPCIeWithNVIDIAData_driverMissingFallback(t *testing.T) {
|
||||||
vendorID := nvidiaVendorID
|
vendorID := NvidiaVendorID
|
||||||
bdf := "0000:17:00.0"
|
bdf := "0000:17:00.0"
|
||||||
manufacturer := "NVIDIA Corporation"
|
manufacturer := "NVIDIA Corporation"
|
||||||
devices := []schema.HardwarePCIeDevice{
|
devices := []schema.HardwarePCIeDevice{
|
||||||
|
|||||||
11
audit/internal/collector/pci_vendors.go
Normal file
11
audit/internal/collector/pci_vendors.go
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
package collector
|
||||||
|
|
||||||
|
// PCI vendor IDs for hardware classification.
|
||||||
|
// Source: https://pcisig.com / https://pci-ids.ucw.cz/
|
||||||
|
const (
|
||||||
|
NvidiaVendorID = 0x10de
|
||||||
|
AMDVendorID = 0x1002
|
||||||
|
AspeedVendorID = 0x1a03
|
||||||
|
MellanoxVendorID = 0x15b3
|
||||||
|
IntelVendorID = 0x8086
|
||||||
|
)
|
||||||
@@ -58,7 +58,6 @@ func buildSensorsFromDoc(doc sensorsDoc) *schema.HardwareSensors {
|
|||||||
|
|
||||||
for _, chip := range chips {
|
for _, chip := range chips {
|
||||||
features := doc[chip]
|
features := doc[chip]
|
||||||
location := sensorLocation(chip)
|
|
||||||
|
|
||||||
keys := make([]string, 0, len(features))
|
keys := make([]string, 0, len(features))
|
||||||
for key := range features {
|
for key := range features {
|
||||||
@@ -80,25 +79,25 @@ func buildSensorsFromDoc(doc sensorsDoc) *schema.HardwareSensors {
|
|||||||
}
|
}
|
||||||
switch classifySensorFeature(feature) {
|
switch classifySensorFeature(feature) {
|
||||||
case "fan":
|
case "fan":
|
||||||
item := buildFanSensor(name, location, feature)
|
item := buildFanSensor(name, feature)
|
||||||
if item == nil || duplicateSensor(seen, "fan", item.Name) {
|
if item == nil || duplicateSensor(seen, "fan", item.Name) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
result.Fans = append(result.Fans, *item)
|
result.Fans = append(result.Fans, *item)
|
||||||
case "temp":
|
case "temp":
|
||||||
item := buildTempSensor(name, location, feature)
|
item := buildTempSensor(name, feature)
|
||||||
if item == nil || duplicateSensor(seen, "temp", item.Name) {
|
if item == nil || duplicateSensor(seen, "temp", item.Name) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
result.Temperatures = append(result.Temperatures, *item)
|
result.Temperatures = append(result.Temperatures, *item)
|
||||||
case "power":
|
case "power":
|
||||||
item := buildPowerSensor(name, location, feature)
|
item := buildPowerSensor(name, feature)
|
||||||
if item == nil || duplicateSensor(seen, "power", item.Name) {
|
if item == nil || duplicateSensor(seen, "power", item.Name) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
result.Power = append(result.Power, *item)
|
result.Power = append(result.Power, *item)
|
||||||
default:
|
default:
|
||||||
item := buildOtherSensor(name, location, feature)
|
item := buildOtherSensor(name, feature)
|
||||||
if item == nil || duplicateSensor(seen, "other", item.Name) {
|
if item == nil || duplicateSensor(seen, "other", item.Name) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
@@ -128,14 +127,6 @@ func duplicateSensor(seen map[string]struct{}, sensorType, name string) bool {
|
|||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
func sensorLocation(chip string) *string {
|
|
||||||
chip = strings.TrimSpace(chip)
|
|
||||||
if chip == "" {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
return &chip
|
|
||||||
}
|
|
||||||
|
|
||||||
func classifySensorFeature(feature map[string]any) string {
|
func classifySensorFeature(feature map[string]any) string {
|
||||||
for key := range feature {
|
for key := range feature {
|
||||||
switch {
|
switch {
|
||||||
@@ -154,24 +145,24 @@ func classifySensorFeature(feature map[string]any) string {
|
|||||||
return "other"
|
return "other"
|
||||||
}
|
}
|
||||||
|
|
||||||
func buildFanSensor(name string, location *string, feature map[string]any) *schema.HardwareFanSensor {
|
func buildFanSensor(name string, feature map[string]any) *schema.HardwareFanSensor {
|
||||||
rpm, ok := firstFeatureInt(feature, "_input")
|
rpm, ok := firstFeatureInt(feature, "_input")
|
||||||
if !ok {
|
if !ok {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
item := &schema.HardwareFanSensor{Name: name, Location: location, RPM: &rpm}
|
item := &schema.HardwareFanSensor{Name: name, RPM: &rpm}
|
||||||
if status := sensorStatusFromFeature(feature); status != nil {
|
if status := sensorStatusFromFeature(feature); status != nil {
|
||||||
item.Status = status
|
item.Status = status
|
||||||
}
|
}
|
||||||
return item
|
return item
|
||||||
}
|
}
|
||||||
|
|
||||||
func buildTempSensor(name string, location *string, feature map[string]any) *schema.HardwareTemperatureSensor {
|
func buildTempSensor(name string, feature map[string]any) *schema.HardwareTemperatureSensor {
|
||||||
celsius, ok := firstFeatureFloat(feature, "_input")
|
celsius, ok := firstFeatureFloat(feature, "_input")
|
||||||
if !ok {
|
if !ok {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
item := &schema.HardwareTemperatureSensor{Name: name, Location: location, Celsius: &celsius}
|
item := &schema.HardwareTemperatureSensor{Name: name, Celsius: &celsius}
|
||||||
if warning, ok := firstFeatureFloatWithSuffixes(feature, []string{"_max", "_high"}); ok {
|
if warning, ok := firstFeatureFloatWithSuffixes(feature, []string{"_max", "_high"}); ok {
|
||||||
item.ThresholdWarningCelsius = &warning
|
item.ThresholdWarningCelsius = &warning
|
||||||
}
|
}
|
||||||
@@ -186,8 +177,8 @@ func buildTempSensor(name string, location *string, feature map[string]any) *sch
|
|||||||
return item
|
return item
|
||||||
}
|
}
|
||||||
|
|
||||||
func buildPowerSensor(name string, location *string, feature map[string]any) *schema.HardwarePowerSensor {
|
func buildPowerSensor(name string, feature map[string]any) *schema.HardwarePowerSensor {
|
||||||
item := &schema.HardwarePowerSensor{Name: name, Location: location}
|
item := &schema.HardwarePowerSensor{Name: name}
|
||||||
if v, ok := firstFeatureFloatWithContains(feature, []string{"power"}); ok {
|
if v, ok := firstFeatureFloatWithContains(feature, []string{"power"}); ok {
|
||||||
item.PowerW = &v
|
item.PowerW = &v
|
||||||
}
|
}
|
||||||
@@ -206,12 +197,12 @@ func buildPowerSensor(name string, location *string, feature map[string]any) *sc
|
|||||||
return item
|
return item
|
||||||
}
|
}
|
||||||
|
|
||||||
func buildOtherSensor(name string, location *string, feature map[string]any) *schema.HardwareOtherSensor {
|
func buildOtherSensor(name string, feature map[string]any) *schema.HardwareOtherSensor {
|
||||||
value, unit, ok := firstGenericSensorValue(feature)
|
value, unit, ok := firstGenericSensorValue(feature)
|
||||||
if !ok {
|
if !ok {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
item := &schema.HardwareOtherSensor{Name: name, Location: location, Value: &value}
|
item := &schema.HardwareOtherSensor{Name: name, Value: &value}
|
||||||
if unit != "" {
|
if unit != "" {
|
||||||
item.Unit = &unit
|
item.Unit = &unit
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -258,7 +258,7 @@ func (s *System) GetInterfaceState(iface string) (bool, error) {
|
|||||||
func interfaceAdminState(iface string) (bool, error) {
|
func interfaceAdminState(iface string) (bool, error) {
|
||||||
raw, err := exec.Command("ip", "-o", "link", "show", "dev", iface).Output()
|
raw, err := exec.Command("ip", "-o", "link", "show", "dev", iface).Output()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return false, err
|
return false, fmt.Errorf("ip link show dev %s: %w", iface, err)
|
||||||
}
|
}
|
||||||
return parseInterfaceAdminState(string(raw))
|
return parseInterfaceAdminState(string(raw))
|
||||||
}
|
}
|
||||||
@@ -288,7 +288,7 @@ func interfaceIPv4Addrs(iface string) ([]string, error) {
|
|||||||
if errors.As(err, &exitErr) {
|
if errors.As(err, &exitErr) {
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
return nil, err
|
return nil, fmt.Errorf("ip addr show dev %s: %w", iface, err)
|
||||||
}
|
}
|
||||||
var ipv4 []string
|
var ipv4 []string
|
||||||
for _, line := range strings.Split(strings.TrimSpace(string(raw)), "\n") {
|
for _, line := range strings.Split(strings.TrimSpace(string(raw)), "\n") {
|
||||||
|
|||||||
@@ -2,6 +2,8 @@
|
|||||||
// core/internal/ingest/parser_hardware.go. No import dependency on core.
|
// core/internal/ingest/parser_hardware.go. No import dependency on core.
|
||||||
package schema
|
package schema
|
||||||
|
|
||||||
|
import "encoding/json"
|
||||||
|
|
||||||
// HardwareIngestRequest is the top-level output document produced by `bee audit`.
|
// HardwareIngestRequest is the top-level output document produced by `bee audit`.
|
||||||
// It is accepted as-is by the core /api/ingest/hardware endpoint.
|
// It is accepted as-is by the core /api/ingest/hardware endpoint.
|
||||||
type HardwareIngestRequest struct {
|
type HardwareIngestRequest struct {
|
||||||
@@ -64,9 +66,10 @@ type HardwareSnapshot struct {
|
|||||||
Storage []HardwareStorage `json:"storage,omitempty"`
|
Storage []HardwareStorage `json:"storage,omitempty"`
|
||||||
PCIeDevices []HardwarePCIeDevice `json:"pcie_devices,omitempty"`
|
PCIeDevices []HardwarePCIeDevice `json:"pcie_devices,omitempty"`
|
||||||
PowerSupplies []HardwarePowerSupply `json:"power_supplies,omitempty"`
|
PowerSupplies []HardwarePowerSupply `json:"power_supplies,omitempty"`
|
||||||
Sensors *HardwareSensors `json:"sensors,omitempty"`
|
Sensors *HardwareSensors `json:"sensors,omitempty"`
|
||||||
EventLogs []HardwareEventLog `json:"event_logs,omitempty"`
|
EventLogs []HardwareEventLog `json:"event_logs,omitempty"`
|
||||||
VROCLicense *string `json:"vroc_license,omitempty"`
|
PlatformConfig *json.RawMessage `json:"platform_config,omitempty"`
|
||||||
|
VROCLicense *string `json:"vroc_license,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type HardwareHealthSummary struct {
|
type HardwareHealthSummary struct {
|
||||||
@@ -123,7 +126,7 @@ type HardwareCPU struct {
|
|||||||
type HardwareMemory struct {
|
type HardwareMemory struct {
|
||||||
HardwareComponentStatus
|
HardwareComponentStatus
|
||||||
Slot *string `json:"slot,omitempty"`
|
Slot *string `json:"slot,omitempty"`
|
||||||
Location *string `json:"location,omitempty"`
|
Location *string `json:"-"` // internal: used for DIMM telemetry matching only
|
||||||
Present *bool `json:"present,omitempty"`
|
Present *bool `json:"present,omitempty"`
|
||||||
SizeMB *int `json:"size_mb,omitempty"`
|
SizeMB *int `json:"size_mb,omitempty"`
|
||||||
Type *string `json:"type,omitempty"`
|
Type *string `json:"type,omitempty"`
|
||||||
@@ -261,15 +264,13 @@ type HardwareSensors struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type HardwareFanSensor struct {
|
type HardwareFanSensor struct {
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
Location *string `json:"location,omitempty"`
|
RPM *int `json:"rpm,omitempty"`
|
||||||
RPM *int `json:"rpm,omitempty"`
|
Status *string `json:"status,omitempty"`
|
||||||
Status *string `json:"status,omitempty"`
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type HardwarePowerSensor struct {
|
type HardwarePowerSensor struct {
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
Location *string `json:"location,omitempty"`
|
|
||||||
VoltageV *float64 `json:"voltage_v,omitempty"`
|
VoltageV *float64 `json:"voltage_v,omitempty"`
|
||||||
CurrentA *float64 `json:"current_a,omitempty"`
|
CurrentA *float64 `json:"current_a,omitempty"`
|
||||||
PowerW *float64 `json:"power_w,omitempty"`
|
PowerW *float64 `json:"power_w,omitempty"`
|
||||||
@@ -278,7 +279,6 @@ type HardwarePowerSensor struct {
|
|||||||
|
|
||||||
type HardwareTemperatureSensor struct {
|
type HardwareTemperatureSensor struct {
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
Location *string `json:"location,omitempty"`
|
|
||||||
Celsius *float64 `json:"celsius,omitempty"`
|
Celsius *float64 `json:"celsius,omitempty"`
|
||||||
ThresholdWarningCelsius *float64 `json:"threshold_warning_celsius,omitempty"`
|
ThresholdWarningCelsius *float64 `json:"threshold_warning_celsius,omitempty"`
|
||||||
ThresholdCriticalCelsius *float64 `json:"threshold_critical_celsius,omitempty"`
|
ThresholdCriticalCelsius *float64 `json:"threshold_critical_celsius,omitempty"`
|
||||||
@@ -286,11 +286,10 @@ type HardwareTemperatureSensor struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type HardwareOtherSensor struct {
|
type HardwareOtherSensor struct {
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
Location *string `json:"location,omitempty"`
|
Value *float64 `json:"value,omitempty"`
|
||||||
Value *float64 `json:"value,omitempty"`
|
Unit *string `json:"unit,omitempty"`
|
||||||
Unit *string `json:"unit,omitempty"`
|
Status *string `json:"status,omitempty"`
|
||||||
Status *string `json:"status,omitempty"`
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type HardwareEventLog struct {
|
type HardwareEventLog struct {
|
||||||
|
|||||||
@@ -11,6 +11,13 @@ import (
|
|||||||
"bee/audit/internal/schema"
|
"bee/audit/internal/schema"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// PCI vendor IDs used for GPU classification (source: pci-ids.ucw.cz).
|
||||||
|
const (
|
||||||
|
pciVendorNvidia = 0x10de
|
||||||
|
pciVendorAMD = 0x1002
|
||||||
|
pciVendorAspeed = 0x1a03
|
||||||
|
)
|
||||||
|
|
||||||
type validateInventory struct {
|
type validateInventory struct {
|
||||||
CPU string
|
CPU string
|
||||||
Memory string
|
Memory string
|
||||||
@@ -634,22 +641,16 @@ func validateFirstNonEmpty(values ...string) string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func validateIsVendorGPU(dev schema.HardwarePCIeDevice, vendor string) bool {
|
func validateIsVendorGPU(dev schema.HardwarePCIeDevice, vendor string) bool {
|
||||||
model := strings.ToLower(validateTrimPtr(dev.Model))
|
if dev.VendorID != nil && *dev.VendorID == pciVendorAspeed {
|
||||||
manufacturer := strings.ToLower(validateTrimPtr(dev.Manufacturer))
|
|
||||||
class := strings.ToLower(validateTrimPtr(dev.DeviceClass))
|
|
||||||
if strings.Contains(model, "aspeed") || strings.Contains(manufacturer, "aspeed") {
|
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
class := strings.ToLower(validateTrimPtr(dev.DeviceClass))
|
||||||
|
isGPUClass := class == "videocontroller" || class == "processingaccelerator" || class == "displaycontroller"
|
||||||
switch vendor {
|
switch vendor {
|
||||||
case "nvidia":
|
case "nvidia":
|
||||||
isNVIDIAVendor := strings.Contains(model, "nvidia") || strings.Contains(manufacturer, "nvidia")
|
return isGPUClass && dev.VendorID != nil && *dev.VendorID == pciVendorNvidia
|
||||||
isGPUClass := class == "videocontroller" || class == "processingaccelerator" || class == "displaycontroller"
|
|
||||||
return isNVIDIAVendor && isGPUClass
|
|
||||||
case "amd":
|
case "amd":
|
||||||
isGPUClass := class == "processingaccelerator" || class == "displaycontroller" || class == "videocontroller"
|
return isGPUClass && dev.VendorID != nil && *dev.VendorID == pciVendorAMD
|
||||||
isAMDVendor := strings.Contains(manufacturer, "advanced micro devices") || strings.Contains(manufacturer, "amd") || strings.Contains(manufacturer, "ati")
|
|
||||||
isAMDModel := strings.Contains(model, "instinct") || strings.Contains(model, "radeon") || strings.Contains(model, "amd")
|
|
||||||
return isGPUClass && (isAMDVendor || isAMDModel)
|
|
||||||
default:
|
default:
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|||||||
185
bible-local/architecture/api-surface.md
Normal file
185
bible-local/architecture/api-surface.md
Normal file
@@ -0,0 +1,185 @@
|
|||||||
|
# API Surface
|
||||||
|
|
||||||
|
HTTP endpoints exposed by `bee web` (binds `0.0.0.0:80`).
|
||||||
|
Handler registration: `audit/internal/webui/server.go` → `NewHandler()`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Health & readiness
|
||||||
|
|
||||||
|
| Method | Path | Description |
|
||||||
|
|--------|----------------|-----------------------------------------------------|
|
||||||
|
| GET | `/healthz` | Always 200. Used by load balancers / boot scripts. |
|
||||||
|
| GET | `/api/ready` | 200 when audit JSON exists and is readable. |
|
||||||
|
| GET | `/loading` | HTML loading page shown before first audit. |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Audit
|
||||||
|
|
||||||
|
| Method | Path | Description |
|
||||||
|
|--------|-----------------------|--------------------------------------------------------------|
|
||||||
|
| GET | `/audit.json` | Latest audit JSON with SAT overlay applied. |
|
||||||
|
| GET | `/runtime-health.json`| Latest runtime preflight JSON. |
|
||||||
|
| POST | `/api/audit/run` | Enqueue a full `bee audit` run. Returns task ID. |
|
||||||
|
| GET | `/api/audit/stream` | SSE: audit run log lines (`data:` + newline per line). |
|
||||||
|
| GET | `/api/preflight` | Run runtime preflight check (synchronous, returns JSON). |
|
||||||
|
| GET | `/api/hardware-summary` | Hardware health summary (status counts + failures). |
|
||||||
|
| GET | `/api/components/{type}` | HTML fragment for component detail dialog (e.g. `cpu`, `memory`, `storage`, `pcie`). |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## SAT (System Acceptance Testing)
|
||||||
|
|
||||||
|
All SAT run endpoints enqueue an async task. Response: `{"task_id": "..."}`.
|
||||||
|
|
||||||
|
| Method | Path | Description |
|
||||||
|
|--------|--------------------------------------------|-----------------------------------|
|
||||||
|
| POST | `/api/sat/nvidia/run` | NVIDIA DCGM SAT |
|
||||||
|
| POST | `/api/sat/nvidia-targeted-stress/run` | NVIDIA targeted stress validate |
|
||||||
|
| POST | `/api/sat/nvidia-compute/run` | NVIDIA max compute load |
|
||||||
|
| POST | `/api/sat/nvidia-targeted-power/run` | NVIDIA targeted power |
|
||||||
|
| POST | `/api/sat/nvidia-pulse/run` | NVIDIA pulse test |
|
||||||
|
| POST | `/api/sat/nvidia-interconnect/run` | NCCL all_reduce_perf |
|
||||||
|
| POST | `/api/sat/nvidia-bandwidth/run` | NVBandwidth test |
|
||||||
|
| POST | `/api/sat/nvidia-stress/run` | NVIDIA stress pack |
|
||||||
|
| POST | `/api/sat/memory/run` | Memory acceptance |
|
||||||
|
| POST | `/api/sat/storage/run` | Storage acceptance (smartctl) |
|
||||||
|
| POST | `/api/sat/cpu/run` | CPU acceptance (stress-ng) |
|
||||||
|
| POST | `/api/sat/amd/run` | AMD GPU SAT (ROCm) |
|
||||||
|
| POST | `/api/sat/amd-mem/run` | AMD memory integrity + bandwidth |
|
||||||
|
| POST | `/api/sat/amd-bandwidth/run` | AMD memory bandwidth |
|
||||||
|
| POST | `/api/sat/amd-stress/run` | AMD GPU stress |
|
||||||
|
| POST | `/api/sat/memory-stress/run` | Memory stress |
|
||||||
|
| POST | `/api/sat/sat-stress/run` | Combined storage+memory stress |
|
||||||
|
| POST | `/api/sat/platform-stress/run` | Fan + thermal stress |
|
||||||
|
| GET | `/api/sat/stream` | SSE: live SAT log stream |
|
||||||
|
| POST | `/api/sat/abort` | Abort the running SAT task |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Benchmarks
|
||||||
|
|
||||||
|
| Method | Path | Description |
|
||||||
|
|--------|-----------------------------------------|----------------------------------------------|
|
||||||
|
| POST | `/api/bee-bench/nvidia/perf/run` | NVIDIA performance benchmark |
|
||||||
|
| POST | `/api/bee-bench/nvidia/power/run` | NVIDIA power benchmark |
|
||||||
|
| POST | `/api/bee-bench/nvidia/autotune/run` | Power source autotune (prerequisite for benchmarks) |
|
||||||
|
| GET | `/api/bee-bench/nvidia/autotune/status` | Current autotune result / status |
|
||||||
|
| GET | `/api/benchmark/results` | List completed benchmark result archives |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Tasks (async job queue)
|
||||||
|
|
||||||
|
| Method | Path | Description |
|
||||||
|
|--------|-----------------------------|----------------------------------------------------|
|
||||||
|
| GET | `/api/tasks` | List all tasks with status |
|
||||||
|
| POST | `/api/tasks/cancel-all` | Cancel all pending/running tasks |
|
||||||
|
| POST | `/api/tasks/kill-workers` | Force-kill worker goroutines |
|
||||||
|
| POST | `/api/tasks/{id}/cancel` | Cancel a specific task |
|
||||||
|
| POST | `/api/tasks/{id}/priority` | Elevate task priority |
|
||||||
|
| GET | `/api/tasks/{id}/stream` | SSE: live log stream for a task |
|
||||||
|
| GET | `/api/tasks/{id}/charts` | List chart names for a task |
|
||||||
|
| GET | `/api/tasks/{id}/chart/` | SVG chart for a task result |
|
||||||
|
| GET | `/tasks/{id}` | HTML task detail page |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Services
|
||||||
|
|
||||||
|
| Method | Path | Description |
|
||||||
|
|--------|---------------------------|--------------------------------------------------|
|
||||||
|
| GET | `/api/services` | List bee-* systemd services and their states |
|
||||||
|
| POST | `/api/services/action` | start/stop/restart a service |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Network
|
||||||
|
|
||||||
|
| Method | Path | Description |
|
||||||
|
|--------|----------------------------|-----------------------------------------------------|
|
||||||
|
| GET | `/api/network` | List interfaces with state and IPv4 addresses |
|
||||||
|
| POST | `/api/network/dhcp` | Run dhclient on one or all interfaces |
|
||||||
|
| POST | `/api/network/static` | Set static IPv4 address |
|
||||||
|
| POST | `/api/network/toggle` | Bring interface up or down |
|
||||||
|
| POST | `/api/network/confirm` | Confirm pending network change (clears rollback) |
|
||||||
|
| POST | `/api/network/rollback` | Restore pre-change network snapshot |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Export
|
||||||
|
|
||||||
|
| Method | Path | Description |
|
||||||
|
|--------|-------------------------------|---------------------------------------------------|
|
||||||
|
| GET | `/export/support.tar.gz` | Download support bundle (live-generated) |
|
||||||
|
| GET | `/export/file` | Download a file from the export dir by path param |
|
||||||
|
| GET | `/export/` | Browse export dir (HTML index) |
|
||||||
|
| GET | `/api/export/list` | JSON list of files in export dir |
|
||||||
|
| GET | `/api/export/usb` | List removable USB targets available for export |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## GPU
|
||||||
|
|
||||||
|
| Method | Path | Description |
|
||||||
|
|--------|----------------------------|----------------------------------------------------|
|
||||||
|
| GET | `/api/gpu/presence` | `{"nvidia": bool, "amd": bool}` |
|
||||||
|
| GET | `/api/gpu/nvidia` | List NVIDIA GPUs from nvidia-smi |
|
||||||
|
| GET | `/api/gpu/nvidia-status` | Per-GPU status (ECC, power, throttle) |
|
||||||
|
| POST | `/api/gpu/nvidia-reset` | GPU reset by index |
|
||||||
|
| GET | `/api/gpu/tools` | nvidia-smi / rocm-smi tool availability |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## System
|
||||||
|
|
||||||
|
| Method | Path | Description |
|
||||||
|
|--------|------------------------------|---------------------------------------------------|
|
||||||
|
| GET | `/api/system/ram-status` | toram boot state and ISO copy status |
|
||||||
|
| POST | `/api/system/install-to-ram` | Copy ISO to RAM (background task) |
|
||||||
|
| GET | `/api/install/disks` | List block devices suitable for disk installation |
|
||||||
|
| POST | `/api/install/run` | Install bee to disk (background task) |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Tools & NVMe
|
||||||
|
|
||||||
|
| Method | Path | Description |
|
||||||
|
|--------|-------------------------------|--------------------------------------------------|
|
||||||
|
| GET | `/api/tools/check` | Check availability of required CLI tools |
|
||||||
|
| GET | `/api/tools/nvme-formats` | List NVMe format options for a device |
|
||||||
|
| POST | `/api/tools/nvme-format/run` | Run nvme-format on a device |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Live metrics
|
||||||
|
|
||||||
|
| Method | Path | Description |
|
||||||
|
|--------|------------------------------|---------------------------------------------------|
|
||||||
|
| GET | `/api/metrics/stream` | SSE: live metrics (GPU power, temp, utilization) |
|
||||||
|
| GET | `/api/metrics/latest` | Latest metrics snapshot (JSON) |
|
||||||
|
| GET | `/api/metrics/chart/` | SVG chart for a metric over time |
|
||||||
|
| GET | `/api/metrics/export.csv` | Download metrics history as CSV |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Blackbox logging
|
||||||
|
|
||||||
|
| Method | Path | Description |
|
||||||
|
|--------|----------------------------|-----------------------------------------------|
|
||||||
|
| GET | `/api/blackbox/status` | Blackbox log state (enabled, size, path) |
|
||||||
|
| POST | `/api/blackbox/enable` | Start recording blackbox log |
|
||||||
|
| POST | `/api/blackbox/disable` | Stop recording, flush to disk |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## UI pages
|
||||||
|
|
||||||
|
| Method | Path | Description |
|
||||||
|
|--------|------------|-----------------------------------------------|
|
||||||
|
| GET | `/` | Main dashboard (serves all page routes) |
|
||||||
|
| GET | `/viewer` | Standalone JSON viewer for uploaded audit files |
|
||||||
|
|
||||||
|
All pages are rendered server-side as HTML. The `/` route handles sub-paths such as
|
||||||
|
`/network`, `/services`, `/sat`, `/benchmark`, `/install`, `/validate`, `/export`.
|
||||||
137
bible-local/architecture/data-model.md
Normal file
137
bible-local/architecture/data-model.md
Normal file
@@ -0,0 +1,137 @@
|
|||||||
|
# Data Model
|
||||||
|
|
||||||
|
The canonical output of `bee audit` is a `HardwareIngestRequest` JSON document accepted
|
||||||
|
by the Reanimator `/api/ingest/hardware` endpoint. The ingest endpoint uses a strict
|
||||||
|
decoder — unknown fields cause `400 Bad Request`.
|
||||||
|
|
||||||
|
Source of truth: `audit/internal/schema/hardware.go`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Top-level: HardwareIngestRequest
|
||||||
|
|
||||||
|
```
|
||||||
|
HardwareIngestRequest
|
||||||
|
├── collected_at string RFC3339 UTC timestamp of collection
|
||||||
|
├── hardware HardwareSnapshot
|
||||||
|
├── runtime RuntimeHealth? from bee-runtime-preflight service
|
||||||
|
├── filename string?
|
||||||
|
├── source_type string?
|
||||||
|
├── protocol string?
|
||||||
|
└── target_host string?
|
||||||
|
```
|
||||||
|
|
||||||
|
`collected_at` is the primary sort key used by Reanimator to deduplicate ingests.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## HardwareSnapshot
|
||||||
|
|
||||||
|
All component arrays are `omitempty` — absent when the collector finds nothing.
|
||||||
|
|
||||||
|
| JSON key | Go type | Source |
|
||||||
|
|-------------------|----------------------------|------------------------------|
|
||||||
|
| `board` | HardwareBoard | dmidecode type 1/2 |
|
||||||
|
| `firmware` | []HardwareFirmwareRecord | dmidecode type 0/13 |
|
||||||
|
| `cpus` | []HardwareCPU | dmidecode type 4 |
|
||||||
|
| `memory` | []HardwareMemory | dmidecode type 17 |
|
||||||
|
| `storage` | []HardwareStorage | lsblk + nvme-cli + smartctl |
|
||||||
|
| `pcie_devices` | []HardwarePCIeDevice | lspci |
|
||||||
|
| `power_supplies` | []HardwarePowerSupply | ipmitool fru + sdr |
|
||||||
|
| `sensors` | *HardwareSensors | sensors -j |
|
||||||
|
| `event_logs` | []HardwareEventLog | ipmitool sel + journald |
|
||||||
|
| `platform_config` | *json.RawMessage | reserved, nil until used |
|
||||||
|
| `vroc_license` | *string | vroc-cli |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Identity keys
|
||||||
|
|
||||||
|
Reanimator uses these fields to match components across successive audits:
|
||||||
|
|
||||||
|
| Component | Identity key |
|
||||||
|
|----------------|------------------------------------------------|
|
||||||
|
| Board | `board.serial_number` (required, never empty) |
|
||||||
|
| CPU | `serial_number` if present; else generated key |
|
||||||
|
| Memory DIMM | `serial_number` — absent DIMMs have `present: false` |
|
||||||
|
| Storage | `serial_number` if present; else `linux_device` from Telemetry |
|
||||||
|
| PCIe device | `bdf` (Bus:Device.Function address) |
|
||||||
|
| PSU | `slot` |
|
||||||
|
|
||||||
|
Components without a stable identity are still emitted but may not be matched across runs.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## HardwareComponentStatus (embedded in all components)
|
||||||
|
|
||||||
|
```go
|
||||||
|
type HardwareComponentStatus struct {
|
||||||
|
Status *string `json:"status,omitempty"` // OK | Warning | Critical | Unknown
|
||||||
|
ErrorDescription *string `json:"error_description,omitempty"`
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Status is set by collectors and overwritten at render time by `ApplySATOverlay`
|
||||||
|
(latest SAT run results are always merged on top before display).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## HardwarePCIeDevice
|
||||||
|
|
||||||
|
The most enriched component type. Key fields:
|
||||||
|
|
||||||
|
| JSON key | Meaning |
|
||||||
|
|----------------------|------------------------------------------------|
|
||||||
|
| `bdf` | PCI address (identity key), e.g. `0000:4b:00.0` |
|
||||||
|
| `vendor_id` | Numeric PCI vendor ID (hex). Use this for classification — not `manufacturer`. |
|
||||||
|
| `device_id` | Numeric PCI device ID (hex) |
|
||||||
|
| `device_class` | Human-readable class, e.g. `VideoController` |
|
||||||
|
| `manufacturer` | String label from lspci — for display only |
|
||||||
|
| `model` | From nvidia-smi / rocm-smi — display name |
|
||||||
|
| `link_speed` | Current PCIe link speed, e.g. `Gen4` |
|
||||||
|
| `max_link_speed` | Max negotiated speed |
|
||||||
|
| `link_width` | Current lane count |
|
||||||
|
| `max_link_width` | Max lane count |
|
||||||
|
| `temperature_c` | From nvidia-smi / rocm-smi |
|
||||||
|
| `power_w` | Current power draw |
|
||||||
|
| `ecc_uncorrected_total` | Cumulative ECC uncorrected errors (NVIDIA) |
|
||||||
|
| `ecc_corrected_total` | Cumulative ECC corrected errors (NVIDIA) |
|
||||||
|
| `hw_slowdown` | HW throttle active (NVIDIA) |
|
||||||
|
| `telemetry` | Free-form map for vendor-specific extras |
|
||||||
|
|
||||||
|
**Classification rule**: use `vendor_id` (numeric PCI ID), never `manufacturer` string.
|
||||||
|
|
||||||
|
| Vendor | vendor_id |
|
||||||
|
|-----------|-----------|
|
||||||
|
| NVIDIA | `0x10de` |
|
||||||
|
| AMD | `0x1002` |
|
||||||
|
| Mellanox | `0x15b3` |
|
||||||
|
| Aspeed | `0x1a03` |
|
||||||
|
| Intel | `0x8086` |
|
||||||
|
|
||||||
|
Constants live in `audit/internal/collector/pci_vendors.go`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## HardwareMemory
|
||||||
|
|
||||||
|
`location` field exists in the Go struct with `json:"-"` — it is intentionally excluded
|
||||||
|
from JSON output because the Reanimator schema does not include it. It is used internally
|
||||||
|
for DIMM telemetry matching only (`collector/memory_telemetry.go`).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## HardwareSensors
|
||||||
|
|
||||||
|
Sensor structs (`HardwareFanSensor`, `HardwareTemperatureSensor`,
|
||||||
|
`HardwarePowerSensor`, `HardwareOtherSensor`) do **not** have a `location` field.
|
||||||
|
Location was removed in contract v2.8. The Go types mirror the schema exactly.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## JSON naming convention
|
||||||
|
|
||||||
|
All JSON keys are `snake_case`. Go field names are `CamelCase`. The mapping is
|
||||||
|
maintained by struct tags in `audit/internal/schema/hardware.go`.
|
||||||
|
|
||||||
|
All pointer fields use `omitempty` — absent means not collected (not zero).
|
||||||
Reference in New Issue
Block a user