diff --git a/audit/internal/app/app.go b/audit/internal/app/app.go index 1bc1356..c9b09ee 100644 --- a/audit/internal/app/app.go +++ b/audit/internal/app/app.go @@ -6,11 +6,8 @@ import ( "fmt" "log/slog" "os" - "path/filepath" - "sort" "strconv" "strings" - "time" "bee/audit/internal/collector" "bee/audit/internal/platform" @@ -301,579 +298,6 @@ func (a *App) RunAuditToDefaultFile(runtimeMode runtimeenv.Mode) (string, error) return a.RunAudit(runtimeMode, "file:"+DefaultAuditJSONPath) } -func (a *App) ExportLatestAudit(target platform.RemovableTarget) (string, error) { - if _, err := os.Stat(DefaultAuditJSONPath); err != nil { - return "", err - } - filename := fmt.Sprintf("audit-%s-%s.json", sanitizeFilename(hostnameOr("unknown")), time.Now().UTC().Format("20060102-150405")) - tmpPath := filepath.Join(os.TempDir(), filename) - data, err := readFileLimited(DefaultAuditJSONPath, 100<<20) - if err != nil { - return "", err - } - if normalized, normErr := ApplySATOverlay(data); normErr == nil { - data = normalized - } - if err := os.WriteFile(tmpPath, data, 0644); err != nil { - return "", err - } - defer os.Remove(tmpPath) - return a.exports.ExportFileToTarget(tmpPath, target) -} - -func (a *App) ExportLatestAuditResult(target platform.RemovableTarget) (ActionResult, error) { - path, err := a.ExportLatestAudit(target) - body := "Audit export failed." - if err == nil { - body = "Audit exported." - } - if err == nil && path != "" { - body = "Audit exported to " + path - } - return ActionResult{Title: "Export audit", Body: body}, err -} - -func (a *App) ExportSupportBundle(target platform.RemovableTarget) (string, error) { - archive, err := BuildSupportBundle(DefaultExportDir) - if err != nil { - return "", err - } - defer os.Remove(archive) - return a.exports.ExportFileToTarget(archive, target) -} - -func (a *App) ExportSupportBundleResult(target platform.RemovableTarget) (ActionResult, error) { - path, err := a.ExportSupportBundle(target) - body := "Support bundle export failed." - if err == nil { - body = "Support bundle exported. USB target unmounted and safe to remove." - } - if err == nil && path != "" { - body = "Support bundle exported to " + path + ".\n\nUSB target unmounted and safe to remove." - } - return ActionResult{Title: "Export support bundle", Body: body}, err -} - -func (a *App) ListInterfaces() ([]platform.InterfaceInfo, error) { - return a.network.ListInterfaces() -} - -func (a *App) DefaultRoute() string { - return a.network.DefaultRoute() -} - -func (a *App) DHCPOne(iface string) (string, error) { - return a.network.DHCPOne(iface) -} - -func (a *App) DHCPOneResult(iface string) (ActionResult, error) { - body, err := a.network.DHCPOne(iface) - return ActionResult{Title: "DHCP: " + iface, Body: bodyOr(body, "DHCP completed.")}, err -} - -func (a *App) DHCPAll() (string, error) { - return a.network.DHCPAll() -} - -func (a *App) DHCPAllResult() (ActionResult, error) { - body, err := a.network.DHCPAll() - return ActionResult{Title: "DHCP: all interfaces", Body: bodyOr(body, "DHCP completed.")}, err -} - -func (a *App) SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error) { - return a.network.SetStaticIPv4(cfg) -} - -func (a *App) SetInterfaceState(iface string, up bool) error { - return a.network.SetInterfaceState(iface, up) -} - -func (a *App) GetInterfaceState(iface string) (bool, error) { - return a.network.GetInterfaceState(iface) -} - -func (a *App) CaptureNetworkSnapshot() (platform.NetworkSnapshot, error) { - return a.network.CaptureNetworkSnapshot() -} - -func (a *App) RestoreNetworkSnapshot(snapshot platform.NetworkSnapshot) error { - return a.network.RestoreNetworkSnapshot(snapshot) -} - -func (a *App) SetStaticIPv4Result(cfg platform.StaticIPv4Config) (ActionResult, error) { - body, err := a.network.SetStaticIPv4(cfg) - return ActionResult{Title: "Static IPv4: " + cfg.Interface, Body: bodyOr(body, "Static IPv4 updated.")}, err -} - -func (a *App) NetworkStatus() (ActionResult, error) { - ifaces, err := a.network.ListInterfaces() - if err != nil { - return ActionResult{Title: "Network status"}, err - } - if len(ifaces) == 0 { - return ActionResult{Title: "Network status", Body: "No physical interfaces found."}, nil - } - var body strings.Builder - for _, iface := range ifaces { - ipv4 := "(no IPv4)" - if len(iface.IPv4) > 0 { - ipv4 = strings.Join(iface.IPv4, ", ") - } - fmt.Fprintf(&body, "- %s: state=%s ip=%s\n", iface.Name, iface.State, ipv4) - } - if gw := a.network.DefaultRoute(); gw != "" { - fmt.Fprintf(&body, "\nDefault route: %s\n", gw) - } - return ActionResult{Title: "Network status", Body: strings.TrimSpace(body.String())}, nil -} - -func (a *App) DefaultStaticIPv4FormFields(iface string) []string { - return []string{ - "", - "24", - strings.TrimSpace(a.network.DefaultRoute()), - "77.88.8.8 77.88.8.1 1.1.1.1 8.8.8.8", - } -} - -func (a *App) ParseStaticIPv4Config(iface string, fields []string) platform.StaticIPv4Config { - get := func(index int) string { - if index >= 0 && index < len(fields) { - return strings.TrimSpace(fields[index]) - } - return "" - } - return platform.StaticIPv4Config{ - Interface: iface, - Address: get(0), - Prefix: get(1), - Gateway: get(2), - DNS: strings.Fields(get(3)), - } -} - -func (a *App) ListBeeServices() ([]string, error) { - return a.services.ListBeeServices() -} - -func (a *App) ServiceState(name string) string { - return a.services.ServiceState(name) -} - -func (a *App) ServiceStatus(name string) (string, error) { - return a.services.ServiceStatus(name) -} - -func (a *App) ServiceStatusResult(name string) (ActionResult, error) { - body, err := a.services.ServiceStatus(name) - return ActionResult{Title: "service status: " + name, Body: bodyOr(body, "No status output.")}, err -} - -func (a *App) ServiceDo(name string, action platform.ServiceAction) (string, error) { - return a.services.ServiceDo(name, action) -} - -func (a *App) ServiceActionResult(name string, action platform.ServiceAction) (ActionResult, error) { - body, err := a.services.ServiceDo(name, action) - return ActionResult{Title: "service " + string(action) + ": " + name, Body: bodyOr(body, "Action completed.")}, err -} - -func (a *App) ListRemovableTargets() ([]platform.RemovableTarget, error) { - return a.exports.ListRemovableTargets() -} - -func (a *App) TailFile(path string, lines int) string { - return a.tools.TailFile(path, lines) -} - -func (a *App) CheckTools(names []string) []platform.ToolStatus { - return a.tools.CheckTools(names) -} - -func (a *App) ToolCheckResult(names []string) ActionResult { - if len(names) == 0 { - return ActionResult{Title: "Required tools", Body: "No tools checked."} - } - var body strings.Builder - for _, tool := range a.tools.CheckTools(names) { - status := "MISSING" - if tool.OK { - status = "OK (" + tool.Path + ")" - } - fmt.Fprintf(&body, "- %s: %s\n", tool.Name, status) - } - return ActionResult{Title: "Required tools", Body: strings.TrimSpace(body.String())} -} - -func (a *App) AuditLogTailResult() ActionResult { - logTail := strings.TrimSpace(a.tools.TailFile(DefaultAuditLogPath, 40)) - jsonTail := strings.TrimSpace(a.tools.TailFile(DefaultAuditJSONPath, 20)) - body := strings.TrimSpace(logTail + "\n\n" + jsonTail) - if body == "" { - body = "No audit logs found." - } - return ActionResult{Title: "Audit log tail", Body: body} -} - -func (a *App) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) { - if strings.TrimSpace(baseDir) == "" { - baseDir = DefaultSATBaseDir - } - return a.sat.RunNvidiaAcceptancePack(baseDir, logFunc) -} - -func (a *App) RunNvidiaAcceptancePackResult(baseDir string) (ActionResult, error) { - path, err := a.RunNvidiaAcceptancePack(baseDir, nil) - body := "Archive written." - if path != "" { - body = "Archive written to " + path - } - return ActionResult{Title: "NVIDIA SAT", Body: body}, err -} - -func (a *App) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) { - return a.sat.ListNvidiaGPUs() -} - -func (a *App) ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error) { - return a.sat.ListNvidiaGPUStatuses() -} - -func (a *App) ResetNvidiaGPU(index int) (ActionResult, error) { - out, err := a.sat.ResetNvidiaGPU(index) - return ActionResult{Title: fmt.Sprintf("Reset NVIDIA GPU %d", index), Body: strings.TrimSpace(out)}, err -} - -func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (ActionResult, error) { - if strings.TrimSpace(baseDir) == "" { - baseDir = DefaultSATBaseDir - } - path, err := a.sat.RunNvidiaAcceptancePackWithOptions(ctx, baseDir, diagLevel, gpuIndices, logFunc) - body := "Archive written." - if path != "" { - body = "Archive written to " + path - } - return ActionResult{Title: "NVIDIA DCGM", Body: body}, err -} - -func (a *App) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) { - if strings.TrimSpace(baseDir) == "" { - baseDir = DefaultSATBaseDir - } - return a.sat.RunNvidiaTargetedStressValidatePack(ctx, baseDir, durationSec, gpuIndices, logFunc) -} - -func (a *App) RunNvidiaStressPack(baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) { - return a.RunNvidiaStressPackCtx(context.Background(), baseDir, opts, logFunc) -} - -func (a *App) RunNvidiaBenchmark(baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) { - return a.RunNvidiaBenchmarkCtx(context.Background(), baseDir, opts, logFunc) -} - -func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) { - if strings.TrimSpace(baseDir) == "" { - baseDir = DefaultBeeBenchPerfDir - } - resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "performance", logFunc) - if err != nil { - return "", err - } - opts.ServerPowerSource = resolved.SelectedSource - return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc) -} - -func (a *App) RunNvidiaPowerBenchCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) { - if strings.TrimSpace(baseDir) == "" { - baseDir = DefaultBeeBenchPowerDir - } - resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "power-fit", logFunc) - if err != nil { - return "", err - } - opts.ServerPowerSource = resolved.SelectedSource - return a.sat.RunNvidiaPowerBench(ctx, baseDir, opts, logFunc) -} - -func (a *App) RunNvidiaPowerSourceAutotuneCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error) { - if strings.TrimSpace(baseDir) == "" { - baseDir = DefaultBeeBenchAutotuneDir - } - return a.sat.RunNvidiaPowerSourceAutotune(ctx, baseDir, opts, benchmarkKind, logFunc) -} - -func (a *App) LoadBenchmarkPowerAutotune() (*platform.BenchmarkPowerAutotuneConfig, error) { - return platform.LoadBenchmarkPowerAutotuneConfig(DefaultBeeBenchPowerSourceConfigPath) -} - -func (a *App) ensureBenchmarkPowerAutotune(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (platform.BenchmarkPowerAutotuneConfig, error) { - cfgPath := platform.BenchmarkPowerSourceConfigPath(baseDir) - if cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath); err == nil { - if logFunc != nil { - logFunc(fmt.Sprintf("benchmark autotune: using saved server power source %s", cfg.SelectedSource)) - } - return *cfg, nil - } - if logFunc != nil { - logFunc("benchmark autotune: no saved power source config, running autotune first") - } - autotuneDir := filepath.Join(filepath.Dir(baseDir), "autotune") - if _, err := a.RunNvidiaPowerSourceAutotuneCtx(ctx, autotuneDir, opts, benchmarkKind, logFunc); err != nil { - return platform.BenchmarkPowerAutotuneConfig{}, err - } - cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath) - if err != nil { - return platform.BenchmarkPowerAutotuneConfig{}, err - } - return *cfg, nil -} - -func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) { - if strings.TrimSpace(baseDir) == "" { - baseDir = DefaultSATBaseDir - } - return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, staggerSec, logFunc) -} - -func (a *App) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) { - if strings.TrimSpace(baseDir) == "" { - baseDir = DefaultSATBaseDir - } - return a.sat.RunNvidiaTargetedPowerPack(ctx, baseDir, durationSec, gpuIndices, logFunc) -} - -func (a *App) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) { - if strings.TrimSpace(baseDir) == "" { - baseDir = DefaultSATBaseDir - } - return a.sat.RunNvidiaPulseTestPack(ctx, baseDir, durationSec, gpuIndices, logFunc) -} - -func (a *App) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) { - if strings.TrimSpace(baseDir) == "" { - baseDir = DefaultSATBaseDir - } - return a.sat.RunNvidiaBandwidthPack(ctx, baseDir, gpuIndices, logFunc) -} - -func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) { - if strings.TrimSpace(baseDir) == "" { - baseDir = DefaultSATBaseDir - } - return a.sat.RunNvidiaStressPack(ctx, baseDir, opts, logFunc) -} - -func (a *App) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) { - return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, 256, 1, logFunc) -} - -func (a *App) RunMemoryAcceptancePackCtx(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) { - if strings.TrimSpace(baseDir) == "" { - baseDir = DefaultSATBaseDir - } - return a.sat.RunMemoryAcceptancePack(ctx, baseDir, sizeMB, passes, logFunc) -} - -func (a *App) RunMemoryAcceptancePackResult(baseDir string) (ActionResult, error) { - path, err := a.RunMemoryAcceptancePack(baseDir, nil) - return ActionResult{Title: "Memory SAT", Body: satResultBody(path)}, err -} - -func (a *App) RunCPUAcceptancePack(baseDir string, durationSec int, logFunc func(string)) (string, error) { - return a.RunCPUAcceptancePackCtx(context.Background(), baseDir, durationSec, logFunc) -} - -func (a *App) RunCPUAcceptancePackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) { - if strings.TrimSpace(baseDir) == "" { - baseDir = DefaultSATBaseDir - } - return a.sat.RunCPUAcceptancePack(ctx, baseDir, durationSec, logFunc) -} - -func (a *App) RunCPUAcceptancePackResult(baseDir string, durationSec int) (ActionResult, error) { - path, err := a.RunCPUAcceptancePack(baseDir, durationSec, nil) - return ActionResult{Title: "CPU SAT", Body: satResultBody(path)}, err -} - -func (a *App) RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error) { - return a.RunStorageAcceptancePackCtx(context.Background(), baseDir, false, logFunc) -} - -func (a *App) RunStorageAcceptancePackCtx(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) { - if strings.TrimSpace(baseDir) == "" { - baseDir = DefaultSATBaseDir - } - return a.sat.RunStorageAcceptancePack(ctx, baseDir, extended, logFunc) -} - -func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, error) { - path, err := a.RunStorageAcceptancePack(baseDir, nil) - return ActionResult{Title: "Storage SAT", Body: satResultBody(path)}, err -} - -func (a *App) DetectGPUVendor() string { - return a.sat.DetectGPUVendor() -} - -func (a *App) ListAMDGPUs() ([]platform.AMDGPUInfo, error) { - return a.sat.ListAMDGPUs() -} - -func (a *App) RunAMDAcceptancePack(baseDir string, logFunc func(string)) (string, error) { - return a.RunAMDAcceptancePackCtx(context.Background(), baseDir, logFunc) -} - -func (a *App) RunAMDAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) { - if strings.TrimSpace(baseDir) == "" { - baseDir = DefaultSATBaseDir - } - return a.sat.RunAMDAcceptancePack(ctx, baseDir, logFunc) -} - -func (a *App) RunAMDAcceptancePackResult(baseDir string) (ActionResult, error) { - path, err := a.RunAMDAcceptancePack(baseDir, nil) - return ActionResult{Title: "AMD GPU SAT", Body: satResultBody(path)}, err -} - -func (a *App) RunAMDMemIntegrityPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) { - if strings.TrimSpace(baseDir) == "" { - baseDir = DefaultSATBaseDir - } - return a.sat.RunAMDMemIntegrityPack(ctx, baseDir, logFunc) -} - -func (a *App) RunAMDMemBandwidthPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) { - if strings.TrimSpace(baseDir) == "" { - baseDir = DefaultSATBaseDir - } - return a.sat.RunAMDMemBandwidthPack(ctx, baseDir, logFunc) -} - -func (a *App) RunMemoryStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) { - return a.RunMemoryStressPackCtx(context.Background(), baseDir, durationSec, logFunc) -} - -func (a *App) RunSATStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) { - return a.RunSATStressPackCtx(context.Background(), baseDir, durationSec, logFunc) -} - -func (a *App) RunAMDStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) { - return a.RunAMDStressPackCtx(context.Background(), baseDir, durationSec, logFunc) -} - -func (a *App) RunMemoryStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) { - return a.sat.RunMemoryStressPack(ctx, baseDir, durationSec, logFunc) -} - -func (a *App) RunSATStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) { - return a.sat.RunSATStressPack(ctx, baseDir, durationSec, logFunc) -} - -func (a *App) RunAMDStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) { - if strings.TrimSpace(baseDir) == "" { - baseDir = DefaultSATBaseDir - } - return a.sat.RunAMDStressPack(ctx, baseDir, durationSec, logFunc) -} - -func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error) { - if strings.TrimSpace(baseDir) == "" { - baseDir = DefaultSATBaseDir - } - return a.sat.RunFanStressTest(ctx, baseDir, opts) -} - -func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error) { - if strings.TrimSpace(baseDir) == "" { - baseDir = DefaultSATBaseDir - } - return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc) -} - -func (a *App) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) { - if strings.TrimSpace(baseDir) == "" { - baseDir = DefaultSATBaseDir - } - return a.sat.RunNCCLTests(ctx, baseDir, gpuIndices, logFunc) -} - -func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) { - path, err := a.RunNCCLTests(ctx, DefaultSATBaseDir, nil, nil) - body := "Results: " + path - if err != nil && err != context.Canceled { - body += "\nERROR: " + err.Error() - } - return ActionResult{Title: "NCCL bandwidth test", Body: body}, err -} - -func (a *App) RunFanStressTestResult(ctx context.Context, opts platform.FanStressOptions) (ActionResult, error) { - path, err := a.RunFanStressTest(ctx, "", opts) - body := formatFanStressResult(path) - if err != nil && err != context.Canceled { - body += "\nERROR: " + err.Error() - } - return ActionResult{Title: "GPU Platform Stress Test", Body: body}, err -} - -// formatFanStressResult formats the summary.txt from a fan-stress run, including -// the per-step pass/fail display and the analysis section (throttling, max temps, fan response). -func formatFanStressResult(archivePath string) string { - if archivePath == "" { - return "No output produced." - } - runDir := strings.TrimSuffix(archivePath, ".tar.gz") - raw, err := os.ReadFile(filepath.Join(runDir, "summary.txt")) - if err != nil { - return "Archive written to " + archivePath - } - content := strings.TrimSpace(string(raw)) - kv := parseKeyValueSummary(content) - - var b strings.Builder - b.WriteString(formatSATDetail(content)) - - // Append analysis section. - var analysis []string - if v, ok := kv["throttling_detected"]; ok { - label := "NO" - if v == "true" { - label = "YES ← throttling detected during load" - } - analysis = append(analysis, "Throttling: "+label) - } - if v, ok := kv["max_gpu_temp_c"]; ok && v != "0.0" { - analysis = append(analysis, "Max GPU temp: "+v+"°C") - } - if v, ok := kv["max_cpu_temp_c"]; ok && v != "0.0" { - analysis = append(analysis, "Max CPU temp: "+v+"°C") - } - if v, ok := kv["fan_response_sec"]; ok && v != "N/A" && v != "-1.0" { - analysis = append(analysis, "Fan response: "+v+"s") - } - - if len(analysis) > 0 { - b.WriteString("\n\n=== Analysis ===\n") - for _, line := range analysis { - b.WriteString(line + "\n") - } - } - return strings.TrimSpace(b.String()) -} - -// satResultBody reads summary.txt from the SAT run directory (archive path without .tar.gz) -// and returns a formatted human-readable result. Falls back to a plain message if unreadable. -func satResultBody(archivePath string) string { - if archivePath == "" { - return "No output produced." - } - runDir := strings.TrimSuffix(archivePath, ".tar.gz") - raw, err := os.ReadFile(filepath.Join(runDir, "summary.txt")) - if err != nil { - return "Archive written to " + archivePath - } - return formatSATDetail(strings.TrimSpace(string(raw))) -} - func (a *App) HealthSummaryResult() ActionResult { raw, err := os.ReadFile(DefaultAuditJSONPath) if err != nil { @@ -963,38 +387,6 @@ func (a *App) ParsePrefix(raw string, fallback int) int { return value } -func hostnameOr(fallback string) string { - hn, err := os.Hostname() - if err != nil || strings.TrimSpace(hn) == "" { - return fallback - } - return hn -} - -func sanitizeFilename(v string) string { - var out []rune - for _, r := range v { - switch { - case r >= 'a' && r <= 'z', r >= 'A' && r <= 'Z', r >= '0' && r <= '9', r == '-', r == '_', r == '.': - out = append(out, r) - default: - out = append(out, '-') - } - } - if len(out) == 0 { - return "unknown" - } - return string(out) -} - -func bodyOr(body, fallback string) string { - body = strings.TrimSpace(body) - if body == "" { - return fallback - } - return body -} - // writePSUStatusesToDB records PSU statuses collected during audit into the // component-status DB so they are visible in the Hardware Summary card. // PSU status is sourced from IPMI (ipmitool fru + sdr) during audit. @@ -1041,373 +433,3 @@ func ReadRuntimeHealth(path string) (schema.RuntimeHealth, error) { } return health, nil } - -func latestSATSummaries() []string { - patterns := []struct { - label string - prefix string - }{ - {label: "NVIDIA SAT", prefix: "gpu-nvidia-"}, - {label: "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)", prefix: "gpu-nvidia-targeted-stress-"}, - {label: "NVIDIA Max Compute Load (dcgmproftester)", prefix: "gpu-nvidia-compute-"}, - {label: "NVIDIA Targeted Power (dcgmi diag targeted_power)", prefix: "gpu-nvidia-targeted-power-"}, - {label: "NVIDIA Pulse Test (dcgmi diag pulse_test)", prefix: "gpu-nvidia-pulse-"}, - {label: "NVIDIA Interconnect Test (NCCL all_reduce_perf)", prefix: "gpu-nvidia-nccl-"}, - {label: "NVIDIA Bandwidth Test (NVBandwidth)", prefix: "gpu-nvidia-bandwidth-"}, - {label: "Memory SAT", prefix: "memory-"}, - {label: "Storage SAT", prefix: "storage-"}, - {label: "CPU SAT", prefix: "cpu-"}, - } - var out []string - for _, item := range patterns { - matches, err := filepath.Glob(filepath.Join(DefaultSATBaseDir, item.prefix+"*/summary.txt")) - if err != nil || len(matches) == 0 { - continue - } - sort.Strings(matches) - raw, err := os.ReadFile(matches[len(matches)-1]) - if err != nil { - continue - } - out = append(out, formatSATSummary(item.label, string(raw))) - } - return out -} - -func formatSATSummary(label, raw string) string { - values := parseKeyValueSummary(raw) - var body strings.Builder - fmt.Fprintf(&body, "%s:", label) - if overall := firstNonEmpty(values["overall_status"], "UNKNOWN"); overall != "" { - fmt.Fprintf(&body, " %s", overall) - } - if ok := firstNonEmpty(values["job_ok"], "0"); ok != "" { - fmt.Fprintf(&body, " ok=%s", ok) - } - if failed := firstNonEmpty(values["job_failed"], "0"); failed != "" { - fmt.Fprintf(&body, " failed=%s", failed) - } - if unsupported := firstNonEmpty(values["job_unsupported"], "0"); unsupported != "" && unsupported != "0" { - fmt.Fprintf(&body, " unsupported=%s", unsupported) - } - if devices := strings.TrimSpace(values["devices"]); devices != "" { - fmt.Fprintf(&body, "\nDevices: %s", devices) - } - return body.String() -} - -func formatSystemLine(board schema.HardwareBoard) string { - model := strings.TrimSpace(strings.Join([]string{ - trimPtr(board.Manufacturer), - trimPtr(board.ProductName), - }, " ")) - serial := strings.TrimSpace(board.SerialNumber) - switch { - case model != "" && serial != "": - return fmt.Sprintf("System: %s | S/N %s", model, serial) - case model != "": - return "System: " + model - case serial != "": - return "System S/N: " + serial - default: - return "" - } -} - -func formatCPULine(cpus []schema.HardwareCPU) string { - if len(cpus) == 0 { - return "" - } - modelCounts := map[string]int{} - unknown := 0 - for _, cpu := range cpus { - model := trimPtr(cpu.Model) - if model == "" { - unknown++ - continue - } - modelCounts[model]++ - } - if len(modelCounts) == 1 && unknown == 0 { - for model, count := range modelCounts { - return fmt.Sprintf("CPU: %d x %s", count, model) - } - } - parts := make([]string, 0, len(modelCounts)+1) - if len(modelCounts) > 0 { - keys := make([]string, 0, len(modelCounts)) - for key := range modelCounts { - keys = append(keys, key) - } - sort.Strings(keys) - for _, key := range keys { - parts = append(parts, fmt.Sprintf("%d x %s", modelCounts[key], key)) - } - } - if unknown > 0 { - parts = append(parts, fmt.Sprintf("%d x unknown", unknown)) - } - return "CPU: " + strings.Join(parts, ", ") -} - -func formatMemoryLine(dimms []schema.HardwareMemory) string { - totalMB := 0 - present := 0 - types := map[string]struct{}{} - for _, dimm := range dimms { - if dimm.Present != nil && !*dimm.Present { - continue - } - if dimm.SizeMB == nil || *dimm.SizeMB <= 0 { - continue - } - present++ - totalMB += *dimm.SizeMB - if value := trimPtr(dimm.Type); value != "" { - types[value] = struct{}{} - } - } - if totalMB == 0 { - return "" - } - typeText := joinSortedKeys(types) - line := fmt.Sprintf("Memory: %s", humanizeMB(totalMB)) - if typeText != "" { - line += " " + typeText - } - if present > 0 { - line += fmt.Sprintf(" (%d DIMMs)", present) - } - return line -} - -func formatStorageLine(disks []schema.HardwareStorage) string { - count := 0 - totalGB := 0 - for _, disk := range disks { - if disk.Present != nil && !*disk.Present { - continue - } - count++ - if disk.SizeGB != nil && *disk.SizeGB > 0 { - totalGB += *disk.SizeGB - } - } - if count == 0 { - return "" - } - line := fmt.Sprintf("Storage: %d drives", count) - if totalGB > 0 { - line += fmt.Sprintf(" / %s", humanizeGB(totalGB)) - } - return line -} - -func formatGPULine(devices []schema.HardwarePCIeDevice) string { - gpus := map[string]int{} - for _, dev := range devices { - if !isGPUDevice(dev) { - continue - } - name := firstNonEmpty(trimPtr(dev.Model), trimPtr(dev.Manufacturer), "unknown") - gpus[name]++ - } - if len(gpus) == 0 { - return "" - } - keys := make([]string, 0, len(gpus)) - for key := range gpus { - keys = append(keys, key) - } - sort.Strings(keys) - parts := make([]string, 0, len(keys)) - for _, key := range keys { - parts = append(parts, fmt.Sprintf("%d x %s", gpus[key], key)) - } - return "GPU: " + strings.Join(parts, ", ") -} - -func formatIPLine(list func() ([]platform.InterfaceInfo, error)) string { - if list == nil { - return "" - } - ifaces, err := list() - if err != nil { - return "" - } - seen := map[string]struct{}{} - var ips []string - for _, iface := range ifaces { - for _, ip := range iface.IPv4 { - ip = strings.TrimSpace(ip) - if ip == "" { - continue - } - if _, ok := seen[ip]; ok { - continue - } - seen[ip] = struct{}{} - ips = append(ips, ip) - } - } - if len(ips) == 0 { - return "" - } - sort.Strings(ips) - return "IP: " + strings.Join(ips, ", ") -} - -func isGPUDevice(dev schema.HardwarePCIeDevice) bool { - class := trimPtr(dev.DeviceClass) - model := strings.ToLower(trimPtr(dev.Model)) - vendor := strings.ToLower(trimPtr(dev.Manufacturer)) - // Exclude ASPEED (BMC VGA adapter, not a compute GPU) - if strings.Contains(vendor, "aspeed") || strings.Contains(model, "aspeed") { - return false - } - // AMD Instinct / Radeon compute GPUs have class ProcessingAccelerator or DisplayController. - // Do NOT match by AMD vendor alone — chipset/CPU PCIe devices share that vendor. - return class == "VideoController" || - class == "DisplayController" || - class == "ProcessingAccelerator" || - strings.Contains(model, "nvidia") || - strings.Contains(vendor, "nvidia") -} - -func trimPtr(value *string) string { - if value == nil { - return "" - } - return strings.TrimSpace(*value) -} - -func joinSortedKeys(values map[string]struct{}) string { - if len(values) == 0 { - return "" - } - keys := make([]string, 0, len(values)) - for key := range values { - keys = append(keys, key) - } - sort.Strings(keys) - return strings.Join(keys, "/") -} - -func humanizeMB(totalMB int) string { - if totalMB <= 0 { - return "" - } - gb := float64(totalMB) / 1024.0 - if gb >= 1024.0 { - tb := gb / 1024.0 - return fmt.Sprintf("%.1f TB", tb) - } - if gb == float64(int64(gb)) { - return fmt.Sprintf("%.0f GB", gb) - } - return fmt.Sprintf("%.1f GB", gb) -} - -func humanizeGB(totalGB int) string { - if totalGB <= 0 { - return "" - } - tb := float64(totalGB) / 1024.0 - if tb >= 1.0 { - return fmt.Sprintf("%.1f TB", tb) - } - return fmt.Sprintf("%d GB", totalGB) -} - -func parseKeyValueSummary(raw string) map[string]string { - out := map[string]string{} - for _, line := range strings.Split(raw, "\n") { - line = strings.TrimSpace(line) - if line == "" { - continue - } - key, value, ok := strings.Cut(line, "=") - if !ok { - continue - } - out[strings.TrimSpace(key)] = strings.TrimSpace(value) - } - return out -} - -func firstNonEmpty(values ...string) string { - for _, value := range values { - value = strings.TrimSpace(value) - if value != "" { - return value - } - } - return "" -} - -func (a *App) ListInstallDisks() ([]platform.InstallDisk, error) { - return a.installer.ListInstallDisks() -} - -func (a *App) InstallToDisk(ctx context.Context, device string, logFile string) error { - return a.installer.InstallToDisk(ctx, device, logFile) -} - -func formatSATDetail(raw string) string { - var b strings.Builder - kv := parseKeyValueSummary(raw) - - if t, ok := kv["run_at_utc"]; ok { - fmt.Fprintf(&b, "Run: %s\n\n", t) - } - - lines := strings.Split(raw, "\n") - var stepKeys []string - seenStep := map[string]bool{} - for _, line := range lines { - if idx := strings.Index(line, "_status="); idx >= 0 { - key := line[:idx] - if !seenStep[key] && key != "overall" { - seenStep[key] = true - stepKeys = append(stepKeys, key) - } - } - } - - for _, key := range stepKeys { - status := kv[key+"_status"] - display := cleanSummaryKey(key) - switch status { - case "OK": - fmt.Fprintf(&b, "PASS %s\n", display) - case "FAILED": - fmt.Fprintf(&b, "FAIL %s\n", display) - case "UNSUPPORTED": - fmt.Fprintf(&b, "SKIP %s\n", display) - default: - fmt.Fprintf(&b, "? %s\n", display) - } - } - - if overall, ok := kv["overall_status"]; ok { - ok2 := kv["job_ok"] - failed := kv["job_failed"] - fmt.Fprintf(&b, "\nOverall: %s (ok=%s failed=%s)", overall, ok2, failed) - } - - return strings.TrimSpace(b.String()) -} - -func cleanSummaryKey(key string) string { - idx := strings.Index(key, "-") - if idx <= 0 { - return key - } - prefix := key[:idx] - for _, c := range prefix { - if c < '0' || c > '9' { - return key - } - } - return key[idx+1:] -} diff --git a/audit/internal/app/app_format.go b/audit/internal/app/app_format.go new file mode 100644 index 0000000..b9ad60e --- /dev/null +++ b/audit/internal/app/app_format.go @@ -0,0 +1,405 @@ +package app + +import ( + "fmt" + "os" + "path/filepath" + "sort" + "strings" + + "bee/audit/internal/collector" + "bee/audit/internal/platform" + "bee/audit/internal/schema" +) + +func hostnameOr(fallback string) string { + hn, err := os.Hostname() + if err != nil || strings.TrimSpace(hn) == "" { + return fallback + } + return hn +} + +func sanitizeFilename(v string) string { + var out []rune + for _, r := range v { + switch { + case r >= 'a' && r <= 'z', r >= 'A' && r <= 'Z', r >= '0' && r <= '9', r == '-', r == '_', r == '.': + out = append(out, r) + default: + out = append(out, '-') + } + } + if len(out) == 0 { + return "unknown" + } + return string(out) +} + +func bodyOr(body, fallback string) string { + body = strings.TrimSpace(body) + if body == "" { + return fallback + } + return body +} + +func trimPtr(value *string) string { + if value == nil { + return "" + } + return strings.TrimSpace(*value) +} + +func joinSortedKeys(values map[string]struct{}) string { + if len(values) == 0 { + return "" + } + keys := make([]string, 0, len(values)) + for key := range values { + keys = append(keys, key) + } + sort.Strings(keys) + return strings.Join(keys, "/") +} + +func humanizeMB(totalMB int) string { + if totalMB <= 0 { + return "" + } + gb := float64(totalMB) / 1024.0 + if gb >= 1024.0 { + tb := gb / 1024.0 + return fmt.Sprintf("%.1f TB", tb) + } + if gb == float64(int64(gb)) { + return fmt.Sprintf("%.0f GB", gb) + } + return fmt.Sprintf("%.1f GB", gb) +} + +func humanizeGB(totalGB int) string { + if totalGB <= 0 { + return "" + } + tb := float64(totalGB) / 1024.0 + if tb >= 1.0 { + return fmt.Sprintf("%.1f TB", tb) + } + return fmt.Sprintf("%d GB", totalGB) +} + +func parseKeyValueSummary(raw string) map[string]string { + out := map[string]string{} + for _, line := range strings.Split(raw, "\n") { + line = strings.TrimSpace(line) + if line == "" { + continue + } + key, value, ok := strings.Cut(line, "=") + if !ok { + continue + } + out[strings.TrimSpace(key)] = strings.TrimSpace(value) + } + return out +} + +func firstNonEmpty(values ...string) string { + for _, value := range values { + value = strings.TrimSpace(value) + if value != "" { + return value + } + } + return "" +} + +func cleanSummaryKey(key string) string { + idx := strings.Index(key, "-") + if idx <= 0 { + return key + } + prefix := key[:idx] + for _, c := range prefix { + if c < '0' || c > '9' { + return key + } + } + return key[idx+1:] +} + +func isGPUDevice(dev schema.HardwarePCIeDevice) bool { + // Exclude Aspeed BMC VGA adapters (not compute GPUs). + if dev.VendorID != nil && *dev.VendorID == collector.AspeedVendorID { + return false + } + class := trimPtr(dev.DeviceClass) + // AMD Instinct / Radeon compute GPUs always carry ProcessingAccelerator or DisplayController. + // Do NOT match AMD vendor alone — CPU chipset PCIe devices share that vendor ID. + if class == "VideoController" || class == "DisplayController" || class == "ProcessingAccelerator" { + return true + } + // NVIDIA devices sometimes expose class values outside the standard GPU set. + return dev.VendorID != nil && *dev.VendorID == collector.NvidiaVendorID +} + +func formatSystemLine(board schema.HardwareBoard) string { + model := strings.TrimSpace(strings.Join([]string{ + trimPtr(board.Manufacturer), + trimPtr(board.ProductName), + }, " ")) + serial := strings.TrimSpace(board.SerialNumber) + switch { + case model != "" && serial != "": + return fmt.Sprintf("System: %s | S/N %s", model, serial) + case model != "": + return "System: " + model + case serial != "": + return "System S/N: " + serial + default: + return "" + } +} + +func formatCPULine(cpus []schema.HardwareCPU) string { + if len(cpus) == 0 { + return "" + } + modelCounts := map[string]int{} + unknown := 0 + for _, cpu := range cpus { + model := trimPtr(cpu.Model) + if model == "" { + unknown++ + continue + } + modelCounts[model]++ + } + if len(modelCounts) == 1 && unknown == 0 { + for model, count := range modelCounts { + return fmt.Sprintf("CPU: %d x %s", count, model) + } + } + parts := make([]string, 0, len(modelCounts)+1) + if len(modelCounts) > 0 { + keys := make([]string, 0, len(modelCounts)) + for key := range modelCounts { + keys = append(keys, key) + } + sort.Strings(keys) + for _, key := range keys { + parts = append(parts, fmt.Sprintf("%d x %s", modelCounts[key], key)) + } + } + if unknown > 0 { + parts = append(parts, fmt.Sprintf("%d x unknown", unknown)) + } + return "CPU: " + strings.Join(parts, ", ") +} + +func formatMemoryLine(dimms []schema.HardwareMemory) string { + totalMB := 0 + present := 0 + types := map[string]struct{}{} + for _, dimm := range dimms { + if dimm.Present != nil && !*dimm.Present { + continue + } + if dimm.SizeMB == nil || *dimm.SizeMB <= 0 { + continue + } + present++ + totalMB += *dimm.SizeMB + if value := trimPtr(dimm.Type); value != "" { + types[value] = struct{}{} + } + } + if totalMB == 0 { + return "" + } + typeText := joinSortedKeys(types) + line := fmt.Sprintf("Memory: %s", humanizeMB(totalMB)) + if typeText != "" { + line += " " + typeText + } + if present > 0 { + line += fmt.Sprintf(" (%d DIMMs)", present) + } + return line +} + +func formatStorageLine(disks []schema.HardwareStorage) string { + count := 0 + totalGB := 0 + for _, disk := range disks { + if disk.Present != nil && !*disk.Present { + continue + } + count++ + if disk.SizeGB != nil && *disk.SizeGB > 0 { + totalGB += *disk.SizeGB + } + } + if count == 0 { + return "" + } + line := fmt.Sprintf("Storage: %d drives", count) + if totalGB > 0 { + line += fmt.Sprintf(" / %s", humanizeGB(totalGB)) + } + return line +} + +func formatGPULine(devices []schema.HardwarePCIeDevice) string { + gpus := map[string]int{} + for _, dev := range devices { + if !isGPUDevice(dev) { + continue + } + name := firstNonEmpty(trimPtr(dev.Model), trimPtr(dev.Manufacturer), "unknown") + gpus[name]++ + } + if len(gpus) == 0 { + return "" + } + keys := make([]string, 0, len(gpus)) + for key := range gpus { + keys = append(keys, key) + } + sort.Strings(keys) + parts := make([]string, 0, len(keys)) + for _, key := range keys { + parts = append(parts, fmt.Sprintf("%d x %s", gpus[key], key)) + } + return "GPU: " + strings.Join(parts, ", ") +} + +func formatIPLine(list func() ([]platform.InterfaceInfo, error)) string { + if list == nil { + return "" + } + ifaces, err := list() + if err != nil { + return "" + } + seen := map[string]struct{}{} + var ips []string + for _, iface := range ifaces { + for _, ip := range iface.IPv4 { + ip = strings.TrimSpace(ip) + if ip == "" { + continue + } + if _, ok := seen[ip]; ok { + continue + } + seen[ip] = struct{}{} + ips = append(ips, ip) + } + } + if len(ips) == 0 { + return "" + } + sort.Strings(ips) + return "IP: " + strings.Join(ips, ", ") +} + +func formatSATDetail(raw string) string { + var b strings.Builder + kv := parseKeyValueSummary(raw) + + if t, ok := kv["run_at_utc"]; ok { + fmt.Fprintf(&b, "Run: %s\n\n", t) + } + + lines := strings.Split(raw, "\n") + var stepKeys []string + seenStep := map[string]bool{} + for _, line := range lines { + if idx := strings.Index(line, "_status="); idx >= 0 { + key := line[:idx] + if !seenStep[key] && key != "overall" { + seenStep[key] = true + stepKeys = append(stepKeys, key) + } + } + } + + for _, key := range stepKeys { + status := kv[key+"_status"] + display := cleanSummaryKey(key) + switch status { + case "OK": + fmt.Fprintf(&b, "PASS %s\n", display) + case "FAILED": + fmt.Fprintf(&b, "FAIL %s\n", display) + case "UNSUPPORTED": + fmt.Fprintf(&b, "SKIP %s\n", display) + default: + fmt.Fprintf(&b, "? %s\n", display) + } + } + + if overall, ok := kv["overall_status"]; ok { + ok2 := kv["job_ok"] + failed := kv["job_failed"] + fmt.Fprintf(&b, "\nOverall: %s (ok=%s failed=%s)", overall, ok2, failed) + } + + return strings.TrimSpace(b.String()) +} + +func formatSATSummary(label, raw string) string { + values := parseKeyValueSummary(raw) + var body strings.Builder + fmt.Fprintf(&body, "%s:", label) + if overall := firstNonEmpty(values["overall_status"], "UNKNOWN"); overall != "" { + fmt.Fprintf(&body, " %s", overall) + } + if ok := firstNonEmpty(values["job_ok"], "0"); ok != "" { + fmt.Fprintf(&body, " ok=%s", ok) + } + if failed := firstNonEmpty(values["job_failed"], "0"); failed != "" { + fmt.Fprintf(&body, " failed=%s", failed) + } + if unsupported := firstNonEmpty(values["job_unsupported"], "0"); unsupported != "" && unsupported != "0" { + fmt.Fprintf(&body, " unsupported=%s", unsupported) + } + if devices := strings.TrimSpace(values["devices"]); devices != "" { + fmt.Fprintf(&body, "\nDevices: %s", devices) + } + return body.String() +} + +func latestSATSummaries() []string { + patterns := []struct { + label string + prefix string + }{ + {label: "NVIDIA SAT", prefix: "gpu-nvidia-"}, + {label: "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)", prefix: "gpu-nvidia-targeted-stress-"}, + {label: "NVIDIA Max Compute Load (dcgmproftester)", prefix: "gpu-nvidia-compute-"}, + {label: "NVIDIA Targeted Power (dcgmi diag targeted_power)", prefix: "gpu-nvidia-targeted-power-"}, + {label: "NVIDIA Pulse Test (dcgmi diag pulse_test)", prefix: "gpu-nvidia-pulse-"}, + {label: "NVIDIA Interconnect Test (NCCL all_reduce_perf)", prefix: "gpu-nvidia-nccl-"}, + {label: "NVIDIA Bandwidth Test (NVBandwidth)", prefix: "gpu-nvidia-bandwidth-"}, + {label: "Memory SAT", prefix: "memory-"}, + {label: "Storage SAT", prefix: "storage-"}, + {label: "CPU SAT", prefix: "cpu-"}, + } + var out []string + for _, item := range patterns { + matches, err := filepath.Glob(filepath.Join(DefaultSATBaseDir, item.prefix+"*/summary.txt")) + if err != nil || len(matches) == 0 { + continue + } + sort.Strings(matches) + raw, err := os.ReadFile(matches[len(matches)-1]) + if err != nil { + continue + } + out = append(out, formatSATSummary(item.label, string(raw))) + } + return out +} diff --git a/audit/internal/app/app_install.go b/audit/internal/app/app_install.go new file mode 100644 index 0000000..c5783e2 --- /dev/null +++ b/audit/internal/app/app_install.go @@ -0,0 +1,76 @@ +package app + +import ( + "context" + "fmt" + "os" + "path/filepath" + "time" + + "bee/audit/internal/platform" +) + +func (a *App) ListRemovableTargets() ([]platform.RemovableTarget, error) { + return a.exports.ListRemovableTargets() +} + +func (a *App) ExportLatestAudit(target platform.RemovableTarget) (string, error) { + if _, err := os.Stat(DefaultAuditJSONPath); err != nil { + return "", err + } + filename := fmt.Sprintf("audit-%s-%s.json", sanitizeFilename(hostnameOr("unknown")), time.Now().UTC().Format("20060102-150405")) + tmpPath := filepath.Join(os.TempDir(), filename) + data, err := readFileLimited(DefaultAuditJSONPath, 100<<20) + if err != nil { + return "", err + } + if normalized, normErr := ApplySATOverlay(data); normErr == nil { + data = normalized + } + if err := os.WriteFile(tmpPath, data, 0644); err != nil { + return "", err + } + defer os.Remove(tmpPath) + return a.exports.ExportFileToTarget(tmpPath, target) +} + +func (a *App) ExportLatestAuditResult(target platform.RemovableTarget) (ActionResult, error) { + path, err := a.ExportLatestAudit(target) + body := "Audit export failed." + if err == nil { + body = "Audit exported." + } + if err == nil && path != "" { + body = "Audit exported to " + path + } + return ActionResult{Title: "Export audit", Body: body}, err +} + +func (a *App) ExportSupportBundle(target platform.RemovableTarget) (string, error) { + archive, err := BuildSupportBundle(DefaultExportDir) + if err != nil { + return "", err + } + defer os.Remove(archive) + return a.exports.ExportFileToTarget(archive, target) +} + +func (a *App) ExportSupportBundleResult(target platform.RemovableTarget) (ActionResult, error) { + path, err := a.ExportSupportBundle(target) + body := "Support bundle export failed." + if err == nil { + body = "Support bundle exported. USB target unmounted and safe to remove." + } + if err == nil && path != "" { + body = "Support bundle exported to " + path + ".\n\nUSB target unmounted and safe to remove." + } + return ActionResult{Title: "Export support bundle", Body: body}, err +} + +func (a *App) ListInstallDisks() ([]platform.InstallDisk, error) { + return a.installer.ListInstallDisks() +} + +func (a *App) InstallToDisk(ctx context.Context, device string, logFile string) error { + return a.installer.InstallToDisk(ctx, device, logFile) +} diff --git a/audit/internal/app/app_network.go b/audit/internal/app/app_network.go new file mode 100644 index 0000000..f6f05bd --- /dev/null +++ b/audit/internal/app/app_network.go @@ -0,0 +1,106 @@ +package app + +import ( + "fmt" + "strings" + + "bee/audit/internal/platform" +) + +func (a *App) ListInterfaces() ([]platform.InterfaceInfo, error) { + return a.network.ListInterfaces() +} + +func (a *App) DefaultRoute() string { + return a.network.DefaultRoute() +} + +func (a *App) DHCPOne(iface string) (string, error) { + return a.network.DHCPOne(iface) +} + +func (a *App) DHCPOneResult(iface string) (ActionResult, error) { + body, err := a.network.DHCPOne(iface) + return ActionResult{Title: "DHCP: " + iface, Body: bodyOr(body, "DHCP completed.")}, err +} + +func (a *App) DHCPAll() (string, error) { + return a.network.DHCPAll() +} + +func (a *App) DHCPAllResult() (ActionResult, error) { + body, err := a.network.DHCPAll() + return ActionResult{Title: "DHCP: all interfaces", Body: bodyOr(body, "DHCP completed.")}, err +} + +func (a *App) SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error) { + return a.network.SetStaticIPv4(cfg) +} + +func (a *App) SetInterfaceState(iface string, up bool) error { + return a.network.SetInterfaceState(iface, up) +} + +func (a *App) GetInterfaceState(iface string) (bool, error) { + return a.network.GetInterfaceState(iface) +} + +func (a *App) CaptureNetworkSnapshot() (platform.NetworkSnapshot, error) { + return a.network.CaptureNetworkSnapshot() +} + +func (a *App) RestoreNetworkSnapshot(snapshot platform.NetworkSnapshot) error { + return a.network.RestoreNetworkSnapshot(snapshot) +} + +func (a *App) SetStaticIPv4Result(cfg platform.StaticIPv4Config) (ActionResult, error) { + body, err := a.network.SetStaticIPv4(cfg) + return ActionResult{Title: "Static IPv4: " + cfg.Interface, Body: bodyOr(body, "Static IPv4 updated.")}, err +} + +func (a *App) NetworkStatus() (ActionResult, error) { + ifaces, err := a.network.ListInterfaces() + if err != nil { + return ActionResult{Title: "Network status"}, err + } + if len(ifaces) == 0 { + return ActionResult{Title: "Network status", Body: "No physical interfaces found."}, nil + } + var body strings.Builder + for _, iface := range ifaces { + ipv4 := "(no IPv4)" + if len(iface.IPv4) > 0 { + ipv4 = strings.Join(iface.IPv4, ", ") + } + fmt.Fprintf(&body, "- %s: state=%s ip=%s\n", iface.Name, iface.State, ipv4) + } + if gw := a.network.DefaultRoute(); gw != "" { + fmt.Fprintf(&body, "\nDefault route: %s\n", gw) + } + return ActionResult{Title: "Network status", Body: strings.TrimSpace(body.String())}, nil +} + +func (a *App) DefaultStaticIPv4FormFields(iface string) []string { + return []string{ + "", + "24", + strings.TrimSpace(a.network.DefaultRoute()), + "77.88.8.8 77.88.8.1 1.1.1.1 8.8.8.8", + } +} + +func (a *App) ParseStaticIPv4Config(iface string, fields []string) platform.StaticIPv4Config { + get := func(index int) string { + if index >= 0 && index < len(fields) { + return strings.TrimSpace(fields[index]) + } + return "" + } + return platform.StaticIPv4Config{ + Interface: iface, + Address: get(0), + Prefix: get(1), + Gateway: get(2), + DNS: strings.Fields(get(3)), + } +} diff --git a/audit/internal/app/app_packs.go b/audit/internal/app/app_packs.go new file mode 100644 index 0000000..59ffc4c --- /dev/null +++ b/audit/internal/app/app_packs.go @@ -0,0 +1,370 @@ +package app + +import ( + "context" + "fmt" + "os" + "path/filepath" + "strings" + + "bee/audit/internal/platform" +) + +func (a *App) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) { + if strings.TrimSpace(baseDir) == "" { + baseDir = DefaultSATBaseDir + } + return a.sat.RunNvidiaAcceptancePack(baseDir, logFunc) +} + +func (a *App) RunNvidiaAcceptancePackResult(baseDir string) (ActionResult, error) { + path, err := a.RunNvidiaAcceptancePack(baseDir, nil) + body := "Archive written." + if path != "" { + body = "Archive written to " + path + } + return ActionResult{Title: "NVIDIA SAT", Body: body}, err +} + +func (a *App) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) { + return a.sat.ListNvidiaGPUs() +} + +func (a *App) ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error) { + return a.sat.ListNvidiaGPUStatuses() +} + +func (a *App) ResetNvidiaGPU(index int) (ActionResult, error) { + out, err := a.sat.ResetNvidiaGPU(index) + return ActionResult{Title: fmt.Sprintf("Reset NVIDIA GPU %d", index), Body: strings.TrimSpace(out)}, err +} + +func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (ActionResult, error) { + if strings.TrimSpace(baseDir) == "" { + baseDir = DefaultSATBaseDir + } + path, err := a.sat.RunNvidiaAcceptancePackWithOptions(ctx, baseDir, diagLevel, gpuIndices, logFunc) + body := "Archive written." + if path != "" { + body = "Archive written to " + path + } + return ActionResult{Title: "NVIDIA DCGM", Body: body}, err +} + +func (a *App) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) { + if strings.TrimSpace(baseDir) == "" { + baseDir = DefaultSATBaseDir + } + return a.sat.RunNvidiaTargetedStressValidatePack(ctx, baseDir, durationSec, gpuIndices, logFunc) +} + +func (a *App) RunNvidiaStressPack(baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) { + return a.RunNvidiaStressPackCtx(context.Background(), baseDir, opts, logFunc) +} + +func (a *App) RunNvidiaBenchmark(baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) { + return a.RunNvidiaBenchmarkCtx(context.Background(), baseDir, opts, logFunc) +} + +func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) { + if strings.TrimSpace(baseDir) == "" { + baseDir = DefaultBeeBenchPerfDir + } + resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "performance", logFunc) + if err != nil { + return "", err + } + opts.ServerPowerSource = resolved.SelectedSource + return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc) +} + +func (a *App) RunNvidiaPowerBenchCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) { + if strings.TrimSpace(baseDir) == "" { + baseDir = DefaultBeeBenchPowerDir + } + resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "power-fit", logFunc) + if err != nil { + return "", err + } + opts.ServerPowerSource = resolved.SelectedSource + return a.sat.RunNvidiaPowerBench(ctx, baseDir, opts, logFunc) +} + +func (a *App) RunNvidiaPowerSourceAutotuneCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error) { + if strings.TrimSpace(baseDir) == "" { + baseDir = DefaultBeeBenchAutotuneDir + } + return a.sat.RunNvidiaPowerSourceAutotune(ctx, baseDir, opts, benchmarkKind, logFunc) +} + +func (a *App) LoadBenchmarkPowerAutotune() (*platform.BenchmarkPowerAutotuneConfig, error) { + return platform.LoadBenchmarkPowerAutotuneConfig(DefaultBeeBenchPowerSourceConfigPath) +} + +func (a *App) ensureBenchmarkPowerAutotune(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (platform.BenchmarkPowerAutotuneConfig, error) { + cfgPath := platform.BenchmarkPowerSourceConfigPath(baseDir) + if cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath); err == nil { + if logFunc != nil { + logFunc(fmt.Sprintf("benchmark autotune: using saved server power source %s", cfg.SelectedSource)) + } + return *cfg, nil + } + if logFunc != nil { + logFunc("benchmark autotune: no saved power source config, running autotune first") + } + autotuneDir := filepath.Join(filepath.Dir(baseDir), "autotune") + if _, err := a.RunNvidiaPowerSourceAutotuneCtx(ctx, autotuneDir, opts, benchmarkKind, logFunc); err != nil { + return platform.BenchmarkPowerAutotuneConfig{}, err + } + cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath) + if err != nil { + return platform.BenchmarkPowerAutotuneConfig{}, err + } + return *cfg, nil +} + +func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) { + if strings.TrimSpace(baseDir) == "" { + baseDir = DefaultSATBaseDir + } + return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, staggerSec, logFunc) +} + +func (a *App) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) { + if strings.TrimSpace(baseDir) == "" { + baseDir = DefaultSATBaseDir + } + return a.sat.RunNvidiaTargetedPowerPack(ctx, baseDir, durationSec, gpuIndices, logFunc) +} + +func (a *App) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) { + if strings.TrimSpace(baseDir) == "" { + baseDir = DefaultSATBaseDir + } + return a.sat.RunNvidiaPulseTestPack(ctx, baseDir, durationSec, gpuIndices, logFunc) +} + +func (a *App) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) { + if strings.TrimSpace(baseDir) == "" { + baseDir = DefaultSATBaseDir + } + return a.sat.RunNvidiaBandwidthPack(ctx, baseDir, gpuIndices, logFunc) +} + +func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) { + if strings.TrimSpace(baseDir) == "" { + baseDir = DefaultSATBaseDir + } + return a.sat.RunNvidiaStressPack(ctx, baseDir, opts, logFunc) +} + +func (a *App) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) { + return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, 256, 1, logFunc) +} + +func (a *App) RunMemoryAcceptancePackCtx(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) { + if strings.TrimSpace(baseDir) == "" { + baseDir = DefaultSATBaseDir + } + return a.sat.RunMemoryAcceptancePack(ctx, baseDir, sizeMB, passes, logFunc) +} + +func (a *App) RunMemoryAcceptancePackResult(baseDir string) (ActionResult, error) { + path, err := a.RunMemoryAcceptancePack(baseDir, nil) + return ActionResult{Title: "Memory SAT", Body: satResultBody(path)}, err +} + +func (a *App) RunCPUAcceptancePack(baseDir string, durationSec int, logFunc func(string)) (string, error) { + return a.RunCPUAcceptancePackCtx(context.Background(), baseDir, durationSec, logFunc) +} + +func (a *App) RunCPUAcceptancePackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) { + if strings.TrimSpace(baseDir) == "" { + baseDir = DefaultSATBaseDir + } + return a.sat.RunCPUAcceptancePack(ctx, baseDir, durationSec, logFunc) +} + +func (a *App) RunCPUAcceptancePackResult(baseDir string, durationSec int) (ActionResult, error) { + path, err := a.RunCPUAcceptancePack(baseDir, durationSec, nil) + return ActionResult{Title: "CPU SAT", Body: satResultBody(path)}, err +} + +func (a *App) RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error) { + return a.RunStorageAcceptancePackCtx(context.Background(), baseDir, false, logFunc) +} + +func (a *App) RunStorageAcceptancePackCtx(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) { + if strings.TrimSpace(baseDir) == "" { + baseDir = DefaultSATBaseDir + } + return a.sat.RunStorageAcceptancePack(ctx, baseDir, extended, logFunc) +} + +func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, error) { + path, err := a.RunStorageAcceptancePack(baseDir, nil) + return ActionResult{Title: "Storage SAT", Body: satResultBody(path)}, err +} + +func (a *App) DetectGPUVendor() string { + return a.sat.DetectGPUVendor() +} + +func (a *App) ListAMDGPUs() ([]platform.AMDGPUInfo, error) { + return a.sat.ListAMDGPUs() +} + +func (a *App) RunAMDAcceptancePack(baseDir string, logFunc func(string)) (string, error) { + return a.RunAMDAcceptancePackCtx(context.Background(), baseDir, logFunc) +} + +func (a *App) RunAMDAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) { + if strings.TrimSpace(baseDir) == "" { + baseDir = DefaultSATBaseDir + } + return a.sat.RunAMDAcceptancePack(ctx, baseDir, logFunc) +} + +func (a *App) RunAMDAcceptancePackResult(baseDir string) (ActionResult, error) { + path, err := a.RunAMDAcceptancePack(baseDir, nil) + return ActionResult{Title: "AMD GPU SAT", Body: satResultBody(path)}, err +} + +func (a *App) RunAMDMemIntegrityPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) { + if strings.TrimSpace(baseDir) == "" { + baseDir = DefaultSATBaseDir + } + return a.sat.RunAMDMemIntegrityPack(ctx, baseDir, logFunc) +} + +func (a *App) RunAMDMemBandwidthPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) { + if strings.TrimSpace(baseDir) == "" { + baseDir = DefaultSATBaseDir + } + return a.sat.RunAMDMemBandwidthPack(ctx, baseDir, logFunc) +} + +func (a *App) RunMemoryStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) { + return a.RunMemoryStressPackCtx(context.Background(), baseDir, durationSec, logFunc) +} + +func (a *App) RunSATStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) { + return a.RunSATStressPackCtx(context.Background(), baseDir, durationSec, logFunc) +} + +func (a *App) RunAMDStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) { + return a.RunAMDStressPackCtx(context.Background(), baseDir, durationSec, logFunc) +} + +func (a *App) RunMemoryStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) { + return a.sat.RunMemoryStressPack(ctx, baseDir, durationSec, logFunc) +} + +func (a *App) RunSATStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) { + return a.sat.RunSATStressPack(ctx, baseDir, durationSec, logFunc) +} + +func (a *App) RunAMDStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) { + if strings.TrimSpace(baseDir) == "" { + baseDir = DefaultSATBaseDir + } + return a.sat.RunAMDStressPack(ctx, baseDir, durationSec, logFunc) +} + +func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error) { + if strings.TrimSpace(baseDir) == "" { + baseDir = DefaultSATBaseDir + } + return a.sat.RunFanStressTest(ctx, baseDir, opts) +} + +func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error) { + if strings.TrimSpace(baseDir) == "" { + baseDir = DefaultSATBaseDir + } + return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc) +} + +func (a *App) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) { + if strings.TrimSpace(baseDir) == "" { + baseDir = DefaultSATBaseDir + } + return a.sat.RunNCCLTests(ctx, baseDir, gpuIndices, logFunc) +} + +func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) { + path, err := a.RunNCCLTests(ctx, DefaultSATBaseDir, nil, nil) + body := "Results: " + path + if err != nil && err != context.Canceled { + body += "\nERROR: " + err.Error() + } + return ActionResult{Title: "NCCL bandwidth test", Body: body}, err +} + +func (a *App) RunFanStressTestResult(ctx context.Context, opts platform.FanStressOptions) (ActionResult, error) { + path, err := a.RunFanStressTest(ctx, "", opts) + body := formatFanStressResult(path) + if err != nil && err != context.Canceled { + body += "\nERROR: " + err.Error() + } + return ActionResult{Title: "GPU Platform Stress Test", Body: body}, err +} + +// formatFanStressResult formats the summary.txt from a fan-stress run, including +// the per-step pass/fail display and the analysis section (throttling, max temps, fan response). +func formatFanStressResult(archivePath string) string { + if archivePath == "" { + return "No output produced." + } + runDir := strings.TrimSuffix(archivePath, ".tar.gz") + raw, err := os.ReadFile(filepath.Join(runDir, "summary.txt")) + if err != nil { + return "Archive written to " + archivePath + } + content := strings.TrimSpace(string(raw)) + kv := parseKeyValueSummary(content) + + var b strings.Builder + b.WriteString(formatSATDetail(content)) + + // Append analysis section. + var analysis []string + if v, ok := kv["throttling_detected"]; ok { + label := "NO" + if v == "true" { + label = "YES ← throttling detected during load" + } + analysis = append(analysis, "Throttling: "+label) + } + if v, ok := kv["max_gpu_temp_c"]; ok && v != "0.0" { + analysis = append(analysis, "Max GPU temp: "+v+"°C") + } + if v, ok := kv["max_cpu_temp_c"]; ok && v != "0.0" { + analysis = append(analysis, "Max CPU temp: "+v+"°C") + } + if v, ok := kv["fan_response_sec"]; ok && v != "N/A" && v != "-1.0" { + analysis = append(analysis, "Fan response: "+v+"s") + } + + if len(analysis) > 0 { + b.WriteString("\n\n=== Analysis ===\n") + for _, line := range analysis { + b.WriteString(line + "\n") + } + } + return strings.TrimSpace(b.String()) +} + +// satResultBody reads summary.txt from the SAT run directory (archive path without .tar.gz) +// and returns a formatted human-readable result. Falls back to a plain message if unreadable. +func satResultBody(archivePath string) string { + if archivePath == "" { + return "No output produced." + } + runDir := strings.TrimSuffix(archivePath, ".tar.gz") + raw, err := os.ReadFile(filepath.Join(runDir, "summary.txt")) + if err != nil { + return "Archive written to " + archivePath + } + return formatSATDetail(strings.TrimSpace(string(raw))) +} diff --git a/audit/internal/app/app_services.go b/audit/internal/app/app_services.go new file mode 100644 index 0000000..ecf9f3a --- /dev/null +++ b/audit/internal/app/app_services.go @@ -0,0 +1,67 @@ +package app + +import ( + "fmt" + "strings" + + "bee/audit/internal/platform" +) + +func (a *App) ListBeeServices() ([]string, error) { + return a.services.ListBeeServices() +} + +func (a *App) ServiceState(name string) string { + return a.services.ServiceState(name) +} + +func (a *App) ServiceStatus(name string) (string, error) { + return a.services.ServiceStatus(name) +} + +func (a *App) ServiceStatusResult(name string) (ActionResult, error) { + body, err := a.services.ServiceStatus(name) + return ActionResult{Title: "service status: " + name, Body: bodyOr(body, "No status output.")}, err +} + +func (a *App) ServiceDo(name string, action platform.ServiceAction) (string, error) { + return a.services.ServiceDo(name, action) +} + +func (a *App) ServiceActionResult(name string, action platform.ServiceAction) (ActionResult, error) { + body, err := a.services.ServiceDo(name, action) + return ActionResult{Title: "service " + string(action) + ": " + name, Body: bodyOr(body, "Action completed.")}, err +} + +func (a *App) TailFile(path string, lines int) string { + return a.tools.TailFile(path, lines) +} + +func (a *App) CheckTools(names []string) []platform.ToolStatus { + return a.tools.CheckTools(names) +} + +func (a *App) ToolCheckResult(names []string) ActionResult { + if len(names) == 0 { + return ActionResult{Title: "Required tools", Body: "No tools checked."} + } + var body strings.Builder + for _, tool := range a.tools.CheckTools(names) { + status := "MISSING" + if tool.OK { + status = "OK (" + tool.Path + ")" + } + fmt.Fprintf(&body, "- %s: %s\n", tool.Name, status) + } + return ActionResult{Title: "Required tools", Body: strings.TrimSpace(body.String())} +} + +func (a *App) AuditLogTailResult() ActionResult { + logTail := strings.TrimSpace(a.tools.TailFile(DefaultAuditLogPath, 40)) + jsonTail := strings.TrimSpace(a.tools.TailFile(DefaultAuditJSONPath, 20)) + body := strings.TrimSpace(logTail + "\n\n" + jsonTail) + if body == "" { + body = "No audit logs found." + } + return ActionResult{Title: "Audit log tail", Body: body} +} diff --git a/audit/internal/app/sat_overlay.go b/audit/internal/app/sat_overlay.go index 12b46f0..c4d8675 100644 --- a/audit/internal/app/sat_overlay.go +++ b/audit/internal/app/sat_overlay.go @@ -3,10 +3,11 @@ package app import ( "os" "path/filepath" - "strconv" "sort" + "strconv" "strings" + "bee/audit/internal/collector" "bee/audit/internal/schema" ) @@ -313,17 +314,20 @@ func statusSeverity(status string) int { } func matchesGPUVendor(dev schema.HardwarePCIeDevice, vendor string) bool { - if dev.DeviceClass == nil || !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Controller") && !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Accelerator") { - if dev.DeviceClass == nil || !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Display") && !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Video") { - return false - } + if dev.DeviceClass == nil { + return false + } + class := strings.TrimSpace(*dev.DeviceClass) + isGPUClass := strings.Contains(class, "Controller") || strings.Contains(class, "Accelerator") || + strings.Contains(class, "Display") || strings.Contains(class, "Video") + if !isGPUClass { + return false } - manufacturer := strings.ToLower(strings.TrimSpace(ptrString(dev.Manufacturer))) switch vendor { case "amd": - return strings.Contains(manufacturer, "advanced micro devices") || strings.Contains(manufacturer, "amd/ati") + return dev.VendorID != nil && *dev.VendorID == collector.AMDVendorID case "nvidia": - return strings.Contains(manufacturer, "nvidia") + return dev.VendorID != nil && *dev.VendorID == collector.NvidiaVendorID default: return false } diff --git a/audit/internal/app/sat_overlay_test.go b/audit/internal/app/sat_overlay_test.go index 1370e5c..85ad8ad 100644 --- a/audit/internal/app/sat_overlay_test.go +++ b/audit/internal/app/sat_overlay_test.go @@ -5,6 +5,7 @@ import ( "path/filepath" "testing" + "bee/audit/internal/collector" "bee/audit/internal/schema" ) @@ -46,10 +47,12 @@ func TestApplyLatestSATStatusesMarksAMDGPUs(t *testing.T) { class := "DisplayController" manufacturer := "Advanced Micro Devices, Inc. [AMD/ATI]" + amdVendorID := collector.AMDVendorID snap := schema.HardwareSnapshot{ PCIeDevices: []schema.HardwarePCIeDevice{{ DeviceClass: &class, Manufacturer: &manufacturer, + VendorID: &amdVendorID, }}, } diff --git a/audit/internal/collector/amdgpu.go b/audit/internal/collector/amdgpu.go index 59ad6af..3ed8abe 100644 --- a/audit/internal/collector/amdgpu.go +++ b/audit/internal/collector/amdgpu.go @@ -84,11 +84,10 @@ func hasAMDGPUDevices(devs []schema.HardwarePCIeDevice) bool { } func isAMDGPUDevice(dev schema.HardwarePCIeDevice) bool { - if dev.Manufacturer == nil || dev.DeviceClass == nil { + if dev.DeviceClass == nil { return false } - manufacturer := strings.ToLower(strings.TrimSpace(*dev.Manufacturer)) - return strings.Contains(manufacturer, "advanced micro devices") && isGPUClass(strings.TrimSpace(*dev.DeviceClass)) + return dev.VendorID != nil && *dev.VendorID == AMDVendorID && isGPUClass(strings.TrimSpace(*dev.DeviceClass)) } func queryAMDGPUs() (map[string]amdGPUInfo, error) { diff --git a/audit/internal/collector/nic_mellanox.go b/audit/internal/collector/nic_mellanox.go index 3db3427..20215ab 100644 --- a/audit/internal/collector/nic_mellanox.go +++ b/audit/internal/collector/nic_mellanox.go @@ -11,7 +11,6 @@ import ( "time" ) -const mellanoxVendorID = 0x15b3 const nicProbeTimeout = 2 * time.Second var ( @@ -80,16 +79,7 @@ func enrichPCIeWithMellanox(devs []schema.HardwarePCIeDevice) []schema.HardwareP } func isMellanoxDevice(dev schema.HardwarePCIeDevice) bool { - if dev.VendorID != nil && *dev.VendorID == mellanoxVendorID { - return true - } - if dev.Manufacturer != nil { - m := strings.ToLower(*dev.Manufacturer) - if strings.Contains(m, "mellanox") || strings.Contains(m, "nvidia networking") { - return true - } - } - return false + return dev.VendorID != nil && *dev.VendorID == MellanoxVendorID } func queryMellanoxFromMstflint(bdf string) (firmware, serial string) { diff --git a/audit/internal/collector/nic_mellanox_test.go b/audit/internal/collector/nic_mellanox_test.go index 89ae2ad..009b10b 100644 --- a/audit/internal/collector/nic_mellanox_test.go +++ b/audit/internal/collector/nic_mellanox_test.go @@ -55,7 +55,7 @@ func TestEnrichPCIeWithMellanox_mstflint(t *testing.T) { } netIfacesByBDF = func(string) []string { return nil } - vendorID := mellanoxVendorID + vendorID := MellanoxVendorID bdf := "0000:18:00.0" manufacturer := "Mellanox Technologies" devs := []schema.HardwarePCIeDevice{{ @@ -99,7 +99,7 @@ func TestEnrichPCIeWithMellanox_fallbackEthtool(t *testing.T) { return "driver: mlx5_core\nfirmware-version: 28.40.1000\n", nil } - vendorID := mellanoxVendorID + vendorID := MellanoxVendorID bdf := "0000:18:00.0" manufacturer := "NVIDIA Networking" devs := []schema.HardwarePCIeDevice{{ diff --git a/audit/internal/collector/nvidia.go b/audit/internal/collector/nvidia.go index 246aafb..25639d5 100644 --- a/audit/internal/collector/nvidia.go +++ b/audit/internal/collector/nvidia.go @@ -10,8 +10,6 @@ import ( "strings" ) -const nvidiaVendorID = 0x10de - type nvidiaGPUInfo struct { Index int BDF string @@ -240,13 +238,7 @@ func normalizePCIeBDF(bdf string) string { } func isNVIDIADevice(dev schema.HardwarePCIeDevice) bool { - if dev.VendorID != nil && *dev.VendorID == nvidiaVendorID { - return true - } - if dev.Manufacturer != nil && strings.Contains(strings.ToLower(*dev.Manufacturer), "nvidia") { - return true - } - return false + return dev.VendorID != nil && *dev.VendorID == NvidiaVendorID } func setPCIeFallback(dev *schema.HardwarePCIeDevice) { diff --git a/audit/internal/collector/nvidia_test.go b/audit/internal/collector/nvidia_test.go index 320dc8f..781d574 100644 --- a/audit/internal/collector/nvidia_test.go +++ b/audit/internal/collector/nvidia_test.go @@ -57,7 +57,7 @@ func TestNormalizePCIeBDF(t *testing.T) { } func TestEnrichPCIeWithNVIDIAData_driverLoaded(t *testing.T) { - vendorID := nvidiaVendorID + vendorID := NvidiaVendorID bdf := "0000:65:00.0" manufacturer := "NVIDIA Corporation" status := "OK" @@ -104,7 +104,7 @@ func TestEnrichPCIeWithNVIDIAData_driverLoaded(t *testing.T) { } func TestEnrichPCIeWithNVIDIAData_driverMissingFallback(t *testing.T) { - vendorID := nvidiaVendorID + vendorID := NvidiaVendorID bdf := "0000:17:00.0" manufacturer := "NVIDIA Corporation" devices := []schema.HardwarePCIeDevice{ diff --git a/audit/internal/collector/pci_vendors.go b/audit/internal/collector/pci_vendors.go new file mode 100644 index 0000000..796fcb4 --- /dev/null +++ b/audit/internal/collector/pci_vendors.go @@ -0,0 +1,11 @@ +package collector + +// PCI vendor IDs for hardware classification. +// Source: https://pcisig.com / https://pci-ids.ucw.cz/ +const ( + NvidiaVendorID = 0x10de + AMDVendorID = 0x1002 + AspeedVendorID = 0x1a03 + MellanoxVendorID = 0x15b3 + IntelVendorID = 0x8086 +) diff --git a/audit/internal/collector/sensors.go b/audit/internal/collector/sensors.go index 8a8c4f3..0a448b2 100644 --- a/audit/internal/collector/sensors.go +++ b/audit/internal/collector/sensors.go @@ -58,7 +58,6 @@ func buildSensorsFromDoc(doc sensorsDoc) *schema.HardwareSensors { for _, chip := range chips { features := doc[chip] - location := sensorLocation(chip) keys := make([]string, 0, len(features)) for key := range features { @@ -80,25 +79,25 @@ func buildSensorsFromDoc(doc sensorsDoc) *schema.HardwareSensors { } switch classifySensorFeature(feature) { case "fan": - item := buildFanSensor(name, location, feature) + item := buildFanSensor(name, feature) if item == nil || duplicateSensor(seen, "fan", item.Name) { continue } result.Fans = append(result.Fans, *item) case "temp": - item := buildTempSensor(name, location, feature) + item := buildTempSensor(name, feature) if item == nil || duplicateSensor(seen, "temp", item.Name) { continue } result.Temperatures = append(result.Temperatures, *item) case "power": - item := buildPowerSensor(name, location, feature) + item := buildPowerSensor(name, feature) if item == nil || duplicateSensor(seen, "power", item.Name) { continue } result.Power = append(result.Power, *item) default: - item := buildOtherSensor(name, location, feature) + item := buildOtherSensor(name, feature) if item == nil || duplicateSensor(seen, "other", item.Name) { continue } @@ -128,14 +127,6 @@ func duplicateSensor(seen map[string]struct{}, sensorType, name string) bool { return false } -func sensorLocation(chip string) *string { - chip = strings.TrimSpace(chip) - if chip == "" { - return nil - } - return &chip -} - func classifySensorFeature(feature map[string]any) string { for key := range feature { switch { @@ -154,24 +145,24 @@ func classifySensorFeature(feature map[string]any) string { return "other" } -func buildFanSensor(name string, location *string, feature map[string]any) *schema.HardwareFanSensor { +func buildFanSensor(name string, feature map[string]any) *schema.HardwareFanSensor { rpm, ok := firstFeatureInt(feature, "_input") if !ok { return nil } - item := &schema.HardwareFanSensor{Name: name, Location: location, RPM: &rpm} + item := &schema.HardwareFanSensor{Name: name, RPM: &rpm} if status := sensorStatusFromFeature(feature); status != nil { item.Status = status } return item } -func buildTempSensor(name string, location *string, feature map[string]any) *schema.HardwareTemperatureSensor { +func buildTempSensor(name string, feature map[string]any) *schema.HardwareTemperatureSensor { celsius, ok := firstFeatureFloat(feature, "_input") if !ok { return nil } - item := &schema.HardwareTemperatureSensor{Name: name, Location: location, Celsius: &celsius} + item := &schema.HardwareTemperatureSensor{Name: name, Celsius: &celsius} if warning, ok := firstFeatureFloatWithSuffixes(feature, []string{"_max", "_high"}); ok { item.ThresholdWarningCelsius = &warning } @@ -186,8 +177,8 @@ func buildTempSensor(name string, location *string, feature map[string]any) *sch return item } -func buildPowerSensor(name string, location *string, feature map[string]any) *schema.HardwarePowerSensor { - item := &schema.HardwarePowerSensor{Name: name, Location: location} +func buildPowerSensor(name string, feature map[string]any) *schema.HardwarePowerSensor { + item := &schema.HardwarePowerSensor{Name: name} if v, ok := firstFeatureFloatWithContains(feature, []string{"power"}); ok { item.PowerW = &v } @@ -206,12 +197,12 @@ func buildPowerSensor(name string, location *string, feature map[string]any) *sc return item } -func buildOtherSensor(name string, location *string, feature map[string]any) *schema.HardwareOtherSensor { +func buildOtherSensor(name string, feature map[string]any) *schema.HardwareOtherSensor { value, unit, ok := firstGenericSensorValue(feature) if !ok { return nil } - item := &schema.HardwareOtherSensor{Name: name, Location: location, Value: &value} + item := &schema.HardwareOtherSensor{Name: name, Value: &value} if unit != "" { item.Unit = &unit } diff --git a/audit/internal/platform/network.go b/audit/internal/platform/network.go index c6a86fb..4a818ae 100644 --- a/audit/internal/platform/network.go +++ b/audit/internal/platform/network.go @@ -258,7 +258,7 @@ func (s *System) GetInterfaceState(iface string) (bool, error) { func interfaceAdminState(iface string) (bool, error) { raw, err := exec.Command("ip", "-o", "link", "show", "dev", iface).Output() if err != nil { - return false, err + return false, fmt.Errorf("ip link show dev %s: %w", iface, err) } return parseInterfaceAdminState(string(raw)) } @@ -288,7 +288,7 @@ func interfaceIPv4Addrs(iface string) ([]string, error) { if errors.As(err, &exitErr) { return nil, nil } - return nil, err + return nil, fmt.Errorf("ip addr show dev %s: %w", iface, err) } var ipv4 []string for _, line := range strings.Split(strings.TrimSpace(string(raw)), "\n") { diff --git a/audit/internal/schema/hardware.go b/audit/internal/schema/hardware.go index 2699563..d12bf8e 100644 --- a/audit/internal/schema/hardware.go +++ b/audit/internal/schema/hardware.go @@ -2,6 +2,8 @@ // core/internal/ingest/parser_hardware.go. No import dependency on core. package schema +import "encoding/json" + // HardwareIngestRequest is the top-level output document produced by `bee audit`. // It is accepted as-is by the core /api/ingest/hardware endpoint. type HardwareIngestRequest struct { @@ -64,9 +66,10 @@ type HardwareSnapshot struct { Storage []HardwareStorage `json:"storage,omitempty"` PCIeDevices []HardwarePCIeDevice `json:"pcie_devices,omitempty"` PowerSupplies []HardwarePowerSupply `json:"power_supplies,omitempty"` - Sensors *HardwareSensors `json:"sensors,omitempty"` - EventLogs []HardwareEventLog `json:"event_logs,omitempty"` - VROCLicense *string `json:"vroc_license,omitempty"` + Sensors *HardwareSensors `json:"sensors,omitempty"` + EventLogs []HardwareEventLog `json:"event_logs,omitempty"` + PlatformConfig *json.RawMessage `json:"platform_config,omitempty"` + VROCLicense *string `json:"vroc_license,omitempty"` } type HardwareHealthSummary struct { @@ -123,7 +126,7 @@ type HardwareCPU struct { type HardwareMemory struct { HardwareComponentStatus Slot *string `json:"slot,omitempty"` - Location *string `json:"location,omitempty"` + Location *string `json:"-"` // internal: used for DIMM telemetry matching only Present *bool `json:"present,omitempty"` SizeMB *int `json:"size_mb,omitempty"` Type *string `json:"type,omitempty"` @@ -261,15 +264,13 @@ type HardwareSensors struct { } type HardwareFanSensor struct { - Name string `json:"name"` - Location *string `json:"location,omitempty"` - RPM *int `json:"rpm,omitempty"` - Status *string `json:"status,omitempty"` + Name string `json:"name"` + RPM *int `json:"rpm,omitempty"` + Status *string `json:"status,omitempty"` } type HardwarePowerSensor struct { Name string `json:"name"` - Location *string `json:"location,omitempty"` VoltageV *float64 `json:"voltage_v,omitempty"` CurrentA *float64 `json:"current_a,omitempty"` PowerW *float64 `json:"power_w,omitempty"` @@ -278,7 +279,6 @@ type HardwarePowerSensor struct { type HardwareTemperatureSensor struct { Name string `json:"name"` - Location *string `json:"location,omitempty"` Celsius *float64 `json:"celsius,omitempty"` ThresholdWarningCelsius *float64 `json:"threshold_warning_celsius,omitempty"` ThresholdCriticalCelsius *float64 `json:"threshold_critical_celsius,omitempty"` @@ -286,11 +286,10 @@ type HardwareTemperatureSensor struct { } type HardwareOtherSensor struct { - Name string `json:"name"` - Location *string `json:"location,omitempty"` - Value *float64 `json:"value,omitempty"` - Unit *string `json:"unit,omitempty"` - Status *string `json:"status,omitempty"` + Name string `json:"name"` + Value *float64 `json:"value,omitempty"` + Unit *string `json:"unit,omitempty"` + Status *string `json:"status,omitempty"` } type HardwareEventLog struct { diff --git a/audit/internal/webui/page_validate.go b/audit/internal/webui/page_validate.go index 90fcace..6ad26e3 100644 --- a/audit/internal/webui/page_validate.go +++ b/audit/internal/webui/page_validate.go @@ -11,6 +11,13 @@ import ( "bee/audit/internal/schema" ) +// PCI vendor IDs used for GPU classification (source: pci-ids.ucw.cz). +const ( + pciVendorNvidia = 0x10de + pciVendorAMD = 0x1002 + pciVendorAspeed = 0x1a03 +) + type validateInventory struct { CPU string Memory string @@ -634,22 +641,16 @@ func validateFirstNonEmpty(values ...string) string { } func validateIsVendorGPU(dev schema.HardwarePCIeDevice, vendor string) bool { - model := strings.ToLower(validateTrimPtr(dev.Model)) - manufacturer := strings.ToLower(validateTrimPtr(dev.Manufacturer)) - class := strings.ToLower(validateTrimPtr(dev.DeviceClass)) - if strings.Contains(model, "aspeed") || strings.Contains(manufacturer, "aspeed") { + if dev.VendorID != nil && *dev.VendorID == pciVendorAspeed { return false } + class := strings.ToLower(validateTrimPtr(dev.DeviceClass)) + isGPUClass := class == "videocontroller" || class == "processingaccelerator" || class == "displaycontroller" switch vendor { case "nvidia": - isNVIDIAVendor := strings.Contains(model, "nvidia") || strings.Contains(manufacturer, "nvidia") - isGPUClass := class == "videocontroller" || class == "processingaccelerator" || class == "displaycontroller" - return isNVIDIAVendor && isGPUClass + return isGPUClass && dev.VendorID != nil && *dev.VendorID == pciVendorNvidia case "amd": - isGPUClass := class == "processingaccelerator" || class == "displaycontroller" || class == "videocontroller" - isAMDVendor := strings.Contains(manufacturer, "advanced micro devices") || strings.Contains(manufacturer, "amd") || strings.Contains(manufacturer, "ati") - isAMDModel := strings.Contains(model, "instinct") || strings.Contains(model, "radeon") || strings.Contains(model, "amd") - return isGPUClass && (isAMDVendor || isAMDModel) + return isGPUClass && dev.VendorID != nil && *dev.VendorID == pciVendorAMD default: return false } diff --git a/bible-local/architecture/api-surface.md b/bible-local/architecture/api-surface.md new file mode 100644 index 0000000..75626f2 --- /dev/null +++ b/bible-local/architecture/api-surface.md @@ -0,0 +1,185 @@ +# API Surface + +HTTP endpoints exposed by `bee web` (binds `0.0.0.0:80`). +Handler registration: `audit/internal/webui/server.go` → `NewHandler()`. + +--- + +## Health & readiness + +| Method | Path | Description | +|--------|----------------|-----------------------------------------------------| +| GET | `/healthz` | Always 200. Used by load balancers / boot scripts. | +| GET | `/api/ready` | 200 when audit JSON exists and is readable. | +| GET | `/loading` | HTML loading page shown before first audit. | + +--- + +## Audit + +| Method | Path | Description | +|--------|-----------------------|--------------------------------------------------------------| +| GET | `/audit.json` | Latest audit JSON with SAT overlay applied. | +| GET | `/runtime-health.json`| Latest runtime preflight JSON. | +| POST | `/api/audit/run` | Enqueue a full `bee audit` run. Returns task ID. | +| GET | `/api/audit/stream` | SSE: audit run log lines (`data:` + newline per line). | +| GET | `/api/preflight` | Run runtime preflight check (synchronous, returns JSON). | +| GET | `/api/hardware-summary` | Hardware health summary (status counts + failures). | +| GET | `/api/components/{type}` | HTML fragment for component detail dialog (e.g. `cpu`, `memory`, `storage`, `pcie`). | + +--- + +## SAT (System Acceptance Testing) + +All SAT run endpoints enqueue an async task. Response: `{"task_id": "..."}`. + +| Method | Path | Description | +|--------|--------------------------------------------|-----------------------------------| +| POST | `/api/sat/nvidia/run` | NVIDIA DCGM SAT | +| POST | `/api/sat/nvidia-targeted-stress/run` | NVIDIA targeted stress validate | +| POST | `/api/sat/nvidia-compute/run` | NVIDIA max compute load | +| POST | `/api/sat/nvidia-targeted-power/run` | NVIDIA targeted power | +| POST | `/api/sat/nvidia-pulse/run` | NVIDIA pulse test | +| POST | `/api/sat/nvidia-interconnect/run` | NCCL all_reduce_perf | +| POST | `/api/sat/nvidia-bandwidth/run` | NVBandwidth test | +| POST | `/api/sat/nvidia-stress/run` | NVIDIA stress pack | +| POST | `/api/sat/memory/run` | Memory acceptance | +| POST | `/api/sat/storage/run` | Storage acceptance (smartctl) | +| POST | `/api/sat/cpu/run` | CPU acceptance (stress-ng) | +| POST | `/api/sat/amd/run` | AMD GPU SAT (ROCm) | +| POST | `/api/sat/amd-mem/run` | AMD memory integrity + bandwidth | +| POST | `/api/sat/amd-bandwidth/run` | AMD memory bandwidth | +| POST | `/api/sat/amd-stress/run` | AMD GPU stress | +| POST | `/api/sat/memory-stress/run` | Memory stress | +| POST | `/api/sat/sat-stress/run` | Combined storage+memory stress | +| POST | `/api/sat/platform-stress/run` | Fan + thermal stress | +| GET | `/api/sat/stream` | SSE: live SAT log stream | +| POST | `/api/sat/abort` | Abort the running SAT task | + +--- + +## Benchmarks + +| Method | Path | Description | +|--------|-----------------------------------------|----------------------------------------------| +| POST | `/api/bee-bench/nvidia/perf/run` | NVIDIA performance benchmark | +| POST | `/api/bee-bench/nvidia/power/run` | NVIDIA power benchmark | +| POST | `/api/bee-bench/nvidia/autotune/run` | Power source autotune (prerequisite for benchmarks) | +| GET | `/api/bee-bench/nvidia/autotune/status` | Current autotune result / status | +| GET | `/api/benchmark/results` | List completed benchmark result archives | + +--- + +## Tasks (async job queue) + +| Method | Path | Description | +|--------|-----------------------------|----------------------------------------------------| +| GET | `/api/tasks` | List all tasks with status | +| POST | `/api/tasks/cancel-all` | Cancel all pending/running tasks | +| POST | `/api/tasks/kill-workers` | Force-kill worker goroutines | +| POST | `/api/tasks/{id}/cancel` | Cancel a specific task | +| POST | `/api/tasks/{id}/priority` | Elevate task priority | +| GET | `/api/tasks/{id}/stream` | SSE: live log stream for a task | +| GET | `/api/tasks/{id}/charts` | List chart names for a task | +| GET | `/api/tasks/{id}/chart/` | SVG chart for a task result | +| GET | `/tasks/{id}` | HTML task detail page | + +--- + +## Services + +| Method | Path | Description | +|--------|---------------------------|--------------------------------------------------| +| GET | `/api/services` | List bee-* systemd services and their states | +| POST | `/api/services/action` | start/stop/restart a service | + +--- + +## Network + +| Method | Path | Description | +|--------|----------------------------|-----------------------------------------------------| +| GET | `/api/network` | List interfaces with state and IPv4 addresses | +| POST | `/api/network/dhcp` | Run dhclient on one or all interfaces | +| POST | `/api/network/static` | Set static IPv4 address | +| POST | `/api/network/toggle` | Bring interface up or down | +| POST | `/api/network/confirm` | Confirm pending network change (clears rollback) | +| POST | `/api/network/rollback` | Restore pre-change network snapshot | + +--- + +## Export + +| Method | Path | Description | +|--------|-------------------------------|---------------------------------------------------| +| GET | `/export/support.tar.gz` | Download support bundle (live-generated) | +| GET | `/export/file` | Download a file from the export dir by path param | +| GET | `/export/` | Browse export dir (HTML index) | +| GET | `/api/export/list` | JSON list of files in export dir | +| GET | `/api/export/usb` | List removable USB targets available for export | + +--- + +## GPU + +| Method | Path | Description | +|--------|----------------------------|----------------------------------------------------| +| GET | `/api/gpu/presence` | `{"nvidia": bool, "amd": bool}` | +| GET | `/api/gpu/nvidia` | List NVIDIA GPUs from nvidia-smi | +| GET | `/api/gpu/nvidia-status` | Per-GPU status (ECC, power, throttle) | +| POST | `/api/gpu/nvidia-reset` | GPU reset by index | +| GET | `/api/gpu/tools` | nvidia-smi / rocm-smi tool availability | + +--- + +## System + +| Method | Path | Description | +|--------|------------------------------|---------------------------------------------------| +| GET | `/api/system/ram-status` | toram boot state and ISO copy status | +| POST | `/api/system/install-to-ram` | Copy ISO to RAM (background task) | +| GET | `/api/install/disks` | List block devices suitable for disk installation | +| POST | `/api/install/run` | Install bee to disk (background task) | + +--- + +## Tools & NVMe + +| Method | Path | Description | +|--------|-------------------------------|--------------------------------------------------| +| GET | `/api/tools/check` | Check availability of required CLI tools | +| GET | `/api/tools/nvme-formats` | List NVMe format options for a device | +| POST | `/api/tools/nvme-format/run` | Run nvme-format on a device | + +--- + +## Live metrics + +| Method | Path | Description | +|--------|------------------------------|---------------------------------------------------| +| GET | `/api/metrics/stream` | SSE: live metrics (GPU power, temp, utilization) | +| GET | `/api/metrics/latest` | Latest metrics snapshot (JSON) | +| GET | `/api/metrics/chart/` | SVG chart for a metric over time | +| GET | `/api/metrics/export.csv` | Download metrics history as CSV | + +--- + +## Blackbox logging + +| Method | Path | Description | +|--------|----------------------------|-----------------------------------------------| +| GET | `/api/blackbox/status` | Blackbox log state (enabled, size, path) | +| POST | `/api/blackbox/enable` | Start recording blackbox log | +| POST | `/api/blackbox/disable` | Stop recording, flush to disk | + +--- + +## UI pages + +| Method | Path | Description | +|--------|------------|-----------------------------------------------| +| GET | `/` | Main dashboard (serves all page routes) | +| GET | `/viewer` | Standalone JSON viewer for uploaded audit files | + +All pages are rendered server-side as HTML. The `/` route handles sub-paths such as +`/network`, `/services`, `/sat`, `/benchmark`, `/install`, `/validate`, `/export`. diff --git a/bible-local/architecture/data-model.md b/bible-local/architecture/data-model.md new file mode 100644 index 0000000..333d009 --- /dev/null +++ b/bible-local/architecture/data-model.md @@ -0,0 +1,137 @@ +# Data Model + +The canonical output of `bee audit` is a `HardwareIngestRequest` JSON document accepted +by the Reanimator `/api/ingest/hardware` endpoint. The ingest endpoint uses a strict +decoder — unknown fields cause `400 Bad Request`. + +Source of truth: `audit/internal/schema/hardware.go` + +--- + +## Top-level: HardwareIngestRequest + +``` +HardwareIngestRequest +├── collected_at string RFC3339 UTC timestamp of collection +├── hardware HardwareSnapshot +├── runtime RuntimeHealth? from bee-runtime-preflight service +├── filename string? +├── source_type string? +├── protocol string? +└── target_host string? +``` + +`collected_at` is the primary sort key used by Reanimator to deduplicate ingests. + +--- + +## HardwareSnapshot + +All component arrays are `omitempty` — absent when the collector finds nothing. + +| JSON key | Go type | Source | +|-------------------|----------------------------|------------------------------| +| `board` | HardwareBoard | dmidecode type 1/2 | +| `firmware` | []HardwareFirmwareRecord | dmidecode type 0/13 | +| `cpus` | []HardwareCPU | dmidecode type 4 | +| `memory` | []HardwareMemory | dmidecode type 17 | +| `storage` | []HardwareStorage | lsblk + nvme-cli + smartctl | +| `pcie_devices` | []HardwarePCIeDevice | lspci | +| `power_supplies` | []HardwarePowerSupply | ipmitool fru + sdr | +| `sensors` | *HardwareSensors | sensors -j | +| `event_logs` | []HardwareEventLog | ipmitool sel + journald | +| `platform_config` | *json.RawMessage | reserved, nil until used | +| `vroc_license` | *string | vroc-cli | + +--- + +## Identity keys + +Reanimator uses these fields to match components across successive audits: + +| Component | Identity key | +|----------------|------------------------------------------------| +| Board | `board.serial_number` (required, never empty) | +| CPU | `serial_number` if present; else generated key | +| Memory DIMM | `serial_number` — absent DIMMs have `present: false` | +| Storage | `serial_number` if present; else `linux_device` from Telemetry | +| PCIe device | `bdf` (Bus:Device.Function address) | +| PSU | `slot` | + +Components without a stable identity are still emitted but may not be matched across runs. + +--- + +## HardwareComponentStatus (embedded in all components) + +```go +type HardwareComponentStatus struct { + Status *string `json:"status,omitempty"` // OK | Warning | Critical | Unknown + ErrorDescription *string `json:"error_description,omitempty"` +} +``` + +Status is set by collectors and overwritten at render time by `ApplySATOverlay` +(latest SAT run results are always merged on top before display). + +--- + +## HardwarePCIeDevice + +The most enriched component type. Key fields: + +| JSON key | Meaning | +|----------------------|------------------------------------------------| +| `bdf` | PCI address (identity key), e.g. `0000:4b:00.0` | +| `vendor_id` | Numeric PCI vendor ID (hex). Use this for classification — not `manufacturer`. | +| `device_id` | Numeric PCI device ID (hex) | +| `device_class` | Human-readable class, e.g. `VideoController` | +| `manufacturer` | String label from lspci — for display only | +| `model` | From nvidia-smi / rocm-smi — display name | +| `link_speed` | Current PCIe link speed, e.g. `Gen4` | +| `max_link_speed` | Max negotiated speed | +| `link_width` | Current lane count | +| `max_link_width` | Max lane count | +| `temperature_c` | From nvidia-smi / rocm-smi | +| `power_w` | Current power draw | +| `ecc_uncorrected_total` | Cumulative ECC uncorrected errors (NVIDIA) | +| `ecc_corrected_total` | Cumulative ECC corrected errors (NVIDIA) | +| `hw_slowdown` | HW throttle active (NVIDIA) | +| `telemetry` | Free-form map for vendor-specific extras | + +**Classification rule**: use `vendor_id` (numeric PCI ID), never `manufacturer` string. + +| Vendor | vendor_id | +|-----------|-----------| +| NVIDIA | `0x10de` | +| AMD | `0x1002` | +| Mellanox | `0x15b3` | +| Aspeed | `0x1a03` | +| Intel | `0x8086` | + +Constants live in `audit/internal/collector/pci_vendors.go`. + +--- + +## HardwareMemory + +`location` field exists in the Go struct with `json:"-"` — it is intentionally excluded +from JSON output because the Reanimator schema does not include it. It is used internally +for DIMM telemetry matching only (`collector/memory_telemetry.go`). + +--- + +## HardwareSensors + +Sensor structs (`HardwareFanSensor`, `HardwareTemperatureSensor`, +`HardwarePowerSensor`, `HardwareOtherSensor`) do **not** have a `location` field. +Location was removed in contract v2.8. The Go types mirror the schema exactly. + +--- + +## JSON naming convention + +All JSON keys are `snake_case`. Go field names are `CamelCase`. The mapping is +maintained by struct tags in `audit/internal/schema/hardware.go`. + +All pointer fields use `omitempty` — absent means not collected (not zero).