A (hardware-ingest-json v2.8-2.9): remove sensor location fields from schema and collector; tag HardwareMemory.Location as json:"-"; add PlatformConfig to HardwareSnapshot. B (no-hardcoded-vendors): consolidate PCI vendor IDs into collector/pci_vendors.go; replace all vendor-name string checks in isGPUDevice, isNVIDIADevice, isMellanoxDevice, isAMDGPUDevice, matchesGPUVendor (sat_overlay), and validateIsVendorGPU (page_validate) with numeric vendor_id comparisons. C (module-structure): split app/app.go (1413 lines) into app.go + app_format.go, app_network.go, app_services.go, app_packs.go, app_install.go — no logic changes. D (go-code-style): wrap bare return err in interfaceAdminState and interfaceIPv4Addrs (platform/network.go) with fmt.Errorf context including the interface name. E (go-project-bible): add bible-local/architecture/data-model.md and bible-local/architecture/api-surface.md. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
371 lines
14 KiB
Go
371 lines
14 KiB
Go
package app
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
|
|
"bee/audit/internal/platform"
|
|
)
|
|
|
|
func (a *App) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
|
if strings.TrimSpace(baseDir) == "" {
|
|
baseDir = DefaultSATBaseDir
|
|
}
|
|
return a.sat.RunNvidiaAcceptancePack(baseDir, logFunc)
|
|
}
|
|
|
|
func (a *App) RunNvidiaAcceptancePackResult(baseDir string) (ActionResult, error) {
|
|
path, err := a.RunNvidiaAcceptancePack(baseDir, nil)
|
|
body := "Archive written."
|
|
if path != "" {
|
|
body = "Archive written to " + path
|
|
}
|
|
return ActionResult{Title: "NVIDIA SAT", Body: body}, err
|
|
}
|
|
|
|
func (a *App) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
|
|
return a.sat.ListNvidiaGPUs()
|
|
}
|
|
|
|
func (a *App) ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error) {
|
|
return a.sat.ListNvidiaGPUStatuses()
|
|
}
|
|
|
|
func (a *App) ResetNvidiaGPU(index int) (ActionResult, error) {
|
|
out, err := a.sat.ResetNvidiaGPU(index)
|
|
return ActionResult{Title: fmt.Sprintf("Reset NVIDIA GPU %d", index), Body: strings.TrimSpace(out)}, err
|
|
}
|
|
|
|
func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (ActionResult, error) {
|
|
if strings.TrimSpace(baseDir) == "" {
|
|
baseDir = DefaultSATBaseDir
|
|
}
|
|
path, err := a.sat.RunNvidiaAcceptancePackWithOptions(ctx, baseDir, diagLevel, gpuIndices, logFunc)
|
|
body := "Archive written."
|
|
if path != "" {
|
|
body = "Archive written to " + path
|
|
}
|
|
return ActionResult{Title: "NVIDIA DCGM", Body: body}, err
|
|
}
|
|
|
|
func (a *App) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
|
if strings.TrimSpace(baseDir) == "" {
|
|
baseDir = DefaultSATBaseDir
|
|
}
|
|
return a.sat.RunNvidiaTargetedStressValidatePack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
|
}
|
|
|
|
func (a *App) RunNvidiaStressPack(baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
|
|
return a.RunNvidiaStressPackCtx(context.Background(), baseDir, opts, logFunc)
|
|
}
|
|
|
|
func (a *App) RunNvidiaBenchmark(baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
|
return a.RunNvidiaBenchmarkCtx(context.Background(), baseDir, opts, logFunc)
|
|
}
|
|
|
|
func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
|
if strings.TrimSpace(baseDir) == "" {
|
|
baseDir = DefaultBeeBenchPerfDir
|
|
}
|
|
resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "performance", logFunc)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
opts.ServerPowerSource = resolved.SelectedSource
|
|
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
|
|
}
|
|
|
|
func (a *App) RunNvidiaPowerBenchCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
|
if strings.TrimSpace(baseDir) == "" {
|
|
baseDir = DefaultBeeBenchPowerDir
|
|
}
|
|
resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "power-fit", logFunc)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
opts.ServerPowerSource = resolved.SelectedSource
|
|
return a.sat.RunNvidiaPowerBench(ctx, baseDir, opts, logFunc)
|
|
}
|
|
|
|
func (a *App) RunNvidiaPowerSourceAutotuneCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error) {
|
|
if strings.TrimSpace(baseDir) == "" {
|
|
baseDir = DefaultBeeBenchAutotuneDir
|
|
}
|
|
return a.sat.RunNvidiaPowerSourceAutotune(ctx, baseDir, opts, benchmarkKind, logFunc)
|
|
}
|
|
|
|
func (a *App) LoadBenchmarkPowerAutotune() (*platform.BenchmarkPowerAutotuneConfig, error) {
|
|
return platform.LoadBenchmarkPowerAutotuneConfig(DefaultBeeBenchPowerSourceConfigPath)
|
|
}
|
|
|
|
func (a *App) ensureBenchmarkPowerAutotune(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (platform.BenchmarkPowerAutotuneConfig, error) {
|
|
cfgPath := platform.BenchmarkPowerSourceConfigPath(baseDir)
|
|
if cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath); err == nil {
|
|
if logFunc != nil {
|
|
logFunc(fmt.Sprintf("benchmark autotune: using saved server power source %s", cfg.SelectedSource))
|
|
}
|
|
return *cfg, nil
|
|
}
|
|
if logFunc != nil {
|
|
logFunc("benchmark autotune: no saved power source config, running autotune first")
|
|
}
|
|
autotuneDir := filepath.Join(filepath.Dir(baseDir), "autotune")
|
|
if _, err := a.RunNvidiaPowerSourceAutotuneCtx(ctx, autotuneDir, opts, benchmarkKind, logFunc); err != nil {
|
|
return platform.BenchmarkPowerAutotuneConfig{}, err
|
|
}
|
|
cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath)
|
|
if err != nil {
|
|
return platform.BenchmarkPowerAutotuneConfig{}, err
|
|
}
|
|
return *cfg, nil
|
|
}
|
|
|
|
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
|
|
if strings.TrimSpace(baseDir) == "" {
|
|
baseDir = DefaultSATBaseDir
|
|
}
|
|
return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, staggerSec, logFunc)
|
|
}
|
|
|
|
func (a *App) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
|
if strings.TrimSpace(baseDir) == "" {
|
|
baseDir = DefaultSATBaseDir
|
|
}
|
|
return a.sat.RunNvidiaTargetedPowerPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
|
}
|
|
|
|
func (a *App) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
|
if strings.TrimSpace(baseDir) == "" {
|
|
baseDir = DefaultSATBaseDir
|
|
}
|
|
return a.sat.RunNvidiaPulseTestPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
|
}
|
|
|
|
func (a *App) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
|
if strings.TrimSpace(baseDir) == "" {
|
|
baseDir = DefaultSATBaseDir
|
|
}
|
|
return a.sat.RunNvidiaBandwidthPack(ctx, baseDir, gpuIndices, logFunc)
|
|
}
|
|
|
|
func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
|
|
if strings.TrimSpace(baseDir) == "" {
|
|
baseDir = DefaultSATBaseDir
|
|
}
|
|
return a.sat.RunNvidiaStressPack(ctx, baseDir, opts, logFunc)
|
|
}
|
|
|
|
func (a *App) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
|
return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, 256, 1, logFunc)
|
|
}
|
|
|
|
func (a *App) RunMemoryAcceptancePackCtx(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) {
|
|
if strings.TrimSpace(baseDir) == "" {
|
|
baseDir = DefaultSATBaseDir
|
|
}
|
|
return a.sat.RunMemoryAcceptancePack(ctx, baseDir, sizeMB, passes, logFunc)
|
|
}
|
|
|
|
func (a *App) RunMemoryAcceptancePackResult(baseDir string) (ActionResult, error) {
|
|
path, err := a.RunMemoryAcceptancePack(baseDir, nil)
|
|
return ActionResult{Title: "Memory SAT", Body: satResultBody(path)}, err
|
|
}
|
|
|
|
func (a *App) RunCPUAcceptancePack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
|
return a.RunCPUAcceptancePackCtx(context.Background(), baseDir, durationSec, logFunc)
|
|
}
|
|
|
|
func (a *App) RunCPUAcceptancePackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
|
if strings.TrimSpace(baseDir) == "" {
|
|
baseDir = DefaultSATBaseDir
|
|
}
|
|
return a.sat.RunCPUAcceptancePack(ctx, baseDir, durationSec, logFunc)
|
|
}
|
|
|
|
func (a *App) RunCPUAcceptancePackResult(baseDir string, durationSec int) (ActionResult, error) {
|
|
path, err := a.RunCPUAcceptancePack(baseDir, durationSec, nil)
|
|
return ActionResult{Title: "CPU SAT", Body: satResultBody(path)}, err
|
|
}
|
|
|
|
func (a *App) RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
|
return a.RunStorageAcceptancePackCtx(context.Background(), baseDir, false, logFunc)
|
|
}
|
|
|
|
func (a *App) RunStorageAcceptancePackCtx(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
|
|
if strings.TrimSpace(baseDir) == "" {
|
|
baseDir = DefaultSATBaseDir
|
|
}
|
|
return a.sat.RunStorageAcceptancePack(ctx, baseDir, extended, logFunc)
|
|
}
|
|
|
|
func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, error) {
|
|
path, err := a.RunStorageAcceptancePack(baseDir, nil)
|
|
return ActionResult{Title: "Storage SAT", Body: satResultBody(path)}, err
|
|
}
|
|
|
|
func (a *App) DetectGPUVendor() string {
|
|
return a.sat.DetectGPUVendor()
|
|
}
|
|
|
|
func (a *App) ListAMDGPUs() ([]platform.AMDGPUInfo, error) {
|
|
return a.sat.ListAMDGPUs()
|
|
}
|
|
|
|
func (a *App) RunAMDAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
|
return a.RunAMDAcceptancePackCtx(context.Background(), baseDir, logFunc)
|
|
}
|
|
|
|
func (a *App) RunAMDAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
|
if strings.TrimSpace(baseDir) == "" {
|
|
baseDir = DefaultSATBaseDir
|
|
}
|
|
return a.sat.RunAMDAcceptancePack(ctx, baseDir, logFunc)
|
|
}
|
|
|
|
func (a *App) RunAMDAcceptancePackResult(baseDir string) (ActionResult, error) {
|
|
path, err := a.RunAMDAcceptancePack(baseDir, nil)
|
|
return ActionResult{Title: "AMD GPU SAT", Body: satResultBody(path)}, err
|
|
}
|
|
|
|
func (a *App) RunAMDMemIntegrityPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
|
if strings.TrimSpace(baseDir) == "" {
|
|
baseDir = DefaultSATBaseDir
|
|
}
|
|
return a.sat.RunAMDMemIntegrityPack(ctx, baseDir, logFunc)
|
|
}
|
|
|
|
func (a *App) RunAMDMemBandwidthPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
|
if strings.TrimSpace(baseDir) == "" {
|
|
baseDir = DefaultSATBaseDir
|
|
}
|
|
return a.sat.RunAMDMemBandwidthPack(ctx, baseDir, logFunc)
|
|
}
|
|
|
|
func (a *App) RunMemoryStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
|
return a.RunMemoryStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
|
|
}
|
|
|
|
func (a *App) RunSATStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
|
return a.RunSATStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
|
|
}
|
|
|
|
func (a *App) RunAMDStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
|
return a.RunAMDStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
|
|
}
|
|
|
|
func (a *App) RunMemoryStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
|
return a.sat.RunMemoryStressPack(ctx, baseDir, durationSec, logFunc)
|
|
}
|
|
|
|
func (a *App) RunSATStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
|
return a.sat.RunSATStressPack(ctx, baseDir, durationSec, logFunc)
|
|
}
|
|
|
|
func (a *App) RunAMDStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
|
if strings.TrimSpace(baseDir) == "" {
|
|
baseDir = DefaultSATBaseDir
|
|
}
|
|
return a.sat.RunAMDStressPack(ctx, baseDir, durationSec, logFunc)
|
|
}
|
|
|
|
func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error) {
|
|
if strings.TrimSpace(baseDir) == "" {
|
|
baseDir = DefaultSATBaseDir
|
|
}
|
|
return a.sat.RunFanStressTest(ctx, baseDir, opts)
|
|
}
|
|
|
|
func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error) {
|
|
if strings.TrimSpace(baseDir) == "" {
|
|
baseDir = DefaultSATBaseDir
|
|
}
|
|
return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc)
|
|
}
|
|
|
|
func (a *App) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
|
if strings.TrimSpace(baseDir) == "" {
|
|
baseDir = DefaultSATBaseDir
|
|
}
|
|
return a.sat.RunNCCLTests(ctx, baseDir, gpuIndices, logFunc)
|
|
}
|
|
|
|
func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
|
|
path, err := a.RunNCCLTests(ctx, DefaultSATBaseDir, nil, nil)
|
|
body := "Results: " + path
|
|
if err != nil && err != context.Canceled {
|
|
body += "\nERROR: " + err.Error()
|
|
}
|
|
return ActionResult{Title: "NCCL bandwidth test", Body: body}, err
|
|
}
|
|
|
|
func (a *App) RunFanStressTestResult(ctx context.Context, opts platform.FanStressOptions) (ActionResult, error) {
|
|
path, err := a.RunFanStressTest(ctx, "", opts)
|
|
body := formatFanStressResult(path)
|
|
if err != nil && err != context.Canceled {
|
|
body += "\nERROR: " + err.Error()
|
|
}
|
|
return ActionResult{Title: "GPU Platform Stress Test", Body: body}, err
|
|
}
|
|
|
|
// formatFanStressResult formats the summary.txt from a fan-stress run, including
|
|
// the per-step pass/fail display and the analysis section (throttling, max temps, fan response).
|
|
func formatFanStressResult(archivePath string) string {
|
|
if archivePath == "" {
|
|
return "No output produced."
|
|
}
|
|
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
|
raw, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
|
if err != nil {
|
|
return "Archive written to " + archivePath
|
|
}
|
|
content := strings.TrimSpace(string(raw))
|
|
kv := parseKeyValueSummary(content)
|
|
|
|
var b strings.Builder
|
|
b.WriteString(formatSATDetail(content))
|
|
|
|
// Append analysis section.
|
|
var analysis []string
|
|
if v, ok := kv["throttling_detected"]; ok {
|
|
label := "NO"
|
|
if v == "true" {
|
|
label = "YES ← throttling detected during load"
|
|
}
|
|
analysis = append(analysis, "Throttling: "+label)
|
|
}
|
|
if v, ok := kv["max_gpu_temp_c"]; ok && v != "0.0" {
|
|
analysis = append(analysis, "Max GPU temp: "+v+"°C")
|
|
}
|
|
if v, ok := kv["max_cpu_temp_c"]; ok && v != "0.0" {
|
|
analysis = append(analysis, "Max CPU temp: "+v+"°C")
|
|
}
|
|
if v, ok := kv["fan_response_sec"]; ok && v != "N/A" && v != "-1.0" {
|
|
analysis = append(analysis, "Fan response: "+v+"s")
|
|
}
|
|
|
|
if len(analysis) > 0 {
|
|
b.WriteString("\n\n=== Analysis ===\n")
|
|
for _, line := range analysis {
|
|
b.WriteString(line + "\n")
|
|
}
|
|
}
|
|
return strings.TrimSpace(b.String())
|
|
}
|
|
|
|
// satResultBody reads summary.txt from the SAT run directory (archive path without .tar.gz)
|
|
// and returns a formatted human-readable result. Falls back to a plain message if unreadable.
|
|
func satResultBody(archivePath string) string {
|
|
if archivePath == "" {
|
|
return "No output produced."
|
|
}
|
|
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
|
raw, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
|
if err != nil {
|
|
return "Archive written to " + archivePath
|
|
}
|
|
return formatSATDetail(strings.TrimSpace(string(raw)))
|
|
}
|