Files
bee/audit/internal/app/app_packs.go
T
Mikhail Chusavitin abaeaea13f add Confidential Computing readiness check + collect nvidia-smi conf-compute -q
New read-only "Check" step reports whether this server can run NVIDIA
Confidential Computing: CPU TEE support (Intel TDX / AMD SEV-SNP, via
dmesg and kvm_amd sysfs params) and GPU firmware CC capability (via
`nvidia-smi conf-compute -q`). Also collect that command's output into
the techdump export bundle.

Co-Authored-By: Claude Sonnet 5 <noreply@anthropic.com>
2026-07-02 19:18:41 +03:00

387 lines
15 KiB
Go

package app
import (
"context"
"fmt"
"os"
"path/filepath"
"strings"
"bee/audit/internal/platform"
)
func (a *App) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunNvidiaAcceptancePack(baseDir, logFunc)
}
func (a *App) RunNvidiaAcceptancePackResult(baseDir string) (ActionResult, error) {
path, err := a.RunNvidiaAcceptancePack(baseDir, nil)
body := "Archive written."
if path != "" {
body = "Archive written to " + path
}
return ActionResult{Title: "NVIDIA SAT", Body: body}, err
}
func (a *App) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
return a.sat.ListNvidiaGPUs()
}
func (a *App) ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error) {
return a.sat.ListNvidiaGPUStatuses()
}
func (a *App) ResetNvidiaGPU(index int) (ActionResult, error) {
out, err := a.sat.ResetNvidiaGPU(index)
return ActionResult{Title: fmt.Sprintf("Reset NVIDIA GPU %d", index), Body: strings.TrimSpace(out)}, err
}
func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (ActionResult, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
path, err := a.sat.RunNvidiaAcceptancePackWithOptions(ctx, baseDir, diagLevel, gpuIndices, logFunc)
body := "Archive written."
if path != "" {
body = "Archive written to " + path
}
return ActionResult{Title: "NVIDIA DCGM", Body: body}, err
}
func (a *App) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunNvidiaTargetedStressValidatePack(ctx, baseDir, durationSec, gpuIndices, logFunc)
}
func (a *App) RunNvidiaStressPack(baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
return a.RunNvidiaStressPackCtx(context.Background(), baseDir, opts, logFunc)
}
func (a *App) RunNvidiaBenchmark(baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
return a.RunNvidiaBenchmarkCtx(context.Background(), baseDir, opts, logFunc)
}
func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultBeeBenchPerfDir
}
resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "performance", logFunc)
if err != nil {
return "", err
}
opts.ServerPowerSource = resolved.SelectedSource
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
}
func (a *App) RunNvidiaPowerBenchCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultBeeBenchPowerDir
}
resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "power-fit", logFunc)
if err != nil {
return "", err
}
opts.ServerPowerSource = resolved.SelectedSource
return a.sat.RunNvidiaPowerBench(ctx, baseDir, opts, logFunc)
}
func (a *App) RunNvidiaPowerSourceAutotuneCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultBeeBenchAutotuneDir
}
return a.sat.RunNvidiaPowerSourceAutotune(ctx, baseDir, opts, benchmarkKind, logFunc)
}
func (a *App) LoadBenchmarkPowerAutotune() (*platform.BenchmarkPowerAutotuneConfig, error) {
return platform.LoadBenchmarkPowerAutotuneConfig(DefaultBeeBenchPowerSourceConfigPath)
}
func (a *App) ensureBenchmarkPowerAutotune(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (platform.BenchmarkPowerAutotuneConfig, error) {
cfgPath := platform.BenchmarkPowerSourceConfigPath(baseDir)
if cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath); err == nil {
if logFunc != nil {
logFunc(fmt.Sprintf("benchmark autotune: using saved server power source %s", cfg.SelectedSource))
}
return *cfg, nil
}
if logFunc != nil {
logFunc("benchmark autotune: no saved power source config, running autotune first")
}
autotuneDir := filepath.Join(filepath.Dir(baseDir), "autotune")
if _, err := a.RunNvidiaPowerSourceAutotuneCtx(ctx, autotuneDir, opts, benchmarkKind, logFunc); err != nil {
return platform.BenchmarkPowerAutotuneConfig{}, err
}
cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath)
if err != nil {
return platform.BenchmarkPowerAutotuneConfig{}, err
}
return *cfg, nil
}
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, staggerSec, logFunc)
}
func (a *App) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunNvidiaTargetedPowerPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
}
func (a *App) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunNvidiaPulseTestPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
}
func (a *App) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunNvidiaBandwidthPack(ctx, baseDir, gpuIndices, logFunc)
}
func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunNvidiaStressPack(ctx, baseDir, opts, logFunc)
}
func (a *App) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, 256, 1, logFunc)
}
func (a *App) RunMemoryAcceptancePackCtx(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunMemoryAcceptancePack(ctx, baseDir, sizeMB, passes, logFunc)
}
func (a *App) RunMemoryAcceptancePackResult(baseDir string) (ActionResult, error) {
path, err := a.RunMemoryAcceptancePack(baseDir, nil)
return ActionResult{Title: "Memory SAT", Body: satResultBody(path)}, err
}
func (a *App) RunCPUAcceptancePack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
return a.RunCPUAcceptancePackCtx(context.Background(), baseDir, durationSec, logFunc)
}
func (a *App) RunCPUAcceptancePackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunCPUAcceptancePack(ctx, baseDir, durationSec, logFunc)
}
func (a *App) RunCPUAcceptancePackResult(baseDir string, durationSec int) (ActionResult, error) {
path, err := a.RunCPUAcceptancePack(baseDir, durationSec, nil)
return ActionResult{Title: "CPU SAT", Body: satResultBody(path)}, err
}
func (a *App) RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
return a.RunStorageAcceptancePackCtx(context.Background(), baseDir, false, logFunc)
}
func (a *App) RunStorageAcceptancePackCtx(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunStorageAcceptancePack(ctx, baseDir, extended, logFunc)
}
func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, error) {
path, err := a.RunStorageAcceptancePack(baseDir, nil)
return ActionResult{Title: "Storage SAT", Body: satResultBody(path)}, err
}
func (a *App) RunConfidentialComputingCheckPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunConfidentialComputingCheckPack(ctx, baseDir, logFunc)
}
func (a *App) RunConfidentialComputingCheckPack(baseDir string, logFunc func(string)) (string, error) {
return a.RunConfidentialComputingCheckPackCtx(context.Background(), baseDir, logFunc)
}
func (a *App) RunConfidentialComputingCheckPackResult(baseDir string) (ActionResult, error) {
path, err := a.RunConfidentialComputingCheckPack(baseDir, nil)
return ActionResult{Title: "Confidential Computing Check", Body: satResultBody(path)}, err
}
func (a *App) DetectGPUVendor() string {
return a.sat.DetectGPUVendor()
}
func (a *App) ListAMDGPUs() ([]platform.AMDGPUInfo, error) {
return a.sat.ListAMDGPUs()
}
func (a *App) RunAMDAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
return a.RunAMDAcceptancePackCtx(context.Background(), baseDir, logFunc)
}
func (a *App) RunAMDAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunAMDAcceptancePack(ctx, baseDir, logFunc)
}
func (a *App) RunAMDAcceptancePackResult(baseDir string) (ActionResult, error) {
path, err := a.RunAMDAcceptancePack(baseDir, nil)
return ActionResult{Title: "AMD GPU SAT", Body: satResultBody(path)}, err
}
func (a *App) RunAMDMemIntegrityPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunAMDMemIntegrityPack(ctx, baseDir, logFunc)
}
func (a *App) RunAMDMemBandwidthPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunAMDMemBandwidthPack(ctx, baseDir, logFunc)
}
func (a *App) RunMemoryStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
return a.RunMemoryStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
}
func (a *App) RunSATStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
return a.RunSATStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
}
func (a *App) RunAMDStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
return a.RunAMDStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
}
func (a *App) RunMemoryStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
return a.sat.RunMemoryStressPack(ctx, baseDir, durationSec, logFunc)
}
func (a *App) RunSATStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
return a.sat.RunSATStressPack(ctx, baseDir, durationSec, logFunc)
}
func (a *App) RunAMDStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunAMDStressPack(ctx, baseDir, durationSec, logFunc)
}
func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunFanStressTest(ctx, baseDir, opts)
}
func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc)
}
func (a *App) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunNCCLTests(ctx, baseDir, gpuIndices, logFunc)
}
func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
path, err := a.RunNCCLTests(ctx, DefaultSATBaseDir, nil, nil)
body := "Results: " + path
if err != nil && err != context.Canceled {
body += "\nERROR: " + err.Error()
}
return ActionResult{Title: "NCCL bandwidth test", Body: body}, err
}
func (a *App) RunFanStressTestResult(ctx context.Context, opts platform.FanStressOptions) (ActionResult, error) {
path, err := a.RunFanStressTest(ctx, "", opts)
body := formatFanStressResult(path)
if err != nil && err != context.Canceled {
body += "\nERROR: " + err.Error()
}
return ActionResult{Title: "GPU Platform Stress Test", Body: body}, err
}
// formatFanStressResult formats the summary.txt from a fan-stress run, including
// the per-step pass/fail display and the analysis section (throttling, max temps, fan response).
func formatFanStressResult(archivePath string) string {
if archivePath == "" {
return "No output produced."
}
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
raw, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
if err != nil {
return "Archive written to " + archivePath
}
content := strings.TrimSpace(string(raw))
kv := parseKeyValueSummary(content)
var b strings.Builder
b.WriteString(formatSATDetail(content))
// Append analysis section.
var analysis []string
if v, ok := kv["throttling_detected"]; ok {
label := "NO"
if v == "true" {
label = "YES ← throttling detected during load"
}
analysis = append(analysis, "Throttling: "+label)
}
if v, ok := kv["max_gpu_temp_c"]; ok && v != "0.0" {
analysis = append(analysis, "Max GPU temp: "+v+"°C")
}
if v, ok := kv["max_cpu_temp_c"]; ok && v != "0.0" {
analysis = append(analysis, "Max CPU temp: "+v+"°C")
}
if v, ok := kv["fan_response_sec"]; ok && v != "N/A" && v != "-1.0" {
analysis = append(analysis, "Fan response: "+v+"s")
}
if len(analysis) > 0 {
b.WriteString("\n\n=== Analysis ===\n")
for _, line := range analysis {
b.WriteString(line + "\n")
}
}
return strings.TrimSpace(b.String())
}
// satResultBody reads summary.txt from the SAT run directory (archive path without .tar.gz)
// and returns a formatted human-readable result. Falls back to a plain message if unreadable.
func satResultBody(archivePath string) string {
if archivePath == "" {
return "No output produced."
}
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
raw, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
if err != nil {
return "Archive written to " + archivePath
}
return formatSATDetail(strings.TrimSpace(string(raw)))
}