Compare commits

..

1 Commits

Author SHA1 Message Date
Mikhail Chusavitin abaeaea13f add Confidential Computing readiness check + collect nvidia-smi conf-compute -q
New read-only "Check" step reports whether this server can run NVIDIA
Confidential Computing: CPU TEE support (Intel TDX / AMD SEV-SNP, via
dmesg and kvm_amd sysfs params) and GPU firmware CC capability (via
`nvidia-smi conf-compute -q`). Also collect that command's output into
the techdump export bundle.

Co-Authored-By: Claude Sonnet 5 <noreply@anthropic.com>
2026-07-02 19:18:41 +03:00
10 changed files with 296 additions and 3 deletions
+1
View File
@@ -134,6 +134,7 @@ type satRunner interface {
ResetNvidiaGPU(index int) (string, error)
RunMemoryAcceptancePack(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error)
RunStorageAcceptancePack(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error)
RunConfidentialComputingCheckPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
ListNvidiaGPUs() ([]platform.NvidiaGPU, error)
DetectGPUVendor() string
+16
View File
@@ -206,6 +206,22 @@ func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, erro
return ActionResult{Title: "Storage SAT", Body: satResultBody(path)}, err
}
func (a *App) RunConfidentialComputingCheckPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunConfidentialComputingCheckPack(ctx, baseDir, logFunc)
}
func (a *App) RunConfidentialComputingCheckPack(baseDir string, logFunc func(string)) (string, error) {
return a.RunConfidentialComputingCheckPackCtx(context.Background(), baseDir, logFunc)
}
func (a *App) RunConfidentialComputingCheckPackResult(baseDir string) (ActionResult, error) {
path, err := a.RunConfidentialComputingCheckPack(baseDir, nil)
return ActionResult{Title: "Confidential Computing Check", Body: satResultBody(path)}, err
}
func (a *App) DetectGPUVendor() string {
return a.sat.DetectGPUVendor()
}
+4
View File
@@ -243,6 +243,10 @@ func (f fakeSAT) RunStorageAcceptancePack(_ context.Context, baseDir string, _ b
return f.runStorageFn(baseDir)
}
func (f fakeSAT) RunConfidentialComputingCheckPack(_ context.Context, baseDir string, _ func(string)) (string, error) {
return "", nil
}
func (f fakeSAT) RunCPUAcceptancePack(_ context.Context, baseDir string, durationSec int, _ func(string)) (string, error) {
if f.runCPUFn != nil {
return f.runCPUFn(baseDir, durationSec)
@@ -0,0 +1,248 @@
package platform
import (
"bytes"
"context"
"fmt"
"os"
"path/filepath"
"strings"
"time"
)
// ConfidentialComputingStatus summarizes whether this server can run NVIDIA
// Confidential Computing: CPU-side TEE support (Intel TDX / AMD SEV-SNP) and
// GPU firmware CC capability, as reported by `nvidia-smi conf-compute -q`.
type ConfidentialComputingStatus struct {
CollectedAt time.Time `json:"collected_at"`
// GPU-reported fields, parsed from `nvidia-smi conf-compute -q`.
NvidiaSMIAvailable bool `json:"nvidia_smi_available"`
CCState string `json:"cc_state,omitempty"` // ON / OFF
MultiGPUMode string `json:"multi_gpu_mode,omitempty"` // Protected PCIe / ...
CPUCCCapability string `json:"cpu_cc_capability,omitempty"` // e.g. "INTEL TDX", "AMD SEV-SNP", "NONE"
GPUCCCapability string `json:"gpu_cc_capability,omitempty"` // e.g. "CC Capable", "Not Capable"
CCGPUsReadyState string `json:"cc_gpus_ready_state,omitempty"` // Ready / Not Ready
// Host-side evidence that the CPU's TEE is actually active in the running
// kernel (BIOS + kernel cmdline + firmware), independent of what the GPU
// driver reports. Used as a fallback when the NVIDIA driver isn't loaded.
HostAMDSEVSupported bool `json:"host_amd_sev_supported"`
HostAMDSEVESSupported bool `json:"host_amd_sev_es_supported"`
HostAMDSEVSNPActive bool `json:"host_amd_sev_snp_active"`
HostIntelTDXActive bool `json:"host_intel_tdx_active"`
// GPUCanRunCC is true when the GPU firmware reports CC-capable.
GPUCanRunCC bool `json:"gpu_can_run_cc"`
// CPUCanRunCC is true when either the GPU driver or the host kernel
// reports an active/available CPU TEE (SEV-SNP or TDX).
CPUCanRunCC bool `json:"cpu_can_run_cc"`
// Ready is true when both the CPU and the GPU support Confidential
// Computing, regardless of whether CC mode is currently enabled.
Ready bool `json:"ready"`
Notes []string `json:"notes,omitempty"`
}
// RunConfidentialComputingCheckPack runs a read-only check of whether this
// server can run NVIDIA Confidential Computing: it queries the GPU driver
// (`nvidia-smi conf-compute -q`) and inspects host kernel/dmesg evidence of
// AMD SEV-SNP / Intel TDX support. It changes nothing on the system.
func (s *System) RunConfidentialComputingCheckPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
if ctx == nil {
ctx = context.Background()
}
if baseDir == "" {
baseDir = "/var/log/bee-sat"
}
ts := time.Now().UTC().Format("20060102-150405")
runDir := filepath.Join(baseDir, "confidential-computing-"+ts)
if err := os.MkdirAll(runDir, 0755); err != nil {
return "", err
}
verboseLog := filepath.Join(runDir, "verbose.log")
status := ConfidentialComputingStatus{CollectedAt: time.Now().UTC()}
// GPU firmware / driver state.
ccOut, ccErr := runSATCommandCtx(ctx, verboseLog, "nvidia-smi-conf-compute-q", []string{"nvidia-smi", "conf-compute", "-q"}, nil, logFunc)
_ = os.WriteFile(filepath.Join(runDir, "01-nvidia-smi-conf-compute-q.log"), ccOut, 0644)
if ccErr == nil {
status.NvidiaSMIAvailable = true
fields := parseConfComputeFields(ccOut)
status.CCState = fields["CC State"]
status.MultiGPUMode = fields["Multi-GPU Mode"]
status.CPUCCCapability = fields["CPU CC Capabilities"]
status.GPUCCCapability = fields["GPU CC Capabilities"]
status.CCGPUsReadyState = fields["CC GPUs Ready State"]
} else {
status.Notes = append(status.Notes, "nvidia-smi conf-compute -q unavailable (no NVIDIA driver, or GPU not present): "+firstLine(string(ccOut)))
}
// Host kernel evidence, independent of the GPU driver.
dmesgOut, _ := runSATCommandCtx(ctx, verboseLog, "dmesg", []string{"dmesg"}, nil, nil)
ccDmesgLines := filterConfComputeDmesgLines(dmesgOut)
_ = os.WriteFile(filepath.Join(runDir, "02-dmesg-cc-relevant.log"), []byte(strings.Join(ccDmesgLines, "\n")+"\n"), 0644)
lowerDmesg := strings.ToLower(strings.Join(ccDmesgLines, "\n"))
status.HostAMDSEVSNPActive = strings.Contains(lowerDmesg, "sev-snp enabled")
status.HostIntelTDXActive = strings.Contains(lowerDmesg, "tdx module") && strings.Contains(lowerDmesg, "module initialized") ||
strings.Contains(lowerDmesg, "virt/tdx: module initialized")
for i, path := range []string{
"/sys/module/kvm_amd/parameters/sev",
"/sys/module/kvm_amd/parameters/sev_es",
"/sys/module/kvm_amd/parameters/sev_snp",
} {
name := fmt.Sprintf("sysfs-%s", filepath.Base(path))
out, err := runSATCommandCtx(ctx, verboseLog, name, []string{"cat", path}, nil, nil)
_ = os.WriteFile(filepath.Join(runDir, fmt.Sprintf("03-%02d-%s.log", i+1, name)), out, 0644)
if err != nil {
continue
}
val := strings.TrimSpace(string(out))
switch filepath.Base(path) {
case "sev":
status.HostAMDSEVSupported = strings.EqualFold(val, "Y")
case "sev_es":
status.HostAMDSEVESSupported = strings.EqualFold(val, "Y")
case "sev_snp":
if strings.EqualFold(val, "Y") {
status.HostAMDSEVSNPActive = true
}
}
}
status.GPUCanRunCC = strings.EqualFold(strings.TrimSpace(status.GPUCCCapability), "CC Capable")
cpuCapReported := strings.TrimSpace(status.CPUCCCapability)
status.CPUCanRunCC = status.HostAMDSEVSNPActive || status.HostIntelTDXActive ||
(cpuCapReported != "" && !strings.EqualFold(cpuCapReported, "NONE"))
status.Ready = status.CPUCanRunCC && status.GPUCanRunCC
if !status.NvidiaSMIAvailable {
status.Notes = append(status.Notes, "GPU CC capability unknown — install the NVIDIA driver to query it with `nvidia-smi conf-compute -q`.")
}
summary := renderConfidentialComputingSummary(status)
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644); err != nil {
return "", err
}
report := renderConfidentialComputingReport(status)
if err := os.WriteFile(filepath.Join(runDir, "confidential-computing-report.txt"), []byte(report), 0644); err != nil {
return "", err
}
return runDir, nil
}
// parseConfComputeFields parses the indented "Key : Value" block emitted by
// `nvidia-smi conf-compute -q`, e.g.:
//
// CC State : OFF
// Multi-GPU Mode : Protected PCIe
// CPU CC Capabilities : INTEL TDX
// GPU CC Capabilities : CC Capable
// CC GPUs Ready State : Not Ready
func parseConfComputeFields(out []byte) map[string]string {
fields := map[string]string{}
for _, line := range strings.Split(string(out), "\n") {
idx := strings.Index(line, ":")
if idx < 0 {
continue
}
key := strings.TrimSpace(line[:idx])
val := strings.TrimSpace(line[idx+1:])
if key == "" || val == "" {
continue
}
fields[key] = val
}
return fields
}
// filterConfComputeDmesgLines returns the dmesg lines relevant to CPU
// Confidential Computing support (AMD SEV/SEV-ES/SEV-SNP, Intel TDX).
func filterConfComputeDmesgLines(dmesgOut []byte) []string {
var lines []string
scanner := bytes.Split(dmesgOut, []byte("\n"))
for _, raw := range scanner {
lower := strings.ToLower(string(raw))
if strings.Contains(lower, "sev") || strings.Contains(lower, "tdx") {
lines = append(lines, string(raw))
}
}
return lines
}
func renderConfidentialComputingSummary(status ConfidentialComputingStatus) string {
var b strings.Builder
fmt.Fprintf(&b, "run_at_utc=%s\n", status.CollectedAt.Format(time.RFC3339))
fmt.Fprintf(&b, "nvidia_smi_available=%t\n", status.NvidiaSMIAvailable)
fmt.Fprintf(&b, "cc_state=%s\n", status.CCState)
fmt.Fprintf(&b, "multi_gpu_mode=%s\n", status.MultiGPUMode)
fmt.Fprintf(&b, "cpu_cc_capability=%s\n", status.CPUCCCapability)
fmt.Fprintf(&b, "gpu_cc_capability=%s\n", status.GPUCCCapability)
fmt.Fprintf(&b, "cc_gpus_ready_state=%s\n", status.CCGPUsReadyState)
fmt.Fprintf(&b, "host_amd_sev_supported=%t\n", status.HostAMDSEVSupported)
fmt.Fprintf(&b, "host_amd_sev_es_supported=%t\n", status.HostAMDSEVESSupported)
fmt.Fprintf(&b, "host_amd_sev_snp_active=%t\n", status.HostAMDSEVSNPActive)
fmt.Fprintf(&b, "host_intel_tdx_active=%t\n", status.HostIntelTDXActive)
fmt.Fprintf(&b, "cpu_can_run_cc=%t\n", status.CPUCanRunCC)
fmt.Fprintf(&b, "gpu_can_run_cc=%t\n", status.GPUCanRunCC)
fmt.Fprintf(&b, "ready=%t\n", status.Ready)
if status.Ready {
fmt.Fprintln(&b, "overall_status=OK")
} else {
fmt.Fprintln(&b, "overall_status=NOT_READY")
}
return b.String()
}
func renderConfidentialComputingReport(status ConfidentialComputingStatus) string {
var b strings.Builder
line := strings.Repeat("=", 80)
b.WriteString(line + "\n")
b.WriteString("Confidential Computing Readiness\n")
b.WriteString(line + "\n\n")
verdict := "NOT READY"
if status.Ready {
verdict = "READY"
}
fmt.Fprintf(&b, "Verdict: %s\n\n", verdict)
b.WriteString("-- CPU ----------------------------------------------------------------------\n")
fmt.Fprintf(&b, " Reported by GPU driver : %s\n", nonEmptyOr(status.CPUCCCapability, "unknown"))
fmt.Fprintf(&b, " AMD SEV supported : %t\n", status.HostAMDSEVSupported)
fmt.Fprintf(&b, " AMD SEV-ES supported : %t\n", status.HostAMDSEVESSupported)
fmt.Fprintf(&b, " AMD SEV-SNP active : %t\n", status.HostAMDSEVSNPActive)
fmt.Fprintf(&b, " Intel TDX active : %t\n", status.HostIntelTDXActive)
fmt.Fprintf(&b, " Can run CC : %t\n\n", status.CPUCanRunCC)
b.WriteString("-- GPU ----------------------------------------------------------------------\n")
fmt.Fprintf(&b, " nvidia-smi available : %t\n", status.NvidiaSMIAvailable)
fmt.Fprintf(&b, " GPU CC Capabilities : %s\n", nonEmptyOr(status.GPUCCCapability, "unknown"))
fmt.Fprintf(&b, " CC State (current) : %s\n", nonEmptyOr(status.CCState, "unknown"))
fmt.Fprintf(&b, " Multi-GPU Mode : %s\n", nonEmptyOr(status.MultiGPUMode, "unknown"))
fmt.Fprintf(&b, " CC GPUs Ready State : %s\n", nonEmptyOr(status.CCGPUsReadyState, "unknown"))
fmt.Fprintf(&b, " Can run CC : %t\n\n", status.GPUCanRunCC)
if len(status.Notes) > 0 {
b.WriteString("-- Notes ----------------------------------------------------------------------\n")
for _, n := range status.Notes {
fmt.Fprintf(&b, " - %s\n", n)
}
b.WriteString("\n")
}
fmt.Fprintf(&b, "Collected : %s\n", status.CollectedAt.Format("2006-01-02 15:04:05 UTC"))
b.WriteString(line + "\n")
return b.String()
}
func nonEmptyOr(v, fallback string) string {
if strings.TrimSpace(v) == "" {
return fallback
}
return v
}
+1
View File
@@ -38,6 +38,7 @@ var techDumpNvidiaCommands = []struct {
}{
{Name: "nvidia-smi", Args: []string{"-q"}, File: "nvidia-smi-q.txt"},
{Name: "nvidia-smi", Args: []string{"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown", "--format=csv,noheader,nounits"}, File: "nvidia-smi-query.csv"},
{Name: "nvidia-smi", Args: []string{"conf-compute", "-q"}, File: "nvidia-smi-conf-compute-q.txt"},
}
type lsblkDumpRoot struct {
+1 -1
View File
@@ -135,7 +135,7 @@ func defaultTaskPriority(target string, params taskParams) int {
return taskPriorityBurn
case "nvidia", "nvidia-targeted-stress", "nvidia-targeted-power", "nvidia-pulse",
"nvidia-interconnect", "nvidia-bandwidth", "memory", "storage", "cpu",
"amd", "amd-mem", "amd-bandwidth":
"amd", "amd-mem", "amd-bandwidth", "confidential-computing":
if params.StressMode {
return taskPriorityValidateStress
}
+8 -2
View File
@@ -676,6 +676,12 @@ func renderCheck(opts HandlerOptions) string {
`<code>lsblk</code>; NVMe: <code>nvme id-ctrl</code>, <code>nvme smart-log</code>; SATA/SAS: <code>smartctl -H -A</code>`,
`Seconds — instantaneous device query, no wear counters incremented.`,
)) +
renderSATCard("confidential-computing", "Confidential Computing", "runSAT('confidential-computing')", "", renderValidateCardBody(
inv.NVIDIA,
`Checks whether this server can run NVIDIA Confidential Computing: CPU TEE support (Intel TDX / AMD SEV-SNP) and GPU firmware CC capability. Read-only — changes nothing.`,
`<code>nvidia-smi conf-compute -q</code>, <code>dmesg</code>, <code>/sys/module/kvm_amd/parameters/*</code>`,
`Seconds — read-only query only.`,
)) +
`</div>
<div style="height:1px;background:var(--border);margin:16px 0"></div>
<div class="card" style="margin-bottom:16px">
@@ -737,7 +743,7 @@ func renderCheck(opts HandlerOptions) string {
<script>
let satES = null;
function satLabels() {
return {nvidia:'Check GPU (DCGM L2)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Check Memory', storage:'Check Storage', cpu:'Check CPU', amd:'Check AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
return {nvidia:'Check GPU (DCGM L2)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Check Memory', storage:'Check Storage', cpu:'Check CPU', amd:'Check AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth', 'confidential-computing':'Check Confidential Computing'};
}
let satNvidiaGPUsPromise = null;
function loadSatNvidiaGPUs() {
@@ -873,7 +879,7 @@ function runAllCheckSAT() {
status.textContent = 'Enqueuing...';
const nvidiaIndices = satSelectedGPUIndices();
const nvidiaAllTargets = ['nvidia', 'nvidia-interconnect', 'nvidia-bandwidth'];
const baseTargets = ['cpu', 'memory', 'storage'];
const baseTargets = ['cpu', 'memory', 'storage', 'confidential-computing'];
const amdTargets = selectedAMDValidateTargets();
const expanded = [];
baseTargets.forEach(t => expanded.push({target: t}));
+1
View File
@@ -264,6 +264,7 @@ func NewHandler(opts HandlerOptions) http.Handler {
mux.HandleFunc("POST /api/sat/nvidia-stress/run", h.handleAPISATRun("nvidia-stress"))
mux.HandleFunc("POST /api/sat/memory/run", h.handleAPISATRun("memory"))
mux.HandleFunc("POST /api/sat/storage/run", h.handleAPISATRun("storage"))
mux.HandleFunc("POST /api/sat/confidential-computing/run", h.handleAPISATRun("confidential-computing"))
mux.HandleFunc("POST /api/sat/cpu/run", h.handleAPISATRun("cpu"))
mux.HandleFunc("POST /api/sat/amd/run", h.handleAPISATRun("amd"))
mux.HandleFunc("POST /api/sat/amd-mem/run", h.handleAPISATRun("amd-mem"))
+6
View File
@@ -272,6 +272,12 @@ func executeTaskWithOptions(opts *HandlerOptions, t *Task, j *jobState, ctx cont
break
}
archive, err = runStorageAcceptancePackCtx(a, ctx, "", t.params.StressMode, j.append)
case "confidential-computing":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
archive, err = runConfidentialComputingCheckPackCtx(a, ctx, "", j.append)
case "cpu":
if a == nil {
err = fmt.Errorf("app not configured")
+10
View File
@@ -45,6 +45,7 @@ var taskNames = map[string]string{
"nvidia-stress": "NVIDIA GPU Stress",
"memory": "Memory SAT",
"storage": "Storage SAT",
"confidential-computing": "Confidential Computing Check",
"cpu": "CPU SAT",
"amd": "AMD GPU SAT",
"amd-mem": "AMD GPU MEM Integrity",
@@ -312,6 +313,9 @@ var (
runStorageAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
return a.RunStorageAcceptancePackCtx(ctx, baseDir, extended, logFunc)
}
runConfidentialComputingCheckPackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
return a.RunConfidentialComputingCheckPackCtx(ctx, baseDir, logFunc)
}
runCPUAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
return a.RunCPUAcceptancePackCtx(ctx, baseDir, durationSec, logFunc)
}
@@ -1025,6 +1029,12 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
break
}
archive, err = runStorageAcceptancePackCtx(a, ctx, "", t.params.StressMode, j.append)
case "confidential-computing":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
archive, err = runConfidentialComputingCheckPackCtx(a, ctx, "", j.append)
case "cpu":
if a == nil {
err = fmt.Errorf("app not configured")