gpu topology: detect cross-NUMA-only paths between GPU pairs

Flags GPUs that reach one or more peers only via a SYS-class PCIe hop (crossing the CPU/NUMA-node boundary) in nvidia-smi topo -m. On servers where GPUs are only bridged pairwise via NVLink bridge (no switched NVLink fabric), this is the exact path that traffic between different bridge pairs has to cross, and can cut multi-GPU throughput by 2x+ for workloads spanning more than one pair. Co-Authored-By: Claude Sonnet 5 <noreply@anthropic.com>
add Confidential Computing readiness check + collect nvidia-smi conf-compute -q
2026-07-03 12:22:04 +03:00 · 2026-07-02 19:18:41 +03:00 · 2026-07-02 12:14:20 +03:00
16 changed files with 588 additions and 37 deletions
@@ -134,6 +134,7 @@ type satRunner interface {
 	ResetNvidiaGPU(index int) (string, error)
 	RunMemoryAcceptancePack(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error)
 	RunStorageAcceptancePack(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error)
+	RunConfidentialComputingCheckPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
 	RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
 	ListNvidiaGPUs() ([]platform.NvidiaGPU, error)
 	DetectGPUVendor() string
@@ -206,6 +206,22 @@ func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, erro
 	return ActionResult{Title: "Storage SAT", Body: satResultBody(path)}, err
 }

+func (a *App) RunConfidentialComputingCheckPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
+	if strings.TrimSpace(baseDir) == "" {
+		baseDir = DefaultSATBaseDir
+	}
+	return a.sat.RunConfidentialComputingCheckPack(ctx, baseDir, logFunc)
+}
+
+func (a *App) RunConfidentialComputingCheckPack(baseDir string, logFunc func(string)) (string, error) {
+	return a.RunConfidentialComputingCheckPackCtx(context.Background(), baseDir, logFunc)
+}
+
+func (a *App) RunConfidentialComputingCheckPackResult(baseDir string) (ActionResult, error) {
+	path, err := a.RunConfidentialComputingCheckPack(baseDir, nil)
+	return ActionResult{Title: "Confidential Computing Check", Body: satResultBody(path)}, err
+}
+
 func (a *App) DetectGPUVendor() string {
 	return a.sat.DetectGPUVendor()
 }
@@ -243,6 +243,10 @@ func (f fakeSAT) RunStorageAcceptancePack(_ context.Context, baseDir string, _ b
 	return f.runStorageFn(baseDir)
 }

+func (f fakeSAT) RunConfidentialComputingCheckPack(_ context.Context, baseDir string, _ func(string)) (string, error) {
+	return "", nil
+}
+
 func (f fakeSAT) RunCPUAcceptancePack(_ context.Context, baseDir string, durationSec int, _ func(string)) (string, error) {
 	if f.runCPUFn != nil {
 		return f.runCPUFn(baseDir, durationSec)
@@ -41,6 +41,7 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
 	snap.PCIeDevices = enrichPCIeWithPCISerials(snap.PCIeDevices)
 	snap.PCIeDevices = enrichPCIeWithNVIDIA(snap.PCIeDevices)
 	snap.PCIeDevices = enrichNVLinkBridgesWithGPUTopo(snap.PCIeDevices)
+	snap.PCIeDevices = enrichGPUCrossNUMATopology(snap.PCIeDevices)
 	snap.PCIeDevices = enrichPCIeWithMellanox(snap.PCIeDevices)
 	snap.PCIeDevices = enrichPCIeWithNICTelemetry(snap.PCIeDevices)
 	snap.PCIeDevices = enrichPCIeWithRAIDTelemetry(snap.PCIeDevices)
@@ -2,6 +2,7 @@ package collector

 import (
 	"bee/audit/internal/schema"
+	"fmt"
 	"log/slog"
 	"os/exec"
 	"regexp"
@@ -119,26 +120,12 @@ func queryNVIDIANVLinkTopo() (nvlinkTopoResult, error) {
 	return parseNVIDIATopologyMatrix(string(out)), nil
 }

-// parseNVIDIATopologyMatrix extracts the minimum NVLink bond count from the
-// nvidia-smi topo -m matrix.
-//
-// Format (abbreviated):
-//
-//	         GPU0  GPU1 ... NIC0 NIC1
-//	GPU0      X   NV18 ... NODE NODE
-//	GPU1     NV18   X  ... NODE NODE
-//	NIC0     NODE  NODE...   X   PIX
-//
-// The header row starts with "GPU0"; its columns may include non-GPU entries
-// (NIC, CPU) which are ignored. Only GPU×GPU cells containing NV# values are
-// counted. X is self; non-NV tokens (NODE, SYS, PHB, PIX) are skipped.
-func parseNVIDIATopologyMatrix(raw string) nvlinkTopoResult {
-	lines := strings.Split(raw, "\n")
-
-	// Locate the header line and record which column indices are GPU columns.
-	headerIdx := -1
-	var gpuColIndices []int // 0-based indices within fields (excluding the row label)
-	var gpuCount int
+// locateGPUTopologyColumns finds the header line of a nvidia-smi topo -m
+// matrix and the 0-based field indices (excluding the row label) that
+// correspond to GPU columns. Returns headerIdx=-1 if fewer than 2 GPU columns
+// are found.
+func locateGPUTopologyColumns(lines []string) (headerIdx int, gpuColIndices []int, gpuCount int) {
+	headerIdx = -1
 	for i, line := range lines {
 		trimmed := strings.TrimSpace(line)
 		if strings.HasPrefix(trimmed, "GPU0") {
@@ -155,7 +142,30 @@ func parseNVIDIATopologyMatrix(raw string) nvlinkTopoResult {
 			break
 		}
 	}
-	if headerIdx < 0 || gpuCount == 0 {
+	if headerIdx < 0 {
+		gpuColIndices = nil
+		gpuCount = 0
+	}
+	return headerIdx, gpuColIndices, gpuCount
+}
+
+// parseNVIDIATopologyMatrix extracts the minimum NVLink bond count from the
+// nvidia-smi topo -m matrix.
+//
+// Format (abbreviated):
+//
+//	         GPU0  GPU1 ... NIC0 NIC1
+//	GPU0      X   NV18 ... NODE NODE
+//	GPU1     NV18   X  ... NODE NODE
+//	NIC0     NODE  NODE...   X   PIX
+//
+// The header row starts with "GPU0"; its columns may include non-GPU entries
+// (NIC, CPU) which are ignored. Only GPU×GPU cells containing NV# values are
+// counted. X is self; non-NV tokens (NODE, SYS, PHB, PIX) are skipped.
+func parseNVIDIATopologyMatrix(raw string) nvlinkTopoResult {
+	lines := strings.Split(raw, "\n")
+	headerIdx, gpuColIndices, gpuCount := locateGPUTopologyColumns(lines)
+	if headerIdx < 0 {
 		return nvlinkTopoResult{}
 	}

@@ -204,3 +214,110 @@ func parseNVIDIATopologyMatrix(raw string) nvlinkTopoResult {
 		MinNVLinks: minLinks,
 	}
 }
+
+// parseCrossNUMAPeers scans a nvidia-smi topo -m matrix for GPU pairs whose
+// only path is "SYS" — traversing PCIe as well as the SMP interconnect
+// between NUMA nodes (e.g. QPI/UPI). This is the slowest possible GPU-GPU
+// path and, on servers where GPUs are only bridged pairwise via NVLink
+// bridge (no switched NVLink fabric), it is exactly the hop that traffic
+// between different bridge pairs has to cross. Returns a map from GPU index
+// to the peer GPU indices reachable only via this cross-NUMA path.
+func parseCrossNUMAPeers(raw string) map[int][]int {
+	lines := strings.Split(raw, "\n")
+	headerIdx, gpuColIndices, _ := locateGPUTopologyColumns(lines)
+	if headerIdx < 0 {
+		return nil
+	}
+
+	// colIdx (0-based within header fields) -> GPU index, in header order.
+	colIdxToGPU := make(map[int]int, len(gpuColIndices))
+	for gpuIdx, colIdx := range gpuColIndices {
+		colIdxToGPU[colIdx] = gpuIdx
+	}
+
+	peers := make(map[int][]int)
+	rowGPU := -1
+	for _, line := range lines[headerIdx+1:] {
+		trimmed := strings.TrimSpace(line)
+		if !strings.HasPrefix(trimmed, "GPU") {
+			continue
+		}
+		rowGPU++
+		cells := strings.Fields(trimmed)
+		for _, colIdx := range gpuColIndices {
+			dataIdx := colIdx + 1
+			if dataIdx >= len(cells) {
+				continue
+			}
+			colGPU := colIdxToGPU[colIdx]
+			if colGPU == rowGPU {
+				continue
+			}
+			if strings.EqualFold(cells[dataIdx], "SYS") {
+				peers[rowGPU] = append(peers[rowGPU], colGPU)
+			}
+		}
+	}
+	if len(peers) == 0 {
+		return nil
+	}
+	return peers
+}
+
+// enrichGPUCrossNUMATopology flags GPUs that reach one or more peer GPUs only
+// via a cross-NUMA-node PCIe hop ("SYS" in nvidia-smi topo -m). Unlike
+// enrichNVLinkBridgesWithGPUTopo, this does not require an NVLink bridge PCIe
+// device to be present: it applies to any multi-GPU box, since the weak point
+// it detects is the path *between* NVLink-bridged pairs (or between GPUs with
+// no NVLink at all), not the bridge itself.
+func enrichGPUCrossNUMATopology(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice {
+	gpuByBDF, err := queryNVIDIAGPUs()
+	if err != nil || len(gpuByBDF) < 2 {
+		return devs
+	}
+
+	out, err := exec.Command("nvidia-smi", "topo", "-m").Output()
+	if err != nil {
+		slog.Info("gpu-topology: nvidia-smi topo unavailable, skipping cross-NUMA check", "err", err)
+		return devs
+	}
+	peers := parseCrossNUMAPeers(string(out))
+	if len(peers) == 0 {
+		return devs
+	}
+
+	bdfToIndex := make(map[string]int, len(gpuByBDF))
+	for bdf, info := range gpuByBDF {
+		bdfToIndex[bdf] = info.Index
+	}
+
+	for i := range devs {
+		if devs[i].BDF == nil {
+			continue
+		}
+		idx, ok := bdfToIndex[normalizePCIeBDF(*devs[i].BDF)]
+		if !ok {
+			continue
+		}
+		peerList, ok := peers[idx]
+		if !ok {
+			continue
+		}
+		if devs[i].Telemetry == nil {
+			devs[i].Telemetry = map[string]any{}
+		}
+		devs[i].Telemetry["nvlink_cross_numa_peers"] = peerList
+		if devs[i].Status == nil || *devs[i].Status == statusOK {
+			warn := statusWarning
+			devs[i].Status = &warn
+		}
+		if devs[i].ErrorDescription == nil {
+			devs[i].ErrorDescription = stringPtr(fmt.Sprintf(
+				"GPU %d reaches GPU(s) %v only via a cross-NUMA-node PCIe path (SYS) — expect reduced bandwidth/increased latency for tensor-parallel workloads spanning these GPUs",
+				idx, peerList))
+		}
+	}
+
+	slog.Info("gpu-topology: cross-NUMA peers detected", "affected_gpus", len(peers))
+	return devs
+}
@@ -80,6 +80,42 @@ func TestParseNVIDIATopologyMatrixEmpty(t *testing.T) {
 	}
 }

+func TestParseCrossNUMAPeersDetectsSYS(t *testing.T) {
+	t.Parallel()
+
+	// 4-GPU box, two NVLink-bridged pairs (GPU0-GPU1, GPU2-GPU3); the pairs
+	// themselves only reach each other via SYS (cross-NUMA PCIe hop) — the
+	// exact topology of a server using pairwise NVLink bridge cards instead
+	// of a switched NVLink fabric.
+	input := `	GPU0	GPU1	GPU2	GPU3
+GPU0	 X 	NV4	SYS	SYS
+GPU1	NV4	 X 	SYS	SYS
+GPU2	SYS	SYS	 X 	NV4
+GPU3	SYS	SYS	NV4	 X
+`
+	peers := parseCrossNUMAPeers(input)
+
+	if len(peers[0]) != 2 || peers[0][0] != 2 || peers[0][1] != 3 {
+		t.Fatalf("peers[0]=%v want [2 3]", peers[0])
+	}
+	if len(peers[2]) != 2 {
+		t.Fatalf("peers[2]=%v want 2 entries", peers[2])
+	}
+}
+
+func TestParseCrossNUMAPeersNoSYS(t *testing.T) {
+	t.Parallel()
+
+	// Full NVSwitch fabric: every GPU pair connects via NVLink, no SYS hops.
+	input := `	GPU0	GPU1
+GPU0	 X 	NV18
+GPU1	NV18	 X
+`
+	if peers := parseCrossNUMAPeers(input); peers != nil {
+		t.Fatalf("peers=%v want nil (no SYS pairs)", peers)
+	}
+}
+
 func TestApplyPCIeLinkSpeedWarningNVLinkBridgeEscalates(t *testing.T) {
 	t.Parallel()

@@ -0,0 +1,248 @@
+package platform
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+	"time"
+)
+
+// ConfidentialComputingStatus summarizes whether this server can run NVIDIA
+// Confidential Computing: CPU-side TEE support (Intel TDX / AMD SEV-SNP) and
+// GPU firmware CC capability, as reported by `nvidia-smi conf-compute -q`.
+type ConfidentialComputingStatus struct {
+	CollectedAt time.Time `json:"collected_at"`
+
+	// GPU-reported fields, parsed from `nvidia-smi conf-compute -q`.
+	NvidiaSMIAvailable bool   `json:"nvidia_smi_available"`
+	CCState            string `json:"cc_state,omitempty"`             // ON / OFF
+	MultiGPUMode       string `json:"multi_gpu_mode,omitempty"`       // Protected PCIe / ...
+	CPUCCCapability    string `json:"cpu_cc_capability,omitempty"`    // e.g. "INTEL TDX", "AMD SEV-SNP", "NONE"
+	GPUCCCapability    string `json:"gpu_cc_capability,omitempty"`    // e.g. "CC Capable", "Not Capable"
+	CCGPUsReadyState   string `json:"cc_gpus_ready_state,omitempty"`  // Ready / Not Ready
+
+	// Host-side evidence that the CPU's TEE is actually active in the running
+	// kernel (BIOS + kernel cmdline + firmware), independent of what the GPU
+	// driver reports. Used as a fallback when the NVIDIA driver isn't loaded.
+	HostAMDSEVSupported   bool `json:"host_amd_sev_supported"`
+	HostAMDSEVESSupported bool `json:"host_amd_sev_es_supported"`
+	HostAMDSEVSNPActive   bool `json:"host_amd_sev_snp_active"`
+	HostIntelTDXActive    bool `json:"host_intel_tdx_active"`
+
+	// GPUCanRunCC is true when the GPU firmware reports CC-capable.
+	GPUCanRunCC bool `json:"gpu_can_run_cc"`
+	// CPUCanRunCC is true when either the GPU driver or the host kernel
+	// reports an active/available CPU TEE (SEV-SNP or TDX).
+	CPUCanRunCC bool `json:"cpu_can_run_cc"`
+	// Ready is true when both the CPU and the GPU support Confidential
+	// Computing, regardless of whether CC mode is currently enabled.
+	Ready bool `json:"ready"`
+
+	Notes []string `json:"notes,omitempty"`
+}
+
+// RunConfidentialComputingCheckPack runs a read-only check of whether this
+// server can run NVIDIA Confidential Computing: it queries the GPU driver
+// (`nvidia-smi conf-compute -q`) and inspects host kernel/dmesg evidence of
+// AMD SEV-SNP / Intel TDX support. It changes nothing on the system.
+func (s *System) RunConfidentialComputingCheckPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
+	if ctx == nil {
+		ctx = context.Background()
+	}
+	if baseDir == "" {
+		baseDir = "/var/log/bee-sat"
+	}
+	ts := time.Now().UTC().Format("20060102-150405")
+	runDir := filepath.Join(baseDir, "confidential-computing-"+ts)
+	if err := os.MkdirAll(runDir, 0755); err != nil {
+		return "", err
+	}
+	verboseLog := filepath.Join(runDir, "verbose.log")
+
+	status := ConfidentialComputingStatus{CollectedAt: time.Now().UTC()}
+
+	// GPU firmware / driver state.
+	ccOut, ccErr := runSATCommandCtx(ctx, verboseLog, "nvidia-smi-conf-compute-q", []string{"nvidia-smi", "conf-compute", "-q"}, nil, logFunc)
+	_ = os.WriteFile(filepath.Join(runDir, "01-nvidia-smi-conf-compute-q.log"), ccOut, 0644)
+	if ccErr == nil {
+		status.NvidiaSMIAvailable = true
+		fields := parseConfComputeFields(ccOut)
+		status.CCState = fields["CC State"]
+		status.MultiGPUMode = fields["Multi-GPU Mode"]
+		status.CPUCCCapability = fields["CPU CC Capabilities"]
+		status.GPUCCCapability = fields["GPU CC Capabilities"]
+		status.CCGPUsReadyState = fields["CC GPUs Ready State"]
+	} else {
+		status.Notes = append(status.Notes, "nvidia-smi conf-compute -q unavailable (no NVIDIA driver, or GPU not present): "+firstLine(string(ccOut)))
+	}
+
+	// Host kernel evidence, independent of the GPU driver.
+	dmesgOut, _ := runSATCommandCtx(ctx, verboseLog, "dmesg", []string{"dmesg"}, nil, nil)
+	ccDmesgLines := filterConfComputeDmesgLines(dmesgOut)
+	_ = os.WriteFile(filepath.Join(runDir, "02-dmesg-cc-relevant.log"), []byte(strings.Join(ccDmesgLines, "\n")+"\n"), 0644)
+
+	lowerDmesg := strings.ToLower(strings.Join(ccDmesgLines, "\n"))
+	status.HostAMDSEVSNPActive = strings.Contains(lowerDmesg, "sev-snp enabled")
+	status.HostIntelTDXActive = strings.Contains(lowerDmesg, "tdx module") && strings.Contains(lowerDmesg, "module initialized") ||
+		strings.Contains(lowerDmesg, "virt/tdx: module initialized")
+
+	for i, path := range []string{
+		"/sys/module/kvm_amd/parameters/sev",
+		"/sys/module/kvm_amd/parameters/sev_es",
+		"/sys/module/kvm_amd/parameters/sev_snp",
+	} {
+		name := fmt.Sprintf("sysfs-%s", filepath.Base(path))
+		out, err := runSATCommandCtx(ctx, verboseLog, name, []string{"cat", path}, nil, nil)
+		_ = os.WriteFile(filepath.Join(runDir, fmt.Sprintf("03-%02d-%s.log", i+1, name)), out, 0644)
+		if err != nil {
+			continue
+		}
+		val := strings.TrimSpace(string(out))
+		switch filepath.Base(path) {
+		case "sev":
+			status.HostAMDSEVSupported = strings.EqualFold(val, "Y")
+		case "sev_es":
+			status.HostAMDSEVESSupported = strings.EqualFold(val, "Y")
+		case "sev_snp":
+			if strings.EqualFold(val, "Y") {
+				status.HostAMDSEVSNPActive = true
+			}
+		}
+	}
+
+	status.GPUCanRunCC = strings.EqualFold(strings.TrimSpace(status.GPUCCCapability), "CC Capable")
+	cpuCapReported := strings.TrimSpace(status.CPUCCCapability)
+	status.CPUCanRunCC = status.HostAMDSEVSNPActive || status.HostIntelTDXActive ||
+		(cpuCapReported != "" && !strings.EqualFold(cpuCapReported, "NONE"))
+	status.Ready = status.CPUCanRunCC && status.GPUCanRunCC
+
+	if !status.NvidiaSMIAvailable {
+		status.Notes = append(status.Notes, "GPU CC capability unknown — install the NVIDIA driver to query it with `nvidia-smi conf-compute -q`.")
+	}
+
+	summary := renderConfidentialComputingSummary(status)
+	if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644); err != nil {
+		return "", err
+	}
+	report := renderConfidentialComputingReport(status)
+	if err := os.WriteFile(filepath.Join(runDir, "confidential-computing-report.txt"), []byte(report), 0644); err != nil {
+		return "", err
+	}
+
+	return runDir, nil
+}
+
+// parseConfComputeFields parses the indented "Key : Value" block emitted by
+// `nvidia-smi conf-compute -q`, e.g.:
+//
+//	CC State                : OFF
+//	Multi-GPU Mode          : Protected PCIe
+//	CPU CC Capabilities     : INTEL TDX
+//	GPU CC Capabilities     : CC Capable
+//	CC GPUs Ready State     : Not Ready
+func parseConfComputeFields(out []byte) map[string]string {
+	fields := map[string]string{}
+	for _, line := range strings.Split(string(out), "\n") {
+		idx := strings.Index(line, ":")
+		if idx < 0 {
+			continue
+		}
+		key := strings.TrimSpace(line[:idx])
+		val := strings.TrimSpace(line[idx+1:])
+		if key == "" || val == "" {
+			continue
+		}
+		fields[key] = val
+	}
+	return fields
+}
+
+// filterConfComputeDmesgLines returns the dmesg lines relevant to CPU
+// Confidential Computing support (AMD SEV/SEV-ES/SEV-SNP, Intel TDX).
+func filterConfComputeDmesgLines(dmesgOut []byte) []string {
+	var lines []string
+	scanner := bytes.Split(dmesgOut, []byte("\n"))
+	for _, raw := range scanner {
+		lower := strings.ToLower(string(raw))
+		if strings.Contains(lower, "sev") || strings.Contains(lower, "tdx") {
+			lines = append(lines, string(raw))
+		}
+	}
+	return lines
+}
+
+func renderConfidentialComputingSummary(status ConfidentialComputingStatus) string {
+	var b strings.Builder
+	fmt.Fprintf(&b, "run_at_utc=%s\n", status.CollectedAt.Format(time.RFC3339))
+	fmt.Fprintf(&b, "nvidia_smi_available=%t\n", status.NvidiaSMIAvailable)
+	fmt.Fprintf(&b, "cc_state=%s\n", status.CCState)
+	fmt.Fprintf(&b, "multi_gpu_mode=%s\n", status.MultiGPUMode)
+	fmt.Fprintf(&b, "cpu_cc_capability=%s\n", status.CPUCCCapability)
+	fmt.Fprintf(&b, "gpu_cc_capability=%s\n", status.GPUCCCapability)
+	fmt.Fprintf(&b, "cc_gpus_ready_state=%s\n", status.CCGPUsReadyState)
+	fmt.Fprintf(&b, "host_amd_sev_supported=%t\n", status.HostAMDSEVSupported)
+	fmt.Fprintf(&b, "host_amd_sev_es_supported=%t\n", status.HostAMDSEVESSupported)
+	fmt.Fprintf(&b, "host_amd_sev_snp_active=%t\n", status.HostAMDSEVSNPActive)
+	fmt.Fprintf(&b, "host_intel_tdx_active=%t\n", status.HostIntelTDXActive)
+	fmt.Fprintf(&b, "cpu_can_run_cc=%t\n", status.CPUCanRunCC)
+	fmt.Fprintf(&b, "gpu_can_run_cc=%t\n", status.GPUCanRunCC)
+	fmt.Fprintf(&b, "ready=%t\n", status.Ready)
+	if status.Ready {
+		fmt.Fprintln(&b, "overall_status=OK")
+	} else {
+		fmt.Fprintln(&b, "overall_status=NOT_READY")
+	}
+	return b.String()
+}
+
+func renderConfidentialComputingReport(status ConfidentialComputingStatus) string {
+	var b strings.Builder
+	line := strings.Repeat("=", 80)
+	b.WriteString(line + "\n")
+	b.WriteString("Confidential Computing Readiness\n")
+	b.WriteString(line + "\n\n")
+
+	verdict := "NOT READY"
+	if status.Ready {
+		verdict = "READY"
+	}
+	fmt.Fprintf(&b, "Verdict: %s\n\n", verdict)
+
+	b.WriteString("-- CPU ----------------------------------------------------------------------\n")
+	fmt.Fprintf(&b, "  Reported by GPU driver : %s\n", nonEmptyOr(status.CPUCCCapability, "unknown"))
+	fmt.Fprintf(&b, "  AMD SEV supported      : %t\n", status.HostAMDSEVSupported)
+	fmt.Fprintf(&b, "  AMD SEV-ES supported    : %t\n", status.HostAMDSEVESSupported)
+	fmt.Fprintf(&b, "  AMD SEV-SNP active      : %t\n", status.HostAMDSEVSNPActive)
+	fmt.Fprintf(&b, "  Intel TDX active        : %t\n", status.HostIntelTDXActive)
+	fmt.Fprintf(&b, "  Can run CC              : %t\n\n", status.CPUCanRunCC)
+
+	b.WriteString("-- GPU ----------------------------------------------------------------------\n")
+	fmt.Fprintf(&b, "  nvidia-smi available    : %t\n", status.NvidiaSMIAvailable)
+	fmt.Fprintf(&b, "  GPU CC Capabilities     : %s\n", nonEmptyOr(status.GPUCCCapability, "unknown"))
+	fmt.Fprintf(&b, "  CC State (current)      : %s\n", nonEmptyOr(status.CCState, "unknown"))
+	fmt.Fprintf(&b, "  Multi-GPU Mode          : %s\n", nonEmptyOr(status.MultiGPUMode, "unknown"))
+	fmt.Fprintf(&b, "  CC GPUs Ready State     : %s\n", nonEmptyOr(status.CCGPUsReadyState, "unknown"))
+	fmt.Fprintf(&b, "  Can run CC              : %t\n\n", status.GPUCanRunCC)
+
+	if len(status.Notes) > 0 {
+		b.WriteString("-- Notes ----------------------------------------------------------------------\n")
+		for _, n := range status.Notes {
+			fmt.Fprintf(&b, "  - %s\n", n)
+		}
+		b.WriteString("\n")
+	}
+
+	fmt.Fprintf(&b, "Collected : %s\n", status.CollectedAt.Format("2006-01-02 15:04:05 UTC"))
+	b.WriteString(line + "\n")
+	return b.String()
+}
+
+func nonEmptyOr(v, fallback string) string {
+	if strings.TrimSpace(v) == "" {
+		return fallback
+	}
+	return v
+}
@@ -1259,7 +1259,7 @@ func storageSATCommands(devPath string, extended bool) []satJob {
 		return jobs
 	}
 	jobs := []satJob{
-		{name: "smartctl-health", cmd: []string{"smartctl", "-H", "-A", devPath}},
+		{name: "smartctl-health", cmd: []string{"smartctl", "-H", "-A", "-i", devPath}},
 	}
 	if extended {
 		jobs = append(jobs, satJob{name: "smartctl-self-test-short", cmd: []string{"smartctl", "-t", "short", devPath}})
@@ -94,8 +94,8 @@ func writeNVMeReport(b *strings.Builder, outputs map[string][]byte) {
 	nel := uint64(sl.NumErrLogEntries)

 	// data_units are in 1000 × 512-byte sectors = 512,000 bytes each
-	dataRead := float64(sl.DataUnitsRead) * 512000 / 1e9
-	dataWritten := float64(sl.DataUnitsWritten) * 512000 / 1e9
+	readBytes := uint64(sl.DataUnitsRead) * 512000
+	writtenBytes := uint64(sl.DataUnitsWritten) * 512000

 	writeSectionHeader(b, "Health")
 	writeField(b, "Temperature", fmt.Sprintf("%d °C", tempC))
@@ -107,8 +107,8 @@ func writeNVMeReport(b *strings.Builder, outputs map[string][]byte) {
 	writeField(b, "Power On Hours", fmt.Sprintf("%s h", formatUint(poh)))
 	writeField(b, "Power Cycles", formatUint(pc))
 	writeField(b, "Unsafe Shutdowns", formatUint(us))
-	writeField(b, "Data Written", fmt.Sprintf("%.1f GB", dataWritten))
-	writeField(b, "Data Read", fmt.Sprintf("%.1f GB", dataRead))
+	writeField(b, "Data Written", formatBytesHuman(float64(writtenBytes)))
+	writeField(b, "Data Read", formatBytesHuman(float64(readBytes)))

 	writeSectionHeader(b, "Errors")
 	writeField(b, "Media Errors", formatUint(me))
@@ -118,18 +118,22 @@ func writeNVMeReport(b *strings.Builder, outputs map[string][]byte) {
 	if capacityBytes == 0 {
 		capacityBytes = uint64(ctrl.NVMCapacity)
 	}
-	writeResourceSection(b, resourceInfo{
+	ri := resourceInfo{
 		powerOnHours:  poh,
-		writtenBytes:  uint64(sl.DataUnitsWritten) * 512000,
-		readBytes:     uint64(sl.DataUnitsRead) * 512000,
+		powerCycles:   pc,
+		writtenBytes:  writtenBytes,
+		readBytes:     readBytes,
 		capacityBytes: capacityBytes,
-	})
+	}
+	writeResourceSection(b, ri)

 	if selfTest := outputs["nvme-device-self-test"]; len(selfTest) > 0 {
 		writeSectionHeader(b, "Self-Test")
 		result := parseSelfTestResult(string(selfTest))
 		writeField(b, "Result", result)
 	}
+
+	writeConclusionSection(b, ri)
 }

 // ── SATA / SAS (smartctl) ────────────────────────────────────────────────────
@@ -202,13 +206,15 @@ func writeSATAReport(b *strings.Builder, outputs map[string][]byte) {
 		}
 	}

-	var poh, writtenLBAs, readLBAs uint64
+	var poh, pc, writtenLBAs, readLBAs uint64
 	var readValue int
 	hasReadValue := false
 	for _, a := range attrs {
 		switch a.ID {
 		case 9: // Power_On_Hours
 			poh = parseLeadingUint(a.Raw)
+		case 12: // Power_Cycle_Count
+			pc = parseLeadingUint(a.Raw)
 		case 241: // Total_LBAs_Written
 			writtenLBAs = parseLeadingUint(a.Raw)
 		case 242: // Total_LBAs_Read
@@ -218,14 +224,16 @@ func writeSATAReport(b *strings.Builder, outputs map[string][]byte) {
 		}
 	}
 	const sataSectorBytes = 512
-	writeResourceSection(b, resourceInfo{
+	ri := resourceInfo{
 		powerOnHours:   poh,
+		powerCycles:    pc,
 		writtenBytes:   writtenLBAs * sataSectorBytes,
 		readBytes:      readLBAs * sataSectorBytes,
 		capacityBytes:  capacityBytes,
 		readPercent:    100 - readValue,
 		hasReadPercent: hasReadValue,
-	})
+	}
+	writeResourceSection(b, ri)

 	selfTest := outputs["smartctl-self-test-status"]
 	if len(selfTest) == 0 {
@@ -236,6 +244,8 @@ func writeSATAReport(b *strings.Builder, outputs map[string][]byte) {
 		result := parseSelfTestResult(string(selfTest))
 		writeField(b, "Result", result)
 	}
+
+	writeConclusionSection(b, ri)
 }

 func parseSMARTAttrs(text string) []smartAttr {
@@ -331,6 +341,7 @@ const (

 type resourceInfo struct {
 	powerOnHours   uint64
+	powerCycles    uint64
 	writtenBytes   uint64
 	readBytes      uint64
 	capacityBytes  uint64
@@ -363,6 +374,70 @@ func writeResourceSection(b *strings.Builder, r resourceInfo) {
 	}
 }

+// ── Conclusion (new-vs-used verdict) ────────────────────────────────────────
+
+// Thresholds for treating a drive as "new": less than one full drive-write
+// (110% of capacity, headroom for provisioning/overprovisioning rounding),
+// less than a bit over two full drive-reads (210% of capacity), under a
+// week of power-on time, and under 30 power cycles. Any one violation is
+// enough to call the drive used — these are deliberately loose bounds, not
+// a wear/endurance judgment (see -- Resource -- for that).
+const (
+	newDiskMaxWrittenFrac = 1.10
+	newDiskMaxReadFrac    = 2.10
+	newDiskMaxUptimeHours = 7 * 24
+	newDiskMaxPowerCycles = 30
+)
+
+func writeConclusionSection(b *strings.Builder, r resourceInfo) {
+	writeSectionHeader(b, "Conclusion")
+
+	var reasons, notes []string
+	isNew := true
+
+	if r.capacityBytes > 0 {
+		writtenFrac := float64(r.writtenBytes) / float64(r.capacityBytes)
+		readFrac := float64(r.readBytes) / float64(r.capacityBytes)
+		if writtenFrac >= newDiskMaxWrittenFrac {
+			isNew = false
+			reasons = append(reasons, fmt.Sprintf(
+				"data written %s (%s of capacity)",
+				formatBytesHuman(float64(r.writtenBytes)), formatPercent(writtenFrac*100)))
+		}
+		if readFrac >= newDiskMaxReadFrac {
+			isNew = false
+			reasons = append(reasons, fmt.Sprintf(
+				"data read %s (%s of capacity)",
+				formatBytesHuman(float64(r.readBytes)), formatPercent(readFrac*100)))
+		}
+	} else {
+		notes = append(notes, "capacity unknown — write/read criteria not evaluated")
+	}
+
+	if r.powerOnHours >= newDiskMaxUptimeHours {
+		isNew = false
+		reasons = append(reasons, fmt.Sprintf("uptime %s", formatHoursHuman(r.powerOnHours)))
+	}
+
+	if r.powerCycles >= newDiskMaxPowerCycles {
+		isNew = false
+		reasons = append(reasons, fmt.Sprintf("power cycles %s", formatUint(r.powerCycles)))
+	}
+
+	if isNew {
+		writeField(b, "Disk Condition", "NEW")
+	} else {
+		writeField(b, "Disk Condition", "USED")
+		b.WriteString("  Reason:\n")
+		for _, reason := range reasons {
+			fmt.Fprintf(b, "    - %s\n", reason)
+		}
+	}
+	for _, note := range notes {
+		fmt.Fprintf(b, "  Note: %s\n", note)
+	}
+}
+
 // progressBar renders a fixed-width pseudographic bar, e.g. "[######------]".
 func progressBar(frac float64, width int) string {
 	if math.IsNaN(frac) || frac < 0 {
@@ -83,7 +83,36 @@ func TestGenerateDiskReportNVMe(t *testing.T) {
 	assertContains(t, report, "1,234 h")   // power_on_hours with separator
 	assertContains(t, report, "32")        // power_cycles
 	assertContains(t, report, "3")         // unsafe_shutdowns
-	assertContains(t, report, "378.0 GB")  // data_units_written * 512000 / 1e9
+	assertContains(t, report, "378.00 GB") // data_units_written * 512000, human-scaled
+}
+
+// TestGenerateDiskReportNVMeDataUnitsScaleToTB verifies that heavy write/read
+// counters render in the "-- Usage --" section as TB/PB, not raw GB, matching
+// the "-- Resource --" section which already used formatBytesHuman.
+func TestGenerateDiskReportNVMeDataUnitsScaleToTB(t *testing.T) {
+	t.Parallel()
+	heavy := []byte(`{
+  "critical_warning": 0,
+  "temperature": 307,
+  "avail_spare": 100,
+  "spare_thresh": 10,
+  "percent_used": 0,
+  "data_units_read": "252420478",
+  "data_units_written": "103834055",
+  "power_cycles": "45",
+  "power_on_hours": "45",
+  "unsafe_shutdowns": "35",
+  "media_errors": "0",
+  "num_err_log_entries": "0"
+}`)
+	outputs := map[string][]byte{
+		"nvme-id-ctrl":   testNVMeIdCtrl,
+		"nvme-smart-log": heavy,
+	}
+	report := GenerateDiskReportText(1, "/dev/nvme0n1", outputs, time.Unix(0, 0).UTC())
+
+	assertContains(t, report, "Data Written         : 53.16 TB")
+	assertContains(t, report, "Data Read            : 129.24 TB")
 }

 func TestGenerateDiskReportNVMeLoHi(t *testing.T) {
@@ -38,6 +38,7 @@ var techDumpNvidiaCommands = []struct {
 }{
 	{Name: "nvidia-smi", Args: []string{"-q"}, File: "nvidia-smi-q.txt"},
 	{Name: "nvidia-smi", Args: []string{"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown", "--format=csv,noheader,nounits"}, File: "nvidia-smi-query.csv"},
+	{Name: "nvidia-smi", Args: []string{"conf-compute", "-q"}, File: "nvidia-smi-conf-compute-q.txt"},
 }

 type lsblkDumpRoot struct {
@@ -135,7 +135,7 @@ func defaultTaskPriority(target string, params taskParams) int {
 		return taskPriorityBurn
 	case "nvidia", "nvidia-targeted-stress", "nvidia-targeted-power", "nvidia-pulse",
 		"nvidia-interconnect", "nvidia-bandwidth", "memory", "storage", "cpu",
-		"amd", "amd-mem", "amd-bandwidth":
+		"amd", "amd-mem", "amd-bandwidth", "confidential-computing":
 		if params.StressMode {
 			return taskPriorityValidateStress
 		}
@@ -676,6 +676,12 @@ func renderCheck(opts HandlerOptions) string {
 			`<code>lsblk</code>; NVMe: <code>nvme id-ctrl</code>, <code>nvme smart-log</code>; SATA/SAS: <code>smartctl -H -A</code>`,
 			`Seconds — instantaneous device query, no wear counters incremented.`,
 		)) +
+		renderSATCard("confidential-computing", "Confidential Computing", "runSAT('confidential-computing')", "", renderValidateCardBody(
+			inv.NVIDIA,
+			`Checks whether this server can run NVIDIA Confidential Computing: CPU TEE support (Intel TDX / AMD SEV-SNP) and GPU firmware CC capability. Read-only — changes nothing.`,
+			`<code>nvidia-smi conf-compute -q</code>, <code>dmesg</code>, <code>/sys/module/kvm_amd/parameters/*</code>`,
+			`Seconds — read-only query only.`,
+		)) +
 		`</div>
 <div style="height:1px;background:var(--border);margin:16px 0"></div>
 <div class="card" style="margin-bottom:16px">
@@ -737,7 +743,7 @@ func renderCheck(opts HandlerOptions) string {
 <script>
 let satES = null;
 function satLabels() {
-  return {nvidia:'Check GPU (DCGM L2)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Check Memory', storage:'Check Storage', cpu:'Check CPU', amd:'Check AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
+  return {nvidia:'Check GPU (DCGM L2)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Check Memory', storage:'Check Storage', cpu:'Check CPU', amd:'Check AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth', 'confidential-computing':'Check Confidential Computing'};
 }
 let satNvidiaGPUsPromise = null;
 function loadSatNvidiaGPUs() {
@@ -873,7 +879,7 @@ function runAllCheckSAT() {
  status.textContent = 'Enqueuing...';
  const nvidiaIndices = satSelectedGPUIndices();
  const nvidiaAllTargets = ['nvidia', 'nvidia-interconnect', 'nvidia-bandwidth'];
-  const baseTargets = ['cpu', 'memory', 'storage'];
+  const baseTargets = ['cpu', 'memory', 'storage', 'confidential-computing'];
  const amdTargets = selectedAMDValidateTargets();
  const expanded = [];
  baseTargets.forEach(t => expanded.push({target: t}));
@@ -264,6 +264,7 @@ func NewHandler(opts HandlerOptions) http.Handler {
 	mux.HandleFunc("POST /api/sat/nvidia-stress/run", h.handleAPISATRun("nvidia-stress"))
 	mux.HandleFunc("POST /api/sat/memory/run", h.handleAPISATRun("memory"))
 	mux.HandleFunc("POST /api/sat/storage/run", h.handleAPISATRun("storage"))
+	mux.HandleFunc("POST /api/sat/confidential-computing/run", h.handleAPISATRun("confidential-computing"))
 	mux.HandleFunc("POST /api/sat/cpu/run", h.handleAPISATRun("cpu"))
 	mux.HandleFunc("POST /api/sat/amd/run", h.handleAPISATRun("amd"))
 	mux.HandleFunc("POST /api/sat/amd-mem/run", h.handleAPISATRun("amd-mem"))
@@ -272,6 +272,12 @@ func executeTaskWithOptions(opts *HandlerOptions, t *Task, j *jobState, ctx cont
 			break
 		}
 		archive, err = runStorageAcceptancePackCtx(a, ctx, "", t.params.StressMode, j.append)
+	case "confidential-computing":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		archive, err = runConfidentialComputingCheckPackCtx(a, ctx, "", j.append)
 	case "cpu":
 		if a == nil {
 			err = fmt.Errorf("app not configured")
@@ -45,6 +45,7 @@ var taskNames = map[string]string{
 	"nvidia-stress":          "NVIDIA GPU Stress",
 	"memory":                 "Memory SAT",
 	"storage":                "Storage SAT",
+	"confidential-computing": "Confidential Computing Check",
 	"cpu":                    "CPU SAT",
 	"amd":                    "AMD GPU SAT",
 	"amd-mem":                "AMD GPU MEM Integrity",
@@ -312,6 +313,9 @@ var (
 	runStorageAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
 		return a.RunStorageAcceptancePackCtx(ctx, baseDir, extended, logFunc)
 	}
+	runConfidentialComputingCheckPackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
+		return a.RunConfidentialComputingCheckPackCtx(ctx, baseDir, logFunc)
+	}
 	runCPUAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
 		return a.RunCPUAcceptancePackCtx(ctx, baseDir, durationSec, logFunc)
 	}
@@ -1025,6 +1029,12 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
 			break
 		}
 		archive, err = runStorageAcceptancePackCtx(a, ctx, "", t.params.StressMode, j.append)
+	case "confidential-computing":
+		if a == nil {
+			err = fmt.Errorf("app not configured")
+			break
+		}
+		archive, err = runConfidentialComputingCheckPackCtx(a, ctx, "", j.append)
 	case "cpu":
 		if a == nil {
 			err = fmt.Errorf("app not configured")