Compare commits
4 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 9db651c75a | |||
| abaeaea13f | |||
| 5b98005d5d | |||
| 33bc275da2 |
@@ -134,6 +134,7 @@ type satRunner interface {
|
|||||||
ResetNvidiaGPU(index int) (string, error)
|
ResetNvidiaGPU(index int) (string, error)
|
||||||
RunMemoryAcceptancePack(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error)
|
RunMemoryAcceptancePack(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error)
|
||||||
RunStorageAcceptancePack(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error)
|
RunStorageAcceptancePack(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error)
|
||||||
|
RunConfidentialComputingCheckPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||||
RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||||
ListNvidiaGPUs() ([]platform.NvidiaGPU, error)
|
ListNvidiaGPUs() ([]platform.NvidiaGPU, error)
|
||||||
DetectGPUVendor() string
|
DetectGPUVendor() string
|
||||||
|
|||||||
@@ -206,6 +206,22 @@ func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, erro
|
|||||||
return ActionResult{Title: "Storage SAT", Body: satResultBody(path)}, err
|
return ActionResult{Title: "Storage SAT", Body: satResultBody(path)}, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (a *App) RunConfidentialComputingCheckPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunConfidentialComputingCheckPack(ctx, baseDir, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunConfidentialComputingCheckPack(baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunConfidentialComputingCheckPackCtx(context.Background(), baseDir, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunConfidentialComputingCheckPackResult(baseDir string) (ActionResult, error) {
|
||||||
|
path, err := a.RunConfidentialComputingCheckPack(baseDir, nil)
|
||||||
|
return ActionResult{Title: "Confidential Computing Check", Body: satResultBody(path)}, err
|
||||||
|
}
|
||||||
|
|
||||||
func (a *App) DetectGPUVendor() string {
|
func (a *App) DetectGPUVendor() string {
|
||||||
return a.sat.DetectGPUVendor()
|
return a.sat.DetectGPUVendor()
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -243,6 +243,10 @@ func (f fakeSAT) RunStorageAcceptancePack(_ context.Context, baseDir string, _ b
|
|||||||
return f.runStorageFn(baseDir)
|
return f.runStorageFn(baseDir)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (f fakeSAT) RunConfidentialComputingCheckPack(_ context.Context, baseDir string, _ func(string)) (string, error) {
|
||||||
|
return "", nil
|
||||||
|
}
|
||||||
|
|
||||||
func (f fakeSAT) RunCPUAcceptancePack(_ context.Context, baseDir string, durationSec int, _ func(string)) (string, error) {
|
func (f fakeSAT) RunCPUAcceptancePack(_ context.Context, baseDir string, durationSec int, _ func(string)) (string, error) {
|
||||||
if f.runCPUFn != nil {
|
if f.runCPUFn != nil {
|
||||||
return f.runCPUFn(baseDir, durationSec)
|
return f.runCPUFn(baseDir, durationSec)
|
||||||
|
|||||||
@@ -41,6 +41,7 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
|
|||||||
snap.PCIeDevices = enrichPCIeWithPCISerials(snap.PCIeDevices)
|
snap.PCIeDevices = enrichPCIeWithPCISerials(snap.PCIeDevices)
|
||||||
snap.PCIeDevices = enrichPCIeWithNVIDIA(snap.PCIeDevices)
|
snap.PCIeDevices = enrichPCIeWithNVIDIA(snap.PCIeDevices)
|
||||||
snap.PCIeDevices = enrichNVLinkBridgesWithGPUTopo(snap.PCIeDevices)
|
snap.PCIeDevices = enrichNVLinkBridgesWithGPUTopo(snap.PCIeDevices)
|
||||||
|
snap.PCIeDevices = enrichGPUCrossNUMATopology(snap.PCIeDevices)
|
||||||
snap.PCIeDevices = enrichPCIeWithMellanox(snap.PCIeDevices)
|
snap.PCIeDevices = enrichPCIeWithMellanox(snap.PCIeDevices)
|
||||||
snap.PCIeDevices = enrichPCIeWithNICTelemetry(snap.PCIeDevices)
|
snap.PCIeDevices = enrichPCIeWithNICTelemetry(snap.PCIeDevices)
|
||||||
snap.PCIeDevices = enrichPCIeWithRAIDTelemetry(snap.PCIeDevices)
|
snap.PCIeDevices = enrichPCIeWithRAIDTelemetry(snap.PCIeDevices)
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ package collector
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"bee/audit/internal/schema"
|
"bee/audit/internal/schema"
|
||||||
|
"fmt"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"regexp"
|
"regexp"
|
||||||
@@ -119,26 +120,12 @@ func queryNVIDIANVLinkTopo() (nvlinkTopoResult, error) {
|
|||||||
return parseNVIDIATopologyMatrix(string(out)), nil
|
return parseNVIDIATopologyMatrix(string(out)), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// parseNVIDIATopologyMatrix extracts the minimum NVLink bond count from the
|
// locateGPUTopologyColumns finds the header line of a nvidia-smi topo -m
|
||||||
// nvidia-smi topo -m matrix.
|
// matrix and the 0-based field indices (excluding the row label) that
|
||||||
//
|
// correspond to GPU columns. Returns headerIdx=-1 if fewer than 2 GPU columns
|
||||||
// Format (abbreviated):
|
// are found.
|
||||||
//
|
func locateGPUTopologyColumns(lines []string) (headerIdx int, gpuColIndices []int, gpuCount int) {
|
||||||
// GPU0 GPU1 ... NIC0 NIC1
|
headerIdx = -1
|
||||||
// GPU0 X NV18 ... NODE NODE
|
|
||||||
// GPU1 NV18 X ... NODE NODE
|
|
||||||
// NIC0 NODE NODE... X PIX
|
|
||||||
//
|
|
||||||
// The header row starts with "GPU0"; its columns may include non-GPU entries
|
|
||||||
// (NIC, CPU) which are ignored. Only GPU×GPU cells containing NV# values are
|
|
||||||
// counted. X is self; non-NV tokens (NODE, SYS, PHB, PIX) are skipped.
|
|
||||||
func parseNVIDIATopologyMatrix(raw string) nvlinkTopoResult {
|
|
||||||
lines := strings.Split(raw, "\n")
|
|
||||||
|
|
||||||
// Locate the header line and record which column indices are GPU columns.
|
|
||||||
headerIdx := -1
|
|
||||||
var gpuColIndices []int // 0-based indices within fields (excluding the row label)
|
|
||||||
var gpuCount int
|
|
||||||
for i, line := range lines {
|
for i, line := range lines {
|
||||||
trimmed := strings.TrimSpace(line)
|
trimmed := strings.TrimSpace(line)
|
||||||
if strings.HasPrefix(trimmed, "GPU0") {
|
if strings.HasPrefix(trimmed, "GPU0") {
|
||||||
@@ -155,7 +142,30 @@ func parseNVIDIATopologyMatrix(raw string) nvlinkTopoResult {
|
|||||||
break
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if headerIdx < 0 || gpuCount == 0 {
|
if headerIdx < 0 {
|
||||||
|
gpuColIndices = nil
|
||||||
|
gpuCount = 0
|
||||||
|
}
|
||||||
|
return headerIdx, gpuColIndices, gpuCount
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseNVIDIATopologyMatrix extracts the minimum NVLink bond count from the
|
||||||
|
// nvidia-smi topo -m matrix.
|
||||||
|
//
|
||||||
|
// Format (abbreviated):
|
||||||
|
//
|
||||||
|
// GPU0 GPU1 ... NIC0 NIC1
|
||||||
|
// GPU0 X NV18 ... NODE NODE
|
||||||
|
// GPU1 NV18 X ... NODE NODE
|
||||||
|
// NIC0 NODE NODE... X PIX
|
||||||
|
//
|
||||||
|
// The header row starts with "GPU0"; its columns may include non-GPU entries
|
||||||
|
// (NIC, CPU) which are ignored. Only GPU×GPU cells containing NV# values are
|
||||||
|
// counted. X is self; non-NV tokens (NODE, SYS, PHB, PIX) are skipped.
|
||||||
|
func parseNVIDIATopologyMatrix(raw string) nvlinkTopoResult {
|
||||||
|
lines := strings.Split(raw, "\n")
|
||||||
|
headerIdx, gpuColIndices, gpuCount := locateGPUTopologyColumns(lines)
|
||||||
|
if headerIdx < 0 {
|
||||||
return nvlinkTopoResult{}
|
return nvlinkTopoResult{}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -204,3 +214,110 @@ func parseNVIDIATopologyMatrix(raw string) nvlinkTopoResult {
|
|||||||
MinNVLinks: minLinks,
|
MinNVLinks: minLinks,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// parseCrossNUMAPeers scans a nvidia-smi topo -m matrix for GPU pairs whose
|
||||||
|
// only path is "SYS" — traversing PCIe as well as the SMP interconnect
|
||||||
|
// between NUMA nodes (e.g. QPI/UPI). This is the slowest possible GPU-GPU
|
||||||
|
// path and, on servers where GPUs are only bridged pairwise via NVLink
|
||||||
|
// bridge (no switched NVLink fabric), it is exactly the hop that traffic
|
||||||
|
// between different bridge pairs has to cross. Returns a map from GPU index
|
||||||
|
// to the peer GPU indices reachable only via this cross-NUMA path.
|
||||||
|
func parseCrossNUMAPeers(raw string) map[int][]int {
|
||||||
|
lines := strings.Split(raw, "\n")
|
||||||
|
headerIdx, gpuColIndices, _ := locateGPUTopologyColumns(lines)
|
||||||
|
if headerIdx < 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// colIdx (0-based within header fields) -> GPU index, in header order.
|
||||||
|
colIdxToGPU := make(map[int]int, len(gpuColIndices))
|
||||||
|
for gpuIdx, colIdx := range gpuColIndices {
|
||||||
|
colIdxToGPU[colIdx] = gpuIdx
|
||||||
|
}
|
||||||
|
|
||||||
|
peers := make(map[int][]int)
|
||||||
|
rowGPU := -1
|
||||||
|
for _, line := range lines[headerIdx+1:] {
|
||||||
|
trimmed := strings.TrimSpace(line)
|
||||||
|
if !strings.HasPrefix(trimmed, "GPU") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
rowGPU++
|
||||||
|
cells := strings.Fields(trimmed)
|
||||||
|
for _, colIdx := range gpuColIndices {
|
||||||
|
dataIdx := colIdx + 1
|
||||||
|
if dataIdx >= len(cells) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
colGPU := colIdxToGPU[colIdx]
|
||||||
|
if colGPU == rowGPU {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if strings.EqualFold(cells[dataIdx], "SYS") {
|
||||||
|
peers[rowGPU] = append(peers[rowGPU], colGPU)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(peers) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return peers
|
||||||
|
}
|
||||||
|
|
||||||
|
// enrichGPUCrossNUMATopology flags GPUs that reach one or more peer GPUs only
|
||||||
|
// via a cross-NUMA-node PCIe hop ("SYS" in nvidia-smi topo -m). Unlike
|
||||||
|
// enrichNVLinkBridgesWithGPUTopo, this does not require an NVLink bridge PCIe
|
||||||
|
// device to be present: it applies to any multi-GPU box, since the weak point
|
||||||
|
// it detects is the path *between* NVLink-bridged pairs (or between GPUs with
|
||||||
|
// no NVLink at all), not the bridge itself.
|
||||||
|
func enrichGPUCrossNUMATopology(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice {
|
||||||
|
gpuByBDF, err := queryNVIDIAGPUs()
|
||||||
|
if err != nil || len(gpuByBDF) < 2 {
|
||||||
|
return devs
|
||||||
|
}
|
||||||
|
|
||||||
|
out, err := exec.Command("nvidia-smi", "topo", "-m").Output()
|
||||||
|
if err != nil {
|
||||||
|
slog.Info("gpu-topology: nvidia-smi topo unavailable, skipping cross-NUMA check", "err", err)
|
||||||
|
return devs
|
||||||
|
}
|
||||||
|
peers := parseCrossNUMAPeers(string(out))
|
||||||
|
if len(peers) == 0 {
|
||||||
|
return devs
|
||||||
|
}
|
||||||
|
|
||||||
|
bdfToIndex := make(map[string]int, len(gpuByBDF))
|
||||||
|
for bdf, info := range gpuByBDF {
|
||||||
|
bdfToIndex[bdf] = info.Index
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := range devs {
|
||||||
|
if devs[i].BDF == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
idx, ok := bdfToIndex[normalizePCIeBDF(*devs[i].BDF)]
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
peerList, ok := peers[idx]
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if devs[i].Telemetry == nil {
|
||||||
|
devs[i].Telemetry = map[string]any{}
|
||||||
|
}
|
||||||
|
devs[i].Telemetry["nvlink_cross_numa_peers"] = peerList
|
||||||
|
if devs[i].Status == nil || *devs[i].Status == statusOK {
|
||||||
|
warn := statusWarning
|
||||||
|
devs[i].Status = &warn
|
||||||
|
}
|
||||||
|
if devs[i].ErrorDescription == nil {
|
||||||
|
devs[i].ErrorDescription = stringPtr(fmt.Sprintf(
|
||||||
|
"GPU %d reaches GPU(s) %v only via a cross-NUMA-node PCIe path (SYS) — expect reduced bandwidth/increased latency for tensor-parallel workloads spanning these GPUs",
|
||||||
|
idx, peerList))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
slog.Info("gpu-topology: cross-NUMA peers detected", "affected_gpus", len(peers))
|
||||||
|
return devs
|
||||||
|
}
|
||||||
|
|||||||
@@ -80,6 +80,42 @@ func TestParseNVIDIATopologyMatrixEmpty(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestParseCrossNUMAPeersDetectsSYS(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
// 4-GPU box, two NVLink-bridged pairs (GPU0-GPU1, GPU2-GPU3); the pairs
|
||||||
|
// themselves only reach each other via SYS (cross-NUMA PCIe hop) — the
|
||||||
|
// exact topology of a server using pairwise NVLink bridge cards instead
|
||||||
|
// of a switched NVLink fabric.
|
||||||
|
input := ` GPU0 GPU1 GPU2 GPU3
|
||||||
|
GPU0 X NV4 SYS SYS
|
||||||
|
GPU1 NV4 X SYS SYS
|
||||||
|
GPU2 SYS SYS X NV4
|
||||||
|
GPU3 SYS SYS NV4 X
|
||||||
|
`
|
||||||
|
peers := parseCrossNUMAPeers(input)
|
||||||
|
|
||||||
|
if len(peers[0]) != 2 || peers[0][0] != 2 || peers[0][1] != 3 {
|
||||||
|
t.Fatalf("peers[0]=%v want [2 3]", peers[0])
|
||||||
|
}
|
||||||
|
if len(peers[2]) != 2 {
|
||||||
|
t.Fatalf("peers[2]=%v want 2 entries", peers[2])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseCrossNUMAPeersNoSYS(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
// Full NVSwitch fabric: every GPU pair connects via NVLink, no SYS hops.
|
||||||
|
input := ` GPU0 GPU1
|
||||||
|
GPU0 X NV18
|
||||||
|
GPU1 NV18 X
|
||||||
|
`
|
||||||
|
if peers := parseCrossNUMAPeers(input); peers != nil {
|
||||||
|
t.Fatalf("peers=%v want nil (no SYS pairs)", peers)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestApplyPCIeLinkSpeedWarningNVLinkBridgeEscalates(t *testing.T) {
|
func TestApplyPCIeLinkSpeedWarningNVLinkBridgeEscalates(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
|
|||||||
@@ -766,7 +766,7 @@ func parseMDAdmPlatformLicense(raw string) *string {
|
|||||||
|
|
||||||
func queryDeviceSerial(devPath string) string {
|
func queryDeviceSerial(devPath string) string {
|
||||||
if out, err := exec.Command("nvme", "id-ctrl", devPath, "-o", "json").Output(); err == nil {
|
if out, err := exec.Command("nvme", "id-ctrl", devPath, "-o", "json").Output(); err == nil {
|
||||||
var ctrl nvmeIDCtrl
|
var ctrl NVMeIDCtrl
|
||||||
if json.Unmarshal(out, &ctrl) == nil {
|
if json.Unmarshal(out, &ctrl) == nil {
|
||||||
if v := cleanDMIValue(strings.TrimSpace(ctrl.SerialNumber)); v != "" {
|
if v := cleanDMIValue(strings.TrimSpace(ctrl.SerialNumber)); v != "" {
|
||||||
return v
|
return v
|
||||||
|
|||||||
@@ -84,16 +84,19 @@ func collectStorage() []schema.HardwareStorage {
|
|||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
// jsonInt64 accepts both a bare JSON number and a JSON-quoted number string.
|
// JSONInt64 accepts a bare JSON number (512), a JSON-quoted number string
|
||||||
// lsblk -J emits LOG-SEC / PHY-SEC as integers on util-linux ≥ 2.37 (Debian 12)
|
// ("512" — lsblk -J on util-linux < 2.37, and nvme-cli for large 64-bit
|
||||||
// but older versions emit them as strings. This type handles both.
|
// counters that would lose precision as JS numbers), or a {"lo":n,"hi":n}
|
||||||
type jsonInt64 int64
|
// object (128-bit NVMe counters on some nvme-cli versions; hi is ignored as
|
||||||
|
// no real counter exceeds 64 bits). Shared by lsblk and nvme-cli JSON output
|
||||||
|
// across the collector and the human-readable disk report.
|
||||||
|
type JSONInt64 int64
|
||||||
|
|
||||||
func (j *jsonInt64) UnmarshalJSON(data []byte) error {
|
func (j *JSONInt64) UnmarshalJSON(data []byte) error {
|
||||||
// bare number: 512
|
// bare number: 512
|
||||||
var n int64
|
var n int64
|
||||||
if err := json.Unmarshal(data, &n); err == nil {
|
if err := json.Unmarshal(data, &n); err == nil {
|
||||||
*j = jsonInt64(n)
|
*j = JSONInt64(n)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
// quoted string: "512"
|
// quoted string: "512"
|
||||||
@@ -101,10 +104,18 @@ func (j *jsonInt64) UnmarshalJSON(data []byte) error {
|
|||||||
if err := json.Unmarshal(data, &s); err == nil {
|
if err := json.Unmarshal(data, &s); err == nil {
|
||||||
n, err := strconv.ParseInt(strings.TrimSpace(s), 10, 64)
|
n, err := strconv.ParseInt(strings.TrimSpace(s), 10, 64)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
*j = jsonInt64(n)
|
*j = JSONInt64(n)
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
// {"lo":n,"hi":n} 128-bit counter object
|
||||||
|
var obj struct {
|
||||||
|
Lo int64 `json:"lo"`
|
||||||
|
}
|
||||||
|
if err := json.Unmarshal(data, &obj); err == nil {
|
||||||
|
*j = JSONInt64(obj.Lo)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
return nil // null or unexpected type — leave zero
|
return nil // null or unexpected type — leave zero
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -117,8 +128,8 @@ type lsblkDevice struct {
|
|||||||
Model string `json:"model"`
|
Model string `json:"model"`
|
||||||
Tran string `json:"tran"`
|
Tran string `json:"tran"`
|
||||||
Hctl string `json:"hctl"`
|
Hctl string `json:"hctl"`
|
||||||
LogSec jsonInt64 `json:"log-sec"`
|
LogSec JSONInt64 `json:"log-sec"`
|
||||||
PhySec jsonInt64 `json:"phy-sec"`
|
PhySec JSONInt64 `json:"phy-sec"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type lsblkRoot struct {
|
type lsblkRoot struct {
|
||||||
@@ -423,32 +434,36 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
|
|||||||
return s
|
return s
|
||||||
}
|
}
|
||||||
|
|
||||||
// nvmeSmartLog is the subset of `nvme smart-log -o json` output we care about.
|
// NVMeSmartLog is the subset of `nvme smart-log -o json` output shared by the
|
||||||
// nvme-cli emits most counters as JSON strings (e.g. "power_on_hours":"49"),
|
// structured collector and the human-readable disk report. nvme-cli emits
|
||||||
// so all numeric fields use jsonInt64 which accepts both bare numbers and
|
// most counters as JSON strings (e.g. "power_on_hours":"49") or, on some
|
||||||
// quoted strings. Field names match nvme-cli JSON output, not NVMe spec prose.
|
// versions, as {"lo":n,"hi":n} objects — all numeric fields use JSONInt64,
|
||||||
type nvmeSmartLog struct {
|
// which accepts bare numbers, quoted strings, and lo/hi objects. Field names
|
||||||
CriticalWarning jsonInt64 `json:"critical_warning"`
|
// match nvme-cli JSON output, not NVMe spec prose.
|
||||||
PercentageUsed jsonInt64 `json:"percent_used"`
|
type NVMeSmartLog struct {
|
||||||
AvailableSpare jsonInt64 `json:"avail_spare"`
|
CriticalWarning JSONInt64 `json:"critical_warning"`
|
||||||
SpareThreshold jsonInt64 `json:"spare_thresh"`
|
PercentageUsed JSONInt64 `json:"percent_used"`
|
||||||
Temperature jsonInt64 `json:"temperature"`
|
AvailableSpare JSONInt64 `json:"avail_spare"`
|
||||||
PowerOnHours jsonInt64 `json:"power_on_hours"`
|
SpareThreshold JSONInt64 `json:"spare_thresh"`
|
||||||
PowerCycles jsonInt64 `json:"power_cycles"`
|
Temperature JSONInt64 `json:"temperature"`
|
||||||
UnsafeShutdowns jsonInt64 `json:"unsafe_shutdowns"`
|
PowerOnHours JSONInt64 `json:"power_on_hours"`
|
||||||
DataUnitsRead jsonInt64 `json:"data_units_read"`
|
PowerCycles JSONInt64 `json:"power_cycles"`
|
||||||
DataUnitsWritten jsonInt64 `json:"data_units_written"`
|
UnsafeShutdowns JSONInt64 `json:"unsafe_shutdowns"`
|
||||||
ControllerBusy jsonInt64 `json:"controller_busy_time"`
|
DataUnitsRead JSONInt64 `json:"data_units_read"`
|
||||||
MediaErrors jsonInt64 `json:"media_errors"`
|
DataUnitsWritten JSONInt64 `json:"data_units_written"`
|
||||||
NumErrLogEntries jsonInt64 `json:"num_err_log_entries"`
|
ControllerBusy JSONInt64 `json:"controller_busy_time"`
|
||||||
|
MediaErrors JSONInt64 `json:"media_errors"`
|
||||||
|
NumErrLogEntries JSONInt64 `json:"num_err_log_entries"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// nvmeIDCtrl is the subset of `nvme id-ctrl -o json` output.
|
// NVMeIDCtrl is the subset of `nvme id-ctrl -o json` output shared by the
|
||||||
type nvmeIDCtrl struct {
|
// structured collector and the human-readable disk report.
|
||||||
|
type NVMeIDCtrl struct {
|
||||||
ModelNumber string `json:"mn"`
|
ModelNumber string `json:"mn"`
|
||||||
SerialNumber string `json:"sn"`
|
SerialNumber string `json:"sn"`
|
||||||
FirmwareRev string `json:"fr"`
|
FirmwareRev string `json:"fr"`
|
||||||
TotalCapacity int64 `json:"tnvmcap"`
|
TotalCapacity JSONInt64 `json:"tnvmcap"`
|
||||||
|
NVMCapacity JSONInt64 `json:"nvmcap"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
||||||
@@ -481,7 +496,7 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
|||||||
|
|
||||||
// id-ctrl: model, serial, firmware, capacity
|
// id-ctrl: model, serial, firmware, capacity
|
||||||
if out, err := exec.Command("nvme", "id-ctrl", devPath, "-o", "json").Output(); err == nil {
|
if out, err := exec.Command("nvme", "id-ctrl", devPath, "-o", "json").Output(); err == nil {
|
||||||
var ctrl nvmeIDCtrl
|
var ctrl NVMeIDCtrl
|
||||||
if json.Unmarshal(out, &ctrl) == nil {
|
if json.Unmarshal(out, &ctrl) == nil {
|
||||||
if v := cleanDMIValue(strings.TrimSpace(ctrl.ModelNumber)); v != "" {
|
if v := cleanDMIValue(strings.TrimSpace(ctrl.ModelNumber)); v != "" {
|
||||||
s.Model = &v
|
s.Model = &v
|
||||||
@@ -502,7 +517,7 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
|||||||
|
|
||||||
// smart-log: wear telemetry
|
// smart-log: wear telemetry
|
||||||
if out, err := exec.Command("nvme", "smart-log", devPath, "-o", "json").Output(); err == nil {
|
if out, err := exec.Command("nvme", "smart-log", devPath, "-o", "json").Output(); err == nil {
|
||||||
var log nvmeSmartLog
|
var log NVMeSmartLog
|
||||||
if json.Unmarshal(out, &log) == nil {
|
if json.Unmarshal(out, &log) == nil {
|
||||||
if log.PowerOnHours > 0 {
|
if log.PowerOnHours > 0 {
|
||||||
v := int64(log.PowerOnHours)
|
v := int64(log.PowerOnHours)
|
||||||
|
|||||||
@@ -56,7 +56,7 @@ func TestJsonInt64UnmarshalBothFormats(t *testing.T) {
|
|||||||
{`null`, 0},
|
{`null`, 0},
|
||||||
}
|
}
|
||||||
for _, tc := range cases {
|
for _, tc := range cases {
|
||||||
var v jsonInt64
|
var v JSONInt64
|
||||||
if err := v.UnmarshalJSON([]byte(tc.json)); err != nil {
|
if err := v.UnmarshalJSON([]byte(tc.json)); err != nil {
|
||||||
t.Fatalf("UnmarshalJSON(%s): unexpected error %v", tc.json, err)
|
t.Fatalf("UnmarshalJSON(%s): unexpected error %v", tc.json, err)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ import (
|
|||||||
|
|
||||||
// TestNVMeSmartLogUnmarshal verifies that nvme-cli JSON output (where most
|
// TestNVMeSmartLogUnmarshal verifies that nvme-cli JSON output (where most
|
||||||
// counters are quoted strings and field names differ from NVMe spec prose)
|
// counters are quoted strings and field names differ from NVMe spec prose)
|
||||||
// is correctly parsed into nvmeSmartLog.
|
// is correctly parsed into NVMeSmartLog.
|
||||||
func TestNVMeSmartLogUnmarshal(t *testing.T) {
|
func TestNVMeSmartLogUnmarshal(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
@@ -30,7 +30,7 @@ func TestNVMeSmartLogUnmarshal(t *testing.T) {
|
|||||||
"media_errors": "0",
|
"media_errors": "0",
|
||||||
"num_err_log_entries": "0"
|
"num_err_log_entries": "0"
|
||||||
}`
|
}`
|
||||||
var log nvmeSmartLog
|
var log NVMeSmartLog
|
||||||
if err := json.Unmarshal([]byte(raw), &log); err != nil {
|
if err := json.Unmarshal([]byte(raw), &log); err != nil {
|
||||||
t.Fatalf("json.Unmarshal failed: %v", err)
|
t.Fatalf("json.Unmarshal failed: %v", err)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,248 @@
|
|||||||
|
package platform
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// ConfidentialComputingStatus summarizes whether this server can run NVIDIA
|
||||||
|
// Confidential Computing: CPU-side TEE support (Intel TDX / AMD SEV-SNP) and
|
||||||
|
// GPU firmware CC capability, as reported by `nvidia-smi conf-compute -q`.
|
||||||
|
type ConfidentialComputingStatus struct {
|
||||||
|
CollectedAt time.Time `json:"collected_at"`
|
||||||
|
|
||||||
|
// GPU-reported fields, parsed from `nvidia-smi conf-compute -q`.
|
||||||
|
NvidiaSMIAvailable bool `json:"nvidia_smi_available"`
|
||||||
|
CCState string `json:"cc_state,omitempty"` // ON / OFF
|
||||||
|
MultiGPUMode string `json:"multi_gpu_mode,omitempty"` // Protected PCIe / ...
|
||||||
|
CPUCCCapability string `json:"cpu_cc_capability,omitempty"` // e.g. "INTEL TDX", "AMD SEV-SNP", "NONE"
|
||||||
|
GPUCCCapability string `json:"gpu_cc_capability,omitempty"` // e.g. "CC Capable", "Not Capable"
|
||||||
|
CCGPUsReadyState string `json:"cc_gpus_ready_state,omitempty"` // Ready / Not Ready
|
||||||
|
|
||||||
|
// Host-side evidence that the CPU's TEE is actually active in the running
|
||||||
|
// kernel (BIOS + kernel cmdline + firmware), independent of what the GPU
|
||||||
|
// driver reports. Used as a fallback when the NVIDIA driver isn't loaded.
|
||||||
|
HostAMDSEVSupported bool `json:"host_amd_sev_supported"`
|
||||||
|
HostAMDSEVESSupported bool `json:"host_amd_sev_es_supported"`
|
||||||
|
HostAMDSEVSNPActive bool `json:"host_amd_sev_snp_active"`
|
||||||
|
HostIntelTDXActive bool `json:"host_intel_tdx_active"`
|
||||||
|
|
||||||
|
// GPUCanRunCC is true when the GPU firmware reports CC-capable.
|
||||||
|
GPUCanRunCC bool `json:"gpu_can_run_cc"`
|
||||||
|
// CPUCanRunCC is true when either the GPU driver or the host kernel
|
||||||
|
// reports an active/available CPU TEE (SEV-SNP or TDX).
|
||||||
|
CPUCanRunCC bool `json:"cpu_can_run_cc"`
|
||||||
|
// Ready is true when both the CPU and the GPU support Confidential
|
||||||
|
// Computing, regardless of whether CC mode is currently enabled.
|
||||||
|
Ready bool `json:"ready"`
|
||||||
|
|
||||||
|
Notes []string `json:"notes,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// RunConfidentialComputingCheckPack runs a read-only check of whether this
|
||||||
|
// server can run NVIDIA Confidential Computing: it queries the GPU driver
|
||||||
|
// (`nvidia-smi conf-compute -q`) and inspects host kernel/dmesg evidence of
|
||||||
|
// AMD SEV-SNP / Intel TDX support. It changes nothing on the system.
|
||||||
|
func (s *System) RunConfidentialComputingCheckPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
if ctx == nil {
|
||||||
|
ctx = context.Background()
|
||||||
|
}
|
||||||
|
if baseDir == "" {
|
||||||
|
baseDir = "/var/log/bee-sat"
|
||||||
|
}
|
||||||
|
ts := time.Now().UTC().Format("20060102-150405")
|
||||||
|
runDir := filepath.Join(baseDir, "confidential-computing-"+ts)
|
||||||
|
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
verboseLog := filepath.Join(runDir, "verbose.log")
|
||||||
|
|
||||||
|
status := ConfidentialComputingStatus{CollectedAt: time.Now().UTC()}
|
||||||
|
|
||||||
|
// GPU firmware / driver state.
|
||||||
|
ccOut, ccErr := runSATCommandCtx(ctx, verboseLog, "nvidia-smi-conf-compute-q", []string{"nvidia-smi", "conf-compute", "-q"}, nil, logFunc)
|
||||||
|
_ = os.WriteFile(filepath.Join(runDir, "01-nvidia-smi-conf-compute-q.log"), ccOut, 0644)
|
||||||
|
if ccErr == nil {
|
||||||
|
status.NvidiaSMIAvailable = true
|
||||||
|
fields := parseConfComputeFields(ccOut)
|
||||||
|
status.CCState = fields["CC State"]
|
||||||
|
status.MultiGPUMode = fields["Multi-GPU Mode"]
|
||||||
|
status.CPUCCCapability = fields["CPU CC Capabilities"]
|
||||||
|
status.GPUCCCapability = fields["GPU CC Capabilities"]
|
||||||
|
status.CCGPUsReadyState = fields["CC GPUs Ready State"]
|
||||||
|
} else {
|
||||||
|
status.Notes = append(status.Notes, "nvidia-smi conf-compute -q unavailable (no NVIDIA driver, or GPU not present): "+firstLine(string(ccOut)))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Host kernel evidence, independent of the GPU driver.
|
||||||
|
dmesgOut, _ := runSATCommandCtx(ctx, verboseLog, "dmesg", []string{"dmesg"}, nil, nil)
|
||||||
|
ccDmesgLines := filterConfComputeDmesgLines(dmesgOut)
|
||||||
|
_ = os.WriteFile(filepath.Join(runDir, "02-dmesg-cc-relevant.log"), []byte(strings.Join(ccDmesgLines, "\n")+"\n"), 0644)
|
||||||
|
|
||||||
|
lowerDmesg := strings.ToLower(strings.Join(ccDmesgLines, "\n"))
|
||||||
|
status.HostAMDSEVSNPActive = strings.Contains(lowerDmesg, "sev-snp enabled")
|
||||||
|
status.HostIntelTDXActive = strings.Contains(lowerDmesg, "tdx module") && strings.Contains(lowerDmesg, "module initialized") ||
|
||||||
|
strings.Contains(lowerDmesg, "virt/tdx: module initialized")
|
||||||
|
|
||||||
|
for i, path := range []string{
|
||||||
|
"/sys/module/kvm_amd/parameters/sev",
|
||||||
|
"/sys/module/kvm_amd/parameters/sev_es",
|
||||||
|
"/sys/module/kvm_amd/parameters/sev_snp",
|
||||||
|
} {
|
||||||
|
name := fmt.Sprintf("sysfs-%s", filepath.Base(path))
|
||||||
|
out, err := runSATCommandCtx(ctx, verboseLog, name, []string{"cat", path}, nil, nil)
|
||||||
|
_ = os.WriteFile(filepath.Join(runDir, fmt.Sprintf("03-%02d-%s.log", i+1, name)), out, 0644)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
val := strings.TrimSpace(string(out))
|
||||||
|
switch filepath.Base(path) {
|
||||||
|
case "sev":
|
||||||
|
status.HostAMDSEVSupported = strings.EqualFold(val, "Y")
|
||||||
|
case "sev_es":
|
||||||
|
status.HostAMDSEVESSupported = strings.EqualFold(val, "Y")
|
||||||
|
case "sev_snp":
|
||||||
|
if strings.EqualFold(val, "Y") {
|
||||||
|
status.HostAMDSEVSNPActive = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
status.GPUCanRunCC = strings.EqualFold(strings.TrimSpace(status.GPUCCCapability), "CC Capable")
|
||||||
|
cpuCapReported := strings.TrimSpace(status.CPUCCCapability)
|
||||||
|
status.CPUCanRunCC = status.HostAMDSEVSNPActive || status.HostIntelTDXActive ||
|
||||||
|
(cpuCapReported != "" && !strings.EqualFold(cpuCapReported, "NONE"))
|
||||||
|
status.Ready = status.CPUCanRunCC && status.GPUCanRunCC
|
||||||
|
|
||||||
|
if !status.NvidiaSMIAvailable {
|
||||||
|
status.Notes = append(status.Notes, "GPU CC capability unknown — install the NVIDIA driver to query it with `nvidia-smi conf-compute -q`.")
|
||||||
|
}
|
||||||
|
|
||||||
|
summary := renderConfidentialComputingSummary(status)
|
||||||
|
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
report := renderConfidentialComputingReport(status)
|
||||||
|
if err := os.WriteFile(filepath.Join(runDir, "confidential-computing-report.txt"), []byte(report), 0644); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
|
||||||
|
return runDir, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseConfComputeFields parses the indented "Key : Value" block emitted by
|
||||||
|
// `nvidia-smi conf-compute -q`, e.g.:
|
||||||
|
//
|
||||||
|
// CC State : OFF
|
||||||
|
// Multi-GPU Mode : Protected PCIe
|
||||||
|
// CPU CC Capabilities : INTEL TDX
|
||||||
|
// GPU CC Capabilities : CC Capable
|
||||||
|
// CC GPUs Ready State : Not Ready
|
||||||
|
func parseConfComputeFields(out []byte) map[string]string {
|
||||||
|
fields := map[string]string{}
|
||||||
|
for _, line := range strings.Split(string(out), "\n") {
|
||||||
|
idx := strings.Index(line, ":")
|
||||||
|
if idx < 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
key := strings.TrimSpace(line[:idx])
|
||||||
|
val := strings.TrimSpace(line[idx+1:])
|
||||||
|
if key == "" || val == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
fields[key] = val
|
||||||
|
}
|
||||||
|
return fields
|
||||||
|
}
|
||||||
|
|
||||||
|
// filterConfComputeDmesgLines returns the dmesg lines relevant to CPU
|
||||||
|
// Confidential Computing support (AMD SEV/SEV-ES/SEV-SNP, Intel TDX).
|
||||||
|
func filterConfComputeDmesgLines(dmesgOut []byte) []string {
|
||||||
|
var lines []string
|
||||||
|
scanner := bytes.Split(dmesgOut, []byte("\n"))
|
||||||
|
for _, raw := range scanner {
|
||||||
|
lower := strings.ToLower(string(raw))
|
||||||
|
if strings.Contains(lower, "sev") || strings.Contains(lower, "tdx") {
|
||||||
|
lines = append(lines, string(raw))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return lines
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderConfidentialComputingSummary(status ConfidentialComputingStatus) string {
|
||||||
|
var b strings.Builder
|
||||||
|
fmt.Fprintf(&b, "run_at_utc=%s\n", status.CollectedAt.Format(time.RFC3339))
|
||||||
|
fmt.Fprintf(&b, "nvidia_smi_available=%t\n", status.NvidiaSMIAvailable)
|
||||||
|
fmt.Fprintf(&b, "cc_state=%s\n", status.CCState)
|
||||||
|
fmt.Fprintf(&b, "multi_gpu_mode=%s\n", status.MultiGPUMode)
|
||||||
|
fmt.Fprintf(&b, "cpu_cc_capability=%s\n", status.CPUCCCapability)
|
||||||
|
fmt.Fprintf(&b, "gpu_cc_capability=%s\n", status.GPUCCCapability)
|
||||||
|
fmt.Fprintf(&b, "cc_gpus_ready_state=%s\n", status.CCGPUsReadyState)
|
||||||
|
fmt.Fprintf(&b, "host_amd_sev_supported=%t\n", status.HostAMDSEVSupported)
|
||||||
|
fmt.Fprintf(&b, "host_amd_sev_es_supported=%t\n", status.HostAMDSEVESSupported)
|
||||||
|
fmt.Fprintf(&b, "host_amd_sev_snp_active=%t\n", status.HostAMDSEVSNPActive)
|
||||||
|
fmt.Fprintf(&b, "host_intel_tdx_active=%t\n", status.HostIntelTDXActive)
|
||||||
|
fmt.Fprintf(&b, "cpu_can_run_cc=%t\n", status.CPUCanRunCC)
|
||||||
|
fmt.Fprintf(&b, "gpu_can_run_cc=%t\n", status.GPUCanRunCC)
|
||||||
|
fmt.Fprintf(&b, "ready=%t\n", status.Ready)
|
||||||
|
if status.Ready {
|
||||||
|
fmt.Fprintln(&b, "overall_status=OK")
|
||||||
|
} else {
|
||||||
|
fmt.Fprintln(&b, "overall_status=NOT_READY")
|
||||||
|
}
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderConfidentialComputingReport(status ConfidentialComputingStatus) string {
|
||||||
|
var b strings.Builder
|
||||||
|
line := strings.Repeat("=", 80)
|
||||||
|
b.WriteString(line + "\n")
|
||||||
|
b.WriteString("Confidential Computing Readiness\n")
|
||||||
|
b.WriteString(line + "\n\n")
|
||||||
|
|
||||||
|
verdict := "NOT READY"
|
||||||
|
if status.Ready {
|
||||||
|
verdict = "READY"
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, "Verdict: %s\n\n", verdict)
|
||||||
|
|
||||||
|
b.WriteString("-- CPU ----------------------------------------------------------------------\n")
|
||||||
|
fmt.Fprintf(&b, " Reported by GPU driver : %s\n", nonEmptyOr(status.CPUCCCapability, "unknown"))
|
||||||
|
fmt.Fprintf(&b, " AMD SEV supported : %t\n", status.HostAMDSEVSupported)
|
||||||
|
fmt.Fprintf(&b, " AMD SEV-ES supported : %t\n", status.HostAMDSEVESSupported)
|
||||||
|
fmt.Fprintf(&b, " AMD SEV-SNP active : %t\n", status.HostAMDSEVSNPActive)
|
||||||
|
fmt.Fprintf(&b, " Intel TDX active : %t\n", status.HostIntelTDXActive)
|
||||||
|
fmt.Fprintf(&b, " Can run CC : %t\n\n", status.CPUCanRunCC)
|
||||||
|
|
||||||
|
b.WriteString("-- GPU ----------------------------------------------------------------------\n")
|
||||||
|
fmt.Fprintf(&b, " nvidia-smi available : %t\n", status.NvidiaSMIAvailable)
|
||||||
|
fmt.Fprintf(&b, " GPU CC Capabilities : %s\n", nonEmptyOr(status.GPUCCCapability, "unknown"))
|
||||||
|
fmt.Fprintf(&b, " CC State (current) : %s\n", nonEmptyOr(status.CCState, "unknown"))
|
||||||
|
fmt.Fprintf(&b, " Multi-GPU Mode : %s\n", nonEmptyOr(status.MultiGPUMode, "unknown"))
|
||||||
|
fmt.Fprintf(&b, " CC GPUs Ready State : %s\n", nonEmptyOr(status.CCGPUsReadyState, "unknown"))
|
||||||
|
fmt.Fprintf(&b, " Can run CC : %t\n\n", status.GPUCanRunCC)
|
||||||
|
|
||||||
|
if len(status.Notes) > 0 {
|
||||||
|
b.WriteString("-- Notes ----------------------------------------------------------------------\n")
|
||||||
|
for _, n := range status.Notes {
|
||||||
|
fmt.Fprintf(&b, " - %s\n", n)
|
||||||
|
}
|
||||||
|
b.WriteString("\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Fprintf(&b, "Collected : %s\n", status.CollectedAt.Format("2006-01-02 15:04:05 UTC"))
|
||||||
|
b.WriteString(line + "\n")
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func nonEmptyOr(v, fallback string) string {
|
||||||
|
if strings.TrimSpace(v) == "" {
|
||||||
|
return fallback
|
||||||
|
}
|
||||||
|
return v
|
||||||
|
}
|
||||||
@@ -1259,7 +1259,7 @@ func storageSATCommands(devPath string, extended bool) []satJob {
|
|||||||
return jobs
|
return jobs
|
||||||
}
|
}
|
||||||
jobs := []satJob{
|
jobs := []satJob{
|
||||||
{name: "smartctl-health", cmd: []string{"smartctl", "-H", "-A", devPath}},
|
{name: "smartctl-health", cmd: []string{"smartctl", "-H", "-A", "-i", devPath}},
|
||||||
}
|
}
|
||||||
if extended {
|
if extended {
|
||||||
jobs = append(jobs, satJob{name: "smartctl-self-test-short", cmd: []string{"smartctl", "-t", "short", devPath}})
|
jobs = append(jobs, satJob{name: "smartctl-self-test-short", cmd: []string{"smartctl", "-t", "short", devPath}})
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
package platform
|
package platform
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bee/audit/internal/collector"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"math"
|
"math"
|
||||||
@@ -39,65 +40,22 @@ func GenerateDiskReportText(index int, devPath string, outputs map[string][]byte
|
|||||||
|
|
||||||
// ── NVMe ─────────────────────────────────────────────────────────────────────
|
// ── NVMe ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
type nvmeIdCtrl struct {
|
|
||||||
ModelNumber string `json:"mn"`
|
|
||||||
SerialNumber string `json:"sn"`
|
|
||||||
Firmware string `json:"fr"`
|
|
||||||
TotalCap uint64 `json:"tnvmcap"`
|
|
||||||
NVMCap uint64 `json:"nvmcap"`
|
|
||||||
}
|
|
||||||
|
|
||||||
// nvmeU64 handles both plain JSON numbers and {"lo":n,"hi":n} objects that
|
|
||||||
// some nvme-cli versions emit for 128-bit counters.
|
|
||||||
func nvmeU64(raw json.RawMessage) uint64 {
|
|
||||||
if len(raw) == 0 {
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
var n uint64
|
|
||||||
if json.Unmarshal(raw, &n) == nil {
|
|
||||||
return n
|
|
||||||
}
|
|
||||||
var obj struct {
|
|
||||||
Lo uint64 `json:"lo"`
|
|
||||||
Hi uint64 `json:"hi"`
|
|
||||||
}
|
|
||||||
if json.Unmarshal(raw, &obj) == nil {
|
|
||||||
return obj.Lo
|
|
||||||
}
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
type nvmeSmartLogRaw struct {
|
|
||||||
CriticalWarning uint64 `json:"critical_warning"`
|
|
||||||
Temperature json.RawMessage `json:"temperature"`
|
|
||||||
AvailSpare uint64 `json:"avail_spare"`
|
|
||||||
SpareThresh uint64 `json:"spare_thresh"`
|
|
||||||
PercentUsed uint64 `json:"percent_used"`
|
|
||||||
DataUnitsRead json.RawMessage `json:"data_units_read"`
|
|
||||||
DataUnitsWritten json.RawMessage `json:"data_units_written"`
|
|
||||||
PowerCycles json.RawMessage `json:"power_cycles"`
|
|
||||||
PowerOnHours json.RawMessage `json:"power_on_hours"`
|
|
||||||
UnsafeShutdowns json.RawMessage `json:"unsafe_shutdowns"`
|
|
||||||
MediaErrors json.RawMessage `json:"media_errors"`
|
|
||||||
NumErrLogEntries json.RawMessage `json:"num_err_log_entries"`
|
|
||||||
}
|
|
||||||
|
|
||||||
func writeNVMeReport(b *strings.Builder, outputs map[string][]byte) {
|
func writeNVMeReport(b *strings.Builder, outputs map[string][]byte) {
|
||||||
// id-ctrl
|
// id-ctrl
|
||||||
var ctrl nvmeIdCtrl
|
var ctrl collector.NVMeIDCtrl
|
||||||
if data := outputs["nvme-id-ctrl"]; len(data) > 0 {
|
if data := outputs["nvme-id-ctrl"]; len(data) > 0 {
|
||||||
_ = json.Unmarshal(data, &ctrl)
|
_ = json.Unmarshal(data, &ctrl)
|
||||||
}
|
}
|
||||||
|
|
||||||
model := strings.TrimSpace(ctrl.ModelNumber)
|
model := strings.TrimSpace(ctrl.ModelNumber)
|
||||||
serial := strings.TrimSpace(ctrl.SerialNumber)
|
serial := strings.TrimSpace(ctrl.SerialNumber)
|
||||||
firmware := strings.TrimSpace(ctrl.Firmware)
|
firmware := strings.TrimSpace(ctrl.FirmwareRev)
|
||||||
|
|
||||||
capacityGB := ""
|
capacityGB := ""
|
||||||
if ctrl.TotalCap > 0 {
|
if ctrl.TotalCapacity > 0 {
|
||||||
capacityGB = formatCapacityGB(ctrl.TotalCap)
|
capacityGB = formatCapacityGB(uint64(ctrl.TotalCapacity))
|
||||||
} else if ctrl.NVMCap > 0 {
|
} else if ctrl.NVMCapacity > 0 {
|
||||||
capacityGB = formatCapacityGB(ctrl.NVMCap)
|
capacityGB = formatCapacityGB(uint64(ctrl.NVMCapacity))
|
||||||
}
|
}
|
||||||
|
|
||||||
writeField(b, "Model", model)
|
writeField(b, "Model", model)
|
||||||
@@ -113,67 +71,69 @@ func writeNVMeReport(b *strings.Builder, outputs map[string][]byte) {
|
|||||||
b.WriteString("\n(no SMART data)\n")
|
b.WriteString("\n(no SMART data)\n")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
var sl nvmeSmartLogRaw
|
var sl collector.NVMeSmartLog
|
||||||
if err := json.Unmarshal(data, &sl); err != nil {
|
if err := json.Unmarshal(data, &sl); err != nil {
|
||||||
fmt.Fprintf(b, "\n(SMART parse error: %v)\n", err)
|
fmt.Fprintf(b, "\n(SMART parse error: %v)\n", err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
tempK := nvmeU64(sl.Temperature)
|
tempC := int(sl.Temperature) - 273
|
||||||
tempC := int(tempK) - 273
|
|
||||||
if tempC < 0 {
|
if tempC < 0 {
|
||||||
tempC = 0
|
tempC = 0
|
||||||
}
|
}
|
||||||
|
|
||||||
critWarn := sl.CriticalWarning
|
|
||||||
critWarnStr := "OK"
|
critWarnStr := "OK"
|
||||||
if critWarn != 0 {
|
if sl.CriticalWarning != 0 {
|
||||||
critWarnStr = fmt.Sprintf("0x%02X", critWarn)
|
critWarnStr = fmt.Sprintf("0x%02X", sl.CriticalWarning)
|
||||||
}
|
}
|
||||||
|
|
||||||
poh := nvmeU64(sl.PowerOnHours)
|
poh := uint64(sl.PowerOnHours)
|
||||||
pc := nvmeU64(sl.PowerCycles)
|
pc := uint64(sl.PowerCycles)
|
||||||
us := nvmeU64(sl.UnsafeShutdowns)
|
us := uint64(sl.UnsafeShutdowns)
|
||||||
me := nvmeU64(sl.MediaErrors)
|
me := uint64(sl.MediaErrors)
|
||||||
nel := nvmeU64(sl.NumErrLogEntries)
|
nel := uint64(sl.NumErrLogEntries)
|
||||||
|
|
||||||
// data_units are in 1000 × 512-byte sectors = 512,000 bytes each
|
// data_units are in 1000 × 512-byte sectors = 512,000 bytes each
|
||||||
dataRead := float64(nvmeU64(sl.DataUnitsRead)) * 512000 / 1e9
|
readBytes := uint64(sl.DataUnitsRead) * 512000
|
||||||
dataWritten := float64(nvmeU64(sl.DataUnitsWritten)) * 512000 / 1e9
|
writtenBytes := uint64(sl.DataUnitsWritten) * 512000
|
||||||
|
|
||||||
writeSectionHeader(b, "Health")
|
writeSectionHeader(b, "Health")
|
||||||
writeField(b, "Temperature", fmt.Sprintf("%d °C", tempC))
|
writeField(b, "Temperature", fmt.Sprintf("%d °C", tempC))
|
||||||
writeField(b, "Critical Warning", critWarnStr)
|
writeField(b, "Critical Warning", critWarnStr)
|
||||||
writeField(b, "Percentage Used", fmt.Sprintf("%d %%", sl.PercentUsed))
|
writeField(b, "Percentage Used", fmt.Sprintf("%d %%", sl.PercentageUsed))
|
||||||
writeField(b, "Available Spare", fmt.Sprintf("%d %% (threshold: %d %%)", sl.AvailSpare, sl.SpareThresh))
|
writeField(b, "Available Spare", fmt.Sprintf("%d %% (threshold: %d %%)", sl.AvailableSpare, sl.SpareThreshold))
|
||||||
|
|
||||||
writeSectionHeader(b, "Usage")
|
writeSectionHeader(b, "Usage")
|
||||||
writeField(b, "Power On Hours", fmt.Sprintf("%s h", formatUint(poh)))
|
writeField(b, "Power On Hours", fmt.Sprintf("%s h", formatUint(poh)))
|
||||||
writeField(b, "Power Cycles", formatUint(pc))
|
writeField(b, "Power Cycles", formatUint(pc))
|
||||||
writeField(b, "Unsafe Shutdowns", formatUint(us))
|
writeField(b, "Unsafe Shutdowns", formatUint(us))
|
||||||
writeField(b, "Data Written", fmt.Sprintf("%.1f GB", dataWritten))
|
writeField(b, "Data Written", formatBytesHuman(float64(writtenBytes)))
|
||||||
writeField(b, "Data Read", fmt.Sprintf("%.1f GB", dataRead))
|
writeField(b, "Data Read", formatBytesHuman(float64(readBytes)))
|
||||||
|
|
||||||
writeSectionHeader(b, "Errors")
|
writeSectionHeader(b, "Errors")
|
||||||
writeField(b, "Media Errors", formatUint(me))
|
writeField(b, "Media Errors", formatUint(me))
|
||||||
writeField(b, "Error Log Entries", formatUint(nel))
|
writeField(b, "Error Log Entries", formatUint(nel))
|
||||||
|
|
||||||
capacityBytes := ctrl.TotalCap
|
capacityBytes := uint64(ctrl.TotalCapacity)
|
||||||
if capacityBytes == 0 {
|
if capacityBytes == 0 {
|
||||||
capacityBytes = ctrl.NVMCap
|
capacityBytes = uint64(ctrl.NVMCapacity)
|
||||||
}
|
}
|
||||||
writeResourceSection(b, resourceInfo{
|
ri := resourceInfo{
|
||||||
powerOnHours: poh,
|
powerOnHours: poh,
|
||||||
writtenBytes: uint64(nvmeU64(sl.DataUnitsWritten)) * 512000,
|
powerCycles: pc,
|
||||||
readBytes: uint64(nvmeU64(sl.DataUnitsRead)) * 512000,
|
writtenBytes: writtenBytes,
|
||||||
|
readBytes: readBytes,
|
||||||
capacityBytes: capacityBytes,
|
capacityBytes: capacityBytes,
|
||||||
})
|
}
|
||||||
|
writeResourceSection(b, ri)
|
||||||
|
|
||||||
if selfTest := outputs["nvme-device-self-test"]; len(selfTest) > 0 {
|
if selfTest := outputs["nvme-device-self-test"]; len(selfTest) > 0 {
|
||||||
writeSectionHeader(b, "Self-Test")
|
writeSectionHeader(b, "Self-Test")
|
||||||
result := parseSelfTestResult(string(selfTest))
|
result := parseSelfTestResult(string(selfTest))
|
||||||
writeField(b, "Result", result)
|
writeField(b, "Result", result)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
writeConclusionSection(b, ri)
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── SATA / SAS (smartctl) ────────────────────────────────────────────────────
|
// ── SATA / SAS (smartctl) ────────────────────────────────────────────────────
|
||||||
@@ -246,13 +206,15 @@ func writeSATAReport(b *strings.Builder, outputs map[string][]byte) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
var poh, writtenLBAs, readLBAs uint64
|
var poh, pc, writtenLBAs, readLBAs uint64
|
||||||
var readValue int
|
var readValue int
|
||||||
hasReadValue := false
|
hasReadValue := false
|
||||||
for _, a := range attrs {
|
for _, a := range attrs {
|
||||||
switch a.ID {
|
switch a.ID {
|
||||||
case 9: // Power_On_Hours
|
case 9: // Power_On_Hours
|
||||||
poh = parseLeadingUint(a.Raw)
|
poh = parseLeadingUint(a.Raw)
|
||||||
|
case 12: // Power_Cycle_Count
|
||||||
|
pc = parseLeadingUint(a.Raw)
|
||||||
case 241: // Total_LBAs_Written
|
case 241: // Total_LBAs_Written
|
||||||
writtenLBAs = parseLeadingUint(a.Raw)
|
writtenLBAs = parseLeadingUint(a.Raw)
|
||||||
case 242: // Total_LBAs_Read
|
case 242: // Total_LBAs_Read
|
||||||
@@ -262,14 +224,16 @@ func writeSATAReport(b *strings.Builder, outputs map[string][]byte) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
const sataSectorBytes = 512
|
const sataSectorBytes = 512
|
||||||
writeResourceSection(b, resourceInfo{
|
ri := resourceInfo{
|
||||||
powerOnHours: poh,
|
powerOnHours: poh,
|
||||||
|
powerCycles: pc,
|
||||||
writtenBytes: writtenLBAs * sataSectorBytes,
|
writtenBytes: writtenLBAs * sataSectorBytes,
|
||||||
readBytes: readLBAs * sataSectorBytes,
|
readBytes: readLBAs * sataSectorBytes,
|
||||||
capacityBytes: capacityBytes,
|
capacityBytes: capacityBytes,
|
||||||
readPercent: 100 - readValue,
|
readPercent: 100 - readValue,
|
||||||
hasReadPercent: hasReadValue,
|
hasReadPercent: hasReadValue,
|
||||||
})
|
}
|
||||||
|
writeResourceSection(b, ri)
|
||||||
|
|
||||||
selfTest := outputs["smartctl-self-test-status"]
|
selfTest := outputs["smartctl-self-test-status"]
|
||||||
if len(selfTest) == 0 {
|
if len(selfTest) == 0 {
|
||||||
@@ -280,6 +244,8 @@ func writeSATAReport(b *strings.Builder, outputs map[string][]byte) {
|
|||||||
result := parseSelfTestResult(string(selfTest))
|
result := parseSelfTestResult(string(selfTest))
|
||||||
writeField(b, "Result", result)
|
writeField(b, "Result", result)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
writeConclusionSection(b, ri)
|
||||||
}
|
}
|
||||||
|
|
||||||
func parseSMARTAttrs(text string) []smartAttr {
|
func parseSMARTAttrs(text string) []smartAttr {
|
||||||
@@ -375,6 +341,7 @@ const (
|
|||||||
|
|
||||||
type resourceInfo struct {
|
type resourceInfo struct {
|
||||||
powerOnHours uint64
|
powerOnHours uint64
|
||||||
|
powerCycles uint64
|
||||||
writtenBytes uint64
|
writtenBytes uint64
|
||||||
readBytes uint64
|
readBytes uint64
|
||||||
capacityBytes uint64
|
capacityBytes uint64
|
||||||
@@ -407,6 +374,70 @@ func writeResourceSection(b *strings.Builder, r resourceInfo) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── Conclusion (new-vs-used verdict) ────────────────────────────────────────
|
||||||
|
|
||||||
|
// Thresholds for treating a drive as "new": less than one full drive-write
|
||||||
|
// (110% of capacity, headroom for provisioning/overprovisioning rounding),
|
||||||
|
// less than a bit over two full drive-reads (210% of capacity), under a
|
||||||
|
// week of power-on time, and under 30 power cycles. Any one violation is
|
||||||
|
// enough to call the drive used — these are deliberately loose bounds, not
|
||||||
|
// a wear/endurance judgment (see -- Resource -- for that).
|
||||||
|
const (
|
||||||
|
newDiskMaxWrittenFrac = 1.10
|
||||||
|
newDiskMaxReadFrac = 2.10
|
||||||
|
newDiskMaxUptimeHours = 7 * 24
|
||||||
|
newDiskMaxPowerCycles = 30
|
||||||
|
)
|
||||||
|
|
||||||
|
func writeConclusionSection(b *strings.Builder, r resourceInfo) {
|
||||||
|
writeSectionHeader(b, "Conclusion")
|
||||||
|
|
||||||
|
var reasons, notes []string
|
||||||
|
isNew := true
|
||||||
|
|
||||||
|
if r.capacityBytes > 0 {
|
||||||
|
writtenFrac := float64(r.writtenBytes) / float64(r.capacityBytes)
|
||||||
|
readFrac := float64(r.readBytes) / float64(r.capacityBytes)
|
||||||
|
if writtenFrac >= newDiskMaxWrittenFrac {
|
||||||
|
isNew = false
|
||||||
|
reasons = append(reasons, fmt.Sprintf(
|
||||||
|
"data written %s (%s of capacity)",
|
||||||
|
formatBytesHuman(float64(r.writtenBytes)), formatPercent(writtenFrac*100)))
|
||||||
|
}
|
||||||
|
if readFrac >= newDiskMaxReadFrac {
|
||||||
|
isNew = false
|
||||||
|
reasons = append(reasons, fmt.Sprintf(
|
||||||
|
"data read %s (%s of capacity)",
|
||||||
|
formatBytesHuman(float64(r.readBytes)), formatPercent(readFrac*100)))
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
notes = append(notes, "capacity unknown — write/read criteria not evaluated")
|
||||||
|
}
|
||||||
|
|
||||||
|
if r.powerOnHours >= newDiskMaxUptimeHours {
|
||||||
|
isNew = false
|
||||||
|
reasons = append(reasons, fmt.Sprintf("uptime %s", formatHoursHuman(r.powerOnHours)))
|
||||||
|
}
|
||||||
|
|
||||||
|
if r.powerCycles >= newDiskMaxPowerCycles {
|
||||||
|
isNew = false
|
||||||
|
reasons = append(reasons, fmt.Sprintf("power cycles %s", formatUint(r.powerCycles)))
|
||||||
|
}
|
||||||
|
|
||||||
|
if isNew {
|
||||||
|
writeField(b, "Disk Condition", "NEW")
|
||||||
|
} else {
|
||||||
|
writeField(b, "Disk Condition", "USED")
|
||||||
|
b.WriteString(" Reason:\n")
|
||||||
|
for _, reason := range reasons {
|
||||||
|
fmt.Fprintf(b, " - %s\n", reason)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, note := range notes {
|
||||||
|
fmt.Fprintf(b, " Note: %s\n", note)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// progressBar renders a fixed-width pseudographic bar, e.g. "[######------]".
|
// progressBar renders a fixed-width pseudographic bar, e.g. "[######------]".
|
||||||
func progressBar(frac float64, width int) string {
|
func progressBar(frac float64, width int) string {
|
||||||
if math.IsNaN(frac) || frac < 0 {
|
if math.IsNaN(frac) || frac < 0 {
|
||||||
|
|||||||
@@ -83,7 +83,36 @@ func TestGenerateDiskReportNVMe(t *testing.T) {
|
|||||||
assertContains(t, report, "1,234 h") // power_on_hours with separator
|
assertContains(t, report, "1,234 h") // power_on_hours with separator
|
||||||
assertContains(t, report, "32") // power_cycles
|
assertContains(t, report, "32") // power_cycles
|
||||||
assertContains(t, report, "3") // unsafe_shutdowns
|
assertContains(t, report, "3") // unsafe_shutdowns
|
||||||
assertContains(t, report, "378.0 GB") // data_units_written * 512000 / 1e9
|
assertContains(t, report, "378.00 GB") // data_units_written * 512000, human-scaled
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestGenerateDiskReportNVMeDataUnitsScaleToTB verifies that heavy write/read
|
||||||
|
// counters render in the "-- Usage --" section as TB/PB, not raw GB, matching
|
||||||
|
// the "-- Resource --" section which already used formatBytesHuman.
|
||||||
|
func TestGenerateDiskReportNVMeDataUnitsScaleToTB(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
heavy := []byte(`{
|
||||||
|
"critical_warning": 0,
|
||||||
|
"temperature": 307,
|
||||||
|
"avail_spare": 100,
|
||||||
|
"spare_thresh": 10,
|
||||||
|
"percent_used": 0,
|
||||||
|
"data_units_read": "252420478",
|
||||||
|
"data_units_written": "103834055",
|
||||||
|
"power_cycles": "45",
|
||||||
|
"power_on_hours": "45",
|
||||||
|
"unsafe_shutdowns": "35",
|
||||||
|
"media_errors": "0",
|
||||||
|
"num_err_log_entries": "0"
|
||||||
|
}`)
|
||||||
|
outputs := map[string][]byte{
|
||||||
|
"nvme-id-ctrl": testNVMeIdCtrl,
|
||||||
|
"nvme-smart-log": heavy,
|
||||||
|
}
|
||||||
|
report := GenerateDiskReportText(1, "/dev/nvme0n1", outputs, time.Unix(0, 0).UTC())
|
||||||
|
|
||||||
|
assertContains(t, report, "Data Written : 53.16 TB")
|
||||||
|
assertContains(t, report, "Data Read : 129.24 TB")
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestGenerateDiskReportNVMeLoHi(t *testing.T) {
|
func TestGenerateDiskReportNVMeLoHi(t *testing.T) {
|
||||||
|
|||||||
@@ -38,6 +38,7 @@ var techDumpNvidiaCommands = []struct {
|
|||||||
}{
|
}{
|
||||||
{Name: "nvidia-smi", Args: []string{"-q"}, File: "nvidia-smi-q.txt"},
|
{Name: "nvidia-smi", Args: []string{"-q"}, File: "nvidia-smi-q.txt"},
|
||||||
{Name: "nvidia-smi", Args: []string{"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown", "--format=csv,noheader,nounits"}, File: "nvidia-smi-query.csv"},
|
{Name: "nvidia-smi", Args: []string{"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown", "--format=csv,noheader,nounits"}, File: "nvidia-smi-query.csv"},
|
||||||
|
{Name: "nvidia-smi", Args: []string{"conf-compute", "-q"}, File: "nvidia-smi-conf-compute-q.txt"},
|
||||||
}
|
}
|
||||||
|
|
||||||
type lsblkDumpRoot struct {
|
type lsblkDumpRoot struct {
|
||||||
|
|||||||
@@ -135,7 +135,7 @@ func defaultTaskPriority(target string, params taskParams) int {
|
|||||||
return taskPriorityBurn
|
return taskPriorityBurn
|
||||||
case "nvidia", "nvidia-targeted-stress", "nvidia-targeted-power", "nvidia-pulse",
|
case "nvidia", "nvidia-targeted-stress", "nvidia-targeted-power", "nvidia-pulse",
|
||||||
"nvidia-interconnect", "nvidia-bandwidth", "memory", "storage", "cpu",
|
"nvidia-interconnect", "nvidia-bandwidth", "memory", "storage", "cpu",
|
||||||
"amd", "amd-mem", "amd-bandwidth":
|
"amd", "amd-mem", "amd-bandwidth", "confidential-computing":
|
||||||
if params.StressMode {
|
if params.StressMode {
|
||||||
return taskPriorityValidateStress
|
return taskPriorityValidateStress
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -676,6 +676,12 @@ func renderCheck(opts HandlerOptions) string {
|
|||||||
`<code>lsblk</code>; NVMe: <code>nvme id-ctrl</code>, <code>nvme smart-log</code>; SATA/SAS: <code>smartctl -H -A</code>`,
|
`<code>lsblk</code>; NVMe: <code>nvme id-ctrl</code>, <code>nvme smart-log</code>; SATA/SAS: <code>smartctl -H -A</code>`,
|
||||||
`Seconds — instantaneous device query, no wear counters incremented.`,
|
`Seconds — instantaneous device query, no wear counters incremented.`,
|
||||||
)) +
|
)) +
|
||||||
|
renderSATCard("confidential-computing", "Confidential Computing", "runSAT('confidential-computing')", "", renderValidateCardBody(
|
||||||
|
inv.NVIDIA,
|
||||||
|
`Checks whether this server can run NVIDIA Confidential Computing: CPU TEE support (Intel TDX / AMD SEV-SNP) and GPU firmware CC capability. Read-only — changes nothing.`,
|
||||||
|
`<code>nvidia-smi conf-compute -q</code>, <code>dmesg</code>, <code>/sys/module/kvm_amd/parameters/*</code>`,
|
||||||
|
`Seconds — read-only query only.`,
|
||||||
|
)) +
|
||||||
`</div>
|
`</div>
|
||||||
<div style="height:1px;background:var(--border);margin:16px 0"></div>
|
<div style="height:1px;background:var(--border);margin:16px 0"></div>
|
||||||
<div class="card" style="margin-bottom:16px">
|
<div class="card" style="margin-bottom:16px">
|
||||||
@@ -737,7 +743,7 @@ func renderCheck(opts HandlerOptions) string {
|
|||||||
<script>
|
<script>
|
||||||
let satES = null;
|
let satES = null;
|
||||||
function satLabels() {
|
function satLabels() {
|
||||||
return {nvidia:'Check GPU (DCGM L2)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Check Memory', storage:'Check Storage', cpu:'Check CPU', amd:'Check AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
|
return {nvidia:'Check GPU (DCGM L2)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Check Memory', storage:'Check Storage', cpu:'Check CPU', amd:'Check AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth', 'confidential-computing':'Check Confidential Computing'};
|
||||||
}
|
}
|
||||||
let satNvidiaGPUsPromise = null;
|
let satNvidiaGPUsPromise = null;
|
||||||
function loadSatNvidiaGPUs() {
|
function loadSatNvidiaGPUs() {
|
||||||
@@ -873,7 +879,7 @@ function runAllCheckSAT() {
|
|||||||
status.textContent = 'Enqueuing...';
|
status.textContent = 'Enqueuing...';
|
||||||
const nvidiaIndices = satSelectedGPUIndices();
|
const nvidiaIndices = satSelectedGPUIndices();
|
||||||
const nvidiaAllTargets = ['nvidia', 'nvidia-interconnect', 'nvidia-bandwidth'];
|
const nvidiaAllTargets = ['nvidia', 'nvidia-interconnect', 'nvidia-bandwidth'];
|
||||||
const baseTargets = ['cpu', 'memory', 'storage'];
|
const baseTargets = ['cpu', 'memory', 'storage', 'confidential-computing'];
|
||||||
const amdTargets = selectedAMDValidateTargets();
|
const amdTargets = selectedAMDValidateTargets();
|
||||||
const expanded = [];
|
const expanded = [];
|
||||||
baseTargets.forEach(t => expanded.push({target: t}));
|
baseTargets.forEach(t => expanded.push({target: t}));
|
||||||
|
|||||||
@@ -264,6 +264,7 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
|||||||
mux.HandleFunc("POST /api/sat/nvidia-stress/run", h.handleAPISATRun("nvidia-stress"))
|
mux.HandleFunc("POST /api/sat/nvidia-stress/run", h.handleAPISATRun("nvidia-stress"))
|
||||||
mux.HandleFunc("POST /api/sat/memory/run", h.handleAPISATRun("memory"))
|
mux.HandleFunc("POST /api/sat/memory/run", h.handleAPISATRun("memory"))
|
||||||
mux.HandleFunc("POST /api/sat/storage/run", h.handleAPISATRun("storage"))
|
mux.HandleFunc("POST /api/sat/storage/run", h.handleAPISATRun("storage"))
|
||||||
|
mux.HandleFunc("POST /api/sat/confidential-computing/run", h.handleAPISATRun("confidential-computing"))
|
||||||
mux.HandleFunc("POST /api/sat/cpu/run", h.handleAPISATRun("cpu"))
|
mux.HandleFunc("POST /api/sat/cpu/run", h.handleAPISATRun("cpu"))
|
||||||
mux.HandleFunc("POST /api/sat/amd/run", h.handleAPISATRun("amd"))
|
mux.HandleFunc("POST /api/sat/amd/run", h.handleAPISATRun("amd"))
|
||||||
mux.HandleFunc("POST /api/sat/amd-mem/run", h.handleAPISATRun("amd-mem"))
|
mux.HandleFunc("POST /api/sat/amd-mem/run", h.handleAPISATRun("amd-mem"))
|
||||||
|
|||||||
@@ -272,6 +272,12 @@ func executeTaskWithOptions(opts *HandlerOptions, t *Task, j *jobState, ctx cont
|
|||||||
break
|
break
|
||||||
}
|
}
|
||||||
archive, err = runStorageAcceptancePackCtx(a, ctx, "", t.params.StressMode, j.append)
|
archive, err = runStorageAcceptancePackCtx(a, ctx, "", t.params.StressMode, j.append)
|
||||||
|
case "confidential-computing":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
archive, err = runConfidentialComputingCheckPackCtx(a, ctx, "", j.append)
|
||||||
case "cpu":
|
case "cpu":
|
||||||
if a == nil {
|
if a == nil {
|
||||||
err = fmt.Errorf("app not configured")
|
err = fmt.Errorf("app not configured")
|
||||||
|
|||||||
@@ -45,6 +45,7 @@ var taskNames = map[string]string{
|
|||||||
"nvidia-stress": "NVIDIA GPU Stress",
|
"nvidia-stress": "NVIDIA GPU Stress",
|
||||||
"memory": "Memory SAT",
|
"memory": "Memory SAT",
|
||||||
"storage": "Storage SAT",
|
"storage": "Storage SAT",
|
||||||
|
"confidential-computing": "Confidential Computing Check",
|
||||||
"cpu": "CPU SAT",
|
"cpu": "CPU SAT",
|
||||||
"amd": "AMD GPU SAT",
|
"amd": "AMD GPU SAT",
|
||||||
"amd-mem": "AMD GPU MEM Integrity",
|
"amd-mem": "AMD GPU MEM Integrity",
|
||||||
@@ -312,6 +313,9 @@ var (
|
|||||||
runStorageAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
|
runStorageAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
|
||||||
return a.RunStorageAcceptancePackCtx(ctx, baseDir, extended, logFunc)
|
return a.RunStorageAcceptancePackCtx(ctx, baseDir, extended, logFunc)
|
||||||
}
|
}
|
||||||
|
runConfidentialComputingCheckPackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunConfidentialComputingCheckPackCtx(ctx, baseDir, logFunc)
|
||||||
|
}
|
||||||
runCPUAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
runCPUAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
return a.RunCPUAcceptancePackCtx(ctx, baseDir, durationSec, logFunc)
|
return a.RunCPUAcceptancePackCtx(ctx, baseDir, durationSec, logFunc)
|
||||||
}
|
}
|
||||||
@@ -1025,6 +1029,12 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
|||||||
break
|
break
|
||||||
}
|
}
|
||||||
archive, err = runStorageAcceptancePackCtx(a, ctx, "", t.params.StressMode, j.append)
|
archive, err = runStorageAcceptancePackCtx(a, ctx, "", t.params.StressMode, j.append)
|
||||||
|
case "confidential-computing":
|
||||||
|
if a == nil {
|
||||||
|
err = fmt.Errorf("app not configured")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
archive, err = runConfidentialComputingCheckPackCtx(a, ctx, "", j.append)
|
||||||
case "cpu":
|
case "cpu":
|
||||||
if a == nil {
|
if a == nil {
|
||||||
err = fmt.Errorf("app not configured")
|
err = fmt.Errorf("app not configured")
|
||||||
|
|||||||
Reference in New Issue
Block a user