Compare commits

..

5 Commits

Author SHA1 Message Date
Mikhail Chusavitin abaeaea13f add Confidential Computing readiness check + collect nvidia-smi conf-compute -q
New read-only "Check" step reports whether this server can run NVIDIA
Confidential Computing: CPU TEE support (Intel TDX / AMD SEV-SNP, via
dmesg and kvm_amd sysfs params) and GPU firmware CC capability (via
`nvidia-smi conf-compute -q`). Also collect that command's output into
the techdump export bundle.

Co-Authored-By: Claude Sonnet 5 <noreply@anthropic.com>
2026-07-02 19:18:41 +03:00
mchus 5b98005d5d storage report: add new-vs-used disk verdict, human-readable data units, collect smartctl -i
Disk report now ends with a Conclusion section judging a drive NEW/USED
against loose thresholds (<110% capacity written, <210% read, <7d
uptime, <30 power cycles), listing which ones tripped. Data
Written/Read in the Usage section now scale to TB/PB via
formatBytesHuman instead of always printing raw GB. storageSATCommands
now runs smartctl with -i so SATA/SAS reports get Model/Serial/
Firmware/Capacity, which the Conclusion needs to evaluate the
write/read criteria (previously only -H -A was collected).

Co-Authored-By: Claude Sonnet 5 <noreply@anthropic.com>
2026-07-02 12:14:20 +03:00
mchus 33bc275da2 storage SAT: fix NVMe SMART counters showing 0 for power-on hours/read/write
nvme-cli emits large 64-bit counters as JSON-quoted strings on some
versions; the disk-report text generator only handled bare numbers and
{lo,hi} objects, so power_on_hours/data_units_read/data_units_written
etc. silently parsed as 0 while the structured collector path already
handled this correctly. Unify both paths on a single exported
JSONInt64/NVMeSmartLog/NVMeIDCtrl type in collector/storage.go instead
of keeping two independent nvme-cli JSON parsers in sync.

Co-Authored-By: Claude Sonnet 5 <noreply@anthropic.com>
2026-07-02 11:58:37 +03:00
Mikhail Chusavitin 11ea640626 power ramp: fix missing step-1 GPU telemetry, add GPU/server power breakdown
Ramp Sequence table's Run 1 row showed "—" for GPU power because the
step-1 fast path (reusing single-card calibration) never populated
PerGPUTelemetry like steps 2+ do. Also add GPU total W / Server itself W
columns and an idle baseline row so server-vs-GPU consumption is visible
per ramp step.
2026-07-01 17:39:58 +03:00
Mikhail Chusavitin 796acdfec1 ipmi fru: add Asset Tag and vendor Extra field write support (in-band)
Product Asset Tag (p 5) and the repeated custom "Extra" fields (Product
Extra p 7, Board Extra b 5/6/7, Chassis Extra c 2/3) from the Inspur FRU
field doc weren't writable — ipmitool prints identically-named lines for
each custom field with no index of its own, so a plain name lookup
couldn't tell them apart. parseFRUOutput now counts occurrences per area
to recover the real index, and the existing area/index round-trip in the
FRU editor write path picks it up automatically. Out-of-band (-H/-U/-P)
writing remains out of scope.

Co-Authored-By: Claude Sonnet 5 <noreply@anthropic.com>
2026-07-01 17:21:26 +03:00
20 changed files with 614 additions and 153 deletions
+1
View File
@@ -134,6 +134,7 @@ type satRunner interface {
ResetNvidiaGPU(index int) (string, error)
RunMemoryAcceptancePack(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error)
RunStorageAcceptancePack(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error)
RunConfidentialComputingCheckPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
ListNvidiaGPUs() ([]platform.NvidiaGPU, error)
DetectGPUVendor() string
+16
View File
@@ -206,6 +206,22 @@ func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, erro
return ActionResult{Title: "Storage SAT", Body: satResultBody(path)}, err
}
func (a *App) RunConfidentialComputingCheckPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
if strings.TrimSpace(baseDir) == "" {
baseDir = DefaultSATBaseDir
}
return a.sat.RunConfidentialComputingCheckPack(ctx, baseDir, logFunc)
}
func (a *App) RunConfidentialComputingCheckPack(baseDir string, logFunc func(string)) (string, error) {
return a.RunConfidentialComputingCheckPackCtx(context.Background(), baseDir, logFunc)
}
func (a *App) RunConfidentialComputingCheckPackResult(baseDir string) (ActionResult, error) {
path, err := a.RunConfidentialComputingCheckPack(baseDir, nil)
return ActionResult{Title: "Confidential Computing Check", Body: satResultBody(path)}, err
}
func (a *App) DetectGPUVendor() string {
return a.sat.DetectGPUVendor()
}
+4
View File
@@ -243,6 +243,10 @@ func (f fakeSAT) RunStorageAcceptancePack(_ context.Context, baseDir string, _ b
return f.runStorageFn(baseDir)
}
func (f fakeSAT) RunConfidentialComputingCheckPack(_ context.Context, baseDir string, _ func(string)) (string, error) {
return "", nil
}
func (f fakeSAT) RunCPUAcceptancePack(_ context.Context, baseDir string, durationSec int, _ func(string)) (string, error) {
if f.runCPUFn != nil {
return f.runCPUFn(baseDir, durationSec)
+1 -1
View File
@@ -766,7 +766,7 @@ func parseMDAdmPlatformLicense(raw string) *string {
func queryDeviceSerial(devPath string) string {
if out, err := exec.Command("nvme", "id-ctrl", devPath, "-o", "json").Output(); err == nil {
var ctrl nvmeIDCtrl
var ctrl NVMeIDCtrl
if json.Unmarshal(out, &ctrl) == nil {
if v := cleanDMIValue(strings.TrimSpace(ctrl.SerialNumber)); v != "" {
return v
+57 -42
View File
@@ -84,16 +84,19 @@ func collectStorage() []schema.HardwareStorage {
return result
}
// jsonInt64 accepts both a bare JSON number and a JSON-quoted number string.
// lsblk -J emits LOG-SEC / PHY-SEC as integers on util-linux 2.37 (Debian 12)
// but older versions emit them as strings. This type handles both.
type jsonInt64 int64
// JSONInt64 accepts a bare JSON number (512), a JSON-quoted number string
// ("512" — lsblk -J on util-linux < 2.37, and nvme-cli for large 64-bit
// counters that would lose precision as JS numbers), or a {"lo":n,"hi":n}
// object (128-bit NVMe counters on some nvme-cli versions; hi is ignored as
// no real counter exceeds 64 bits). Shared by lsblk and nvme-cli JSON output
// across the collector and the human-readable disk report.
type JSONInt64 int64
func (j *jsonInt64) UnmarshalJSON(data []byte) error {
func (j *JSONInt64) UnmarshalJSON(data []byte) error {
// bare number: 512
var n int64
if err := json.Unmarshal(data, &n); err == nil {
*j = jsonInt64(n)
*j = JSONInt64(n)
return nil
}
// quoted string: "512"
@@ -101,24 +104,32 @@ func (j *jsonInt64) UnmarshalJSON(data []byte) error {
if err := json.Unmarshal(data, &s); err == nil {
n, err := strconv.ParseInt(strings.TrimSpace(s), 10, 64)
if err == nil {
*j = jsonInt64(n)
*j = JSONInt64(n)
}
return nil
}
// {"lo":n,"hi":n} 128-bit counter object
var obj struct {
Lo int64 `json:"lo"`
}
if err := json.Unmarshal(data, &obj); err == nil {
*j = JSONInt64(obj.Lo)
return nil
}
return nil // null or unexpected type — leave zero
}
// lsblkDevice is a minimal lsblk JSON record.
type lsblkDevice struct {
Name string `json:"name"`
Type string `json:"type"`
Size string `json:"size"`
Serial string `json:"serial"`
Model string `json:"model"`
Tran string `json:"tran"`
Hctl string `json:"hctl"`
LogSec jsonInt64 `json:"log-sec"`
PhySec jsonInt64 `json:"phy-sec"`
Name string `json:"name"`
Type string `json:"type"`
Size string `json:"size"`
Serial string `json:"serial"`
Model string `json:"model"`
Tran string `json:"tran"`
Hctl string `json:"hctl"`
LogSec JSONInt64 `json:"log-sec"`
PhySec JSONInt64 `json:"phy-sec"`
}
type lsblkRoot struct {
@@ -423,32 +434,36 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
return s
}
// nvmeSmartLog is the subset of `nvme smart-log -o json` output we care about.
// nvme-cli emits most counters as JSON strings (e.g. "power_on_hours":"49"),
// so all numeric fields use jsonInt64 which accepts both bare numbers and
// quoted strings. Field names match nvme-cli JSON output, not NVMe spec prose.
type nvmeSmartLog struct {
CriticalWarning jsonInt64 `json:"critical_warning"`
PercentageUsed jsonInt64 `json:"percent_used"`
AvailableSpare jsonInt64 `json:"avail_spare"`
SpareThreshold jsonInt64 `json:"spare_thresh"`
Temperature jsonInt64 `json:"temperature"`
PowerOnHours jsonInt64 `json:"power_on_hours"`
PowerCycles jsonInt64 `json:"power_cycles"`
UnsafeShutdowns jsonInt64 `json:"unsafe_shutdowns"`
DataUnitsRead jsonInt64 `json:"data_units_read"`
DataUnitsWritten jsonInt64 `json:"data_units_written"`
ControllerBusy jsonInt64 `json:"controller_busy_time"`
MediaErrors jsonInt64 `json:"media_errors"`
NumErrLogEntries jsonInt64 `json:"num_err_log_entries"`
// NVMeSmartLog is the subset of `nvme smart-log -o json` output shared by the
// structured collector and the human-readable disk report. nvme-cli emits
// most counters as JSON strings (e.g. "power_on_hours":"49") or, on some
// versions, as {"lo":n,"hi":n} objects — all numeric fields use JSONInt64,
// which accepts bare numbers, quoted strings, and lo/hi objects. Field names
// match nvme-cli JSON output, not NVMe spec prose.
type NVMeSmartLog struct {
CriticalWarning JSONInt64 `json:"critical_warning"`
PercentageUsed JSONInt64 `json:"percent_used"`
AvailableSpare JSONInt64 `json:"avail_spare"`
SpareThreshold JSONInt64 `json:"spare_thresh"`
Temperature JSONInt64 `json:"temperature"`
PowerOnHours JSONInt64 `json:"power_on_hours"`
PowerCycles JSONInt64 `json:"power_cycles"`
UnsafeShutdowns JSONInt64 `json:"unsafe_shutdowns"`
DataUnitsRead JSONInt64 `json:"data_units_read"`
DataUnitsWritten JSONInt64 `json:"data_units_written"`
ControllerBusy JSONInt64 `json:"controller_busy_time"`
MediaErrors JSONInt64 `json:"media_errors"`
NumErrLogEntries JSONInt64 `json:"num_err_log_entries"`
}
// nvmeIDCtrl is the subset of `nvme id-ctrl -o json` output.
type nvmeIDCtrl struct {
ModelNumber string `json:"mn"`
SerialNumber string `json:"sn"`
FirmwareRev string `json:"fr"`
TotalCapacity int64 `json:"tnvmcap"`
// NVMeIDCtrl is the subset of `nvme id-ctrl -o json` output shared by the
// structured collector and the human-readable disk report.
type NVMeIDCtrl struct {
ModelNumber string `json:"mn"`
SerialNumber string `json:"sn"`
FirmwareRev string `json:"fr"`
TotalCapacity JSONInt64 `json:"tnvmcap"`
NVMCapacity JSONInt64 `json:"nvmcap"`
}
func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
@@ -481,7 +496,7 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
// id-ctrl: model, serial, firmware, capacity
if out, err := exec.Command("nvme", "id-ctrl", devPath, "-o", "json").Output(); err == nil {
var ctrl nvmeIDCtrl
var ctrl NVMeIDCtrl
if json.Unmarshal(out, &ctrl) == nil {
if v := cleanDMIValue(strings.TrimSpace(ctrl.ModelNumber)); v != "" {
s.Model = &v
@@ -502,7 +517,7 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
// smart-log: wear telemetry
if out, err := exec.Command("nvme", "smart-log", devPath, "-o", "json").Output(); err == nil {
var log nvmeSmartLog
var log NVMeSmartLog
if json.Unmarshal(out, &log) == nil {
if log.PowerOnHours > 0 {
v := int64(log.PowerOnHours)
@@ -56,7 +56,7 @@ func TestJsonInt64UnmarshalBothFormats(t *testing.T) {
{`null`, 0},
}
for _, tc := range cases {
var v jsonInt64
var v JSONInt64
if err := v.UnmarshalJSON([]byte(tc.json)); err != nil {
t.Fatalf("UnmarshalJSON(%s): unexpected error %v", tc.json, err)
}
@@ -9,7 +9,7 @@ import (
// TestNVMeSmartLogUnmarshal verifies that nvme-cli JSON output (where most
// counters are quoted strings and field names differ from NVMe spec prose)
// is correctly parsed into nvmeSmartLog.
// is correctly parsed into NVMeSmartLog.
func TestNVMeSmartLogUnmarshal(t *testing.T) {
t.Parallel()
@@ -30,7 +30,7 @@ func TestNVMeSmartLogUnmarshal(t *testing.T) {
"media_errors": "0",
"num_err_log_entries": "0"
}`
var log nvmeSmartLog
var log NVMeSmartLog
if err := json.Unmarshal([]byte(raw), &log); err != nil {
t.Fatalf("json.Unmarshal failed: %v", err)
}
+24 -3
View File
@@ -4008,14 +4008,23 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
idleW = result.ServerPower.IdleW
}
// Build header: Run | GPU 0 | GPU 1 | ... | Server wall W | Per GPU wall W | Platform eff.
// Build header: Run | GPU 0 | GPU 1 | ... | GPU total W | Server itself W | Server wall W | Per GPU wall W | Platform eff.
headers := []string{"Run"}
for _, idx := range allGPUIndices {
headers = append(headers, fmt.Sprintf("GPU %d W", idx))
}
headers = append(headers, "Server wall W", "Per GPU wall W", "Platform eff.")
headers = append(headers, "GPU total W", "Server itself W", "Server wall W", "Per GPU wall W", "Platform eff.")
var rampRows [][]string
if idleW > 0 {
idleRow := []string{"0 (idle)"}
for range allGPUIndices {
idleRow = append(idleRow, "—")
}
// No load: GPU total is negligible, all draw is the server's own baseline.
idleRow = append(idleRow, "—", fmt.Sprintf("%.0f", idleW), fmt.Sprintf("%.0f", idleW), "—", "—")
rampRows = append(rampRows, idleRow)
}
for _, step := range result.RampSteps {
row := []string{fmt.Sprintf("%d", step.StepIndex)}
for _, idx := range allGPUIndices {
@@ -4036,6 +4045,16 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
}
row = append(row, gpuPwr)
}
// GPU total W = sum of observed GPU power (nvidia-smi)
gpuTotal := "—"
if step.TotalObservedPowerW > 0 {
gpuTotal = fmt.Sprintf("%.0f", step.TotalObservedPowerW)
}
// Server itself W = server wall power minus GPU total (non-GPU baseline draw)
serverItself := "—"
if step.ServerLoadedW > 0 && step.TotalObservedPowerW > 0 {
serverItself = fmt.Sprintf("%.0f", step.ServerLoadedW-step.TotalObservedPowerW)
}
// Server wall W
serverWall := "—"
if step.ServerLoadedW > 0 {
@@ -4055,7 +4074,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
}
platEff = fmt.Sprintf("%.2f", eff)
}
row = append(row, serverWall, perGPUWall, platEff)
row = append(row, gpuTotal, serverItself, serverWall, perGPUWall, platEff)
rampRows = append(rampRows, row)
}
b.WriteString(fmtMDTable(headers, rampRows))
@@ -4617,6 +4636,8 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
ramp.AvgFanRPM = singleRun.AvgFanRPM
ramp.AvgFanDutyCyclePct = singleRun.AvgFanDutyCyclePct
}
firstSummary := firstCalib.Summary
ramp.PerGPUTelemetry = map[int]*BenchmarkTelemetrySummary{firstIdx: &firstSummary}
if !firstCalib.Completed {
ramp.Status = "FAILED"
ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card %s", firstIdx, benchmarkPowerEngineLabel(benchmarkPowerEngine())))
@@ -0,0 +1,248 @@
package platform
import (
"bytes"
"context"
"fmt"
"os"
"path/filepath"
"strings"
"time"
)
// ConfidentialComputingStatus summarizes whether this server can run NVIDIA
// Confidential Computing: CPU-side TEE support (Intel TDX / AMD SEV-SNP) and
// GPU firmware CC capability, as reported by `nvidia-smi conf-compute -q`.
type ConfidentialComputingStatus struct {
CollectedAt time.Time `json:"collected_at"`
// GPU-reported fields, parsed from `nvidia-smi conf-compute -q`.
NvidiaSMIAvailable bool `json:"nvidia_smi_available"`
CCState string `json:"cc_state,omitempty"` // ON / OFF
MultiGPUMode string `json:"multi_gpu_mode,omitempty"` // Protected PCIe / ...
CPUCCCapability string `json:"cpu_cc_capability,omitempty"` // e.g. "INTEL TDX", "AMD SEV-SNP", "NONE"
GPUCCCapability string `json:"gpu_cc_capability,omitempty"` // e.g. "CC Capable", "Not Capable"
CCGPUsReadyState string `json:"cc_gpus_ready_state,omitempty"` // Ready / Not Ready
// Host-side evidence that the CPU's TEE is actually active in the running
// kernel (BIOS + kernel cmdline + firmware), independent of what the GPU
// driver reports. Used as a fallback when the NVIDIA driver isn't loaded.
HostAMDSEVSupported bool `json:"host_amd_sev_supported"`
HostAMDSEVESSupported bool `json:"host_amd_sev_es_supported"`
HostAMDSEVSNPActive bool `json:"host_amd_sev_snp_active"`
HostIntelTDXActive bool `json:"host_intel_tdx_active"`
// GPUCanRunCC is true when the GPU firmware reports CC-capable.
GPUCanRunCC bool `json:"gpu_can_run_cc"`
// CPUCanRunCC is true when either the GPU driver or the host kernel
// reports an active/available CPU TEE (SEV-SNP or TDX).
CPUCanRunCC bool `json:"cpu_can_run_cc"`
// Ready is true when both the CPU and the GPU support Confidential
// Computing, regardless of whether CC mode is currently enabled.
Ready bool `json:"ready"`
Notes []string `json:"notes,omitempty"`
}
// RunConfidentialComputingCheckPack runs a read-only check of whether this
// server can run NVIDIA Confidential Computing: it queries the GPU driver
// (`nvidia-smi conf-compute -q`) and inspects host kernel/dmesg evidence of
// AMD SEV-SNP / Intel TDX support. It changes nothing on the system.
func (s *System) RunConfidentialComputingCheckPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
if ctx == nil {
ctx = context.Background()
}
if baseDir == "" {
baseDir = "/var/log/bee-sat"
}
ts := time.Now().UTC().Format("20060102-150405")
runDir := filepath.Join(baseDir, "confidential-computing-"+ts)
if err := os.MkdirAll(runDir, 0755); err != nil {
return "", err
}
verboseLog := filepath.Join(runDir, "verbose.log")
status := ConfidentialComputingStatus{CollectedAt: time.Now().UTC()}
// GPU firmware / driver state.
ccOut, ccErr := runSATCommandCtx(ctx, verboseLog, "nvidia-smi-conf-compute-q", []string{"nvidia-smi", "conf-compute", "-q"}, nil, logFunc)
_ = os.WriteFile(filepath.Join(runDir, "01-nvidia-smi-conf-compute-q.log"), ccOut, 0644)
if ccErr == nil {
status.NvidiaSMIAvailable = true
fields := parseConfComputeFields(ccOut)
status.CCState = fields["CC State"]
status.MultiGPUMode = fields["Multi-GPU Mode"]
status.CPUCCCapability = fields["CPU CC Capabilities"]
status.GPUCCCapability = fields["GPU CC Capabilities"]
status.CCGPUsReadyState = fields["CC GPUs Ready State"]
} else {
status.Notes = append(status.Notes, "nvidia-smi conf-compute -q unavailable (no NVIDIA driver, or GPU not present): "+firstLine(string(ccOut)))
}
// Host kernel evidence, independent of the GPU driver.
dmesgOut, _ := runSATCommandCtx(ctx, verboseLog, "dmesg", []string{"dmesg"}, nil, nil)
ccDmesgLines := filterConfComputeDmesgLines(dmesgOut)
_ = os.WriteFile(filepath.Join(runDir, "02-dmesg-cc-relevant.log"), []byte(strings.Join(ccDmesgLines, "\n")+"\n"), 0644)
lowerDmesg := strings.ToLower(strings.Join(ccDmesgLines, "\n"))
status.HostAMDSEVSNPActive = strings.Contains(lowerDmesg, "sev-snp enabled")
status.HostIntelTDXActive = strings.Contains(lowerDmesg, "tdx module") && strings.Contains(lowerDmesg, "module initialized") ||
strings.Contains(lowerDmesg, "virt/tdx: module initialized")
for i, path := range []string{
"/sys/module/kvm_amd/parameters/sev",
"/sys/module/kvm_amd/parameters/sev_es",
"/sys/module/kvm_amd/parameters/sev_snp",
} {
name := fmt.Sprintf("sysfs-%s", filepath.Base(path))
out, err := runSATCommandCtx(ctx, verboseLog, name, []string{"cat", path}, nil, nil)
_ = os.WriteFile(filepath.Join(runDir, fmt.Sprintf("03-%02d-%s.log", i+1, name)), out, 0644)
if err != nil {
continue
}
val := strings.TrimSpace(string(out))
switch filepath.Base(path) {
case "sev":
status.HostAMDSEVSupported = strings.EqualFold(val, "Y")
case "sev_es":
status.HostAMDSEVESSupported = strings.EqualFold(val, "Y")
case "sev_snp":
if strings.EqualFold(val, "Y") {
status.HostAMDSEVSNPActive = true
}
}
}
status.GPUCanRunCC = strings.EqualFold(strings.TrimSpace(status.GPUCCCapability), "CC Capable")
cpuCapReported := strings.TrimSpace(status.CPUCCCapability)
status.CPUCanRunCC = status.HostAMDSEVSNPActive || status.HostIntelTDXActive ||
(cpuCapReported != "" && !strings.EqualFold(cpuCapReported, "NONE"))
status.Ready = status.CPUCanRunCC && status.GPUCanRunCC
if !status.NvidiaSMIAvailable {
status.Notes = append(status.Notes, "GPU CC capability unknown — install the NVIDIA driver to query it with `nvidia-smi conf-compute -q`.")
}
summary := renderConfidentialComputingSummary(status)
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644); err != nil {
return "", err
}
report := renderConfidentialComputingReport(status)
if err := os.WriteFile(filepath.Join(runDir, "confidential-computing-report.txt"), []byte(report), 0644); err != nil {
return "", err
}
return runDir, nil
}
// parseConfComputeFields parses the indented "Key : Value" block emitted by
// `nvidia-smi conf-compute -q`, e.g.:
//
// CC State : OFF
// Multi-GPU Mode : Protected PCIe
// CPU CC Capabilities : INTEL TDX
// GPU CC Capabilities : CC Capable
// CC GPUs Ready State : Not Ready
func parseConfComputeFields(out []byte) map[string]string {
fields := map[string]string{}
for _, line := range strings.Split(string(out), "\n") {
idx := strings.Index(line, ":")
if idx < 0 {
continue
}
key := strings.TrimSpace(line[:idx])
val := strings.TrimSpace(line[idx+1:])
if key == "" || val == "" {
continue
}
fields[key] = val
}
return fields
}
// filterConfComputeDmesgLines returns the dmesg lines relevant to CPU
// Confidential Computing support (AMD SEV/SEV-ES/SEV-SNP, Intel TDX).
func filterConfComputeDmesgLines(dmesgOut []byte) []string {
var lines []string
scanner := bytes.Split(dmesgOut, []byte("\n"))
for _, raw := range scanner {
lower := strings.ToLower(string(raw))
if strings.Contains(lower, "sev") || strings.Contains(lower, "tdx") {
lines = append(lines, string(raw))
}
}
return lines
}
func renderConfidentialComputingSummary(status ConfidentialComputingStatus) string {
var b strings.Builder
fmt.Fprintf(&b, "run_at_utc=%s\n", status.CollectedAt.Format(time.RFC3339))
fmt.Fprintf(&b, "nvidia_smi_available=%t\n", status.NvidiaSMIAvailable)
fmt.Fprintf(&b, "cc_state=%s\n", status.CCState)
fmt.Fprintf(&b, "multi_gpu_mode=%s\n", status.MultiGPUMode)
fmt.Fprintf(&b, "cpu_cc_capability=%s\n", status.CPUCCCapability)
fmt.Fprintf(&b, "gpu_cc_capability=%s\n", status.GPUCCCapability)
fmt.Fprintf(&b, "cc_gpus_ready_state=%s\n", status.CCGPUsReadyState)
fmt.Fprintf(&b, "host_amd_sev_supported=%t\n", status.HostAMDSEVSupported)
fmt.Fprintf(&b, "host_amd_sev_es_supported=%t\n", status.HostAMDSEVESSupported)
fmt.Fprintf(&b, "host_amd_sev_snp_active=%t\n", status.HostAMDSEVSNPActive)
fmt.Fprintf(&b, "host_intel_tdx_active=%t\n", status.HostIntelTDXActive)
fmt.Fprintf(&b, "cpu_can_run_cc=%t\n", status.CPUCanRunCC)
fmt.Fprintf(&b, "gpu_can_run_cc=%t\n", status.GPUCanRunCC)
fmt.Fprintf(&b, "ready=%t\n", status.Ready)
if status.Ready {
fmt.Fprintln(&b, "overall_status=OK")
} else {
fmt.Fprintln(&b, "overall_status=NOT_READY")
}
return b.String()
}
func renderConfidentialComputingReport(status ConfidentialComputingStatus) string {
var b strings.Builder
line := strings.Repeat("=", 80)
b.WriteString(line + "\n")
b.WriteString("Confidential Computing Readiness\n")
b.WriteString(line + "\n\n")
verdict := "NOT READY"
if status.Ready {
verdict = "READY"
}
fmt.Fprintf(&b, "Verdict: %s\n\n", verdict)
b.WriteString("-- CPU ----------------------------------------------------------------------\n")
fmt.Fprintf(&b, " Reported by GPU driver : %s\n", nonEmptyOr(status.CPUCCCapability, "unknown"))
fmt.Fprintf(&b, " AMD SEV supported : %t\n", status.HostAMDSEVSupported)
fmt.Fprintf(&b, " AMD SEV-ES supported : %t\n", status.HostAMDSEVESSupported)
fmt.Fprintf(&b, " AMD SEV-SNP active : %t\n", status.HostAMDSEVSNPActive)
fmt.Fprintf(&b, " Intel TDX active : %t\n", status.HostIntelTDXActive)
fmt.Fprintf(&b, " Can run CC : %t\n\n", status.CPUCanRunCC)
b.WriteString("-- GPU ----------------------------------------------------------------------\n")
fmt.Fprintf(&b, " nvidia-smi available : %t\n", status.NvidiaSMIAvailable)
fmt.Fprintf(&b, " GPU CC Capabilities : %s\n", nonEmptyOr(status.GPUCCCapability, "unknown"))
fmt.Fprintf(&b, " CC State (current) : %s\n", nonEmptyOr(status.CCState, "unknown"))
fmt.Fprintf(&b, " Multi-GPU Mode : %s\n", nonEmptyOr(status.MultiGPUMode, "unknown"))
fmt.Fprintf(&b, " CC GPUs Ready State : %s\n", nonEmptyOr(status.CCGPUsReadyState, "unknown"))
fmt.Fprintf(&b, " Can run CC : %t\n\n", status.GPUCanRunCC)
if len(status.Notes) > 0 {
b.WriteString("-- Notes ----------------------------------------------------------------------\n")
for _, n := range status.Notes {
fmt.Fprintf(&b, " - %s\n", n)
}
b.WriteString("\n")
}
fmt.Fprintf(&b, "Collected : %s\n", status.CollectedAt.Format("2006-01-02 15:04:05 UTC"))
b.WriteString(line + "\n")
return b.String()
}
func nonEmptyOr(v, fallback string) string {
if strings.TrimSpace(v) == "" {
return fallback
}
return v
}
+1 -1
View File
@@ -1259,7 +1259,7 @@ func storageSATCommands(devPath string, extended bool) []satJob {
return jobs
}
jobs := []satJob{
{name: "smartctl-health", cmd: []string{"smartctl", "-H", "-A", devPath}},
{name: "smartctl-health", cmd: []string{"smartctl", "-H", "-A", "-i", devPath}},
}
if extended {
jobs = append(jobs, satJob{name: "smartctl-self-test-short", cmd: []string{"smartctl", "-t", "short", devPath}})
+106 -75
View File
@@ -1,6 +1,7 @@
package platform
import (
"bee/audit/internal/collector"
"encoding/json"
"fmt"
"math"
@@ -39,65 +40,22 @@ func GenerateDiskReportText(index int, devPath string, outputs map[string][]byte
// ── NVMe ─────────────────────────────────────────────────────────────────────
type nvmeIdCtrl struct {
ModelNumber string `json:"mn"`
SerialNumber string `json:"sn"`
Firmware string `json:"fr"`
TotalCap uint64 `json:"tnvmcap"`
NVMCap uint64 `json:"nvmcap"`
}
// nvmeU64 handles both plain JSON numbers and {"lo":n,"hi":n} objects that
// some nvme-cli versions emit for 128-bit counters.
func nvmeU64(raw json.RawMessage) uint64 {
if len(raw) == 0 {
return 0
}
var n uint64
if json.Unmarshal(raw, &n) == nil {
return n
}
var obj struct {
Lo uint64 `json:"lo"`
Hi uint64 `json:"hi"`
}
if json.Unmarshal(raw, &obj) == nil {
return obj.Lo
}
return 0
}
type nvmeSmartLogRaw struct {
CriticalWarning uint64 `json:"critical_warning"`
Temperature json.RawMessage `json:"temperature"`
AvailSpare uint64 `json:"avail_spare"`
SpareThresh uint64 `json:"spare_thresh"`
PercentUsed uint64 `json:"percent_used"`
DataUnitsRead json.RawMessage `json:"data_units_read"`
DataUnitsWritten json.RawMessage `json:"data_units_written"`
PowerCycles json.RawMessage `json:"power_cycles"`
PowerOnHours json.RawMessage `json:"power_on_hours"`
UnsafeShutdowns json.RawMessage `json:"unsafe_shutdowns"`
MediaErrors json.RawMessage `json:"media_errors"`
NumErrLogEntries json.RawMessage `json:"num_err_log_entries"`
}
func writeNVMeReport(b *strings.Builder, outputs map[string][]byte) {
// id-ctrl
var ctrl nvmeIdCtrl
var ctrl collector.NVMeIDCtrl
if data := outputs["nvme-id-ctrl"]; len(data) > 0 {
_ = json.Unmarshal(data, &ctrl)
}
model := strings.TrimSpace(ctrl.ModelNumber)
serial := strings.TrimSpace(ctrl.SerialNumber)
firmware := strings.TrimSpace(ctrl.Firmware)
firmware := strings.TrimSpace(ctrl.FirmwareRev)
capacityGB := ""
if ctrl.TotalCap > 0 {
capacityGB = formatCapacityGB(ctrl.TotalCap)
} else if ctrl.NVMCap > 0 {
capacityGB = formatCapacityGB(ctrl.NVMCap)
if ctrl.TotalCapacity > 0 {
capacityGB = formatCapacityGB(uint64(ctrl.TotalCapacity))
} else if ctrl.NVMCapacity > 0 {
capacityGB = formatCapacityGB(uint64(ctrl.NVMCapacity))
}
writeField(b, "Model", model)
@@ -113,67 +71,69 @@ func writeNVMeReport(b *strings.Builder, outputs map[string][]byte) {
b.WriteString("\n(no SMART data)\n")
return
}
var sl nvmeSmartLogRaw
var sl collector.NVMeSmartLog
if err := json.Unmarshal(data, &sl); err != nil {
fmt.Fprintf(b, "\n(SMART parse error: %v)\n", err)
return
}
tempK := nvmeU64(sl.Temperature)
tempC := int(tempK) - 273
tempC := int(sl.Temperature) - 273
if tempC < 0 {
tempC = 0
}
critWarn := sl.CriticalWarning
critWarnStr := "OK"
if critWarn != 0 {
critWarnStr = fmt.Sprintf("0x%02X", critWarn)
if sl.CriticalWarning != 0 {
critWarnStr = fmt.Sprintf("0x%02X", sl.CriticalWarning)
}
poh := nvmeU64(sl.PowerOnHours)
pc := nvmeU64(sl.PowerCycles)
us := nvmeU64(sl.UnsafeShutdowns)
me := nvmeU64(sl.MediaErrors)
nel := nvmeU64(sl.NumErrLogEntries)
poh := uint64(sl.PowerOnHours)
pc := uint64(sl.PowerCycles)
us := uint64(sl.UnsafeShutdowns)
me := uint64(sl.MediaErrors)
nel := uint64(sl.NumErrLogEntries)
// data_units are in 1000 × 512-byte sectors = 512,000 bytes each
dataRead := float64(nvmeU64(sl.DataUnitsRead)) * 512000 / 1e9
dataWritten := float64(nvmeU64(sl.DataUnitsWritten)) * 512000 / 1e9
readBytes := uint64(sl.DataUnitsRead) * 512000
writtenBytes := uint64(sl.DataUnitsWritten) * 512000
writeSectionHeader(b, "Health")
writeField(b, "Temperature", fmt.Sprintf("%d °C", tempC))
writeField(b, "Critical Warning", critWarnStr)
writeField(b, "Percentage Used", fmt.Sprintf("%d %%", sl.PercentUsed))
writeField(b, "Available Spare", fmt.Sprintf("%d %% (threshold: %d %%)", sl.AvailSpare, sl.SpareThresh))
writeField(b, "Percentage Used", fmt.Sprintf("%d %%", sl.PercentageUsed))
writeField(b, "Available Spare", fmt.Sprintf("%d %% (threshold: %d %%)", sl.AvailableSpare, sl.SpareThreshold))
writeSectionHeader(b, "Usage")
writeField(b, "Power On Hours", fmt.Sprintf("%s h", formatUint(poh)))
writeField(b, "Power Cycles", formatUint(pc))
writeField(b, "Unsafe Shutdowns", formatUint(us))
writeField(b, "Data Written", fmt.Sprintf("%.1f GB", dataWritten))
writeField(b, "Data Read", fmt.Sprintf("%.1f GB", dataRead))
writeField(b, "Data Written", formatBytesHuman(float64(writtenBytes)))
writeField(b, "Data Read", formatBytesHuman(float64(readBytes)))
writeSectionHeader(b, "Errors")
writeField(b, "Media Errors", formatUint(me))
writeField(b, "Error Log Entries", formatUint(nel))
capacityBytes := ctrl.TotalCap
capacityBytes := uint64(ctrl.TotalCapacity)
if capacityBytes == 0 {
capacityBytes = ctrl.NVMCap
capacityBytes = uint64(ctrl.NVMCapacity)
}
writeResourceSection(b, resourceInfo{
ri := resourceInfo{
powerOnHours: poh,
writtenBytes: uint64(nvmeU64(sl.DataUnitsWritten)) * 512000,
readBytes: uint64(nvmeU64(sl.DataUnitsRead)) * 512000,
powerCycles: pc,
writtenBytes: writtenBytes,
readBytes: readBytes,
capacityBytes: capacityBytes,
})
}
writeResourceSection(b, ri)
if selfTest := outputs["nvme-device-self-test"]; len(selfTest) > 0 {
writeSectionHeader(b, "Self-Test")
result := parseSelfTestResult(string(selfTest))
writeField(b, "Result", result)
}
writeConclusionSection(b, ri)
}
// ── SATA / SAS (smartctl) ────────────────────────────────────────────────────
@@ -246,13 +206,15 @@ func writeSATAReport(b *strings.Builder, outputs map[string][]byte) {
}
}
var poh, writtenLBAs, readLBAs uint64
var poh, pc, writtenLBAs, readLBAs uint64
var readValue int
hasReadValue := false
for _, a := range attrs {
switch a.ID {
case 9: // Power_On_Hours
poh = parseLeadingUint(a.Raw)
case 12: // Power_Cycle_Count
pc = parseLeadingUint(a.Raw)
case 241: // Total_LBAs_Written
writtenLBAs = parseLeadingUint(a.Raw)
case 242: // Total_LBAs_Read
@@ -262,14 +224,16 @@ func writeSATAReport(b *strings.Builder, outputs map[string][]byte) {
}
}
const sataSectorBytes = 512
writeResourceSection(b, resourceInfo{
ri := resourceInfo{
powerOnHours: poh,
powerCycles: pc,
writtenBytes: writtenLBAs * sataSectorBytes,
readBytes: readLBAs * sataSectorBytes,
capacityBytes: capacityBytes,
readPercent: 100 - readValue,
hasReadPercent: hasReadValue,
})
}
writeResourceSection(b, ri)
selfTest := outputs["smartctl-self-test-status"]
if len(selfTest) == 0 {
@@ -280,6 +244,8 @@ func writeSATAReport(b *strings.Builder, outputs map[string][]byte) {
result := parseSelfTestResult(string(selfTest))
writeField(b, "Result", result)
}
writeConclusionSection(b, ri)
}
func parseSMARTAttrs(text string) []smartAttr {
@@ -375,6 +341,7 @@ const (
type resourceInfo struct {
powerOnHours uint64
powerCycles uint64
writtenBytes uint64
readBytes uint64
capacityBytes uint64
@@ -407,6 +374,70 @@ func writeResourceSection(b *strings.Builder, r resourceInfo) {
}
}
// ── Conclusion (new-vs-used verdict) ────────────────────────────────────────
// Thresholds for treating a drive as "new": less than one full drive-write
// (110% of capacity, headroom for provisioning/overprovisioning rounding),
// less than a bit over two full drive-reads (210% of capacity), under a
// week of power-on time, and under 30 power cycles. Any one violation is
// enough to call the drive used — these are deliberately loose bounds, not
// a wear/endurance judgment (see -- Resource -- for that).
const (
newDiskMaxWrittenFrac = 1.10
newDiskMaxReadFrac = 2.10
newDiskMaxUptimeHours = 7 * 24
newDiskMaxPowerCycles = 30
)
func writeConclusionSection(b *strings.Builder, r resourceInfo) {
writeSectionHeader(b, "Conclusion")
var reasons, notes []string
isNew := true
if r.capacityBytes > 0 {
writtenFrac := float64(r.writtenBytes) / float64(r.capacityBytes)
readFrac := float64(r.readBytes) / float64(r.capacityBytes)
if writtenFrac >= newDiskMaxWrittenFrac {
isNew = false
reasons = append(reasons, fmt.Sprintf(
"data written %s (%s of capacity)",
formatBytesHuman(float64(r.writtenBytes)), formatPercent(writtenFrac*100)))
}
if readFrac >= newDiskMaxReadFrac {
isNew = false
reasons = append(reasons, fmt.Sprintf(
"data read %s (%s of capacity)",
formatBytesHuman(float64(r.readBytes)), formatPercent(readFrac*100)))
}
} else {
notes = append(notes, "capacity unknown — write/read criteria not evaluated")
}
if r.powerOnHours >= newDiskMaxUptimeHours {
isNew = false
reasons = append(reasons, fmt.Sprintf("uptime %s", formatHoursHuman(r.powerOnHours)))
}
if r.powerCycles >= newDiskMaxPowerCycles {
isNew = false
reasons = append(reasons, fmt.Sprintf("power cycles %s", formatUint(r.powerCycles)))
}
if isNew {
writeField(b, "Disk Condition", "NEW")
} else {
writeField(b, "Disk Condition", "USED")
b.WriteString(" Reason:\n")
for _, reason := range reasons {
fmt.Fprintf(b, " - %s\n", reason)
}
}
for _, note := range notes {
fmt.Fprintf(b, " Note: %s\n", note)
}
}
// progressBar renders a fixed-width pseudographic bar, e.g. "[######------]".
func progressBar(frac float64, width int) string {
if math.IsNaN(frac) || frac < 0 {
+30 -1
View File
@@ -83,7 +83,36 @@ func TestGenerateDiskReportNVMe(t *testing.T) {
assertContains(t, report, "1,234 h") // power_on_hours with separator
assertContains(t, report, "32") // power_cycles
assertContains(t, report, "3") // unsafe_shutdowns
assertContains(t, report, "378.0 GB") // data_units_written * 512000 / 1e9
assertContains(t, report, "378.00 GB") // data_units_written * 512000, human-scaled
}
// TestGenerateDiskReportNVMeDataUnitsScaleToTB verifies that heavy write/read
// counters render in the "-- Usage --" section as TB/PB, not raw GB, matching
// the "-- Resource --" section which already used formatBytesHuman.
func TestGenerateDiskReportNVMeDataUnitsScaleToTB(t *testing.T) {
t.Parallel()
heavy := []byte(`{
"critical_warning": 0,
"temperature": 307,
"avail_spare": 100,
"spare_thresh": 10,
"percent_used": 0,
"data_units_read": "252420478",
"data_units_written": "103834055",
"power_cycles": "45",
"power_on_hours": "45",
"unsafe_shutdowns": "35",
"media_errors": "0",
"num_err_log_entries": "0"
}`)
outputs := map[string][]byte{
"nvme-id-ctrl": testNVMeIdCtrl,
"nvme-smart-log": heavy,
}
report := GenerateDiskReportText(1, "/dev/nvme0n1", outputs, time.Unix(0, 0).UTC())
assertContains(t, report, "Data Written : 53.16 TB")
assertContains(t, report, "Data Read : 129.24 TB")
}
func TestGenerateDiskReportNVMeLoHi(t *testing.T) {
+1
View File
@@ -38,6 +38,7 @@ var techDumpNvidiaCommands = []struct {
}{
{Name: "nvidia-smi", Args: []string{"-q"}, File: "nvidia-smi-q.txt"},
{Name: "nvidia-smi", Args: []string{"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown", "--format=csv,noheader,nounits"}, File: "nvidia-smi-query.csv"},
{Name: "nvidia-smi", Args: []string{"conf-compute", "-q"}, File: "nvidia-smi-conf-compute-q.txt"},
}
type lsblkDumpRoot struct {
+1 -1
View File
@@ -135,7 +135,7 @@ func defaultTaskPriority(target string, params taskParams) int {
return taskPriorityBurn
case "nvidia", "nvidia-targeted-stress", "nvidia-targeted-power", "nvidia-pulse",
"nvidia-interconnect", "nvidia-bandwidth", "memory", "storage", "cpu",
"amd", "amd-mem", "amd-bandwidth":
"amd", "amd-mem", "amd-bandwidth", "confidential-computing":
if params.StressMode {
return taskPriorityValidateStress
}
+37 -24
View File
@@ -37,26 +37,41 @@ var fruEditableFields = map[string]struct {
"Chassis Part Number": {"c", 0},
"Chassis Serial Number": {"c", 1},
"Chassis Serial": {"c", 1},
"Chassis Extra": {"c", 2},
// Board — vendor doc names and ipmitool abbreviated names
"Board Manufacturer": {"b", 0},
"Board Mfg": {"b", 0},
"Board Product Name": {"b", 1},
"Board Product": {"b", 1},
"Board Manufacturer": {"b", 0},
"Board Mfg": {"b", 0},
"Board Product Name": {"b", 1},
"Board Product": {"b", 1},
"Board Serial Number": {"b", 2},
"Board Serial": {"b", 2},
"Board Part Number": {"b", 3},
"Board Serial": {"b", 2},
"Board Part Number": {"b", 3},
// Product — vendor doc names and ipmitool abbreviated names
"Product Manufacturer": {"p", 0},
"Product Name": {"p", 1},
"Product Part Number": {"p", 2},
"Product Version": {"p", 3},
"Product Manufacturer": {"p", 0},
"Product Name": {"p", 1},
"Product Part Number": {"p", 2},
"Product Version": {"p", 3},
"Product Serial Number": {"p", 4},
"Product Serial": {"p", 4},
"Product Serial": {"p", 4},
"Product Asset Tag": {"p", 5},
}
// fruExtraBaseIndex gives the starting ipmitool field index for each area's
// repeated "<Area> Extra" custom fields, per the vendor FRU field doc (Chassis
// extra fields start at 2, Board at 5, Product at 7). ipmitool fru print
// emits one identically-named line per custom field, so parseFRUOutput
// counts occurrences to recover the real index for each one.
var fruExtraBaseIndex = map[string]struct {
Area string
Base int
}{
"Chassis Extra": {"c", 2},
"Board Extra": {"b", 5},
"Product Extra": {"p", 7},
}
func parseFRUOutput(output string) []fruField {
var fields []fruField
extraSeen := map[string]int{}
for _, line := range strings.Split(output, "\n") {
// Lines look like: " Field Name : value"
trimmed := strings.TrimLeft(line, " \t")
@@ -64,33 +79,32 @@ func parseFRUOutput(output string) []fruField {
continue
}
colon := strings.Index(trimmed, " : ")
valueOffset := 3
if colon < 0 {
// try ": " with no leading space before colon
colon = strings.Index(trimmed, ": ")
valueOffset = 2
if colon < 0 {
continue
}
name := strings.TrimSpace(trimmed[:colon])
value := strings.TrimSpace(trimmed[colon+2:])
if name == "" {
continue
}
editable, area, idx := fruFieldMeta(name)
fields = append(fields, fruField{Name: name, Value: value, Editable: editable, Area: area, Index: idx})
continue
}
name := strings.TrimSpace(trimmed[:colon])
value := strings.TrimSpace(trimmed[colon+3:])
value := strings.TrimSpace(trimmed[colon+valueOffset:])
if name == "" {
continue
}
editable, area, idx := fruFieldMeta(name)
editable, area, idx := fruFieldMeta(name, extraSeen)
fields = append(fields, fruField{Name: name, Value: value, Editable: editable, Area: area, Index: idx})
}
return fields
}
func fruFieldMeta(name string) (editable bool, area string, index int) {
func fruFieldMeta(name string, extraSeen map[string]int) (editable bool, area string, index int) {
if e, ok := fruExtraBaseIndex[name]; ok {
idx := e.Base + extraSeen[name]
extraSeen[name]++
return true, e.Area, idx
}
if e, ok := fruEditableFields[name]; ok {
return true, e.Area, e.Index
}
@@ -201,4 +215,3 @@ func runIPMIFRUWriteTask(ctx context.Context, j *jobState, exportDir string, p t
}
return nil
}
+59
View File
@@ -0,0 +1,59 @@
package webui
import "testing"
func TestParseFRUOutputExtraFields(t *testing.T) {
// Realistic ipmitool fru print output: repeated "<Area> Extra" lines
// (one per custom field) must resolve to sequential indices per the
// vendor FRU doc (Chassis Extra starts at 2, Board Extra at 5, Product
// Extra at 7), not all collapse onto the same index.
out := `
Product Manufacturer : Inspur
Product Name : NF5280M6
Product Part Number : PN123
Product Version : 1.0
Product Serial : SN123
Product Asset Tag : ASSET01
Product Extra : custom-p1
Board Mfg : Inspur
Board Product : BoardX
Board Serial : BSN1
Board Part Number : BPN1
Board Extra : custom-b1
Board Extra : custom-b2
Board Extra : custom-b3
Chassis Part Number : CPN1
Chassis Serial : CSN1
Chassis Extra : front-half
Chassis Extra : back-half
`
fields := parseFRUOutput(out)
byName := map[string][]fruField{}
for _, f := range fields {
byName[f.Name] = append(byName[f.Name], f)
}
assertMeta := func(name string, occurrence int, wantArea string, wantIndex int) {
t.Helper()
list := byName[name]
if occurrence >= len(list) {
t.Fatalf("expected occurrence %d of %q, got %d entries", occurrence, name, len(list))
}
f := list[occurrence]
if f.Area != wantArea || f.Index != wantIndex {
t.Errorf("%s[%d] = area:%q index:%d, want area:%q index:%d", name, occurrence, f.Area, f.Index, wantArea, wantIndex)
}
if !f.Editable {
t.Errorf("%s[%d] expected editable", name, occurrence)
}
}
assertMeta("Product Asset Tag", 0, "p", 5)
assertMeta("Product Extra", 0, "p", 7)
assertMeta("Board Extra", 0, "b", 5)
assertMeta("Board Extra", 1, "b", 6)
assertMeta("Board Extra", 2, "b", 7)
assertMeta("Chassis Extra", 0, "c", 2)
assertMeta("Chassis Extra", 1, "c", 3)
}
+8 -2
View File
@@ -676,6 +676,12 @@ func renderCheck(opts HandlerOptions) string {
`<code>lsblk</code>; NVMe: <code>nvme id-ctrl</code>, <code>nvme smart-log</code>; SATA/SAS: <code>smartctl -H -A</code>`,
`Seconds — instantaneous device query, no wear counters incremented.`,
)) +
renderSATCard("confidential-computing", "Confidential Computing", "runSAT('confidential-computing')", "", renderValidateCardBody(
inv.NVIDIA,
`Checks whether this server can run NVIDIA Confidential Computing: CPU TEE support (Intel TDX / AMD SEV-SNP) and GPU firmware CC capability. Read-only — changes nothing.`,
`<code>nvidia-smi conf-compute -q</code>, <code>dmesg</code>, <code>/sys/module/kvm_amd/parameters/*</code>`,
`Seconds — read-only query only.`,
)) +
`</div>
<div style="height:1px;background:var(--border);margin:16px 0"></div>
<div class="card" style="margin-bottom:16px">
@@ -737,7 +743,7 @@ func renderCheck(opts HandlerOptions) string {
<script>
let satES = null;
function satLabels() {
return {nvidia:'Check GPU (DCGM L2)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Check Memory', storage:'Check Storage', cpu:'Check CPU', amd:'Check AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
return {nvidia:'Check GPU (DCGM L2)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Check Memory', storage:'Check Storage', cpu:'Check CPU', amd:'Check AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth', 'confidential-computing':'Check Confidential Computing'};
}
let satNvidiaGPUsPromise = null;
function loadSatNvidiaGPUs() {
@@ -873,7 +879,7 @@ function runAllCheckSAT() {
status.textContent = 'Enqueuing...';
const nvidiaIndices = satSelectedGPUIndices();
const nvidiaAllTargets = ['nvidia', 'nvidia-interconnect', 'nvidia-bandwidth'];
const baseTargets = ['cpu', 'memory', 'storage'];
const baseTargets = ['cpu', 'memory', 'storage', 'confidential-computing'];
const amdTargets = selectedAMDValidateTargets();
const expanded = [];
baseTargets.forEach(t => expanded.push({target: t}));
+1
View File
@@ -264,6 +264,7 @@ func NewHandler(opts HandlerOptions) http.Handler {
mux.HandleFunc("POST /api/sat/nvidia-stress/run", h.handleAPISATRun("nvidia-stress"))
mux.HandleFunc("POST /api/sat/memory/run", h.handleAPISATRun("memory"))
mux.HandleFunc("POST /api/sat/storage/run", h.handleAPISATRun("storage"))
mux.HandleFunc("POST /api/sat/confidential-computing/run", h.handleAPISATRun("confidential-computing"))
mux.HandleFunc("POST /api/sat/cpu/run", h.handleAPISATRun("cpu"))
mux.HandleFunc("POST /api/sat/amd/run", h.handleAPISATRun("amd"))
mux.HandleFunc("POST /api/sat/amd-mem/run", h.handleAPISATRun("amd-mem"))
+6
View File
@@ -272,6 +272,12 @@ func executeTaskWithOptions(opts *HandlerOptions, t *Task, j *jobState, ctx cont
break
}
archive, err = runStorageAcceptancePackCtx(a, ctx, "", t.params.StressMode, j.append)
case "confidential-computing":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
archive, err = runConfidentialComputingCheckPackCtx(a, ctx, "", j.append)
case "cpu":
if a == nil {
err = fmt.Errorf("app not configured")
+10
View File
@@ -45,6 +45,7 @@ var taskNames = map[string]string{
"nvidia-stress": "NVIDIA GPU Stress",
"memory": "Memory SAT",
"storage": "Storage SAT",
"confidential-computing": "Confidential Computing Check",
"cpu": "CPU SAT",
"amd": "AMD GPU SAT",
"amd-mem": "AMD GPU MEM Integrity",
@@ -312,6 +313,9 @@ var (
runStorageAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
return a.RunStorageAcceptancePackCtx(ctx, baseDir, extended, logFunc)
}
runConfidentialComputingCheckPackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
return a.RunConfidentialComputingCheckPackCtx(ctx, baseDir, logFunc)
}
runCPUAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
return a.RunCPUAcceptancePackCtx(ctx, baseDir, durationSec, logFunc)
}
@@ -1025,6 +1029,12 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
break
}
archive, err = runStorageAcceptancePackCtx(a, ctx, "", t.params.StressMode, j.append)
case "confidential-computing":
if a == nil {
err = fmt.Errorf("app not configured")
break
}
archive, err = runConfidentialComputingCheckPackCtx(a, ctx, "", j.append)
case "cpu":
if a == nil {
err = fmt.Errorf("app not configured")