Compare commits
59 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
11ea640626 | ||
|
|
796acdfec1 | ||
|
|
2a7d366e50 | ||
|
|
5bfaecd417 | ||
|
|
8575cf06f8 | ||
|
|
d1d5f63257 | ||
| fc9b446d2e | |||
|
|
ea68318744 | ||
|
|
518082c2e2 | ||
|
|
056dce0b98 | ||
|
|
24f2e65b6e | ||
|
|
7f27b9aa38 | ||
|
|
cf29131116 | ||
|
|
13e6324853 | ||
|
|
892ef6fb7d | ||
|
|
ce46a97975 | ||
|
|
258ecb3453 | ||
|
|
cbb0d1e522 | ||
|
|
bab941ccf1 | ||
|
|
b49c71a980 | ||
|
|
85d1acdaa3 | ||
|
|
a2d7513153 | ||
|
|
5b5d8609d3 | ||
|
|
e7442972d1 | ||
|
|
4c6daa1c5e | ||
|
|
e420888d71 | ||
|
|
8149360410 | ||
|
|
4262c5b798 | ||
|
|
b2e177af31 | ||
|
|
271dadda03 | ||
|
|
20766ccc76 | ||
|
|
966944d6d8 | ||
| ce6b1e0eb7 | |||
| 4066e842a9 | |||
| 7d2e904d14 | |||
| 2320925433 | |||
| e169a7722c | |||
| 74a3c65f64 | |||
| 884988cb2a | |||
| 963bc960ca | |||
| 4f6579e040 | |||
| dc07580adc | |||
|
|
87e78e230e | ||
|
|
805a3b277d | ||
|
|
5bc9bd7fb3 | ||
|
|
0939a647ea | ||
|
|
7640f20714 | ||
|
|
1593bf3e76 | ||
|
|
ae80d7711e | ||
|
|
ca78b9df65 | ||
|
|
5cafe63f33 | ||
|
|
b75e65bcb1 | ||
|
|
8d173175eb | ||
|
|
5cbde0448e | ||
|
|
49a09fde05 | ||
|
|
f3962422c8 | ||
|
|
ee36e3c711 | ||
|
|
cca3b21d35 | ||
|
|
75c33e073e |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1,6 +1,5 @@
|
||||
.env
|
||||
.DS_Store
|
||||
dist/
|
||||
iso/out/
|
||||
build-cache/
|
||||
audit/bee
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
405
audit/internal/app/app_format.go
Normal file
405
audit/internal/app/app_format.go
Normal file
@@ -0,0 +1,405 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/collector"
|
||||
"bee/audit/internal/platform"
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
func hostnameOr(fallback string) string {
|
||||
hn, err := os.Hostname()
|
||||
if err != nil || strings.TrimSpace(hn) == "" {
|
||||
return fallback
|
||||
}
|
||||
return hn
|
||||
}
|
||||
|
||||
func sanitizeFilename(v string) string {
|
||||
var out []rune
|
||||
for _, r := range v {
|
||||
switch {
|
||||
case r >= 'a' && r <= 'z', r >= 'A' && r <= 'Z', r >= '0' && r <= '9', r == '-', r == '_', r == '.':
|
||||
out = append(out, r)
|
||||
default:
|
||||
out = append(out, '-')
|
||||
}
|
||||
}
|
||||
if len(out) == 0 {
|
||||
return "unknown"
|
||||
}
|
||||
return string(out)
|
||||
}
|
||||
|
||||
func bodyOr(body, fallback string) string {
|
||||
body = strings.TrimSpace(body)
|
||||
if body == "" {
|
||||
return fallback
|
||||
}
|
||||
return body
|
||||
}
|
||||
|
||||
func trimPtr(value *string) string {
|
||||
if value == nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(*value)
|
||||
}
|
||||
|
||||
func joinSortedKeys(values map[string]struct{}) string {
|
||||
if len(values) == 0 {
|
||||
return ""
|
||||
}
|
||||
keys := make([]string, 0, len(values))
|
||||
for key := range values {
|
||||
keys = append(keys, key)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
return strings.Join(keys, "/")
|
||||
}
|
||||
|
||||
func humanizeMB(totalMB int) string {
|
||||
if totalMB <= 0 {
|
||||
return ""
|
||||
}
|
||||
gb := float64(totalMB) / 1024.0
|
||||
if gb >= 1024.0 {
|
||||
tb := gb / 1024.0
|
||||
return fmt.Sprintf("%.1f TB", tb)
|
||||
}
|
||||
if gb == float64(int64(gb)) {
|
||||
return fmt.Sprintf("%.0f GB", gb)
|
||||
}
|
||||
return fmt.Sprintf("%.1f GB", gb)
|
||||
}
|
||||
|
||||
func humanizeGB(totalGB int) string {
|
||||
if totalGB <= 0 {
|
||||
return ""
|
||||
}
|
||||
tb := float64(totalGB) / 1024.0
|
||||
if tb >= 1.0 {
|
||||
return fmt.Sprintf("%.1f TB", tb)
|
||||
}
|
||||
return fmt.Sprintf("%d GB", totalGB)
|
||||
}
|
||||
|
||||
func parseKeyValueSummary(raw string) map[string]string {
|
||||
out := map[string]string{}
|
||||
for _, line := range strings.Split(raw, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
key, value, ok := strings.Cut(line, "=")
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
out[strings.TrimSpace(key)] = strings.TrimSpace(value)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func firstNonEmpty(values ...string) string {
|
||||
for _, value := range values {
|
||||
value = strings.TrimSpace(value)
|
||||
if value != "" {
|
||||
return value
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func cleanSummaryKey(key string) string {
|
||||
idx := strings.Index(key, "-")
|
||||
if idx <= 0 {
|
||||
return key
|
||||
}
|
||||
prefix := key[:idx]
|
||||
for _, c := range prefix {
|
||||
if c < '0' || c > '9' {
|
||||
return key
|
||||
}
|
||||
}
|
||||
return key[idx+1:]
|
||||
}
|
||||
|
||||
func isGPUDevice(dev schema.HardwarePCIeDevice) bool {
|
||||
// Exclude Aspeed BMC VGA adapters (not compute GPUs).
|
||||
if dev.VendorID != nil && *dev.VendorID == collector.AspeedVendorID {
|
||||
return false
|
||||
}
|
||||
class := trimPtr(dev.DeviceClass)
|
||||
// AMD Instinct / Radeon compute GPUs always carry ProcessingAccelerator or DisplayController.
|
||||
// Do NOT match AMD vendor alone — CPU chipset PCIe devices share that vendor ID.
|
||||
if class == "VideoController" || class == "DisplayController" || class == "ProcessingAccelerator" {
|
||||
return true
|
||||
}
|
||||
// NVIDIA devices sometimes expose class values outside the standard GPU set.
|
||||
return dev.VendorID != nil && *dev.VendorID == collector.NvidiaVendorID
|
||||
}
|
||||
|
||||
func formatSystemLine(board schema.HardwareBoard) string {
|
||||
model := strings.TrimSpace(strings.Join([]string{
|
||||
trimPtr(board.Manufacturer),
|
||||
trimPtr(board.ProductName),
|
||||
}, " "))
|
||||
serial := strings.TrimSpace(board.SerialNumber)
|
||||
switch {
|
||||
case model != "" && serial != "":
|
||||
return fmt.Sprintf("System: %s | S/N %s", model, serial)
|
||||
case model != "":
|
||||
return "System: " + model
|
||||
case serial != "":
|
||||
return "System S/N: " + serial
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
func formatCPULine(cpus []schema.HardwareCPU) string {
|
||||
if len(cpus) == 0 {
|
||||
return ""
|
||||
}
|
||||
modelCounts := map[string]int{}
|
||||
unknown := 0
|
||||
for _, cpu := range cpus {
|
||||
model := trimPtr(cpu.Model)
|
||||
if model == "" {
|
||||
unknown++
|
||||
continue
|
||||
}
|
||||
modelCounts[model]++
|
||||
}
|
||||
if len(modelCounts) == 1 && unknown == 0 {
|
||||
for model, count := range modelCounts {
|
||||
return fmt.Sprintf("CPU: %d x %s", count, model)
|
||||
}
|
||||
}
|
||||
parts := make([]string, 0, len(modelCounts)+1)
|
||||
if len(modelCounts) > 0 {
|
||||
keys := make([]string, 0, len(modelCounts))
|
||||
for key := range modelCounts {
|
||||
keys = append(keys, key)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
for _, key := range keys {
|
||||
parts = append(parts, fmt.Sprintf("%d x %s", modelCounts[key], key))
|
||||
}
|
||||
}
|
||||
if unknown > 0 {
|
||||
parts = append(parts, fmt.Sprintf("%d x unknown", unknown))
|
||||
}
|
||||
return "CPU: " + strings.Join(parts, ", ")
|
||||
}
|
||||
|
||||
func formatMemoryLine(dimms []schema.HardwareMemory) string {
|
||||
totalMB := 0
|
||||
present := 0
|
||||
types := map[string]struct{}{}
|
||||
for _, dimm := range dimms {
|
||||
if dimm.Present != nil && !*dimm.Present {
|
||||
continue
|
||||
}
|
||||
if dimm.SizeMB == nil || *dimm.SizeMB <= 0 {
|
||||
continue
|
||||
}
|
||||
present++
|
||||
totalMB += *dimm.SizeMB
|
||||
if value := trimPtr(dimm.Type); value != "" {
|
||||
types[value] = struct{}{}
|
||||
}
|
||||
}
|
||||
if totalMB == 0 {
|
||||
return ""
|
||||
}
|
||||
typeText := joinSortedKeys(types)
|
||||
line := fmt.Sprintf("Memory: %s", humanizeMB(totalMB))
|
||||
if typeText != "" {
|
||||
line += " " + typeText
|
||||
}
|
||||
if present > 0 {
|
||||
line += fmt.Sprintf(" (%d DIMMs)", present)
|
||||
}
|
||||
return line
|
||||
}
|
||||
|
||||
func formatStorageLine(disks []schema.HardwareStorage) string {
|
||||
count := 0
|
||||
totalGB := 0
|
||||
for _, disk := range disks {
|
||||
if disk.Present != nil && !*disk.Present {
|
||||
continue
|
||||
}
|
||||
count++
|
||||
if disk.SizeGB != nil && *disk.SizeGB > 0 {
|
||||
totalGB += *disk.SizeGB
|
||||
}
|
||||
}
|
||||
if count == 0 {
|
||||
return ""
|
||||
}
|
||||
line := fmt.Sprintf("Storage: %d drives", count)
|
||||
if totalGB > 0 {
|
||||
line += fmt.Sprintf(" / %s", humanizeGB(totalGB))
|
||||
}
|
||||
return line
|
||||
}
|
||||
|
||||
func formatGPULine(devices []schema.HardwarePCIeDevice) string {
|
||||
gpus := map[string]int{}
|
||||
for _, dev := range devices {
|
||||
if !isGPUDevice(dev) {
|
||||
continue
|
||||
}
|
||||
name := firstNonEmpty(trimPtr(dev.Model), trimPtr(dev.Manufacturer), "unknown")
|
||||
gpus[name]++
|
||||
}
|
||||
if len(gpus) == 0 {
|
||||
return ""
|
||||
}
|
||||
keys := make([]string, 0, len(gpus))
|
||||
for key := range gpus {
|
||||
keys = append(keys, key)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
parts := make([]string, 0, len(keys))
|
||||
for _, key := range keys {
|
||||
parts = append(parts, fmt.Sprintf("%d x %s", gpus[key], key))
|
||||
}
|
||||
return "GPU: " + strings.Join(parts, ", ")
|
||||
}
|
||||
|
||||
func formatIPLine(list func() ([]platform.InterfaceInfo, error)) string {
|
||||
if list == nil {
|
||||
return ""
|
||||
}
|
||||
ifaces, err := list()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
seen := map[string]struct{}{}
|
||||
var ips []string
|
||||
for _, iface := range ifaces {
|
||||
for _, ip := range iface.IPv4 {
|
||||
ip = strings.TrimSpace(ip)
|
||||
if ip == "" {
|
||||
continue
|
||||
}
|
||||
if _, ok := seen[ip]; ok {
|
||||
continue
|
||||
}
|
||||
seen[ip] = struct{}{}
|
||||
ips = append(ips, ip)
|
||||
}
|
||||
}
|
||||
if len(ips) == 0 {
|
||||
return ""
|
||||
}
|
||||
sort.Strings(ips)
|
||||
return "IP: " + strings.Join(ips, ", ")
|
||||
}
|
||||
|
||||
func formatSATDetail(raw string) string {
|
||||
var b strings.Builder
|
||||
kv := parseKeyValueSummary(raw)
|
||||
|
||||
if t, ok := kv["run_at_utc"]; ok {
|
||||
fmt.Fprintf(&b, "Run: %s\n\n", t)
|
||||
}
|
||||
|
||||
lines := strings.Split(raw, "\n")
|
||||
var stepKeys []string
|
||||
seenStep := map[string]bool{}
|
||||
for _, line := range lines {
|
||||
if idx := strings.Index(line, "_status="); idx >= 0 {
|
||||
key := line[:idx]
|
||||
if !seenStep[key] && key != "overall" {
|
||||
seenStep[key] = true
|
||||
stepKeys = append(stepKeys, key)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for _, key := range stepKeys {
|
||||
status := kv[key+"_status"]
|
||||
display := cleanSummaryKey(key)
|
||||
switch status {
|
||||
case "OK":
|
||||
fmt.Fprintf(&b, "PASS %s\n", display)
|
||||
case "FAILED":
|
||||
fmt.Fprintf(&b, "FAIL %s\n", display)
|
||||
case "UNSUPPORTED":
|
||||
fmt.Fprintf(&b, "SKIP %s\n", display)
|
||||
default:
|
||||
fmt.Fprintf(&b, "? %s\n", display)
|
||||
}
|
||||
}
|
||||
|
||||
if overall, ok := kv["overall_status"]; ok {
|
||||
ok2 := kv["job_ok"]
|
||||
failed := kv["job_failed"]
|
||||
fmt.Fprintf(&b, "\nOverall: %s (ok=%s failed=%s)", overall, ok2, failed)
|
||||
}
|
||||
|
||||
return strings.TrimSpace(b.String())
|
||||
}
|
||||
|
||||
func formatSATSummary(label, raw string) string {
|
||||
values := parseKeyValueSummary(raw)
|
||||
var body strings.Builder
|
||||
fmt.Fprintf(&body, "%s:", label)
|
||||
if overall := firstNonEmpty(values["overall_status"], "UNKNOWN"); overall != "" {
|
||||
fmt.Fprintf(&body, " %s", overall)
|
||||
}
|
||||
if ok := firstNonEmpty(values["job_ok"], "0"); ok != "" {
|
||||
fmt.Fprintf(&body, " ok=%s", ok)
|
||||
}
|
||||
if failed := firstNonEmpty(values["job_failed"], "0"); failed != "" {
|
||||
fmt.Fprintf(&body, " failed=%s", failed)
|
||||
}
|
||||
if unsupported := firstNonEmpty(values["job_unsupported"], "0"); unsupported != "" && unsupported != "0" {
|
||||
fmt.Fprintf(&body, " unsupported=%s", unsupported)
|
||||
}
|
||||
if devices := strings.TrimSpace(values["devices"]); devices != "" {
|
||||
fmt.Fprintf(&body, "\nDevices: %s", devices)
|
||||
}
|
||||
return body.String()
|
||||
}
|
||||
|
||||
func latestSATSummaries() []string {
|
||||
patterns := []struct {
|
||||
label string
|
||||
prefix string
|
||||
}{
|
||||
{label: "NVIDIA SAT", prefix: "gpu-nvidia-"},
|
||||
{label: "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)", prefix: "gpu-nvidia-targeted-stress-"},
|
||||
{label: "NVIDIA Max Compute Load (dcgmproftester)", prefix: "gpu-nvidia-compute-"},
|
||||
{label: "NVIDIA Targeted Power (dcgmi diag targeted_power)", prefix: "gpu-nvidia-targeted-power-"},
|
||||
{label: "NVIDIA Pulse Test (dcgmi diag pulse_test)", prefix: "gpu-nvidia-pulse-"},
|
||||
{label: "NVIDIA Interconnect Test (NCCL all_reduce_perf)", prefix: "gpu-nvidia-nccl-"},
|
||||
{label: "NVIDIA Bandwidth Test (NVBandwidth)", prefix: "gpu-nvidia-bandwidth-"},
|
||||
{label: "Memory SAT", prefix: "memory-"},
|
||||
{label: "Storage SAT", prefix: "storage-"},
|
||||
{label: "CPU SAT", prefix: "cpu-"},
|
||||
}
|
||||
var out []string
|
||||
for _, item := range patterns {
|
||||
matches, err := filepath.Glob(filepath.Join(DefaultSATBaseDir, item.prefix+"*/summary.txt"))
|
||||
if err != nil || len(matches) == 0 {
|
||||
continue
|
||||
}
|
||||
sort.Strings(matches)
|
||||
raw, err := os.ReadFile(matches[len(matches)-1])
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
out = append(out, formatSATSummary(item.label, string(raw)))
|
||||
}
|
||||
return out
|
||||
}
|
||||
76
audit/internal/app/app_install.go
Normal file
76
audit/internal/app/app_install.go
Normal file
@@ -0,0 +1,76 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
func (a *App) ListRemovableTargets() ([]platform.RemovableTarget, error) {
|
||||
return a.exports.ListRemovableTargets()
|
||||
}
|
||||
|
||||
func (a *App) ExportLatestAudit(target platform.RemovableTarget) (string, error) {
|
||||
if _, err := os.Stat(DefaultAuditJSONPath); err != nil {
|
||||
return "", err
|
||||
}
|
||||
filename := fmt.Sprintf("audit-%s-%s.json", sanitizeFilename(hostnameOr("unknown")), time.Now().UTC().Format("20060102-150405"))
|
||||
tmpPath := filepath.Join(os.TempDir(), filename)
|
||||
data, err := readFileLimited(DefaultAuditJSONPath, 100<<20)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
if normalized, normErr := ApplySATOverlay(data); normErr == nil {
|
||||
data = normalized
|
||||
}
|
||||
if err := os.WriteFile(tmpPath, data, 0644); err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer os.Remove(tmpPath)
|
||||
return a.exports.ExportFileToTarget(tmpPath, target)
|
||||
}
|
||||
|
||||
func (a *App) ExportLatestAuditResult(target platform.RemovableTarget) (ActionResult, error) {
|
||||
path, err := a.ExportLatestAudit(target)
|
||||
body := "Audit export failed."
|
||||
if err == nil {
|
||||
body = "Audit exported."
|
||||
}
|
||||
if err == nil && path != "" {
|
||||
body = "Audit exported to " + path
|
||||
}
|
||||
return ActionResult{Title: "Export audit", Body: body}, err
|
||||
}
|
||||
|
||||
func (a *App) ExportSupportBundle(target platform.RemovableTarget) (string, error) {
|
||||
archive, err := BuildSupportBundle(DefaultExportDir)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer os.Remove(archive)
|
||||
return a.exports.ExportFileToTarget(archive, target)
|
||||
}
|
||||
|
||||
func (a *App) ExportSupportBundleResult(target platform.RemovableTarget) (ActionResult, error) {
|
||||
path, err := a.ExportSupportBundle(target)
|
||||
body := "Support bundle export failed."
|
||||
if err == nil {
|
||||
body = "Support bundle exported. USB target unmounted and safe to remove."
|
||||
}
|
||||
if err == nil && path != "" {
|
||||
body = "Support bundle exported to " + path + ".\n\nUSB target unmounted and safe to remove."
|
||||
}
|
||||
return ActionResult{Title: "Export support bundle", Body: body}, err
|
||||
}
|
||||
|
||||
func (a *App) ListInstallDisks() ([]platform.InstallDisk, error) {
|
||||
return a.installer.ListInstallDisks()
|
||||
}
|
||||
|
||||
func (a *App) InstallToDisk(ctx context.Context, device string, logFile string) error {
|
||||
return a.installer.InstallToDisk(ctx, device, logFile)
|
||||
}
|
||||
106
audit/internal/app/app_network.go
Normal file
106
audit/internal/app/app_network.go
Normal file
@@ -0,0 +1,106 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
func (a *App) ListInterfaces() ([]platform.InterfaceInfo, error) {
|
||||
return a.network.ListInterfaces()
|
||||
}
|
||||
|
||||
func (a *App) DefaultRoute() string {
|
||||
return a.network.DefaultRoute()
|
||||
}
|
||||
|
||||
func (a *App) DHCPOne(iface string) (string, error) {
|
||||
return a.network.DHCPOne(iface)
|
||||
}
|
||||
|
||||
func (a *App) DHCPOneResult(iface string) (ActionResult, error) {
|
||||
body, err := a.network.DHCPOne(iface)
|
||||
return ActionResult{Title: "DHCP: " + iface, Body: bodyOr(body, "DHCP completed.")}, err
|
||||
}
|
||||
|
||||
func (a *App) DHCPAll() (string, error) {
|
||||
return a.network.DHCPAll()
|
||||
}
|
||||
|
||||
func (a *App) DHCPAllResult() (ActionResult, error) {
|
||||
body, err := a.network.DHCPAll()
|
||||
return ActionResult{Title: "DHCP: all interfaces", Body: bodyOr(body, "DHCP completed.")}, err
|
||||
}
|
||||
|
||||
func (a *App) SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error) {
|
||||
return a.network.SetStaticIPv4(cfg)
|
||||
}
|
||||
|
||||
func (a *App) SetInterfaceState(iface string, up bool) error {
|
||||
return a.network.SetInterfaceState(iface, up)
|
||||
}
|
||||
|
||||
func (a *App) GetInterfaceState(iface string) (bool, error) {
|
||||
return a.network.GetInterfaceState(iface)
|
||||
}
|
||||
|
||||
func (a *App) CaptureNetworkSnapshot() (platform.NetworkSnapshot, error) {
|
||||
return a.network.CaptureNetworkSnapshot()
|
||||
}
|
||||
|
||||
func (a *App) RestoreNetworkSnapshot(snapshot platform.NetworkSnapshot) error {
|
||||
return a.network.RestoreNetworkSnapshot(snapshot)
|
||||
}
|
||||
|
||||
func (a *App) SetStaticIPv4Result(cfg platform.StaticIPv4Config) (ActionResult, error) {
|
||||
body, err := a.network.SetStaticIPv4(cfg)
|
||||
return ActionResult{Title: "Static IPv4: " + cfg.Interface, Body: bodyOr(body, "Static IPv4 updated.")}, err
|
||||
}
|
||||
|
||||
func (a *App) NetworkStatus() (ActionResult, error) {
|
||||
ifaces, err := a.network.ListInterfaces()
|
||||
if err != nil {
|
||||
return ActionResult{Title: "Network status"}, err
|
||||
}
|
||||
if len(ifaces) == 0 {
|
||||
return ActionResult{Title: "Network status", Body: "No physical interfaces found."}, nil
|
||||
}
|
||||
var body strings.Builder
|
||||
for _, iface := range ifaces {
|
||||
ipv4 := "(no IPv4)"
|
||||
if len(iface.IPv4) > 0 {
|
||||
ipv4 = strings.Join(iface.IPv4, ", ")
|
||||
}
|
||||
fmt.Fprintf(&body, "- %s: state=%s ip=%s\n", iface.Name, iface.State, ipv4)
|
||||
}
|
||||
if gw := a.network.DefaultRoute(); gw != "" {
|
||||
fmt.Fprintf(&body, "\nDefault route: %s\n", gw)
|
||||
}
|
||||
return ActionResult{Title: "Network status", Body: strings.TrimSpace(body.String())}, nil
|
||||
}
|
||||
|
||||
func (a *App) DefaultStaticIPv4FormFields(iface string) []string {
|
||||
return []string{
|
||||
"",
|
||||
"24",
|
||||
strings.TrimSpace(a.network.DefaultRoute()),
|
||||
"77.88.8.8 77.88.8.1 1.1.1.1 8.8.8.8",
|
||||
}
|
||||
}
|
||||
|
||||
func (a *App) ParseStaticIPv4Config(iface string, fields []string) platform.StaticIPv4Config {
|
||||
get := func(index int) string {
|
||||
if index >= 0 && index < len(fields) {
|
||||
return strings.TrimSpace(fields[index])
|
||||
}
|
||||
return ""
|
||||
}
|
||||
return platform.StaticIPv4Config{
|
||||
Interface: iface,
|
||||
Address: get(0),
|
||||
Prefix: get(1),
|
||||
Gateway: get(2),
|
||||
DNS: strings.Fields(get(3)),
|
||||
}
|
||||
}
|
||||
370
audit/internal/app/app_packs.go
Normal file
370
audit/internal/app/app_packs.go
Normal file
@@ -0,0 +1,370 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
func (a *App) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaAcceptancePack(baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||
path, err := a.RunNvidiaAcceptancePack(baseDir, nil)
|
||||
body := "Archive written."
|
||||
if path != "" {
|
||||
body = "Archive written to " + path
|
||||
}
|
||||
return ActionResult{Title: "NVIDIA SAT", Body: body}, err
|
||||
}
|
||||
|
||||
func (a *App) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
|
||||
return a.sat.ListNvidiaGPUs()
|
||||
}
|
||||
|
||||
func (a *App) ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error) {
|
||||
return a.sat.ListNvidiaGPUStatuses()
|
||||
}
|
||||
|
||||
func (a *App) ResetNvidiaGPU(index int) (ActionResult, error) {
|
||||
out, err := a.sat.ResetNvidiaGPU(index)
|
||||
return ActionResult{Title: fmt.Sprintf("Reset NVIDIA GPU %d", index), Body: strings.TrimSpace(out)}, err
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (ActionResult, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
path, err := a.sat.RunNvidiaAcceptancePackWithOptions(ctx, baseDir, diagLevel, gpuIndices, logFunc)
|
||||
body := "Archive written."
|
||||
if path != "" {
|
||||
body = "Archive written to " + path
|
||||
}
|
||||
return ActionResult{Title: "NVIDIA DCGM", Body: body}, err
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaTargetedStressValidatePack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaStressPack(baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||
return a.RunNvidiaStressPackCtx(context.Background(), baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaBenchmark(baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||
return a.RunNvidiaBenchmarkCtx(context.Background(), baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultBeeBenchPerfDir
|
||||
}
|
||||
resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "performance", logFunc)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
opts.ServerPowerSource = resolved.SelectedSource
|
||||
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaPowerBenchCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultBeeBenchPowerDir
|
||||
}
|
||||
resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "power-fit", logFunc)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
opts.ServerPowerSource = resolved.SelectedSource
|
||||
return a.sat.RunNvidiaPowerBench(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaPowerSourceAutotuneCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultBeeBenchAutotuneDir
|
||||
}
|
||||
return a.sat.RunNvidiaPowerSourceAutotune(ctx, baseDir, opts, benchmarkKind, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) LoadBenchmarkPowerAutotune() (*platform.BenchmarkPowerAutotuneConfig, error) {
|
||||
return platform.LoadBenchmarkPowerAutotuneConfig(DefaultBeeBenchPowerSourceConfigPath)
|
||||
}
|
||||
|
||||
func (a *App) ensureBenchmarkPowerAutotune(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (platform.BenchmarkPowerAutotuneConfig, error) {
|
||||
cfgPath := platform.BenchmarkPowerSourceConfigPath(baseDir)
|
||||
if cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath); err == nil {
|
||||
if logFunc != nil {
|
||||
logFunc(fmt.Sprintf("benchmark autotune: using saved server power source %s", cfg.SelectedSource))
|
||||
}
|
||||
return *cfg, nil
|
||||
}
|
||||
if logFunc != nil {
|
||||
logFunc("benchmark autotune: no saved power source config, running autotune first")
|
||||
}
|
||||
autotuneDir := filepath.Join(filepath.Dir(baseDir), "autotune")
|
||||
if _, err := a.RunNvidiaPowerSourceAutotuneCtx(ctx, autotuneDir, opts, benchmarkKind, logFunc); err != nil {
|
||||
return platform.BenchmarkPowerAutotuneConfig{}, err
|
||||
}
|
||||
cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath)
|
||||
if err != nil {
|
||||
return platform.BenchmarkPowerAutotuneConfig{}, err
|
||||
}
|
||||
return *cfg, nil
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, staggerSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaTargetedPowerPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaPulseTestPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaBandwidthPack(ctx, baseDir, gpuIndices, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaStressPack(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, 256, 1, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryAcceptancePackCtx(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunMemoryAcceptancePack(ctx, baseDir, sizeMB, passes, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||
path, err := a.RunMemoryAcceptancePack(baseDir, nil)
|
||||
return ActionResult{Title: "Memory SAT", Body: satResultBody(path)}, err
|
||||
}
|
||||
|
||||
func (a *App) RunCPUAcceptancePack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.RunCPUAcceptancePackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunCPUAcceptancePackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunCPUAcceptancePack(ctx, baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunCPUAcceptancePackResult(baseDir string, durationSec int) (ActionResult, error) {
|
||||
path, err := a.RunCPUAcceptancePack(baseDir, durationSec, nil)
|
||||
return ActionResult{Title: "CPU SAT", Body: satResultBody(path)}, err
|
||||
}
|
||||
|
||||
func (a *App) RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunStorageAcceptancePackCtx(context.Background(), baseDir, false, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunStorageAcceptancePackCtx(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunStorageAcceptancePack(ctx, baseDir, extended, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||
path, err := a.RunStorageAcceptancePack(baseDir, nil)
|
||||
return ActionResult{Title: "Storage SAT", Body: satResultBody(path)}, err
|
||||
}
|
||||
|
||||
func (a *App) DetectGPUVendor() string {
|
||||
return a.sat.DetectGPUVendor()
|
||||
}
|
||||
|
||||
func (a *App) ListAMDGPUs() ([]platform.AMDGPUInfo, error) {
|
||||
return a.sat.ListAMDGPUs()
|
||||
}
|
||||
|
||||
func (a *App) RunAMDAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunAMDAcceptancePackCtx(context.Background(), baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunAMDAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunAMDAcceptancePack(ctx, baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunAMDAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||
path, err := a.RunAMDAcceptancePack(baseDir, nil)
|
||||
return ActionResult{Title: "AMD GPU SAT", Body: satResultBody(path)}, err
|
||||
}
|
||||
|
||||
func (a *App) RunAMDMemIntegrityPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunAMDMemIntegrityPack(ctx, baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunAMDMemBandwidthPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunAMDMemBandwidthPack(ctx, baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.RunMemoryStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunSATStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.RunSATStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunAMDStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.RunAMDStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.sat.RunMemoryStressPack(ctx, baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunSATStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.sat.RunSATStressPack(ctx, baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunAMDStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunAMDStressPack(ctx, baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunFanStressTest(ctx, baseDir, opts)
|
||||
}
|
||||
|
||||
func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNCCLTests(ctx, baseDir, gpuIndices, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
|
||||
path, err := a.RunNCCLTests(ctx, DefaultSATBaseDir, nil, nil)
|
||||
body := "Results: " + path
|
||||
if err != nil && err != context.Canceled {
|
||||
body += "\nERROR: " + err.Error()
|
||||
}
|
||||
return ActionResult{Title: "NCCL bandwidth test", Body: body}, err
|
||||
}
|
||||
|
||||
func (a *App) RunFanStressTestResult(ctx context.Context, opts platform.FanStressOptions) (ActionResult, error) {
|
||||
path, err := a.RunFanStressTest(ctx, "", opts)
|
||||
body := formatFanStressResult(path)
|
||||
if err != nil && err != context.Canceled {
|
||||
body += "\nERROR: " + err.Error()
|
||||
}
|
||||
return ActionResult{Title: "GPU Platform Stress Test", Body: body}, err
|
||||
}
|
||||
|
||||
// formatFanStressResult formats the summary.txt from a fan-stress run, including
|
||||
// the per-step pass/fail display and the analysis section (throttling, max temps, fan response).
|
||||
func formatFanStressResult(archivePath string) string {
|
||||
if archivePath == "" {
|
||||
return "No output produced."
|
||||
}
|
||||
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||
raw, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||
if err != nil {
|
||||
return "Archive written to " + archivePath
|
||||
}
|
||||
content := strings.TrimSpace(string(raw))
|
||||
kv := parseKeyValueSummary(content)
|
||||
|
||||
var b strings.Builder
|
||||
b.WriteString(formatSATDetail(content))
|
||||
|
||||
// Append analysis section.
|
||||
var analysis []string
|
||||
if v, ok := kv["throttling_detected"]; ok {
|
||||
label := "NO"
|
||||
if v == "true" {
|
||||
label = "YES ← throttling detected during load"
|
||||
}
|
||||
analysis = append(analysis, "Throttling: "+label)
|
||||
}
|
||||
if v, ok := kv["max_gpu_temp_c"]; ok && v != "0.0" {
|
||||
analysis = append(analysis, "Max GPU temp: "+v+"°C")
|
||||
}
|
||||
if v, ok := kv["max_cpu_temp_c"]; ok && v != "0.0" {
|
||||
analysis = append(analysis, "Max CPU temp: "+v+"°C")
|
||||
}
|
||||
if v, ok := kv["fan_response_sec"]; ok && v != "N/A" && v != "-1.0" {
|
||||
analysis = append(analysis, "Fan response: "+v+"s")
|
||||
}
|
||||
|
||||
if len(analysis) > 0 {
|
||||
b.WriteString("\n\n=== Analysis ===\n")
|
||||
for _, line := range analysis {
|
||||
b.WriteString(line + "\n")
|
||||
}
|
||||
}
|
||||
return strings.TrimSpace(b.String())
|
||||
}
|
||||
|
||||
// satResultBody reads summary.txt from the SAT run directory (archive path without .tar.gz)
|
||||
// and returns a formatted human-readable result. Falls back to a plain message if unreadable.
|
||||
func satResultBody(archivePath string) string {
|
||||
if archivePath == "" {
|
||||
return "No output produced."
|
||||
}
|
||||
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||
raw, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||
if err != nil {
|
||||
return "Archive written to " + archivePath
|
||||
}
|
||||
return formatSATDetail(strings.TrimSpace(string(raw)))
|
||||
}
|
||||
67
audit/internal/app/app_services.go
Normal file
67
audit/internal/app/app_services.go
Normal file
@@ -0,0 +1,67 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
func (a *App) ListBeeServices() ([]string, error) {
|
||||
return a.services.ListBeeServices()
|
||||
}
|
||||
|
||||
func (a *App) ServiceState(name string) string {
|
||||
return a.services.ServiceState(name)
|
||||
}
|
||||
|
||||
func (a *App) ServiceStatus(name string) (string, error) {
|
||||
return a.services.ServiceStatus(name)
|
||||
}
|
||||
|
||||
func (a *App) ServiceStatusResult(name string) (ActionResult, error) {
|
||||
body, err := a.services.ServiceStatus(name)
|
||||
return ActionResult{Title: "service status: " + name, Body: bodyOr(body, "No status output.")}, err
|
||||
}
|
||||
|
||||
func (a *App) ServiceDo(name string, action platform.ServiceAction) (string, error) {
|
||||
return a.services.ServiceDo(name, action)
|
||||
}
|
||||
|
||||
func (a *App) ServiceActionResult(name string, action platform.ServiceAction) (ActionResult, error) {
|
||||
body, err := a.services.ServiceDo(name, action)
|
||||
return ActionResult{Title: "service " + string(action) + ": " + name, Body: bodyOr(body, "Action completed.")}, err
|
||||
}
|
||||
|
||||
func (a *App) TailFile(path string, lines int) string {
|
||||
return a.tools.TailFile(path, lines)
|
||||
}
|
||||
|
||||
func (a *App) CheckTools(names []string) []platform.ToolStatus {
|
||||
return a.tools.CheckTools(names)
|
||||
}
|
||||
|
||||
func (a *App) ToolCheckResult(names []string) ActionResult {
|
||||
if len(names) == 0 {
|
||||
return ActionResult{Title: "Required tools", Body: "No tools checked."}
|
||||
}
|
||||
var body strings.Builder
|
||||
for _, tool := range a.tools.CheckTools(names) {
|
||||
status := "MISSING"
|
||||
if tool.OK {
|
||||
status = "OK (" + tool.Path + ")"
|
||||
}
|
||||
fmt.Fprintf(&body, "- %s: %s\n", tool.Name, status)
|
||||
}
|
||||
return ActionResult{Title: "Required tools", Body: strings.TrimSpace(body.String())}
|
||||
}
|
||||
|
||||
func (a *App) AuditLogTailResult() ActionResult {
|
||||
logTail := strings.TrimSpace(a.tools.TailFile(DefaultAuditLogPath, 40))
|
||||
jsonTail := strings.TrimSpace(a.tools.TailFile(DefaultAuditJSONPath, 20))
|
||||
body := strings.TrimSpace(logTail + "\n\n" + jsonTail)
|
||||
if body == "" {
|
||||
body = "No audit logs found."
|
||||
}
|
||||
return ActionResult{Title: "Audit log tail", Body: body}
|
||||
}
|
||||
@@ -365,7 +365,6 @@ func (w *blackboxWorker) currentFlushPeriod() time.Duration {
|
||||
|
||||
func (w *blackboxWorker) finishCycle(duration time.Duration, err error) {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
w.lastDuration = duration
|
||||
if err != nil {
|
||||
w.status = "degraded"
|
||||
@@ -383,6 +382,10 @@ func (w *blackboxWorker) finishCycle(duration time.Duration, err error) {
|
||||
}
|
||||
w.flushPeriod = adjustFlushPeriod(w.flushPeriod, duration, true, w.fastCycles)
|
||||
}
|
||||
w.mu.Unlock()
|
||||
// persistState must be called without w.mu held: it acquires rt.mu then
|
||||
// each worker.mu inside persistStateLocked, so holding w.mu here would
|
||||
// cause a deadlock (w.mu → rt.mu → w.mu).
|
||||
w.runtime.persistState()
|
||||
}
|
||||
|
||||
|
||||
@@ -3,10 +3,11 @@ package app
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/collector"
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
@@ -313,17 +314,20 @@ func statusSeverity(status string) int {
|
||||
}
|
||||
|
||||
func matchesGPUVendor(dev schema.HardwarePCIeDevice, vendor string) bool {
|
||||
if dev.DeviceClass == nil || !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Controller") && !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Accelerator") {
|
||||
if dev.DeviceClass == nil || !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Display") && !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Video") {
|
||||
return false
|
||||
}
|
||||
if dev.DeviceClass == nil {
|
||||
return false
|
||||
}
|
||||
class := strings.TrimSpace(*dev.DeviceClass)
|
||||
isGPUClass := strings.Contains(class, "Controller") || strings.Contains(class, "Accelerator") ||
|
||||
strings.Contains(class, "Display") || strings.Contains(class, "Video")
|
||||
if !isGPUClass {
|
||||
return false
|
||||
}
|
||||
manufacturer := strings.ToLower(strings.TrimSpace(ptrString(dev.Manufacturer)))
|
||||
switch vendor {
|
||||
case "amd":
|
||||
return strings.Contains(manufacturer, "advanced micro devices") || strings.Contains(manufacturer, "amd/ati")
|
||||
return dev.VendorID != nil && *dev.VendorID == collector.AMDVendorID
|
||||
case "nvidia":
|
||||
return strings.Contains(manufacturer, "nvidia")
|
||||
return dev.VendorID != nil && *dev.VendorID == collector.NvidiaVendorID
|
||||
default:
|
||||
return false
|
||||
}
|
||||
|
||||
@@ -5,6 +5,7 @@ import (
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"bee/audit/internal/collector"
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
@@ -46,10 +47,12 @@ func TestApplyLatestSATStatusesMarksAMDGPUs(t *testing.T) {
|
||||
|
||||
class := "DisplayController"
|
||||
manufacturer := "Advanced Micro Devices, Inc. [AMD/ATI]"
|
||||
amdVendorID := collector.AMDVendorID
|
||||
snap := schema.HardwareSnapshot{
|
||||
PCIeDevices: []schema.HardwarePCIeDevice{{
|
||||
DeviceClass: &class,
|
||||
Manufacturer: &manufacturer,
|
||||
VendorID: &amdVendorID,
|
||||
}},
|
||||
}
|
||||
|
||||
|
||||
@@ -84,11 +84,10 @@ func hasAMDGPUDevices(devs []schema.HardwarePCIeDevice) bool {
|
||||
}
|
||||
|
||||
func isAMDGPUDevice(dev schema.HardwarePCIeDevice) bool {
|
||||
if dev.Manufacturer == nil || dev.DeviceClass == nil {
|
||||
if dev.DeviceClass == nil {
|
||||
return false
|
||||
}
|
||||
manufacturer := strings.ToLower(strings.TrimSpace(*dev.Manufacturer))
|
||||
return strings.Contains(manufacturer, "advanced micro devices") && isGPUClass(strings.TrimSpace(*dev.DeviceClass))
|
||||
return dev.VendorID != nil && *dev.VendorID == AMDVendorID && isGPUClass(strings.TrimSpace(*dev.DeviceClass))
|
||||
}
|
||||
|
||||
func queryAMDGPUs() (map[string]amdGPUInfo, error) {
|
||||
|
||||
@@ -174,15 +174,19 @@ func cleanDMIValue(v string) string {
|
||||
upper := strings.ToUpper(v)
|
||||
placeholders := []string{
|
||||
"TO BE FILLED BY O.E.M.",
|
||||
"TO BE FILLED BY O.E.M",
|
||||
"NOT SPECIFIED",
|
||||
"NOT SETTABLE",
|
||||
"NOT PRESENT",
|
||||
"NOT AVAILABLE",
|
||||
"UNKNOWN",
|
||||
"N/A",
|
||||
"NONE",
|
||||
"NULL",
|
||||
"DEFAULT STRING",
|
||||
"0",
|
||||
"0123456789",
|
||||
"1234567890",
|
||||
}
|
||||
for _, p := range placeholders {
|
||||
if upper == p {
|
||||
|
||||
@@ -84,6 +84,10 @@ func TestCleanDMIValue(t *testing.T) {
|
||||
{" Inspur ", "Inspur"},
|
||||
{"", ""},
|
||||
{"0", ""},
|
||||
{"0123456789", ""},
|
||||
{"1234567890", ""},
|
||||
{"Not Available", ""},
|
||||
{"To Be Filled By O.E.M", ""},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
got := cleanDMIValue(tt.input)
|
||||
@@ -109,6 +113,80 @@ func TestParseDMIFields(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseBoard_Dell(t *testing.T) {
|
||||
type1 := mustReadFile(t, "testdata/dmidecode_type1_dell.txt")
|
||||
type2 := mustReadFile(t, "testdata/dmidecode_type2_dell.txt")
|
||||
|
||||
board := parseBoard(type1, type2)
|
||||
|
||||
if board.SerialNumber != "7SG9F63" {
|
||||
t.Errorf("serial_number: got %q, want %q", board.SerialNumber, "7SG9F63")
|
||||
}
|
||||
if board.Manufacturer == nil || *board.Manufacturer != "Dell Inc." {
|
||||
t.Errorf("manufacturer: got %v, want Dell Inc.", board.Manufacturer)
|
||||
}
|
||||
if board.ProductName == nil || *board.ProductName != "PowerEdge R740xd" {
|
||||
t.Errorf("product_name: got %v, want PowerEdge R740xd", board.ProductName)
|
||||
}
|
||||
// part number comes from type2 Product Name
|
||||
if board.PartNumber == nil || *board.PartNumber != "0F9N89" {
|
||||
t.Errorf("part_number: got %v, want 0F9N89", board.PartNumber)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseBoard_HPE(t *testing.T) {
|
||||
type1 := mustReadFile(t, "testdata/dmidecode_type1_hpe.txt")
|
||||
type2 := mustReadFile(t, "testdata/dmidecode_type2_hpe.txt")
|
||||
|
||||
board := parseBoard(type1, type2)
|
||||
|
||||
if board.SerialNumber != "CZJ9320CXN" {
|
||||
t.Errorf("serial_number: got %q, want %q", board.SerialNumber, "CZJ9320CXN")
|
||||
}
|
||||
if board.Manufacturer == nil || *board.Manufacturer != "HPE" {
|
||||
t.Errorf("manufacturer: got %v, want HPE", board.Manufacturer)
|
||||
}
|
||||
if board.ProductName == nil || *board.ProductName != "ProLiant DL380 Gen10" {
|
||||
t.Errorf("product_name: got %v, want ProLiant DL380 Gen10", board.ProductName)
|
||||
}
|
||||
if board.PartNumber == nil || *board.PartNumber != "ProLiant DL380 Gen10" {
|
||||
t.Errorf("part_number: got %v, want ProLiant DL380 Gen10", board.PartNumber)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseBoard_Supermicro_Placeholders(t *testing.T) {
|
||||
type1 := mustReadFile(t, "testdata/dmidecode_type1_supermicro.txt")
|
||||
type2 := mustReadFile(t, "testdata/dmidecode_type2_supermicro.txt")
|
||||
|
||||
board := parseBoard(type1, type2)
|
||||
|
||||
if board.SerialNumber != "S214726X2A36789" {
|
||||
t.Errorf("serial_number: got %q, want %q", board.SerialNumber, "S214726X2A36789")
|
||||
}
|
||||
if board.Manufacturer == nil || *board.Manufacturer != "Supermicro" {
|
||||
t.Errorf("manufacturer: got %v, want Supermicro", board.Manufacturer)
|
||||
}
|
||||
if board.ProductName == nil || *board.ProductName != "SYS-6028R-WTR" {
|
||||
t.Errorf("product_name: got %v, want SYS-6028R-WTR", board.ProductName)
|
||||
}
|
||||
// "X10DRW-i" is the real part number from type 2
|
||||
if board.PartNumber == nil || *board.PartNumber != "X10DRW-i" {
|
||||
t.Errorf("part_number: got %v, want X10DRW-i", board.PartNumber)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseBIOSFirmware_Dell(t *testing.T) {
|
||||
type0 := mustReadFile(t, "testdata/dmidecode_type0_dell.txt")
|
||||
fw := parseBIOSFirmware(type0)
|
||||
|
||||
if len(fw) != 1 {
|
||||
t.Fatalf("expected 1 firmware record, got %d", len(fw))
|
||||
}
|
||||
if fw[0].Version != "2.5.4" {
|
||||
t.Errorf("version: got %q, want 2.5.4", fw[0].Version)
|
||||
}
|
||||
}
|
||||
|
||||
func mustReadFile(t *testing.T, path string) string {
|
||||
t.Helper()
|
||||
b, err := os.ReadFile(path)
|
||||
|
||||
@@ -40,6 +40,7 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
|
||||
snap.PCIeDevices = enrichPCIeWithAMD(snap.PCIeDevices)
|
||||
snap.PCIeDevices = enrichPCIeWithPCISerials(snap.PCIeDevices)
|
||||
snap.PCIeDevices = enrichPCIeWithNVIDIA(snap.PCIeDevices)
|
||||
snap.PCIeDevices = enrichNVLinkBridgesWithGPUTopo(snap.PCIeDevices)
|
||||
snap.PCIeDevices = enrichPCIeWithMellanox(snap.PCIeDevices)
|
||||
snap.PCIeDevices = enrichPCIeWithNICTelemetry(snap.PCIeDevices)
|
||||
snap.PCIeDevices = enrichPCIeWithRAIDTelemetry(snap.PCIeDevices)
|
||||
@@ -48,7 +49,8 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
|
||||
snap.VROCLicense = collectVROCLicense(snap.PCIeDevices)
|
||||
snap.PowerSupplies = collectPSUs(derefString(snap.Board.Manufacturer))
|
||||
snap.PowerSupplies = enrichPSUsWithTelemetry(snap.PowerSupplies, sensorDoc)
|
||||
snap.Sensors = buildSensorsFromDoc(sensorDoc)
|
||||
snap.Sensors = mergeIPMISensors(buildSensorsFromDoc(sensorDoc), collectIPMISensors())
|
||||
snap.EventLogs = append(collectIPMISEL(), collectDmesgErrors()...)
|
||||
finalizeSnapshot(&snap, collectedAt)
|
||||
|
||||
// remaining collectors added in steps 1.8 – 1.10
|
||||
|
||||
129
audit/internal/collector/dmesg_events.go
Normal file
129
audit/internal/collector/dmesg_events.go
Normal file
@@ -0,0 +1,129 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"log/slog"
|
||||
"os/exec"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// dmesg -T output: [Thu Jun 18 14:23:45 2026] message
|
||||
// dmesg without -T: [ 123.456789] message
|
||||
var dmesgTimestampRE = regexp.MustCompile(`^\[([^\]]+)\]\s*(.*)$`)
|
||||
|
||||
// Keywords that indicate an error or hardware problem worth capturing.
|
||||
var dmesgErrorPatterns = []*regexp.Regexp{
|
||||
regexp.MustCompile(`(?i)\berr(or)?\b`),
|
||||
regexp.MustCompile(`(?i)\bfail(ed|ure)?\b`),
|
||||
regexp.MustCompile(`(?i)\bfault\b`),
|
||||
regexp.MustCompile(`(?i)\bwarn(ing)?\b`),
|
||||
regexp.MustCompile(`(?i)\bAER\b`),
|
||||
regexp.MustCompile(`(?i)\bXid\b`),
|
||||
regexp.MustCompile(`(?i)\bNVRM\b`),
|
||||
regexp.MustCompile(`(?i)\bpanic\b`),
|
||||
regexp.MustCompile(`(?i)\bcorrected\b`),
|
||||
regexp.MustCompile(`(?i)\buncorrect`),
|
||||
regexp.MustCompile(`(?i)\bECC\b`),
|
||||
regexp.MustCompile(`(?i)\btimeout\b`),
|
||||
regexp.MustCompile(`(?i)\breset\b`),
|
||||
regexp.MustCompile(`(?i)\bdead\b`),
|
||||
regexp.MustCompile(`(?i)\bhang\b`),
|
||||
regexp.MustCompile(`(?i)\bstall\b`),
|
||||
regexp.MustCompile(`(?i)\bdisabled\b`),
|
||||
}
|
||||
|
||||
// collectDmesgErrors runs `dmesg -T` (or `dmesg` without -T on failure) and
|
||||
// returns only lines that match known error/warning patterns.
|
||||
func collectDmesgErrors() []schema.HardwareEventLog {
|
||||
out, err := exec.Command("dmesg", "-T").Output()
|
||||
if err != nil || len(out) == 0 {
|
||||
// Fallback: dmesg without human-readable timestamps
|
||||
out, err = exec.Command("dmesg").Output()
|
||||
if err != nil || len(out) == 0 {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
entries := parseDmesgErrors(string(out))
|
||||
if len(entries) == 0 {
|
||||
return nil
|
||||
}
|
||||
slog.Info("dmesg: collected error entries", "count", len(entries))
|
||||
return entries
|
||||
}
|
||||
|
||||
func parseDmesgErrors(output string) []schema.HardwareEventLog {
|
||||
var entries []schema.HardwareEventLog
|
||||
collectedAt := time.Now().UTC().Format(time.RFC3339)
|
||||
|
||||
for _, line := range strings.Split(output, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
var timestamp, message string
|
||||
if m := dmesgTimestampRE.FindStringSubmatch(line); m != nil {
|
||||
timestamp = strings.TrimSpace(m[1])
|
||||
message = strings.TrimSpace(m[2])
|
||||
} else {
|
||||
message = line
|
||||
}
|
||||
|
||||
if message == "" {
|
||||
continue
|
||||
}
|
||||
if !matchesAny(message, dmesgErrorPatterns) {
|
||||
continue
|
||||
}
|
||||
|
||||
severity := dmesgSeverity(message)
|
||||
source := "dmesg"
|
||||
|
||||
var eventTime *string
|
||||
if timestamp != "" {
|
||||
t := timestamp
|
||||
eventTime = &t
|
||||
} else {
|
||||
eventTime = &collectedAt
|
||||
}
|
||||
|
||||
entries = append(entries, schema.HardwareEventLog{
|
||||
Source: source,
|
||||
EventTime: eventTime,
|
||||
Severity: &severity,
|
||||
Message: message,
|
||||
})
|
||||
}
|
||||
return entries
|
||||
}
|
||||
|
||||
func matchesAny(s string, patterns []*regexp.Regexp) bool {
|
||||
for _, p := range patterns {
|
||||
if p.MatchString(s) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func dmesgSeverity(msg string) string {
|
||||
lower := strings.ToLower(msg)
|
||||
switch {
|
||||
case strings.Contains(lower, "panic") ||
|
||||
strings.Contains(lower, "aer") ||
|
||||
strings.Contains(lower, "uncorrect") ||
|
||||
strings.Contains(lower, "xid") ||
|
||||
strings.Contains(lower, "nvrm"):
|
||||
return statusCritical
|
||||
case strings.Contains(lower, "error") ||
|
||||
strings.Contains(lower, "fault") ||
|
||||
strings.Contains(lower, "fail") ||
|
||||
strings.Contains(lower, "dead") ||
|
||||
strings.Contains(lower, "hang"):
|
||||
return statusCritical
|
||||
default:
|
||||
return statusWarning
|
||||
}
|
||||
}
|
||||
90
audit/internal/collector/ipmi_sel.go
Normal file
90
audit/internal/collector/ipmi_sel.go
Normal file
@@ -0,0 +1,90 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os/exec"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// collectIPMISEL runs `ipmitool sel list` and returns parsed event log entries.
|
||||
// Returns nil if ipmitool is unavailable or the SEL is empty.
|
||||
func collectIPMISEL() []schema.HardwareEventLog {
|
||||
out, err := exec.Command("ipmitool", "sel", "list").Output()
|
||||
if err != nil || len(out) == 0 {
|
||||
return nil
|
||||
}
|
||||
entries := parseIPMISELOutput(string(out))
|
||||
if len(entries) == 0 {
|
||||
return nil
|
||||
}
|
||||
slog.Info("ipmi sel: collected", "entries", len(entries))
|
||||
return entries
|
||||
}
|
||||
|
||||
// parseIPMISELOutput parses `ipmitool sel list` output.
|
||||
// Line format: ID | date | time | sensor | event description | direction
|
||||
// Example: 1 | 06/18/2026 | 14:23:45 | Temperature #0x30 | Upper Critical going high | Asserted
|
||||
func parseIPMISELOutput(output string) []schema.HardwareEventLog {
|
||||
var entries []schema.HardwareEventLog
|
||||
for _, line := range strings.Split(output, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
parts := strings.SplitN(line, "|", 6)
|
||||
if len(parts) < 5 {
|
||||
continue
|
||||
}
|
||||
id := strings.TrimSpace(parts[0])
|
||||
date := strings.TrimSpace(parts[1])
|
||||
timeStr := strings.TrimSpace(parts[2])
|
||||
sensor := strings.TrimSpace(parts[3])
|
||||
event := strings.TrimSpace(parts[4])
|
||||
direction := ""
|
||||
if len(parts) == 6 {
|
||||
direction = strings.TrimSpace(parts[5])
|
||||
}
|
||||
|
||||
var eventTime *string
|
||||
if date != "" && timeStr != "" {
|
||||
t := fmt.Sprintf("%s %s", date, timeStr)
|
||||
eventTime = &t
|
||||
}
|
||||
|
||||
message := event
|
||||
if direction != "" && strings.EqualFold(direction, "Deasserted") {
|
||||
message = event + " (Deasserted)"
|
||||
}
|
||||
|
||||
severity := ipmiSELSeverity(event)
|
||||
isActive := !strings.EqualFold(direction, "Deasserted")
|
||||
|
||||
entry := schema.HardwareEventLog{
|
||||
Source: "ipmi-sel",
|
||||
EventTime: eventTime,
|
||||
Severity: &severity,
|
||||
MessageID: &id,
|
||||
Message: message,
|
||||
IsActive: &isActive,
|
||||
}
|
||||
if sensor != "" {
|
||||
entry.ComponentRef = &sensor
|
||||
}
|
||||
entries = append(entries, entry)
|
||||
}
|
||||
return entries
|
||||
}
|
||||
|
||||
func ipmiSELSeverity(event string) string {
|
||||
lower := strings.ToLower(event)
|
||||
switch {
|
||||
case strings.Contains(lower, "critical") || strings.Contains(lower, "non-recoverable"):
|
||||
return statusCritical
|
||||
case strings.Contains(lower, "non-critical") || strings.Contains(lower, "warning") || strings.Contains(lower, "degraded"):
|
||||
return statusWarning
|
||||
default:
|
||||
return "info"
|
||||
}
|
||||
}
|
||||
216
audit/internal/collector/ipmi_sensors.go
Normal file
216
audit/internal/collector/ipmi_sensors.go
Normal file
@@ -0,0 +1,216 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"log/slog"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// collectIPMISensors runs `ipmitool sensor` and returns parsed sensor readings.
|
||||
// Returns nil if ipmitool is unavailable or produces no output.
|
||||
func collectIPMISensors() *schema.HardwareSensors {
|
||||
out, err := exec.Command("ipmitool", "sensor").Output()
|
||||
if err != nil || len(out) == 0 {
|
||||
return nil
|
||||
}
|
||||
result := parseIPMISensorOutput(string(out))
|
||||
if result == nil {
|
||||
return nil
|
||||
}
|
||||
slog.Info("ipmi sensors: collected",
|
||||
"fans", len(result.Fans),
|
||||
"temperatures", len(result.Temperatures),
|
||||
"power", len(result.Power),
|
||||
"other", len(result.Other),
|
||||
)
|
||||
return result
|
||||
}
|
||||
|
||||
// parseIPMISensorOutput parses `ipmitool sensor` text output.
|
||||
// Each line: name | value | unit | status | lnr | lcr | lnc | unc | ucr | unr
|
||||
func parseIPMISensorOutput(output string) *schema.HardwareSensors {
|
||||
result := &schema.HardwareSensors{}
|
||||
seen := map[string]struct{}{}
|
||||
|
||||
for _, line := range strings.Split(output, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
parts := strings.Split(line, "|")
|
||||
if len(parts) < 4 {
|
||||
continue
|
||||
}
|
||||
name := strings.TrimSpace(parts[0])
|
||||
rawVal := strings.TrimSpace(parts[1])
|
||||
unit := strings.TrimSpace(parts[2])
|
||||
status := strings.TrimSpace(parts[3])
|
||||
|
||||
if name == "" || rawVal == "na" || rawVal == "N/A" || rawVal == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
value, err := strconv.ParseFloat(rawVal, 64)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
statusStr := normalizeIPMISensorStatus(status)
|
||||
|
||||
switch {
|
||||
case strings.EqualFold(unit, "RPM"):
|
||||
if duplicateSensor(seen, "fan", name) {
|
||||
continue
|
||||
}
|
||||
rpm := int(value)
|
||||
item := schema.HardwareFanSensor{Name: name, RPM: &rpm}
|
||||
if statusStr != "" {
|
||||
item.Status = &statusStr
|
||||
}
|
||||
result.Fans = append(result.Fans, item)
|
||||
|
||||
case strings.EqualFold(unit, "degrees C") || strings.EqualFold(unit, "C"):
|
||||
if duplicateSensor(seen, "temp", name) {
|
||||
continue
|
||||
}
|
||||
item := schema.HardwareTemperatureSensor{Name: name, Celsius: &value}
|
||||
if len(parts) >= 9 {
|
||||
if unc := parseIPMIThreshold(parts[7]); unc != nil {
|
||||
item.ThresholdWarningCelsius = unc
|
||||
}
|
||||
if ucr := parseIPMIThreshold(parts[8]); ucr != nil {
|
||||
item.ThresholdCriticalCelsius = ucr
|
||||
}
|
||||
}
|
||||
if statusStr != "" {
|
||||
item.Status = &statusStr
|
||||
} else {
|
||||
item.Status = deriveTemperatureStatus(item.Celsius, item.ThresholdWarningCelsius, item.ThresholdCriticalCelsius)
|
||||
}
|
||||
result.Temperatures = append(result.Temperatures, item)
|
||||
|
||||
case strings.EqualFold(unit, "Volts") || strings.EqualFold(unit, "V"):
|
||||
if duplicateSensor(seen, "power", name) {
|
||||
continue
|
||||
}
|
||||
item := schema.HardwarePowerSensor{Name: name, VoltageV: &value}
|
||||
if statusStr != "" {
|
||||
item.Status = &statusStr
|
||||
}
|
||||
result.Power = append(result.Power, item)
|
||||
|
||||
case strings.EqualFold(unit, "Watts") || strings.EqualFold(unit, "W"):
|
||||
if duplicateSensor(seen, "power", name) {
|
||||
continue
|
||||
}
|
||||
item := schema.HardwarePowerSensor{Name: name, PowerW: &value}
|
||||
if statusStr != "" {
|
||||
item.Status = &statusStr
|
||||
}
|
||||
result.Power = append(result.Power, item)
|
||||
|
||||
case strings.EqualFold(unit, "Amps") || strings.EqualFold(unit, "A"):
|
||||
if duplicateSensor(seen, "power", name) {
|
||||
continue
|
||||
}
|
||||
item := schema.HardwarePowerSensor{Name: name, CurrentA: &value}
|
||||
if statusStr != "" {
|
||||
item.Status = &statusStr
|
||||
}
|
||||
result.Power = append(result.Power, item)
|
||||
|
||||
default:
|
||||
if duplicateSensor(seen, "other", name) {
|
||||
continue
|
||||
}
|
||||
item := schema.HardwareOtherSensor{Name: name, Value: &value}
|
||||
if unit != "" {
|
||||
item.Unit = &unit
|
||||
}
|
||||
if statusStr != "" {
|
||||
item.Status = &statusStr
|
||||
}
|
||||
result.Other = append(result.Other, item)
|
||||
}
|
||||
}
|
||||
|
||||
if len(result.Fans) == 0 && len(result.Temperatures) == 0 && len(result.Power) == 0 && len(result.Other) == 0 {
|
||||
return nil
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func parseIPMIThreshold(raw string) *float64 {
|
||||
s := strings.TrimSpace(raw)
|
||||
if s == "" || s == "na" || s == "N/A" {
|
||||
return nil
|
||||
}
|
||||
v, err := strconv.ParseFloat(s, 64)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
return &v
|
||||
}
|
||||
|
||||
func normalizeIPMISensorStatus(s string) string {
|
||||
switch strings.ToLower(s) {
|
||||
case "ok":
|
||||
return statusOK
|
||||
case "cr", "ucr", "lcr":
|
||||
return statusCritical
|
||||
case "nc", "unc", "lnc", "nr", "unr", "lnr":
|
||||
return statusWarning
|
||||
case "ns", "na":
|
||||
return ""
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
// mergeIPMISensors appends IPMI sensor entries into existing, skipping names already present.
|
||||
func mergeIPMISensors(existing, ipmi *schema.HardwareSensors) *schema.HardwareSensors {
|
||||
if ipmi == nil {
|
||||
return existing
|
||||
}
|
||||
if existing == nil {
|
||||
return ipmi
|
||||
}
|
||||
|
||||
existingNames := map[string]struct{}{}
|
||||
for _, s := range existing.Fans {
|
||||
existingNames["fan\x00"+s.Name] = struct{}{}
|
||||
}
|
||||
for _, s := range existing.Temperatures {
|
||||
existingNames["temp\x00"+s.Name] = struct{}{}
|
||||
}
|
||||
for _, s := range existing.Power {
|
||||
existingNames["power\x00"+s.Name] = struct{}{}
|
||||
}
|
||||
for _, s := range existing.Other {
|
||||
existingNames["other\x00"+s.Name] = struct{}{}
|
||||
}
|
||||
|
||||
for _, s := range ipmi.Fans {
|
||||
if _, ok := existingNames["fan\x00"+s.Name]; !ok {
|
||||
existing.Fans = append(existing.Fans, s)
|
||||
}
|
||||
}
|
||||
for _, s := range ipmi.Temperatures {
|
||||
if _, ok := existingNames["temp\x00"+s.Name]; !ok {
|
||||
existing.Temperatures = append(existing.Temperatures, s)
|
||||
}
|
||||
}
|
||||
for _, s := range ipmi.Power {
|
||||
if _, ok := existingNames["power\x00"+s.Name]; !ok {
|
||||
existing.Power = append(existing.Power, s)
|
||||
}
|
||||
}
|
||||
for _, s := range ipmi.Other {
|
||||
if _, ok := existingNames["other\x00"+s.Name]; !ok {
|
||||
existing.Other = append(existing.Other, s)
|
||||
}
|
||||
}
|
||||
return existing
|
||||
}
|
||||
87
audit/internal/collector/memory_test.go
Normal file
87
audit/internal/collector/memory_test.go
Normal file
@@ -0,0 +1,87 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestParseMemory_Mixed(t *testing.T) {
|
||||
out := mustReadFile(t, "testdata/dmidecode_type17_mixed.txt")
|
||||
dimms := parseMemory(out)
|
||||
|
||||
if len(dimms) != 3 {
|
||||
t.Fatalf("expected 3 DIMMs, got %d", len(dimms))
|
||||
}
|
||||
|
||||
// slot 0: populated, 16 GB Supermicro-style
|
||||
d0 := dimms[0]
|
||||
if d0.Present == nil || !*d0.Present {
|
||||
t.Errorf("dimm0: expected present=true")
|
||||
}
|
||||
if d0.SizeMB == nil || *d0.SizeMB != 16384 {
|
||||
t.Errorf("dimm0: size_mb=%v, want 16384", d0.SizeMB)
|
||||
}
|
||||
if d0.Slot == nil || *d0.Slot != "P1-DIMMA1" {
|
||||
t.Errorf("dimm0: slot=%v, want P1-DIMMA1", d0.Slot)
|
||||
}
|
||||
if d0.Location == nil || *d0.Location != "P0_Node0_Channel0_Dimm0" {
|
||||
t.Errorf("dimm0: location=%v, want P0_Node0_Channel0_Dimm0", d0.Location)
|
||||
}
|
||||
if d0.Manufacturer == nil || *d0.Manufacturer != "Micron" {
|
||||
t.Errorf("dimm0: manufacturer=%v, want Micron", d0.Manufacturer)
|
||||
}
|
||||
if d0.PartNumber == nil || *d0.PartNumber != "36ASF2G72PZ-2G1A2" {
|
||||
t.Errorf("dimm0: part_number=%v, want 36ASF2G72PZ-2G1A2", d0.PartNumber)
|
||||
}
|
||||
if d0.MaxSpeedMHz == nil || *d0.MaxSpeedMHz != 2133 {
|
||||
t.Errorf("dimm0: max_speed_mhz=%v, want 2133", d0.MaxSpeedMHz)
|
||||
}
|
||||
|
||||
// slot 1: empty
|
||||
d1 := dimms[1]
|
||||
if d1.Present == nil || *d1.Present {
|
||||
t.Errorf("dimm1: expected present=false")
|
||||
}
|
||||
if d1.Status == nil || *d1.Status != statusEmpty {
|
||||
t.Errorf("dimm1: status=%v, want %s", d1.Status, statusEmpty)
|
||||
}
|
||||
if d1.SizeMB != nil {
|
||||
t.Errorf("dimm1: size_mb should be nil for empty slot, got %v", d1.SizeMB)
|
||||
}
|
||||
|
||||
// slot 2: populated, 32768 MB Dell-style size
|
||||
d2 := dimms[2]
|
||||
if d2.Present == nil || !*d2.Present {
|
||||
t.Errorf("dimm2: expected present=true")
|
||||
}
|
||||
if d2.SizeMB == nil || *d2.SizeMB != 32768 {
|
||||
t.Errorf("dimm2: size_mb=%v, want 32768", d2.SizeMB)
|
||||
}
|
||||
if d2.Manufacturer == nil || *d2.Manufacturer != "Samsung" {
|
||||
t.Errorf("dimm2: manufacturer=%v, want Samsung", d2.Manufacturer)
|
||||
}
|
||||
if d2.CurrentSpeedMHz == nil || *d2.CurrentSpeedMHz != 2400 {
|
||||
t.Errorf("dimm2: current_speed_mhz=%v, want 2400", d2.CurrentSpeedMHz)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseMemorySizeMB(t *testing.T) {
|
||||
tests := []struct {
|
||||
input string
|
||||
want int
|
||||
}{
|
||||
{"16 GB", 16384},
|
||||
{"32 GB", 32768},
|
||||
{"8 GB", 8192},
|
||||
{"16384 MB", 16384},
|
||||
{"32768 MB", 32768},
|
||||
{"No Module Installed", 0},
|
||||
{"0", 0},
|
||||
{"", 0},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
got := parseMemorySizeMB(tt.input)
|
||||
if got != tt.want {
|
||||
t.Errorf("parseMemorySizeMB(%q) = %d, want %d", tt.input, got, tt.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -11,7 +11,6 @@ import (
|
||||
"time"
|
||||
)
|
||||
|
||||
const mellanoxVendorID = 0x15b3
|
||||
const nicProbeTimeout = 2 * time.Second
|
||||
|
||||
var (
|
||||
@@ -80,16 +79,7 @@ func enrichPCIeWithMellanox(devs []schema.HardwarePCIeDevice) []schema.HardwareP
|
||||
}
|
||||
|
||||
func isMellanoxDevice(dev schema.HardwarePCIeDevice) bool {
|
||||
if dev.VendorID != nil && *dev.VendorID == mellanoxVendorID {
|
||||
return true
|
||||
}
|
||||
if dev.Manufacturer != nil {
|
||||
m := strings.ToLower(*dev.Manufacturer)
|
||||
if strings.Contains(m, "mellanox") || strings.Contains(m, "nvidia networking") {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
return dev.VendorID != nil && *dev.VendorID == MellanoxVendorID
|
||||
}
|
||||
|
||||
func queryMellanoxFromMstflint(bdf string) (firmware, serial string) {
|
||||
|
||||
@@ -55,7 +55,7 @@ func TestEnrichPCIeWithMellanox_mstflint(t *testing.T) {
|
||||
}
|
||||
netIfacesByBDF = func(string) []string { return nil }
|
||||
|
||||
vendorID := mellanoxVendorID
|
||||
vendorID := MellanoxVendorID
|
||||
bdf := "0000:18:00.0"
|
||||
manufacturer := "Mellanox Technologies"
|
||||
devs := []schema.HardwarePCIeDevice{{
|
||||
@@ -99,7 +99,7 @@ func TestEnrichPCIeWithMellanox_fallbackEthtool(t *testing.T) {
|
||||
return "driver: mlx5_core\nfirmware-version: 28.40.1000\n", nil
|
||||
}
|
||||
|
||||
vendorID := mellanoxVendorID
|
||||
vendorID := MellanoxVendorID
|
||||
bdf := "0000:18:00.0"
|
||||
manufacturer := "NVIDIA Networking"
|
||||
devs := []schema.HardwarePCIeDevice{{
|
||||
|
||||
@@ -10,8 +10,6 @@ import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
const nvidiaVendorID = 0x10de
|
||||
|
||||
type nvidiaGPUInfo struct {
|
||||
Index int
|
||||
BDF string
|
||||
@@ -240,13 +238,7 @@ func normalizePCIeBDF(bdf string) string {
|
||||
}
|
||||
|
||||
func isNVIDIADevice(dev schema.HardwarePCIeDevice) bool {
|
||||
if dev.VendorID != nil && *dev.VendorID == nvidiaVendorID {
|
||||
return true
|
||||
}
|
||||
if dev.Manufacturer != nil && strings.Contains(strings.ToLower(*dev.Manufacturer), "nvidia") {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
return dev.VendorID != nil && *dev.VendorID == NvidiaVendorID
|
||||
}
|
||||
|
||||
func setPCIeFallback(dev *schema.HardwarePCIeDevice) {
|
||||
|
||||
@@ -57,7 +57,7 @@ func TestNormalizePCIeBDF(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestEnrichPCIeWithNVIDIAData_driverLoaded(t *testing.T) {
|
||||
vendorID := nvidiaVendorID
|
||||
vendorID := NvidiaVendorID
|
||||
bdf := "0000:65:00.0"
|
||||
manufacturer := "NVIDIA Corporation"
|
||||
status := "OK"
|
||||
@@ -104,7 +104,7 @@ func TestEnrichPCIeWithNVIDIAData_driverLoaded(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestEnrichPCIeWithNVIDIAData_driverMissingFallback(t *testing.T) {
|
||||
vendorID := nvidiaVendorID
|
||||
vendorID := NvidiaVendorID
|
||||
bdf := "0000:17:00.0"
|
||||
manufacturer := "NVIDIA Corporation"
|
||||
devices := []schema.HardwarePCIeDevice{
|
||||
|
||||
11
audit/internal/collector/pci_vendors.go
Normal file
11
audit/internal/collector/pci_vendors.go
Normal file
@@ -0,0 +1,11 @@
|
||||
package collector
|
||||
|
||||
// PCI vendor IDs for hardware classification.
|
||||
// Source: https://pcisig.com / https://pci-ids.ucw.cz/
|
||||
const (
|
||||
NvidiaVendorID = 0x10de
|
||||
AMDVendorID = 0x1002
|
||||
AspeedVendorID = 0x1a03
|
||||
MellanoxVendorID = 0x15b3
|
||||
IntelVendorID = 0x8086
|
||||
)
|
||||
@@ -126,38 +126,39 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
|
||||
dev.Status = &status
|
||||
|
||||
// Slot is the BDF: "0000:00:02.0"
|
||||
if bdf := fields["Slot"]; bdf != "" {
|
||||
dev.Slot = &bdf
|
||||
dev.BDF = &bdf
|
||||
bdfStr := fields["Slot"]
|
||||
if bdfStr != "" {
|
||||
dev.Slot = &bdfStr
|
||||
dev.BDF = &bdfStr
|
||||
// parse vendor_id and device_id from sysfs
|
||||
vendorID, deviceID := readPCIIDs(bdf)
|
||||
vendorID, deviceID := readPCIIDs(bdfStr)
|
||||
if vendorID != 0 {
|
||||
dev.VendorID = &vendorID
|
||||
}
|
||||
if deviceID != 0 {
|
||||
dev.DeviceID = &deviceID
|
||||
}
|
||||
if numaNode, ok := readPCINumaNode(bdf); ok {
|
||||
if numaNode, ok := readPCINumaNode(bdfStr); ok {
|
||||
dev.NUMANode = &numaNode
|
||||
} else if numaNode, ok := parsePCINumaNode(fields["NUMANode"]); ok {
|
||||
dev.NUMANode = &numaNode
|
||||
}
|
||||
if group, ok := readPCIIOMMUGroup(bdf); ok {
|
||||
if group, ok := readPCIIOMMUGroup(bdfStr); ok {
|
||||
dev.IOMMUGroup = &group
|
||||
}
|
||||
if width, ok := readPCIIntAttribute(bdf, "current_link_width"); ok {
|
||||
if width, ok := readPCIIntAttribute(bdfStr, "current_link_width"); ok {
|
||||
dev.LinkWidth = &width
|
||||
}
|
||||
if width, ok := readPCIIntAttribute(bdf, "max_link_width"); ok {
|
||||
if width, ok := readPCIIntAttribute(bdfStr, "max_link_width"); ok {
|
||||
dev.MaxLinkWidth = &width
|
||||
}
|
||||
if speed, ok := readPCIStringAttribute(bdf, "current_link_speed"); ok {
|
||||
if speed, ok := readPCIStringAttribute(bdfStr, "current_link_speed"); ok {
|
||||
linkSpeed := normalizePCILinkSpeed(speed)
|
||||
if linkSpeed != "" {
|
||||
dev.LinkSpeed = &linkSpeed
|
||||
}
|
||||
}
|
||||
if speed, ok := readPCIStringAttribute(bdf, "max_link_speed"); ok {
|
||||
if speed, ok := readPCIStringAttribute(bdfStr, "max_link_speed"); ok {
|
||||
linkSpeed := normalizePCILinkSpeed(speed)
|
||||
if linkSpeed != "" {
|
||||
dev.MaxLinkSpeed = &linkSpeed
|
||||
@@ -178,7 +179,15 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
|
||||
|
||||
// SVendor/SDevice available but not in schema — skip
|
||||
|
||||
// Warn if PCIe link is running below its maximum negotiated speed.
|
||||
// Detect NVLink bridge mezzanine cards (CPU→HGX internal link).
|
||||
// These are Mellanox x2 devices with no host net interfaces and a DeviceName
|
||||
// containing "NVLINK". The targeted lspci call is only executed for the small
|
||||
// number of narrow-link Mellanox cards that pass the cheap pre-filter.
|
||||
if bdfStr != "" && isNVLinkBridgeCandidate(bdfStr, dev) && confirmNVLinkBridgeDeviceName(bdfStr) {
|
||||
markNVLinkBridge(&dev)
|
||||
}
|
||||
|
||||
// Warn (or Critical for NVLink bridges) if PCIe link is running below max.
|
||||
applyPCIeLinkSpeedWarning(&dev)
|
||||
|
||||
return dev
|
||||
@@ -265,17 +274,37 @@ func readPCIStringAttribute(bdf, attribute string) (string, bool) {
|
||||
return value, true
|
||||
}
|
||||
|
||||
// applyPCIeLinkSpeedWarning sets the device status to Warning if the current PCIe link
|
||||
// speed is below the maximum negotiated speed supported by both ends.
|
||||
// applyPCIeLinkSpeedWarning sets device status when the current PCIe link speed is
|
||||
// below the device maximum. Regular PCIe slots get Warning; NVLink bridge cards
|
||||
// get Critical because they are fixed internal connectors that must always train
|
||||
// to max speed — any downgrade signals a hardware fault.
|
||||
//
|
||||
// Disabled devices (sysfs enable==0) are skipped: they carry no data traffic and
|
||||
// their link state has no operational impact. This covers management endpoints
|
||||
// (e.g. PCIe switch fabric controllers on HGX baseboards) that the kernel never
|
||||
// activates but that lspci still reports with link stats.
|
||||
func applyPCIeLinkSpeedWarning(dev *schema.HardwarePCIeDevice) {
|
||||
if dev.LinkSpeed == nil || dev.MaxLinkSpeed == nil {
|
||||
return
|
||||
}
|
||||
if pcieLinkSpeedRank(*dev.LinkSpeed) < pcieLinkSpeedRank(*dev.MaxLinkSpeed) {
|
||||
if pcieLinkSpeedRank(*dev.LinkSpeed) >= pcieLinkSpeedRank(*dev.MaxLinkSpeed) {
|
||||
return
|
||||
}
|
||||
if dev.BDF != nil {
|
||||
if enabled, ok := readPCIIntAttribute(*dev.BDF, "enable"); ok && enabled == 0 {
|
||||
return
|
||||
}
|
||||
}
|
||||
desc := fmt.Sprintf("PCIe link speed degraded: running at %s, capable of %s", *dev.LinkSpeed, *dev.MaxLinkSpeed)
|
||||
dev.ErrorDescription = &desc
|
||||
|
||||
isNVLinkBridge := dev.DeviceClass != nil && *dev.DeviceClass == "NVLinkBridge"
|
||||
if isNVLinkBridge {
|
||||
crit := statusCritical
|
||||
dev.Status = &crit
|
||||
} else {
|
||||
warn := statusWarning
|
||||
dev.Status = &warn
|
||||
desc := fmt.Sprintf("PCIe link speed degraded: running at %s, capable of %s", *dev.LinkSpeed, *dev.MaxLinkSpeed)
|
||||
dev.ErrorDescription = &desc
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
206
audit/internal/collector/pcie_nvlink_bridge.go
Normal file
206
audit/internal/collector/pcie_nvlink_bridge.go
Normal file
@@ -0,0 +1,206 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"log/slog"
|
||||
"os/exec"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var nv5re = regexp.MustCompile(`(?i)^NV(\d+)$`)
|
||||
|
||||
// isNVLinkBridgeCandidate returns true for Mellanox PCIe devices that look like
|
||||
// NVLink bridge mezzanine cards: narrow link (x2), no host net interfaces.
|
||||
// These are the CPU-side PCIe control plane of the NVSwitch fabric on HGX/DGX systems.
|
||||
func isNVLinkBridgeCandidate(bdf string, dev schema.HardwarePCIeDevice) bool {
|
||||
if !isMellanoxDevice(dev) {
|
||||
return false
|
||||
}
|
||||
if dev.LinkWidth == nil || *dev.LinkWidth > 2 {
|
||||
return false
|
||||
}
|
||||
if len(netIfacesByBDF(bdf)) > 0 {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// confirmNVLinkBridgeDeviceName checks if the lspci DeviceName for bdf contains
|
||||
// "NVLINK". This is a targeted single-device call, only executed for candidates
|
||||
// already pre-filtered by isNVLinkBridgeCandidate.
|
||||
func confirmNVLinkBridgeDeviceName(bdf string) bool {
|
||||
out, err := exec.Command("lspci", "-s", bdf, "-v").Output()
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
for _, line := range strings.Split(string(out), "\n") {
|
||||
if strings.Contains(strings.ToUpper(strings.TrimSpace(line)), "NVLINK") {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// markNVLinkBridge overwrites device_class and adds telemetry flags on a detected
|
||||
// NVLink bridge card. Must be called before applyPCIeLinkSpeedWarning so that the
|
||||
// correct severity (Critical) is applied.
|
||||
func markNVLinkBridge(dev *schema.HardwarePCIeDevice) {
|
||||
class := "NVLinkBridge"
|
||||
dev.DeviceClass = &class
|
||||
if dev.Telemetry == nil {
|
||||
dev.Telemetry = map[string]any{}
|
||||
}
|
||||
dev.Telemetry["nvlink_bridge"] = true
|
||||
}
|
||||
|
||||
// enrichNVLinkBridgesWithGPUTopo cross-references NVLink bridge PCIe status with
|
||||
// the GPU-side NVLink topology reported by nvidia-smi. For each bridge device it
|
||||
// adds nvlink_topo_all_active and nvlink_topo_min_links to the telemetry, and
|
||||
// upgrades a degraded-link Warning to Critical when the fabric is also affected.
|
||||
func enrichNVLinkBridgesWithGPUTopo(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice {
|
||||
hasBridge := false
|
||||
for _, d := range devs {
|
||||
if d.DeviceClass != nil && *d.DeviceClass == "NVLinkBridge" {
|
||||
hasBridge = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !hasBridge {
|
||||
return devs
|
||||
}
|
||||
|
||||
topo, err := queryNVIDIANVLinkTopo()
|
||||
if err != nil {
|
||||
slog.Info("nvlink-bridge: nvidia-smi topo unavailable, skipping cross-reference", "err", err)
|
||||
return devs
|
||||
}
|
||||
|
||||
for i := range devs {
|
||||
if devs[i].DeviceClass == nil || *devs[i].DeviceClass != "NVLinkBridge" {
|
||||
continue
|
||||
}
|
||||
if devs[i].Telemetry == nil {
|
||||
devs[i].Telemetry = map[string]any{}
|
||||
}
|
||||
devs[i].Telemetry["nvlink_topo_all_active"] = topo.AllActive
|
||||
devs[i].Telemetry["nvlink_topo_min_links"] = topo.MinNVLinks
|
||||
devs[i].Telemetry["nvlink_topo_gpu_count"] = topo.GPUCount
|
||||
|
||||
// If the bridge PCIe is already degraded AND the fabric is also degraded
|
||||
// (missing NVLink connections), escalate to Critical.
|
||||
if devs[i].Status != nil && *devs[i].Status == statusCritical && !topo.AllActive {
|
||||
devs[i].Telemetry["nvlink_fabric_affected"] = true
|
||||
}
|
||||
}
|
||||
|
||||
slog.Info("nvlink-bridge: topo cross-reference applied",
|
||||
"gpu_count", topo.GPUCount,
|
||||
"all_active", topo.AllActive,
|
||||
"min_links", topo.MinNVLinks,
|
||||
)
|
||||
return devs
|
||||
}
|
||||
|
||||
// nvlinkTopoResult summarises the GPU NVLink connectivity matrix.
|
||||
type nvlinkTopoResult struct {
|
||||
GPUCount int
|
||||
AllActive bool // true if every GPU pair has at least one NVLink bond
|
||||
MinNVLinks int // minimum NVLink bonds seen across any GPU pair (0 = some pair disconnected)
|
||||
}
|
||||
|
||||
// queryNVIDIANVLinkTopo runs nvidia-smi topo -m and parses the NVLink matrix.
|
||||
func queryNVIDIANVLinkTopo() (nvlinkTopoResult, error) {
|
||||
out, err := exec.Command("nvidia-smi", "topo", "-m").Output()
|
||||
if err != nil {
|
||||
return nvlinkTopoResult{}, err
|
||||
}
|
||||
return parseNVIDIATopologyMatrix(string(out)), nil
|
||||
}
|
||||
|
||||
// parseNVIDIATopologyMatrix extracts the minimum NVLink bond count from the
|
||||
// nvidia-smi topo -m matrix.
|
||||
//
|
||||
// Format (abbreviated):
|
||||
//
|
||||
// GPU0 GPU1 ... NIC0 NIC1
|
||||
// GPU0 X NV18 ... NODE NODE
|
||||
// GPU1 NV18 X ... NODE NODE
|
||||
// NIC0 NODE NODE... X PIX
|
||||
//
|
||||
// The header row starts with "GPU0"; its columns may include non-GPU entries
|
||||
// (NIC, CPU) which are ignored. Only GPU×GPU cells containing NV# values are
|
||||
// counted. X is self; non-NV tokens (NODE, SYS, PHB, PIX) are skipped.
|
||||
func parseNVIDIATopologyMatrix(raw string) nvlinkTopoResult {
|
||||
lines := strings.Split(raw, "\n")
|
||||
|
||||
// Locate the header line and record which column indices are GPU columns.
|
||||
headerIdx := -1
|
||||
var gpuColIndices []int // 0-based indices within fields (excluding the row label)
|
||||
var gpuCount int
|
||||
for i, line := range lines {
|
||||
trimmed := strings.TrimSpace(line)
|
||||
if strings.HasPrefix(trimmed, "GPU0") {
|
||||
parts := strings.Fields(trimmed)
|
||||
for j, col := range parts {
|
||||
if strings.HasPrefix(col, "GPU") {
|
||||
gpuColIndices = append(gpuColIndices, j)
|
||||
}
|
||||
}
|
||||
gpuCount = len(gpuColIndices)
|
||||
if gpuCount >= 2 {
|
||||
headerIdx = i
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
if headerIdx < 0 || gpuCount == 0 {
|
||||
return nvlinkTopoResult{}
|
||||
}
|
||||
|
||||
minLinks := -1 // -1 = no NV pair seen yet
|
||||
allActive := true
|
||||
|
||||
for _, line := range lines[headerIdx+1:] {
|
||||
trimmed := strings.TrimSpace(line)
|
||||
if !strings.HasPrefix(trimmed, "GPU") {
|
||||
continue
|
||||
}
|
||||
cells := strings.Fields(trimmed)
|
||||
// cells[0] is the row label (e.g. "GPU0"); cells[1..] are column values.
|
||||
// gpuColIndices are 0-based within the header fields, so they map to
|
||||
// cells[idx+1] in the data rows (shift by 1 for the row label).
|
||||
for _, colIdx := range gpuColIndices {
|
||||
dataIdx := colIdx + 1
|
||||
if dataIdx >= len(cells) {
|
||||
continue
|
||||
}
|
||||
cell := cells[dataIdx]
|
||||
m := nv5re.FindStringSubmatch(cell)
|
||||
if len(m) != 2 {
|
||||
continue
|
||||
}
|
||||
n, err := strconv.Atoi(m[1])
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if n == 0 {
|
||||
allActive = false
|
||||
}
|
||||
if minLinks < 0 || n < minLinks {
|
||||
minLinks = n
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if minLinks < 0 {
|
||||
minLinks = 0
|
||||
}
|
||||
|
||||
return nvlinkTopoResult{
|
||||
GPUCount: gpuCount,
|
||||
AllActive: allActive && minLinks > 0,
|
||||
MinNVLinks: minLinks,
|
||||
}
|
||||
}
|
||||
124
audit/internal/collector/pcie_nvlink_bridge_test.go
Normal file
124
audit/internal/collector/pcie_nvlink_bridge_test.go
Normal file
@@ -0,0 +1,124 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestParseNVIDIATopologyMatrix(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
// Real-world B200 HGX output: 8 GPUs, all pairs connected via NV18.
|
||||
input := ` GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 NIC0 NIC1
|
||||
GPU0 X NV18 NV18 NV18 NV18 NV18 NV18 NV18 NODE NODE
|
||||
GPU1 NV18 X NV18 NV18 NV18 NV18 NV18 NV18 NODE NODE
|
||||
GPU2 NV18 NV18 X NV18 NV18 NV18 NV18 NV18 NODE NODE
|
||||
GPU3 NV18 NV18 NV18 X NV18 NV18 NV18 NV18 NODE NODE
|
||||
GPU4 NV18 NV18 NV18 NV18 X NV18 NV18 NV18 SYS SYS
|
||||
GPU5 NV18 NV18 NV18 NV18 NV18 X NV18 NV18 SYS SYS
|
||||
GPU6 NV18 NV18 NV18 NV18 NV18 NV18 X NV18 SYS SYS
|
||||
GPU7 NV18 NV18 NV18 NV18 NV18 NV18 NV18 X SYS SYS
|
||||
NIC0 NODE NODE NODE NODE SYS SYS SYS SYS X PIX
|
||||
`
|
||||
got := parseNVIDIATopologyMatrix(input)
|
||||
|
||||
if got.GPUCount != 8 {
|
||||
t.Fatalf("GPUCount=%d want 8", got.GPUCount)
|
||||
}
|
||||
if !got.AllActive {
|
||||
t.Fatalf("AllActive=false want true")
|
||||
}
|
||||
if got.MinNVLinks != 18 {
|
||||
t.Fatalf("MinNVLinks=%d want 18", got.MinNVLinks)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseNVIDIATopologyMatrixPartialDegradation(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
// GPU1-GPU3 pair shows NV12 (reduced) instead of NV18.
|
||||
input := ` GPU0 GPU1 GPU2 GPU3
|
||||
GPU0 X NV18 NV18 NV18
|
||||
GPU1 NV18 X NV18 NV12
|
||||
GPU2 NV18 NV18 X NV18
|
||||
GPU3 NV18 NV12 NV18 X
|
||||
`
|
||||
got := parseNVIDIATopologyMatrix(input)
|
||||
|
||||
if got.MinNVLinks != 12 {
|
||||
t.Fatalf("MinNVLinks=%d want 12", got.MinNVLinks)
|
||||
}
|
||||
if !got.AllActive {
|
||||
t.Fatalf("AllActive=false want true (12 links is still active)")
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseNVIDIATopologyMatrixDisconnected(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
// GPU0-GPU1 pair fully disconnected (NV0).
|
||||
input := ` GPU0 GPU1
|
||||
GPU0 X NV0
|
||||
GPU1 NV0 X
|
||||
`
|
||||
got := parseNVIDIATopologyMatrix(input)
|
||||
|
||||
if got.AllActive {
|
||||
t.Fatalf("AllActive=true want false (NV0 means no links)")
|
||||
}
|
||||
if got.MinNVLinks != 0 {
|
||||
t.Fatalf("MinNVLinks=%d want 0", got.MinNVLinks)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseNVIDIATopologyMatrixEmpty(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
got := parseNVIDIATopologyMatrix("no gpus here")
|
||||
if got.GPUCount != 0 {
|
||||
t.Fatalf("GPUCount=%d want 0", got.GPUCount)
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyPCIeLinkSpeedWarningNVLinkBridgeEscalates(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
bridgeClass := "NVLinkBridge"
|
||||
linkSpeed := "Gen3"
|
||||
maxLinkSpeed := "Gen4"
|
||||
dev := schema.HardwarePCIeDevice{}
|
||||
dev.DeviceClass = &bridgeClass
|
||||
dev.LinkSpeed = &linkSpeed
|
||||
dev.MaxLinkSpeed = &maxLinkSpeed
|
||||
s := statusOK
|
||||
dev.Status = &s
|
||||
|
||||
applyPCIeLinkSpeedWarning(&dev)
|
||||
|
||||
if dev.Status == nil || *dev.Status != statusCritical {
|
||||
t.Fatalf("status=%v want Critical for NVLink bridge degradation", dev.Status)
|
||||
}
|
||||
if dev.ErrorDescription == nil {
|
||||
t.Fatal("ErrorDescription nil, want degradation message")
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyPCIeLinkSpeedWarningRegularCardIsWarning(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
regularClass := "NetworkController"
|
||||
linkSpeed := "Gen3"
|
||||
maxLinkSpeed := "Gen4"
|
||||
dev := schema.HardwarePCIeDevice{}
|
||||
dev.DeviceClass = ®ularClass
|
||||
dev.LinkSpeed = &linkSpeed
|
||||
dev.MaxLinkSpeed = &maxLinkSpeed
|
||||
s := statusOK
|
||||
dev.Status = &s
|
||||
|
||||
applyPCIeLinkSpeedWarning(&dev)
|
||||
|
||||
if dev.Status == nil || *dev.Status != statusWarning {
|
||||
t.Fatalf("status=%v want Warning for regular card degradation", dev.Status)
|
||||
}
|
||||
}
|
||||
@@ -58,7 +58,6 @@ func buildSensorsFromDoc(doc sensorsDoc) *schema.HardwareSensors {
|
||||
|
||||
for _, chip := range chips {
|
||||
features := doc[chip]
|
||||
location := sensorLocation(chip)
|
||||
|
||||
keys := make([]string, 0, len(features))
|
||||
for key := range features {
|
||||
@@ -80,25 +79,25 @@ func buildSensorsFromDoc(doc sensorsDoc) *schema.HardwareSensors {
|
||||
}
|
||||
switch classifySensorFeature(feature) {
|
||||
case "fan":
|
||||
item := buildFanSensor(name, location, feature)
|
||||
item := buildFanSensor(name, feature)
|
||||
if item == nil || duplicateSensor(seen, "fan", item.Name) {
|
||||
continue
|
||||
}
|
||||
result.Fans = append(result.Fans, *item)
|
||||
case "temp":
|
||||
item := buildTempSensor(name, location, feature)
|
||||
item := buildTempSensor(name, feature)
|
||||
if item == nil || duplicateSensor(seen, "temp", item.Name) {
|
||||
continue
|
||||
}
|
||||
result.Temperatures = append(result.Temperatures, *item)
|
||||
case "power":
|
||||
item := buildPowerSensor(name, location, feature)
|
||||
item := buildPowerSensor(name, feature)
|
||||
if item == nil || duplicateSensor(seen, "power", item.Name) {
|
||||
continue
|
||||
}
|
||||
result.Power = append(result.Power, *item)
|
||||
default:
|
||||
item := buildOtherSensor(name, location, feature)
|
||||
item := buildOtherSensor(name, feature)
|
||||
if item == nil || duplicateSensor(seen, "other", item.Name) {
|
||||
continue
|
||||
}
|
||||
@@ -128,14 +127,6 @@ func duplicateSensor(seen map[string]struct{}, sensorType, name string) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
func sensorLocation(chip string) *string {
|
||||
chip = strings.TrimSpace(chip)
|
||||
if chip == "" {
|
||||
return nil
|
||||
}
|
||||
return &chip
|
||||
}
|
||||
|
||||
func classifySensorFeature(feature map[string]any) string {
|
||||
for key := range feature {
|
||||
switch {
|
||||
@@ -154,24 +145,24 @@ func classifySensorFeature(feature map[string]any) string {
|
||||
return "other"
|
||||
}
|
||||
|
||||
func buildFanSensor(name string, location *string, feature map[string]any) *schema.HardwareFanSensor {
|
||||
func buildFanSensor(name string, feature map[string]any) *schema.HardwareFanSensor {
|
||||
rpm, ok := firstFeatureInt(feature, "_input")
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
item := &schema.HardwareFanSensor{Name: name, Location: location, RPM: &rpm}
|
||||
item := &schema.HardwareFanSensor{Name: name, RPM: &rpm}
|
||||
if status := sensorStatusFromFeature(feature); status != nil {
|
||||
item.Status = status
|
||||
}
|
||||
return item
|
||||
}
|
||||
|
||||
func buildTempSensor(name string, location *string, feature map[string]any) *schema.HardwareTemperatureSensor {
|
||||
func buildTempSensor(name string, feature map[string]any) *schema.HardwareTemperatureSensor {
|
||||
celsius, ok := firstFeatureFloat(feature, "_input")
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
item := &schema.HardwareTemperatureSensor{Name: name, Location: location, Celsius: &celsius}
|
||||
item := &schema.HardwareTemperatureSensor{Name: name, Celsius: &celsius}
|
||||
if warning, ok := firstFeatureFloatWithSuffixes(feature, []string{"_max", "_high"}); ok {
|
||||
item.ThresholdWarningCelsius = &warning
|
||||
}
|
||||
@@ -186,8 +177,8 @@ func buildTempSensor(name string, location *string, feature map[string]any) *sch
|
||||
return item
|
||||
}
|
||||
|
||||
func buildPowerSensor(name string, location *string, feature map[string]any) *schema.HardwarePowerSensor {
|
||||
item := &schema.HardwarePowerSensor{Name: name, Location: location}
|
||||
func buildPowerSensor(name string, feature map[string]any) *schema.HardwarePowerSensor {
|
||||
item := &schema.HardwarePowerSensor{Name: name}
|
||||
if v, ok := firstFeatureFloatWithContains(feature, []string{"power"}); ok {
|
||||
item.PowerW = &v
|
||||
}
|
||||
@@ -206,12 +197,12 @@ func buildPowerSensor(name string, location *string, feature map[string]any) *sc
|
||||
return item
|
||||
}
|
||||
|
||||
func buildOtherSensor(name string, location *string, feature map[string]any) *schema.HardwareOtherSensor {
|
||||
func buildOtherSensor(name string, feature map[string]any) *schema.HardwareOtherSensor {
|
||||
value, unit, ok := firstGenericSensorValue(feature)
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
item := &schema.HardwareOtherSensor{Name: name, Location: location, Value: &value}
|
||||
item := &schema.HardwareOtherSensor{Name: name, Value: &value}
|
||||
if unit != "" {
|
||||
item.Unit = &unit
|
||||
}
|
||||
|
||||
@@ -36,6 +36,24 @@ func bestEffortRescanHotplugStorage() {
|
||||
slog.Info("storage: scsi host scan skipped", "pattern", scsiHostScanGlob, "err", err)
|
||||
} else {
|
||||
for _, path := range hostPaths {
|
||||
// SAS HBAs (e.g. smartpqi) block indefinitely in sas_user_scan when
|
||||
// written to — SAS topology is discovered by the driver itself.
|
||||
// Detect via two methods: (1) sas_host class registration, and
|
||||
// (2) driver proc_name — smartpqi uses scsi_transport_sas but does
|
||||
// not register a sas_host object, so (1) alone misses it.
|
||||
host := filepath.Base(filepath.Dir(path))
|
||||
if _, err := os.Stat("/sys/class/sas_host/" + host); err == nil {
|
||||
slog.Info("storage: scsi host scan skipped (SAS host)", "path", path)
|
||||
continue
|
||||
}
|
||||
if procName, err := os.ReadFile("/sys/class/scsi_host/" + host + "/proc_name"); err == nil {
|
||||
switch strings.TrimSpace(string(procName)) {
|
||||
case "smartpqi", "hpsa":
|
||||
slog.Info("storage: scsi host scan skipped (SAS transport driver)",
|
||||
"path", path, "driver", strings.TrimSpace(string(procName)))
|
||||
continue
|
||||
}
|
||||
}
|
||||
if err := hotplugWriteFile(path, []byte("- - -\n"), 0644); err != nil {
|
||||
slog.Info("storage: scsi host scan write failed", "path", path, "err", err)
|
||||
continue
|
||||
@@ -66,17 +84,41 @@ func collectStorage() []schema.HardwareStorage {
|
||||
return result
|
||||
}
|
||||
|
||||
// jsonInt64 accepts both a bare JSON number and a JSON-quoted number string.
|
||||
// lsblk -J emits LOG-SEC / PHY-SEC as integers on util-linux ≥ 2.37 (Debian 12)
|
||||
// but older versions emit them as strings. This type handles both.
|
||||
type jsonInt64 int64
|
||||
|
||||
func (j *jsonInt64) UnmarshalJSON(data []byte) error {
|
||||
// bare number: 512
|
||||
var n int64
|
||||
if err := json.Unmarshal(data, &n); err == nil {
|
||||
*j = jsonInt64(n)
|
||||
return nil
|
||||
}
|
||||
// quoted string: "512"
|
||||
var s string
|
||||
if err := json.Unmarshal(data, &s); err == nil {
|
||||
n, err := strconv.ParseInt(strings.TrimSpace(s), 10, 64)
|
||||
if err == nil {
|
||||
*j = jsonInt64(n)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
return nil // null or unexpected type — leave zero
|
||||
}
|
||||
|
||||
// lsblkDevice is a minimal lsblk JSON record.
|
||||
type lsblkDevice struct {
|
||||
Name string `json:"name"`
|
||||
Type string `json:"type"`
|
||||
Size string `json:"size"`
|
||||
Serial string `json:"serial"`
|
||||
Model string `json:"model"`
|
||||
Tran string `json:"tran"`
|
||||
Hctl string `json:"hctl"`
|
||||
LogSec string `json:"log-sec"`
|
||||
PhySec string `json:"phy-sec"`
|
||||
Name string `json:"name"`
|
||||
Type string `json:"type"`
|
||||
Size string `json:"size"`
|
||||
Serial string `json:"serial"`
|
||||
Model string `json:"model"`
|
||||
Tran string `json:"tran"`
|
||||
Hctl string `json:"hctl"`
|
||||
LogSec jsonInt64 `json:"log-sec"`
|
||||
PhySec jsonInt64 `json:"phy-sec"`
|
||||
}
|
||||
|
||||
type lsblkRoot struct {
|
||||
@@ -382,20 +424,23 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
|
||||
}
|
||||
|
||||
// nvmeSmartLog is the subset of `nvme smart-log -o json` output we care about.
|
||||
// nvme-cli emits most counters as JSON strings (e.g. "power_on_hours":"49"),
|
||||
// so all numeric fields use jsonInt64 which accepts both bare numbers and
|
||||
// quoted strings. Field names match nvme-cli JSON output, not NVMe spec prose.
|
||||
type nvmeSmartLog struct {
|
||||
CriticalWarning int `json:"critical_warning"`
|
||||
PercentageUsed int `json:"percentage_used"`
|
||||
AvailableSpare int `json:"available_spare"`
|
||||
SpareThreshold int `json:"spare_thresh"`
|
||||
Temperature int64 `json:"temperature"`
|
||||
PowerOnHours int64 `json:"power_on_hours"`
|
||||
PowerCycles int64 `json:"power_cycles"`
|
||||
UnsafeShutdowns int64 `json:"unsafe_shutdowns"`
|
||||
DataUnitsRead int64 `json:"data_units_read"`
|
||||
DataUnitsWritten int64 `json:"data_units_written"`
|
||||
ControllerBusy int64 `json:"controller_busy_time"`
|
||||
MediaErrors int64 `json:"media_errors"`
|
||||
NumErrLogEntries int64 `json:"num_err_log_entries"`
|
||||
CriticalWarning jsonInt64 `json:"critical_warning"`
|
||||
PercentageUsed jsonInt64 `json:"percent_used"`
|
||||
AvailableSpare jsonInt64 `json:"avail_spare"`
|
||||
SpareThreshold jsonInt64 `json:"spare_thresh"`
|
||||
Temperature jsonInt64 `json:"temperature"`
|
||||
PowerOnHours jsonInt64 `json:"power_on_hours"`
|
||||
PowerCycles jsonInt64 `json:"power_cycles"`
|
||||
UnsafeShutdowns jsonInt64 `json:"unsafe_shutdowns"`
|
||||
DataUnitsRead jsonInt64 `json:"data_units_read"`
|
||||
DataUnitsWritten jsonInt64 `json:"data_units_written"`
|
||||
ControllerBusy jsonInt64 `json:"controller_busy_time"`
|
||||
MediaErrors jsonInt64 `json:"media_errors"`
|
||||
NumErrLogEntries jsonInt64 `json:"num_err_log_entries"`
|
||||
}
|
||||
|
||||
// nvmeIDCtrl is the subset of `nvme id-ctrl -o json` output.
|
||||
@@ -460,13 +505,16 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
||||
var log nvmeSmartLog
|
||||
if json.Unmarshal(out, &log) == nil {
|
||||
if log.PowerOnHours > 0 {
|
||||
s.PowerOnHours = &log.PowerOnHours
|
||||
v := int64(log.PowerOnHours)
|
||||
s.PowerOnHours = &v
|
||||
}
|
||||
if log.PowerCycles > 0 {
|
||||
s.PowerCycles = &log.PowerCycles
|
||||
v := int64(log.PowerCycles)
|
||||
s.PowerCycles = &v
|
||||
}
|
||||
if log.UnsafeShutdowns > 0 {
|
||||
s.UnsafeShutdowns = &log.UnsafeShutdowns
|
||||
v := int64(log.UnsafeShutdowns)
|
||||
s.UnsafeShutdowns = &v
|
||||
}
|
||||
if log.PercentageUsed > 0 {
|
||||
v := float64(log.PercentageUsed)
|
||||
@@ -475,11 +523,11 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
||||
s.LifeRemainingPct = &remaining
|
||||
}
|
||||
if log.DataUnitsWritten > 0 {
|
||||
v := nvmeDataUnitsToBytes(log.DataUnitsWritten)
|
||||
v := nvmeDataUnitsToBytes(int64(log.DataUnitsWritten))
|
||||
s.WrittenBytes = &v
|
||||
}
|
||||
if log.DataUnitsRead > 0 {
|
||||
v := nvmeDataUnitsToBytes(log.DataUnitsRead)
|
||||
v := nvmeDataUnitsToBytes(int64(log.DataUnitsRead))
|
||||
s.ReadBytes = &v
|
||||
}
|
||||
if log.AvailableSpare > 0 {
|
||||
@@ -487,23 +535,25 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
||||
s.AvailableSparePct = &v
|
||||
}
|
||||
if log.MediaErrors > 0 {
|
||||
s.MediaErrors = &log.MediaErrors
|
||||
v := int64(log.MediaErrors)
|
||||
s.MediaErrors = &v
|
||||
}
|
||||
if log.NumErrLogEntries > 0 {
|
||||
s.ErrorLogEntries = &log.NumErrLogEntries
|
||||
v := int64(log.NumErrLogEntries)
|
||||
s.ErrorLogEntries = &v
|
||||
}
|
||||
if log.Temperature > 0 {
|
||||
v := float64(log.Temperature - 273)
|
||||
s.TemperatureC = &v
|
||||
}
|
||||
setStorageHealthStatus(&s, storageHealthStatus{
|
||||
criticalWarning: log.CriticalWarning,
|
||||
criticalWarning: int(log.CriticalWarning),
|
||||
percentageUsed: int64(log.PercentageUsed),
|
||||
availableSpare: int64(log.AvailableSpare),
|
||||
spareThreshold: int64(log.SpareThreshold),
|
||||
unsafeShutdowns: log.UnsafeShutdowns,
|
||||
mediaErrors: log.MediaErrors,
|
||||
errorLogEntries: log.NumErrLogEntries,
|
||||
unsafeShutdowns: int64(log.UnsafeShutdowns),
|
||||
mediaErrors: int64(log.MediaErrors),
|
||||
errorLogEntries: int64(log.NumErrLogEntries),
|
||||
})
|
||||
return s
|
||||
}
|
||||
@@ -620,8 +670,8 @@ func applyStorageBlockGeometry(s *schema.HardwareStorage, dev lsblkDevice) {
|
||||
if s == nil {
|
||||
return
|
||||
}
|
||||
logical := parseStorageBytes(dev.LogSec)
|
||||
physical := parseStorageBytes(dev.PhySec)
|
||||
logical := int64(dev.LogSec)
|
||||
physical := int64(dev.PhySec)
|
||||
if logical <= 0 && physical <= 0 {
|
||||
return
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
@@ -38,6 +39,54 @@ func TestParseStorageBytes(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestJsonInt64UnmarshalBothFormats(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
// util-linux ≥ 2.37 emits LOG-SEC / PHY-SEC as bare JSON numbers.
|
||||
// Older versions emit quoted strings. Both must parse without error
|
||||
// so that the entire lsblkDevices() call does not return nil on Debian 12.
|
||||
cases := []struct {
|
||||
json string
|
||||
want int64
|
||||
}{
|
||||
{`512`, 512},
|
||||
{`4096`, 4096},
|
||||
{`"512"`, 512},
|
||||
{`"4096"`, 4096},
|
||||
{`null`, 0},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
var v jsonInt64
|
||||
if err := v.UnmarshalJSON([]byte(tc.json)); err != nil {
|
||||
t.Fatalf("UnmarshalJSON(%s): unexpected error %v", tc.json, err)
|
||||
}
|
||||
if int64(v) != tc.want {
|
||||
t.Fatalf("UnmarshalJSON(%s)=%d want %d", tc.json, int64(v), tc.want)
|
||||
}
|
||||
}
|
||||
|
||||
// Simulate the exact JSON shape that triggered the bug on Debian 12.
|
||||
input := []byte(`{
|
||||
"blockdevices": [
|
||||
{"name":"sda","type":"disk","size":"3.6T","serial":"S1234","model":"SEAGATE","tran":"sata","hctl":"0:0:0:0","log-sec":512,"phy-sec":4096},
|
||||
{"name":"sdb","type":"disk","size":"3.6T","serial":"S5678","model":"SEAGATE","tran":"sata","hctl":"0:0:1:0","log-sec":512,"phy-sec":4096}
|
||||
]
|
||||
}`)
|
||||
var root lsblkRoot
|
||||
if err := json.Unmarshal(input, &root); err != nil {
|
||||
t.Fatalf("lsblkRoot unmarshal with integer log-sec/phy-sec: %v", err)
|
||||
}
|
||||
if len(root.Blockdevices) != 2 {
|
||||
t.Fatalf("got %d blockdevices want 2", len(root.Blockdevices))
|
||||
}
|
||||
if int64(root.Blockdevices[0].LogSec) != 512 {
|
||||
t.Fatalf("LogSec=%d want 512", root.Blockdevices[0].LogSec)
|
||||
}
|
||||
if int64(root.Blockdevices[0].PhySec) != 4096 {
|
||||
t.Fatalf("PhySec=%d want 4096", root.Blockdevices[0].PhySec)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBestEffortRescanHotplugStorage(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
||||
@@ -1,11 +1,65 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"testing"
|
||||
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
// TestNVMeSmartLogUnmarshal verifies that nvme-cli JSON output (where most
|
||||
// counters are quoted strings and field names differ from NVMe spec prose)
|
||||
// is correctly parsed into nvmeSmartLog.
|
||||
func TestNVMeSmartLogUnmarshal(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
// Real nvme-cli output: counters are JSON strings, spare is "avail_spare",
|
||||
// percentage used is "percent_used".
|
||||
raw := `{
|
||||
"critical_warning": 0,
|
||||
"temperature": 310,
|
||||
"avail_spare": 100,
|
||||
"spare_thresh": 5,
|
||||
"percent_used": 0,
|
||||
"data_units_read": "10925415",
|
||||
"data_units_written": "8497672",
|
||||
"controller_busy_time": "305",
|
||||
"power_cycles": "53",
|
||||
"power_on_hours": "49",
|
||||
"unsafe_shutdowns": "22",
|
||||
"media_errors": "0",
|
||||
"num_err_log_entries": "0"
|
||||
}`
|
||||
var log nvmeSmartLog
|
||||
if err := json.Unmarshal([]byte(raw), &log); err != nil {
|
||||
t.Fatalf("json.Unmarshal failed: %v", err)
|
||||
}
|
||||
if log.PowerOnHours != 49 {
|
||||
t.Errorf("PowerOnHours=%d want 49", log.PowerOnHours)
|
||||
}
|
||||
if log.PowerCycles != 53 {
|
||||
t.Errorf("PowerCycles=%d want 53", log.PowerCycles)
|
||||
}
|
||||
if log.AvailableSpare != 100 {
|
||||
t.Errorf("AvailableSpare=%d want 100", log.AvailableSpare)
|
||||
}
|
||||
if log.SpareThreshold != 5 {
|
||||
t.Errorf("SpareThreshold=%d want 5", log.SpareThreshold)
|
||||
}
|
||||
if log.PercentageUsed != 0 {
|
||||
t.Errorf("PercentageUsed=%d want 0", log.PercentageUsed)
|
||||
}
|
||||
if log.Temperature != 310 {
|
||||
t.Errorf("Temperature=%d want 310", log.Temperature)
|
||||
}
|
||||
if log.MediaErrors != 0 {
|
||||
t.Errorf("MediaErrors=%d want 0", log.MediaErrors)
|
||||
}
|
||||
if log.UnsafeShutdowns != 22 {
|
||||
t.Errorf("UnsafeShutdowns=%d want 22", log.UnsafeShutdowns)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSetStorageHealthStatus(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
||||
27
audit/internal/collector/testdata/dmidecode_type0_dell.txt
vendored
Normal file
27
audit/internal/collector/testdata/dmidecode_type0_dell.txt
vendored
Normal file
@@ -0,0 +1,27 @@
|
||||
# dmidecode 3.2
|
||||
Getting SMBIOS data from sysfs.
|
||||
SMBIOS 3.1.0 present.
|
||||
|
||||
Handle 0x0000, DMI type 0, 26 bytes
|
||||
BIOS Information
|
||||
Vendor: Dell Inc.
|
||||
Version: 2.5.4
|
||||
Release Date: 01/13/2020
|
||||
Address: 0xF0000
|
||||
Runtime Size: 64 kB
|
||||
ROM Size: 32 MB
|
||||
Characteristics:
|
||||
ISA is supported
|
||||
PCI is supported
|
||||
PNP is supported
|
||||
BIOS is upgradeable
|
||||
BIOS shadowing is allowed
|
||||
Boot from CD is supported
|
||||
Selectable boot is supported
|
||||
EDD is supported
|
||||
ACPI is supported
|
||||
USB legacy is supported
|
||||
BIOS boot specification is supported
|
||||
Targeted content distribution is supported
|
||||
UEFI is supported
|
||||
BIOS Revision: 2.5
|
||||
59
audit/internal/collector/testdata/dmidecode_type17_mixed.txt
vendored
Normal file
59
audit/internal/collector/testdata/dmidecode_type17_mixed.txt
vendored
Normal file
@@ -0,0 +1,59 @@
|
||||
# dmidecode 3.1
|
||||
Getting SMBIOS data from sysfs.
|
||||
SMBIOS 2.8 present.
|
||||
|
||||
Handle 0x0026, DMI type 17, 40 bytes
|
||||
Memory Device
|
||||
Array Handle: 0x0025
|
||||
Error Information Handle: Not Provided
|
||||
Total Width: 72 bits
|
||||
Data Width: 64 bits
|
||||
Size: 16 GB
|
||||
Form Factor: DIMM
|
||||
Set: None
|
||||
Locator: P1-DIMMA1
|
||||
Bank Locator: P0_Node0_Channel0_Dimm0
|
||||
Type: DDR4
|
||||
Type Detail: Synchronous
|
||||
Speed: 2133 MT/s
|
||||
Manufacturer: Micron
|
||||
Serial Number: 1A2B3C4D
|
||||
Asset Tag: Not Specified
|
||||
Part Number: 36ASF2G72PZ-2G1A2
|
||||
Rank: 2
|
||||
Configured Memory Speed: 2133 MT/s
|
||||
|
||||
Handle 0x0027, DMI type 17, 40 bytes
|
||||
Memory Device
|
||||
Array Handle: 0x0025
|
||||
Error Information Handle: Not Provided
|
||||
Total Width: Unknown
|
||||
Data Width: Unknown
|
||||
Size: No Module Installed
|
||||
Form Factor: DIMM
|
||||
Set: None
|
||||
Locator: P1-DIMMA2
|
||||
Bank Locator: P0_Node0_Channel0_Dimm1
|
||||
Type: DDR4
|
||||
Type Detail: Synchronous
|
||||
|
||||
Handle 0x0028, DMI type 17, 84 bytes
|
||||
Memory Device
|
||||
Array Handle: 0x0025
|
||||
Error Information Handle: Not Provided
|
||||
Total Width: 72 bits
|
||||
Data Width: 64 bits
|
||||
Size: 32768 MB
|
||||
Form Factor: DIMM
|
||||
Set: 1
|
||||
Locator: A1
|
||||
Bank Locator: Not Specified
|
||||
Type: DDR4
|
||||
Type Detail: Synchronous Registered (Buffered)
|
||||
Speed: 2933 MT/s
|
||||
Manufacturer: Samsung
|
||||
Serial Number: 5E6F7A8B
|
||||
Asset Tag: Not Specified
|
||||
Part Number: M393A4K40CB2-CVF
|
||||
Rank: 2
|
||||
Configured Memory Speed: 2400 MT/s
|
||||
14
audit/internal/collector/testdata/dmidecode_type1_dell.txt
vendored
Normal file
14
audit/internal/collector/testdata/dmidecode_type1_dell.txt
vendored
Normal file
@@ -0,0 +1,14 @@
|
||||
# dmidecode 3.2
|
||||
Getting SMBIOS data from sysfs.
|
||||
SMBIOS 3.1.0 present.
|
||||
|
||||
Handle 0x0100, DMI type 1, 27 bytes
|
||||
System Information
|
||||
Manufacturer: Dell Inc.
|
||||
Product Name: PowerEdge R740xd
|
||||
Version: Not Specified
|
||||
Serial Number: 7SG9F63
|
||||
UUID: b1c2d3e4-f5a6-7890-bcde-f12345678901
|
||||
Wake-up Type: Power Switch
|
||||
SKU Number: SKU=NotProvided;ModelName=PowerEdge R740xd
|
||||
Family: PowerEdge
|
||||
14
audit/internal/collector/testdata/dmidecode_type1_hpe.txt
vendored
Normal file
14
audit/internal/collector/testdata/dmidecode_type1_hpe.txt
vendored
Normal file
@@ -0,0 +1,14 @@
|
||||
# dmidecode 3.3
|
||||
Getting SMBIOS data from sysfs.
|
||||
SMBIOS 3.1.0 present.
|
||||
|
||||
Handle 0x008E, DMI type 1, 27 bytes
|
||||
System Information
|
||||
Manufacturer: HPE
|
||||
Product Name: ProLiant DL380 Gen10
|
||||
Version: Not Specified
|
||||
Serial Number: CZJ9320CXN
|
||||
UUID: c2d3e4f5-a6b7-8901-cdef-012345678902
|
||||
Wake-up Type: Power Switch
|
||||
SKU Number: 868703-B21
|
||||
Family: ProLiant
|
||||
14
audit/internal/collector/testdata/dmidecode_type1_supermicro.txt
vendored
Normal file
14
audit/internal/collector/testdata/dmidecode_type1_supermicro.txt
vendored
Normal file
@@ -0,0 +1,14 @@
|
||||
# dmidecode 3.1
|
||||
Getting SMBIOS data from sysfs.
|
||||
SMBIOS 2.8 present.
|
||||
|
||||
Handle 0x0001, DMI type 1, 27 bytes
|
||||
System Information
|
||||
Manufacturer: Supermicro
|
||||
Product Name: SYS-6028R-WTR
|
||||
Version: 0123456789
|
||||
Serial Number: S214726X2A36789
|
||||
UUID: d3e4f5a6-b7c8-9012-def0-123456789003
|
||||
Wake-up Type: Power Switch
|
||||
SKU Number: Default string
|
||||
Family: Default string
|
||||
10
audit/internal/collector/testdata/dmidecode_type2_dell.txt
vendored
Normal file
10
audit/internal/collector/testdata/dmidecode_type2_dell.txt
vendored
Normal file
@@ -0,0 +1,10 @@
|
||||
# dmidecode 3.2
|
||||
Getting SMBIOS data from sysfs.
|
||||
SMBIOS 3.1.0 present.
|
||||
|
||||
Handle 0x0200, DMI type 2, 8 bytes
|
||||
Base Board Information
|
||||
Manufacturer: Dell Inc.
|
||||
Product Name: 0F9N89
|
||||
Version: A00
|
||||
Serial Number: 7SG9F63
|
||||
19
audit/internal/collector/testdata/dmidecode_type2_hpe.txt
vendored
Normal file
19
audit/internal/collector/testdata/dmidecode_type2_hpe.txt
vendored
Normal file
@@ -0,0 +1,19 @@
|
||||
# dmidecode 3.3
|
||||
Getting SMBIOS data from sysfs.
|
||||
SMBIOS 3.1.0 present.
|
||||
|
||||
Handle 0x00A4, DMI type 2, 15 bytes
|
||||
Base Board Information
|
||||
Manufacturer: HPE
|
||||
Product Name: ProLiant DL380 Gen10
|
||||
Version: Not Specified
|
||||
Serial Number: CZJ9320CXN
|
||||
Asset Tag: CZJ9320CXN
|
||||
Features:
|
||||
Board is a hosting board
|
||||
Board is removable
|
||||
Board is replaceable
|
||||
Location In Chassis: Not Specified
|
||||
Chassis Handle: 0x0000
|
||||
Type: Motherboard
|
||||
Contained Object Handles: 0
|
||||
18
audit/internal/collector/testdata/dmidecode_type2_supermicro.txt
vendored
Normal file
18
audit/internal/collector/testdata/dmidecode_type2_supermicro.txt
vendored
Normal file
@@ -0,0 +1,18 @@
|
||||
# dmidecode 3.1
|
||||
Getting SMBIOS data from sysfs.
|
||||
SMBIOS 2.8 present.
|
||||
|
||||
Handle 0x0002, DMI type 2, 15 bytes
|
||||
Base Board Information
|
||||
Manufacturer: Supermicro
|
||||
Product Name: X10DRW-i
|
||||
Version: 1.02
|
||||
Serial Number: S214726X2A36789
|
||||
Asset Tag: Default string
|
||||
Features:
|
||||
Board is a hosting board
|
||||
Board is replaceable
|
||||
Location In Chassis: Default string
|
||||
Chassis Handle: 0x0003
|
||||
Type: Motherboard
|
||||
Contained Object Handles: 0
|
||||
@@ -4008,14 +4008,23 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
idleW = result.ServerPower.IdleW
|
||||
}
|
||||
|
||||
// Build header: Run | GPU 0 | GPU 1 | ... | Server wall W | Per GPU wall W | Platform eff.
|
||||
// Build header: Run | GPU 0 | GPU 1 | ... | GPU total W | Server itself W | Server wall W | Per GPU wall W | Platform eff.
|
||||
headers := []string{"Run"}
|
||||
for _, idx := range allGPUIndices {
|
||||
headers = append(headers, fmt.Sprintf("GPU %d W", idx))
|
||||
}
|
||||
headers = append(headers, "Server wall W", "Per GPU wall W", "Platform eff.")
|
||||
headers = append(headers, "GPU total W", "Server itself W", "Server wall W", "Per GPU wall W", "Platform eff.")
|
||||
|
||||
var rampRows [][]string
|
||||
if idleW > 0 {
|
||||
idleRow := []string{"0 (idle)"}
|
||||
for range allGPUIndices {
|
||||
idleRow = append(idleRow, "—")
|
||||
}
|
||||
// No load: GPU total is negligible, all draw is the server's own baseline.
|
||||
idleRow = append(idleRow, "—", fmt.Sprintf("%.0f", idleW), fmt.Sprintf("%.0f", idleW), "—", "—")
|
||||
rampRows = append(rampRows, idleRow)
|
||||
}
|
||||
for _, step := range result.RampSteps {
|
||||
row := []string{fmt.Sprintf("%d", step.StepIndex)}
|
||||
for _, idx := range allGPUIndices {
|
||||
@@ -4036,6 +4045,16 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
}
|
||||
row = append(row, gpuPwr)
|
||||
}
|
||||
// GPU total W = sum of observed GPU power (nvidia-smi)
|
||||
gpuTotal := "—"
|
||||
if step.TotalObservedPowerW > 0 {
|
||||
gpuTotal = fmt.Sprintf("%.0f", step.TotalObservedPowerW)
|
||||
}
|
||||
// Server itself W = server wall power minus GPU total (non-GPU baseline draw)
|
||||
serverItself := "—"
|
||||
if step.ServerLoadedW > 0 && step.TotalObservedPowerW > 0 {
|
||||
serverItself = fmt.Sprintf("%.0f", step.ServerLoadedW-step.TotalObservedPowerW)
|
||||
}
|
||||
// Server wall W
|
||||
serverWall := "—"
|
||||
if step.ServerLoadedW > 0 {
|
||||
@@ -4055,7 +4074,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
}
|
||||
platEff = fmt.Sprintf("%.2f", eff)
|
||||
}
|
||||
row = append(row, serverWall, perGPUWall, platEff)
|
||||
row = append(row, gpuTotal, serverItself, serverWall, perGPUWall, platEff)
|
||||
rampRows = append(rampRows, row)
|
||||
}
|
||||
b.WriteString(fmtMDTable(headers, rampRows))
|
||||
@@ -4617,6 +4636,8 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
ramp.AvgFanRPM = singleRun.AvgFanRPM
|
||||
ramp.AvgFanDutyCyclePct = singleRun.AvgFanDutyCyclePct
|
||||
}
|
||||
firstSummary := firstCalib.Summary
|
||||
ramp.PerGPUTelemetry = map[int]*BenchmarkTelemetrySummary{firstIdx: &firstSummary}
|
||||
if !firstCalib.Completed {
|
||||
ramp.Status = "FAILED"
|
||||
ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card %s", firstIdx, benchmarkPowerEngineLabel(benchmarkPowerEngine())))
|
||||
|
||||
@@ -38,6 +38,15 @@ var HardwareErrorPatterns = []ErrorPattern{
|
||||
Category: "gpu",
|
||||
Severity: "warning",
|
||||
},
|
||||
// PCIe AER correctable from the NVIDIA driver — "bus correctable error" in SEL.
|
||||
// Severity is warning (not critical): correctable errors are hardware-recovered.
|
||||
{
|
||||
Name: "nvidia-aer-correctable",
|
||||
Re: mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER.*[Cc]orrect`),
|
||||
Category: "gpu",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "nvidia-aer",
|
||||
Re: mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
|
||||
@@ -54,6 +63,15 @@ var HardwareErrorPatterns = []ErrorPattern{
|
||||
},
|
||||
|
||||
// ── PCIe AER (generic) ──────────────────────────────────────────────────────
|
||||
// PCIe AER correctable from the root port — captures the reported device BDF
|
||||
// (second BDF in "pcieport X: AER: Correctable error received: Y").
|
||||
{
|
||||
Name: "pcie-aer-correctable",
|
||||
Re: mustPat(`(?i)pcieport.*AER:.*[Cc]orrect.*:\s*([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
|
||||
Category: "pcie",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "pcie-aer",
|
||||
Re: mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
|
||||
|
||||
@@ -258,7 +258,7 @@ func (s *System) GetInterfaceState(iface string) (bool, error) {
|
||||
func interfaceAdminState(iface string) (bool, error) {
|
||||
raw, err := exec.Command("ip", "-o", "link", "show", "dev", iface).Output()
|
||||
if err != nil {
|
||||
return false, err
|
||||
return false, fmt.Errorf("ip link show dev %s: %w", iface, err)
|
||||
}
|
||||
return parseInterfaceAdminState(string(raw))
|
||||
}
|
||||
@@ -288,7 +288,7 @@ func interfaceIPv4Addrs(iface string) ([]string, error) {
|
||||
if errors.As(err, &exitErr) {
|
||||
return nil, nil
|
||||
}
|
||||
return nil, err
|
||||
return nil, fmt.Errorf("ip addr show dev %s: %w", iface, err)
|
||||
}
|
||||
var ipv4 []string
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(raw)), "\n") {
|
||||
|
||||
@@ -55,7 +55,6 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
|
||||
if err == nil {
|
||||
health.Interfaces = make([]schema.RuntimeInterface, 0, len(interfaces))
|
||||
hasIPv4 := false
|
||||
missingIPv4 := false
|
||||
for _, iface := range interfaces {
|
||||
outcome := "no_offer"
|
||||
if len(iface.IPv4) > 0 {
|
||||
@@ -63,8 +62,6 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
|
||||
hasIPv4 = true
|
||||
} else if strings.EqualFold(iface.State, "DOWN") {
|
||||
outcome = "link_down"
|
||||
} else {
|
||||
missingIPv4 = true
|
||||
}
|
||||
health.Interfaces = append(health.Interfaces, schema.RuntimeInterface{
|
||||
Name: iface.Name,
|
||||
@@ -73,17 +70,9 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
|
||||
Outcome: outcome,
|
||||
})
|
||||
}
|
||||
switch {
|
||||
case hasIPv4 && !missingIPv4:
|
||||
if hasIPv4 {
|
||||
health.NetworkStatus = "OK"
|
||||
case hasIPv4:
|
||||
health.NetworkStatus = "PARTIAL"
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "dhcp_partial",
|
||||
Severity: "warning",
|
||||
Description: "At least one interface did not obtain IPv4 connectivity.",
|
||||
})
|
||||
default:
|
||||
} else {
|
||||
health.NetworkStatus = "FAILED"
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "dhcp_failed",
|
||||
|
||||
@@ -182,9 +182,16 @@ func (s *System) DetectGPUVendor() string {
|
||||
return "amd"
|
||||
}
|
||||
if raw, err := exec.Command("lspci", "-nn").Output(); err == nil {
|
||||
text := strings.ToLower(string(raw))
|
||||
if strings.Contains(text, "advanced micro devices") || strings.Contains(text, "amd/ati") {
|
||||
return "amd"
|
||||
// Only match AMD GPU device classes [0300]=VGA, [0302]=3D controller, [0380]=Display.
|
||||
// AMD CPUs also appear in lspci as "Advanced Micro Devices" (Root Complex, IOMMU, etc.)
|
||||
// so matching vendor alone causes false positives on AMD CPU servers without GPUs.
|
||||
for _, line := range strings.Split(strings.ToLower(string(raw)), "\n") {
|
||||
if !strings.Contains(line, "advanced micro devices") && !strings.Contains(line, "amd/ati") {
|
||||
continue
|
||||
}
|
||||
if strings.Contains(line, "[0300]") || strings.Contains(line, "[0302]") || strings.Contains(line, "[0380]") {
|
||||
return "amd"
|
||||
}
|
||||
}
|
||||
}
|
||||
return ""
|
||||
@@ -723,12 +730,14 @@ func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, e
|
||||
}
|
||||
prefix := fmt.Sprintf("%02d-%s", index+1, filepath.Base(devPath))
|
||||
commands := storageSATCommands(devPath, extended)
|
||||
deviceOutputs := make(map[string][]byte, len(commands))
|
||||
for cmdIndex, job := range commands {
|
||||
if ctx.Err() != nil {
|
||||
break
|
||||
}
|
||||
name := fmt.Sprintf("%s-%02d-%s.log", prefix, cmdIndex+1, job.name)
|
||||
out, err := runSATCommandCtx(ctx, verboseLog, job.name, job.cmd, nil, logFunc)
|
||||
deviceOutputs[job.name] = out
|
||||
if writeErr := os.WriteFile(filepath.Join(runDir, name), out, 0644); writeErr != nil {
|
||||
return "", writeErr
|
||||
}
|
||||
@@ -737,7 +746,28 @@ func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, e
|
||||
key := filepath.Base(devPath) + "_" + strings.ReplaceAll(job.name, "-", "_")
|
||||
fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc)
|
||||
fmt.Fprintf(&summary, "%s_status=%s\n", key, status)
|
||||
|
||||
// smartctl -t short only launches the self-test on the drive firmware and
|
||||
// returns immediately ("Testing has begun"); unlike `nvme device-self-test
|
||||
// --wait`, smartctl has no blocking mode, so we must poll the drive
|
||||
// ourselves until the self-test actually finishes.
|
||||
if job.name == "smartctl-self-test-short" && err == nil {
|
||||
statusName := "smartctl-self-test-status"
|
||||
statusOut := waitForSmartctlSelfTest(ctx, verboseLog, devPath, logFunc)
|
||||
deviceOutputs[statusName] = statusOut
|
||||
statusFile := fmt.Sprintf("%s-%02d-%s.log", prefix, cmdIndex+2, statusName)
|
||||
if writeErr := os.WriteFile(filepath.Join(runDir, statusFile), statusOut, 0644); writeErr != nil {
|
||||
return "", writeErr
|
||||
}
|
||||
sStatus, sRC := classifySATResult(statusName, statusOut, nil)
|
||||
stats.Add(sStatus)
|
||||
sKey := filepath.Base(devPath) + "_" + strings.ReplaceAll(statusName, "-", "_")
|
||||
fmt.Fprintf(&summary, "%s_rc=%d\n", sKey, sRC)
|
||||
fmt.Fprintf(&summary, "%s_status=%s\n", sKey, sStatus)
|
||||
}
|
||||
}
|
||||
reportText := GenerateDiskReportText(index+1, devPath, deviceOutputs, time.Now().UTC())
|
||||
_ = os.WriteFile(filepath.Join(runDir, "disk-"+prefix+"-report.txt"), []byte(reportText), 0644)
|
||||
}
|
||||
|
||||
writeSATStats(&summary, stats)
|
||||
@@ -1170,6 +1200,42 @@ func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string
|
||||
return out, err
|
||||
}
|
||||
|
||||
// smartctlSelfTestPollInterval/Timeout bound how long we poll the drive after
|
||||
// launching `smartctl -t short`, which SMART/ATA specs put at ~2 minutes.
|
||||
const (
|
||||
smartctlSelfTestPollInterval = 5 * time.Second
|
||||
smartctlSelfTestTimeout = 4 * time.Minute
|
||||
)
|
||||
|
||||
// waitForSmartctlSelfTest polls `smartctl -a` until the short self-test
|
||||
// started on devPath finishes (or the timeout/context elapses) and returns
|
||||
// the final output, which reflects the actual test result rather than the
|
||||
// "Testing has begun" launch acknowledgement.
|
||||
func waitForSmartctlSelfTest(ctx context.Context, verboseLog, devPath string, logFunc func(string)) []byte {
|
||||
deadline := time.Now().Add(smartctlSelfTestTimeout)
|
||||
var last []byte
|
||||
for {
|
||||
out, _ := runSATCommandCtx(ctx, verboseLog, "smartctl-self-test-status", []string{"smartctl", "-a", devPath}, nil, nil)
|
||||
last = out
|
||||
if ctx.Err() != nil {
|
||||
return last
|
||||
}
|
||||
lower := bytes.ToLower(out)
|
||||
if !bytes.Contains(lower, []byte("self-test routine in progress")) &&
|
||||
!bytes.Contains(lower, []byte("% of test remaining")) {
|
||||
return last
|
||||
}
|
||||
if time.Now().After(deadline) {
|
||||
return last
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return last
|
||||
case <-time.After(smartctlSelfTestPollInterval):
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func listStorageDevices() ([]string, error) {
|
||||
out, err := satExecCommand("lsblk", "-dn", "-o", "NAME,TYPE,TRAN").Output()
|
||||
if err != nil {
|
||||
@@ -1178,26 +1244,27 @@ func listStorageDevices() ([]string, error) {
|
||||
return parseStorageDevices(string(out)), nil
|
||||
}
|
||||
|
||||
// storageSATCommands returns the commands to run for a single storage device.
|
||||
// extended=false (Check): read-only SMART/NVMe data collection, no self-test.
|
||||
// extended=true (Load): data collection + short self-test.
|
||||
func storageSATCommands(devPath string, extended bool) []satJob {
|
||||
if strings.Contains(filepath.Base(devPath), "nvme") {
|
||||
selfTestLevel := "1"
|
||||
if extended {
|
||||
selfTestLevel = "2"
|
||||
}
|
||||
return []satJob{
|
||||
jobs := []satJob{
|
||||
{name: "nvme-id-ctrl", cmd: []string{"nvme", "id-ctrl", devPath, "-o", "json"}},
|
||||
{name: "nvme-smart-log", cmd: []string{"nvme", "smart-log", devPath, "-o", "json"}},
|
||||
{name: "nvme-device-self-test", cmd: []string{"nvme", "device-self-test", devPath, "-s", selfTestLevel, "--wait"}},
|
||||
}
|
||||
if extended {
|
||||
jobs = append(jobs, satJob{name: "nvme-device-self-test", cmd: []string{"nvme", "device-self-test", devPath, "-s", "1", "--wait"}})
|
||||
}
|
||||
return jobs
|
||||
}
|
||||
smartTestType := "short"
|
||||
if extended {
|
||||
smartTestType = "long"
|
||||
}
|
||||
return []satJob{
|
||||
jobs := []satJob{
|
||||
{name: "smartctl-health", cmd: []string{"smartctl", "-H", "-A", devPath}},
|
||||
{name: "smartctl-self-test-short", cmd: []string{"smartctl", "-t", smartTestType, devPath}},
|
||||
}
|
||||
if extended {
|
||||
jobs = append(jobs, satJob{name: "smartctl-self-test-short", cmd: []string{"smartctl", "-t", "short", devPath}})
|
||||
}
|
||||
return jobs
|
||||
}
|
||||
|
||||
func (s *satStats) Add(status string) {
|
||||
|
||||
@@ -14,14 +14,42 @@ import (
|
||||
func TestStorageSATCommands(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
nvme := storageSATCommands("/dev/nvme0n1", false)
|
||||
if len(nvme) != 3 || nvme[2].cmd[0] != "nvme" {
|
||||
t.Fatalf("unexpected nvme commands: %#v", nvme)
|
||||
// Check mode (extended=false): read-only collection, no self-test.
|
||||
nvmeCheck := storageSATCommands("/dev/nvme0n1", false)
|
||||
if len(nvmeCheck) != 2 {
|
||||
t.Fatalf("check nvme: want 2 commands, got %d: %#v", len(nvmeCheck), nvmeCheck)
|
||||
}
|
||||
if nvmeCheck[0].name != "nvme-id-ctrl" || nvmeCheck[1].name != "nvme-smart-log" {
|
||||
t.Fatalf("check nvme: unexpected command names: %#v", nvmeCheck)
|
||||
}
|
||||
|
||||
sata := storageSATCommands("/dev/sda", false)
|
||||
if len(sata) != 2 || sata[0].cmd[0] != "smartctl" {
|
||||
t.Fatalf("unexpected sata commands: %#v", sata)
|
||||
sataCheck := storageSATCommands("/dev/sda", false)
|
||||
if len(sataCheck) != 1 || sataCheck[0].cmd[0] != "smartctl" {
|
||||
t.Fatalf("check sata: want 1 smartctl command, got %#v", sataCheck)
|
||||
}
|
||||
|
||||
// Load mode (extended=true): collection + short self-test.
|
||||
nvmeLoad := storageSATCommands("/dev/nvme0n1", true)
|
||||
if len(nvmeLoad) != 3 || nvmeLoad[2].name != "nvme-device-self-test" {
|
||||
t.Fatalf("load nvme: want 3 commands with self-test last, got %#v", nvmeLoad)
|
||||
}
|
||||
if got := nvmeLoad[2].cmd[len(nvmeLoad[2].cmd)-3]; got != "-s" {
|
||||
t.Fatalf("load nvme: want -s flag, got %q", got)
|
||||
}
|
||||
if got := nvmeLoad[2].cmd[len(nvmeLoad[2].cmd)-2]; got != "1" {
|
||||
t.Fatalf("load nvme: want self-test level 1, got %q", got)
|
||||
}
|
||||
|
||||
sataLoad := storageSATCommands("/dev/sda", true)
|
||||
if len(sataLoad) != 2 || sataLoad[1].name != "smartctl-self-test-short" {
|
||||
t.Fatalf("load sata: want 2 commands with short self-test last, got %#v", sataLoad)
|
||||
}
|
||||
// cmd is: smartctl -t short /dev/sda
|
||||
if got := sataLoad[1].cmd[1]; got != "-t" {
|
||||
t.Fatalf("load sata: want -t flag at index 1, got %q", got)
|
||||
}
|
||||
if got := sataLoad[1].cmd[2]; got != "short" {
|
||||
t.Fatalf("load sata: want short at index 2, got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
517
audit/internal/platform/storage_report.go
Normal file
517
audit/internal/platform/storage_report.go
Normal file
@@ -0,0 +1,517 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"math"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// GenerateDiskReportText builds a human-readable text report for one storage
|
||||
// device from the raw command outputs collected during storage SAT.
|
||||
//
|
||||
// outputs keys match satJob.name: "nvme-id-ctrl", "nvme-smart-log",
|
||||
// "smartctl-health", "smartctl-self-test-short".
|
||||
func GenerateDiskReportText(index int, devPath string, outputs map[string][]byte, ts time.Time) string {
|
||||
var b strings.Builder
|
||||
devName := filepath.Base(devPath)
|
||||
line := strings.Repeat("=", 80)
|
||||
b.WriteString(line + "\n")
|
||||
fmt.Fprintf(&b, "Disk %-3d %s\n", index, devPath)
|
||||
b.WriteString(line + "\n")
|
||||
|
||||
isNVMe := strings.Contains(devName, "nvme")
|
||||
if isNVMe {
|
||||
writeNVMeReport(&b, outputs)
|
||||
} else {
|
||||
writeSATAReport(&b, outputs)
|
||||
}
|
||||
|
||||
b.WriteString("\n")
|
||||
fmt.Fprintf(&b, "Collected : %s\n", ts.UTC().Format("2006-01-02 15:04:05 UTC"))
|
||||
b.WriteString(line + "\n")
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// ── NVMe ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
type nvmeIdCtrl struct {
|
||||
ModelNumber string `json:"mn"`
|
||||
SerialNumber string `json:"sn"`
|
||||
Firmware string `json:"fr"`
|
||||
TotalCap uint64 `json:"tnvmcap"`
|
||||
NVMCap uint64 `json:"nvmcap"`
|
||||
}
|
||||
|
||||
// nvmeU64 handles both plain JSON numbers and {"lo":n,"hi":n} objects that
|
||||
// some nvme-cli versions emit for 128-bit counters.
|
||||
func nvmeU64(raw json.RawMessage) uint64 {
|
||||
if len(raw) == 0 {
|
||||
return 0
|
||||
}
|
||||
var n uint64
|
||||
if json.Unmarshal(raw, &n) == nil {
|
||||
return n
|
||||
}
|
||||
var obj struct {
|
||||
Lo uint64 `json:"lo"`
|
||||
Hi uint64 `json:"hi"`
|
||||
}
|
||||
if json.Unmarshal(raw, &obj) == nil {
|
||||
return obj.Lo
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
type nvmeSmartLogRaw struct {
|
||||
CriticalWarning uint64 `json:"critical_warning"`
|
||||
Temperature json.RawMessage `json:"temperature"`
|
||||
AvailSpare uint64 `json:"avail_spare"`
|
||||
SpareThresh uint64 `json:"spare_thresh"`
|
||||
PercentUsed uint64 `json:"percent_used"`
|
||||
DataUnitsRead json.RawMessage `json:"data_units_read"`
|
||||
DataUnitsWritten json.RawMessage `json:"data_units_written"`
|
||||
PowerCycles json.RawMessage `json:"power_cycles"`
|
||||
PowerOnHours json.RawMessage `json:"power_on_hours"`
|
||||
UnsafeShutdowns json.RawMessage `json:"unsafe_shutdowns"`
|
||||
MediaErrors json.RawMessage `json:"media_errors"`
|
||||
NumErrLogEntries json.RawMessage `json:"num_err_log_entries"`
|
||||
}
|
||||
|
||||
func writeNVMeReport(b *strings.Builder, outputs map[string][]byte) {
|
||||
// id-ctrl
|
||||
var ctrl nvmeIdCtrl
|
||||
if data := outputs["nvme-id-ctrl"]; len(data) > 0 {
|
||||
_ = json.Unmarshal(data, &ctrl)
|
||||
}
|
||||
|
||||
model := strings.TrimSpace(ctrl.ModelNumber)
|
||||
serial := strings.TrimSpace(ctrl.SerialNumber)
|
||||
firmware := strings.TrimSpace(ctrl.Firmware)
|
||||
|
||||
capacityGB := ""
|
||||
if ctrl.TotalCap > 0 {
|
||||
capacityGB = formatCapacityGB(ctrl.TotalCap)
|
||||
} else if ctrl.NVMCap > 0 {
|
||||
capacityGB = formatCapacityGB(ctrl.NVMCap)
|
||||
}
|
||||
|
||||
writeField(b, "Model", model)
|
||||
writeField(b, "Serial", serial)
|
||||
writeField(b, "Firmware", firmware)
|
||||
if capacityGB != "" {
|
||||
writeField(b, "Capacity", capacityGB)
|
||||
}
|
||||
|
||||
// smart-log
|
||||
data := outputs["nvme-smart-log"]
|
||||
if len(data) == 0 {
|
||||
b.WriteString("\n(no SMART data)\n")
|
||||
return
|
||||
}
|
||||
var sl nvmeSmartLogRaw
|
||||
if err := json.Unmarshal(data, &sl); err != nil {
|
||||
fmt.Fprintf(b, "\n(SMART parse error: %v)\n", err)
|
||||
return
|
||||
}
|
||||
|
||||
tempK := nvmeU64(sl.Temperature)
|
||||
tempC := int(tempK) - 273
|
||||
if tempC < 0 {
|
||||
tempC = 0
|
||||
}
|
||||
|
||||
critWarn := sl.CriticalWarning
|
||||
critWarnStr := "OK"
|
||||
if critWarn != 0 {
|
||||
critWarnStr = fmt.Sprintf("0x%02X", critWarn)
|
||||
}
|
||||
|
||||
poh := nvmeU64(sl.PowerOnHours)
|
||||
pc := nvmeU64(sl.PowerCycles)
|
||||
us := nvmeU64(sl.UnsafeShutdowns)
|
||||
me := nvmeU64(sl.MediaErrors)
|
||||
nel := nvmeU64(sl.NumErrLogEntries)
|
||||
|
||||
// data_units are in 1000 × 512-byte sectors = 512,000 bytes each
|
||||
dataRead := float64(nvmeU64(sl.DataUnitsRead)) * 512000 / 1e9
|
||||
dataWritten := float64(nvmeU64(sl.DataUnitsWritten)) * 512000 / 1e9
|
||||
|
||||
writeSectionHeader(b, "Health")
|
||||
writeField(b, "Temperature", fmt.Sprintf("%d °C", tempC))
|
||||
writeField(b, "Critical Warning", critWarnStr)
|
||||
writeField(b, "Percentage Used", fmt.Sprintf("%d %%", sl.PercentUsed))
|
||||
writeField(b, "Available Spare", fmt.Sprintf("%d %% (threshold: %d %%)", sl.AvailSpare, sl.SpareThresh))
|
||||
|
||||
writeSectionHeader(b, "Usage")
|
||||
writeField(b, "Power On Hours", fmt.Sprintf("%s h", formatUint(poh)))
|
||||
writeField(b, "Power Cycles", formatUint(pc))
|
||||
writeField(b, "Unsafe Shutdowns", formatUint(us))
|
||||
writeField(b, "Data Written", fmt.Sprintf("%.1f GB", dataWritten))
|
||||
writeField(b, "Data Read", fmt.Sprintf("%.1f GB", dataRead))
|
||||
|
||||
writeSectionHeader(b, "Errors")
|
||||
writeField(b, "Media Errors", formatUint(me))
|
||||
writeField(b, "Error Log Entries", formatUint(nel))
|
||||
|
||||
capacityBytes := ctrl.TotalCap
|
||||
if capacityBytes == 0 {
|
||||
capacityBytes = ctrl.NVMCap
|
||||
}
|
||||
writeResourceSection(b, resourceInfo{
|
||||
powerOnHours: poh,
|
||||
writtenBytes: uint64(nvmeU64(sl.DataUnitsWritten)) * 512000,
|
||||
readBytes: uint64(nvmeU64(sl.DataUnitsRead)) * 512000,
|
||||
capacityBytes: capacityBytes,
|
||||
})
|
||||
|
||||
if selfTest := outputs["nvme-device-self-test"]; len(selfTest) > 0 {
|
||||
writeSectionHeader(b, "Self-Test")
|
||||
result := parseSelfTestResult(string(selfTest))
|
||||
writeField(b, "Result", result)
|
||||
}
|
||||
}
|
||||
|
||||
// ── SATA / SAS (smartctl) ────────────────────────────────────────────────────
|
||||
|
||||
var (
|
||||
smartHealthRE = regexp.MustCompile(`(?i)SMART overall-health self-assessment test result:\s*(\S+)`)
|
||||
smartAttrLineRE = regexp.MustCompile(
|
||||
`^\s*(\d{1,3})\s+(\S+)\s+0x[0-9a-fA-F]+\s+(\d{1,3})\s+(\d{1,3})\s+(\d{1,3})\s+\S+\s+\S+\s+\S+\s+(.+?)\s*$`,
|
||||
)
|
||||
smartModelRE = regexp.MustCompile(`(?im)^Device Model:\s*(.+)$`)
|
||||
smartSerialRE = regexp.MustCompile(`(?im)^Serial Number:\s*(.+)$`)
|
||||
smartFirmwareRE = regexp.MustCompile(`(?im)^Firmware Version:\s*(.+)$`)
|
||||
smartCapacityRE = regexp.MustCompile(`(?im)^User Capacity:\s*(.+)$`)
|
||||
)
|
||||
|
||||
type smartAttr struct {
|
||||
ID int
|
||||
Name string
|
||||
Value int
|
||||
Worst int
|
||||
Threshold int
|
||||
Raw string
|
||||
}
|
||||
|
||||
func writeSATAReport(b *strings.Builder, outputs map[string][]byte) {
|
||||
data := outputs["smartctl-health"]
|
||||
if len(data) == 0 {
|
||||
b.WriteString("\n(no SMART data)\n")
|
||||
return
|
||||
}
|
||||
text := string(data)
|
||||
|
||||
// Identity
|
||||
if m := smartModelRE.FindStringSubmatch(text); m != nil {
|
||||
writeField(b, "Model", strings.TrimSpace(m[1]))
|
||||
}
|
||||
if m := smartSerialRE.FindStringSubmatch(text); m != nil {
|
||||
writeField(b, "Serial", strings.TrimSpace(m[1]))
|
||||
}
|
||||
if m := smartFirmwareRE.FindStringSubmatch(text); m != nil {
|
||||
writeField(b, "Firmware", strings.TrimSpace(m[1]))
|
||||
}
|
||||
var capacityBytes uint64
|
||||
if m := smartCapacityRE.FindStringSubmatch(text); m != nil {
|
||||
cap := strings.TrimSpace(m[1])
|
||||
capacityBytes = parseLeadingUint(cap)
|
||||
// trim everything after "[" if present (e.g. "500,107,862,016 bytes [500 GB]")
|
||||
if idx := strings.Index(cap, "["); idx > 0 {
|
||||
cap = strings.TrimSpace(cap[idx+1:])
|
||||
cap = strings.TrimSuffix(cap, "]")
|
||||
}
|
||||
writeField(b, "Capacity", cap)
|
||||
}
|
||||
|
||||
writeSectionHeader(b, "Health")
|
||||
health := "unknown"
|
||||
if m := smartHealthRE.FindStringSubmatch(text); m != nil {
|
||||
health = strings.TrimSpace(m[1])
|
||||
}
|
||||
writeField(b, "SMART Overall Health", health)
|
||||
|
||||
attrs := parseSMARTAttrs(text)
|
||||
if len(attrs) > 0 {
|
||||
writeSectionHeader(b, "SMART Attributes")
|
||||
fmt.Fprintf(b, " %-4s %-32s %5s %5s %5s %s\n", "ID", "Attribute", "Value", "Worst", "Thresh", "Raw")
|
||||
b.WriteString(" " + strings.Repeat("-", 72) + "\n")
|
||||
for _, a := range attrs {
|
||||
fmt.Fprintf(b, " %-4d %-32s %5d %5d %5d %s\n",
|
||||
a.ID, a.Name, a.Value, a.Worst, a.Threshold, a.Raw)
|
||||
}
|
||||
}
|
||||
|
||||
var poh, writtenLBAs, readLBAs uint64
|
||||
var readValue int
|
||||
hasReadValue := false
|
||||
for _, a := range attrs {
|
||||
switch a.ID {
|
||||
case 9: // Power_On_Hours
|
||||
poh = parseLeadingUint(a.Raw)
|
||||
case 241: // Total_LBAs_Written
|
||||
writtenLBAs = parseLeadingUint(a.Raw)
|
||||
case 242: // Total_LBAs_Read
|
||||
readLBAs = parseLeadingUint(a.Raw)
|
||||
readValue = a.Value
|
||||
hasReadValue = true
|
||||
}
|
||||
}
|
||||
const sataSectorBytes = 512
|
||||
writeResourceSection(b, resourceInfo{
|
||||
powerOnHours: poh,
|
||||
writtenBytes: writtenLBAs * sataSectorBytes,
|
||||
readBytes: readLBAs * sataSectorBytes,
|
||||
capacityBytes: capacityBytes,
|
||||
readPercent: 100 - readValue,
|
||||
hasReadPercent: hasReadValue,
|
||||
})
|
||||
|
||||
selfTest := outputs["smartctl-self-test-status"]
|
||||
if len(selfTest) == 0 {
|
||||
selfTest = outputs["smartctl-self-test-short"]
|
||||
}
|
||||
if len(selfTest) > 0 {
|
||||
writeSectionHeader(b, "Self-Test")
|
||||
result := parseSelfTestResult(string(selfTest))
|
||||
writeField(b, "Result", result)
|
||||
}
|
||||
}
|
||||
|
||||
func parseSMARTAttrs(text string) []smartAttr {
|
||||
var attrs []smartAttr
|
||||
inTable := false
|
||||
for _, line := range strings.Split(text, "\n") {
|
||||
if strings.Contains(line, "ATTRIBUTE_NAME") {
|
||||
inTable = true
|
||||
continue
|
||||
}
|
||||
if !inTable {
|
||||
continue
|
||||
}
|
||||
m := smartAttrLineRE.FindStringSubmatch(line)
|
||||
if m == nil {
|
||||
if strings.TrimSpace(line) == "" {
|
||||
inTable = false
|
||||
}
|
||||
continue
|
||||
}
|
||||
id, _ := strconv.Atoi(m[1])
|
||||
val, _ := strconv.Atoi(m[3])
|
||||
worst, _ := strconv.Atoi(m[4])
|
||||
thresh, _ := strconv.Atoi(m[5])
|
||||
attrs = append(attrs, smartAttr{
|
||||
ID: id,
|
||||
Name: m[2],
|
||||
Value: val,
|
||||
Worst: worst,
|
||||
Threshold: thresh,
|
||||
Raw: strings.TrimSpace(m[6]),
|
||||
})
|
||||
}
|
||||
return attrs
|
||||
}
|
||||
|
||||
// parseSelfTestResult extracts a one-line summary from nvme device-self-test,
|
||||
// smartctl -a (post-completion status), or smartctl -t short (launch ack) output.
|
||||
func parseSelfTestResult(text string) string {
|
||||
text = strings.TrimSpace(text)
|
||||
if text == "" {
|
||||
return "no output"
|
||||
}
|
||||
lines := strings.Split(text, "\n")
|
||||
// smartctl -a: "Self-test execution status: ( 0)\n\tThe previous
|
||||
// self-test routine completed\n\twithout error ..." — the description
|
||||
// wraps onto following indented, colon-free continuation lines.
|
||||
for i, line := range lines {
|
||||
if strings.Contains(strings.ToLower(line), "self-test execution status") {
|
||||
parts := []string{strings.TrimSpace(line)}
|
||||
for j := i + 1; j < len(lines) && j < i+4; j++ {
|
||||
cont := strings.TrimSpace(lines[j])
|
||||
if cont == "" || strings.Contains(cont, ":") {
|
||||
break
|
||||
}
|
||||
parts = append(parts, cont)
|
||||
}
|
||||
return strings.Join(parts, " ")
|
||||
}
|
||||
}
|
||||
// nvme device-self-test: look for "Short Device Self-Test Status : 0x0" or similar
|
||||
for _, line := range lines {
|
||||
l := strings.ToLower(line)
|
||||
if strings.Contains(l, "self-test status") || strings.Contains(l, "self test status") {
|
||||
return strings.TrimSpace(line)
|
||||
}
|
||||
}
|
||||
// smartctl -t short: "Testing has begun" or "Short BGST started"
|
||||
for _, line := range lines {
|
||||
l := strings.ToLower(line)
|
||||
if strings.Contains(l, "testing has begun") || strings.Contains(l, "started") || strings.Contains(l, "complete") {
|
||||
return strings.TrimSpace(line)
|
||||
}
|
||||
}
|
||||
// fallback: last non-empty line
|
||||
for i := len(lines) - 1; i >= 0; i-- {
|
||||
if s := strings.TrimSpace(lines[i]); s != "" {
|
||||
return s
|
||||
}
|
||||
}
|
||||
return "done"
|
||||
}
|
||||
|
||||
// ── Resource (pseudographic usage bars) ────────────────────────────────────────
|
||||
|
||||
// designLifeYears/dwpd model the drive's rated endurance: 1 drive-write-per-day
|
||||
// for 5 years, the baseline enterprise endurance spec used when the vendor's
|
||||
// own TBW/DWPD rating isn't available from SMART/NVMe data.
|
||||
const (
|
||||
designLifeYears = 5
|
||||
dwpd = 1.0
|
||||
)
|
||||
|
||||
type resourceInfo struct {
|
||||
powerOnHours uint64
|
||||
writtenBytes uint64
|
||||
readBytes uint64
|
||||
capacityBytes uint64
|
||||
readPercent int // only meaningful when hasReadPercent
|
||||
hasReadPercent bool // true when the source SMART attribute exposes a normalized read-wear value
|
||||
}
|
||||
|
||||
func writeResourceSection(b *strings.Builder, r resourceInfo) {
|
||||
writeSectionHeader(b, "Resource")
|
||||
|
||||
const maxLifeHours = designLifeYears * 365 * 24
|
||||
upFrac := float64(r.powerOnHours) / float64(maxLifeHours)
|
||||
fmt.Fprintf(b, " %-9s %s %s / %s (%s)\n",
|
||||
"Uptime", progressBar(upFrac, 24), formatHoursHuman(r.powerOnHours), formatHoursHuman(maxLifeHours), formatPercent(upFrac*100))
|
||||
|
||||
if r.capacityBytes > 0 {
|
||||
maxWritten := float64(r.capacityBytes) * dwpd * designLifeYears * 365
|
||||
wFrac := float64(r.writtenBytes) / maxWritten
|
||||
fmt.Fprintf(b, " %-9s %s %s / %s (%s, %g DWPD×%dy)\n",
|
||||
"Written", progressBar(wFrac, 24), formatBytesHuman(float64(r.writtenBytes)), formatBytesHuman(maxWritten), formatPercent(wFrac*100), dwpd, designLifeYears)
|
||||
} else {
|
||||
fmt.Fprintf(b, " %-9s %s\n", "Written", formatBytesHuman(float64(r.writtenBytes)))
|
||||
}
|
||||
|
||||
if r.hasReadPercent {
|
||||
fmt.Fprintf(b, " %-9s %s %s (%d%%)\n",
|
||||
"Read", progressBar(float64(r.readPercent)/100, 24), formatBytesHuman(float64(r.readBytes)), r.readPercent)
|
||||
} else {
|
||||
fmt.Fprintf(b, " %-9s %s\n", "Read", formatBytesHuman(float64(r.readBytes)))
|
||||
}
|
||||
}
|
||||
|
||||
// progressBar renders a fixed-width pseudographic bar, e.g. "[######------]".
|
||||
func progressBar(frac float64, width int) string {
|
||||
if math.IsNaN(frac) || frac < 0 {
|
||||
frac = 0
|
||||
}
|
||||
if frac > 1 {
|
||||
frac = 1
|
||||
}
|
||||
filled := int(math.Round(frac * float64(width)))
|
||||
return "[" + strings.Repeat("#", filled) + strings.Repeat("-", width-filled) + "]"
|
||||
}
|
||||
|
||||
// formatBytesHuman renders a decimal (SI) human-readable byte size, e.g. "1.23 TB".
|
||||
func formatBytesHuman(n float64) string {
|
||||
units := []string{"B", "KB", "MB", "GB", "TB", "PB"}
|
||||
i := 0
|
||||
for n >= 1000 && i < len(units)-1 {
|
||||
n /= 1000
|
||||
i++
|
||||
}
|
||||
if i == 0 {
|
||||
return fmt.Sprintf("%.0f %s", n, units[i])
|
||||
}
|
||||
return fmt.Sprintf("%.2f %s", n, units[i])
|
||||
}
|
||||
|
||||
// formatHoursHuman renders an hour count as a human-scaled duration (hours,
|
||||
// days, or years) so uptimes don't show as raw four/five-digit hour counts.
|
||||
func formatHoursHuman(hours uint64) string {
|
||||
if hours < 48 {
|
||||
return fmt.Sprintf("%d h", hours)
|
||||
}
|
||||
days := float64(hours) / 24
|
||||
if days < 365 {
|
||||
return fmt.Sprintf("%.0f d", days)
|
||||
}
|
||||
years := days / 365
|
||||
if years == math.Trunc(years) {
|
||||
return fmt.Sprintf("%.0f y", years)
|
||||
}
|
||||
return fmt.Sprintf("%.1f y", years)
|
||||
}
|
||||
|
||||
// formatPercent renders a percentage with extra precision below 1% (e.g.
|
||||
// "0.03%"), where a rounded "0%" would hide any usage at all.
|
||||
func formatPercent(pct float64) string {
|
||||
if pct > 0 && pct < 1 {
|
||||
return fmt.Sprintf("%.2f%%", pct)
|
||||
}
|
||||
return fmt.Sprintf("%.0f%%", pct)
|
||||
}
|
||||
|
||||
// parseLeadingUint parses the leading run of digits/commas in s (e.g. from a
|
||||
// SMART raw value or "500,107,862,016 bytes") into a uint64, ignoring the rest.
|
||||
func parseLeadingUint(s string) uint64 {
|
||||
s = strings.TrimSpace(s)
|
||||
end := 0
|
||||
for end < len(s) && (s[end] >= '0' && s[end] <= '9' || s[end] == ',') {
|
||||
end++
|
||||
}
|
||||
digits := strings.ReplaceAll(s[:end], ",", "")
|
||||
n, _ := strconv.ParseUint(digits, 10, 64)
|
||||
return n
|
||||
}
|
||||
|
||||
// ── Formatting helpers ────────────────────────────────────────────────────────
|
||||
|
||||
func writeSectionHeader(b *strings.Builder, title string) {
|
||||
b.WriteString("\n")
|
||||
header := "-- " + title + " "
|
||||
header += strings.Repeat("-", max(0, 76-len(header)))
|
||||
b.WriteString(header + "\n")
|
||||
}
|
||||
|
||||
func writeField(b *strings.Builder, label, value string) {
|
||||
fmt.Fprintf(b, " %-20s : %s\n", label, value)
|
||||
}
|
||||
|
||||
func formatCapacityGB(bytes uint64) string {
|
||||
gb := float64(bytes) / 1e9
|
||||
if gb >= 1000 {
|
||||
return fmt.Sprintf("%.2g TB", gb/1000)
|
||||
}
|
||||
return fmt.Sprintf("%.0f GB", math.Round(gb))
|
||||
}
|
||||
|
||||
func formatUint(n uint64) string {
|
||||
if n == 0 {
|
||||
return "0"
|
||||
}
|
||||
s := strconv.FormatUint(n, 10)
|
||||
// insert thousand separators
|
||||
var out []byte
|
||||
for i, c := range s {
|
||||
if i > 0 && (len(s)-i)%3 == 0 {
|
||||
out = append(out, ',')
|
||||
}
|
||||
out = append(out, byte(c))
|
||||
}
|
||||
return string(out)
|
||||
}
|
||||
|
||||
func max(a, b int) int {
|
||||
if a > b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
122
audit/internal/platform/storage_report_test.go
Normal file
122
audit/internal/platform/storage_report_test.go
Normal file
@@ -0,0 +1,122 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
var testNVMeIdCtrl = []byte(`{
|
||||
"mn": "SAMSUNG MZ1L2960HCJR-00A07 ",
|
||||
"sn": "S665NN0X415495",
|
||||
"fr": "GDC7602Q",
|
||||
"tnvmcap": 960197124096
|
||||
}`)
|
||||
|
||||
var testNVMeSmartLog = []byte(`{
|
||||
"critical_warning": 0,
|
||||
"temperature": 311,
|
||||
"avail_spare": 100,
|
||||
"spare_thresh": 10,
|
||||
"percent_used": 0,
|
||||
"data_units_read": 1023456,
|
||||
"data_units_written": 738281,
|
||||
"power_cycles": 32,
|
||||
"power_on_hours": 1234,
|
||||
"unsafe_shutdowns": 3,
|
||||
"media_errors": 0,
|
||||
"num_err_log_entries": 0
|
||||
}`)
|
||||
|
||||
// lo/hi variant emitted by some nvme-cli versions
|
||||
var testNVMeSmartLogLoHi = []byte(`{
|
||||
"critical_warning": 0,
|
||||
"temperature": {"lo": 311, "hi": 0},
|
||||
"avail_spare": 100,
|
||||
"spare_thresh": 10,
|
||||
"percent_used": 0,
|
||||
"data_units_read": {"lo": 1023456, "hi": 0},
|
||||
"data_units_written": {"lo": 738281, "hi": 0},
|
||||
"power_cycles": {"lo": 32, "hi": 0},
|
||||
"power_on_hours": {"lo": 1234, "hi": 0},
|
||||
"unsafe_shutdowns": {"lo": 3, "hi": 0},
|
||||
"media_errors": {"lo": 0, "hi": 0},
|
||||
"num_err_log_entries": {"lo": 0, "hi": 0}
|
||||
}`)
|
||||
|
||||
var testSmartCtlHealth = []byte(`
|
||||
smartctl 7.3 2022-02-28 r5338 [x86_64-linux-5.15.0] (local build)
|
||||
Copyright (C) 2002-22, Bruce Allen, Christian Franke, www.smartmontools.org
|
||||
|
||||
=== START OF INFORMATION SECTION ===
|
||||
Device Model: SAMSUNG MZ1L2960HCJR-00A07
|
||||
Serial Number: S665NN0X415495
|
||||
Firmware Version: GDC7602Q
|
||||
User Capacity: 960,197,124,096 bytes [960 GB]
|
||||
|
||||
=== START OF READ SMART DATA SECTION ===
|
||||
SMART overall-health self-assessment test result: PASSED
|
||||
|
||||
SMART Attributes Data Structure revision number: 1
|
||||
Vendor Specific SMART Attributes with Thresholds:
|
||||
ID# ATTRIBUTE_NAME FLAG VALUE WORST THRESH TYPE UPDATED WHEN_FAILED RAW_VALUE
|
||||
5 Reallocated_Sector_Ct 0x0032 100 100 000 Old_age Always - 0
|
||||
9 Power_On_Hours 0x0032 100 100 000 Old_age Always - 1234
|
||||
12 Power_Cycle_Count 0x0032 100 100 000 Old_age Always - 45
|
||||
177 Wear_Leveling_Count 0x0013 097 097 000 Pre-fail Always - 30
|
||||
190 Airflow_Temperature_Cel 0x0032 063 045 000 Old_age Always - 37
|
||||
`)
|
||||
|
||||
func TestGenerateDiskReportNVMe(t *testing.T) {
|
||||
t.Parallel()
|
||||
outputs := map[string][]byte{
|
||||
"nvme-id-ctrl": testNVMeIdCtrl,
|
||||
"nvme-smart-log": testNVMeSmartLog,
|
||||
}
|
||||
report := GenerateDiskReportText(1, "/dev/nvme0n1", outputs, time.Unix(0, 0).UTC())
|
||||
|
||||
assertContains(t, report, "Disk 1", "/dev/nvme0n1")
|
||||
assertContains(t, report, "SAMSUNG MZ1L2960HCJR-00A07")
|
||||
assertContains(t, report, "S665NN0X415495")
|
||||
assertContains(t, report, "GDC7602Q")
|
||||
assertContains(t, report, "38 °C") // 311 K - 273
|
||||
assertContains(t, report, "1,234 h") // power_on_hours with separator
|
||||
assertContains(t, report, "32") // power_cycles
|
||||
assertContains(t, report, "3") // unsafe_shutdowns
|
||||
assertContains(t, report, "378.0 GB") // data_units_written * 512000 / 1e9
|
||||
}
|
||||
|
||||
func TestGenerateDiskReportNVMeLoHi(t *testing.T) {
|
||||
t.Parallel()
|
||||
outputs := map[string][]byte{
|
||||
"nvme-id-ctrl": testNVMeIdCtrl,
|
||||
"nvme-smart-log": testNVMeSmartLogLoHi,
|
||||
}
|
||||
report := GenerateDiskReportText(1, "/dev/nvme0n1", outputs, time.Unix(0, 0).UTC())
|
||||
assertContains(t, report, "38 °C")
|
||||
assertContains(t, report, "1,234 h")
|
||||
}
|
||||
|
||||
func TestGenerateDiskReportSATA(t *testing.T) {
|
||||
t.Parallel()
|
||||
outputs := map[string][]byte{
|
||||
"smartctl-health": testSmartCtlHealth,
|
||||
}
|
||||
report := GenerateDiskReportText(2, "/dev/sda", outputs, time.Unix(0, 0).UTC())
|
||||
|
||||
assertContains(t, report, "Disk 2", "/dev/sda")
|
||||
assertContains(t, report, "SAMSUNG MZ1L2960HCJR-00A07")
|
||||
assertContains(t, report, "S665NN0X415495")
|
||||
assertContains(t, report, "PASSED")
|
||||
assertContains(t, report, "Reallocated_Sector_Ct")
|
||||
assertContains(t, report, "Power_On_Hours")
|
||||
}
|
||||
|
||||
func assertContains(t *testing.T, text string, needles ...string) {
|
||||
t.Helper()
|
||||
for _, needle := range needles {
|
||||
if !strings.Contains(text, needle) {
|
||||
t.Errorf("report missing %q\nreport:\n%s", needle, text)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -25,6 +25,9 @@ var techDumpFixedCommands = []struct {
|
||||
{Name: "sensors", Args: []string{"-j"}, File: "sensors.json"},
|
||||
{Name: "ipmitool", Args: []string{"fru", "print"}, File: "ipmitool-fru.txt"},
|
||||
{Name: "ipmitool", Args: []string{"sdr"}, File: "ipmitool-sdr.txt"},
|
||||
{Name: "ipmitool", Args: []string{"sensor"}, File: "ipmitool-sensor.txt"},
|
||||
{Name: "ipmitool", Args: []string{"sel", "list"}, File: "ipmitool-sel.txt"},
|
||||
{Name: "ipmitool", Args: []string{"sel", "time", "get"}, File: "ipmitool-sel-time.txt"},
|
||||
{Name: "nvme", Args: []string{"list", "-o", "json"}, File: "nvme-list.json"},
|
||||
}
|
||||
|
||||
|
||||
@@ -2,6 +2,8 @@
|
||||
// core/internal/ingest/parser_hardware.go. No import dependency on core.
|
||||
package schema
|
||||
|
||||
import "encoding/json"
|
||||
|
||||
// HardwareIngestRequest is the top-level output document produced by `bee audit`.
|
||||
// It is accepted as-is by the core /api/ingest/hardware endpoint.
|
||||
type HardwareIngestRequest struct {
|
||||
@@ -64,9 +66,10 @@ type HardwareSnapshot struct {
|
||||
Storage []HardwareStorage `json:"storage,omitempty"`
|
||||
PCIeDevices []HardwarePCIeDevice `json:"pcie_devices,omitempty"`
|
||||
PowerSupplies []HardwarePowerSupply `json:"power_supplies,omitempty"`
|
||||
Sensors *HardwareSensors `json:"sensors,omitempty"`
|
||||
EventLogs []HardwareEventLog `json:"event_logs,omitempty"`
|
||||
VROCLicense *string `json:"vroc_license,omitempty"`
|
||||
Sensors *HardwareSensors `json:"sensors,omitempty"`
|
||||
EventLogs []HardwareEventLog `json:"event_logs,omitempty"`
|
||||
PlatformConfig *json.RawMessage `json:"platform_config,omitempty"`
|
||||
VROCLicense *string `json:"vroc_license,omitempty"`
|
||||
}
|
||||
|
||||
type HardwareHealthSummary struct {
|
||||
@@ -123,7 +126,7 @@ type HardwareCPU struct {
|
||||
type HardwareMemory struct {
|
||||
HardwareComponentStatus
|
||||
Slot *string `json:"slot,omitempty"`
|
||||
Location *string `json:"location,omitempty"`
|
||||
Location *string `json:"-"` // internal: used for DIMM telemetry matching only
|
||||
Present *bool `json:"present,omitempty"`
|
||||
SizeMB *int `json:"size_mb,omitempty"`
|
||||
Type *string `json:"type,omitempty"`
|
||||
@@ -261,15 +264,13 @@ type HardwareSensors struct {
|
||||
}
|
||||
|
||||
type HardwareFanSensor struct {
|
||||
Name string `json:"name"`
|
||||
Location *string `json:"location,omitempty"`
|
||||
RPM *int `json:"rpm,omitempty"`
|
||||
Status *string `json:"status,omitempty"`
|
||||
Name string `json:"name"`
|
||||
RPM *int `json:"rpm,omitempty"`
|
||||
Status *string `json:"status,omitempty"`
|
||||
}
|
||||
|
||||
type HardwarePowerSensor struct {
|
||||
Name string `json:"name"`
|
||||
Location *string `json:"location,omitempty"`
|
||||
VoltageV *float64 `json:"voltage_v,omitempty"`
|
||||
CurrentA *float64 `json:"current_a,omitempty"`
|
||||
PowerW *float64 `json:"power_w,omitempty"`
|
||||
@@ -278,7 +279,6 @@ type HardwarePowerSensor struct {
|
||||
|
||||
type HardwareTemperatureSensor struct {
|
||||
Name string `json:"name"`
|
||||
Location *string `json:"location,omitempty"`
|
||||
Celsius *float64 `json:"celsius,omitempty"`
|
||||
ThresholdWarningCelsius *float64 `json:"threshold_warning_celsius,omitempty"`
|
||||
ThresholdCriticalCelsius *float64 `json:"threshold_critical_celsius,omitempty"`
|
||||
@@ -286,11 +286,10 @@ type HardwareTemperatureSensor struct {
|
||||
}
|
||||
|
||||
type HardwareOtherSensor struct {
|
||||
Name string `json:"name"`
|
||||
Location *string `json:"location,omitempty"`
|
||||
Value *float64 `json:"value,omitempty"`
|
||||
Unit *string `json:"unit,omitempty"`
|
||||
Status *string `json:"status,omitempty"`
|
||||
Name string `json:"name"`
|
||||
Value *float64 `json:"value,omitempty"`
|
||||
Unit *string `json:"unit,omitempty"`
|
||||
Status *string `json:"status,omitempty"`
|
||||
}
|
||||
|
||||
type HardwareEventLog struct {
|
||||
|
||||
@@ -1292,12 +1292,28 @@ func (h *handler) handleAPIInstallToRAM(w http.ResponseWriter, r *http.Request)
|
||||
_ = json.NewEncoder(w).Encode(map[string]string{"task_id": t.ID})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPISystemReboot(w http.ResponseWriter, r *http.Request) {
|
||||
if err := exec.Command("systemctl", "reboot").Start(); err != nil {
|
||||
writeError(w, http.StatusInternalServerError, "reboot failed: "+err.Error())
|
||||
return
|
||||
}
|
||||
writeJSON(w, map[string]string{"status": "rebooting"})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPISystemShutdown(w http.ResponseWriter, r *http.Request) {
|
||||
if err := exec.Command("systemctl", "poweroff").Start(); err != nil {
|
||||
writeError(w, http.StatusInternalServerError, "shutdown failed: "+err.Error())
|
||||
return
|
||||
}
|
||||
writeJSON(w, map[string]string{"status": "shutting down"})
|
||||
}
|
||||
|
||||
// ── Tools ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
var standardTools = []string{
|
||||
"dmidecode", "smartctl", "nvme", "lspci", "ipmitool",
|
||||
"nvidia-smi", "dcgmi", "nv-hostengine", "memtester", "stress-ng", "nvtop",
|
||||
"mstflint",
|
||||
"mstflint", "saa",
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIToolsCheck(w http.ResponseWriter, r *http.Request) {
|
||||
@@ -1679,6 +1695,56 @@ func (h *handler) handleAPIBenchmarkResults(w http.ResponseWriter, r *http.Reque
|
||||
fmt.Fprint(w, renderBenchmarkResultsCard(h.opts.ExportDir))
|
||||
}
|
||||
|
||||
// ── Hardware summary / component detail ──────────────────────────────────────
|
||||
|
||||
// handleAPIHardwareSummary returns the hardware summary card HTML fragment for
|
||||
// htmx polling (hx-get="/api/hardware-summary" hx-swap="outerHTML").
|
||||
func (h *handler) handleAPIHardwareSummary(w http.ResponseWriter, _ *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||
w.Header().Set("Cache-Control", "no-store")
|
||||
fmt.Fprint(w, renderHardwareSummaryCard(h.opts))
|
||||
}
|
||||
|
||||
// handleAPIComponentDetail returns an HTML fragment describing the current and
|
||||
// historical status for one component type (cpu, memory, storage, gpu, psu).
|
||||
func (h *handler) handleAPIComponentDetail(w http.ResponseWriter, r *http.Request) {
|
||||
compType := r.PathValue("type")
|
||||
var exact, prefixes []string
|
||||
var title string
|
||||
switch compType {
|
||||
case "cpu":
|
||||
title = "CPU"
|
||||
exact = []string{"cpu:all"}
|
||||
case "memory":
|
||||
title = "Memory"
|
||||
exact = []string{"memory:all"}
|
||||
prefixes = []string{"memory:"}
|
||||
case "storage":
|
||||
title = "Storage"
|
||||
exact = []string{"storage:all"}
|
||||
prefixes = []string{"storage:"}
|
||||
case "gpu":
|
||||
title = "GPU"
|
||||
prefixes = []string{"pcie:gpu:"}
|
||||
case "psu":
|
||||
title = "PSU"
|
||||
prefixes = []string{"psu:"}
|
||||
default:
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
|
||||
var records []app.ComponentStatusRecord
|
||||
if h.opts.App != nil && h.opts.App.StatusDB != nil {
|
||||
all := h.opts.App.StatusDB.All()
|
||||
records = matchedRecords(all, exact, prefixes)
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||
w.Header().Set("Cache-Control", "no-store")
|
||||
fmt.Fprint(w, renderComponentDetail(title, records))
|
||||
}
|
||||
|
||||
func (h *handler) rollbackPendingNetworkChange() error {
|
||||
h.pendingNetMu.Lock()
|
||||
pnc := h.pendingNet
|
||||
|
||||
76
audit/internal/webui/health_poller.go
Normal file
76
audit/internal/webui/health_poller.go
Normal file
@@ -0,0 +1,76 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"log/slog"
|
||||
"os/exec"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/collector"
|
||||
)
|
||||
|
||||
const healthPollInterval = 60 * time.Second
|
||||
const psuIPMITimeout = 15 * time.Second
|
||||
|
||||
// healthPoller runs periodic health checks for hardware components that do not
|
||||
// emit kernel log events (e.g. PSU). Results are written to ComponentStatusDB.
|
||||
type healthPoller struct {
|
||||
statusDB *app.ComponentStatusDB
|
||||
}
|
||||
|
||||
func newHealthPoller(statusDB *app.ComponentStatusDB) *healthPoller {
|
||||
return &healthPoller{statusDB: statusDB}
|
||||
}
|
||||
|
||||
func (p *healthPoller) start() {
|
||||
goRecoverLoop("health poller", 5*time.Second, p.run)
|
||||
}
|
||||
|
||||
func (p *healthPoller) run() {
|
||||
ticker := time.NewTicker(healthPollInterval)
|
||||
defer ticker.Stop()
|
||||
for range ticker.C {
|
||||
p.pollPSU()
|
||||
}
|
||||
}
|
||||
|
||||
func (p *healthPoller) pollPSU() {
|
||||
if p.statusDB == nil {
|
||||
return
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), psuIPMITimeout)
|
||||
defer cancel()
|
||||
|
||||
cmd := exec.CommandContext(ctx, "ipmitool", "sdr")
|
||||
var out bytes.Buffer
|
||||
cmd.Stdout = &out
|
||||
if err := cmd.Run(); err != nil {
|
||||
// IPMI not available or not a server — skip silently.
|
||||
slog.Debug("health poller: ipmitool sdr unavailable", "err", err)
|
||||
return
|
||||
}
|
||||
|
||||
slots := collector.PSUSlotsFromSDR(out.String())
|
||||
if len(slots) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
const source = "watchdog:psu"
|
||||
for slot, psu := range slots {
|
||||
key := "psu:" + slot
|
||||
status := psu.Status
|
||||
if status == "" {
|
||||
status = "Unknown"
|
||||
}
|
||||
detail := ""
|
||||
switch status {
|
||||
case "Critical":
|
||||
detail = "PSU sensor reported non-OK state"
|
||||
case "Warning":
|
||||
detail = "PSU sensor in warning state"
|
||||
}
|
||||
p.statusDB.Record(key, source, status, detail)
|
||||
}
|
||||
}
|
||||
280
audit/internal/webui/huawei_elabel.go
Normal file
280
audit/internal/webui/huawei_elabel.go
Normal file
@@ -0,0 +1,280 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type huaweiField struct {
|
||||
Name string `json:"name"`
|
||||
Key string `json:"key"`
|
||||
Value string `json:"value"`
|
||||
ReadOnly bool `json:"read_only,omitempty"`
|
||||
}
|
||||
|
||||
type huaweiChange struct {
|
||||
Key string `json:"key"`
|
||||
Value string `json:"value"`
|
||||
}
|
||||
|
||||
type huaweiFieldDef struct {
|
||||
Name string
|
||||
Key string
|
||||
FruID byte
|
||||
TypeID byte
|
||||
FieldID byte
|
||||
Special string // "chassis-type" | "guid"
|
||||
}
|
||||
|
||||
var huaweiElabelDefs = []huaweiFieldDef{
|
||||
{"Device Name", "DeviceName", 0x00, 0x06, 0x01, ""},
|
||||
{"Device Serial Number", "DeviceSerialNumber", 0x00, 0x06, 0x03, ""},
|
||||
{"Product Name", "ProductName", 0x00, 0x03, 0x01, ""},
|
||||
{"Product Serial Number", "ProductSerialNumber", 0x00, 0x03, 0x04, ""},
|
||||
{"Product Asset Tag", "ProductAssetTag", 0x00, 0x03, 0x05, ""},
|
||||
{"Product Manufacturer", "ProductManufacturer", 0x00, 0x03, 0x00, ""},
|
||||
{"Mainboard Manufacturer", "MainboardManufacturer", 0x00, 0x02, 0x01, ""},
|
||||
{"Board Product Name", "BoardProductName", 0x00, 0x02, 0x02, ""},
|
||||
{"Chassis Part Number", "ChassisPartnumber", 0x00, 0x01, 0x01, ""},
|
||||
{"Chassis Type", "ChassisType", 0x00, 0x01, 0x00, "chassis-type"},
|
||||
{"IO Chassis Serial", "IOChassisSerialNumber", 0x01, 0x03, 0x04, ""},
|
||||
{"IO Chassis Asset Tag", "IOChassisAssetTag", 0x01, 0x03, 0x05, ""},
|
||||
{"GUID", "GUID", 0x00, 0x00, 0x00, "guid"},
|
||||
}
|
||||
|
||||
// huaweiGetRaw reads a string elabel field via OEM IPMI raw command.
|
||||
// Protocol: ipmitool raw 0x30 0x90 0x05 <fru_id> <type_id> <field_id> 0x00 0x30
|
||||
// Response: <length_byte> <ascii_byte1> ... (null-terminated)
|
||||
func huaweiGetRaw(ctx context.Context, def huaweiFieldDef) (string, error) {
|
||||
if def.Special == "guid" {
|
||||
return huaweiGetGUID(ctx)
|
||||
}
|
||||
args := []string{
|
||||
"0x30", "0x90", "0x05",
|
||||
fmt.Sprintf("0x%02x", def.FruID),
|
||||
fmt.Sprintf("0x%02x", def.TypeID),
|
||||
fmt.Sprintf("0x%02x", def.FieldID),
|
||||
"0x00", "0x30",
|
||||
}
|
||||
out, err := exec.CommandContext(ctx, "ipmitool", append([]string{"raw"}, args...)...).CombinedOutput()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return huaweiParseStringResponse(strings.TrimSpace(string(out)), def.Special), nil
|
||||
}
|
||||
|
||||
// huaweiParseStringResponse decodes the OEM IPMI response bytes to a string.
|
||||
// Format: <length_byte> <byte1> <byte2> ...
|
||||
func huaweiParseStringResponse(hexOut, special string) string {
|
||||
parts := strings.Fields(hexOut)
|
||||
if len(parts) < 2 {
|
||||
return ""
|
||||
}
|
||||
if special == "chassis-type" {
|
||||
// Response: <length=1> <type_byte>
|
||||
if len(parts) >= 2 {
|
||||
n, err := strconv.ParseUint(parts[1], 16, 8)
|
||||
if err == nil {
|
||||
return fmt.Sprintf("0x%02x", n)
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
var sb strings.Builder
|
||||
for _, p := range parts[1:] {
|
||||
b, err := strconv.ParseUint(p, 16, 8)
|
||||
if err != nil || b == 0 {
|
||||
break
|
||||
}
|
||||
sb.WriteByte(byte(b))
|
||||
}
|
||||
return strings.TrimRight(sb.String(), "\x00")
|
||||
}
|
||||
|
||||
// huaweiGetGUID reads the system GUID via standard IPMI Get System GUID (0x06 0x08).
|
||||
func huaweiGetGUID(ctx context.Context) (string, error) {
|
||||
out, err := exec.CommandContext(ctx, "ipmitool", "raw", "0x06", "0x08").CombinedOutput()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
parts := strings.Fields(strings.TrimSpace(string(out)))
|
||||
if len(parts) != 16 {
|
||||
return "", nil
|
||||
}
|
||||
// Format as UUID: 4-2-2-2-6 byte groups
|
||||
// iBMC returns bytes in reversed order; re-reverse to get canonical UUID.
|
||||
var bytes [16]string
|
||||
for i, p := range parts {
|
||||
bytes[15-i] = p
|
||||
}
|
||||
return fmt.Sprintf("%s%s%s%s-%s%s-%s%s-%s%s-%s%s%s%s%s%s",
|
||||
bytes[0], bytes[1], bytes[2], bytes[3],
|
||||
bytes[4], bytes[5],
|
||||
bytes[6], bytes[7],
|
||||
bytes[8], bytes[9],
|
||||
bytes[10], bytes[11], bytes[12], bytes[13], bytes[14], bytes[15],
|
||||
), nil
|
||||
}
|
||||
|
||||
// huaweiChunks splits a value into 19-byte chunks for the OEM IPMI SET protocol.
|
||||
// Key byte: bit7=1 means more chunks follow; bits 0-6 = offset into string.
|
||||
func huaweiChunks(value string) [][]string {
|
||||
if len(value) == 0 {
|
||||
return [][]string{{"0x00", "0x01", "0x00"}}
|
||||
}
|
||||
const maxLen = 63
|
||||
if len(value) > maxLen {
|
||||
value = value[:maxLen]
|
||||
}
|
||||
const chunkSize = 19
|
||||
var chunks [][]string
|
||||
for offset := 0; offset < len(value); {
|
||||
end := offset + chunkSize
|
||||
if end > len(value) {
|
||||
end = len(value)
|
||||
}
|
||||
isLast := end >= len(value)
|
||||
key := byte(offset)
|
||||
if !isLast {
|
||||
key |= 0x80
|
||||
}
|
||||
args := []string{
|
||||
fmt.Sprintf("0x%02x", key),
|
||||
fmt.Sprintf("0x%02x", end-offset),
|
||||
}
|
||||
for _, b := range []byte(value[offset:end]) {
|
||||
args = append(args, fmt.Sprintf("0x%02x", b))
|
||||
}
|
||||
chunks = append(chunks, args)
|
||||
offset = end
|
||||
}
|
||||
return chunks
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIHuaweiElabelRead(w http.ResponseWriter, r *http.Request) {
|
||||
ctx, cancel := context.WithTimeout(r.Context(), 60*time.Second)
|
||||
defer cancel()
|
||||
|
||||
var fields []huaweiField
|
||||
for _, def := range huaweiElabelDefs {
|
||||
val, err := huaweiGetRaw(ctx, def)
|
||||
if err != nil {
|
||||
// First field failure likely means no Huawei BMC — abort with error.
|
||||
if len(fields) == 0 {
|
||||
msg := strings.TrimSpace(err.Error())
|
||||
writeError(w, http.StatusInternalServerError, "huawei elabel not available: "+msg)
|
||||
return
|
||||
}
|
||||
val = ""
|
||||
}
|
||||
fields = append(fields, huaweiField{
|
||||
Name: def.Name,
|
||||
Key: def.Key,
|
||||
Value: val,
|
||||
ReadOnly: def.Special == "guid" || def.Special == "chassis-type",
|
||||
})
|
||||
}
|
||||
writeJSON(w, fields)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIHuaweiElabelWrite(w http.ResponseWriter, r *http.Request) {
|
||||
var req struct {
|
||||
Changes []huaweiChange `json:"changes"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid JSON")
|
||||
return
|
||||
}
|
||||
if len(req.Changes) == 0 {
|
||||
writeError(w, http.StatusUnprocessableEntity, "no changes provided")
|
||||
return
|
||||
}
|
||||
|
||||
defByKey := make(map[string]huaweiFieldDef, len(huaweiElabelDefs))
|
||||
for _, d := range huaweiElabelDefs {
|
||||
defByKey[d.Key] = d
|
||||
}
|
||||
|
||||
for _, c := range req.Changes {
|
||||
def, ok := defByKey[c.Key]
|
||||
if !ok {
|
||||
writeError(w, http.StatusUnprocessableEntity, "unknown field key: "+c.Key)
|
||||
return
|
||||
}
|
||||
if def.Special == "guid" || def.Special == "chassis-type" {
|
||||
writeError(w, http.StatusUnprocessableEntity, "field is read-only: "+c.Key)
|
||||
return
|
||||
}
|
||||
if len(c.Value) > 63 {
|
||||
writeError(w, http.StatusUnprocessableEntity, "value too long (max 63 chars): "+c.Key)
|
||||
return
|
||||
}
|
||||
for _, ch := range c.Value {
|
||||
if ch < 0x20 || ch > 0x7E {
|
||||
writeError(w, http.StatusUnprocessableEntity, "non-printable character in value for: "+c.Key)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
t := &Task{
|
||||
ID: newJobID("huawei-elabel-write"),
|
||||
Name: fmt.Sprintf("Huawei Elabel Write (%d field(s))", len(req.Changes)),
|
||||
Target: "huawei-elabel-write",
|
||||
Priority: defaultTaskPriority("huawei-elabel-write", taskParams{}),
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{HuaweiElabelChanges: req.Changes},
|
||||
}
|
||||
globalQueue.enqueue(t)
|
||||
writeJSON(w, map[string]string{"task_id": t.ID})
|
||||
}
|
||||
|
||||
func runHuaweiElabelWriteTask(ctx context.Context, j *jobState, p taskParams) error {
|
||||
defByKey := make(map[string]huaweiFieldDef, len(huaweiElabelDefs))
|
||||
for _, d := range huaweiElabelDefs {
|
||||
defByKey[d.Key] = d
|
||||
}
|
||||
|
||||
// Enable device name effective flag before writing.
|
||||
enableCmd := exec.CommandContext(ctx, "ipmitool", "raw", "0x30", "0x90", "0x21", "0x04", "0x01")
|
||||
if out, err := enableCmd.CombinedOutput(); err != nil {
|
||||
j.append("Warning: enable flag: " + strings.TrimSpace(string(out)))
|
||||
}
|
||||
|
||||
for _, c := range p.HuaweiElabelChanges {
|
||||
def := defByKey[c.Key]
|
||||
setPrefix := []string{
|
||||
"0x30", "0x90", "0x04",
|
||||
fmt.Sprintf("0x%02x", def.FruID),
|
||||
fmt.Sprintf("0x%02x", def.TypeID),
|
||||
fmt.Sprintf("0x%02x", def.FieldID),
|
||||
}
|
||||
|
||||
chunks := huaweiChunks(c.Value)
|
||||
j.append(fmt.Sprintf("Setting %s = %q (%d chunk(s))", c.Key, c.Value, len(chunks)))
|
||||
|
||||
for _, chunk := range chunks {
|
||||
args := append([]string{"raw"}, setPrefix...)
|
||||
args = append(args, chunk...)
|
||||
cmd := exec.CommandContext(ctx, "ipmitool", args...)
|
||||
if err := streamCmdJob(j, cmd); err != nil {
|
||||
return fmt.Errorf("set %s: %w", c.Key, err)
|
||||
}
|
||||
}
|
||||
|
||||
// Commit after each field.
|
||||
commitCmd := exec.CommandContext(ctx, "ipmitool", "raw", "0x30", "0x90", "0x06", "0x00", "0xAA")
|
||||
if out, err := commitCmd.CombinedOutput(); err != nil {
|
||||
return fmt.Errorf("commit after %s: %w (output: %s)", c.Key, err, strings.TrimSpace(string(out)))
|
||||
}
|
||||
j.append("Committed " + c.Key)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
217
audit/internal/webui/ipmi_fru.go
Normal file
217
audit/internal/webui/ipmi_fru.go
Normal file
@@ -0,0 +1,217 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
"unicode"
|
||||
)
|
||||
|
||||
type fruField struct {
|
||||
Name string `json:"name"`
|
||||
Value string `json:"value"`
|
||||
Editable bool `json:"editable"`
|
||||
Area string `json:"area,omitempty"`
|
||||
Index int `json:"index,omitempty"`
|
||||
}
|
||||
|
||||
type fruChange struct {
|
||||
Area string `json:"area"`
|
||||
Index int `json:"index"`
|
||||
Name string `json:"name"`
|
||||
Value string `json:"value"`
|
||||
}
|
||||
|
||||
// fruEditableFields maps display name → area + index for ipmitool fru edit.
|
||||
var fruEditableFields = map[string]struct {
|
||||
Area string
|
||||
Index int
|
||||
}{
|
||||
// Chassis — vendor doc names and ipmitool abbreviated names
|
||||
"Chassis Part Number": {"c", 0},
|
||||
"Chassis Serial Number": {"c", 1},
|
||||
"Chassis Serial": {"c", 1},
|
||||
// Board — vendor doc names and ipmitool abbreviated names
|
||||
"Board Manufacturer": {"b", 0},
|
||||
"Board Mfg": {"b", 0},
|
||||
"Board Product Name": {"b", 1},
|
||||
"Board Product": {"b", 1},
|
||||
"Board Serial Number": {"b", 2},
|
||||
"Board Serial": {"b", 2},
|
||||
"Board Part Number": {"b", 3},
|
||||
// Product — vendor doc names and ipmitool abbreviated names
|
||||
"Product Manufacturer": {"p", 0},
|
||||
"Product Name": {"p", 1},
|
||||
"Product Part Number": {"p", 2},
|
||||
"Product Version": {"p", 3},
|
||||
"Product Serial Number": {"p", 4},
|
||||
"Product Serial": {"p", 4},
|
||||
"Product Asset Tag": {"p", 5},
|
||||
}
|
||||
|
||||
// fruExtraBaseIndex gives the starting ipmitool field index for each area's
|
||||
// repeated "<Area> Extra" custom fields, per the vendor FRU field doc (Chassis
|
||||
// extra fields start at 2, Board at 5, Product at 7). ipmitool fru print
|
||||
// emits one identically-named line per custom field, so parseFRUOutput
|
||||
// counts occurrences to recover the real index for each one.
|
||||
var fruExtraBaseIndex = map[string]struct {
|
||||
Area string
|
||||
Base int
|
||||
}{
|
||||
"Chassis Extra": {"c", 2},
|
||||
"Board Extra": {"b", 5},
|
||||
"Product Extra": {"p", 7},
|
||||
}
|
||||
|
||||
func parseFRUOutput(output string) []fruField {
|
||||
var fields []fruField
|
||||
extraSeen := map[string]int{}
|
||||
for _, line := range strings.Split(output, "\n") {
|
||||
// Lines look like: " Field Name : value"
|
||||
trimmed := strings.TrimLeft(line, " \t")
|
||||
if trimmed == "" {
|
||||
continue
|
||||
}
|
||||
colon := strings.Index(trimmed, " : ")
|
||||
valueOffset := 3
|
||||
if colon < 0 {
|
||||
// try ": " with no leading space before colon
|
||||
colon = strings.Index(trimmed, ": ")
|
||||
valueOffset = 2
|
||||
if colon < 0 {
|
||||
continue
|
||||
}
|
||||
}
|
||||
name := strings.TrimSpace(trimmed[:colon])
|
||||
value := strings.TrimSpace(trimmed[colon+valueOffset:])
|
||||
if name == "" {
|
||||
continue
|
||||
}
|
||||
editable, area, idx := fruFieldMeta(name, extraSeen)
|
||||
fields = append(fields, fruField{Name: name, Value: value, Editable: editable, Area: area, Index: idx})
|
||||
}
|
||||
return fields
|
||||
}
|
||||
|
||||
func fruFieldMeta(name string, extraSeen map[string]int) (editable bool, area string, index int) {
|
||||
if e, ok := fruExtraBaseIndex[name]; ok {
|
||||
idx := e.Base + extraSeen[name]
|
||||
extraSeen[name]++
|
||||
return true, e.Area, idx
|
||||
}
|
||||
if e, ok := fruEditableFields[name]; ok {
|
||||
return true, e.Area, e.Index
|
||||
}
|
||||
// All fields are shown as editable; server will reject unknown fields.
|
||||
return true, "", 0
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIIPMIFRURead(w http.ResponseWriter, r *http.Request) {
|
||||
ctx, cancel := context.WithTimeout(r.Context(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
out, err := exec.CommandContext(ctx, "ipmitool", "fru", "print", "0").CombinedOutput()
|
||||
if err != nil {
|
||||
msg := strings.TrimSpace(string(out))
|
||||
if msg == "" {
|
||||
msg = err.Error()
|
||||
}
|
||||
writeError(w, http.StatusInternalServerError, "ipmitool fru print: "+msg)
|
||||
return
|
||||
}
|
||||
|
||||
fields := parseFRUOutput(string(out))
|
||||
writeJSON(w, fields)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIIPMIFRUWrite(w http.ResponseWriter, r *http.Request) {
|
||||
var req struct {
|
||||
Changes []fruChange `json:"changes"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid JSON")
|
||||
return
|
||||
}
|
||||
if len(req.Changes) == 0 {
|
||||
writeError(w, http.StatusUnprocessableEntity, "no changes provided")
|
||||
return
|
||||
}
|
||||
validAreas := map[string]bool{"c": true, "b": true, "p": true}
|
||||
for i, c := range req.Changes {
|
||||
if c.Area == "" {
|
||||
e, ok := fruEditableFields[c.Name]
|
||||
if !ok {
|
||||
writeError(w, http.StatusUnprocessableEntity, "field not writable via ipmitool: "+c.Name)
|
||||
return
|
||||
}
|
||||
req.Changes[i].Area = e.Area
|
||||
req.Changes[i].Index = e.Index
|
||||
c = req.Changes[i]
|
||||
}
|
||||
if !validAreas[c.Area] {
|
||||
writeError(w, http.StatusUnprocessableEntity, "invalid area: "+c.Area)
|
||||
return
|
||||
}
|
||||
if c.Index < 0 || c.Index > 9 {
|
||||
writeError(w, http.StatusUnprocessableEntity, fmt.Sprintf("invalid index %d", c.Index))
|
||||
return
|
||||
}
|
||||
if len(c.Value) > 64 {
|
||||
writeError(w, http.StatusUnprocessableEntity, "value too long (max 64 chars)")
|
||||
return
|
||||
}
|
||||
for _, ch := range c.Value {
|
||||
if ch > unicode.MaxASCII || (ch < 0x20 && ch != 0) {
|
||||
writeError(w, http.StatusUnprocessableEntity, "value contains non-printable characters")
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
t := &Task{
|
||||
ID: newJobID("ipmi-fru-write"),
|
||||
Name: fmt.Sprintf("IPMI FRU Write (%d field(s))", len(req.Changes)),
|
||||
Target: "ipmi-fru-write",
|
||||
Priority: defaultTaskPriority("ipmi-fru-write", taskParams{}),
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{FRUChanges: req.Changes},
|
||||
}
|
||||
globalQueue.enqueue(t)
|
||||
writeJSON(w, map[string]string{"task_id": t.ID})
|
||||
}
|
||||
|
||||
func runIPMIFRUWriteTask(ctx context.Context, j *jobState, exportDir string, p taskParams) error {
|
||||
// Backup current FRU state
|
||||
backupDir := filepath.Join(exportDir, "fru-backups")
|
||||
if err := os.MkdirAll(backupDir, 0755); err != nil {
|
||||
return fmt.Errorf("mkdir fru-backups: %w", err)
|
||||
}
|
||||
stamp := time.Now().Format("20060102150405")
|
||||
backupPath := filepath.Join(backupDir, "fru-"+stamp+".txt")
|
||||
|
||||
backupOut, err := exec.CommandContext(ctx, "ipmitool", "fru", "print", "0").CombinedOutput()
|
||||
if err != nil {
|
||||
return fmt.Errorf("backup fru print: %w", err)
|
||||
}
|
||||
if err := os.WriteFile(backupPath, backupOut, 0644); err != nil {
|
||||
return fmt.Errorf("write backup: %w", err)
|
||||
}
|
||||
j.append("Backup saved to " + backupPath)
|
||||
|
||||
// Apply changes
|
||||
for _, c := range p.FRUChanges {
|
||||
j.append(fmt.Sprintf("Setting %s (%s %d) = %q", c.Name, c.Area, c.Index, c.Value))
|
||||
cmd := exec.CommandContext(ctx, "ipmitool", "fru", "edit", "0", "field", c.Area, fmt.Sprintf("%d", c.Index), c.Value)
|
||||
if err := streamCmdJob(j, cmd); err != nil {
|
||||
return fmt.Errorf("fru edit %s %d: %w", c.Area, c.Index, err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
59
audit/internal/webui/ipmi_fru_test.go
Normal file
59
audit/internal/webui/ipmi_fru_test.go
Normal file
@@ -0,0 +1,59 @@
|
||||
package webui
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestParseFRUOutputExtraFields(t *testing.T) {
|
||||
// Realistic ipmitool fru print output: repeated "<Area> Extra" lines
|
||||
// (one per custom field) must resolve to sequential indices per the
|
||||
// vendor FRU doc (Chassis Extra starts at 2, Board Extra at 5, Product
|
||||
// Extra at 7), not all collapse onto the same index.
|
||||
out := `
|
||||
Product Manufacturer : Inspur
|
||||
Product Name : NF5280M6
|
||||
Product Part Number : PN123
|
||||
Product Version : 1.0
|
||||
Product Serial : SN123
|
||||
Product Asset Tag : ASSET01
|
||||
Product Extra : custom-p1
|
||||
Board Mfg : Inspur
|
||||
Board Product : BoardX
|
||||
Board Serial : BSN1
|
||||
Board Part Number : BPN1
|
||||
Board Extra : custom-b1
|
||||
Board Extra : custom-b2
|
||||
Board Extra : custom-b3
|
||||
Chassis Part Number : CPN1
|
||||
Chassis Serial : CSN1
|
||||
Chassis Extra : front-half
|
||||
Chassis Extra : back-half
|
||||
`
|
||||
fields := parseFRUOutput(out)
|
||||
|
||||
byName := map[string][]fruField{}
|
||||
for _, f := range fields {
|
||||
byName[f.Name] = append(byName[f.Name], f)
|
||||
}
|
||||
|
||||
assertMeta := func(name string, occurrence int, wantArea string, wantIndex int) {
|
||||
t.Helper()
|
||||
list := byName[name]
|
||||
if occurrence >= len(list) {
|
||||
t.Fatalf("expected occurrence %d of %q, got %d entries", occurrence, name, len(list))
|
||||
}
|
||||
f := list[occurrence]
|
||||
if f.Area != wantArea || f.Index != wantIndex {
|
||||
t.Errorf("%s[%d] = area:%q index:%d, want area:%q index:%d", name, occurrence, f.Area, f.Index, wantArea, wantIndex)
|
||||
}
|
||||
if !f.Editable {
|
||||
t.Errorf("%s[%d] expected editable", name, occurrence)
|
||||
}
|
||||
}
|
||||
|
||||
assertMeta("Product Asset Tag", 0, "p", 5)
|
||||
assertMeta("Product Extra", 0, "p", 7)
|
||||
assertMeta("Board Extra", 0, "b", 5)
|
||||
assertMeta("Board Extra", 1, "b", 6)
|
||||
assertMeta("Board Extra", 2, "b", 7)
|
||||
assertMeta("Chassis Extra", 0, "c", 2)
|
||||
assertMeta("Chassis Extra", 1, "c", 3)
|
||||
}
|
||||
@@ -73,6 +73,9 @@ func (w *kmsgWatcher) run() {
|
||||
w.mu.Lock()
|
||||
if w.window != nil {
|
||||
w.recordEvent(evt)
|
||||
} else {
|
||||
evtCopy := evt
|
||||
goRecoverOnce("kmsg flush immediate", func() { w.flushImmediate(evtCopy) })
|
||||
}
|
||||
w.mu.Unlock()
|
||||
}
|
||||
@@ -162,7 +165,9 @@ func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
|
||||
for _, id := range evt.ids {
|
||||
var key string
|
||||
switch evt.category {
|
||||
case "gpu", "pcie":
|
||||
case "gpu":
|
||||
key = "pcie:gpu:" + normalizeBDF(id)
|
||||
case "pcie":
|
||||
key = "pcie:" + normalizeBDF(id)
|
||||
case "storage":
|
||||
key = "storage:" + id
|
||||
@@ -180,6 +185,54 @@ func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
|
||||
}
|
||||
}
|
||||
|
||||
// flushImmediate writes a single kmsg event directly to the status DB without a SAT window.
|
||||
// Called when an error is detected outside of any SAT task (always-on watching).
|
||||
func (w *kmsgWatcher) flushImmediate(evt kmsgEvent) {
|
||||
if w.statusDB == nil {
|
||||
return
|
||||
}
|
||||
const source = "watchdog:kmsg"
|
||||
detail := "kernel: " + truncate(evt.raw, 120)
|
||||
|
||||
var severity string
|
||||
for _, p := range platform.HardwareErrorPatterns {
|
||||
if p.Re.MatchString(evt.raw) {
|
||||
if p.Severity == "critical" {
|
||||
severity = "Critical"
|
||||
} else {
|
||||
severity = "Warning"
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
if severity == "" {
|
||||
severity = "Warning"
|
||||
}
|
||||
|
||||
if len(evt.ids) == 0 {
|
||||
key := "cpu:all"
|
||||
if evt.category == "memory" {
|
||||
key = "memory:all"
|
||||
}
|
||||
w.statusDB.Record(key, source, severity, detail)
|
||||
return
|
||||
}
|
||||
for _, id := range evt.ids {
|
||||
var key string
|
||||
switch evt.category {
|
||||
case "gpu":
|
||||
key = "pcie:gpu:" + normalizeBDF(id)
|
||||
case "pcie":
|
||||
key = "pcie:" + normalizeBDF(id)
|
||||
case "storage":
|
||||
key = "storage:" + id
|
||||
default:
|
||||
key = "pcie:" + normalizeBDF(id)
|
||||
}
|
||||
w.statusDB.Record(key, source, severity, detail)
|
||||
}
|
||||
}
|
||||
|
||||
// parseKmsgLine parses a single /dev/kmsg line and returns an event if it matches
|
||||
// any pattern in platform.HardwareErrorPatterns.
|
||||
// kmsg format: "<priority>,<sequence>,<timestamp_usec>,-;message text"
|
||||
|
||||
@@ -17,6 +17,7 @@ func layoutHead(title string) string {
|
||||
<style>
|
||||
:root{--bg:#fff;--surface:#fff;--surface-2:#f9fafb;--border:rgba(34,36,38,.15);--border-lite:rgba(34,36,38,.1);--ink:rgba(0,0,0,.87);--muted:rgba(0,0,0,.6);--accent:#2185d0;--accent-dark:#1678c2;--crit-bg:#fff6f6;--crit-fg:#9f3a38;--crit-border:#e0b4b4;--ok-bg:#fcfff5;--ok-fg:#2c662d;--warn-bg:#fffaf3;--warn-fg:#573a08}
|
||||
*{box-sizing:border-box;margin:0;padding:0}
|
||||
dialog{margin:auto}
|
||||
body{font:14px/1.5 Lato,"Helvetica Neue",Arial,Helvetica,sans-serif;background:var(--bg);color:var(--ink);display:flex;min-height:100vh}
|
||||
a{color:var(--accent);text-decoration:none}
|
||||
/* Sidebar */
|
||||
@@ -67,6 +68,10 @@ tbody tr:hover td{background:rgba(0,0,0,.03)}
|
||||
.chip-warn{background:var(--warn-bg);color:var(--warn-fg);border:1px solid #c9ba9b}
|
||||
.chip-fail{background:var(--crit-bg);color:var(--crit-fg);border:1px solid var(--crit-border)}
|
||||
.chip-unknown{background:var(--surface-2);color:var(--muted);border:1px solid var(--border)}
|
||||
/* Nav separator and tasks count badge */
|
||||
.nav-sep{height:1px;background:rgba(255,255,255,.12);margin:6px 0}
|
||||
.tasks-nav-count{background:var(--accent);color:#fff;border-radius:10px;padding:1px 7px;font-size:11px;font-weight:700;display:none;margin-left:auto}
|
||||
.tasks-nav-count.active{display:inline}
|
||||
/* Output terminal */
|
||||
.terminal{background:#1b1c1d;border:1px solid rgba(0,0,0,.2);border-radius:4px;padding:14px;font-family:monospace;font-size:12px;color:#b5cea8;max-height:400px;overflow-y:auto;white-space:pre-wrap;word-break:break-all;user-select:text;-webkit-user-select:text}
|
||||
.terminal-wrap{position:relative}.terminal-copy{position:absolute;top:6px;right:6px;background:#2d2f30;border:1px solid #444;color:#aaa;font-size:11px;padding:2px 8px;border-radius:3px;cursor:pointer;opacity:.7}.terminal-copy:hover{opacity:1}
|
||||
@@ -92,14 +97,21 @@ tbody tr:hover td{background:rgba(0,0,0,.03)}
|
||||
}
|
||||
|
||||
func layoutNav(active string, buildLabel string) string {
|
||||
items := []struct{ id, label, href, onclick string }{
|
||||
{"dashboard", "Dashboard", "/", ""},
|
||||
{"audit", "Audit", "/audit", ""},
|
||||
{"validate", "Validate", "/validate", ""},
|
||||
{"burn", "Burn", "/burn", ""},
|
||||
{"benchmark", "Benchmark", "/benchmark", ""},
|
||||
{"tasks", "Tasks", "/tasks", ""},
|
||||
{"tools", "Tools", "/tools", ""},
|
||||
type navItem struct {
|
||||
id, label, href string
|
||||
sep bool
|
||||
}
|
||||
items := []navItem{
|
||||
{id: "dashboard", label: "Dashboard", href: "/"},
|
||||
{id: "audit", label: "1. Audit", href: "/audit"},
|
||||
{id: "check", label: "2. Check", href: "/check"},
|
||||
{id: "load", label: "3. Load", href: "/load"},
|
||||
{id: "burn", label: "4. Burn", href: "/burn"},
|
||||
{id: "benchmark", label: "5. Benchmark", href: "/benchmark"},
|
||||
{sep: true},
|
||||
{id: "tasks", label: "Tasks", href: "/tasks"},
|
||||
{id: "tools", label: "Tools", href: "/tools"},
|
||||
{id: "settings", label: "Settings", href: "/settings"},
|
||||
}
|
||||
var b strings.Builder
|
||||
b.WriteString(`<aside class="sidebar">`)
|
||||
@@ -119,19 +131,24 @@ func layoutNav(active string, buildLabel string) string {
|
||||
}
|
||||
b.WriteString(`<nav class="nav">`)
|
||||
for _, item := range items {
|
||||
if item.sep {
|
||||
b.WriteString(`<div class="nav-sep"></div>`)
|
||||
continue
|
||||
}
|
||||
cls := "nav-item"
|
||||
if item.id == active {
|
||||
cls += " active"
|
||||
}
|
||||
if item.onclick != "" {
|
||||
b.WriteString(fmt.Sprintf(`<a class="%s" href="%s" onclick="%s">%s</a>`,
|
||||
cls, item.href, item.onclick, item.label))
|
||||
if item.id == "tasks" {
|
||||
b.WriteString(fmt.Sprintf(`<a class="%s" href="%s" id="tasks-nav-item">%s<span class="tasks-nav-count" id="tasks-nav-count"></span></a>`, cls, item.href, item.label))
|
||||
} else {
|
||||
b.WriteString(fmt.Sprintf(`<a class="%s" href="%s">%s</a>`,
|
||||
cls, item.href, item.label))
|
||||
b.WriteString(fmt.Sprintf(`<a class="%s" href="%s">%s</a>`, cls, item.href, item.label))
|
||||
}
|
||||
}
|
||||
b.WriteString(`</nav>`)
|
||||
b.WriteString(`<script>`)
|
||||
b.WriteString(`(function(){function u(){fetch('/api/tasks',{cache:'no-store'}).then(function(r){return r.json();}).then(function(d){var n=Array.isArray(d)?d.filter(function(t){return t.status==='pending'||t.status==='running';}).length:0;var c=document.getElementById('tasks-nav-count');var el=document.getElementById('tasks-nav-item');if(c){c.textContent=n>0?String(n):'';c.className='tasks-nav-count'+(n>0?' active':'');}if(el){el.style.color=n>0?'#f6c90e':'';}}).catch(function(){});}u();setInterval(u,5000);})();`)
|
||||
b.WriteString(`</script>`)
|
||||
b.WriteString(`</aside>`)
|
||||
return b.String()
|
||||
}
|
||||
|
||||
@@ -611,3 +611,7 @@ func renderPowerBenchmarkResultsCard(exportDir string) string {
|
||||
b.WriteString(`</div></div>`)
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// renderSpeed and renderEndurance are legacy wrappers; canonical page is 5. Benchmark at /benchmark.
|
||||
func renderSpeed(opts HandlerOptions) string { return renderBenchmark(opts) }
|
||||
func renderEndurance(opts HandlerOptions) string { return renderBenchmark(opts) }
|
||||
|
||||
@@ -2,7 +2,7 @@ package webui
|
||||
|
||||
func renderBurn() string {
|
||||
return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>⚠ Warning:</strong> Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.</div>
|
||||
<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> Burn exposes sustained GPU compute load recipes. DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `) and LINPACK remain in <a href="/validate">Validate → Stress mode</a>; NCCL and NVBandwidth are available directly from <a href="/validate">Validate</a>.</div>
|
||||
<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> Burn runs sustained GPU compute and CPU/memory stress recipes. DCGM targeted diagnostics (<code>targeted_stress</code>, <code>targeted_power</code>, <code>pulse_test</code>) and NCCL/NVBandwidth are on the <a href="/load">3. Load</a> page. For performance benchmarks, see <a href="/benchmark">5. Benchmark</a>.</div>
|
||||
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
|
||||
@@ -402,93 +402,226 @@ loadNvidiaSelfHeal();
|
||||
}
|
||||
|
||||
func renderTools() string {
|
||||
return `<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">System Install</div>
|
||||
<div class="card-body">
|
||||
<div style="margin-bottom:20px">
|
||||
<div style="font-weight:600;margin-bottom:8px">Install to RAM</div>
|
||||
<p id="boot-source-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Detecting boot source...</p>
|
||||
<p id="ram-status-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Checking...</p>
|
||||
<button id="ram-install-btn" class="btn btn-primary" onclick="installToRAM()" style="display:none">▶ Copy to RAM</button>
|
||||
</div>
|
||||
<div style="border-top:1px solid var(--line);padding-top:20px">
|
||||
<div style="font-weight:600;margin-bottom:8px">Install to Disk</div>` +
|
||||
renderInstallInline() + `
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<script>
|
||||
fetch('/api/system/ram-status').then(r=>r.json()).then(d=>{
|
||||
const boot = document.getElementById('boot-source-text');
|
||||
const txt = document.getElementById('ram-status-text');
|
||||
const btn = document.getElementById('ram-install-btn');
|
||||
let source = d.device || d.source || 'unknown source';
|
||||
let kind = d.kind || 'unknown';
|
||||
let label = source;
|
||||
if (kind === 'ram') label = 'RAM';
|
||||
else if (kind === 'usb') label = 'USB (' + source + ')';
|
||||
else if (kind === 'cdrom') label = 'CD-ROM (' + source + ')';
|
||||
else if (kind === 'disk') label = 'disk (' + source + ')';
|
||||
else label = source;
|
||||
boot.textContent = 'Current boot source: ' + label + '.';
|
||||
txt.textContent = d.blocked_reason || d.message || 'Checking...';
|
||||
if (d.status === 'ok' || d.in_ram) {
|
||||
txt.style.color = 'var(--ok, green)';
|
||||
} else if (d.status === 'failed') {
|
||||
txt.style.color = 'var(--err, #b91c1c)';
|
||||
} else {
|
||||
txt.style.color = 'var(--muted)';
|
||||
}
|
||||
if (d.can_start_task) {
|
||||
btn.style.display = '';
|
||||
btn.disabled = false;
|
||||
} else {
|
||||
btn.style.display = 'none';
|
||||
}
|
||||
});
|
||||
function installToRAM() {
|
||||
document.getElementById('ram-install-btn').disabled = true;
|
||||
fetch('/api/system/install-to-ram', {method:'POST'}).then(r=>r.json()).then(d=>{
|
||||
window.location.href = '/tasks#' + d.task_id;
|
||||
});
|
||||
}
|
||||
</script>
|
||||
return renderNVMeFormatCard() + `
|
||||
|
||||
<div class="card"><div class="card-head">Support Bundle</div><div class="card-body">
|
||||
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Downloads a tar.gz archive of all audit files, SAT results, and logs.</p>
|
||||
` + renderSupportBundleInline() + `
|
||||
<div style="border-top:1px solid var(--border);margin-top:16px;padding-top:16px">
|
||||
<div style="font-weight:600;margin-bottom:8px">USB Black-Box</div>
|
||||
` + renderUSBExportInline() + `
|
||||
</div>
|
||||
` + renderFRUEditorCard() + `
|
||||
|
||||
` + renderRAIDMgmtCard()
|
||||
}
|
||||
|
||||
func renderFRUEditorCard() string {
|
||||
return `<div class="card"><div class="card-head card-head-actions">FRU / Elabel<div class="card-head-buttons"><button class="btn btn-sm btn-secondary" onclick="fruAllRead()">Read All</button></div></div><div class="card-body">
|
||||
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Reads and edits hardware identity fields from all available sources. Each field shows its source method.</p>
|
||||
<div id="fru-all-status" style="font-size:13px;color:var(--muted);margin-bottom:8px"></div>
|
||||
<div id="fru-src-status" style="display:none;margin-bottom:10px"></div>
|
||||
<div id="fru-all-table"></div>
|
||||
</div></div>
|
||||
|
||||
<div class="card"><div class="card-head">Tool Check <button class="btn btn-sm btn-secondary" onclick="checkTools()" style="margin-left:auto">↻ Check</button></div>
|
||||
<div class="card-body"><div id="tools-table"><p style="color:var(--muted);font-size:13px">Checking...</p></div></div></div>
|
||||
|
||||
<div class="card"><div class="card-head">NVIDIA Self Heal</div><div class="card-body">` +
|
||||
renderNvidiaSelfHealInline() + `</div></div>
|
||||
|
||||
<div class="card"><div class="card-head">Network</div><div class="card-body">` +
|
||||
renderNetworkInline() + `</div></div>
|
||||
|
||||
<div class="card"><div class="card-head">Services</div><div class="card-body">` +
|
||||
renderServicesInline() + `</div></div>
|
||||
|
||||
` + renderNVMeFormatCard() + `
|
||||
|
||||
<style>
|
||||
.fru-chip{display:inline-block;font-size:10px;font-weight:600;letter-spacing:.02em;padding:1px 6px;border-radius:3px;vertical-align:middle;white-space:nowrap;margin-right:8px;flex-shrink:0}
|
||||
.fru-chip-ipmi{background:#e8e8e8;color:#555}
|
||||
.fru-chip-huawei{background:#fff0e6;color:#b83}
|
||||
.fru-chip-saa{background:#e6f0ff;color:#557}
|
||||
.fru-inp-wrap{display:flex;align-items:center;gap:0}
|
||||
</style>
|
||||
<script>
|
||||
function checkTools() {
|
||||
document.getElementById('tools-table').innerHTML = '<p style="color:var(--muted);font-size:13px">Checking...</p>';
|
||||
fetch('/api/tools/check').then(r=>r.json()).then(tools => {
|
||||
const rows = tools.map(t =>
|
||||
'<tr><td>'+t.Name+'</td><td><span class="badge '+(t.OK ? 'badge-ok' : 'badge-err')+'">'+(t.OK ? '✓ '+t.Path : '✗ missing')+'</span></td></tr>'
|
||||
).join('');
|
||||
document.getElementById('tools-table').innerHTML =
|
||||
'<table><tr><th>Tool</th><th>Status</th></tr>'+rows+'</table>';
|
||||
(function(){
|
||||
var _actBtn='width:22px;height:22px;padding:0;font-size:13px;line-height:1;border:1px solid var(--line);border-radius:3px;background:var(--surface);cursor:pointer;vertical-align:middle;';
|
||||
var _inp='width:100%;padding:3px 6px;border:1.5px solid #888;border-radius:3px;font-size:13px;font-family:monospace;background:var(--surface);color:var(--ink);';
|
||||
|
||||
var SOURCES = [
|
||||
{
|
||||
id: 'ipmi-fru',
|
||||
label: 'IPMI FRU',
|
||||
chipClass: 'fru-chip-ipmi',
|
||||
url: '/api/tools/ipmi-fru',
|
||||
writeUrl: '/api/tools/ipmi-fru/write',
|
||||
rowAttrs: function(f) {
|
||||
return 'data-source="ipmi-fru" data-area="'+esc(f.area||'')+'" data-index="'+(f.index||0)+'" data-name="'+esc(f.name)+'"';
|
||||
},
|
||||
writeBody: function(inp) {
|
||||
return JSON.stringify({changes:[{area:inp.dataset.area,index:parseInt(inp.dataset.index,10),name:inp.dataset.name,value:inp.value}]});
|
||||
},
|
||||
fieldName: function(f) { return f.name; },
|
||||
fieldValue: function(f) { return f.value||''; },
|
||||
readOnly: function(f) { return false; },
|
||||
},
|
||||
{
|
||||
id: 'huawei',
|
||||
label: 'Huawei iBMC',
|
||||
chipClass: 'fru-chip-huawei',
|
||||
url: '/api/tools/huawei-elabel',
|
||||
writeUrl: '/api/tools/huawei-elabel/write',
|
||||
rowAttrs: function(f) {
|
||||
return 'data-source="huawei" data-key="'+esc(f.key)+'"';
|
||||
},
|
||||
writeBody: function(inp) {
|
||||
return JSON.stringify({changes:[{key:inp.dataset.key,value:inp.value}]});
|
||||
},
|
||||
fieldName: function(f) { return f.name; },
|
||||
fieldValue: function(f) { return f.value||''; },
|
||||
readOnly: function(f) { return !!f.read_only; },
|
||||
},
|
||||
{
|
||||
id: 'saa-dmi',
|
||||
label: 'SAA DMI',
|
||||
chipClass: 'fru-chip-saa',
|
||||
url: '/api/tools/saa-dmi',
|
||||
writeUrl: '/api/tools/saa-dmi/write',
|
||||
rowAttrs: function(f) {
|
||||
return 'data-source="saa-dmi" data-shn="'+esc(f.shn)+'"';
|
||||
},
|
||||
writeBody: function(inp) {
|
||||
return JSON.stringify({changes:[{shn:inp.dataset.shn,value:inp.value}]});
|
||||
},
|
||||
fieldName: function(f) { return f.name; },
|
||||
fieldValue: function(f) { return f.value||''; },
|
||||
readOnly: function(f) { return false; },
|
||||
},
|
||||
];
|
||||
|
||||
function esc(s){return String(s==null?'':s).replace(/&/g,'&').replace(/</g,'<').replace(/>/g,'>').replace(/"/g,'"');}
|
||||
|
||||
function renderSrcStatus(perSource) {
|
||||
var bar = document.getElementById('fru-src-status');
|
||||
if (!perSource.length) { bar.style.display = 'none'; bar.innerHTML = ''; return; }
|
||||
var html = '';
|
||||
perSource.forEach(function(p) {
|
||||
var state, color;
|
||||
if (p.ok) {
|
||||
state = p.count + ' field(s) available';
|
||||
color = 'var(--ok-fg,green)';
|
||||
} else if (/not activated|product key|SFT-DCMS|SFT-OOB/i.test(p.reason)) {
|
||||
state = 'requires Supermicro license (SFT-OOB-LIC / SFT-DCMS-SINGLE) — activate on BMC';
|
||||
color = 'var(--crit-fg,#9f3a38)';
|
||||
} else {
|
||||
state = p.reason || 'unavailable';
|
||||
color = 'var(--muted)';
|
||||
}
|
||||
html += '<div style="display:flex;align-items:center;gap:8px;font-size:12px;margin:3px 0">'
|
||||
+ '<span class="fru-chip '+p.src.chipClass+'">'+p.src.label+'</span>'
|
||||
+ '<span style="color:'+color+'">'+esc(state)+'</span>'
|
||||
+ '</div>';
|
||||
});
|
||||
bar.innerHTML = html;
|
||||
bar.style.display = '';
|
||||
}
|
||||
checkTools();
|
||||
|
||||
window.fruAllRead = function() {
|
||||
var status = document.getElementById('fru-all-status');
|
||||
var table = document.getElementById('fru-all-table');
|
||||
status.textContent = 'Reading…'; status.style.color = 'var(--muted)';
|
||||
table.innerHTML = '';
|
||||
|
||||
var fetches = SOURCES.map(function(src) {
|
||||
return fetch(src.url, {cache:'no-store'})
|
||||
.then(function(r){ return r.json().then(function(d){ if(!r.ok) throw new Error(d.error||r.statusText); return d; }); });
|
||||
});
|
||||
|
||||
Promise.allSettled(fetches).then(function(results) {
|
||||
var rows = '';
|
||||
var totalFields = 0;
|
||||
var perSource = [];
|
||||
|
||||
results.forEach(function(res, i) {
|
||||
var src = SOURCES[i];
|
||||
if (res.status === 'rejected' || !Array.isArray(res.value) || res.value.length === 0) {
|
||||
var reason = '';
|
||||
if (res.status === 'rejected' && res.reason) reason = res.reason.message;
|
||||
else reason = 'no editable fields returned';
|
||||
perSource.push({src:src, ok:false, count:0, reason:reason});
|
||||
return;
|
||||
}
|
||||
perSource.push({src:src, ok:true, count:res.value.length, reason:''});
|
||||
res.value.forEach(function(f) {
|
||||
var val = esc(src.fieldValue(f));
|
||||
var ro = src.readOnly(f);
|
||||
var attrs = ro ? '' : (' '+src.rowAttrs(f));
|
||||
rows += '<tr>'
|
||||
+ '<td style="white-space:nowrap;padding-right:4px;vertical-align:middle">'
|
||||
+ '<span class="fru-chip '+src.chipClass+'">'+src.label+'</span>'
|
||||
+ '</td>'
|
||||
+ '<td style="color:var(--muted);white-space:nowrap;padding-right:16px;vertical-align:middle;font-size:13px">'+esc(src.fieldName(f))+'</td>'
|
||||
+ '<td style="vertical-align:middle">'
|
||||
+ (ro
|
||||
? '<span style="font-family:monospace;font-size:13px;color:var(--muted)">'+val+'</span>'
|
||||
: '<input class="fru-uni-inp" style="'+_inp+'" value="'+val+'" data-original="'+val+'"'+attrs+' oninput="fruUniChanged(this)">')
|
||||
+ '</td>'
|
||||
+ '<td class="fru-uni-act" style="display:none;white-space:nowrap;padding-left:6px;vertical-align:middle">'
|
||||
+ '<button style="'+_actBtn+'color:var(--ok-fg,green);margin-right:3px" title="Save" onclick="fruUniSave(this)">✓</button>'
|
||||
+ '<button style="'+_actBtn+'color:var(--crit-fg,#9f3a38)" title="Cancel" onclick="fruUniCancel(this)">✗</button>'
|
||||
+ '<span class="fru-uni-msg" style="font-size:11px;margin-left:5px;color:var(--muted)"></span>'
|
||||
+ '</td>'
|
||||
+ '</tr>';
|
||||
totalFields++;
|
||||
});
|
||||
});
|
||||
|
||||
renderSrcStatus(perSource);
|
||||
|
||||
if (totalFields === 0) {
|
||||
status.textContent = 'No editable fields available — see per-source status below.';
|
||||
status.style.color = 'var(--crit-fg,#9f3a38)';
|
||||
table.innerHTML = '';
|
||||
return;
|
||||
}
|
||||
|
||||
table.innerHTML = '<table style="width:100%;border-collapse:collapse">'+rows+'</table>';
|
||||
status.textContent = totalFields + ' field(s) loaded';
|
||||
status.style.color = 'var(--muted)';
|
||||
});
|
||||
};
|
||||
|
||||
window.fruUniChanged = function(inp) {
|
||||
var row = inp.closest('tr');
|
||||
row.querySelector('.fru-uni-act').style.display = inp.value !== inp.dataset.original ? '' : 'none';
|
||||
row.querySelector('.fru-uni-msg').textContent = '';
|
||||
};
|
||||
|
||||
window.fruUniCancel = function(btn) {
|
||||
var row = btn.closest('tr');
|
||||
var inp = row.querySelector('.fru-uni-inp');
|
||||
inp.value = inp.dataset.original;
|
||||
row.querySelector('.fru-uni-act').style.display = 'none';
|
||||
row.querySelector('.fru-uni-msg').textContent = '';
|
||||
};
|
||||
|
||||
window.fruUniSave = function(btn) {
|
||||
var row = btn.closest('tr');
|
||||
var inp = row.querySelector('.fru-uni-inp');
|
||||
var msg = row.querySelector('.fru-uni-msg');
|
||||
var cancelBtn = row.querySelectorAll('.fru-uni-act button')[1];
|
||||
var src = SOURCES.find(function(s){ return s.id === inp.dataset.source; });
|
||||
if (!src) { msg.textContent = 'Unknown source'; msg.style.color='var(--crit-fg)'; return; }
|
||||
|
||||
btn.disabled = true; cancelBtn.disabled = true;
|
||||
msg.textContent = '…'; msg.style.color = 'var(--muted)';
|
||||
|
||||
fetch(src.writeUrl, {method:'POST', headers:{'Content-Type':'application/json'}, body:src.writeBody(inp)})
|
||||
.then(function(r){ return r.json().then(function(d){ if(!r.ok) throw new Error(d.error||r.statusText); return d; }); })
|
||||
.then(function(d) {
|
||||
var poll = setInterval(function() {
|
||||
fetch('/api/tasks',{cache:'no-store'}).then(function(r){return r.json();}).then(function(tasks){
|
||||
var t = Array.isArray(tasks) ? tasks.find(function(x){return x.id===d.task_id;}) : null;
|
||||
if (!t) return;
|
||||
if (t.status==='done') {
|
||||
clearInterval(poll);
|
||||
inp.dataset.original = inp.value;
|
||||
row.querySelector('.fru-uni-act').style.display = 'none';
|
||||
msg.textContent = ''; msg.style.color = '';
|
||||
} else if (t.status==='failed'||t.status==='cancelled') {
|
||||
clearInterval(poll);
|
||||
msg.textContent = t.error||t.status; msg.style.color = 'var(--crit-fg)';
|
||||
btn.disabled = false; cancelBtn.disabled = false;
|
||||
}
|
||||
});
|
||||
}, 1500);
|
||||
})
|
||||
.catch(function(e) {
|
||||
msg.textContent = 'Error: '+e.message; msg.style.color = 'var(--crit-fg)';
|
||||
btn.disabled = false; cancelBtn.disabled = false;
|
||||
});
|
||||
};
|
||||
})();
|
||||
</script>`
|
||||
}
|
||||
|
||||
|
||||
115
audit/internal/webui/page_settings.go
Normal file
115
audit/internal/webui/page_settings.go
Normal file
@@ -0,0 +1,115 @@
|
||||
package webui
|
||||
|
||||
import "html"
|
||||
|
||||
func renderSettings(opts HandlerOptions) string {
|
||||
version := opts.BuildLabel
|
||||
if version == "" {
|
||||
version = "dev"
|
||||
}
|
||||
return `<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">System Install</div>
|
||||
<div class="card-body">
|
||||
<div style="margin-bottom:20px">
|
||||
<div style="font-weight:600;margin-bottom:8px">Install to RAM</div>
|
||||
<p id="boot-source-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Detecting boot source...</p>
|
||||
<p id="ram-status-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Checking...</p>
|
||||
<button id="ram-install-btn" class="btn btn-primary" onclick="installToRAM()" style="display:none">▶ Copy to RAM</button>
|
||||
</div>
|
||||
<div style="border-top:1px solid var(--line);padding-top:20px">
|
||||
<div style="font-weight:600;margin-bottom:8px">Install to Disk</div>` +
|
||||
renderInstallInline() + `
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<script>
|
||||
fetch('/api/system/ram-status').then(r=>r.json()).then(d=>{
|
||||
const boot = document.getElementById('boot-source-text');
|
||||
const txt = document.getElementById('ram-status-text');
|
||||
const btn = document.getElementById('ram-install-btn');
|
||||
let kind = d.kind || 'unknown';
|
||||
let source = d.device || d.source || 'unknown source';
|
||||
let label = kind==='ram'?'RAM':kind==='usb'?'USB ('+source+')':kind==='cdrom'?'CD-ROM ('+source+')':kind==='disk'?'disk ('+source+')':source;
|
||||
boot.textContent = 'Current boot source: ' + label + '.';
|
||||
txt.textContent = d.blocked_reason || d.message || 'Checking...';
|
||||
txt.style.color = (d.status==='ok'||d.in_ram)?'var(--ok,green)':d.status==='failed'?'var(--err,#b91c1c)':'var(--muted)';
|
||||
if (d.can_start_task) { btn.style.display=''; btn.disabled=false; } else { btn.style.display='none'; }
|
||||
});
|
||||
function installToRAM() {
|
||||
document.getElementById('ram-install-btn').disabled = true;
|
||||
fetch('/api/system/install-to-ram', {method:'POST'}).then(r=>r.json()).then(d=>{
|
||||
window.location.href = '/tasks#' + d.task_id;
|
||||
});
|
||||
}
|
||||
</script>
|
||||
|
||||
<div class="card"><div class="card-head">Support Bundle</div><div class="card-body">
|
||||
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Downloads a tar.gz archive of all audit files, SAT results, and logs.</p>
|
||||
` + renderSupportBundleInline() + `
|
||||
<div style="border-top:1px solid var(--border);margin-top:16px;padding-top:16px">
|
||||
<div style="font-weight:600;margin-bottom:8px">USB Black-Box</div>
|
||||
` + renderUSBExportInline() + `
|
||||
</div>
|
||||
</div></div>
|
||||
|
||||
<div class="card"><div class="card-head">Tool Check <button class="btn btn-sm btn-secondary" onclick="checkTools()" style="margin-left:auto">↻ Check</button></div>
|
||||
<div class="card-body"><div id="tools-table"><p style="color:var(--muted);font-size:13px">Checking...</p></div></div></div>
|
||||
<script>
|
||||
function checkTools() {
|
||||
document.getElementById('tools-table').innerHTML = '<p style="color:var(--muted);font-size:13px">Checking...</p>';
|
||||
fetch('/api/tools/check').then(r=>r.json()).then(tools => {
|
||||
const rows = tools.map(t =>
|
||||
'<tr><td>'+t.Name+'</td><td><span class="badge '+(t.OK?'badge-ok':'badge-err')+'">'+(t.OK?'✓ '+t.Path:'✗ missing')+'</span></td></tr>'
|
||||
).join('');
|
||||
document.getElementById('tools-table').innerHTML = '<table><tr><th>Tool</th><th>Status</th></tr>'+rows+'</table>';
|
||||
});
|
||||
}
|
||||
checkTools();
|
||||
</script>
|
||||
|
||||
<div class="card"><div class="card-head">NVIDIA Self Heal</div><div class="card-body">` +
|
||||
renderNvidiaSelfHealInline() + `</div></div>
|
||||
|
||||
<div class="card"><div class="card-head">Network</div><div class="card-body">` +
|
||||
renderNetworkInline() + `</div></div>
|
||||
|
||||
<div class="card"><div class="card-head">Services</div><div class="card-body">` +
|
||||
renderServicesInline() + `</div></div>
|
||||
|
||||
<div class="card">
|
||||
<div class="card-head">Build Info</div>
|
||||
<div class="card-body">
|
||||
<table style="width:auto">
|
||||
<tbody>
|
||||
<tr><td style="color:var(--muted);padding-right:24px">Version</td><td>` + html.EscapeString(version) + `</td></tr>
|
||||
<tr><td style="color:var(--muted);padding-right:24px">Title</td><td>` + html.EscapeString(opts.Title) + `</td></tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card">
|
||||
<div class="card-head">Power</div>
|
||||
<div class="card-body">
|
||||
<div style="display:flex;gap:8px;align-items:center">
|
||||
<button class="btn btn-secondary btn-sm" onclick="systemPower('reboot')">Reboot</button>
|
||||
<button class="btn btn-secondary btn-sm" onclick="systemPower('shutdown')">Shutdown</button>
|
||||
<span id="power-status" style="font-size:12px;color:var(--muted)"></span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
function systemPower(action) {
|
||||
var label = action === 'reboot' ? 'reboot' : 'shut down';
|
||||
if (!confirm('Are you sure you want to ' + label + ' the server?')) return;
|
||||
var el = document.getElementById('power-status');
|
||||
if (el) el.textContent = action === 'reboot' ? 'Rebooting...' : 'Shutting down...';
|
||||
fetch('/api/system/' + action, {method: 'POST'})
|
||||
.then(function(r) { return r.json(); })
|
||||
.catch(function(e) { if (el) el.textContent = 'Error: ' + e.message; });
|
||||
}
|
||||
</script>
|
||||
|
||||
`
|
||||
}
|
||||
@@ -11,6 +11,13 @@ import (
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
// PCI vendor IDs used for GPU classification (source: pci-ids.ucw.cz).
|
||||
const (
|
||||
pciVendorNvidia = 0x10de
|
||||
pciVendorAMD = 0x1002
|
||||
pciVendorAspeed = 0x1a03
|
||||
)
|
||||
|
||||
type validateInventory struct {
|
||||
CPU string
|
||||
Memory string
|
||||
@@ -61,6 +68,14 @@ func validateTotalStressSec(n int) int {
|
||||
}
|
||||
|
||||
func renderValidate(opts HandlerOptions) string {
|
||||
return renderValidateMode(opts, false)
|
||||
}
|
||||
|
||||
func renderValidateStress(opts HandlerOptions) string {
|
||||
return renderValidateMode(opts, true)
|
||||
}
|
||||
|
||||
func renderValidateMode(opts HandlerOptions, stressDefault bool) string {
|
||||
inv := loadValidateInventory(opts)
|
||||
n := inv.NvidiaGPUCount
|
||||
validateTotalStr := validateFmtDur(validateTotalValidateSec(n))
|
||||
@@ -69,26 +84,49 @@ func renderValidate(opts HandlerOptions) string {
|
||||
if n > 0 {
|
||||
gpuNote = fmt.Sprintf(" (%d GPU)", n)
|
||||
}
|
||||
return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div>
|
||||
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||
estStr := validateTotalStr
|
||||
if stressDefault {
|
||||
estStr = stressTotalStr
|
||||
}
|
||||
alert := `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div>`
|
||||
if stressDefault {
|
||||
alert = `<div class="alert alert-warn" style="margin-bottom:16px"><strong>⚠ Stress mode:</strong> Runs extended load tests — CPU stress-ng, memory passes, DCGM targeted diagnostics. Higher wear than Validate.</div>`
|
||||
}
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">Validate Profile</div>
|
||||
<div class="card-body validate-profile-body">
|
||||
<div class="validate-profile-col">
|
||||
<div class="form-row" style="margin:12px 0 0"><label>Mode</label></div>
|
||||
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-validate" value="validate" checked onchange="satModeChanged()"><span>Validate — quick non-destructive check</span></label>
|
||||
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (` + stressTotalStr + gpuNote + `)</span></label>
|
||||
</div>
|
||||
<div class="validate-profile-col validate-profile-action">
|
||||
<p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially. Validate: ` + validateTotalStr + gpuNote + `; Stress: ` + stressTotalStr + gpuNote + `. Estimates are based on real log data and scale with GPU count.</p>
|
||||
<button type="button" class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
|
||||
<div style="margin-top:12px">
|
||||
<span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
stressOnlyCards := ""
|
||||
if stressDefault {
|
||||
stressOnlyCards = renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
|
||||
`<code>dcgmi diag targeted_stress</code>`,
|
||||
validateFmtDur(platform.SATEstimatedNvidiaTargetedStressSec)+` (all GPUs simultaneously).`,
|
||||
)) +
|
||||
renderSATCard("nvidia-targeted-power", "NVIDIA Targeted Power", "runNvidiaValidateSet('nvidia-targeted-power')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
|
||||
`<code>dcgmi diag targeted_power</code>`,
|
||||
validateFmtDur(platform.SATEstimatedNvidiaTargetedPowerSec)+` (all GPUs simultaneously).`,
|
||||
)) +
|
||||
renderSATCard("nvidia-pulse", "NVIDIA PSU Pulse Test", "runNvidiaFabricValidate('nvidia-pulse')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`,
|
||||
`<code>dcgmi diag pulse_test</code>`,
|
||||
validateFmtDur(platform.SATEstimatedNvidiaPulseTestSec)+` (all GPUs simultaneously; measured on 8-GPU system).`,
|
||||
))
|
||||
}
|
||||
|
||||
satStressModeJS := "function satStressMode() { return false; }"
|
||||
if stressDefault {
|
||||
satStressModeJS = "function satStressMode() { return true; }"
|
||||
}
|
||||
|
||||
return alert + `
|
||||
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||
<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px">
|
||||
<button type="button" class="btn btn-primary" onclick="runAllSAT()">Run All</button>
|
||||
<span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
|
||||
<span style="font-size:12px;color:var(--muted)">est. ` + estStr + gpuNote + `</span>
|
||||
</div>
|
||||
|
||||
<div class="grid3">
|
||||
` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody(
|
||||
@@ -105,9 +143,9 @@ func renderValidate(opts HandlerOptions) string {
|
||||
)) +
|
||||
renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
|
||||
inv.Storage,
|
||||
`Scans all storage devices and runs the matching health or self-test path for each device type.`,
|
||||
`<code>lsblk</code>; NVMe: <code>nvme</code>; SATA/SAS: <code>smartctl</code>`,
|
||||
`Seconds in Validate (NVMe: instant device query; SATA/SAS: short self-test). Up to ~1 h per device in Stress (extended self-test, device-dependent).`,
|
||||
`Collects SMART data and runs a short self-test on each storage device.`,
|
||||
`<code>lsblk</code>; NVMe: <code>nvme id-ctrl</code>, <code>nvme smart-log</code>, <code>nvme device-self-test -s 1</code>; SATA/SAS: <code>smartctl -H -A</code>, <code>smartctl -t short</code>`,
|
||||
`~2 min per device (NVMe short self-test; SATA/SAS short self-test — duration device-dependent).`,
|
||||
)) +
|
||||
`</div>
|
||||
<div style="height:1px;background:var(--border);margin:16px 0"></div>
|
||||
@@ -115,7 +153,7 @@ func renderValidate(opts HandlerOptions) string {
|
||||
<div class="card-head">NVIDIA GPU Selection</div>
|
||||
<div class="card-body">
|
||||
<p style="font-size:12px;color:var(--muted);margin:0 0 8px">` + inv.NVIDIA + `</p>
|
||||
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">All NVIDIA validate tasks use only the GPUs selected here. The same selection is used by Validate one by one.</p>
|
||||
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">All NVIDIA validate tasks use only the GPUs selected here. The same selection is used by Run All.</p>
|
||||
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
|
||||
<button class="btn btn-sm btn-secondary" type="button" onclick="satSelectAllGPUs()">Select All</button>
|
||||
<button class="btn btn-sm btn-secondary" type="button" onclick="satSelectNoGPUs()">Clear</button>
|
||||
@@ -136,46 +174,19 @@ func renderValidate(opts HandlerOptions) string {
|
||||
validateFmtDur(platform.SATEstimatedNvidiaGPUValidateSec),
|
||||
validateFmtDur(platform.SATEstimatedNvidiaGPUStressSec)),
|
||||
)) +
|
||||
`<div id="sat-card-nvidia-targeted-stress">` +
|
||||
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
|
||||
`<code>dcgmi diag targeted_stress</code>`,
|
||||
"Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedStressSec) + ` (all GPUs simultaneously).<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`<div id="sat-card-nvidia-targeted-power">` +
|
||||
renderSATCard("nvidia-targeted-power", "NVIDIA Targeted Power", "runNvidiaValidateSet('nvidia-targeted-power')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
|
||||
`<code>dcgmi diag targeted_power</code>`,
|
||||
"Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedPowerSec) + ` (all GPUs simultaneously).<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`<div id="sat-card-nvidia-pulse">` +
|
||||
renderSATCard("nvidia-pulse", "NVIDIA PSU Pulse Test", "runNvidiaFabricValidate('nvidia-pulse')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`,
|
||||
`<code>dcgmi diag pulse_test</code>`,
|
||||
`Skipped in Validate. Stress: `+validateFmtDur(platform.SATEstimatedNvidiaPulseTestSec)+` (all GPUs simultaneously; measured on 8-GPU system).`+`<p id="sat-pt-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`<div id="sat-card-nvidia-interconnect">` +
|
||||
stressOnlyCards +
|
||||
renderSATCard("nvidia-interconnect", "NVIDIA Interconnect (NCCL)", "runNvidiaFabricValidate('nvidia-interconnect')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
|
||||
`<code>all_reduce_perf</code> (NCCL tests)`,
|
||||
`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`,
|
||||
validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`<div id="sat-card-nvidia-bandwidth">` +
|
||||
renderSATCard("nvidia-bandwidth", "NVIDIA Bandwidth (NVBandwidth)", "runNvidiaFabricValidate('nvidia-bandwidth')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
|
||||
`<code>nvbandwidth</code>`,
|
||||
`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously; nvbandwidth runs all built-in tests without a time limit — duration set by the tool).`,
|
||||
validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously; nvbandwidth runs all built-in tests without a time limit — duration set by the tool).`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`</div>
|
||||
<div class="grid3" style="margin-top:16px">
|
||||
` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody(
|
||||
@@ -190,36 +201,15 @@ func renderValidate(opts HandlerOptions) string {
|
||||
<div class="card-body"><div id="sat-terminal" class="terminal"></div></div>
|
||||
</div>
|
||||
<style>
|
||||
.validate-profile-body { display:grid; grid-template-columns:1fr 1fr 1fr; gap:24px; align-items:stretch; }
|
||||
.validate-profile-col { min-width:0; display:flex; flex-direction:column; }
|
||||
.validate-profile-action { display:flex; flex-direction:column; align-items:center; justify-content:center; }
|
||||
.validate-card-body { padding:0; }
|
||||
.validate-card-section { padding:12px 16px 0; }
|
||||
.validate-card-section:last-child { padding-bottom:16px; }
|
||||
.sat-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
|
||||
.sat-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||
@media(max-width:900px){ .validate-profile-body { grid-template-columns:1fr; } }
|
||||
</style>
|
||||
<script>
|
||||
let satES = null;
|
||||
function satStressMode() {
|
||||
return document.querySelector('input[name="sat-mode"]:checked')?.value === 'stress';
|
||||
}
|
||||
function satModeChanged() {
|
||||
const stress = satStressMode();
|
||||
[
|
||||
{card: 'sat-card-nvidia-targeted-stress', hint: 'sat-ts-mode-hint'},
|
||||
{card: 'sat-card-nvidia-targeted-power', hint: 'sat-tp-mode-hint'},
|
||||
{card: 'sat-card-nvidia-pulse', hint: 'sat-pt-mode-hint'},
|
||||
].forEach(function(item) {
|
||||
const card = document.getElementById(item.card);
|
||||
if (card) {
|
||||
card.style.opacity = stress ? '1' : '0.5';
|
||||
const hint = document.getElementById(item.hint);
|
||||
if (hint) hint.style.display = stress ? 'none' : '';
|
||||
}
|
||||
});
|
||||
}
|
||||
` + satStressModeJS + `
|
||||
function satLabels() {
|
||||
return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress (dcgmi diag targeted_stress)', 'nvidia-targeted-power':'NVIDIA Targeted Power (dcgmi diag targeted_power)', 'nvidia-pulse':'NVIDIA PSU Pulse Test (dcgmi diag pulse_test)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL all_reduce_perf)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
|
||||
}
|
||||
@@ -634,25 +624,307 @@ func validateFirstNonEmpty(values ...string) string {
|
||||
}
|
||||
|
||||
func validateIsVendorGPU(dev schema.HardwarePCIeDevice, vendor string) bool {
|
||||
model := strings.ToLower(validateTrimPtr(dev.Model))
|
||||
manufacturer := strings.ToLower(validateTrimPtr(dev.Manufacturer))
|
||||
class := strings.ToLower(validateTrimPtr(dev.DeviceClass))
|
||||
if strings.Contains(model, "aspeed") || strings.Contains(manufacturer, "aspeed") {
|
||||
if dev.VendorID != nil && *dev.VendorID == pciVendorAspeed {
|
||||
return false
|
||||
}
|
||||
class := strings.ToLower(validateTrimPtr(dev.DeviceClass))
|
||||
isGPUClass := class == "videocontroller" || class == "processingaccelerator" || class == "displaycontroller"
|
||||
switch vendor {
|
||||
case "nvidia":
|
||||
return strings.Contains(model, "nvidia") || strings.Contains(manufacturer, "nvidia")
|
||||
return isGPUClass && dev.VendorID != nil && *dev.VendorID == pciVendorNvidia
|
||||
case "amd":
|
||||
isGPUClass := class == "processingaccelerator" || class == "displaycontroller" || class == "videocontroller"
|
||||
isAMDVendor := strings.Contains(manufacturer, "advanced micro devices") || strings.Contains(manufacturer, "amd") || strings.Contains(manufacturer, "ati")
|
||||
isAMDModel := strings.Contains(model, "instinct") || strings.Contains(model, "radeon") || strings.Contains(model, "amd")
|
||||
return isGPUClass && (isAMDVendor || isAMDModel)
|
||||
return isGPUClass && dev.VendorID != nil && *dev.VendorID == pciVendorAMD
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// renderCheck renders the non-destructive Check page (step 2).
|
||||
// Shows validate-mode tests only: CPU, Memory, Storage, NVIDIA L2, NCCL, NVBandwidth, AMD.
|
||||
// Stress-mode tests (targeted-stress, targeted-power, pulse) are on the Load page.
|
||||
func renderCheck(opts HandlerOptions) string {
|
||||
inv := loadValidateInventory(opts)
|
||||
n := inv.NvidiaGPUCount
|
||||
validateTotalStr := validateFmtDur(validateTotalValidateSec(n))
|
||||
gpuNote := ""
|
||||
if n > 0 {
|
||||
gpuNote = fmt.Sprintf(" (%d GPU)", n)
|
||||
}
|
||||
return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Check tests collect diagnostics only — no writes to disks, no sustained load, no hardware wear counters incremented. For stress testing, go to <a href="/burn">4. Burn</a>.</div>
|
||||
<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px">
|
||||
<button type="button" class="btn btn-primary" onclick="runAllCheckSAT()">Run All Checks</button>
|
||||
<span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
|
||||
<span style="font-size:12px;color:var(--muted)">est. ` + validateTotalStr + gpuNote + `</span>
|
||||
</div>
|
||||
|
||||
<div class="grid3">
|
||||
` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody(
|
||||
inv.CPU,
|
||||
`Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`,
|
||||
`<code>lscpu</code>, <code>sensors</code>, <code>stress-ng</code>`,
|
||||
validateFmtDur(platform.SATEstimatedCPUValidateSec)+` (stress-ng 60 s).`,
|
||||
)) +
|
||||
renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody(
|
||||
inv.Memory,
|
||||
`Runs a RAM validation pass and records memory state around the test.`,
|
||||
`<code>free</code>, <code>memtester</code>`,
|
||||
validateFmtDur(platform.SATEstimatedMemoryValidateSec)+` (256 MB × 1 pass).`,
|
||||
)) +
|
||||
renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
|
||||
inv.Storage,
|
||||
`Collects SMART health and attributes for each storage device. No self-test is triggered — read-only query only.`,
|
||||
`<code>lsblk</code>; NVMe: <code>nvme id-ctrl</code>, <code>nvme smart-log</code>; SATA/SAS: <code>smartctl -H -A</code>`,
|
||||
`Seconds — instantaneous device query, no wear counters incremented.`,
|
||||
)) +
|
||||
`</div>
|
||||
<div style="height:1px;background:var(--border);margin:16px 0"></div>
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">NVIDIA GPU Selection</div>
|
||||
<div class="card-body">
|
||||
<p style="font-size:12px;color:var(--muted);margin:0 0 8px">` + inv.NVIDIA + `</p>
|
||||
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
|
||||
<button class="btn btn-sm btn-secondary" type="button" onclick="satSelectAllGPUs()">Select All</button>
|
||||
<button class="btn btn-sm btn-secondary" type="button" onclick="satSelectNoGPUs()">Clear</button>
|
||||
</div>
|
||||
<div id="sat-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
|
||||
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
|
||||
</div>
|
||||
<p id="sat-gpu-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA check tasks.</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="grid3">
|
||||
` + renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Runs NVIDIA diagnostics and board inventory checks (DCGM Level 2).`,
|
||||
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
|
||||
validateFmtDur(platform.SATEstimatedNvidiaGPUValidateSec)+` (Level 2, all GPUs simultaneously).`,
|
||||
)) +
|
||||
renderSATCard("nvidia-interconnect", "NVIDIA Interconnect (NCCL)", "runNvidiaFabricValidate('nvidia-interconnect')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs.`,
|
||||
`<code>all_reduce_perf</code> (NCCL tests)`,
|
||||
validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`,
|
||||
)) +
|
||||
renderSATCard("nvidia-bandwidth", "NVIDIA Bandwidth (NVBandwidth)", "runNvidiaFabricValidate('nvidia-bandwidth')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
|
||||
`<code>nvbandwidth</code>`,
|
||||
validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously).`,
|
||||
)) +
|
||||
`</div>
|
||||
<div class="grid3" style="margin-top:16px">
|
||||
` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody(
|
||||
inv.AMD,
|
||||
`Runs AMD GPU inventory, MEM integrity, and MEM bandwidth checks.`,
|
||||
`GPU Validate: <code>rocm-smi</code>, <code>dmidecode</code>; MEM Integrity: <code>rvs mem</code>; MEM Bandwidth: <code>rocm-bandwidth-test</code>, <code>rvs babel</code>`,
|
||||
`<div style="display:flex;flex-direction:column;gap:4px"><label class="cb-row"><input type="checkbox" id="sat-amd-target" checked><span>GPU Validate</span></label><label class="cb-row"><input type="checkbox" id="sat-amd-mem-target" checked><span>MEM Integrity</span></label><label class="cb-row"><input type="checkbox" id="sat-amd-bandwidth-target" checked><span>MEM Bandwidth</span></label></div>`,
|
||||
)) +
|
||||
`</div>
|
||||
<div id="sat-output" style="display:none;margin-top:16px" class="card">
|
||||
<div class="card-head">Test Output <span id="sat-title"></span></div>
|
||||
<div class="card-body"><div id="sat-terminal" class="terminal"></div></div>
|
||||
</div>
|
||||
<style>
|
||||
.validate-card-body { padding:0; }
|
||||
.validate-card-section { padding:12px 16px 0; }
|
||||
.validate-card-section:last-child { padding-bottom:16px; }
|
||||
.sat-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
|
||||
.sat-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||
.cb-row { display:flex; align-items:flex-start; gap:8px; padding:4px 0; cursor:pointer; font-size:13px; }
|
||||
.cb-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||
</style>
|
||||
<script>
|
||||
let satES = null;
|
||||
function satLabels() {
|
||||
return {nvidia:'Check GPU (DCGM L2)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Check Memory', storage:'Check Storage', cpu:'Check CPU', amd:'Check AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
|
||||
}
|
||||
let satNvidiaGPUsPromise = null;
|
||||
function loadSatNvidiaGPUs() {
|
||||
if (!satNvidiaGPUsPromise) {
|
||||
satNvidiaGPUsPromise = fetch('/api/gpu/nvidia').then(r => {
|
||||
if (!r.ok) throw new Error('Failed to load NVIDIA GPUs.');
|
||||
return r.json();
|
||||
}).then(list => Array.isArray(list) ? list : []);
|
||||
}
|
||||
return satNvidiaGPUsPromise;
|
||||
}
|
||||
function satSelectedGPUIndices() {
|
||||
return Array.from(document.querySelectorAll('.sat-nvidia-checkbox'))
|
||||
.filter(el => el.checked && !el.disabled)
|
||||
.map(el => parseInt(el.value, 10))
|
||||
.filter(v => !Number.isNaN(v))
|
||||
.sort((a, b) => a - b);
|
||||
}
|
||||
function satUpdateGPUSelectionNote() {
|
||||
const note = document.getElementById('sat-gpu-selection-note');
|
||||
if (!note) return;
|
||||
const sel = satSelectedGPUIndices();
|
||||
note.textContent = sel.length
|
||||
? 'Selected GPUs: ' + sel.join(', ') + '. Multi-GPU tests will use all selected GPUs.'
|
||||
: 'Select at least one NVIDIA GPU to enable NVIDIA check tasks.';
|
||||
}
|
||||
function satRenderGPUList(gpus) {
|
||||
const root = document.getElementById('sat-gpu-list');
|
||||
if (!root) return;
|
||||
if (!gpus || !gpus.length) {
|
||||
root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
|
||||
satUpdateGPUSelectionNote(); return;
|
||||
}
|
||||
root.innerHTML = gpus.map(gpu => {
|
||||
const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
|
||||
return '<label class="sat-gpu-row"><input class="sat-nvidia-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="satUpdateGPUSelectionNote()"><span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span></label>';
|
||||
}).join('');
|
||||
satUpdateGPUSelectionNote();
|
||||
}
|
||||
function satSelectAllGPUs() { document.querySelectorAll('.sat-nvidia-checkbox').forEach(el => { el.checked = true; }); satUpdateGPUSelectionNote(); }
|
||||
function satSelectNoGPUs() { document.querySelectorAll('.sat-nvidia-checkbox').forEach(el => { el.checked = false; }); satUpdateGPUSelectionNote(); }
|
||||
function satGPULoadInit() {
|
||||
loadSatNvidiaGPUs().then(satRenderGPUList).catch(err => {
|
||||
const root = document.getElementById('sat-gpu-list');
|
||||
if (root) root.innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
|
||||
satUpdateGPUSelectionNote();
|
||||
});
|
||||
}
|
||||
function satRequestBody(target, overrides) {
|
||||
const body = {};
|
||||
const labels = satLabels();
|
||||
body.display_name = labels[target] || ('Check ' + target);
|
||||
body.stress_mode = false;
|
||||
if (target === 'cpu') body.duration = 60;
|
||||
if (overrides) Object.keys(overrides).forEach(k => { body[k] = overrides[k]; });
|
||||
return body;
|
||||
}
|
||||
function enqueueSATTarget(target, overrides) {
|
||||
return fetch('/api/sat/' + target + '/run', {method:'POST', headers:{'Content-Type':'application/json'}, body:JSON.stringify(satRequestBody(target, overrides))}).then(r => r.json());
|
||||
}
|
||||
function streamSATTask(taskId, title, resetTerminal) {
|
||||
if (satES) { satES.close(); satES = null; }
|
||||
document.getElementById('sat-output').style.display = 'block';
|
||||
document.getElementById('sat-title').textContent = '— ' + title;
|
||||
const term = document.getElementById('sat-terminal');
|
||||
if (resetTerminal) term.textContent = '';
|
||||
term.textContent += 'Task ' + taskId + ' queued. Streaming log...\n';
|
||||
return new Promise(resolve => {
|
||||
satES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||
satES.onmessage = e => { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||
satES.addEventListener('done', e => {
|
||||
satES.close(); satES = null;
|
||||
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
resolve({ok: !e.data, error: e.data || ''});
|
||||
});
|
||||
satES.onerror = () => {
|
||||
if (satES) { satES.close(); satES = null; }
|
||||
term.textContent += '\nERROR: stream disconnected.\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
resolve({ok: false, error: 'stream disconnected'});
|
||||
};
|
||||
});
|
||||
}
|
||||
function selectedAMDValidateTargets() {
|
||||
const targets = [];
|
||||
const gpu = document.getElementById('sat-amd-target');
|
||||
const mem = document.getElementById('sat-amd-mem-target');
|
||||
const bw = document.getElementById('sat-amd-bandwidth-target');
|
||||
if (gpu && gpu.checked && !gpu.disabled) targets.push('amd');
|
||||
if (mem && mem.checked && !mem.disabled) targets.push('amd-mem');
|
||||
if (bw && bw.checked && !bw.disabled) targets.push('amd-bandwidth');
|
||||
return targets;
|
||||
}
|
||||
function runSAT(target) { return runSATWithOverrides(target, null); }
|
||||
function runSATWithOverrides(target, overrides) {
|
||||
const title = (overrides && overrides.display_name) || target;
|
||||
document.getElementById('sat-output').style.display = 'block';
|
||||
document.getElementById('sat-title').textContent = '— ' + title;
|
||||
const term = document.getElementById('sat-terminal');
|
||||
term.textContent = 'Enqueuing ' + title + ' test...\n';
|
||||
return enqueueSATTarget(target, overrides).then(d => streamSATTask(d.task_id, title, false));
|
||||
}
|
||||
function runNvidiaFabricValidate(target) {
|
||||
const indices = satSelectedGPUIndices();
|
||||
if (!indices.length) { alert('No NVIDIA GPUs available.'); return; }
|
||||
runSATWithOverrides(target, {gpu_indices: indices, display_name: satLabels()[target] || target});
|
||||
}
|
||||
function runNvidiaValidateSet(target) {
|
||||
const sel = satSelectedGPUIndices();
|
||||
if (!sel.length) { alert('Select at least one NVIDIA GPU.'); return; }
|
||||
return runSATWithOverrides(target, {gpu_indices: sel, display_name: satLabels()[target] || target});
|
||||
}
|
||||
function runAMDValidateSet() {
|
||||
const targets = selectedAMDValidateTargets();
|
||||
if (!targets.length) return;
|
||||
if (targets.length === 1) return runSAT(targets[0]);
|
||||
const term = document.getElementById('sat-terminal');
|
||||
document.getElementById('sat-output').style.display = 'block';
|
||||
document.getElementById('sat-title').textContent = '— amd';
|
||||
term.textContent = 'Running AMD check set...\n';
|
||||
const labels = satLabels();
|
||||
const runNext = idx => {
|
||||
if (idx >= targets.length) return Promise.resolve();
|
||||
const t = targets[idx];
|
||||
term.textContent += '\n[' + (idx + 1) + '/' + targets.length + '] ' + labels[t] + '\n';
|
||||
return enqueueSATTarget(t).then(d => streamSATTask(d.task_id, labels[t], false)).then(() => runNext(idx + 1));
|
||||
};
|
||||
return runNext(0);
|
||||
}
|
||||
function runAllCheckSAT() {
|
||||
const status = document.getElementById('sat-all-status');
|
||||
status.textContent = 'Enqueuing...';
|
||||
const nvidiaIndices = satSelectedGPUIndices();
|
||||
const nvidiaAllTargets = ['nvidia', 'nvidia-interconnect', 'nvidia-bandwidth'];
|
||||
const baseTargets = ['cpu', 'memory', 'storage'];
|
||||
const amdTargets = selectedAMDValidateTargets();
|
||||
const expanded = [];
|
||||
baseTargets.forEach(t => expanded.push({target: t}));
|
||||
if (nvidiaIndices.length) {
|
||||
nvidiaAllTargets.forEach(t => {
|
||||
const btn = document.getElementById('sat-btn-' + t);
|
||||
if (!(btn && btn.disabled)) expanded.push({target: t, overrides: {gpu_indices: nvidiaIndices, display_name: satLabels()[t] || t}});
|
||||
});
|
||||
}
|
||||
amdTargets.forEach(t => expanded.push({target: t}));
|
||||
if (!expanded.length) { status.textContent = 'No tasks selected.'; return; }
|
||||
const total = expanded.length;
|
||||
const runNext = idx => {
|
||||
if (idx >= expanded.length) { status.textContent = 'Completed ' + total + ' task(s).'; return Promise.resolve(); }
|
||||
const item = expanded[idx];
|
||||
status.textContent = 'Running ' + (idx + 1) + '/' + total + '...';
|
||||
return enqueueSATTarget(item.target, item.overrides).then(() => runNext(idx + 1));
|
||||
};
|
||||
runNext(0).catch(err => { status.textContent = 'Error: ' + err.message; });
|
||||
}
|
||||
function disableSATCard(id, reason) {
|
||||
const btn = document.getElementById('sat-btn-' + id);
|
||||
if (!btn) return;
|
||||
btn.disabled = true; btn.title = reason; btn.style.opacity = '0.4';
|
||||
const card = btn.closest('.card');
|
||||
if (card) {
|
||||
let note = card.querySelector('.sat-unavail');
|
||||
if (!note) {
|
||||
note = document.createElement('p');
|
||||
note.className = 'sat-unavail';
|
||||
note.style.cssText = 'color:var(--muted);font-size:12px;margin:0 0 8px';
|
||||
const body = card.querySelector('.card-body');
|
||||
if (body) body.insertBefore(note, body.firstChild);
|
||||
}
|
||||
note.textContent = reason;
|
||||
}
|
||||
}
|
||||
fetch('/api/gpu/presence').then(r => r.json()).then(gp => {
|
||||
if (!gp.nvidia) ['nvidia','nvidia-interconnect','nvidia-bandwidth'].forEach(t => disableSATCard(t, 'No NVIDIA GPU detected'));
|
||||
if (!gp.amd) {
|
||||
disableSATCard('amd', 'No AMD GPU detected');
|
||||
['sat-amd-target','sat-amd-mem-target','sat-amd-bandwidth-target'].forEach(id => {
|
||||
const cb = document.getElementById(id);
|
||||
if (cb) { cb.disabled = true; cb.checked = false; }
|
||||
});
|
||||
}
|
||||
});
|
||||
satGPULoadInit();
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderSATCard(id, label, runAction, headerActions, body string) string {
|
||||
actions := `<button id="sat-btn-` + id + `" class="btn btn-primary btn-sm" onclick="` + runAction + `">Run</button>`
|
||||
if strings.TrimSpace(headerActions) != "" {
|
||||
|
||||
@@ -5,7 +5,9 @@ import (
|
||||
"fmt"
|
||||
"html"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
@@ -22,41 +24,54 @@ func renderPage(page string, opts HandlerOptions) string {
|
||||
body = renderDashboard(opts)
|
||||
case "audit":
|
||||
pageID = "audit"
|
||||
title = "Audit"
|
||||
title = "1. Audit"
|
||||
body = renderAudit()
|
||||
case "validate":
|
||||
pageID = "validate"
|
||||
title = "Validate"
|
||||
body = renderValidate(opts)
|
||||
case "check":
|
||||
pageID = "check"
|
||||
title = "2. Check"
|
||||
body = renderCheck(opts)
|
||||
case "load":
|
||||
pageID = "load"
|
||||
title = "3. Load"
|
||||
body = renderValidateStress(opts)
|
||||
case "burn":
|
||||
pageID = "burn"
|
||||
title = "Burn"
|
||||
title = "4. Burn"
|
||||
body = renderBurn()
|
||||
case "benchmark":
|
||||
pageID = "benchmark"
|
||||
title = "Benchmark"
|
||||
title = "5. Benchmark"
|
||||
body = renderBenchmark(opts)
|
||||
case "tools":
|
||||
pageID = "tools"
|
||||
title = "Tools"
|
||||
body = renderTools()
|
||||
case "settings":
|
||||
pageID = "settings"
|
||||
title = "Settings"
|
||||
body = renderSettings(opts)
|
||||
// Legacy routes (redirected at HTTP level in handlePage; these are fallbacks)
|
||||
case "validate", "tests":
|
||||
pageID = "load"
|
||||
title = "3. Load"
|
||||
body = renderValidate(opts)
|
||||
case "burn-in":
|
||||
pageID = "burn"
|
||||
title = "4. Burn"
|
||||
body = renderBurn()
|
||||
case "speed", "endurance":
|
||||
pageID = "benchmark"
|
||||
title = "5. Benchmark"
|
||||
body = renderBenchmark(opts)
|
||||
case "tasks":
|
||||
pageID = "tasks"
|
||||
title = "Tasks"
|
||||
body = renderTasks()
|
||||
case "tools":
|
||||
pageID = "tools"
|
||||
title = "Tools"
|
||||
body = renderTools()
|
||||
// Legacy routes kept accessible but not in nav
|
||||
// Hidden pages (not in nav, accessible by direct URL)
|
||||
case "metrics":
|
||||
pageID = "metrics"
|
||||
title = "Live Metrics"
|
||||
body = renderMetrics()
|
||||
case "tests":
|
||||
pageID = "validate"
|
||||
title = "Acceptance Tests"
|
||||
body = renderValidate(opts)
|
||||
case "burn-in":
|
||||
pageID = "burn"
|
||||
title = "Burn-in Tests"
|
||||
body = renderBurn()
|
||||
case "network":
|
||||
pageID = "network"
|
||||
title = "Network"
|
||||
@@ -85,6 +100,7 @@ func renderPage(page string, opts HandlerOptions) string {
|
||||
body +
|
||||
`</div></div>` +
|
||||
renderAuditModal() +
|
||||
`<dialog id="component-detail-dialog" style="min-width:600px;max-width:900px;width:90vw;padding:0;border:1px solid var(--border);border-radius:8px;background:var(--surface)"><div id="component-detail-body" style="padding-bottom:20px"></div></dialog>` +
|
||||
`<script>
|
||||
// Add copy button to every .terminal on the page
|
||||
document.querySelectorAll('.terminal').forEach(function(t){
|
||||
@@ -94,6 +110,17 @@ document.querySelectorAll('.terminal').forEach(function(t){
|
||||
btn.onclick=function(){navigator.clipboard.writeText(t.textContent).then(function(){btn.textContent='Copied!';setTimeout(function(){btn.textContent='Copy';},1500);});};
|
||||
w.appendChild(btn);
|
||||
});
|
||||
function openComponentDetail(type) {
|
||||
var dlg = document.getElementById('component-detail-dialog');
|
||||
var body = document.getElementById('component-detail-body');
|
||||
body.innerHTML = '<div style="padding:20px;color:var(--muted)">Loading…</div>';
|
||||
dlg.showModal();
|
||||
fetch('/api/components/' + type).then(function(r){ return r.text(); }).then(function(html){
|
||||
body.innerHTML = html;
|
||||
}).catch(function(){
|
||||
body.innerHTML = '<div style="padding:20px;color:var(--crit-fg)">Error loading details.</div>';
|
||||
});
|
||||
}
|
||||
</script>` +
|
||||
`</body></html>`
|
||||
}
|
||||
@@ -106,6 +133,14 @@ func renderDashboard(opts HandlerOptions) string {
|
||||
b.WriteString(renderHardwareSummaryCard(opts))
|
||||
b.WriteString(renderHealthCard(opts))
|
||||
b.WriteString(renderMetrics())
|
||||
b.WriteString(`<script>
|
||||
setInterval(function(){
|
||||
fetch('/api/hardware-summary').then(function(r){return r.text();}).then(function(html){
|
||||
var el=document.getElementById('hw-summary-card');
|
||||
if(el){el.outerHTML=html;}
|
||||
}).catch(function(){});
|
||||
},30000);
|
||||
</script>`)
|
||||
return b.String()
|
||||
}
|
||||
|
||||
@@ -184,13 +219,14 @@ func renderAudit() string {
|
||||
}
|
||||
|
||||
func renderHardwareSummaryCard(opts HandlerOptions) string {
|
||||
const cardID = ` id="hw-summary-card"`
|
||||
data, err := loadSnapshot(opts.AuditPath)
|
||||
if err != nil {
|
||||
return `<div class="card"><div class="card-head card-head-actions"><span>Hardware Summary</span><div class="card-head-buttons"><button class="btn btn-primary btn-sm" onclick="auditModalRun()">Run audit</button></div></div><div class="card-body"></div></div>`
|
||||
return `<div class="card"` + cardID + `><div class="card-head card-head-actions"><span>Hardware Summary</span><div class="card-head-buttons"><button class="btn btn-primary btn-sm" onclick="auditModalRun()">Run audit</button></div></div><div class="card-body"></div></div>`
|
||||
}
|
||||
var ingest schema.HardwareIngestRequest
|
||||
if err := json.Unmarshal(data, &ingest); err != nil {
|
||||
return `<div class="card"><div class="card-head">Hardware Summary</div><div class="card-body"><span class="badge badge-err">Parse error</span></div></div>`
|
||||
return `<div class="card"` + cardID + `><div class="card-head">Hardware Summary</div><div class="card-body"><span class="badge badge-err">Parse error</span></div></div>`
|
||||
}
|
||||
hw := ingest.Hardware
|
||||
|
||||
@@ -200,7 +236,7 @@ func renderHardwareSummaryCard(opts HandlerOptions) string {
|
||||
}
|
||||
|
||||
var b strings.Builder
|
||||
b.WriteString(`<div class="card"><div class="card-head">Hardware Summary</div><div class="card-body">`)
|
||||
b.WriteString(`<div class="card"` + cardID + `><div class="card-head">Hardware Summary</div><div class="card-body">`)
|
||||
|
||||
// Server identity block above the component table.
|
||||
{
|
||||
@@ -229,22 +265,32 @@ func renderHardwareSummaryCard(opts HandlerOptions) string {
|
||||
}
|
||||
|
||||
b.WriteString(`<table style="width:auto">`)
|
||||
writeRow := func(label, value, badgeHTML string) {
|
||||
b.WriteString(fmt.Sprintf(`<tr><td style="padding:6px 14px 6px 0;font-weight:700;white-space:nowrap">%s</td><td style="padding:6px 0;color:var(--muted);font-size:13px">%s</td><td style="padding:6px 0 6px 12px">%s</td></tr>`,
|
||||
html.EscapeString(label), html.EscapeString(value), badgeHTML))
|
||||
// writeRow renders one component row. compType is the URL path segment for the detail
|
||||
// endpoint (e.g. "cpu"). Pass "" for rows that have no detail view.
|
||||
writeRow := func(label, value, badgeHTML, compType string) {
|
||||
var labelHTML string
|
||||
if compType != "" {
|
||||
labelHTML = fmt.Sprintf(
|
||||
`<span style="cursor:pointer;text-decoration:underline dotted;text-underline-offset:3px" onclick="openComponentDetail('%s')">%s</span>`,
|
||||
compType, html.EscapeString(label))
|
||||
} else {
|
||||
labelHTML = html.EscapeString(label)
|
||||
}
|
||||
fmt.Fprintf(&b, `<tr><td style="padding:6px 14px 6px 0;font-weight:700;white-space:nowrap">%s</td><td style="padding:6px 0;color:var(--muted);font-size:13px">%s</td><td style="padding:6px 0 6px 12px">%s</td></tr>`,
|
||||
labelHTML, html.EscapeString(value), badgeHTML)
|
||||
}
|
||||
|
||||
writeRow("CPU", hwDescribeCPU(hw),
|
||||
renderComponentChips(matchedRecords(records, []string{"cpu:all"}, nil)))
|
||||
renderComponentChips(matchedRecords(records, []string{"cpu:all"}, nil)), "cpu")
|
||||
|
||||
writeRow("Memory", hwDescribeMemory(hw),
|
||||
renderComponentChips(matchedRecords(records, []string{"memory:all"}, []string{"memory:"})))
|
||||
renderComponentChips(matchedRecords(records, []string{"memory:all"}, []string{"memory:"})), "memory")
|
||||
|
||||
writeRow("Storage", hwDescribeStorage(hw),
|
||||
renderComponentChips(matchedRecords(records, []string{"storage:all"}, []string{"storage:"})))
|
||||
renderComponentChips(matchedRecords(records, []string{"storage:all"}, []string{"storage:"})), "storage")
|
||||
|
||||
writeRow("GPU", hwDescribeGPU(hw),
|
||||
renderComponentChips(matchedRecords(records, nil, []string{"pcie:gpu:"})))
|
||||
renderComponentChips(matchedRecords(records, nil, []string{"pcie:gpu:"})), "gpu")
|
||||
|
||||
psuMatched := matchedRecords(records, nil, []string{"psu:"})
|
||||
if len(psuMatched) == 0 && len(hw.PowerSupplies) > 0 {
|
||||
@@ -252,10 +298,10 @@ func renderHardwareSummaryCard(opts HandlerOptions) string {
|
||||
psuStatus := hwPSUStatus(hw.PowerSupplies)
|
||||
psuMatched = []app.ComponentStatusRecord{{ComponentKey: "psu:ipmi", Status: psuStatus}}
|
||||
}
|
||||
writeRow("PSU", hwDescribePSU(hw), renderComponentChips(psuMatched))
|
||||
writeRow("PSU", hwDescribePSU(hw), renderComponentChips(psuMatched), "psu")
|
||||
|
||||
if nicDesc := hwDescribeNIC(hw); nicDesc != "" {
|
||||
writeRow("Network", nicDesc, "")
|
||||
writeRow("Network", nicDesc, "", "")
|
||||
}
|
||||
|
||||
b.WriteString(`</table>`)
|
||||
@@ -614,7 +660,7 @@ func buildRuntimeNetworkRow(health schema.RuntimeHealth) runtimeHealthRow {
|
||||
if status == "" {
|
||||
status = "UNKNOWN"
|
||||
}
|
||||
issue := runtimeIssueDescriptions(health.Issues, "dhcp_partial", "dhcp_failed")
|
||||
issue := runtimeIssueDescriptions(health.Issues, "dhcp_failed")
|
||||
return runtimeHealthRow{Title: "Network", Status: status, Source: "ListInterfaces / DHCP", Issue: issue}
|
||||
}
|
||||
|
||||
@@ -672,12 +718,12 @@ func buildRuntimeServicesRow(health schema.RuntimeHealth) runtimeHealthRow {
|
||||
nonActive := make([]string, 0)
|
||||
for _, svc := range health.Services {
|
||||
state := strings.TrimSpace(strings.ToLower(svc.Status))
|
||||
// "activating" and "deactivating" are transient states for oneshot services
|
||||
// (RemainAfterExit=yes) — the service is running normally, not failed.
|
||||
// Only "failed" and "inactive" (after services should be running) are problems.
|
||||
// "inactive" is OK for oneshot services that have completed successfully
|
||||
// (bee-sshsetup, bee-preflight, bee-audit, bee-network, etc.).
|
||||
// Only "failed" is a genuine problem.
|
||||
switch state {
|
||||
case "active", "activating", "deactivating", "reloading":
|
||||
// OK — service is running or transitioning normally
|
||||
case "active", "activating", "deactivating", "reloading", "inactive":
|
||||
// OK — service is running, transitioning normally, or completed successfully
|
||||
default:
|
||||
nonActive = append(nonActive, svc.Name+"="+svc.Status)
|
||||
}
|
||||
@@ -999,3 +1045,200 @@ func rowIssueHTML(issue string) string {
|
||||
}
|
||||
return html.EscapeString(issue)
|
||||
}
|
||||
|
||||
var aerStatusRe = regexp.MustCompile(`aer_status:\s*0x([0-9a-fA-F]{1,8})`)
|
||||
|
||||
// decodeAERStatus parses an AER status hex value from a kernel error detail string
|
||||
// and returns a human-readable list of set bit names with correctable/uncorrectable label,
|
||||
// or "" if no AER status is found.
|
||||
func decodeAERStatus(detail string) string {
|
||||
m := aerStatusRe.FindStringSubmatch(detail)
|
||||
if m == nil {
|
||||
return ""
|
||||
}
|
||||
v64, err := strconv.ParseUint(m[1], 16, 32)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
val := uint32(v64)
|
||||
|
||||
type bitDef struct {
|
||||
bit uint32
|
||||
name string
|
||||
}
|
||||
corrBits := []bitDef{
|
||||
{0, "Receiver Error"}, {6, "Replay Timer Timeout"}, {7, "Advisory Non-Fatal"},
|
||||
{8, "Corrected Internal Error"}, {9, "Header Log Overflow"},
|
||||
{13, "Replay Num Rollover"}, {14, "Bad DLLP"}, {15, "Bad TLP"},
|
||||
}
|
||||
uncorrBits := []bitDef{
|
||||
{4, "Data Link Protocol Error"}, {5, "Surprise Down Error"},
|
||||
{12, "Poisoned TLP Received"}, {13, "Flow Control Protocol Error"},
|
||||
{14, "Completion Timeout"}, {15, "Completer Abort"}, {16, "Unexpected Completion"},
|
||||
{17, "Receiver Overflow"}, {18, "Malformed TLP"}, {19, "ECRC Error"},
|
||||
{20, "Unsupported Request Error"}, {21, "ACS Violation"}, {22, "Uncorrectable Internal Error"},
|
||||
}
|
||||
var corrNames, uncorrNames []string
|
||||
for _, b := range corrBits {
|
||||
if val&(1<<b.bit) != 0 {
|
||||
corrNames = append(corrNames, b.name)
|
||||
}
|
||||
}
|
||||
for _, b := range uncorrBits {
|
||||
if val&(1<<b.bit) != 0 {
|
||||
uncorrNames = append(uncorrNames, b.name)
|
||||
}
|
||||
}
|
||||
if len(corrNames) >= len(uncorrNames) && len(corrNames) > 0 {
|
||||
return strings.Join(corrNames, ", ") + " (correctable)"
|
||||
}
|
||||
if len(uncorrNames) > 0 {
|
||||
return strings.Join(uncorrNames, ", ") + " (uncorrectable)"
|
||||
}
|
||||
return fmt.Sprintf("unknown bits: 0x%08x", val)
|
||||
}
|
||||
|
||||
// renderSparkline returns a small inline SVG showing non-OK events over time.
|
||||
// Events are positioned proportionally along the time axis; if all share the same
|
||||
// timestamp they are spaced evenly. Width is always 100px.
|
||||
func renderSparkline(history []app.ComponentStatusEntry) string {
|
||||
const (
|
||||
svgW = 100
|
||||
svgH = 20
|
||||
barW = 3
|
||||
barH = 14
|
||||
)
|
||||
var events []app.ComponentStatusEntry
|
||||
for _, e := range history {
|
||||
if e.Status != "OK" {
|
||||
events = append(events, e)
|
||||
}
|
||||
}
|
||||
if len(events) == 0 {
|
||||
return ""
|
||||
}
|
||||
n := len(events)
|
||||
barColor := func(status string) string {
|
||||
if status == "Critical" {
|
||||
return "#c0392b"
|
||||
}
|
||||
return "#d97706"
|
||||
}
|
||||
yTop := (svgH - barH) / 2
|
||||
|
||||
var bars strings.Builder
|
||||
if n == 1 {
|
||||
x := (svgW - barW) / 2
|
||||
fmt.Fprintf(&bars, `<rect x="%d" y="%d" width="%d" height="%d" fill="%s" rx="1"/>`,
|
||||
x, yTop, barW, barH, barColor(events[0].Status))
|
||||
} else {
|
||||
minT := events[0].At
|
||||
maxT := events[n-1].At
|
||||
dur := maxT.Sub(minT).Seconds()
|
||||
for i, e := range events {
|
||||
var x int
|
||||
if dur <= 0 {
|
||||
step := svgW / n
|
||||
x = i*step + (step-barW)/2
|
||||
} else {
|
||||
frac := e.At.Sub(minT).Seconds() / dur
|
||||
x = int(frac * float64(svgW-barW))
|
||||
}
|
||||
fmt.Fprintf(&bars, `<rect x="%d" y="%d" width="%d" height="%d" fill="%s" rx="1"/>`,
|
||||
x, yTop, barW, barH, barColor(e.Status))
|
||||
}
|
||||
}
|
||||
return fmt.Sprintf(
|
||||
`<svg width="%d" height="%d" style="display:inline-block;vertical-align:middle;margin-left:6px;flex-shrink:0" xmlns="http://www.w3.org/2000/svg">`+
|
||||
`<rect x="0" y="0" width="%d" height="%d" fill="var(--surface-alt,#ebebeb)" rx="3"/>%s</svg>`,
|
||||
svgW, svgH, svgW, svgH, bars.String())
|
||||
}
|
||||
|
||||
// renderComponentDetail renders a modal content fragment for one component type.
|
||||
// Called by handleAPIComponentDetail and displayed inside #component-detail-dialog.
|
||||
func renderComponentDetail(title string, records []app.ComponentStatusRecord) string {
|
||||
var b strings.Builder
|
||||
fmt.Fprintf(&b, `<div style="padding:20px 24px 0">`)
|
||||
fmt.Fprintf(&b, `<div style="display:flex;align-items:center;justify-content:space-between;margin-bottom:16px">`)
|
||||
fmt.Fprintf(&b, `<span style="font-size:16px;font-weight:700">%s — Status Detail</span>`, html.EscapeString(title))
|
||||
b.WriteString(`<button class="btn btn-sm btn-secondary" onclick="document.getElementById('component-detail-dialog').close()">Close</button>`)
|
||||
b.WriteString(`</div>`)
|
||||
|
||||
if len(records) == 0 {
|
||||
b.WriteString(`<p style="color:var(--muted)">No status data recorded yet for this component type.</p>`)
|
||||
b.WriteString(`</div>`)
|
||||
return b.String()
|
||||
}
|
||||
|
||||
sort.Slice(records, func(i, j int) bool {
|
||||
return records[i].ComponentKey < records[j].ComponentKey
|
||||
})
|
||||
|
||||
for _, rec := range records {
|
||||
letter, cls := chipLetterClass(rec.Status)
|
||||
|
||||
// Count non-OK events across the full history for the badge + sparkline.
|
||||
warnCount := 0
|
||||
for _, e := range rec.History {
|
||||
if e.Status != "OK" {
|
||||
warnCount++
|
||||
}
|
||||
}
|
||||
|
||||
fmt.Fprintf(&b, `<div style="margin-bottom:20px">`)
|
||||
fmt.Fprintf(&b, `<div style="display:flex;align-items:center;gap:8px;margin-bottom:8px;flex-wrap:wrap">`)
|
||||
fmt.Fprintf(&b, `<span class="chip %s">%s</span>`, cls, letter)
|
||||
fmt.Fprintf(&b, `<span style="font-weight:700;font-size:13px">%s</span>`, html.EscapeString(rec.ComponentKey))
|
||||
if !rec.LastCheckedAt.IsZero() {
|
||||
fmt.Fprintf(&b, `<span style="color:var(--muted);font-size:12px">checked %s</span>`, rec.LastCheckedAt.Format("2006-01-02 15:04:05"))
|
||||
}
|
||||
if warnCount > 0 {
|
||||
noun := "events"
|
||||
if warnCount == 1 {
|
||||
noun = "event"
|
||||
}
|
||||
fmt.Fprintf(&b,
|
||||
`<span style="font-size:11px;background:var(--warn-bg,#fffbeb);color:var(--warn-fg,#92400e);border:1px solid var(--warn-border,#fde68a);border-radius:10px;padding:1px 7px;white-space:nowrap">%d %s</span>`,
|
||||
warnCount, noun)
|
||||
b.WriteString(renderSparkline(rec.History))
|
||||
}
|
||||
b.WriteString(`</div>`)
|
||||
|
||||
if rec.ErrorSummary != "" {
|
||||
fmt.Fprintf(&b, `<div style="font-size:12px;margin-bottom:4px;color:var(--muted)">%s</div>`, html.EscapeString(rec.ErrorSummary))
|
||||
if decoded := decodeAERStatus(rec.ErrorSummary); decoded != "" {
|
||||
fmt.Fprintf(&b,
|
||||
`<div style="font-size:12px;margin-bottom:8px;color:var(--muted)"><span style="background:var(--surface-alt,#f5f5f5);border-radius:4px;padding:1px 6px;font-family:monospace">AER: %s</span></div>`,
|
||||
html.EscapeString(decoded))
|
||||
}
|
||||
}
|
||||
|
||||
// History table — newest first, cap at 20 entries.
|
||||
history := rec.History
|
||||
if len(history) > 20 {
|
||||
history = history[len(history)-20:]
|
||||
}
|
||||
b.WriteString(`<table style="width:100%;font-size:12px;border-collapse:collapse">`)
|
||||
b.WriteString(`<tr style="color:var(--muted)"><th style="text-align:left;padding:2px 10px 2px 0;white-space:nowrap">Time</th><th style="text-align:left;padding:2px 10px 2px 0">Status</th><th style="text-align:left;padding:2px 10px 2px 0">Source</th><th style="text-align:left;padding:2px 0">Detail</th></tr>`)
|
||||
for i := len(history) - 1; i >= 0; i-- {
|
||||
e := history[i]
|
||||
eLetter, eCls := chipLetterClass(e.Status)
|
||||
detail := e.Detail
|
||||
if detail == "" {
|
||||
detail = "—"
|
||||
}
|
||||
fmt.Fprintf(&b,
|
||||
`<tr><td style="padding:3px 10px 3px 0;white-space:nowrap;color:var(--muted)">%s</td><td style="padding:3px 10px 3px 0"><span class="chip %s" style="font-size:10px;width:16px;height:16px">%s</span></td><td style="padding:3px 10px 3px 0;white-space:nowrap">%s</td><td style="padding:3px 0;color:var(--muted)">%s</td></tr>`,
|
||||
html.EscapeString(e.At.Format("2006-01-02 15:04:05")),
|
||||
eCls, eLetter,
|
||||
html.EscapeString(e.Source),
|
||||
html.EscapeString(detail),
|
||||
)
|
||||
}
|
||||
b.WriteString(`</table>`)
|
||||
b.WriteString(`</div>`)
|
||||
}
|
||||
|
||||
b.WriteString(`</div>`)
|
||||
return b.String()
|
||||
}
|
||||
|
||||
857
audit/internal/webui/raid_mgmt.go
Normal file
857
audit/internal/webui/raid_mgmt.go
Normal file
@@ -0,0 +1,857 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/exec"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// --- Response types ---
|
||||
|
||||
type raidDriveInfo struct {
|
||||
Slot string `json:"slot,omitempty"`
|
||||
Device string `json:"device,omitempty"`
|
||||
Model string `json:"model,omitempty"`
|
||||
SizeGB float64 `json:"size_gb,omitempty"`
|
||||
Serial string `json:"serial,omitempty"`
|
||||
State string `json:"state,omitempty"`
|
||||
}
|
||||
|
||||
type raidArrayInfo struct {
|
||||
Name string `json:"name"`
|
||||
Level string `json:"level,omitempty"`
|
||||
Members []string `json:"members"`
|
||||
Degraded bool `json:"degraded"`
|
||||
}
|
||||
|
||||
type raidControllerInfo struct {
|
||||
ID string `json:"id"`
|
||||
Type string `json:"type"`
|
||||
Index int `json:"index"`
|
||||
Model string `json:"model"`
|
||||
ForeignDrives []raidDriveInfo `json:"foreign_drives"`
|
||||
FreeDrives []raidDriveInfo `json:"free_drives"`
|
||||
AllDrives []raidDriveInfo `json:"all_drives"`
|
||||
Arrays []raidArrayInfo `json:"arrays,omitempty"`
|
||||
}
|
||||
|
||||
type raidStatusResp struct {
|
||||
Controllers []raidControllerInfo `json:"controllers"`
|
||||
}
|
||||
|
||||
// --- LSI/storcli detection ---
|
||||
|
||||
func detectLSIControllers() []raidControllerInfo {
|
||||
ctrlOut, err := exec.Command("storcli64", "/call", "show", "J").Output()
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
var ctrlDoc struct {
|
||||
Controllers []struct {
|
||||
ResponseData struct {
|
||||
Basics struct {
|
||||
Controller int `json:"Controller"`
|
||||
Model string `json:"Model"`
|
||||
} `json:"Basics"`
|
||||
} `json:"Response Data"`
|
||||
} `json:"Controllers"`
|
||||
}
|
||||
if err := json.Unmarshal(ctrlOut, &ctrlDoc); err != nil || len(ctrlDoc.Controllers) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
driveOut, _ := exec.Command("storcli64", "/call/eall/sall", "show", "all", "J").Output()
|
||||
|
||||
var driveDoc struct {
|
||||
Controllers []struct {
|
||||
ResponseData struct {
|
||||
DriveInformation []struct {
|
||||
EIDSlt string `json:"EID:Slt"`
|
||||
State string `json:"State"`
|
||||
Size string `json:"Size"`
|
||||
Intf string `json:"Intf"`
|
||||
Med string `json:"Med"`
|
||||
Model string `json:"Model"`
|
||||
SN string `json:"SN"`
|
||||
} `json:"Drive Information"`
|
||||
} `json:"Response Data"`
|
||||
} `json:"Controllers"`
|
||||
}
|
||||
if len(driveOut) > 0 {
|
||||
json.Unmarshal(driveOut, &driveDoc) //nolint:errcheck
|
||||
}
|
||||
|
||||
var controllers []raidControllerInfo
|
||||
for i, c := range ctrlDoc.Controllers {
|
||||
ctrl := raidControllerInfo{
|
||||
ID: fmt.Sprintf("lsi-%d", c.ResponseData.Basics.Controller),
|
||||
Type: "lsi",
|
||||
Index: c.ResponseData.Basics.Controller,
|
||||
Model: c.ResponseData.Basics.Model,
|
||||
ForeignDrives: []raidDriveInfo{},
|
||||
FreeDrives: []raidDriveInfo{},
|
||||
AllDrives: []raidDriveInfo{},
|
||||
}
|
||||
if ctrl.Model == "" {
|
||||
ctrl.Model = fmt.Sprintf("LSI Controller %d", ctrl.Index)
|
||||
}
|
||||
|
||||
if i < len(driveDoc.Controllers) {
|
||||
for _, d := range driveDoc.Controllers[i].ResponseData.DriveInformation {
|
||||
info := raidDriveInfo{
|
||||
Slot: strings.TrimSpace(d.EIDSlt),
|
||||
Model: strings.TrimSpace(d.Model),
|
||||
State: strings.TrimSpace(d.State),
|
||||
SizeGB: raidParseHumanSizeGB(d.Size),
|
||||
Serial: strings.TrimSpace(d.SN),
|
||||
}
|
||||
ctrl.AllDrives = append(ctrl.AllDrives, info)
|
||||
switch strings.TrimSpace(d.State) {
|
||||
case "Frgn":
|
||||
ctrl.ForeignDrives = append(ctrl.ForeignDrives, info)
|
||||
case "UGood", "JBOD":
|
||||
ctrl.FreeDrives = append(ctrl.FreeDrives, info)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
controllers = append(controllers, ctrl)
|
||||
}
|
||||
return controllers
|
||||
}
|
||||
|
||||
// --- VROC/mdadm detection ---
|
||||
|
||||
var raidMDStatDegradedRx = regexp.MustCompile(`\[[U_]+\]`)
|
||||
|
||||
type mdStatEntry struct {
|
||||
Name string
|
||||
Level string
|
||||
Members []string
|
||||
Degraded bool
|
||||
}
|
||||
|
||||
func parseRAIDMDStat(raw string) []mdStatEntry {
|
||||
var entries []mdStatEntry
|
||||
var cur *mdStatEntry
|
||||
for _, line := range strings.Split(raw, "\n") {
|
||||
if strings.HasPrefix(line, "Personalities") || strings.HasPrefix(line, "unused devices") {
|
||||
continue
|
||||
}
|
||||
if idx := strings.Index(line, " : "); idx > 0 {
|
||||
name := strings.TrimSpace(line[:idx])
|
||||
rest := line[idx+3:]
|
||||
entry := mdStatEntry{Name: name}
|
||||
for _, tok := range strings.Fields(rest) {
|
||||
if strings.HasPrefix(tok, "raid") || strings.HasPrefix(tok, "linear") {
|
||||
entry.Level = tok
|
||||
}
|
||||
if bk := strings.Index(tok, "["); bk > 0 && strings.HasSuffix(tok, "]") {
|
||||
entry.Members = append(entry.Members, tok[:bk])
|
||||
}
|
||||
}
|
||||
entries = append(entries, entry)
|
||||
cur = &entries[len(entries)-1]
|
||||
continue
|
||||
}
|
||||
if cur != nil {
|
||||
if m := raidMDStatDegradedRx.FindString(line); m != "" && strings.Contains(m, "_") {
|
||||
cur.Degraded = true
|
||||
}
|
||||
}
|
||||
}
|
||||
return entries
|
||||
}
|
||||
|
||||
// raidVROCPortRx matches lines like " Port2 : /dev/sda (SERIAL123)"
|
||||
// or " Port3 : - no device attached -" from `mdadm --detail-platform`.
|
||||
var raidVROCPortRx = regexp.MustCompile(`^\s*Port\d+\s*:\s*(\S+)`)
|
||||
|
||||
// parseVROCPorts returns the block device basenames (e.g. "sda") that are
|
||||
// physically wired to the VROC I/O controller's ports, per `mdadm
|
||||
// --detail-platform` output. Drives attached directly to the CPU (or to a
|
||||
// separate HBA) rather than through this controller's ports are excluded.
|
||||
func parseVROCPorts(raw string) map[string]bool {
|
||||
ports := map[string]bool{}
|
||||
for _, line := range strings.Split(raw, "\n") {
|
||||
m := raidVROCPortRx.FindStringSubmatch(line)
|
||||
if m == nil {
|
||||
continue
|
||||
}
|
||||
dev := m[1]
|
||||
if !strings.HasPrefix(dev, "/dev/") {
|
||||
continue
|
||||
}
|
||||
ports[strings.TrimPrefix(dev, "/dev/")] = true
|
||||
}
|
||||
return ports
|
||||
}
|
||||
|
||||
func detectVROCController() *raidControllerInfo {
|
||||
out, err := exec.Command("mdadm", "--detail-platform").CombinedOutput()
|
||||
if err != nil && len(out) == 0 {
|
||||
return nil
|
||||
}
|
||||
hasVROC := false
|
||||
for _, line := range strings.Split(string(out), "\n") {
|
||||
lower := strings.ToLower(line)
|
||||
if strings.Contains(lower, "license") || strings.Contains(lower, "intel") || strings.Contains(lower, "platform") {
|
||||
hasVROC = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !hasVROC {
|
||||
return nil
|
||||
}
|
||||
|
||||
ctrl := &raidControllerInfo{
|
||||
ID: "vroc-0",
|
||||
Type: "vroc",
|
||||
Model: "Intel VROC",
|
||||
ForeignDrives: []raidDriveInfo{},
|
||||
FreeDrives: []raidDriveInfo{},
|
||||
AllDrives: []raidDriveInfo{},
|
||||
}
|
||||
|
||||
ports := parseVROCPorts(string(out))
|
||||
// Some mdadm builds omit the "Port" lines from --detail-platform. When
|
||||
// we can't determine which drives are actually wired to this
|
||||
// controller, fall back to showing every disk not already in an array
|
||||
// rather than hiding everything.
|
||||
portsKnown := len(ports) > 0
|
||||
|
||||
inArray := map[string]bool{}
|
||||
raw, err := os.ReadFile("/proc/mdstat")
|
||||
if err == nil {
|
||||
for _, arr := range parseRAIDMDStat(string(raw)) {
|
||||
ctrl.Arrays = append(ctrl.Arrays, raidArrayInfo{
|
||||
Name: arr.Name,
|
||||
Level: arr.Level,
|
||||
Members: arr.Members,
|
||||
Degraded: arr.Degraded,
|
||||
})
|
||||
for _, m := range arr.Members {
|
||||
inArray[m] = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
lsblkOut, err := exec.Command("lsblk", "-J", "-d", "-o", "NAME,SIZE,TYPE,MODEL,SERIAL").Output()
|
||||
if err == nil {
|
||||
var lsblkDoc struct {
|
||||
BlockDevices []struct {
|
||||
Name string `json:"name"`
|
||||
Size string `json:"size"`
|
||||
Type string `json:"type"`
|
||||
Model string `json:"model"`
|
||||
Serial string `json:"serial"`
|
||||
} `json:"blockdevices"`
|
||||
}
|
||||
if json.Unmarshal(lsblkOut, &lsblkDoc) == nil {
|
||||
for _, d := range lsblkDoc.BlockDevices {
|
||||
// Only consider disks wired to this controller's ports -
|
||||
// drives attached directly to the CPU (or another
|
||||
// controller) never show up as VROC ports and are skipped.
|
||||
if d.Type != "disk" || (portsKnown && !ports[d.Name]) {
|
||||
continue
|
||||
}
|
||||
info := raidDriveInfo{
|
||||
Device: "/dev/" + d.Name,
|
||||
Model: strings.TrimSpace(d.Model),
|
||||
Serial: strings.TrimSpace(d.Serial),
|
||||
State: "available",
|
||||
}
|
||||
if inArray[d.Name] {
|
||||
info.State = "member"
|
||||
}
|
||||
ctrl.AllDrives = append(ctrl.AllDrives, info)
|
||||
if info.State == "available" {
|
||||
ctrl.FreeDrives = append(ctrl.FreeDrives, info)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ctrl
|
||||
}
|
||||
|
||||
// --- API handlers ---
|
||||
|
||||
func (h *handler) handleAPIRAIDStatus(w http.ResponseWriter, r *http.Request) {
|
||||
resp := raidStatusResp{Controllers: []raidControllerInfo{}}
|
||||
|
||||
if lsi := detectLSIControllers(); len(lsi) > 0 {
|
||||
resp.Controllers = append(resp.Controllers, lsi...)
|
||||
}
|
||||
if vroc := detectVROCController(); vroc != nil {
|
||||
resp.Controllers = append(resp.Controllers, *vroc)
|
||||
}
|
||||
|
||||
writeJSON(w, resp)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIRAIDForeignAction(w http.ResponseWriter, r *http.Request) {
|
||||
var req struct {
|
||||
ControllerID string `json:"controller_id"`
|
||||
Action string `json:"action"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid JSON")
|
||||
return
|
||||
}
|
||||
if req.Action != "import" && req.Action != "clear" {
|
||||
writeError(w, http.StatusBadRequest, "action must be 'import' or 'clear'")
|
||||
return
|
||||
}
|
||||
ctrlIdx, ok := parseLSIControllerIndex(req.ControllerID)
|
||||
if !ok {
|
||||
writeError(w, http.StatusBadRequest, "invalid controller_id")
|
||||
return
|
||||
}
|
||||
|
||||
target := "raid-foreign-clear"
|
||||
name := fmt.Sprintf("RAID Foreign Clear (ctrl %d)", ctrlIdx)
|
||||
if req.Action == "import" {
|
||||
target = "raid-foreign-import"
|
||||
name = fmt.Sprintf("RAID Foreign Import (ctrl %d)", ctrlIdx)
|
||||
}
|
||||
|
||||
t := &Task{
|
||||
ID: newJobID(target),
|
||||
Name: name,
|
||||
Target: target,
|
||||
Priority: defaultTaskPriority(target, taskParams{}),
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{RAIDController: ctrlIdx},
|
||||
}
|
||||
globalQueue.enqueue(t)
|
||||
writeJSON(w, map[string]string{"task_id": t.ID})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIRAIDCreateMirror(w http.ResponseWriter, r *http.Request) {
|
||||
var req struct {
|
||||
ControllerID string `json:"controller_id"`
|
||||
Devices []string `json:"devices"`
|
||||
ArrayName string `json:"array_name"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid JSON")
|
||||
return
|
||||
}
|
||||
if len(req.Devices) < 2 {
|
||||
writeError(w, http.StatusBadRequest, "at least 2 devices required")
|
||||
return
|
||||
}
|
||||
|
||||
var target, name string
|
||||
var params taskParams
|
||||
|
||||
switch {
|
||||
case strings.HasPrefix(req.ControllerID, "lsi-"):
|
||||
ctrlIdx, ok := parseLSIControllerIndex(req.ControllerID)
|
||||
if !ok {
|
||||
writeError(w, http.StatusBadRequest, "invalid controller_id")
|
||||
return
|
||||
}
|
||||
target = "raid-lsi-create-mirror"
|
||||
name = fmt.Sprintf("Create RAID 1 Mirror (LSI ctrl %d)", ctrlIdx)
|
||||
params = taskParams{RAIDController: ctrlIdx, RAIDDevices: req.Devices}
|
||||
|
||||
case req.ControllerID == "vroc-0":
|
||||
arrayName := strings.TrimSpace(req.ArrayName)
|
||||
if arrayName == "" {
|
||||
arrayName = "bee-mirror0"
|
||||
}
|
||||
target = "raid-vroc-create-mirror"
|
||||
name = fmt.Sprintf("Create VROC RAID 1 (%s)", arrayName)
|
||||
params = taskParams{RAIDDevices: req.Devices, RAIDArrayName: arrayName}
|
||||
|
||||
default:
|
||||
writeError(w, http.StatusBadRequest, "unknown controller_id")
|
||||
return
|
||||
}
|
||||
|
||||
t := &Task{
|
||||
ID: newJobID(target),
|
||||
Name: name,
|
||||
Target: target,
|
||||
Priority: defaultTaskPriority(target, taskParams{}),
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
params: params,
|
||||
}
|
||||
globalQueue.enqueue(t)
|
||||
writeJSON(w, map[string]string{"task_id": t.ID})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIRAIDPrepareDrive(w http.ResponseWriter, r *http.Request) {
|
||||
var req struct {
|
||||
ControllerID string `json:"controller_id"`
|
||||
Slot string `json:"slot"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid JSON")
|
||||
return
|
||||
}
|
||||
ctrlIdx, ok := parseLSIControllerIndex(req.ControllerID)
|
||||
if !ok {
|
||||
writeError(w, http.StatusBadRequest, "invalid controller_id")
|
||||
return
|
||||
}
|
||||
if _, _, ok := parseRAIDSlot(req.Slot); !ok {
|
||||
writeError(w, http.StatusBadRequest, "invalid slot")
|
||||
return
|
||||
}
|
||||
|
||||
t := &Task{
|
||||
ID: newJobID("raid-lsi-prepare-drive"),
|
||||
Name: fmt.Sprintf("Prepare drive %s (LSI ctrl %d)", req.Slot, ctrlIdx),
|
||||
Target: "raid-lsi-prepare-drive",
|
||||
Priority: defaultTaskPriority("raid-lsi-prepare-drive", taskParams{}),
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{RAIDController: ctrlIdx, RAIDSlot: req.Slot},
|
||||
}
|
||||
globalQueue.enqueue(t)
|
||||
writeJSON(w, map[string]string{"task_id": t.ID})
|
||||
}
|
||||
|
||||
func parseLSIControllerIndex(id string) (int, bool) {
|
||||
if !strings.HasPrefix(id, "lsi-") {
|
||||
return 0, false
|
||||
}
|
||||
n, err := strconv.Atoi(strings.TrimPrefix(id, "lsi-"))
|
||||
if err != nil || n < 0 {
|
||||
return 0, false
|
||||
}
|
||||
return n, true
|
||||
}
|
||||
|
||||
// --- Task runner functions ---
|
||||
|
||||
func runRAIDForeignClearTask(ctx context.Context, j *jobState, ctrl int) error {
|
||||
j.append(fmt.Sprintf("Clearing foreign configuration on controller %d...", ctrl))
|
||||
cmd := exec.CommandContext(ctx, "storcli64", fmt.Sprintf("/c%d/fall", ctrl), "del", "noprompt")
|
||||
return streamCmdJob(j, cmd)
|
||||
}
|
||||
|
||||
func runRAIDForeignImportTask(ctx context.Context, j *jobState, ctrl int) error {
|
||||
j.append(fmt.Sprintf("Importing foreign configuration on controller %d...", ctrl))
|
||||
cmd := exec.CommandContext(ctx, "storcli64", fmt.Sprintf("/c%d/fall", ctrl), "import", "noprompt")
|
||||
return streamCmdJob(j, cmd)
|
||||
}
|
||||
|
||||
func runRAIDLSICreateMirrorTask(ctx context.Context, j *jobState, ctrl int, drives []string) error {
|
||||
driveList := strings.Join(drives, ",")
|
||||
j.append(fmt.Sprintf("Creating RAID 1 on controller %d with drives: %s", ctrl, driveList))
|
||||
cmd := exec.CommandContext(ctx, "storcli64",
|
||||
fmt.Sprintf("/c%d", ctrl),
|
||||
"add", "vd", "type=raid1",
|
||||
fmt.Sprintf("drives=%s", driveList),
|
||||
"pdperarray=2",
|
||||
)
|
||||
return streamCmdJob(j, cmd)
|
||||
}
|
||||
|
||||
// parseRAIDSlot splits a storcli "EID:Slt" identifier (e.g. "252:0") into
|
||||
// enclosure and slot numbers.
|
||||
func parseRAIDSlot(slot string) (eid int, slt int, ok bool) {
|
||||
parts := strings.SplitN(strings.TrimSpace(slot), ":", 2)
|
||||
if len(parts) != 2 {
|
||||
return 0, 0, false
|
||||
}
|
||||
eid, err1 := strconv.Atoi(strings.TrimSpace(parts[0]))
|
||||
slt, err2 := strconv.Atoi(strings.TrimSpace(parts[1]))
|
||||
if err1 != nil || err2 != nil {
|
||||
return 0, 0, false
|
||||
}
|
||||
return eid, slt, true
|
||||
}
|
||||
|
||||
func runRAIDPrepareDriveTask(ctx context.Context, j *jobState, ctrl int, slot string) error {
|
||||
eid, slt, ok := parseRAIDSlot(slot)
|
||||
if !ok {
|
||||
return fmt.Errorf("invalid slot %q", slot)
|
||||
}
|
||||
j.append(fmt.Sprintf("Preparing drive %s on controller %d (set good, force)...", slot, ctrl))
|
||||
cmd := exec.CommandContext(ctx, "storcli64",
|
||||
fmt.Sprintf("/c%d/e%d/s%d", ctrl, eid, slt),
|
||||
"set", "good", "force",
|
||||
)
|
||||
return streamCmdJob(j, cmd)
|
||||
}
|
||||
|
||||
func runRAIDVROCCreateMirrorTask(ctx context.Context, j *jobState, devices []string, arrayName string) error {
|
||||
if arrayName == "" {
|
||||
arrayName = "bee-mirror0"
|
||||
}
|
||||
devPath := "/dev/md/" + arrayName
|
||||
args := []string{
|
||||
"--create", devPath,
|
||||
"--level=1",
|
||||
fmt.Sprintf("--raid-devices=%d", len(devices)),
|
||||
"--run",
|
||||
}
|
||||
args = append(args, devices...)
|
||||
j.append(fmt.Sprintf("Creating VROC RAID 1 array %s with: %s", devPath, strings.Join(devices, " ")))
|
||||
cmd := exec.CommandContext(ctx, "mdadm", args...)
|
||||
return streamCmdJob(j, cmd)
|
||||
}
|
||||
|
||||
// raidParseHumanSizeGB parses storcli size strings like "1.818 TB", "745.211 GB".
|
||||
func raidParseHumanSizeGB(s string) float64 {
|
||||
s = strings.TrimSpace(s)
|
||||
if s == "" {
|
||||
return 0
|
||||
}
|
||||
upper := strings.ToUpper(s)
|
||||
var mul float64
|
||||
var numStr string
|
||||
switch {
|
||||
case strings.Contains(upper, " TB"):
|
||||
mul = 1024
|
||||
numStr = strings.TrimSpace(strings.SplitN(upper, " T", 2)[0])
|
||||
case strings.Contains(upper, " GB"):
|
||||
mul = 1
|
||||
numStr = strings.TrimSpace(strings.SplitN(upper, " G", 2)[0])
|
||||
case strings.Contains(upper, " MB"):
|
||||
mul = 1.0 / 1024
|
||||
numStr = strings.TrimSpace(strings.SplitN(upper, " M", 2)[0])
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
v, err := strconv.ParseFloat(numStr, 64)
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
return v * mul
|
||||
}
|
||||
|
||||
// --- UI card ---
|
||||
|
||||
func renderRAIDMgmtCard() string {
|
||||
return `<div class="card"><div class="card-head card-head-actions">RAID Controller Management<div class="card-head-buttons"><button class="btn btn-sm btn-secondary" onclick="raidLoad()">↻ Refresh</button></div></div><div class="card-body">
|
||||
<div id="raid-status" style="font-size:13px;color:var(--muted);margin-bottom:8px">Loading...</div>
|
||||
<div id="raid-content"></div>
|
||||
<div id="raid-out-wrap" style="display:none;margin-top:14px">
|
||||
<div style="display:flex;align-items:center;justify-content:space-between;margin-bottom:4px">
|
||||
<span id="raid-out-label" style="font-size:12px;font-weight:600;color:var(--muted)">Output</span>
|
||||
<span id="raid-out-status" style="font-size:12px"></span>
|
||||
</div>
|
||||
<div id="raid-terminal" class="terminal" style="max-height:260px;width:100%;box-sizing:border-box"></div>
|
||||
</div>
|
||||
</div></div>
|
||||
<script>
|
||||
(function(){
|
||||
function escHtml(s) {
|
||||
return String(s||'').replace(/&/g,'&').replace(/</g,'<').replace(/>/g,'>').replace(/"/g,'"');
|
||||
}
|
||||
|
||||
var _raidControllers = [];
|
||||
|
||||
function raidLoad() {
|
||||
var status = document.getElementById('raid-status');
|
||||
var content = document.getElementById('raid-content');
|
||||
status.textContent = 'Detecting RAID controllers...';
|
||||
status.style.color = 'var(--muted)';
|
||||
content.innerHTML = '';
|
||||
fetch('/api/tools/raid/status', {cache:'no-store'})
|
||||
.then(function(r) {
|
||||
if (!r.ok) return r.json().then(function(e) { throw new Error(e.error || r.statusText); });
|
||||
return r.json();
|
||||
})
|
||||
.then(function(data) {
|
||||
_raidControllers = data.controllers || [];
|
||||
if (_raidControllers.length === 0) {
|
||||
status.textContent = 'No RAID controllers detected.';
|
||||
return;
|
||||
}
|
||||
status.textContent = _raidControllers.length + ' controller(s) detected.';
|
||||
content.innerHTML = _raidControllers.map(function(c, i) {
|
||||
return raidRenderController(c, i);
|
||||
}).join('<hr style="margin:16px 0;border:none;border-top:1px solid var(--border)">');
|
||||
})
|
||||
.catch(function(e) {
|
||||
status.textContent = 'Error: ' + e.message;
|
||||
status.style.color = 'var(--crit-fg)';
|
||||
});
|
||||
}
|
||||
|
||||
function raidRenderController(c, idx) {
|
||||
var html = '';
|
||||
var typeLabel = c.type === 'lsi' ? 'LSI / Broadcom' : 'Intel VROC';
|
||||
html += '<div style="font-weight:600;font-size:13px;margin-bottom:10px">' + typeLabel + ' — ' + escHtml(c.model) + '</div>';
|
||||
|
||||
if (c.type === 'lsi') {
|
||||
var foreign = c.foreign_drives || [];
|
||||
if (foreign.length > 0) {
|
||||
html += '<div style="background:var(--warn-bg,rgba(240,192,0,0.1));border:1px solid var(--warn-border,#c8a800);border-radius:4px;padding:10px 12px;margin-bottom:12px">';
|
||||
html += '<div style="font-weight:600;font-size:13px;margin-bottom:6px">⚠︎ Foreign Configuration Detected (' + foreign.length + ' drive(s))</div>';
|
||||
html += '<table style="margin-bottom:10px"><tr><th>Slot</th><th>Model</th><th>Size</th><th>State</th></tr>';
|
||||
foreign.forEach(function(d) {
|
||||
html += '<tr>'
|
||||
+ '<td style="font-family:monospace">' + escHtml(d.slot) + '</td>'
|
||||
+ '<td>' + escHtml(d.model||'—') + '</td>'
|
||||
+ '<td>' + (d.size_gb > 0 ? Math.round(d.size_gb) + ' GB' : '—') + '</td>'
|
||||
+ '<td><span class="badge badge-warn">' + escHtml(d.state) + '</span></td>'
|
||||
+ '</tr>';
|
||||
});
|
||||
html += '</table>';
|
||||
html += '<div style="display:flex;gap:8px;flex-wrap:wrap">';
|
||||
html += '<button class="btn btn-sm btn-primary" onclick="raidForeignAction(\'' + escHtml(c.id) + '\',\'import\',this)">Import Foreign Config</button>';
|
||||
html += '<button class="btn btn-sm btn-secondary" style="color:var(--crit-fg)" onclick="raidForeignAction(\'' + escHtml(c.id) + '\',\'clear\',this)">Clear Foreign Config</button>';
|
||||
html += '</div></div>';
|
||||
}
|
||||
|
||||
html += raidRenderAllDrives(c, idx);
|
||||
html += raidRenderMirrorSection(c, idx, 'lsi');
|
||||
}
|
||||
|
||||
if (c.type === 'vroc') {
|
||||
var arrays = c.arrays || [];
|
||||
if (arrays.length > 0) {
|
||||
html += '<div style="font-size:12px;font-weight:600;color:var(--muted);margin-bottom:6px;text-transform:uppercase;letter-spacing:.04em">Active Arrays</div>';
|
||||
html += '<table style="margin-bottom:14px"><tr><th>Name</th><th>Level</th><th>Members</th><th>Status</th></tr>';
|
||||
arrays.forEach(function(a) {
|
||||
var badge = a.degraded
|
||||
? '<span class="badge badge-err">Degraded</span>'
|
||||
: '<span class="badge badge-ok">OK</span>';
|
||||
html += '<tr>'
|
||||
+ '<td style="font-family:monospace">' + escHtml(a.name) + '</td>'
|
||||
+ '<td>' + escHtml(a.level||'—') + '</td>'
|
||||
+ '<td style="font-family:monospace;font-size:12px">' + (a.members||[]).map(escHtml).join(', ') + '</td>'
|
||||
+ '<td>' + badge + '</td>'
|
||||
+ '</tr>';
|
||||
});
|
||||
html += '</table>';
|
||||
}
|
||||
|
||||
html += raidRenderAllDrives(c, idx);
|
||||
html += raidRenderMirrorSection(c, idx, 'vroc');
|
||||
}
|
||||
|
||||
return html;
|
||||
}
|
||||
|
||||
var RAID_READY_STATES = {'UGood': true, 'JBOD': true, 'available': true};
|
||||
var RAID_NO_PREPARE_STATES = {'UGood': true, 'JBOD': true, 'Frgn': true, 'Onln': true, 'Msng': true};
|
||||
|
||||
function raidRenderAllDrives(c, idx) {
|
||||
var drives = c.all_drives || [];
|
||||
var isLSI = c.type === 'lsi';
|
||||
if (drives.length === 0) {
|
||||
return '<p style="font-size:13px;color:var(--muted);margin-bottom:12px">No drives detected on this controller.</p>';
|
||||
}
|
||||
var html = '<div style="font-size:12px;font-weight:600;color:var(--muted);margin-bottom:6px;text-transform:uppercase;letter-spacing:.04em">All Drives on This Controller</div>';
|
||||
html += '<table style="margin-bottom:14px"><tr><th>' + (isLSI ? 'Slot' : 'Device') + '</th><th>Model</th><th>Size</th><th>State</th>' + (isLSI ? '<th></th>' : '') + '</tr>';
|
||||
drives.forEach(function(d) {
|
||||
var ready = !!RAID_READY_STATES[d.state];
|
||||
var badgeClass = ready ? 'badge-ok' : 'badge-warn';
|
||||
var actionCell = '';
|
||||
if (isLSI && !RAID_NO_PREPARE_STATES[d.state]) {
|
||||
actionCell = '<td><button class="btn btn-sm btn-secondary" onclick="raidPrepareDrive(\'' + escHtml(c.id) + '\',\'' + escHtml(d.slot) + '\',this)">Prepare</button></td>';
|
||||
} else if (isLSI) {
|
||||
actionCell = '<td></td>';
|
||||
}
|
||||
html += '<tr>'
|
||||
+ '<td style="font-family:monospace">' + escHtml(isLSI ? d.slot : d.device) + '</td>'
|
||||
+ '<td>' + escHtml(d.model||'—') + (d.serial ? ' [' + escHtml(d.serial) + ']' : '') + '</td>'
|
||||
+ '<td>' + (d.size_gb > 0 ? Math.round(d.size_gb) + ' GB' : '—') + '</td>'
|
||||
+ '<td><span class="badge ' + badgeClass + '">' + escHtml(d.state||'—') + '</span></td>'
|
||||
+ actionCell
|
||||
+ '</tr>';
|
||||
});
|
||||
html += '</table>';
|
||||
return html;
|
||||
}
|
||||
|
||||
function raidPrepareDrive(ctrlID, slot, btn) {
|
||||
if (!confirm('Prepare drive ' + slot + ' on ' + ctrlID + ' for array creation?\n\nThis forces the drive into Unconfigured Good state. If it currently belongs to a virtual drive or holds data, that data will become inaccessible.')) {
|
||||
return;
|
||||
}
|
||||
var original = btn ? btn.textContent : '';
|
||||
if (btn) { btn.disabled = true; btn.textContent = 'Preparing...'; }
|
||||
raidShowOutput('Prepare drive ' + slot, '', '');
|
||||
fetch('/api/tools/raid/prepare-drive', {
|
||||
method: 'POST',
|
||||
headers: {'Content-Type': 'application/json'},
|
||||
body: JSON.stringify({controller_id: ctrlID, slot: slot})
|
||||
})
|
||||
.then(function(r) { return r.json(); })
|
||||
.then(function(d) {
|
||||
if (d.error) throw new Error(d.error);
|
||||
raidStreamTask(d.task_id, 'Prepare drive ' + slot, function() {
|
||||
if (btn) { btn.disabled = false; btn.textContent = original; }
|
||||
raidLoad();
|
||||
});
|
||||
})
|
||||
.catch(function(e) {
|
||||
raidShowOutput('Error', 'failed', e.message);
|
||||
if (btn) { btn.disabled = false; btn.textContent = original; }
|
||||
});
|
||||
}
|
||||
|
||||
function raidRenderMirrorSection(c, idx, kind) {
|
||||
var free = c.free_drives || [];
|
||||
var html = '<div style="font-size:12px;font-weight:600;color:var(--muted);margin-bottom:6px;text-transform:uppercase;letter-spacing:.04em">Create RAID 1 Mirror</div>';
|
||||
|
||||
if (free.length < 2) {
|
||||
html += '<p style="font-size:13px;color:var(--muted)">No unconfigured drives available (need at least 2).</p>';
|
||||
return html;
|
||||
}
|
||||
|
||||
html += '<p style="font-size:13px;color:var(--muted);margin-bottom:8px">Select exactly 2 drives:</p>';
|
||||
html += '<div>';
|
||||
free.forEach(function(d) {
|
||||
var val = kind === 'lsi' ? d.slot : d.device;
|
||||
var label = kind === 'lsi'
|
||||
? escHtml(d.slot) + (d.model ? ' — ' + escHtml(d.model) : '') + (d.size_gb > 0 ? ' (' + Math.round(d.size_gb) + ' GB)' : '')
|
||||
: escHtml(d.device) + (d.model ? ' — ' + escHtml(d.model) : '') + (d.serial ? ' [' + escHtml(d.serial) + ']' : '');
|
||||
html += '<label style="display:block;margin-bottom:4px;font-size:13px;cursor:pointer">'
|
||||
+ '<input type="checkbox" class="raid-mirror-check-' + idx + '" value="' + escHtml(val) + '"> '
|
||||
+ label + '</label>';
|
||||
});
|
||||
html += '</div>';
|
||||
|
||||
if (kind === 'vroc') {
|
||||
html += '<div style="margin-top:10px;display:flex;align-items:center;gap:8px;flex-wrap:wrap">'
|
||||
+ '<label style="font-size:13px">Array name: <input type="text" id="vroc-arrayname-' + idx + '" value="bee-mirror0" style="font-family:monospace;padding:2px 6px;width:140px"></label>';
|
||||
} else {
|
||||
html += '<div style="margin-top:10px;display:flex;gap:8px">';
|
||||
}
|
||||
|
||||
html += '<button class="btn btn-sm btn-primary raid-mirror-btn-' + idx + '" onclick="raidCreateMirror(\'' + escHtml(c.id) + '\',' + idx + ',\'' + kind + '\',this)">Create Mirror</button>';
|
||||
html += '</div>';
|
||||
|
||||
return html;
|
||||
}
|
||||
|
||||
function raidForeignAction(ctrlID, action, btn) {
|
||||
if (action === 'clear' && !confirm('Clear foreign configuration on ' + ctrlID + '?\n\nThis will DELETE the foreign RAID metadata. Data on those drives may become inaccessible.')) {
|
||||
return;
|
||||
}
|
||||
var original = btn ? btn.textContent : '';
|
||||
if (btn) { btn.disabled = true; btn.textContent = action === 'import' ? 'Importing...' : 'Clearing...'; }
|
||||
raidShowOutput('RAID foreign ' + action, '', '');
|
||||
fetch('/api/tools/raid/foreign', {
|
||||
method: 'POST',
|
||||
headers: {'Content-Type': 'application/json'},
|
||||
body: JSON.stringify({controller_id: ctrlID, action: action})
|
||||
})
|
||||
.then(function(r) { return r.json(); })
|
||||
.then(function(d) {
|
||||
if (d.error) throw new Error(d.error);
|
||||
var actionLabel = action === 'import' ? 'Import foreign config' : 'Clear foreign config';
|
||||
raidStreamTask(d.task_id, actionLabel, function() {
|
||||
if (btn) { btn.disabled = false; btn.textContent = original; }
|
||||
raidLoad();
|
||||
});
|
||||
})
|
||||
.catch(function(e) {
|
||||
raidShowOutput('Error', 'failed', e.message);
|
||||
if (btn) { btn.disabled = false; btn.textContent = original; }
|
||||
});
|
||||
}
|
||||
|
||||
function raidCreateMirror(ctrlID, idx, kind, btn) {
|
||||
var checks = document.querySelectorAll('.raid-mirror-check-' + idx + ':checked');
|
||||
if (checks.length !== 2) {
|
||||
alert('Select exactly 2 drives.');
|
||||
return;
|
||||
}
|
||||
var devices = Array.from(checks).map(function(c) { return c.value; });
|
||||
var arrayName = '';
|
||||
if (kind === 'vroc') {
|
||||
var nameEl = document.getElementById('vroc-arrayname-' + idx);
|
||||
arrayName = nameEl ? nameEl.value.trim() : 'bee-mirror0';
|
||||
if (!arrayName) arrayName = 'bee-mirror0';
|
||||
}
|
||||
var original = btn ? btn.textContent : '';
|
||||
if (btn) { btn.disabled = true; btn.textContent = 'Creating...'; }
|
||||
raidShowOutput('Create RAID 1', '', '');
|
||||
fetch('/api/tools/raid/create-mirror', {
|
||||
method: 'POST',
|
||||
headers: {'Content-Type': 'application/json'},
|
||||
body: JSON.stringify({controller_id: ctrlID, devices: devices, array_name: arrayName})
|
||||
})
|
||||
.then(function(r) { return r.json(); })
|
||||
.then(function(d) {
|
||||
if (d.error) throw new Error(d.error);
|
||||
raidStreamTask(d.task_id, 'Create RAID 1 mirror', function() {
|
||||
if (btn) { btn.disabled = false; btn.textContent = original; }
|
||||
raidLoad();
|
||||
});
|
||||
})
|
||||
.catch(function(e) {
|
||||
raidShowOutput('Error', 'failed', e.message);
|
||||
if (btn) { btn.disabled = false; btn.textContent = original; }
|
||||
});
|
||||
}
|
||||
|
||||
function raidShowOutput(label, status, text) {
|
||||
var wrap = document.getElementById('raid-out-wrap');
|
||||
var labelEl = document.getElementById('raid-out-label');
|
||||
var statusEl = document.getElementById('raid-out-status');
|
||||
var term = document.getElementById('raid-terminal');
|
||||
wrap.style.display = 'block';
|
||||
labelEl.textContent = label;
|
||||
if (status === 'ok') {
|
||||
statusEl.textContent = '✓ done';
|
||||
statusEl.style.color = 'var(--ok-fg)';
|
||||
} else if (status === 'failed') {
|
||||
statusEl.textContent = '✗ failed';
|
||||
statusEl.style.color = 'var(--crit-fg)';
|
||||
} else {
|
||||
statusEl.textContent = status;
|
||||
statusEl.style.color = 'var(--muted)';
|
||||
}
|
||||
if (text !== undefined) {
|
||||
term.textContent = text;
|
||||
term.scrollTop = term.scrollHeight;
|
||||
}
|
||||
}
|
||||
|
||||
function raidStreamTask(taskID, taskName, onDone) {
|
||||
var term = document.getElementById('raid-terminal');
|
||||
term.textContent = '';
|
||||
raidShowOutput(taskName || 'Running…', 'running…', undefined);
|
||||
var es = new EventSource('/api/tasks/' + taskID + '/stream');
|
||||
es.onmessage = function(e) {
|
||||
term.textContent += e.data + '\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
};
|
||||
es.addEventListener('done', function(e) {
|
||||
es.close();
|
||||
if (!e.data) {
|
||||
raidShowOutput(taskName, 'ok', undefined);
|
||||
} else {
|
||||
raidShowOutput(taskName, 'failed', undefined);
|
||||
term.textContent += '\nFailed: ' + e.data;
|
||||
term.scrollTop = term.scrollHeight;
|
||||
}
|
||||
if (onDone) onDone();
|
||||
});
|
||||
es.onerror = function() {
|
||||
es.close();
|
||||
raidShowOutput(taskName, 'failed', undefined);
|
||||
if (onDone) onDone();
|
||||
};
|
||||
}
|
||||
|
||||
window.raidLoad = raidLoad;
|
||||
window.raidForeignAction = raidForeignAction;
|
||||
window.raidCreateMirror = raidCreateMirror;
|
||||
window.raidPrepareDrive = raidPrepareDrive;
|
||||
raidLoad();
|
||||
})();
|
||||
</script>`
|
||||
}
|
||||
214
audit/internal/webui/saa_dmi.go
Normal file
214
audit/internal/webui/saa_dmi.go
Normal file
@@ -0,0 +1,214 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type dmiField struct {
|
||||
Name string `json:"name"`
|
||||
Shn string `json:"shn"`
|
||||
Value string `json:"value"`
|
||||
}
|
||||
|
||||
type saaChange struct {
|
||||
Shn string `json:"shn"`
|
||||
Value string `json:"value"`
|
||||
}
|
||||
|
||||
var (
|
||||
shnRE = regexp.MustCompile(`^[A-Za-z0-9_]{1,16}$`)
|
||||
dmiSectionRE = regexp.MustCompile(`^\[(.+?)\]$`)
|
||||
// Item Name {SHN} = value // comment
|
||||
// SHN may contain parentheses, e.g. {PS(4)LC} for power supply fields
|
||||
dmiItemRE = regexp.MustCompile(`^(.+?)\s+\{([A-Za-z0-9_()\-]{1,24})\}\s*=\s*(.*)$`)
|
||||
dmiVersionRE = regexp.MustCompile(`(?i)^version\s*=`)
|
||||
)
|
||||
|
||||
|
||||
// parseDMIFile parses the DMI.txt produced by "saa GetDmiInfo".
|
||||
// Real format (from SAA User Guide 4.8.1):
|
||||
//
|
||||
// [System]
|
||||
// Version {SYVS} = "A Version" // string value
|
||||
// Serial Number {SYSN} = $DEFAULT$ // string value
|
||||
// UUID {SYUU} = 00112233-... // hex value
|
||||
func parseDMIFile(content string) []dmiField {
|
||||
var fields []dmiField
|
||||
currentSection := ""
|
||||
for _, line := range strings.Split(content, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" || strings.HasPrefix(line, "//") || strings.HasPrefix(line, "#") {
|
||||
continue
|
||||
}
|
||||
if dmiVersionRE.MatchString(line) {
|
||||
continue
|
||||
}
|
||||
if m := dmiSectionRE.FindStringSubmatch(line); m != nil {
|
||||
currentSection = strings.TrimSpace(m[1])
|
||||
continue
|
||||
}
|
||||
m := dmiItemRE.FindStringSubmatch(line)
|
||||
if m == nil {
|
||||
continue
|
||||
}
|
||||
itemName := strings.TrimSpace(m[1])
|
||||
shn := m[2]
|
||||
rawValue := strings.TrimSpace(m[3])
|
||||
// strip trailing comment (space + //)
|
||||
if idx := strings.LastIndex(rawValue, " //"); idx >= 0 {
|
||||
rawValue = strings.TrimSpace(rawValue[:idx])
|
||||
}
|
||||
// strip surrounding double quotes from string values
|
||||
if len(rawValue) >= 2 && rawValue[0] == '"' && rawValue[len(rawValue)-1] == '"' {
|
||||
rawValue = rawValue[1 : len(rawValue)-1]
|
||||
}
|
||||
displayName := itemName
|
||||
if currentSection != "" {
|
||||
displayName = currentSection + " / " + itemName
|
||||
}
|
||||
fields = append(fields, dmiField{Name: displayName, Shn: shn, Value: rawValue})
|
||||
}
|
||||
return fields
|
||||
}
|
||||
|
||||
func (h *handler) handleAPISAADMIRead(w http.ResponseWriter, r *http.Request) {
|
||||
ctx, cancel := context.WithTimeout(r.Context(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
tmpDir, err := os.MkdirTemp("", "bee-saa-*")
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, "create temp dir: "+err.Error())
|
||||
return
|
||||
}
|
||||
defer os.RemoveAll(tmpDir)
|
||||
|
||||
dmiFile := filepath.Join(tmpDir, "DMI.txt")
|
||||
cmd := exec.CommandContext(ctx, "saa", "-c", "GetDmiInfo", "--file", dmiFile, "--overwrite")
|
||||
cmd.Dir = "/usr/local/bin"
|
||||
out, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
msg := strings.TrimSpace(string(out))
|
||||
if msg == "" {
|
||||
msg = err.Error()
|
||||
}
|
||||
writeError(w, http.StatusInternalServerError, "saa GetDmiInfo: "+msg)
|
||||
return
|
||||
}
|
||||
|
||||
raw, err := os.ReadFile(dmiFile)
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, "read DMI file: "+err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
fields := parseDMIFile(string(raw))
|
||||
if len(fields) == 0 {
|
||||
writeError(w, http.StatusInternalServerError, "no DMI fields found (file may be empty — reboot the server and try again)")
|
||||
return
|
||||
}
|
||||
writeJSON(w, fields)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPISAADMIWrite(w http.ResponseWriter, r *http.Request) {
|
||||
var req struct {
|
||||
Changes []saaChange `json:"changes"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid request body")
|
||||
return
|
||||
}
|
||||
if len(req.Changes) == 0 {
|
||||
writeError(w, http.StatusUnprocessableEntity, "no changes provided")
|
||||
return
|
||||
}
|
||||
for _, c := range req.Changes {
|
||||
if !shnRE.MatchString(c.Shn) {
|
||||
writeError(w, http.StatusUnprocessableEntity, "invalid shn: "+c.Shn)
|
||||
return
|
||||
}
|
||||
if len(c.Value) == 0 || len(c.Value) > 64 {
|
||||
writeError(w, http.StatusUnprocessableEntity, "value length out of range for shn: "+c.Shn)
|
||||
return
|
||||
}
|
||||
for _, ch := range c.Value {
|
||||
if ch < 0x20 || ch > 0x7E {
|
||||
writeError(w, http.StatusUnprocessableEntity, "value contains non-printable character for shn: "+c.Shn)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
t := &Task{
|
||||
ID: newJobID("saa-dmi-write"),
|
||||
Name: fmt.Sprintf("SAA DMI Write (%d field(s))", len(req.Changes)),
|
||||
Target: "saa-dmi-write",
|
||||
Priority: defaultTaskPriority("saa-dmi-write", taskParams{}),
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{
|
||||
SAADmiChanges: req.Changes,
|
||||
},
|
||||
}
|
||||
globalQueue.enqueue(t)
|
||||
writeJSON(w, map[string]string{"task_id": t.ID})
|
||||
}
|
||||
|
||||
func runSAADMIWriteTask(ctx context.Context, j *jobState, exportDir string, p taskParams) error {
|
||||
tmpDir, err := os.MkdirTemp("", "bee-saa-*")
|
||||
if err != nil {
|
||||
return fmt.Errorf("create temp dir: %w", err)
|
||||
}
|
||||
defer os.RemoveAll(tmpDir)
|
||||
dmiFile := filepath.Join(tmpDir, "DMI.txt")
|
||||
|
||||
j.append("Reading current DMI configuration...")
|
||||
getCmd := exec.CommandContext(ctx, "saa", "-c", "GetDmiInfo", "--file", dmiFile, "--overwrite")
|
||||
getCmd.Dir = "/usr/local/bin"
|
||||
if err := streamCmdJob(j, getCmd); err != nil {
|
||||
return fmt.Errorf("GetDmiInfo: %w", err)
|
||||
}
|
||||
|
||||
backupDir := filepath.Join(exportDir, "dmi-backups")
|
||||
if err := os.MkdirAll(backupDir, 0o755); err != nil {
|
||||
return fmt.Errorf("create backup dir: %w", err)
|
||||
}
|
||||
backupName := "dmi-" + time.Now().UTC().Format("20060102-150405") + ".txt"
|
||||
backupPath := filepath.Join(backupDir, backupName)
|
||||
raw, err := os.ReadFile(dmiFile)
|
||||
if err != nil {
|
||||
return fmt.Errorf("read DMI file: %w", err)
|
||||
}
|
||||
if err := os.WriteFile(backupPath, raw, 0o644); err != nil {
|
||||
return fmt.Errorf("write backup: %w", err)
|
||||
}
|
||||
j.append("Backup saved: dmi-backups/" + backupName)
|
||||
|
||||
for _, c := range p.SAADmiChanges {
|
||||
j.append("Setting " + c.Shn + " = " + c.Value)
|
||||
cmd := exec.CommandContext(ctx, "saa", "-c", "EditDmiInfo", "--file", dmiFile, "--shn", c.Shn, "--value", c.Value)
|
||||
cmd.Dir = "/usr/local/bin"
|
||||
if err := streamCmdJob(j, cmd); err != nil {
|
||||
return fmt.Errorf("EditDmiInfo %s: %w", c.Shn, err)
|
||||
}
|
||||
}
|
||||
|
||||
j.append("Applying changes to hardware...")
|
||||
changeCmd := exec.CommandContext(ctx, "saa", "-c", "ChangeDmiInfo", "--file", dmiFile)
|
||||
changeCmd.Dir = "/usr/local/bin"
|
||||
if err := streamCmdJob(j, changeCmd); err != nil {
|
||||
return fmt.Errorf("ChangeDmiInfo: %w", err)
|
||||
}
|
||||
|
||||
j.append("Done. Reboot the server for changes to take effect.")
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -221,6 +221,11 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
h.kmsg = newKmsgWatcher(opts.App.StatusDB)
|
||||
h.kmsg.start()
|
||||
globalQueue.kmsgWatcher = h.kmsg
|
||||
|
||||
// Start periodic health poller for components that don't emit kernel log events (e.g. PSU).
|
||||
if opts.App.StatusDB != nil {
|
||||
newHealthPoller(opts.App.StatusDB).start()
|
||||
}
|
||||
}
|
||||
|
||||
globalQueue.startWorker(&opts)
|
||||
@@ -309,6 +314,16 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
mux.HandleFunc("GET /api/tools/check", h.handleAPIToolsCheck)
|
||||
mux.HandleFunc("GET /api/tools/nvme-formats", h.handleAPINVMeFormats)
|
||||
mux.HandleFunc("POST /api/tools/nvme-format/run", h.handleAPINVMeFormatRun)
|
||||
mux.HandleFunc("GET /api/tools/saa-dmi", h.handleAPISAADMIRead)
|
||||
mux.HandleFunc("POST /api/tools/saa-dmi/write", h.handleAPISAADMIWrite)
|
||||
mux.HandleFunc("GET /api/tools/ipmi-fru", h.handleAPIIPMIFRURead)
|
||||
mux.HandleFunc("POST /api/tools/ipmi-fru/write", h.handleAPIIPMIFRUWrite)
|
||||
mux.HandleFunc("GET /api/tools/huawei-elabel", h.handleAPIHuaweiElabelRead)
|
||||
mux.HandleFunc("POST /api/tools/huawei-elabel/write", h.handleAPIHuaweiElabelWrite)
|
||||
mux.HandleFunc("GET /api/tools/raid/status", h.handleAPIRAIDStatus)
|
||||
mux.HandleFunc("POST /api/tools/raid/foreign", h.handleAPIRAIDForeignAction)
|
||||
mux.HandleFunc("POST /api/tools/raid/create-mirror", h.handleAPIRAIDCreateMirror)
|
||||
mux.HandleFunc("POST /api/tools/raid/prepare-drive", h.handleAPIRAIDPrepareDrive)
|
||||
|
||||
// GPU presence / tools
|
||||
mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
|
||||
@@ -320,6 +335,8 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
// System
|
||||
mux.HandleFunc("GET /api/system/ram-status", h.handleAPIRAMStatus)
|
||||
mux.HandleFunc("POST /api/system/install-to-ram", h.handleAPIInstallToRAM)
|
||||
mux.HandleFunc("POST /api/system/reboot", h.handleAPISystemReboot)
|
||||
mux.HandleFunc("POST /api/system/shutdown", h.handleAPISystemShutdown)
|
||||
|
||||
// Preflight
|
||||
mux.HandleFunc("GET /api/preflight", h.handleAPIPreflight)
|
||||
@@ -328,6 +345,10 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
mux.HandleFunc("GET /api/install/disks", h.handleAPIInstallDisks)
|
||||
mux.HandleFunc("POST /api/install/run", h.handleAPIInstallRun)
|
||||
|
||||
// Hardware component detail (fragment for modal in Hardware Summary card)
|
||||
mux.HandleFunc("GET /api/hardware-summary", h.handleAPIHardwareSummary)
|
||||
mux.HandleFunc("GET /api/components/{type}", h.handleAPIComponentDetail)
|
||||
|
||||
// Metrics — SSE stream of live sensor data + server-side SVG charts + CSV export
|
||||
mux.HandleFunc("GET /api/metrics/stream", h.handleAPIMetricsStream)
|
||||
mux.HandleFunc("GET /api/metrics/latest", h.handleAPIMetricsLatest)
|
||||
@@ -1410,14 +1431,17 @@ func (h *handler) handlePage(w http.ResponseWriter, r *http.Request) {
|
||||
if page == "" {
|
||||
page = "dashboard"
|
||||
}
|
||||
// Redirect old routes to new names
|
||||
// Redirect legacy routes to new named pages
|
||||
switch page {
|
||||
case "tests":
|
||||
http.Redirect(w, r, "/validate", http.StatusMovedPermanently)
|
||||
case "validate", "tests":
|
||||
http.Redirect(w, r, "/load", http.StatusMovedPermanently)
|
||||
return
|
||||
case "burn-in":
|
||||
http.Redirect(w, r, "/burn", http.StatusMovedPermanently)
|
||||
return
|
||||
case "speed", "endurance":
|
||||
http.Redirect(w, r, "/benchmark", http.StatusMovedPermanently)
|
||||
return
|
||||
}
|
||||
body := renderPage(page, h.opts)
|
||||
w.Header().Set("Cache-Control", "no-store")
|
||||
|
||||
@@ -666,41 +666,51 @@ func TestTasksPageRendersOpenLinksAndPaginationControls(t *testing.T) {
|
||||
|
||||
func TestToolsPageRendersNvidiaSelfHealSection(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tools", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
|
||||
// /tools: only NVMe Block Format and Supermicro DMI remain
|
||||
recTools := httptest.NewRecorder()
|
||||
handler.ServeHTTP(recTools, httptest.NewRequest(http.MethodGet, "/tools", nil))
|
||||
if recTools.Code != http.StatusOK {
|
||||
t.Fatalf("tools status=%d", recTools.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
if !strings.Contains(body, `NVIDIA Self Heal`) {
|
||||
t.Fatalf("tools page missing nvidia self heal section: %s", body)
|
||||
toolsBody := recTools.Body.String()
|
||||
if !strings.Contains(toolsBody, `NVMe Block Format`) {
|
||||
t.Fatalf("tools page missing nvme block format section: %s", toolsBody)
|
||||
}
|
||||
if !strings.Contains(body, `Restart GPU Drivers`) {
|
||||
t.Fatalf("tools page missing restart gpu drivers button: %s", body)
|
||||
if !strings.Contains(toolsBody, `/api/tools/nvme-formats`) || !strings.Contains(toolsBody, `/api/tools/nvme-format/run`) {
|
||||
t.Fatalf("tools page missing nvme format api usage: %s", toolsBody)
|
||||
}
|
||||
if !strings.Contains(body, `nvidiaRestartDrivers()`) {
|
||||
t.Fatalf("tools page missing nvidiaRestartDrivers action: %s", body)
|
||||
|
||||
// /settings: system install, support bundle, tool check, nvidia self heal, network, services
|
||||
recSettings := httptest.NewRecorder()
|
||||
handler.ServeHTTP(recSettings, httptest.NewRequest(http.MethodGet, "/settings", nil))
|
||||
if recSettings.Code != http.StatusOK {
|
||||
t.Fatalf("settings status=%d", recSettings.Code)
|
||||
}
|
||||
if !strings.Contains(body, `/api/gpu/nvidia-status`) {
|
||||
t.Fatalf("tools page missing nvidia status api usage: %s", body)
|
||||
settingsBody := recSettings.Body.String()
|
||||
if !strings.Contains(settingsBody, `NVIDIA Self Heal`) {
|
||||
t.Fatalf("settings page missing nvidia self heal section: %s", settingsBody)
|
||||
}
|
||||
if !strings.Contains(body, `nvidiaResetGPU(`) {
|
||||
t.Fatalf("tools page missing nvidiaResetGPU action: %s", body)
|
||||
if !strings.Contains(settingsBody, `Restart GPU Drivers`) {
|
||||
t.Fatalf("settings page missing restart gpu drivers button: %s", settingsBody)
|
||||
}
|
||||
if !strings.Contains(body, `id="boot-source-text"`) {
|
||||
t.Fatalf("tools page missing boot source field: %s", body)
|
||||
if !strings.Contains(settingsBody, `nvidiaRestartDrivers()`) {
|
||||
t.Fatalf("settings page missing nvidiaRestartDrivers action: %s", settingsBody)
|
||||
}
|
||||
if !strings.Contains(body, `USB Black-Box`) {
|
||||
t.Fatalf("tools page missing usb black-box section: %s", body)
|
||||
if !strings.Contains(settingsBody, `/api/gpu/nvidia-status`) {
|
||||
t.Fatalf("settings page missing nvidia status api usage: %s", settingsBody)
|
||||
}
|
||||
if !strings.Contains(body, `/api/blackbox/status`) {
|
||||
t.Fatalf("tools page missing black-box status api usage: %s", body)
|
||||
if !strings.Contains(settingsBody, `nvidiaResetGPU(`) {
|
||||
t.Fatalf("settings page missing nvidiaResetGPU action: %s", settingsBody)
|
||||
}
|
||||
if !strings.Contains(body, `NVMe Block Format`) {
|
||||
t.Fatalf("tools page missing nvme block format section: %s", body)
|
||||
if !strings.Contains(settingsBody, `id="boot-source-text"`) {
|
||||
t.Fatalf("settings page missing boot source field: %s", settingsBody)
|
||||
}
|
||||
if !strings.Contains(body, `/api/tools/nvme-formats`) || !strings.Contains(body, `/api/tools/nvme-format/run`) {
|
||||
t.Fatalf("tools page missing nvme format api usage: %s", body)
|
||||
if !strings.Contains(settingsBody, `USB Black-Box`) {
|
||||
t.Fatalf("settings page missing usb black-box section: %s", settingsBody)
|
||||
}
|
||||
if !strings.Contains(settingsBody, `/api/blackbox/status`) {
|
||||
t.Fatalf("settings page missing black-box status api usage: %s", settingsBody)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -791,46 +801,45 @@ func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
|
||||
func TestCheckPageRendersGPUSelectionAndNvidiaCards(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil))
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/check", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
for _, needle := range []string{
|
||||
`NVIDIA GPU Targeted Stress`,
|
||||
`nvidia-targeted-stress`,
|
||||
`controlled NVIDIA DCGM load`,
|
||||
`<code>dcgmi diag targeted_stress</code>`,
|
||||
`NVIDIA GPU Selection`,
|
||||
`All NVIDIA validate tasks use only the GPUs selected here.`,
|
||||
`Select All`,
|
||||
`id="sat-gpu-list"`,
|
||||
`Select All`,
|
||||
`id="sat-btn-nvidia"`,
|
||||
`NVIDIA Interconnect (NCCL)`,
|
||||
`NVIDIA Bandwidth (NVBandwidth)`,
|
||||
`Non-destructive`,
|
||||
} {
|
||||
if !strings.Contains(body, needle) {
|
||||
t.Fatalf("validate page missing %q: %s", needle, body)
|
||||
t.Fatalf("check page missing %q: %s", needle, body)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidatePageRendersNvidiaFabricCardsInValidateMode(t *testing.T) {
|
||||
func TestCheckPageRendersNvidiaFabricCards(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil))
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/check", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
for _, needle := range []string{
|
||||
`NVIDIA Interconnect (NCCL)`,
|
||||
`Validate and Stress:`,
|
||||
`NVIDIA Bandwidth (NVBandwidth)`,
|
||||
`nvbandwidth runs all built-in tests without a time limit`,
|
||||
`nvbandwidth`,
|
||||
`all_reduce_perf`,
|
||||
} {
|
||||
if !strings.Contains(body, needle) {
|
||||
t.Fatalf("validate page missing %q: %s", needle, body)
|
||||
t.Fatalf("check page missing %q: %s", needle, body)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -847,7 +856,6 @@ func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
|
||||
`NVIDIA Max Compute Load`,
|
||||
`dcgmproftester`,
|
||||
`NCCL`,
|
||||
`Validate → Stress mode`,
|
||||
`id="burn-gpu-list"`,
|
||||
} {
|
||||
if !strings.Contains(body, needle) {
|
||||
@@ -1219,7 +1227,8 @@ func TestDashboardRendersRuntimeHealthTable(t *testing.T) {
|
||||
],
|
||||
"services":[
|
||||
{"name":"bee-web","status":"active"},
|
||||
{"name":"bee-nvidia","status":"inactive"}
|
||||
{"name":"bee-audit","status":"inactive"},
|
||||
{"name":"bee-nvidia","status":"failed"}
|
||||
]
|
||||
}`
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "runtime-health.json"), []byte(health), 0644); err != nil {
|
||||
@@ -1273,7 +1282,7 @@ func TestDashboardRendersRuntimeHealthTable(t *testing.T) {
|
||||
`Bee Services`,
|
||||
`CUDA runtime is not ready for GPU SAT.`,
|
||||
`Missing: nvidia-smi`,
|
||||
`bee-nvidia=inactive`,
|
||||
`bee-nvidia=failed`,
|
||||
// Hardware Summary card — component health badges
|
||||
`Hardware Summary`,
|
||||
`>CPU<`,
|
||||
|
||||
@@ -232,6 +232,9 @@ func renderTaskReportFragment(report taskReport, charts map[string]string, logTe
|
||||
if powerCard := renderTaskPowerResultsCard(report.Target, logText); powerCard != "" {
|
||||
b.WriteString(powerCard)
|
||||
}
|
||||
if report.Target == "storage" {
|
||||
b.WriteString(renderStorageDiskReportCards(logText))
|
||||
}
|
||||
|
||||
if len(report.Charts) > 0 {
|
||||
for _, chart := range report.Charts {
|
||||
@@ -369,3 +372,60 @@ func formatTaskDuration(sec int) string {
|
||||
}
|
||||
return fmt.Sprintf("%dh %02dm %02ds", sec/3600, (sec%3600)/60, sec%60)
|
||||
}
|
||||
|
||||
// renderStorageDiskReportCards reads disk-*-report.txt files from the storage
|
||||
// SAT run directory and renders one card per disk.
|
||||
func renderStorageDiskReportCards(logText string) string {
|
||||
runDir := taskStorageRunDirFromLog(logText)
|
||||
if runDir == "" {
|
||||
return ""
|
||||
}
|
||||
entries, err := os.ReadDir(runDir)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
|
||||
var cards []string
|
||||
for _, entry := range entries {
|
||||
name := entry.Name()
|
||||
if !strings.HasPrefix(name, "disk-") || !strings.HasSuffix(name, "-report.txt") {
|
||||
continue
|
||||
}
|
||||
data, err := os.ReadFile(filepath.Join(runDir, name))
|
||||
if err != nil || len(data) == 0 {
|
||||
continue
|
||||
}
|
||||
// Extract disk label from filename: "disk-01-nvme0n1-report.txt" → "Disk 01 — nvme0n1"
|
||||
stem := strings.TrimPrefix(strings.TrimSuffix(name, "-report.txt"), "disk-")
|
||||
// stem is like "01-nvme0n1"
|
||||
parts := strings.SplitN(stem, "-", 2)
|
||||
title := "Disk " + stem
|
||||
if len(parts) == 2 {
|
||||
title = "Disk " + parts[0] + " — " + parts[1]
|
||||
}
|
||||
card := `<div class="card">` +
|
||||
`<div class="card-head">` + html.EscapeString(title) + `</div>` +
|
||||
`<div class="card-body" style="padding:0">` +
|
||||
`<pre style="margin:0;padding:16px;font-size:12px;line-height:1.6;overflow-x:auto;white-space:pre">` +
|
||||
html.EscapeString(string(data)) +
|
||||
`</pre></div></div>`
|
||||
cards = append(cards, card)
|
||||
}
|
||||
return strings.Join(cards, "\n")
|
||||
}
|
||||
|
||||
// taskStorageRunDirFromLog finds the storage SAT run directory path logged as
|
||||
// "Archive: /path/to/storage-YYYYMMDD-HHMMSS".
|
||||
func taskStorageRunDirFromLog(logText string) string {
|
||||
for _, line := range strings.Split(logText, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if !strings.HasPrefix(line, "Archive:") {
|
||||
continue
|
||||
}
|
||||
path := strings.TrimSpace(strings.TrimPrefix(line, "Archive:"))
|
||||
if strings.Contains(filepath.Base(path), "storage-") && !strings.HasSuffix(path, ".tar.gz") {
|
||||
return path
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
@@ -382,6 +382,46 @@ func executeTaskWithOptions(opts *HandlerOptions, t *Task, j *jobState, ctx cont
|
||||
break
|
||||
}
|
||||
err = runNVMeFormatTask(ctx, j, t.params.Device, t.params.LBAF)
|
||||
case "saa-dmi-write":
|
||||
if len(t.params.SAADmiChanges) == 0 {
|
||||
err = fmt.Errorf("no changes provided")
|
||||
break
|
||||
}
|
||||
err = runSAADMIWriteTask(ctx, j, opts.ExportDir, t.params)
|
||||
case "ipmi-fru-write":
|
||||
if len(t.params.FRUChanges) == 0 {
|
||||
err = fmt.Errorf("no changes provided")
|
||||
break
|
||||
}
|
||||
err = runIPMIFRUWriteTask(ctx, j, opts.ExportDir, t.params)
|
||||
case "huawei-elabel-write":
|
||||
if len(t.params.HuaweiElabelChanges) == 0 {
|
||||
err = fmt.Errorf("no changes provided")
|
||||
break
|
||||
}
|
||||
err = runHuaweiElabelWriteTask(ctx, j, t.params)
|
||||
case "raid-foreign-clear":
|
||||
err = runRAIDForeignClearTask(ctx, j, t.params.RAIDController)
|
||||
case "raid-foreign-import":
|
||||
err = runRAIDForeignImportTask(ctx, j, t.params.RAIDController)
|
||||
case "raid-lsi-create-mirror":
|
||||
if len(t.params.RAIDDevices) < 2 {
|
||||
err = fmt.Errorf("at least 2 drives required")
|
||||
break
|
||||
}
|
||||
err = runRAIDLSICreateMirrorTask(ctx, j, t.params.RAIDController, t.params.RAIDDevices)
|
||||
case "raid-lsi-prepare-drive":
|
||||
if strings.TrimSpace(t.params.RAIDSlot) == "" {
|
||||
err = fmt.Errorf("no drive slot provided")
|
||||
break
|
||||
}
|
||||
err = runRAIDPrepareDriveTask(ctx, j, t.params.RAIDController, t.params.RAIDSlot)
|
||||
case "raid-vroc-create-mirror":
|
||||
if len(t.params.RAIDDevices) < 2 {
|
||||
err = fmt.Errorf("at least 2 devices required")
|
||||
break
|
||||
}
|
||||
err = runRAIDVROCCreateMirrorTask(ctx, j, t.params.RAIDDevices, t.params.RAIDArrayName)
|
||||
default:
|
||||
j.append("ERROR: unknown target: " + t.Target)
|
||||
j.finish("unknown target")
|
||||
|
||||
@@ -137,9 +137,16 @@ type taskParams struct {
|
||||
RampTotal int `json:"ramp_total,omitempty"`
|
||||
RampRunID string `json:"ramp_run_id,omitempty"`
|
||||
DisplayName string `json:"display_name,omitempty"`
|
||||
Device string `json:"device,omitempty"` // for install
|
||||
LBAF int `json:"lbaf,omitempty"`
|
||||
PlatformComponents []string `json:"platform_components,omitempty"`
|
||||
Device string `json:"device,omitempty"` // for install
|
||||
LBAF int `json:"lbaf,omitempty"`
|
||||
PlatformComponents []string `json:"platform_components,omitempty"`
|
||||
SAADmiChanges []saaChange `json:"saa_dmi_changes,omitempty"`
|
||||
FRUChanges []fruChange `json:"fru_changes,omitempty"`
|
||||
HuaweiElabelChanges []huaweiChange `json:"huawei_elabel_changes,omitempty"`
|
||||
RAIDController int `json:"raid_controller,omitempty"`
|
||||
RAIDDevices []string `json:"raid_devices,omitempty"`
|
||||
RAIDArrayName string `json:"raid_array_name,omitempty"`
|
||||
RAIDSlot string `json:"raid_slot,omitempty"`
|
||||
}
|
||||
|
||||
type persistedTask struct {
|
||||
|
||||
2
bible
2
bible
Submodule bible updated: 1d89a4918e...d2600f1279
@@ -13,6 +13,7 @@ Generic engineering rules live in `bible/rules/patterns/`.
|
||||
| `docs/hardware-ingest-contract.md` | Current Reanimator hardware ingest JSON contract |
|
||||
| `docs/validate-vs-burn.md` | Validate and Validate -> Stress hardware test policy |
|
||||
| `decisions/` | Architectural decision log, including read-only submodule policy |
|
||||
| `proposals/` | RFCs and contract change proposals for Reanimator Core |
|
||||
|
||||
## Validate Test Matrix
|
||||
|
||||
|
||||
185
bible-local/architecture/api-surface.md
Normal file
185
bible-local/architecture/api-surface.md
Normal file
@@ -0,0 +1,185 @@
|
||||
# API Surface
|
||||
|
||||
HTTP endpoints exposed by `bee web` (binds `0.0.0.0:80`).
|
||||
Handler registration: `audit/internal/webui/server.go` → `NewHandler()`.
|
||||
|
||||
---
|
||||
|
||||
## Health & readiness
|
||||
|
||||
| Method | Path | Description |
|
||||
|--------|----------------|-----------------------------------------------------|
|
||||
| GET | `/healthz` | Always 200. Used by load balancers / boot scripts. |
|
||||
| GET | `/api/ready` | 200 when audit JSON exists and is readable. |
|
||||
| GET | `/loading` | HTML loading page shown before first audit. |
|
||||
|
||||
---
|
||||
|
||||
## Audit
|
||||
|
||||
| Method | Path | Description |
|
||||
|--------|-----------------------|--------------------------------------------------------------|
|
||||
| GET | `/audit.json` | Latest audit JSON with SAT overlay applied. |
|
||||
| GET | `/runtime-health.json`| Latest runtime preflight JSON. |
|
||||
| POST | `/api/audit/run` | Enqueue a full `bee audit` run. Returns task ID. |
|
||||
| GET | `/api/audit/stream` | SSE: audit run log lines (`data:` + newline per line). |
|
||||
| GET | `/api/preflight` | Run runtime preflight check (synchronous, returns JSON). |
|
||||
| GET | `/api/hardware-summary` | Hardware health summary (status counts + failures). |
|
||||
| GET | `/api/components/{type}` | HTML fragment for component detail dialog (e.g. `cpu`, `memory`, `storage`, `pcie`). |
|
||||
|
||||
---
|
||||
|
||||
## SAT (System Acceptance Testing)
|
||||
|
||||
All SAT run endpoints enqueue an async task. Response: `{"task_id": "..."}`.
|
||||
|
||||
| Method | Path | Description |
|
||||
|--------|--------------------------------------------|-----------------------------------|
|
||||
| POST | `/api/sat/nvidia/run` | NVIDIA DCGM SAT |
|
||||
| POST | `/api/sat/nvidia-targeted-stress/run` | NVIDIA targeted stress validate |
|
||||
| POST | `/api/sat/nvidia-compute/run` | NVIDIA max compute load |
|
||||
| POST | `/api/sat/nvidia-targeted-power/run` | NVIDIA targeted power |
|
||||
| POST | `/api/sat/nvidia-pulse/run` | NVIDIA pulse test |
|
||||
| POST | `/api/sat/nvidia-interconnect/run` | NCCL all_reduce_perf |
|
||||
| POST | `/api/sat/nvidia-bandwidth/run` | NVBandwidth test |
|
||||
| POST | `/api/sat/nvidia-stress/run` | NVIDIA stress pack |
|
||||
| POST | `/api/sat/memory/run` | Memory acceptance |
|
||||
| POST | `/api/sat/storage/run` | Storage acceptance (smartctl) |
|
||||
| POST | `/api/sat/cpu/run` | CPU acceptance (stress-ng) |
|
||||
| POST | `/api/sat/amd/run` | AMD GPU SAT (ROCm) |
|
||||
| POST | `/api/sat/amd-mem/run` | AMD memory integrity + bandwidth |
|
||||
| POST | `/api/sat/amd-bandwidth/run` | AMD memory bandwidth |
|
||||
| POST | `/api/sat/amd-stress/run` | AMD GPU stress |
|
||||
| POST | `/api/sat/memory-stress/run` | Memory stress |
|
||||
| POST | `/api/sat/sat-stress/run` | Combined storage+memory stress |
|
||||
| POST | `/api/sat/platform-stress/run` | Fan + thermal stress |
|
||||
| GET | `/api/sat/stream` | SSE: live SAT log stream |
|
||||
| POST | `/api/sat/abort` | Abort the running SAT task |
|
||||
|
||||
---
|
||||
|
||||
## Benchmarks
|
||||
|
||||
| Method | Path | Description |
|
||||
|--------|-----------------------------------------|----------------------------------------------|
|
||||
| POST | `/api/bee-bench/nvidia/perf/run` | NVIDIA performance benchmark |
|
||||
| POST | `/api/bee-bench/nvidia/power/run` | NVIDIA power benchmark |
|
||||
| POST | `/api/bee-bench/nvidia/autotune/run` | Power source autotune (prerequisite for benchmarks) |
|
||||
| GET | `/api/bee-bench/nvidia/autotune/status` | Current autotune result / status |
|
||||
| GET | `/api/benchmark/results` | List completed benchmark result archives |
|
||||
|
||||
---
|
||||
|
||||
## Tasks (async job queue)
|
||||
|
||||
| Method | Path | Description |
|
||||
|--------|-----------------------------|----------------------------------------------------|
|
||||
| GET | `/api/tasks` | List all tasks with status |
|
||||
| POST | `/api/tasks/cancel-all` | Cancel all pending/running tasks |
|
||||
| POST | `/api/tasks/kill-workers` | Force-kill worker goroutines |
|
||||
| POST | `/api/tasks/{id}/cancel` | Cancel a specific task |
|
||||
| POST | `/api/tasks/{id}/priority` | Elevate task priority |
|
||||
| GET | `/api/tasks/{id}/stream` | SSE: live log stream for a task |
|
||||
| GET | `/api/tasks/{id}/charts` | List chart names for a task |
|
||||
| GET | `/api/tasks/{id}/chart/` | SVG chart for a task result |
|
||||
| GET | `/tasks/{id}` | HTML task detail page |
|
||||
|
||||
---
|
||||
|
||||
## Services
|
||||
|
||||
| Method | Path | Description |
|
||||
|--------|---------------------------|--------------------------------------------------|
|
||||
| GET | `/api/services` | List bee-* systemd services and their states |
|
||||
| POST | `/api/services/action` | start/stop/restart a service |
|
||||
|
||||
---
|
||||
|
||||
## Network
|
||||
|
||||
| Method | Path | Description |
|
||||
|--------|----------------------------|-----------------------------------------------------|
|
||||
| GET | `/api/network` | List interfaces with state and IPv4 addresses |
|
||||
| POST | `/api/network/dhcp` | Run dhclient on one or all interfaces |
|
||||
| POST | `/api/network/static` | Set static IPv4 address |
|
||||
| POST | `/api/network/toggle` | Bring interface up or down |
|
||||
| POST | `/api/network/confirm` | Confirm pending network change (clears rollback) |
|
||||
| POST | `/api/network/rollback` | Restore pre-change network snapshot |
|
||||
|
||||
---
|
||||
|
||||
## Export
|
||||
|
||||
| Method | Path | Description |
|
||||
|--------|-------------------------------|---------------------------------------------------|
|
||||
| GET | `/export/support.tar.gz` | Download support bundle (live-generated) |
|
||||
| GET | `/export/file` | Download a file from the export dir by path param |
|
||||
| GET | `/export/` | Browse export dir (HTML index) |
|
||||
| GET | `/api/export/list` | JSON list of files in export dir |
|
||||
| GET | `/api/export/usb` | List removable USB targets available for export |
|
||||
|
||||
---
|
||||
|
||||
## GPU
|
||||
|
||||
| Method | Path | Description |
|
||||
|--------|----------------------------|----------------------------------------------------|
|
||||
| GET | `/api/gpu/presence` | `{"nvidia": bool, "amd": bool}` |
|
||||
| GET | `/api/gpu/nvidia` | List NVIDIA GPUs from nvidia-smi |
|
||||
| GET | `/api/gpu/nvidia-status` | Per-GPU status (ECC, power, throttle) |
|
||||
| POST | `/api/gpu/nvidia-reset` | GPU reset by index |
|
||||
| GET | `/api/gpu/tools` | nvidia-smi / rocm-smi tool availability |
|
||||
|
||||
---
|
||||
|
||||
## System
|
||||
|
||||
| Method | Path | Description |
|
||||
|--------|------------------------------|---------------------------------------------------|
|
||||
| GET | `/api/system/ram-status` | toram boot state and ISO copy status |
|
||||
| POST | `/api/system/install-to-ram` | Copy ISO to RAM (background task) |
|
||||
| GET | `/api/install/disks` | List block devices suitable for disk installation |
|
||||
| POST | `/api/install/run` | Install bee to disk (background task) |
|
||||
|
||||
---
|
||||
|
||||
## Tools & NVMe
|
||||
|
||||
| Method | Path | Description |
|
||||
|--------|-------------------------------|--------------------------------------------------|
|
||||
| GET | `/api/tools/check` | Check availability of required CLI tools |
|
||||
| GET | `/api/tools/nvme-formats` | List NVMe format options for a device |
|
||||
| POST | `/api/tools/nvme-format/run` | Run nvme-format on a device |
|
||||
|
||||
---
|
||||
|
||||
## Live metrics
|
||||
|
||||
| Method | Path | Description |
|
||||
|--------|------------------------------|---------------------------------------------------|
|
||||
| GET | `/api/metrics/stream` | SSE: live metrics (GPU power, temp, utilization) |
|
||||
| GET | `/api/metrics/latest` | Latest metrics snapshot (JSON) |
|
||||
| GET | `/api/metrics/chart/` | SVG chart for a metric over time |
|
||||
| GET | `/api/metrics/export.csv` | Download metrics history as CSV |
|
||||
|
||||
---
|
||||
|
||||
## Blackbox logging
|
||||
|
||||
| Method | Path | Description |
|
||||
|--------|----------------------------|-----------------------------------------------|
|
||||
| GET | `/api/blackbox/status` | Blackbox log state (enabled, size, path) |
|
||||
| POST | `/api/blackbox/enable` | Start recording blackbox log |
|
||||
| POST | `/api/blackbox/disable` | Stop recording, flush to disk |
|
||||
|
||||
---
|
||||
|
||||
## UI pages
|
||||
|
||||
| Method | Path | Description |
|
||||
|--------|------------|-----------------------------------------------|
|
||||
| GET | `/` | Main dashboard (serves all page routes) |
|
||||
| GET | `/viewer` | Standalone JSON viewer for uploaded audit files |
|
||||
|
||||
All pages are rendered server-side as HTML. The `/` route handles sub-paths such as
|
||||
`/network`, `/services`, `/sat`, `/benchmark`, `/install`, `/validate`, `/export`.
|
||||
137
bible-local/architecture/data-model.md
Normal file
137
bible-local/architecture/data-model.md
Normal file
@@ -0,0 +1,137 @@
|
||||
# Data Model
|
||||
|
||||
The canonical output of `bee audit` is a `HardwareIngestRequest` JSON document accepted
|
||||
by the Reanimator `/api/ingest/hardware` endpoint. The ingest endpoint uses a strict
|
||||
decoder — unknown fields cause `400 Bad Request`.
|
||||
|
||||
Source of truth: `audit/internal/schema/hardware.go`
|
||||
|
||||
---
|
||||
|
||||
## Top-level: HardwareIngestRequest
|
||||
|
||||
```
|
||||
HardwareIngestRequest
|
||||
├── collected_at string RFC3339 UTC timestamp of collection
|
||||
├── hardware HardwareSnapshot
|
||||
├── runtime RuntimeHealth? from bee-runtime-preflight service
|
||||
├── filename string?
|
||||
├── source_type string?
|
||||
├── protocol string?
|
||||
└── target_host string?
|
||||
```
|
||||
|
||||
`collected_at` is the primary sort key used by Reanimator to deduplicate ingests.
|
||||
|
||||
---
|
||||
|
||||
## HardwareSnapshot
|
||||
|
||||
All component arrays are `omitempty` — absent when the collector finds nothing.
|
||||
|
||||
| JSON key | Go type | Source |
|
||||
|-------------------|----------------------------|------------------------------|
|
||||
| `board` | HardwareBoard | dmidecode type 1/2 |
|
||||
| `firmware` | []HardwareFirmwareRecord | dmidecode type 0/13 |
|
||||
| `cpus` | []HardwareCPU | dmidecode type 4 |
|
||||
| `memory` | []HardwareMemory | dmidecode type 17 |
|
||||
| `storage` | []HardwareStorage | lsblk + nvme-cli + smartctl |
|
||||
| `pcie_devices` | []HardwarePCIeDevice | lspci |
|
||||
| `power_supplies` | []HardwarePowerSupply | ipmitool fru + sdr |
|
||||
| `sensors` | *HardwareSensors | sensors -j |
|
||||
| `event_logs` | []HardwareEventLog | ipmitool sel + journald |
|
||||
| `platform_config` | *json.RawMessage | reserved, nil until used |
|
||||
| `vroc_license` | *string | vroc-cli |
|
||||
|
||||
---
|
||||
|
||||
## Identity keys
|
||||
|
||||
Reanimator uses these fields to match components across successive audits:
|
||||
|
||||
| Component | Identity key |
|
||||
|----------------|------------------------------------------------|
|
||||
| Board | `board.serial_number` (required, never empty) |
|
||||
| CPU | `serial_number` if present; else generated key |
|
||||
| Memory DIMM | `serial_number` — absent DIMMs have `present: false` |
|
||||
| Storage | `serial_number` if present; else `linux_device` from Telemetry |
|
||||
| PCIe device | `bdf` (Bus:Device.Function address) |
|
||||
| PSU | `slot` |
|
||||
|
||||
Components without a stable identity are still emitted but may not be matched across runs.
|
||||
|
||||
---
|
||||
|
||||
## HardwareComponentStatus (embedded in all components)
|
||||
|
||||
```go
|
||||
type HardwareComponentStatus struct {
|
||||
Status *string `json:"status,omitempty"` // OK | Warning | Critical | Unknown
|
||||
ErrorDescription *string `json:"error_description,omitempty"`
|
||||
}
|
||||
```
|
||||
|
||||
Status is set by collectors and overwritten at render time by `ApplySATOverlay`
|
||||
(latest SAT run results are always merged on top before display).
|
||||
|
||||
---
|
||||
|
||||
## HardwarePCIeDevice
|
||||
|
||||
The most enriched component type. Key fields:
|
||||
|
||||
| JSON key | Meaning |
|
||||
|----------------------|------------------------------------------------|
|
||||
| `bdf` | PCI address (identity key), e.g. `0000:4b:00.0` |
|
||||
| `vendor_id` | Numeric PCI vendor ID (hex). Use this for classification — not `manufacturer`. |
|
||||
| `device_id` | Numeric PCI device ID (hex) |
|
||||
| `device_class` | Human-readable class, e.g. `VideoController` |
|
||||
| `manufacturer` | String label from lspci — for display only |
|
||||
| `model` | From nvidia-smi / rocm-smi — display name |
|
||||
| `link_speed` | Current PCIe link speed, e.g. `Gen4` |
|
||||
| `max_link_speed` | Max negotiated speed |
|
||||
| `link_width` | Current lane count |
|
||||
| `max_link_width` | Max lane count |
|
||||
| `temperature_c` | From nvidia-smi / rocm-smi |
|
||||
| `power_w` | Current power draw |
|
||||
| `ecc_uncorrected_total` | Cumulative ECC uncorrected errors (NVIDIA) |
|
||||
| `ecc_corrected_total` | Cumulative ECC corrected errors (NVIDIA) |
|
||||
| `hw_slowdown` | HW throttle active (NVIDIA) |
|
||||
| `telemetry` | Free-form map for vendor-specific extras |
|
||||
|
||||
**Classification rule**: use `vendor_id` (numeric PCI ID), never `manufacturer` string.
|
||||
|
||||
| Vendor | vendor_id |
|
||||
|-----------|-----------|
|
||||
| NVIDIA | `0x10de` |
|
||||
| AMD | `0x1002` |
|
||||
| Mellanox | `0x15b3` |
|
||||
| Aspeed | `0x1a03` |
|
||||
| Intel | `0x8086` |
|
||||
|
||||
Constants live in `audit/internal/collector/pci_vendors.go`.
|
||||
|
||||
---
|
||||
|
||||
## HardwareMemory
|
||||
|
||||
`location` field exists in the Go struct with `json:"-"` — it is intentionally excluded
|
||||
from JSON output because the Reanimator schema does not include it. It is used internally
|
||||
for DIMM telemetry matching only (`collector/memory_telemetry.go`).
|
||||
|
||||
---
|
||||
|
||||
## HardwareSensors
|
||||
|
||||
Sensor structs (`HardwareFanSensor`, `HardwareTemperatureSensor`,
|
||||
`HardwarePowerSensor`, `HardwareOtherSensor`) do **not** have a `location` field.
|
||||
Location was removed in contract v2.8. The Go types mirror the schema exactly.
|
||||
|
||||
---
|
||||
|
||||
## JSON naming convention
|
||||
|
||||
All JSON keys are `snake_case`. Go field names are `CamelCase`. The mapping is
|
||||
maintained by struct tags in `audit/internal/schema/hardware.go`.
|
||||
|
||||
All pointer fields use `omitempty` — absent means not collected (not zero).
|
||||
@@ -1,5 +1,103 @@
|
||||
# Backlog
|
||||
|
||||
## Сбор SFP-модулей
|
||||
|
||||
**Статус:** не реализовано.
|
||||
|
||||
### Источник данных
|
||||
|
||||
`ethtool -m <iface>` / `ethtool --module-info <iface>` — читает EEPROM SFP/SFP+/QSFP28/QSFP-DD по стандарту MSA (SFF-8472 / SFF-8636).
|
||||
|
||||
Доступные поля из EEPROM:
|
||||
- Идентификатор модуля: `Identifier` (SFP, SFP+, QSFP28, …)
|
||||
- Тип коннектора: `Connector`
|
||||
- Вендор: `Vendor name`, `Vendor OUI`, `Vendor PN`, `Vendor SN`, `Vendor rev`
|
||||
- Оптика: `Wavelength`, `Transceiver type` (10GBase-SR, LR, DAC, …)
|
||||
- Телеметрия DOM (если модуль поддерживает): `Laser tx bias current`, `Transmit avg optical power`, `Receive avg optical power`, `Module temperature`, `Module voltage`
|
||||
- Статус: `Rx power high alarm`, `Tx power low warning`, …
|
||||
|
||||
Для QSFP28 данные повторяются на 4 канала (lane 0–3).
|
||||
|
||||
Инструмент требует root. На bee ISO — доступен (`ethtool` входит в образ).
|
||||
|
||||
### Scope для bee
|
||||
|
||||
1. Собирать список сетевых интерфейсов через `ip -j link show` (только `ether`, без `lo`/VLAN/bond).
|
||||
2. Для каждого интерфейса пробовать `ethtool -m <iface>`. Если модуль отсутствует или не поддерживает EEPROM read — тихо пропускать.
|
||||
3. Связывать интерфейс с PCIe-устройством через `ethtool -i <iface>` → поле `bus-info` (BDF) → сопоставление с `pcie_devices[].slot`.
|
||||
|
||||
### Gap в контракте
|
||||
|
||||
Текущий контракт v2.10 имеет в `pcie_devices[]` скалярные поля:
|
||||
- `sfp_temperature_c`, `sfp_tx_power_dbm`, `sfp_rx_power_dbm`, `sfp_voltage_v`, `sfp_bias_ma`
|
||||
|
||||
Этого **недостаточно**:
|
||||
- Одна NIC-карта может иметь несколько портов — нужен массив, а не скаляр.
|
||||
- Нет полей идентификации модуля (vendor, part_number, serial_number, wavelength, connector).
|
||||
- Нет разбивки по каналам для QSFP28.
|
||||
|
||||
### Предлагаемое расширение контракта
|
||||
|
||||
Добавить в `pcie_devices[]` массив `sfp_modules[]`:
|
||||
|
||||
```json
|
||||
"pcie_devices": [
|
||||
{
|
||||
"slot": "0000:3b:00.0",
|
||||
"device_class": "EthernetController",
|
||||
"model": "ConnectX-6 Dx",
|
||||
"sfp_modules": [
|
||||
{
|
||||
"port": 0,
|
||||
"identifier": "QSFP28",
|
||||
"connector": "LC",
|
||||
"vendor": "Mellanox",
|
||||
"part_number": "MFA1A00-C003",
|
||||
"serial_number": "MT2124VS09999",
|
||||
"revision": "A",
|
||||
"wavelength_nm": 850,
|
||||
"transceiver_type": "100GBase-SR4",
|
||||
"temperature_c": 36.4,
|
||||
"voltage_v": 3.29,
|
||||
"tx_power_dbm": -1.8,
|
||||
"rx_power_dbm": -2.1,
|
||||
"bias_ma": 7.2
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
Поля `sfp_modules[]`:
|
||||
|
||||
| Поле | Тип | Описание |
|
||||
|---|---|---|
|
||||
| `port` | int | Номер порта на NIC (0-based) |
|
||||
| `identifier` | string | `SFP`, `SFP+`, `QSFP28`, `QSFP-DD`, … |
|
||||
| `connector` | string | `LC`, `MPO`, `DAC`, … |
|
||||
| `vendor` | string | Производитель модуля |
|
||||
| `part_number` | string | Партномер |
|
||||
| `serial_number` | string | Серийный номер |
|
||||
| `revision` | string | Ревизия |
|
||||
| `wavelength_nm` | int | Длина волны, нм |
|
||||
| `transceiver_type` | string | `10GBase-SR`, `100GBase-SR4`, `DAC`, … |
|
||||
| `temperature_c` | float | Температура модуля, °C |
|
||||
| `voltage_v` | float | Напряжение, В |
|
||||
| `tx_power_dbm` | float | TX оптическая мощность, dBm |
|
||||
| `rx_power_dbm` | float | RX оптическая мощность, dBm |
|
||||
| `bias_ma` | float | Bias current, мА |
|
||||
|
||||
Старые скалярные поля `sfp_temperature_c` / `sfp_tx_power_dbm` / `sfp_rx_power_dbm` / `sfp_voltage_v` / `sfp_bias_ma` на уровне `pcie_devices[]` — **вывести из контракта** (deprecated), заменить на `sfp_modules[]`.
|
||||
|
||||
### Порядок реализации
|
||||
|
||||
1. Согласовать расширение контракта с Reanimator Core (bump до v2.11).
|
||||
2. Добавить `ethtool` parser в `audit/internal/collector/` — новый файл `sfp.go`.
|
||||
3. Дополнить schema в `audit/internal/schema/` типом `SFPModule`.
|
||||
4. Добавить `sfp_modules` в `PCIeDevice` в schema.
|
||||
5. Заполнять в NIC-коллекторе: связь интерфейс → BDF → `pcie_devices[].sfp_modules`.
|
||||
6. Показывать в TUI и web UI в разделе PCIe/NIC.
|
||||
|
||||
## BMC версия через IPMI
|
||||
|
||||
**Статус:** реализовано.
|
||||
|
||||
@@ -0,0 +1,41 @@
|
||||
# Decision: Skip PCIe link-speed warnings for disabled devices
|
||||
|
||||
**Date:** 2026-06-12
|
||||
**Status:** active
|
||||
|
||||
## Context
|
||||
|
||||
On HGX H100 SXM5 baseboards, the Microchip Switchtec PM41028 PSX PCIe switch
|
||||
(vendor 11F8, device 4128, NVIDIA subsystem 10DE:1643) appears in `lspci` as a
|
||||
"Memory controller". Its upstream link trains at Gen3 x2 while the device is
|
||||
capable of Gen4 x16. The device is permanently in a disabled state: memory access
|
||||
and bus-mastering are both off (Mem-, BusMaster-); `/sys/bus/pci/devices/<bdf>/enable`
|
||||
reads `0`.
|
||||
|
||||
This chip is the PCIe fabric management endpoint for the NVSwitch interconnect — it
|
||||
carries only management traffic at low bandwidth and is intentionally not activated
|
||||
by any Linux driver. The bee audit was reporting a `statusWarning` with message
|
||||
"PCIe link speed degraded" for this device, which is misleading because the device
|
||||
is not in the data path.
|
||||
|
||||
## Decision
|
||||
|
||||
`applyPCIeLinkSpeedWarning` reads `/sys/bus/pci/devices/<bdf>/enable` via the
|
||||
existing `readPCIIntAttribute` helper. If the value is `0` the function returns
|
||||
early without setting any warning status.
|
||||
|
||||
The check is vendor-agnostic: it applies to any PCIe device that Linux has not
|
||||
activated, regardless of make or model. This is consistent with the
|
||||
`no-hardcoded-vendors` contract — no vendor ID, device ID, or name string is
|
||||
used as a condition.
|
||||
|
||||
## Consequences
|
||||
|
||||
- PCIe fabric management endpoints, IPMI virtual devices, and other permanently
|
||||
disabled PCIe functions no longer produce spurious link-degradation warnings.
|
||||
- Real link degradation on active devices (GPUs, NICs, NVMe, NVLink bridges)
|
||||
continues to be detected and reported as before.
|
||||
- NVLink bridge cards retain their existing `statusCritical` path (they are always
|
||||
enabled, so the early return is never taken for them).
|
||||
- The Switchtec device on HGX H100 boards shows `statusOK` with no
|
||||
`error_description` in the audit JSON.
|
||||
@@ -7,3 +7,4 @@ One file per decision, named `YYYY-MM-DD-short-topic.md`.
|
||||
| 2026-03-05 | Use NVIDIA proprietary driver | active |
|
||||
| 2026-04-01 | Treat memtest as explicit ISO content | active |
|
||||
| 2026-04-29 | Treat embedded submodules as read-only | active |
|
||||
| 2026-06-12 | Skip PCIe link-speed warnings for disabled devices | active |
|
||||
|
||||
312
bible-local/docs/grub-bitmap-error-history.md
Normal file
312
bible-local/docs/grub-bitmap-error-history.md
Normal file
@@ -0,0 +1,312 @@
|
||||
# GRUB Bitmap Error History
|
||||
|
||||
## Symptom
|
||||
|
||||
On some servers GRUB prints:
|
||||
|
||||
```text
|
||||
error: null src bitmap in grub_video_bitmap_create_scaled.
|
||||
Press any key to continue...
|
||||
```
|
||||
|
||||
The important new observation as of `v10.7` is:
|
||||
|
||||
- the error still appears even when the logo image block is removed from
|
||||
`iso/builder/config/bootloaders/grub-efi/live-theme/theme.txt`
|
||||
- therefore the current error can no longer be explained only by
|
||||
`bee-logo.png` / `bee-logo.tga`
|
||||
|
||||
That does not prove the theme system is healthy. It proves only that the
|
||||
currently remaining failure is deeper than "bad logo file".
|
||||
|
||||
## Current State
|
||||
|
||||
Current source files:
|
||||
|
||||
- [iso/builder/config/bootloaders/grub-efi/live-theme/theme.txt](/Users/mchusavitin/Documents/git/bee/iso/builder/config/bootloaders/grub-efi/live-theme/theme.txt:1)
|
||||
has no `image` block anymore
|
||||
- [iso/builder/config/bootloaders/grub-efi/config.cfg](/Users/mchusavitin/Documents/git/bee/iso/builder/config/bootloaders/grub-efi/config.cfg:1)
|
||||
still does `insmod tga` and then `source /boot/grub/theme.cfg`
|
||||
|
||||
Implication:
|
||||
|
||||
- if the error still fires, the trigger is likely elsewhere in GRUB theme
|
||||
rendering or in the assets/config GRUB resolves while sourcing `theme.cfg`
|
||||
- the old "PNG parser fragility" story is no longer a sufficient explanation
|
||||
for the current failure mode
|
||||
|
||||
Current artifact facts:
|
||||
|
||||
- the provided `easy-bee-nvidia-v10.7-amd64.logs` build logs reference
|
||||
`linux-image-6.1.0-45`
|
||||
- the provided `easy-bee-nvidia-v10.7-amd64.iso` contains
|
||||
`live/initrd.img-6.1.0-45-amd64` and `live/vmlinuz-6.1.0-45-amd64`
|
||||
- a later `BOOT FAILED!` screenshot showed `live/initrd.img-6.1.0-44-amd64`
|
||||
and `live/vmlinuz-6.1.0-44-amd64`
|
||||
|
||||
Implication:
|
||||
|
||||
- the `BOOT FAILED!` screenshot is not from the same artifact as the provided
|
||||
`v10.7` ISO/log set
|
||||
- until the exact ISO filename and checksum are tied to that failure, the
|
||||
GRUB bitmap issue and the live-boot failure must be treated as separate
|
||||
problems
|
||||
|
||||
## Chronology
|
||||
|
||||
### 1. Initial bee GRUB theme introduction
|
||||
|
||||
Relevant commit:
|
||||
|
||||
- `d52ec67` `Stability hardening, build script fixes, GRUB bee logo`
|
||||
|
||||
What changed:
|
||||
|
||||
- bee-branded GRUB theme introduced
|
||||
- image block with explicit `width` / `height`
|
||||
|
||||
Observed result:
|
||||
|
||||
- bitmap error appeared
|
||||
|
||||
### 2. Remove explicit scaling dimensions
|
||||
|
||||
Relevant commit:
|
||||
|
||||
- `aa284ae` `fix(iso): avoid grub logo scaling error`
|
||||
|
||||
What changed:
|
||||
|
||||
- removed `width = 400`
|
||||
- removed `height = 400`
|
||||
|
||||
Reason stated by the change:
|
||||
|
||||
- try to avoid the scaling path
|
||||
|
||||
Observed result:
|
||||
|
||||
- error persisted
|
||||
|
||||
Conclusion:
|
||||
|
||||
- explicit width/height were not the sole trigger
|
||||
|
||||
### 3. Rework PNG handling and menu rendering
|
||||
|
||||
Relevant commit:
|
||||
|
||||
- `6112094` `fix(grub): fix bitmap error and menu rendering`
|
||||
|
||||
Commit message says the change was intended to:
|
||||
|
||||
- convert `bee-logo.png` to RGBA and strip metadata
|
||||
- move `terminal_output gfxterm` before `insmod png` / theme load
|
||||
- remove ASCII banner from GRUB menu area
|
||||
- fix theme typography/layout fields
|
||||
|
||||
Observed result:
|
||||
|
||||
- error persisted
|
||||
|
||||
Notes:
|
||||
|
||||
- this was still operating under the assumption that the issue was the PNG
|
||||
payload or the order of gfxterm/theme init
|
||||
|
||||
### 4. Convert logo PNG back to RGB
|
||||
|
||||
Relevant commit:
|
||||
|
||||
- `333c44f` `Fix GRUB splash: convert bee-logo.png from RGBA to RGB`
|
||||
|
||||
Intended reason:
|
||||
|
||||
- GRUB might dislike RGBA PNG and want RGB PNG
|
||||
|
||||
Observed result:
|
||||
|
||||
- error still persisted according to later project notes
|
||||
|
||||
### 5. Add post-build canonical GRUB/isolinux sync
|
||||
|
||||
Relevant commit:
|
||||
|
||||
- `0cdfbc5` `fix(iso): restore boot UX and boot logs`
|
||||
|
||||
What this introduced:
|
||||
|
||||
- post-`lb build` rewriting of `binary/boot/grub/grub.cfg`
|
||||
- post-`lb build` rewriting of `binary/isolinux/live.cfg`
|
||||
- forced rebuild of `binary_checksums`, `binary_iso`, `binary_zsync`
|
||||
|
||||
Why it was added:
|
||||
|
||||
- restore canonical EASY-BEE boot UX after live-build wrote its own bootloader
|
||||
outputs
|
||||
- restore expected boot menu and logs
|
||||
|
||||
Important note:
|
||||
|
||||
- this commit did not directly solve the bitmap issue
|
||||
- it added a second layer of bootloader mutation after live-build
|
||||
|
||||
### 6. Switch from PNG to TGA
|
||||
|
||||
Relevant commit:
|
||||
|
||||
- `626763e` `Fix GRUB bitmap error: switch from PNG to TGA for splash logo`
|
||||
|
||||
Commit message says:
|
||||
|
||||
- GRUB PNG reader was considered fragile
|
||||
- switch to uncompressed 24-bit TGA
|
||||
- `config.cfg`: `insmod png` -> `insmod tga`
|
||||
- `theme.txt`: `bee-logo.png` -> `bee-logo.tga`
|
||||
|
||||
Observed result:
|
||||
|
||||
- this did not eliminate the problem in the current lineage
|
||||
- today the system still errors even after the entire image block was removed
|
||||
|
||||
Conclusion:
|
||||
|
||||
- switching PNG -> TGA was not a durable root-cause fix
|
||||
|
||||
### 7. Patch EFI image after build
|
||||
|
||||
Relevant commit:
|
||||
|
||||
- `4f20c92` `Make UEFI boot safe and remove GRUB logo`
|
||||
|
||||
What this introduced:
|
||||
|
||||
- `sync_efi_grub_theme_assets`
|
||||
- direct `mtools` patching of `efi.img`
|
||||
- copying `config.cfg`, `theme.cfg`, and `live-theme/*` into the EFI FAT image
|
||||
- removal of the logo image block from `theme.txt`
|
||||
|
||||
Why it was added:
|
||||
|
||||
- make UEFI path "safe"
|
||||
- keep EFI GRUB image aligned with canonical bootloader assets
|
||||
|
||||
Observed result:
|
||||
|
||||
- later this became the direct cause of `Disk full` during build once
|
||||
`bee-logo.tga` was large enough
|
||||
- and even with the logo removed from `theme.txt`, the bitmap error still
|
||||
remained
|
||||
|
||||
Conclusion:
|
||||
|
||||
- EFI post-build patching increased build complexity
|
||||
- removing the logo alone did not remove the runtime GRUB error
|
||||
|
||||
### 8. Remove ASCII logo banners
|
||||
|
||||
Relevant commit:
|
||||
|
||||
- `14505ef` `Remove easy bee ASCII logo banners`
|
||||
|
||||
What changed:
|
||||
|
||||
- web loading page ASCII cleanup only
|
||||
|
||||
Relevance here:
|
||||
|
||||
- none for GRUB bitmap error
|
||||
- included here only to avoid confusion with other "logo removal" work
|
||||
|
||||
### 9. Remove EFI post-build patching
|
||||
|
||||
Relevant commit:
|
||||
|
||||
- `5dc022d` `Drop post-build EFI bootloader patching`
|
||||
|
||||
Why it was done:
|
||||
|
||||
- stop mutating `efi.img` post-build
|
||||
- remove dependence on `mtools` for EFI patching
|
||||
- remove the `Disk full` failure mode
|
||||
|
||||
Impact:
|
||||
|
||||
- this did not target the GRUB bitmap error directly
|
||||
- it targeted build-system complexity and EFI image overflow
|
||||
|
||||
### 10. Restore only GRUB/isolinux post-build sync
|
||||
|
||||
Relevant commit:
|
||||
|
||||
- `42774d4` `Restore post-build GRUB and isolinux sync`
|
||||
|
||||
Why it was needed:
|
||||
|
||||
- removing all post-build sync caused final ISO validation to fail with
|
||||
missing canonical EASY-BEE boot entries
|
||||
- memtest was still fine, but final GRUB menu was no longer canonical
|
||||
|
||||
What it restored:
|
||||
|
||||
- only `binary/boot/grub/grub.cfg`
|
||||
- only `binary/isolinux/live.cfg`
|
||||
|
||||
What it did not restore:
|
||||
|
||||
- no EFI FAT image patching
|
||||
- no `mtools` path
|
||||
|
||||
## What Is Proven False
|
||||
|
||||
The current evidence rules out several simplistic explanations:
|
||||
|
||||
- "the error is only caused by explicit image scaling"
|
||||
- "the error is only caused by PNG vs TGA"
|
||||
- "the error is only caused by the logo file itself"
|
||||
|
||||
Why:
|
||||
|
||||
- scaling dimensions were removed and error persisted
|
||||
- PNG was replaced with TGA and error still survived in the lineage
|
||||
- the image block itself is now absent, and the error still occurs
|
||||
|
||||
## Working Hypotheses Left
|
||||
|
||||
The remaining plausible layers are:
|
||||
|
||||
- GRUB theme engine still tries to render some bitmap-related element even
|
||||
without the logo image block
|
||||
- GRUB is resolving stale theme assets from the built EFI/ISO path rather than
|
||||
what we think the source tree says
|
||||
- `theme.cfg` / `theme.txt` / GRUB module loading order still triggers a bitmap
|
||||
code path elsewhere
|
||||
- live-build may still package a stale `theme.txt` or stale `live-theme`
|
||||
directory into the final image
|
||||
- the GRUB environment on the failing hardware may behave differently from the
|
||||
assumptions in our source tree
|
||||
|
||||
## Decision Boundary
|
||||
|
||||
Before making another change, the next step should be evidence gathering from
|
||||
the real built artifact, not another speculative edit.
|
||||
|
||||
That means checking on the actual built ISO or EFI image:
|
||||
|
||||
- exact `boot/grub/theme.cfg`
|
||||
- exact `boot/grub/live-theme/theme.txt`
|
||||
- exact contents of `boot/grub/live-theme/`
|
||||
- whether the final image still contains a stale logo reference
|
||||
- whether the EFI path and non-EFI path differ
|
||||
|
||||
## Relevant Commits
|
||||
|
||||
- `d52ec67` `Stability hardening, build script fixes, GRUB bee logo`
|
||||
- `aa284ae` `fix(iso): avoid grub logo scaling error`
|
||||
- `6112094` `fix(grub): fix bitmap error and menu rendering`
|
||||
- `333c44f` `Fix GRUB splash: convert bee-logo.png from RGBA to RGB`
|
||||
- `0cdfbc5` `fix(iso): restore boot UX and boot logs`
|
||||
- `626763e` `Fix GRUB bitmap error: switch from PNG to TGA for splash logo`
|
||||
- `4f20c92` `Make UEFI boot safe and remove GRUB logo`
|
||||
- `5dc022d` `Drop post-build EFI bootloader patching`
|
||||
- `42774d4` `Restore post-build GRUB and isolinux sync`
|
||||
@@ -1,7 +1,7 @@
|
||||
---
|
||||
title: Hardware Ingest JSON Contract
|
||||
version: "2.10"
|
||||
updated: "2026-04-29"
|
||||
version: "2.11"
|
||||
updated: "2026-06-19"
|
||||
maintainer: Reanimator Core
|
||||
audience: external-integrators, ai-agents
|
||||
language: ru
|
||||
@@ -9,7 +9,7 @@ language: ru
|
||||
|
||||
# Интеграция с Reanimator: контракт JSON-импорта аппаратного обеспечения
|
||||
|
||||
Версия: **2.10** · Дата: **2026-04-29**
|
||||
Версия: **2.11** · Дата: **2026-06-19**
|
||||
|
||||
Документ описывает формат JSON для передачи данных об аппаратном обеспечении серверов в систему **Reanimator** (управление жизненным циклом аппаратного обеспечения).
|
||||
Предназначен для разработчиков смежных систем (Redfish-коллекторов, агентов мониторинга, CMDB-экспортёров) и может быть включён в документацию интегрируемых проектов.
|
||||
@@ -22,6 +22,7 @@ language: ru
|
||||
|
||||
| Версия | Дата | Изменения |
|
||||
|--------|------|-----------|
|
||||
| 2.11 | 2026-06-19 | В `pcie_devices[]` добавлен необязательный массив `sfp_modules[]` с идентификацией и DOM telemetry SFP/QSFP-модулей. Скалярные поля `sfp_temperature_c` / `sfp_tx_power_dbm` / `sfp_rx_power_dbm` / `sfp_voltage_v` / `sfp_bias_ma` помечены как deprecated (принимаются, но `sfp_modules[]` имеет приоритет) |
|
||||
| 2.10 | 2026-04-29 | Для `hardware.storage[]` добавлены необязательные числовые поля `logical_block_size_bytes`, `physical_block_size_bytes`, `metadata_bytes_per_block` для нормализованного описания формата блока накопителя |
|
||||
| 2.9 | 2026-03-19 | Добавлена необязательная секция `hardware.platform_config` — произвольный объект с настройками платформы (BIOS/Redfish); хранится как latest-snapshot per machine |
|
||||
| 2.8 | 2026-03-15 | Поле `location` удалено из всех `sensors.*`; сенсоры передаются только по `name` и измеренным значениям |
|
||||
@@ -422,11 +423,12 @@ GET /ingest/hardware/jobs/{job_id}
|
||||
| `battery_temperature_c` | float | нет | Температура батареи / supercap, °C |
|
||||
| `battery_voltage_v` | float | нет | Напряжение батареи / supercap, В |
|
||||
| `battery_replace_required` | bool | нет | Требуется замена батареи / supercap |
|
||||
| `sfp_temperature_c` | float | нет | Температура SFP/optic, °C |
|
||||
| `sfp_tx_power_dbm` | float | нет | TX optical power, dBm |
|
||||
| `sfp_rx_power_dbm` | float | нет | RX optical power, dBm |
|
||||
| `sfp_voltage_v` | float | нет | Напряжение SFP, В |
|
||||
| `sfp_bias_ma` | float | нет | Bias current SFP, мА |
|
||||
| `sfp_temperature_c` | float | нет | Температура SFP/optic, °C *(deprecated since 2.11)* |
|
||||
| `sfp_tx_power_dbm` | float | нет | TX optical power, dBm *(deprecated since 2.11)* |
|
||||
| `sfp_rx_power_dbm` | float | нет | RX optical power, dBm *(deprecated since 2.11)* |
|
||||
| `sfp_voltage_v` | float | нет | Напряжение SFP, В *(deprecated since 2.11)* |
|
||||
| `sfp_bias_ma` | float | нет | Bias current SFP, мА *(deprecated since 2.11)* |
|
||||
| `sfp_modules` | array | нет | Установленные SFP/QSFP-модули по портам (см. sfp_modules[]) |
|
||||
| `bdf` | string | нет | Deprecated alias для `slot`; при наличии ingest нормализует его в `slot` |
|
||||
| `device_class` | string | нет | Класс устройства (см. список ниже) |
|
||||
| `manufacturer` | string | нет | Производитель |
|
||||
@@ -444,10 +446,43 @@ GET /ingest/hardware/jobs/{job_id}
|
||||
`numa_node` передавайте для NIC / InfiniBand / RAID / GPU, когда источник знает CPU/NUMA affinity. Поле сохраняется в snapshot-атрибутах PCIe-компонента и дублируется в telemetry для topology use cases.
|
||||
Поля `temperature_c` и `power_w` используйте для device-level telemetry GPU / accelerator / smart PCIe devices. Они не влияют на идентификацию компонента.
|
||||
|
||||
**Deprecated поля sfp_\*:** Скалярные поля `sfp_temperature_c`, `sfp_tx_power_dbm`, `sfp_rx_power_dbm`, `sfp_voltage_v`, `sfp_bias_ma` продолжают приниматься, но помечены как deprecated since 2.11. Если в payload одновременно присутствуют `sfp_modules[]` и deprecated sfp_-скаляры — приоритет у `sfp_modules[]`, скаляры игнорируются. Deprecated поля будут удалены в версии 3.0.
|
||||
|
||||
**Генерация serial_number при отсутствии или `"N/A"`:** `{board_serial}-PCIE-{slot}`, где `slot` для PCIe равен BDF.
|
||||
|
||||
`slot` — единственный канонический адрес компонента. Для PCIe в `slot` передавайте BDF. Поле `bdf` сохраняется только как переходный alias на входе и не должно использоваться как отдельная координата рядом со `slot`.
|
||||
|
||||
#### pcie_devices[].sfp_modules[]
|
||||
|
||||
Необязательный массив установленных SFP/QSFP-модулей для данного PCIe-устройства. Один элемент — один порт. Используйте для многопортовых NIC (ConnectX-6 Dx, Intel X710, Mellanox HDR и др.).
|
||||
|
||||
| Поле | Тип | Обязательно | Описание |
|
||||
|------|-----|-------------|----------|
|
||||
| `port` | int | **да** | Номер порта на NIC (0-based). Ключ дедупликации внутри устройства |
|
||||
| `identifier` | string | нет | Тип модуля: `SFP`, `SFP+`, `SFP28`, `QSFP+`, `QSFP28`, `QSFP-DD`, `DAC` |
|
||||
| `connector` | string | нет | Тип разъёма: `LC`, `MPO`, `RJ45`, `DAC`, `AOC`, `No separable connector` |
|
||||
| `vendor` | string | нет | Производитель модуля из EEPROM |
|
||||
| `part_number` | string | нет | Партномер из EEPROM |
|
||||
| `serial_number` | string | нет | Серийный номер из EEPROM |
|
||||
| `revision` | string | нет | Ревизия из EEPROM |
|
||||
| `wavelength_nm` | int | нет | Длина волны, нм (0 для DAC/медных кабелей) |
|
||||
| `transceiver_type` | string | нет | `10GBase-SR`, `10GBase-LR`, `25GBase-SR`, `100GBase-SR4`, `DAC`, … |
|
||||
| `temperature_c` | float | нет | Температура модуля, °C (DOM telemetry) |
|
||||
| `voltage_v` | float | нет | Напряжение питания, В (DOM telemetry) |
|
||||
| `tx_power_dbm` | float | нет | TX оптическая мощность, dBm (DOM telemetry) |
|
||||
| `rx_power_dbm` | float | нет | RX оптическая мощность, dBm (DOM telemetry) |
|
||||
| `bias_ma` | float | нет | Bias current, мА (DOM telemetry) |
|
||||
|
||||
**Ключ дедупликации:** `(pcie_devices[].slot, sfp_modules[].port)`.
|
||||
|
||||
**Правила ingest:**
|
||||
- При каждом импорте — полная замена `sfp_modules[]` для данного `pcie_devices[].slot` (upsert всего массива целиком).
|
||||
- Если `sfp_modules` отсутствует или `null` — существующие данные SFP не трогать.
|
||||
- Если `sfp_modules: []` (пустой массив) — трактовать как «модули не обнаружены», очистить сохранённые данные.
|
||||
- Дубли по `port` внутри одного `pcie_devices[]` — невалидны, endpoint возвращает `400` с описанием поля.
|
||||
- Модули без `serial_number` допустимы (многие DAC-кабели не имеют SN); сохраняются по ключу `(slot, port)`.
|
||||
- Изменение `serial_number` или `part_number` модуля на порту создаёт событие `COMPONENT_CHANGED` для PCIe-устройства с описанием «SFP module replaced on port N».
|
||||
|
||||
**Значения `device_class`:**
|
||||
|
||||
| Значение | Назначение |
|
||||
@@ -472,16 +507,47 @@ GET /ingest/hardware/jobs/{job_id}
|
||||
"numa_node": 0,
|
||||
"temperature_c": 48.5,
|
||||
"power_w": 18.2,
|
||||
"sfp_temperature_c": 36.2,
|
||||
"sfp_tx_power_dbm": -1.8,
|
||||
"sfp_rx_power_dbm": -2.1,
|
||||
"device_class": "EthernetController",
|
||||
"manufacturer": "Intel",
|
||||
"model": "X710 10GbE",
|
||||
"serial_number": "K65472-003",
|
||||
"firmware": "9.20 0x8000d4ae",
|
||||
"manufacturer": "Mellanox",
|
||||
"model": "ConnectX-6 Dx",
|
||||
"serial_number": "MT2012X12345",
|
||||
"firmware": "22.35.2010",
|
||||
"mac_addresses": ["3c:fd:fe:aa:bb:cc", "3c:fd:fe:aa:bb:cd"],
|
||||
"status": "OK"
|
||||
"status": "OK",
|
||||
"sfp_modules": [
|
||||
{
|
||||
"port": 0,
|
||||
"identifier": "QSFP28",
|
||||
"connector": "LC",
|
||||
"vendor": "Mellanox",
|
||||
"part_number": "MFA1A00-C003",
|
||||
"serial_number": "MT2124VS09999",
|
||||
"revision": "A",
|
||||
"wavelength_nm": 850,
|
||||
"transceiver_type": "100GBase-SR4",
|
||||
"temperature_c": 36.4,
|
||||
"voltage_v": 3.29,
|
||||
"tx_power_dbm": -1.8,
|
||||
"rx_power_dbm": -2.1,
|
||||
"bias_ma": 7.2
|
||||
},
|
||||
{
|
||||
"port": 1,
|
||||
"identifier": "QSFP28",
|
||||
"connector": "LC",
|
||||
"vendor": "Mellanox",
|
||||
"part_number": "MFA1A00-C003",
|
||||
"serial_number": "MT2124VS09998",
|
||||
"revision": "A",
|
||||
"wavelength_nm": 850,
|
||||
"transceiver_type": "100GBase-SR4",
|
||||
"temperature_c": 35.9,
|
||||
"voltage_v": 3.28,
|
||||
"tx_power_dbm": -1.9,
|
||||
"rx_power_dbm": -2.3,
|
||||
"bias_ma": 7.1
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
```
|
||||
@@ -793,7 +859,24 @@ PSU без `serial_number` игнорируется.
|
||||
"model": "X710 10GbE",
|
||||
"serial_number": "K65472-003",
|
||||
"mac_addresses": ["3c:fd:fe:aa:bb:cc", "3c:fd:fe:aa:bb:cd"],
|
||||
"status": "OK"
|
||||
"status": "OK",
|
||||
"sfp_modules": [
|
||||
{
|
||||
"port": 0,
|
||||
"identifier": "SFP+",
|
||||
"connector": "LC",
|
||||
"vendor": "Intel",
|
||||
"part_number": "FTLX8574D3BCV-IT",
|
||||
"serial_number": "FNS123456789",
|
||||
"wavelength_nm": 850,
|
||||
"transceiver_type": "10GBase-SR",
|
||||
"temperature_c": 34.1,
|
||||
"voltage_v": 3.30,
|
||||
"tx_power_dbm": -2.5,
|
||||
"rx_power_dbm": -3.0,
|
||||
"bias_ma": 6.8
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"power_supplies": [
|
||||
|
||||
201
bible-local/proposals/2026-06-19-sfp-modules-contract-rfc.md
Normal file
201
bible-local/proposals/2026-06-19-sfp-modules-contract-rfc.md
Normal file
@@ -0,0 +1,201 @@
|
||||
# RFC: SFP Module Identity and Telemetry in Hardware Ingest Contract
|
||||
|
||||
**Date:** 2026-06-19
|
||||
**Status:** proposal
|
||||
**Target:** Reanimator Core — ingest schema, validation, and storage
|
||||
**Requested by:** bee (hardware audit LiveCD)
|
||||
**Contract version affected:** 2.10 → 2.11
|
||||
|
||||
---
|
||||
|
||||
## Задача для агента
|
||||
|
||||
Внести изменения в контракт ingest и реализовать их в Reanimator Core:
|
||||
|
||||
1. Добавить массив `sfp_modules[]` внутри `pcie_devices[]`.
|
||||
2. Обновить JSON-схему и валидатор ingest endpoint.
|
||||
3. Реализовать хранение/upsert данных SFP-модулей.
|
||||
4. Пометить скалярные поля `sfp_*` на уровне `pcie_devices[]` как deprecated.
|
||||
5. Обновить документ контракта (`hardware-ingest-contract.md`) до версии 2.11.
|
||||
|
||||
Ниже — всё необходимое для принятия решений без дополнительных вопросов.
|
||||
|
||||
---
|
||||
|
||||
## Контекст: откуда берутся данные
|
||||
|
||||
**Источник в bee:** `ethtool -m <iface>` (читает EEPROM SFP/SFP+/QSFP28/QSFP-DD по стандарту MSA SFF-8472 / SFF-8636).
|
||||
|
||||
**Связь с PCIe:** `ethtool -i <iface>` возвращает `bus-info` = BDF (`0000:3b:00.0`), который совпадает с `pcie_devices[].slot`. Так bee связывает SFP-данные конкретного интерфейса с PCIe-устройством.
|
||||
|
||||
**Один NIC — несколько модулей:** карта ConnectX-6 Dx (2 порта), Intel X710 (4 порта), Mellanox HDR (2 порта). Каждый порт — отдельный `ethtool -m`, отдельный SFP-модуль. Одного скаляра на устройство недостаточно.
|
||||
|
||||
**QSFP28/QSFP-DD:** 4-канальные модули возвращают telemetry отдельно по каждому каналу (lane). В предложенной схеме lane-уровень не включён в первую версию — только агрегированные значения модуля в целом. Расширение до lane-уровня — отдельный RFC если понадобится.
|
||||
|
||||
---
|
||||
|
||||
## Проблема с текущим контрактом v2.10
|
||||
|
||||
В `pcie_devices[]` есть пять скалярных полей:
|
||||
|
||||
```
|
||||
sfp_temperature_c float
|
||||
sfp_tx_power_dbm float
|
||||
sfp_rx_power_dbm float
|
||||
sfp_voltage_v float
|
||||
sfp_bias_ma float
|
||||
```
|
||||
|
||||
Ограничения:
|
||||
- **Нет идентификации модуля** — vendor, part_number, serial_number, wavelength отсутствуют; модуль нельзя инвентаризировать как самостоятельный компонент.
|
||||
- **Только один набор значений на устройство** — невозможно описать 4-портовый NIC.
|
||||
- **Нет типа модуля** — SFP, QSFP28, DAC-кабель не различаются.
|
||||
- **Нет connector/transceiver_type** — невозможно понять, оптика это или медь.
|
||||
|
||||
---
|
||||
|
||||
## Предлагаемое изменение схемы
|
||||
|
||||
### Новая структура `sfp_modules[]`
|
||||
|
||||
Добавляется как необязательное поле внутри каждого объекта `pcie_devices[]`.
|
||||
|
||||
```json
|
||||
"pcie_devices": [
|
||||
{
|
||||
"slot": "0000:3b:00.0",
|
||||
"device_class": "EthernetController",
|
||||
"model": "ConnectX-6 Dx",
|
||||
"manufacturer": "Mellanox",
|
||||
"serial_number": "MT2012X12345",
|
||||
"status": "OK",
|
||||
"sfp_modules": [
|
||||
{
|
||||
"port": 0,
|
||||
"identifier": "QSFP28",
|
||||
"connector": "LC",
|
||||
"vendor": "Mellanox",
|
||||
"part_number": "MFA1A00-C003",
|
||||
"serial_number": "MT2124VS09999",
|
||||
"revision": "A",
|
||||
"wavelength_nm": 850,
|
||||
"transceiver_type": "100GBase-SR4",
|
||||
"temperature_c": 36.4,
|
||||
"voltage_v": 3.29,
|
||||
"tx_power_dbm": -1.8,
|
||||
"rx_power_dbm": -2.1,
|
||||
"bias_ma": 7.2
|
||||
},
|
||||
{
|
||||
"port": 1,
|
||||
"identifier": "QSFP28",
|
||||
"connector": "LC",
|
||||
"vendor": "Mellanox",
|
||||
"part_number": "MFA1A00-C003",
|
||||
"serial_number": "MT2124VS09998",
|
||||
"revision": "A",
|
||||
"wavelength_nm": 850,
|
||||
"transceiver_type": "100GBase-SR4",
|
||||
"temperature_c": 35.9,
|
||||
"voltage_v": 3.28,
|
||||
"tx_power_dbm": -1.9,
|
||||
"rx_power_dbm": -2.3,
|
||||
"bias_ma": 7.1
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
### Поля `sfp_modules[]`
|
||||
|
||||
| Поле | Тип | Обязательно | Описание |
|
||||
|---|---|---|---|
|
||||
| `port` | int | **да** | Номер порта на NIC (0-based). Ключ дедупликации внутри устройства. |
|
||||
| `identifier` | string | нет | Тип модуля: `SFP`, `SFP+`, `SFP28`, `QSFP+`, `QSFP28`, `QSFP-DD`, `DAC` |
|
||||
| `connector` | string | нет | Тип разъёма: `LC`, `MPO`, `RJ45`, `DAC`, `AOC`, `No separable connector` |
|
||||
| `vendor` | string | нет | Производитель модуля из EEPROM |
|
||||
| `part_number` | string | нет | Партномер из EEPROM |
|
||||
| `serial_number` | string | нет | Серийный номер из EEPROM |
|
||||
| `revision` | string | нет | Ревизия из EEPROM |
|
||||
| `wavelength_nm` | int | нет | Длина волны, нм (0 для DAC/медных кабелей) |
|
||||
| `transceiver_type` | string | нет | `10GBase-SR`, `10GBase-LR`, `25GBase-SR`, `100GBase-SR4`, `DAC`, … |
|
||||
| `temperature_c` | float | нет | Температура модуля, °C (DOM telemetry) |
|
||||
| `voltage_v` | float | нет | Напряжение питания, В (DOM telemetry) |
|
||||
| `tx_power_dbm` | float | нет | TX оптическая мощность, dBm (DOM telemetry) |
|
||||
| `rx_power_dbm` | float | нет | RX оптическая мощность, dBm (DOM telemetry) |
|
||||
| `bias_ma` | float | нет | Bias current, мА (DOM telemetry) |
|
||||
|
||||
**Ключ дедупликации:** `(pcie_device.slot, sfp_modules[].port)`.
|
||||
|
||||
**Модули без серийного номера** — допустимы; многие DAC-кабели не имеют SN. Не игнорировать, сохранять по ключу `(slot, port)`.
|
||||
|
||||
---
|
||||
|
||||
## Deprecated поля
|
||||
|
||||
Следующие поля на уровне `pcie_devices[]` помечаются как **deprecated** начиная с v2.11:
|
||||
|
||||
```
|
||||
sfp_temperature_c
|
||||
sfp_tx_power_dbm
|
||||
sfp_rx_power_dbm
|
||||
sfp_voltage_v
|
||||
sfp_bias_ma
|
||||
```
|
||||
|
||||
**Поведение при получении deprecated полей:**
|
||||
- Продолжать принимать и сохранять (не ломать существующих интеграторов).
|
||||
- Если одновременно присутствуют `sfp_modules[]` и deprecated скаляры — приоритет у `sfp_modules[]`; скаляры игнорируются.
|
||||
- В документации пометить как `deprecated since 2.11, will be removed in 3.0`.
|
||||
|
||||
**Не удалять** deprecated поля из валидации в этом PR — только пометить в документации и changelog.
|
||||
|
||||
---
|
||||
|
||||
## Правила ingest для `sfp_modules[]`
|
||||
|
||||
- `sfp_modules[]` хранится как snapshot-атрибут PCIe-компонента (аналогично `mac_addresses`).
|
||||
- При каждом импорте — полная замена `sfp_modules[]` для данного `pcie_devices[].slot` (upsert всего массива целиком, не merge по портам).
|
||||
- Если `sfp_modules` отсутствует или `null` — существующие данные SFP не трогать (не затирать).
|
||||
- Если `sfp_modules: []` (пустой массив) — трактовать как «модули не обнаружены», очистить сохранённые данные.
|
||||
- Изменение `serial_number` или `part_number` модуля на порту — создавать событие `COMPONENT_CHANGED` для PCIe-устройства с описанием «SFP module replaced on port N».
|
||||
|
||||
---
|
||||
|
||||
## Изменения в документе контракта
|
||||
|
||||
Файл: `bible-local/docs/hardware-ingest-contract.md`
|
||||
|
||||
1. Заголовок версии: `2.10` → `2.11`, дата → `2026-06-19`.
|
||||
2. Добавить в changelog:
|
||||
```
|
||||
| 2.11 | 2026-06-19 | В `pcie_devices[]` добавлен необязательный массив `sfp_modules[]`
|
||||
с идентификацией и DOM telemetry SFP/QSFP-модулей. Скалярные поля
|
||||
sfp_temperature_c / sfp_tx_power_dbm / sfp_rx_power_dbm / sfp_voltage_v /
|
||||
sfp_bias_ma помечены как deprecated (принимаются, но sfp_modules[] имеет приоритет). |
|
||||
```
|
||||
3. В секции `pcie_devices` добавить строку в таблицу полей:
|
||||
```
|
||||
| `sfp_modules` | array | нет | Установленные SFP/QSFP-модули по портам (см. sfp_modules[]) |
|
||||
```
|
||||
4. Добавить подсекцию `#### pcie_devices[].sfp_modules[]` с таблицей полей и примером JSON (из раздела выше).
|
||||
5. Пометить deprecated поля в таблице: добавить суффикс `*(deprecated since 2.11)*`.
|
||||
6. Обновить полный пример JSON — добавить `sfp_modules` к NIC-записи в `pcie_devices`.
|
||||
|
||||
---
|
||||
|
||||
## Что не нужно делать в этом PR
|
||||
|
||||
- Не добавлять lane-level данные QSFP (tx_power_dbm_lane_0 и т.п.) — отдельный RFC.
|
||||
- Не удалять deprecated поля — только пометить.
|
||||
- Не создавать отдельную top-level секцию `network_ports` — данные остаются вложенными в `pcie_devices`.
|
||||
- Не менять логику идентификации PCIe-компонента — `serial_number` SFP-модуля не является ключом для самостоятельного компонента.
|
||||
|
||||
---
|
||||
|
||||
## Валидация
|
||||
|
||||
Единственное обязательное поле в `sfp_modules[]` — `port` (int, >= 0).
|
||||
Все остальные поля опциональны.
|
||||
Дубли по `port` внутри одного `pcie_devices[]` — невалидны, возвращать `400` с описанием поля.
|
||||
Submodule internal/chart updated: ac8120c8ab...2a15bc87f1
@@ -9,7 +9,7 @@ NCCL_TESTS_VERSION=2.13.10
|
||||
NVCC_VERSION=12.8
|
||||
CUBLAS_VERSION=13.1.1.3-1
|
||||
CUDA_USERSPACE_VERSION=13.0.96-1
|
||||
DCGM_VERSION=4.5.3-1
|
||||
DCGM_VERSION=4.6.0-1
|
||||
JOHN_JUMBO_COMMIT=67fcf9fe5a
|
||||
ROCM_VERSION=6.3.4
|
||||
ROCM_SMI_VERSION=7.4.0.60304-76~22.04
|
||||
|
||||
@@ -38,7 +38,7 @@ lb config noauto \
|
||||
--memtest memtest86+ \
|
||||
--iso-volume "${LB_ISO_VOLUME}" \
|
||||
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||
--bootappend-live "boot=live live-media-label=${LB_ISO_VOLUME} components video=1920x1080 console=ttyS0,115200n8 console=tty0 loglevel=3 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
|
||||
--bootappend-live "boot=live live-media=/dev/disk/by-label/${LB_ISO_VOLUME} live-media-label=${LB_ISO_VOLUME} components video=1920x1080 console=ttyS0,115200n8 console=tty0 loglevel=3 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
|
||||
--debootstrap-options "--include=ca-certificates" \
|
||||
--apt-recommends false \
|
||||
--chroot-squashfs-compression-type zstd \
|
||||
|
||||
@@ -8,7 +8,7 @@ BUILDER_DIR="${REPO_ROOT}/iso/builder"
|
||||
CONTAINER_TOOL="${CONTAINER_TOOL:-docker}"
|
||||
IMAGE_TAG="${BEE_BUILDER_IMAGE:-bee-iso-builder}"
|
||||
BUILDER_PLATFORM="${BEE_BUILDER_PLATFORM:-linux/amd64}"
|
||||
CACHE_DIR="${BEE_BUILDER_CACHE_DIR:-${REPO_ROOT}/dist/container-cache}"
|
||||
CACHE_DIR="${BEE_BUILDER_CACHE_DIR:-${REPO_ROOT}/dist/cache}"
|
||||
AUTH_KEYS=""
|
||||
CLEAN_CACHE=0
|
||||
VARIANT="all"
|
||||
@@ -54,14 +54,14 @@ if [ "$CLEAN_CACHE" = "1" ]; then
|
||||
"${CACHE_DIR:?}/bee" \
|
||||
"${CACHE_DIR:?}/lb-packages"
|
||||
echo "=== cleaning live-build work dirs ==="
|
||||
rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia"
|
||||
rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia-legacy"
|
||||
rm -rf "${REPO_ROOT}/dist/live-build-work-amd"
|
||||
rm -rf "${REPO_ROOT}/dist/live-build-work-nogpu"
|
||||
rm -rf "${REPO_ROOT}/dist/overlay-stage-nvidia"
|
||||
rm -rf "${REPO_ROOT}/dist/overlay-stage-nvidia-legacy"
|
||||
rm -rf "${REPO_ROOT}/dist/overlay-stage-amd"
|
||||
rm -rf "${REPO_ROOT}/dist/overlay-stage-nogpu"
|
||||
rm -rf "${REPO_ROOT}/dist/cache/live-build-work-nvidia"
|
||||
rm -rf "${REPO_ROOT}/dist/cache/live-build-work-nvidia-legacy"
|
||||
rm -rf "${REPO_ROOT}/dist/cache/live-build-work-amd"
|
||||
rm -rf "${REPO_ROOT}/dist/cache/live-build-work-nogpu"
|
||||
rm -rf "${REPO_ROOT}/dist/cache/overlay-stage-nvidia"
|
||||
rm -rf "${REPO_ROOT}/dist/cache/overlay-stage-nvidia-legacy"
|
||||
rm -rf "${REPO_ROOT}/dist/cache/overlay-stage-amd"
|
||||
rm -rf "${REPO_ROOT}/dist/cache/overlay-stage-nogpu"
|
||||
echo "=== caches cleared, proceeding with build ==="
|
||||
fi
|
||||
|
||||
|
||||
@@ -51,8 +51,8 @@ case "$BUILD_VARIANT" in
|
||||
;;
|
||||
esac
|
||||
|
||||
BUILD_WORK_DIR="${DIST_DIR}/live-build-work-${BUILD_VARIANT}"
|
||||
OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage-${BUILD_VARIANT}"
|
||||
BUILD_WORK_DIR="${DIST_DIR}/cache/live-build-work-${BUILD_VARIANT}"
|
||||
OVERLAY_STAGE_DIR="${DIST_DIR}/cache/overlay-stage-${BUILD_VARIANT}"
|
||||
|
||||
export BEE_GPU_VENDOR BEE_NVIDIA_MODULE_FLAVOR BUILD_VARIANT
|
||||
|
||||
@@ -63,7 +63,7 @@ export PATH="$PATH:/usr/local/go/bin"
|
||||
|
||||
# Allow git to read the bind-mounted repo (different UID inside container).
|
||||
git config --global safe.directory "${REPO_ROOT}"
|
||||
mkdir -p "${DIST_DIR}"
|
||||
mkdir -p "${DIST_DIR}/cache" "${DIST_DIR}/release"
|
||||
mkdir -p "${CACHE_ROOT}"
|
||||
: "${GOCACHE:=${CACHE_ROOT}/go-build}"
|
||||
: "${GOMODCACHE:=${CACHE_ROOT}/go-mod}"
|
||||
@@ -894,13 +894,11 @@ FULL_BUILD_MARKER="${BUILD_WORK_DIR}/.bee-full-build-marker"
|
||||
# hooks, archives, Dockerfile, auto/config) require a full lb build.
|
||||
needs_full_build() {
|
||||
[ -f "${FULL_BUILD_MARKER}" ] || return 0
|
||||
[ -f "${BUILD_WORK_DIR}/binary/live/filesystem.squashfs" ] || return 0
|
||||
[ -f "${BUILD_WORK_DIR}/live-image-amd64.hybrid.iso" ] || return 0
|
||||
_extra_sq=$(find "${BUILD_WORK_DIR}/binary/live" -maxdepth 1 -type f -name '*.squashfs' ! -name 'filesystem.squashfs' 2>/dev/null | head -1)
|
||||
if [ -n "$_extra_sq" ]; then
|
||||
echo "=== full build required: multi-squashfs live image present ==="
|
||||
return 0
|
||||
fi
|
||||
# Accept any versioned squashfs (filesystem-v*.squashfs or legacy filesystem.squashfs)
|
||||
_any_sq=$(find "${BUILD_WORK_DIR}/binary/live" -maxdepth 1 \
|
||||
-name 'filesystem*.squashfs' 2>/dev/null | head -1)
|
||||
[ -n "$_any_sq" ] || return 0
|
||||
|
||||
_heavy=$(find \
|
||||
"${BUILDER_DIR}/VERSIONS" \
|
||||
@@ -923,34 +921,46 @@ needs_full_build() {
|
||||
# Fast-path: unsquash existing filesystem, rsync overlay on top, repack.
|
||||
# Requires ~10 GB free in BEE_CACHE_DIR for the unpacked squashfs.
|
||||
fast_path_repack_squashfs() {
|
||||
_sq="${BUILD_WORK_DIR}/binary/live/filesystem.squashfs"
|
||||
_old_sq=$(find "${BUILD_WORK_DIR}/binary/live" -maxdepth 1 \
|
||||
-name 'filesystem*.squashfs' | sort | head -1)
|
||||
_sq="${BUILD_WORK_DIR}/binary/live/${SQUASHFS_FILENAME}"
|
||||
_tmp="${BEE_CACHE_DIR}/fast-unsquash-${BUILD_VARIANT}"
|
||||
echo "=== fast-path: unsquash ($(du -sh "$_sq" | cut -f1) compressed) ==="
|
||||
echo "=== fast-path: unsquash $(basename "$_old_sq") ($(du -sh "$_old_sq" | cut -f1) compressed) ==="
|
||||
rm -rf "$_tmp"
|
||||
unsquashfs -d "$_tmp" "$_sq"
|
||||
unsquashfs -d "$_tmp" "$_old_sq"
|
||||
echo "=== fast-path: syncing overlay stage ==="
|
||||
rsync -a --checksum "${OVERLAY_STAGE_DIR}/" "$_tmp/"
|
||||
echo "=== fast-path: repacking squashfs ==="
|
||||
echo "=== fast-path: repacking as ${SQUASHFS_FILENAME} ==="
|
||||
_sq_new="${_sq}.new"
|
||||
rm -f "$_sq_new"
|
||||
mksquashfs "$_tmp" "$_sq_new" -comp zstd -b 1048576 -noappend -no-progress
|
||||
mksquashfs "$_tmp" "$_sq_new" -comp zstd -b 1048576 -noappend -no-progress -no-xattrs
|
||||
mv "$_sq_new" "$_sq"
|
||||
rm -rf "$_tmp"
|
||||
[ "$_old_sq" != "$_sq" ] && rm -f "$_old_sq"
|
||||
echo "=== fast-path: squashfs repacked ($(du -sh "$_sq" | cut -f1)) ==="
|
||||
}
|
||||
|
||||
# Fast-path: rebuild ISO by replacing only live/filesystem.squashfs via xorriso.
|
||||
# Fast-path: rebuild ISO replacing the squashfs via xorriso.
|
||||
# Boot structure (El Torito, EFI, MBR hybrid) is replayed from the prior ISO.
|
||||
fast_path_rebuild_iso() {
|
||||
_sq="${BUILD_WORK_DIR}/binary/live/filesystem.squashfs"
|
||||
_sq="${BUILD_WORK_DIR}/binary/live/${SQUASHFS_FILENAME}"
|
||||
_prior="${BUILD_WORK_DIR}/live-image-amd64.hybrid.iso"
|
||||
_new="${BUILD_WORK_DIR}/live-image-amd64.hybrid.iso.new"
|
||||
echo "=== fast-path: rebuilding ISO with xorriso ==="
|
||||
rm -f "$_new"
|
||||
# Remove any old squashfs entries from the prior ISO before adding the new one
|
||||
_old_entries=$(xorriso -indev "$_prior" -find /live -name 'filesystem*.squashfs' -- 2>/dev/null \
|
||||
| grep -E '^/live/filesystem.*\.squashfs$' || true)
|
||||
_rm_args=""
|
||||
for _e in $_old_entries; do
|
||||
_rm_args="$_rm_args -rm $_e --"
|
||||
done
|
||||
# shellcheck disable=SC2086
|
||||
xorriso \
|
||||
-indev "$_prior" \
|
||||
-outdev "$_new" \
|
||||
-map "$_sq" /live/filesystem.squashfs \
|
||||
${_rm_args} \
|
||||
-map "$_sq" /live/${SQUASHFS_FILENAME} \
|
||||
-boot_image any replay \
|
||||
-commit
|
||||
mv "$_new" "$_prior"
|
||||
@@ -986,7 +996,6 @@ split_live_squashfs_layers() {
|
||||
tmp_root="$(mktemp -d)"
|
||||
tmp_usr="$(mktemp -d)"
|
||||
tmp_fw="$(mktemp -d)"
|
||||
trap 'rm -rf "$tmp_root" "$tmp_usr" "$tmp_fw"' RETURN
|
||||
|
||||
echo "=== splitting live squashfs into smaller layers ==="
|
||||
unsquashfs -d "$tmp_root/root" "$base_sq" >/dev/null
|
||||
@@ -998,22 +1007,21 @@ split_live_squashfs_layers() {
|
||||
move_tree_to_layer "$tmp_root/root" "boot/firmware" "$tmp_fw/root"
|
||||
|
||||
rm -f "$usr_sq" "$fw_sq"
|
||||
mksquashfs "$tmp_root/root" "${base_sq}.new" -comp zstd -b 1048576 -noappend -no-progress >/dev/null
|
||||
mksquashfs "$tmp_root/root" "${base_sq}.new" -comp zstd -b 1048576 -noappend -no-progress -no-xattrs >/dev/null
|
||||
mv "${base_sq}.new" "$base_sq"
|
||||
|
||||
if dir_has_entries "$tmp_usr/root"; then
|
||||
mksquashfs "$tmp_usr/root" "${usr_sq}.new" -comp zstd -b 1048576 -noappend -no-progress >/dev/null
|
||||
mksquashfs "$tmp_usr/root" "${usr_sq}.new" -comp zstd -b 1048576 -noappend -no-progress -no-xattrs >/dev/null
|
||||
mv "${usr_sq}.new" "$usr_sq"
|
||||
fi
|
||||
if dir_has_entries "$tmp_fw/root"; then
|
||||
mksquashfs "$tmp_fw/root" "${fw_sq}.new" -comp zstd -b 1048576 -noappend -no-progress >/dev/null
|
||||
mksquashfs "$tmp_fw/root" "${fw_sq}.new" -comp zstd -b 1048576 -noappend -no-progress -no-xattrs >/dev/null
|
||||
mv "${fw_sq}.new" "$fw_sq"
|
||||
fi
|
||||
|
||||
echo "=== live squashfs layers ==="
|
||||
find "$live_dir" -maxdepth 1 -type f -name '*.squashfs' -exec du -sh {} \; | sort
|
||||
rm -rf "$tmp_root" "$tmp_usr" "$tmp_fw"
|
||||
trap - RETURN
|
||||
}
|
||||
|
||||
recover_iso_memtest() {
|
||||
@@ -1094,9 +1102,10 @@ recover_iso_memtest() {
|
||||
}
|
||||
|
||||
PROJECT_VERSION_EFFECTIVE="$(resolve_project_version)"
|
||||
SQUASHFS_FILENAME="filesystem-v${PROJECT_VERSION_EFFECTIVE}.squashfs"
|
||||
ISO_BASENAME="easy-bee-${BUILD_VARIANT}-v${PROJECT_VERSION_EFFECTIVE}-amd64"
|
||||
# Versioned output directory: dist/easy-bee-v4.1/ — all final artefacts live here.
|
||||
OUT_DIR="${DIST_DIR}/easy-bee-v${PROJECT_VERSION_EFFECTIVE}"
|
||||
OUT_DIR="${DIST_DIR}/release/easy-bee-v${PROJECT_VERSION_EFFECTIVE}"
|
||||
ISO_VERSION_LABEL_TOKEN="$(printf '%s' "${PROJECT_VERSION_EFFECTIVE}" | tr '[:lower:].-' '[:upper:]__')"
|
||||
mkdir -p "${OUT_DIR}"
|
||||
LOG_DIR="${OUT_DIR}/${ISO_BASENAME}.logs"
|
||||
@@ -1281,7 +1290,7 @@ run_step "sync git submodules" "05-git-submodules" \
|
||||
|
||||
# --- compile bee binary (static, Linux amd64) ---
|
||||
# Shared between variants — built once, reused on second pass.
|
||||
BEE_BIN="${DIST_DIR}/bee-linux-amd64"
|
||||
BEE_BIN="${DIST_DIR}/cache/bee-linux-amd64"
|
||||
NEED_BUILD=1
|
||||
if [ -f "$BEE_BIN" ]; then
|
||||
NEWEST_SRC=$(find "${REPO_ROOT}/audit" -name '*.go' -newer "$BEE_BIN" | head -1)
|
||||
@@ -1312,16 +1321,16 @@ else
|
||||
fi
|
||||
|
||||
# --- NVIDIA-only build steps ---
|
||||
GPU_BURN_WORKER_BIN="${DIST_DIR}/bee-gpu-burn-worker-linux-amd64"
|
||||
GPU_BURN_WORKER_BIN="${DIST_DIR}/cache/bee-gpu-burn-worker-linux-amd64"
|
||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
run_step "download cuBLAS/cuBLASLt/cudart ${NCCL_CUDA_VERSION} userspace" "20-cublas" \
|
||||
sh "${BUILDER_DIR}/build-cublas.sh" \
|
||||
"${CUBLAS_VERSION}" \
|
||||
"${CUDA_USERSPACE_VERSION}" \
|
||||
"${NCCL_CUDA_VERSION}" \
|
||||
"${DIST_DIR}"
|
||||
"${DIST_DIR}/cache"
|
||||
|
||||
CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
||||
CUBLAS_CACHE="${DIST_DIR}/cache/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
||||
|
||||
echo "=== bee-gpu-burn FP4 header probe ==="
|
||||
fp4_type_match="$(grep -Rsnm 1 'CUDA_R_4F_E2M1' "${CUBLAS_CACHE}/include" 2>/dev/null || true)"
|
||||
@@ -1410,6 +1419,13 @@ rm -rf \
|
||||
if [ "$BEE_GPU_VENDOR" != "nvidia" ]; then
|
||||
rm -f "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nvidia-load"
|
||||
rm -f "${OVERLAY_STAGE_DIR}/etc/systemd/system/bee-nvidia.service"
|
||||
rm -f "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-burn"
|
||||
rm -f "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-john-gpu-stress"
|
||||
rm -f "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nccl-gpu-stress"
|
||||
rm -f "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nvidia-recover"
|
||||
rm -f "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-dcgmproftester-staggered"
|
||||
rm -f "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-check-nvswitch"
|
||||
rm -rf "${OVERLAY_STAGE_DIR}/etc/systemd/system/nvidia-fabricmanager.service.d"
|
||||
fi
|
||||
|
||||
# --- inject authorized_keys for SSH access ---
|
||||
@@ -1447,7 +1463,7 @@ fi
|
||||
|
||||
# --- copy bee binary into overlay ---
|
||||
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/bin"
|
||||
cp "${DIST_DIR}/bee-linux-amd64" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
|
||||
cp "$BEE_BIN" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
|
||||
|
||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ] && [ -f "$GPU_BURN_WORKER_BIN" ]; then
|
||||
@@ -1464,7 +1480,7 @@ cp "${BUILDER_DIR}/smoketest.sh" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smokete
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest"
|
||||
|
||||
# --- vendor utilities (optional pre-fetched binaries) ---
|
||||
for tool in storcli64 sas2ircu sas3ircu arcconf ssacli; do
|
||||
for tool in storcli64 sas2ircu sas3ircu arcconf ssacli saa; do
|
||||
if [ -f "${VENDOR_DIR}/${tool}" ]; then
|
||||
cp "${VENDOR_DIR}/${tool}" "${OVERLAY_STAGE_DIR}/usr/local/bin/${tool}"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/${tool}" || true
|
||||
@@ -1474,13 +1490,24 @@ for tool in storcli64 sas2ircu sas3ircu arcconf ssacli; do
|
||||
fi
|
||||
done
|
||||
|
||||
# saa companion directories — saa searches for these relative to CWD (/usr/local/bin)
|
||||
for saa_subdir in acpica_bin ExternalData tool stunnel GO_SNMP; do
|
||||
if [ -d "${VENDOR_DIR}/${saa_subdir}" ]; then
|
||||
cp -r "${VENDOR_DIR}/${saa_subdir}" "${OVERLAY_STAGE_DIR}/usr/local/bin/"
|
||||
find "${OVERLAY_STAGE_DIR}/usr/local/bin/${saa_subdir}" -type f -exec chmod +x {} \; 2>/dev/null || true
|
||||
echo "vendor saa: ${saa_subdir}/ (included)"
|
||||
else
|
||||
echo "vendor saa: ${saa_subdir}/ (not found, skipped)"
|
||||
fi
|
||||
done
|
||||
|
||||
# --- NVIDIA kernel modules and userspace libs ---
|
||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
run_step "build NVIDIA ${NVIDIA_DRIVER_VERSION} modules" "40-nvidia-module" \
|
||||
sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}" "${BEE_NVIDIA_MODULE_FLAVOR}"
|
||||
sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}/cache" "${DEBIAN_KERNEL_ABI}" "${BEE_NVIDIA_MODULE_FLAVOR}"
|
||||
|
||||
KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
||||
NVIDIA_CACHE="${DIST_DIR}/nvidia-${BEE_NVIDIA_MODULE_FLAVOR}-${NVIDIA_DRIVER_VERSION}-${KVER}"
|
||||
NVIDIA_CACHE="${DIST_DIR}/cache/nvidia-${BEE_NVIDIA_MODULE_FLAVOR}-${NVIDIA_DRIVER_VERSION}-${KVER}"
|
||||
|
||||
# Inject .ko files into overlay at /usr/local/lib/nvidia/
|
||||
OVERLAY_KMOD_DIR="${OVERLAY_STAGE_DIR}/usr/local/lib/nvidia"
|
||||
@@ -1506,9 +1533,9 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
|
||||
# --- build / download NCCL ---
|
||||
run_step "download NCCL ${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}" "50-nccl" \
|
||||
sh "${BUILDER_DIR}/build-nccl.sh" "${NCCL_VERSION}" "${NCCL_CUDA_VERSION}" "${DIST_DIR}" "${NCCL_SHA256:-}"
|
||||
sh "${BUILDER_DIR}/build-nccl.sh" "${NCCL_VERSION}" "${NCCL_CUDA_VERSION}" "${DIST_DIR}/cache" "${NCCL_SHA256:-}"
|
||||
|
||||
NCCL_CACHE="${DIST_DIR}/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
||||
NCCL_CACHE="${DIST_DIR}/cache/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
||||
|
||||
# Inject libnccl.so.* into overlay alongside other NVIDIA userspace libs
|
||||
cp "${NCCL_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
|
||||
@@ -1524,19 +1551,19 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
"${NCCL_TESTS_VERSION}" \
|
||||
"${NCCL_VERSION}" \
|
||||
"${NCCL_CUDA_VERSION}" \
|
||||
"${DIST_DIR}" \
|
||||
"${DIST_DIR}/cache" \
|
||||
"${NVCC_VERSION}" \
|
||||
"${DEBIAN_VERSION}"
|
||||
|
||||
NCCL_TESTS_CACHE="${DIST_DIR}/nccl-tests-${NCCL_TESTS_VERSION}"
|
||||
NCCL_TESTS_CACHE="${DIST_DIR}/cache/nccl-tests-${NCCL_TESTS_VERSION}"
|
||||
cp "${NCCL_TESTS_CACHE}/bin/all_reduce_perf" "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
||||
cp "${NCCL_TESTS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" 2>/dev/null || true
|
||||
echo "=== all_reduce_perf injected ==="
|
||||
|
||||
run_step "build john jumbo ${JOHN_JUMBO_COMMIT}" "70-john" \
|
||||
sh "${BUILDER_DIR}/build-john.sh" "${JOHN_JUMBO_COMMIT}" "${DIST_DIR}"
|
||||
JOHN_CACHE="${DIST_DIR}/john-${JOHN_JUMBO_COMMIT}"
|
||||
sh "${BUILDER_DIR}/build-john.sh" "${JOHN_JUMBO_COMMIT}" "${DIST_DIR}/cache"
|
||||
JOHN_CACHE="${DIST_DIR}/cache/john-${JOHN_JUMBO_COMMIT}"
|
||||
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john"
|
||||
rsync -a --delete "${JOHN_CACHE}/run/" "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john/run/"
|
||||
ln -sfn ../lib/bee/john/run/john "${OVERLAY_STAGE_DIR}/usr/local/bin/john"
|
||||
@@ -1679,17 +1706,30 @@ echo "=== building ISO (variant: ${BUILD_VARIANT}) ==="
|
||||
|
||||
# Export for auto/config
|
||||
BEE_GPU_VENDOR_UPPER="$(echo "${BUILD_VARIANT}" | tr 'a-z-' 'A-Z_')"
|
||||
BEE_ISO_VOLUME="EASY_BEE_${BEE_GPU_VENDOR_UPPER}_V${ISO_VERSION_LABEL_TOKEN}"
|
||||
# ISO 9660 volume ID is limited to 32 characters; truncate the version token to fit.
|
||||
_vol_prefix="EASY_BEE_${BEE_GPU_VENDOR_UPPER}_V"
|
||||
_max_token=$(( 32 - ${#_vol_prefix} ))
|
||||
_vol_token="$(printf '%s' "${ISO_VERSION_LABEL_TOKEN}" | cut -c1-${_max_token})"
|
||||
BEE_ISO_VOLUME="${_vol_prefix}${_vol_token}"
|
||||
unset _vol_prefix _max_token _vol_token
|
||||
export BEE_GPU_VENDOR_UPPER BEE_ISO_VOLUME
|
||||
|
||||
cd "${LB_DIR}"
|
||||
run_step_sh "live-build clean" "80-lb-clean" "lb clean --all 2>&1 | tail -3"
|
||||
run_step_sh "live-build config" "81-lb-config" "lb config 2>&1 | tail -5"
|
||||
dump_memtest_debug "pre-build" "${LB_DIR}"
|
||||
export MKSQUASHFS_OPTIONS="-no-xattrs"
|
||||
run_step_sh "live-build build" "90-lb-build" "lb build 2>&1"
|
||||
split_live_squashfs_layers "${LB_DIR}"
|
||||
echo "=== enforcing canonical bootloader assets ==="
|
||||
enforce_live_build_bootloader_assets "${LB_DIR}"
|
||||
# Rename lb's default filesystem.squashfs to the versioned filename so the
|
||||
# ISO contains a version-stamped squashfs (e.g. filesystem-v10.15.squashfs).
|
||||
_std_sq="${LB_DIR}/binary/live/filesystem.squashfs"
|
||||
_ver_sq="${LB_DIR}/binary/live/${SQUASHFS_FILENAME}"
|
||||
if [ -f "${_std_sq}" ] && [ "${_std_sq}" != "${_ver_sq}" ]; then
|
||||
mv "${_std_sq}" "${_ver_sq}"
|
||||
echo "=== squashfs renamed: filesystem.squashfs → ${SQUASHFS_FILENAME} ==="
|
||||
fi
|
||||
reset_live_build_stage "${LB_DIR}" "binary_checksums"
|
||||
reset_live_build_stage "${LB_DIR}" "binary_iso"
|
||||
reset_live_build_stage "${LB_DIR}" "binary_zsync"
|
||||
|
||||
@@ -16,6 +16,11 @@ menuentry "EASY-BEE v@VERSION@ -- no GUI / no X11" {
|
||||
}
|
||||
|
||||
|
||||
menuentry "*** WIPE ALL DISKS (irreversible!) ***" {
|
||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.gui=off bee.wipe=all net.ifnames=0 biosdevname=0
|
||||
initrd @INITRD_LIVE@
|
||||
}
|
||||
|
||||
if [ "${grub_platform}" = "efi" ]; then
|
||||
menuentry "Memory Test (memtest86+)" {
|
||||
chainloader /boot/memtest86+x64.efi
|
||||
|
||||
@@ -41,6 +41,12 @@ label live-@FLAVOUR@-failsafe
|
||||
initrd @INITRD@
|
||||
append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
||||
|
||||
label wipe-disks
|
||||
menu label *** WIPE ALL DISKS (irreversible!) ***
|
||||
linux @LINUX@
|
||||
initrd @INITRD@
|
||||
append @APPEND_LIVE@ toram nomodeset bee.gui=off bee.wipe=all net.ifnames=0 biosdevname=0
|
||||
|
||||
label memtest
|
||||
menu label ^Memory Test (memtest86+)
|
||||
linux /boot/memtest86+x64.bin
|
||||
|
||||
@@ -69,6 +69,7 @@ chmod +x /usr/local/bin/bee-boot-status 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-install 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-gui-gate 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-remount-medium 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-check-nvswitch 2>/dev/null || true
|
||||
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
||||
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
|
||||
chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true
|
||||
|
||||
57
iso/builder/config/hooks/normal/9012-wipe.hook.chroot
Executable file
57
iso/builder/config/hooks/normal/9012-wipe.hook.chroot
Executable file
@@ -0,0 +1,57 @@
|
||||
#!/bin/sh
|
||||
# 9012-wipe.hook.chroot
|
||||
#
|
||||
# Adds bee-initramfs-wipe to the initramfs so that selecting the
|
||||
# "WIPE ALL DISKS" boot menu entry runs the wipe tool before squashfs
|
||||
# is mounted — i.e. it works even when live boot fails.
|
||||
#
|
||||
# Two files are installed inside the chroot:
|
||||
# /etc/initramfs-tools/hooks/bee-wipe — copies binaries into initrd
|
||||
# /etc/initramfs-tools/scripts/local-premount/bee-wipe — runs at boot
|
||||
|
||||
set -e
|
||||
|
||||
HOOK_DIR="/etc/initramfs-tools/hooks"
|
||||
SCRIPT_DIR="/etc/initramfs-tools/scripts/local-premount"
|
||||
|
||||
mkdir -p "${HOOK_DIR}" "${SCRIPT_DIR}"
|
||||
|
||||
# ── initramfs hook: copy binaries ────────────────────────────────────────────
|
||||
cat > "${HOOK_DIR}/bee-wipe" << 'EOF'
|
||||
#!/bin/sh
|
||||
PREREQ=""
|
||||
prereqs() { echo "$PREREQ"; }
|
||||
case "$1" in prereqs) prereqs; exit 0 ;; esac
|
||||
|
||||
. /usr/share/initramfs-tools/hook-functions
|
||||
|
||||
for bin in lsblk blkid blkdiscard blockdev; do
|
||||
b=$(command -v "$bin" 2>/dev/null) && copy_exec "$b" /bin
|
||||
done
|
||||
|
||||
[ -x /usr/sbin/nvme ] && copy_exec /usr/sbin/nvme /sbin
|
||||
|
||||
copy_exec /usr/local/bin/bee-initramfs-wipe /bin/bee-wipe
|
||||
EOF
|
||||
|
||||
chmod +x "${HOOK_DIR}/bee-wipe"
|
||||
|
||||
# ── initramfs premount script: trigger on bee.wipe=all ───────────────────────
|
||||
cat > "${SCRIPT_DIR}/bee-wipe" << 'EOF'
|
||||
#!/bin/sh
|
||||
PREREQ=""
|
||||
prereqs() { echo "$PREREQ"; }
|
||||
case "$1" in prereqs) prereqs; exit 0 ;; esac
|
||||
|
||||
grep -qw 'bee.wipe=all' /proc/cmdline 2>/dev/null || exit 0
|
||||
exec /bin/bee-wipe
|
||||
EOF
|
||||
|
||||
chmod +x "${SCRIPT_DIR}/bee-wipe"
|
||||
|
||||
echo "9012-wipe: installed initramfs hook and premount script"
|
||||
|
||||
KVER=$(ls /lib/modules | sort -V | tail -1)
|
||||
echo "9012-wipe: rebuilding initramfs for kernel ${KVER}"
|
||||
update-initramfs -u -k "${KVER}"
|
||||
echo "9012-wipe: done"
|
||||
37
iso/builder/config/hooks/normal/9998-strip-xattrs.hook.chroot
Executable file
37
iso/builder/config/hooks/normal/9998-strip-xattrs.hook.chroot
Executable file
@@ -0,0 +1,37 @@
|
||||
#!/usr/bin/env python3
|
||||
# 9998-strip-xattrs.hook.chroot
|
||||
#
|
||||
# mksquashfs 4.5.1 (Debian bookworm) writes a non-INVALID xattr_id_table_start
|
||||
# even with -no-xattrs when the source tree contains POSIX ACL xattrs set by
|
||||
# dpkg/install-time. Linux 6.1 squashfs driver then fails with
|
||||
# "unable to read xattr id index table" and aborts the mount.
|
||||
#
|
||||
# Strip all xattrs from the live chroot before mksquashfs sees the tree so the
|
||||
# resulting squashfs has SQUASHFS_INVALID_BLK in xattr_id_table_start.
|
||||
|
||||
import os
|
||||
|
||||
def strip(path):
|
||||
try:
|
||||
for attr in os.listxattr(path, follow_symlinks=False):
|
||||
try:
|
||||
os.removexattr(path, attr, follow_symlinks=False)
|
||||
except OSError:
|
||||
pass
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
removed = 0
|
||||
for root, dirs, files in os.walk('/', topdown=True, followlinks=False):
|
||||
for name in dirs + files:
|
||||
p = os.path.join(root, name)
|
||||
try:
|
||||
attrs = os.listxattr(p, follow_symlinks=False)
|
||||
if attrs:
|
||||
strip(p)
|
||||
removed += len(attrs)
|
||||
except OSError:
|
||||
pass
|
||||
strip(root)
|
||||
|
||||
print(f"9998-strip-xattrs: removed xattrs from {removed} entries")
|
||||
@@ -1,5 +1,6 @@
|
||||
# AMD GPU firmware
|
||||
firmware-amd-graphics
|
||||
nvtop
|
||||
|
||||
# AMD ROCm — GPU monitoring, bandwidth test, and compute stress (RVS GST)
|
||||
rocm-smi-lib=%%ROCM_SMI_VERSION%%
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with
|
||||
# CUDA 13 userspace, so install the CUDA 13 build plus proprietary components
|
||||
# explicitly.
|
||||
nvtop
|
||||
nvidia-fabricmanager=%%NVIDIA_FABRICMANAGER_VERSION%%
|
||||
datacenter-gpu-manager-4-cuda13=1:%%DCGM_VERSION%%
|
||||
datacenter-gpu-manager-4-proprietary=1:%%DCGM_VERSION%%
|
||||
|
||||
@@ -38,6 +38,7 @@ exfat-fuse
|
||||
ntfs-3g
|
||||
|
||||
# Utilities
|
||||
infiniband-diags
|
||||
bash
|
||||
procps
|
||||
lsof
|
||||
@@ -46,7 +47,6 @@ less
|
||||
vim-tiny
|
||||
mc
|
||||
htop
|
||||
nvtop
|
||||
sudo
|
||||
zstd
|
||||
mstflint
|
||||
|
||||
@@ -2,6 +2,8 @@
|
||||
Description=Bee: load NVIDIA kernel modules and create device nodes
|
||||
After=local-fs.target udev.service bee-blackbox.service
|
||||
Before=bee-audit.service
|
||||
# Skip silently if bee-nvidia-load is absent (non-nvidia builds).
|
||||
ConditionPathExists=/usr/local/bin/bee-nvidia-load
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
|
||||
@@ -0,0 +1,9 @@
|
||||
[Unit]
|
||||
# bee-nvidia.service loads the NVIDIA kernel modules; fabricmanager must wait
|
||||
# for them to be fully initialized before attempting to open /dev/nvidiactl.
|
||||
After=bee-nvidia.service
|
||||
|
||||
[Service]
|
||||
# Skip fabricmanager on systems without NVSwitch hardware.
|
||||
# ExecCondition exits 1-254 → unit is silently skipped (inactive, not failed).
|
||||
ExecCondition=/usr/local/bin/bee-check-nvswitch
|
||||
@@ -3,8 +3,14 @@
|
||||
# Shows live service status until all bee services are done or failed,
|
||||
# then exits so getty can show the login prompt.
|
||||
|
||||
CRITICAL="bee-preflight bee-nvidia bee-audit"
|
||||
ALL="bee-sshsetup ssh bee-network bee-nvidia bee-preflight bee-audit bee-web"
|
||||
GPU_VENDOR="$(cat /etc/bee-gpu-vendor 2>/dev/null || echo nvidia)"
|
||||
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
||||
CRITICAL="bee-preflight bee-nvidia bee-audit"
|
||||
ALL="bee-sshsetup ssh bee-network bee-nvidia bee-preflight bee-audit bee-web"
|
||||
else
|
||||
CRITICAL="bee-preflight bee-audit"
|
||||
ALL="bee-sshsetup ssh bee-network bee-preflight bee-audit bee-web"
|
||||
fi
|
||||
|
||||
svc_state() { systemctl is-active "$1.service" 2>/dev/null || echo "inactive"; }
|
||||
|
||||
|
||||
4
iso/overlay/usr/local/bin/bee-check-nvswitch
Normal file
4
iso/overlay/usr/local/bin/bee-check-nvswitch
Normal file
@@ -0,0 +1,4 @@
|
||||
#!/bin/sh
|
||||
# Exit 0 if NVSwitch hardware is detected; exit 1 to skip fabricmanager on non-NVSwitch systems.
|
||||
# NVSwitch appears in lspci as vendor 10de, class 0680 (Bridge, Other).
|
||||
lspci -Dn 2>/dev/null | awk '$2 == "0680:" && $3 ~ /^10de:/ { found=1; exit } END { exit(found ? 0 : 1) }'
|
||||
166
iso/overlay/usr/local/bin/bee-initramfs-wipe
Executable file
166
iso/overlay/usr/local/bin/bee-initramfs-wipe
Executable file
@@ -0,0 +1,166 @@
|
||||
#!/bin/sh
|
||||
# bee-initramfs-wipe — interactive disk wipe running entirely in the initramfs.
|
||||
# Triggered by bee.wipe=all on the kernel cmdline (via local-premount hook).
|
||||
# Works before squashfs is mounted, so it runs even when live boot fails.
|
||||
|
||||
RED='\033[1;31m'
|
||||
YEL='\033[1;33m'
|
||||
GRN='\033[1;32m'
|
||||
CYN='\033[1;36m'
|
||||
NC='\033[0m'
|
||||
|
||||
p() { printf '%b\n' "$*"; }
|
||||
pp() { printf '%b' "$*"; }
|
||||
|
||||
banner() {
|
||||
p ""
|
||||
p "${RED}╔══════════════════════════════════════════════════════════╗${NC}"
|
||||
p "${RED}║ BEE DRIVE WIPE — initramfs stage ║${NC}"
|
||||
p "${RED}╚══════════════════════════════════════════════════════════╝${NC}"
|
||||
p ""
|
||||
}
|
||||
|
||||
# ── find boot device ─────────────────────────────────────────────────────────
|
||||
boot_dev() {
|
||||
local label token
|
||||
for token in $(cat /proc/cmdline 2>/dev/null); do
|
||||
case "$token" in
|
||||
live-media-label=*) label="${token#*=}" ;;
|
||||
esac
|
||||
done
|
||||
[ -z "$label" ] && return
|
||||
|
||||
local dev
|
||||
dev=$(blkid -L "$label" 2>/dev/null) || return
|
||||
# strip partition suffix: /dev/sdb1 → /dev/sdb, /dev/nvme0n1p1 → /dev/nvme0n1
|
||||
echo "$dev" | sed 's/p\?[0-9]\+$//'
|
||||
}
|
||||
|
||||
# ── enumerate candidate disks ─────────────────────────────────────────────────
|
||||
list_disks() {
|
||||
local boot
|
||||
boot=$(boot_dev)
|
||||
|
||||
lsblk -d -n -o NAME,TYPE,SIZE,MODEL 2>/dev/null | while read -r name type size model; do
|
||||
[ "$type" = "disk" ] || continue
|
||||
[ "$size" = "0B" ] && continue
|
||||
local dev="/dev/$name"
|
||||
[ "$dev" = "$boot" ] && continue
|
||||
printf '%s\t%s\t%s\n' "$dev" "$size" "${model:-}"
|
||||
done
|
||||
}
|
||||
|
||||
# ── wipe one disk ─────────────────────────────────────────────────────────────
|
||||
wipe_one() {
|
||||
local dev="$1"
|
||||
p ""
|
||||
p "=== ${YEL}${dev}${NC} ==="
|
||||
|
||||
if echo "$dev" | grep -q '^/dev/nvme'; then
|
||||
if nvme format --ses=1 "$dev" 2>&1; then
|
||||
p " ${GRN}nvme format OK${NC}"
|
||||
blockdev --flushbufs "$dev" 2>/dev/null || true
|
||||
return
|
||||
fi
|
||||
p " nvme format failed — falling back to blkdiscard"
|
||||
fi
|
||||
|
||||
if blkdiscard -f "$dev" 2>&1; then
|
||||
p " ${GRN}blkdiscard OK${NC}"
|
||||
blockdev --flushbufs "$dev" 2>/dev/null || true
|
||||
return
|
||||
fi
|
||||
|
||||
p " blkdiscard not supported — zeroing partition tables (HDD fallback)"
|
||||
local size_bytes mb32 skip
|
||||
size_bytes=$(blockdev --getsize64 "$dev" 2>/dev/null || echo 0)
|
||||
mb32=$(( 32 * 1024 * 1024 ))
|
||||
|
||||
dd if=/dev/zero of="$dev" bs=4M count=8 conv=fsync status=progress 2>&1 || true
|
||||
|
||||
if [ "$size_bytes" -gt $(( mb32 * 2 )) ]; then
|
||||
skip=$(( (size_bytes - mb32) / (4 * 1024 * 1024) ))
|
||||
dd if=/dev/zero of="$dev" bs=4M count=8 seek="$skip" conv=fsync status=progress 2>&1 || true
|
||||
fi
|
||||
|
||||
blockdev --flushbufs "$dev" 2>/dev/null || true
|
||||
p " ${GRN}done (partition tables zeroed)${NC}"
|
||||
}
|
||||
|
||||
# ── main ──────────────────────────────────────────────────────────────────────
|
||||
banner
|
||||
|
||||
BOOT=$(boot_dev)
|
||||
[ -n "$BOOT" ] && p "Boot device (excluded): ${CYN}${BOOT}${NC}\n"
|
||||
|
||||
# build indexed list
|
||||
i=0
|
||||
DEVS=""
|
||||
IFS='
|
||||
'
|
||||
for line in $(list_disks); do
|
||||
i=$(( i + 1 ))
|
||||
dev=$(echo "$line" | cut -f1)
|
||||
size=$(echo "$line" | cut -f2)
|
||||
model=$(echo "$line" | cut -f3)
|
||||
DEVS="${DEVS}${i}:${dev}:${size}:${model}
|
||||
"
|
||||
printf " ${CYN}[%d]${NC} %-16s %8s %s\n" "$i" "$dev" "$size" "$model"
|
||||
done
|
||||
IFS='
|
||||
'
|
||||
|
||||
if [ "$i" -eq 0 ]; then
|
||||
p "\nNo physical disks found (boot device excluded)."
|
||||
p "Dropping to shell — type 'exit' to continue boot."
|
||||
exec /bin/sh
|
||||
fi
|
||||
|
||||
p ""
|
||||
pp "Enter numbers to wipe (space-separated), ${YEL}all${NC} for all, ${YEL}q${NC} to abort: "
|
||||
read -r SELECTION
|
||||
|
||||
case "$SELECTION" in
|
||||
q|Q|'') p "\nAborted."; exec /bin/sh ;;
|
||||
esac
|
||||
|
||||
# resolve selection → list of devs
|
||||
SELECTED=""
|
||||
if [ "$SELECTION" = "all" ] || [ "$SELECTION" = "ALL" ]; then
|
||||
SELECTED=$(echo "$DEVS" | grep -v '^$' | cut -d: -f2 | tr '\n' ' ')
|
||||
else
|
||||
for num in $SELECTION; do
|
||||
match=$(echo "$DEVS" | grep "^${num}:" | cut -d: -f2)
|
||||
if [ -z "$match" ]; then
|
||||
p "${RED}Unknown index: ${num}${NC}"; exec /bin/sh
|
||||
fi
|
||||
SELECTED="${SELECTED}${match} "
|
||||
done
|
||||
fi
|
||||
|
||||
SELECTED=$(echo "$SELECTED" | tr -s ' ' | sed 's/ $//')
|
||||
|
||||
p ""
|
||||
p "Selected for wipe: ${YEL}${SELECTED}${NC}"
|
||||
p "${RED}WARNING: This is IRREVERSIBLE. All data on the selected disks will be lost.${NC}"
|
||||
p ""
|
||||
pp "Type YES to confirm, anything else to abort: "
|
||||
read -r CONFIRM
|
||||
|
||||
if [ "$CONFIRM" != "YES" ]; then
|
||||
p "\nAborted — no disks were touched."
|
||||
exec /bin/sh
|
||||
fi
|
||||
|
||||
p "\nStarting wipe..."
|
||||
for dev in $SELECTED; do
|
||||
wipe_one "$dev"
|
||||
done
|
||||
|
||||
sync
|
||||
p ""
|
||||
p "${GRN}=== All selected disks wiped and flushed. ===${NC}"
|
||||
p ""
|
||||
pp "Press Enter to reboot..."
|
||||
read -r _
|
||||
reboot
|
||||
@@ -67,7 +67,8 @@ if ! mkdir "${LOCK_DIR}" 2>/dev/null; then
|
||||
fi
|
||||
trap 'rmdir "${LOCK_DIR}" >/dev/null 2>&1 || true' EXIT
|
||||
|
||||
if have_nvidia_gpu && [ ! -e /dev/nvidia0 ]; then
|
||||
GPU_VENDOR=$(cat /etc/bee-gpu-vendor 2>/dev/null || echo "")
|
||||
if [ "$GPU_VENDOR" = "nvidia" ] && have_nvidia_gpu && [ ! -e /dev/nvidia0 ]; then
|
||||
log_event "NVIDIA GPU detected but /dev/nvidia0 is missing"
|
||||
restart_service bee-nvidia.service || true
|
||||
fi
|
||||
|
||||
132
iso/overlay/usr/local/bin/bee-wipe-disks
Executable file
132
iso/overlay/usr/local/bin/bee-wipe-disks
Executable file
@@ -0,0 +1,132 @@
|
||||
#!/bin/bash
|
||||
# bee-wipe-disks — erase all physical disks (interactive, confirmation required)
|
||||
#
|
||||
# Triggered automatically when the kernel cmdline contains bee.wipe=all.
|
||||
# Can also be run manually from a root shell.
|
||||
#
|
||||
# Wipe strategy:
|
||||
# NVMe — nvme format (ATA-style secure erase, fast)
|
||||
# Other — blkdiscard -f (TRIM/UNMAP, fast on SSDs)
|
||||
# dd if=/dev/zero (fallback for HDDs, zeros first+last 32 MB)
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
RED=$'\033[1;31m'
|
||||
YEL=$'\033[1;33m'
|
||||
GRN=$'\033[1;32m'
|
||||
NC=$'\033[0m'
|
||||
|
||||
banner() {
|
||||
echo ""
|
||||
echo "${RED}╔══════════════════════════════════════════════════════════╗${NC}"
|
||||
echo "${RED}║ BEE DISK WIPE — ALL DATA WILL BE DESTROYED ║${NC}"
|
||||
echo "${RED}╚══════════════════════════════════════════════════════════╝${NC}"
|
||||
echo ""
|
||||
}
|
||||
|
||||
# ── find boot device to skip ──────────────────────────────────────────────────
|
||||
live_dev() {
|
||||
local src
|
||||
src=$(findmnt -n -o SOURCE /run/live/medium 2>/dev/null || true)
|
||||
[ -z "$src" ] && return
|
||||
# Strip partition suffix: /dev/sdb1 → /dev/sdb, /dev/nvme0n1p1 → /dev/nvme0n1
|
||||
echo "$src" | sed 's/p\?[0-9]\+$//'
|
||||
}
|
||||
|
||||
# ── enumerate target disks ────────────────────────────────────────────────────
|
||||
find_disks() {
|
||||
local boot_dev
|
||||
boot_dev=$(live_dev)
|
||||
|
||||
lsblk -d -n -o NAME,TYPE,SIZE,MODEL | while read -r name type size model; do
|
||||
[ "$type" = "disk" ] || continue
|
||||
[ "$size" = "0B" ] && continue # empty virtual media
|
||||
|
||||
local dev="/dev/$name"
|
||||
[ "$dev" = "$boot_dev" ] && continue # skip boot device
|
||||
|
||||
printf '%s\t%s\t%s\n' "$dev" "$size" "$model"
|
||||
done
|
||||
}
|
||||
|
||||
# ── wipe one disk ─────────────────────────────────────────────────────────────
|
||||
wipe_disk() {
|
||||
local dev="$1"
|
||||
echo ""
|
||||
echo "=== ${YEL}${dev}${NC} ==="
|
||||
|
||||
if echo "$dev" | grep -q '^/dev/nvme'; then
|
||||
# NVMe format (ses=1 = user data erase)
|
||||
if nvme format --ses=1 "$dev" 2>&1; then
|
||||
echo " ${GRN}nvme format OK${NC}"
|
||||
return
|
||||
fi
|
||||
echo " nvme format failed, falling back to blkdiscard"
|
||||
fi
|
||||
|
||||
if blkdiscard -f "$dev" 2>&1; then
|
||||
echo " ${GRN}blkdiscard OK${NC}"
|
||||
return
|
||||
fi
|
||||
|
||||
echo " blkdiscard not supported — zeroing partition tables (HDD fallback)"
|
||||
local size_bytes
|
||||
size_bytes=$(blockdev --getsize64 "$dev")
|
||||
local mb32=$(( 32 * 1024 * 1024 ))
|
||||
|
||||
# Zero first 32 MB (MBR, GPT, filesystem superblocks)
|
||||
dd if=/dev/zero of="$dev" bs=4M count=8 conv=fsync status=progress 2>&1 || true
|
||||
|
||||
# Zero last 32 MB (backup GPT)
|
||||
if [ "$size_bytes" -gt $(( mb32 * 2 )) ]; then
|
||||
local skip=$(( (size_bytes - mb32) / (4 * 1024 * 1024) ))
|
||||
dd if=/dev/zero of="$dev" bs=4M count=8 seek="$skip" conv=fsync status=progress 2>&1 || true
|
||||
fi
|
||||
|
||||
echo " ${GRN}done (partition tables zeroed)${NC}"
|
||||
}
|
||||
|
||||
# ── main ──────────────────────────────────────────────────────────────────────
|
||||
banner
|
||||
|
||||
mapfile -t DISKS < <(find_disks | awk '{print $1}')
|
||||
|
||||
if [ ${#DISKS[@]} -eq 0 ]; then
|
||||
echo "No physical disks found (boot device excluded)."
|
||||
echo "Nothing to wipe."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "Disks to be ${RED}COMPLETELY ERASED${NC}:"
|
||||
echo ""
|
||||
find_disks | while IFS=$'\t' read -r dev size model; do
|
||||
printf " ${YEL}%-16s${NC} %8s %s\n" "$dev" "$size" "$model"
|
||||
done
|
||||
echo ""
|
||||
echo "${RED}WARNING: This is IRREVERSIBLE. All data on the listed disks will be lost.${NC}"
|
||||
echo ""
|
||||
printf "Type YES to confirm wipe, anything else to abort: "
|
||||
read -r CONFIRM
|
||||
|
||||
if [ "$CONFIRM" != "YES" ]; then
|
||||
echo ""
|
||||
echo "Aborted — no disks were touched."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "Starting wipe..."
|
||||
|
||||
for dev in "${DISKS[@]}"; do
|
||||
wipe_disk "$dev"
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "${GRN}=== All disks wiped. ===${NC}"
|
||||
echo ""
|
||||
printf "Reboot now to return to the boot menu? [Y/n] "
|
||||
read -r REBOOT
|
||||
case "${REBOOT:-Y}" in
|
||||
[Nn]*) echo "You can reboot manually when ready." ;;
|
||||
*) echo "Rebooting..."; sleep 2; reboot ;;
|
||||
esac
|
||||
1131
iso/vendor/ExternalData/SMCIPID.txt
vendored
Normal file
1131
iso/vendor/ExternalData/SMCIPID.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user