Compare commits
58 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b49c71a980 | ||
|
|
85d1acdaa3 | ||
|
|
a2d7513153 | ||
|
|
5b5d8609d3 | ||
|
|
e7442972d1 | ||
|
|
4c6daa1c5e | ||
|
|
e420888d71 | ||
|
|
8149360410 | ||
|
|
4262c5b798 | ||
|
|
b2e177af31 | ||
|
|
271dadda03 | ||
|
|
20766ccc76 | ||
|
|
966944d6d8 | ||
| ce6b1e0eb7 | |||
| 4066e842a9 | |||
| 7d2e904d14 | |||
| 2320925433 | |||
| e169a7722c | |||
| 74a3c65f64 | |||
| 884988cb2a | |||
| 963bc960ca | |||
| 4f6579e040 | |||
| dc07580adc | |||
|
|
87e78e230e | ||
|
|
805a3b277d | ||
|
|
5bc9bd7fb3 | ||
|
|
0939a647ea | ||
|
|
7640f20714 | ||
|
|
1593bf3e76 | ||
|
|
ae80d7711e | ||
|
|
ca78b9df65 | ||
|
|
5cafe63f33 | ||
|
|
b75e65bcb1 | ||
|
|
8d173175eb | ||
|
|
5cbde0448e | ||
|
|
49a09fde05 | ||
|
|
f3962422c8 | ||
|
|
ee36e3c711 | ||
|
|
cca3b21d35 | ||
|
|
75c33e073e | ||
| 7b4bcc745a | |||
| 42774d44a6 | |||
| 5dc022ddf8 | |||
| 6623e159f5 | |||
| bbd6d009f8 | |||
| 6c2b188ec9 | |||
| 14505ef24a | |||
| 4f20c9246d | |||
| eed157c2db | |||
| a2c8aea0df | |||
| b21f03cd26 | |||
| cac5b9c86e | |||
| b5d04ef045 | |||
| fcd64438ea | |||
| 0e39e7d960 | |||
|
|
58d6da0e4f | ||
|
|
7ce73e34a4 | ||
|
|
8a21809ade |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1,6 +1,5 @@
|
|||||||
.env
|
.env
|
||||||
.DS_Store
|
.DS_Store
|
||||||
dist/
|
dist/
|
||||||
iso/out/
|
|
||||||
build-cache/
|
build-cache/
|
||||||
audit/bee
|
audit/bee
|
||||||
|
|||||||
@@ -64,6 +64,8 @@ func run(args []string, stdout, stderr io.Writer) (exitCode int) {
|
|||||||
return runExport(args[1:], stdout, stderr)
|
return runExport(args[1:], stdout, stderr)
|
||||||
case "preflight":
|
case "preflight":
|
||||||
return runPreflight(args[1:], stdout, stderr)
|
return runPreflight(args[1:], stdout, stderr)
|
||||||
|
case "install-to-ram":
|
||||||
|
return runInstallToRAM(args[1:], stdout, stderr)
|
||||||
case "support-bundle":
|
case "support-bundle":
|
||||||
return runSupportBundle(args[1:], stdout, stderr)
|
return runSupportBundle(args[1:], stdout, stderr)
|
||||||
case "web":
|
case "web":
|
||||||
@@ -90,6 +92,7 @@ func printRootUsage(w io.Writer) {
|
|||||||
fmt.Fprintln(w, `bee commands:
|
fmt.Fprintln(w, `bee commands:
|
||||||
bee audit --runtime auto|local|livecd --output stdout|file:<path>
|
bee audit --runtime auto|local|livecd --output stdout|file:<path>
|
||||||
bee preflight --output stdout|file:<path>
|
bee preflight --output stdout|file:<path>
|
||||||
|
bee install-to-ram
|
||||||
bee export --target <device>
|
bee export --target <device>
|
||||||
bee support-bundle --output stdout|file:<path>
|
bee support-bundle --output stdout|file:<path>
|
||||||
bee web --listen :80 [--audit-path `+app.DefaultAuditJSONPath+`]
|
bee web --listen :80 [--audit-path `+app.DefaultAuditJSONPath+`]
|
||||||
@@ -109,6 +112,8 @@ func runHelp(args []string, stdout, stderr io.Writer) int {
|
|||||||
return runExport([]string{"--help"}, stdout, stdout)
|
return runExport([]string{"--help"}, stdout, stdout)
|
||||||
case "preflight":
|
case "preflight":
|
||||||
return runPreflight([]string{"--help"}, stdout, stdout)
|
return runPreflight([]string{"--help"}, stdout, stdout)
|
||||||
|
case "install-to-ram":
|
||||||
|
return runInstallToRAM([]string{"--help"}, stdout, stdout)
|
||||||
case "support-bundle":
|
case "support-bundle":
|
||||||
return runSupportBundle([]string{"--help"}, stdout, stdout)
|
return runSupportBundle([]string{"--help"}, stdout, stdout)
|
||||||
case "web":
|
case "web":
|
||||||
@@ -252,6 +257,32 @@ func runPreflight(args []string, stdout, stderr io.Writer) int {
|
|||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func runInstallToRAM(args []string, stdout, stderr io.Writer) int {
|
||||||
|
fs := flag.NewFlagSet("install-to-ram", flag.ContinueOnError)
|
||||||
|
fs.SetOutput(stderr)
|
||||||
|
fs.Usage = func() {
|
||||||
|
fmt.Fprintln(stderr, "usage: bee install-to-ram")
|
||||||
|
}
|
||||||
|
if err := fs.Parse(args); err != nil {
|
||||||
|
if err == flag.ErrHelp {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
if fs.NArg() != 0 {
|
||||||
|
fs.Usage()
|
||||||
|
return 2
|
||||||
|
}
|
||||||
|
|
||||||
|
application := app.New(platform.New())
|
||||||
|
logLine := func(s string) { fmt.Fprintln(stdout, s) }
|
||||||
|
if err := application.RunInstallToRAM(context.Background(), logLine); err != nil {
|
||||||
|
slog.Error("run install-to-ram", "err", err)
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
func runSupportBundle(args []string, stdout, stderr io.Writer) int {
|
func runSupportBundle(args []string, stdout, stderr io.Writer) int {
|
||||||
fs := flag.NewFlagSet("support-bundle", flag.ContinueOnError)
|
fs := flag.NewFlagSet("support-bundle", flag.ContinueOnError)
|
||||||
fs.SetOutput(stderr)
|
fs.SetOutput(stderr)
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
405
audit/internal/app/app_format.go
Normal file
405
audit/internal/app/app_format.go
Normal file
@@ -0,0 +1,405 @@
|
|||||||
|
package app
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"sort"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"bee/audit/internal/collector"
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
)
|
||||||
|
|
||||||
|
func hostnameOr(fallback string) string {
|
||||||
|
hn, err := os.Hostname()
|
||||||
|
if err != nil || strings.TrimSpace(hn) == "" {
|
||||||
|
return fallback
|
||||||
|
}
|
||||||
|
return hn
|
||||||
|
}
|
||||||
|
|
||||||
|
func sanitizeFilename(v string) string {
|
||||||
|
var out []rune
|
||||||
|
for _, r := range v {
|
||||||
|
switch {
|
||||||
|
case r >= 'a' && r <= 'z', r >= 'A' && r <= 'Z', r >= '0' && r <= '9', r == '-', r == '_', r == '.':
|
||||||
|
out = append(out, r)
|
||||||
|
default:
|
||||||
|
out = append(out, '-')
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(out) == 0 {
|
||||||
|
return "unknown"
|
||||||
|
}
|
||||||
|
return string(out)
|
||||||
|
}
|
||||||
|
|
||||||
|
func bodyOr(body, fallback string) string {
|
||||||
|
body = strings.TrimSpace(body)
|
||||||
|
if body == "" {
|
||||||
|
return fallback
|
||||||
|
}
|
||||||
|
return body
|
||||||
|
}
|
||||||
|
|
||||||
|
func trimPtr(value *string) string {
|
||||||
|
if value == nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return strings.TrimSpace(*value)
|
||||||
|
}
|
||||||
|
|
||||||
|
func joinSortedKeys(values map[string]struct{}) string {
|
||||||
|
if len(values) == 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
keys := make([]string, 0, len(values))
|
||||||
|
for key := range values {
|
||||||
|
keys = append(keys, key)
|
||||||
|
}
|
||||||
|
sort.Strings(keys)
|
||||||
|
return strings.Join(keys, "/")
|
||||||
|
}
|
||||||
|
|
||||||
|
func humanizeMB(totalMB int) string {
|
||||||
|
if totalMB <= 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
gb := float64(totalMB) / 1024.0
|
||||||
|
if gb >= 1024.0 {
|
||||||
|
tb := gb / 1024.0
|
||||||
|
return fmt.Sprintf("%.1f TB", tb)
|
||||||
|
}
|
||||||
|
if gb == float64(int64(gb)) {
|
||||||
|
return fmt.Sprintf("%.0f GB", gb)
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("%.1f GB", gb)
|
||||||
|
}
|
||||||
|
|
||||||
|
func humanizeGB(totalGB int) string {
|
||||||
|
if totalGB <= 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
tb := float64(totalGB) / 1024.0
|
||||||
|
if tb >= 1.0 {
|
||||||
|
return fmt.Sprintf("%.1f TB", tb)
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("%d GB", totalGB)
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseKeyValueSummary(raw string) map[string]string {
|
||||||
|
out := map[string]string{}
|
||||||
|
for _, line := range strings.Split(raw, "\n") {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
if line == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
key, value, ok := strings.Cut(line, "=")
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
out[strings.TrimSpace(key)] = strings.TrimSpace(value)
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func firstNonEmpty(values ...string) string {
|
||||||
|
for _, value := range values {
|
||||||
|
value = strings.TrimSpace(value)
|
||||||
|
if value != "" {
|
||||||
|
return value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
func cleanSummaryKey(key string) string {
|
||||||
|
idx := strings.Index(key, "-")
|
||||||
|
if idx <= 0 {
|
||||||
|
return key
|
||||||
|
}
|
||||||
|
prefix := key[:idx]
|
||||||
|
for _, c := range prefix {
|
||||||
|
if c < '0' || c > '9' {
|
||||||
|
return key
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return key[idx+1:]
|
||||||
|
}
|
||||||
|
|
||||||
|
func isGPUDevice(dev schema.HardwarePCIeDevice) bool {
|
||||||
|
// Exclude Aspeed BMC VGA adapters (not compute GPUs).
|
||||||
|
if dev.VendorID != nil && *dev.VendorID == collector.AspeedVendorID {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
class := trimPtr(dev.DeviceClass)
|
||||||
|
// AMD Instinct / Radeon compute GPUs always carry ProcessingAccelerator or DisplayController.
|
||||||
|
// Do NOT match AMD vendor alone — CPU chipset PCIe devices share that vendor ID.
|
||||||
|
if class == "VideoController" || class == "DisplayController" || class == "ProcessingAccelerator" {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
// NVIDIA devices sometimes expose class values outside the standard GPU set.
|
||||||
|
return dev.VendorID != nil && *dev.VendorID == collector.NvidiaVendorID
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatSystemLine(board schema.HardwareBoard) string {
|
||||||
|
model := strings.TrimSpace(strings.Join([]string{
|
||||||
|
trimPtr(board.Manufacturer),
|
||||||
|
trimPtr(board.ProductName),
|
||||||
|
}, " "))
|
||||||
|
serial := strings.TrimSpace(board.SerialNumber)
|
||||||
|
switch {
|
||||||
|
case model != "" && serial != "":
|
||||||
|
return fmt.Sprintf("System: %s | S/N %s", model, serial)
|
||||||
|
case model != "":
|
||||||
|
return "System: " + model
|
||||||
|
case serial != "":
|
||||||
|
return "System S/N: " + serial
|
||||||
|
default:
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatCPULine(cpus []schema.HardwareCPU) string {
|
||||||
|
if len(cpus) == 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
modelCounts := map[string]int{}
|
||||||
|
unknown := 0
|
||||||
|
for _, cpu := range cpus {
|
||||||
|
model := trimPtr(cpu.Model)
|
||||||
|
if model == "" {
|
||||||
|
unknown++
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
modelCounts[model]++
|
||||||
|
}
|
||||||
|
if len(modelCounts) == 1 && unknown == 0 {
|
||||||
|
for model, count := range modelCounts {
|
||||||
|
return fmt.Sprintf("CPU: %d x %s", count, model)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
parts := make([]string, 0, len(modelCounts)+1)
|
||||||
|
if len(modelCounts) > 0 {
|
||||||
|
keys := make([]string, 0, len(modelCounts))
|
||||||
|
for key := range modelCounts {
|
||||||
|
keys = append(keys, key)
|
||||||
|
}
|
||||||
|
sort.Strings(keys)
|
||||||
|
for _, key := range keys {
|
||||||
|
parts = append(parts, fmt.Sprintf("%d x %s", modelCounts[key], key))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if unknown > 0 {
|
||||||
|
parts = append(parts, fmt.Sprintf("%d x unknown", unknown))
|
||||||
|
}
|
||||||
|
return "CPU: " + strings.Join(parts, ", ")
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatMemoryLine(dimms []schema.HardwareMemory) string {
|
||||||
|
totalMB := 0
|
||||||
|
present := 0
|
||||||
|
types := map[string]struct{}{}
|
||||||
|
for _, dimm := range dimms {
|
||||||
|
if dimm.Present != nil && !*dimm.Present {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if dimm.SizeMB == nil || *dimm.SizeMB <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
present++
|
||||||
|
totalMB += *dimm.SizeMB
|
||||||
|
if value := trimPtr(dimm.Type); value != "" {
|
||||||
|
types[value] = struct{}{}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if totalMB == 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
typeText := joinSortedKeys(types)
|
||||||
|
line := fmt.Sprintf("Memory: %s", humanizeMB(totalMB))
|
||||||
|
if typeText != "" {
|
||||||
|
line += " " + typeText
|
||||||
|
}
|
||||||
|
if present > 0 {
|
||||||
|
line += fmt.Sprintf(" (%d DIMMs)", present)
|
||||||
|
}
|
||||||
|
return line
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatStorageLine(disks []schema.HardwareStorage) string {
|
||||||
|
count := 0
|
||||||
|
totalGB := 0
|
||||||
|
for _, disk := range disks {
|
||||||
|
if disk.Present != nil && !*disk.Present {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
count++
|
||||||
|
if disk.SizeGB != nil && *disk.SizeGB > 0 {
|
||||||
|
totalGB += *disk.SizeGB
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if count == 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
line := fmt.Sprintf("Storage: %d drives", count)
|
||||||
|
if totalGB > 0 {
|
||||||
|
line += fmt.Sprintf(" / %s", humanizeGB(totalGB))
|
||||||
|
}
|
||||||
|
return line
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatGPULine(devices []schema.HardwarePCIeDevice) string {
|
||||||
|
gpus := map[string]int{}
|
||||||
|
for _, dev := range devices {
|
||||||
|
if !isGPUDevice(dev) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
name := firstNonEmpty(trimPtr(dev.Model), trimPtr(dev.Manufacturer), "unknown")
|
||||||
|
gpus[name]++
|
||||||
|
}
|
||||||
|
if len(gpus) == 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
keys := make([]string, 0, len(gpus))
|
||||||
|
for key := range gpus {
|
||||||
|
keys = append(keys, key)
|
||||||
|
}
|
||||||
|
sort.Strings(keys)
|
||||||
|
parts := make([]string, 0, len(keys))
|
||||||
|
for _, key := range keys {
|
||||||
|
parts = append(parts, fmt.Sprintf("%d x %s", gpus[key], key))
|
||||||
|
}
|
||||||
|
return "GPU: " + strings.Join(parts, ", ")
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatIPLine(list func() ([]platform.InterfaceInfo, error)) string {
|
||||||
|
if list == nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
ifaces, err := list()
|
||||||
|
if err != nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
seen := map[string]struct{}{}
|
||||||
|
var ips []string
|
||||||
|
for _, iface := range ifaces {
|
||||||
|
for _, ip := range iface.IPv4 {
|
||||||
|
ip = strings.TrimSpace(ip)
|
||||||
|
if ip == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if _, ok := seen[ip]; ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seen[ip] = struct{}{}
|
||||||
|
ips = append(ips, ip)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(ips) == 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
sort.Strings(ips)
|
||||||
|
return "IP: " + strings.Join(ips, ", ")
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatSATDetail(raw string) string {
|
||||||
|
var b strings.Builder
|
||||||
|
kv := parseKeyValueSummary(raw)
|
||||||
|
|
||||||
|
if t, ok := kv["run_at_utc"]; ok {
|
||||||
|
fmt.Fprintf(&b, "Run: %s\n\n", t)
|
||||||
|
}
|
||||||
|
|
||||||
|
lines := strings.Split(raw, "\n")
|
||||||
|
var stepKeys []string
|
||||||
|
seenStep := map[string]bool{}
|
||||||
|
for _, line := range lines {
|
||||||
|
if idx := strings.Index(line, "_status="); idx >= 0 {
|
||||||
|
key := line[:idx]
|
||||||
|
if !seenStep[key] && key != "overall" {
|
||||||
|
seenStep[key] = true
|
||||||
|
stepKeys = append(stepKeys, key)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, key := range stepKeys {
|
||||||
|
status := kv[key+"_status"]
|
||||||
|
display := cleanSummaryKey(key)
|
||||||
|
switch status {
|
||||||
|
case "OK":
|
||||||
|
fmt.Fprintf(&b, "PASS %s\n", display)
|
||||||
|
case "FAILED":
|
||||||
|
fmt.Fprintf(&b, "FAIL %s\n", display)
|
||||||
|
case "UNSUPPORTED":
|
||||||
|
fmt.Fprintf(&b, "SKIP %s\n", display)
|
||||||
|
default:
|
||||||
|
fmt.Fprintf(&b, "? %s\n", display)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if overall, ok := kv["overall_status"]; ok {
|
||||||
|
ok2 := kv["job_ok"]
|
||||||
|
failed := kv["job_failed"]
|
||||||
|
fmt.Fprintf(&b, "\nOverall: %s (ok=%s failed=%s)", overall, ok2, failed)
|
||||||
|
}
|
||||||
|
|
||||||
|
return strings.TrimSpace(b.String())
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatSATSummary(label, raw string) string {
|
||||||
|
values := parseKeyValueSummary(raw)
|
||||||
|
var body strings.Builder
|
||||||
|
fmt.Fprintf(&body, "%s:", label)
|
||||||
|
if overall := firstNonEmpty(values["overall_status"], "UNKNOWN"); overall != "" {
|
||||||
|
fmt.Fprintf(&body, " %s", overall)
|
||||||
|
}
|
||||||
|
if ok := firstNonEmpty(values["job_ok"], "0"); ok != "" {
|
||||||
|
fmt.Fprintf(&body, " ok=%s", ok)
|
||||||
|
}
|
||||||
|
if failed := firstNonEmpty(values["job_failed"], "0"); failed != "" {
|
||||||
|
fmt.Fprintf(&body, " failed=%s", failed)
|
||||||
|
}
|
||||||
|
if unsupported := firstNonEmpty(values["job_unsupported"], "0"); unsupported != "" && unsupported != "0" {
|
||||||
|
fmt.Fprintf(&body, " unsupported=%s", unsupported)
|
||||||
|
}
|
||||||
|
if devices := strings.TrimSpace(values["devices"]); devices != "" {
|
||||||
|
fmt.Fprintf(&body, "\nDevices: %s", devices)
|
||||||
|
}
|
||||||
|
return body.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func latestSATSummaries() []string {
|
||||||
|
patterns := []struct {
|
||||||
|
label string
|
||||||
|
prefix string
|
||||||
|
}{
|
||||||
|
{label: "NVIDIA SAT", prefix: "gpu-nvidia-"},
|
||||||
|
{label: "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)", prefix: "gpu-nvidia-targeted-stress-"},
|
||||||
|
{label: "NVIDIA Max Compute Load (dcgmproftester)", prefix: "gpu-nvidia-compute-"},
|
||||||
|
{label: "NVIDIA Targeted Power (dcgmi diag targeted_power)", prefix: "gpu-nvidia-targeted-power-"},
|
||||||
|
{label: "NVIDIA Pulse Test (dcgmi diag pulse_test)", prefix: "gpu-nvidia-pulse-"},
|
||||||
|
{label: "NVIDIA Interconnect Test (NCCL all_reduce_perf)", prefix: "gpu-nvidia-nccl-"},
|
||||||
|
{label: "NVIDIA Bandwidth Test (NVBandwidth)", prefix: "gpu-nvidia-bandwidth-"},
|
||||||
|
{label: "Memory SAT", prefix: "memory-"},
|
||||||
|
{label: "Storage SAT", prefix: "storage-"},
|
||||||
|
{label: "CPU SAT", prefix: "cpu-"},
|
||||||
|
}
|
||||||
|
var out []string
|
||||||
|
for _, item := range patterns {
|
||||||
|
matches, err := filepath.Glob(filepath.Join(DefaultSATBaseDir, item.prefix+"*/summary.txt"))
|
||||||
|
if err != nil || len(matches) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
sort.Strings(matches)
|
||||||
|
raw, err := os.ReadFile(matches[len(matches)-1])
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
out = append(out, formatSATSummary(item.label, string(raw)))
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
76
audit/internal/app/app_install.go
Normal file
76
audit/internal/app/app_install.go
Normal file
@@ -0,0 +1,76 @@
|
|||||||
|
package app
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
)
|
||||||
|
|
||||||
|
func (a *App) ListRemovableTargets() ([]platform.RemovableTarget, error) {
|
||||||
|
return a.exports.ListRemovableTargets()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) ExportLatestAudit(target platform.RemovableTarget) (string, error) {
|
||||||
|
if _, err := os.Stat(DefaultAuditJSONPath); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
filename := fmt.Sprintf("audit-%s-%s.json", sanitizeFilename(hostnameOr("unknown")), time.Now().UTC().Format("20060102-150405"))
|
||||||
|
tmpPath := filepath.Join(os.TempDir(), filename)
|
||||||
|
data, err := readFileLimited(DefaultAuditJSONPath, 100<<20)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
if normalized, normErr := ApplySATOverlay(data); normErr == nil {
|
||||||
|
data = normalized
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(tmpPath, data, 0644); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
defer os.Remove(tmpPath)
|
||||||
|
return a.exports.ExportFileToTarget(tmpPath, target)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) ExportLatestAuditResult(target platform.RemovableTarget) (ActionResult, error) {
|
||||||
|
path, err := a.ExportLatestAudit(target)
|
||||||
|
body := "Audit export failed."
|
||||||
|
if err == nil {
|
||||||
|
body = "Audit exported."
|
||||||
|
}
|
||||||
|
if err == nil && path != "" {
|
||||||
|
body = "Audit exported to " + path
|
||||||
|
}
|
||||||
|
return ActionResult{Title: "Export audit", Body: body}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) ExportSupportBundle(target platform.RemovableTarget) (string, error) {
|
||||||
|
archive, err := BuildSupportBundle(DefaultExportDir)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
defer os.Remove(archive)
|
||||||
|
return a.exports.ExportFileToTarget(archive, target)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) ExportSupportBundleResult(target platform.RemovableTarget) (ActionResult, error) {
|
||||||
|
path, err := a.ExportSupportBundle(target)
|
||||||
|
body := "Support bundle export failed."
|
||||||
|
if err == nil {
|
||||||
|
body = "Support bundle exported. USB target unmounted and safe to remove."
|
||||||
|
}
|
||||||
|
if err == nil && path != "" {
|
||||||
|
body = "Support bundle exported to " + path + ".\n\nUSB target unmounted and safe to remove."
|
||||||
|
}
|
||||||
|
return ActionResult{Title: "Export support bundle", Body: body}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) ListInstallDisks() ([]platform.InstallDisk, error) {
|
||||||
|
return a.installer.ListInstallDisks()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) InstallToDisk(ctx context.Context, device string, logFile string) error {
|
||||||
|
return a.installer.InstallToDisk(ctx, device, logFile)
|
||||||
|
}
|
||||||
106
audit/internal/app/app_network.go
Normal file
106
audit/internal/app/app_network.go
Normal file
@@ -0,0 +1,106 @@
|
|||||||
|
package app
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
)
|
||||||
|
|
||||||
|
func (a *App) ListInterfaces() ([]platform.InterfaceInfo, error) {
|
||||||
|
return a.network.ListInterfaces()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) DefaultRoute() string {
|
||||||
|
return a.network.DefaultRoute()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) DHCPOne(iface string) (string, error) {
|
||||||
|
return a.network.DHCPOne(iface)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) DHCPOneResult(iface string) (ActionResult, error) {
|
||||||
|
body, err := a.network.DHCPOne(iface)
|
||||||
|
return ActionResult{Title: "DHCP: " + iface, Body: bodyOr(body, "DHCP completed.")}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) DHCPAll() (string, error) {
|
||||||
|
return a.network.DHCPAll()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) DHCPAllResult() (ActionResult, error) {
|
||||||
|
body, err := a.network.DHCPAll()
|
||||||
|
return ActionResult{Title: "DHCP: all interfaces", Body: bodyOr(body, "DHCP completed.")}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error) {
|
||||||
|
return a.network.SetStaticIPv4(cfg)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) SetInterfaceState(iface string, up bool) error {
|
||||||
|
return a.network.SetInterfaceState(iface, up)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) GetInterfaceState(iface string) (bool, error) {
|
||||||
|
return a.network.GetInterfaceState(iface)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) CaptureNetworkSnapshot() (platform.NetworkSnapshot, error) {
|
||||||
|
return a.network.CaptureNetworkSnapshot()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RestoreNetworkSnapshot(snapshot platform.NetworkSnapshot) error {
|
||||||
|
return a.network.RestoreNetworkSnapshot(snapshot)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) SetStaticIPv4Result(cfg platform.StaticIPv4Config) (ActionResult, error) {
|
||||||
|
body, err := a.network.SetStaticIPv4(cfg)
|
||||||
|
return ActionResult{Title: "Static IPv4: " + cfg.Interface, Body: bodyOr(body, "Static IPv4 updated.")}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) NetworkStatus() (ActionResult, error) {
|
||||||
|
ifaces, err := a.network.ListInterfaces()
|
||||||
|
if err != nil {
|
||||||
|
return ActionResult{Title: "Network status"}, err
|
||||||
|
}
|
||||||
|
if len(ifaces) == 0 {
|
||||||
|
return ActionResult{Title: "Network status", Body: "No physical interfaces found."}, nil
|
||||||
|
}
|
||||||
|
var body strings.Builder
|
||||||
|
for _, iface := range ifaces {
|
||||||
|
ipv4 := "(no IPv4)"
|
||||||
|
if len(iface.IPv4) > 0 {
|
||||||
|
ipv4 = strings.Join(iface.IPv4, ", ")
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&body, "- %s: state=%s ip=%s\n", iface.Name, iface.State, ipv4)
|
||||||
|
}
|
||||||
|
if gw := a.network.DefaultRoute(); gw != "" {
|
||||||
|
fmt.Fprintf(&body, "\nDefault route: %s\n", gw)
|
||||||
|
}
|
||||||
|
return ActionResult{Title: "Network status", Body: strings.TrimSpace(body.String())}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) DefaultStaticIPv4FormFields(iface string) []string {
|
||||||
|
return []string{
|
||||||
|
"",
|
||||||
|
"24",
|
||||||
|
strings.TrimSpace(a.network.DefaultRoute()),
|
||||||
|
"77.88.8.8 77.88.8.1 1.1.1.1 8.8.8.8",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) ParseStaticIPv4Config(iface string, fields []string) platform.StaticIPv4Config {
|
||||||
|
get := func(index int) string {
|
||||||
|
if index >= 0 && index < len(fields) {
|
||||||
|
return strings.TrimSpace(fields[index])
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return platform.StaticIPv4Config{
|
||||||
|
Interface: iface,
|
||||||
|
Address: get(0),
|
||||||
|
Prefix: get(1),
|
||||||
|
Gateway: get(2),
|
||||||
|
DNS: strings.Fields(get(3)),
|
||||||
|
}
|
||||||
|
}
|
||||||
370
audit/internal/app/app_packs.go
Normal file
370
audit/internal/app/app_packs.go
Normal file
@@ -0,0 +1,370 @@
|
|||||||
|
package app
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
)
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunNvidiaAcceptancePack(baseDir, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||||
|
path, err := a.RunNvidiaAcceptancePack(baseDir, nil)
|
||||||
|
body := "Archive written."
|
||||||
|
if path != "" {
|
||||||
|
body = "Archive written to " + path
|
||||||
|
}
|
||||||
|
return ActionResult{Title: "NVIDIA SAT", Body: body}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
|
||||||
|
return a.sat.ListNvidiaGPUs()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error) {
|
||||||
|
return a.sat.ListNvidiaGPUStatuses()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) ResetNvidiaGPU(index int) (ActionResult, error) {
|
||||||
|
out, err := a.sat.ResetNvidiaGPU(index)
|
||||||
|
return ActionResult{Title: fmt.Sprintf("Reset NVIDIA GPU %d", index), Body: strings.TrimSpace(out)}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (ActionResult, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
path, err := a.sat.RunNvidiaAcceptancePackWithOptions(ctx, baseDir, diagLevel, gpuIndices, logFunc)
|
||||||
|
body := "Archive written."
|
||||||
|
if path != "" {
|
||||||
|
body = "Archive written to " + path
|
||||||
|
}
|
||||||
|
return ActionResult{Title: "NVIDIA DCGM", Body: body}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunNvidiaTargetedStressValidatePack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaStressPack(baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunNvidiaStressPackCtx(context.Background(), baseDir, opts, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaBenchmark(baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunNvidiaBenchmarkCtx(context.Background(), baseDir, opts, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultBeeBenchPerfDir
|
||||||
|
}
|
||||||
|
resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "performance", logFunc)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
opts.ServerPowerSource = resolved.SelectedSource
|
||||||
|
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaPowerBenchCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultBeeBenchPowerDir
|
||||||
|
}
|
||||||
|
resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "power-fit", logFunc)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
opts.ServerPowerSource = resolved.SelectedSource
|
||||||
|
return a.sat.RunNvidiaPowerBench(ctx, baseDir, opts, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaPowerSourceAutotuneCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultBeeBenchAutotuneDir
|
||||||
|
}
|
||||||
|
return a.sat.RunNvidiaPowerSourceAutotune(ctx, baseDir, opts, benchmarkKind, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) LoadBenchmarkPowerAutotune() (*platform.BenchmarkPowerAutotuneConfig, error) {
|
||||||
|
return platform.LoadBenchmarkPowerAutotuneConfig(DefaultBeeBenchPowerSourceConfigPath)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) ensureBenchmarkPowerAutotune(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (platform.BenchmarkPowerAutotuneConfig, error) {
|
||||||
|
cfgPath := platform.BenchmarkPowerSourceConfigPath(baseDir)
|
||||||
|
if cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath); err == nil {
|
||||||
|
if logFunc != nil {
|
||||||
|
logFunc(fmt.Sprintf("benchmark autotune: using saved server power source %s", cfg.SelectedSource))
|
||||||
|
}
|
||||||
|
return *cfg, nil
|
||||||
|
}
|
||||||
|
if logFunc != nil {
|
||||||
|
logFunc("benchmark autotune: no saved power source config, running autotune first")
|
||||||
|
}
|
||||||
|
autotuneDir := filepath.Join(filepath.Dir(baseDir), "autotune")
|
||||||
|
if _, err := a.RunNvidiaPowerSourceAutotuneCtx(ctx, autotuneDir, opts, benchmarkKind, logFunc); err != nil {
|
||||||
|
return platform.BenchmarkPowerAutotuneConfig{}, err
|
||||||
|
}
|
||||||
|
cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath)
|
||||||
|
if err != nil {
|
||||||
|
return platform.BenchmarkPowerAutotuneConfig{}, err
|
||||||
|
}
|
||||||
|
return *cfg, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, staggerSec, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunNvidiaTargetedPowerPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunNvidiaPulseTestPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunNvidiaBandwidthPack(ctx, baseDir, gpuIndices, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunNvidiaStressPack(ctx, baseDir, opts, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, 256, 1, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunMemoryAcceptancePackCtx(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunMemoryAcceptancePack(ctx, baseDir, sizeMB, passes, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunMemoryAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||||
|
path, err := a.RunMemoryAcceptancePack(baseDir, nil)
|
||||||
|
return ActionResult{Title: "Memory SAT", Body: satResultBody(path)}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunCPUAcceptancePack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunCPUAcceptancePackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunCPUAcceptancePackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunCPUAcceptancePack(ctx, baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunCPUAcceptancePackResult(baseDir string, durationSec int) (ActionResult, error) {
|
||||||
|
path, err := a.RunCPUAcceptancePack(baseDir, durationSec, nil)
|
||||||
|
return ActionResult{Title: "CPU SAT", Body: satResultBody(path)}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunStorageAcceptancePackCtx(context.Background(), baseDir, false, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunStorageAcceptancePackCtx(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunStorageAcceptancePack(ctx, baseDir, extended, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||||
|
path, err := a.RunStorageAcceptancePack(baseDir, nil)
|
||||||
|
return ActionResult{Title: "Storage SAT", Body: satResultBody(path)}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) DetectGPUVendor() string {
|
||||||
|
return a.sat.DetectGPUVendor()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) ListAMDGPUs() ([]platform.AMDGPUInfo, error) {
|
||||||
|
return a.sat.ListAMDGPUs()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunAMDAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunAMDAcceptancePackCtx(context.Background(), baseDir, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunAMDAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunAMDAcceptancePack(ctx, baseDir, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunAMDAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||||
|
path, err := a.RunAMDAcceptancePack(baseDir, nil)
|
||||||
|
return ActionResult{Title: "AMD GPU SAT", Body: satResultBody(path)}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunAMDMemIntegrityPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunAMDMemIntegrityPack(ctx, baseDir, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunAMDMemBandwidthPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunAMDMemBandwidthPack(ctx, baseDir, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunMemoryStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunMemoryStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunSATStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunSATStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunAMDStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
return a.RunAMDStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunMemoryStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
return a.sat.RunMemoryStressPack(ctx, baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunSATStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
return a.sat.RunSATStressPack(ctx, baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunAMDStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunAMDStressPack(ctx, baseDir, durationSec, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunFanStressTest(ctx, baseDir, opts)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||||
|
if strings.TrimSpace(baseDir) == "" {
|
||||||
|
baseDir = DefaultSATBaseDir
|
||||||
|
}
|
||||||
|
return a.sat.RunNCCLTests(ctx, baseDir, gpuIndices, logFunc)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
|
||||||
|
path, err := a.RunNCCLTests(ctx, DefaultSATBaseDir, nil, nil)
|
||||||
|
body := "Results: " + path
|
||||||
|
if err != nil && err != context.Canceled {
|
||||||
|
body += "\nERROR: " + err.Error()
|
||||||
|
}
|
||||||
|
return ActionResult{Title: "NCCL bandwidth test", Body: body}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) RunFanStressTestResult(ctx context.Context, opts platform.FanStressOptions) (ActionResult, error) {
|
||||||
|
path, err := a.RunFanStressTest(ctx, "", opts)
|
||||||
|
body := formatFanStressResult(path)
|
||||||
|
if err != nil && err != context.Canceled {
|
||||||
|
body += "\nERROR: " + err.Error()
|
||||||
|
}
|
||||||
|
return ActionResult{Title: "GPU Platform Stress Test", Body: body}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// formatFanStressResult formats the summary.txt from a fan-stress run, including
|
||||||
|
// the per-step pass/fail display and the analysis section (throttling, max temps, fan response).
|
||||||
|
func formatFanStressResult(archivePath string) string {
|
||||||
|
if archivePath == "" {
|
||||||
|
return "No output produced."
|
||||||
|
}
|
||||||
|
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||||
|
raw, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||||
|
if err != nil {
|
||||||
|
return "Archive written to " + archivePath
|
||||||
|
}
|
||||||
|
content := strings.TrimSpace(string(raw))
|
||||||
|
kv := parseKeyValueSummary(content)
|
||||||
|
|
||||||
|
var b strings.Builder
|
||||||
|
b.WriteString(formatSATDetail(content))
|
||||||
|
|
||||||
|
// Append analysis section.
|
||||||
|
var analysis []string
|
||||||
|
if v, ok := kv["throttling_detected"]; ok {
|
||||||
|
label := "NO"
|
||||||
|
if v == "true" {
|
||||||
|
label = "YES ← throttling detected during load"
|
||||||
|
}
|
||||||
|
analysis = append(analysis, "Throttling: "+label)
|
||||||
|
}
|
||||||
|
if v, ok := kv["max_gpu_temp_c"]; ok && v != "0.0" {
|
||||||
|
analysis = append(analysis, "Max GPU temp: "+v+"°C")
|
||||||
|
}
|
||||||
|
if v, ok := kv["max_cpu_temp_c"]; ok && v != "0.0" {
|
||||||
|
analysis = append(analysis, "Max CPU temp: "+v+"°C")
|
||||||
|
}
|
||||||
|
if v, ok := kv["fan_response_sec"]; ok && v != "N/A" && v != "-1.0" {
|
||||||
|
analysis = append(analysis, "Fan response: "+v+"s")
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(analysis) > 0 {
|
||||||
|
b.WriteString("\n\n=== Analysis ===\n")
|
||||||
|
for _, line := range analysis {
|
||||||
|
b.WriteString(line + "\n")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return strings.TrimSpace(b.String())
|
||||||
|
}
|
||||||
|
|
||||||
|
// satResultBody reads summary.txt from the SAT run directory (archive path without .tar.gz)
|
||||||
|
// and returns a formatted human-readable result. Falls back to a plain message if unreadable.
|
||||||
|
func satResultBody(archivePath string) string {
|
||||||
|
if archivePath == "" {
|
||||||
|
return "No output produced."
|
||||||
|
}
|
||||||
|
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||||
|
raw, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||||
|
if err != nil {
|
||||||
|
return "Archive written to " + archivePath
|
||||||
|
}
|
||||||
|
return formatSATDetail(strings.TrimSpace(string(raw)))
|
||||||
|
}
|
||||||
67
audit/internal/app/app_services.go
Normal file
67
audit/internal/app/app_services.go
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
package app
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
)
|
||||||
|
|
||||||
|
func (a *App) ListBeeServices() ([]string, error) {
|
||||||
|
return a.services.ListBeeServices()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) ServiceState(name string) string {
|
||||||
|
return a.services.ServiceState(name)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) ServiceStatus(name string) (string, error) {
|
||||||
|
return a.services.ServiceStatus(name)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) ServiceStatusResult(name string) (ActionResult, error) {
|
||||||
|
body, err := a.services.ServiceStatus(name)
|
||||||
|
return ActionResult{Title: "service status: " + name, Body: bodyOr(body, "No status output.")}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) ServiceDo(name string, action platform.ServiceAction) (string, error) {
|
||||||
|
return a.services.ServiceDo(name, action)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) ServiceActionResult(name string, action platform.ServiceAction) (ActionResult, error) {
|
||||||
|
body, err := a.services.ServiceDo(name, action)
|
||||||
|
return ActionResult{Title: "service " + string(action) + ": " + name, Body: bodyOr(body, "Action completed.")}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) TailFile(path string, lines int) string {
|
||||||
|
return a.tools.TailFile(path, lines)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) CheckTools(names []string) []platform.ToolStatus {
|
||||||
|
return a.tools.CheckTools(names)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) ToolCheckResult(names []string) ActionResult {
|
||||||
|
if len(names) == 0 {
|
||||||
|
return ActionResult{Title: "Required tools", Body: "No tools checked."}
|
||||||
|
}
|
||||||
|
var body strings.Builder
|
||||||
|
for _, tool := range a.tools.CheckTools(names) {
|
||||||
|
status := "MISSING"
|
||||||
|
if tool.OK {
|
||||||
|
status = "OK (" + tool.Path + ")"
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&body, "- %s: %s\n", tool.Name, status)
|
||||||
|
}
|
||||||
|
return ActionResult{Title: "Required tools", Body: strings.TrimSpace(body.String())}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (a *App) AuditLogTailResult() ActionResult {
|
||||||
|
logTail := strings.TrimSpace(a.tools.TailFile(DefaultAuditLogPath, 40))
|
||||||
|
jsonTail := strings.TrimSpace(a.tools.TailFile(DefaultAuditJSONPath, 20))
|
||||||
|
body := strings.TrimSpace(logTail + "\n\n" + jsonTail)
|
||||||
|
if body == "" {
|
||||||
|
body = "No audit logs found."
|
||||||
|
}
|
||||||
|
return ActionResult{Title: "Audit log tail", Body: body}
|
||||||
|
}
|
||||||
@@ -3,10 +3,11 @@ package app
|
|||||||
import (
|
import (
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strconv"
|
|
||||||
"sort"
|
"sort"
|
||||||
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
|
"bee/audit/internal/collector"
|
||||||
"bee/audit/internal/schema"
|
"bee/audit/internal/schema"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -313,17 +314,20 @@ func statusSeverity(status string) int {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func matchesGPUVendor(dev schema.HardwarePCIeDevice, vendor string) bool {
|
func matchesGPUVendor(dev schema.HardwarePCIeDevice, vendor string) bool {
|
||||||
if dev.DeviceClass == nil || !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Controller") && !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Accelerator") {
|
if dev.DeviceClass == nil {
|
||||||
if dev.DeviceClass == nil || !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Display") && !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Video") {
|
return false
|
||||||
return false
|
}
|
||||||
}
|
class := strings.TrimSpace(*dev.DeviceClass)
|
||||||
|
isGPUClass := strings.Contains(class, "Controller") || strings.Contains(class, "Accelerator") ||
|
||||||
|
strings.Contains(class, "Display") || strings.Contains(class, "Video")
|
||||||
|
if !isGPUClass {
|
||||||
|
return false
|
||||||
}
|
}
|
||||||
manufacturer := strings.ToLower(strings.TrimSpace(ptrString(dev.Manufacturer)))
|
|
||||||
switch vendor {
|
switch vendor {
|
||||||
case "amd":
|
case "amd":
|
||||||
return strings.Contains(manufacturer, "advanced micro devices") || strings.Contains(manufacturer, "amd/ati")
|
return dev.VendorID != nil && *dev.VendorID == collector.AMDVendorID
|
||||||
case "nvidia":
|
case "nvidia":
|
||||||
return strings.Contains(manufacturer, "nvidia")
|
return dev.VendorID != nil && *dev.VendorID == collector.NvidiaVendorID
|
||||||
default:
|
default:
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ import (
|
|||||||
"path/filepath"
|
"path/filepath"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
|
"bee/audit/internal/collector"
|
||||||
"bee/audit/internal/schema"
|
"bee/audit/internal/schema"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -46,10 +47,12 @@ func TestApplyLatestSATStatusesMarksAMDGPUs(t *testing.T) {
|
|||||||
|
|
||||||
class := "DisplayController"
|
class := "DisplayController"
|
||||||
manufacturer := "Advanced Micro Devices, Inc. [AMD/ATI]"
|
manufacturer := "Advanced Micro Devices, Inc. [AMD/ATI]"
|
||||||
|
amdVendorID := collector.AMDVendorID
|
||||||
snap := schema.HardwareSnapshot{
|
snap := schema.HardwareSnapshot{
|
||||||
PCIeDevices: []schema.HardwarePCIeDevice{{
|
PCIeDevices: []schema.HardwarePCIeDevice{{
|
||||||
DeviceClass: &class,
|
DeviceClass: &class,
|
||||||
Manufacturer: &manufacturer,
|
Manufacturer: &manufacturer,
|
||||||
|
VendorID: &amdVendorID,
|
||||||
}},
|
}},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -24,6 +24,8 @@ var supportBundleServices = []string{
|
|||||||
"bee-selfheal.service",
|
"bee-selfheal.service",
|
||||||
"bee-selfheal.timer",
|
"bee-selfheal.timer",
|
||||||
"bee-sshsetup.service",
|
"bee-sshsetup.service",
|
||||||
|
"display-manager.service",
|
||||||
|
"lightdm.service",
|
||||||
"nvidia-dcgm.service",
|
"nvidia-dcgm.service",
|
||||||
"nvidia-fabricmanager.service",
|
"nvidia-fabricmanager.service",
|
||||||
}
|
}
|
||||||
@@ -44,12 +46,128 @@ var supportBundleCommands = []struct {
|
|||||||
{name: "system/mount.txt", cmd: []string{"mount"}},
|
{name: "system/mount.txt", cmd: []string{"mount"}},
|
||||||
{name: "system/df-h.txt", cmd: []string{"df", "-h"}},
|
{name: "system/df-h.txt", cmd: []string{"df", "-h"}},
|
||||||
{name: "system/dmesg.txt", cmd: []string{"dmesg"}},
|
{name: "system/dmesg.txt", cmd: []string{"dmesg"}},
|
||||||
|
{name: "system/dmesg-gui-video-input.txt", cmd: []string{"sh", "-c", `
|
||||||
|
if command -v dmesg >/dev/null 2>&1; then
|
||||||
|
dmesg | grep -iE 'nvidia|drm|fb|framebuffer|vesa|efi|lightdm|Xorg|input|hid|usb|keyboard|mouse|virtual keyboard|virtual mouse|ami|aspeed|ast' || echo "no GUI/video/input kernel messages found"
|
||||||
|
else
|
||||||
|
echo "dmesg not found"
|
||||||
|
fi
|
||||||
|
`}},
|
||||||
{name: "system/kernel-aer-nvidia.txt", cmd: []string{"sh", "-c", `
|
{name: "system/kernel-aer-nvidia.txt", cmd: []string{"sh", "-c", `
|
||||||
if command -v dmesg >/dev/null 2>&1; then
|
if command -v dmesg >/dev/null 2>&1; then
|
||||||
dmesg | grep -iE 'AER|NVRM|Xid|pcieport|nvidia' || echo "no AER/NVRM/Xid kernel messages found"
|
dmesg | grep -iE 'AER|NVRM|Xid|pcieport|nvidia' || echo "no AER/NVRM/Xid kernel messages found"
|
||||||
else
|
else
|
||||||
echo "dmesg not found"
|
echo "dmesg not found"
|
||||||
fi
|
fi
|
||||||
|
`}},
|
||||||
|
{name: "system/loginctl-sessions.txt", cmd: []string{"sh", "-c", `
|
||||||
|
if command -v loginctl >/dev/null 2>&1; then
|
||||||
|
loginctl list-sessions 2>&1 || true
|
||||||
|
else
|
||||||
|
echo "loginctl not found"
|
||||||
|
fi
|
||||||
|
`}},
|
||||||
|
{name: "system/loginctl-seats.txt", cmd: []string{"sh", "-c", `
|
||||||
|
if command -v loginctl >/dev/null 2>&1; then
|
||||||
|
loginctl list-seats 2>&1 || true
|
||||||
|
echo
|
||||||
|
for seat in $(loginctl list-seats --no-legend 2>/dev/null | awk '{print $1}'); do
|
||||||
|
echo "=== $seat ==="
|
||||||
|
loginctl seat-status "$seat" 2>&1 || true
|
||||||
|
echo
|
||||||
|
done
|
||||||
|
else
|
||||||
|
echo "loginctl not found"
|
||||||
|
fi
|
||||||
|
`}},
|
||||||
|
{name: "system/ps-gui.txt", cmd: []string{"sh", "-c", `
|
||||||
|
ps -ef | grep -iE 'lightdm|Xorg|X$|openbox|chromium|chrome|xinit|xsession' | grep -v grep || echo "no GUI processes found"
|
||||||
|
`}},
|
||||||
|
{name: "system/lspci-video-vv.txt", cmd: []string{"sh", "-c", `
|
||||||
|
if ! command -v lspci >/dev/null 2>&1; then
|
||||||
|
echo "lspci not found"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
found=0
|
||||||
|
for dev in $(lspci -Dn | awk '$2 ~ /^03(00|02):$/ {print $1}'); do
|
||||||
|
found=1
|
||||||
|
echo "=== $dev ==="
|
||||||
|
lspci -s "$dev" -vv 2>&1 || true
|
||||||
|
echo
|
||||||
|
done
|
||||||
|
if [ "$found" -eq 0 ]; then
|
||||||
|
echo "no display-class PCI devices found"
|
||||||
|
fi
|
||||||
|
`}},
|
||||||
|
{name: "system/proc-fb.txt", cmd: []string{"cat", "/proc/fb"}},
|
||||||
|
{name: "system/drm-cards.txt", cmd: []string{"sh", "-c", `
|
||||||
|
if [ -d /sys/class/drm ]; then
|
||||||
|
for path in /sys/class/drm/card*; do
|
||||||
|
[ -e "$path" ] || continue
|
||||||
|
card=$(basename "$path")
|
||||||
|
echo "=== $card ==="
|
||||||
|
for f in status enabled dpms modes; do
|
||||||
|
[ -r "$path/$f" ] && printf " %-8s %s\n" "$f" "$(cat "$path/$f" 2>/dev/null)"
|
||||||
|
done
|
||||||
|
device=$(readlink -f "$path/device" 2>/dev/null || true)
|
||||||
|
[ -n "$device" ] && echo " device ${device##*/}"
|
||||||
|
echo
|
||||||
|
done
|
||||||
|
else
|
||||||
|
echo "/sys/class/drm not present"
|
||||||
|
fi
|
||||||
|
`}},
|
||||||
|
{name: "system/input-devices.txt", cmd: []string{"sh", "-c", `
|
||||||
|
if [ -r /proc/bus/input/devices ]; then
|
||||||
|
cat /proc/bus/input/devices
|
||||||
|
else
|
||||||
|
echo "/proc/bus/input/devices not readable"
|
||||||
|
fi
|
||||||
|
`}},
|
||||||
|
{name: "system/udevadm-input.txt", cmd: []string{"sh", "-c", `
|
||||||
|
if ! command -v udevadm >/dev/null 2>&1; then
|
||||||
|
echo "udevadm not found"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
found=0
|
||||||
|
for dev in /dev/input/event*; do
|
||||||
|
[ -e "$dev" ] || continue
|
||||||
|
found=1
|
||||||
|
echo "=== $dev ==="
|
||||||
|
udevadm info --query=all --name="$dev" 2>&1 || true
|
||||||
|
echo
|
||||||
|
done
|
||||||
|
if [ "$found" -eq 0 ]; then
|
||||||
|
echo "no /dev/input/event* devices found"
|
||||||
|
fi
|
||||||
|
`}},
|
||||||
|
{name: "system/xinput-list.txt", cmd: []string{"sh", "-c", `
|
||||||
|
if command -v xinput >/dev/null 2>&1; then
|
||||||
|
DISPLAY=:0 xinput --list 2>&1 || true
|
||||||
|
else
|
||||||
|
echo "xinput not found"
|
||||||
|
fi
|
||||||
|
`}},
|
||||||
|
{name: "system/libinput-list-devices.txt", cmd: []string{"sh", "-c", `
|
||||||
|
if command -v libinput >/dev/null 2>&1; then
|
||||||
|
libinput list-devices 2>&1 || true
|
||||||
|
else
|
||||||
|
echo "libinput not found"
|
||||||
|
fi
|
||||||
|
`}},
|
||||||
|
{name: "system/systemctl-gui-units.txt", cmd: []string{"sh", "-c", `
|
||||||
|
if ! command -v systemctl >/dev/null 2>&1; then
|
||||||
|
echo "systemctl not found"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
echo "=== unit files ==="
|
||||||
|
systemctl list-unit-files --no-pager --all 'lightdm*' 'display-manager*' 2>&1 || true
|
||||||
|
echo
|
||||||
|
echo "=== active units ==="
|
||||||
|
systemctl list-units --no-pager --all 'lightdm*' 'display-manager*' 2>&1 || true
|
||||||
|
echo
|
||||||
|
echo "=== failed units ==="
|
||||||
|
systemctl --failed --no-pager 2>&1 | grep -iE 'lightdm|display-manager|Xorg' || echo "no failed GUI units"
|
||||||
`}},
|
`}},
|
||||||
{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
|
{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
|
||||||
{name: "system/nvidia-smi-topo.txt", cmd: []string{"sh", "-c", `
|
{name: "system/nvidia-smi-topo.txt", cmd: []string{"sh", "-c", `
|
||||||
@@ -236,6 +354,13 @@ var supportBundleOptionalFiles = []struct {
|
|||||||
}{
|
}{
|
||||||
{name: "system/kern.log", src: "/var/log/kern.log"},
|
{name: "system/kern.log", src: "/var/log/kern.log"},
|
||||||
{name: "system/syslog.txt", src: "/var/log/syslog"},
|
{name: "system/syslog.txt", src: "/var/log/syslog"},
|
||||||
|
{name: "system/Xorg.0.log", src: "/var/log/Xorg.0.log"},
|
||||||
|
{name: "system/Xorg.0.log.old", src: "/var/log/Xorg.0.log.old"},
|
||||||
|
{name: "system/lightdm/lightdm.log", src: "/var/log/lightdm/lightdm.log"},
|
||||||
|
{name: "system/lightdm/x-0.log", src: "/var/log/lightdm/x-0.log"},
|
||||||
|
{name: "system/lightdm/x-0-greeter.log", src: "/var/log/lightdm/x-0-greeter.log"},
|
||||||
|
{name: "system/home-bee-xsession-errors.log", src: "/home/bee/.xsession-errors"},
|
||||||
|
{name: "system/home-bee-chromium-debug.log", src: "/tmp/bee-chrome/chrome_debug.log"},
|
||||||
{name: "system/fabricmanager.log", src: "/var/log/fabricmanager.log"},
|
{name: "system/fabricmanager.log", src: "/var/log/fabricmanager.log"},
|
||||||
{name: "system/nvlsm.log", src: "/var/log/nvlsm.log"},
|
{name: "system/nvlsm.log", src: "/var/log/nvlsm.log"},
|
||||||
{name: "system/fabricmanager/fabricmanager.log", src: "/var/log/fabricmanager/fabricmanager.log"},
|
{name: "system/fabricmanager/fabricmanager.log", src: "/var/log/fabricmanager/fabricmanager.log"},
|
||||||
|
|||||||
@@ -84,11 +84,10 @@ func hasAMDGPUDevices(devs []schema.HardwarePCIeDevice) bool {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func isAMDGPUDevice(dev schema.HardwarePCIeDevice) bool {
|
func isAMDGPUDevice(dev schema.HardwarePCIeDevice) bool {
|
||||||
if dev.Manufacturer == nil || dev.DeviceClass == nil {
|
if dev.DeviceClass == nil {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
manufacturer := strings.ToLower(strings.TrimSpace(*dev.Manufacturer))
|
return dev.VendorID != nil && *dev.VendorID == AMDVendorID && isGPUClass(strings.TrimSpace(*dev.DeviceClass))
|
||||||
return strings.Contains(manufacturer, "advanced micro devices") && isGPUClass(strings.TrimSpace(*dev.DeviceClass))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func queryAMDGPUs() (map[string]amdGPUInfo, error) {
|
func queryAMDGPUs() (map[string]amdGPUInfo, error) {
|
||||||
|
|||||||
@@ -174,15 +174,19 @@ func cleanDMIValue(v string) string {
|
|||||||
upper := strings.ToUpper(v)
|
upper := strings.ToUpper(v)
|
||||||
placeholders := []string{
|
placeholders := []string{
|
||||||
"TO BE FILLED BY O.E.M.",
|
"TO BE FILLED BY O.E.M.",
|
||||||
|
"TO BE FILLED BY O.E.M",
|
||||||
"NOT SPECIFIED",
|
"NOT SPECIFIED",
|
||||||
"NOT SETTABLE",
|
"NOT SETTABLE",
|
||||||
"NOT PRESENT",
|
"NOT PRESENT",
|
||||||
|
"NOT AVAILABLE",
|
||||||
"UNKNOWN",
|
"UNKNOWN",
|
||||||
"N/A",
|
"N/A",
|
||||||
"NONE",
|
"NONE",
|
||||||
"NULL",
|
"NULL",
|
||||||
"DEFAULT STRING",
|
"DEFAULT STRING",
|
||||||
"0",
|
"0",
|
||||||
|
"0123456789",
|
||||||
|
"1234567890",
|
||||||
}
|
}
|
||||||
for _, p := range placeholders {
|
for _, p := range placeholders {
|
||||||
if upper == p {
|
if upper == p {
|
||||||
|
|||||||
@@ -84,6 +84,10 @@ func TestCleanDMIValue(t *testing.T) {
|
|||||||
{" Inspur ", "Inspur"},
|
{" Inspur ", "Inspur"},
|
||||||
{"", ""},
|
{"", ""},
|
||||||
{"0", ""},
|
{"0", ""},
|
||||||
|
{"0123456789", ""},
|
||||||
|
{"1234567890", ""},
|
||||||
|
{"Not Available", ""},
|
||||||
|
{"To Be Filled By O.E.M", ""},
|
||||||
}
|
}
|
||||||
for _, tt := range tests {
|
for _, tt := range tests {
|
||||||
got := cleanDMIValue(tt.input)
|
got := cleanDMIValue(tt.input)
|
||||||
@@ -109,6 +113,80 @@ func TestParseDMIFields(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestParseBoard_Dell(t *testing.T) {
|
||||||
|
type1 := mustReadFile(t, "testdata/dmidecode_type1_dell.txt")
|
||||||
|
type2 := mustReadFile(t, "testdata/dmidecode_type2_dell.txt")
|
||||||
|
|
||||||
|
board := parseBoard(type1, type2)
|
||||||
|
|
||||||
|
if board.SerialNumber != "7SG9F63" {
|
||||||
|
t.Errorf("serial_number: got %q, want %q", board.SerialNumber, "7SG9F63")
|
||||||
|
}
|
||||||
|
if board.Manufacturer == nil || *board.Manufacturer != "Dell Inc." {
|
||||||
|
t.Errorf("manufacturer: got %v, want Dell Inc.", board.Manufacturer)
|
||||||
|
}
|
||||||
|
if board.ProductName == nil || *board.ProductName != "PowerEdge R740xd" {
|
||||||
|
t.Errorf("product_name: got %v, want PowerEdge R740xd", board.ProductName)
|
||||||
|
}
|
||||||
|
// part number comes from type2 Product Name
|
||||||
|
if board.PartNumber == nil || *board.PartNumber != "0F9N89" {
|
||||||
|
t.Errorf("part_number: got %v, want 0F9N89", board.PartNumber)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseBoard_HPE(t *testing.T) {
|
||||||
|
type1 := mustReadFile(t, "testdata/dmidecode_type1_hpe.txt")
|
||||||
|
type2 := mustReadFile(t, "testdata/dmidecode_type2_hpe.txt")
|
||||||
|
|
||||||
|
board := parseBoard(type1, type2)
|
||||||
|
|
||||||
|
if board.SerialNumber != "CZJ9320CXN" {
|
||||||
|
t.Errorf("serial_number: got %q, want %q", board.SerialNumber, "CZJ9320CXN")
|
||||||
|
}
|
||||||
|
if board.Manufacturer == nil || *board.Manufacturer != "HPE" {
|
||||||
|
t.Errorf("manufacturer: got %v, want HPE", board.Manufacturer)
|
||||||
|
}
|
||||||
|
if board.ProductName == nil || *board.ProductName != "ProLiant DL380 Gen10" {
|
||||||
|
t.Errorf("product_name: got %v, want ProLiant DL380 Gen10", board.ProductName)
|
||||||
|
}
|
||||||
|
if board.PartNumber == nil || *board.PartNumber != "ProLiant DL380 Gen10" {
|
||||||
|
t.Errorf("part_number: got %v, want ProLiant DL380 Gen10", board.PartNumber)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseBoard_Supermicro_Placeholders(t *testing.T) {
|
||||||
|
type1 := mustReadFile(t, "testdata/dmidecode_type1_supermicro.txt")
|
||||||
|
type2 := mustReadFile(t, "testdata/dmidecode_type2_supermicro.txt")
|
||||||
|
|
||||||
|
board := parseBoard(type1, type2)
|
||||||
|
|
||||||
|
if board.SerialNumber != "S214726X2A36789" {
|
||||||
|
t.Errorf("serial_number: got %q, want %q", board.SerialNumber, "S214726X2A36789")
|
||||||
|
}
|
||||||
|
if board.Manufacturer == nil || *board.Manufacturer != "Supermicro" {
|
||||||
|
t.Errorf("manufacturer: got %v, want Supermicro", board.Manufacturer)
|
||||||
|
}
|
||||||
|
if board.ProductName == nil || *board.ProductName != "SYS-6028R-WTR" {
|
||||||
|
t.Errorf("product_name: got %v, want SYS-6028R-WTR", board.ProductName)
|
||||||
|
}
|
||||||
|
// "X10DRW-i" is the real part number from type 2
|
||||||
|
if board.PartNumber == nil || *board.PartNumber != "X10DRW-i" {
|
||||||
|
t.Errorf("part_number: got %v, want X10DRW-i", board.PartNumber)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseBIOSFirmware_Dell(t *testing.T) {
|
||||||
|
type0 := mustReadFile(t, "testdata/dmidecode_type0_dell.txt")
|
||||||
|
fw := parseBIOSFirmware(type0)
|
||||||
|
|
||||||
|
if len(fw) != 1 {
|
||||||
|
t.Fatalf("expected 1 firmware record, got %d", len(fw))
|
||||||
|
}
|
||||||
|
if fw[0].Version != "2.5.4" {
|
||||||
|
t.Errorf("version: got %q, want 2.5.4", fw[0].Version)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func mustReadFile(t *testing.T, path string) string {
|
func mustReadFile(t *testing.T, path string) string {
|
||||||
t.Helper()
|
t.Helper()
|
||||||
b, err := os.ReadFile(path)
|
b, err := os.ReadFile(path)
|
||||||
|
|||||||
@@ -40,6 +40,7 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
|
|||||||
snap.PCIeDevices = enrichPCIeWithAMD(snap.PCIeDevices)
|
snap.PCIeDevices = enrichPCIeWithAMD(snap.PCIeDevices)
|
||||||
snap.PCIeDevices = enrichPCIeWithPCISerials(snap.PCIeDevices)
|
snap.PCIeDevices = enrichPCIeWithPCISerials(snap.PCIeDevices)
|
||||||
snap.PCIeDevices = enrichPCIeWithNVIDIA(snap.PCIeDevices)
|
snap.PCIeDevices = enrichPCIeWithNVIDIA(snap.PCIeDevices)
|
||||||
|
snap.PCIeDevices = enrichNVLinkBridgesWithGPUTopo(snap.PCIeDevices)
|
||||||
snap.PCIeDevices = enrichPCIeWithMellanox(snap.PCIeDevices)
|
snap.PCIeDevices = enrichPCIeWithMellanox(snap.PCIeDevices)
|
||||||
snap.PCIeDevices = enrichPCIeWithNICTelemetry(snap.PCIeDevices)
|
snap.PCIeDevices = enrichPCIeWithNICTelemetry(snap.PCIeDevices)
|
||||||
snap.PCIeDevices = enrichPCIeWithRAIDTelemetry(snap.PCIeDevices)
|
snap.PCIeDevices = enrichPCIeWithRAIDTelemetry(snap.PCIeDevices)
|
||||||
|
|||||||
87
audit/internal/collector/memory_test.go
Normal file
87
audit/internal/collector/memory_test.go
Normal file
@@ -0,0 +1,87 @@
|
|||||||
|
package collector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestParseMemory_Mixed(t *testing.T) {
|
||||||
|
out := mustReadFile(t, "testdata/dmidecode_type17_mixed.txt")
|
||||||
|
dimms := parseMemory(out)
|
||||||
|
|
||||||
|
if len(dimms) != 3 {
|
||||||
|
t.Fatalf("expected 3 DIMMs, got %d", len(dimms))
|
||||||
|
}
|
||||||
|
|
||||||
|
// slot 0: populated, 16 GB Supermicro-style
|
||||||
|
d0 := dimms[0]
|
||||||
|
if d0.Present == nil || !*d0.Present {
|
||||||
|
t.Errorf("dimm0: expected present=true")
|
||||||
|
}
|
||||||
|
if d0.SizeMB == nil || *d0.SizeMB != 16384 {
|
||||||
|
t.Errorf("dimm0: size_mb=%v, want 16384", d0.SizeMB)
|
||||||
|
}
|
||||||
|
if d0.Slot == nil || *d0.Slot != "P1-DIMMA1" {
|
||||||
|
t.Errorf("dimm0: slot=%v, want P1-DIMMA1", d0.Slot)
|
||||||
|
}
|
||||||
|
if d0.Location == nil || *d0.Location != "P0_Node0_Channel0_Dimm0" {
|
||||||
|
t.Errorf("dimm0: location=%v, want P0_Node0_Channel0_Dimm0", d0.Location)
|
||||||
|
}
|
||||||
|
if d0.Manufacturer == nil || *d0.Manufacturer != "Micron" {
|
||||||
|
t.Errorf("dimm0: manufacturer=%v, want Micron", d0.Manufacturer)
|
||||||
|
}
|
||||||
|
if d0.PartNumber == nil || *d0.PartNumber != "36ASF2G72PZ-2G1A2" {
|
||||||
|
t.Errorf("dimm0: part_number=%v, want 36ASF2G72PZ-2G1A2", d0.PartNumber)
|
||||||
|
}
|
||||||
|
if d0.MaxSpeedMHz == nil || *d0.MaxSpeedMHz != 2133 {
|
||||||
|
t.Errorf("dimm0: max_speed_mhz=%v, want 2133", d0.MaxSpeedMHz)
|
||||||
|
}
|
||||||
|
|
||||||
|
// slot 1: empty
|
||||||
|
d1 := dimms[1]
|
||||||
|
if d1.Present == nil || *d1.Present {
|
||||||
|
t.Errorf("dimm1: expected present=false")
|
||||||
|
}
|
||||||
|
if d1.Status == nil || *d1.Status != statusEmpty {
|
||||||
|
t.Errorf("dimm1: status=%v, want %s", d1.Status, statusEmpty)
|
||||||
|
}
|
||||||
|
if d1.SizeMB != nil {
|
||||||
|
t.Errorf("dimm1: size_mb should be nil for empty slot, got %v", d1.SizeMB)
|
||||||
|
}
|
||||||
|
|
||||||
|
// slot 2: populated, 32768 MB Dell-style size
|
||||||
|
d2 := dimms[2]
|
||||||
|
if d2.Present == nil || !*d2.Present {
|
||||||
|
t.Errorf("dimm2: expected present=true")
|
||||||
|
}
|
||||||
|
if d2.SizeMB == nil || *d2.SizeMB != 32768 {
|
||||||
|
t.Errorf("dimm2: size_mb=%v, want 32768", d2.SizeMB)
|
||||||
|
}
|
||||||
|
if d2.Manufacturer == nil || *d2.Manufacturer != "Samsung" {
|
||||||
|
t.Errorf("dimm2: manufacturer=%v, want Samsung", d2.Manufacturer)
|
||||||
|
}
|
||||||
|
if d2.CurrentSpeedMHz == nil || *d2.CurrentSpeedMHz != 2400 {
|
||||||
|
t.Errorf("dimm2: current_speed_mhz=%v, want 2400", d2.CurrentSpeedMHz)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseMemorySizeMB(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
input string
|
||||||
|
want int
|
||||||
|
}{
|
||||||
|
{"16 GB", 16384},
|
||||||
|
{"32 GB", 32768},
|
||||||
|
{"8 GB", 8192},
|
||||||
|
{"16384 MB", 16384},
|
||||||
|
{"32768 MB", 32768},
|
||||||
|
{"No Module Installed", 0},
|
||||||
|
{"0", 0},
|
||||||
|
{"", 0},
|
||||||
|
}
|
||||||
|
for _, tt := range tests {
|
||||||
|
got := parseMemorySizeMB(tt.input)
|
||||||
|
if got != tt.want {
|
||||||
|
t.Errorf("parseMemorySizeMB(%q) = %d, want %d", tt.input, got, tt.want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -11,7 +11,6 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
const mellanoxVendorID = 0x15b3
|
|
||||||
const nicProbeTimeout = 2 * time.Second
|
const nicProbeTimeout = 2 * time.Second
|
||||||
|
|
||||||
var (
|
var (
|
||||||
@@ -80,16 +79,7 @@ func enrichPCIeWithMellanox(devs []schema.HardwarePCIeDevice) []schema.HardwareP
|
|||||||
}
|
}
|
||||||
|
|
||||||
func isMellanoxDevice(dev schema.HardwarePCIeDevice) bool {
|
func isMellanoxDevice(dev schema.HardwarePCIeDevice) bool {
|
||||||
if dev.VendorID != nil && *dev.VendorID == mellanoxVendorID {
|
return dev.VendorID != nil && *dev.VendorID == MellanoxVendorID
|
||||||
return true
|
|
||||||
}
|
|
||||||
if dev.Manufacturer != nil {
|
|
||||||
m := strings.ToLower(*dev.Manufacturer)
|
|
||||||
if strings.Contains(m, "mellanox") || strings.Contains(m, "nvidia networking") {
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func queryMellanoxFromMstflint(bdf string) (firmware, serial string) {
|
func queryMellanoxFromMstflint(bdf string) (firmware, serial string) {
|
||||||
|
|||||||
@@ -55,7 +55,7 @@ func TestEnrichPCIeWithMellanox_mstflint(t *testing.T) {
|
|||||||
}
|
}
|
||||||
netIfacesByBDF = func(string) []string { return nil }
|
netIfacesByBDF = func(string) []string { return nil }
|
||||||
|
|
||||||
vendorID := mellanoxVendorID
|
vendorID := MellanoxVendorID
|
||||||
bdf := "0000:18:00.0"
|
bdf := "0000:18:00.0"
|
||||||
manufacturer := "Mellanox Technologies"
|
manufacturer := "Mellanox Technologies"
|
||||||
devs := []schema.HardwarePCIeDevice{{
|
devs := []schema.HardwarePCIeDevice{{
|
||||||
@@ -99,7 +99,7 @@ func TestEnrichPCIeWithMellanox_fallbackEthtool(t *testing.T) {
|
|||||||
return "driver: mlx5_core\nfirmware-version: 28.40.1000\n", nil
|
return "driver: mlx5_core\nfirmware-version: 28.40.1000\n", nil
|
||||||
}
|
}
|
||||||
|
|
||||||
vendorID := mellanoxVendorID
|
vendorID := MellanoxVendorID
|
||||||
bdf := "0000:18:00.0"
|
bdf := "0000:18:00.0"
|
||||||
manufacturer := "NVIDIA Networking"
|
manufacturer := "NVIDIA Networking"
|
||||||
devs := []schema.HardwarePCIeDevice{{
|
devs := []schema.HardwarePCIeDevice{{
|
||||||
|
|||||||
@@ -10,8 +10,6 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
const nvidiaVendorID = 0x10de
|
|
||||||
|
|
||||||
type nvidiaGPUInfo struct {
|
type nvidiaGPUInfo struct {
|
||||||
Index int
|
Index int
|
||||||
BDF string
|
BDF string
|
||||||
@@ -240,13 +238,7 @@ func normalizePCIeBDF(bdf string) string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func isNVIDIADevice(dev schema.HardwarePCIeDevice) bool {
|
func isNVIDIADevice(dev schema.HardwarePCIeDevice) bool {
|
||||||
if dev.VendorID != nil && *dev.VendorID == nvidiaVendorID {
|
return dev.VendorID != nil && *dev.VendorID == NvidiaVendorID
|
||||||
return true
|
|
||||||
}
|
|
||||||
if dev.Manufacturer != nil && strings.Contains(strings.ToLower(*dev.Manufacturer), "nvidia") {
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
return false
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func setPCIeFallback(dev *schema.HardwarePCIeDevice) {
|
func setPCIeFallback(dev *schema.HardwarePCIeDevice) {
|
||||||
|
|||||||
@@ -57,7 +57,7 @@ func TestNormalizePCIeBDF(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestEnrichPCIeWithNVIDIAData_driverLoaded(t *testing.T) {
|
func TestEnrichPCIeWithNVIDIAData_driverLoaded(t *testing.T) {
|
||||||
vendorID := nvidiaVendorID
|
vendorID := NvidiaVendorID
|
||||||
bdf := "0000:65:00.0"
|
bdf := "0000:65:00.0"
|
||||||
manufacturer := "NVIDIA Corporation"
|
manufacturer := "NVIDIA Corporation"
|
||||||
status := "OK"
|
status := "OK"
|
||||||
@@ -104,7 +104,7 @@ func TestEnrichPCIeWithNVIDIAData_driverLoaded(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestEnrichPCIeWithNVIDIAData_driverMissingFallback(t *testing.T) {
|
func TestEnrichPCIeWithNVIDIAData_driverMissingFallback(t *testing.T) {
|
||||||
vendorID := nvidiaVendorID
|
vendorID := NvidiaVendorID
|
||||||
bdf := "0000:17:00.0"
|
bdf := "0000:17:00.0"
|
||||||
manufacturer := "NVIDIA Corporation"
|
manufacturer := "NVIDIA Corporation"
|
||||||
devices := []schema.HardwarePCIeDevice{
|
devices := []schema.HardwarePCIeDevice{
|
||||||
|
|||||||
11
audit/internal/collector/pci_vendors.go
Normal file
11
audit/internal/collector/pci_vendors.go
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
package collector
|
||||||
|
|
||||||
|
// PCI vendor IDs for hardware classification.
|
||||||
|
// Source: https://pcisig.com / https://pci-ids.ucw.cz/
|
||||||
|
const (
|
||||||
|
NvidiaVendorID = 0x10de
|
||||||
|
AMDVendorID = 0x1002
|
||||||
|
AspeedVendorID = 0x1a03
|
||||||
|
MellanoxVendorID = 0x15b3
|
||||||
|
IntelVendorID = 0x8086
|
||||||
|
)
|
||||||
@@ -126,38 +126,39 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
|
|||||||
dev.Status = &status
|
dev.Status = &status
|
||||||
|
|
||||||
// Slot is the BDF: "0000:00:02.0"
|
// Slot is the BDF: "0000:00:02.0"
|
||||||
if bdf := fields["Slot"]; bdf != "" {
|
bdfStr := fields["Slot"]
|
||||||
dev.Slot = &bdf
|
if bdfStr != "" {
|
||||||
dev.BDF = &bdf
|
dev.Slot = &bdfStr
|
||||||
|
dev.BDF = &bdfStr
|
||||||
// parse vendor_id and device_id from sysfs
|
// parse vendor_id and device_id from sysfs
|
||||||
vendorID, deviceID := readPCIIDs(bdf)
|
vendorID, deviceID := readPCIIDs(bdfStr)
|
||||||
if vendorID != 0 {
|
if vendorID != 0 {
|
||||||
dev.VendorID = &vendorID
|
dev.VendorID = &vendorID
|
||||||
}
|
}
|
||||||
if deviceID != 0 {
|
if deviceID != 0 {
|
||||||
dev.DeviceID = &deviceID
|
dev.DeviceID = &deviceID
|
||||||
}
|
}
|
||||||
if numaNode, ok := readPCINumaNode(bdf); ok {
|
if numaNode, ok := readPCINumaNode(bdfStr); ok {
|
||||||
dev.NUMANode = &numaNode
|
dev.NUMANode = &numaNode
|
||||||
} else if numaNode, ok := parsePCINumaNode(fields["NUMANode"]); ok {
|
} else if numaNode, ok := parsePCINumaNode(fields["NUMANode"]); ok {
|
||||||
dev.NUMANode = &numaNode
|
dev.NUMANode = &numaNode
|
||||||
}
|
}
|
||||||
if group, ok := readPCIIOMMUGroup(bdf); ok {
|
if group, ok := readPCIIOMMUGroup(bdfStr); ok {
|
||||||
dev.IOMMUGroup = &group
|
dev.IOMMUGroup = &group
|
||||||
}
|
}
|
||||||
if width, ok := readPCIIntAttribute(bdf, "current_link_width"); ok {
|
if width, ok := readPCIIntAttribute(bdfStr, "current_link_width"); ok {
|
||||||
dev.LinkWidth = &width
|
dev.LinkWidth = &width
|
||||||
}
|
}
|
||||||
if width, ok := readPCIIntAttribute(bdf, "max_link_width"); ok {
|
if width, ok := readPCIIntAttribute(bdfStr, "max_link_width"); ok {
|
||||||
dev.MaxLinkWidth = &width
|
dev.MaxLinkWidth = &width
|
||||||
}
|
}
|
||||||
if speed, ok := readPCIStringAttribute(bdf, "current_link_speed"); ok {
|
if speed, ok := readPCIStringAttribute(bdfStr, "current_link_speed"); ok {
|
||||||
linkSpeed := normalizePCILinkSpeed(speed)
|
linkSpeed := normalizePCILinkSpeed(speed)
|
||||||
if linkSpeed != "" {
|
if linkSpeed != "" {
|
||||||
dev.LinkSpeed = &linkSpeed
|
dev.LinkSpeed = &linkSpeed
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if speed, ok := readPCIStringAttribute(bdf, "max_link_speed"); ok {
|
if speed, ok := readPCIStringAttribute(bdfStr, "max_link_speed"); ok {
|
||||||
linkSpeed := normalizePCILinkSpeed(speed)
|
linkSpeed := normalizePCILinkSpeed(speed)
|
||||||
if linkSpeed != "" {
|
if linkSpeed != "" {
|
||||||
dev.MaxLinkSpeed = &linkSpeed
|
dev.MaxLinkSpeed = &linkSpeed
|
||||||
@@ -178,7 +179,15 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
|
|||||||
|
|
||||||
// SVendor/SDevice available but not in schema — skip
|
// SVendor/SDevice available but not in schema — skip
|
||||||
|
|
||||||
// Warn if PCIe link is running below its maximum negotiated speed.
|
// Detect NVLink bridge mezzanine cards (CPU→HGX internal link).
|
||||||
|
// These are Mellanox x2 devices with no host net interfaces and a DeviceName
|
||||||
|
// containing "NVLINK". The targeted lspci call is only executed for the small
|
||||||
|
// number of narrow-link Mellanox cards that pass the cheap pre-filter.
|
||||||
|
if bdfStr != "" && isNVLinkBridgeCandidate(bdfStr, dev) && confirmNVLinkBridgeDeviceName(bdfStr) {
|
||||||
|
markNVLinkBridge(&dev)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Warn (or Critical for NVLink bridges) if PCIe link is running below max.
|
||||||
applyPCIeLinkSpeedWarning(&dev)
|
applyPCIeLinkSpeedWarning(&dev)
|
||||||
|
|
||||||
return dev
|
return dev
|
||||||
@@ -265,17 +274,37 @@ func readPCIStringAttribute(bdf, attribute string) (string, bool) {
|
|||||||
return value, true
|
return value, true
|
||||||
}
|
}
|
||||||
|
|
||||||
// applyPCIeLinkSpeedWarning sets the device status to Warning if the current PCIe link
|
// applyPCIeLinkSpeedWarning sets device status when the current PCIe link speed is
|
||||||
// speed is below the maximum negotiated speed supported by both ends.
|
// below the device maximum. Regular PCIe slots get Warning; NVLink bridge cards
|
||||||
|
// get Critical because they are fixed internal connectors that must always train
|
||||||
|
// to max speed — any downgrade signals a hardware fault.
|
||||||
|
//
|
||||||
|
// Disabled devices (sysfs enable==0) are skipped: they carry no data traffic and
|
||||||
|
// their link state has no operational impact. This covers management endpoints
|
||||||
|
// (e.g. PCIe switch fabric controllers on HGX baseboards) that the kernel never
|
||||||
|
// activates but that lspci still reports with link stats.
|
||||||
func applyPCIeLinkSpeedWarning(dev *schema.HardwarePCIeDevice) {
|
func applyPCIeLinkSpeedWarning(dev *schema.HardwarePCIeDevice) {
|
||||||
if dev.LinkSpeed == nil || dev.MaxLinkSpeed == nil {
|
if dev.LinkSpeed == nil || dev.MaxLinkSpeed == nil {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
if pcieLinkSpeedRank(*dev.LinkSpeed) < pcieLinkSpeedRank(*dev.MaxLinkSpeed) {
|
if pcieLinkSpeedRank(*dev.LinkSpeed) >= pcieLinkSpeedRank(*dev.MaxLinkSpeed) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if dev.BDF != nil {
|
||||||
|
if enabled, ok := readPCIIntAttribute(*dev.BDF, "enable"); ok && enabled == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
desc := fmt.Sprintf("PCIe link speed degraded: running at %s, capable of %s", *dev.LinkSpeed, *dev.MaxLinkSpeed)
|
||||||
|
dev.ErrorDescription = &desc
|
||||||
|
|
||||||
|
isNVLinkBridge := dev.DeviceClass != nil && *dev.DeviceClass == "NVLinkBridge"
|
||||||
|
if isNVLinkBridge {
|
||||||
|
crit := statusCritical
|
||||||
|
dev.Status = &crit
|
||||||
|
} else {
|
||||||
warn := statusWarning
|
warn := statusWarning
|
||||||
dev.Status = &warn
|
dev.Status = &warn
|
||||||
desc := fmt.Sprintf("PCIe link speed degraded: running at %s, capable of %s", *dev.LinkSpeed, *dev.MaxLinkSpeed)
|
|
||||||
dev.ErrorDescription = &desc
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
206
audit/internal/collector/pcie_nvlink_bridge.go
Normal file
206
audit/internal/collector/pcie_nvlink_bridge.go
Normal file
@@ -0,0 +1,206 @@
|
|||||||
|
package collector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
"log/slog"
|
||||||
|
"os/exec"
|
||||||
|
"regexp"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
var nv5re = regexp.MustCompile(`(?i)^NV(\d+)$`)
|
||||||
|
|
||||||
|
// isNVLinkBridgeCandidate returns true for Mellanox PCIe devices that look like
|
||||||
|
// NVLink bridge mezzanine cards: narrow link (x2), no host net interfaces.
|
||||||
|
// These are the CPU-side PCIe control plane of the NVSwitch fabric on HGX/DGX systems.
|
||||||
|
func isNVLinkBridgeCandidate(bdf string, dev schema.HardwarePCIeDevice) bool {
|
||||||
|
if !isMellanoxDevice(dev) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if dev.LinkWidth == nil || *dev.LinkWidth > 2 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if len(netIfacesByBDF(bdf)) > 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
// confirmNVLinkBridgeDeviceName checks if the lspci DeviceName for bdf contains
|
||||||
|
// "NVLINK". This is a targeted single-device call, only executed for candidates
|
||||||
|
// already pre-filtered by isNVLinkBridgeCandidate.
|
||||||
|
func confirmNVLinkBridgeDeviceName(bdf string) bool {
|
||||||
|
out, err := exec.Command("lspci", "-s", bdf, "-v").Output()
|
||||||
|
if err != nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
for _, line := range strings.Split(string(out), "\n") {
|
||||||
|
if strings.Contains(strings.ToUpper(strings.TrimSpace(line)), "NVLINK") {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// markNVLinkBridge overwrites device_class and adds telemetry flags on a detected
|
||||||
|
// NVLink bridge card. Must be called before applyPCIeLinkSpeedWarning so that the
|
||||||
|
// correct severity (Critical) is applied.
|
||||||
|
func markNVLinkBridge(dev *schema.HardwarePCIeDevice) {
|
||||||
|
class := "NVLinkBridge"
|
||||||
|
dev.DeviceClass = &class
|
||||||
|
if dev.Telemetry == nil {
|
||||||
|
dev.Telemetry = map[string]any{}
|
||||||
|
}
|
||||||
|
dev.Telemetry["nvlink_bridge"] = true
|
||||||
|
}
|
||||||
|
|
||||||
|
// enrichNVLinkBridgesWithGPUTopo cross-references NVLink bridge PCIe status with
|
||||||
|
// the GPU-side NVLink topology reported by nvidia-smi. For each bridge device it
|
||||||
|
// adds nvlink_topo_all_active and nvlink_topo_min_links to the telemetry, and
|
||||||
|
// upgrades a degraded-link Warning to Critical when the fabric is also affected.
|
||||||
|
func enrichNVLinkBridgesWithGPUTopo(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice {
|
||||||
|
hasBridge := false
|
||||||
|
for _, d := range devs {
|
||||||
|
if d.DeviceClass != nil && *d.DeviceClass == "NVLinkBridge" {
|
||||||
|
hasBridge = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !hasBridge {
|
||||||
|
return devs
|
||||||
|
}
|
||||||
|
|
||||||
|
topo, err := queryNVIDIANVLinkTopo()
|
||||||
|
if err != nil {
|
||||||
|
slog.Info("nvlink-bridge: nvidia-smi topo unavailable, skipping cross-reference", "err", err)
|
||||||
|
return devs
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := range devs {
|
||||||
|
if devs[i].DeviceClass == nil || *devs[i].DeviceClass != "NVLinkBridge" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if devs[i].Telemetry == nil {
|
||||||
|
devs[i].Telemetry = map[string]any{}
|
||||||
|
}
|
||||||
|
devs[i].Telemetry["nvlink_topo_all_active"] = topo.AllActive
|
||||||
|
devs[i].Telemetry["nvlink_topo_min_links"] = topo.MinNVLinks
|
||||||
|
devs[i].Telemetry["nvlink_topo_gpu_count"] = topo.GPUCount
|
||||||
|
|
||||||
|
// If the bridge PCIe is already degraded AND the fabric is also degraded
|
||||||
|
// (missing NVLink connections), escalate to Critical.
|
||||||
|
if devs[i].Status != nil && *devs[i].Status == statusCritical && !topo.AllActive {
|
||||||
|
devs[i].Telemetry["nvlink_fabric_affected"] = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
slog.Info("nvlink-bridge: topo cross-reference applied",
|
||||||
|
"gpu_count", topo.GPUCount,
|
||||||
|
"all_active", topo.AllActive,
|
||||||
|
"min_links", topo.MinNVLinks,
|
||||||
|
)
|
||||||
|
return devs
|
||||||
|
}
|
||||||
|
|
||||||
|
// nvlinkTopoResult summarises the GPU NVLink connectivity matrix.
|
||||||
|
type nvlinkTopoResult struct {
|
||||||
|
GPUCount int
|
||||||
|
AllActive bool // true if every GPU pair has at least one NVLink bond
|
||||||
|
MinNVLinks int // minimum NVLink bonds seen across any GPU pair (0 = some pair disconnected)
|
||||||
|
}
|
||||||
|
|
||||||
|
// queryNVIDIANVLinkTopo runs nvidia-smi topo -m and parses the NVLink matrix.
|
||||||
|
func queryNVIDIANVLinkTopo() (nvlinkTopoResult, error) {
|
||||||
|
out, err := exec.Command("nvidia-smi", "topo", "-m").Output()
|
||||||
|
if err != nil {
|
||||||
|
return nvlinkTopoResult{}, err
|
||||||
|
}
|
||||||
|
return parseNVIDIATopologyMatrix(string(out)), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseNVIDIATopologyMatrix extracts the minimum NVLink bond count from the
|
||||||
|
// nvidia-smi topo -m matrix.
|
||||||
|
//
|
||||||
|
// Format (abbreviated):
|
||||||
|
//
|
||||||
|
// GPU0 GPU1 ... NIC0 NIC1
|
||||||
|
// GPU0 X NV18 ... NODE NODE
|
||||||
|
// GPU1 NV18 X ... NODE NODE
|
||||||
|
// NIC0 NODE NODE... X PIX
|
||||||
|
//
|
||||||
|
// The header row starts with "GPU0"; its columns may include non-GPU entries
|
||||||
|
// (NIC, CPU) which are ignored. Only GPU×GPU cells containing NV# values are
|
||||||
|
// counted. X is self; non-NV tokens (NODE, SYS, PHB, PIX) are skipped.
|
||||||
|
func parseNVIDIATopologyMatrix(raw string) nvlinkTopoResult {
|
||||||
|
lines := strings.Split(raw, "\n")
|
||||||
|
|
||||||
|
// Locate the header line and record which column indices are GPU columns.
|
||||||
|
headerIdx := -1
|
||||||
|
var gpuColIndices []int // 0-based indices within fields (excluding the row label)
|
||||||
|
var gpuCount int
|
||||||
|
for i, line := range lines {
|
||||||
|
trimmed := strings.TrimSpace(line)
|
||||||
|
if strings.HasPrefix(trimmed, "GPU0") {
|
||||||
|
parts := strings.Fields(trimmed)
|
||||||
|
for j, col := range parts {
|
||||||
|
if strings.HasPrefix(col, "GPU") {
|
||||||
|
gpuColIndices = append(gpuColIndices, j)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
gpuCount = len(gpuColIndices)
|
||||||
|
if gpuCount >= 2 {
|
||||||
|
headerIdx = i
|
||||||
|
}
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if headerIdx < 0 || gpuCount == 0 {
|
||||||
|
return nvlinkTopoResult{}
|
||||||
|
}
|
||||||
|
|
||||||
|
minLinks := -1 // -1 = no NV pair seen yet
|
||||||
|
allActive := true
|
||||||
|
|
||||||
|
for _, line := range lines[headerIdx+1:] {
|
||||||
|
trimmed := strings.TrimSpace(line)
|
||||||
|
if !strings.HasPrefix(trimmed, "GPU") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
cells := strings.Fields(trimmed)
|
||||||
|
// cells[0] is the row label (e.g. "GPU0"); cells[1..] are column values.
|
||||||
|
// gpuColIndices are 0-based within the header fields, so they map to
|
||||||
|
// cells[idx+1] in the data rows (shift by 1 for the row label).
|
||||||
|
for _, colIdx := range gpuColIndices {
|
||||||
|
dataIdx := colIdx + 1
|
||||||
|
if dataIdx >= len(cells) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
cell := cells[dataIdx]
|
||||||
|
m := nv5re.FindStringSubmatch(cell)
|
||||||
|
if len(m) != 2 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
n, err := strconv.Atoi(m[1])
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if n == 0 {
|
||||||
|
allActive = false
|
||||||
|
}
|
||||||
|
if minLinks < 0 || n < minLinks {
|
||||||
|
minLinks = n
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if minLinks < 0 {
|
||||||
|
minLinks = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
return nvlinkTopoResult{
|
||||||
|
GPUCount: gpuCount,
|
||||||
|
AllActive: allActive && minLinks > 0,
|
||||||
|
MinNVLinks: minLinks,
|
||||||
|
}
|
||||||
|
}
|
||||||
124
audit/internal/collector/pcie_nvlink_bridge_test.go
Normal file
124
audit/internal/collector/pcie_nvlink_bridge_test.go
Normal file
@@ -0,0 +1,124 @@
|
|||||||
|
package collector
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bee/audit/internal/schema"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestParseNVIDIATopologyMatrix(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
// Real-world B200 HGX output: 8 GPUs, all pairs connected via NV18.
|
||||||
|
input := ` GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 NIC0 NIC1
|
||||||
|
GPU0 X NV18 NV18 NV18 NV18 NV18 NV18 NV18 NODE NODE
|
||||||
|
GPU1 NV18 X NV18 NV18 NV18 NV18 NV18 NV18 NODE NODE
|
||||||
|
GPU2 NV18 NV18 X NV18 NV18 NV18 NV18 NV18 NODE NODE
|
||||||
|
GPU3 NV18 NV18 NV18 X NV18 NV18 NV18 NV18 NODE NODE
|
||||||
|
GPU4 NV18 NV18 NV18 NV18 X NV18 NV18 NV18 SYS SYS
|
||||||
|
GPU5 NV18 NV18 NV18 NV18 NV18 X NV18 NV18 SYS SYS
|
||||||
|
GPU6 NV18 NV18 NV18 NV18 NV18 NV18 X NV18 SYS SYS
|
||||||
|
GPU7 NV18 NV18 NV18 NV18 NV18 NV18 NV18 X SYS SYS
|
||||||
|
NIC0 NODE NODE NODE NODE SYS SYS SYS SYS X PIX
|
||||||
|
`
|
||||||
|
got := parseNVIDIATopologyMatrix(input)
|
||||||
|
|
||||||
|
if got.GPUCount != 8 {
|
||||||
|
t.Fatalf("GPUCount=%d want 8", got.GPUCount)
|
||||||
|
}
|
||||||
|
if !got.AllActive {
|
||||||
|
t.Fatalf("AllActive=false want true")
|
||||||
|
}
|
||||||
|
if got.MinNVLinks != 18 {
|
||||||
|
t.Fatalf("MinNVLinks=%d want 18", got.MinNVLinks)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseNVIDIATopologyMatrixPartialDegradation(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
// GPU1-GPU3 pair shows NV12 (reduced) instead of NV18.
|
||||||
|
input := ` GPU0 GPU1 GPU2 GPU3
|
||||||
|
GPU0 X NV18 NV18 NV18
|
||||||
|
GPU1 NV18 X NV18 NV12
|
||||||
|
GPU2 NV18 NV18 X NV18
|
||||||
|
GPU3 NV18 NV12 NV18 X
|
||||||
|
`
|
||||||
|
got := parseNVIDIATopologyMatrix(input)
|
||||||
|
|
||||||
|
if got.MinNVLinks != 12 {
|
||||||
|
t.Fatalf("MinNVLinks=%d want 12", got.MinNVLinks)
|
||||||
|
}
|
||||||
|
if !got.AllActive {
|
||||||
|
t.Fatalf("AllActive=false want true (12 links is still active)")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseNVIDIATopologyMatrixDisconnected(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
// GPU0-GPU1 pair fully disconnected (NV0).
|
||||||
|
input := ` GPU0 GPU1
|
||||||
|
GPU0 X NV0
|
||||||
|
GPU1 NV0 X
|
||||||
|
`
|
||||||
|
got := parseNVIDIATopologyMatrix(input)
|
||||||
|
|
||||||
|
if got.AllActive {
|
||||||
|
t.Fatalf("AllActive=true want false (NV0 means no links)")
|
||||||
|
}
|
||||||
|
if got.MinNVLinks != 0 {
|
||||||
|
t.Fatalf("MinNVLinks=%d want 0", got.MinNVLinks)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseNVIDIATopologyMatrixEmpty(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
got := parseNVIDIATopologyMatrix("no gpus here")
|
||||||
|
if got.GPUCount != 0 {
|
||||||
|
t.Fatalf("GPUCount=%d want 0", got.GPUCount)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestApplyPCIeLinkSpeedWarningNVLinkBridgeEscalates(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
bridgeClass := "NVLinkBridge"
|
||||||
|
linkSpeed := "Gen3"
|
||||||
|
maxLinkSpeed := "Gen4"
|
||||||
|
dev := schema.HardwarePCIeDevice{}
|
||||||
|
dev.DeviceClass = &bridgeClass
|
||||||
|
dev.LinkSpeed = &linkSpeed
|
||||||
|
dev.MaxLinkSpeed = &maxLinkSpeed
|
||||||
|
s := statusOK
|
||||||
|
dev.Status = &s
|
||||||
|
|
||||||
|
applyPCIeLinkSpeedWarning(&dev)
|
||||||
|
|
||||||
|
if dev.Status == nil || *dev.Status != statusCritical {
|
||||||
|
t.Fatalf("status=%v want Critical for NVLink bridge degradation", dev.Status)
|
||||||
|
}
|
||||||
|
if dev.ErrorDescription == nil {
|
||||||
|
t.Fatal("ErrorDescription nil, want degradation message")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestApplyPCIeLinkSpeedWarningRegularCardIsWarning(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
regularClass := "NetworkController"
|
||||||
|
linkSpeed := "Gen3"
|
||||||
|
maxLinkSpeed := "Gen4"
|
||||||
|
dev := schema.HardwarePCIeDevice{}
|
||||||
|
dev.DeviceClass = ®ularClass
|
||||||
|
dev.LinkSpeed = &linkSpeed
|
||||||
|
dev.MaxLinkSpeed = &maxLinkSpeed
|
||||||
|
s := statusOK
|
||||||
|
dev.Status = &s
|
||||||
|
|
||||||
|
applyPCIeLinkSpeedWarning(&dev)
|
||||||
|
|
||||||
|
if dev.Status == nil || *dev.Status != statusWarning {
|
||||||
|
t.Fatalf("status=%v want Warning for regular card degradation", dev.Status)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -58,7 +58,6 @@ func buildSensorsFromDoc(doc sensorsDoc) *schema.HardwareSensors {
|
|||||||
|
|
||||||
for _, chip := range chips {
|
for _, chip := range chips {
|
||||||
features := doc[chip]
|
features := doc[chip]
|
||||||
location := sensorLocation(chip)
|
|
||||||
|
|
||||||
keys := make([]string, 0, len(features))
|
keys := make([]string, 0, len(features))
|
||||||
for key := range features {
|
for key := range features {
|
||||||
@@ -80,25 +79,25 @@ func buildSensorsFromDoc(doc sensorsDoc) *schema.HardwareSensors {
|
|||||||
}
|
}
|
||||||
switch classifySensorFeature(feature) {
|
switch classifySensorFeature(feature) {
|
||||||
case "fan":
|
case "fan":
|
||||||
item := buildFanSensor(name, location, feature)
|
item := buildFanSensor(name, feature)
|
||||||
if item == nil || duplicateSensor(seen, "fan", item.Name) {
|
if item == nil || duplicateSensor(seen, "fan", item.Name) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
result.Fans = append(result.Fans, *item)
|
result.Fans = append(result.Fans, *item)
|
||||||
case "temp":
|
case "temp":
|
||||||
item := buildTempSensor(name, location, feature)
|
item := buildTempSensor(name, feature)
|
||||||
if item == nil || duplicateSensor(seen, "temp", item.Name) {
|
if item == nil || duplicateSensor(seen, "temp", item.Name) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
result.Temperatures = append(result.Temperatures, *item)
|
result.Temperatures = append(result.Temperatures, *item)
|
||||||
case "power":
|
case "power":
|
||||||
item := buildPowerSensor(name, location, feature)
|
item := buildPowerSensor(name, feature)
|
||||||
if item == nil || duplicateSensor(seen, "power", item.Name) {
|
if item == nil || duplicateSensor(seen, "power", item.Name) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
result.Power = append(result.Power, *item)
|
result.Power = append(result.Power, *item)
|
||||||
default:
|
default:
|
||||||
item := buildOtherSensor(name, location, feature)
|
item := buildOtherSensor(name, feature)
|
||||||
if item == nil || duplicateSensor(seen, "other", item.Name) {
|
if item == nil || duplicateSensor(seen, "other", item.Name) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
@@ -128,14 +127,6 @@ func duplicateSensor(seen map[string]struct{}, sensorType, name string) bool {
|
|||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
func sensorLocation(chip string) *string {
|
|
||||||
chip = strings.TrimSpace(chip)
|
|
||||||
if chip == "" {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
return &chip
|
|
||||||
}
|
|
||||||
|
|
||||||
func classifySensorFeature(feature map[string]any) string {
|
func classifySensorFeature(feature map[string]any) string {
|
||||||
for key := range feature {
|
for key := range feature {
|
||||||
switch {
|
switch {
|
||||||
@@ -154,24 +145,24 @@ func classifySensorFeature(feature map[string]any) string {
|
|||||||
return "other"
|
return "other"
|
||||||
}
|
}
|
||||||
|
|
||||||
func buildFanSensor(name string, location *string, feature map[string]any) *schema.HardwareFanSensor {
|
func buildFanSensor(name string, feature map[string]any) *schema.HardwareFanSensor {
|
||||||
rpm, ok := firstFeatureInt(feature, "_input")
|
rpm, ok := firstFeatureInt(feature, "_input")
|
||||||
if !ok {
|
if !ok {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
item := &schema.HardwareFanSensor{Name: name, Location: location, RPM: &rpm}
|
item := &schema.HardwareFanSensor{Name: name, RPM: &rpm}
|
||||||
if status := sensorStatusFromFeature(feature); status != nil {
|
if status := sensorStatusFromFeature(feature); status != nil {
|
||||||
item.Status = status
|
item.Status = status
|
||||||
}
|
}
|
||||||
return item
|
return item
|
||||||
}
|
}
|
||||||
|
|
||||||
func buildTempSensor(name string, location *string, feature map[string]any) *schema.HardwareTemperatureSensor {
|
func buildTempSensor(name string, feature map[string]any) *schema.HardwareTemperatureSensor {
|
||||||
celsius, ok := firstFeatureFloat(feature, "_input")
|
celsius, ok := firstFeatureFloat(feature, "_input")
|
||||||
if !ok {
|
if !ok {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
item := &schema.HardwareTemperatureSensor{Name: name, Location: location, Celsius: &celsius}
|
item := &schema.HardwareTemperatureSensor{Name: name, Celsius: &celsius}
|
||||||
if warning, ok := firstFeatureFloatWithSuffixes(feature, []string{"_max", "_high"}); ok {
|
if warning, ok := firstFeatureFloatWithSuffixes(feature, []string{"_max", "_high"}); ok {
|
||||||
item.ThresholdWarningCelsius = &warning
|
item.ThresholdWarningCelsius = &warning
|
||||||
}
|
}
|
||||||
@@ -186,8 +177,8 @@ func buildTempSensor(name string, location *string, feature map[string]any) *sch
|
|||||||
return item
|
return item
|
||||||
}
|
}
|
||||||
|
|
||||||
func buildPowerSensor(name string, location *string, feature map[string]any) *schema.HardwarePowerSensor {
|
func buildPowerSensor(name string, feature map[string]any) *schema.HardwarePowerSensor {
|
||||||
item := &schema.HardwarePowerSensor{Name: name, Location: location}
|
item := &schema.HardwarePowerSensor{Name: name}
|
||||||
if v, ok := firstFeatureFloatWithContains(feature, []string{"power"}); ok {
|
if v, ok := firstFeatureFloatWithContains(feature, []string{"power"}); ok {
|
||||||
item.PowerW = &v
|
item.PowerW = &v
|
||||||
}
|
}
|
||||||
@@ -206,12 +197,12 @@ func buildPowerSensor(name string, location *string, feature map[string]any) *sc
|
|||||||
return item
|
return item
|
||||||
}
|
}
|
||||||
|
|
||||||
func buildOtherSensor(name string, location *string, feature map[string]any) *schema.HardwareOtherSensor {
|
func buildOtherSensor(name string, feature map[string]any) *schema.HardwareOtherSensor {
|
||||||
value, unit, ok := firstGenericSensorValue(feature)
|
value, unit, ok := firstGenericSensorValue(feature)
|
||||||
if !ok {
|
if !ok {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
item := &schema.HardwareOtherSensor{Name: name, Location: location, Value: &value}
|
item := &schema.HardwareOtherSensor{Name: name, Value: &value}
|
||||||
if unit != "" {
|
if unit != "" {
|
||||||
item.Unit = &unit
|
item.Unit = &unit
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -36,6 +36,24 @@ func bestEffortRescanHotplugStorage() {
|
|||||||
slog.Info("storage: scsi host scan skipped", "pattern", scsiHostScanGlob, "err", err)
|
slog.Info("storage: scsi host scan skipped", "pattern", scsiHostScanGlob, "err", err)
|
||||||
} else {
|
} else {
|
||||||
for _, path := range hostPaths {
|
for _, path := range hostPaths {
|
||||||
|
// SAS HBAs (e.g. smartpqi) block indefinitely in sas_user_scan when
|
||||||
|
// written to — SAS topology is discovered by the driver itself.
|
||||||
|
// Detect via two methods: (1) sas_host class registration, and
|
||||||
|
// (2) driver proc_name — smartpqi uses scsi_transport_sas but does
|
||||||
|
// not register a sas_host object, so (1) alone misses it.
|
||||||
|
host := filepath.Base(filepath.Dir(path))
|
||||||
|
if _, err := os.Stat("/sys/class/sas_host/" + host); err == nil {
|
||||||
|
slog.Info("storage: scsi host scan skipped (SAS host)", "path", path)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if procName, err := os.ReadFile("/sys/class/scsi_host/" + host + "/proc_name"); err == nil {
|
||||||
|
switch strings.TrimSpace(string(procName)) {
|
||||||
|
case "smartpqi", "hpsa":
|
||||||
|
slog.Info("storage: scsi host scan skipped (SAS transport driver)",
|
||||||
|
"path", path, "driver", strings.TrimSpace(string(procName)))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
if err := hotplugWriteFile(path, []byte("- - -\n"), 0644); err != nil {
|
if err := hotplugWriteFile(path, []byte("- - -\n"), 0644); err != nil {
|
||||||
slog.Info("storage: scsi host scan write failed", "path", path, "err", err)
|
slog.Info("storage: scsi host scan write failed", "path", path, "err", err)
|
||||||
continue
|
continue
|
||||||
@@ -66,17 +84,41 @@ func collectStorage() []schema.HardwareStorage {
|
|||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// jsonInt64 accepts both a bare JSON number and a JSON-quoted number string.
|
||||||
|
// lsblk -J emits LOG-SEC / PHY-SEC as integers on util-linux ≥ 2.37 (Debian 12)
|
||||||
|
// but older versions emit them as strings. This type handles both.
|
||||||
|
type jsonInt64 int64
|
||||||
|
|
||||||
|
func (j *jsonInt64) UnmarshalJSON(data []byte) error {
|
||||||
|
// bare number: 512
|
||||||
|
var n int64
|
||||||
|
if err := json.Unmarshal(data, &n); err == nil {
|
||||||
|
*j = jsonInt64(n)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
// quoted string: "512"
|
||||||
|
var s string
|
||||||
|
if err := json.Unmarshal(data, &s); err == nil {
|
||||||
|
n, err := strconv.ParseInt(strings.TrimSpace(s), 10, 64)
|
||||||
|
if err == nil {
|
||||||
|
*j = jsonInt64(n)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return nil // null or unexpected type — leave zero
|
||||||
|
}
|
||||||
|
|
||||||
// lsblkDevice is a minimal lsblk JSON record.
|
// lsblkDevice is a minimal lsblk JSON record.
|
||||||
type lsblkDevice struct {
|
type lsblkDevice struct {
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
Type string `json:"type"`
|
Type string `json:"type"`
|
||||||
Size string `json:"size"`
|
Size string `json:"size"`
|
||||||
Serial string `json:"serial"`
|
Serial string `json:"serial"`
|
||||||
Model string `json:"model"`
|
Model string `json:"model"`
|
||||||
Tran string `json:"tran"`
|
Tran string `json:"tran"`
|
||||||
Hctl string `json:"hctl"`
|
Hctl string `json:"hctl"`
|
||||||
LogSec string `json:"log-sec"`
|
LogSec jsonInt64 `json:"log-sec"`
|
||||||
PhySec string `json:"phy-sec"`
|
PhySec jsonInt64 `json:"phy-sec"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type lsblkRoot struct {
|
type lsblkRoot struct {
|
||||||
@@ -382,20 +424,23 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// nvmeSmartLog is the subset of `nvme smart-log -o json` output we care about.
|
// nvmeSmartLog is the subset of `nvme smart-log -o json` output we care about.
|
||||||
|
// nvme-cli emits most counters as JSON strings (e.g. "power_on_hours":"49"),
|
||||||
|
// so all numeric fields use jsonInt64 which accepts both bare numbers and
|
||||||
|
// quoted strings. Field names match nvme-cli JSON output, not NVMe spec prose.
|
||||||
type nvmeSmartLog struct {
|
type nvmeSmartLog struct {
|
||||||
CriticalWarning int `json:"critical_warning"`
|
CriticalWarning jsonInt64 `json:"critical_warning"`
|
||||||
PercentageUsed int `json:"percentage_used"`
|
PercentageUsed jsonInt64 `json:"percent_used"`
|
||||||
AvailableSpare int `json:"available_spare"`
|
AvailableSpare jsonInt64 `json:"avail_spare"`
|
||||||
SpareThreshold int `json:"spare_thresh"`
|
SpareThreshold jsonInt64 `json:"spare_thresh"`
|
||||||
Temperature int64 `json:"temperature"`
|
Temperature jsonInt64 `json:"temperature"`
|
||||||
PowerOnHours int64 `json:"power_on_hours"`
|
PowerOnHours jsonInt64 `json:"power_on_hours"`
|
||||||
PowerCycles int64 `json:"power_cycles"`
|
PowerCycles jsonInt64 `json:"power_cycles"`
|
||||||
UnsafeShutdowns int64 `json:"unsafe_shutdowns"`
|
UnsafeShutdowns jsonInt64 `json:"unsafe_shutdowns"`
|
||||||
DataUnitsRead int64 `json:"data_units_read"`
|
DataUnitsRead jsonInt64 `json:"data_units_read"`
|
||||||
DataUnitsWritten int64 `json:"data_units_written"`
|
DataUnitsWritten jsonInt64 `json:"data_units_written"`
|
||||||
ControllerBusy int64 `json:"controller_busy_time"`
|
ControllerBusy jsonInt64 `json:"controller_busy_time"`
|
||||||
MediaErrors int64 `json:"media_errors"`
|
MediaErrors jsonInt64 `json:"media_errors"`
|
||||||
NumErrLogEntries int64 `json:"num_err_log_entries"`
|
NumErrLogEntries jsonInt64 `json:"num_err_log_entries"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// nvmeIDCtrl is the subset of `nvme id-ctrl -o json` output.
|
// nvmeIDCtrl is the subset of `nvme id-ctrl -o json` output.
|
||||||
@@ -460,13 +505,16 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
|||||||
var log nvmeSmartLog
|
var log nvmeSmartLog
|
||||||
if json.Unmarshal(out, &log) == nil {
|
if json.Unmarshal(out, &log) == nil {
|
||||||
if log.PowerOnHours > 0 {
|
if log.PowerOnHours > 0 {
|
||||||
s.PowerOnHours = &log.PowerOnHours
|
v := int64(log.PowerOnHours)
|
||||||
|
s.PowerOnHours = &v
|
||||||
}
|
}
|
||||||
if log.PowerCycles > 0 {
|
if log.PowerCycles > 0 {
|
||||||
s.PowerCycles = &log.PowerCycles
|
v := int64(log.PowerCycles)
|
||||||
|
s.PowerCycles = &v
|
||||||
}
|
}
|
||||||
if log.UnsafeShutdowns > 0 {
|
if log.UnsafeShutdowns > 0 {
|
||||||
s.UnsafeShutdowns = &log.UnsafeShutdowns
|
v := int64(log.UnsafeShutdowns)
|
||||||
|
s.UnsafeShutdowns = &v
|
||||||
}
|
}
|
||||||
if log.PercentageUsed > 0 {
|
if log.PercentageUsed > 0 {
|
||||||
v := float64(log.PercentageUsed)
|
v := float64(log.PercentageUsed)
|
||||||
@@ -475,11 +523,11 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
|||||||
s.LifeRemainingPct = &remaining
|
s.LifeRemainingPct = &remaining
|
||||||
}
|
}
|
||||||
if log.DataUnitsWritten > 0 {
|
if log.DataUnitsWritten > 0 {
|
||||||
v := nvmeDataUnitsToBytes(log.DataUnitsWritten)
|
v := nvmeDataUnitsToBytes(int64(log.DataUnitsWritten))
|
||||||
s.WrittenBytes = &v
|
s.WrittenBytes = &v
|
||||||
}
|
}
|
||||||
if log.DataUnitsRead > 0 {
|
if log.DataUnitsRead > 0 {
|
||||||
v := nvmeDataUnitsToBytes(log.DataUnitsRead)
|
v := nvmeDataUnitsToBytes(int64(log.DataUnitsRead))
|
||||||
s.ReadBytes = &v
|
s.ReadBytes = &v
|
||||||
}
|
}
|
||||||
if log.AvailableSpare > 0 {
|
if log.AvailableSpare > 0 {
|
||||||
@@ -487,23 +535,25 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
|||||||
s.AvailableSparePct = &v
|
s.AvailableSparePct = &v
|
||||||
}
|
}
|
||||||
if log.MediaErrors > 0 {
|
if log.MediaErrors > 0 {
|
||||||
s.MediaErrors = &log.MediaErrors
|
v := int64(log.MediaErrors)
|
||||||
|
s.MediaErrors = &v
|
||||||
}
|
}
|
||||||
if log.NumErrLogEntries > 0 {
|
if log.NumErrLogEntries > 0 {
|
||||||
s.ErrorLogEntries = &log.NumErrLogEntries
|
v := int64(log.NumErrLogEntries)
|
||||||
|
s.ErrorLogEntries = &v
|
||||||
}
|
}
|
||||||
if log.Temperature > 0 {
|
if log.Temperature > 0 {
|
||||||
v := float64(log.Temperature - 273)
|
v := float64(log.Temperature - 273)
|
||||||
s.TemperatureC = &v
|
s.TemperatureC = &v
|
||||||
}
|
}
|
||||||
setStorageHealthStatus(&s, storageHealthStatus{
|
setStorageHealthStatus(&s, storageHealthStatus{
|
||||||
criticalWarning: log.CriticalWarning,
|
criticalWarning: int(log.CriticalWarning),
|
||||||
percentageUsed: int64(log.PercentageUsed),
|
percentageUsed: int64(log.PercentageUsed),
|
||||||
availableSpare: int64(log.AvailableSpare),
|
availableSpare: int64(log.AvailableSpare),
|
||||||
spareThreshold: int64(log.SpareThreshold),
|
spareThreshold: int64(log.SpareThreshold),
|
||||||
unsafeShutdowns: log.UnsafeShutdowns,
|
unsafeShutdowns: int64(log.UnsafeShutdowns),
|
||||||
mediaErrors: log.MediaErrors,
|
mediaErrors: int64(log.MediaErrors),
|
||||||
errorLogEntries: log.NumErrLogEntries,
|
errorLogEntries: int64(log.NumErrLogEntries),
|
||||||
})
|
})
|
||||||
return s
|
return s
|
||||||
}
|
}
|
||||||
@@ -620,8 +670,8 @@ func applyStorageBlockGeometry(s *schema.HardwareStorage, dev lsblkDevice) {
|
|||||||
if s == nil {
|
if s == nil {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
logical := parseStorageBytes(dev.LogSec)
|
logical := int64(dev.LogSec)
|
||||||
physical := parseStorageBytes(dev.PhySec)
|
physical := int64(dev.PhySec)
|
||||||
if logical <= 0 && physical <= 0 {
|
if logical <= 0 && physical <= 0 {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
package collector
|
package collector
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"encoding/json"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
@@ -38,6 +39,54 @@ func TestParseStorageBytes(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestJsonInt64UnmarshalBothFormats(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
// util-linux ≥ 2.37 emits LOG-SEC / PHY-SEC as bare JSON numbers.
|
||||||
|
// Older versions emit quoted strings. Both must parse without error
|
||||||
|
// so that the entire lsblkDevices() call does not return nil on Debian 12.
|
||||||
|
cases := []struct {
|
||||||
|
json string
|
||||||
|
want int64
|
||||||
|
}{
|
||||||
|
{`512`, 512},
|
||||||
|
{`4096`, 4096},
|
||||||
|
{`"512"`, 512},
|
||||||
|
{`"4096"`, 4096},
|
||||||
|
{`null`, 0},
|
||||||
|
}
|
||||||
|
for _, tc := range cases {
|
||||||
|
var v jsonInt64
|
||||||
|
if err := v.UnmarshalJSON([]byte(tc.json)); err != nil {
|
||||||
|
t.Fatalf("UnmarshalJSON(%s): unexpected error %v", tc.json, err)
|
||||||
|
}
|
||||||
|
if int64(v) != tc.want {
|
||||||
|
t.Fatalf("UnmarshalJSON(%s)=%d want %d", tc.json, int64(v), tc.want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Simulate the exact JSON shape that triggered the bug on Debian 12.
|
||||||
|
input := []byte(`{
|
||||||
|
"blockdevices": [
|
||||||
|
{"name":"sda","type":"disk","size":"3.6T","serial":"S1234","model":"SEAGATE","tran":"sata","hctl":"0:0:0:0","log-sec":512,"phy-sec":4096},
|
||||||
|
{"name":"sdb","type":"disk","size":"3.6T","serial":"S5678","model":"SEAGATE","tran":"sata","hctl":"0:0:1:0","log-sec":512,"phy-sec":4096}
|
||||||
|
]
|
||||||
|
}`)
|
||||||
|
var root lsblkRoot
|
||||||
|
if err := json.Unmarshal(input, &root); err != nil {
|
||||||
|
t.Fatalf("lsblkRoot unmarshal with integer log-sec/phy-sec: %v", err)
|
||||||
|
}
|
||||||
|
if len(root.Blockdevices) != 2 {
|
||||||
|
t.Fatalf("got %d blockdevices want 2", len(root.Blockdevices))
|
||||||
|
}
|
||||||
|
if int64(root.Blockdevices[0].LogSec) != 512 {
|
||||||
|
t.Fatalf("LogSec=%d want 512", root.Blockdevices[0].LogSec)
|
||||||
|
}
|
||||||
|
if int64(root.Blockdevices[0].PhySec) != 4096 {
|
||||||
|
t.Fatalf("PhySec=%d want 4096", root.Blockdevices[0].PhySec)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestBestEffortRescanHotplugStorage(t *testing.T) {
|
func TestBestEffortRescanHotplugStorage(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
|
|||||||
@@ -1,11 +1,65 @@
|
|||||||
package collector
|
package collector
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"encoding/json"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"bee/audit/internal/schema"
|
"bee/audit/internal/schema"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// TestNVMeSmartLogUnmarshal verifies that nvme-cli JSON output (where most
|
||||||
|
// counters are quoted strings and field names differ from NVMe spec prose)
|
||||||
|
// is correctly parsed into nvmeSmartLog.
|
||||||
|
func TestNVMeSmartLogUnmarshal(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
// Real nvme-cli output: counters are JSON strings, spare is "avail_spare",
|
||||||
|
// percentage used is "percent_used".
|
||||||
|
raw := `{
|
||||||
|
"critical_warning": 0,
|
||||||
|
"temperature": 310,
|
||||||
|
"avail_spare": 100,
|
||||||
|
"spare_thresh": 5,
|
||||||
|
"percent_used": 0,
|
||||||
|
"data_units_read": "10925415",
|
||||||
|
"data_units_written": "8497672",
|
||||||
|
"controller_busy_time": "305",
|
||||||
|
"power_cycles": "53",
|
||||||
|
"power_on_hours": "49",
|
||||||
|
"unsafe_shutdowns": "22",
|
||||||
|
"media_errors": "0",
|
||||||
|
"num_err_log_entries": "0"
|
||||||
|
}`
|
||||||
|
var log nvmeSmartLog
|
||||||
|
if err := json.Unmarshal([]byte(raw), &log); err != nil {
|
||||||
|
t.Fatalf("json.Unmarshal failed: %v", err)
|
||||||
|
}
|
||||||
|
if log.PowerOnHours != 49 {
|
||||||
|
t.Errorf("PowerOnHours=%d want 49", log.PowerOnHours)
|
||||||
|
}
|
||||||
|
if log.PowerCycles != 53 {
|
||||||
|
t.Errorf("PowerCycles=%d want 53", log.PowerCycles)
|
||||||
|
}
|
||||||
|
if log.AvailableSpare != 100 {
|
||||||
|
t.Errorf("AvailableSpare=%d want 100", log.AvailableSpare)
|
||||||
|
}
|
||||||
|
if log.SpareThreshold != 5 {
|
||||||
|
t.Errorf("SpareThreshold=%d want 5", log.SpareThreshold)
|
||||||
|
}
|
||||||
|
if log.PercentageUsed != 0 {
|
||||||
|
t.Errorf("PercentageUsed=%d want 0", log.PercentageUsed)
|
||||||
|
}
|
||||||
|
if log.Temperature != 310 {
|
||||||
|
t.Errorf("Temperature=%d want 310", log.Temperature)
|
||||||
|
}
|
||||||
|
if log.MediaErrors != 0 {
|
||||||
|
t.Errorf("MediaErrors=%d want 0", log.MediaErrors)
|
||||||
|
}
|
||||||
|
if log.UnsafeShutdowns != 22 {
|
||||||
|
t.Errorf("UnsafeShutdowns=%d want 22", log.UnsafeShutdowns)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestSetStorageHealthStatus(t *testing.T) {
|
func TestSetStorageHealthStatus(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
||||||
|
|||||||
27
audit/internal/collector/testdata/dmidecode_type0_dell.txt
vendored
Normal file
27
audit/internal/collector/testdata/dmidecode_type0_dell.txt
vendored
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
# dmidecode 3.2
|
||||||
|
Getting SMBIOS data from sysfs.
|
||||||
|
SMBIOS 3.1.0 present.
|
||||||
|
|
||||||
|
Handle 0x0000, DMI type 0, 26 bytes
|
||||||
|
BIOS Information
|
||||||
|
Vendor: Dell Inc.
|
||||||
|
Version: 2.5.4
|
||||||
|
Release Date: 01/13/2020
|
||||||
|
Address: 0xF0000
|
||||||
|
Runtime Size: 64 kB
|
||||||
|
ROM Size: 32 MB
|
||||||
|
Characteristics:
|
||||||
|
ISA is supported
|
||||||
|
PCI is supported
|
||||||
|
PNP is supported
|
||||||
|
BIOS is upgradeable
|
||||||
|
BIOS shadowing is allowed
|
||||||
|
Boot from CD is supported
|
||||||
|
Selectable boot is supported
|
||||||
|
EDD is supported
|
||||||
|
ACPI is supported
|
||||||
|
USB legacy is supported
|
||||||
|
BIOS boot specification is supported
|
||||||
|
Targeted content distribution is supported
|
||||||
|
UEFI is supported
|
||||||
|
BIOS Revision: 2.5
|
||||||
59
audit/internal/collector/testdata/dmidecode_type17_mixed.txt
vendored
Normal file
59
audit/internal/collector/testdata/dmidecode_type17_mixed.txt
vendored
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
# dmidecode 3.1
|
||||||
|
Getting SMBIOS data from sysfs.
|
||||||
|
SMBIOS 2.8 present.
|
||||||
|
|
||||||
|
Handle 0x0026, DMI type 17, 40 bytes
|
||||||
|
Memory Device
|
||||||
|
Array Handle: 0x0025
|
||||||
|
Error Information Handle: Not Provided
|
||||||
|
Total Width: 72 bits
|
||||||
|
Data Width: 64 bits
|
||||||
|
Size: 16 GB
|
||||||
|
Form Factor: DIMM
|
||||||
|
Set: None
|
||||||
|
Locator: P1-DIMMA1
|
||||||
|
Bank Locator: P0_Node0_Channel0_Dimm0
|
||||||
|
Type: DDR4
|
||||||
|
Type Detail: Synchronous
|
||||||
|
Speed: 2133 MT/s
|
||||||
|
Manufacturer: Micron
|
||||||
|
Serial Number: 1A2B3C4D
|
||||||
|
Asset Tag: Not Specified
|
||||||
|
Part Number: 36ASF2G72PZ-2G1A2
|
||||||
|
Rank: 2
|
||||||
|
Configured Memory Speed: 2133 MT/s
|
||||||
|
|
||||||
|
Handle 0x0027, DMI type 17, 40 bytes
|
||||||
|
Memory Device
|
||||||
|
Array Handle: 0x0025
|
||||||
|
Error Information Handle: Not Provided
|
||||||
|
Total Width: Unknown
|
||||||
|
Data Width: Unknown
|
||||||
|
Size: No Module Installed
|
||||||
|
Form Factor: DIMM
|
||||||
|
Set: None
|
||||||
|
Locator: P1-DIMMA2
|
||||||
|
Bank Locator: P0_Node0_Channel0_Dimm1
|
||||||
|
Type: DDR4
|
||||||
|
Type Detail: Synchronous
|
||||||
|
|
||||||
|
Handle 0x0028, DMI type 17, 84 bytes
|
||||||
|
Memory Device
|
||||||
|
Array Handle: 0x0025
|
||||||
|
Error Information Handle: Not Provided
|
||||||
|
Total Width: 72 bits
|
||||||
|
Data Width: 64 bits
|
||||||
|
Size: 32768 MB
|
||||||
|
Form Factor: DIMM
|
||||||
|
Set: 1
|
||||||
|
Locator: A1
|
||||||
|
Bank Locator: Not Specified
|
||||||
|
Type: DDR4
|
||||||
|
Type Detail: Synchronous Registered (Buffered)
|
||||||
|
Speed: 2933 MT/s
|
||||||
|
Manufacturer: Samsung
|
||||||
|
Serial Number: 5E6F7A8B
|
||||||
|
Asset Tag: Not Specified
|
||||||
|
Part Number: M393A4K40CB2-CVF
|
||||||
|
Rank: 2
|
||||||
|
Configured Memory Speed: 2400 MT/s
|
||||||
14
audit/internal/collector/testdata/dmidecode_type1_dell.txt
vendored
Normal file
14
audit/internal/collector/testdata/dmidecode_type1_dell.txt
vendored
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
# dmidecode 3.2
|
||||||
|
Getting SMBIOS data from sysfs.
|
||||||
|
SMBIOS 3.1.0 present.
|
||||||
|
|
||||||
|
Handle 0x0100, DMI type 1, 27 bytes
|
||||||
|
System Information
|
||||||
|
Manufacturer: Dell Inc.
|
||||||
|
Product Name: PowerEdge R740xd
|
||||||
|
Version: Not Specified
|
||||||
|
Serial Number: 7SG9F63
|
||||||
|
UUID: b1c2d3e4-f5a6-7890-bcde-f12345678901
|
||||||
|
Wake-up Type: Power Switch
|
||||||
|
SKU Number: SKU=NotProvided;ModelName=PowerEdge R740xd
|
||||||
|
Family: PowerEdge
|
||||||
14
audit/internal/collector/testdata/dmidecode_type1_hpe.txt
vendored
Normal file
14
audit/internal/collector/testdata/dmidecode_type1_hpe.txt
vendored
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
# dmidecode 3.3
|
||||||
|
Getting SMBIOS data from sysfs.
|
||||||
|
SMBIOS 3.1.0 present.
|
||||||
|
|
||||||
|
Handle 0x008E, DMI type 1, 27 bytes
|
||||||
|
System Information
|
||||||
|
Manufacturer: HPE
|
||||||
|
Product Name: ProLiant DL380 Gen10
|
||||||
|
Version: Not Specified
|
||||||
|
Serial Number: CZJ9320CXN
|
||||||
|
UUID: c2d3e4f5-a6b7-8901-cdef-012345678902
|
||||||
|
Wake-up Type: Power Switch
|
||||||
|
SKU Number: 868703-B21
|
||||||
|
Family: ProLiant
|
||||||
14
audit/internal/collector/testdata/dmidecode_type1_supermicro.txt
vendored
Normal file
14
audit/internal/collector/testdata/dmidecode_type1_supermicro.txt
vendored
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
# dmidecode 3.1
|
||||||
|
Getting SMBIOS data from sysfs.
|
||||||
|
SMBIOS 2.8 present.
|
||||||
|
|
||||||
|
Handle 0x0001, DMI type 1, 27 bytes
|
||||||
|
System Information
|
||||||
|
Manufacturer: Supermicro
|
||||||
|
Product Name: SYS-6028R-WTR
|
||||||
|
Version: 0123456789
|
||||||
|
Serial Number: S214726X2A36789
|
||||||
|
UUID: d3e4f5a6-b7c8-9012-def0-123456789003
|
||||||
|
Wake-up Type: Power Switch
|
||||||
|
SKU Number: Default string
|
||||||
|
Family: Default string
|
||||||
10
audit/internal/collector/testdata/dmidecode_type2_dell.txt
vendored
Normal file
10
audit/internal/collector/testdata/dmidecode_type2_dell.txt
vendored
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
# dmidecode 3.2
|
||||||
|
Getting SMBIOS data from sysfs.
|
||||||
|
SMBIOS 3.1.0 present.
|
||||||
|
|
||||||
|
Handle 0x0200, DMI type 2, 8 bytes
|
||||||
|
Base Board Information
|
||||||
|
Manufacturer: Dell Inc.
|
||||||
|
Product Name: 0F9N89
|
||||||
|
Version: A00
|
||||||
|
Serial Number: 7SG9F63
|
||||||
19
audit/internal/collector/testdata/dmidecode_type2_hpe.txt
vendored
Normal file
19
audit/internal/collector/testdata/dmidecode_type2_hpe.txt
vendored
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
# dmidecode 3.3
|
||||||
|
Getting SMBIOS data from sysfs.
|
||||||
|
SMBIOS 3.1.0 present.
|
||||||
|
|
||||||
|
Handle 0x00A4, DMI type 2, 15 bytes
|
||||||
|
Base Board Information
|
||||||
|
Manufacturer: HPE
|
||||||
|
Product Name: ProLiant DL380 Gen10
|
||||||
|
Version: Not Specified
|
||||||
|
Serial Number: CZJ9320CXN
|
||||||
|
Asset Tag: CZJ9320CXN
|
||||||
|
Features:
|
||||||
|
Board is a hosting board
|
||||||
|
Board is removable
|
||||||
|
Board is replaceable
|
||||||
|
Location In Chassis: Not Specified
|
||||||
|
Chassis Handle: 0x0000
|
||||||
|
Type: Motherboard
|
||||||
|
Contained Object Handles: 0
|
||||||
18
audit/internal/collector/testdata/dmidecode_type2_supermicro.txt
vendored
Normal file
18
audit/internal/collector/testdata/dmidecode_type2_supermicro.txt
vendored
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
# dmidecode 3.1
|
||||||
|
Getting SMBIOS data from sysfs.
|
||||||
|
SMBIOS 2.8 present.
|
||||||
|
|
||||||
|
Handle 0x0002, DMI type 2, 15 bytes
|
||||||
|
Base Board Information
|
||||||
|
Manufacturer: Supermicro
|
||||||
|
Product Name: X10DRW-i
|
||||||
|
Version: 1.02
|
||||||
|
Serial Number: S214726X2A36789
|
||||||
|
Asset Tag: Default string
|
||||||
|
Features:
|
||||||
|
Board is a hosting board
|
||||||
|
Board is replaceable
|
||||||
|
Location In Chassis: Default string
|
||||||
|
Chassis Handle: 0x0003
|
||||||
|
Type: Motherboard
|
||||||
|
Contained Object Handles: 0
|
||||||
@@ -38,6 +38,15 @@ var HardwareErrorPatterns = []ErrorPattern{
|
|||||||
Category: "gpu",
|
Category: "gpu",
|
||||||
Severity: "warning",
|
Severity: "warning",
|
||||||
},
|
},
|
||||||
|
// PCIe AER correctable from the NVIDIA driver — "bus correctable error" in SEL.
|
||||||
|
// Severity is warning (not critical): correctable errors are hardware-recovered.
|
||||||
|
{
|
||||||
|
Name: "nvidia-aer-correctable",
|
||||||
|
Re: mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER.*[Cc]orrect`),
|
||||||
|
Category: "gpu",
|
||||||
|
Severity: "warning",
|
||||||
|
BDFGroup: 1,
|
||||||
|
},
|
||||||
{
|
{
|
||||||
Name: "nvidia-aer",
|
Name: "nvidia-aer",
|
||||||
Re: mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
|
Re: mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
|
||||||
@@ -54,6 +63,15 @@ var HardwareErrorPatterns = []ErrorPattern{
|
|||||||
},
|
},
|
||||||
|
|
||||||
// ── PCIe AER (generic) ──────────────────────────────────────────────────────
|
// ── PCIe AER (generic) ──────────────────────────────────────────────────────
|
||||||
|
// PCIe AER correctable from the root port — captures the reported device BDF
|
||||||
|
// (second BDF in "pcieport X: AER: Correctable error received: Y").
|
||||||
|
{
|
||||||
|
Name: "pcie-aer-correctable",
|
||||||
|
Re: mustPat(`(?i)pcieport.*AER:.*[Cc]orrect.*:\s*([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
|
||||||
|
Category: "pcie",
|
||||||
|
Severity: "warning",
|
||||||
|
BDFGroup: 1,
|
||||||
|
},
|
||||||
{
|
{
|
||||||
Name: "pcie-aer",
|
Name: "pcie-aer",
|
||||||
Re: mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
|
Re: mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
)
|
)
|
||||||
@@ -18,7 +19,7 @@ type InstallDisk struct {
|
|||||||
MountedParts []string // partition mount points currently active
|
MountedParts []string // partition mount points currently active
|
||||||
}
|
}
|
||||||
|
|
||||||
const squashfsPath = "/run/live/medium/live/filesystem.squashfs"
|
const squashfsGlob = "/run/live/medium/live/*.squashfs"
|
||||||
|
|
||||||
// ListInstallDisks returns block devices suitable for installation.
|
// ListInstallDisks returns block devices suitable for installation.
|
||||||
// Excludes the current live boot medium but includes USB drives.
|
// Excludes the current live boot medium but includes USB drives.
|
||||||
@@ -176,11 +177,22 @@ func inferLiveBootKind(fsType, source, deviceType, transport string) string {
|
|||||||
// squashfs size × 1.5 to allow for extracted filesystem and bootloader.
|
// squashfs size × 1.5 to allow for extracted filesystem and bootloader.
|
||||||
// Returns 0 if the squashfs is not available (non-live environment).
|
// Returns 0 if the squashfs is not available (non-live environment).
|
||||||
func MinInstallBytes() int64 {
|
func MinInstallBytes() int64 {
|
||||||
fi, err := os.Stat(squashfsPath)
|
files, err := filepath.Glob(squashfsGlob)
|
||||||
if err != nil {
|
if err != nil || len(files) == 0 {
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
return fi.Size() * 3 / 2
|
var total int64
|
||||||
|
for _, path := range files {
|
||||||
|
fi, statErr := os.Stat(path)
|
||||||
|
if statErr != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
total += fi.Size()
|
||||||
|
}
|
||||||
|
if total == 0 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return total * 3 / 2
|
||||||
}
|
}
|
||||||
|
|
||||||
// toramActive returns true when the live system was booted with toram.
|
// toramActive returns true when the live system was booted with toram.
|
||||||
@@ -222,12 +234,10 @@ func DiskWarnings(d InstallDisk) []string {
|
|||||||
humanBytes(min), humanBytes(d.SizeBytes)))
|
humanBytes(min), humanBytes(d.SizeBytes)))
|
||||||
}
|
}
|
||||||
if toramActive() {
|
if toramActive() {
|
||||||
sqFi, err := os.Stat(squashfsPath)
|
free := freeMemBytes()
|
||||||
if err == nil {
|
min := MinInstallBytes()
|
||||||
free := freeMemBytes()
|
if free > 0 && min > 0 && free < (min*4/3) {
|
||||||
if free > 0 && free < sqFi.Size()*2 {
|
w = append(w, "toram mode — low RAM, extraction may be slow or fail")
|
||||||
w = append(w, "toram mode — low RAM, extraction may be slow or fail")
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return w
|
return w
|
||||||
|
|||||||
@@ -14,6 +14,22 @@ import (
|
|||||||
const installToRAMDir = "/dev/shm/bee-live"
|
const installToRAMDir = "/dev/shm/bee-live"
|
||||||
const copyProgressLogStep int64 = 100 * 1024 * 1024
|
const copyProgressLogStep int64 = 100 * 1024 * 1024
|
||||||
|
|
||||||
|
var liveMediumSquashfsGlob = func() ([]string, error) {
|
||||||
|
return filepath.Glob("/run/live/medium/live/*.squashfs")
|
||||||
|
}
|
||||||
|
|
||||||
|
var runRemountMedium = func() ([]byte, error) {
|
||||||
|
return exec.Command("bee-remount-medium").CombinedOutput()
|
||||||
|
}
|
||||||
|
|
||||||
|
var umountLiveMedium = func() error {
|
||||||
|
return exec.Command("umount", "/run/live/medium").Run()
|
||||||
|
}
|
||||||
|
|
||||||
|
var ejectDevice = func(device string) error {
|
||||||
|
return exec.Command("eject", device).Run()
|
||||||
|
}
|
||||||
|
|
||||||
func (s *System) IsLiveMediaInRAM() bool {
|
func (s *System) IsLiveMediaInRAM() bool {
|
||||||
return s.LiveMediaRAMState().InRAM
|
return s.LiveMediaRAMState().InRAM
|
||||||
}
|
}
|
||||||
@@ -140,8 +156,7 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) (ret
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
squashfsFiles, err := filepath.Glob("/run/live/medium/live/*.squashfs")
|
squashfsFiles, sourceAvailable := ensureLiveMediumAvailable(log)
|
||||||
sourceAvailable := err == nil && len(squashfsFiles) > 0
|
|
||||||
|
|
||||||
dstDir := installToRAMDir
|
dstDir := installToRAMDir
|
||||||
|
|
||||||
@@ -171,7 +186,7 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) (ret
|
|||||||
}
|
}
|
||||||
goto bindMedium
|
goto bindMedium
|
||||||
}
|
}
|
||||||
return fmt.Errorf("no squashfs files found in /run/live/medium/live/ and no prior RAM copy in %s — reconnect the installation medium and retry", dstDir)
|
return fmt.Errorf("no squashfs files found in /run/live/medium/live/ and no prior RAM copy in %s — reconnect the installation medium and retry (or run bee-remount-medium as root)", dstDir)
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
@@ -254,10 +269,83 @@ bindMedium:
|
|||||||
if status.InRAM {
|
if status.InRAM {
|
||||||
log(fmt.Sprintf("Verification passed: live medium now served from %s.", describeLiveBootSource(status)))
|
log(fmt.Sprintf("Verification passed: live medium now served from %s.", describeLiveBootSource(status)))
|
||||||
}
|
}
|
||||||
log("Done. Squashfs files are in RAM. Installation media can be safely disconnected.")
|
detachInstallMedium(status, log)
|
||||||
|
log("Done. Squashfs files are in RAM. Installation media has been detached when possible.")
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func tryRemountLiveMedium(log func(string)) error {
|
||||||
|
output, err := runRemountMedium()
|
||||||
|
trimmed := strings.TrimSpace(string(output))
|
||||||
|
if err != nil {
|
||||||
|
if trimmed != "" && log != nil {
|
||||||
|
for _, line := range strings.Split(trimmed, "\n") {
|
||||||
|
log("bee-remount-medium: " + line)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if trimmed != "" && log != nil {
|
||||||
|
for _, line := range strings.Split(trimmed, "\n") {
|
||||||
|
log("bee-remount-medium: " + line)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func ensureLiveMediumAvailable(log func(string)) ([]string, bool) {
|
||||||
|
squashfsFiles, err := liveMediumSquashfsGlob()
|
||||||
|
sourceAvailable := err == nil && len(squashfsFiles) > 0
|
||||||
|
if sourceAvailable {
|
||||||
|
return squashfsFiles, true
|
||||||
|
}
|
||||||
|
|
||||||
|
if log != nil {
|
||||||
|
log("Live medium not mounted at /run/live/medium — attempting automatic remount scan...")
|
||||||
|
}
|
||||||
|
if remountErr := tryRemountLiveMedium(log); remountErr != nil {
|
||||||
|
if log != nil {
|
||||||
|
log(fmt.Sprintf("Automatic remount did not restore the live medium: %v", remountErr))
|
||||||
|
}
|
||||||
|
return squashfsFiles, false
|
||||||
|
}
|
||||||
|
|
||||||
|
squashfsFiles, err = liveMediumSquashfsGlob()
|
||||||
|
sourceAvailable = err == nil && len(squashfsFiles) > 0
|
||||||
|
if sourceAvailable && log != nil {
|
||||||
|
log("Live medium restored after remount scan.")
|
||||||
|
}
|
||||||
|
return squashfsFiles, sourceAvailable
|
||||||
|
}
|
||||||
|
|
||||||
|
func detachInstallMedium(status LiveBootSource, log func(string)) {
|
||||||
|
if log == nil {
|
||||||
|
log = func(string) {}
|
||||||
|
}
|
||||||
|
|
||||||
|
log("Detaching original installation medium...")
|
||||||
|
if err := umountLiveMedium(); err != nil {
|
||||||
|
log(fmt.Sprintf("Warning: could not unmount /run/live/medium: %v", err))
|
||||||
|
} else {
|
||||||
|
log("Unmounted /run/live/medium.")
|
||||||
|
}
|
||||||
|
|
||||||
|
device := strings.TrimSpace(status.Device)
|
||||||
|
if device == "" {
|
||||||
|
device = strings.TrimSpace(status.Source)
|
||||||
|
}
|
||||||
|
if device == "" || !strings.HasPrefix(device, "/dev/") {
|
||||||
|
log("No block device identified for eject; skipping media eject.")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := ejectDevice(device); err != nil {
|
||||||
|
log(fmt.Sprintf("Warning: could not eject %s: %v", device, err))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
log(fmt.Sprintf("Ejected %s.", device))
|
||||||
|
}
|
||||||
|
|
||||||
func verifyInstallToRAMStatus(status LiveBootSource, dstDir string, mediumRebound bool, log func(string)) error {
|
func verifyInstallToRAMStatus(status LiveBootSource, dstDir string, mediumRebound bool, log func(string)) error {
|
||||||
if status.InRAM {
|
if status.InRAM {
|
||||||
return nil
|
return nil
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
package platform
|
package platform
|
||||||
|
|
||||||
import "testing"
|
import (
|
||||||
|
"fmt"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
func TestInferLiveBootKind(t *testing.T) {
|
func TestInferLiveBootKind(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
@@ -124,3 +127,156 @@ func TestShouldLogCopyProgress(t *testing.T) {
|
|||||||
t.Fatal("expected final completion log")
|
t.Fatal("expected final completion log")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestTryRemountLiveMedium(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
orig := runRemountMedium
|
||||||
|
t.Cleanup(func() {
|
||||||
|
runRemountMedium = orig
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("success", func(t *testing.T) {
|
||||||
|
runRemountMedium = func() ([]byte, error) {
|
||||||
|
return []byte("[10:57:31] Mounted /dev/sr1 on /run/live/medium\n"), nil
|
||||||
|
}
|
||||||
|
var logs []string
|
||||||
|
if err := tryRemountLiveMedium(func(msg string) { logs = append(logs, msg) }); err != nil {
|
||||||
|
t.Fatalf("tryRemountLiveMedium() error = %v", err)
|
||||||
|
}
|
||||||
|
if len(logs) != 1 || logs[0] != "bee-remount-medium: [10:57:31] Mounted /dev/sr1 on /run/live/medium" {
|
||||||
|
t.Fatalf("logs=%v", logs)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("failure", func(t *testing.T) {
|
||||||
|
runRemountMedium = func() ([]byte, error) {
|
||||||
|
return []byte("must be run as root\n"), fmt.Errorf("exit status 1")
|
||||||
|
}
|
||||||
|
var logs []string
|
||||||
|
err := tryRemountLiveMedium(func(msg string) { logs = append(logs, msg) })
|
||||||
|
if err == nil {
|
||||||
|
t.Fatal("expected error")
|
||||||
|
}
|
||||||
|
if len(logs) != 1 || logs[0] != "bee-remount-medium: must be run as root" {
|
||||||
|
t.Fatalf("logs=%v", logs)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestEnsureLiveMediumAvailableRemountsSource(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
origGlob := liveMediumSquashfsGlob
|
||||||
|
origRemount := runRemountMedium
|
||||||
|
t.Cleanup(func() {
|
||||||
|
liveMediumSquashfsGlob = origGlob
|
||||||
|
runRemountMedium = origRemount
|
||||||
|
})
|
||||||
|
|
||||||
|
callCount := 0
|
||||||
|
liveMediumSquashfsGlob = func() ([]string, error) {
|
||||||
|
callCount++
|
||||||
|
if callCount == 1 {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
return []string{"/run/live/medium/live/filesystem.squashfs"}, nil
|
||||||
|
}
|
||||||
|
runRemountMedium = func() ([]byte, error) {
|
||||||
|
return []byte("Mounted /dev/sr1 on /run/live/medium\n"), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
var logs []string
|
||||||
|
files, ok := ensureLiveMediumAvailable(func(msg string) { logs = append(logs, msg) })
|
||||||
|
if !ok {
|
||||||
|
t.Fatal("expected live medium to become available after remount")
|
||||||
|
}
|
||||||
|
if callCount < 2 {
|
||||||
|
t.Fatalf("liveMediumSquashfsGlob called %d times, want at least 2", callCount)
|
||||||
|
}
|
||||||
|
if len(files) != 1 || files[0] != "/run/live/medium/live/filesystem.squashfs" {
|
||||||
|
t.Fatalf("files=%v", files)
|
||||||
|
}
|
||||||
|
found := false
|
||||||
|
for _, msg := range logs {
|
||||||
|
if msg == "Live medium restored after remount scan." {
|
||||||
|
found = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !found {
|
||||||
|
t.Fatalf("expected remount success log, logs=%v", logs)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDetachInstallMedium(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
origUmount := umountLiveMedium
|
||||||
|
origEject := ejectDevice
|
||||||
|
t.Cleanup(func() {
|
||||||
|
umountLiveMedium = origUmount
|
||||||
|
ejectDevice = origEject
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("success", func(t *testing.T) {
|
||||||
|
var umountCalled bool
|
||||||
|
var ejected string
|
||||||
|
umountLiveMedium = func() error {
|
||||||
|
umountCalled = true
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
ejectDevice = func(device string) error {
|
||||||
|
ejected = device
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
var logs []string
|
||||||
|
detachInstallMedium(LiveBootSource{Kind: "cdrom", Device: "/dev/sr1"}, func(msg string) { logs = append(logs, msg) })
|
||||||
|
if !umountCalled {
|
||||||
|
t.Fatal("expected umountLiveMedium to be called")
|
||||||
|
}
|
||||||
|
if ejected != "/dev/sr1" {
|
||||||
|
t.Fatalf("ejected=%q want /dev/sr1", ejected)
|
||||||
|
}
|
||||||
|
if len(logs) < 3 {
|
||||||
|
t.Fatalf("logs=%v", logs)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("no device", func(t *testing.T) {
|
||||||
|
umountLiveMedium = func() error { return nil }
|
||||||
|
ejectDevice = func(device string) error {
|
||||||
|
t.Fatalf("unexpected eject for %q", device)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
var logs []string
|
||||||
|
detachInstallMedium(LiveBootSource{Kind: "ram", Source: "tmpfs"}, func(msg string) { logs = append(logs, msg) })
|
||||||
|
found := false
|
||||||
|
for _, msg := range logs {
|
||||||
|
if msg == "No block device identified for eject; skipping media eject." {
|
||||||
|
found = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !found {
|
||||||
|
t.Fatalf("logs=%v", logs)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("eject failure is warning only", func(t *testing.T) {
|
||||||
|
umountLiveMedium = func() error { return nil }
|
||||||
|
ejectDevice = func(device string) error { return fmt.Errorf("exit status 1") }
|
||||||
|
var logs []string
|
||||||
|
detachInstallMedium(LiveBootSource{Kind: "usb", Device: "/dev/sdb1"}, func(msg string) { logs = append(logs, msg) })
|
||||||
|
found := false
|
||||||
|
for _, msg := range logs {
|
||||||
|
if msg == "Warning: could not eject /dev/sdb1: exit status 1" {
|
||||||
|
found = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !found {
|
||||||
|
t.Fatalf("logs=%v", logs)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|||||||
@@ -258,7 +258,7 @@ func (s *System) GetInterfaceState(iface string) (bool, error) {
|
|||||||
func interfaceAdminState(iface string) (bool, error) {
|
func interfaceAdminState(iface string) (bool, error) {
|
||||||
raw, err := exec.Command("ip", "-o", "link", "show", "dev", iface).Output()
|
raw, err := exec.Command("ip", "-o", "link", "show", "dev", iface).Output()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return false, err
|
return false, fmt.Errorf("ip link show dev %s: %w", iface, err)
|
||||||
}
|
}
|
||||||
return parseInterfaceAdminState(string(raw))
|
return parseInterfaceAdminState(string(raw))
|
||||||
}
|
}
|
||||||
@@ -288,7 +288,7 @@ func interfaceIPv4Addrs(iface string) ([]string, error) {
|
|||||||
if errors.As(err, &exitErr) {
|
if errors.As(err, &exitErr) {
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
return nil, err
|
return nil, fmt.Errorf("ip addr show dev %s: %w", iface, err)
|
||||||
}
|
}
|
||||||
var ipv4 []string
|
var ipv4 []string
|
||||||
for _, line := range strings.Split(strings.TrimSpace(string(raw)), "\n") {
|
for _, line := range strings.Split(strings.TrimSpace(string(raw)), "\n") {
|
||||||
|
|||||||
@@ -55,7 +55,6 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
|
|||||||
if err == nil {
|
if err == nil {
|
||||||
health.Interfaces = make([]schema.RuntimeInterface, 0, len(interfaces))
|
health.Interfaces = make([]schema.RuntimeInterface, 0, len(interfaces))
|
||||||
hasIPv4 := false
|
hasIPv4 := false
|
||||||
missingIPv4 := false
|
|
||||||
for _, iface := range interfaces {
|
for _, iface := range interfaces {
|
||||||
outcome := "no_offer"
|
outcome := "no_offer"
|
||||||
if len(iface.IPv4) > 0 {
|
if len(iface.IPv4) > 0 {
|
||||||
@@ -63,8 +62,6 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
|
|||||||
hasIPv4 = true
|
hasIPv4 = true
|
||||||
} else if strings.EqualFold(iface.State, "DOWN") {
|
} else if strings.EqualFold(iface.State, "DOWN") {
|
||||||
outcome = "link_down"
|
outcome = "link_down"
|
||||||
} else {
|
|
||||||
missingIPv4 = true
|
|
||||||
}
|
}
|
||||||
health.Interfaces = append(health.Interfaces, schema.RuntimeInterface{
|
health.Interfaces = append(health.Interfaces, schema.RuntimeInterface{
|
||||||
Name: iface.Name,
|
Name: iface.Name,
|
||||||
@@ -73,17 +70,9 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
|
|||||||
Outcome: outcome,
|
Outcome: outcome,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
switch {
|
if hasIPv4 {
|
||||||
case hasIPv4 && !missingIPv4:
|
|
||||||
health.NetworkStatus = "OK"
|
health.NetworkStatus = "OK"
|
||||||
case hasIPv4:
|
} else {
|
||||||
health.NetworkStatus = "PARTIAL"
|
|
||||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
|
||||||
Code: "dhcp_partial",
|
|
||||||
Severity: "warning",
|
|
||||||
Description: "At least one interface did not obtain IPv4 connectivity.",
|
|
||||||
})
|
|
||||||
default:
|
|
||||||
health.NetworkStatus = "FAILED"
|
health.NetworkStatus = "FAILED"
|
||||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||||
Code: "dhcp_failed",
|
Code: "dhcp_failed",
|
||||||
|
|||||||
@@ -2,6 +2,8 @@
|
|||||||
// core/internal/ingest/parser_hardware.go. No import dependency on core.
|
// core/internal/ingest/parser_hardware.go. No import dependency on core.
|
||||||
package schema
|
package schema
|
||||||
|
|
||||||
|
import "encoding/json"
|
||||||
|
|
||||||
// HardwareIngestRequest is the top-level output document produced by `bee audit`.
|
// HardwareIngestRequest is the top-level output document produced by `bee audit`.
|
||||||
// It is accepted as-is by the core /api/ingest/hardware endpoint.
|
// It is accepted as-is by the core /api/ingest/hardware endpoint.
|
||||||
type HardwareIngestRequest struct {
|
type HardwareIngestRequest struct {
|
||||||
@@ -64,9 +66,10 @@ type HardwareSnapshot struct {
|
|||||||
Storage []HardwareStorage `json:"storage,omitempty"`
|
Storage []HardwareStorage `json:"storage,omitempty"`
|
||||||
PCIeDevices []HardwarePCIeDevice `json:"pcie_devices,omitempty"`
|
PCIeDevices []HardwarePCIeDevice `json:"pcie_devices,omitempty"`
|
||||||
PowerSupplies []HardwarePowerSupply `json:"power_supplies,omitempty"`
|
PowerSupplies []HardwarePowerSupply `json:"power_supplies,omitempty"`
|
||||||
Sensors *HardwareSensors `json:"sensors,omitempty"`
|
Sensors *HardwareSensors `json:"sensors,omitempty"`
|
||||||
EventLogs []HardwareEventLog `json:"event_logs,omitempty"`
|
EventLogs []HardwareEventLog `json:"event_logs,omitempty"`
|
||||||
VROCLicense *string `json:"vroc_license,omitempty"`
|
PlatformConfig *json.RawMessage `json:"platform_config,omitempty"`
|
||||||
|
VROCLicense *string `json:"vroc_license,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type HardwareHealthSummary struct {
|
type HardwareHealthSummary struct {
|
||||||
@@ -123,7 +126,7 @@ type HardwareCPU struct {
|
|||||||
type HardwareMemory struct {
|
type HardwareMemory struct {
|
||||||
HardwareComponentStatus
|
HardwareComponentStatus
|
||||||
Slot *string `json:"slot,omitempty"`
|
Slot *string `json:"slot,omitempty"`
|
||||||
Location *string `json:"location,omitempty"`
|
Location *string `json:"-"` // internal: used for DIMM telemetry matching only
|
||||||
Present *bool `json:"present,omitempty"`
|
Present *bool `json:"present,omitempty"`
|
||||||
SizeMB *int `json:"size_mb,omitempty"`
|
SizeMB *int `json:"size_mb,omitempty"`
|
||||||
Type *string `json:"type,omitempty"`
|
Type *string `json:"type,omitempty"`
|
||||||
@@ -261,15 +264,13 @@ type HardwareSensors struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type HardwareFanSensor struct {
|
type HardwareFanSensor struct {
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
Location *string `json:"location,omitempty"`
|
RPM *int `json:"rpm,omitempty"`
|
||||||
RPM *int `json:"rpm,omitempty"`
|
Status *string `json:"status,omitempty"`
|
||||||
Status *string `json:"status,omitempty"`
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type HardwarePowerSensor struct {
|
type HardwarePowerSensor struct {
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
Location *string `json:"location,omitempty"`
|
|
||||||
VoltageV *float64 `json:"voltage_v,omitempty"`
|
VoltageV *float64 `json:"voltage_v,omitempty"`
|
||||||
CurrentA *float64 `json:"current_a,omitempty"`
|
CurrentA *float64 `json:"current_a,omitempty"`
|
||||||
PowerW *float64 `json:"power_w,omitempty"`
|
PowerW *float64 `json:"power_w,omitempty"`
|
||||||
@@ -278,7 +279,6 @@ type HardwarePowerSensor struct {
|
|||||||
|
|
||||||
type HardwareTemperatureSensor struct {
|
type HardwareTemperatureSensor struct {
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
Location *string `json:"location,omitempty"`
|
|
||||||
Celsius *float64 `json:"celsius,omitempty"`
|
Celsius *float64 `json:"celsius,omitempty"`
|
||||||
ThresholdWarningCelsius *float64 `json:"threshold_warning_celsius,omitempty"`
|
ThresholdWarningCelsius *float64 `json:"threshold_warning_celsius,omitempty"`
|
||||||
ThresholdCriticalCelsius *float64 `json:"threshold_critical_celsius,omitempty"`
|
ThresholdCriticalCelsius *float64 `json:"threshold_critical_celsius,omitempty"`
|
||||||
@@ -286,11 +286,10 @@ type HardwareTemperatureSensor struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type HardwareOtherSensor struct {
|
type HardwareOtherSensor struct {
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
Location *string `json:"location,omitempty"`
|
Value *float64 `json:"value,omitempty"`
|
||||||
Value *float64 `json:"value,omitempty"`
|
Unit *string `json:"unit,omitempty"`
|
||||||
Unit *string `json:"unit,omitempty"`
|
Status *string `json:"status,omitempty"`
|
||||||
Status *string `json:"status,omitempty"`
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type HardwareEventLog struct {
|
type HardwareEventLog struct {
|
||||||
|
|||||||
@@ -125,6 +125,8 @@ func defaultTaskPriority(target string, params taskParams) int {
|
|||||||
return taskPriorityInstall
|
return taskPriorityInstall
|
||||||
case "install-to-ram":
|
case "install-to-ram":
|
||||||
return taskPriorityInstallToRAM
|
return taskPriorityInstallToRAM
|
||||||
|
case "nvme-format":
|
||||||
|
return taskPriorityInstall
|
||||||
case "audit":
|
case "audit":
|
||||||
return taskPriorityAudit
|
return taskPriorityAudit
|
||||||
case "nvidia-bench-perf", "nvidia-bench-power", "nvidia-bench-autotune":
|
case "nvidia-bench-perf", "nvidia-bench-power", "nvidia-bench-autotune":
|
||||||
@@ -1295,7 +1297,7 @@ func (h *handler) handleAPIInstallToRAM(w http.ResponseWriter, r *http.Request)
|
|||||||
var standardTools = []string{
|
var standardTools = []string{
|
||||||
"dmidecode", "smartctl", "nvme", "lspci", "ipmitool",
|
"dmidecode", "smartctl", "nvme", "lspci", "ipmitool",
|
||||||
"nvidia-smi", "dcgmi", "nv-hostengine", "memtester", "stress-ng", "nvtop",
|
"nvidia-smi", "dcgmi", "nv-hostengine", "memtester", "stress-ng", "nvtop",
|
||||||
"mstflint",
|
"mstflint", "saa",
|
||||||
}
|
}
|
||||||
|
|
||||||
func (h *handler) handleAPIToolsCheck(w http.ResponseWriter, r *http.Request) {
|
func (h *handler) handleAPIToolsCheck(w http.ResponseWriter, r *http.Request) {
|
||||||
@@ -1677,6 +1679,56 @@ func (h *handler) handleAPIBenchmarkResults(w http.ResponseWriter, r *http.Reque
|
|||||||
fmt.Fprint(w, renderBenchmarkResultsCard(h.opts.ExportDir))
|
fmt.Fprint(w, renderBenchmarkResultsCard(h.opts.ExportDir))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── Hardware summary / component detail ──────────────────────────────────────
|
||||||
|
|
||||||
|
// handleAPIHardwareSummary returns the hardware summary card HTML fragment for
|
||||||
|
// htmx polling (hx-get="/api/hardware-summary" hx-swap="outerHTML").
|
||||||
|
func (h *handler) handleAPIHardwareSummary(w http.ResponseWriter, _ *http.Request) {
|
||||||
|
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||||
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
|
fmt.Fprint(w, renderHardwareSummaryCard(h.opts))
|
||||||
|
}
|
||||||
|
|
||||||
|
// handleAPIComponentDetail returns an HTML fragment describing the current and
|
||||||
|
// historical status for one component type (cpu, memory, storage, gpu, psu).
|
||||||
|
func (h *handler) handleAPIComponentDetail(w http.ResponseWriter, r *http.Request) {
|
||||||
|
compType := r.PathValue("type")
|
||||||
|
var exact, prefixes []string
|
||||||
|
var title string
|
||||||
|
switch compType {
|
||||||
|
case "cpu":
|
||||||
|
title = "CPU"
|
||||||
|
exact = []string{"cpu:all"}
|
||||||
|
case "memory":
|
||||||
|
title = "Memory"
|
||||||
|
exact = []string{"memory:all"}
|
||||||
|
prefixes = []string{"memory:"}
|
||||||
|
case "storage":
|
||||||
|
title = "Storage"
|
||||||
|
exact = []string{"storage:all"}
|
||||||
|
prefixes = []string{"storage:"}
|
||||||
|
case "gpu":
|
||||||
|
title = "GPU"
|
||||||
|
prefixes = []string{"pcie:gpu:"}
|
||||||
|
case "psu":
|
||||||
|
title = "PSU"
|
||||||
|
prefixes = []string{"psu:"}
|
||||||
|
default:
|
||||||
|
http.NotFound(w, r)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
var records []app.ComponentStatusRecord
|
||||||
|
if h.opts.App != nil && h.opts.App.StatusDB != nil {
|
||||||
|
all := h.opts.App.StatusDB.All()
|
||||||
|
records = matchedRecords(all, exact, prefixes)
|
||||||
|
}
|
||||||
|
|
||||||
|
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||||
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
|
fmt.Fprint(w, renderComponentDetail(title, records))
|
||||||
|
}
|
||||||
|
|
||||||
func (h *handler) rollbackPendingNetworkChange() error {
|
func (h *handler) rollbackPendingNetworkChange() error {
|
||||||
h.pendingNetMu.Lock()
|
h.pendingNetMu.Lock()
|
||||||
pnc := h.pendingNet
|
pnc := h.pendingNet
|
||||||
|
|||||||
@@ -85,6 +85,27 @@ func TestHandleAPIBlackboxStatusReturnsPersistedState(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestParseNVMeFormatModes(t *testing.T) {
|
||||||
|
raw := `
|
||||||
|
lbaf 0 : ms:0 lbads:9 rp:0x2 (in use)
|
||||||
|
lbaf 1 : ms:8 lbads:9 rp:0x1
|
||||||
|
lbaf 2 : ms:0 lbads:12 rp:0
|
||||||
|
`
|
||||||
|
modes := parseNVMeFormatModes(raw)
|
||||||
|
if len(modes) != 3 {
|
||||||
|
t.Fatalf("modes=%#v want 3 modes", modes)
|
||||||
|
}
|
||||||
|
if modes[0].Mode != 0 || modes[0].DataBytes != 512 || modes[0].MetadataBytes != 0 || !modes[0].InUse {
|
||||||
|
t.Fatalf("mode 0=%#v", modes[0])
|
||||||
|
}
|
||||||
|
if modes[1].Label != "MODE 1 (512+8)" {
|
||||||
|
t.Fatalf("mode 1 label=%q", modes[1].Label)
|
||||||
|
}
|
||||||
|
if modes[2].DataBytes != 4096 || modes[2].MetadataBytes != 0 {
|
||||||
|
t.Fatalf("mode 2=%#v", modes[2])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
|
func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
|
||||||
globalQueue.mu.Lock()
|
globalQueue.mu.Lock()
|
||||||
originalTasks := globalQueue.tasks
|
originalTasks := globalQueue.tasks
|
||||||
|
|||||||
76
audit/internal/webui/health_poller.go
Normal file
76
audit/internal/webui/health_poller.go
Normal file
@@ -0,0 +1,76 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"context"
|
||||||
|
"log/slog"
|
||||||
|
"os/exec"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"bee/audit/internal/app"
|
||||||
|
"bee/audit/internal/collector"
|
||||||
|
)
|
||||||
|
|
||||||
|
const healthPollInterval = 60 * time.Second
|
||||||
|
const psuIPMITimeout = 15 * time.Second
|
||||||
|
|
||||||
|
// healthPoller runs periodic health checks for hardware components that do not
|
||||||
|
// emit kernel log events (e.g. PSU). Results are written to ComponentStatusDB.
|
||||||
|
type healthPoller struct {
|
||||||
|
statusDB *app.ComponentStatusDB
|
||||||
|
}
|
||||||
|
|
||||||
|
func newHealthPoller(statusDB *app.ComponentStatusDB) *healthPoller {
|
||||||
|
return &healthPoller{statusDB: statusDB}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *healthPoller) start() {
|
||||||
|
goRecoverLoop("health poller", 5*time.Second, p.run)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *healthPoller) run() {
|
||||||
|
ticker := time.NewTicker(healthPollInterval)
|
||||||
|
defer ticker.Stop()
|
||||||
|
for range ticker.C {
|
||||||
|
p.pollPSU()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *healthPoller) pollPSU() {
|
||||||
|
if p.statusDB == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), psuIPMITimeout)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
cmd := exec.CommandContext(ctx, "ipmitool", "sdr")
|
||||||
|
var out bytes.Buffer
|
||||||
|
cmd.Stdout = &out
|
||||||
|
if err := cmd.Run(); err != nil {
|
||||||
|
// IPMI not available or not a server — skip silently.
|
||||||
|
slog.Debug("health poller: ipmitool sdr unavailable", "err", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
slots := collector.PSUSlotsFromSDR(out.String())
|
||||||
|
if len(slots) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
const source = "watchdog:psu"
|
||||||
|
for slot, psu := range slots {
|
||||||
|
key := "psu:" + slot
|
||||||
|
status := psu.Status
|
||||||
|
if status == "" {
|
||||||
|
status = "Unknown"
|
||||||
|
}
|
||||||
|
detail := ""
|
||||||
|
switch status {
|
||||||
|
case "Critical":
|
||||||
|
detail = "PSU sensor reported non-OK state"
|
||||||
|
case "Warning":
|
||||||
|
detail = "PSU sensor in warning state"
|
||||||
|
}
|
||||||
|
p.statusDB.Record(key, source, status, detail)
|
||||||
|
}
|
||||||
|
}
|
||||||
293
audit/internal/webui/ipmi_fru.go
Normal file
293
audit/internal/webui/ipmi_fru.go
Normal file
@@ -0,0 +1,293 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"net/http"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
"unicode"
|
||||||
|
)
|
||||||
|
|
||||||
|
type fruField struct {
|
||||||
|
Name string `json:"name"`
|
||||||
|
Value string `json:"value"`
|
||||||
|
Editable bool `json:"editable"`
|
||||||
|
Area string `json:"area,omitempty"`
|
||||||
|
Index int `json:"index,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type fruChange struct {
|
||||||
|
Area string `json:"area"`
|
||||||
|
Index int `json:"index"`
|
||||||
|
Name string `json:"name"`
|
||||||
|
Value string `json:"value"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// fruEditableFields maps display name → area + index for ipmitool fru edit.
|
||||||
|
var fruEditableFields = map[string]struct {
|
||||||
|
Area string
|
||||||
|
Index int
|
||||||
|
}{
|
||||||
|
"Chassis Part Number": {"c", 0},
|
||||||
|
"Chassis Serial Number": {"c", 1},
|
||||||
|
"Chassis Extra": {"c", 2},
|
||||||
|
"Board Manufacturer": {"b", 0},
|
||||||
|
"Board Product Name": {"b", 1},
|
||||||
|
"Board Serial Number": {"b", 2},
|
||||||
|
"Board Part Number": {"b", 3},
|
||||||
|
"Product Manufacturer": {"p", 0},
|
||||||
|
"Product Name": {"p", 1},
|
||||||
|
"Product Part Number": {"p", 2},
|
||||||
|
"Product Version": {"p", 3},
|
||||||
|
"Product Serial Number": {"p", 4},
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseFRUOutput(output string) []fruField {
|
||||||
|
var fields []fruField
|
||||||
|
for _, line := range strings.Split(output, "\n") {
|
||||||
|
// Lines look like: " Field Name : value"
|
||||||
|
trimmed := strings.TrimLeft(line, " \t")
|
||||||
|
if trimmed == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
colon := strings.Index(trimmed, " : ")
|
||||||
|
if colon < 0 {
|
||||||
|
// try ": " with no leading space before colon
|
||||||
|
colon = strings.Index(trimmed, ": ")
|
||||||
|
if colon < 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
name := strings.TrimSpace(trimmed[:colon])
|
||||||
|
value := strings.TrimSpace(trimmed[colon+2:])
|
||||||
|
if name == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
editable, area, idx := fruFieldMeta(name)
|
||||||
|
fields = append(fields, fruField{Name: name, Value: value, Editable: editable, Area: area, Index: idx})
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
name := strings.TrimSpace(trimmed[:colon])
|
||||||
|
value := strings.TrimSpace(trimmed[colon+3:])
|
||||||
|
if name == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
editable, area, idx := fruFieldMeta(name)
|
||||||
|
fields = append(fields, fruField{Name: name, Value: value, Editable: editable, Area: area, Index: idx})
|
||||||
|
}
|
||||||
|
return fields
|
||||||
|
}
|
||||||
|
|
||||||
|
func fruFieldMeta(name string) (editable bool, area string, index int) {
|
||||||
|
if e, ok := fruEditableFields[name]; ok {
|
||||||
|
return true, e.Area, e.Index
|
||||||
|
}
|
||||||
|
return false, "", 0
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPIIPMIFRURead(w http.ResponseWriter, r *http.Request) {
|
||||||
|
ctx, cancel := context.WithTimeout(r.Context(), 30*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
out, err := exec.CommandContext(ctx, "ipmitool", "fru", "print", "0").CombinedOutput()
|
||||||
|
if err != nil {
|
||||||
|
msg := strings.TrimSpace(string(out))
|
||||||
|
if msg == "" {
|
||||||
|
msg = err.Error()
|
||||||
|
}
|
||||||
|
writeError(w, http.StatusInternalServerError, "ipmitool fru print: "+msg)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
fields := parseFRUOutput(string(out))
|
||||||
|
writeJSON(w, fields)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPIIPMIFRUWrite(w http.ResponseWriter, r *http.Request) {
|
||||||
|
var req struct {
|
||||||
|
Changes []fruChange `json:"changes"`
|
||||||
|
}
|
||||||
|
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||||
|
writeError(w, http.StatusBadRequest, "invalid JSON")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if len(req.Changes) == 0 {
|
||||||
|
writeError(w, http.StatusUnprocessableEntity, "no changes provided")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
validAreas := map[string]bool{"c": true, "b": true, "p": true}
|
||||||
|
for _, c := range req.Changes {
|
||||||
|
if !validAreas[c.Area] {
|
||||||
|
writeError(w, http.StatusUnprocessableEntity, "invalid area: "+c.Area)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if c.Index < 0 || c.Index > 9 {
|
||||||
|
writeError(w, http.StatusUnprocessableEntity, fmt.Sprintf("invalid index %d", c.Index))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if len(c.Value) > 64 {
|
||||||
|
writeError(w, http.StatusUnprocessableEntity, "value too long (max 64 chars)")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for _, ch := range c.Value {
|
||||||
|
if ch > unicode.MaxASCII || (ch < 0x20 && ch != 0) {
|
||||||
|
writeError(w, http.StatusUnprocessableEntity, "value contains non-printable characters")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
t := &Task{
|
||||||
|
ID: newJobID("ipmi-fru-write"),
|
||||||
|
Name: fmt.Sprintf("IPMI FRU Write (%d field(s))", len(req.Changes)),
|
||||||
|
Target: "ipmi-fru-write",
|
||||||
|
Priority: defaultTaskPriority("ipmi-fru-write", taskParams{}),
|
||||||
|
Status: TaskPending,
|
||||||
|
CreatedAt: time.Now(),
|
||||||
|
params: taskParams{FRUChanges: req.Changes},
|
||||||
|
}
|
||||||
|
globalQueue.enqueue(t)
|
||||||
|
writeJSON(w, map[string]string{"task_id": t.ID})
|
||||||
|
}
|
||||||
|
|
||||||
|
func runIPMIFRUWriteTask(ctx context.Context, j *jobState, exportDir string, p taskParams) error {
|
||||||
|
// Backup current FRU state
|
||||||
|
backupDir := filepath.Join(exportDir, "fru-backups")
|
||||||
|
if err := os.MkdirAll(backupDir, 0755); err != nil {
|
||||||
|
return fmt.Errorf("mkdir fru-backups: %w", err)
|
||||||
|
}
|
||||||
|
stamp := time.Now().Format("20060102150405")
|
||||||
|
backupPath := filepath.Join(backupDir, "fru-"+stamp+".txt")
|
||||||
|
|
||||||
|
backupOut, err := exec.CommandContext(ctx, "ipmitool", "fru", "print", "0").CombinedOutput()
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("backup fru print: %w", err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(backupPath, backupOut, 0644); err != nil {
|
||||||
|
return fmt.Errorf("write backup: %w", err)
|
||||||
|
}
|
||||||
|
j.append("Backup saved to " + backupPath)
|
||||||
|
|
||||||
|
// Apply changes
|
||||||
|
for _, c := range p.FRUChanges {
|
||||||
|
j.append(fmt.Sprintf("Setting %s (%s %d) = %q", c.Name, c.Area, c.Index, c.Value))
|
||||||
|
cmd := exec.CommandContext(ctx, "ipmitool", "fru", "edit", "0", "field", c.Area, fmt.Sprintf("%d", c.Index), c.Value)
|
||||||
|
if err := streamCmdJob(j, cmd); err != nil {
|
||||||
|
return fmt.Errorf("fru edit %s %d: %w", c.Area, c.Index, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderIPMIFRUCard() string {
|
||||||
|
return `<div class="card"><div class="card-head card-head-actions">IPMI — FRU<div class="card-head-buttons"><button class="btn btn-sm btn-secondary" onclick="fruRead()">Read</button></div></div><div class="card-body">
|
||||||
|
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Reads and edits FRU fields via ipmitool (In-Band, device 0). Works on any server with IPMI support.</p>
|
||||||
|
<div id="fru-status" style="font-size:13px;color:var(--muted);margin-bottom:8px"></div>
|
||||||
|
<div id="fru-table"></div>
|
||||||
|
<div id="fru-save-row" style="display:none;margin-top:12px">
|
||||||
|
<button class="btn btn-primary" id="fru-save-btn" onclick="fruSave()">Save</button>
|
||||||
|
<span id="fru-save-msg" style="font-size:13px;color:var(--muted);margin-left:10px"></span>
|
||||||
|
</div>
|
||||||
|
</div></div>
|
||||||
|
<script>
|
||||||
|
var fruOriginal = {};
|
||||||
|
function fruRead() {
|
||||||
|
document.getElementById('fru-status').textContent = 'Reading...';
|
||||||
|
document.getElementById('fru-table').innerHTML = '';
|
||||||
|
document.getElementById('fru-save-row').style.display = 'none';
|
||||||
|
fetch('/api/tools/ipmi-fru', {cache:'no-store'})
|
||||||
|
.then(function(r) {
|
||||||
|
if (!r.ok) return r.json().then(function(e) { throw new Error(e.error || r.statusText); });
|
||||||
|
return r.json();
|
||||||
|
})
|
||||||
|
.then(function(fields) {
|
||||||
|
fruOriginal = {};
|
||||||
|
if (!fields || !fields.length) {
|
||||||
|
document.getElementById('fru-status').textContent = 'No FRU fields returned.';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
document.getElementById('fru-status').textContent = '';
|
||||||
|
var rows = fields.map(function(f) {
|
||||||
|
var val = f.value || '';
|
||||||
|
if (f.editable) {
|
||||||
|
fruOriginal[f.area + '_' + f.index] = val;
|
||||||
|
return '<tr><td style="color:var(--muted);white-space:nowrap;padding-right:16px">' + escHtml(f.name) + '</td>'
|
||||||
|
+ '<td><input class="fru-input" style="width:100%;padding:4px 6px;border:1px solid var(--border);border-radius:3px;font-size:13px;font-family:inherit;background:var(--surface);color:var(--ink)"'
|
||||||
|
+ ' data-area="' + escHtml(f.area) + '" data-index="' + f.index + '" data-name="' + escHtml(f.name) + '"'
|
||||||
|
+ ' data-original="' + escHtml(val) + '" value="' + escHtml(val) + '" oninput="fruDirtyCheck()"></td></tr>';
|
||||||
|
}
|
||||||
|
return '<tr><td style="color:var(--muted);white-space:nowrap;padding-right:16px">' + escHtml(f.name) + '</td>'
|
||||||
|
+ '<td style="color:var(--ink)">' + escHtml(val || '—') + '</td></tr>';
|
||||||
|
}).join('');
|
||||||
|
document.getElementById('fru-table').innerHTML = '<table style="width:100%">' + rows + '</table>';
|
||||||
|
fruDirtyCheck();
|
||||||
|
})
|
||||||
|
.catch(function(e) {
|
||||||
|
document.getElementById('fru-status').textContent = 'Error: ' + e.message;
|
||||||
|
document.getElementById('fru-status').style.color = 'var(--crit-fg)';
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function escHtml(s) {
|
||||||
|
return String(s).replace(/&/g,'&').replace(/</g,'<').replace(/>/g,'>').replace(/"/g,'"');
|
||||||
|
}
|
||||||
|
function fruDirtyCheck() {
|
||||||
|
var inputs = document.querySelectorAll('.fru-input');
|
||||||
|
var changed = 0;
|
||||||
|
inputs.forEach(function(el) { if (el.value !== el.dataset.original) changed++; });
|
||||||
|
var row = document.getElementById('fru-save-row');
|
||||||
|
var btn = document.getElementById('fru-save-btn');
|
||||||
|
if (changed > 0) {
|
||||||
|
row.style.display = '';
|
||||||
|
btn.textContent = 'Save (' + changed + ' changed)';
|
||||||
|
} else {
|
||||||
|
row.style.display = 'none';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
function fruSave() {
|
||||||
|
var inputs = document.querySelectorAll('.fru-input');
|
||||||
|
var changes = [];
|
||||||
|
inputs.forEach(function(el) {
|
||||||
|
if (el.value !== el.dataset.original) {
|
||||||
|
changes.push({area: el.dataset.area, index: parseInt(el.dataset.index, 10), name: el.dataset.name, value: el.value});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
if (!changes.length) return;
|
||||||
|
document.getElementById('fru-save-btn').disabled = true;
|
||||||
|
document.getElementById('fru-save-msg').textContent = 'Saving...';
|
||||||
|
fetch('/api/tools/ipmi-fru/write', {method:'POST', headers:{'Content-Type':'application/json'}, body: JSON.stringify({changes: changes})})
|
||||||
|
.then(function(r) {
|
||||||
|
if (!r.ok) return r.json().then(function(e) { throw new Error(e.error || r.statusText); });
|
||||||
|
return r.json();
|
||||||
|
})
|
||||||
|
.then(function(d) {
|
||||||
|
var taskId = d.task_id;
|
||||||
|
document.getElementById('fru-save-msg').textContent = 'Task ' + taskId + ' queued…';
|
||||||
|
var poll = setInterval(function() {
|
||||||
|
fetch('/api/tasks', {cache:'no-store'}).then(function(r) { return r.json(); }).then(function(tasks) {
|
||||||
|
var t = Array.isArray(tasks) ? tasks.find(function(x) { return x.id === taskId; }) : null;
|
||||||
|
if (!t) return;
|
||||||
|
if (t.status === 'done') {
|
||||||
|
clearInterval(poll);
|
||||||
|
document.getElementById('fru-save-msg').textContent = 'Done — backup saved to fru-backups/.';
|
||||||
|
document.getElementById('fru-save-btn').disabled = false;
|
||||||
|
inputs.forEach(function(el) { el.dataset.original = el.value; });
|
||||||
|
fruDirtyCheck();
|
||||||
|
} else if (t.status === 'failed') {
|
||||||
|
clearInterval(poll);
|
||||||
|
document.getElementById('fru-save-msg').textContent = 'Failed: ' + (t.error || 'unknown error');
|
||||||
|
document.getElementById('fru-save-btn').disabled = false;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}, 1500);
|
||||||
|
})
|
||||||
|
.catch(function(e) {
|
||||||
|
document.getElementById('fru-save-msg').textContent = 'Error: ' + e.message;
|
||||||
|
document.getElementById('fru-save-btn').disabled = false;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
</script>`
|
||||||
|
}
|
||||||
@@ -91,6 +91,7 @@ func (j *jobState) writeLogLineLocked(line string) {
|
|||||||
j.logBuf = bufio.NewWriterSize(f, 64*1024)
|
j.logBuf = bufio.NewWriterSize(f, 64*1024)
|
||||||
}
|
}
|
||||||
_, _ = j.logBuf.WriteString(line + "\n")
|
_, _ = j.logBuf.WriteString(line + "\n")
|
||||||
|
_ = j.logBuf.Flush()
|
||||||
}
|
}
|
||||||
|
|
||||||
// closeLog flushes and closes the log file. Called after all task output is done.
|
// closeLog flushes and closes the log file. Called after all task output is done.
|
||||||
|
|||||||
@@ -73,6 +73,9 @@ func (w *kmsgWatcher) run() {
|
|||||||
w.mu.Lock()
|
w.mu.Lock()
|
||||||
if w.window != nil {
|
if w.window != nil {
|
||||||
w.recordEvent(evt)
|
w.recordEvent(evt)
|
||||||
|
} else {
|
||||||
|
evtCopy := evt
|
||||||
|
goRecoverOnce("kmsg flush immediate", func() { w.flushImmediate(evtCopy) })
|
||||||
}
|
}
|
||||||
w.mu.Unlock()
|
w.mu.Unlock()
|
||||||
}
|
}
|
||||||
@@ -162,7 +165,9 @@ func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
|
|||||||
for _, id := range evt.ids {
|
for _, id := range evt.ids {
|
||||||
var key string
|
var key string
|
||||||
switch evt.category {
|
switch evt.category {
|
||||||
case "gpu", "pcie":
|
case "gpu":
|
||||||
|
key = "pcie:gpu:" + normalizeBDF(id)
|
||||||
|
case "pcie":
|
||||||
key = "pcie:" + normalizeBDF(id)
|
key = "pcie:" + normalizeBDF(id)
|
||||||
case "storage":
|
case "storage":
|
||||||
key = "storage:" + id
|
key = "storage:" + id
|
||||||
@@ -180,6 +185,54 @@ func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// flushImmediate writes a single kmsg event directly to the status DB without a SAT window.
|
||||||
|
// Called when an error is detected outside of any SAT task (always-on watching).
|
||||||
|
func (w *kmsgWatcher) flushImmediate(evt kmsgEvent) {
|
||||||
|
if w.statusDB == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
const source = "watchdog:kmsg"
|
||||||
|
detail := "kernel: " + truncate(evt.raw, 120)
|
||||||
|
|
||||||
|
var severity string
|
||||||
|
for _, p := range platform.HardwareErrorPatterns {
|
||||||
|
if p.Re.MatchString(evt.raw) {
|
||||||
|
if p.Severity == "critical" {
|
||||||
|
severity = "Critical"
|
||||||
|
} else {
|
||||||
|
severity = "Warning"
|
||||||
|
}
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if severity == "" {
|
||||||
|
severity = "Warning"
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(evt.ids) == 0 {
|
||||||
|
key := "cpu:all"
|
||||||
|
if evt.category == "memory" {
|
||||||
|
key = "memory:all"
|
||||||
|
}
|
||||||
|
w.statusDB.Record(key, source, severity, detail)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for _, id := range evt.ids {
|
||||||
|
var key string
|
||||||
|
switch evt.category {
|
||||||
|
case "gpu":
|
||||||
|
key = "pcie:gpu:" + normalizeBDF(id)
|
||||||
|
case "pcie":
|
||||||
|
key = "pcie:" + normalizeBDF(id)
|
||||||
|
case "storage":
|
||||||
|
key = "storage:" + id
|
||||||
|
default:
|
||||||
|
key = "pcie:" + normalizeBDF(id)
|
||||||
|
}
|
||||||
|
w.statusDB.Record(key, source, severity, detail)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// parseKmsgLine parses a single /dev/kmsg line and returns an event if it matches
|
// parseKmsgLine parses a single /dev/kmsg line and returns an event if it matches
|
||||||
// any pattern in platform.HardwareErrorPatterns.
|
// any pattern in platform.HardwareErrorPatterns.
|
||||||
// kmsg format: "<priority>,<sequence>,<timestamp_usec>,-;message text"
|
// kmsg format: "<priority>,<sequence>,<timestamp_usec>,-;message text"
|
||||||
|
|||||||
@@ -17,6 +17,7 @@ func layoutHead(title string) string {
|
|||||||
<style>
|
<style>
|
||||||
:root{--bg:#fff;--surface:#fff;--surface-2:#f9fafb;--border:rgba(34,36,38,.15);--border-lite:rgba(34,36,38,.1);--ink:rgba(0,0,0,.87);--muted:rgba(0,0,0,.6);--accent:#2185d0;--accent-dark:#1678c2;--crit-bg:#fff6f6;--crit-fg:#9f3a38;--crit-border:#e0b4b4;--ok-bg:#fcfff5;--ok-fg:#2c662d;--warn-bg:#fffaf3;--warn-fg:#573a08}
|
:root{--bg:#fff;--surface:#fff;--surface-2:#f9fafb;--border:rgba(34,36,38,.15);--border-lite:rgba(34,36,38,.1);--ink:rgba(0,0,0,.87);--muted:rgba(0,0,0,.6);--accent:#2185d0;--accent-dark:#1678c2;--crit-bg:#fff6f6;--crit-fg:#9f3a38;--crit-border:#e0b4b4;--ok-bg:#fcfff5;--ok-fg:#2c662d;--warn-bg:#fffaf3;--warn-fg:#573a08}
|
||||||
*{box-sizing:border-box;margin:0;padding:0}
|
*{box-sizing:border-box;margin:0;padding:0}
|
||||||
|
dialog{margin:auto}
|
||||||
body{font:14px/1.5 Lato,"Helvetica Neue",Arial,Helvetica,sans-serif;background:var(--bg);color:var(--ink);display:flex;min-height:100vh}
|
body{font:14px/1.5 Lato,"Helvetica Neue",Arial,Helvetica,sans-serif;background:var(--bg);color:var(--ink);display:flex;min-height:100vh}
|
||||||
a{color:var(--accent);text-decoration:none}
|
a{color:var(--accent);text-decoration:none}
|
||||||
/* Sidebar */
|
/* Sidebar */
|
||||||
@@ -67,6 +68,10 @@ tbody tr:hover td{background:rgba(0,0,0,.03)}
|
|||||||
.chip-warn{background:var(--warn-bg);color:var(--warn-fg);border:1px solid #c9ba9b}
|
.chip-warn{background:var(--warn-bg);color:var(--warn-fg);border:1px solid #c9ba9b}
|
||||||
.chip-fail{background:var(--crit-bg);color:var(--crit-fg);border:1px solid var(--crit-border)}
|
.chip-fail{background:var(--crit-bg);color:var(--crit-fg);border:1px solid var(--crit-border)}
|
||||||
.chip-unknown{background:var(--surface-2);color:var(--muted);border:1px solid var(--border)}
|
.chip-unknown{background:var(--surface-2);color:var(--muted);border:1px solid var(--border)}
|
||||||
|
/* Nav separator and tasks count badge */
|
||||||
|
.nav-sep{height:1px;background:rgba(255,255,255,.12);margin:6px 0}
|
||||||
|
.tasks-nav-count{background:var(--accent);color:#fff;border-radius:10px;padding:1px 7px;font-size:11px;font-weight:700;display:none;margin-left:auto}
|
||||||
|
.tasks-nav-count.active{display:inline}
|
||||||
/* Output terminal */
|
/* Output terminal */
|
||||||
.terminal{background:#1b1c1d;border:1px solid rgba(0,0,0,.2);border-radius:4px;padding:14px;font-family:monospace;font-size:12px;color:#b5cea8;max-height:400px;overflow-y:auto;white-space:pre-wrap;word-break:break-all;user-select:text;-webkit-user-select:text}
|
.terminal{background:#1b1c1d;border:1px solid rgba(0,0,0,.2);border-radius:4px;padding:14px;font-family:monospace;font-size:12px;color:#b5cea8;max-height:400px;overflow-y:auto;white-space:pre-wrap;word-break:break-all;user-select:text;-webkit-user-select:text}
|
||||||
.terminal-wrap{position:relative}.terminal-copy{position:absolute;top:6px;right:6px;background:#2d2f30;border:1px solid #444;color:#aaa;font-size:11px;padding:2px 8px;border-radius:3px;cursor:pointer;opacity:.7}.terminal-copy:hover{opacity:1}
|
.terminal-wrap{position:relative}.terminal-copy{position:absolute;top:6px;right:6px;background:#2d2f30;border:1px solid #444;color:#aaa;font-size:11px;padding:2px 8px;border-radius:3px;cursor:pointer;opacity:.7}.terminal-copy:hover{opacity:1}
|
||||||
@@ -92,14 +97,21 @@ tbody tr:hover td{background:rgba(0,0,0,.03)}
|
|||||||
}
|
}
|
||||||
|
|
||||||
func layoutNav(active string, buildLabel string) string {
|
func layoutNav(active string, buildLabel string) string {
|
||||||
items := []struct{ id, label, href, onclick string }{
|
type navItem struct {
|
||||||
{"dashboard", "Dashboard", "/", ""},
|
id, label, href string
|
||||||
{"audit", "Audit", "/audit", ""},
|
sep bool
|
||||||
{"validate", "Validate", "/validate", ""},
|
}
|
||||||
{"burn", "Burn", "/burn", ""},
|
items := []navItem{
|
||||||
{"benchmark", "Benchmark", "/benchmark", ""},
|
{id: "dashboard", label: "Dashboard", href: "/"},
|
||||||
{"tasks", "Tasks", "/tasks", ""},
|
{id: "audit", label: "1. Audit", href: "/audit"},
|
||||||
{"tools", "Tools", "/tools", ""},
|
{id: "check", label: "2. Check", href: "/check"},
|
||||||
|
{id: "load", label: "3. Load", href: "/load"},
|
||||||
|
{id: "burn", label: "4. Burn", href: "/burn"},
|
||||||
|
{id: "benchmark", label: "5. Benchmark", href: "/benchmark"},
|
||||||
|
{sep: true},
|
||||||
|
{id: "tasks", label: "Tasks", href: "/tasks"},
|
||||||
|
{id: "tools", label: "Tools", href: "/tools"},
|
||||||
|
{id: "settings", label: "Settings", href: "/settings"},
|
||||||
}
|
}
|
||||||
var b strings.Builder
|
var b strings.Builder
|
||||||
b.WriteString(`<aside class="sidebar">`)
|
b.WriteString(`<aside class="sidebar">`)
|
||||||
@@ -119,19 +131,24 @@ func layoutNav(active string, buildLabel string) string {
|
|||||||
}
|
}
|
||||||
b.WriteString(`<nav class="nav">`)
|
b.WriteString(`<nav class="nav">`)
|
||||||
for _, item := range items {
|
for _, item := range items {
|
||||||
|
if item.sep {
|
||||||
|
b.WriteString(`<div class="nav-sep"></div>`)
|
||||||
|
continue
|
||||||
|
}
|
||||||
cls := "nav-item"
|
cls := "nav-item"
|
||||||
if item.id == active {
|
if item.id == active {
|
||||||
cls += " active"
|
cls += " active"
|
||||||
}
|
}
|
||||||
if item.onclick != "" {
|
if item.id == "tasks" {
|
||||||
b.WriteString(fmt.Sprintf(`<a class="%s" href="%s" onclick="%s">%s</a>`,
|
b.WriteString(fmt.Sprintf(`<a class="%s" href="%s" id="tasks-nav-item">%s<span class="tasks-nav-count" id="tasks-nav-count"></span></a>`, cls, item.href, item.label))
|
||||||
cls, item.href, item.onclick, item.label))
|
|
||||||
} else {
|
} else {
|
||||||
b.WriteString(fmt.Sprintf(`<a class="%s" href="%s">%s</a>`,
|
b.WriteString(fmt.Sprintf(`<a class="%s" href="%s">%s</a>`, cls, item.href, item.label))
|
||||||
cls, item.href, item.label))
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
b.WriteString(`</nav>`)
|
b.WriteString(`</nav>`)
|
||||||
|
b.WriteString(`<script>`)
|
||||||
|
b.WriteString(`(function(){function u(){fetch('/api/tasks',{cache:'no-store'}).then(function(r){return r.json();}).then(function(d){var n=Array.isArray(d)?d.filter(function(t){return t.status==='pending'||t.status==='running';}).length:0;var c=document.getElementById('tasks-nav-count');var el=document.getElementById('tasks-nav-item');if(c){c.textContent=n>0?String(n):'';c.className='tasks-nav-count'+(n>0?' active':'');}if(el){el.style.color=n>0?'#f6c90e':'';}}).catch(function(){});}u();setInterval(u,5000);})();`)
|
||||||
|
b.WriteString(`</script>`)
|
||||||
b.WriteString(`</aside>`)
|
b.WriteString(`</aside>`)
|
||||||
return b.String()
|
return b.String()
|
||||||
}
|
}
|
||||||
|
|||||||
368
audit/internal/webui/nvme_format.go
Normal file
368
audit/internal/webui/nvme_format.go
Normal file
@@ -0,0 +1,368 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"net/http"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
|
"regexp"
|
||||||
|
"sort"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
type nvmeFormatMode struct {
|
||||||
|
Mode int `json:"mode"`
|
||||||
|
DataBytes int64 `json:"data_bytes"`
|
||||||
|
MetadataBytes int64 `json:"metadata_bytes"`
|
||||||
|
InUse bool `json:"in_use"`
|
||||||
|
Label string `json:"label"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type nvmeFormatDisk struct {
|
||||||
|
Device string `json:"device"`
|
||||||
|
Model string `json:"model,omitempty"`
|
||||||
|
Serial string `json:"serial,omitempty"`
|
||||||
|
Size string `json:"size,omitempty"`
|
||||||
|
CurrentMode int `json:"current_mode"`
|
||||||
|
CurrentFormat string `json:"current_format"`
|
||||||
|
Modes []nvmeFormatMode `json:"modes"`
|
||||||
|
Error string `json:"error,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type nvmeListJSON struct {
|
||||||
|
Devices []struct {
|
||||||
|
DevicePath string `json:"DevicePath"`
|
||||||
|
ModelNumber string `json:"ModelNumber"`
|
||||||
|
SerialNumber string `json:"SerialNumber"`
|
||||||
|
PhysicalSize int64 `json:"PhysicalSize"`
|
||||||
|
} `json:"Devices"`
|
||||||
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
nvmeFormatDeviceRE = regexp.MustCompile(`^/dev/nvme[0-9]+n[0-9]+$`)
|
||||||
|
nvmeLBAFCompactLineRE = regexp.MustCompile(`(?im)^\s*lbaf\s+(\d+)\s*:\s*ms:(\d+)\s+lbads:(\d+).*$`)
|
||||||
|
nvmeLBAFVerboseLineRE = regexp.MustCompile(`(?im)^\s*LBA Format\s+(\d+)\s*:\s*Metadata Size:\s*(\d+)\s+bytes\s*-\s*Data Size:\s*(\d+)\s+bytes.*$`)
|
||||||
|
nvmeCommandContext = exec.CommandContext
|
||||||
|
nvmeListFormatsTimeout = 20 * time.Second
|
||||||
|
)
|
||||||
|
|
||||||
|
func listNVMeFormatDisks(ctx context.Context) ([]nvmeFormatDisk, error) {
|
||||||
|
ctx, cancel := context.WithTimeout(ctx, nvmeListFormatsTimeout)
|
||||||
|
defer cancel()
|
||||||
|
out, err := nvmeCommandContext(ctx, "nvme", "list", "-o", "json").Output()
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
var root nvmeListJSON
|
||||||
|
if err := json.Unmarshal(out, &root); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
disks := make([]nvmeFormatDisk, 0, len(root.Devices))
|
||||||
|
seen := map[string]struct{}{}
|
||||||
|
for _, dev := range root.Devices {
|
||||||
|
path := strings.TrimSpace(dev.DevicePath)
|
||||||
|
if !nvmeFormatDeviceRE.MatchString(path) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if _, ok := seen[path]; ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seen[path] = struct{}{}
|
||||||
|
disk := nvmeFormatDisk{
|
||||||
|
Device: path,
|
||||||
|
Model: strings.TrimSpace(dev.ModelNumber),
|
||||||
|
Serial: strings.TrimSpace(dev.SerialNumber),
|
||||||
|
Size: formatNVMeBytes(dev.PhysicalSize),
|
||||||
|
CurrentMode: -1,
|
||||||
|
}
|
||||||
|
modes, parseErr := readNVMeFormatModes(ctx, path)
|
||||||
|
if parseErr != nil {
|
||||||
|
disk.Error = parseErr.Error()
|
||||||
|
}
|
||||||
|
disk.Modes = modes
|
||||||
|
for _, mode := range modes {
|
||||||
|
if mode.InUse {
|
||||||
|
disk.CurrentMode = mode.Mode
|
||||||
|
disk.CurrentFormat = formatNVMeBlock(mode.DataBytes, mode.MetadataBytes)
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
disks = append(disks, disk)
|
||||||
|
}
|
||||||
|
sort.Slice(disks, func(i, j int) bool { return disks[i].Device < disks[j].Device })
|
||||||
|
return disks, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func readNVMeFormatModes(ctx context.Context, device string) ([]nvmeFormatMode, error) {
|
||||||
|
if !nvmeFormatDeviceRE.MatchString(device) {
|
||||||
|
return nil, fmt.Errorf("invalid NVMe device")
|
||||||
|
}
|
||||||
|
out, err := nvmeCommandContext(ctx, "nvme", "id-ns", device, "-H").CombinedOutput()
|
||||||
|
if err != nil {
|
||||||
|
msg := strings.TrimSpace(string(out))
|
||||||
|
if msg == "" {
|
||||||
|
msg = err.Error()
|
||||||
|
}
|
||||||
|
return nil, fmt.Errorf("%s", msg)
|
||||||
|
}
|
||||||
|
modes := parseNVMeFormatModes(string(out))
|
||||||
|
if len(modes) == 0 {
|
||||||
|
return nil, fmt.Errorf("no LBA format modes found")
|
||||||
|
}
|
||||||
|
return modes, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseNVMeFormatModes(raw string) []nvmeFormatMode {
|
||||||
|
byMode := map[int]nvmeFormatMode{}
|
||||||
|
for _, m := range nvmeLBAFCompactLineRE.FindAllStringSubmatch(raw, -1) {
|
||||||
|
mode, errMode := strconv.Atoi(m[1])
|
||||||
|
metadata, errMS := strconv.ParseInt(m[2], 10, 64)
|
||||||
|
lbads, errLBADS := strconv.Atoi(m[3])
|
||||||
|
if errMode != nil || errMS != nil || errLBADS != nil || lbads < 0 || lbads >= 63 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
data := int64(1) << lbads
|
||||||
|
line := m[0]
|
||||||
|
byMode[mode] = nvmeFormatMode{
|
||||||
|
Mode: mode,
|
||||||
|
DataBytes: data,
|
||||||
|
MetadataBytes: metadata,
|
||||||
|
InUse: strings.Contains(strings.ToLower(line), "in use"),
|
||||||
|
Label: fmt.Sprintf("MODE %d (%s)", mode, formatNVMeBlock(data, metadata)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, m := range nvmeLBAFVerboseLineRE.FindAllStringSubmatch(raw, -1) {
|
||||||
|
mode, errMode := strconv.Atoi(m[1])
|
||||||
|
metadata, errMS := strconv.ParseInt(m[2], 10, 64)
|
||||||
|
data, errData := strconv.ParseInt(m[3], 10, 64)
|
||||||
|
if errMode != nil || errMS != nil || errData != nil || data <= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
line := m[0]
|
||||||
|
byMode[mode] = nvmeFormatMode{
|
||||||
|
Mode: mode,
|
||||||
|
DataBytes: data,
|
||||||
|
MetadataBytes: metadata,
|
||||||
|
InUse: strings.Contains(strings.ToLower(line), "in use"),
|
||||||
|
Label: fmt.Sprintf("MODE %d (%s)", mode, formatNVMeBlock(data, metadata)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
modes := make([]nvmeFormatMode, 0, len(byMode))
|
||||||
|
for _, mode := range byMode {
|
||||||
|
modes = append(modes, mode)
|
||||||
|
}
|
||||||
|
sort.Slice(modes, func(i, j int) bool { return modes[i].Mode < modes[j].Mode })
|
||||||
|
return modes
|
||||||
|
}
|
||||||
|
|
||||||
|
func runNVMeFormatTask(ctx context.Context, j *jobState, device string, lbaf int) error {
|
||||||
|
if !nvmeFormatDeviceRE.MatchString(device) {
|
||||||
|
return fmt.Errorf("invalid NVMe device")
|
||||||
|
}
|
||||||
|
modes, err := readNVMeFormatModes(ctx, device)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
var selected nvmeFormatMode
|
||||||
|
found := false
|
||||||
|
for _, mode := range modes {
|
||||||
|
if mode.Mode == lbaf {
|
||||||
|
selected = mode
|
||||||
|
found = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !found {
|
||||||
|
return fmt.Errorf("MODE %d is not available on %s", lbaf, device)
|
||||||
|
}
|
||||||
|
ms := 0
|
||||||
|
if selected.MetadataBytes > 0 {
|
||||||
|
ms = 1
|
||||||
|
}
|
||||||
|
j.append(fmt.Sprintf("Formatting %s to %s with --lbaf=%d --ms=%d --force", device, formatNVMeBlock(selected.DataBytes, selected.MetadataBytes), selected.Mode, ms))
|
||||||
|
cmd := nvmeCommandContext(ctx, "nvme", "format", device, fmt.Sprintf("--lbaf=%d", selected.Mode), fmt.Sprintf("--ms=%d", ms), "--force")
|
||||||
|
return streamCmdJob(j, cmd)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPINVMeFormats(w http.ResponseWriter, r *http.Request) {
|
||||||
|
disks, err := listNVMeFormatDisks(r.Context())
|
||||||
|
if err != nil {
|
||||||
|
writeError(w, http.StatusInternalServerError, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
writeJSON(w, disks)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPINVMeFormatRun(w http.ResponseWriter, r *http.Request) {
|
||||||
|
var req struct {
|
||||||
|
Device string `json:"device"`
|
||||||
|
LBAF int `json:"lbaf"`
|
||||||
|
}
|
||||||
|
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||||
|
writeError(w, http.StatusBadRequest, "invalid request body")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if !nvmeFormatDeviceRE.MatchString(req.Device) {
|
||||||
|
writeError(w, http.StatusBadRequest, "invalid NVMe device")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
disks, err := listNVMeFormatDisks(r.Context())
|
||||||
|
if err != nil {
|
||||||
|
writeError(w, http.StatusInternalServerError, err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
var label string
|
||||||
|
allowed := false
|
||||||
|
for _, disk := range disks {
|
||||||
|
if disk.Device != req.Device {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
for _, mode := range disk.Modes {
|
||||||
|
if mode.Mode == req.LBAF {
|
||||||
|
allowed = true
|
||||||
|
label = mode.Label
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !allowed {
|
||||||
|
writeError(w, http.StatusBadRequest, "LBA format mode is not available for this device")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
name := fmt.Sprintf("NVMe Format %s to %s", filepath.Base(req.Device), label)
|
||||||
|
t := &Task{
|
||||||
|
ID: newJobID("nvme-format"),
|
||||||
|
Name: name,
|
||||||
|
Target: "nvme-format",
|
||||||
|
Priority: defaultTaskPriority("nvme-format", taskParams{}),
|
||||||
|
Status: TaskPending,
|
||||||
|
CreatedAt: time.Now(),
|
||||||
|
params: taskParams{
|
||||||
|
Device: req.Device,
|
||||||
|
LBAF: req.LBAF,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
globalQueue.enqueue(t)
|
||||||
|
writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID})
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatNVMeBlock(dataBytes, metadataBytes int64) string {
|
||||||
|
return strconv.FormatInt(dataBytes, 10) + "+" + strconv.FormatInt(metadataBytes, 10)
|
||||||
|
}
|
||||||
|
|
||||||
|
func formatNVMeBytes(n int64) string {
|
||||||
|
if n <= 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
units := []string{"B", "KB", "MB", "GB", "TB", "PB"}
|
||||||
|
v := float64(n)
|
||||||
|
unit := 0
|
||||||
|
for v >= 1000 && unit < len(units)-1 {
|
||||||
|
v /= 1000
|
||||||
|
unit++
|
||||||
|
}
|
||||||
|
if unit == 0 {
|
||||||
|
return fmt.Sprintf("%d B", n)
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("%.1f %s", v, units[unit])
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderNVMeFormatInline() string {
|
||||||
|
return `<div id="nvme-format-status" style="font-size:13px;color:var(--muted);margin-bottom:12px">Loading NVMe disks...</div>
|
||||||
|
<div id="nvme-format-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
|
||||||
|
<script>
|
||||||
|
function nvmeFormatEsc(s) {
|
||||||
|
return String(s == null ? '' : s).replace(/[&<>"']/g, function(c) {
|
||||||
|
return {'&':'&','<':'<','>':'>','"':'"',"'":'''}[c];
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function loadNVMeFormats() {
|
||||||
|
var status = document.getElementById('nvme-format-status');
|
||||||
|
var table = document.getElementById('nvme-format-table');
|
||||||
|
status.textContent = 'Loading NVMe disks...';
|
||||||
|
status.style.color = 'var(--muted)';
|
||||||
|
table.innerHTML = '<p style="color:var(--muted);font-size:13px">Loading...</p>';
|
||||||
|
fetch('/api/tools/nvme-formats').then(function(r) { return r.json().then(function(d) { if (!r.ok) throw new Error(d.error || ('HTTP ' + r.status)); return d; }); }).then(function(disks) {
|
||||||
|
window._nvmeFormatDisks = Array.isArray(disks) ? disks : [];
|
||||||
|
if (!window._nvmeFormatDisks.length) {
|
||||||
|
status.textContent = 'No NVMe disks found.';
|
||||||
|
table.innerHTML = '';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
status.textContent = window._nvmeFormatDisks.length + ' NVMe disk(s) found.';
|
||||||
|
var rows = window._nvmeFormatDisks.map(function(d, idx) {
|
||||||
|
var current = d.current_format ? (d.current_format + ' / MODE ' + d.current_mode) : 'unknown';
|
||||||
|
var detail = [d.model || '', d.serial || '', d.size || ''].filter(Boolean).join(' | ');
|
||||||
|
var options = (d.modes || []).map(function(m) {
|
||||||
|
return '<option value="' + m.mode + '"' + (m.in_use ? ' selected' : '') + '>' + nvmeFormatEsc(m.label) + '</option>';
|
||||||
|
}).join('');
|
||||||
|
var disabled = options ? '' : ' disabled';
|
||||||
|
var err = d.error ? '<div style="font-size:12px;color:var(--crit-fg,#9f3a38);margin-top:4px">' + nvmeFormatEsc(d.error) + '</div>' : '';
|
||||||
|
return '<tr>'
|
||||||
|
+ '<td style="font-family:monospace;white-space:nowrap">' + nvmeFormatEsc(d.device) + (detail ? '<div style="font-family:inherit;font-size:12px;color:var(--muted)">' + nvmeFormatEsc(detail) + '</div>' : '') + '</td>'
|
||||||
|
+ '<td style="white-space:nowrap">' + nvmeFormatEsc(current) + err + '</td>'
|
||||||
|
+ '<td style="white-space:nowrap"><select id="nvme-format-select-' + idx + '"' + disabled + '>' + options + '</select></td>'
|
||||||
|
+ '<td style="white-space:nowrap"><button class="btn btn-sm btn-primary" onclick="nvmeFormatRun(' + idx + ', this)"' + disabled + '>Apply</button><div class="nvme-format-row-msg" style="margin-top:6px;font-size:12px;color:var(--muted)"></div></td>'
|
||||||
|
+ '</tr>';
|
||||||
|
}).join('');
|
||||||
|
table.innerHTML = '<table><tr><th>Disk</th><th>Current block / mode</th><th>New mode</th><th>Action</th></tr>' + rows + '</table>';
|
||||||
|
}).catch(function(e) {
|
||||||
|
status.textContent = 'Error loading NVMe disks: ' + e.message;
|
||||||
|
status.style.color = 'var(--crit-fg,#9f3a38)';
|
||||||
|
table.innerHTML = '';
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function nvmeWaitTaskDone(taskID, rowMsg) {
|
||||||
|
var timer = setInterval(function() {
|
||||||
|
fetch('/api/tasks').then(function(r) { return r.json(); }).then(function(tasks) {
|
||||||
|
var task = (tasks || []).find(function(t) { return t.id === taskID; });
|
||||||
|
if (!task) return;
|
||||||
|
if (task.status === 'done' || task.status === 'failed' || task.status === 'cancelled') {
|
||||||
|
clearInterval(timer);
|
||||||
|
rowMsg.textContent = 'Task ' + taskID + ': ' + task.status + (task.error ? ' - ' + task.error : '');
|
||||||
|
rowMsg.style.color = task.status === 'done' ? 'var(--ok,green)' : 'var(--crit-fg,#9f3a38)';
|
||||||
|
loadNVMeFormats();
|
||||||
|
}
|
||||||
|
}).catch(function(){});
|
||||||
|
}, 1500);
|
||||||
|
}
|
||||||
|
function nvmeFormatRun(idx, btn) {
|
||||||
|
var disk = (window._nvmeFormatDisks || [])[idx];
|
||||||
|
var select = document.getElementById('nvme-format-select-' + idx);
|
||||||
|
var row = btn.closest('td');
|
||||||
|
var rowMsg = row.querySelector('.nvme-format-row-msg');
|
||||||
|
if (!disk || !select) return;
|
||||||
|
var lbaf = parseInt(select.value, 10);
|
||||||
|
var mode = (disk.modes || []).find(function(m) { return m.mode === lbaf; });
|
||||||
|
if (!mode) return;
|
||||||
|
if (!window.confirm('Format ' + disk.device + ' to ' + mode.label + '? This erases data on the namespace.')) return;
|
||||||
|
btn.disabled = true;
|
||||||
|
rowMsg.style.color = 'var(--muted)';
|
||||||
|
rowMsg.textContent = 'Queued...';
|
||||||
|
fetch('/api/tools/nvme-format/run', {
|
||||||
|
method:'POST',
|
||||||
|
headers:{'Content-Type':'application/json'},
|
||||||
|
body:JSON.stringify({device: disk.device, lbaf: lbaf})
|
||||||
|
}).then(function(r) { return r.json().then(function(d) { if (!r.ok) throw new Error(d.error || ('HTTP ' + r.status)); return d; }); }).then(function(d) {
|
||||||
|
rowMsg.textContent = 'Task ' + d.task_id + ' queued.';
|
||||||
|
nvmeWaitTaskDone(d.task_id, rowMsg);
|
||||||
|
}).catch(function(e) {
|
||||||
|
rowMsg.style.color = 'var(--crit-fg,#9f3a38)';
|
||||||
|
rowMsg.textContent = 'Error: ' + e.message;
|
||||||
|
}).finally(function() {
|
||||||
|
btn.disabled = false;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
loadNVMeFormats();
|
||||||
|
</script>`
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderNVMeFormatCard() string {
|
||||||
|
return `<div class="card"><div class="card-head">NVMe Block Format <button class="btn btn-sm btn-secondary" onclick="loadNVMeFormats()" style="margin-left:auto">↻ Refresh</button></div><div class="card-body">` +
|
||||||
|
`<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Lists NVMe namespaces and changes their LBA format through a queued task.</p>` +
|
||||||
|
renderNVMeFormatInline() + `</div></div>`
|
||||||
|
}
|
||||||
@@ -611,3 +611,7 @@ func renderPowerBenchmarkResultsCard(exportDir string) string {
|
|||||||
b.WriteString(`</div></div>`)
|
b.WriteString(`</div></div>`)
|
||||||
return b.String()
|
return b.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// renderSpeed and renderEndurance are legacy wrappers; canonical page is 5. Benchmark at /benchmark.
|
||||||
|
func renderSpeed(opts HandlerOptions) string { return renderBenchmark(opts) }
|
||||||
|
func renderEndurance(opts HandlerOptions) string { return renderBenchmark(opts) }
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ package webui
|
|||||||
|
|
||||||
func renderBurn() string {
|
func renderBurn() string {
|
||||||
return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>⚠ Warning:</strong> Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.</div>
|
return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>⚠ Warning:</strong> Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.</div>
|
||||||
<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> Burn exposes sustained GPU compute load recipes. DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `) and LINPACK remain in <a href="/validate">Validate → Stress mode</a>; NCCL and NVBandwidth are available directly from <a href="/validate">Validate</a>.</div>
|
<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> Burn runs sustained GPU compute and CPU/memory stress recipes. DCGM targeted diagnostics (<code>targeted_stress</code>, <code>targeted_power</code>, <code>pulse_test</code>) and NCCL/NVBandwidth are on the <a href="/load">3. Load</a> page. For performance benchmarks, see <a href="/benchmark">5. Benchmark</a>.</div>
|
||||||
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||||
|
|
||||||
<div class="card" style="margin-bottom:16px">
|
<div class="card" style="margin-bottom:16px">
|
||||||
|
|||||||
@@ -402,93 +402,11 @@ loadNvidiaSelfHeal();
|
|||||||
}
|
}
|
||||||
|
|
||||||
func renderTools() string {
|
func renderTools() string {
|
||||||
return `<div class="card" style="margin-bottom:16px">
|
return renderNVMeFormatCard() + `
|
||||||
<div class="card-head">System Install</div>
|
|
||||||
<div class="card-body">
|
|
||||||
<div style="margin-bottom:20px">
|
|
||||||
<div style="font-weight:600;margin-bottom:8px">Install to RAM</div>
|
|
||||||
<p id="boot-source-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Detecting boot source...</p>
|
|
||||||
<p id="ram-status-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Checking...</p>
|
|
||||||
<button id="ram-install-btn" class="btn btn-primary" onclick="installToRAM()" style="display:none">▶ Copy to RAM</button>
|
|
||||||
</div>
|
|
||||||
<div style="border-top:1px solid var(--line);padding-top:20px">
|
|
||||||
<div style="font-weight:600;margin-bottom:8px">Install to Disk</div>` +
|
|
||||||
renderInstallInline() + `
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
<script>
|
|
||||||
fetch('/api/system/ram-status').then(r=>r.json()).then(d=>{
|
|
||||||
const boot = document.getElementById('boot-source-text');
|
|
||||||
const txt = document.getElementById('ram-status-text');
|
|
||||||
const btn = document.getElementById('ram-install-btn');
|
|
||||||
let source = d.device || d.source || 'unknown source';
|
|
||||||
let kind = d.kind || 'unknown';
|
|
||||||
let label = source;
|
|
||||||
if (kind === 'ram') label = 'RAM';
|
|
||||||
else if (kind === 'usb') label = 'USB (' + source + ')';
|
|
||||||
else if (kind === 'cdrom') label = 'CD-ROM (' + source + ')';
|
|
||||||
else if (kind === 'disk') label = 'disk (' + source + ')';
|
|
||||||
else label = source;
|
|
||||||
boot.textContent = 'Current boot source: ' + label + '.';
|
|
||||||
txt.textContent = d.message || 'Checking...';
|
|
||||||
if (d.status === 'ok' || d.in_ram) {
|
|
||||||
txt.style.color = 'var(--ok, green)';
|
|
||||||
} else if (d.status === 'failed') {
|
|
||||||
txt.style.color = 'var(--err, #b91c1c)';
|
|
||||||
} else {
|
|
||||||
txt.style.color = 'var(--muted)';
|
|
||||||
}
|
|
||||||
if (d.can_start_task) {
|
|
||||||
btn.style.display = '';
|
|
||||||
btn.disabled = false;
|
|
||||||
} else {
|
|
||||||
btn.style.display = 'none';
|
|
||||||
}
|
|
||||||
});
|
|
||||||
function installToRAM() {
|
|
||||||
document.getElementById('ram-install-btn').disabled = true;
|
|
||||||
fetch('/api/system/install-to-ram', {method:'POST'}).then(r=>r.json()).then(d=>{
|
|
||||||
window.location.href = '/tasks#' + d.task_id;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
</script>
|
|
||||||
|
|
||||||
<div class="card"><div class="card-head">Support Bundle</div><div class="card-body">
|
` + renderSAADMICard() + `
|
||||||
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Downloads a tar.gz archive of all audit files, SAT results, and logs.</p>
|
|
||||||
` + renderSupportBundleInline() + `
|
|
||||||
<div style="border-top:1px solid var(--border);margin-top:16px;padding-top:16px">
|
|
||||||
<div style="font-weight:600;margin-bottom:8px">USB Black-Box</div>
|
|
||||||
` + renderUSBExportInline() + `
|
|
||||||
</div>
|
|
||||||
</div></div>
|
|
||||||
|
|
||||||
<div class="card"><div class="card-head">Tool Check <button class="btn btn-sm btn-secondary" onclick="checkTools()" style="margin-left:auto">↻ Check</button></div>
|
` + renderIPMIFRUCard()
|
||||||
<div class="card-body"><div id="tools-table"><p style="color:var(--muted);font-size:13px">Checking...</p></div></div></div>
|
|
||||||
|
|
||||||
<div class="card"><div class="card-head">NVIDIA Self Heal</div><div class="card-body">` +
|
|
||||||
renderNvidiaSelfHealInline() + `</div></div>
|
|
||||||
|
|
||||||
<div class="card"><div class="card-head">Network</div><div class="card-body">` +
|
|
||||||
renderNetworkInline() + `</div></div>
|
|
||||||
|
|
||||||
<div class="card"><div class="card-head">Services</div><div class="card-body">` +
|
|
||||||
renderServicesInline() + `</div></div>
|
|
||||||
|
|
||||||
|
|
||||||
<script>
|
|
||||||
function checkTools() {
|
|
||||||
document.getElementById('tools-table').innerHTML = '<p style="color:var(--muted);font-size:13px">Checking...</p>';
|
|
||||||
fetch('/api/tools/check').then(r=>r.json()).then(tools => {
|
|
||||||
const rows = tools.map(t =>
|
|
||||||
'<tr><td>'+t.Name+'</td><td><span class="badge '+(t.OK ? 'badge-ok' : 'badge-err')+'">'+(t.OK ? '✓ '+t.Path : '✗ missing')+'</span></td></tr>'
|
|
||||||
).join('');
|
|
||||||
document.getElementById('tools-table').innerHTML =
|
|
||||||
'<table><tr><th>Tool</th><th>Status</th></tr>'+rows+'</table>';
|
|
||||||
});
|
|
||||||
}
|
|
||||||
checkTools();
|
|
||||||
</script>`
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func renderExportIndex(exportDir string) (string, error) {
|
func renderExportIndex(exportDir string) (string, error) {
|
||||||
|
|||||||
122
audit/internal/webui/page_settings.go
Normal file
122
audit/internal/webui/page_settings.go
Normal file
@@ -0,0 +1,122 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import "html"
|
||||||
|
|
||||||
|
func renderSettings(opts HandlerOptions) string {
|
||||||
|
version := opts.BuildLabel
|
||||||
|
if version == "" {
|
||||||
|
version = "dev"
|
||||||
|
}
|
||||||
|
return `<div class="card" style="margin-bottom:16px">
|
||||||
|
<div class="card-head">System Install</div>
|
||||||
|
<div class="card-body">
|
||||||
|
<div style="margin-bottom:20px">
|
||||||
|
<div style="font-weight:600;margin-bottom:8px">Install to RAM</div>
|
||||||
|
<p id="boot-source-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Detecting boot source...</p>
|
||||||
|
<p id="ram-status-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Checking...</p>
|
||||||
|
<button id="ram-install-btn" class="btn btn-primary" onclick="installToRAM()" style="display:none">▶ Copy to RAM</button>
|
||||||
|
</div>
|
||||||
|
<div style="border-top:1px solid var(--line);padding-top:20px">
|
||||||
|
<div style="font-weight:600;margin-bottom:8px">Install to Disk</div>` +
|
||||||
|
renderInstallInline() + `
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<script>
|
||||||
|
fetch('/api/system/ram-status').then(r=>r.json()).then(d=>{
|
||||||
|
const boot = document.getElementById('boot-source-text');
|
||||||
|
const txt = document.getElementById('ram-status-text');
|
||||||
|
const btn = document.getElementById('ram-install-btn');
|
||||||
|
let kind = d.kind || 'unknown';
|
||||||
|
let source = d.device || d.source || 'unknown source';
|
||||||
|
let label = kind==='ram'?'RAM':kind==='usb'?'USB ('+source+')':kind==='cdrom'?'CD-ROM ('+source+')':kind==='disk'?'disk ('+source+')':source;
|
||||||
|
boot.textContent = 'Current boot source: ' + label + '.';
|
||||||
|
txt.textContent = d.blocked_reason || d.message || 'Checking...';
|
||||||
|
txt.style.color = (d.status==='ok'||d.in_ram)?'var(--ok,green)':d.status==='failed'?'var(--err,#b91c1c)':'var(--muted)';
|
||||||
|
if (d.can_start_task) { btn.style.display=''; btn.disabled=false; } else { btn.style.display='none'; }
|
||||||
|
});
|
||||||
|
function installToRAM() {
|
||||||
|
document.getElementById('ram-install-btn').disabled = true;
|
||||||
|
fetch('/api/system/install-to-ram', {method:'POST'}).then(r=>r.json()).then(d=>{
|
||||||
|
window.location.href = '/tasks#' + d.task_id;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
</script>
|
||||||
|
|
||||||
|
<div class="card"><div class="card-head">Support Bundle</div><div class="card-body">
|
||||||
|
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Downloads a tar.gz archive of all audit files, SAT results, and logs.</p>
|
||||||
|
` + renderSupportBundleInline() + `
|
||||||
|
<div style="border-top:1px solid var(--border);margin-top:16px;padding-top:16px">
|
||||||
|
<div style="font-weight:600;margin-bottom:8px">USB Black-Box</div>
|
||||||
|
` + renderUSBExportInline() + `
|
||||||
|
</div>
|
||||||
|
</div></div>
|
||||||
|
|
||||||
|
<div class="card"><div class="card-head">Tool Check <button class="btn btn-sm btn-secondary" onclick="checkTools()" style="margin-left:auto">↻ Check</button></div>
|
||||||
|
<div class="card-body"><div id="tools-table"><p style="color:var(--muted);font-size:13px">Checking...</p></div></div></div>
|
||||||
|
<script>
|
||||||
|
function checkTools() {
|
||||||
|
document.getElementById('tools-table').innerHTML = '<p style="color:var(--muted);font-size:13px">Checking...</p>';
|
||||||
|
fetch('/api/tools/check').then(r=>r.json()).then(tools => {
|
||||||
|
const rows = tools.map(t =>
|
||||||
|
'<tr><td>'+t.Name+'</td><td><span class="badge '+(t.OK?'badge-ok':'badge-err')+'">'+(t.OK?'✓ '+t.Path:'✗ missing')+'</span></td></tr>'
|
||||||
|
).join('');
|
||||||
|
document.getElementById('tools-table').innerHTML = '<table><tr><th>Tool</th><th>Status</th></tr>'+rows+'</table>';
|
||||||
|
});
|
||||||
|
}
|
||||||
|
checkTools();
|
||||||
|
</script>
|
||||||
|
|
||||||
|
<div class="card"><div class="card-head">NVIDIA Self Heal</div><div class="card-body">` +
|
||||||
|
renderNvidiaSelfHealInline() + `</div></div>
|
||||||
|
|
||||||
|
<div class="card"><div class="card-head">Network</div><div class="card-body">` +
|
||||||
|
renderNetworkInline() + `</div></div>
|
||||||
|
|
||||||
|
<div class="card"><div class="card-head">Services</div><div class="card-body">` +
|
||||||
|
renderServicesInline() + `</div></div>
|
||||||
|
|
||||||
|
<div class="card">
|
||||||
|
<div class="card-head">Blackbox Logging</div>
|
||||||
|
<div class="card-body">
|
||||||
|
<p style="font-size:13px;color:var(--muted);margin-bottom:14px">Continuous hardware monitoring that writes a rolling log of sensor readings to the export directory.</p>
|
||||||
|
<div style="display:flex;gap:8px;align-items:center">
|
||||||
|
<button class="btn btn-primary btn-sm" onclick="blackboxToggle('enable')">Enable</button>
|
||||||
|
<button class="btn btn-secondary btn-sm" onclick="blackboxToggle('disable')">Disable</button>
|
||||||
|
<span id="blackbox-status" style="font-size:12px;color:var(--muted)">Loading...</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="card">
|
||||||
|
<div class="card-head">Build Info</div>
|
||||||
|
<div class="card-body">
|
||||||
|
<table style="width:auto">
|
||||||
|
<tbody>
|
||||||
|
<tr><td style="color:var(--muted);padding-right:24px">Version</td><td>` + html.EscapeString(version) + `</td></tr>
|
||||||
|
<tr><td style="color:var(--muted);padding-right:24px">Title</td><td>` + html.EscapeString(opts.Title) + `</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
(function() {
|
||||||
|
fetch('/api/blackbox/status', {cache:'no-store'}).then(r => r.json()).then(d => {
|
||||||
|
var el = document.getElementById('blackbox-status');
|
||||||
|
if (el) el.textContent = d.enabled ? 'Enabled' : 'Disabled';
|
||||||
|
}).catch(() => {
|
||||||
|
var el = document.getElementById('blackbox-status');
|
||||||
|
if (el) el.textContent = 'Status unavailable';
|
||||||
|
});
|
||||||
|
})();
|
||||||
|
function blackboxToggle(action) {
|
||||||
|
var el = document.getElementById('blackbox-status');
|
||||||
|
if (el) el.textContent = 'Updating...';
|
||||||
|
fetch('/api/blackbox/' + action, {method:'POST', cache:'no-store'})
|
||||||
|
.then(r => r.json())
|
||||||
|
.then(d => { if (el) el.textContent = d.enabled ? 'Enabled' : 'Disabled'; })
|
||||||
|
.catch(err => { if (el) el.textContent = 'Error: ' + err.message; });
|
||||||
|
}
|
||||||
|
</script>`
|
||||||
|
}
|
||||||
@@ -11,6 +11,13 @@ import (
|
|||||||
"bee/audit/internal/schema"
|
"bee/audit/internal/schema"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// PCI vendor IDs used for GPU classification (source: pci-ids.ucw.cz).
|
||||||
|
const (
|
||||||
|
pciVendorNvidia = 0x10de
|
||||||
|
pciVendorAMD = 0x1002
|
||||||
|
pciVendorAspeed = 0x1a03
|
||||||
|
)
|
||||||
|
|
||||||
type validateInventory struct {
|
type validateInventory struct {
|
||||||
CPU string
|
CPU string
|
||||||
Memory string
|
Memory string
|
||||||
@@ -61,6 +68,14 @@ func validateTotalStressSec(n int) int {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func renderValidate(opts HandlerOptions) string {
|
func renderValidate(opts HandlerOptions) string {
|
||||||
|
return renderValidateMode(opts, false)
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderValidateStress(opts HandlerOptions) string {
|
||||||
|
return renderValidateMode(opts, true)
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderValidateMode(opts HandlerOptions, stressDefault bool) string {
|
||||||
inv := loadValidateInventory(opts)
|
inv := loadValidateInventory(opts)
|
||||||
n := inv.NvidiaGPUCount
|
n := inv.NvidiaGPUCount
|
||||||
validateTotalStr := validateFmtDur(validateTotalValidateSec(n))
|
validateTotalStr := validateFmtDur(validateTotalValidateSec(n))
|
||||||
@@ -69,26 +84,49 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
if n > 0 {
|
if n > 0 {
|
||||||
gpuNote = fmt.Sprintf(" (%d GPU)", n)
|
gpuNote = fmt.Sprintf(" (%d GPU)", n)
|
||||||
}
|
}
|
||||||
return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div>
|
estStr := validateTotalStr
|
||||||
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
if stressDefault {
|
||||||
|
estStr = stressTotalStr
|
||||||
|
}
|
||||||
|
alert := `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div>`
|
||||||
|
if stressDefault {
|
||||||
|
alert = `<div class="alert alert-warn" style="margin-bottom:16px"><strong>⚠ Stress mode:</strong> Runs extended load tests — CPU stress-ng, memory passes, DCGM targeted diagnostics. Higher wear than Validate.</div>`
|
||||||
|
}
|
||||||
|
|
||||||
<div class="card" style="margin-bottom:16px">
|
stressOnlyCards := ""
|
||||||
<div class="card-head">Validate Profile</div>
|
if stressDefault {
|
||||||
<div class="card-body validate-profile-body">
|
stressOnlyCards = renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
|
||||||
<div class="validate-profile-col">
|
inv.NVIDIA,
|
||||||
<div class="form-row" style="margin:12px 0 0"><label>Mode</label></div>
|
`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
|
||||||
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-validate" value="validate" checked onchange="satModeChanged()"><span>Validate — quick non-destructive check</span></label>
|
`<code>dcgmi diag targeted_stress</code>`,
|
||||||
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (` + stressTotalStr + gpuNote + `)</span></label>
|
validateFmtDur(platform.SATEstimatedNvidiaTargetedStressSec)+` (all GPUs simultaneously).`,
|
||||||
</div>
|
)) +
|
||||||
<div class="validate-profile-col validate-profile-action">
|
renderSATCard("nvidia-targeted-power", "NVIDIA Targeted Power", "runNvidiaValidateSet('nvidia-targeted-power')", "", renderValidateCardBody(
|
||||||
<p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially. Validate: ` + validateTotalStr + gpuNote + `; Stress: ` + stressTotalStr + gpuNote + `. Estimates are based on real log data and scale with GPU count.</p>
|
inv.NVIDIA,
|
||||||
<button type="button" class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
|
`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
|
||||||
<div style="margin-top:12px">
|
`<code>dcgmi diag targeted_power</code>`,
|
||||||
<span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
|
validateFmtDur(platform.SATEstimatedNvidiaTargetedPowerSec)+` (all GPUs simultaneously).`,
|
||||||
</div>
|
)) +
|
||||||
</div>
|
renderSATCard("nvidia-pulse", "NVIDIA PSU Pulse Test", "runNvidiaFabricValidate('nvidia-pulse')", "", renderValidateCardBody(
|
||||||
</div>
|
inv.NVIDIA,
|
||||||
</div>
|
`Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`,
|
||||||
|
`<code>dcgmi diag pulse_test</code>`,
|
||||||
|
validateFmtDur(platform.SATEstimatedNvidiaPulseTestSec)+` (all GPUs simultaneously; measured on 8-GPU system).`,
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
|
satStressModeJS := "function satStressMode() { return false; }"
|
||||||
|
if stressDefault {
|
||||||
|
satStressModeJS = "function satStressMode() { return true; }"
|
||||||
|
}
|
||||||
|
|
||||||
|
return alert + `
|
||||||
|
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||||
|
<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px">
|
||||||
|
<button type="button" class="btn btn-primary" onclick="runAllSAT()">Run All</button>
|
||||||
|
<span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
|
||||||
|
<span style="font-size:12px;color:var(--muted)">est. ` + estStr + gpuNote + `</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
<div class="grid3">
|
<div class="grid3">
|
||||||
` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody(
|
` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody(
|
||||||
@@ -115,7 +153,7 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
<div class="card-head">NVIDIA GPU Selection</div>
|
<div class="card-head">NVIDIA GPU Selection</div>
|
||||||
<div class="card-body">
|
<div class="card-body">
|
||||||
<p style="font-size:12px;color:var(--muted);margin:0 0 8px">` + inv.NVIDIA + `</p>
|
<p style="font-size:12px;color:var(--muted);margin:0 0 8px">` + inv.NVIDIA + `</p>
|
||||||
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">All NVIDIA validate tasks use only the GPUs selected here. The same selection is used by Validate one by one.</p>
|
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">All NVIDIA validate tasks use only the GPUs selected here. The same selection is used by Run All.</p>
|
||||||
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
|
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
|
||||||
<button class="btn btn-sm btn-secondary" type="button" onclick="satSelectAllGPUs()">Select All</button>
|
<button class="btn btn-sm btn-secondary" type="button" onclick="satSelectAllGPUs()">Select All</button>
|
||||||
<button class="btn btn-sm btn-secondary" type="button" onclick="satSelectNoGPUs()">Clear</button>
|
<button class="btn btn-sm btn-secondary" type="button" onclick="satSelectNoGPUs()">Clear</button>
|
||||||
@@ -136,46 +174,19 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
validateFmtDur(platform.SATEstimatedNvidiaGPUValidateSec),
|
validateFmtDur(platform.SATEstimatedNvidiaGPUValidateSec),
|
||||||
validateFmtDur(platform.SATEstimatedNvidiaGPUStressSec)),
|
validateFmtDur(platform.SATEstimatedNvidiaGPUStressSec)),
|
||||||
)) +
|
)) +
|
||||||
`<div id="sat-card-nvidia-targeted-stress">` +
|
stressOnlyCards +
|
||||||
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
|
|
||||||
inv.NVIDIA,
|
|
||||||
`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
|
|
||||||
`<code>dcgmi diag targeted_stress</code>`,
|
|
||||||
"Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedStressSec) + ` (all GPUs simultaneously).<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
|
||||||
)) +
|
|
||||||
`</div>` +
|
|
||||||
`<div id="sat-card-nvidia-targeted-power">` +
|
|
||||||
renderSATCard("nvidia-targeted-power", "NVIDIA Targeted Power", "runNvidiaValidateSet('nvidia-targeted-power')", "", renderValidateCardBody(
|
|
||||||
inv.NVIDIA,
|
|
||||||
`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
|
|
||||||
`<code>dcgmi diag targeted_power</code>`,
|
|
||||||
"Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedPowerSec) + ` (all GPUs simultaneously).<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
|
||||||
)) +
|
|
||||||
`</div>` +
|
|
||||||
`<div id="sat-card-nvidia-pulse">` +
|
|
||||||
renderSATCard("nvidia-pulse", "NVIDIA PSU Pulse Test", "runNvidiaFabricValidate('nvidia-pulse')", "", renderValidateCardBody(
|
|
||||||
inv.NVIDIA,
|
|
||||||
`Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`,
|
|
||||||
`<code>dcgmi diag pulse_test</code>`,
|
|
||||||
`Skipped in Validate. Stress: `+validateFmtDur(platform.SATEstimatedNvidiaPulseTestSec)+` (all GPUs simultaneously; measured on 8-GPU system).`+`<p id="sat-pt-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
|
||||||
)) +
|
|
||||||
`</div>` +
|
|
||||||
`<div id="sat-card-nvidia-interconnect">` +
|
|
||||||
renderSATCard("nvidia-interconnect", "NVIDIA Interconnect (NCCL)", "runNvidiaFabricValidate('nvidia-interconnect')", "", renderValidateCardBody(
|
renderSATCard("nvidia-interconnect", "NVIDIA Interconnect (NCCL)", "runNvidiaFabricValidate('nvidia-interconnect')", "", renderValidateCardBody(
|
||||||
inv.NVIDIA,
|
inv.NVIDIA,
|
||||||
`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
|
`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
|
||||||
`<code>all_reduce_perf</code> (NCCL tests)`,
|
`<code>all_reduce_perf</code> (NCCL tests)`,
|
||||||
`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`,
|
validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`,
|
||||||
)) +
|
)) +
|
||||||
`</div>` +
|
|
||||||
`<div id="sat-card-nvidia-bandwidth">` +
|
|
||||||
renderSATCard("nvidia-bandwidth", "NVIDIA Bandwidth (NVBandwidth)", "runNvidiaFabricValidate('nvidia-bandwidth')", "", renderValidateCardBody(
|
renderSATCard("nvidia-bandwidth", "NVIDIA Bandwidth (NVBandwidth)", "runNvidiaFabricValidate('nvidia-bandwidth')", "", renderValidateCardBody(
|
||||||
inv.NVIDIA,
|
inv.NVIDIA,
|
||||||
`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
|
`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
|
||||||
`<code>nvbandwidth</code>`,
|
`<code>nvbandwidth</code>`,
|
||||||
`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously; nvbandwidth runs all built-in tests without a time limit — duration set by the tool).`,
|
validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously; nvbandwidth runs all built-in tests without a time limit — duration set by the tool).`,
|
||||||
)) +
|
)) +
|
||||||
`</div>` +
|
|
||||||
`</div>
|
`</div>
|
||||||
<div class="grid3" style="margin-top:16px">
|
<div class="grid3" style="margin-top:16px">
|
||||||
` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody(
|
` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody(
|
||||||
@@ -190,36 +201,15 @@ func renderValidate(opts HandlerOptions) string {
|
|||||||
<div class="card-body"><div id="sat-terminal" class="terminal"></div></div>
|
<div class="card-body"><div id="sat-terminal" class="terminal"></div></div>
|
||||||
</div>
|
</div>
|
||||||
<style>
|
<style>
|
||||||
.validate-profile-body { display:grid; grid-template-columns:1fr 1fr 1fr; gap:24px; align-items:stretch; }
|
|
||||||
.validate-profile-col { min-width:0; display:flex; flex-direction:column; }
|
|
||||||
.validate-profile-action { display:flex; flex-direction:column; align-items:center; justify-content:center; }
|
|
||||||
.validate-card-body { padding:0; }
|
.validate-card-body { padding:0; }
|
||||||
.validate-card-section { padding:12px 16px 0; }
|
.validate-card-section { padding:12px 16px 0; }
|
||||||
.validate-card-section:last-child { padding-bottom:16px; }
|
.validate-card-section:last-child { padding-bottom:16px; }
|
||||||
.sat-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
|
.sat-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
|
||||||
.sat-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
.sat-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||||
@media(max-width:900px){ .validate-profile-body { grid-template-columns:1fr; } }
|
|
||||||
</style>
|
</style>
|
||||||
<script>
|
<script>
|
||||||
let satES = null;
|
let satES = null;
|
||||||
function satStressMode() {
|
` + satStressModeJS + `
|
||||||
return document.querySelector('input[name="sat-mode"]:checked')?.value === 'stress';
|
|
||||||
}
|
|
||||||
function satModeChanged() {
|
|
||||||
const stress = satStressMode();
|
|
||||||
[
|
|
||||||
{card: 'sat-card-nvidia-targeted-stress', hint: 'sat-ts-mode-hint'},
|
|
||||||
{card: 'sat-card-nvidia-targeted-power', hint: 'sat-tp-mode-hint'},
|
|
||||||
{card: 'sat-card-nvidia-pulse', hint: 'sat-pt-mode-hint'},
|
|
||||||
].forEach(function(item) {
|
|
||||||
const card = document.getElementById(item.card);
|
|
||||||
if (card) {
|
|
||||||
card.style.opacity = stress ? '1' : '0.5';
|
|
||||||
const hint = document.getElementById(item.hint);
|
|
||||||
if (hint) hint.style.display = stress ? 'none' : '';
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
function satLabels() {
|
function satLabels() {
|
||||||
return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress (dcgmi diag targeted_stress)', 'nvidia-targeted-power':'NVIDIA Targeted Power (dcgmi diag targeted_power)', 'nvidia-pulse':'NVIDIA PSU Pulse Test (dcgmi diag pulse_test)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL all_reduce_perf)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
|
return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress (dcgmi diag targeted_stress)', 'nvidia-targeted-power':'NVIDIA Targeted Power (dcgmi diag targeted_power)', 'nvidia-pulse':'NVIDIA PSU Pulse Test (dcgmi diag pulse_test)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL all_reduce_perf)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
|
||||||
}
|
}
|
||||||
@@ -634,25 +624,307 @@ func validateFirstNonEmpty(values ...string) string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func validateIsVendorGPU(dev schema.HardwarePCIeDevice, vendor string) bool {
|
func validateIsVendorGPU(dev schema.HardwarePCIeDevice, vendor string) bool {
|
||||||
model := strings.ToLower(validateTrimPtr(dev.Model))
|
if dev.VendorID != nil && *dev.VendorID == pciVendorAspeed {
|
||||||
manufacturer := strings.ToLower(validateTrimPtr(dev.Manufacturer))
|
|
||||||
class := strings.ToLower(validateTrimPtr(dev.DeviceClass))
|
|
||||||
if strings.Contains(model, "aspeed") || strings.Contains(manufacturer, "aspeed") {
|
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
class := strings.ToLower(validateTrimPtr(dev.DeviceClass))
|
||||||
|
isGPUClass := class == "videocontroller" || class == "processingaccelerator" || class == "displaycontroller"
|
||||||
switch vendor {
|
switch vendor {
|
||||||
case "nvidia":
|
case "nvidia":
|
||||||
return strings.Contains(model, "nvidia") || strings.Contains(manufacturer, "nvidia")
|
return isGPUClass && dev.VendorID != nil && *dev.VendorID == pciVendorNvidia
|
||||||
case "amd":
|
case "amd":
|
||||||
isGPUClass := class == "processingaccelerator" || class == "displaycontroller" || class == "videocontroller"
|
return isGPUClass && dev.VendorID != nil && *dev.VendorID == pciVendorAMD
|
||||||
isAMDVendor := strings.Contains(manufacturer, "advanced micro devices") || strings.Contains(manufacturer, "amd") || strings.Contains(manufacturer, "ati")
|
|
||||||
isAMDModel := strings.Contains(model, "instinct") || strings.Contains(model, "radeon") || strings.Contains(model, "amd")
|
|
||||||
return isGPUClass && (isAMDVendor || isAMDModel)
|
|
||||||
default:
|
default:
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// renderCheck renders the non-destructive Check page (step 2).
|
||||||
|
// Shows validate-mode tests only: CPU, Memory, Storage, NVIDIA L2, NCCL, NVBandwidth, AMD.
|
||||||
|
// Stress-mode tests (targeted-stress, targeted-power, pulse) are on the Load page.
|
||||||
|
func renderCheck(opts HandlerOptions) string {
|
||||||
|
inv := loadValidateInventory(opts)
|
||||||
|
n := inv.NvidiaGPUCount
|
||||||
|
validateTotalStr := validateFmtDur(validateTotalValidateSec(n))
|
||||||
|
gpuNote := ""
|
||||||
|
if n > 0 {
|
||||||
|
gpuNote = fmt.Sprintf(" (%d GPU)", n)
|
||||||
|
}
|
||||||
|
return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Check tests collect diagnostics only — no writes to disks, no sustained load, no hardware wear counters incremented. For stress testing, go to <a href="/burn">4. Burn</a>.</div>
|
||||||
|
<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px">
|
||||||
|
<button type="button" class="btn btn-primary" onclick="runAllCheckSAT()">Run All Checks</button>
|
||||||
|
<span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
|
||||||
|
<span style="font-size:12px;color:var(--muted)">est. ` + validateTotalStr + gpuNote + `</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="grid3">
|
||||||
|
` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody(
|
||||||
|
inv.CPU,
|
||||||
|
`Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`,
|
||||||
|
`<code>lscpu</code>, <code>sensors</code>, <code>stress-ng</code>`,
|
||||||
|
validateFmtDur(platform.SATEstimatedCPUValidateSec)+` (stress-ng 60 s).`,
|
||||||
|
)) +
|
||||||
|
renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody(
|
||||||
|
inv.Memory,
|
||||||
|
`Runs a RAM validation pass and records memory state around the test.`,
|
||||||
|
`<code>free</code>, <code>memtester</code>`,
|
||||||
|
validateFmtDur(platform.SATEstimatedMemoryValidateSec)+` (256 MB × 1 pass).`,
|
||||||
|
)) +
|
||||||
|
renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
|
||||||
|
inv.Storage,
|
||||||
|
`Scans all storage devices and runs the matching health or self-test path for each.`,
|
||||||
|
`<code>lsblk</code>; NVMe: <code>nvme</code>; SATA/SAS: <code>smartctl</code>`,
|
||||||
|
`Seconds (NVMe: instant device query; SATA/SAS: short self-test).`,
|
||||||
|
)) +
|
||||||
|
`</div>
|
||||||
|
<div style="height:1px;background:var(--border);margin:16px 0"></div>
|
||||||
|
<div class="card" style="margin-bottom:16px">
|
||||||
|
<div class="card-head">NVIDIA GPU Selection</div>
|
||||||
|
<div class="card-body">
|
||||||
|
<p style="font-size:12px;color:var(--muted);margin:0 0 8px">` + inv.NVIDIA + `</p>
|
||||||
|
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
|
||||||
|
<button class="btn btn-sm btn-secondary" type="button" onclick="satSelectAllGPUs()">Select All</button>
|
||||||
|
<button class="btn btn-sm btn-secondary" type="button" onclick="satSelectNoGPUs()">Clear</button>
|
||||||
|
</div>
|
||||||
|
<div id="sat-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
|
||||||
|
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
|
||||||
|
</div>
|
||||||
|
<p id="sat-gpu-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA check tasks.</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="grid3">
|
||||||
|
` + renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody(
|
||||||
|
inv.NVIDIA,
|
||||||
|
`Runs NVIDIA diagnostics and board inventory checks (DCGM Level 2).`,
|
||||||
|
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
|
||||||
|
validateFmtDur(platform.SATEstimatedNvidiaGPUValidateSec)+` (Level 2, all GPUs simultaneously).`,
|
||||||
|
)) +
|
||||||
|
renderSATCard("nvidia-interconnect", "NVIDIA Interconnect (NCCL)", "runNvidiaFabricValidate('nvidia-interconnect')", "", renderValidateCardBody(
|
||||||
|
inv.NVIDIA,
|
||||||
|
`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs.`,
|
||||||
|
`<code>all_reduce_perf</code> (NCCL tests)`,
|
||||||
|
validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`,
|
||||||
|
)) +
|
||||||
|
renderSATCard("nvidia-bandwidth", "NVIDIA Bandwidth (NVBandwidth)", "runNvidiaFabricValidate('nvidia-bandwidth')", "", renderValidateCardBody(
|
||||||
|
inv.NVIDIA,
|
||||||
|
`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
|
||||||
|
`<code>nvbandwidth</code>`,
|
||||||
|
validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously).`,
|
||||||
|
)) +
|
||||||
|
`</div>
|
||||||
|
<div class="grid3" style="margin-top:16px">
|
||||||
|
` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody(
|
||||||
|
inv.AMD,
|
||||||
|
`Runs AMD GPU inventory, MEM integrity, and MEM bandwidth checks.`,
|
||||||
|
`GPU Validate: <code>rocm-smi</code>, <code>dmidecode</code>; MEM Integrity: <code>rvs mem</code>; MEM Bandwidth: <code>rocm-bandwidth-test</code>, <code>rvs babel</code>`,
|
||||||
|
`<div style="display:flex;flex-direction:column;gap:4px"><label class="cb-row"><input type="checkbox" id="sat-amd-target" checked><span>GPU Validate</span></label><label class="cb-row"><input type="checkbox" id="sat-amd-mem-target" checked><span>MEM Integrity</span></label><label class="cb-row"><input type="checkbox" id="sat-amd-bandwidth-target" checked><span>MEM Bandwidth</span></label></div>`,
|
||||||
|
)) +
|
||||||
|
`</div>
|
||||||
|
<div id="sat-output" style="display:none;margin-top:16px" class="card">
|
||||||
|
<div class="card-head">Test Output <span id="sat-title"></span></div>
|
||||||
|
<div class="card-body"><div id="sat-terminal" class="terminal"></div></div>
|
||||||
|
</div>
|
||||||
|
<style>
|
||||||
|
.validate-card-body { padding:0; }
|
||||||
|
.validate-card-section { padding:12px 16px 0; }
|
||||||
|
.validate-card-section:last-child { padding-bottom:16px; }
|
||||||
|
.sat-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
|
||||||
|
.sat-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||||
|
.cb-row { display:flex; align-items:flex-start; gap:8px; padding:4px 0; cursor:pointer; font-size:13px; }
|
||||||
|
.cb-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||||
|
</style>
|
||||||
|
<script>
|
||||||
|
let satES = null;
|
||||||
|
function satLabels() {
|
||||||
|
return {nvidia:'Check GPU (DCGM L2)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Check Memory', storage:'Check Storage', cpu:'Check CPU', amd:'Check AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
|
||||||
|
}
|
||||||
|
let satNvidiaGPUsPromise = null;
|
||||||
|
function loadSatNvidiaGPUs() {
|
||||||
|
if (!satNvidiaGPUsPromise) {
|
||||||
|
satNvidiaGPUsPromise = fetch('/api/gpu/nvidia').then(r => {
|
||||||
|
if (!r.ok) throw new Error('Failed to load NVIDIA GPUs.');
|
||||||
|
return r.json();
|
||||||
|
}).then(list => Array.isArray(list) ? list : []);
|
||||||
|
}
|
||||||
|
return satNvidiaGPUsPromise;
|
||||||
|
}
|
||||||
|
function satSelectedGPUIndices() {
|
||||||
|
return Array.from(document.querySelectorAll('.sat-nvidia-checkbox'))
|
||||||
|
.filter(el => el.checked && !el.disabled)
|
||||||
|
.map(el => parseInt(el.value, 10))
|
||||||
|
.filter(v => !Number.isNaN(v))
|
||||||
|
.sort((a, b) => a - b);
|
||||||
|
}
|
||||||
|
function satUpdateGPUSelectionNote() {
|
||||||
|
const note = document.getElementById('sat-gpu-selection-note');
|
||||||
|
if (!note) return;
|
||||||
|
const sel = satSelectedGPUIndices();
|
||||||
|
note.textContent = sel.length
|
||||||
|
? 'Selected GPUs: ' + sel.join(', ') + '. Multi-GPU tests will use all selected GPUs.'
|
||||||
|
: 'Select at least one NVIDIA GPU to enable NVIDIA check tasks.';
|
||||||
|
}
|
||||||
|
function satRenderGPUList(gpus) {
|
||||||
|
const root = document.getElementById('sat-gpu-list');
|
||||||
|
if (!root) return;
|
||||||
|
if (!gpus || !gpus.length) {
|
||||||
|
root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
|
||||||
|
satUpdateGPUSelectionNote(); return;
|
||||||
|
}
|
||||||
|
root.innerHTML = gpus.map(gpu => {
|
||||||
|
const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
|
||||||
|
return '<label class="sat-gpu-row"><input class="sat-nvidia-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="satUpdateGPUSelectionNote()"><span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span></label>';
|
||||||
|
}).join('');
|
||||||
|
satUpdateGPUSelectionNote();
|
||||||
|
}
|
||||||
|
function satSelectAllGPUs() { document.querySelectorAll('.sat-nvidia-checkbox').forEach(el => { el.checked = true; }); satUpdateGPUSelectionNote(); }
|
||||||
|
function satSelectNoGPUs() { document.querySelectorAll('.sat-nvidia-checkbox').forEach(el => { el.checked = false; }); satUpdateGPUSelectionNote(); }
|
||||||
|
function satGPULoadInit() {
|
||||||
|
loadSatNvidiaGPUs().then(satRenderGPUList).catch(err => {
|
||||||
|
const root = document.getElementById('sat-gpu-list');
|
||||||
|
if (root) root.innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
|
||||||
|
satUpdateGPUSelectionNote();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function satRequestBody(target, overrides) {
|
||||||
|
const body = {};
|
||||||
|
const labels = satLabels();
|
||||||
|
body.display_name = labels[target] || ('Check ' + target);
|
||||||
|
body.stress_mode = false;
|
||||||
|
if (target === 'cpu') body.duration = 60;
|
||||||
|
if (overrides) Object.keys(overrides).forEach(k => { body[k] = overrides[k]; });
|
||||||
|
return body;
|
||||||
|
}
|
||||||
|
function enqueueSATTarget(target, overrides) {
|
||||||
|
return fetch('/api/sat/' + target + '/run', {method:'POST', headers:{'Content-Type':'application/json'}, body:JSON.stringify(satRequestBody(target, overrides))}).then(r => r.json());
|
||||||
|
}
|
||||||
|
function streamSATTask(taskId, title, resetTerminal) {
|
||||||
|
if (satES) { satES.close(); satES = null; }
|
||||||
|
document.getElementById('sat-output').style.display = 'block';
|
||||||
|
document.getElementById('sat-title').textContent = '— ' + title;
|
||||||
|
const term = document.getElementById('sat-terminal');
|
||||||
|
if (resetTerminal) term.textContent = '';
|
||||||
|
term.textContent += 'Task ' + taskId + ' queued. Streaming log...\n';
|
||||||
|
return new Promise(resolve => {
|
||||||
|
satES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||||
|
satES.onmessage = e => { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||||
|
satES.addEventListener('done', e => {
|
||||||
|
satES.close(); satES = null;
|
||||||
|
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||||
|
term.scrollTop = term.scrollHeight;
|
||||||
|
resolve({ok: !e.data, error: e.data || ''});
|
||||||
|
});
|
||||||
|
satES.onerror = () => {
|
||||||
|
if (satES) { satES.close(); satES = null; }
|
||||||
|
term.textContent += '\nERROR: stream disconnected.\n';
|
||||||
|
term.scrollTop = term.scrollHeight;
|
||||||
|
resolve({ok: false, error: 'stream disconnected'});
|
||||||
|
};
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function selectedAMDValidateTargets() {
|
||||||
|
const targets = [];
|
||||||
|
const gpu = document.getElementById('sat-amd-target');
|
||||||
|
const mem = document.getElementById('sat-amd-mem-target');
|
||||||
|
const bw = document.getElementById('sat-amd-bandwidth-target');
|
||||||
|
if (gpu && gpu.checked && !gpu.disabled) targets.push('amd');
|
||||||
|
if (mem && mem.checked && !mem.disabled) targets.push('amd-mem');
|
||||||
|
if (bw && bw.checked && !bw.disabled) targets.push('amd-bandwidth');
|
||||||
|
return targets;
|
||||||
|
}
|
||||||
|
function runSAT(target) { return runSATWithOverrides(target, null); }
|
||||||
|
function runSATWithOverrides(target, overrides) {
|
||||||
|
const title = (overrides && overrides.display_name) || target;
|
||||||
|
document.getElementById('sat-output').style.display = 'block';
|
||||||
|
document.getElementById('sat-title').textContent = '— ' + title;
|
||||||
|
const term = document.getElementById('sat-terminal');
|
||||||
|
term.textContent = 'Enqueuing ' + title + ' test...\n';
|
||||||
|
return enqueueSATTarget(target, overrides).then(d => streamSATTask(d.task_id, title, false));
|
||||||
|
}
|
||||||
|
function runNvidiaFabricValidate(target) {
|
||||||
|
const indices = satSelectedGPUIndices();
|
||||||
|
if (!indices.length) { alert('No NVIDIA GPUs available.'); return; }
|
||||||
|
runSATWithOverrides(target, {gpu_indices: indices, display_name: satLabels()[target] || target});
|
||||||
|
}
|
||||||
|
function runNvidiaValidateSet(target) {
|
||||||
|
const sel = satSelectedGPUIndices();
|
||||||
|
if (!sel.length) { alert('Select at least one NVIDIA GPU.'); return; }
|
||||||
|
return runSATWithOverrides(target, {gpu_indices: sel, display_name: satLabels()[target] || target});
|
||||||
|
}
|
||||||
|
function runAMDValidateSet() {
|
||||||
|
const targets = selectedAMDValidateTargets();
|
||||||
|
if (!targets.length) return;
|
||||||
|
if (targets.length === 1) return runSAT(targets[0]);
|
||||||
|
const term = document.getElementById('sat-terminal');
|
||||||
|
document.getElementById('sat-output').style.display = 'block';
|
||||||
|
document.getElementById('sat-title').textContent = '— amd';
|
||||||
|
term.textContent = 'Running AMD check set...\n';
|
||||||
|
const labels = satLabels();
|
||||||
|
const runNext = idx => {
|
||||||
|
if (idx >= targets.length) return Promise.resolve();
|
||||||
|
const t = targets[idx];
|
||||||
|
term.textContent += '\n[' + (idx + 1) + '/' + targets.length + '] ' + labels[t] + '\n';
|
||||||
|
return enqueueSATTarget(t).then(d => streamSATTask(d.task_id, labels[t], false)).then(() => runNext(idx + 1));
|
||||||
|
};
|
||||||
|
return runNext(0);
|
||||||
|
}
|
||||||
|
function runAllCheckSAT() {
|
||||||
|
const status = document.getElementById('sat-all-status');
|
||||||
|
status.textContent = 'Enqueuing...';
|
||||||
|
const nvidiaIndices = satSelectedGPUIndices();
|
||||||
|
const nvidiaAllTargets = ['nvidia', 'nvidia-interconnect', 'nvidia-bandwidth'];
|
||||||
|
const baseTargets = ['cpu', 'memory', 'storage'];
|
||||||
|
const amdTargets = selectedAMDValidateTargets();
|
||||||
|
const expanded = [];
|
||||||
|
baseTargets.forEach(t => expanded.push({target: t}));
|
||||||
|
if (nvidiaIndices.length) {
|
||||||
|
nvidiaAllTargets.forEach(t => {
|
||||||
|
const btn = document.getElementById('sat-btn-' + t);
|
||||||
|
if (!(btn && btn.disabled)) expanded.push({target: t, overrides: {gpu_indices: nvidiaIndices, display_name: satLabels()[t] || t}});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
amdTargets.forEach(t => expanded.push({target: t}));
|
||||||
|
if (!expanded.length) { status.textContent = 'No tasks selected.'; return; }
|
||||||
|
const total = expanded.length;
|
||||||
|
const runNext = idx => {
|
||||||
|
if (idx >= expanded.length) { status.textContent = 'Completed ' + total + ' task(s).'; return Promise.resolve(); }
|
||||||
|
const item = expanded[idx];
|
||||||
|
status.textContent = 'Running ' + (idx + 1) + '/' + total + '...';
|
||||||
|
return enqueueSATTarget(item.target, item.overrides).then(() => runNext(idx + 1));
|
||||||
|
};
|
||||||
|
runNext(0).catch(err => { status.textContent = 'Error: ' + err.message; });
|
||||||
|
}
|
||||||
|
function disableSATCard(id, reason) {
|
||||||
|
const btn = document.getElementById('sat-btn-' + id);
|
||||||
|
if (!btn) return;
|
||||||
|
btn.disabled = true; btn.title = reason; btn.style.opacity = '0.4';
|
||||||
|
const card = btn.closest('.card');
|
||||||
|
if (card) {
|
||||||
|
let note = card.querySelector('.sat-unavail');
|
||||||
|
if (!note) {
|
||||||
|
note = document.createElement('p');
|
||||||
|
note.className = 'sat-unavail';
|
||||||
|
note.style.cssText = 'color:var(--muted);font-size:12px;margin:0 0 8px';
|
||||||
|
const body = card.querySelector('.card-body');
|
||||||
|
if (body) body.insertBefore(note, body.firstChild);
|
||||||
|
}
|
||||||
|
note.textContent = reason;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fetch('/api/gpu/presence').then(r => r.json()).then(gp => {
|
||||||
|
if (!gp.nvidia) ['nvidia','nvidia-interconnect','nvidia-bandwidth'].forEach(t => disableSATCard(t, 'No NVIDIA GPU detected'));
|
||||||
|
if (!gp.amd) {
|
||||||
|
disableSATCard('amd', 'No AMD GPU detected');
|
||||||
|
['sat-amd-target','sat-amd-mem-target','sat-amd-bandwidth-target'].forEach(id => {
|
||||||
|
const cb = document.getElementById(id);
|
||||||
|
if (cb) { cb.disabled = true; cb.checked = false; }
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
satGPULoadInit();
|
||||||
|
</script>`
|
||||||
|
}
|
||||||
|
|
||||||
func renderSATCard(id, label, runAction, headerActions, body string) string {
|
func renderSATCard(id, label, runAction, headerActions, body string) string {
|
||||||
actions := `<button id="sat-btn-` + id + `" class="btn btn-primary btn-sm" onclick="` + runAction + `">Run</button>`
|
actions := `<button id="sat-btn-` + id + `" class="btn btn-primary btn-sm" onclick="` + runAction + `">Run</button>`
|
||||||
if strings.TrimSpace(headerActions) != "" {
|
if strings.TrimSpace(headerActions) != "" {
|
||||||
|
|||||||
@@ -5,7 +5,9 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"html"
|
"html"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"regexp"
|
||||||
"sort"
|
"sort"
|
||||||
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"bee/audit/internal/app"
|
"bee/audit/internal/app"
|
||||||
@@ -22,41 +24,54 @@ func renderPage(page string, opts HandlerOptions) string {
|
|||||||
body = renderDashboard(opts)
|
body = renderDashboard(opts)
|
||||||
case "audit":
|
case "audit":
|
||||||
pageID = "audit"
|
pageID = "audit"
|
||||||
title = "Audit"
|
title = "1. Audit"
|
||||||
body = renderAudit()
|
body = renderAudit()
|
||||||
case "validate":
|
case "check":
|
||||||
pageID = "validate"
|
pageID = "check"
|
||||||
title = "Validate"
|
title = "2. Check"
|
||||||
body = renderValidate(opts)
|
body = renderCheck(opts)
|
||||||
|
case "load":
|
||||||
|
pageID = "load"
|
||||||
|
title = "3. Load"
|
||||||
|
body = renderValidateStress(opts)
|
||||||
case "burn":
|
case "burn":
|
||||||
pageID = "burn"
|
pageID = "burn"
|
||||||
title = "Burn"
|
title = "4. Burn"
|
||||||
body = renderBurn()
|
body = renderBurn()
|
||||||
case "benchmark":
|
case "benchmark":
|
||||||
pageID = "benchmark"
|
pageID = "benchmark"
|
||||||
title = "Benchmark"
|
title = "5. Benchmark"
|
||||||
|
body = renderBenchmark(opts)
|
||||||
|
case "tools":
|
||||||
|
pageID = "tools"
|
||||||
|
title = "Tools"
|
||||||
|
body = renderTools()
|
||||||
|
case "settings":
|
||||||
|
pageID = "settings"
|
||||||
|
title = "Settings"
|
||||||
|
body = renderSettings(opts)
|
||||||
|
// Legacy routes (redirected at HTTP level in handlePage; these are fallbacks)
|
||||||
|
case "validate", "tests":
|
||||||
|
pageID = "load"
|
||||||
|
title = "3. Load"
|
||||||
|
body = renderValidate(opts)
|
||||||
|
case "burn-in":
|
||||||
|
pageID = "burn"
|
||||||
|
title = "4. Burn"
|
||||||
|
body = renderBurn()
|
||||||
|
case "speed", "endurance":
|
||||||
|
pageID = "benchmark"
|
||||||
|
title = "5. Benchmark"
|
||||||
body = renderBenchmark(opts)
|
body = renderBenchmark(opts)
|
||||||
case "tasks":
|
case "tasks":
|
||||||
pageID = "tasks"
|
pageID = "tasks"
|
||||||
title = "Tasks"
|
title = "Tasks"
|
||||||
body = renderTasks()
|
body = renderTasks()
|
||||||
case "tools":
|
// Hidden pages (not in nav, accessible by direct URL)
|
||||||
pageID = "tools"
|
|
||||||
title = "Tools"
|
|
||||||
body = renderTools()
|
|
||||||
// Legacy routes kept accessible but not in nav
|
|
||||||
case "metrics":
|
case "metrics":
|
||||||
pageID = "metrics"
|
pageID = "metrics"
|
||||||
title = "Live Metrics"
|
title = "Live Metrics"
|
||||||
body = renderMetrics()
|
body = renderMetrics()
|
||||||
case "tests":
|
|
||||||
pageID = "validate"
|
|
||||||
title = "Acceptance Tests"
|
|
||||||
body = renderValidate(opts)
|
|
||||||
case "burn-in":
|
|
||||||
pageID = "burn"
|
|
||||||
title = "Burn-in Tests"
|
|
||||||
body = renderBurn()
|
|
||||||
case "network":
|
case "network":
|
||||||
pageID = "network"
|
pageID = "network"
|
||||||
title = "Network"
|
title = "Network"
|
||||||
@@ -85,6 +100,7 @@ func renderPage(page string, opts HandlerOptions) string {
|
|||||||
body +
|
body +
|
||||||
`</div></div>` +
|
`</div></div>` +
|
||||||
renderAuditModal() +
|
renderAuditModal() +
|
||||||
|
`<dialog id="component-detail-dialog" style="min-width:600px;max-width:900px;width:90vw;padding:0;border:1px solid var(--border);border-radius:8px;background:var(--surface)"><div id="component-detail-body" style="padding-bottom:20px"></div></dialog>` +
|
||||||
`<script>
|
`<script>
|
||||||
// Add copy button to every .terminal on the page
|
// Add copy button to every .terminal on the page
|
||||||
document.querySelectorAll('.terminal').forEach(function(t){
|
document.querySelectorAll('.terminal').forEach(function(t){
|
||||||
@@ -94,6 +110,17 @@ document.querySelectorAll('.terminal').forEach(function(t){
|
|||||||
btn.onclick=function(){navigator.clipboard.writeText(t.textContent).then(function(){btn.textContent='Copied!';setTimeout(function(){btn.textContent='Copy';},1500);});};
|
btn.onclick=function(){navigator.clipboard.writeText(t.textContent).then(function(){btn.textContent='Copied!';setTimeout(function(){btn.textContent='Copy';},1500);});};
|
||||||
w.appendChild(btn);
|
w.appendChild(btn);
|
||||||
});
|
});
|
||||||
|
function openComponentDetail(type) {
|
||||||
|
var dlg = document.getElementById('component-detail-dialog');
|
||||||
|
var body = document.getElementById('component-detail-body');
|
||||||
|
body.innerHTML = '<div style="padding:20px;color:var(--muted)">Loading…</div>';
|
||||||
|
dlg.showModal();
|
||||||
|
fetch('/api/components/' + type).then(function(r){ return r.text(); }).then(function(html){
|
||||||
|
body.innerHTML = html;
|
||||||
|
}).catch(function(){
|
||||||
|
body.innerHTML = '<div style="padding:20px;color:var(--crit-fg)">Error loading details.</div>';
|
||||||
|
});
|
||||||
|
}
|
||||||
</script>` +
|
</script>` +
|
||||||
`</body></html>`
|
`</body></html>`
|
||||||
}
|
}
|
||||||
@@ -106,6 +133,14 @@ func renderDashboard(opts HandlerOptions) string {
|
|||||||
b.WriteString(renderHardwareSummaryCard(opts))
|
b.WriteString(renderHardwareSummaryCard(opts))
|
||||||
b.WriteString(renderHealthCard(opts))
|
b.WriteString(renderHealthCard(opts))
|
||||||
b.WriteString(renderMetrics())
|
b.WriteString(renderMetrics())
|
||||||
|
b.WriteString(`<script>
|
||||||
|
setInterval(function(){
|
||||||
|
fetch('/api/hardware-summary').then(function(r){return r.text();}).then(function(html){
|
||||||
|
var el=document.getElementById('hw-summary-card');
|
||||||
|
if(el){el.outerHTML=html;}
|
||||||
|
}).catch(function(){});
|
||||||
|
},30000);
|
||||||
|
</script>`)
|
||||||
return b.String()
|
return b.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -184,13 +219,14 @@ func renderAudit() string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func renderHardwareSummaryCard(opts HandlerOptions) string {
|
func renderHardwareSummaryCard(opts HandlerOptions) string {
|
||||||
|
const cardID = ` id="hw-summary-card"`
|
||||||
data, err := loadSnapshot(opts.AuditPath)
|
data, err := loadSnapshot(opts.AuditPath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return `<div class="card"><div class="card-head card-head-actions"><span>Hardware Summary</span><div class="card-head-buttons"><button class="btn btn-primary btn-sm" onclick="auditModalRun()">Run audit</button></div></div><div class="card-body"></div></div>`
|
return `<div class="card"` + cardID + `><div class="card-head card-head-actions"><span>Hardware Summary</span><div class="card-head-buttons"><button class="btn btn-primary btn-sm" onclick="auditModalRun()">Run audit</button></div></div><div class="card-body"></div></div>`
|
||||||
}
|
}
|
||||||
var ingest schema.HardwareIngestRequest
|
var ingest schema.HardwareIngestRequest
|
||||||
if err := json.Unmarshal(data, &ingest); err != nil {
|
if err := json.Unmarshal(data, &ingest); err != nil {
|
||||||
return `<div class="card"><div class="card-head">Hardware Summary</div><div class="card-body"><span class="badge badge-err">Parse error</span></div></div>`
|
return `<div class="card"` + cardID + `><div class="card-head">Hardware Summary</div><div class="card-body"><span class="badge badge-err">Parse error</span></div></div>`
|
||||||
}
|
}
|
||||||
hw := ingest.Hardware
|
hw := ingest.Hardware
|
||||||
|
|
||||||
@@ -200,7 +236,7 @@ func renderHardwareSummaryCard(opts HandlerOptions) string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
var b strings.Builder
|
var b strings.Builder
|
||||||
b.WriteString(`<div class="card"><div class="card-head">Hardware Summary</div><div class="card-body">`)
|
b.WriteString(`<div class="card"` + cardID + `><div class="card-head">Hardware Summary</div><div class="card-body">`)
|
||||||
|
|
||||||
// Server identity block above the component table.
|
// Server identity block above the component table.
|
||||||
{
|
{
|
||||||
@@ -229,22 +265,32 @@ func renderHardwareSummaryCard(opts HandlerOptions) string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
b.WriteString(`<table style="width:auto">`)
|
b.WriteString(`<table style="width:auto">`)
|
||||||
writeRow := func(label, value, badgeHTML string) {
|
// writeRow renders one component row. compType is the URL path segment for the detail
|
||||||
b.WriteString(fmt.Sprintf(`<tr><td style="padding:6px 14px 6px 0;font-weight:700;white-space:nowrap">%s</td><td style="padding:6px 0;color:var(--muted);font-size:13px">%s</td><td style="padding:6px 0 6px 12px">%s</td></tr>`,
|
// endpoint (e.g. "cpu"). Pass "" for rows that have no detail view.
|
||||||
html.EscapeString(label), html.EscapeString(value), badgeHTML))
|
writeRow := func(label, value, badgeHTML, compType string) {
|
||||||
|
var labelHTML string
|
||||||
|
if compType != "" {
|
||||||
|
labelHTML = fmt.Sprintf(
|
||||||
|
`<span style="cursor:pointer;text-decoration:underline dotted;text-underline-offset:3px" onclick="openComponentDetail('%s')">%s</span>`,
|
||||||
|
compType, html.EscapeString(label))
|
||||||
|
} else {
|
||||||
|
labelHTML = html.EscapeString(label)
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b, `<tr><td style="padding:6px 14px 6px 0;font-weight:700;white-space:nowrap">%s</td><td style="padding:6px 0;color:var(--muted);font-size:13px">%s</td><td style="padding:6px 0 6px 12px">%s</td></tr>`,
|
||||||
|
labelHTML, html.EscapeString(value), badgeHTML)
|
||||||
}
|
}
|
||||||
|
|
||||||
writeRow("CPU", hwDescribeCPU(hw),
|
writeRow("CPU", hwDescribeCPU(hw),
|
||||||
renderComponentChips(matchedRecords(records, []string{"cpu:all"}, nil)))
|
renderComponentChips(matchedRecords(records, []string{"cpu:all"}, nil)), "cpu")
|
||||||
|
|
||||||
writeRow("Memory", hwDescribeMemory(hw),
|
writeRow("Memory", hwDescribeMemory(hw),
|
||||||
renderComponentChips(matchedRecords(records, []string{"memory:all"}, []string{"memory:"})))
|
renderComponentChips(matchedRecords(records, []string{"memory:all"}, []string{"memory:"})), "memory")
|
||||||
|
|
||||||
writeRow("Storage", hwDescribeStorage(hw),
|
writeRow("Storage", hwDescribeStorage(hw),
|
||||||
renderComponentChips(matchedRecords(records, []string{"storage:all"}, []string{"storage:"})))
|
renderComponentChips(matchedRecords(records, []string{"storage:all"}, []string{"storage:"})), "storage")
|
||||||
|
|
||||||
writeRow("GPU", hwDescribeGPU(hw),
|
writeRow("GPU", hwDescribeGPU(hw),
|
||||||
renderComponentChips(matchedRecords(records, nil, []string{"pcie:gpu:"})))
|
renderComponentChips(matchedRecords(records, nil, []string{"pcie:gpu:"})), "gpu")
|
||||||
|
|
||||||
psuMatched := matchedRecords(records, nil, []string{"psu:"})
|
psuMatched := matchedRecords(records, nil, []string{"psu:"})
|
||||||
if len(psuMatched) == 0 && len(hw.PowerSupplies) > 0 {
|
if len(psuMatched) == 0 && len(hw.PowerSupplies) > 0 {
|
||||||
@@ -252,10 +298,10 @@ func renderHardwareSummaryCard(opts HandlerOptions) string {
|
|||||||
psuStatus := hwPSUStatus(hw.PowerSupplies)
|
psuStatus := hwPSUStatus(hw.PowerSupplies)
|
||||||
psuMatched = []app.ComponentStatusRecord{{ComponentKey: "psu:ipmi", Status: psuStatus}}
|
psuMatched = []app.ComponentStatusRecord{{ComponentKey: "psu:ipmi", Status: psuStatus}}
|
||||||
}
|
}
|
||||||
writeRow("PSU", hwDescribePSU(hw), renderComponentChips(psuMatched))
|
writeRow("PSU", hwDescribePSU(hw), renderComponentChips(psuMatched), "psu")
|
||||||
|
|
||||||
if nicDesc := hwDescribeNIC(hw); nicDesc != "" {
|
if nicDesc := hwDescribeNIC(hw); nicDesc != "" {
|
||||||
writeRow("Network", nicDesc, "")
|
writeRow("Network", nicDesc, "", "")
|
||||||
}
|
}
|
||||||
|
|
||||||
b.WriteString(`</table>`)
|
b.WriteString(`</table>`)
|
||||||
@@ -614,7 +660,7 @@ func buildRuntimeNetworkRow(health schema.RuntimeHealth) runtimeHealthRow {
|
|||||||
if status == "" {
|
if status == "" {
|
||||||
status = "UNKNOWN"
|
status = "UNKNOWN"
|
||||||
}
|
}
|
||||||
issue := runtimeIssueDescriptions(health.Issues, "dhcp_partial", "dhcp_failed")
|
issue := runtimeIssueDescriptions(health.Issues, "dhcp_failed")
|
||||||
return runtimeHealthRow{Title: "Network", Status: status, Source: "ListInterfaces / DHCP", Issue: issue}
|
return runtimeHealthRow{Title: "Network", Status: status, Source: "ListInterfaces / DHCP", Issue: issue}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -672,12 +718,12 @@ func buildRuntimeServicesRow(health schema.RuntimeHealth) runtimeHealthRow {
|
|||||||
nonActive := make([]string, 0)
|
nonActive := make([]string, 0)
|
||||||
for _, svc := range health.Services {
|
for _, svc := range health.Services {
|
||||||
state := strings.TrimSpace(strings.ToLower(svc.Status))
|
state := strings.TrimSpace(strings.ToLower(svc.Status))
|
||||||
// "activating" and "deactivating" are transient states for oneshot services
|
// "inactive" is OK for oneshot services that have completed successfully
|
||||||
// (RemainAfterExit=yes) — the service is running normally, not failed.
|
// (bee-sshsetup, bee-preflight, bee-audit, bee-network, etc.).
|
||||||
// Only "failed" and "inactive" (after services should be running) are problems.
|
// Only "failed" is a genuine problem.
|
||||||
switch state {
|
switch state {
|
||||||
case "active", "activating", "deactivating", "reloading":
|
case "active", "activating", "deactivating", "reloading", "inactive":
|
||||||
// OK — service is running or transitioning normally
|
// OK — service is running, transitioning normally, or completed successfully
|
||||||
default:
|
default:
|
||||||
nonActive = append(nonActive, svc.Name+"="+svc.Status)
|
nonActive = append(nonActive, svc.Name+"="+svc.Status)
|
||||||
}
|
}
|
||||||
@@ -999,3 +1045,200 @@ func rowIssueHTML(issue string) string {
|
|||||||
}
|
}
|
||||||
return html.EscapeString(issue)
|
return html.EscapeString(issue)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var aerStatusRe = regexp.MustCompile(`aer_status:\s*0x([0-9a-fA-F]{1,8})`)
|
||||||
|
|
||||||
|
// decodeAERStatus parses an AER status hex value from a kernel error detail string
|
||||||
|
// and returns a human-readable list of set bit names with correctable/uncorrectable label,
|
||||||
|
// or "" if no AER status is found.
|
||||||
|
func decodeAERStatus(detail string) string {
|
||||||
|
m := aerStatusRe.FindStringSubmatch(detail)
|
||||||
|
if m == nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
v64, err := strconv.ParseUint(m[1], 16, 32)
|
||||||
|
if err != nil {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
val := uint32(v64)
|
||||||
|
|
||||||
|
type bitDef struct {
|
||||||
|
bit uint32
|
||||||
|
name string
|
||||||
|
}
|
||||||
|
corrBits := []bitDef{
|
||||||
|
{0, "Receiver Error"}, {6, "Replay Timer Timeout"}, {7, "Advisory Non-Fatal"},
|
||||||
|
{8, "Corrected Internal Error"}, {9, "Header Log Overflow"},
|
||||||
|
{13, "Replay Num Rollover"}, {14, "Bad DLLP"}, {15, "Bad TLP"},
|
||||||
|
}
|
||||||
|
uncorrBits := []bitDef{
|
||||||
|
{4, "Data Link Protocol Error"}, {5, "Surprise Down Error"},
|
||||||
|
{12, "Poisoned TLP Received"}, {13, "Flow Control Protocol Error"},
|
||||||
|
{14, "Completion Timeout"}, {15, "Completer Abort"}, {16, "Unexpected Completion"},
|
||||||
|
{17, "Receiver Overflow"}, {18, "Malformed TLP"}, {19, "ECRC Error"},
|
||||||
|
{20, "Unsupported Request Error"}, {21, "ACS Violation"}, {22, "Uncorrectable Internal Error"},
|
||||||
|
}
|
||||||
|
var corrNames, uncorrNames []string
|
||||||
|
for _, b := range corrBits {
|
||||||
|
if val&(1<<b.bit) != 0 {
|
||||||
|
corrNames = append(corrNames, b.name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, b := range uncorrBits {
|
||||||
|
if val&(1<<b.bit) != 0 {
|
||||||
|
uncorrNames = append(uncorrNames, b.name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(corrNames) >= len(uncorrNames) && len(corrNames) > 0 {
|
||||||
|
return strings.Join(corrNames, ", ") + " (correctable)"
|
||||||
|
}
|
||||||
|
if len(uncorrNames) > 0 {
|
||||||
|
return strings.Join(uncorrNames, ", ") + " (uncorrectable)"
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("unknown bits: 0x%08x", val)
|
||||||
|
}
|
||||||
|
|
||||||
|
// renderSparkline returns a small inline SVG showing non-OK events over time.
|
||||||
|
// Events are positioned proportionally along the time axis; if all share the same
|
||||||
|
// timestamp they are spaced evenly. Width is always 100px.
|
||||||
|
func renderSparkline(history []app.ComponentStatusEntry) string {
|
||||||
|
const (
|
||||||
|
svgW = 100
|
||||||
|
svgH = 20
|
||||||
|
barW = 3
|
||||||
|
barH = 14
|
||||||
|
)
|
||||||
|
var events []app.ComponentStatusEntry
|
||||||
|
for _, e := range history {
|
||||||
|
if e.Status != "OK" {
|
||||||
|
events = append(events, e)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(events) == 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
n := len(events)
|
||||||
|
barColor := func(status string) string {
|
||||||
|
if status == "Critical" {
|
||||||
|
return "#c0392b"
|
||||||
|
}
|
||||||
|
return "#d97706"
|
||||||
|
}
|
||||||
|
yTop := (svgH - barH) / 2
|
||||||
|
|
||||||
|
var bars strings.Builder
|
||||||
|
if n == 1 {
|
||||||
|
x := (svgW - barW) / 2
|
||||||
|
fmt.Fprintf(&bars, `<rect x="%d" y="%d" width="%d" height="%d" fill="%s" rx="1"/>`,
|
||||||
|
x, yTop, barW, barH, barColor(events[0].Status))
|
||||||
|
} else {
|
||||||
|
minT := events[0].At
|
||||||
|
maxT := events[n-1].At
|
||||||
|
dur := maxT.Sub(minT).Seconds()
|
||||||
|
for i, e := range events {
|
||||||
|
var x int
|
||||||
|
if dur <= 0 {
|
||||||
|
step := svgW / n
|
||||||
|
x = i*step + (step-barW)/2
|
||||||
|
} else {
|
||||||
|
frac := e.At.Sub(minT).Seconds() / dur
|
||||||
|
x = int(frac * float64(svgW-barW))
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&bars, `<rect x="%d" y="%d" width="%d" height="%d" fill="%s" rx="1"/>`,
|
||||||
|
x, yTop, barW, barH, barColor(e.Status))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return fmt.Sprintf(
|
||||||
|
`<svg width="%d" height="%d" style="display:inline-block;vertical-align:middle;margin-left:6px;flex-shrink:0" xmlns="http://www.w3.org/2000/svg">`+
|
||||||
|
`<rect x="0" y="0" width="%d" height="%d" fill="var(--surface-alt,#ebebeb)" rx="3"/>%s</svg>`,
|
||||||
|
svgW, svgH, svgW, svgH, bars.String())
|
||||||
|
}
|
||||||
|
|
||||||
|
// renderComponentDetail renders a modal content fragment for one component type.
|
||||||
|
// Called by handleAPIComponentDetail and displayed inside #component-detail-dialog.
|
||||||
|
func renderComponentDetail(title string, records []app.ComponentStatusRecord) string {
|
||||||
|
var b strings.Builder
|
||||||
|
fmt.Fprintf(&b, `<div style="padding:20px 24px 0">`)
|
||||||
|
fmt.Fprintf(&b, `<div style="display:flex;align-items:center;justify-content:space-between;margin-bottom:16px">`)
|
||||||
|
fmt.Fprintf(&b, `<span style="font-size:16px;font-weight:700">%s — Status Detail</span>`, html.EscapeString(title))
|
||||||
|
b.WriteString(`<button class="btn btn-sm btn-secondary" onclick="document.getElementById('component-detail-dialog').close()">Close</button>`)
|
||||||
|
b.WriteString(`</div>`)
|
||||||
|
|
||||||
|
if len(records) == 0 {
|
||||||
|
b.WriteString(`<p style="color:var(--muted)">No status data recorded yet for this component type.</p>`)
|
||||||
|
b.WriteString(`</div>`)
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
sort.Slice(records, func(i, j int) bool {
|
||||||
|
return records[i].ComponentKey < records[j].ComponentKey
|
||||||
|
})
|
||||||
|
|
||||||
|
for _, rec := range records {
|
||||||
|
letter, cls := chipLetterClass(rec.Status)
|
||||||
|
|
||||||
|
// Count non-OK events across the full history for the badge + sparkline.
|
||||||
|
warnCount := 0
|
||||||
|
for _, e := range rec.History {
|
||||||
|
if e.Status != "OK" {
|
||||||
|
warnCount++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Fprintf(&b, `<div style="margin-bottom:20px">`)
|
||||||
|
fmt.Fprintf(&b, `<div style="display:flex;align-items:center;gap:8px;margin-bottom:8px;flex-wrap:wrap">`)
|
||||||
|
fmt.Fprintf(&b, `<span class="chip %s">%s</span>`, cls, letter)
|
||||||
|
fmt.Fprintf(&b, `<span style="font-weight:700;font-size:13px">%s</span>`, html.EscapeString(rec.ComponentKey))
|
||||||
|
if !rec.LastCheckedAt.IsZero() {
|
||||||
|
fmt.Fprintf(&b, `<span style="color:var(--muted);font-size:12px">checked %s</span>`, rec.LastCheckedAt.Format("2006-01-02 15:04:05"))
|
||||||
|
}
|
||||||
|
if warnCount > 0 {
|
||||||
|
noun := "events"
|
||||||
|
if warnCount == 1 {
|
||||||
|
noun = "event"
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b,
|
||||||
|
`<span style="font-size:11px;background:var(--warn-bg,#fffbeb);color:var(--warn-fg,#92400e);border:1px solid var(--warn-border,#fde68a);border-radius:10px;padding:1px 7px;white-space:nowrap">%d %s</span>`,
|
||||||
|
warnCount, noun)
|
||||||
|
b.WriteString(renderSparkline(rec.History))
|
||||||
|
}
|
||||||
|
b.WriteString(`</div>`)
|
||||||
|
|
||||||
|
if rec.ErrorSummary != "" {
|
||||||
|
fmt.Fprintf(&b, `<div style="font-size:12px;margin-bottom:4px;color:var(--muted)">%s</div>`, html.EscapeString(rec.ErrorSummary))
|
||||||
|
if decoded := decodeAERStatus(rec.ErrorSummary); decoded != "" {
|
||||||
|
fmt.Fprintf(&b,
|
||||||
|
`<div style="font-size:12px;margin-bottom:8px;color:var(--muted)"><span style="background:var(--surface-alt,#f5f5f5);border-radius:4px;padding:1px 6px;font-family:monospace">AER: %s</span></div>`,
|
||||||
|
html.EscapeString(decoded))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// History table — newest first, cap at 20 entries.
|
||||||
|
history := rec.History
|
||||||
|
if len(history) > 20 {
|
||||||
|
history = history[len(history)-20:]
|
||||||
|
}
|
||||||
|
b.WriteString(`<table style="width:100%;font-size:12px;border-collapse:collapse">`)
|
||||||
|
b.WriteString(`<tr style="color:var(--muted)"><th style="text-align:left;padding:2px 10px 2px 0;white-space:nowrap">Time</th><th style="text-align:left;padding:2px 10px 2px 0">Status</th><th style="text-align:left;padding:2px 10px 2px 0">Source</th><th style="text-align:left;padding:2px 0">Detail</th></tr>`)
|
||||||
|
for i := len(history) - 1; i >= 0; i-- {
|
||||||
|
e := history[i]
|
||||||
|
eLetter, eCls := chipLetterClass(e.Status)
|
||||||
|
detail := e.Detail
|
||||||
|
if detail == "" {
|
||||||
|
detail = "—"
|
||||||
|
}
|
||||||
|
fmt.Fprintf(&b,
|
||||||
|
`<tr><td style="padding:3px 10px 3px 0;white-space:nowrap;color:var(--muted)">%s</td><td style="padding:3px 10px 3px 0"><span class="chip %s" style="font-size:10px;width:16px;height:16px">%s</span></td><td style="padding:3px 10px 3px 0;white-space:nowrap">%s</td><td style="padding:3px 0;color:var(--muted)">%s</td></tr>`,
|
||||||
|
html.EscapeString(e.At.Format("2006-01-02 15:04:05")),
|
||||||
|
eCls, eLetter,
|
||||||
|
html.EscapeString(e.Source),
|
||||||
|
html.EscapeString(detail),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
b.WriteString(`</table>`)
|
||||||
|
b.WriteString(`</div>`)
|
||||||
|
}
|
||||||
|
|
||||||
|
b.WriteString(`</div>`)
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|||||||
302
audit/internal/webui/saa_dmi.go
Normal file
302
audit/internal/webui/saa_dmi.go
Normal file
@@ -0,0 +1,302 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"net/http"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
|
"regexp"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
type dmiField struct {
|
||||||
|
Name string `json:"name"`
|
||||||
|
Shn string `json:"shn"`
|
||||||
|
Value string `json:"value"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type saaChange struct {
|
||||||
|
Shn string `json:"shn"`
|
||||||
|
Value string `json:"value"`
|
||||||
|
}
|
||||||
|
|
||||||
|
var (
|
||||||
|
shnRE = regexp.MustCompile(`^[A-Za-z0-9_]{1,16}$`)
|
||||||
|
dmiSectionRE = regexp.MustCompile(`^\[(.+?)\]$`)
|
||||||
|
// Item Name {SHN} = value // comment
|
||||||
|
dmiItemRE = regexp.MustCompile(`^(.+?)\s+\{([A-Za-z0-9]{1,16})\}\s*=\s*(.*)$`)
|
||||||
|
dmiVersionRE = regexp.MustCompile(`(?i)^version\s*=`)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
// parseDMIFile parses the DMI.txt produced by "saa GetDmiInfo".
|
||||||
|
// Real format (from SAA User Guide 4.8.1):
|
||||||
|
//
|
||||||
|
// [System]
|
||||||
|
// Version {SYVS} = "A Version" // string value
|
||||||
|
// Serial Number {SYSN} = $DEFAULT$ // string value
|
||||||
|
// UUID {SYUU} = 00112233-... // hex value
|
||||||
|
func parseDMIFile(content string) []dmiField {
|
||||||
|
var fields []dmiField
|
||||||
|
currentSection := ""
|
||||||
|
for _, line := range strings.Split(content, "\n") {
|
||||||
|
line = strings.TrimSpace(line)
|
||||||
|
if line == "" || strings.HasPrefix(line, "//") || strings.HasPrefix(line, "#") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if dmiVersionRE.MatchString(line) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if m := dmiSectionRE.FindStringSubmatch(line); m != nil {
|
||||||
|
currentSection = strings.TrimSpace(m[1])
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
m := dmiItemRE.FindStringSubmatch(line)
|
||||||
|
if m == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
itemName := strings.TrimSpace(m[1])
|
||||||
|
shn := m[2]
|
||||||
|
rawValue := strings.TrimSpace(m[3])
|
||||||
|
// strip trailing comment (space + //)
|
||||||
|
if idx := strings.LastIndex(rawValue, " //"); idx >= 0 {
|
||||||
|
rawValue = strings.TrimSpace(rawValue[:idx])
|
||||||
|
}
|
||||||
|
// strip surrounding double quotes from string values
|
||||||
|
if len(rawValue) >= 2 && rawValue[0] == '"' && rawValue[len(rawValue)-1] == '"' {
|
||||||
|
rawValue = rawValue[1 : len(rawValue)-1]
|
||||||
|
}
|
||||||
|
displayName := itemName
|
||||||
|
if currentSection != "" {
|
||||||
|
displayName = currentSection + " / " + itemName
|
||||||
|
}
|
||||||
|
fields = append(fields, dmiField{Name: displayName, Shn: shn, Value: rawValue})
|
||||||
|
}
|
||||||
|
return fields
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPISAADMIRead(w http.ResponseWriter, r *http.Request) {
|
||||||
|
ctx, cancel := context.WithTimeout(r.Context(), 30*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
tmpDir, err := os.MkdirTemp("", "bee-saa-*")
|
||||||
|
if err != nil {
|
||||||
|
writeError(w, http.StatusInternalServerError, "create temp dir: "+err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
defer os.RemoveAll(tmpDir)
|
||||||
|
|
||||||
|
dmiFile := filepath.Join(tmpDir, "DMI.txt")
|
||||||
|
out, err := exec.CommandContext(ctx, "saa", "-c", "GetDmiInfo", "--file", dmiFile, "--overwrite").CombinedOutput()
|
||||||
|
if err != nil {
|
||||||
|
msg := strings.TrimSpace(string(out))
|
||||||
|
if msg == "" {
|
||||||
|
msg = err.Error()
|
||||||
|
}
|
||||||
|
writeError(w, http.StatusInternalServerError, "saa GetDmiInfo: "+msg)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
raw, err := os.ReadFile(dmiFile)
|
||||||
|
if err != nil {
|
||||||
|
writeError(w, http.StatusInternalServerError, "read DMI file: "+err.Error())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
fields := parseDMIFile(string(raw))
|
||||||
|
if len(fields) == 0 {
|
||||||
|
writeError(w, http.StatusInternalServerError, "no DMI fields found (file may be empty — reboot the server and try again)")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
writeJSON(w, fields)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) handleAPISAADMIWrite(w http.ResponseWriter, r *http.Request) {
|
||||||
|
var req struct {
|
||||||
|
Changes []saaChange `json:"changes"`
|
||||||
|
}
|
||||||
|
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||||
|
writeError(w, http.StatusBadRequest, "invalid request body")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if len(req.Changes) == 0 {
|
||||||
|
writeError(w, http.StatusUnprocessableEntity, "no changes provided")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for _, c := range req.Changes {
|
||||||
|
if !shnRE.MatchString(c.Shn) {
|
||||||
|
writeError(w, http.StatusUnprocessableEntity, "invalid shn: "+c.Shn)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if len(c.Value) == 0 || len(c.Value) > 64 {
|
||||||
|
writeError(w, http.StatusUnprocessableEntity, "value length out of range for shn: "+c.Shn)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for _, ch := range c.Value {
|
||||||
|
if ch < 0x20 || ch > 0x7E {
|
||||||
|
writeError(w, http.StatusUnprocessableEntity, "value contains non-printable character for shn: "+c.Shn)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
t := &Task{
|
||||||
|
ID: newJobID("saa-dmi-write"),
|
||||||
|
Name: fmt.Sprintf("SAA DMI Write (%d field(s))", len(req.Changes)),
|
||||||
|
Target: "saa-dmi-write",
|
||||||
|
Priority: defaultTaskPriority("saa-dmi-write", taskParams{}),
|
||||||
|
Status: TaskPending,
|
||||||
|
CreatedAt: time.Now(),
|
||||||
|
params: taskParams{
|
||||||
|
SAADmiChanges: req.Changes,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
globalQueue.enqueue(t)
|
||||||
|
writeJSON(w, map[string]string{"task_id": t.ID})
|
||||||
|
}
|
||||||
|
|
||||||
|
func runSAADMIWriteTask(ctx context.Context, j *jobState, exportDir string, p taskParams) error {
|
||||||
|
tmpDir, err := os.MkdirTemp("", "bee-saa-*")
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("create temp dir: %w", err)
|
||||||
|
}
|
||||||
|
defer os.RemoveAll(tmpDir)
|
||||||
|
dmiFile := filepath.Join(tmpDir, "DMI.txt")
|
||||||
|
|
||||||
|
j.append("Reading current DMI configuration...")
|
||||||
|
if err := streamCmdJob(j, exec.CommandContext(ctx, "saa", "-c", "GetDmiInfo", "--file", dmiFile, "--overwrite")); err != nil {
|
||||||
|
return fmt.Errorf("GetDmiInfo: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
backupDir := filepath.Join(exportDir, "dmi-backups")
|
||||||
|
if err := os.MkdirAll(backupDir, 0o755); err != nil {
|
||||||
|
return fmt.Errorf("create backup dir: %w", err)
|
||||||
|
}
|
||||||
|
backupName := "dmi-" + time.Now().UTC().Format("20060102-150405") + ".txt"
|
||||||
|
backupPath := filepath.Join(backupDir, backupName)
|
||||||
|
raw, err := os.ReadFile(dmiFile)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("read DMI file: %w", err)
|
||||||
|
}
|
||||||
|
if err := os.WriteFile(backupPath, raw, 0o644); err != nil {
|
||||||
|
return fmt.Errorf("write backup: %w", err)
|
||||||
|
}
|
||||||
|
j.append("Backup saved: dmi-backups/" + backupName)
|
||||||
|
|
||||||
|
for _, c := range p.SAADmiChanges {
|
||||||
|
j.append("Setting " + c.Shn + " = " + c.Value)
|
||||||
|
cmd := exec.CommandContext(ctx, "saa", "-c", "EditDmiInfo", "--file", dmiFile, "--shn", c.Shn, "--value", c.Value)
|
||||||
|
if err := streamCmdJob(j, cmd); err != nil {
|
||||||
|
return fmt.Errorf("EditDmiInfo %s: %w", c.Shn, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
j.append("Applying changes to hardware...")
|
||||||
|
if err := streamCmdJob(j, exec.CommandContext(ctx, "saa", "-c", "ChangeDmiInfo", "--file", dmiFile)); err != nil {
|
||||||
|
return fmt.Errorf("ChangeDmiInfo: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
j.append("Done. Reboot the server for changes to take effect.")
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func renderSAADMICard() string {
|
||||||
|
return `<div class="card"><div class="card-head">Supermicro — DMI <button class="btn btn-sm btn-secondary" onclick="saaDMIRead()" style="margin-left:auto">Read</button></div><div class="card-body">
|
||||||
|
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Reads and edits DMI fields via SAA (In-Band).</p>
|
||||||
|
<div id="saa-dmi-status" style="font-size:13px;color:var(--muted);margin-bottom:8px"></div>
|
||||||
|
<div id="saa-dmi-table"></div>
|
||||||
|
<div id="saa-dmi-save-row" style="display:none;margin-top:12px">
|
||||||
|
<button class="btn btn-primary" id="saa-dmi-save-btn" onclick="saaDMISave()">Save</button>
|
||||||
|
<span id="saa-dmi-save-msg" style="font-size:13px;color:var(--muted);margin-left:10px"></span>
|
||||||
|
</div>
|
||||||
|
<script>
|
||||||
|
function saaDMIEsc(s) {
|
||||||
|
return String(s==null?'':s).replace(/[&<>"']/g,function(c){return{'&':'&','<':'<','>':'>','"':'"',"'":'''}[c];});
|
||||||
|
}
|
||||||
|
function saaDMIUpdateSaveBtn() {
|
||||||
|
var inputs = document.querySelectorAll('#saa-dmi-table input[data-original]');
|
||||||
|
var dirty = [];
|
||||||
|
inputs.forEach(function(inp){if(inp.value!==inp.dataset.original)dirty.push(inp);});
|
||||||
|
var row = document.getElementById('saa-dmi-save-row');
|
||||||
|
var btn = document.getElementById('saa-dmi-save-btn');
|
||||||
|
if(dirty.length>0){row.style.display='';btn.textContent='Save ('+dirty.length+' changed)';}
|
||||||
|
else{row.style.display='none';}
|
||||||
|
}
|
||||||
|
function saaDMIRead() {
|
||||||
|
var status = document.getElementById('saa-dmi-status');
|
||||||
|
var table = document.getElementById('saa-dmi-table');
|
||||||
|
var saveRow = document.getElementById('saa-dmi-save-row');
|
||||||
|
status.textContent = 'Reading...';
|
||||||
|
status.style.color = 'var(--muted)';
|
||||||
|
table.innerHTML = '';
|
||||||
|
saveRow.style.display = 'none';
|
||||||
|
fetch('/api/tools/saa-dmi').then(function(r){return r.json().then(function(d){if(!r.ok)throw new Error(d.error||('HTTP '+r.status));return d;});}).then(function(fields){
|
||||||
|
status.textContent = fields.length+' field(s) loaded.';
|
||||||
|
var rows = fields.map(function(f){
|
||||||
|
return '<tr>'
|
||||||
|
+'<td style="font-size:13px;white-space:nowrap;padding-right:8px">'+saaDMIEsc(f.name)+'</td>'
|
||||||
|
+'<td style="font-family:monospace;font-size:13px;white-space:nowrap;padding-right:8px">'+saaDMIEsc(f.shn)+'</td>'
|
||||||
|
+'<td><input type="text" value="'+saaDMIEsc(f.value)+'" data-shn="'+saaDMIEsc(f.shn)+'" data-original="'+saaDMIEsc(f.value)+'" oninput="saaDMIMarkDirty(this)" style="width:100%;font-family:monospace;font-size:13px;border:1px solid var(--line);padding:3px 6px;border-radius:3px"></td>'
|
||||||
|
+'<td id="saa-dmi-dirty-'+saaDMIEsc(f.shn)+'" style="font-size:12px;color:var(--warn,#b45309);width:50px;padding-left:6px"></td>'
|
||||||
|
+'</tr>';
|
||||||
|
}).join('');
|
||||||
|
table.innerHTML = '<table style="width:100%;border-collapse:collapse"><tr><th style="text-align:left;font-size:13px;padding-bottom:6px">Field</th><th style="text-align:left;font-size:13px;padding-bottom:6px">Shn</th><th style="text-align:left;font-size:13px;padding-bottom:6px">Value</th><th></th></tr>'+rows+'</table>';
|
||||||
|
}).catch(function(e){
|
||||||
|
status.textContent = 'Error: '+e.message;
|
||||||
|
status.style.color = 'var(--crit-fg,#9f3a38)';
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function saaDMIMarkDirty(inp) {
|
||||||
|
var shn = inp.dataset.shn;
|
||||||
|
var cell = document.getElementById('saa-dmi-dirty-'+shn);
|
||||||
|
if(cell)cell.textContent = inp.value!==inp.dataset.original?'changed':'';
|
||||||
|
saaDMIUpdateSaveBtn();
|
||||||
|
}
|
||||||
|
function saaDMIWaitTask(taskID) {
|
||||||
|
var msg = document.getElementById('saa-dmi-save-msg');
|
||||||
|
msg.textContent = 'Task '+taskID+' queued...';
|
||||||
|
msg.style.color = 'var(--muted)';
|
||||||
|
var timer = setInterval(function(){
|
||||||
|
fetch('/api/tasks').then(function(r){return r.json();}).then(function(tasks){
|
||||||
|
var task = (tasks||[]).find(function(t){return t.id===taskID;});
|
||||||
|
if(!task)return;
|
||||||
|
if(task.status==='done'||task.status==='failed'||task.status==='cancelled'){
|
||||||
|
clearInterval(timer);
|
||||||
|
msg.textContent = task.status==='done'?'Saved. Reboot to apply.':'Failed: '+(task.error||task.status);
|
||||||
|
msg.style.color = task.status==='done'?'var(--ok,green)':'var(--crit-fg,#9f3a38)';
|
||||||
|
document.getElementById('saa-dmi-save-btn').disabled = false;
|
||||||
|
}
|
||||||
|
}).catch(function(){});
|
||||||
|
}, 1500);
|
||||||
|
}
|
||||||
|
function saaDMISave() {
|
||||||
|
var inputs = document.querySelectorAll('#saa-dmi-table input[data-original]');
|
||||||
|
var changes = [];
|
||||||
|
inputs.forEach(function(inp){if(inp.value!==inp.dataset.original)changes.push({shn:inp.dataset.shn,value:inp.value});});
|
||||||
|
if(!changes.length)return;
|
||||||
|
var names = changes.map(function(c){return c.shn;}).join(', ');
|
||||||
|
if(!window.confirm('Apply DMI changes for: '+names+'?\n\nThe server will need to be rebooted for changes to take effect.'))return;
|
||||||
|
var btn = document.getElementById('saa-dmi-save-btn');
|
||||||
|
var msg = document.getElementById('saa-dmi-save-msg');
|
||||||
|
btn.disabled = true;
|
||||||
|
msg.textContent = 'Submitting...';
|
||||||
|
msg.style.color = 'var(--muted)';
|
||||||
|
fetch('/api/tools/saa-dmi/write',{
|
||||||
|
method:'POST',
|
||||||
|
headers:{'Content-Type':'application/json'},
|
||||||
|
body:JSON.stringify({changes:changes})
|
||||||
|
}).then(function(r){return r.json().then(function(d){if(!r.ok)throw new Error(d.error||('HTTP '+r.status));return d;});}).then(function(d){
|
||||||
|
saaDMIWaitTask(d.task_id);
|
||||||
|
}).catch(function(e){
|
||||||
|
msg.textContent = 'Error: '+e.message;
|
||||||
|
msg.style.color = 'var(--crit-fg,#9f3a38)';
|
||||||
|
btn.disabled = false;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
</script>
|
||||||
|
</div></div>`
|
||||||
|
}
|
||||||
@@ -221,6 +221,11 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
|||||||
h.kmsg = newKmsgWatcher(opts.App.StatusDB)
|
h.kmsg = newKmsgWatcher(opts.App.StatusDB)
|
||||||
h.kmsg.start()
|
h.kmsg.start()
|
||||||
globalQueue.kmsgWatcher = h.kmsg
|
globalQueue.kmsgWatcher = h.kmsg
|
||||||
|
|
||||||
|
// Start periodic health poller for components that don't emit kernel log events (e.g. PSU).
|
||||||
|
if opts.App.StatusDB != nil {
|
||||||
|
newHealthPoller(opts.App.StatusDB).start()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
globalQueue.startWorker(&opts)
|
globalQueue.startWorker(&opts)
|
||||||
@@ -307,6 +312,12 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
|||||||
|
|
||||||
// Tools
|
// Tools
|
||||||
mux.HandleFunc("GET /api/tools/check", h.handleAPIToolsCheck)
|
mux.HandleFunc("GET /api/tools/check", h.handleAPIToolsCheck)
|
||||||
|
mux.HandleFunc("GET /api/tools/nvme-formats", h.handleAPINVMeFormats)
|
||||||
|
mux.HandleFunc("POST /api/tools/nvme-format/run", h.handleAPINVMeFormatRun)
|
||||||
|
mux.HandleFunc("GET /api/tools/saa-dmi", h.handleAPISAADMIRead)
|
||||||
|
mux.HandleFunc("POST /api/tools/saa-dmi/write", h.handleAPISAADMIWrite)
|
||||||
|
mux.HandleFunc("GET /api/tools/ipmi-fru", h.handleAPIIPMIFRURead)
|
||||||
|
mux.HandleFunc("POST /api/tools/ipmi-fru/write", h.handleAPIIPMIFRUWrite)
|
||||||
|
|
||||||
// GPU presence / tools
|
// GPU presence / tools
|
||||||
mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
|
mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
|
||||||
@@ -326,6 +337,10 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
|||||||
mux.HandleFunc("GET /api/install/disks", h.handleAPIInstallDisks)
|
mux.HandleFunc("GET /api/install/disks", h.handleAPIInstallDisks)
|
||||||
mux.HandleFunc("POST /api/install/run", h.handleAPIInstallRun)
|
mux.HandleFunc("POST /api/install/run", h.handleAPIInstallRun)
|
||||||
|
|
||||||
|
// Hardware component detail (fragment for modal in Hardware Summary card)
|
||||||
|
mux.HandleFunc("GET /api/hardware-summary", h.handleAPIHardwareSummary)
|
||||||
|
mux.HandleFunc("GET /api/components/{type}", h.handleAPIComponentDetail)
|
||||||
|
|
||||||
// Metrics — SSE stream of live sensor data + server-side SVG charts + CSV export
|
// Metrics — SSE stream of live sensor data + server-side SVG charts + CSV export
|
||||||
mux.HandleFunc("GET /api/metrics/stream", h.handleAPIMetricsStream)
|
mux.HandleFunc("GET /api/metrics/stream", h.handleAPIMetricsStream)
|
||||||
mux.HandleFunc("GET /api/metrics/latest", h.handleAPIMetricsLatest)
|
mux.HandleFunc("GET /api/metrics/latest", h.handleAPIMetricsLatest)
|
||||||
@@ -1292,8 +1307,8 @@ const loadingPageHTML = `<!DOCTYPE html>
|
|||||||
*{margin:0;padding:0;box-sizing:border-box}
|
*{margin:0;padding:0;box-sizing:border-box}
|
||||||
html,body{height:100%;background:#0f1117;display:flex;align-items:center;justify-content:center;font-family:'Courier New',monospace;color:#e2e8f0}
|
html,body{height:100%;background:#0f1117;display:flex;align-items:center;justify-content:center;font-family:'Courier New',monospace;color:#e2e8f0}
|
||||||
.wrap{text-align:center;width:420px}
|
.wrap{text-align:center;width:420px}
|
||||||
.logo{font-size:11px;line-height:1.4;color:#f6c90e;margin-bottom:6px;white-space:pre;text-align:left}
|
.brand{font-size:22px;letter-spacing:.18em;color:#f6c90e;margin-bottom:6px;text-align:left}
|
||||||
.subtitle{font-size:12px;color:#a0aec0;text-align:left;margin-bottom:24px;padding-left:2px}
|
.subtitle{font-size:12px;color:#a0aec0;text-align:left;margin-bottom:24px}
|
||||||
.spinner{width:36px;height:36px;border:3px solid #2d3748;border-top-color:#f6c90e;border-radius:50%;animation:spin .8s linear infinite;margin:0 auto 14px}
|
.spinner{width:36px;height:36px;border:3px solid #2d3748;border-top-color:#f6c90e;border-radius:50%;animation:spin .8s linear infinite;margin:0 auto 14px}
|
||||||
.spinner.hidden{display:none}
|
.spinner.hidden{display:none}
|
||||||
@keyframes spin{to{transform:rotate(360deg)}}
|
@keyframes spin{to{transform:rotate(360deg)}}
|
||||||
@@ -1311,12 +1326,7 @@ td:first-child{color:#718096;width:55%}
|
|||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
<div class="wrap">
|
<div class="wrap">
|
||||||
<div class="logo"> ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗
|
<div class="brand">EASY BEE</div>
|
||||||
██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝
|
|
||||||
█████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗
|
|
||||||
██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝
|
|
||||||
███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗
|
|
||||||
╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝</div>
|
|
||||||
<div class="subtitle">Hardware Audit LiveCD</div>
|
<div class="subtitle">Hardware Audit LiveCD</div>
|
||||||
<div class="spinner" id="spin"></div>
|
<div class="spinner" id="spin"></div>
|
||||||
<div class="status" id="st">Connecting to bee-web...</div>
|
<div class="status" id="st">Connecting to bee-web...</div>
|
||||||
@@ -1326,8 +1336,20 @@ td:first-child{color:#718096;width:55%}
|
|||||||
<script>
|
<script>
|
||||||
(function(){
|
(function(){
|
||||||
var gone = false;
|
var gone = false;
|
||||||
|
var pollStarted = false;
|
||||||
|
var fallbackOpenTimer = null;
|
||||||
|
var AUTO_OPEN_DELAY_MS = 15000;
|
||||||
function go(){ if(!gone){gone=true;window.location.replace('/');} }
|
function go(){ if(!gone){gone=true;window.location.replace('/');} }
|
||||||
|
|
||||||
|
function scheduleFallbackOpen(){
|
||||||
|
if(fallbackOpenTimer!==null) return;
|
||||||
|
fallbackOpenTimer=setTimeout(function(){
|
||||||
|
document.getElementById('spin').className='spinner hidden';
|
||||||
|
document.getElementById('st').textContent='Startup checks are taking too long — opening app...';
|
||||||
|
go();
|
||||||
|
},AUTO_OPEN_DELAY_MS);
|
||||||
|
}
|
||||||
|
|
||||||
function icon(s){
|
function icon(s){
|
||||||
if(s==='active') return '<span class="ok">● active</span>';
|
if(s==='active') return '<span class="ok">● active</span>';
|
||||||
if(s==='failed') return '<span class="fail">✕ failed</span>';
|
if(s==='failed') return '<span class="fail">✕ failed</span>';
|
||||||
@@ -1359,6 +1381,7 @@ function pollServices(){
|
|||||||
tbl.innerHTML=html;
|
tbl.innerHTML=html;
|
||||||
if(allSettled(svcs)){
|
if(allSettled(svcs)){
|
||||||
clearInterval(pollTimer);
|
clearInterval(pollTimer);
|
||||||
|
if(fallbackOpenTimer!==null) clearTimeout(fallbackOpenTimer);
|
||||||
document.getElementById('spin').className='spinner hidden';
|
document.getElementById('spin').className='spinner hidden';
|
||||||
document.getElementById('st').textContent='Ready \u2014 opening...';
|
document.getElementById('st').textContent='Ready \u2014 opening...';
|
||||||
setTimeout(go,800);
|
setTimeout(go,800);
|
||||||
@@ -1373,8 +1396,12 @@ function probe(){
|
|||||||
if(r.ok){
|
if(r.ok){
|
||||||
document.getElementById('st').textContent='bee-web running \u2014 checking services...';
|
document.getElementById('st').textContent='bee-web running \u2014 checking services...';
|
||||||
document.getElementById('btn').style.display='';
|
document.getElementById('btn').style.display='';
|
||||||
pollServices();
|
scheduleFallbackOpen();
|
||||||
pollTimer=setInterval(pollServices,1500);
|
if(!pollStarted){
|
||||||
|
pollStarted=true;
|
||||||
|
pollServices();
|
||||||
|
pollTimer=setInterval(pollServices,1500);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
document.getElementById('st').textContent='bee-web starting (status '+r.status+')...';
|
document.getElementById('st').textContent='bee-web starting (status '+r.status+')...';
|
||||||
setTimeout(probe,500);
|
setTimeout(probe,500);
|
||||||
@@ -1396,14 +1423,17 @@ func (h *handler) handlePage(w http.ResponseWriter, r *http.Request) {
|
|||||||
if page == "" {
|
if page == "" {
|
||||||
page = "dashboard"
|
page = "dashboard"
|
||||||
}
|
}
|
||||||
// Redirect old routes to new names
|
// Redirect legacy routes to new named pages
|
||||||
switch page {
|
switch page {
|
||||||
case "tests":
|
case "validate", "tests":
|
||||||
http.Redirect(w, r, "/validate", http.StatusMovedPermanently)
|
http.Redirect(w, r, "/load", http.StatusMovedPermanently)
|
||||||
return
|
return
|
||||||
case "burn-in":
|
case "burn-in":
|
||||||
http.Redirect(w, r, "/burn", http.StatusMovedPermanently)
|
http.Redirect(w, r, "/burn", http.StatusMovedPermanently)
|
||||||
return
|
return
|
||||||
|
case "speed", "endurance":
|
||||||
|
http.Redirect(w, r, "/benchmark", http.StatusMovedPermanently)
|
||||||
|
return
|
||||||
}
|
}
|
||||||
body := renderPage(page, h.opts)
|
body := renderPage(page, h.opts)
|
||||||
w.Header().Set("Cache-Control", "no-store")
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
|
|||||||
@@ -604,6 +604,25 @@ func TestReadyIsOKWhenAuditPathIsUnset(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestLoadingPageHasFallbackAutoOpen(t *testing.T) {
|
||||||
|
handler := NewHandler(HandlerOptions{})
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/loading", nil))
|
||||||
|
if rec.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||||
|
}
|
||||||
|
body := rec.Body.String()
|
||||||
|
for _, needle := range []string{
|
||||||
|
`var AUTO_OPEN_DELAY_MS = 15000;`,
|
||||||
|
`function scheduleFallbackOpen(){`,
|
||||||
|
`Startup checks are taking too long — opening app...`,
|
||||||
|
} {
|
||||||
|
if !strings.Contains(body, needle) {
|
||||||
|
t.Fatalf("loading page missing %q: %s", needle, body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestAuditPageRendersViewerFrameAndActions(t *testing.T) {
|
func TestAuditPageRendersViewerFrameAndActions(t *testing.T) {
|
||||||
dir := t.TempDir()
|
dir := t.TempDir()
|
||||||
path := filepath.Join(dir, "audit.json")
|
path := filepath.Join(dir, "audit.json")
|
||||||
@@ -647,35 +666,51 @@ func TestTasksPageRendersOpenLinksAndPaginationControls(t *testing.T) {
|
|||||||
|
|
||||||
func TestToolsPageRendersNvidiaSelfHealSection(t *testing.T) {
|
func TestToolsPageRendersNvidiaSelfHealSection(t *testing.T) {
|
||||||
handler := NewHandler(HandlerOptions{})
|
handler := NewHandler(HandlerOptions{})
|
||||||
rec := httptest.NewRecorder()
|
|
||||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tools", nil))
|
// /tools: only NVMe Block Format and Supermicro DMI remain
|
||||||
if rec.Code != http.StatusOK {
|
recTools := httptest.NewRecorder()
|
||||||
t.Fatalf("status=%d", rec.Code)
|
handler.ServeHTTP(recTools, httptest.NewRequest(http.MethodGet, "/tools", nil))
|
||||||
|
if recTools.Code != http.StatusOK {
|
||||||
|
t.Fatalf("tools status=%d", recTools.Code)
|
||||||
}
|
}
|
||||||
body := rec.Body.String()
|
toolsBody := recTools.Body.String()
|
||||||
if !strings.Contains(body, `NVIDIA Self Heal`) {
|
if !strings.Contains(toolsBody, `NVMe Block Format`) {
|
||||||
t.Fatalf("tools page missing nvidia self heal section: %s", body)
|
t.Fatalf("tools page missing nvme block format section: %s", toolsBody)
|
||||||
}
|
}
|
||||||
if !strings.Contains(body, `Restart GPU Drivers`) {
|
if !strings.Contains(toolsBody, `/api/tools/nvme-formats`) || !strings.Contains(toolsBody, `/api/tools/nvme-format/run`) {
|
||||||
t.Fatalf("tools page missing restart gpu drivers button: %s", body)
|
t.Fatalf("tools page missing nvme format api usage: %s", toolsBody)
|
||||||
}
|
}
|
||||||
if !strings.Contains(body, `nvidiaRestartDrivers()`) {
|
|
||||||
t.Fatalf("tools page missing nvidiaRestartDrivers action: %s", body)
|
// /settings: system install, support bundle, tool check, nvidia self heal, network, services
|
||||||
|
recSettings := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(recSettings, httptest.NewRequest(http.MethodGet, "/settings", nil))
|
||||||
|
if recSettings.Code != http.StatusOK {
|
||||||
|
t.Fatalf("settings status=%d", recSettings.Code)
|
||||||
}
|
}
|
||||||
if !strings.Contains(body, `/api/gpu/nvidia-status`) {
|
settingsBody := recSettings.Body.String()
|
||||||
t.Fatalf("tools page missing nvidia status api usage: %s", body)
|
if !strings.Contains(settingsBody, `NVIDIA Self Heal`) {
|
||||||
|
t.Fatalf("settings page missing nvidia self heal section: %s", settingsBody)
|
||||||
}
|
}
|
||||||
if !strings.Contains(body, `nvidiaResetGPU(`) {
|
if !strings.Contains(settingsBody, `Restart GPU Drivers`) {
|
||||||
t.Fatalf("tools page missing nvidiaResetGPU action: %s", body)
|
t.Fatalf("settings page missing restart gpu drivers button: %s", settingsBody)
|
||||||
}
|
}
|
||||||
if !strings.Contains(body, `id="boot-source-text"`) {
|
if !strings.Contains(settingsBody, `nvidiaRestartDrivers()`) {
|
||||||
t.Fatalf("tools page missing boot source field: %s", body)
|
t.Fatalf("settings page missing nvidiaRestartDrivers action: %s", settingsBody)
|
||||||
}
|
}
|
||||||
if !strings.Contains(body, `USB Black-Box`) {
|
if !strings.Contains(settingsBody, `/api/gpu/nvidia-status`) {
|
||||||
t.Fatalf("tools page missing usb black-box section: %s", body)
|
t.Fatalf("settings page missing nvidia status api usage: %s", settingsBody)
|
||||||
}
|
}
|
||||||
if !strings.Contains(body, `/api/blackbox/status`) {
|
if !strings.Contains(settingsBody, `nvidiaResetGPU(`) {
|
||||||
t.Fatalf("tools page missing black-box status api usage: %s", body)
|
t.Fatalf("settings page missing nvidiaResetGPU action: %s", settingsBody)
|
||||||
|
}
|
||||||
|
if !strings.Contains(settingsBody, `id="boot-source-text"`) {
|
||||||
|
t.Fatalf("settings page missing boot source field: %s", settingsBody)
|
||||||
|
}
|
||||||
|
if !strings.Contains(settingsBody, `USB Black-Box`) {
|
||||||
|
t.Fatalf("settings page missing usb black-box section: %s", settingsBody)
|
||||||
|
}
|
||||||
|
if !strings.Contains(settingsBody, `/api/blackbox/status`) {
|
||||||
|
t.Fatalf("settings page missing black-box status api usage: %s", settingsBody)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -766,46 +801,45 @@ func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
|
func TestCheckPageRendersGPUSelectionAndNvidiaCards(t *testing.T) {
|
||||||
handler := NewHandler(HandlerOptions{})
|
handler := NewHandler(HandlerOptions{})
|
||||||
rec := httptest.NewRecorder()
|
rec := httptest.NewRecorder()
|
||||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil))
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/check", nil))
|
||||||
if rec.Code != http.StatusOK {
|
if rec.Code != http.StatusOK {
|
||||||
t.Fatalf("status=%d", rec.Code)
|
t.Fatalf("status=%d", rec.Code)
|
||||||
}
|
}
|
||||||
body := rec.Body.String()
|
body := rec.Body.String()
|
||||||
for _, needle := range []string{
|
for _, needle := range []string{
|
||||||
`NVIDIA GPU Targeted Stress`,
|
|
||||||
`nvidia-targeted-stress`,
|
|
||||||
`controlled NVIDIA DCGM load`,
|
|
||||||
`<code>dcgmi diag targeted_stress</code>`,
|
|
||||||
`NVIDIA GPU Selection`,
|
`NVIDIA GPU Selection`,
|
||||||
`All NVIDIA validate tasks use only the GPUs selected here.`,
|
|
||||||
`Select All`,
|
|
||||||
`id="sat-gpu-list"`,
|
`id="sat-gpu-list"`,
|
||||||
|
`Select All`,
|
||||||
|
`id="sat-btn-nvidia"`,
|
||||||
|
`NVIDIA Interconnect (NCCL)`,
|
||||||
|
`NVIDIA Bandwidth (NVBandwidth)`,
|
||||||
|
`Non-destructive`,
|
||||||
} {
|
} {
|
||||||
if !strings.Contains(body, needle) {
|
if !strings.Contains(body, needle) {
|
||||||
t.Fatalf("validate page missing %q: %s", needle, body)
|
t.Fatalf("check page missing %q: %s", needle, body)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestValidatePageRendersNvidiaFabricCardsInValidateMode(t *testing.T) {
|
func TestCheckPageRendersNvidiaFabricCards(t *testing.T) {
|
||||||
handler := NewHandler(HandlerOptions{})
|
handler := NewHandler(HandlerOptions{})
|
||||||
rec := httptest.NewRecorder()
|
rec := httptest.NewRecorder()
|
||||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil))
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/check", nil))
|
||||||
if rec.Code != http.StatusOK {
|
if rec.Code != http.StatusOK {
|
||||||
t.Fatalf("status=%d", rec.Code)
|
t.Fatalf("status=%d", rec.Code)
|
||||||
}
|
}
|
||||||
body := rec.Body.String()
|
body := rec.Body.String()
|
||||||
for _, needle := range []string{
|
for _, needle := range []string{
|
||||||
`NVIDIA Interconnect (NCCL)`,
|
`NVIDIA Interconnect (NCCL)`,
|
||||||
`Validate and Stress:`,
|
|
||||||
`NVIDIA Bandwidth (NVBandwidth)`,
|
`NVIDIA Bandwidth (NVBandwidth)`,
|
||||||
`nvbandwidth runs all built-in tests without a time limit`,
|
`nvbandwidth`,
|
||||||
|
`all_reduce_perf`,
|
||||||
} {
|
} {
|
||||||
if !strings.Contains(body, needle) {
|
if !strings.Contains(body, needle) {
|
||||||
t.Fatalf("validate page missing %q: %s", needle, body)
|
t.Fatalf("check page missing %q: %s", needle, body)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -822,7 +856,6 @@ func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
|
|||||||
`NVIDIA Max Compute Load`,
|
`NVIDIA Max Compute Load`,
|
||||||
`dcgmproftester`,
|
`dcgmproftester`,
|
||||||
`NCCL`,
|
`NCCL`,
|
||||||
`Validate → Stress mode`,
|
|
||||||
`id="burn-gpu-list"`,
|
`id="burn-gpu-list"`,
|
||||||
} {
|
} {
|
||||||
if !strings.Contains(body, needle) {
|
if !strings.Contains(body, needle) {
|
||||||
|
|||||||
@@ -376,6 +376,24 @@ func executeTaskWithOptions(opts *HandlerOptions, t *Task, j *jobState, ctx cont
|
|||||||
break
|
break
|
||||||
}
|
}
|
||||||
err = a.RunInstallToRAM(ctx, j.append)
|
err = a.RunInstallToRAM(ctx, j.append)
|
||||||
|
case "nvme-format":
|
||||||
|
if strings.TrimSpace(t.params.Device) == "" {
|
||||||
|
err = fmt.Errorf("device is required")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
err = runNVMeFormatTask(ctx, j, t.params.Device, t.params.LBAF)
|
||||||
|
case "saa-dmi-write":
|
||||||
|
if len(t.params.SAADmiChanges) == 0 {
|
||||||
|
err = fmt.Errorf("no changes provided")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
err = runSAADMIWriteTask(ctx, j, opts.ExportDir, t.params)
|
||||||
|
case "ipmi-fru-write":
|
||||||
|
if len(t.params.FRUChanges) == 0 {
|
||||||
|
err = fmt.Errorf("no changes provided")
|
||||||
|
break
|
||||||
|
}
|
||||||
|
err = runIPMIFRUWriteTask(ctx, j, opts.ExportDir, t.params)
|
||||||
default:
|
default:
|
||||||
j.append("ERROR: unknown target: " + t.Target)
|
j.append("ERROR: unknown target: " + t.Target)
|
||||||
j.finish("unknown target")
|
j.finish("unknown target")
|
||||||
|
|||||||
@@ -57,6 +57,7 @@ var taskNames = map[string]string{
|
|||||||
"support-bundle": "Support Bundle",
|
"support-bundle": "Support Bundle",
|
||||||
"install": "Install to Disk",
|
"install": "Install to Disk",
|
||||||
"install-to-ram": "Install to RAM",
|
"install-to-ram": "Install to RAM",
|
||||||
|
"nvme-format": "NVMe Block Format Change",
|
||||||
}
|
}
|
||||||
|
|
||||||
// burnNames maps target → human-readable name when a burn profile is set.
|
// burnNames maps target → human-readable name when a burn profile is set.
|
||||||
@@ -136,8 +137,11 @@ type taskParams struct {
|
|||||||
RampTotal int `json:"ramp_total,omitempty"`
|
RampTotal int `json:"ramp_total,omitempty"`
|
||||||
RampRunID string `json:"ramp_run_id,omitempty"`
|
RampRunID string `json:"ramp_run_id,omitempty"`
|
||||||
DisplayName string `json:"display_name,omitempty"`
|
DisplayName string `json:"display_name,omitempty"`
|
||||||
Device string `json:"device,omitempty"` // for install
|
Device string `json:"device,omitempty"` // for install
|
||||||
PlatformComponents []string `json:"platform_components,omitempty"`
|
LBAF int `json:"lbaf,omitempty"`
|
||||||
|
PlatformComponents []string `json:"platform_components,omitempty"`
|
||||||
|
SAADmiChanges []saaChange `json:"saa_dmi_changes,omitempty"`
|
||||||
|
FRUChanges []fruChange `json:"fru_changes,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type persistedTask struct {
|
type persistedTask struct {
|
||||||
@@ -598,6 +602,17 @@ func (q *taskQueue) startRecoveredTaskMonitorLocked(t *Task, j *jobState) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (q *taskQueue) runTaskExternal(t *Task, j *jobState) {
|
func (q *taskQueue) runTaskExternal(t *Task, j *jobState) {
|
||||||
|
startedKmsgWatch := false
|
||||||
|
if q.kmsgWatcher != nil && isSATTarget(t.Target) {
|
||||||
|
q.kmsgWatcher.NotifyTaskStarted(t.ID, t.Target)
|
||||||
|
startedKmsgWatch = true
|
||||||
|
}
|
||||||
|
defer func() {
|
||||||
|
if startedKmsgWatch && q.kmsgWatcher != nil {
|
||||||
|
q.kmsgWatcher.NotifyTaskFinished(t.ID)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
stopTail := make(chan struct{})
|
stopTail := make(chan struct{})
|
||||||
doneTail := make(chan struct{})
|
doneTail := make(chan struct{})
|
||||||
defer func() {
|
defer func() {
|
||||||
|
|||||||
@@ -126,6 +126,23 @@ func TestNewTaskJobStateLoadsExistingLog(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestJobAppendFlushesTaskLogImmediately(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
path := filepath.Join(dir, "task.log")
|
||||||
|
j := newTaskJobState(path)
|
||||||
|
|
||||||
|
j.append("live-line")
|
||||||
|
|
||||||
|
data, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if string(data) != "live-line\n" {
|
||||||
|
t.Fatalf("log=%q want live-line newline", string(data))
|
||||||
|
}
|
||||||
|
j.closeLog()
|
||||||
|
}
|
||||||
|
|
||||||
func TestTaskQueueSnapshotSortsNewestFirst(t *testing.T) {
|
func TestTaskQueueSnapshotSortsNewestFirst(t *testing.T) {
|
||||||
now := time.Date(2026, 4, 2, 12, 0, 0, 0, time.UTC)
|
now := time.Date(2026, 4, 2, 12, 0, 0, 0, time.UTC)
|
||||||
q := &taskQueue{
|
q := &taskQueue{
|
||||||
@@ -849,3 +866,82 @@ func TestExecuteTaskMarksPanicsAsFailedAndClosesKmsgWindow(t *testing.T) {
|
|||||||
t.Fatalf("expected kmsg window to be cleared, got %+v", window)
|
t.Fatalf("expected kmsg window to be cleared, got %+v", window)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestRunTaskExternalOpensAndClosesKmsgWindow(t *testing.T) {
|
||||||
|
dir := t.TempDir()
|
||||||
|
releasePath := filepath.Join(dir, "release")
|
||||||
|
readyPath := filepath.Join(dir, "ready")
|
||||||
|
q := &taskQueue{
|
||||||
|
opts: &HandlerOptions{ExportDir: dir},
|
||||||
|
logsDir: filepath.Join(dir, "tasks"),
|
||||||
|
kmsgWatcher: newKmsgWatcher(nil),
|
||||||
|
trigger: make(chan struct{}, 1),
|
||||||
|
}
|
||||||
|
if err := os.MkdirAll(q.logsDir, 0755); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
tk := &Task{
|
||||||
|
ID: "cpu-external-1",
|
||||||
|
Name: "CPU SAT",
|
||||||
|
Target: "cpu",
|
||||||
|
Status: TaskRunning,
|
||||||
|
CreatedAt: time.Now(),
|
||||||
|
}
|
||||||
|
q.assignTaskLogPathLocked(tk)
|
||||||
|
j := newTaskJobState(tk.LogPath)
|
||||||
|
|
||||||
|
orig := externalTaskRunnerCommand
|
||||||
|
externalTaskRunnerCommand = func(exportDir, taskID string) (*exec.Cmd, error) {
|
||||||
|
script := "printf ready > \"$1\"; while [ ! -f \"$2\" ]; do sleep 0.05; done"
|
||||||
|
return exec.Command("sh", "-c", script, "sh", readyPath, releasePath), nil
|
||||||
|
}
|
||||||
|
defer func() { externalTaskRunnerCommand = orig }()
|
||||||
|
|
||||||
|
done := make(chan struct{})
|
||||||
|
go func() {
|
||||||
|
q.runTaskExternal(tk, j)
|
||||||
|
close(done)
|
||||||
|
}()
|
||||||
|
|
||||||
|
deadline := time.Now().Add(2 * time.Second)
|
||||||
|
for time.Now().Before(deadline) {
|
||||||
|
if _, err := os.Stat(readyPath); err == nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
time.Sleep(20 * time.Millisecond)
|
||||||
|
}
|
||||||
|
if _, err := os.Stat(readyPath); err != nil {
|
||||||
|
t.Fatalf("external runner did not start: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
q.kmsgWatcher.mu.Lock()
|
||||||
|
activeCount := q.kmsgWatcher.activeCount
|
||||||
|
window := q.kmsgWatcher.window
|
||||||
|
q.kmsgWatcher.mu.Unlock()
|
||||||
|
if activeCount != 1 {
|
||||||
|
t.Fatalf("activeCount while running=%d want 1", activeCount)
|
||||||
|
}
|
||||||
|
if window == nil || len(window.targets) != 1 || window.targets[0] != "cpu" {
|
||||||
|
t.Fatalf("window while running=%+v", window)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := os.WriteFile(releasePath, []byte("1\n"), 0644); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
select {
|
||||||
|
case <-done:
|
||||||
|
case <-time.After(2 * time.Second):
|
||||||
|
t.Fatal("runTaskExternal did not return")
|
||||||
|
}
|
||||||
|
|
||||||
|
q.kmsgWatcher.mu.Lock()
|
||||||
|
activeCount = q.kmsgWatcher.activeCount
|
||||||
|
window = q.kmsgWatcher.window
|
||||||
|
q.kmsgWatcher.mu.Unlock()
|
||||||
|
if activeCount != 0 {
|
||||||
|
t.Fatalf("activeCount after finish=%d want 0", activeCount)
|
||||||
|
}
|
||||||
|
if window != nil {
|
||||||
|
t.Fatalf("expected kmsg window to be cleared, got %+v", window)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
2
bible
2
bible
Submodule bible updated: d2600f1279...1977730d93
185
bible-local/architecture/api-surface.md
Normal file
185
bible-local/architecture/api-surface.md
Normal file
@@ -0,0 +1,185 @@
|
|||||||
|
# API Surface
|
||||||
|
|
||||||
|
HTTP endpoints exposed by `bee web` (binds `0.0.0.0:80`).
|
||||||
|
Handler registration: `audit/internal/webui/server.go` → `NewHandler()`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Health & readiness
|
||||||
|
|
||||||
|
| Method | Path | Description |
|
||||||
|
|--------|----------------|-----------------------------------------------------|
|
||||||
|
| GET | `/healthz` | Always 200. Used by load balancers / boot scripts. |
|
||||||
|
| GET | `/api/ready` | 200 when audit JSON exists and is readable. |
|
||||||
|
| GET | `/loading` | HTML loading page shown before first audit. |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Audit
|
||||||
|
|
||||||
|
| Method | Path | Description |
|
||||||
|
|--------|-----------------------|--------------------------------------------------------------|
|
||||||
|
| GET | `/audit.json` | Latest audit JSON with SAT overlay applied. |
|
||||||
|
| GET | `/runtime-health.json`| Latest runtime preflight JSON. |
|
||||||
|
| POST | `/api/audit/run` | Enqueue a full `bee audit` run. Returns task ID. |
|
||||||
|
| GET | `/api/audit/stream` | SSE: audit run log lines (`data:` + newline per line). |
|
||||||
|
| GET | `/api/preflight` | Run runtime preflight check (synchronous, returns JSON). |
|
||||||
|
| GET | `/api/hardware-summary` | Hardware health summary (status counts + failures). |
|
||||||
|
| GET | `/api/components/{type}` | HTML fragment for component detail dialog (e.g. `cpu`, `memory`, `storage`, `pcie`). |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## SAT (System Acceptance Testing)
|
||||||
|
|
||||||
|
All SAT run endpoints enqueue an async task. Response: `{"task_id": "..."}`.
|
||||||
|
|
||||||
|
| Method | Path | Description |
|
||||||
|
|--------|--------------------------------------------|-----------------------------------|
|
||||||
|
| POST | `/api/sat/nvidia/run` | NVIDIA DCGM SAT |
|
||||||
|
| POST | `/api/sat/nvidia-targeted-stress/run` | NVIDIA targeted stress validate |
|
||||||
|
| POST | `/api/sat/nvidia-compute/run` | NVIDIA max compute load |
|
||||||
|
| POST | `/api/sat/nvidia-targeted-power/run` | NVIDIA targeted power |
|
||||||
|
| POST | `/api/sat/nvidia-pulse/run` | NVIDIA pulse test |
|
||||||
|
| POST | `/api/sat/nvidia-interconnect/run` | NCCL all_reduce_perf |
|
||||||
|
| POST | `/api/sat/nvidia-bandwidth/run` | NVBandwidth test |
|
||||||
|
| POST | `/api/sat/nvidia-stress/run` | NVIDIA stress pack |
|
||||||
|
| POST | `/api/sat/memory/run` | Memory acceptance |
|
||||||
|
| POST | `/api/sat/storage/run` | Storage acceptance (smartctl) |
|
||||||
|
| POST | `/api/sat/cpu/run` | CPU acceptance (stress-ng) |
|
||||||
|
| POST | `/api/sat/amd/run` | AMD GPU SAT (ROCm) |
|
||||||
|
| POST | `/api/sat/amd-mem/run` | AMD memory integrity + bandwidth |
|
||||||
|
| POST | `/api/sat/amd-bandwidth/run` | AMD memory bandwidth |
|
||||||
|
| POST | `/api/sat/amd-stress/run` | AMD GPU stress |
|
||||||
|
| POST | `/api/sat/memory-stress/run` | Memory stress |
|
||||||
|
| POST | `/api/sat/sat-stress/run` | Combined storage+memory stress |
|
||||||
|
| POST | `/api/sat/platform-stress/run` | Fan + thermal stress |
|
||||||
|
| GET | `/api/sat/stream` | SSE: live SAT log stream |
|
||||||
|
| POST | `/api/sat/abort` | Abort the running SAT task |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Benchmarks
|
||||||
|
|
||||||
|
| Method | Path | Description |
|
||||||
|
|--------|-----------------------------------------|----------------------------------------------|
|
||||||
|
| POST | `/api/bee-bench/nvidia/perf/run` | NVIDIA performance benchmark |
|
||||||
|
| POST | `/api/bee-bench/nvidia/power/run` | NVIDIA power benchmark |
|
||||||
|
| POST | `/api/bee-bench/nvidia/autotune/run` | Power source autotune (prerequisite for benchmarks) |
|
||||||
|
| GET | `/api/bee-bench/nvidia/autotune/status` | Current autotune result / status |
|
||||||
|
| GET | `/api/benchmark/results` | List completed benchmark result archives |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Tasks (async job queue)
|
||||||
|
|
||||||
|
| Method | Path | Description |
|
||||||
|
|--------|-----------------------------|----------------------------------------------------|
|
||||||
|
| GET | `/api/tasks` | List all tasks with status |
|
||||||
|
| POST | `/api/tasks/cancel-all` | Cancel all pending/running tasks |
|
||||||
|
| POST | `/api/tasks/kill-workers` | Force-kill worker goroutines |
|
||||||
|
| POST | `/api/tasks/{id}/cancel` | Cancel a specific task |
|
||||||
|
| POST | `/api/tasks/{id}/priority` | Elevate task priority |
|
||||||
|
| GET | `/api/tasks/{id}/stream` | SSE: live log stream for a task |
|
||||||
|
| GET | `/api/tasks/{id}/charts` | List chart names for a task |
|
||||||
|
| GET | `/api/tasks/{id}/chart/` | SVG chart for a task result |
|
||||||
|
| GET | `/tasks/{id}` | HTML task detail page |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Services
|
||||||
|
|
||||||
|
| Method | Path | Description |
|
||||||
|
|--------|---------------------------|--------------------------------------------------|
|
||||||
|
| GET | `/api/services` | List bee-* systemd services and their states |
|
||||||
|
| POST | `/api/services/action` | start/stop/restart a service |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Network
|
||||||
|
|
||||||
|
| Method | Path | Description |
|
||||||
|
|--------|----------------------------|-----------------------------------------------------|
|
||||||
|
| GET | `/api/network` | List interfaces with state and IPv4 addresses |
|
||||||
|
| POST | `/api/network/dhcp` | Run dhclient on one or all interfaces |
|
||||||
|
| POST | `/api/network/static` | Set static IPv4 address |
|
||||||
|
| POST | `/api/network/toggle` | Bring interface up or down |
|
||||||
|
| POST | `/api/network/confirm` | Confirm pending network change (clears rollback) |
|
||||||
|
| POST | `/api/network/rollback` | Restore pre-change network snapshot |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Export
|
||||||
|
|
||||||
|
| Method | Path | Description |
|
||||||
|
|--------|-------------------------------|---------------------------------------------------|
|
||||||
|
| GET | `/export/support.tar.gz` | Download support bundle (live-generated) |
|
||||||
|
| GET | `/export/file` | Download a file from the export dir by path param |
|
||||||
|
| GET | `/export/` | Browse export dir (HTML index) |
|
||||||
|
| GET | `/api/export/list` | JSON list of files in export dir |
|
||||||
|
| GET | `/api/export/usb` | List removable USB targets available for export |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## GPU
|
||||||
|
|
||||||
|
| Method | Path | Description |
|
||||||
|
|--------|----------------------------|----------------------------------------------------|
|
||||||
|
| GET | `/api/gpu/presence` | `{"nvidia": bool, "amd": bool}` |
|
||||||
|
| GET | `/api/gpu/nvidia` | List NVIDIA GPUs from nvidia-smi |
|
||||||
|
| GET | `/api/gpu/nvidia-status` | Per-GPU status (ECC, power, throttle) |
|
||||||
|
| POST | `/api/gpu/nvidia-reset` | GPU reset by index |
|
||||||
|
| GET | `/api/gpu/tools` | nvidia-smi / rocm-smi tool availability |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## System
|
||||||
|
|
||||||
|
| Method | Path | Description |
|
||||||
|
|--------|------------------------------|---------------------------------------------------|
|
||||||
|
| GET | `/api/system/ram-status` | toram boot state and ISO copy status |
|
||||||
|
| POST | `/api/system/install-to-ram` | Copy ISO to RAM (background task) |
|
||||||
|
| GET | `/api/install/disks` | List block devices suitable for disk installation |
|
||||||
|
| POST | `/api/install/run` | Install bee to disk (background task) |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Tools & NVMe
|
||||||
|
|
||||||
|
| Method | Path | Description |
|
||||||
|
|--------|-------------------------------|--------------------------------------------------|
|
||||||
|
| GET | `/api/tools/check` | Check availability of required CLI tools |
|
||||||
|
| GET | `/api/tools/nvme-formats` | List NVMe format options for a device |
|
||||||
|
| POST | `/api/tools/nvme-format/run` | Run nvme-format on a device |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Live metrics
|
||||||
|
|
||||||
|
| Method | Path | Description |
|
||||||
|
|--------|------------------------------|---------------------------------------------------|
|
||||||
|
| GET | `/api/metrics/stream` | SSE: live metrics (GPU power, temp, utilization) |
|
||||||
|
| GET | `/api/metrics/latest` | Latest metrics snapshot (JSON) |
|
||||||
|
| GET | `/api/metrics/chart/` | SVG chart for a metric over time |
|
||||||
|
| GET | `/api/metrics/export.csv` | Download metrics history as CSV |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Blackbox logging
|
||||||
|
|
||||||
|
| Method | Path | Description |
|
||||||
|
|--------|----------------------------|-----------------------------------------------|
|
||||||
|
| GET | `/api/blackbox/status` | Blackbox log state (enabled, size, path) |
|
||||||
|
| POST | `/api/blackbox/enable` | Start recording blackbox log |
|
||||||
|
| POST | `/api/blackbox/disable` | Stop recording, flush to disk |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## UI pages
|
||||||
|
|
||||||
|
| Method | Path | Description |
|
||||||
|
|--------|------------|-----------------------------------------------|
|
||||||
|
| GET | `/` | Main dashboard (serves all page routes) |
|
||||||
|
| GET | `/viewer` | Standalone JSON viewer for uploaded audit files |
|
||||||
|
|
||||||
|
All pages are rendered server-side as HTML. The `/` route handles sub-paths such as
|
||||||
|
`/network`, `/services`, `/sat`, `/benchmark`, `/install`, `/validate`, `/export`.
|
||||||
137
bible-local/architecture/data-model.md
Normal file
137
bible-local/architecture/data-model.md
Normal file
@@ -0,0 +1,137 @@
|
|||||||
|
# Data Model
|
||||||
|
|
||||||
|
The canonical output of `bee audit` is a `HardwareIngestRequest` JSON document accepted
|
||||||
|
by the Reanimator `/api/ingest/hardware` endpoint. The ingest endpoint uses a strict
|
||||||
|
decoder — unknown fields cause `400 Bad Request`.
|
||||||
|
|
||||||
|
Source of truth: `audit/internal/schema/hardware.go`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Top-level: HardwareIngestRequest
|
||||||
|
|
||||||
|
```
|
||||||
|
HardwareIngestRequest
|
||||||
|
├── collected_at string RFC3339 UTC timestamp of collection
|
||||||
|
├── hardware HardwareSnapshot
|
||||||
|
├── runtime RuntimeHealth? from bee-runtime-preflight service
|
||||||
|
├── filename string?
|
||||||
|
├── source_type string?
|
||||||
|
├── protocol string?
|
||||||
|
└── target_host string?
|
||||||
|
```
|
||||||
|
|
||||||
|
`collected_at` is the primary sort key used by Reanimator to deduplicate ingests.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## HardwareSnapshot
|
||||||
|
|
||||||
|
All component arrays are `omitempty` — absent when the collector finds nothing.
|
||||||
|
|
||||||
|
| JSON key | Go type | Source |
|
||||||
|
|-------------------|----------------------------|------------------------------|
|
||||||
|
| `board` | HardwareBoard | dmidecode type 1/2 |
|
||||||
|
| `firmware` | []HardwareFirmwareRecord | dmidecode type 0/13 |
|
||||||
|
| `cpus` | []HardwareCPU | dmidecode type 4 |
|
||||||
|
| `memory` | []HardwareMemory | dmidecode type 17 |
|
||||||
|
| `storage` | []HardwareStorage | lsblk + nvme-cli + smartctl |
|
||||||
|
| `pcie_devices` | []HardwarePCIeDevice | lspci |
|
||||||
|
| `power_supplies` | []HardwarePowerSupply | ipmitool fru + sdr |
|
||||||
|
| `sensors` | *HardwareSensors | sensors -j |
|
||||||
|
| `event_logs` | []HardwareEventLog | ipmitool sel + journald |
|
||||||
|
| `platform_config` | *json.RawMessage | reserved, nil until used |
|
||||||
|
| `vroc_license` | *string | vroc-cli |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Identity keys
|
||||||
|
|
||||||
|
Reanimator uses these fields to match components across successive audits:
|
||||||
|
|
||||||
|
| Component | Identity key |
|
||||||
|
|----------------|------------------------------------------------|
|
||||||
|
| Board | `board.serial_number` (required, never empty) |
|
||||||
|
| CPU | `serial_number` if present; else generated key |
|
||||||
|
| Memory DIMM | `serial_number` — absent DIMMs have `present: false` |
|
||||||
|
| Storage | `serial_number` if present; else `linux_device` from Telemetry |
|
||||||
|
| PCIe device | `bdf` (Bus:Device.Function address) |
|
||||||
|
| PSU | `slot` |
|
||||||
|
|
||||||
|
Components without a stable identity are still emitted but may not be matched across runs.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## HardwareComponentStatus (embedded in all components)
|
||||||
|
|
||||||
|
```go
|
||||||
|
type HardwareComponentStatus struct {
|
||||||
|
Status *string `json:"status,omitempty"` // OK | Warning | Critical | Unknown
|
||||||
|
ErrorDescription *string `json:"error_description,omitempty"`
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Status is set by collectors and overwritten at render time by `ApplySATOverlay`
|
||||||
|
(latest SAT run results are always merged on top before display).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## HardwarePCIeDevice
|
||||||
|
|
||||||
|
The most enriched component type. Key fields:
|
||||||
|
|
||||||
|
| JSON key | Meaning |
|
||||||
|
|----------------------|------------------------------------------------|
|
||||||
|
| `bdf` | PCI address (identity key), e.g. `0000:4b:00.0` |
|
||||||
|
| `vendor_id` | Numeric PCI vendor ID (hex). Use this for classification — not `manufacturer`. |
|
||||||
|
| `device_id` | Numeric PCI device ID (hex) |
|
||||||
|
| `device_class` | Human-readable class, e.g. `VideoController` |
|
||||||
|
| `manufacturer` | String label from lspci — for display only |
|
||||||
|
| `model` | From nvidia-smi / rocm-smi — display name |
|
||||||
|
| `link_speed` | Current PCIe link speed, e.g. `Gen4` |
|
||||||
|
| `max_link_speed` | Max negotiated speed |
|
||||||
|
| `link_width` | Current lane count |
|
||||||
|
| `max_link_width` | Max lane count |
|
||||||
|
| `temperature_c` | From nvidia-smi / rocm-smi |
|
||||||
|
| `power_w` | Current power draw |
|
||||||
|
| `ecc_uncorrected_total` | Cumulative ECC uncorrected errors (NVIDIA) |
|
||||||
|
| `ecc_corrected_total` | Cumulative ECC corrected errors (NVIDIA) |
|
||||||
|
| `hw_slowdown` | HW throttle active (NVIDIA) |
|
||||||
|
| `telemetry` | Free-form map for vendor-specific extras |
|
||||||
|
|
||||||
|
**Classification rule**: use `vendor_id` (numeric PCI ID), never `manufacturer` string.
|
||||||
|
|
||||||
|
| Vendor | vendor_id |
|
||||||
|
|-----------|-----------|
|
||||||
|
| NVIDIA | `0x10de` |
|
||||||
|
| AMD | `0x1002` |
|
||||||
|
| Mellanox | `0x15b3` |
|
||||||
|
| Aspeed | `0x1a03` |
|
||||||
|
| Intel | `0x8086` |
|
||||||
|
|
||||||
|
Constants live in `audit/internal/collector/pci_vendors.go`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## HardwareMemory
|
||||||
|
|
||||||
|
`location` field exists in the Go struct with `json:"-"` — it is intentionally excluded
|
||||||
|
from JSON output because the Reanimator schema does not include it. It is used internally
|
||||||
|
for DIMM telemetry matching only (`collector/memory_telemetry.go`).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## HardwareSensors
|
||||||
|
|
||||||
|
Sensor structs (`HardwareFanSensor`, `HardwareTemperatureSensor`,
|
||||||
|
`HardwarePowerSensor`, `HardwareOtherSensor`) do **not** have a `location` field.
|
||||||
|
Location was removed in contract v2.8. The Go types mirror the schema exactly.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## JSON naming convention
|
||||||
|
|
||||||
|
All JSON keys are `snake_case`. Go field names are `CamelCase`. The mapping is
|
||||||
|
maintained by struct tags in `audit/internal/schema/hardware.go`.
|
||||||
|
|
||||||
|
All pointer fields use `omitempty` — absent means not collected (not zero).
|
||||||
@@ -0,0 +1,41 @@
|
|||||||
|
# Decision: Skip PCIe link-speed warnings for disabled devices
|
||||||
|
|
||||||
|
**Date:** 2026-06-12
|
||||||
|
**Status:** active
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
On HGX H100 SXM5 baseboards, the Microchip Switchtec PM41028 PSX PCIe switch
|
||||||
|
(vendor 11F8, device 4128, NVIDIA subsystem 10DE:1643) appears in `lspci` as a
|
||||||
|
"Memory controller". Its upstream link trains at Gen3 x2 while the device is
|
||||||
|
capable of Gen4 x16. The device is permanently in a disabled state: memory access
|
||||||
|
and bus-mastering are both off (Mem-, BusMaster-); `/sys/bus/pci/devices/<bdf>/enable`
|
||||||
|
reads `0`.
|
||||||
|
|
||||||
|
This chip is the PCIe fabric management endpoint for the NVSwitch interconnect — it
|
||||||
|
carries only management traffic at low bandwidth and is intentionally not activated
|
||||||
|
by any Linux driver. The bee audit was reporting a `statusWarning` with message
|
||||||
|
"PCIe link speed degraded" for this device, which is misleading because the device
|
||||||
|
is not in the data path.
|
||||||
|
|
||||||
|
## Decision
|
||||||
|
|
||||||
|
`applyPCIeLinkSpeedWarning` reads `/sys/bus/pci/devices/<bdf>/enable` via the
|
||||||
|
existing `readPCIIntAttribute` helper. If the value is `0` the function returns
|
||||||
|
early without setting any warning status.
|
||||||
|
|
||||||
|
The check is vendor-agnostic: it applies to any PCIe device that Linux has not
|
||||||
|
activated, regardless of make or model. This is consistent with the
|
||||||
|
`no-hardcoded-vendors` contract — no vendor ID, device ID, or name string is
|
||||||
|
used as a condition.
|
||||||
|
|
||||||
|
## Consequences
|
||||||
|
|
||||||
|
- PCIe fabric management endpoints, IPMI virtual devices, and other permanently
|
||||||
|
disabled PCIe functions no longer produce spurious link-degradation warnings.
|
||||||
|
- Real link degradation on active devices (GPUs, NICs, NVMe, NVLink bridges)
|
||||||
|
continues to be detected and reported as before.
|
||||||
|
- NVLink bridge cards retain their existing `statusCritical` path (they are always
|
||||||
|
enabled, so the early return is never taken for them).
|
||||||
|
- The Switchtec device on HGX H100 boards shows `statusOK` with no
|
||||||
|
`error_description` in the audit JSON.
|
||||||
@@ -7,3 +7,4 @@ One file per decision, named `YYYY-MM-DD-short-topic.md`.
|
|||||||
| 2026-03-05 | Use NVIDIA proprietary driver | active |
|
| 2026-03-05 | Use NVIDIA proprietary driver | active |
|
||||||
| 2026-04-01 | Treat memtest as explicit ISO content | active |
|
| 2026-04-01 | Treat memtest as explicit ISO content | active |
|
||||||
| 2026-04-29 | Treat embedded submodules as read-only | active |
|
| 2026-04-29 | Treat embedded submodules as read-only | active |
|
||||||
|
| 2026-06-12 | Skip PCIe link-speed warnings for disabled devices | active |
|
||||||
|
|||||||
312
bible-local/docs/grub-bitmap-error-history.md
Normal file
312
bible-local/docs/grub-bitmap-error-history.md
Normal file
@@ -0,0 +1,312 @@
|
|||||||
|
# GRUB Bitmap Error History
|
||||||
|
|
||||||
|
## Symptom
|
||||||
|
|
||||||
|
On some servers GRUB prints:
|
||||||
|
|
||||||
|
```text
|
||||||
|
error: null src bitmap in grub_video_bitmap_create_scaled.
|
||||||
|
Press any key to continue...
|
||||||
|
```
|
||||||
|
|
||||||
|
The important new observation as of `v10.7` is:
|
||||||
|
|
||||||
|
- the error still appears even when the logo image block is removed from
|
||||||
|
`iso/builder/config/bootloaders/grub-efi/live-theme/theme.txt`
|
||||||
|
- therefore the current error can no longer be explained only by
|
||||||
|
`bee-logo.png` / `bee-logo.tga`
|
||||||
|
|
||||||
|
That does not prove the theme system is healthy. It proves only that the
|
||||||
|
currently remaining failure is deeper than "bad logo file".
|
||||||
|
|
||||||
|
## Current State
|
||||||
|
|
||||||
|
Current source files:
|
||||||
|
|
||||||
|
- [iso/builder/config/bootloaders/grub-efi/live-theme/theme.txt](/Users/mchusavitin/Documents/git/bee/iso/builder/config/bootloaders/grub-efi/live-theme/theme.txt:1)
|
||||||
|
has no `image` block anymore
|
||||||
|
- [iso/builder/config/bootloaders/grub-efi/config.cfg](/Users/mchusavitin/Documents/git/bee/iso/builder/config/bootloaders/grub-efi/config.cfg:1)
|
||||||
|
still does `insmod tga` and then `source /boot/grub/theme.cfg`
|
||||||
|
|
||||||
|
Implication:
|
||||||
|
|
||||||
|
- if the error still fires, the trigger is likely elsewhere in GRUB theme
|
||||||
|
rendering or in the assets/config GRUB resolves while sourcing `theme.cfg`
|
||||||
|
- the old "PNG parser fragility" story is no longer a sufficient explanation
|
||||||
|
for the current failure mode
|
||||||
|
|
||||||
|
Current artifact facts:
|
||||||
|
|
||||||
|
- the provided `easy-bee-nvidia-v10.7-amd64.logs` build logs reference
|
||||||
|
`linux-image-6.1.0-45`
|
||||||
|
- the provided `easy-bee-nvidia-v10.7-amd64.iso` contains
|
||||||
|
`live/initrd.img-6.1.0-45-amd64` and `live/vmlinuz-6.1.0-45-amd64`
|
||||||
|
- a later `BOOT FAILED!` screenshot showed `live/initrd.img-6.1.0-44-amd64`
|
||||||
|
and `live/vmlinuz-6.1.0-44-amd64`
|
||||||
|
|
||||||
|
Implication:
|
||||||
|
|
||||||
|
- the `BOOT FAILED!` screenshot is not from the same artifact as the provided
|
||||||
|
`v10.7` ISO/log set
|
||||||
|
- until the exact ISO filename and checksum are tied to that failure, the
|
||||||
|
GRUB bitmap issue and the live-boot failure must be treated as separate
|
||||||
|
problems
|
||||||
|
|
||||||
|
## Chronology
|
||||||
|
|
||||||
|
### 1. Initial bee GRUB theme introduction
|
||||||
|
|
||||||
|
Relevant commit:
|
||||||
|
|
||||||
|
- `d52ec67` `Stability hardening, build script fixes, GRUB bee logo`
|
||||||
|
|
||||||
|
What changed:
|
||||||
|
|
||||||
|
- bee-branded GRUB theme introduced
|
||||||
|
- image block with explicit `width` / `height`
|
||||||
|
|
||||||
|
Observed result:
|
||||||
|
|
||||||
|
- bitmap error appeared
|
||||||
|
|
||||||
|
### 2. Remove explicit scaling dimensions
|
||||||
|
|
||||||
|
Relevant commit:
|
||||||
|
|
||||||
|
- `aa284ae` `fix(iso): avoid grub logo scaling error`
|
||||||
|
|
||||||
|
What changed:
|
||||||
|
|
||||||
|
- removed `width = 400`
|
||||||
|
- removed `height = 400`
|
||||||
|
|
||||||
|
Reason stated by the change:
|
||||||
|
|
||||||
|
- try to avoid the scaling path
|
||||||
|
|
||||||
|
Observed result:
|
||||||
|
|
||||||
|
- error persisted
|
||||||
|
|
||||||
|
Conclusion:
|
||||||
|
|
||||||
|
- explicit width/height were not the sole trigger
|
||||||
|
|
||||||
|
### 3. Rework PNG handling and menu rendering
|
||||||
|
|
||||||
|
Relevant commit:
|
||||||
|
|
||||||
|
- `6112094` `fix(grub): fix bitmap error and menu rendering`
|
||||||
|
|
||||||
|
Commit message says the change was intended to:
|
||||||
|
|
||||||
|
- convert `bee-logo.png` to RGBA and strip metadata
|
||||||
|
- move `terminal_output gfxterm` before `insmod png` / theme load
|
||||||
|
- remove ASCII banner from GRUB menu area
|
||||||
|
- fix theme typography/layout fields
|
||||||
|
|
||||||
|
Observed result:
|
||||||
|
|
||||||
|
- error persisted
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
|
||||||
|
- this was still operating under the assumption that the issue was the PNG
|
||||||
|
payload or the order of gfxterm/theme init
|
||||||
|
|
||||||
|
### 4. Convert logo PNG back to RGB
|
||||||
|
|
||||||
|
Relevant commit:
|
||||||
|
|
||||||
|
- `333c44f` `Fix GRUB splash: convert bee-logo.png from RGBA to RGB`
|
||||||
|
|
||||||
|
Intended reason:
|
||||||
|
|
||||||
|
- GRUB might dislike RGBA PNG and want RGB PNG
|
||||||
|
|
||||||
|
Observed result:
|
||||||
|
|
||||||
|
- error still persisted according to later project notes
|
||||||
|
|
||||||
|
### 5. Add post-build canonical GRUB/isolinux sync
|
||||||
|
|
||||||
|
Relevant commit:
|
||||||
|
|
||||||
|
- `0cdfbc5` `fix(iso): restore boot UX and boot logs`
|
||||||
|
|
||||||
|
What this introduced:
|
||||||
|
|
||||||
|
- post-`lb build` rewriting of `binary/boot/grub/grub.cfg`
|
||||||
|
- post-`lb build` rewriting of `binary/isolinux/live.cfg`
|
||||||
|
- forced rebuild of `binary_checksums`, `binary_iso`, `binary_zsync`
|
||||||
|
|
||||||
|
Why it was added:
|
||||||
|
|
||||||
|
- restore canonical EASY-BEE boot UX after live-build wrote its own bootloader
|
||||||
|
outputs
|
||||||
|
- restore expected boot menu and logs
|
||||||
|
|
||||||
|
Important note:
|
||||||
|
|
||||||
|
- this commit did not directly solve the bitmap issue
|
||||||
|
- it added a second layer of bootloader mutation after live-build
|
||||||
|
|
||||||
|
### 6. Switch from PNG to TGA
|
||||||
|
|
||||||
|
Relevant commit:
|
||||||
|
|
||||||
|
- `626763e` `Fix GRUB bitmap error: switch from PNG to TGA for splash logo`
|
||||||
|
|
||||||
|
Commit message says:
|
||||||
|
|
||||||
|
- GRUB PNG reader was considered fragile
|
||||||
|
- switch to uncompressed 24-bit TGA
|
||||||
|
- `config.cfg`: `insmod png` -> `insmod tga`
|
||||||
|
- `theme.txt`: `bee-logo.png` -> `bee-logo.tga`
|
||||||
|
|
||||||
|
Observed result:
|
||||||
|
|
||||||
|
- this did not eliminate the problem in the current lineage
|
||||||
|
- today the system still errors even after the entire image block was removed
|
||||||
|
|
||||||
|
Conclusion:
|
||||||
|
|
||||||
|
- switching PNG -> TGA was not a durable root-cause fix
|
||||||
|
|
||||||
|
### 7. Patch EFI image after build
|
||||||
|
|
||||||
|
Relevant commit:
|
||||||
|
|
||||||
|
- `4f20c92` `Make UEFI boot safe and remove GRUB logo`
|
||||||
|
|
||||||
|
What this introduced:
|
||||||
|
|
||||||
|
- `sync_efi_grub_theme_assets`
|
||||||
|
- direct `mtools` patching of `efi.img`
|
||||||
|
- copying `config.cfg`, `theme.cfg`, and `live-theme/*` into the EFI FAT image
|
||||||
|
- removal of the logo image block from `theme.txt`
|
||||||
|
|
||||||
|
Why it was added:
|
||||||
|
|
||||||
|
- make UEFI path "safe"
|
||||||
|
- keep EFI GRUB image aligned with canonical bootloader assets
|
||||||
|
|
||||||
|
Observed result:
|
||||||
|
|
||||||
|
- later this became the direct cause of `Disk full` during build once
|
||||||
|
`bee-logo.tga` was large enough
|
||||||
|
- and even with the logo removed from `theme.txt`, the bitmap error still
|
||||||
|
remained
|
||||||
|
|
||||||
|
Conclusion:
|
||||||
|
|
||||||
|
- EFI post-build patching increased build complexity
|
||||||
|
- removing the logo alone did not remove the runtime GRUB error
|
||||||
|
|
||||||
|
### 8. Remove ASCII logo banners
|
||||||
|
|
||||||
|
Relevant commit:
|
||||||
|
|
||||||
|
- `14505ef` `Remove easy bee ASCII logo banners`
|
||||||
|
|
||||||
|
What changed:
|
||||||
|
|
||||||
|
- web loading page ASCII cleanup only
|
||||||
|
|
||||||
|
Relevance here:
|
||||||
|
|
||||||
|
- none for GRUB bitmap error
|
||||||
|
- included here only to avoid confusion with other "logo removal" work
|
||||||
|
|
||||||
|
### 9. Remove EFI post-build patching
|
||||||
|
|
||||||
|
Relevant commit:
|
||||||
|
|
||||||
|
- `5dc022d` `Drop post-build EFI bootloader patching`
|
||||||
|
|
||||||
|
Why it was done:
|
||||||
|
|
||||||
|
- stop mutating `efi.img` post-build
|
||||||
|
- remove dependence on `mtools` for EFI patching
|
||||||
|
- remove the `Disk full` failure mode
|
||||||
|
|
||||||
|
Impact:
|
||||||
|
|
||||||
|
- this did not target the GRUB bitmap error directly
|
||||||
|
- it targeted build-system complexity and EFI image overflow
|
||||||
|
|
||||||
|
### 10. Restore only GRUB/isolinux post-build sync
|
||||||
|
|
||||||
|
Relevant commit:
|
||||||
|
|
||||||
|
- `42774d4` `Restore post-build GRUB and isolinux sync`
|
||||||
|
|
||||||
|
Why it was needed:
|
||||||
|
|
||||||
|
- removing all post-build sync caused final ISO validation to fail with
|
||||||
|
missing canonical EASY-BEE boot entries
|
||||||
|
- memtest was still fine, but final GRUB menu was no longer canonical
|
||||||
|
|
||||||
|
What it restored:
|
||||||
|
|
||||||
|
- only `binary/boot/grub/grub.cfg`
|
||||||
|
- only `binary/isolinux/live.cfg`
|
||||||
|
|
||||||
|
What it did not restore:
|
||||||
|
|
||||||
|
- no EFI FAT image patching
|
||||||
|
- no `mtools` path
|
||||||
|
|
||||||
|
## What Is Proven False
|
||||||
|
|
||||||
|
The current evidence rules out several simplistic explanations:
|
||||||
|
|
||||||
|
- "the error is only caused by explicit image scaling"
|
||||||
|
- "the error is only caused by PNG vs TGA"
|
||||||
|
- "the error is only caused by the logo file itself"
|
||||||
|
|
||||||
|
Why:
|
||||||
|
|
||||||
|
- scaling dimensions were removed and error persisted
|
||||||
|
- PNG was replaced with TGA and error still survived in the lineage
|
||||||
|
- the image block itself is now absent, and the error still occurs
|
||||||
|
|
||||||
|
## Working Hypotheses Left
|
||||||
|
|
||||||
|
The remaining plausible layers are:
|
||||||
|
|
||||||
|
- GRUB theme engine still tries to render some bitmap-related element even
|
||||||
|
without the logo image block
|
||||||
|
- GRUB is resolving stale theme assets from the built EFI/ISO path rather than
|
||||||
|
what we think the source tree says
|
||||||
|
- `theme.cfg` / `theme.txt` / GRUB module loading order still triggers a bitmap
|
||||||
|
code path elsewhere
|
||||||
|
- live-build may still package a stale `theme.txt` or stale `live-theme`
|
||||||
|
directory into the final image
|
||||||
|
- the GRUB environment on the failing hardware may behave differently from the
|
||||||
|
assumptions in our source tree
|
||||||
|
|
||||||
|
## Decision Boundary
|
||||||
|
|
||||||
|
Before making another change, the next step should be evidence gathering from
|
||||||
|
the real built artifact, not another speculative edit.
|
||||||
|
|
||||||
|
That means checking on the actual built ISO or EFI image:
|
||||||
|
|
||||||
|
- exact `boot/grub/theme.cfg`
|
||||||
|
- exact `boot/grub/live-theme/theme.txt`
|
||||||
|
- exact contents of `boot/grub/live-theme/`
|
||||||
|
- whether the final image still contains a stale logo reference
|
||||||
|
- whether the EFI path and non-EFI path differ
|
||||||
|
|
||||||
|
## Relevant Commits
|
||||||
|
|
||||||
|
- `d52ec67` `Stability hardening, build script fixes, GRUB bee logo`
|
||||||
|
- `aa284ae` `fix(iso): avoid grub logo scaling error`
|
||||||
|
- `6112094` `fix(grub): fix bitmap error and menu rendering`
|
||||||
|
- `333c44f` `Fix GRUB splash: convert bee-logo.png from RGBA to RGB`
|
||||||
|
- `0cdfbc5` `fix(iso): restore boot UX and boot logs`
|
||||||
|
- `626763e` `Fix GRUB bitmap error: switch from PNG to TGA for splash logo`
|
||||||
|
- `4f20c92` `Make UEFI boot safe and remove GRUB logo`
|
||||||
|
- `5dc022d` `Drop post-build EFI bootloader patching`
|
||||||
|
- `42774d4` `Restore post-build GRUB and isolinux sync`
|
||||||
Submodule internal/chart updated: ac8120c8ab...8105c7ec08
@@ -9,7 +9,7 @@ NCCL_TESTS_VERSION=2.13.10
|
|||||||
NVCC_VERSION=12.8
|
NVCC_VERSION=12.8
|
||||||
CUBLAS_VERSION=13.1.1.3-1
|
CUBLAS_VERSION=13.1.1.3-1
|
||||||
CUDA_USERSPACE_VERSION=13.0.96-1
|
CUDA_USERSPACE_VERSION=13.0.96-1
|
||||||
DCGM_VERSION=4.5.3-1
|
DCGM_VERSION=4.6.0-1
|
||||||
JOHN_JUMBO_COMMIT=67fcf9fe5a
|
JOHN_JUMBO_COMMIT=67fcf9fe5a
|
||||||
ROCM_VERSION=6.3.4
|
ROCM_VERSION=6.3.4
|
||||||
ROCM_SMI_VERSION=7.4.0.60304-76~22.04
|
ROCM_SMI_VERSION=7.4.0.60304-76~22.04
|
||||||
|
|||||||
@@ -16,6 +16,12 @@ else
|
|||||||
LB_LINUX_PACKAGES="linux-image"
|
LB_LINUX_PACKAGES="linux-image"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [ -n "${BEE_ISO_VOLUME:-}" ]; then
|
||||||
|
LB_ISO_VOLUME="${BEE_ISO_VOLUME}"
|
||||||
|
else
|
||||||
|
LB_ISO_VOLUME="EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}"
|
||||||
|
fi
|
||||||
|
|
||||||
lb config noauto \
|
lb config noauto \
|
||||||
--distribution bookworm \
|
--distribution bookworm \
|
||||||
--architectures amd64 \
|
--architectures amd64 \
|
||||||
@@ -30,9 +36,9 @@ lb config noauto \
|
|||||||
--linux-flavours "amd64" \
|
--linux-flavours "amd64" \
|
||||||
--linux-packages "${LB_LINUX_PACKAGES}" \
|
--linux-packages "${LB_LINUX_PACKAGES}" \
|
||||||
--memtest memtest86+ \
|
--memtest memtest86+ \
|
||||||
--iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
--iso-volume "${LB_ISO_VOLUME}" \
|
||||||
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||||
--bootappend-live "boot=live components video=1920x1080 console=ttyS0,115200n8 console=tty0 loglevel=3 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
|
--bootappend-live "boot=live live-media=/dev/disk/by-label/${LB_ISO_VOLUME} live-media-label=${LB_ISO_VOLUME} components video=1920x1080 console=ttyS0,115200n8 console=tty0 loglevel=3 systemd.show_status=1 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
|
||||||
--debootstrap-options "--include=ca-certificates" \
|
--debootstrap-options "--include=ca-certificates" \
|
||||||
--apt-recommends false \
|
--apt-recommends false \
|
||||||
--chroot-squashfs-compression-type zstd \
|
--chroot-squashfs-compression-type zstd \
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ BUILDER_DIR="${REPO_ROOT}/iso/builder"
|
|||||||
CONTAINER_TOOL="${CONTAINER_TOOL:-docker}"
|
CONTAINER_TOOL="${CONTAINER_TOOL:-docker}"
|
||||||
IMAGE_TAG="${BEE_BUILDER_IMAGE:-bee-iso-builder}"
|
IMAGE_TAG="${BEE_BUILDER_IMAGE:-bee-iso-builder}"
|
||||||
BUILDER_PLATFORM="${BEE_BUILDER_PLATFORM:-linux/amd64}"
|
BUILDER_PLATFORM="${BEE_BUILDER_PLATFORM:-linux/amd64}"
|
||||||
CACHE_DIR="${BEE_BUILDER_CACHE_DIR:-${REPO_ROOT}/dist/container-cache}"
|
CACHE_DIR="${BEE_BUILDER_CACHE_DIR:-${REPO_ROOT}/dist/cache}"
|
||||||
AUTH_KEYS=""
|
AUTH_KEYS=""
|
||||||
CLEAN_CACHE=0
|
CLEAN_CACHE=0
|
||||||
VARIANT="all"
|
VARIANT="all"
|
||||||
@@ -54,14 +54,14 @@ if [ "$CLEAN_CACHE" = "1" ]; then
|
|||||||
"${CACHE_DIR:?}/bee" \
|
"${CACHE_DIR:?}/bee" \
|
||||||
"${CACHE_DIR:?}/lb-packages"
|
"${CACHE_DIR:?}/lb-packages"
|
||||||
echo "=== cleaning live-build work dirs ==="
|
echo "=== cleaning live-build work dirs ==="
|
||||||
rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia"
|
rm -rf "${REPO_ROOT}/dist/cache/live-build-work-nvidia"
|
||||||
rm -rf "${REPO_ROOT}/dist/live-build-work-nvidia-legacy"
|
rm -rf "${REPO_ROOT}/dist/cache/live-build-work-nvidia-legacy"
|
||||||
rm -rf "${REPO_ROOT}/dist/live-build-work-amd"
|
rm -rf "${REPO_ROOT}/dist/cache/live-build-work-amd"
|
||||||
rm -rf "${REPO_ROOT}/dist/live-build-work-nogpu"
|
rm -rf "${REPO_ROOT}/dist/cache/live-build-work-nogpu"
|
||||||
rm -rf "${REPO_ROOT}/dist/overlay-stage-nvidia"
|
rm -rf "${REPO_ROOT}/dist/cache/overlay-stage-nvidia"
|
||||||
rm -rf "${REPO_ROOT}/dist/overlay-stage-nvidia-legacy"
|
rm -rf "${REPO_ROOT}/dist/cache/overlay-stage-nvidia-legacy"
|
||||||
rm -rf "${REPO_ROOT}/dist/overlay-stage-amd"
|
rm -rf "${REPO_ROOT}/dist/cache/overlay-stage-amd"
|
||||||
rm -rf "${REPO_ROOT}/dist/overlay-stage-nogpu"
|
rm -rf "${REPO_ROOT}/dist/cache/overlay-stage-nogpu"
|
||||||
echo "=== caches cleared, proceeding with build ==="
|
echo "=== caches cleared, proceeding with build ==="
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|||||||
@@ -51,8 +51,8 @@ case "$BUILD_VARIANT" in
|
|||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
BUILD_WORK_DIR="${DIST_DIR}/live-build-work-${BUILD_VARIANT}"
|
BUILD_WORK_DIR="${DIST_DIR}/cache/live-build-work-${BUILD_VARIANT}"
|
||||||
OVERLAY_STAGE_DIR="${DIST_DIR}/overlay-stage-${BUILD_VARIANT}"
|
OVERLAY_STAGE_DIR="${DIST_DIR}/cache/overlay-stage-${BUILD_VARIANT}"
|
||||||
|
|
||||||
export BEE_GPU_VENDOR BEE_NVIDIA_MODULE_FLAVOR BUILD_VARIANT
|
export BEE_GPU_VENDOR BEE_NVIDIA_MODULE_FLAVOR BUILD_VARIANT
|
||||||
|
|
||||||
@@ -63,18 +63,33 @@ export PATH="$PATH:/usr/local/go/bin"
|
|||||||
|
|
||||||
# Allow git to read the bind-mounted repo (different UID inside container).
|
# Allow git to read the bind-mounted repo (different UID inside container).
|
||||||
git config --global safe.directory "${REPO_ROOT}"
|
git config --global safe.directory "${REPO_ROOT}"
|
||||||
mkdir -p "${DIST_DIR}"
|
mkdir -p "${DIST_DIR}/cache" "${DIST_DIR}/release"
|
||||||
mkdir -p "${CACHE_ROOT}"
|
mkdir -p "${CACHE_ROOT}"
|
||||||
: "${GOCACHE:=${CACHE_ROOT}/go-build}"
|
: "${GOCACHE:=${CACHE_ROOT}/go-build}"
|
||||||
: "${GOMODCACHE:=${CACHE_ROOT}/go-mod}"
|
: "${GOMODCACHE:=${CACHE_ROOT}/go-mod}"
|
||||||
export GOCACHE GOMODCACHE
|
export GOCACHE GOMODCACHE
|
||||||
|
|
||||||
resolve_audit_version() {
|
resolve_project_version() {
|
||||||
|
if [ -n "${BEE_VERSION:-}" ]; then
|
||||||
|
echo "${BEE_VERSION}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -n "${BEE_AUDIT_VERSION:-}" ] && [ -n "${BEE_ISO_VERSION:-}" ] && [ "${BEE_AUDIT_VERSION}" != "${BEE_ISO_VERSION}" ]; then
|
||||||
|
echo "ERROR: BEE_AUDIT_VERSION (${BEE_AUDIT_VERSION}) and BEE_ISO_VERSION (${BEE_ISO_VERSION}) differ; versioning must stay synchronized" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
if [ -n "${BEE_AUDIT_VERSION:-}" ]; then
|
if [ -n "${BEE_AUDIT_VERSION:-}" ]; then
|
||||||
echo "${BEE_AUDIT_VERSION}"
|
echo "${BEE_AUDIT_VERSION}"
|
||||||
return 0
|
return 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
if [ -n "${BEE_ISO_VERSION:-}" ]; then
|
||||||
|
echo "${BEE_ISO_VERSION}"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
tag="$(git -C "${REPO_ROOT}" describe --tags --match 'v[0-9]*' --abbrev=7 --dirty 2>/dev/null || true)"
|
tag="$(git -C "${REPO_ROOT}" describe --tags --match 'v[0-9]*' --abbrev=7 --dirty 2>/dev/null || true)"
|
||||||
case "${tag}" in
|
case "${tag}" in
|
||||||
v*)
|
v*)
|
||||||
@@ -97,35 +112,6 @@ resolve_audit_version() {
|
|||||||
date +%Y%m%d
|
date +%Y%m%d
|
||||||
}
|
}
|
||||||
|
|
||||||
# ISO image versioned separately from the audit binary (iso/v* tags).
|
|
||||||
resolve_iso_version() {
|
|
||||||
if [ -n "${BEE_ISO_VERSION:-}" ]; then
|
|
||||||
echo "${BEE_ISO_VERSION}"
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Plain v* tags (e.g. v2.7) take priority — this is the current tagging scheme
|
|
||||||
tag="$(git -C "${REPO_ROOT}" describe --tags --match 'v[0-9]*' --abbrev=7 --dirty 2>/dev/null || true)"
|
|
||||||
case "${tag}" in
|
|
||||||
v*)
|
|
||||||
echo "${tag#v}"
|
|
||||||
return 0
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
|
|
||||||
# Legacy iso/v* tags fallback
|
|
||||||
tag="$(git -C "${REPO_ROOT}" describe --tags --match 'iso/v*' --abbrev=7 --dirty 2>/dev/null || true)"
|
|
||||||
case "${tag}" in
|
|
||||||
iso/v*)
|
|
||||||
echo "${tag#iso/v}"
|
|
||||||
return 0
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
|
|
||||||
# Fall back to audit version so the name is still meaningful
|
|
||||||
resolve_audit_version
|
|
||||||
}
|
|
||||||
|
|
||||||
sync_builder_workdir() {
|
sync_builder_workdir() {
|
||||||
src_dir="$1"
|
src_dir="$1"
|
||||||
dst_dir="$2"
|
dst_dir="$2"
|
||||||
@@ -530,12 +516,12 @@ validate_iso_live_boot_entries() {
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
grep -q 'menuentry "EASY-BEE"' "$grub_cfg" || {
|
grep -q 'menuentry "EASY-BEE v' "$grub_cfg" || {
|
||||||
echo "ERROR: GRUB default EASY-BEE entry is missing" >&2
|
echo "ERROR: GRUB default EASY-BEE entry is missing" >&2
|
||||||
rm -f "$grub_cfg" "$isolinux_cfg"
|
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||||
exit 1
|
exit 1
|
||||||
}
|
}
|
||||||
grep -q 'menuentry "EASY-BEE -- load to RAM (toram)"' "$grub_cfg" || {
|
grep -q 'menuentry "EASY-BEE v.* -- load to RAM (toram)"' "$grub_cfg" || {
|
||||||
echo "ERROR: GRUB toram entry is missing" >&2
|
echo "ERROR: GRUB toram entry is missing" >&2
|
||||||
rm -f "$grub_cfg" "$isolinux_cfg"
|
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||||
exit 1
|
exit 1
|
||||||
@@ -550,6 +536,11 @@ validate_iso_live_boot_entries() {
|
|||||||
rm -f "$grub_cfg" "$isolinux_cfg"
|
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||||
exit 1
|
exit 1
|
||||||
}
|
}
|
||||||
|
grep -q 'linux .*live-media-label=EASY_BEE_' "$grub_cfg" || {
|
||||||
|
echo "ERROR: GRUB live entry is missing live-media-label pinning" >&2
|
||||||
|
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
grep -q 'append .*boot=live ' "$isolinux_cfg" || {
|
grep -q 'append .*boot=live ' "$isolinux_cfg" || {
|
||||||
echo "ERROR: isolinux live entry is missing boot=live" >&2
|
echo "ERROR: isolinux live entry is missing boot=live" >&2
|
||||||
@@ -561,11 +552,50 @@ validate_iso_live_boot_entries() {
|
|||||||
rm -f "$grub_cfg" "$isolinux_cfg"
|
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||||
exit 1
|
exit 1
|
||||||
}
|
}
|
||||||
|
grep -q 'append .*live-media-label=EASY_BEE_' "$isolinux_cfg" || {
|
||||||
|
echo "ERROR: isolinux live entry is missing live-media-label pinning" >&2
|
||||||
|
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
rm -f "$grub_cfg" "$isolinux_cfg"
|
rm -f "$grub_cfg" "$isolinux_cfg"
|
||||||
echo "=== live boot validation OK ==="
|
echo "=== live boot validation OK ==="
|
||||||
}
|
}
|
||||||
|
|
||||||
|
validate_iso_grub_assets() {
|
||||||
|
iso_path="$1"
|
||||||
|
echo "=== validating GRUB assets in ISO ==="
|
||||||
|
|
||||||
|
[ -f "$iso_path" ] || {
|
||||||
|
echo "ERROR: ISO not found for GRUB asset validation: $iso_path" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
require_iso_reader "$iso_path" >/dev/null 2>&1 || {
|
||||||
|
echo "ERROR: ISO reader unavailable for GRUB asset validation" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
iso_files="$(mktemp)"
|
||||||
|
iso_list_files "$iso_path" > "$iso_files" || {
|
||||||
|
echo "ERROR: failed to list ISO files for GRUB asset validation" >&2
|
||||||
|
rm -f "$iso_files"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
for required in \
|
||||||
|
boot/grub/config.cfg \
|
||||||
|
boot/grub/grub.cfg; do
|
||||||
|
grep -q "^${required}$" "$iso_files" || {
|
||||||
|
echo "ERROR: missing GRUB asset in ISO: ${required}" >&2
|
||||||
|
rm -f "$iso_files"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
done
|
||||||
|
|
||||||
|
rm -f "$iso_files"
|
||||||
|
echo "=== GRUB asset validation OK ==="
|
||||||
|
}
|
||||||
|
|
||||||
validate_iso_nvidia_runtime() {
|
validate_iso_nvidia_runtime() {
|
||||||
iso_path="$1"
|
iso_path="$1"
|
||||||
[ "$BEE_GPU_VENDOR" = "nvidia" ] || return 0
|
[ "$BEE_GPU_VENDOR" = "nvidia" ] || return 0
|
||||||
@@ -578,29 +608,37 @@ validate_iso_nvidia_runtime() {
|
|||||||
|
|
||||||
squashfs_tmp="$(mktemp)"
|
squashfs_tmp="$(mktemp)"
|
||||||
squashfs_list="$(mktemp)"
|
squashfs_list="$(mktemp)"
|
||||||
iso_read_member "$iso_path" live/filesystem.squashfs "$squashfs_tmp" || {
|
iso_files="$(mktemp)"
|
||||||
rm -f "$squashfs_tmp" "$squashfs_list"
|
iso_list_files "$iso_path" > "$iso_files" || {
|
||||||
nvidia_runtime_fail "failed to extract live/filesystem.squashfs from ISO"
|
rm -f "$squashfs_tmp" "$squashfs_list" "$iso_files"
|
||||||
}
|
nvidia_runtime_fail "failed to list ISO files for NVIDIA runtime validation"
|
||||||
unsquashfs -ll "$squashfs_tmp" > "$squashfs_list" 2>/dev/null || {
|
|
||||||
rm -f "$squashfs_tmp" "$squashfs_list"
|
|
||||||
nvidia_runtime_fail "failed to inspect filesystem.squashfs from ISO"
|
|
||||||
}
|
}
|
||||||
|
grep '^live/.*\.squashfs$' "$iso_files" | while IFS= read -r squashfs_member; do
|
||||||
|
iso_read_member "$iso_path" "$squashfs_member" "$squashfs_tmp" || {
|
||||||
|
rm -f "$squashfs_tmp" "$squashfs_list" "$iso_files"
|
||||||
|
nvidia_runtime_fail "failed to extract $squashfs_member from ISO"
|
||||||
|
}
|
||||||
|
unsquashfs -ll "$squashfs_tmp" >> "$squashfs_list" 2>/dev/null || {
|
||||||
|
rm -f "$squashfs_tmp" "$squashfs_list" "$iso_files"
|
||||||
|
nvidia_runtime_fail "failed to inspect $squashfs_member from ISO"
|
||||||
|
}
|
||||||
|
: > "$squashfs_tmp"
|
||||||
|
done
|
||||||
|
|
||||||
grep -Eq 'usr/bin/dcgmi$' "$squashfs_list" || {
|
grep -Eq 'usr/bin/dcgmi$' "$squashfs_list" || {
|
||||||
rm -f "$squashfs_tmp" "$squashfs_list"
|
rm -f "$squashfs_tmp" "$squashfs_list" "$iso_files"
|
||||||
nvidia_runtime_fail "dcgmi missing from final NVIDIA ISO"
|
nvidia_runtime_fail "dcgmi missing from final NVIDIA ISO"
|
||||||
}
|
}
|
||||||
grep -Eq 'usr/bin/nv-hostengine$' "$squashfs_list" || {
|
grep -Eq 'usr/bin/nv-hostengine$' "$squashfs_list" || {
|
||||||
rm -f "$squashfs_tmp" "$squashfs_list"
|
rm -f "$squashfs_tmp" "$squashfs_list" "$iso_files"
|
||||||
nvidia_runtime_fail "nv-hostengine missing from final NVIDIA ISO"
|
nvidia_runtime_fail "nv-hostengine missing from final NVIDIA ISO"
|
||||||
}
|
}
|
||||||
grep -Eq 'usr/bin/dcgmproftester([0-9]+)?$' "$squashfs_list" || {
|
grep -Eq 'usr/bin/dcgmproftester([0-9]+)?$' "$squashfs_list" || {
|
||||||
rm -f "$squashfs_tmp" "$squashfs_list"
|
rm -f "$squashfs_tmp" "$squashfs_list" "$iso_files"
|
||||||
nvidia_runtime_fail "dcgmproftester missing from final NVIDIA ISO"
|
nvidia_runtime_fail "dcgmproftester missing from final NVIDIA ISO"
|
||||||
}
|
}
|
||||||
|
|
||||||
rm -f "$squashfs_tmp" "$squashfs_list"
|
rm -f "$squashfs_tmp" "$squashfs_list" "$iso_files"
|
||||||
echo "=== NVIDIA runtime validation OK ==="
|
echo "=== NVIDIA runtime validation OK ==="
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -694,30 +732,25 @@ write_canonical_grub_cfg() {
|
|||||||
kernel="$2"
|
kernel="$2"
|
||||||
append_live="$3"
|
append_live="$3"
|
||||||
initrd="$4"
|
initrd="$4"
|
||||||
|
version_label="${PROJECT_VERSION_EFFECTIVE}"
|
||||||
|
|
||||||
cat > "$cfg" <<EOF
|
cat > "$cfg" <<EOF
|
||||||
source /boot/grub/config.cfg
|
source /boot/grub/config.cfg
|
||||||
|
|
||||||
echo ""
|
menuentry "EASY-BEE v${version_label}" {
|
||||||
echo " ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗"
|
linux ${kernel} ${append_live} nomodeset bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
echo " ██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝"
|
|
||||||
echo " █████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗"
|
|
||||||
echo " ██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝"
|
|
||||||
echo " ███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗"
|
|
||||||
echo " ╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝"
|
|
||||||
echo " Hardware Audit LiveCD"
|
|
||||||
echo ""
|
|
||||||
|
|
||||||
menuentry "EASY-BEE" {
|
|
||||||
linux ${kernel} ${append_live} bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
|
||||||
initrd ${initrd}
|
initrd ${initrd}
|
||||||
}
|
}
|
||||||
|
|
||||||
menuentry "EASY-BEE -- load to RAM (toram)" {
|
menuentry "EASY-BEE v${version_label} -- load to RAM (toram)" {
|
||||||
linux ${kernel} ${append_live} toram bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
linux ${kernel} ${append_live} toram nomodeset bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
initrd ${initrd}
|
initrd ${initrd}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
menuentry "EASY-BEE v${version_label} -- no GUI / no X11" {
|
||||||
|
linux ${kernel} ${append_live} nomodeset bee.gui=off bee.nvidia.mode=gsp-off pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
initrd ${initrd}
|
||||||
|
}
|
||||||
|
|
||||||
if [ "\${grub_platform}" = "efi" ]; then
|
if [ "\${grub_platform}" = "efi" ]; then
|
||||||
menuentry "Memory Test (memtest86+)" {
|
menuentry "Memory Test (memtest86+)" {
|
||||||
@@ -742,21 +775,28 @@ write_canonical_isolinux_cfg() {
|
|||||||
kernel="$2"
|
kernel="$2"
|
||||||
initrd="$3"
|
initrd="$3"
|
||||||
append_live="$4"
|
append_live="$4"
|
||||||
|
version_label="${PROJECT_VERSION_EFFECTIVE}"
|
||||||
|
|
||||||
cat > "$cfg" <<EOF
|
cat > "$cfg" <<EOF
|
||||||
label live-@FLAVOUR@-normal
|
label live-@FLAVOUR@-normal
|
||||||
menu label ^EASY-BEE
|
menu label ^EASY-BEE v${version_label}
|
||||||
menu default
|
|
||||||
linux ${kernel}
|
linux ${kernel}
|
||||||
initrd ${initrd}
|
initrd ${initrd}
|
||||||
append ${append_live} nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
append ${append_live} nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
|
||||||
label live-@FLAVOUR@-toram
|
label live-@FLAVOUR@-toram
|
||||||
menu label EASY-BEE (^load to RAM)
|
menu label EASY-BEE v${version_label} (^load to RAM)
|
||||||
|
menu default
|
||||||
linux ${kernel}
|
linux ${kernel}
|
||||||
initrd ${initrd}
|
initrd ${initrd}
|
||||||
append ${append_live} toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
append ${append_live} toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
|
||||||
|
label live-@FLAVOUR@-console
|
||||||
|
menu label EASY-BEE v${version_label} (^no GUI / no X11)
|
||||||
|
linux ${kernel}
|
||||||
|
initrd ${initrd}
|
||||||
|
append ${append_live} nomodeset bee.gui=off bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
|
||||||
label live-@FLAVOUR@-gsp-off
|
label live-@FLAVOUR@-gsp-off
|
||||||
menu label EASY-BEE (^NVIDIA GSP=off)
|
menu label EASY-BEE (^NVIDIA GSP=off)
|
||||||
linux ${kernel}
|
linux ${kernel}
|
||||||
@@ -800,10 +840,7 @@ enforce_live_build_bootloader_assets() {
|
|||||||
|
|
||||||
if [ -f "$grub_cfg" ]; then
|
if [ -f "$grub_cfg" ]; then
|
||||||
if extract_live_grub_entry "$grub_cfg"; then
|
if extract_live_grub_entry "$grub_cfg"; then
|
||||||
mkdir -p "$grub_dir/live-theme"
|
|
||||||
cp "${BUILDER_DIR}/config/bootloaders/grub-efi/config.cfg" "$grub_dir/config.cfg"
|
cp "${BUILDER_DIR}/config/bootloaders/grub-efi/config.cfg" "$grub_dir/config.cfg"
|
||||||
cp "${BUILDER_DIR}/config/bootloaders/grub-efi/theme.cfg" "$grub_dir/theme.cfg"
|
|
||||||
cp -R "${BUILDER_DIR}/config/bootloaders/grub-efi/live-theme/." "$grub_dir/live-theme/"
|
|
||||||
write_canonical_grub_cfg "$grub_cfg" "$grub_kernel" "${live_build_append:-$grub_append}" "$grub_initrd"
|
write_canonical_grub_cfg "$grub_cfg" "$grub_kernel" "${live_build_append:-$grub_append}" "$grub_initrd"
|
||||||
echo "bootloader sync: rewrote binary/boot/grub/grub.cfg with canonical EASY-BEE menu"
|
echo "bootloader sync: rewrote binary/boot/grub/grub.cfg with canonical EASY-BEE menu"
|
||||||
else
|
else
|
||||||
@@ -857,8 +894,11 @@ FULL_BUILD_MARKER="${BUILD_WORK_DIR}/.bee-full-build-marker"
|
|||||||
# hooks, archives, Dockerfile, auto/config) require a full lb build.
|
# hooks, archives, Dockerfile, auto/config) require a full lb build.
|
||||||
needs_full_build() {
|
needs_full_build() {
|
||||||
[ -f "${FULL_BUILD_MARKER}" ] || return 0
|
[ -f "${FULL_BUILD_MARKER}" ] || return 0
|
||||||
[ -f "${BUILD_WORK_DIR}/binary/live/filesystem.squashfs" ] || return 0
|
|
||||||
[ -f "${BUILD_WORK_DIR}/live-image-amd64.hybrid.iso" ] || return 0
|
[ -f "${BUILD_WORK_DIR}/live-image-amd64.hybrid.iso" ] || return 0
|
||||||
|
# Accept any versioned squashfs (filesystem-v*.squashfs or legacy filesystem.squashfs)
|
||||||
|
_any_sq=$(find "${BUILD_WORK_DIR}/binary/live" -maxdepth 1 \
|
||||||
|
-name 'filesystem*.squashfs' 2>/dev/null | head -1)
|
||||||
|
[ -n "$_any_sq" ] || return 0
|
||||||
|
|
||||||
_heavy=$(find \
|
_heavy=$(find \
|
||||||
"${BUILDER_DIR}/VERSIONS" \
|
"${BUILDER_DIR}/VERSIONS" \
|
||||||
@@ -881,40 +921,109 @@ needs_full_build() {
|
|||||||
# Fast-path: unsquash existing filesystem, rsync overlay on top, repack.
|
# Fast-path: unsquash existing filesystem, rsync overlay on top, repack.
|
||||||
# Requires ~10 GB free in BEE_CACHE_DIR for the unpacked squashfs.
|
# Requires ~10 GB free in BEE_CACHE_DIR for the unpacked squashfs.
|
||||||
fast_path_repack_squashfs() {
|
fast_path_repack_squashfs() {
|
||||||
_sq="${BUILD_WORK_DIR}/binary/live/filesystem.squashfs"
|
_old_sq=$(find "${BUILD_WORK_DIR}/binary/live" -maxdepth 1 \
|
||||||
|
-name 'filesystem*.squashfs' | sort | head -1)
|
||||||
|
_sq="${BUILD_WORK_DIR}/binary/live/${SQUASHFS_FILENAME}"
|
||||||
_tmp="${BEE_CACHE_DIR}/fast-unsquash-${BUILD_VARIANT}"
|
_tmp="${BEE_CACHE_DIR}/fast-unsquash-${BUILD_VARIANT}"
|
||||||
echo "=== fast-path: unsquash ($(du -sh "$_sq" | cut -f1) compressed) ==="
|
echo "=== fast-path: unsquash $(basename "$_old_sq") ($(du -sh "$_old_sq" | cut -f1) compressed) ==="
|
||||||
rm -rf "$_tmp"
|
rm -rf "$_tmp"
|
||||||
unsquashfs -d "$_tmp" "$_sq"
|
unsquashfs -d "$_tmp" "$_old_sq"
|
||||||
echo "=== fast-path: syncing overlay stage ==="
|
echo "=== fast-path: syncing overlay stage ==="
|
||||||
rsync -a --checksum "${OVERLAY_STAGE_DIR}/" "$_tmp/"
|
rsync -a --checksum "${OVERLAY_STAGE_DIR}/" "$_tmp/"
|
||||||
echo "=== fast-path: repacking squashfs ==="
|
echo "=== fast-path: repacking as ${SQUASHFS_FILENAME} ==="
|
||||||
_sq_new="${_sq}.new"
|
_sq_new="${_sq}.new"
|
||||||
rm -f "$_sq_new"
|
rm -f "$_sq_new"
|
||||||
mksquashfs "$_tmp" "$_sq_new" -comp zstd -b 1048576 -noappend -no-progress
|
mksquashfs "$_tmp" "$_sq_new" -comp zstd -b 1048576 -noappend -no-progress -no-xattrs
|
||||||
mv "$_sq_new" "$_sq"
|
mv "$_sq_new" "$_sq"
|
||||||
rm -rf "$_tmp"
|
rm -rf "$_tmp"
|
||||||
|
[ "$_old_sq" != "$_sq" ] && rm -f "$_old_sq"
|
||||||
echo "=== fast-path: squashfs repacked ($(du -sh "$_sq" | cut -f1)) ==="
|
echo "=== fast-path: squashfs repacked ($(du -sh "$_sq" | cut -f1)) ==="
|
||||||
}
|
}
|
||||||
|
|
||||||
# Fast-path: rebuild ISO by replacing only live/filesystem.squashfs via xorriso.
|
# Fast-path: rebuild ISO replacing the squashfs via xorriso.
|
||||||
# Boot structure (El Torito, EFI, MBR hybrid) is replayed from the prior ISO.
|
# Boot structure (El Torito, EFI, MBR hybrid) is replayed from the prior ISO.
|
||||||
fast_path_rebuild_iso() {
|
fast_path_rebuild_iso() {
|
||||||
_sq="${BUILD_WORK_DIR}/binary/live/filesystem.squashfs"
|
_sq="${BUILD_WORK_DIR}/binary/live/${SQUASHFS_FILENAME}"
|
||||||
_prior="${BUILD_WORK_DIR}/live-image-amd64.hybrid.iso"
|
_prior="${BUILD_WORK_DIR}/live-image-amd64.hybrid.iso"
|
||||||
_new="${BUILD_WORK_DIR}/live-image-amd64.hybrid.iso.new"
|
_new="${BUILD_WORK_DIR}/live-image-amd64.hybrid.iso.new"
|
||||||
echo "=== fast-path: rebuilding ISO with xorriso ==="
|
echo "=== fast-path: rebuilding ISO with xorriso ==="
|
||||||
rm -f "$_new"
|
rm -f "$_new"
|
||||||
|
# Remove any old squashfs entries from the prior ISO before adding the new one
|
||||||
|
_old_entries=$(xorriso -indev "$_prior" -find /live -name 'filesystem*.squashfs' -- 2>/dev/null \
|
||||||
|
| grep -E '^/live/filesystem.*\.squashfs$' || true)
|
||||||
|
_rm_args=""
|
||||||
|
for _e in $_old_entries; do
|
||||||
|
_rm_args="$_rm_args -rm $_e --"
|
||||||
|
done
|
||||||
|
# shellcheck disable=SC2086
|
||||||
xorriso \
|
xorriso \
|
||||||
-indev "$_prior" \
|
-indev "$_prior" \
|
||||||
-outdev "$_new" \
|
-outdev "$_new" \
|
||||||
-map "$_sq" /live/filesystem.squashfs \
|
${_rm_args} \
|
||||||
|
-map "$_sq" /live/${SQUASHFS_FILENAME} \
|
||||||
-boot_image any replay \
|
-boot_image any replay \
|
||||||
-commit
|
-commit
|
||||||
mv "$_new" "$_prior"
|
mv "$_new" "$_prior"
|
||||||
echo "=== fast-path: ISO rebuilt ==="
|
echo "=== fast-path: ISO rebuilt ==="
|
||||||
}
|
}
|
||||||
|
|
||||||
|
dir_has_entries() {
|
||||||
|
_dir="$1"
|
||||||
|
[ -d "$_dir" ] || return 1
|
||||||
|
find "$_dir" -mindepth 1 -print -quit 2>/dev/null | grep -q .
|
||||||
|
}
|
||||||
|
|
||||||
|
move_tree_to_layer() {
|
||||||
|
_src_root="$1"
|
||||||
|
_rel="$2"
|
||||||
|
_dst_root="$3"
|
||||||
|
[ -e "${_src_root}/${_rel}" ] || return 0
|
||||||
|
mkdir -p "${_dst_root}/$(dirname "$_rel")"
|
||||||
|
mv "${_src_root}/${_rel}" "${_dst_root}/${_rel}"
|
||||||
|
}
|
||||||
|
|
||||||
|
split_live_squashfs_layers() {
|
||||||
|
lb_dir="$1"
|
||||||
|
live_dir="${lb_dir}/binary/live"
|
||||||
|
base_sq="${live_dir}/filesystem.squashfs"
|
||||||
|
usr_sq="${live_dir}/10-usr.squashfs"
|
||||||
|
fw_sq="${live_dir}/20-firmware.squashfs"
|
||||||
|
|
||||||
|
[ -f "$base_sq" ] || return 0
|
||||||
|
command -v unsquashfs >/dev/null 2>&1 || return 0
|
||||||
|
command -v mksquashfs >/dev/null 2>&1 || return 0
|
||||||
|
|
||||||
|
tmp_root="$(mktemp -d)"
|
||||||
|
tmp_usr="$(mktemp -d)"
|
||||||
|
tmp_fw="$(mktemp -d)"
|
||||||
|
|
||||||
|
echo "=== splitting live squashfs into smaller layers ==="
|
||||||
|
unsquashfs -d "$tmp_root/root" "$base_sq" >/dev/null
|
||||||
|
mkdir -p "$tmp_usr/root" "$tmp_fw/root"
|
||||||
|
|
||||||
|
move_tree_to_layer "$tmp_root/root" "usr" "$tmp_usr/root"
|
||||||
|
move_tree_to_layer "$tmp_root/root" "lib/firmware" "$tmp_fw/root"
|
||||||
|
move_tree_to_layer "$tmp_root/root" "usr/lib/firmware" "$tmp_fw/root"
|
||||||
|
move_tree_to_layer "$tmp_root/root" "boot/firmware" "$tmp_fw/root"
|
||||||
|
|
||||||
|
rm -f "$usr_sq" "$fw_sq"
|
||||||
|
mksquashfs "$tmp_root/root" "${base_sq}.new" -comp zstd -b 1048576 -noappend -no-progress -no-xattrs >/dev/null
|
||||||
|
mv "${base_sq}.new" "$base_sq"
|
||||||
|
|
||||||
|
if dir_has_entries "$tmp_usr/root"; then
|
||||||
|
mksquashfs "$tmp_usr/root" "${usr_sq}.new" -comp zstd -b 1048576 -noappend -no-progress -no-xattrs >/dev/null
|
||||||
|
mv "${usr_sq}.new" "$usr_sq"
|
||||||
|
fi
|
||||||
|
if dir_has_entries "$tmp_fw/root"; then
|
||||||
|
mksquashfs "$tmp_fw/root" "${fw_sq}.new" -comp zstd -b 1048576 -noappend -no-progress -no-xattrs >/dev/null
|
||||||
|
mv "${fw_sq}.new" "$fw_sq"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "=== live squashfs layers ==="
|
||||||
|
find "$live_dir" -maxdepth 1 -type f -name '*.squashfs' -exec du -sh {} \; | sort
|
||||||
|
rm -rf "$tmp_root" "$tmp_usr" "$tmp_fw"
|
||||||
|
}
|
||||||
|
|
||||||
recover_iso_memtest() {
|
recover_iso_memtest() {
|
||||||
lb_dir="$1"
|
lb_dir="$1"
|
||||||
iso_path="$2"
|
iso_path="$2"
|
||||||
@@ -992,11 +1101,12 @@ recover_iso_memtest() {
|
|||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
AUDIT_VERSION_EFFECTIVE="$(resolve_audit_version)"
|
PROJECT_VERSION_EFFECTIVE="$(resolve_project_version)"
|
||||||
ISO_VERSION_EFFECTIVE="$(resolve_iso_version)"
|
SQUASHFS_FILENAME="filesystem-v${PROJECT_VERSION_EFFECTIVE}.squashfs"
|
||||||
ISO_BASENAME="easy-bee-${BUILD_VARIANT}-v${ISO_VERSION_EFFECTIVE}-amd64"
|
ISO_BASENAME="easy-bee-${BUILD_VARIANT}-v${PROJECT_VERSION_EFFECTIVE}-amd64"
|
||||||
# Versioned output directory: dist/easy-bee-v4.1/ — all final artefacts live here.
|
# Versioned output directory: dist/easy-bee-v4.1/ — all final artefacts live here.
|
||||||
OUT_DIR="${DIST_DIR}/easy-bee-v${ISO_VERSION_EFFECTIVE}"
|
OUT_DIR="${DIST_DIR}/release/easy-bee-v${PROJECT_VERSION_EFFECTIVE}"
|
||||||
|
ISO_VERSION_LABEL_TOKEN="$(printf '%s' "${PROJECT_VERSION_EFFECTIVE}" | tr '[:lower:].-' '[:upper:]__')"
|
||||||
mkdir -p "${OUT_DIR}"
|
mkdir -p "${OUT_DIR}"
|
||||||
LOG_DIR="${OUT_DIR}/${ISO_BASENAME}.logs"
|
LOG_DIR="${OUT_DIR}/${ISO_BASENAME}.logs"
|
||||||
LOG_ARCHIVE="${OUT_DIR}/${ISO_BASENAME}.logs.tar.gz"
|
LOG_ARCHIVE="${OUT_DIR}/${ISO_BASENAME}.logs.tar.gz"
|
||||||
@@ -1172,7 +1282,7 @@ fi
|
|||||||
|
|
||||||
echo "=== bee ISO build (variant: ${BUILD_VARIANT}) ==="
|
echo "=== bee ISO build (variant: ${BUILD_VARIANT}) ==="
|
||||||
echo "Debian: ${DEBIAN_VERSION}, Kernel ABI: ${DEBIAN_KERNEL_ABI}, Go: ${GO_VERSION}"
|
echo "Debian: ${DEBIAN_VERSION}, Kernel ABI: ${DEBIAN_KERNEL_ABI}, Go: ${GO_VERSION}"
|
||||||
echo "Audit version: ${AUDIT_VERSION_EFFECTIVE}, ISO version: ${ISO_VERSION_EFFECTIVE}"
|
echo "Project version: ${PROJECT_VERSION_EFFECTIVE}"
|
||||||
echo ""
|
echo ""
|
||||||
|
|
||||||
run_step "sync git submodules" "05-git-submodules" \
|
run_step "sync git submodules" "05-git-submodules" \
|
||||||
@@ -1180,7 +1290,7 @@ run_step "sync git submodules" "05-git-submodules" \
|
|||||||
|
|
||||||
# --- compile bee binary (static, Linux amd64) ---
|
# --- compile bee binary (static, Linux amd64) ---
|
||||||
# Shared between variants — built once, reused on second pass.
|
# Shared between variants — built once, reused on second pass.
|
||||||
BEE_BIN="${DIST_DIR}/bee-linux-amd64"
|
BEE_BIN="${DIST_DIR}/cache/bee-linux-amd64"
|
||||||
NEED_BUILD=1
|
NEED_BUILD=1
|
||||||
if [ -f "$BEE_BIN" ]; then
|
if [ -f "$BEE_BIN" ]; then
|
||||||
NEWEST_SRC=$(find "${REPO_ROOT}/audit" -name '*.go' -newer "$BEE_BIN" | head -1)
|
NEWEST_SRC=$(find "${REPO_ROOT}/audit" -name '*.go' -newer "$BEE_BIN" | head -1)
|
||||||
@@ -1192,7 +1302,7 @@ if [ "$NEED_BUILD" = "1" ]; then
|
|||||||
"cd '${REPO_ROOT}/audit' && \
|
"cd '${REPO_ROOT}/audit' && \
|
||||||
env GOOS=linux GOARCH=amd64 CGO_ENABLED=0 \
|
env GOOS=linux GOARCH=amd64 CGO_ENABLED=0 \
|
||||||
go build \
|
go build \
|
||||||
-ldflags '-s -w -X main.Version=${AUDIT_VERSION_EFFECTIVE}' \
|
-ldflags '-s -w -X main.Version=${PROJECT_VERSION_EFFECTIVE}' \
|
||||||
-o '${BEE_BIN}' \
|
-o '${BEE_BIN}' \
|
||||||
./cmd/bee"
|
./cmd/bee"
|
||||||
echo "binary: $BEE_BIN"
|
echo "binary: $BEE_BIN"
|
||||||
@@ -1211,16 +1321,16 @@ else
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
# --- NVIDIA-only build steps ---
|
# --- NVIDIA-only build steps ---
|
||||||
GPU_BURN_WORKER_BIN="${DIST_DIR}/bee-gpu-burn-worker-linux-amd64"
|
GPU_BURN_WORKER_BIN="${DIST_DIR}/cache/bee-gpu-burn-worker-linux-amd64"
|
||||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||||
run_step "download cuBLAS/cuBLASLt/cudart ${NCCL_CUDA_VERSION} userspace" "20-cublas" \
|
run_step "download cuBLAS/cuBLASLt/cudart ${NCCL_CUDA_VERSION} userspace" "20-cublas" \
|
||||||
sh "${BUILDER_DIR}/build-cublas.sh" \
|
sh "${BUILDER_DIR}/build-cublas.sh" \
|
||||||
"${CUBLAS_VERSION}" \
|
"${CUBLAS_VERSION}" \
|
||||||
"${CUDA_USERSPACE_VERSION}" \
|
"${CUDA_USERSPACE_VERSION}" \
|
||||||
"${NCCL_CUDA_VERSION}" \
|
"${NCCL_CUDA_VERSION}" \
|
||||||
"${DIST_DIR}"
|
"${DIST_DIR}/cache"
|
||||||
|
|
||||||
CUBLAS_CACHE="${DIST_DIR}/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
CUBLAS_CACHE="${DIST_DIR}/cache/cublas-${CUBLAS_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
||||||
|
|
||||||
echo "=== bee-gpu-burn FP4 header probe ==="
|
echo "=== bee-gpu-burn FP4 header probe ==="
|
||||||
fp4_type_match="$(grep -Rsnm 1 'CUDA_R_4F_E2M1' "${CUBLAS_CACHE}/include" 2>/dev/null || true)"
|
fp4_type_match="$(grep -Rsnm 1 'CUDA_R_4F_E2M1' "${CUBLAS_CACHE}/include" 2>/dev/null || true)"
|
||||||
@@ -1346,7 +1456,7 @@ fi
|
|||||||
|
|
||||||
# --- copy bee binary into overlay ---
|
# --- copy bee binary into overlay ---
|
||||||
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/bin"
|
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/bin"
|
||||||
cp "${DIST_DIR}/bee-linux-amd64" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
|
cp "$BEE_BIN" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
|
||||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
|
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
|
||||||
|
|
||||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ] && [ -f "$GPU_BURN_WORKER_BIN" ]; then
|
if [ "$BEE_GPU_VENDOR" = "nvidia" ] && [ -f "$GPU_BURN_WORKER_BIN" ]; then
|
||||||
@@ -1363,7 +1473,7 @@ cp "${BUILDER_DIR}/smoketest.sh" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smokete
|
|||||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest"
|
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest"
|
||||||
|
|
||||||
# --- vendor utilities (optional pre-fetched binaries) ---
|
# --- vendor utilities (optional pre-fetched binaries) ---
|
||||||
for tool in storcli64 sas2ircu sas3ircu arcconf ssacli; do
|
for tool in storcli64 sas2ircu sas3ircu arcconf ssacli saa; do
|
||||||
if [ -f "${VENDOR_DIR}/${tool}" ]; then
|
if [ -f "${VENDOR_DIR}/${tool}" ]; then
|
||||||
cp "${VENDOR_DIR}/${tool}" "${OVERLAY_STAGE_DIR}/usr/local/bin/${tool}"
|
cp "${VENDOR_DIR}/${tool}" "${OVERLAY_STAGE_DIR}/usr/local/bin/${tool}"
|
||||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/${tool}" || true
|
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/${tool}" || true
|
||||||
@@ -1373,13 +1483,23 @@ for tool in storcli64 sas2ircu sas3ircu arcconf ssacli; do
|
|||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
|
# saa requires acpica_bin/acpidump relative to its own location
|
||||||
|
if [ -f "${VENDOR_DIR}/acpica_bin/acpidump" ]; then
|
||||||
|
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/bin/acpica_bin"
|
||||||
|
cp "${VENDOR_DIR}/acpica_bin/acpidump" "${OVERLAY_STAGE_DIR}/usr/local/bin/acpica_bin/acpidump"
|
||||||
|
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/acpica_bin/acpidump" || true
|
||||||
|
echo "vendor tool: acpica_bin/acpidump (included)"
|
||||||
|
else
|
||||||
|
echo "vendor tool: acpica_bin/acpidump (not found, skipped)"
|
||||||
|
fi
|
||||||
|
|
||||||
# --- NVIDIA kernel modules and userspace libs ---
|
# --- NVIDIA kernel modules and userspace libs ---
|
||||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||||
run_step "build NVIDIA ${NVIDIA_DRIVER_VERSION} modules" "40-nvidia-module" \
|
run_step "build NVIDIA ${NVIDIA_DRIVER_VERSION} modules" "40-nvidia-module" \
|
||||||
sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}" "${DEBIAN_KERNEL_ABI}" "${BEE_NVIDIA_MODULE_FLAVOR}"
|
sh "${BUILDER_DIR}/build-nvidia-module.sh" "${NVIDIA_DRIVER_VERSION}" "${DIST_DIR}/cache" "${DEBIAN_KERNEL_ABI}" "${BEE_NVIDIA_MODULE_FLAVOR}"
|
||||||
|
|
||||||
KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
KVER="${DEBIAN_KERNEL_ABI}-amd64"
|
||||||
NVIDIA_CACHE="${DIST_DIR}/nvidia-${BEE_NVIDIA_MODULE_FLAVOR}-${NVIDIA_DRIVER_VERSION}-${KVER}"
|
NVIDIA_CACHE="${DIST_DIR}/cache/nvidia-${BEE_NVIDIA_MODULE_FLAVOR}-${NVIDIA_DRIVER_VERSION}-${KVER}"
|
||||||
|
|
||||||
# Inject .ko files into overlay at /usr/local/lib/nvidia/
|
# Inject .ko files into overlay at /usr/local/lib/nvidia/
|
||||||
OVERLAY_KMOD_DIR="${OVERLAY_STAGE_DIR}/usr/local/lib/nvidia"
|
OVERLAY_KMOD_DIR="${OVERLAY_STAGE_DIR}/usr/local/lib/nvidia"
|
||||||
@@ -1405,9 +1525,9 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
|||||||
|
|
||||||
# --- build / download NCCL ---
|
# --- build / download NCCL ---
|
||||||
run_step "download NCCL ${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}" "50-nccl" \
|
run_step "download NCCL ${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}" "50-nccl" \
|
||||||
sh "${BUILDER_DIR}/build-nccl.sh" "${NCCL_VERSION}" "${NCCL_CUDA_VERSION}" "${DIST_DIR}" "${NCCL_SHA256:-}"
|
sh "${BUILDER_DIR}/build-nccl.sh" "${NCCL_VERSION}" "${NCCL_CUDA_VERSION}" "${DIST_DIR}/cache" "${NCCL_SHA256:-}"
|
||||||
|
|
||||||
NCCL_CACHE="${DIST_DIR}/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
NCCL_CACHE="${DIST_DIR}/cache/nccl-${NCCL_VERSION}+cuda${NCCL_CUDA_VERSION}"
|
||||||
|
|
||||||
# Inject libnccl.so.* into overlay alongside other NVIDIA userspace libs
|
# Inject libnccl.so.* into overlay alongside other NVIDIA userspace libs
|
||||||
cp "${NCCL_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
|
cp "${NCCL_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/"
|
||||||
@@ -1423,19 +1543,19 @@ if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
|||||||
"${NCCL_TESTS_VERSION}" \
|
"${NCCL_TESTS_VERSION}" \
|
||||||
"${NCCL_VERSION}" \
|
"${NCCL_VERSION}" \
|
||||||
"${NCCL_CUDA_VERSION}" \
|
"${NCCL_CUDA_VERSION}" \
|
||||||
"${DIST_DIR}" \
|
"${DIST_DIR}/cache" \
|
||||||
"${NVCC_VERSION}" \
|
"${NVCC_VERSION}" \
|
||||||
"${DEBIAN_VERSION}"
|
"${DEBIAN_VERSION}"
|
||||||
|
|
||||||
NCCL_TESTS_CACHE="${DIST_DIR}/nccl-tests-${NCCL_TESTS_VERSION}"
|
NCCL_TESTS_CACHE="${DIST_DIR}/cache/nccl-tests-${NCCL_TESTS_VERSION}"
|
||||||
cp "${NCCL_TESTS_CACHE}/bin/all_reduce_perf" "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
cp "${NCCL_TESTS_CACHE}/bin/all_reduce_perf" "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
||||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/all_reduce_perf"
|
||||||
cp "${NCCL_TESTS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" 2>/dev/null || true
|
cp "${NCCL_TESTS_CACHE}/lib/"* "${OVERLAY_STAGE_DIR}/usr/lib/" 2>/dev/null || true
|
||||||
echo "=== all_reduce_perf injected ==="
|
echo "=== all_reduce_perf injected ==="
|
||||||
|
|
||||||
run_step "build john jumbo ${JOHN_JUMBO_COMMIT}" "70-john" \
|
run_step "build john jumbo ${JOHN_JUMBO_COMMIT}" "70-john" \
|
||||||
sh "${BUILDER_DIR}/build-john.sh" "${JOHN_JUMBO_COMMIT}" "${DIST_DIR}"
|
sh "${BUILDER_DIR}/build-john.sh" "${JOHN_JUMBO_COMMIT}" "${DIST_DIR}/cache"
|
||||||
JOHN_CACHE="${DIST_DIR}/john-${JOHN_JUMBO_COMMIT}"
|
JOHN_CACHE="${DIST_DIR}/cache/john-${JOHN_JUMBO_COMMIT}"
|
||||||
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john"
|
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john"
|
||||||
rsync -a --delete "${JOHN_CACHE}/run/" "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john/run/"
|
rsync -a --delete "${JOHN_CACHE}/run/" "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/john/run/"
|
||||||
ln -sfn ../lib/bee/john/run/john "${OVERLAY_STAGE_DIR}/usr/local/bin/john"
|
ln -sfn ../lib/bee/john/run/john "${OVERLAY_STAGE_DIR}/usr/local/bin/john"
|
||||||
@@ -1467,8 +1587,10 @@ else
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
cat > "${OVERLAY_STAGE_DIR}/etc/bee-release" <<EOF
|
cat > "${OVERLAY_STAGE_DIR}/etc/bee-release" <<EOF
|
||||||
BEE_ISO_VERSION=${ISO_VERSION_EFFECTIVE}
|
BEE_VERSION=${PROJECT_VERSION_EFFECTIVE}
|
||||||
BEE_AUDIT_VERSION=${AUDIT_VERSION_EFFECTIVE}
|
export BEE_VERSION
|
||||||
|
BEE_ISO_VERSION=${PROJECT_VERSION_EFFECTIVE}
|
||||||
|
BEE_AUDIT_VERSION=${PROJECT_VERSION_EFFECTIVE}
|
||||||
BEE_BUILD_VARIANT=${BUILD_VARIANT}
|
BEE_BUILD_VARIANT=${BUILD_VARIANT}
|
||||||
BEE_GPU_VENDOR=${BEE_GPU_VENDOR}
|
BEE_GPU_VENDOR=${BEE_GPU_VENDOR}
|
||||||
BUILD_DATE=${BUILD_DATE}
|
BUILD_DATE=${BUILD_DATE}
|
||||||
@@ -1561,6 +1683,7 @@ if ! needs_full_build; then
|
|||||||
fast_path_rebuild_iso
|
fast_path_rebuild_iso
|
||||||
ISO_RAW="${LB_DIR}/live-image-amd64.hybrid.iso"
|
ISO_RAW="${LB_DIR}/live-image-amd64.hybrid.iso"
|
||||||
validate_iso_live_boot_entries "$ISO_RAW"
|
validate_iso_live_boot_entries "$ISO_RAW"
|
||||||
|
validate_iso_grub_assets "$ISO_RAW"
|
||||||
validate_iso_nvidia_runtime "$ISO_RAW"
|
validate_iso_nvidia_runtime "$ISO_RAW"
|
||||||
cp "$ISO_RAW" "$ISO_OUT"
|
cp "$ISO_RAW" "$ISO_OUT"
|
||||||
echo ""
|
echo ""
|
||||||
@@ -1575,15 +1698,30 @@ echo "=== building ISO (variant: ${BUILD_VARIANT}) ==="
|
|||||||
|
|
||||||
# Export for auto/config
|
# Export for auto/config
|
||||||
BEE_GPU_VENDOR_UPPER="$(echo "${BUILD_VARIANT}" | tr 'a-z-' 'A-Z_')"
|
BEE_GPU_VENDOR_UPPER="$(echo "${BUILD_VARIANT}" | tr 'a-z-' 'A-Z_')"
|
||||||
export BEE_GPU_VENDOR_UPPER
|
# ISO 9660 volume ID is limited to 32 characters; truncate the version token to fit.
|
||||||
|
_vol_prefix="EASY_BEE_${BEE_GPU_VENDOR_UPPER}_V"
|
||||||
|
_max_token=$(( 32 - ${#_vol_prefix} ))
|
||||||
|
_vol_token="$(printf '%s' "${ISO_VERSION_LABEL_TOKEN}" | cut -c1-${_max_token})"
|
||||||
|
BEE_ISO_VOLUME="${_vol_prefix}${_vol_token}"
|
||||||
|
unset _vol_prefix _max_token _vol_token
|
||||||
|
export BEE_GPU_VENDOR_UPPER BEE_ISO_VOLUME
|
||||||
|
|
||||||
cd "${LB_DIR}"
|
cd "${LB_DIR}"
|
||||||
run_step_sh "live-build clean" "80-lb-clean" "lb clean --all 2>&1 | tail -3"
|
run_step_sh "live-build clean" "80-lb-clean" "lb clean --all 2>&1 | tail -3"
|
||||||
run_step_sh "live-build config" "81-lb-config" "lb config 2>&1 | tail -5"
|
run_step_sh "live-build config" "81-lb-config" "lb config 2>&1 | tail -5"
|
||||||
dump_memtest_debug "pre-build" "${LB_DIR}"
|
dump_memtest_debug "pre-build" "${LB_DIR}"
|
||||||
|
export MKSQUASHFS_OPTIONS="-no-xattrs"
|
||||||
run_step_sh "live-build build" "90-lb-build" "lb build 2>&1"
|
run_step_sh "live-build build" "90-lb-build" "lb build 2>&1"
|
||||||
echo "=== enforcing canonical bootloader assets ==="
|
echo "=== enforcing canonical bootloader assets ==="
|
||||||
enforce_live_build_bootloader_assets "${LB_DIR}"
|
enforce_live_build_bootloader_assets "${LB_DIR}"
|
||||||
|
# Rename lb's default filesystem.squashfs to the versioned filename so the
|
||||||
|
# ISO contains a version-stamped squashfs (e.g. filesystem-v10.15.squashfs).
|
||||||
|
_std_sq="${LB_DIR}/binary/live/filesystem.squashfs"
|
||||||
|
_ver_sq="${LB_DIR}/binary/live/${SQUASHFS_FILENAME}"
|
||||||
|
if [ -f "${_std_sq}" ] && [ "${_std_sq}" != "${_ver_sq}" ]; then
|
||||||
|
mv "${_std_sq}" "${_ver_sq}"
|
||||||
|
echo "=== squashfs renamed: filesystem.squashfs → ${SQUASHFS_FILENAME} ==="
|
||||||
|
fi
|
||||||
reset_live_build_stage "${LB_DIR}" "binary_checksums"
|
reset_live_build_stage "${LB_DIR}" "binary_checksums"
|
||||||
reset_live_build_stage "${LB_DIR}" "binary_iso"
|
reset_live_build_stage "${LB_DIR}" "binary_iso"
|
||||||
reset_live_build_stage "${LB_DIR}" "binary_zsync"
|
reset_live_build_stage "${LB_DIR}" "binary_zsync"
|
||||||
@@ -1615,6 +1753,7 @@ if [ -f "$ISO_RAW" ]; then
|
|||||||
fi
|
fi
|
||||||
validate_iso_memtest "$ISO_RAW"
|
validate_iso_memtest "$ISO_RAW"
|
||||||
validate_iso_live_boot_entries "$ISO_RAW"
|
validate_iso_live_boot_entries "$ISO_RAW"
|
||||||
|
validate_iso_grub_assets "$ISO_RAW"
|
||||||
validate_iso_nvidia_runtime "$ISO_RAW"
|
validate_iso_nvidia_runtime "$ISO_RAW"
|
||||||
cp "$ISO_RAW" "$ISO_OUT"
|
cp "$ISO_RAW" "$ISO_OUT"
|
||||||
touch "${FULL_BUILD_MARKER}"
|
touch "${FULL_BUILD_MARKER}"
|
||||||
|
|||||||
@@ -1,5 +1,7 @@
|
|||||||
set default=0
|
set default=1
|
||||||
set timeout=5
|
set timeout=10
|
||||||
|
set color_normal=yellow/black
|
||||||
|
set color_highlight=white/brown
|
||||||
|
|
||||||
if [ x$feature_default_font_path = xy ] ; then
|
if [ x$feature_default_font_path = xy ] ; then
|
||||||
font=unicode
|
font=unicode
|
||||||
@@ -8,7 +10,7 @@ else
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
if loadfont $font ; then
|
if loadfont $font ; then
|
||||||
set gfxmode=1920x1080,1280x1024,auto
|
set gfxmode=1280x1024,auto
|
||||||
set gfxpayload=keep
|
set gfxpayload=keep
|
||||||
insmod efi_gop
|
insmod efi_gop
|
||||||
insmod efi_uga
|
insmod efi_uga
|
||||||
@@ -26,6 +28,3 @@ insmod gfxterm
|
|||||||
|
|
||||||
terminal_input console serial
|
terminal_input console serial
|
||||||
terminal_output gfxterm serial
|
terminal_output gfxterm serial
|
||||||
|
|
||||||
insmod tga
|
|
||||||
source /boot/grub/theme.cfg
|
|
||||||
|
|||||||
@@ -1,15 +1,25 @@
|
|||||||
source /boot/grub/config.cfg
|
source /boot/grub/config.cfg
|
||||||
|
|
||||||
menuentry "EASY-BEE" {
|
menuentry "EASY-BEE v@VERSION@" {
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
initrd @INITRD_LIVE@
|
initrd @INITRD_LIVE@
|
||||||
}
|
}
|
||||||
|
|
||||||
menuentry "EASY-BEE -- load to RAM (toram)" {
|
menuentry "EASY-BEE v@VERSION@ -- load to RAM (toram)" {
|
||||||
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram bee.display=kms bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
initrd @INITRD_LIVE@
|
initrd @INITRD_LIVE@
|
||||||
}
|
}
|
||||||
|
|
||||||
|
menuentry "EASY-BEE v@VERSION@ -- no GUI / no X11" {
|
||||||
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ nomodeset bee.gui=off bee.nvidia.mode=gsp-off pci=realloc net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
initrd @INITRD_LIVE@
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
menuentry "*** WIPE ALL DISKS (irreversible!) ***" {
|
||||||
|
linux @KERNEL_LIVE@ @APPEND_LIVE@ toram nomodeset bee.gui=off bee.wipe=all net.ifnames=0 biosdevname=0
|
||||||
|
initrd @INITRD_LIVE@
|
||||||
|
}
|
||||||
|
|
||||||
if [ "${grub_platform}" = "efi" ]; then
|
if [ "${grub_platform}" = "efi" ]; then
|
||||||
menuentry "Memory Test (memtest86+)" {
|
menuentry "Memory Test (memtest86+)" {
|
||||||
|
|||||||
@@ -5,13 +5,6 @@ title-text: ""
|
|||||||
message-font: "Unifont Regular 16"
|
message-font: "Unifont Regular 16"
|
||||||
terminal-font: "Unifont Regular 16"
|
terminal-font: "Unifont Regular 16"
|
||||||
|
|
||||||
#bee logo - centered, upper third of screen
|
|
||||||
+ image {
|
|
||||||
top = 4%
|
|
||||||
left = 50%-200
|
|
||||||
file = "bee-logo.tga"
|
|
||||||
}
|
|
||||||
|
|
||||||
#help bar at the bottom
|
#help bar at the bottom
|
||||||
+ label {
|
+ label {
|
||||||
top = 100%-50
|
top = 100%-50
|
||||||
|
|||||||
@@ -1,16 +1,22 @@
|
|||||||
label live-@FLAVOUR@-normal
|
label live-@FLAVOUR@-normal
|
||||||
menu label ^EASY-BEE
|
menu label ^EASY-BEE v@VERSION@
|
||||||
menu default
|
|
||||||
linux @LINUX@
|
linux @LINUX@
|
||||||
initrd @INITRD@
|
initrd @INITRD@
|
||||||
append @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
append @APPEND_LIVE@ nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
|
||||||
label live-@FLAVOUR@-toram
|
label live-@FLAVOUR@-toram
|
||||||
menu label EASY-BEE (^load to RAM)
|
menu label EASY-BEE v@VERSION@ (^load to RAM)
|
||||||
|
menu default
|
||||||
linux @LINUX@
|
linux @LINUX@
|
||||||
initrd @INITRD@
|
initrd @INITRD@
|
||||||
append @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
append @APPEND_LIVE@ toram nomodeset bee.nvidia.mode=normal net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
|
||||||
|
label live-@FLAVOUR@-console
|
||||||
|
menu label EASY-BEE v@VERSION@ (^no GUI / no X11)
|
||||||
|
linux @LINUX@
|
||||||
|
initrd @INITRD@
|
||||||
|
append @APPEND_LIVE@ nomodeset bee.gui=off bee.nvidia.mode=gsp-off net.ifnames=0 biosdevname=0 mitigations=off transparent_hugepage=always numa_balancing=disable pcie_aspm=off intel_idle.max_cstate=1 processor.max_cstate=1 nowatchdog nosoftlockup
|
||||||
|
|
||||||
label live-@FLAVOUR@-gsp-off
|
label live-@FLAVOUR@-gsp-off
|
||||||
menu label EASY-BEE (^NVIDIA GSP=off)
|
menu label EASY-BEE (^NVIDIA GSP=off)
|
||||||
linux @LINUX@
|
linux @LINUX@
|
||||||
@@ -35,6 +41,12 @@ label live-@FLAVOUR@-failsafe
|
|||||||
initrd @INITRD@
|
initrd @INITRD@
|
||||||
append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
append @APPEND_LIVE@ nomodeset bee.nvidia.mode=gsp-off noapic noapm nodma nomce nolapic nosmp vga=normal net.ifnames=0 biosdevname=0
|
||||||
|
|
||||||
|
label wipe-disks
|
||||||
|
menu label *** WIPE ALL DISKS (irreversible!) ***
|
||||||
|
linux @LINUX@
|
||||||
|
initrd @INITRD@
|
||||||
|
append @APPEND_LIVE@ toram nomodeset bee.gui=off bee.wipe=all net.ifnames=0 biosdevname=0
|
||||||
|
|
||||||
label memtest
|
label memtest
|
||||||
menu label ^Memory Test (memtest86+)
|
menu label ^Memory Test (memtest86+)
|
||||||
linux /boot/memtest86+x64.bin
|
linux /boot/memtest86+x64.bin
|
||||||
|
|||||||
@@ -67,7 +67,9 @@ chmod +x /usr/local/bin/bee-log-run 2>/dev/null || true
|
|||||||
chmod +x /usr/local/bin/bee-selfheal 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-selfheal 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee-boot-status 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-boot-status 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee-install 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-install 2>/dev/null || true
|
||||||
|
chmod +x /usr/local/bin/bee-gui-gate 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee-remount-medium 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-remount-medium 2>/dev/null || true
|
||||||
|
chmod +x /usr/local/bin/bee-check-nvswitch 2>/dev/null || true
|
||||||
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
||||||
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-nvidia-load 2>/dev/null || true
|
||||||
chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true
|
chmod +x /usr/local/bin/bee-gpu-burn 2>/dev/null || true
|
||||||
|
|||||||
57
iso/builder/config/hooks/normal/9012-wipe.hook.chroot
Executable file
57
iso/builder/config/hooks/normal/9012-wipe.hook.chroot
Executable file
@@ -0,0 +1,57 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# 9012-wipe.hook.chroot
|
||||||
|
#
|
||||||
|
# Adds bee-initramfs-wipe to the initramfs so that selecting the
|
||||||
|
# "WIPE ALL DISKS" boot menu entry runs the wipe tool before squashfs
|
||||||
|
# is mounted — i.e. it works even when live boot fails.
|
||||||
|
#
|
||||||
|
# Two files are installed inside the chroot:
|
||||||
|
# /etc/initramfs-tools/hooks/bee-wipe — copies binaries into initrd
|
||||||
|
# /etc/initramfs-tools/scripts/local-premount/bee-wipe — runs at boot
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
HOOK_DIR="/etc/initramfs-tools/hooks"
|
||||||
|
SCRIPT_DIR="/etc/initramfs-tools/scripts/local-premount"
|
||||||
|
|
||||||
|
mkdir -p "${HOOK_DIR}" "${SCRIPT_DIR}"
|
||||||
|
|
||||||
|
# ── initramfs hook: copy binaries ────────────────────────────────────────────
|
||||||
|
cat > "${HOOK_DIR}/bee-wipe" << 'EOF'
|
||||||
|
#!/bin/sh
|
||||||
|
PREREQ=""
|
||||||
|
prereqs() { echo "$PREREQ"; }
|
||||||
|
case "$1" in prereqs) prereqs; exit 0 ;; esac
|
||||||
|
|
||||||
|
. /usr/share/initramfs-tools/hook-functions
|
||||||
|
|
||||||
|
for bin in lsblk blkid blkdiscard blockdev; do
|
||||||
|
b=$(command -v "$bin" 2>/dev/null) && copy_exec "$b" /bin
|
||||||
|
done
|
||||||
|
|
||||||
|
[ -x /usr/sbin/nvme ] && copy_exec /usr/sbin/nvme /sbin
|
||||||
|
|
||||||
|
copy_exec /usr/local/bin/bee-initramfs-wipe /bin/bee-wipe
|
||||||
|
EOF
|
||||||
|
|
||||||
|
chmod +x "${HOOK_DIR}/bee-wipe"
|
||||||
|
|
||||||
|
# ── initramfs premount script: trigger on bee.wipe=all ───────────────────────
|
||||||
|
cat > "${SCRIPT_DIR}/bee-wipe" << 'EOF'
|
||||||
|
#!/bin/sh
|
||||||
|
PREREQ=""
|
||||||
|
prereqs() { echo "$PREREQ"; }
|
||||||
|
case "$1" in prereqs) prereqs; exit 0 ;; esac
|
||||||
|
|
||||||
|
grep -qw 'bee.wipe=all' /proc/cmdline 2>/dev/null || exit 0
|
||||||
|
exec /bin/bee-wipe
|
||||||
|
EOF
|
||||||
|
|
||||||
|
chmod +x "${SCRIPT_DIR}/bee-wipe"
|
||||||
|
|
||||||
|
echo "9012-wipe: installed initramfs hook and premount script"
|
||||||
|
|
||||||
|
KVER=$(ls /lib/modules | sort -V | tail -1)
|
||||||
|
echo "9012-wipe: rebuilding initramfs for kernel ${KVER}"
|
||||||
|
update-initramfs -u -k "${KVER}"
|
||||||
|
echo "9012-wipe: done"
|
||||||
37
iso/builder/config/hooks/normal/9998-strip-xattrs.hook.chroot
Executable file
37
iso/builder/config/hooks/normal/9998-strip-xattrs.hook.chroot
Executable file
@@ -0,0 +1,37 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# 9998-strip-xattrs.hook.chroot
|
||||||
|
#
|
||||||
|
# mksquashfs 4.5.1 (Debian bookworm) writes a non-INVALID xattr_id_table_start
|
||||||
|
# even with -no-xattrs when the source tree contains POSIX ACL xattrs set by
|
||||||
|
# dpkg/install-time. Linux 6.1 squashfs driver then fails with
|
||||||
|
# "unable to read xattr id index table" and aborts the mount.
|
||||||
|
#
|
||||||
|
# Strip all xattrs from the live chroot before mksquashfs sees the tree so the
|
||||||
|
# resulting squashfs has SQUASHFS_INVALID_BLK in xattr_id_table_start.
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
def strip(path):
|
||||||
|
try:
|
||||||
|
for attr in os.listxattr(path, follow_symlinks=False):
|
||||||
|
try:
|
||||||
|
os.removexattr(path, attr, follow_symlinks=False)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
removed = 0
|
||||||
|
for root, dirs, files in os.walk('/', topdown=True, followlinks=False):
|
||||||
|
for name in dirs + files:
|
||||||
|
p = os.path.join(root, name)
|
||||||
|
try:
|
||||||
|
attrs = os.listxattr(p, follow_symlinks=False)
|
||||||
|
if attrs:
|
||||||
|
strip(p)
|
||||||
|
removed += len(attrs)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
strip(root)
|
||||||
|
|
||||||
|
print(f"9998-strip-xattrs: removed xattrs from {removed} entries")
|
||||||
@@ -1,5 +1,6 @@
|
|||||||
# AMD GPU firmware
|
# AMD GPU firmware
|
||||||
firmware-amd-graphics
|
firmware-amd-graphics
|
||||||
|
nvtop
|
||||||
|
|
||||||
# AMD ROCm — GPU monitoring, bandwidth test, and compute stress (RVS GST)
|
# AMD ROCm — GPU monitoring, bandwidth test, and compute stress (RVS GST)
|
||||||
rocm-smi-lib=%%ROCM_SMI_VERSION%%
|
rocm-smi-lib=%%ROCM_SMI_VERSION%%
|
||||||
|
|||||||
@@ -5,6 +5,7 @@
|
|||||||
# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with
|
# DCGM 4 is packaged per CUDA major. The image ships NVIDIA driver 590 with
|
||||||
# CUDA 13 userspace, so install the CUDA 13 build plus proprietary components
|
# CUDA 13 userspace, so install the CUDA 13 build plus proprietary components
|
||||||
# explicitly.
|
# explicitly.
|
||||||
|
nvtop
|
||||||
nvidia-fabricmanager=%%NVIDIA_FABRICMANAGER_VERSION%%
|
nvidia-fabricmanager=%%NVIDIA_FABRICMANAGER_VERSION%%
|
||||||
datacenter-gpu-manager-4-cuda13=1:%%DCGM_VERSION%%
|
datacenter-gpu-manager-4-cuda13=1:%%DCGM_VERSION%%
|
||||||
datacenter-gpu-manager-4-proprietary=1:%%DCGM_VERSION%%
|
datacenter-gpu-manager-4-proprietary=1:%%DCGM_VERSION%%
|
||||||
|
|||||||
@@ -38,6 +38,7 @@ exfat-fuse
|
|||||||
ntfs-3g
|
ntfs-3g
|
||||||
|
|
||||||
# Utilities
|
# Utilities
|
||||||
|
infiniband-diags
|
||||||
bash
|
bash
|
||||||
procps
|
procps
|
||||||
lsof
|
lsof
|
||||||
@@ -46,7 +47,6 @@ less
|
|||||||
vim-tiny
|
vim-tiny
|
||||||
mc
|
mc
|
||||||
htop
|
htop
|
||||||
nvtop
|
|
||||||
sudo
|
sudo
|
||||||
zstd
|
zstd
|
||||||
mstflint
|
mstflint
|
||||||
|
|||||||
@@ -1,11 +1,4 @@
|
|||||||
|
EASY BEE
|
||||||
███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗
|
|
||||||
██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝
|
|
||||||
█████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗
|
|
||||||
██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝
|
|
||||||
███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗
|
|
||||||
╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝
|
|
||||||
|
|
||||||
Hardware Audit LiveCD
|
Hardware Audit LiveCD
|
||||||
Build: %%BUILD_INFO%%
|
Build: %%BUILD_INFO%%
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[Unit]
|
[Unit]
|
||||||
Description=Bee: hardware audit
|
Description=Bee: hardware audit
|
||||||
After=bee-preflight.service bee-network.service bee-nvidia.service bee-blackbox.service
|
After=bee-preflight.service bee-nvidia.service bee-blackbox.service
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
Type=oneshot
|
Type=oneshot
|
||||||
|
|||||||
@@ -1,7 +1,6 @@
|
|||||||
[Unit]
|
[Unit]
|
||||||
Description=Bee: bring up network interfaces via DHCP
|
Description=Bee: bring up network interfaces via DHCP
|
||||||
After=local-fs.target bee-blackbox.service
|
After=bee-web.service bee-audit.service
|
||||||
Before=network-online.target bee-audit.service
|
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
Type=oneshot
|
Type=oneshot
|
||||||
|
|||||||
@@ -2,6 +2,8 @@
|
|||||||
Description=Bee: load NVIDIA kernel modules and create device nodes
|
Description=Bee: load NVIDIA kernel modules and create device nodes
|
||||||
After=local-fs.target udev.service bee-blackbox.service
|
After=local-fs.target udev.service bee-blackbox.service
|
||||||
Before=bee-audit.service
|
Before=bee-audit.service
|
||||||
|
# Skip silently if bee-nvidia-load is absent (non-nvidia builds).
|
||||||
|
ConditionPathExists=/usr/local/bin/bee-nvidia-load
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
Type=oneshot
|
Type=oneshot
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[Unit]
|
[Unit]
|
||||||
Description=Bee: runtime preflight self-check
|
Description=Bee: runtime preflight self-check
|
||||||
After=bee-network.service bee-nvidia.service bee-blackbox.service
|
After=bee-nvidia.service bee-blackbox.service
|
||||||
Before=bee-audit.service
|
Before=bee-audit.service
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ Description=Bee: run self-heal checks periodically
|
|||||||
|
|
||||||
[Timer]
|
[Timer]
|
||||||
OnBootSec=45sec
|
OnBootSec=45sec
|
||||||
OnUnitActiveSec=60sec
|
OnUnitActiveSec=3min
|
||||||
AccuracySec=15sec
|
AccuracySec=15sec
|
||||||
Unit=bee-selfheal.service
|
Unit=bee-selfheal.service
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,2 @@
|
|||||||
|
[Service]
|
||||||
|
ExecCondition=/usr/local/bin/bee-gui-gate
|
||||||
@@ -0,0 +1,9 @@
|
|||||||
|
[Unit]
|
||||||
|
# bee-nvidia.service loads the NVIDIA kernel modules; fabricmanager must wait
|
||||||
|
# for them to be fully initialized before attempting to open /dev/nvidiactl.
|
||||||
|
After=bee-nvidia.service
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
# Skip fabricmanager on systems without NVSwitch hardware.
|
||||||
|
# ExecCondition exits 1-254 → unit is silently skipped (inactive, not failed).
|
||||||
|
ExecCondition=/usr/local/bin/bee-check-nvswitch
|
||||||
@@ -3,8 +3,14 @@
|
|||||||
# Shows live service status until all bee services are done or failed,
|
# Shows live service status until all bee services are done or failed,
|
||||||
# then exits so getty can show the login prompt.
|
# then exits so getty can show the login prompt.
|
||||||
|
|
||||||
CRITICAL="bee-preflight bee-nvidia bee-audit"
|
GPU_VENDOR="$(cat /etc/bee-gpu-vendor 2>/dev/null || echo nvidia)"
|
||||||
ALL="bee-sshsetup ssh bee-network bee-nvidia bee-preflight bee-audit bee-web"
|
if [ "$GPU_VENDOR" = "nvidia" ]; then
|
||||||
|
CRITICAL="bee-preflight bee-nvidia bee-audit"
|
||||||
|
ALL="bee-sshsetup ssh bee-network bee-nvidia bee-preflight bee-audit bee-web"
|
||||||
|
else
|
||||||
|
CRITICAL="bee-preflight bee-audit"
|
||||||
|
ALL="bee-sshsetup ssh bee-network bee-preflight bee-audit bee-web"
|
||||||
|
fi
|
||||||
|
|
||||||
svc_state() { systemctl is-active "$1.service" 2>/dev/null || echo "inactive"; }
|
svc_state() { systemctl is-active "$1.service" 2>/dev/null || echo "inactive"; }
|
||||||
|
|
||||||
@@ -51,12 +57,7 @@ while true; do
|
|||||||
printf '\033[H\033[2J'
|
printf '\033[H\033[2J'
|
||||||
|
|
||||||
printf '\n'
|
printf '\n'
|
||||||
printf ' \033[33m███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗\033[0m\n'
|
printf ' \033[33mEASY BEE\033[0m\n'
|
||||||
printf ' \033[33m██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝\033[0m\n'
|
|
||||||
printf ' \033[33m█████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗\033[0m\n'
|
|
||||||
printf ' \033[33m██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝\033[0m\n'
|
|
||||||
printf ' \033[33m███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗\033[0m\n'
|
|
||||||
printf ' \033[33m╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝\033[0m\n'
|
|
||||||
printf ' Hardware Audit LiveCD\n'
|
printf ' Hardware Audit LiveCD\n'
|
||||||
printf '\n'
|
printf '\n'
|
||||||
|
|
||||||
|
|||||||
4
iso/overlay/usr/local/bin/bee-check-nvswitch
Normal file
4
iso/overlay/usr/local/bin/bee-check-nvswitch
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# Exit 0 if NVSwitch hardware is detected; exit 1 to skip fabricmanager on non-NVSwitch systems.
|
||||||
|
# NVSwitch appears in lspci as vendor 10de, class 0680 (Bridge, Other).
|
||||||
|
lspci -Dn 2>/dev/null | awk '$2 == "0680:" && $3 ~ /^10de:/ { found=1; exit } END { exit(found ? 0 : 1) }'
|
||||||
27
iso/overlay/usr/local/bin/bee-gui-gate
Executable file
27
iso/overlay/usr/local/bin/bee-gui-gate
Executable file
@@ -0,0 +1,27 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# bee-gui-gate — skip starting the local GUI when bee.gui=off is set.
|
||||||
|
|
||||||
|
set -eu
|
||||||
|
|
||||||
|
cmdline_param() {
|
||||||
|
key="$1"
|
||||||
|
for token in $(cat /proc/cmdline 2>/dev/null); do
|
||||||
|
case "$token" in
|
||||||
|
"$key"=*)
|
||||||
|
echo "${token#*=}"
|
||||||
|
return 0
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
mode="$(cmdline_param bee.gui || true)"
|
||||||
|
case "${mode}" in
|
||||||
|
off|false|0|tty|console|text|nogui)
|
||||||
|
echo "bee-gui-gate: bee.gui=${mode}; skipping lightdm"
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
exit 0
|
||||||
166
iso/overlay/usr/local/bin/bee-initramfs-wipe
Executable file
166
iso/overlay/usr/local/bin/bee-initramfs-wipe
Executable file
@@ -0,0 +1,166 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
# bee-initramfs-wipe — interactive disk wipe running entirely in the initramfs.
|
||||||
|
# Triggered by bee.wipe=all on the kernel cmdline (via local-premount hook).
|
||||||
|
# Works before squashfs is mounted, so it runs even when live boot fails.
|
||||||
|
|
||||||
|
RED='\033[1;31m'
|
||||||
|
YEL='\033[1;33m'
|
||||||
|
GRN='\033[1;32m'
|
||||||
|
CYN='\033[1;36m'
|
||||||
|
NC='\033[0m'
|
||||||
|
|
||||||
|
p() { printf '%b\n' "$*"; }
|
||||||
|
pp() { printf '%b' "$*"; }
|
||||||
|
|
||||||
|
banner() {
|
||||||
|
p ""
|
||||||
|
p "${RED}╔══════════════════════════════════════════════════════════╗${NC}"
|
||||||
|
p "${RED}║ BEE DRIVE WIPE — initramfs stage ║${NC}"
|
||||||
|
p "${RED}╚══════════════════════════════════════════════════════════╝${NC}"
|
||||||
|
p ""
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── find boot device ─────────────────────────────────────────────────────────
|
||||||
|
boot_dev() {
|
||||||
|
local label token
|
||||||
|
for token in $(cat /proc/cmdline 2>/dev/null); do
|
||||||
|
case "$token" in
|
||||||
|
live-media-label=*) label="${token#*=}" ;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
[ -z "$label" ] && return
|
||||||
|
|
||||||
|
local dev
|
||||||
|
dev=$(blkid -L "$label" 2>/dev/null) || return
|
||||||
|
# strip partition suffix: /dev/sdb1 → /dev/sdb, /dev/nvme0n1p1 → /dev/nvme0n1
|
||||||
|
echo "$dev" | sed 's/p\?[0-9]\+$//'
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── enumerate candidate disks ─────────────────────────────────────────────────
|
||||||
|
list_disks() {
|
||||||
|
local boot
|
||||||
|
boot=$(boot_dev)
|
||||||
|
|
||||||
|
lsblk -d -n -o NAME,TYPE,SIZE,MODEL 2>/dev/null | while read -r name type size model; do
|
||||||
|
[ "$type" = "disk" ] || continue
|
||||||
|
[ "$size" = "0B" ] && continue
|
||||||
|
local dev="/dev/$name"
|
||||||
|
[ "$dev" = "$boot" ] && continue
|
||||||
|
printf '%s\t%s\t%s\n' "$dev" "$size" "${model:-}"
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── wipe one disk ─────────────────────────────────────────────────────────────
|
||||||
|
wipe_one() {
|
||||||
|
local dev="$1"
|
||||||
|
p ""
|
||||||
|
p "=== ${YEL}${dev}${NC} ==="
|
||||||
|
|
||||||
|
if echo "$dev" | grep -q '^/dev/nvme'; then
|
||||||
|
if nvme format --ses=1 "$dev" 2>&1; then
|
||||||
|
p " ${GRN}nvme format OK${NC}"
|
||||||
|
blockdev --flushbufs "$dev" 2>/dev/null || true
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
p " nvme format failed — falling back to blkdiscard"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if blkdiscard -f "$dev" 2>&1; then
|
||||||
|
p " ${GRN}blkdiscard OK${NC}"
|
||||||
|
blockdev --flushbufs "$dev" 2>/dev/null || true
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
p " blkdiscard not supported — zeroing partition tables (HDD fallback)"
|
||||||
|
local size_bytes mb32 skip
|
||||||
|
size_bytes=$(blockdev --getsize64 "$dev" 2>/dev/null || echo 0)
|
||||||
|
mb32=$(( 32 * 1024 * 1024 ))
|
||||||
|
|
||||||
|
dd if=/dev/zero of="$dev" bs=4M count=8 conv=fsync status=progress 2>&1 || true
|
||||||
|
|
||||||
|
if [ "$size_bytes" -gt $(( mb32 * 2 )) ]; then
|
||||||
|
skip=$(( (size_bytes - mb32) / (4 * 1024 * 1024) ))
|
||||||
|
dd if=/dev/zero of="$dev" bs=4M count=8 seek="$skip" conv=fsync status=progress 2>&1 || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
blockdev --flushbufs "$dev" 2>/dev/null || true
|
||||||
|
p " ${GRN}done (partition tables zeroed)${NC}"
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── main ──────────────────────────────────────────────────────────────────────
|
||||||
|
banner
|
||||||
|
|
||||||
|
BOOT=$(boot_dev)
|
||||||
|
[ -n "$BOOT" ] && p "Boot device (excluded): ${CYN}${BOOT}${NC}\n"
|
||||||
|
|
||||||
|
# build indexed list
|
||||||
|
i=0
|
||||||
|
DEVS=""
|
||||||
|
IFS='
|
||||||
|
'
|
||||||
|
for line in $(list_disks); do
|
||||||
|
i=$(( i + 1 ))
|
||||||
|
dev=$(echo "$line" | cut -f1)
|
||||||
|
size=$(echo "$line" | cut -f2)
|
||||||
|
model=$(echo "$line" | cut -f3)
|
||||||
|
DEVS="${DEVS}${i}:${dev}:${size}:${model}
|
||||||
|
"
|
||||||
|
printf " ${CYN}[%d]${NC} %-16s %8s %s\n" "$i" "$dev" "$size" "$model"
|
||||||
|
done
|
||||||
|
IFS='
|
||||||
|
'
|
||||||
|
|
||||||
|
if [ "$i" -eq 0 ]; then
|
||||||
|
p "\nNo physical disks found (boot device excluded)."
|
||||||
|
p "Dropping to shell — type 'exit' to continue boot."
|
||||||
|
exec /bin/sh
|
||||||
|
fi
|
||||||
|
|
||||||
|
p ""
|
||||||
|
pp "Enter numbers to wipe (space-separated), ${YEL}all${NC} for all, ${YEL}q${NC} to abort: "
|
||||||
|
read -r SELECTION
|
||||||
|
|
||||||
|
case "$SELECTION" in
|
||||||
|
q|Q|'') p "\nAborted."; exec /bin/sh ;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
# resolve selection → list of devs
|
||||||
|
SELECTED=""
|
||||||
|
if [ "$SELECTION" = "all" ] || [ "$SELECTION" = "ALL" ]; then
|
||||||
|
SELECTED=$(echo "$DEVS" | grep -v '^$' | cut -d: -f2 | tr '\n' ' ')
|
||||||
|
else
|
||||||
|
for num in $SELECTION; do
|
||||||
|
match=$(echo "$DEVS" | grep "^${num}:" | cut -d: -f2)
|
||||||
|
if [ -z "$match" ]; then
|
||||||
|
p "${RED}Unknown index: ${num}${NC}"; exec /bin/sh
|
||||||
|
fi
|
||||||
|
SELECTED="${SELECTED}${match} "
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
SELECTED=$(echo "$SELECTED" | tr -s ' ' | sed 's/ $//')
|
||||||
|
|
||||||
|
p ""
|
||||||
|
p "Selected for wipe: ${YEL}${SELECTED}${NC}"
|
||||||
|
p "${RED}WARNING: This is IRREVERSIBLE. All data on the selected disks will be lost.${NC}"
|
||||||
|
p ""
|
||||||
|
pp "Type YES to confirm, anything else to abort: "
|
||||||
|
read -r CONFIRM
|
||||||
|
|
||||||
|
if [ "$CONFIRM" != "YES" ]; then
|
||||||
|
p "\nAborted — no disks were touched."
|
||||||
|
exec /bin/sh
|
||||||
|
fi
|
||||||
|
|
||||||
|
p "\nStarting wipe..."
|
||||||
|
for dev in $SELECTED; do
|
||||||
|
wipe_one "$dev"
|
||||||
|
done
|
||||||
|
|
||||||
|
sync
|
||||||
|
p ""
|
||||||
|
p "${GRN}=== All selected disks wiped and flushed. ===${NC}"
|
||||||
|
p ""
|
||||||
|
pp "Press Enter to reboot..."
|
||||||
|
read -r _
|
||||||
|
reboot
|
||||||
@@ -8,7 +8,7 @@
|
|||||||
# Layout (UEFI): GPT, /dev/sdX1=EFI 512MB vfat, /dev/sdX2=root ext4
|
# Layout (UEFI): GPT, /dev/sdX1=EFI 512MB vfat, /dev/sdX2=root ext4
|
||||||
# Layout (BIOS): MBR, /dev/sdX1=root ext4
|
# Layout (BIOS): MBR, /dev/sdX1=root ext4
|
||||||
#
|
#
|
||||||
# Squashfs source: /run/live/medium/live/filesystem.squashfs
|
# Squashfs sources: /run/live/medium/live/*.squashfs
|
||||||
|
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
@@ -62,9 +62,9 @@ for tool in parted mkfs.vfat mkfs.ext4 unsquashfs grub-install update-grub; do
|
|||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
SQUASHFS="/run/live/medium/live/filesystem.squashfs"
|
mapfile -t SQUASHFS_FILES < <(find /run/live/medium/live -maxdepth 1 -type f -name '*.squashfs' | sort)
|
||||||
if [ ! -f "$SQUASHFS" ]; then
|
if [ "${#SQUASHFS_FILES[@]}" -eq 0 ]; then
|
||||||
echo "ERROR: squashfs not found at $SQUASHFS" >&2
|
echo "ERROR: no squashfs files found under /run/live/medium/live" >&2
|
||||||
echo " The live medium may have been disconnected." >&2
|
echo " The live medium may have been disconnected." >&2
|
||||||
echo " Reconnect the disc and run: bee-remount-medium --wait" >&2
|
echo " Reconnect the disc and run: bee-remount-medium --wait" >&2
|
||||||
echo " Then re-run bee-install." >&2
|
echo " Then re-run bee-install." >&2
|
||||||
@@ -106,7 +106,10 @@ log "=== BEE DISK INSTALLER ==="
|
|||||||
log "Target device : $DEVICE"
|
log "Target device : $DEVICE"
|
||||||
log "Root partition: $PART_ROOT"
|
log "Root partition: $PART_ROOT"
|
||||||
[ "$UEFI" = "1" ] && log "EFI partition : $PART_EFI"
|
[ "$UEFI" = "1" ] && log "EFI partition : $PART_EFI"
|
||||||
log "Squashfs : $SQUASHFS ($(du -sh "$SQUASHFS" | cut -f1))"
|
log "Squashfs : ${#SQUASHFS_FILES[@]} layer(s)"
|
||||||
|
for sf in "${SQUASHFS_FILES[@]}"; do
|
||||||
|
log " - $sf ($(du -sh "$sf" | cut -f1))"
|
||||||
|
done
|
||||||
log "Log : $LOGFILE"
|
log "Log : $LOGFILE"
|
||||||
log ""
|
log ""
|
||||||
|
|
||||||
@@ -163,7 +166,9 @@ log " Mounted."
|
|||||||
|
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
log "--- Step 5/7: Unpacking filesystem (this takes 10-20 minutes) ---"
|
log "--- Step 5/7: Unpacking filesystem (this takes 10-20 minutes) ---"
|
||||||
log " Source: $SQUASHFS"
|
for sf in "${SQUASHFS_FILES[@]}"; do
|
||||||
|
log " Source: $sf"
|
||||||
|
done
|
||||||
log " Target: $MOUNT_ROOT"
|
log " Target: $MOUNT_ROOT"
|
||||||
|
|
||||||
# unsquashfs does not support resume, so retry the entire unpack step if the
|
# unsquashfs does not support resume, so retry the entire unpack step if the
|
||||||
@@ -177,9 +182,9 @@ while true; do
|
|||||||
fi
|
fi
|
||||||
[ "$UNPACK_ATTEMPTS" -gt 1 ] && log " Retry attempt $UNPACK_ATTEMPTS / $UNPACK_MAX ..."
|
[ "$UNPACK_ATTEMPTS" -gt 1 ] && log " Retry attempt $UNPACK_ATTEMPTS / $UNPACK_MAX ..."
|
||||||
|
|
||||||
# Re-check squashfs is reachable before each attempt
|
mapfile -t SQUASHFS_FILES < <(find /run/live/medium/live -maxdepth 1 -type f -name '*.squashfs' | sort)
|
||||||
if [ ! -f "$SQUASHFS" ]; then
|
if [ "${#SQUASHFS_FILES[@]}" -eq 0 ]; then
|
||||||
log " SOURCE LOST: $SQUASHFS not found."
|
log " SOURCE LOST: no squashfs files found under /run/live/medium/live."
|
||||||
log " Reconnect the disc and run 'bee-remount-medium --wait' in another terminal,"
|
log " Reconnect the disc and run 'bee-remount-medium --wait' in another terminal,"
|
||||||
log " then press Enter here to retry."
|
log " then press Enter here to retry."
|
||||||
read -r _
|
read -r _
|
||||||
@@ -194,12 +199,17 @@ while true; do
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
UNPACK_OK=0
|
UNPACK_OK=0
|
||||||
unsquashfs -f -d "$MOUNT_ROOT" "$SQUASHFS" 2>&1 | \
|
for sf in "${SQUASHFS_FILES[@]}"; do
|
||||||
grep -E '^\[|^inod|^created|^extract|^ERROR|failed' | \
|
log " Unpacking $(basename "$sf") ..."
|
||||||
while IFS= read -r line; do log " $line"; done || UNPACK_OK=$?
|
unsquashfs -f -d "$MOUNT_ROOT" "$sf" 2>&1 | \
|
||||||
|
grep -E '^\[|^inod|^created|^extract|^ERROR|failed' | \
|
||||||
|
while IFS= read -r line; do log " $line"; done || UNPACK_OK=$?
|
||||||
|
[ "$UNPACK_OK" -eq 0 ] || break
|
||||||
|
done
|
||||||
|
|
||||||
# Check squashfs is still reachable (gone = disc pulled during copy)
|
# Check squashfs is still reachable (gone = disc pulled during copy)
|
||||||
if [ ! -f "$SQUASHFS" ]; then
|
mapfile -t SQUASHFS_FILES < <(find /run/live/medium/live -maxdepth 1 -type f -name '*.squashfs' | sort)
|
||||||
|
if [ "${#SQUASHFS_FILES[@]}" -eq 0 ]; then
|
||||||
log " WARNING: source medium lost during unpack — will retry after remount."
|
log " WARNING: source medium lost during unpack — will retry after remount."
|
||||||
log " Run 'bee-remount-medium --wait' in another terminal, then press Enter."
|
log " Run 'bee-remount-medium --wait' in another terminal, then press Enter."
|
||||||
read -r _
|
read -r _
|
||||||
|
|||||||
@@ -1,8 +1,9 @@
|
|||||||
#!/bin/sh
|
#!/bin/sh
|
||||||
# bee-network.sh — bring up all physical network interfaces via DHCP
|
# bee-network.sh — bring up all physical network interfaces via DHCP
|
||||||
# Unattended: runs silently, logs results, never blocks.
|
# Unattended: starts later in boot, runs quietly, and gives up after a bounded timeout.
|
||||||
|
|
||||||
LOG_PREFIX="bee-network"
|
LOG_PREFIX="bee-network"
|
||||||
|
DHCP_TIMEOUT_SECS=300
|
||||||
|
|
||||||
log() { echo "[$LOG_PREFIX] $*"; }
|
log() { echo "[$LOG_PREFIX] $*"; }
|
||||||
|
|
||||||
@@ -19,9 +20,50 @@ if command -v udevadm >/dev/null 2>&1; then
|
|||||||
udevadm settle --timeout=5 >/dev/null 2>&1 || log "WARN: udevadm settle timed out"
|
udevadm settle --timeout=5 >/dev/null 2>&1 || log "WARN: udevadm settle timed out"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
start_dhcp() {
|
||||||
|
iface="$1"
|
||||||
|
if ! ip link set "$iface" up; then
|
||||||
|
log "WARN: could not bring up $iface"
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
carrier=$(cat "/sys/class/net/$iface/carrier" 2>/dev/null || true)
|
||||||
|
if [ "$carrier" = "1" ]; then
|
||||||
|
log "carrier detected on $iface"
|
||||||
|
else
|
||||||
|
log "carrier not detected on $iface"
|
||||||
|
fi
|
||||||
|
|
||||||
|
dhclient -r "$iface" >/dev/null 2>&1 || true
|
||||||
|
|
||||||
|
if timeout "${DHCP_TIMEOUT_SECS}" dhclient -4 -q -1 "$iface" >/dev/null 2>&1; then
|
||||||
|
addr="$(ip -4 -o addr show dev "$iface" scope global 2>/dev/null | awk '{print $4}' | head -1)"
|
||||||
|
if [ -n "$addr" ]; then
|
||||||
|
log "DHCP lease acquired on $iface ($addr)"
|
||||||
|
else
|
||||||
|
log "DHCP lease acquired on $iface"
|
||||||
|
fi
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
rc=$?
|
||||||
|
case "$rc" in
|
||||||
|
124)
|
||||||
|
log "DHCP timed out on $iface after ${DHCP_TIMEOUT_SECS}s"
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
log "DHCP failed on $iface (exit $rc)"
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
dhclient -r "$iface" >/dev/null 2>&1 || true
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
started_ifaces=""
|
started_ifaces=""
|
||||||
started_count=0
|
started_count=0
|
||||||
scan_pass=1
|
scan_pass=1
|
||||||
|
pids=""
|
||||||
|
pid_ifaces=""
|
||||||
|
|
||||||
# Some server NICs appear a bit later after module/firmware init. Do a small
|
# Some server NICs appear a bit later after module/firmware init. Do a small
|
||||||
# bounded rescan window without turning network bring-up into a boot blocker.
|
# bounded rescan window without turning network bring-up into a boot blocker.
|
||||||
@@ -34,22 +76,11 @@ while [ "$scan_pass" -le 3 ]; do
|
|||||||
*" $iface "*) continue ;;
|
*" $iface "*) continue ;;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
log "bringing up $iface"
|
log "starting DHCP on $iface (timeout ${DHCP_TIMEOUT_SECS}s)"
|
||||||
if ! ip link set "$iface" up; then
|
start_dhcp "$iface" &
|
||||||
log "WARN: could not bring up $iface"
|
pid="$!"
|
||||||
continue
|
pids="$pids $pid"
|
||||||
fi
|
pid_ifaces="$pid_ifaces $pid:$iface"
|
||||||
|
|
||||||
carrier=$(cat "/sys/class/net/$iface/carrier" 2>/dev/null || true)
|
|
||||||
if [ "$carrier" = "1" ]; then
|
|
||||||
log "carrier detected on $iface"
|
|
||||||
else
|
|
||||||
log "carrier not detected yet on $iface"
|
|
||||||
fi
|
|
||||||
|
|
||||||
# DHCP in background — non-blocking, keep dhclient verbose output in the service log.
|
|
||||||
dhclient -4 -v -nw "$iface" &
|
|
||||||
log "DHCP started for $iface (pid $!)"
|
|
||||||
|
|
||||||
started_ifaces="$started_ifaces $iface"
|
started_ifaces="$started_ifaces $iface"
|
||||||
started_count=$((started_count + 1))
|
started_count=$((started_count + 1))
|
||||||
@@ -68,4 +99,15 @@ if [ "$started_count" -eq 0 ]; then
|
|||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
log "done (interfaces started: $started_count)"
|
success_count=0
|
||||||
|
for pid_iface in $pid_ifaces; do
|
||||||
|
pid="${pid_iface%%:*}"
|
||||||
|
iface="${pid_iface#*:}"
|
||||||
|
if wait "$pid"; then
|
||||||
|
success_count=$((success_count + 1))
|
||||||
|
else
|
||||||
|
log "DHCP did not complete successfully on $iface"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
log "done (interfaces scanned: $started_count, leases acquired: $success_count)"
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
# bee-remount-medium — find and remount the live ISO medium to /run/live/medium
|
# bee-remount-medium — find and remount the live ISO medium to /run/live/medium
|
||||||
#
|
#
|
||||||
# Run this after reconnecting the ISO source disc (USB/CD) if the live medium
|
# Run this after reconnecting the ISO source disc (USB/CD) if the live medium
|
||||||
# was lost and /run/live/medium/live/filesystem.squashfs is missing.
|
# was lost and /run/live/medium/live/*.squashfs are missing.
|
||||||
#
|
#
|
||||||
# Usage: bee-remount-medium [--wait]
|
# Usage: bee-remount-medium [--wait]
|
||||||
# --wait keep retrying every 5 seconds until the medium is found (useful
|
# --wait keep retrying every 5 seconds until the medium is found (useful
|
||||||
@@ -11,7 +11,7 @@
|
|||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
MEDIUM_DIR="/run/live/medium"
|
MEDIUM_DIR="/run/live/medium"
|
||||||
SQUASHFS_REL="live/filesystem.squashfs"
|
SQUASHFS_GLOB="live/*.squashfs"
|
||||||
WAIT_MODE=0
|
WAIT_MODE=0
|
||||||
|
|
||||||
for arg in "$@"; do
|
for arg in "$@"; do
|
||||||
@@ -28,6 +28,10 @@ done
|
|||||||
log() { echo "[$(date +%H:%M:%S)] $*"; }
|
log() { echo "[$(date +%H:%M:%S)] $*"; }
|
||||||
die() { log "ERROR: $*" >&2; exit 1; }
|
die() { log "ERROR: $*" >&2; exit 1; }
|
||||||
|
|
||||||
|
if [ "$(id -u)" -ne 0 ]; then
|
||||||
|
die "bee-remount-medium must be run as root (use sudo or a root shell)"
|
||||||
|
fi
|
||||||
|
|
||||||
# Return all candidate block devices (optical + removable USB mass storage)
|
# Return all candidate block devices (optical + removable USB mass storage)
|
||||||
find_candidates() {
|
find_candidates() {
|
||||||
# CD/DVD drives
|
# CD/DVD drives
|
||||||
@@ -52,7 +56,7 @@ try_mount() {
|
|||||||
local tmpdir
|
local tmpdir
|
||||||
tmpdir=$(mktemp -d /tmp/bee-probe-XXXXXX)
|
tmpdir=$(mktemp -d /tmp/bee-probe-XXXXXX)
|
||||||
if mount -o ro "$dev" "$tmpdir" 2>/dev/null; then
|
if mount -o ro "$dev" "$tmpdir" 2>/dev/null; then
|
||||||
if [ -f "${tmpdir}/${SQUASHFS_REL}" ]; then
|
if find "${tmpdir}/live" -maxdepth 1 -type f -name '*.squashfs' 2>/dev/null | grep -q .; then
|
||||||
# Unmount probe mount and mount properly onto live path
|
# Unmount probe mount and mount properly onto live path
|
||||||
umount "$tmpdir" 2>/dev/null || true
|
umount "$tmpdir" 2>/dev/null || true
|
||||||
rmdir "$tmpdir" 2>/dev/null || true
|
rmdir "$tmpdir" 2>/dev/null || true
|
||||||
@@ -78,8 +82,9 @@ attempt() {
|
|||||||
for dev in $(find_candidates); do
|
for dev in $(find_candidates); do
|
||||||
log " Trying $dev ..."
|
log " Trying $dev ..."
|
||||||
if try_mount "$dev"; then
|
if try_mount "$dev"; then
|
||||||
local sq="${MEDIUM_DIR}/${SQUASHFS_REL}"
|
local count
|
||||||
log "SUCCESS: squashfs available at $sq ($(du -sh "$sq" | cut -f1))"
|
count=$(find "${MEDIUM_DIR}/live" -maxdepth 1 -type f -name '*.squashfs' 2>/dev/null | wc -l | tr -d ' ')
|
||||||
|
log "SUCCESS: ${count} squashfs layer(s) available under ${MEDIUM_DIR}/live"
|
||||||
return 0
|
return 0
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
@@ -96,5 +101,5 @@ if [ "$WAIT_MODE" = "1" ]; then
|
|||||||
sleep 5
|
sleep 5
|
||||||
done
|
done
|
||||||
else
|
else
|
||||||
attempt || die "No ISO medium with ${SQUASHFS_REL} found. Reconnect the disc and re-run, or use --wait."
|
attempt || die "No ISO medium with ${SQUASHFS_GLOB} found. Reconnect the disc and re-run, or use --wait."
|
||||||
fi
|
fi
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user