Compare commits
101 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| fc9b446d2e | |||
|
|
ea68318744 | ||
|
|
518082c2e2 | ||
|
|
056dce0b98 | ||
|
|
24f2e65b6e | ||
|
|
7f27b9aa38 | ||
|
|
cf29131116 | ||
|
|
13e6324853 | ||
|
|
892ef6fb7d | ||
|
|
ce46a97975 | ||
|
|
258ecb3453 | ||
|
|
cbb0d1e522 | ||
|
|
bab941ccf1 | ||
|
|
b49c71a980 | ||
|
|
85d1acdaa3 | ||
|
|
a2d7513153 | ||
|
|
5b5d8609d3 | ||
|
|
e7442972d1 | ||
|
|
4c6daa1c5e | ||
|
|
e420888d71 | ||
|
|
8149360410 | ||
|
|
4262c5b798 | ||
|
|
b2e177af31 | ||
|
|
271dadda03 | ||
|
|
20766ccc76 | ||
|
|
966944d6d8 | ||
| ce6b1e0eb7 | |||
| 4066e842a9 | |||
| 7d2e904d14 | |||
| 2320925433 | |||
| e169a7722c | |||
| 74a3c65f64 | |||
| 884988cb2a | |||
| 963bc960ca | |||
| 4f6579e040 | |||
| dc07580adc | |||
|
|
87e78e230e | ||
|
|
805a3b277d | ||
|
|
5bc9bd7fb3 | ||
|
|
0939a647ea | ||
|
|
7640f20714 | ||
|
|
1593bf3e76 | ||
|
|
ae80d7711e | ||
|
|
ca78b9df65 | ||
|
|
5cafe63f33 | ||
|
|
b75e65bcb1 | ||
|
|
8d173175eb | ||
|
|
5cbde0448e | ||
|
|
49a09fde05 | ||
|
|
f3962422c8 | ||
|
|
ee36e3c711 | ||
|
|
cca3b21d35 | ||
|
|
75c33e073e | ||
| 7b4bcc745a | |||
| 42774d44a6 | |||
| 5dc022ddf8 | |||
| 6623e159f5 | |||
| bbd6d009f8 | |||
| 6c2b188ec9 | |||
| 14505ef24a | |||
| 4f20c9246d | |||
| eed157c2db | |||
| a2c8aea0df | |||
| b21f03cd26 | |||
| cac5b9c86e | |||
| b5d04ef045 | |||
| fcd64438ea | |||
| 0e39e7d960 | |||
|
|
58d6da0e4f | ||
|
|
7ce73e34a4 | ||
|
|
8a21809ade | ||
|
|
626763e31d | ||
|
|
0b8a2ff83f | ||
|
|
2c22b01fe3 | ||
|
|
ec89616585 | ||
|
|
c0dbbf96ad | ||
|
|
76484b123c | ||
|
|
8901596152 | ||
|
|
7c504e5056 | ||
|
|
333c44f3ba | ||
|
|
3bca821d3e | ||
|
|
3648e37a1e | ||
|
|
d109e08fab | ||
|
|
11d00b9442 | ||
|
|
6defa5ae15 | ||
|
|
c76658ed00 | ||
|
|
2163017a98 | ||
| 29179917c3 | |||
| be4b439804 | |||
| 749fc8a94d | |||
| 6112094d45 | |||
| e9a2bc9f9d | |||
|
|
7a8f884664 | ||
|
|
8bf8dfa45b | ||
|
|
6a22199aff | ||
|
|
ddb2bb5d1c | ||
|
|
aa284ae754 | ||
|
|
8512098174 | ||
|
|
6b5d22c194 | ||
|
|
a35e90a93e | ||
|
|
1ced81707f |
2
.gitignore
vendored
2
.gitignore
vendored
@@ -1,5 +1,5 @@
|
||||
.env
|
||||
.DS_Store
|
||||
dist/
|
||||
iso/out/
|
||||
build-cache/
|
||||
audit/bee
|
||||
|
||||
@@ -2,6 +2,7 @@ package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
@@ -63,14 +64,20 @@ func run(args []string, stdout, stderr io.Writer) (exitCode int) {
|
||||
return runExport(args[1:], stdout, stderr)
|
||||
case "preflight":
|
||||
return runPreflight(args[1:], stdout, stderr)
|
||||
case "install-to-ram":
|
||||
return runInstallToRAM(args[1:], stdout, stderr)
|
||||
case "support-bundle":
|
||||
return runSupportBundle(args[1:], stdout, stderr)
|
||||
case "web":
|
||||
return runWeb(args[1:], stdout, stderr)
|
||||
case "blackbox":
|
||||
return runBlackbox(args[1:], stdout, stderr)
|
||||
case "sat":
|
||||
return runSAT(args[1:], stdout, stderr)
|
||||
case "benchmark":
|
||||
return runBenchmark(args[1:], stdout, stderr)
|
||||
case "bee-worker":
|
||||
return runBeeWorker(args[1:], stdout, stderr)
|
||||
case "version", "--version", "-version":
|
||||
fmt.Fprintln(stdout, Version)
|
||||
return 0
|
||||
@@ -85,11 +92,14 @@ func printRootUsage(w io.Writer) {
|
||||
fmt.Fprintln(w, `bee commands:
|
||||
bee audit --runtime auto|local|livecd --output stdout|file:<path>
|
||||
bee preflight --output stdout|file:<path>
|
||||
bee install-to-ram
|
||||
bee export --target <device>
|
||||
bee support-bundle --output stdout|file:<path>
|
||||
bee web --listen :80 [--audit-path `+app.DefaultAuditJSONPath+`]
|
||||
bee blackbox --export-dir `+app.DefaultExportDir+` [--state-file `+app.DefaultBlackboxStatePath+`]
|
||||
bee sat nvidia|memory|storage|cpu [--duration <seconds>]
|
||||
bee benchmark nvidia [--profile standard|stability|overnight]
|
||||
bee bee-worker --export-dir `+app.DefaultExportDir+` --task-id TASK-001
|
||||
bee version
|
||||
bee help [command]`)
|
||||
}
|
||||
@@ -102,14 +112,20 @@ func runHelp(args []string, stdout, stderr io.Writer) int {
|
||||
return runExport([]string{"--help"}, stdout, stdout)
|
||||
case "preflight":
|
||||
return runPreflight([]string{"--help"}, stdout, stdout)
|
||||
case "install-to-ram":
|
||||
return runInstallToRAM([]string{"--help"}, stdout, stdout)
|
||||
case "support-bundle":
|
||||
return runSupportBundle([]string{"--help"}, stdout, stdout)
|
||||
case "web":
|
||||
return runWeb([]string{"--help"}, stdout, stdout)
|
||||
case "blackbox":
|
||||
return runBlackbox([]string{"--help"}, stdout, stdout)
|
||||
case "sat":
|
||||
return runSAT([]string{"--help"}, stdout, stderr)
|
||||
case "benchmark":
|
||||
return runBenchmark([]string{"--help"}, stdout, stderr)
|
||||
case "bee-worker":
|
||||
return runBeeWorker([]string{"--help"}, stdout, stderr)
|
||||
case "version":
|
||||
fmt.Fprintln(stdout, "usage: bee version")
|
||||
return 0
|
||||
@@ -241,6 +257,32 @@ func runPreflight(args []string, stdout, stderr io.Writer) int {
|
||||
return 0
|
||||
}
|
||||
|
||||
func runInstallToRAM(args []string, stdout, stderr io.Writer) int {
|
||||
fs := flag.NewFlagSet("install-to-ram", flag.ContinueOnError)
|
||||
fs.SetOutput(stderr)
|
||||
fs.Usage = func() {
|
||||
fmt.Fprintln(stderr, "usage: bee install-to-ram")
|
||||
}
|
||||
if err := fs.Parse(args); err != nil {
|
||||
if err == flag.ErrHelp {
|
||||
return 0
|
||||
}
|
||||
return 2
|
||||
}
|
||||
if fs.NArg() != 0 {
|
||||
fs.Usage()
|
||||
return 2
|
||||
}
|
||||
|
||||
application := app.New(platform.New())
|
||||
logLine := func(s string) { fmt.Fprintln(stdout, s) }
|
||||
if err := application.RunInstallToRAM(context.Background(), logLine); err != nil {
|
||||
slog.Error("run install-to-ram", "err", err)
|
||||
return 1
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func runSupportBundle(args []string, stdout, stderr io.Writer) int {
|
||||
fs := flag.NewFlagSet("support-bundle", flag.ContinueOnError)
|
||||
fs.SetOutput(stderr)
|
||||
@@ -335,6 +377,33 @@ func runWeb(args []string, stdout, stderr io.Writer) int {
|
||||
return 0
|
||||
}
|
||||
|
||||
func runBlackbox(args []string, stdout, stderr io.Writer) int {
|
||||
fs := flag.NewFlagSet("blackbox", flag.ContinueOnError)
|
||||
fs.SetOutput(stderr)
|
||||
exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with logs, SAT results, and support bundles")
|
||||
statePath := fs.String("state-file", app.DefaultBlackboxStatePath, "blackbox state file")
|
||||
fs.Usage = func() {
|
||||
fmt.Fprintf(stderr, "usage: bee blackbox [--export-dir %s] [--state-file %s]\n", app.DefaultExportDir, app.DefaultBlackboxStatePath)
|
||||
fs.PrintDefaults()
|
||||
}
|
||||
if err := fs.Parse(args); err != nil {
|
||||
if err == flag.ErrHelp {
|
||||
return 0
|
||||
}
|
||||
return 2
|
||||
}
|
||||
if fs.NArg() != 0 {
|
||||
fs.Usage()
|
||||
return 2
|
||||
}
|
||||
slog.Info("starting bee blackbox", "export_dir", *exportDir, "state_file", *statePath)
|
||||
if err := app.RunBlackbox(context.Background(), *exportDir, *statePath, platform.New()); err != nil && !errors.Is(err, context.Canceled) {
|
||||
slog.Error("run blackbox", "err", err)
|
||||
return 1
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
func runSAT(args []string, stdout, stderr io.Writer) int {
|
||||
if len(args) == 0 {
|
||||
fmt.Fprintln(stderr, "usage: bee sat nvidia|memory|storage|cpu [--duration <seconds>]")
|
||||
@@ -462,6 +531,28 @@ func runBenchmark(args []string, stdout, stderr io.Writer) int {
|
||||
return 0
|
||||
}
|
||||
|
||||
func runBeeWorker(args []string, stdout, stderr io.Writer) int {
|
||||
fs := flag.NewFlagSet("bee-worker", flag.ContinueOnError)
|
||||
fs.SetOutput(stderr)
|
||||
exportDir := fs.String("export-dir", app.DefaultExportDir, "directory with task state and artifacts")
|
||||
taskID := fs.String("task-id", "", "task identifier, e.g. TASK-001")
|
||||
fs.Usage = func() {
|
||||
fmt.Fprintf(stderr, "usage: bee bee-worker --export-dir %s --task-id TASK-001\n", app.DefaultExportDir)
|
||||
fs.PrintDefaults()
|
||||
}
|
||||
if err := fs.Parse(args); err != nil {
|
||||
if err == flag.ErrHelp {
|
||||
return 0
|
||||
}
|
||||
return 2
|
||||
}
|
||||
if fs.NArg() != 0 {
|
||||
fs.Usage()
|
||||
return 2
|
||||
}
|
||||
return webui.RunPersistedTask(*exportDir, *taskID, stdout, stderr)
|
||||
}
|
||||
|
||||
func parseBenchmarkIndexCSV(raw string) ([]int, error) {
|
||||
raw = strings.TrimSpace(raw)
|
||||
if raw == "" {
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
405
audit/internal/app/app_format.go
Normal file
405
audit/internal/app/app_format.go
Normal file
@@ -0,0 +1,405 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/collector"
|
||||
"bee/audit/internal/platform"
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
func hostnameOr(fallback string) string {
|
||||
hn, err := os.Hostname()
|
||||
if err != nil || strings.TrimSpace(hn) == "" {
|
||||
return fallback
|
||||
}
|
||||
return hn
|
||||
}
|
||||
|
||||
func sanitizeFilename(v string) string {
|
||||
var out []rune
|
||||
for _, r := range v {
|
||||
switch {
|
||||
case r >= 'a' && r <= 'z', r >= 'A' && r <= 'Z', r >= '0' && r <= '9', r == '-', r == '_', r == '.':
|
||||
out = append(out, r)
|
||||
default:
|
||||
out = append(out, '-')
|
||||
}
|
||||
}
|
||||
if len(out) == 0 {
|
||||
return "unknown"
|
||||
}
|
||||
return string(out)
|
||||
}
|
||||
|
||||
func bodyOr(body, fallback string) string {
|
||||
body = strings.TrimSpace(body)
|
||||
if body == "" {
|
||||
return fallback
|
||||
}
|
||||
return body
|
||||
}
|
||||
|
||||
func trimPtr(value *string) string {
|
||||
if value == nil {
|
||||
return ""
|
||||
}
|
||||
return strings.TrimSpace(*value)
|
||||
}
|
||||
|
||||
func joinSortedKeys(values map[string]struct{}) string {
|
||||
if len(values) == 0 {
|
||||
return ""
|
||||
}
|
||||
keys := make([]string, 0, len(values))
|
||||
for key := range values {
|
||||
keys = append(keys, key)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
return strings.Join(keys, "/")
|
||||
}
|
||||
|
||||
func humanizeMB(totalMB int) string {
|
||||
if totalMB <= 0 {
|
||||
return ""
|
||||
}
|
||||
gb := float64(totalMB) / 1024.0
|
||||
if gb >= 1024.0 {
|
||||
tb := gb / 1024.0
|
||||
return fmt.Sprintf("%.1f TB", tb)
|
||||
}
|
||||
if gb == float64(int64(gb)) {
|
||||
return fmt.Sprintf("%.0f GB", gb)
|
||||
}
|
||||
return fmt.Sprintf("%.1f GB", gb)
|
||||
}
|
||||
|
||||
func humanizeGB(totalGB int) string {
|
||||
if totalGB <= 0 {
|
||||
return ""
|
||||
}
|
||||
tb := float64(totalGB) / 1024.0
|
||||
if tb >= 1.0 {
|
||||
return fmt.Sprintf("%.1f TB", tb)
|
||||
}
|
||||
return fmt.Sprintf("%d GB", totalGB)
|
||||
}
|
||||
|
||||
func parseKeyValueSummary(raw string) map[string]string {
|
||||
out := map[string]string{}
|
||||
for _, line := range strings.Split(raw, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
key, value, ok := strings.Cut(line, "=")
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
out[strings.TrimSpace(key)] = strings.TrimSpace(value)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func firstNonEmpty(values ...string) string {
|
||||
for _, value := range values {
|
||||
value = strings.TrimSpace(value)
|
||||
if value != "" {
|
||||
return value
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func cleanSummaryKey(key string) string {
|
||||
idx := strings.Index(key, "-")
|
||||
if idx <= 0 {
|
||||
return key
|
||||
}
|
||||
prefix := key[:idx]
|
||||
for _, c := range prefix {
|
||||
if c < '0' || c > '9' {
|
||||
return key
|
||||
}
|
||||
}
|
||||
return key[idx+1:]
|
||||
}
|
||||
|
||||
func isGPUDevice(dev schema.HardwarePCIeDevice) bool {
|
||||
// Exclude Aspeed BMC VGA adapters (not compute GPUs).
|
||||
if dev.VendorID != nil && *dev.VendorID == collector.AspeedVendorID {
|
||||
return false
|
||||
}
|
||||
class := trimPtr(dev.DeviceClass)
|
||||
// AMD Instinct / Radeon compute GPUs always carry ProcessingAccelerator or DisplayController.
|
||||
// Do NOT match AMD vendor alone — CPU chipset PCIe devices share that vendor ID.
|
||||
if class == "VideoController" || class == "DisplayController" || class == "ProcessingAccelerator" {
|
||||
return true
|
||||
}
|
||||
// NVIDIA devices sometimes expose class values outside the standard GPU set.
|
||||
return dev.VendorID != nil && *dev.VendorID == collector.NvidiaVendorID
|
||||
}
|
||||
|
||||
func formatSystemLine(board schema.HardwareBoard) string {
|
||||
model := strings.TrimSpace(strings.Join([]string{
|
||||
trimPtr(board.Manufacturer),
|
||||
trimPtr(board.ProductName),
|
||||
}, " "))
|
||||
serial := strings.TrimSpace(board.SerialNumber)
|
||||
switch {
|
||||
case model != "" && serial != "":
|
||||
return fmt.Sprintf("System: %s | S/N %s", model, serial)
|
||||
case model != "":
|
||||
return "System: " + model
|
||||
case serial != "":
|
||||
return "System S/N: " + serial
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
func formatCPULine(cpus []schema.HardwareCPU) string {
|
||||
if len(cpus) == 0 {
|
||||
return ""
|
||||
}
|
||||
modelCounts := map[string]int{}
|
||||
unknown := 0
|
||||
for _, cpu := range cpus {
|
||||
model := trimPtr(cpu.Model)
|
||||
if model == "" {
|
||||
unknown++
|
||||
continue
|
||||
}
|
||||
modelCounts[model]++
|
||||
}
|
||||
if len(modelCounts) == 1 && unknown == 0 {
|
||||
for model, count := range modelCounts {
|
||||
return fmt.Sprintf("CPU: %d x %s", count, model)
|
||||
}
|
||||
}
|
||||
parts := make([]string, 0, len(modelCounts)+1)
|
||||
if len(modelCounts) > 0 {
|
||||
keys := make([]string, 0, len(modelCounts))
|
||||
for key := range modelCounts {
|
||||
keys = append(keys, key)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
for _, key := range keys {
|
||||
parts = append(parts, fmt.Sprintf("%d x %s", modelCounts[key], key))
|
||||
}
|
||||
}
|
||||
if unknown > 0 {
|
||||
parts = append(parts, fmt.Sprintf("%d x unknown", unknown))
|
||||
}
|
||||
return "CPU: " + strings.Join(parts, ", ")
|
||||
}
|
||||
|
||||
func formatMemoryLine(dimms []schema.HardwareMemory) string {
|
||||
totalMB := 0
|
||||
present := 0
|
||||
types := map[string]struct{}{}
|
||||
for _, dimm := range dimms {
|
||||
if dimm.Present != nil && !*dimm.Present {
|
||||
continue
|
||||
}
|
||||
if dimm.SizeMB == nil || *dimm.SizeMB <= 0 {
|
||||
continue
|
||||
}
|
||||
present++
|
||||
totalMB += *dimm.SizeMB
|
||||
if value := trimPtr(dimm.Type); value != "" {
|
||||
types[value] = struct{}{}
|
||||
}
|
||||
}
|
||||
if totalMB == 0 {
|
||||
return ""
|
||||
}
|
||||
typeText := joinSortedKeys(types)
|
||||
line := fmt.Sprintf("Memory: %s", humanizeMB(totalMB))
|
||||
if typeText != "" {
|
||||
line += " " + typeText
|
||||
}
|
||||
if present > 0 {
|
||||
line += fmt.Sprintf(" (%d DIMMs)", present)
|
||||
}
|
||||
return line
|
||||
}
|
||||
|
||||
func formatStorageLine(disks []schema.HardwareStorage) string {
|
||||
count := 0
|
||||
totalGB := 0
|
||||
for _, disk := range disks {
|
||||
if disk.Present != nil && !*disk.Present {
|
||||
continue
|
||||
}
|
||||
count++
|
||||
if disk.SizeGB != nil && *disk.SizeGB > 0 {
|
||||
totalGB += *disk.SizeGB
|
||||
}
|
||||
}
|
||||
if count == 0 {
|
||||
return ""
|
||||
}
|
||||
line := fmt.Sprintf("Storage: %d drives", count)
|
||||
if totalGB > 0 {
|
||||
line += fmt.Sprintf(" / %s", humanizeGB(totalGB))
|
||||
}
|
||||
return line
|
||||
}
|
||||
|
||||
func formatGPULine(devices []schema.HardwarePCIeDevice) string {
|
||||
gpus := map[string]int{}
|
||||
for _, dev := range devices {
|
||||
if !isGPUDevice(dev) {
|
||||
continue
|
||||
}
|
||||
name := firstNonEmpty(trimPtr(dev.Model), trimPtr(dev.Manufacturer), "unknown")
|
||||
gpus[name]++
|
||||
}
|
||||
if len(gpus) == 0 {
|
||||
return ""
|
||||
}
|
||||
keys := make([]string, 0, len(gpus))
|
||||
for key := range gpus {
|
||||
keys = append(keys, key)
|
||||
}
|
||||
sort.Strings(keys)
|
||||
parts := make([]string, 0, len(keys))
|
||||
for _, key := range keys {
|
||||
parts = append(parts, fmt.Sprintf("%d x %s", gpus[key], key))
|
||||
}
|
||||
return "GPU: " + strings.Join(parts, ", ")
|
||||
}
|
||||
|
||||
func formatIPLine(list func() ([]platform.InterfaceInfo, error)) string {
|
||||
if list == nil {
|
||||
return ""
|
||||
}
|
||||
ifaces, err := list()
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
seen := map[string]struct{}{}
|
||||
var ips []string
|
||||
for _, iface := range ifaces {
|
||||
for _, ip := range iface.IPv4 {
|
||||
ip = strings.TrimSpace(ip)
|
||||
if ip == "" {
|
||||
continue
|
||||
}
|
||||
if _, ok := seen[ip]; ok {
|
||||
continue
|
||||
}
|
||||
seen[ip] = struct{}{}
|
||||
ips = append(ips, ip)
|
||||
}
|
||||
}
|
||||
if len(ips) == 0 {
|
||||
return ""
|
||||
}
|
||||
sort.Strings(ips)
|
||||
return "IP: " + strings.Join(ips, ", ")
|
||||
}
|
||||
|
||||
func formatSATDetail(raw string) string {
|
||||
var b strings.Builder
|
||||
kv := parseKeyValueSummary(raw)
|
||||
|
||||
if t, ok := kv["run_at_utc"]; ok {
|
||||
fmt.Fprintf(&b, "Run: %s\n\n", t)
|
||||
}
|
||||
|
||||
lines := strings.Split(raw, "\n")
|
||||
var stepKeys []string
|
||||
seenStep := map[string]bool{}
|
||||
for _, line := range lines {
|
||||
if idx := strings.Index(line, "_status="); idx >= 0 {
|
||||
key := line[:idx]
|
||||
if !seenStep[key] && key != "overall" {
|
||||
seenStep[key] = true
|
||||
stepKeys = append(stepKeys, key)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for _, key := range stepKeys {
|
||||
status := kv[key+"_status"]
|
||||
display := cleanSummaryKey(key)
|
||||
switch status {
|
||||
case "OK":
|
||||
fmt.Fprintf(&b, "PASS %s\n", display)
|
||||
case "FAILED":
|
||||
fmt.Fprintf(&b, "FAIL %s\n", display)
|
||||
case "UNSUPPORTED":
|
||||
fmt.Fprintf(&b, "SKIP %s\n", display)
|
||||
default:
|
||||
fmt.Fprintf(&b, "? %s\n", display)
|
||||
}
|
||||
}
|
||||
|
||||
if overall, ok := kv["overall_status"]; ok {
|
||||
ok2 := kv["job_ok"]
|
||||
failed := kv["job_failed"]
|
||||
fmt.Fprintf(&b, "\nOverall: %s (ok=%s failed=%s)", overall, ok2, failed)
|
||||
}
|
||||
|
||||
return strings.TrimSpace(b.String())
|
||||
}
|
||||
|
||||
func formatSATSummary(label, raw string) string {
|
||||
values := parseKeyValueSummary(raw)
|
||||
var body strings.Builder
|
||||
fmt.Fprintf(&body, "%s:", label)
|
||||
if overall := firstNonEmpty(values["overall_status"], "UNKNOWN"); overall != "" {
|
||||
fmt.Fprintf(&body, " %s", overall)
|
||||
}
|
||||
if ok := firstNonEmpty(values["job_ok"], "0"); ok != "" {
|
||||
fmt.Fprintf(&body, " ok=%s", ok)
|
||||
}
|
||||
if failed := firstNonEmpty(values["job_failed"], "0"); failed != "" {
|
||||
fmt.Fprintf(&body, " failed=%s", failed)
|
||||
}
|
||||
if unsupported := firstNonEmpty(values["job_unsupported"], "0"); unsupported != "" && unsupported != "0" {
|
||||
fmt.Fprintf(&body, " unsupported=%s", unsupported)
|
||||
}
|
||||
if devices := strings.TrimSpace(values["devices"]); devices != "" {
|
||||
fmt.Fprintf(&body, "\nDevices: %s", devices)
|
||||
}
|
||||
return body.String()
|
||||
}
|
||||
|
||||
func latestSATSummaries() []string {
|
||||
patterns := []struct {
|
||||
label string
|
||||
prefix string
|
||||
}{
|
||||
{label: "NVIDIA SAT", prefix: "gpu-nvidia-"},
|
||||
{label: "NVIDIA Targeted Stress Validate (dcgmi diag targeted_stress)", prefix: "gpu-nvidia-targeted-stress-"},
|
||||
{label: "NVIDIA Max Compute Load (dcgmproftester)", prefix: "gpu-nvidia-compute-"},
|
||||
{label: "NVIDIA Targeted Power (dcgmi diag targeted_power)", prefix: "gpu-nvidia-targeted-power-"},
|
||||
{label: "NVIDIA Pulse Test (dcgmi diag pulse_test)", prefix: "gpu-nvidia-pulse-"},
|
||||
{label: "NVIDIA Interconnect Test (NCCL all_reduce_perf)", prefix: "gpu-nvidia-nccl-"},
|
||||
{label: "NVIDIA Bandwidth Test (NVBandwidth)", prefix: "gpu-nvidia-bandwidth-"},
|
||||
{label: "Memory SAT", prefix: "memory-"},
|
||||
{label: "Storage SAT", prefix: "storage-"},
|
||||
{label: "CPU SAT", prefix: "cpu-"},
|
||||
}
|
||||
var out []string
|
||||
for _, item := range patterns {
|
||||
matches, err := filepath.Glob(filepath.Join(DefaultSATBaseDir, item.prefix+"*/summary.txt"))
|
||||
if err != nil || len(matches) == 0 {
|
||||
continue
|
||||
}
|
||||
sort.Strings(matches)
|
||||
raw, err := os.ReadFile(matches[len(matches)-1])
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
out = append(out, formatSATSummary(item.label, string(raw)))
|
||||
}
|
||||
return out
|
||||
}
|
||||
76
audit/internal/app/app_install.go
Normal file
76
audit/internal/app/app_install.go
Normal file
@@ -0,0 +1,76 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
func (a *App) ListRemovableTargets() ([]platform.RemovableTarget, error) {
|
||||
return a.exports.ListRemovableTargets()
|
||||
}
|
||||
|
||||
func (a *App) ExportLatestAudit(target platform.RemovableTarget) (string, error) {
|
||||
if _, err := os.Stat(DefaultAuditJSONPath); err != nil {
|
||||
return "", err
|
||||
}
|
||||
filename := fmt.Sprintf("audit-%s-%s.json", sanitizeFilename(hostnameOr("unknown")), time.Now().UTC().Format("20060102-150405"))
|
||||
tmpPath := filepath.Join(os.TempDir(), filename)
|
||||
data, err := readFileLimited(DefaultAuditJSONPath, 100<<20)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
if normalized, normErr := ApplySATOverlay(data); normErr == nil {
|
||||
data = normalized
|
||||
}
|
||||
if err := os.WriteFile(tmpPath, data, 0644); err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer os.Remove(tmpPath)
|
||||
return a.exports.ExportFileToTarget(tmpPath, target)
|
||||
}
|
||||
|
||||
func (a *App) ExportLatestAuditResult(target platform.RemovableTarget) (ActionResult, error) {
|
||||
path, err := a.ExportLatestAudit(target)
|
||||
body := "Audit export failed."
|
||||
if err == nil {
|
||||
body = "Audit exported."
|
||||
}
|
||||
if err == nil && path != "" {
|
||||
body = "Audit exported to " + path
|
||||
}
|
||||
return ActionResult{Title: "Export audit", Body: body}, err
|
||||
}
|
||||
|
||||
func (a *App) ExportSupportBundle(target platform.RemovableTarget) (string, error) {
|
||||
archive, err := BuildSupportBundle(DefaultExportDir)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer os.Remove(archive)
|
||||
return a.exports.ExportFileToTarget(archive, target)
|
||||
}
|
||||
|
||||
func (a *App) ExportSupportBundleResult(target platform.RemovableTarget) (ActionResult, error) {
|
||||
path, err := a.ExportSupportBundle(target)
|
||||
body := "Support bundle export failed."
|
||||
if err == nil {
|
||||
body = "Support bundle exported. USB target unmounted and safe to remove."
|
||||
}
|
||||
if err == nil && path != "" {
|
||||
body = "Support bundle exported to " + path + ".\n\nUSB target unmounted and safe to remove."
|
||||
}
|
||||
return ActionResult{Title: "Export support bundle", Body: body}, err
|
||||
}
|
||||
|
||||
func (a *App) ListInstallDisks() ([]platform.InstallDisk, error) {
|
||||
return a.installer.ListInstallDisks()
|
||||
}
|
||||
|
||||
func (a *App) InstallToDisk(ctx context.Context, device string, logFile string) error {
|
||||
return a.installer.InstallToDisk(ctx, device, logFile)
|
||||
}
|
||||
106
audit/internal/app/app_network.go
Normal file
106
audit/internal/app/app_network.go
Normal file
@@ -0,0 +1,106 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
func (a *App) ListInterfaces() ([]platform.InterfaceInfo, error) {
|
||||
return a.network.ListInterfaces()
|
||||
}
|
||||
|
||||
func (a *App) DefaultRoute() string {
|
||||
return a.network.DefaultRoute()
|
||||
}
|
||||
|
||||
func (a *App) DHCPOne(iface string) (string, error) {
|
||||
return a.network.DHCPOne(iface)
|
||||
}
|
||||
|
||||
func (a *App) DHCPOneResult(iface string) (ActionResult, error) {
|
||||
body, err := a.network.DHCPOne(iface)
|
||||
return ActionResult{Title: "DHCP: " + iface, Body: bodyOr(body, "DHCP completed.")}, err
|
||||
}
|
||||
|
||||
func (a *App) DHCPAll() (string, error) {
|
||||
return a.network.DHCPAll()
|
||||
}
|
||||
|
||||
func (a *App) DHCPAllResult() (ActionResult, error) {
|
||||
body, err := a.network.DHCPAll()
|
||||
return ActionResult{Title: "DHCP: all interfaces", Body: bodyOr(body, "DHCP completed.")}, err
|
||||
}
|
||||
|
||||
func (a *App) SetStaticIPv4(cfg platform.StaticIPv4Config) (string, error) {
|
||||
return a.network.SetStaticIPv4(cfg)
|
||||
}
|
||||
|
||||
func (a *App) SetInterfaceState(iface string, up bool) error {
|
||||
return a.network.SetInterfaceState(iface, up)
|
||||
}
|
||||
|
||||
func (a *App) GetInterfaceState(iface string) (bool, error) {
|
||||
return a.network.GetInterfaceState(iface)
|
||||
}
|
||||
|
||||
func (a *App) CaptureNetworkSnapshot() (platform.NetworkSnapshot, error) {
|
||||
return a.network.CaptureNetworkSnapshot()
|
||||
}
|
||||
|
||||
func (a *App) RestoreNetworkSnapshot(snapshot platform.NetworkSnapshot) error {
|
||||
return a.network.RestoreNetworkSnapshot(snapshot)
|
||||
}
|
||||
|
||||
func (a *App) SetStaticIPv4Result(cfg platform.StaticIPv4Config) (ActionResult, error) {
|
||||
body, err := a.network.SetStaticIPv4(cfg)
|
||||
return ActionResult{Title: "Static IPv4: " + cfg.Interface, Body: bodyOr(body, "Static IPv4 updated.")}, err
|
||||
}
|
||||
|
||||
func (a *App) NetworkStatus() (ActionResult, error) {
|
||||
ifaces, err := a.network.ListInterfaces()
|
||||
if err != nil {
|
||||
return ActionResult{Title: "Network status"}, err
|
||||
}
|
||||
if len(ifaces) == 0 {
|
||||
return ActionResult{Title: "Network status", Body: "No physical interfaces found."}, nil
|
||||
}
|
||||
var body strings.Builder
|
||||
for _, iface := range ifaces {
|
||||
ipv4 := "(no IPv4)"
|
||||
if len(iface.IPv4) > 0 {
|
||||
ipv4 = strings.Join(iface.IPv4, ", ")
|
||||
}
|
||||
fmt.Fprintf(&body, "- %s: state=%s ip=%s\n", iface.Name, iface.State, ipv4)
|
||||
}
|
||||
if gw := a.network.DefaultRoute(); gw != "" {
|
||||
fmt.Fprintf(&body, "\nDefault route: %s\n", gw)
|
||||
}
|
||||
return ActionResult{Title: "Network status", Body: strings.TrimSpace(body.String())}, nil
|
||||
}
|
||||
|
||||
func (a *App) DefaultStaticIPv4FormFields(iface string) []string {
|
||||
return []string{
|
||||
"",
|
||||
"24",
|
||||
strings.TrimSpace(a.network.DefaultRoute()),
|
||||
"77.88.8.8 77.88.8.1 1.1.1.1 8.8.8.8",
|
||||
}
|
||||
}
|
||||
|
||||
func (a *App) ParseStaticIPv4Config(iface string, fields []string) platform.StaticIPv4Config {
|
||||
get := func(index int) string {
|
||||
if index >= 0 && index < len(fields) {
|
||||
return strings.TrimSpace(fields[index])
|
||||
}
|
||||
return ""
|
||||
}
|
||||
return platform.StaticIPv4Config{
|
||||
Interface: iface,
|
||||
Address: get(0),
|
||||
Prefix: get(1),
|
||||
Gateway: get(2),
|
||||
DNS: strings.Fields(get(3)),
|
||||
}
|
||||
}
|
||||
370
audit/internal/app/app_packs.go
Normal file
370
audit/internal/app/app_packs.go
Normal file
@@ -0,0 +1,370 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
func (a *App) RunNvidiaAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaAcceptancePack(baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||
path, err := a.RunNvidiaAcceptancePack(baseDir, nil)
|
||||
body := "Archive written."
|
||||
if path != "" {
|
||||
body = "Archive written to " + path
|
||||
}
|
||||
return ActionResult{Title: "NVIDIA SAT", Body: body}, err
|
||||
}
|
||||
|
||||
func (a *App) ListNvidiaGPUs() ([]platform.NvidiaGPU, error) {
|
||||
return a.sat.ListNvidiaGPUs()
|
||||
}
|
||||
|
||||
func (a *App) ListNvidiaGPUStatuses() ([]platform.NvidiaGPUStatus, error) {
|
||||
return a.sat.ListNvidiaGPUStatuses()
|
||||
}
|
||||
|
||||
func (a *App) ResetNvidiaGPU(index int) (ActionResult, error) {
|
||||
out, err := a.sat.ResetNvidiaGPU(index)
|
||||
return ActionResult{Title: fmt.Sprintf("Reset NVIDIA GPU %d", index), Body: strings.TrimSpace(out)}, err
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaAcceptancePackWithOptions(ctx context.Context, baseDir string, diagLevel int, gpuIndices []int, logFunc func(string)) (ActionResult, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
path, err := a.sat.RunNvidiaAcceptancePackWithOptions(ctx, baseDir, diagLevel, gpuIndices, logFunc)
|
||||
body := "Archive written."
|
||||
if path != "" {
|
||||
body = "Archive written to " + path
|
||||
}
|
||||
return ActionResult{Title: "NVIDIA DCGM", Body: body}, err
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaTargetedStressValidatePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaTargetedStressValidatePack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaStressPack(baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||
return a.RunNvidiaStressPackCtx(context.Background(), baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaBenchmark(baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||
return a.RunNvidiaBenchmarkCtx(context.Background(), baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaBenchmarkCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultBeeBenchPerfDir
|
||||
}
|
||||
resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "performance", logFunc)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
opts.ServerPowerSource = resolved.SelectedSource
|
||||
return a.sat.RunNvidiaBenchmark(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaPowerBenchCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultBeeBenchPowerDir
|
||||
}
|
||||
resolved, err := a.ensureBenchmarkPowerAutotune(ctx, baseDir, opts, "power-fit", logFunc)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
opts.ServerPowerSource = resolved.SelectedSource
|
||||
return a.sat.RunNvidiaPowerBench(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaPowerSourceAutotuneCtx(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultBeeBenchAutotuneDir
|
||||
}
|
||||
return a.sat.RunNvidiaPowerSourceAutotune(ctx, baseDir, opts, benchmarkKind, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) LoadBenchmarkPowerAutotune() (*platform.BenchmarkPowerAutotuneConfig, error) {
|
||||
return platform.LoadBenchmarkPowerAutotuneConfig(DefaultBeeBenchPowerSourceConfigPath)
|
||||
}
|
||||
|
||||
func (a *App) ensureBenchmarkPowerAutotune(ctx context.Context, baseDir string, opts platform.NvidiaBenchmarkOptions, benchmarkKind string, logFunc func(string)) (platform.BenchmarkPowerAutotuneConfig, error) {
|
||||
cfgPath := platform.BenchmarkPowerSourceConfigPath(baseDir)
|
||||
if cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath); err == nil {
|
||||
if logFunc != nil {
|
||||
logFunc(fmt.Sprintf("benchmark autotune: using saved server power source %s", cfg.SelectedSource))
|
||||
}
|
||||
return *cfg, nil
|
||||
}
|
||||
if logFunc != nil {
|
||||
logFunc("benchmark autotune: no saved power source config, running autotune first")
|
||||
}
|
||||
autotuneDir := filepath.Join(filepath.Dir(baseDir), "autotune")
|
||||
if _, err := a.RunNvidiaPowerSourceAutotuneCtx(ctx, autotuneDir, opts, benchmarkKind, logFunc); err != nil {
|
||||
return platform.BenchmarkPowerAutotuneConfig{}, err
|
||||
}
|
||||
cfg, err := platform.LoadBenchmarkPowerAutotuneConfig(cfgPath)
|
||||
if err != nil {
|
||||
return platform.BenchmarkPowerAutotuneConfig{}, err
|
||||
}
|
||||
return *cfg, nil
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaOfficialComputePack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, staggerSec int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaOfficialComputePack(ctx, baseDir, durationSec, gpuIndices, staggerSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaTargetedPowerPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaTargetedPowerPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaPulseTestPack(ctx context.Context, baseDir string, durationSec int, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaPulseTestPack(ctx, baseDir, durationSec, gpuIndices, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaBandwidthPack(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaBandwidthPack(ctx, baseDir, gpuIndices, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNvidiaStressPackCtx(ctx context.Context, baseDir string, opts platform.NvidiaStressOptions, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNvidiaStressPack(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunMemoryAcceptancePackCtx(context.Background(), baseDir, 256, 1, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryAcceptancePackCtx(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunMemoryAcceptancePack(ctx, baseDir, sizeMB, passes, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||
path, err := a.RunMemoryAcceptancePack(baseDir, nil)
|
||||
return ActionResult{Title: "Memory SAT", Body: satResultBody(path)}, err
|
||||
}
|
||||
|
||||
func (a *App) RunCPUAcceptancePack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.RunCPUAcceptancePackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunCPUAcceptancePackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunCPUAcceptancePack(ctx, baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunCPUAcceptancePackResult(baseDir string, durationSec int) (ActionResult, error) {
|
||||
path, err := a.RunCPUAcceptancePack(baseDir, durationSec, nil)
|
||||
return ActionResult{Title: "CPU SAT", Body: satResultBody(path)}, err
|
||||
}
|
||||
|
||||
func (a *App) RunStorageAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunStorageAcceptancePackCtx(context.Background(), baseDir, false, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunStorageAcceptancePackCtx(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunStorageAcceptancePack(ctx, baseDir, extended, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||
path, err := a.RunStorageAcceptancePack(baseDir, nil)
|
||||
return ActionResult{Title: "Storage SAT", Body: satResultBody(path)}, err
|
||||
}
|
||||
|
||||
func (a *App) DetectGPUVendor() string {
|
||||
return a.sat.DetectGPUVendor()
|
||||
}
|
||||
|
||||
func (a *App) ListAMDGPUs() ([]platform.AMDGPUInfo, error) {
|
||||
return a.sat.ListAMDGPUs()
|
||||
}
|
||||
|
||||
func (a *App) RunAMDAcceptancePack(baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunAMDAcceptancePackCtx(context.Background(), baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunAMDAcceptancePackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunAMDAcceptancePack(ctx, baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunAMDAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||
path, err := a.RunAMDAcceptancePack(baseDir, nil)
|
||||
return ActionResult{Title: "AMD GPU SAT", Body: satResultBody(path)}, err
|
||||
}
|
||||
|
||||
func (a *App) RunAMDMemIntegrityPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunAMDMemIntegrityPack(ctx, baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunAMDMemBandwidthPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunAMDMemBandwidthPack(ctx, baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.RunMemoryStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunSATStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.RunSATStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunAMDStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.RunAMDStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.sat.RunMemoryStressPack(ctx, baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunSATStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.sat.RunSATStressPack(ctx, baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunAMDStressPackCtx(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunAMDStressPack(ctx, baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunFanStressTest(ctx context.Context, baseDir string, opts platform.FanStressOptions) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunFanStressTest(ctx, baseDir, opts)
|
||||
}
|
||||
|
||||
func (a *App) RunPlatformStress(ctx context.Context, baseDir string, opts platform.PlatformStressOptions, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunPlatformStress(ctx, baseDir, opts, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNCCLTests(ctx context.Context, baseDir string, gpuIndices []int, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunNCCLTests(ctx, baseDir, gpuIndices, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunNCCLTestsResult(ctx context.Context) (ActionResult, error) {
|
||||
path, err := a.RunNCCLTests(ctx, DefaultSATBaseDir, nil, nil)
|
||||
body := "Results: " + path
|
||||
if err != nil && err != context.Canceled {
|
||||
body += "\nERROR: " + err.Error()
|
||||
}
|
||||
return ActionResult{Title: "NCCL bandwidth test", Body: body}, err
|
||||
}
|
||||
|
||||
func (a *App) RunFanStressTestResult(ctx context.Context, opts platform.FanStressOptions) (ActionResult, error) {
|
||||
path, err := a.RunFanStressTest(ctx, "", opts)
|
||||
body := formatFanStressResult(path)
|
||||
if err != nil && err != context.Canceled {
|
||||
body += "\nERROR: " + err.Error()
|
||||
}
|
||||
return ActionResult{Title: "GPU Platform Stress Test", Body: body}, err
|
||||
}
|
||||
|
||||
// formatFanStressResult formats the summary.txt from a fan-stress run, including
|
||||
// the per-step pass/fail display and the analysis section (throttling, max temps, fan response).
|
||||
func formatFanStressResult(archivePath string) string {
|
||||
if archivePath == "" {
|
||||
return "No output produced."
|
||||
}
|
||||
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||
raw, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||
if err != nil {
|
||||
return "Archive written to " + archivePath
|
||||
}
|
||||
content := strings.TrimSpace(string(raw))
|
||||
kv := parseKeyValueSummary(content)
|
||||
|
||||
var b strings.Builder
|
||||
b.WriteString(formatSATDetail(content))
|
||||
|
||||
// Append analysis section.
|
||||
var analysis []string
|
||||
if v, ok := kv["throttling_detected"]; ok {
|
||||
label := "NO"
|
||||
if v == "true" {
|
||||
label = "YES ← throttling detected during load"
|
||||
}
|
||||
analysis = append(analysis, "Throttling: "+label)
|
||||
}
|
||||
if v, ok := kv["max_gpu_temp_c"]; ok && v != "0.0" {
|
||||
analysis = append(analysis, "Max GPU temp: "+v+"°C")
|
||||
}
|
||||
if v, ok := kv["max_cpu_temp_c"]; ok && v != "0.0" {
|
||||
analysis = append(analysis, "Max CPU temp: "+v+"°C")
|
||||
}
|
||||
if v, ok := kv["fan_response_sec"]; ok && v != "N/A" && v != "-1.0" {
|
||||
analysis = append(analysis, "Fan response: "+v+"s")
|
||||
}
|
||||
|
||||
if len(analysis) > 0 {
|
||||
b.WriteString("\n\n=== Analysis ===\n")
|
||||
for _, line := range analysis {
|
||||
b.WriteString(line + "\n")
|
||||
}
|
||||
}
|
||||
return strings.TrimSpace(b.String())
|
||||
}
|
||||
|
||||
// satResultBody reads summary.txt from the SAT run directory (archive path without .tar.gz)
|
||||
// and returns a formatted human-readable result. Falls back to a plain message if unreadable.
|
||||
func satResultBody(archivePath string) string {
|
||||
if archivePath == "" {
|
||||
return "No output produced."
|
||||
}
|
||||
runDir := strings.TrimSuffix(archivePath, ".tar.gz")
|
||||
raw, err := os.ReadFile(filepath.Join(runDir, "summary.txt"))
|
||||
if err != nil {
|
||||
return "Archive written to " + archivePath
|
||||
}
|
||||
return formatSATDetail(strings.TrimSpace(string(raw)))
|
||||
}
|
||||
67
audit/internal/app/app_services.go
Normal file
67
audit/internal/app/app_services.go
Normal file
@@ -0,0 +1,67 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
func (a *App) ListBeeServices() ([]string, error) {
|
||||
return a.services.ListBeeServices()
|
||||
}
|
||||
|
||||
func (a *App) ServiceState(name string) string {
|
||||
return a.services.ServiceState(name)
|
||||
}
|
||||
|
||||
func (a *App) ServiceStatus(name string) (string, error) {
|
||||
return a.services.ServiceStatus(name)
|
||||
}
|
||||
|
||||
func (a *App) ServiceStatusResult(name string) (ActionResult, error) {
|
||||
body, err := a.services.ServiceStatus(name)
|
||||
return ActionResult{Title: "service status: " + name, Body: bodyOr(body, "No status output.")}, err
|
||||
}
|
||||
|
||||
func (a *App) ServiceDo(name string, action platform.ServiceAction) (string, error) {
|
||||
return a.services.ServiceDo(name, action)
|
||||
}
|
||||
|
||||
func (a *App) ServiceActionResult(name string, action platform.ServiceAction) (ActionResult, error) {
|
||||
body, err := a.services.ServiceDo(name, action)
|
||||
return ActionResult{Title: "service " + string(action) + ": " + name, Body: bodyOr(body, "Action completed.")}, err
|
||||
}
|
||||
|
||||
func (a *App) TailFile(path string, lines int) string {
|
||||
return a.tools.TailFile(path, lines)
|
||||
}
|
||||
|
||||
func (a *App) CheckTools(names []string) []platform.ToolStatus {
|
||||
return a.tools.CheckTools(names)
|
||||
}
|
||||
|
||||
func (a *App) ToolCheckResult(names []string) ActionResult {
|
||||
if len(names) == 0 {
|
||||
return ActionResult{Title: "Required tools", Body: "No tools checked."}
|
||||
}
|
||||
var body strings.Builder
|
||||
for _, tool := range a.tools.CheckTools(names) {
|
||||
status := "MISSING"
|
||||
if tool.OK {
|
||||
status = "OK (" + tool.Path + ")"
|
||||
}
|
||||
fmt.Fprintf(&body, "- %s: %s\n", tool.Name, status)
|
||||
}
|
||||
return ActionResult{Title: "Required tools", Body: strings.TrimSpace(body.String())}
|
||||
}
|
||||
|
||||
func (a *App) AuditLogTailResult() ActionResult {
|
||||
logTail := strings.TrimSpace(a.tools.TailFile(DefaultAuditLogPath, 40))
|
||||
jsonTail := strings.TrimSpace(a.tools.TailFile(DefaultAuditJSONPath, 20))
|
||||
body := strings.TrimSpace(logTail + "\n\n" + jsonTail)
|
||||
if body == "" {
|
||||
body = "No audit logs found."
|
||||
}
|
||||
return ActionResult{Title: "Audit log tail", Body: body}
|
||||
}
|
||||
782
audit/internal/app/blackbox.go
Normal file
782
audit/internal/app/blackbox.go
Normal file
@@ -0,0 +1,782 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"crypto/rand"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io/fs"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
const (
|
||||
blackboxMarkerName = ".bee-blackbox"
|
||||
blackboxDiscoverInterval = 2 * time.Second
|
||||
blackboxMinFlushPeriod = 1 * time.Second
|
||||
blackboxMaxFlushPeriod = 30 * time.Second
|
||||
blackboxRecoveryFastCount = 5
|
||||
)
|
||||
|
||||
var DefaultBlackboxStatePath = DefaultExportDir + "/blackbox-state.json"
|
||||
|
||||
var (
|
||||
blackboxExecCommand = exec.Command
|
||||
blackboxNow = func() time.Time { return time.Now().UTC() }
|
||||
)
|
||||
|
||||
type BlackboxMarker struct {
|
||||
Version int `json:"version"`
|
||||
EnrollmentID string `json:"enrollment_id"`
|
||||
CreatedAtUTC string `json:"created_at_utc"`
|
||||
Host string `json:"host,omitempty"`
|
||||
}
|
||||
|
||||
type BlackboxTargetStatus struct {
|
||||
EnrollmentID string `json:"enrollment_id"`
|
||||
Device string `json:"device"`
|
||||
FS platform.RemovableTarget `json:"fs"`
|
||||
BootFolder string `json:"boot_folder"`
|
||||
Status string `json:"status"`
|
||||
LastSyncAtUTC string `json:"last_sync_at_utc,omitempty"`
|
||||
LastCycleDuration string `json:"last_cycle_duration,omitempty"`
|
||||
FlushPeriod string `json:"flush_period"`
|
||||
LastError string `json:"last_error,omitempty"`
|
||||
Mountpoint string `json:"mountpoint,omitempty"`
|
||||
}
|
||||
|
||||
type BlackboxState struct {
|
||||
Status string `json:"status"`
|
||||
BootStartedAtUTC string `json:"boot_started_at_utc"`
|
||||
BootFolder string `json:"boot_folder"`
|
||||
UpdatedAtUTC string `json:"updated_at_utc"`
|
||||
Targets []BlackboxTargetStatus `json:"targets"`
|
||||
}
|
||||
|
||||
type blackboxRuntime struct {
|
||||
exportDir string
|
||||
statePath string
|
||||
system *platform.System
|
||||
bootStarted time.Time
|
||||
bootFolder string
|
||||
|
||||
mu sync.Mutex
|
||||
workers map[string]*blackboxWorker
|
||||
}
|
||||
|
||||
type discoveredBlackboxTarget struct {
|
||||
marker BlackboxMarker
|
||||
target platform.RemovableTarget
|
||||
seenMount string
|
||||
mountedByBee bool
|
||||
}
|
||||
|
||||
type blackboxWorker struct {
|
||||
runtime *blackboxRuntime
|
||||
enrollmentID string
|
||||
|
||||
mu sync.Mutex
|
||||
target platform.RemovableTarget
|
||||
marker BlackboxMarker
|
||||
mountpoint string
|
||||
mountedByBee bool
|
||||
status string
|
||||
lastSyncAt time.Time
|
||||
lastDuration time.Duration
|
||||
flushPeriod time.Duration
|
||||
lastError string
|
||||
fastCycles int
|
||||
stopCh chan struct{}
|
||||
stoppedCh chan struct{}
|
||||
}
|
||||
|
||||
func RunBlackbox(ctx context.Context, exportDir, statePath string, system *platform.System) error {
|
||||
exportDir = strings.TrimSpace(exportDir)
|
||||
if exportDir == "" {
|
||||
exportDir = DefaultExportDir
|
||||
}
|
||||
statePath = strings.TrimSpace(statePath)
|
||||
if statePath == "" {
|
||||
statePath = DefaultBlackboxStatePath
|
||||
}
|
||||
if system == nil {
|
||||
system = platform.New()
|
||||
}
|
||||
bootStarted, err := bootStartedAtUTC()
|
||||
if err != nil {
|
||||
bootStarted = blackboxNow()
|
||||
}
|
||||
rt := &blackboxRuntime{
|
||||
exportDir: exportDir,
|
||||
statePath: statePath,
|
||||
system: system,
|
||||
bootStarted: bootStarted,
|
||||
bootFolder: SupportBundleBaseName(bootStarted),
|
||||
workers: make(map[string]*blackboxWorker),
|
||||
}
|
||||
_ = os.MkdirAll(filepath.Dir(statePath), 0755)
|
||||
rt.persistState()
|
||||
ticker := time.NewTicker(blackboxDiscoverInterval)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
rt.reconcile()
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
rt.stopAll()
|
||||
return ctx.Err()
|
||||
case <-ticker.C:
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func ReadBlackboxState(path string) (BlackboxState, error) {
|
||||
path = strings.TrimSpace(path)
|
||||
if path == "" {
|
||||
path = DefaultBlackboxStatePath
|
||||
}
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return BlackboxState{}, err
|
||||
}
|
||||
var state BlackboxState
|
||||
if err := json.Unmarshal(raw, &state); err != nil {
|
||||
return BlackboxState{}, err
|
||||
}
|
||||
return state, nil
|
||||
}
|
||||
|
||||
func EnableBlackboxTarget(target platform.RemovableTarget) (BlackboxMarker, error) {
|
||||
target = sanitizeRemovableTarget(target)
|
||||
if target.Device == "" {
|
||||
return BlackboxMarker{}, fmt.Errorf("device is required")
|
||||
}
|
||||
mountpoint, mountedByBee, err := ensureMountedTarget(target, "marker")
|
||||
if err != nil {
|
||||
return BlackboxMarker{}, err
|
||||
}
|
||||
defer func() {
|
||||
if mountedByBee {
|
||||
_ = unmountTarget(mountpoint)
|
||||
}
|
||||
}()
|
||||
|
||||
marker, _, err := readBlackboxMarker(mountpoint)
|
||||
if err != nil && !errors.Is(err, os.ErrNotExist) {
|
||||
return BlackboxMarker{}, err
|
||||
}
|
||||
if marker.EnrollmentID == "" {
|
||||
marker = BlackboxMarker{
|
||||
Version: 1,
|
||||
EnrollmentID: newBlackboxEnrollmentID(),
|
||||
CreatedAtUTC: blackboxNow().Format(time.RFC3339),
|
||||
Host: hostnameOr("unknown"),
|
||||
}
|
||||
}
|
||||
if err := writeBlackboxMarker(mountpoint, marker); err != nil {
|
||||
return BlackboxMarker{}, err
|
||||
}
|
||||
return marker, nil
|
||||
}
|
||||
|
||||
func DisableBlackboxTarget(device, enrollmentID string) error {
|
||||
device = strings.TrimSpace(device)
|
||||
enrollmentID = strings.TrimSpace(enrollmentID)
|
||||
if device == "" && enrollmentID == "" {
|
||||
return fmt.Errorf("device or enrollment_id is required")
|
||||
}
|
||||
system := platform.New()
|
||||
targets, err := system.ListRemovableTargets()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for _, target := range targets {
|
||||
target = sanitizeRemovableTarget(target)
|
||||
mountpoint, mountedByBee, mountErr := ensureMountedTarget(target, "marker")
|
||||
if mountErr != nil {
|
||||
continue
|
||||
}
|
||||
remove := false
|
||||
marker, _, err := readBlackboxMarker(mountpoint)
|
||||
if err == nil {
|
||||
if enrollmentID != "" && marker.EnrollmentID == enrollmentID {
|
||||
remove = true
|
||||
}
|
||||
if device != "" && target.Device == device {
|
||||
remove = true
|
||||
}
|
||||
}
|
||||
if remove {
|
||||
err = os.Remove(filepath.Join(mountpoint, blackboxMarkerName))
|
||||
}
|
||||
if mountedByBee {
|
||||
_ = unmountTarget(mountpoint)
|
||||
}
|
||||
if remove {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return os.ErrNotExist
|
||||
}
|
||||
|
||||
func (rt *blackboxRuntime) reconcile() {
|
||||
discovered, _ := rt.discoverMarkedTargets()
|
||||
|
||||
rt.mu.Lock()
|
||||
defer rt.mu.Unlock()
|
||||
|
||||
seen := make(map[string]struct{}, len(discovered))
|
||||
for _, found := range discovered {
|
||||
seen[found.marker.EnrollmentID] = struct{}{}
|
||||
worker, ok := rt.workers[found.marker.EnrollmentID]
|
||||
if !ok {
|
||||
worker = newBlackboxWorker(rt, found)
|
||||
rt.workers[found.marker.EnrollmentID] = worker
|
||||
go worker.run()
|
||||
continue
|
||||
}
|
||||
worker.update(found)
|
||||
}
|
||||
for id, worker := range rt.workers {
|
||||
if _, ok := seen[id]; ok {
|
||||
continue
|
||||
}
|
||||
worker.stop()
|
||||
delete(rt.workers, id)
|
||||
}
|
||||
rt.persistStateLocked()
|
||||
}
|
||||
|
||||
func (rt *blackboxRuntime) stopAll() {
|
||||
rt.mu.Lock()
|
||||
workers := make([]*blackboxWorker, 0, len(rt.workers))
|
||||
for _, worker := range rt.workers {
|
||||
workers = append(workers, worker)
|
||||
}
|
||||
rt.workers = map[string]*blackboxWorker{}
|
||||
rt.persistStateLocked()
|
||||
rt.mu.Unlock()
|
||||
for _, worker := range workers {
|
||||
worker.stop()
|
||||
}
|
||||
}
|
||||
|
||||
func (rt *blackboxRuntime) discoverMarkedTargets() ([]discoveredBlackboxTarget, error) {
|
||||
targets, err := rt.system.ListRemovableTargets()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var out []discoveredBlackboxTarget
|
||||
for _, rawTarget := range targets {
|
||||
target := sanitizeRemovableTarget(rawTarget)
|
||||
if target.Device == "" {
|
||||
continue
|
||||
}
|
||||
mountpoint, mountedByBee, err := ensureMountedTarget(target, "probe")
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
marker, ok, err := readBlackboxMarker(mountpoint)
|
||||
if mountedByBee && !ok {
|
||||
_ = unmountTarget(mountpoint)
|
||||
}
|
||||
if err != nil || !ok || marker.EnrollmentID == "" {
|
||||
continue
|
||||
}
|
||||
if mountedByBee {
|
||||
_ = unmountTarget(mountpoint)
|
||||
}
|
||||
out = append(out, discoveredBlackboxTarget{
|
||||
marker: marker,
|
||||
target: target,
|
||||
seenMount: mountpoint,
|
||||
mountedByBee: mountedByBee,
|
||||
})
|
||||
}
|
||||
sort.Slice(out, func(i, j int) bool {
|
||||
return out[i].marker.EnrollmentID < out[j].marker.EnrollmentID
|
||||
})
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func newBlackboxWorker(rt *blackboxRuntime, found discoveredBlackboxTarget) *blackboxWorker {
|
||||
return &blackboxWorker{
|
||||
runtime: rt,
|
||||
enrollmentID: found.marker.EnrollmentID,
|
||||
target: found.target,
|
||||
marker: found.marker,
|
||||
flushPeriod: blackboxMinFlushPeriod,
|
||||
status: "running",
|
||||
stopCh: make(chan struct{}),
|
||||
stoppedCh: make(chan struct{}),
|
||||
}
|
||||
}
|
||||
|
||||
func (w *blackboxWorker) run() {
|
||||
defer close(w.stoppedCh)
|
||||
for {
|
||||
start := time.Now()
|
||||
err := w.syncCycle()
|
||||
duration := time.Since(start)
|
||||
w.finishCycle(duration, err)
|
||||
|
||||
wait := w.currentFlushPeriod()
|
||||
timer := time.NewTimer(wait)
|
||||
select {
|
||||
case <-w.stopCh:
|
||||
timer.Stop()
|
||||
w.cleanup()
|
||||
return
|
||||
case <-timer.C:
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (w *blackboxWorker) update(found discoveredBlackboxTarget) {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
w.target = found.target
|
||||
w.marker = found.marker
|
||||
}
|
||||
|
||||
func (w *blackboxWorker) stop() {
|
||||
select {
|
||||
case <-w.stopCh:
|
||||
default:
|
||||
close(w.stopCh)
|
||||
}
|
||||
<-w.stoppedCh
|
||||
}
|
||||
|
||||
func (w *blackboxWorker) currentFlushPeriod() time.Duration {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
return w.flushPeriod
|
||||
}
|
||||
|
||||
func (w *blackboxWorker) finishCycle(duration time.Duration, err error) {
|
||||
w.mu.Lock()
|
||||
w.lastDuration = duration
|
||||
if err != nil {
|
||||
w.status = "degraded"
|
||||
w.lastError = err.Error()
|
||||
w.fastCycles = 0
|
||||
w.flushPeriod = adjustFlushPeriod(w.flushPeriod, duration, false, 0)
|
||||
} else {
|
||||
w.status = "running"
|
||||
w.lastSyncAt = blackboxNow()
|
||||
w.lastError = ""
|
||||
if duration <= w.flushPeriod/2 {
|
||||
w.fastCycles++
|
||||
} else {
|
||||
w.fastCycles = 0
|
||||
}
|
||||
w.flushPeriod = adjustFlushPeriod(w.flushPeriod, duration, true, w.fastCycles)
|
||||
}
|
||||
w.mu.Unlock()
|
||||
// persistState must be called without w.mu held: it acquires rt.mu then
|
||||
// each worker.mu inside persistStateLocked, so holding w.mu here would
|
||||
// cause a deadlock (w.mu → rt.mu → w.mu).
|
||||
w.runtime.persistState()
|
||||
}
|
||||
|
||||
func adjustFlushPeriod(current, duration time.Duration, success bool, fastCycles int) time.Duration {
|
||||
if current <= 0 {
|
||||
current = blackboxMinFlushPeriod
|
||||
}
|
||||
if duration <= 0 {
|
||||
duration = current
|
||||
}
|
||||
next := current
|
||||
if duration > current {
|
||||
growA := time.Duration(float64(current) * 1.25)
|
||||
growB := time.Duration(float64(duration) * 1.25)
|
||||
if growB > growA {
|
||||
next = growB
|
||||
} else {
|
||||
next = growA
|
||||
}
|
||||
}
|
||||
if success && fastCycles >= blackboxRecoveryFastCount {
|
||||
next = time.Duration(float64(current) * 0.9)
|
||||
}
|
||||
if next < blackboxMinFlushPeriod {
|
||||
next = blackboxMinFlushPeriod
|
||||
}
|
||||
if next > blackboxMaxFlushPeriod {
|
||||
next = blackboxMaxFlushPeriod
|
||||
}
|
||||
return next
|
||||
}
|
||||
|
||||
func (w *blackboxWorker) syncCycle() error {
|
||||
target, marker := w.snapshotTarget()
|
||||
mountpoint, mountedByBee, err := ensureMountedTarget(target, marker.EnrollmentID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
w.recordMountpoint(mountpoint, mountedByBee)
|
||||
|
||||
root := filepath.Join(mountpoint, w.runtime.bootFolder)
|
||||
if err := os.MkdirAll(filepath.Join(root, "export"), 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := syncDirectoryTree(w.runtime.exportDir, filepath.Join(root, "export")); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := w.captureSnapshots(root); err != nil {
|
||||
return err
|
||||
}
|
||||
return syncFilesystem(root)
|
||||
}
|
||||
|
||||
func (w *blackboxWorker) cleanup() {
|
||||
w.mu.Lock()
|
||||
mountpoint := w.mountpoint
|
||||
mountedByBee := w.mountedByBee
|
||||
w.mu.Unlock()
|
||||
if mountedByBee && mountpoint != "" {
|
||||
_ = unmountTarget(mountpoint)
|
||||
}
|
||||
}
|
||||
|
||||
func (w *blackboxWorker) snapshotTarget() (platform.RemovableTarget, BlackboxMarker) {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
return w.target, w.marker
|
||||
}
|
||||
|
||||
func (w *blackboxWorker) recordMountpoint(mountpoint string, mountedByBee bool) {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
w.mountpoint = mountpoint
|
||||
w.mountedByBee = mountedByBee
|
||||
}
|
||||
|
||||
func (w *blackboxWorker) captureSnapshots(root string) error {
|
||||
if err := captureCommandAtomic(filepath.Join(root, "systemd", "combined.journal.log"), "journalctl", "--no-pager", "--since", w.runtime.bootStarted.Format(time.RFC3339)); err != nil {
|
||||
return err
|
||||
}
|
||||
for _, svc := range supportBundleServices {
|
||||
if err := captureCommandAtomic(filepath.Join(root, "systemd", svc+".journal.log"), "journalctl", "--no-pager", "-u", svc, "--since", w.runtime.bootStarted.Format(time.RFC3339)); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := captureCommandAtomic(filepath.Join(root, "systemd", svc+".status.txt"), "systemctl", "status", svc, "--no-pager"); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if err := captureCommandAtomic(filepath.Join(root, "system", "dmesg.txt"), "dmesg"); err != nil {
|
||||
return err
|
||||
}
|
||||
for _, item := range supportBundleOptionalFiles {
|
||||
if err := copyFileIfChanged(item.src, filepath.Join(root, item.name)); err != nil && !errors.Is(err, os.ErrNotExist) {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (rt *blackboxRuntime) persistState() {
|
||||
rt.mu.Lock()
|
||||
defer rt.mu.Unlock()
|
||||
rt.persistStateLocked()
|
||||
}
|
||||
|
||||
func (rt *blackboxRuntime) persistStateLocked() {
|
||||
state := BlackboxState{
|
||||
Status: "disabled",
|
||||
BootStartedAtUTC: rt.bootStarted.Format(time.RFC3339),
|
||||
BootFolder: rt.bootFolder,
|
||||
UpdatedAtUTC: blackboxNow().Format(time.RFC3339),
|
||||
Targets: make([]BlackboxTargetStatus, 0, len(rt.workers)),
|
||||
}
|
||||
if len(rt.workers) > 0 {
|
||||
state.Status = "running"
|
||||
}
|
||||
for _, worker := range rt.workers {
|
||||
worker.mu.Lock()
|
||||
targetState := BlackboxTargetStatus{
|
||||
EnrollmentID: worker.enrollmentID,
|
||||
Device: worker.target.Device,
|
||||
FS: worker.target,
|
||||
BootFolder: rt.bootFolder,
|
||||
Status: worker.status,
|
||||
FlushPeriod: worker.flushPeriod.String(),
|
||||
LastError: worker.lastError,
|
||||
Mountpoint: worker.mountpoint,
|
||||
}
|
||||
if !worker.lastSyncAt.IsZero() {
|
||||
targetState.LastSyncAtUTC = worker.lastSyncAt.Format(time.RFC3339)
|
||||
}
|
||||
if worker.lastDuration > 0 {
|
||||
targetState.LastCycleDuration = worker.lastDuration.String()
|
||||
}
|
||||
if worker.status == "degraded" {
|
||||
state.Status = "degraded"
|
||||
}
|
||||
worker.mu.Unlock()
|
||||
state.Targets = append(state.Targets, targetState)
|
||||
}
|
||||
sort.Slice(state.Targets, func(i, j int) bool {
|
||||
return state.Targets[i].EnrollmentID < state.Targets[j].EnrollmentID
|
||||
})
|
||||
_ = writeJSONAtomic(rt.statePath, state)
|
||||
}
|
||||
|
||||
func bootStartedAtUTC() (time.Time, error) {
|
||||
raw, err := os.ReadFile("/proc/stat")
|
||||
if err != nil {
|
||||
return time.Time{}, err
|
||||
}
|
||||
for _, line := range strings.Split(string(raw), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if !strings.HasPrefix(line, "btime ") {
|
||||
continue
|
||||
}
|
||||
parts := strings.Fields(line)
|
||||
if len(parts) != 2 {
|
||||
break
|
||||
}
|
||||
sec, err := time.ParseDuration(parts[1] + "s")
|
||||
if err != nil {
|
||||
break
|
||||
}
|
||||
return time.Unix(int64(sec/time.Second), 0).UTC(), nil
|
||||
}
|
||||
return time.Time{}, fmt.Errorf("boot time not found")
|
||||
}
|
||||
|
||||
func newBlackboxEnrollmentID() string {
|
||||
var buf [8]byte
|
||||
if _, err := rand.Read(buf[:]); err != nil {
|
||||
return fmt.Sprintf("bb-%d", time.Now().UnixNano())
|
||||
}
|
||||
return "bb-" + hex.EncodeToString(buf[:])
|
||||
}
|
||||
|
||||
func sanitizeRemovableTarget(target platform.RemovableTarget) platform.RemovableTarget {
|
||||
target.Device = strings.TrimSpace(target.Device)
|
||||
target.FSType = strings.TrimSpace(target.FSType)
|
||||
target.Size = strings.TrimSpace(target.Size)
|
||||
target.Label = strings.TrimSpace(target.Label)
|
||||
target.Model = strings.TrimSpace(target.Model)
|
||||
target.Mountpoint = strings.TrimSpace(target.Mountpoint)
|
||||
return target
|
||||
}
|
||||
|
||||
func ensureMountedTarget(target platform.RemovableTarget, suffix string) (mountpoint string, mountedByBee bool, retErr error) {
|
||||
target = sanitizeRemovableTarget(target)
|
||||
if target.Mountpoint != "" {
|
||||
if err := ensureWritableBlackboxMountpoint(target.Mountpoint); err == nil {
|
||||
return target.Mountpoint, false, nil
|
||||
}
|
||||
}
|
||||
mountpoint = filepath.Join("/tmp", "bee-blackbox-"+sanitizeFilename(suffix))
|
||||
if err := os.MkdirAll(mountpoint, 0755); err != nil {
|
||||
return "", false, err
|
||||
}
|
||||
if raw, err := blackboxExecCommand("mount", target.Device, mountpoint).CombinedOutput(); err != nil {
|
||||
return "", false, formatBlackboxMountTargetError(target, string(raw), err)
|
||||
}
|
||||
if err := ensureWritableBlackboxMountpoint(mountpoint); err != nil {
|
||||
_ = unmountTarget(mountpoint)
|
||||
return "", false, err
|
||||
}
|
||||
return mountpoint, true, nil
|
||||
}
|
||||
|
||||
func unmountTarget(mountpoint string) error {
|
||||
_ = blackboxExecCommand("sync").Run()
|
||||
raw, err := blackboxExecCommand("umount", mountpoint).CombinedOutput()
|
||||
if err != nil {
|
||||
msg := strings.TrimSpace(string(raw))
|
||||
if msg == "" {
|
||||
return err
|
||||
}
|
||||
return fmt.Errorf("%s: %w", msg, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func readBlackboxMarker(mountpoint string) (BlackboxMarker, bool, error) {
|
||||
raw, err := os.ReadFile(filepath.Join(mountpoint, blackboxMarkerName))
|
||||
if err != nil {
|
||||
if errors.Is(err, os.ErrNotExist) {
|
||||
return BlackboxMarker{}, false, os.ErrNotExist
|
||||
}
|
||||
return BlackboxMarker{}, false, err
|
||||
}
|
||||
var marker BlackboxMarker
|
||||
if err := json.Unmarshal(raw, &marker); err != nil {
|
||||
return BlackboxMarker{}, false, err
|
||||
}
|
||||
return marker, true, nil
|
||||
}
|
||||
|
||||
func writeBlackboxMarker(mountpoint string, marker BlackboxMarker) error {
|
||||
if marker.Version == 0 {
|
||||
marker.Version = 1
|
||||
}
|
||||
return writeJSONAtomic(filepath.Join(mountpoint, blackboxMarkerName), marker)
|
||||
}
|
||||
|
||||
func syncDirectoryTree(srcDir, dstDir string) error {
|
||||
seen := make(map[string]struct{})
|
||||
err := filepath.WalkDir(srcDir, func(path string, d fs.DirEntry, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
rel, err := filepath.Rel(srcDir, path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
rel = filepath.Clean(rel)
|
||||
if rel == "." {
|
||||
seen["."] = struct{}{}
|
||||
return os.MkdirAll(dstDir, 0755)
|
||||
}
|
||||
seen[rel] = struct{}{}
|
||||
dstPath := filepath.Join(dstDir, rel)
|
||||
if d.IsDir() {
|
||||
info, err := d.Info()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return os.MkdirAll(dstPath, info.Mode().Perm())
|
||||
}
|
||||
return copyFileIfChanged(path, dstPath)
|
||||
})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return removeMissingPaths(dstDir, seen)
|
||||
}
|
||||
|
||||
func removeMissingPaths(dstDir string, seen map[string]struct{}) error {
|
||||
return filepath.WalkDir(dstDir, func(path string, d fs.DirEntry, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
rel, err := filepath.Rel(dstDir, path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
rel = filepath.Clean(rel)
|
||||
if rel == "." {
|
||||
return nil
|
||||
}
|
||||
if _, ok := seen[rel]; ok {
|
||||
return nil
|
||||
}
|
||||
return os.RemoveAll(path)
|
||||
})
|
||||
}
|
||||
|
||||
func copyFileIfChanged(src, dst string) error {
|
||||
info, err := os.Stat(src)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if info.IsDir() {
|
||||
return os.MkdirAll(dst, info.Mode().Perm())
|
||||
}
|
||||
srcData, err := os.ReadFile(src)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if dstData, err := os.ReadFile(dst); err == nil && bytes.Equal(dstData, srcData) {
|
||||
return nil
|
||||
}
|
||||
return writeFileAtomic(dst, srcData, info.Mode().Perm())
|
||||
}
|
||||
|
||||
func captureCommandAtomic(dst string, name string, args ...string) error {
|
||||
raw, err := blackboxExecCommand(name, args...).CombinedOutput()
|
||||
if len(raw) == 0 {
|
||||
if err != nil {
|
||||
raw = []byte(err.Error() + "\n")
|
||||
} else {
|
||||
raw = []byte("no output\n")
|
||||
}
|
||||
}
|
||||
return writeFileAtomic(dst, raw, 0644)
|
||||
}
|
||||
|
||||
func writeJSONAtomic(path string, v any) error {
|
||||
raw, err := json.MarshalIndent(v, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
raw = append(raw, '\n')
|
||||
return writeFileAtomic(path, raw, 0644)
|
||||
}
|
||||
|
||||
func writeFileAtomic(path string, data []byte, perm os.FileMode) error {
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
if existing, err := os.ReadFile(path); err == nil && bytes.Equal(existing, data) {
|
||||
return nil
|
||||
}
|
||||
tmp := path + ".tmp"
|
||||
f, err := os.OpenFile(tmp, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, perm)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if _, err := f.Write(data); err != nil {
|
||||
_ = f.Close()
|
||||
return err
|
||||
}
|
||||
if err := f.Sync(); err != nil {
|
||||
_ = f.Close()
|
||||
return err
|
||||
}
|
||||
if err := f.Close(); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := os.Rename(tmp, path); err != nil {
|
||||
return err
|
||||
}
|
||||
return syncFilesystem(filepath.Dir(path))
|
||||
}
|
||||
|
||||
func syncFilesystem(path string) error {
|
||||
return blackboxExecCommand("sync").Run()
|
||||
}
|
||||
|
||||
func ensureWritableBlackboxMountpoint(mountpoint string) error {
|
||||
probe, err := os.CreateTemp(mountpoint, ".bee-blackbox-write-test-*")
|
||||
if err != nil {
|
||||
return fmt.Errorf("target filesystem is not writable: %w", err)
|
||||
}
|
||||
name := probe.Name()
|
||||
if closeErr := probe.Close(); closeErr != nil {
|
||||
_ = os.Remove(name)
|
||||
return closeErr
|
||||
}
|
||||
if err := os.Remove(name); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func formatBlackboxMountTargetError(target platform.RemovableTarget, raw string, err error) error {
|
||||
msg := strings.TrimSpace(raw)
|
||||
fstype := strings.ToLower(strings.TrimSpace(target.FSType))
|
||||
if fstype == "exfat" && strings.Contains(strings.ToLower(msg), "unknown filesystem type 'exfat'") {
|
||||
return fmt.Errorf("mount %s: exFAT support is missing in this ISO build: %w", target.Device, err)
|
||||
}
|
||||
if msg == "" {
|
||||
return err
|
||||
}
|
||||
return fmt.Errorf("%s: %w", msg, err)
|
||||
}
|
||||
52
audit/internal/app/blackbox_test.go
Normal file
52
audit/internal/app/blackbox_test.go
Normal file
@@ -0,0 +1,52 @@
|
||||
package app
|
||||
|
||||
import (
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
func TestAdjustFlushPeriodGrowsOnSlowCycle(t *testing.T) {
|
||||
current := 2 * time.Second
|
||||
got := adjustFlushPeriod(current, 4*time.Second, false, 0)
|
||||
if got <= current {
|
||||
t.Fatalf("adjustFlushPeriod=%s want > %s", got, current)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAdjustFlushPeriodShrinksAfterFastCycles(t *testing.T) {
|
||||
current := 10 * time.Second
|
||||
got := adjustFlushPeriod(current, 2*time.Second, true, blackboxRecoveryFastCount)
|
||||
if got >= current {
|
||||
t.Fatalf("adjustFlushPeriod=%s want < %s", got, current)
|
||||
}
|
||||
if got < blackboxMinFlushPeriod {
|
||||
t.Fatalf("adjustFlushPeriod=%s below min %s", got, blackboxMinFlushPeriod)
|
||||
}
|
||||
}
|
||||
|
||||
func TestReadBlackboxState(t *testing.T) {
|
||||
path := filepath.Join(t.TempDir(), "blackbox-state.json")
|
||||
want := BlackboxState{
|
||||
Status: "running",
|
||||
BootStartedAtUTC: "2026-04-24T00:00:00Z",
|
||||
BootFolder: "boot-folder",
|
||||
UpdatedAtUTC: "2026-04-24T00:00:01Z",
|
||||
Targets: []BlackboxTargetStatus{{
|
||||
EnrollmentID: "bb-1",
|
||||
Device: "/dev/sdb1",
|
||||
Status: "running",
|
||||
FlushPeriod: "1s",
|
||||
}},
|
||||
}
|
||||
if err := writeJSONAtomic(path, want); err != nil {
|
||||
t.Fatalf("writeJSONAtomic: %v", err)
|
||||
}
|
||||
got, err := ReadBlackboxState(path)
|
||||
if err != nil {
|
||||
t.Fatalf("ReadBlackboxState: %v", err)
|
||||
}
|
||||
if got.Status != want.Status || got.BootFolder != want.BootFolder || len(got.Targets) != 1 || got.Targets[0].EnrollmentID != "bb-1" {
|
||||
t.Fatalf("state=%+v", got)
|
||||
}
|
||||
}
|
||||
@@ -3,10 +3,11 @@ package app
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/collector"
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
@@ -313,17 +314,20 @@ func statusSeverity(status string) int {
|
||||
}
|
||||
|
||||
func matchesGPUVendor(dev schema.HardwarePCIeDevice, vendor string) bool {
|
||||
if dev.DeviceClass == nil || !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Controller") && !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Accelerator") {
|
||||
if dev.DeviceClass == nil || !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Display") && !strings.Contains(strings.TrimSpace(*dev.DeviceClass), "Video") {
|
||||
return false
|
||||
}
|
||||
if dev.DeviceClass == nil {
|
||||
return false
|
||||
}
|
||||
class := strings.TrimSpace(*dev.DeviceClass)
|
||||
isGPUClass := strings.Contains(class, "Controller") || strings.Contains(class, "Accelerator") ||
|
||||
strings.Contains(class, "Display") || strings.Contains(class, "Video")
|
||||
if !isGPUClass {
|
||||
return false
|
||||
}
|
||||
manufacturer := strings.ToLower(strings.TrimSpace(ptrString(dev.Manufacturer)))
|
||||
switch vendor {
|
||||
case "amd":
|
||||
return strings.Contains(manufacturer, "advanced micro devices") || strings.Contains(manufacturer, "amd/ati")
|
||||
return dev.VendorID != nil && *dev.VendorID == collector.AMDVendorID
|
||||
case "nvidia":
|
||||
return strings.Contains(manufacturer, "nvidia")
|
||||
return dev.VendorID != nil && *dev.VendorID == collector.NvidiaVendorID
|
||||
default:
|
||||
return false
|
||||
}
|
||||
|
||||
@@ -5,6 +5,7 @@ import (
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"bee/audit/internal/collector"
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
@@ -46,10 +47,12 @@ func TestApplyLatestSATStatusesMarksAMDGPUs(t *testing.T) {
|
||||
|
||||
class := "DisplayController"
|
||||
manufacturer := "Advanced Micro Devices, Inc. [AMD/ATI]"
|
||||
amdVendorID := collector.AMDVendorID
|
||||
snap := schema.HardwareSnapshot{
|
||||
PCIeDevices: []schema.HardwarePCIeDevice{{
|
||||
DeviceClass: &class,
|
||||
Manufacturer: &manufacturer,
|
||||
VendorID: &amdVendorID,
|
||||
}},
|
||||
}
|
||||
|
||||
|
||||
@@ -15,6 +15,7 @@ import (
|
||||
)
|
||||
|
||||
var supportBundleServices = []string{
|
||||
"bee-blackbox.service",
|
||||
"bee-audit.service",
|
||||
"bee-web.service",
|
||||
"bee-network.service",
|
||||
@@ -23,6 +24,8 @@ var supportBundleServices = []string{
|
||||
"bee-selfheal.service",
|
||||
"bee-selfheal.timer",
|
||||
"bee-sshsetup.service",
|
||||
"display-manager.service",
|
||||
"lightdm.service",
|
||||
"nvidia-dcgm.service",
|
||||
"nvidia-fabricmanager.service",
|
||||
}
|
||||
@@ -43,12 +46,128 @@ var supportBundleCommands = []struct {
|
||||
{name: "system/mount.txt", cmd: []string{"mount"}},
|
||||
{name: "system/df-h.txt", cmd: []string{"df", "-h"}},
|
||||
{name: "system/dmesg.txt", cmd: []string{"dmesg"}},
|
||||
{name: "system/dmesg-gui-video-input.txt", cmd: []string{"sh", "-c", `
|
||||
if command -v dmesg >/dev/null 2>&1; then
|
||||
dmesg | grep -iE 'nvidia|drm|fb|framebuffer|vesa|efi|lightdm|Xorg|input|hid|usb|keyboard|mouse|virtual keyboard|virtual mouse|ami|aspeed|ast' || echo "no GUI/video/input kernel messages found"
|
||||
else
|
||||
echo "dmesg not found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/kernel-aer-nvidia.txt", cmd: []string{"sh", "-c", `
|
||||
if command -v dmesg >/dev/null 2>&1; then
|
||||
dmesg | grep -iE 'AER|NVRM|Xid|pcieport|nvidia' || echo "no AER/NVRM/Xid kernel messages found"
|
||||
else
|
||||
echo "dmesg not found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/loginctl-sessions.txt", cmd: []string{"sh", "-c", `
|
||||
if command -v loginctl >/dev/null 2>&1; then
|
||||
loginctl list-sessions 2>&1 || true
|
||||
else
|
||||
echo "loginctl not found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/loginctl-seats.txt", cmd: []string{"sh", "-c", `
|
||||
if command -v loginctl >/dev/null 2>&1; then
|
||||
loginctl list-seats 2>&1 || true
|
||||
echo
|
||||
for seat in $(loginctl list-seats --no-legend 2>/dev/null | awk '{print $1}'); do
|
||||
echo "=== $seat ==="
|
||||
loginctl seat-status "$seat" 2>&1 || true
|
||||
echo
|
||||
done
|
||||
else
|
||||
echo "loginctl not found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/ps-gui.txt", cmd: []string{"sh", "-c", `
|
||||
ps -ef | grep -iE 'lightdm|Xorg|X$|openbox|chromium|chrome|xinit|xsession' | grep -v grep || echo "no GUI processes found"
|
||||
`}},
|
||||
{name: "system/lspci-video-vv.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v lspci >/dev/null 2>&1; then
|
||||
echo "lspci not found"
|
||||
exit 0
|
||||
fi
|
||||
found=0
|
||||
for dev in $(lspci -Dn | awk '$2 ~ /^03(00|02):$/ {print $1}'); do
|
||||
found=1
|
||||
echo "=== $dev ==="
|
||||
lspci -s "$dev" -vv 2>&1 || true
|
||||
echo
|
||||
done
|
||||
if [ "$found" -eq 0 ]; then
|
||||
echo "no display-class PCI devices found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/proc-fb.txt", cmd: []string{"cat", "/proc/fb"}},
|
||||
{name: "system/drm-cards.txt", cmd: []string{"sh", "-c", `
|
||||
if [ -d /sys/class/drm ]; then
|
||||
for path in /sys/class/drm/card*; do
|
||||
[ -e "$path" ] || continue
|
||||
card=$(basename "$path")
|
||||
echo "=== $card ==="
|
||||
for f in status enabled dpms modes; do
|
||||
[ -r "$path/$f" ] && printf " %-8s %s\n" "$f" "$(cat "$path/$f" 2>/dev/null)"
|
||||
done
|
||||
device=$(readlink -f "$path/device" 2>/dev/null || true)
|
||||
[ -n "$device" ] && echo " device ${device##*/}"
|
||||
echo
|
||||
done
|
||||
else
|
||||
echo "/sys/class/drm not present"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/input-devices.txt", cmd: []string{"sh", "-c", `
|
||||
if [ -r /proc/bus/input/devices ]; then
|
||||
cat /proc/bus/input/devices
|
||||
else
|
||||
echo "/proc/bus/input/devices not readable"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/udevadm-input.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v udevadm >/dev/null 2>&1; then
|
||||
echo "udevadm not found"
|
||||
exit 0
|
||||
fi
|
||||
found=0
|
||||
for dev in /dev/input/event*; do
|
||||
[ -e "$dev" ] || continue
|
||||
found=1
|
||||
echo "=== $dev ==="
|
||||
udevadm info --query=all --name="$dev" 2>&1 || true
|
||||
echo
|
||||
done
|
||||
if [ "$found" -eq 0 ]; then
|
||||
echo "no /dev/input/event* devices found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/xinput-list.txt", cmd: []string{"sh", "-c", `
|
||||
if command -v xinput >/dev/null 2>&1; then
|
||||
DISPLAY=:0 xinput --list 2>&1 || true
|
||||
else
|
||||
echo "xinput not found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/libinput-list-devices.txt", cmd: []string{"sh", "-c", `
|
||||
if command -v libinput >/dev/null 2>&1; then
|
||||
libinput list-devices 2>&1 || true
|
||||
else
|
||||
echo "libinput not found"
|
||||
fi
|
||||
`}},
|
||||
{name: "system/systemctl-gui-units.txt", cmd: []string{"sh", "-c", `
|
||||
if ! command -v systemctl >/dev/null 2>&1; then
|
||||
echo "systemctl not found"
|
||||
exit 0
|
||||
fi
|
||||
echo "=== unit files ==="
|
||||
systemctl list-unit-files --no-pager --all 'lightdm*' 'display-manager*' 2>&1 || true
|
||||
echo
|
||||
echo "=== active units ==="
|
||||
systemctl list-units --no-pager --all 'lightdm*' 'display-manager*' 2>&1 || true
|
||||
echo
|
||||
echo "=== failed units ==="
|
||||
systemctl --failed --no-pager 2>&1 | grep -iE 'lightdm|display-manager|Xorg' || echo "no failed GUI units"
|
||||
`}},
|
||||
{name: "system/nvidia-smi-q.txt", cmd: []string{"nvidia-smi", "-q"}},
|
||||
{name: "system/nvidia-smi-topo.txt", cmd: []string{"sh", "-c", `
|
||||
@@ -235,6 +354,13 @@ var supportBundleOptionalFiles = []struct {
|
||||
}{
|
||||
{name: "system/kern.log", src: "/var/log/kern.log"},
|
||||
{name: "system/syslog.txt", src: "/var/log/syslog"},
|
||||
{name: "system/Xorg.0.log", src: "/var/log/Xorg.0.log"},
|
||||
{name: "system/Xorg.0.log.old", src: "/var/log/Xorg.0.log.old"},
|
||||
{name: "system/lightdm/lightdm.log", src: "/var/log/lightdm/lightdm.log"},
|
||||
{name: "system/lightdm/x-0.log", src: "/var/log/lightdm/x-0.log"},
|
||||
{name: "system/lightdm/x-0-greeter.log", src: "/var/log/lightdm/x-0-greeter.log"},
|
||||
{name: "system/home-bee-xsession-errors.log", src: "/home/bee/.xsession-errors"},
|
||||
{name: "system/home-bee-chromium-debug.log", src: "/tmp/bee-chrome/chrome_debug.log"},
|
||||
{name: "system/fabricmanager.log", src: "/var/log/fabricmanager.log"},
|
||||
{name: "system/nvlsm.log", src: "/var/log/nvlsm.log"},
|
||||
{name: "system/fabricmanager/fabricmanager.log", src: "/var/log/fabricmanager/fabricmanager.log"},
|
||||
@@ -256,11 +382,6 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
||||
}
|
||||
|
||||
now := time.Now().UTC()
|
||||
date := now.Format("2006-01-02")
|
||||
tod := now.Format("150405")
|
||||
ver := bundleVersion()
|
||||
model := serverModelForBundle()
|
||||
sn := serverSerialForBundle()
|
||||
|
||||
stageRoot := filepath.Join(os.TempDir(), fmt.Sprintf("bee-support-stage-%s-%s", sanitizeFilename(hostnameOr("unknown")), now.Format("20060102-150405")))
|
||||
if err := os.MkdirAll(stageRoot, 0755); err != nil {
|
||||
@@ -294,7 +415,7 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
||||
return "", err
|
||||
}
|
||||
|
||||
archiveName := fmt.Sprintf("%s (BEE-SP v%s) %s %s %s.tar.gz", date, ver, model, sn, tod)
|
||||
archiveName := SupportBundleBaseName(now) + ".tar.gz"
|
||||
archivePath := filepath.Join(os.TempDir(), archiveName)
|
||||
if err := createSupportTarGz(archivePath, stageRoot); err != nil {
|
||||
return "", err
|
||||
@@ -302,6 +423,16 @@ func BuildSupportBundle(exportDir string) (string, error) {
|
||||
return archivePath, nil
|
||||
}
|
||||
|
||||
func SupportBundleBaseName(at time.Time) string {
|
||||
at = at.UTC()
|
||||
date := at.Format("2006-01-02")
|
||||
tod := at.Format("150405")
|
||||
ver := bundleVersion()
|
||||
model := serverModelForBundle()
|
||||
sn := serverSerialForBundle()
|
||||
return fmt.Sprintf("%s (BEE-SP v%s) %s %s %s", date, ver, model, sn, tod)
|
||||
}
|
||||
|
||||
func LatestSupportBundlePath() (string, error) {
|
||||
return latestSupportBundlePath(os.TempDir())
|
||||
}
|
||||
|
||||
@@ -84,11 +84,10 @@ func hasAMDGPUDevices(devs []schema.HardwarePCIeDevice) bool {
|
||||
}
|
||||
|
||||
func isAMDGPUDevice(dev schema.HardwarePCIeDevice) bool {
|
||||
if dev.Manufacturer == nil || dev.DeviceClass == nil {
|
||||
if dev.DeviceClass == nil {
|
||||
return false
|
||||
}
|
||||
manufacturer := strings.ToLower(strings.TrimSpace(*dev.Manufacturer))
|
||||
return strings.Contains(manufacturer, "advanced micro devices") && isGPUClass(strings.TrimSpace(*dev.DeviceClass))
|
||||
return dev.VendorID != nil && *dev.VendorID == AMDVendorID && isGPUClass(strings.TrimSpace(*dev.DeviceClass))
|
||||
}
|
||||
|
||||
func queryAMDGPUs() (map[string]amdGPUInfo, error) {
|
||||
|
||||
@@ -3,6 +3,7 @@ package collector
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"bufio"
|
||||
"context"
|
||||
"log/slog"
|
||||
"os"
|
||||
"os/exec"
|
||||
@@ -17,14 +18,6 @@ var execDmidecode = func(typeNum string) (string, error) {
|
||||
return string(out), nil
|
||||
}
|
||||
|
||||
var execIpmitool = func(args ...string) (string, error) {
|
||||
out, err := exec.Command("ipmitool", args...).Output()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(out), nil
|
||||
}
|
||||
|
||||
// collectBoard runs dmidecode for types 0, 1, 2 and returns the board record
|
||||
// plus the BIOS firmware entry. Any failure is logged and returns zero values.
|
||||
func collectBoard() (schema.HardwareBoard, []schema.HardwareFirmwareRecord) {
|
||||
@@ -80,19 +73,23 @@ func parseBoard(type1, type2 string) schema.HardwareBoard {
|
||||
|
||||
// collectBMCFirmware collects BMC firmware version via ipmitool mc info.
|
||||
// Returns nil if ipmitool is missing, /dev/ipmi0 is absent, or any error occurs.
|
||||
func collectBMCFirmware() []schema.HardwareFirmwareRecord {
|
||||
func collectBMCFirmware(manufacturer string) []schema.HardwareFirmwareRecord {
|
||||
if _, err := exec.LookPath("ipmitool"); err != nil {
|
||||
return nil
|
||||
}
|
||||
if _, err := os.Stat("/dev/ipmi0"); err != nil {
|
||||
return nil
|
||||
}
|
||||
out, err := execIpmitool("mc", "info")
|
||||
profile := selectIPMIProfile(manufacturer)
|
||||
ctx, cancel := context.WithTimeout(context.Background(), profile.mcInfoTimeout)
|
||||
defer cancel()
|
||||
cmd := exec.CommandContext(ctx, "ipmitool", "mc", "info")
|
||||
raw, err := cmd.Output()
|
||||
if err != nil {
|
||||
slog.Info("bmc: ipmitool mc info unavailable", "err", err)
|
||||
return nil
|
||||
}
|
||||
version := parseBMCFirmwareRevision(out)
|
||||
version := parseBMCFirmwareRevision(string(raw))
|
||||
if version == "" {
|
||||
return nil
|
||||
}
|
||||
@@ -177,15 +174,19 @@ func cleanDMIValue(v string) string {
|
||||
upper := strings.ToUpper(v)
|
||||
placeholders := []string{
|
||||
"TO BE FILLED BY O.E.M.",
|
||||
"TO BE FILLED BY O.E.M",
|
||||
"NOT SPECIFIED",
|
||||
"NOT SETTABLE",
|
||||
"NOT PRESENT",
|
||||
"NOT AVAILABLE",
|
||||
"UNKNOWN",
|
||||
"N/A",
|
||||
"NONE",
|
||||
"NULL",
|
||||
"DEFAULT STRING",
|
||||
"0",
|
||||
"0123456789",
|
||||
"1234567890",
|
||||
}
|
||||
for _, p := range placeholders {
|
||||
if upper == p {
|
||||
|
||||
@@ -84,6 +84,10 @@ func TestCleanDMIValue(t *testing.T) {
|
||||
{" Inspur ", "Inspur"},
|
||||
{"", ""},
|
||||
{"0", ""},
|
||||
{"0123456789", ""},
|
||||
{"1234567890", ""},
|
||||
{"Not Available", ""},
|
||||
{"To Be Filled By O.E.M", ""},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
got := cleanDMIValue(tt.input)
|
||||
@@ -109,6 +113,80 @@ func TestParseDMIFields(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseBoard_Dell(t *testing.T) {
|
||||
type1 := mustReadFile(t, "testdata/dmidecode_type1_dell.txt")
|
||||
type2 := mustReadFile(t, "testdata/dmidecode_type2_dell.txt")
|
||||
|
||||
board := parseBoard(type1, type2)
|
||||
|
||||
if board.SerialNumber != "7SG9F63" {
|
||||
t.Errorf("serial_number: got %q, want %q", board.SerialNumber, "7SG9F63")
|
||||
}
|
||||
if board.Manufacturer == nil || *board.Manufacturer != "Dell Inc." {
|
||||
t.Errorf("manufacturer: got %v, want Dell Inc.", board.Manufacturer)
|
||||
}
|
||||
if board.ProductName == nil || *board.ProductName != "PowerEdge R740xd" {
|
||||
t.Errorf("product_name: got %v, want PowerEdge R740xd", board.ProductName)
|
||||
}
|
||||
// part number comes from type2 Product Name
|
||||
if board.PartNumber == nil || *board.PartNumber != "0F9N89" {
|
||||
t.Errorf("part_number: got %v, want 0F9N89", board.PartNumber)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseBoard_HPE(t *testing.T) {
|
||||
type1 := mustReadFile(t, "testdata/dmidecode_type1_hpe.txt")
|
||||
type2 := mustReadFile(t, "testdata/dmidecode_type2_hpe.txt")
|
||||
|
||||
board := parseBoard(type1, type2)
|
||||
|
||||
if board.SerialNumber != "CZJ9320CXN" {
|
||||
t.Errorf("serial_number: got %q, want %q", board.SerialNumber, "CZJ9320CXN")
|
||||
}
|
||||
if board.Manufacturer == nil || *board.Manufacturer != "HPE" {
|
||||
t.Errorf("manufacturer: got %v, want HPE", board.Manufacturer)
|
||||
}
|
||||
if board.ProductName == nil || *board.ProductName != "ProLiant DL380 Gen10" {
|
||||
t.Errorf("product_name: got %v, want ProLiant DL380 Gen10", board.ProductName)
|
||||
}
|
||||
if board.PartNumber == nil || *board.PartNumber != "ProLiant DL380 Gen10" {
|
||||
t.Errorf("part_number: got %v, want ProLiant DL380 Gen10", board.PartNumber)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseBoard_Supermicro_Placeholders(t *testing.T) {
|
||||
type1 := mustReadFile(t, "testdata/dmidecode_type1_supermicro.txt")
|
||||
type2 := mustReadFile(t, "testdata/dmidecode_type2_supermicro.txt")
|
||||
|
||||
board := parseBoard(type1, type2)
|
||||
|
||||
if board.SerialNumber != "S214726X2A36789" {
|
||||
t.Errorf("serial_number: got %q, want %q", board.SerialNumber, "S214726X2A36789")
|
||||
}
|
||||
if board.Manufacturer == nil || *board.Manufacturer != "Supermicro" {
|
||||
t.Errorf("manufacturer: got %v, want Supermicro", board.Manufacturer)
|
||||
}
|
||||
if board.ProductName == nil || *board.ProductName != "SYS-6028R-WTR" {
|
||||
t.Errorf("product_name: got %v, want SYS-6028R-WTR", board.ProductName)
|
||||
}
|
||||
// "X10DRW-i" is the real part number from type 2
|
||||
if board.PartNumber == nil || *board.PartNumber != "X10DRW-i" {
|
||||
t.Errorf("part_number: got %v, want X10DRW-i", board.PartNumber)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseBIOSFirmware_Dell(t *testing.T) {
|
||||
type0 := mustReadFile(t, "testdata/dmidecode_type0_dell.txt")
|
||||
fw := parseBIOSFirmware(type0)
|
||||
|
||||
if len(fw) != 1 {
|
||||
t.Fatalf("expected 1 firmware record, got %d", len(fw))
|
||||
}
|
||||
if fw[0].Version != "2.5.4" {
|
||||
t.Errorf("version: got %q, want 2.5.4", fw[0].Version)
|
||||
}
|
||||
}
|
||||
|
||||
func mustReadFile(t *testing.T, path string) string {
|
||||
t.Helper()
|
||||
b, err := os.ReadFile(path)
|
||||
|
||||
@@ -23,7 +23,7 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
|
||||
board, biosFW := collectBoard()
|
||||
snap.Board = board
|
||||
snap.Firmware = append(snap.Firmware, biosFW...)
|
||||
snap.Firmware = append(snap.Firmware, collectBMCFirmware()...)
|
||||
snap.Firmware = append(snap.Firmware, collectBMCFirmware(derefString(snap.Board.Manufacturer))...)
|
||||
|
||||
snap.CPUs = collectCPUs()
|
||||
|
||||
@@ -34,19 +34,23 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
|
||||
}
|
||||
snap.CPUs = enrichCPUsWithTelemetry(snap.CPUs, sensorDoc)
|
||||
snap.Memory = enrichMemoryWithTelemetry(snap.Memory, sensorDoc)
|
||||
bestEffortRescanHotplugStorage()
|
||||
snap.Storage = collectStorage()
|
||||
snap.PCIeDevices = collectPCIe()
|
||||
snap.PCIeDevices = enrichPCIeWithAMD(snap.PCIeDevices)
|
||||
snap.PCIeDevices = enrichPCIeWithPCISerials(snap.PCIeDevices)
|
||||
snap.PCIeDevices = enrichPCIeWithNVIDIA(snap.PCIeDevices)
|
||||
snap.PCIeDevices = enrichNVLinkBridgesWithGPUTopo(snap.PCIeDevices)
|
||||
snap.PCIeDevices = enrichPCIeWithMellanox(snap.PCIeDevices)
|
||||
snap.PCIeDevices = enrichPCIeWithNICTelemetry(snap.PCIeDevices)
|
||||
snap.PCIeDevices = enrichPCIeWithRAIDTelemetry(snap.PCIeDevices)
|
||||
snap.Storage = enrichStorageWithVROC(snap.Storage, snap.PCIeDevices)
|
||||
snap.Storage = appendUniqueStorage(snap.Storage, collectRAIDStorage(snap.PCIeDevices))
|
||||
snap.PowerSupplies = collectPSUs()
|
||||
snap.VROCLicense = collectVROCLicense(snap.PCIeDevices)
|
||||
snap.PowerSupplies = collectPSUs(derefString(snap.Board.Manufacturer))
|
||||
snap.PowerSupplies = enrichPSUsWithTelemetry(snap.PowerSupplies, sensorDoc)
|
||||
snap.Sensors = buildSensorsFromDoc(sensorDoc)
|
||||
snap.Sensors = mergeIPMISensors(buildSensorsFromDoc(sensorDoc), collectIPMISensors())
|
||||
snap.EventLogs = append(collectIPMISEL(), collectDmesgErrors()...)
|
||||
finalizeSnapshot(&snap, collectedAt)
|
||||
|
||||
// remaining collectors added in steps 1.8 – 1.10
|
||||
|
||||
129
audit/internal/collector/dmesg_events.go
Normal file
129
audit/internal/collector/dmesg_events.go
Normal file
@@ -0,0 +1,129 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"log/slog"
|
||||
"os/exec"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// dmesg -T output: [Thu Jun 18 14:23:45 2026] message
|
||||
// dmesg without -T: [ 123.456789] message
|
||||
var dmesgTimestampRE = regexp.MustCompile(`^\[([^\]]+)\]\s*(.*)$`)
|
||||
|
||||
// Keywords that indicate an error or hardware problem worth capturing.
|
||||
var dmesgErrorPatterns = []*regexp.Regexp{
|
||||
regexp.MustCompile(`(?i)\berr(or)?\b`),
|
||||
regexp.MustCompile(`(?i)\bfail(ed|ure)?\b`),
|
||||
regexp.MustCompile(`(?i)\bfault\b`),
|
||||
regexp.MustCompile(`(?i)\bwarn(ing)?\b`),
|
||||
regexp.MustCompile(`(?i)\bAER\b`),
|
||||
regexp.MustCompile(`(?i)\bXid\b`),
|
||||
regexp.MustCompile(`(?i)\bNVRM\b`),
|
||||
regexp.MustCompile(`(?i)\bpanic\b`),
|
||||
regexp.MustCompile(`(?i)\bcorrected\b`),
|
||||
regexp.MustCompile(`(?i)\buncorrect`),
|
||||
regexp.MustCompile(`(?i)\bECC\b`),
|
||||
regexp.MustCompile(`(?i)\btimeout\b`),
|
||||
regexp.MustCompile(`(?i)\breset\b`),
|
||||
regexp.MustCompile(`(?i)\bdead\b`),
|
||||
regexp.MustCompile(`(?i)\bhang\b`),
|
||||
regexp.MustCompile(`(?i)\bstall\b`),
|
||||
regexp.MustCompile(`(?i)\bdisabled\b`),
|
||||
}
|
||||
|
||||
// collectDmesgErrors runs `dmesg -T` (or `dmesg` without -T on failure) and
|
||||
// returns only lines that match known error/warning patterns.
|
||||
func collectDmesgErrors() []schema.HardwareEventLog {
|
||||
out, err := exec.Command("dmesg", "-T").Output()
|
||||
if err != nil || len(out) == 0 {
|
||||
// Fallback: dmesg without human-readable timestamps
|
||||
out, err = exec.Command("dmesg").Output()
|
||||
if err != nil || len(out) == 0 {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
entries := parseDmesgErrors(string(out))
|
||||
if len(entries) == 0 {
|
||||
return nil
|
||||
}
|
||||
slog.Info("dmesg: collected error entries", "count", len(entries))
|
||||
return entries
|
||||
}
|
||||
|
||||
func parseDmesgErrors(output string) []schema.HardwareEventLog {
|
||||
var entries []schema.HardwareEventLog
|
||||
collectedAt := time.Now().UTC().Format(time.RFC3339)
|
||||
|
||||
for _, line := range strings.Split(output, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
var timestamp, message string
|
||||
if m := dmesgTimestampRE.FindStringSubmatch(line); m != nil {
|
||||
timestamp = strings.TrimSpace(m[1])
|
||||
message = strings.TrimSpace(m[2])
|
||||
} else {
|
||||
message = line
|
||||
}
|
||||
|
||||
if message == "" {
|
||||
continue
|
||||
}
|
||||
if !matchesAny(message, dmesgErrorPatterns) {
|
||||
continue
|
||||
}
|
||||
|
||||
severity := dmesgSeverity(message)
|
||||
source := "dmesg"
|
||||
|
||||
var eventTime *string
|
||||
if timestamp != "" {
|
||||
t := timestamp
|
||||
eventTime = &t
|
||||
} else {
|
||||
eventTime = &collectedAt
|
||||
}
|
||||
|
||||
entries = append(entries, schema.HardwareEventLog{
|
||||
Source: source,
|
||||
EventTime: eventTime,
|
||||
Severity: &severity,
|
||||
Message: message,
|
||||
})
|
||||
}
|
||||
return entries
|
||||
}
|
||||
|
||||
func matchesAny(s string, patterns []*regexp.Regexp) bool {
|
||||
for _, p := range patterns {
|
||||
if p.MatchString(s) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func dmesgSeverity(msg string) string {
|
||||
lower := strings.ToLower(msg)
|
||||
switch {
|
||||
case strings.Contains(lower, "panic") ||
|
||||
strings.Contains(lower, "aer") ||
|
||||
strings.Contains(lower, "uncorrect") ||
|
||||
strings.Contains(lower, "xid") ||
|
||||
strings.Contains(lower, "nvrm"):
|
||||
return statusCritical
|
||||
case strings.Contains(lower, "error") ||
|
||||
strings.Contains(lower, "fault") ||
|
||||
strings.Contains(lower, "fail") ||
|
||||
strings.Contains(lower, "dead") ||
|
||||
strings.Contains(lower, "hang"):
|
||||
return statusCritical
|
||||
default:
|
||||
return statusWarning
|
||||
}
|
||||
}
|
||||
92
audit/internal/collector/ipmi_profile.go
Normal file
92
audit/internal/collector/ipmi_profile.go
Normal file
@@ -0,0 +1,92 @@
|
||||
package collector
|
||||
|
||||
// Package-level IPMI tuning profiles.
|
||||
//
|
||||
// Each profile is matched by board manufacturer (already known before PSU
|
||||
// collection runs). The profile drives two things:
|
||||
// - Per-command timeouts — prevents infinite hangs on slow BMCs.
|
||||
// - FRU early-exit — streaming parser stops reading once all PSU entries
|
||||
// are found, avoiding the tail of non-PSU FRU records.
|
||||
//
|
||||
// To add a new vendor: append to ipmiProfiles. The first matching entry wins.
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// ipmiProfile holds tuning parameters for one or more board manufacturers.
|
||||
type ipmiProfile struct {
|
||||
// name is shown in log messages.
|
||||
name string
|
||||
// manufacturers is a list of lowercase substrings matched against the
|
||||
// board manufacturer string from dmidecode type 1.
|
||||
manufacturers []string
|
||||
// fruTimeout is the hard deadline for the entire `ipmitool fru print`
|
||||
// command. Zero means no timeout (not recommended).
|
||||
fruTimeout time.Duration
|
||||
// sdrTimeout is the hard deadline for `ipmitool sdr`.
|
||||
sdrTimeout time.Duration
|
||||
// mcInfoTimeout is the hard deadline for `ipmitool mc info`.
|
||||
mcInfoTimeout time.Duration
|
||||
// fruEarlyExit instructs the streaming FRU parser to stop reading
|
||||
// after it has found at least one PSU entry and the current block is
|
||||
// complete. Useful on servers with many non-PSU FRU devices.
|
||||
fruEarlyExit bool
|
||||
}
|
||||
|
||||
// ipmiProfiles is the ordered list of profiles. First match wins.
|
||||
var ipmiProfiles = []ipmiProfile{
|
||||
{
|
||||
// Lenovo XCC-based servers (ThinkSystem SR6xx / SR8xx / ST series).
|
||||
// SR650 V3 has 54 FRU devices; each IPMI read takes ~2 s, so the
|
||||
// full `fru print` scan takes ~108 s on a loaded BMC. Enable early
|
||||
// exit so collection stops once PSU records are found.
|
||||
name: "lenovo",
|
||||
manufacturers: []string{"lenovo"},
|
||||
fruTimeout: 90 * time.Second,
|
||||
sdrTimeout: 45 * time.Second,
|
||||
mcInfoTimeout: 15 * time.Second,
|
||||
fruEarlyExit: true,
|
||||
},
|
||||
{
|
||||
// HPE iLO-based servers (ProLiant DL/ML/BL).
|
||||
name: "hpe",
|
||||
manufacturers: []string{"hp", "hewlett packard"},
|
||||
fruTimeout: 60 * time.Second,
|
||||
sdrTimeout: 30 * time.Second,
|
||||
mcInfoTimeout: 10 * time.Second,
|
||||
fruEarlyExit: false,
|
||||
},
|
||||
{
|
||||
// Dell iDRAC-based servers.
|
||||
name: "dell",
|
||||
manufacturers: []string{"dell"},
|
||||
fruTimeout: 60 * time.Second,
|
||||
sdrTimeout: 30 * time.Second,
|
||||
mcInfoTimeout: 10 * time.Second,
|
||||
fruEarlyExit: false,
|
||||
},
|
||||
}
|
||||
|
||||
// defaultIPMIProfile is used when no vendor profile matches.
|
||||
var defaultIPMIProfile = ipmiProfile{
|
||||
name: "default",
|
||||
fruTimeout: 60 * time.Second,
|
||||
sdrTimeout: 30 * time.Second,
|
||||
mcInfoTimeout: 10 * time.Second,
|
||||
fruEarlyExit: false,
|
||||
}
|
||||
|
||||
// selectIPMIProfile returns the profile for the given board manufacturer.
|
||||
func selectIPMIProfile(manufacturer string) ipmiProfile {
|
||||
mfgLower := strings.ToLower(strings.TrimSpace(manufacturer))
|
||||
for _, p := range ipmiProfiles {
|
||||
for _, m := range p.manufacturers {
|
||||
if strings.Contains(mfgLower, m) {
|
||||
return p
|
||||
}
|
||||
}
|
||||
}
|
||||
return defaultIPMIProfile
|
||||
}
|
||||
90
audit/internal/collector/ipmi_sel.go
Normal file
90
audit/internal/collector/ipmi_sel.go
Normal file
@@ -0,0 +1,90 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os/exec"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// collectIPMISEL runs `ipmitool sel list` and returns parsed event log entries.
|
||||
// Returns nil if ipmitool is unavailable or the SEL is empty.
|
||||
func collectIPMISEL() []schema.HardwareEventLog {
|
||||
out, err := exec.Command("ipmitool", "sel", "list").Output()
|
||||
if err != nil || len(out) == 0 {
|
||||
return nil
|
||||
}
|
||||
entries := parseIPMISELOutput(string(out))
|
||||
if len(entries) == 0 {
|
||||
return nil
|
||||
}
|
||||
slog.Info("ipmi sel: collected", "entries", len(entries))
|
||||
return entries
|
||||
}
|
||||
|
||||
// parseIPMISELOutput parses `ipmitool sel list` output.
|
||||
// Line format: ID | date | time | sensor | event description | direction
|
||||
// Example: 1 | 06/18/2026 | 14:23:45 | Temperature #0x30 | Upper Critical going high | Asserted
|
||||
func parseIPMISELOutput(output string) []schema.HardwareEventLog {
|
||||
var entries []schema.HardwareEventLog
|
||||
for _, line := range strings.Split(output, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
parts := strings.SplitN(line, "|", 6)
|
||||
if len(parts) < 5 {
|
||||
continue
|
||||
}
|
||||
id := strings.TrimSpace(parts[0])
|
||||
date := strings.TrimSpace(parts[1])
|
||||
timeStr := strings.TrimSpace(parts[2])
|
||||
sensor := strings.TrimSpace(parts[3])
|
||||
event := strings.TrimSpace(parts[4])
|
||||
direction := ""
|
||||
if len(parts) == 6 {
|
||||
direction = strings.TrimSpace(parts[5])
|
||||
}
|
||||
|
||||
var eventTime *string
|
||||
if date != "" && timeStr != "" {
|
||||
t := fmt.Sprintf("%s %s", date, timeStr)
|
||||
eventTime = &t
|
||||
}
|
||||
|
||||
message := event
|
||||
if direction != "" && strings.EqualFold(direction, "Deasserted") {
|
||||
message = event + " (Deasserted)"
|
||||
}
|
||||
|
||||
severity := ipmiSELSeverity(event)
|
||||
isActive := !strings.EqualFold(direction, "Deasserted")
|
||||
|
||||
entry := schema.HardwareEventLog{
|
||||
Source: "ipmi-sel",
|
||||
EventTime: eventTime,
|
||||
Severity: &severity,
|
||||
MessageID: &id,
|
||||
Message: message,
|
||||
IsActive: &isActive,
|
||||
}
|
||||
if sensor != "" {
|
||||
entry.ComponentRef = &sensor
|
||||
}
|
||||
entries = append(entries, entry)
|
||||
}
|
||||
return entries
|
||||
}
|
||||
|
||||
func ipmiSELSeverity(event string) string {
|
||||
lower := strings.ToLower(event)
|
||||
switch {
|
||||
case strings.Contains(lower, "critical") || strings.Contains(lower, "non-recoverable"):
|
||||
return statusCritical
|
||||
case strings.Contains(lower, "non-critical") || strings.Contains(lower, "warning") || strings.Contains(lower, "degraded"):
|
||||
return statusWarning
|
||||
default:
|
||||
return "info"
|
||||
}
|
||||
}
|
||||
216
audit/internal/collector/ipmi_sensors.go
Normal file
216
audit/internal/collector/ipmi_sensors.go
Normal file
@@ -0,0 +1,216 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"log/slog"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// collectIPMISensors runs `ipmitool sensor` and returns parsed sensor readings.
|
||||
// Returns nil if ipmitool is unavailable or produces no output.
|
||||
func collectIPMISensors() *schema.HardwareSensors {
|
||||
out, err := exec.Command("ipmitool", "sensor").Output()
|
||||
if err != nil || len(out) == 0 {
|
||||
return nil
|
||||
}
|
||||
result := parseIPMISensorOutput(string(out))
|
||||
if result == nil {
|
||||
return nil
|
||||
}
|
||||
slog.Info("ipmi sensors: collected",
|
||||
"fans", len(result.Fans),
|
||||
"temperatures", len(result.Temperatures),
|
||||
"power", len(result.Power),
|
||||
"other", len(result.Other),
|
||||
)
|
||||
return result
|
||||
}
|
||||
|
||||
// parseIPMISensorOutput parses `ipmitool sensor` text output.
|
||||
// Each line: name | value | unit | status | lnr | lcr | lnc | unc | ucr | unr
|
||||
func parseIPMISensorOutput(output string) *schema.HardwareSensors {
|
||||
result := &schema.HardwareSensors{}
|
||||
seen := map[string]struct{}{}
|
||||
|
||||
for _, line := range strings.Split(output, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
parts := strings.Split(line, "|")
|
||||
if len(parts) < 4 {
|
||||
continue
|
||||
}
|
||||
name := strings.TrimSpace(parts[0])
|
||||
rawVal := strings.TrimSpace(parts[1])
|
||||
unit := strings.TrimSpace(parts[2])
|
||||
status := strings.TrimSpace(parts[3])
|
||||
|
||||
if name == "" || rawVal == "na" || rawVal == "N/A" || rawVal == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
value, err := strconv.ParseFloat(rawVal, 64)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
statusStr := normalizeIPMISensorStatus(status)
|
||||
|
||||
switch {
|
||||
case strings.EqualFold(unit, "RPM"):
|
||||
if duplicateSensor(seen, "fan", name) {
|
||||
continue
|
||||
}
|
||||
rpm := int(value)
|
||||
item := schema.HardwareFanSensor{Name: name, RPM: &rpm}
|
||||
if statusStr != "" {
|
||||
item.Status = &statusStr
|
||||
}
|
||||
result.Fans = append(result.Fans, item)
|
||||
|
||||
case strings.EqualFold(unit, "degrees C") || strings.EqualFold(unit, "C"):
|
||||
if duplicateSensor(seen, "temp", name) {
|
||||
continue
|
||||
}
|
||||
item := schema.HardwareTemperatureSensor{Name: name, Celsius: &value}
|
||||
if len(parts) >= 9 {
|
||||
if unc := parseIPMIThreshold(parts[7]); unc != nil {
|
||||
item.ThresholdWarningCelsius = unc
|
||||
}
|
||||
if ucr := parseIPMIThreshold(parts[8]); ucr != nil {
|
||||
item.ThresholdCriticalCelsius = ucr
|
||||
}
|
||||
}
|
||||
if statusStr != "" {
|
||||
item.Status = &statusStr
|
||||
} else {
|
||||
item.Status = deriveTemperatureStatus(item.Celsius, item.ThresholdWarningCelsius, item.ThresholdCriticalCelsius)
|
||||
}
|
||||
result.Temperatures = append(result.Temperatures, item)
|
||||
|
||||
case strings.EqualFold(unit, "Volts") || strings.EqualFold(unit, "V"):
|
||||
if duplicateSensor(seen, "power", name) {
|
||||
continue
|
||||
}
|
||||
item := schema.HardwarePowerSensor{Name: name, VoltageV: &value}
|
||||
if statusStr != "" {
|
||||
item.Status = &statusStr
|
||||
}
|
||||
result.Power = append(result.Power, item)
|
||||
|
||||
case strings.EqualFold(unit, "Watts") || strings.EqualFold(unit, "W"):
|
||||
if duplicateSensor(seen, "power", name) {
|
||||
continue
|
||||
}
|
||||
item := schema.HardwarePowerSensor{Name: name, PowerW: &value}
|
||||
if statusStr != "" {
|
||||
item.Status = &statusStr
|
||||
}
|
||||
result.Power = append(result.Power, item)
|
||||
|
||||
case strings.EqualFold(unit, "Amps") || strings.EqualFold(unit, "A"):
|
||||
if duplicateSensor(seen, "power", name) {
|
||||
continue
|
||||
}
|
||||
item := schema.HardwarePowerSensor{Name: name, CurrentA: &value}
|
||||
if statusStr != "" {
|
||||
item.Status = &statusStr
|
||||
}
|
||||
result.Power = append(result.Power, item)
|
||||
|
||||
default:
|
||||
if duplicateSensor(seen, "other", name) {
|
||||
continue
|
||||
}
|
||||
item := schema.HardwareOtherSensor{Name: name, Value: &value}
|
||||
if unit != "" {
|
||||
item.Unit = &unit
|
||||
}
|
||||
if statusStr != "" {
|
||||
item.Status = &statusStr
|
||||
}
|
||||
result.Other = append(result.Other, item)
|
||||
}
|
||||
}
|
||||
|
||||
if len(result.Fans) == 0 && len(result.Temperatures) == 0 && len(result.Power) == 0 && len(result.Other) == 0 {
|
||||
return nil
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func parseIPMIThreshold(raw string) *float64 {
|
||||
s := strings.TrimSpace(raw)
|
||||
if s == "" || s == "na" || s == "N/A" {
|
||||
return nil
|
||||
}
|
||||
v, err := strconv.ParseFloat(s, 64)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
return &v
|
||||
}
|
||||
|
||||
func normalizeIPMISensorStatus(s string) string {
|
||||
switch strings.ToLower(s) {
|
||||
case "ok":
|
||||
return statusOK
|
||||
case "cr", "ucr", "lcr":
|
||||
return statusCritical
|
||||
case "nc", "unc", "lnc", "nr", "unr", "lnr":
|
||||
return statusWarning
|
||||
case "ns", "na":
|
||||
return ""
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
// mergeIPMISensors appends IPMI sensor entries into existing, skipping names already present.
|
||||
func mergeIPMISensors(existing, ipmi *schema.HardwareSensors) *schema.HardwareSensors {
|
||||
if ipmi == nil {
|
||||
return existing
|
||||
}
|
||||
if existing == nil {
|
||||
return ipmi
|
||||
}
|
||||
|
||||
existingNames := map[string]struct{}{}
|
||||
for _, s := range existing.Fans {
|
||||
existingNames["fan\x00"+s.Name] = struct{}{}
|
||||
}
|
||||
for _, s := range existing.Temperatures {
|
||||
existingNames["temp\x00"+s.Name] = struct{}{}
|
||||
}
|
||||
for _, s := range existing.Power {
|
||||
existingNames["power\x00"+s.Name] = struct{}{}
|
||||
}
|
||||
for _, s := range existing.Other {
|
||||
existingNames["other\x00"+s.Name] = struct{}{}
|
||||
}
|
||||
|
||||
for _, s := range ipmi.Fans {
|
||||
if _, ok := existingNames["fan\x00"+s.Name]; !ok {
|
||||
existing.Fans = append(existing.Fans, s)
|
||||
}
|
||||
}
|
||||
for _, s := range ipmi.Temperatures {
|
||||
if _, ok := existingNames["temp\x00"+s.Name]; !ok {
|
||||
existing.Temperatures = append(existing.Temperatures, s)
|
||||
}
|
||||
}
|
||||
for _, s := range ipmi.Power {
|
||||
if _, ok := existingNames["power\x00"+s.Name]; !ok {
|
||||
existing.Power = append(existing.Power, s)
|
||||
}
|
||||
}
|
||||
for _, s := range ipmi.Other {
|
||||
if _, ok := existingNames["other\x00"+s.Name]; !ok {
|
||||
existing.Other = append(existing.Other, s)
|
||||
}
|
||||
}
|
||||
return existing
|
||||
}
|
||||
87
audit/internal/collector/memory_test.go
Normal file
87
audit/internal/collector/memory_test.go
Normal file
@@ -0,0 +1,87 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestParseMemory_Mixed(t *testing.T) {
|
||||
out := mustReadFile(t, "testdata/dmidecode_type17_mixed.txt")
|
||||
dimms := parseMemory(out)
|
||||
|
||||
if len(dimms) != 3 {
|
||||
t.Fatalf("expected 3 DIMMs, got %d", len(dimms))
|
||||
}
|
||||
|
||||
// slot 0: populated, 16 GB Supermicro-style
|
||||
d0 := dimms[0]
|
||||
if d0.Present == nil || !*d0.Present {
|
||||
t.Errorf("dimm0: expected present=true")
|
||||
}
|
||||
if d0.SizeMB == nil || *d0.SizeMB != 16384 {
|
||||
t.Errorf("dimm0: size_mb=%v, want 16384", d0.SizeMB)
|
||||
}
|
||||
if d0.Slot == nil || *d0.Slot != "P1-DIMMA1" {
|
||||
t.Errorf("dimm0: slot=%v, want P1-DIMMA1", d0.Slot)
|
||||
}
|
||||
if d0.Location == nil || *d0.Location != "P0_Node0_Channel0_Dimm0" {
|
||||
t.Errorf("dimm0: location=%v, want P0_Node0_Channel0_Dimm0", d0.Location)
|
||||
}
|
||||
if d0.Manufacturer == nil || *d0.Manufacturer != "Micron" {
|
||||
t.Errorf("dimm0: manufacturer=%v, want Micron", d0.Manufacturer)
|
||||
}
|
||||
if d0.PartNumber == nil || *d0.PartNumber != "36ASF2G72PZ-2G1A2" {
|
||||
t.Errorf("dimm0: part_number=%v, want 36ASF2G72PZ-2G1A2", d0.PartNumber)
|
||||
}
|
||||
if d0.MaxSpeedMHz == nil || *d0.MaxSpeedMHz != 2133 {
|
||||
t.Errorf("dimm0: max_speed_mhz=%v, want 2133", d0.MaxSpeedMHz)
|
||||
}
|
||||
|
||||
// slot 1: empty
|
||||
d1 := dimms[1]
|
||||
if d1.Present == nil || *d1.Present {
|
||||
t.Errorf("dimm1: expected present=false")
|
||||
}
|
||||
if d1.Status == nil || *d1.Status != statusEmpty {
|
||||
t.Errorf("dimm1: status=%v, want %s", d1.Status, statusEmpty)
|
||||
}
|
||||
if d1.SizeMB != nil {
|
||||
t.Errorf("dimm1: size_mb should be nil for empty slot, got %v", d1.SizeMB)
|
||||
}
|
||||
|
||||
// slot 2: populated, 32768 MB Dell-style size
|
||||
d2 := dimms[2]
|
||||
if d2.Present == nil || !*d2.Present {
|
||||
t.Errorf("dimm2: expected present=true")
|
||||
}
|
||||
if d2.SizeMB == nil || *d2.SizeMB != 32768 {
|
||||
t.Errorf("dimm2: size_mb=%v, want 32768", d2.SizeMB)
|
||||
}
|
||||
if d2.Manufacturer == nil || *d2.Manufacturer != "Samsung" {
|
||||
t.Errorf("dimm2: manufacturer=%v, want Samsung", d2.Manufacturer)
|
||||
}
|
||||
if d2.CurrentSpeedMHz == nil || *d2.CurrentSpeedMHz != 2400 {
|
||||
t.Errorf("dimm2: current_speed_mhz=%v, want 2400", d2.CurrentSpeedMHz)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseMemorySizeMB(t *testing.T) {
|
||||
tests := []struct {
|
||||
input string
|
||||
want int
|
||||
}{
|
||||
{"16 GB", 16384},
|
||||
{"32 GB", 32768},
|
||||
{"8 GB", 8192},
|
||||
{"16384 MB", 16384},
|
||||
{"32768 MB", 32768},
|
||||
{"No Module Installed", 0},
|
||||
{"0", 0},
|
||||
{"", 0},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
got := parseMemorySizeMB(tt.input)
|
||||
if got != tt.want {
|
||||
t.Errorf("parseMemorySizeMB(%q) = %d, want %d", tt.input, got, tt.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -11,7 +11,6 @@ import (
|
||||
"time"
|
||||
)
|
||||
|
||||
const mellanoxVendorID = 0x15b3
|
||||
const nicProbeTimeout = 2 * time.Second
|
||||
|
||||
var (
|
||||
@@ -80,16 +79,7 @@ func enrichPCIeWithMellanox(devs []schema.HardwarePCIeDevice) []schema.HardwareP
|
||||
}
|
||||
|
||||
func isMellanoxDevice(dev schema.HardwarePCIeDevice) bool {
|
||||
if dev.VendorID != nil && *dev.VendorID == mellanoxVendorID {
|
||||
return true
|
||||
}
|
||||
if dev.Manufacturer != nil {
|
||||
m := strings.ToLower(*dev.Manufacturer)
|
||||
if strings.Contains(m, "mellanox") || strings.Contains(m, "nvidia networking") {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
return dev.VendorID != nil && *dev.VendorID == MellanoxVendorID
|
||||
}
|
||||
|
||||
func queryMellanoxFromMstflint(bdf string) (firmware, serial string) {
|
||||
|
||||
@@ -55,7 +55,7 @@ func TestEnrichPCIeWithMellanox_mstflint(t *testing.T) {
|
||||
}
|
||||
netIfacesByBDF = func(string) []string { return nil }
|
||||
|
||||
vendorID := mellanoxVendorID
|
||||
vendorID := MellanoxVendorID
|
||||
bdf := "0000:18:00.0"
|
||||
manufacturer := "Mellanox Technologies"
|
||||
devs := []schema.HardwarePCIeDevice{{
|
||||
@@ -99,7 +99,7 @@ func TestEnrichPCIeWithMellanox_fallbackEthtool(t *testing.T) {
|
||||
return "driver: mlx5_core\nfirmware-version: 28.40.1000\n", nil
|
||||
}
|
||||
|
||||
vendorID := mellanoxVendorID
|
||||
vendorID := MellanoxVendorID
|
||||
bdf := "0000:18:00.0"
|
||||
manufacturer := "NVIDIA Networking"
|
||||
devs := []schema.HardwarePCIeDevice{{
|
||||
|
||||
@@ -10,8 +10,6 @@ import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
const nvidiaVendorID = 0x10de
|
||||
|
||||
type nvidiaGPUInfo struct {
|
||||
Index int
|
||||
BDF string
|
||||
@@ -240,13 +238,7 @@ func normalizePCIeBDF(bdf string) string {
|
||||
}
|
||||
|
||||
func isNVIDIADevice(dev schema.HardwarePCIeDevice) bool {
|
||||
if dev.VendorID != nil && *dev.VendorID == nvidiaVendorID {
|
||||
return true
|
||||
}
|
||||
if dev.Manufacturer != nil && strings.Contains(strings.ToLower(*dev.Manufacturer), "nvidia") {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
return dev.VendorID != nil && *dev.VendorID == NvidiaVendorID
|
||||
}
|
||||
|
||||
func setPCIeFallback(dev *schema.HardwarePCIeDevice) {
|
||||
|
||||
@@ -57,7 +57,7 @@ func TestNormalizePCIeBDF(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestEnrichPCIeWithNVIDIAData_driverLoaded(t *testing.T) {
|
||||
vendorID := nvidiaVendorID
|
||||
vendorID := NvidiaVendorID
|
||||
bdf := "0000:65:00.0"
|
||||
manufacturer := "NVIDIA Corporation"
|
||||
status := "OK"
|
||||
@@ -104,7 +104,7 @@ func TestEnrichPCIeWithNVIDIAData_driverLoaded(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestEnrichPCIeWithNVIDIAData_driverMissingFallback(t *testing.T) {
|
||||
vendorID := nvidiaVendorID
|
||||
vendorID := NvidiaVendorID
|
||||
bdf := "0000:17:00.0"
|
||||
manufacturer := "NVIDIA Corporation"
|
||||
devices := []schema.HardwarePCIeDevice{
|
||||
|
||||
11
audit/internal/collector/pci_vendors.go
Normal file
11
audit/internal/collector/pci_vendors.go
Normal file
@@ -0,0 +1,11 @@
|
||||
package collector
|
||||
|
||||
// PCI vendor IDs for hardware classification.
|
||||
// Source: https://pcisig.com / https://pci-ids.ucw.cz/
|
||||
const (
|
||||
NvidiaVendorID = 0x10de
|
||||
AMDVendorID = 0x1002
|
||||
AspeedVendorID = 0x1a03
|
||||
MellanoxVendorID = 0x15b3
|
||||
IntelVendorID = 0x8086
|
||||
)
|
||||
@@ -4,7 +4,9 @@ import (
|
||||
"bee/audit/internal/schema"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
@@ -124,35 +126,39 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
|
||||
dev.Status = &status
|
||||
|
||||
// Slot is the BDF: "0000:00:02.0"
|
||||
if bdf := fields["Slot"]; bdf != "" {
|
||||
dev.Slot = &bdf
|
||||
dev.BDF = &bdf
|
||||
bdfStr := fields["Slot"]
|
||||
if bdfStr != "" {
|
||||
dev.Slot = &bdfStr
|
||||
dev.BDF = &bdfStr
|
||||
// parse vendor_id and device_id from sysfs
|
||||
vendorID, deviceID := readPCIIDs(bdf)
|
||||
vendorID, deviceID := readPCIIDs(bdfStr)
|
||||
if vendorID != 0 {
|
||||
dev.VendorID = &vendorID
|
||||
}
|
||||
if deviceID != 0 {
|
||||
dev.DeviceID = &deviceID
|
||||
}
|
||||
if numaNode, ok := readPCINumaNode(bdf); ok {
|
||||
if numaNode, ok := readPCINumaNode(bdfStr); ok {
|
||||
dev.NUMANode = &numaNode
|
||||
} else if numaNode, ok := parsePCINumaNode(fields["NUMANode"]); ok {
|
||||
dev.NUMANode = &numaNode
|
||||
}
|
||||
if width, ok := readPCIIntAttribute(bdf, "current_link_width"); ok {
|
||||
if group, ok := readPCIIOMMUGroup(bdfStr); ok {
|
||||
dev.IOMMUGroup = &group
|
||||
}
|
||||
if width, ok := readPCIIntAttribute(bdfStr, "current_link_width"); ok {
|
||||
dev.LinkWidth = &width
|
||||
}
|
||||
if width, ok := readPCIIntAttribute(bdf, "max_link_width"); ok {
|
||||
if width, ok := readPCIIntAttribute(bdfStr, "max_link_width"); ok {
|
||||
dev.MaxLinkWidth = &width
|
||||
}
|
||||
if speed, ok := readPCIStringAttribute(bdf, "current_link_speed"); ok {
|
||||
if speed, ok := readPCIStringAttribute(bdfStr, "current_link_speed"); ok {
|
||||
linkSpeed := normalizePCILinkSpeed(speed)
|
||||
if linkSpeed != "" {
|
||||
dev.LinkSpeed = &linkSpeed
|
||||
}
|
||||
}
|
||||
if speed, ok := readPCIStringAttribute(bdf, "max_link_speed"); ok {
|
||||
if speed, ok := readPCIStringAttribute(bdfStr, "max_link_speed"); ok {
|
||||
linkSpeed := normalizePCILinkSpeed(speed)
|
||||
if linkSpeed != "" {
|
||||
dev.MaxLinkSpeed = &linkSpeed
|
||||
@@ -173,12 +179,35 @@ func parseLspciDevice(fields map[string]string) schema.HardwarePCIeDevice {
|
||||
|
||||
// SVendor/SDevice available but not in schema — skip
|
||||
|
||||
// Warn if PCIe link is running below its maximum negotiated speed.
|
||||
// Detect NVLink bridge mezzanine cards (CPU→HGX internal link).
|
||||
// These are Mellanox x2 devices with no host net interfaces and a DeviceName
|
||||
// containing "NVLINK". The targeted lspci call is only executed for the small
|
||||
// number of narrow-link Mellanox cards that pass the cheap pre-filter.
|
||||
if bdfStr != "" && isNVLinkBridgeCandidate(bdfStr, dev) && confirmNVLinkBridgeDeviceName(bdfStr) {
|
||||
markNVLinkBridge(&dev)
|
||||
}
|
||||
|
||||
// Warn (or Critical for NVLink bridges) if PCIe link is running below max.
|
||||
applyPCIeLinkSpeedWarning(&dev)
|
||||
|
||||
return dev
|
||||
}
|
||||
|
||||
// readPCIIOMMUGroup resolves the IOMMU group number for a BDF via the
|
||||
// iommu_group symlink in sysfs: .../devices/<bdf>/iommu_group -> .../kernel/iommu_groups/<N>
|
||||
func readPCIIOMMUGroup(bdf string) (int, bool) {
|
||||
link := "/sys/bus/pci/devices/" + bdf + "/iommu_group"
|
||||
target, err := os.Readlink(link)
|
||||
if err != nil {
|
||||
return 0, false
|
||||
}
|
||||
n, err := strconv.Atoi(filepath.Base(target))
|
||||
if err != nil {
|
||||
return 0, false
|
||||
}
|
||||
return n, true
|
||||
}
|
||||
|
||||
// readPCIIDs reads vendor and device IDs from sysfs for a given BDF.
|
||||
func readPCIIDs(bdf string) (vendorID, deviceID int) {
|
||||
base := "/sys/bus/pci/devices/" + bdf
|
||||
@@ -245,17 +274,37 @@ func readPCIStringAttribute(bdf, attribute string) (string, bool) {
|
||||
return value, true
|
||||
}
|
||||
|
||||
// applyPCIeLinkSpeedWarning sets the device status to Warning if the current PCIe link
|
||||
// speed is below the maximum negotiated speed supported by both ends.
|
||||
// applyPCIeLinkSpeedWarning sets device status when the current PCIe link speed is
|
||||
// below the device maximum. Regular PCIe slots get Warning; NVLink bridge cards
|
||||
// get Critical because they are fixed internal connectors that must always train
|
||||
// to max speed — any downgrade signals a hardware fault.
|
||||
//
|
||||
// Disabled devices (sysfs enable==0) are skipped: they carry no data traffic and
|
||||
// their link state has no operational impact. This covers management endpoints
|
||||
// (e.g. PCIe switch fabric controllers on HGX baseboards) that the kernel never
|
||||
// activates but that lspci still reports with link stats.
|
||||
func applyPCIeLinkSpeedWarning(dev *schema.HardwarePCIeDevice) {
|
||||
if dev.LinkSpeed == nil || dev.MaxLinkSpeed == nil {
|
||||
return
|
||||
}
|
||||
if pcieLinkSpeedRank(*dev.LinkSpeed) < pcieLinkSpeedRank(*dev.MaxLinkSpeed) {
|
||||
if pcieLinkSpeedRank(*dev.LinkSpeed) >= pcieLinkSpeedRank(*dev.MaxLinkSpeed) {
|
||||
return
|
||||
}
|
||||
if dev.BDF != nil {
|
||||
if enabled, ok := readPCIIntAttribute(*dev.BDF, "enable"); ok && enabled == 0 {
|
||||
return
|
||||
}
|
||||
}
|
||||
desc := fmt.Sprintf("PCIe link speed degraded: running at %s, capable of %s", *dev.LinkSpeed, *dev.MaxLinkSpeed)
|
||||
dev.ErrorDescription = &desc
|
||||
|
||||
isNVLinkBridge := dev.DeviceClass != nil && *dev.DeviceClass == "NVLinkBridge"
|
||||
if isNVLinkBridge {
|
||||
crit := statusCritical
|
||||
dev.Status = &crit
|
||||
} else {
|
||||
warn := statusWarning
|
||||
dev.Status = &warn
|
||||
desc := fmt.Sprintf("PCIe link speed degraded: running at %s, capable of %s", *dev.LinkSpeed, *dev.MaxLinkSpeed)
|
||||
dev.ErrorDescription = &desc
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
206
audit/internal/collector/pcie_nvlink_bridge.go
Normal file
206
audit/internal/collector/pcie_nvlink_bridge.go
Normal file
@@ -0,0 +1,206 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"log/slog"
|
||||
"os/exec"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var nv5re = regexp.MustCompile(`(?i)^NV(\d+)$`)
|
||||
|
||||
// isNVLinkBridgeCandidate returns true for Mellanox PCIe devices that look like
|
||||
// NVLink bridge mezzanine cards: narrow link (x2), no host net interfaces.
|
||||
// These are the CPU-side PCIe control plane of the NVSwitch fabric on HGX/DGX systems.
|
||||
func isNVLinkBridgeCandidate(bdf string, dev schema.HardwarePCIeDevice) bool {
|
||||
if !isMellanoxDevice(dev) {
|
||||
return false
|
||||
}
|
||||
if dev.LinkWidth == nil || *dev.LinkWidth > 2 {
|
||||
return false
|
||||
}
|
||||
if len(netIfacesByBDF(bdf)) > 0 {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// confirmNVLinkBridgeDeviceName checks if the lspci DeviceName for bdf contains
|
||||
// "NVLINK". This is a targeted single-device call, only executed for candidates
|
||||
// already pre-filtered by isNVLinkBridgeCandidate.
|
||||
func confirmNVLinkBridgeDeviceName(bdf string) bool {
|
||||
out, err := exec.Command("lspci", "-s", bdf, "-v").Output()
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
for _, line := range strings.Split(string(out), "\n") {
|
||||
if strings.Contains(strings.ToUpper(strings.TrimSpace(line)), "NVLINK") {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// markNVLinkBridge overwrites device_class and adds telemetry flags on a detected
|
||||
// NVLink bridge card. Must be called before applyPCIeLinkSpeedWarning so that the
|
||||
// correct severity (Critical) is applied.
|
||||
func markNVLinkBridge(dev *schema.HardwarePCIeDevice) {
|
||||
class := "NVLinkBridge"
|
||||
dev.DeviceClass = &class
|
||||
if dev.Telemetry == nil {
|
||||
dev.Telemetry = map[string]any{}
|
||||
}
|
||||
dev.Telemetry["nvlink_bridge"] = true
|
||||
}
|
||||
|
||||
// enrichNVLinkBridgesWithGPUTopo cross-references NVLink bridge PCIe status with
|
||||
// the GPU-side NVLink topology reported by nvidia-smi. For each bridge device it
|
||||
// adds nvlink_topo_all_active and nvlink_topo_min_links to the telemetry, and
|
||||
// upgrades a degraded-link Warning to Critical when the fabric is also affected.
|
||||
func enrichNVLinkBridgesWithGPUTopo(devs []schema.HardwarePCIeDevice) []schema.HardwarePCIeDevice {
|
||||
hasBridge := false
|
||||
for _, d := range devs {
|
||||
if d.DeviceClass != nil && *d.DeviceClass == "NVLinkBridge" {
|
||||
hasBridge = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !hasBridge {
|
||||
return devs
|
||||
}
|
||||
|
||||
topo, err := queryNVIDIANVLinkTopo()
|
||||
if err != nil {
|
||||
slog.Info("nvlink-bridge: nvidia-smi topo unavailable, skipping cross-reference", "err", err)
|
||||
return devs
|
||||
}
|
||||
|
||||
for i := range devs {
|
||||
if devs[i].DeviceClass == nil || *devs[i].DeviceClass != "NVLinkBridge" {
|
||||
continue
|
||||
}
|
||||
if devs[i].Telemetry == nil {
|
||||
devs[i].Telemetry = map[string]any{}
|
||||
}
|
||||
devs[i].Telemetry["nvlink_topo_all_active"] = topo.AllActive
|
||||
devs[i].Telemetry["nvlink_topo_min_links"] = topo.MinNVLinks
|
||||
devs[i].Telemetry["nvlink_topo_gpu_count"] = topo.GPUCount
|
||||
|
||||
// If the bridge PCIe is already degraded AND the fabric is also degraded
|
||||
// (missing NVLink connections), escalate to Critical.
|
||||
if devs[i].Status != nil && *devs[i].Status == statusCritical && !topo.AllActive {
|
||||
devs[i].Telemetry["nvlink_fabric_affected"] = true
|
||||
}
|
||||
}
|
||||
|
||||
slog.Info("nvlink-bridge: topo cross-reference applied",
|
||||
"gpu_count", topo.GPUCount,
|
||||
"all_active", topo.AllActive,
|
||||
"min_links", topo.MinNVLinks,
|
||||
)
|
||||
return devs
|
||||
}
|
||||
|
||||
// nvlinkTopoResult summarises the GPU NVLink connectivity matrix.
|
||||
type nvlinkTopoResult struct {
|
||||
GPUCount int
|
||||
AllActive bool // true if every GPU pair has at least one NVLink bond
|
||||
MinNVLinks int // minimum NVLink bonds seen across any GPU pair (0 = some pair disconnected)
|
||||
}
|
||||
|
||||
// queryNVIDIANVLinkTopo runs nvidia-smi topo -m and parses the NVLink matrix.
|
||||
func queryNVIDIANVLinkTopo() (nvlinkTopoResult, error) {
|
||||
out, err := exec.Command("nvidia-smi", "topo", "-m").Output()
|
||||
if err != nil {
|
||||
return nvlinkTopoResult{}, err
|
||||
}
|
||||
return parseNVIDIATopologyMatrix(string(out)), nil
|
||||
}
|
||||
|
||||
// parseNVIDIATopologyMatrix extracts the minimum NVLink bond count from the
|
||||
// nvidia-smi topo -m matrix.
|
||||
//
|
||||
// Format (abbreviated):
|
||||
//
|
||||
// GPU0 GPU1 ... NIC0 NIC1
|
||||
// GPU0 X NV18 ... NODE NODE
|
||||
// GPU1 NV18 X ... NODE NODE
|
||||
// NIC0 NODE NODE... X PIX
|
||||
//
|
||||
// The header row starts with "GPU0"; its columns may include non-GPU entries
|
||||
// (NIC, CPU) which are ignored. Only GPU×GPU cells containing NV# values are
|
||||
// counted. X is self; non-NV tokens (NODE, SYS, PHB, PIX) are skipped.
|
||||
func parseNVIDIATopologyMatrix(raw string) nvlinkTopoResult {
|
||||
lines := strings.Split(raw, "\n")
|
||||
|
||||
// Locate the header line and record which column indices are GPU columns.
|
||||
headerIdx := -1
|
||||
var gpuColIndices []int // 0-based indices within fields (excluding the row label)
|
||||
var gpuCount int
|
||||
for i, line := range lines {
|
||||
trimmed := strings.TrimSpace(line)
|
||||
if strings.HasPrefix(trimmed, "GPU0") {
|
||||
parts := strings.Fields(trimmed)
|
||||
for j, col := range parts {
|
||||
if strings.HasPrefix(col, "GPU") {
|
||||
gpuColIndices = append(gpuColIndices, j)
|
||||
}
|
||||
}
|
||||
gpuCount = len(gpuColIndices)
|
||||
if gpuCount >= 2 {
|
||||
headerIdx = i
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
if headerIdx < 0 || gpuCount == 0 {
|
||||
return nvlinkTopoResult{}
|
||||
}
|
||||
|
||||
minLinks := -1 // -1 = no NV pair seen yet
|
||||
allActive := true
|
||||
|
||||
for _, line := range lines[headerIdx+1:] {
|
||||
trimmed := strings.TrimSpace(line)
|
||||
if !strings.HasPrefix(trimmed, "GPU") {
|
||||
continue
|
||||
}
|
||||
cells := strings.Fields(trimmed)
|
||||
// cells[0] is the row label (e.g. "GPU0"); cells[1..] are column values.
|
||||
// gpuColIndices are 0-based within the header fields, so they map to
|
||||
// cells[idx+1] in the data rows (shift by 1 for the row label).
|
||||
for _, colIdx := range gpuColIndices {
|
||||
dataIdx := colIdx + 1
|
||||
if dataIdx >= len(cells) {
|
||||
continue
|
||||
}
|
||||
cell := cells[dataIdx]
|
||||
m := nv5re.FindStringSubmatch(cell)
|
||||
if len(m) != 2 {
|
||||
continue
|
||||
}
|
||||
n, err := strconv.Atoi(m[1])
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if n == 0 {
|
||||
allActive = false
|
||||
}
|
||||
if minLinks < 0 || n < minLinks {
|
||||
minLinks = n
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if minLinks < 0 {
|
||||
minLinks = 0
|
||||
}
|
||||
|
||||
return nvlinkTopoResult{
|
||||
GPUCount: gpuCount,
|
||||
AllActive: allActive && minLinks > 0,
|
||||
MinNVLinks: minLinks,
|
||||
}
|
||||
}
|
||||
124
audit/internal/collector/pcie_nvlink_bridge_test.go
Normal file
124
audit/internal/collector/pcie_nvlink_bridge_test.go
Normal file
@@ -0,0 +1,124 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestParseNVIDIATopologyMatrix(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
// Real-world B200 HGX output: 8 GPUs, all pairs connected via NV18.
|
||||
input := ` GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 NIC0 NIC1
|
||||
GPU0 X NV18 NV18 NV18 NV18 NV18 NV18 NV18 NODE NODE
|
||||
GPU1 NV18 X NV18 NV18 NV18 NV18 NV18 NV18 NODE NODE
|
||||
GPU2 NV18 NV18 X NV18 NV18 NV18 NV18 NV18 NODE NODE
|
||||
GPU3 NV18 NV18 NV18 X NV18 NV18 NV18 NV18 NODE NODE
|
||||
GPU4 NV18 NV18 NV18 NV18 X NV18 NV18 NV18 SYS SYS
|
||||
GPU5 NV18 NV18 NV18 NV18 NV18 X NV18 NV18 SYS SYS
|
||||
GPU6 NV18 NV18 NV18 NV18 NV18 NV18 X NV18 SYS SYS
|
||||
GPU7 NV18 NV18 NV18 NV18 NV18 NV18 NV18 X SYS SYS
|
||||
NIC0 NODE NODE NODE NODE SYS SYS SYS SYS X PIX
|
||||
`
|
||||
got := parseNVIDIATopologyMatrix(input)
|
||||
|
||||
if got.GPUCount != 8 {
|
||||
t.Fatalf("GPUCount=%d want 8", got.GPUCount)
|
||||
}
|
||||
if !got.AllActive {
|
||||
t.Fatalf("AllActive=false want true")
|
||||
}
|
||||
if got.MinNVLinks != 18 {
|
||||
t.Fatalf("MinNVLinks=%d want 18", got.MinNVLinks)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseNVIDIATopologyMatrixPartialDegradation(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
// GPU1-GPU3 pair shows NV12 (reduced) instead of NV18.
|
||||
input := ` GPU0 GPU1 GPU2 GPU3
|
||||
GPU0 X NV18 NV18 NV18
|
||||
GPU1 NV18 X NV18 NV12
|
||||
GPU2 NV18 NV18 X NV18
|
||||
GPU3 NV18 NV12 NV18 X
|
||||
`
|
||||
got := parseNVIDIATopologyMatrix(input)
|
||||
|
||||
if got.MinNVLinks != 12 {
|
||||
t.Fatalf("MinNVLinks=%d want 12", got.MinNVLinks)
|
||||
}
|
||||
if !got.AllActive {
|
||||
t.Fatalf("AllActive=false want true (12 links is still active)")
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseNVIDIATopologyMatrixDisconnected(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
// GPU0-GPU1 pair fully disconnected (NV0).
|
||||
input := ` GPU0 GPU1
|
||||
GPU0 X NV0
|
||||
GPU1 NV0 X
|
||||
`
|
||||
got := parseNVIDIATopologyMatrix(input)
|
||||
|
||||
if got.AllActive {
|
||||
t.Fatalf("AllActive=true want false (NV0 means no links)")
|
||||
}
|
||||
if got.MinNVLinks != 0 {
|
||||
t.Fatalf("MinNVLinks=%d want 0", got.MinNVLinks)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseNVIDIATopologyMatrixEmpty(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
got := parseNVIDIATopologyMatrix("no gpus here")
|
||||
if got.GPUCount != 0 {
|
||||
t.Fatalf("GPUCount=%d want 0", got.GPUCount)
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyPCIeLinkSpeedWarningNVLinkBridgeEscalates(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
bridgeClass := "NVLinkBridge"
|
||||
linkSpeed := "Gen3"
|
||||
maxLinkSpeed := "Gen4"
|
||||
dev := schema.HardwarePCIeDevice{}
|
||||
dev.DeviceClass = &bridgeClass
|
||||
dev.LinkSpeed = &linkSpeed
|
||||
dev.MaxLinkSpeed = &maxLinkSpeed
|
||||
s := statusOK
|
||||
dev.Status = &s
|
||||
|
||||
applyPCIeLinkSpeedWarning(&dev)
|
||||
|
||||
if dev.Status == nil || *dev.Status != statusCritical {
|
||||
t.Fatalf("status=%v want Critical for NVLink bridge degradation", dev.Status)
|
||||
}
|
||||
if dev.ErrorDescription == nil {
|
||||
t.Fatal("ErrorDescription nil, want degradation message")
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplyPCIeLinkSpeedWarningRegularCardIsWarning(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
regularClass := "NetworkController"
|
||||
linkSpeed := "Gen3"
|
||||
maxLinkSpeed := "Gen4"
|
||||
dev := schema.HardwarePCIeDevice{}
|
||||
dev.DeviceClass = ®ularClass
|
||||
dev.LinkSpeed = &linkSpeed
|
||||
dev.MaxLinkSpeed = &maxLinkSpeed
|
||||
s := statusOK
|
||||
dev.Status = &s
|
||||
|
||||
applyPCIeLinkSpeedWarning(&dev)
|
||||
|
||||
if dev.Status == nil || *dev.Status != statusWarning {
|
||||
t.Fatalf("status=%v want Warning for regular card degradation", dev.Status)
|
||||
}
|
||||
}
|
||||
@@ -2,6 +2,8 @@ package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"bufio"
|
||||
"context"
|
||||
"log/slog"
|
||||
"os/exec"
|
||||
"regexp"
|
||||
@@ -10,16 +12,29 @@ import (
|
||||
"strings"
|
||||
)
|
||||
|
||||
func collectPSUs() []schema.HardwarePowerSupply {
|
||||
func collectPSUs(manufacturer string) []schema.HardwarePowerSupply {
|
||||
profile := selectIPMIProfile(manufacturer)
|
||||
|
||||
var psus []schema.HardwarePowerSupply
|
||||
if out, err := exec.Command("ipmitool", "fru", "print").Output(); err == nil {
|
||||
psus = parseFRU(string(out))
|
||||
fruCtx, fruCancel := context.WithTimeout(context.Background(), profile.fruTimeout)
|
||||
defer fruCancel()
|
||||
|
||||
if profile.fruEarlyExit {
|
||||
psus = collectFRUEarlyExit(fruCtx)
|
||||
} else {
|
||||
slog.Info("psu: fru unavailable", "err", err)
|
||||
cmd := exec.CommandContext(fruCtx, "ipmitool", "fru", "print")
|
||||
if out, err := cmd.Output(); err == nil {
|
||||
psus = parseFRU(string(out))
|
||||
} else {
|
||||
slog.Info("psu: fru unavailable", "err", err)
|
||||
}
|
||||
}
|
||||
|
||||
sdrData := map[int]psuSDR{}
|
||||
if sdrOut, err := exec.Command("ipmitool", "sdr").Output(); err == nil {
|
||||
sdrCtx, sdrCancel := context.WithTimeout(context.Background(), profile.sdrTimeout)
|
||||
defer sdrCancel()
|
||||
cmd := exec.CommandContext(sdrCtx, "ipmitool", "sdr")
|
||||
if sdrOut, err := cmd.Output(); err == nil {
|
||||
sdrData = parsePSUSDR(string(sdrOut))
|
||||
if len(psus) == 0 {
|
||||
psus = synthesizePSUsFromSDR(sdrData)
|
||||
@@ -30,7 +45,66 @@ func collectPSUs() []schema.HardwarePowerSupply {
|
||||
slog.Info("psu: ipmitool unavailable, skipping", "err", err)
|
||||
return nil
|
||||
}
|
||||
slog.Info("psu: collected", "count", len(psus))
|
||||
slog.Info("psu: collected", "count", len(psus), "profile", profile.name)
|
||||
return psus
|
||||
}
|
||||
|
||||
// collectFRUEarlyExit streams ipmitool fru print line-by-line and stops reading
|
||||
// as soon as it has found all PSU blocks and the next block is not a PSU.
|
||||
// This avoids scanning all 50+ non-PSU FRU devices on Lenovo XCC servers.
|
||||
func collectFRUEarlyExit(ctx context.Context) []schema.HardwarePowerSupply {
|
||||
cmd := exec.CommandContext(ctx, "ipmitool", "fru", "print")
|
||||
pipe, err := cmd.StdoutPipe()
|
||||
if err != nil {
|
||||
slog.Info("psu: fru pipe unavailable", "err", err)
|
||||
return nil
|
||||
}
|
||||
if err := cmd.Start(); err != nil {
|
||||
slog.Info("psu: fru start failed", "err", err)
|
||||
return nil
|
||||
}
|
||||
|
||||
var psus []schema.HardwarePowerSupply
|
||||
var currentBlock strings.Builder
|
||||
slot := 0
|
||||
psuFound := false
|
||||
stoppedEarly := false
|
||||
|
||||
scanner := bufio.NewScanner(pipe)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
|
||||
if strings.HasPrefix(line, "FRU Device Description") {
|
||||
if currentBlock.Len() > 0 {
|
||||
if psu, ok := parseFRUBlock(currentBlock.String(), slot); ok {
|
||||
psus = append(psus, psu)
|
||||
psuFound = true
|
||||
slot++
|
||||
}
|
||||
currentBlock.Reset()
|
||||
}
|
||||
// Stop once we've collected PSUs and hit a non-PSU block header.
|
||||
if psuFound && !isPSUHeader(strings.ToLower(line)) {
|
||||
stoppedEarly = true
|
||||
break
|
||||
}
|
||||
}
|
||||
currentBlock.WriteString(line)
|
||||
currentBlock.WriteByte('\n')
|
||||
}
|
||||
|
||||
if !stoppedEarly && currentBlock.Len() > 0 {
|
||||
if psu, ok := parseFRUBlock(currentBlock.String(), slot); ok {
|
||||
psus = append(psus, psu)
|
||||
}
|
||||
}
|
||||
|
||||
// Kill the process immediately on early exit rather than waiting for context timeout.
|
||||
if cmd.Process != nil {
|
||||
cmd.Process.Kill() //nolint:errcheck
|
||||
}
|
||||
cmd.Wait() //nolint:errcheck
|
||||
slog.Info("psu: fru early-exit complete", "psus_found", len(psus), "stopped_early", stoppedEarly)
|
||||
return psus
|
||||
}
|
||||
|
||||
|
||||
@@ -733,6 +733,37 @@ func parseMDStatArrays(raw string) []mdArray {
|
||||
return arrays
|
||||
}
|
||||
|
||||
// collectVROCLicense runs mdadm --detail-platform and extracts the License field.
|
||||
// Returns nil when VROC is absent or the platform does not report a license.
|
||||
func collectVROCLicense(pcie []schema.HardwarePCIeDevice) *string {
|
||||
if !hasVROCController(pcie) {
|
||||
return nil
|
||||
}
|
||||
out, err := raidToolQuery("mdadm", "--detail-platform")
|
||||
if err != nil {
|
||||
slog.Info("vroc: mdadm --detail-platform unavailable", "err", err)
|
||||
return nil
|
||||
}
|
||||
return parseMDAdmPlatformLicense(string(out))
|
||||
}
|
||||
|
||||
func parseMDAdmPlatformLicense(raw string) *string {
|
||||
for _, line := range strings.Split(raw, "\n") {
|
||||
trimmed := strings.TrimSpace(line)
|
||||
if !strings.HasPrefix(strings.ToLower(trimmed), "license") {
|
||||
continue
|
||||
}
|
||||
if idx := strings.Index(trimmed, ":"); idx >= 0 {
|
||||
val := strings.TrimSpace(trimmed[idx+1:])
|
||||
if val != "" {
|
||||
v := strings.ToLower(val)
|
||||
return &v
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func queryDeviceSerial(devPath string) string {
|
||||
if out, err := exec.Command("nvme", "id-ctrl", devPath, "-o", "json").Output(); err == nil {
|
||||
var ctrl nvmeIDCtrl
|
||||
|
||||
@@ -58,7 +58,6 @@ func buildSensorsFromDoc(doc sensorsDoc) *schema.HardwareSensors {
|
||||
|
||||
for _, chip := range chips {
|
||||
features := doc[chip]
|
||||
location := sensorLocation(chip)
|
||||
|
||||
keys := make([]string, 0, len(features))
|
||||
for key := range features {
|
||||
@@ -80,25 +79,25 @@ func buildSensorsFromDoc(doc sensorsDoc) *schema.HardwareSensors {
|
||||
}
|
||||
switch classifySensorFeature(feature) {
|
||||
case "fan":
|
||||
item := buildFanSensor(name, location, feature)
|
||||
item := buildFanSensor(name, feature)
|
||||
if item == nil || duplicateSensor(seen, "fan", item.Name) {
|
||||
continue
|
||||
}
|
||||
result.Fans = append(result.Fans, *item)
|
||||
case "temp":
|
||||
item := buildTempSensor(name, location, feature)
|
||||
item := buildTempSensor(name, feature)
|
||||
if item == nil || duplicateSensor(seen, "temp", item.Name) {
|
||||
continue
|
||||
}
|
||||
result.Temperatures = append(result.Temperatures, *item)
|
||||
case "power":
|
||||
item := buildPowerSensor(name, location, feature)
|
||||
item := buildPowerSensor(name, feature)
|
||||
if item == nil || duplicateSensor(seen, "power", item.Name) {
|
||||
continue
|
||||
}
|
||||
result.Power = append(result.Power, *item)
|
||||
default:
|
||||
item := buildOtherSensor(name, location, feature)
|
||||
item := buildOtherSensor(name, feature)
|
||||
if item == nil || duplicateSensor(seen, "other", item.Name) {
|
||||
continue
|
||||
}
|
||||
@@ -128,14 +127,6 @@ func duplicateSensor(seen map[string]struct{}, sensorType, name string) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
func sensorLocation(chip string) *string {
|
||||
chip = strings.TrimSpace(chip)
|
||||
if chip == "" {
|
||||
return nil
|
||||
}
|
||||
return &chip
|
||||
}
|
||||
|
||||
func classifySensorFeature(feature map[string]any) string {
|
||||
for key := range feature {
|
||||
switch {
|
||||
@@ -154,24 +145,24 @@ func classifySensorFeature(feature map[string]any) string {
|
||||
return "other"
|
||||
}
|
||||
|
||||
func buildFanSensor(name string, location *string, feature map[string]any) *schema.HardwareFanSensor {
|
||||
func buildFanSensor(name string, feature map[string]any) *schema.HardwareFanSensor {
|
||||
rpm, ok := firstFeatureInt(feature, "_input")
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
item := &schema.HardwareFanSensor{Name: name, Location: location, RPM: &rpm}
|
||||
item := &schema.HardwareFanSensor{Name: name, RPM: &rpm}
|
||||
if status := sensorStatusFromFeature(feature); status != nil {
|
||||
item.Status = status
|
||||
}
|
||||
return item
|
||||
}
|
||||
|
||||
func buildTempSensor(name string, location *string, feature map[string]any) *schema.HardwareTemperatureSensor {
|
||||
func buildTempSensor(name string, feature map[string]any) *schema.HardwareTemperatureSensor {
|
||||
celsius, ok := firstFeatureFloat(feature, "_input")
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
item := &schema.HardwareTemperatureSensor{Name: name, Location: location, Celsius: &celsius}
|
||||
item := &schema.HardwareTemperatureSensor{Name: name, Celsius: &celsius}
|
||||
if warning, ok := firstFeatureFloatWithSuffixes(feature, []string{"_max", "_high"}); ok {
|
||||
item.ThresholdWarningCelsius = &warning
|
||||
}
|
||||
@@ -186,8 +177,8 @@ func buildTempSensor(name string, location *string, feature map[string]any) *sch
|
||||
return item
|
||||
}
|
||||
|
||||
func buildPowerSensor(name string, location *string, feature map[string]any) *schema.HardwarePowerSensor {
|
||||
item := &schema.HardwarePowerSensor{Name: name, Location: location}
|
||||
func buildPowerSensor(name string, feature map[string]any) *schema.HardwarePowerSensor {
|
||||
item := &schema.HardwarePowerSensor{Name: name}
|
||||
if v, ok := firstFeatureFloatWithContains(feature, []string{"power"}); ok {
|
||||
item.PowerW = &v
|
||||
}
|
||||
@@ -206,12 +197,12 @@ func buildPowerSensor(name string, location *string, feature map[string]any) *sc
|
||||
return item
|
||||
}
|
||||
|
||||
func buildOtherSensor(name string, location *string, feature map[string]any) *schema.HardwareOtherSensor {
|
||||
func buildOtherSensor(name string, feature map[string]any) *schema.HardwareOtherSensor {
|
||||
value, unit, ok := firstGenericSensorValue(feature)
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
item := &schema.HardwareOtherSensor{Name: name, Location: location, Value: &value}
|
||||
item := &schema.HardwareOtherSensor{Name: name, Value: &value}
|
||||
if unit != "" {
|
||||
item.Unit = &unit
|
||||
}
|
||||
|
||||
@@ -4,12 +4,70 @@ import (
|
||||
"bee/audit/internal/schema"
|
||||
"encoding/json"
|
||||
"log/slog"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var (
|
||||
pciRescanPath = "/sys/bus/pci/rescan"
|
||||
scsiHostScanGlob = "/sys/class/scsi_host/host*/scan"
|
||||
hotplugWriteFile = os.WriteFile
|
||||
hotplugExecCommand = exec.Command
|
||||
hotplugGlob = filepath.Glob
|
||||
nvmeLBAFCompactRE = regexp.MustCompile(`(?im)^\s*lbaf\s+\d+\s*:\s*ms:(\d+)\s+lbads:(\d+).*?\(in use\)\s*$`)
|
||||
nvmeLBAFVerboseRE = regexp.MustCompile(`(?im)^\s*LBA Format\s+\d+\s*:\s*Metadata Size:\s*(\d+)\s+bytes\s*-\s*Data Size:\s*(\d+)\s+bytes.*?\(in use\)\s*$`)
|
||||
sgReadcapBlockRE = regexp.MustCompile(`(?im)logical block length\s*=\s*(\d+)\s+bytes`)
|
||||
sgReadcapProtRE = regexp.MustCompile(`(?im)prot_en\s*=\s*1`)
|
||||
)
|
||||
|
||||
func bestEffortRescanHotplugStorage() {
|
||||
if err := hotplugWriteFile(pciRescanPath, []byte("1\n"), 0644); err != nil {
|
||||
slog.Info("storage: pci rescan skipped", "path", pciRescanPath, "err", err)
|
||||
} else {
|
||||
slog.Info("storage: triggered pci rescan for hotplug discovery")
|
||||
}
|
||||
|
||||
hostPaths, err := hotplugGlob(scsiHostScanGlob)
|
||||
if err != nil {
|
||||
slog.Info("storage: scsi host scan skipped", "pattern", scsiHostScanGlob, "err", err)
|
||||
} else {
|
||||
for _, path := range hostPaths {
|
||||
// SAS HBAs (e.g. smartpqi) block indefinitely in sas_user_scan when
|
||||
// written to — SAS topology is discovered by the driver itself.
|
||||
// Detect via two methods: (1) sas_host class registration, and
|
||||
// (2) driver proc_name — smartpqi uses scsi_transport_sas but does
|
||||
// not register a sas_host object, so (1) alone misses it.
|
||||
host := filepath.Base(filepath.Dir(path))
|
||||
if _, err := os.Stat("/sys/class/sas_host/" + host); err == nil {
|
||||
slog.Info("storage: scsi host scan skipped (SAS host)", "path", path)
|
||||
continue
|
||||
}
|
||||
if procName, err := os.ReadFile("/sys/class/scsi_host/" + host + "/proc_name"); err == nil {
|
||||
switch strings.TrimSpace(string(procName)) {
|
||||
case "smartpqi", "hpsa":
|
||||
slog.Info("storage: scsi host scan skipped (SAS transport driver)",
|
||||
"path", path, "driver", strings.TrimSpace(string(procName)))
|
||||
continue
|
||||
}
|
||||
}
|
||||
if err := hotplugWriteFile(path, []byte("- - -\n"), 0644); err != nil {
|
||||
slog.Info("storage: scsi host scan write failed", "path", path, "err", err)
|
||||
continue
|
||||
}
|
||||
slog.Info("storage: triggered scsi host scan", "path", path)
|
||||
}
|
||||
}
|
||||
|
||||
out, err := hotplugExecCommand("udevadm", "settle", "--timeout=10").CombinedOutput()
|
||||
if err != nil {
|
||||
slog.Info("storage: udev settle after hotplug rescan failed", "err", err, "output", strings.TrimSpace(string(out)))
|
||||
}
|
||||
}
|
||||
|
||||
func collectStorage() []schema.HardwareStorage {
|
||||
devs := discoverStorageDevices()
|
||||
result := make([]schema.HardwareStorage, 0, len(devs))
|
||||
@@ -26,15 +84,41 @@ func collectStorage() []schema.HardwareStorage {
|
||||
return result
|
||||
}
|
||||
|
||||
// jsonInt64 accepts both a bare JSON number and a JSON-quoted number string.
|
||||
// lsblk -J emits LOG-SEC / PHY-SEC as integers on util-linux ≥ 2.37 (Debian 12)
|
||||
// but older versions emit them as strings. This type handles both.
|
||||
type jsonInt64 int64
|
||||
|
||||
func (j *jsonInt64) UnmarshalJSON(data []byte) error {
|
||||
// bare number: 512
|
||||
var n int64
|
||||
if err := json.Unmarshal(data, &n); err == nil {
|
||||
*j = jsonInt64(n)
|
||||
return nil
|
||||
}
|
||||
// quoted string: "512"
|
||||
var s string
|
||||
if err := json.Unmarshal(data, &s); err == nil {
|
||||
n, err := strconv.ParseInt(strings.TrimSpace(s), 10, 64)
|
||||
if err == nil {
|
||||
*j = jsonInt64(n)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
return nil // null or unexpected type — leave zero
|
||||
}
|
||||
|
||||
// lsblkDevice is a minimal lsblk JSON record.
|
||||
type lsblkDevice struct {
|
||||
Name string `json:"name"`
|
||||
Type string `json:"type"`
|
||||
Size string `json:"size"`
|
||||
Serial string `json:"serial"`
|
||||
Model string `json:"model"`
|
||||
Tran string `json:"tran"`
|
||||
Hctl string `json:"hctl"`
|
||||
Name string `json:"name"`
|
||||
Type string `json:"type"`
|
||||
Size string `json:"size"`
|
||||
Serial string `json:"serial"`
|
||||
Model string `json:"model"`
|
||||
Tran string `json:"tran"`
|
||||
Hctl string `json:"hctl"`
|
||||
LogSec jsonInt64 `json:"log-sec"`
|
||||
PhySec jsonInt64 `json:"phy-sec"`
|
||||
}
|
||||
|
||||
type lsblkRoot struct {
|
||||
@@ -101,7 +185,7 @@ func isVirtualHDiskModel(model string) bool {
|
||||
|
||||
func lsblkDevices() []lsblkDevice {
|
||||
out, err := exec.Command("lsblk", "-J", "-d",
|
||||
"-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL").Output()
|
||||
"-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL,LOG-SEC,PHY-SEC").Output()
|
||||
if err != nil {
|
||||
slog.Warn("storage: lsblk failed", "err", err)
|
||||
return nil
|
||||
@@ -208,6 +292,7 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
|
||||
present := true
|
||||
s := schema.HardwareStorage{Present: &present}
|
||||
s.Telemetry = map[string]any{"linux_device": "/dev/" + dev.Name}
|
||||
applyStorageBlockGeometry(&s, dev)
|
||||
|
||||
tran := strings.ToLower(dev.Tran)
|
||||
devPath := "/dev/" + dev.Name
|
||||
@@ -250,6 +335,8 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
|
||||
}
|
||||
|
||||
var info smartctlInfo
|
||||
var raw map[string]any
|
||||
_ = json.Unmarshal(out, &raw)
|
||||
if err := json.Unmarshal(out, &info); err == nil {
|
||||
if v := cleanDMIValue(info.ModelName); v != "" {
|
||||
s.Model = &v
|
||||
@@ -302,8 +389,11 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
|
||||
value := float64(attr.Raw.Value)
|
||||
s.LifeRemainingPct = &value
|
||||
case 241:
|
||||
value := attr.Raw.Value
|
||||
value := smartLBAsToBytes(attr.Raw.Value)
|
||||
s.WrittenBytes = &value
|
||||
case 242:
|
||||
value := smartLBAsToBytes(attr.Raw.Value)
|
||||
s.ReadBytes = &value
|
||||
case 197:
|
||||
pending = attr.Raw.Value
|
||||
s.CurrentPendingSectors = &pending
|
||||
@@ -321,6 +411,8 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
|
||||
offlineUncorrectable: uncorrectable,
|
||||
lifeRemainingPct: lifeRemaining,
|
||||
}
|
||||
applySCSISmartctlTelemetry(&s, raw, &status)
|
||||
applySCSIProtectionBlockGeometry(&s, devPath)
|
||||
setStorageHealthStatus(&s, status)
|
||||
return s
|
||||
}
|
||||
@@ -332,20 +424,23 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
|
||||
}
|
||||
|
||||
// nvmeSmartLog is the subset of `nvme smart-log -o json` output we care about.
|
||||
// nvme-cli emits most counters as JSON strings (e.g. "power_on_hours":"49"),
|
||||
// so all numeric fields use jsonInt64 which accepts both bare numbers and
|
||||
// quoted strings. Field names match nvme-cli JSON output, not NVMe spec prose.
|
||||
type nvmeSmartLog struct {
|
||||
CriticalWarning int `json:"critical_warning"`
|
||||
PercentageUsed int `json:"percentage_used"`
|
||||
AvailableSpare int `json:"available_spare"`
|
||||
SpareThreshold int `json:"spare_thresh"`
|
||||
Temperature int64 `json:"temperature"`
|
||||
PowerOnHours int64 `json:"power_on_hours"`
|
||||
PowerCycles int64 `json:"power_cycles"`
|
||||
UnsafeShutdowns int64 `json:"unsafe_shutdowns"`
|
||||
DataUnitsRead int64 `json:"data_units_read"`
|
||||
DataUnitsWritten int64 `json:"data_units_written"`
|
||||
ControllerBusy int64 `json:"controller_busy_time"`
|
||||
MediaErrors int64 `json:"media_errors"`
|
||||
NumErrLogEntries int64 `json:"num_err_log_entries"`
|
||||
CriticalWarning jsonInt64 `json:"critical_warning"`
|
||||
PercentageUsed jsonInt64 `json:"percent_used"`
|
||||
AvailableSpare jsonInt64 `json:"avail_spare"`
|
||||
SpareThreshold jsonInt64 `json:"spare_thresh"`
|
||||
Temperature jsonInt64 `json:"temperature"`
|
||||
PowerOnHours jsonInt64 `json:"power_on_hours"`
|
||||
PowerCycles jsonInt64 `json:"power_cycles"`
|
||||
UnsafeShutdowns jsonInt64 `json:"unsafe_shutdowns"`
|
||||
DataUnitsRead jsonInt64 `json:"data_units_read"`
|
||||
DataUnitsWritten jsonInt64 `json:"data_units_written"`
|
||||
ControllerBusy jsonInt64 `json:"controller_busy_time"`
|
||||
MediaErrors jsonInt64 `json:"media_errors"`
|
||||
NumErrLogEntries jsonInt64 `json:"num_err_log_entries"`
|
||||
}
|
||||
|
||||
// nvmeIDCtrl is the subset of `nvme id-ctrl -o json` output.
|
||||
@@ -368,6 +463,7 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
||||
Interface: &iface,
|
||||
Telemetry: map[string]any{"linux_device": "/dev/" + dev.Name},
|
||||
}
|
||||
applyStorageBlockGeometry(&s, dev)
|
||||
|
||||
devPath := "/dev/" + dev.Name
|
||||
if v := cleanDMIValue(strings.TrimSpace(dev.Model)); v != "" {
|
||||
@@ -402,19 +498,23 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
||||
}
|
||||
}
|
||||
}
|
||||
applyNVMeBlockGeometry(&s, devPath)
|
||||
|
||||
// smart-log: wear telemetry
|
||||
if out, err := exec.Command("nvme", "smart-log", devPath, "-o", "json").Output(); err == nil {
|
||||
var log nvmeSmartLog
|
||||
if json.Unmarshal(out, &log) == nil {
|
||||
if log.PowerOnHours > 0 {
|
||||
s.PowerOnHours = &log.PowerOnHours
|
||||
v := int64(log.PowerOnHours)
|
||||
s.PowerOnHours = &v
|
||||
}
|
||||
if log.PowerCycles > 0 {
|
||||
s.PowerCycles = &log.PowerCycles
|
||||
v := int64(log.PowerCycles)
|
||||
s.PowerCycles = &v
|
||||
}
|
||||
if log.UnsafeShutdowns > 0 {
|
||||
s.UnsafeShutdowns = &log.UnsafeShutdowns
|
||||
v := int64(log.UnsafeShutdowns)
|
||||
s.UnsafeShutdowns = &v
|
||||
}
|
||||
if log.PercentageUsed > 0 {
|
||||
v := float64(log.PercentageUsed)
|
||||
@@ -423,11 +523,11 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
||||
s.LifeRemainingPct = &remaining
|
||||
}
|
||||
if log.DataUnitsWritten > 0 {
|
||||
v := nvmeDataUnitsToBytes(log.DataUnitsWritten)
|
||||
v := nvmeDataUnitsToBytes(int64(log.DataUnitsWritten))
|
||||
s.WrittenBytes = &v
|
||||
}
|
||||
if log.DataUnitsRead > 0 {
|
||||
v := nvmeDataUnitsToBytes(log.DataUnitsRead)
|
||||
v := nvmeDataUnitsToBytes(int64(log.DataUnitsRead))
|
||||
s.ReadBytes = &v
|
||||
}
|
||||
if log.AvailableSpare > 0 {
|
||||
@@ -435,23 +535,25 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
||||
s.AvailableSparePct = &v
|
||||
}
|
||||
if log.MediaErrors > 0 {
|
||||
s.MediaErrors = &log.MediaErrors
|
||||
v := int64(log.MediaErrors)
|
||||
s.MediaErrors = &v
|
||||
}
|
||||
if log.NumErrLogEntries > 0 {
|
||||
s.ErrorLogEntries = &log.NumErrLogEntries
|
||||
v := int64(log.NumErrLogEntries)
|
||||
s.ErrorLogEntries = &v
|
||||
}
|
||||
if log.Temperature > 0 {
|
||||
v := float64(log.Temperature - 273)
|
||||
s.TemperatureC = &v
|
||||
}
|
||||
setStorageHealthStatus(&s, storageHealthStatus{
|
||||
criticalWarning: log.CriticalWarning,
|
||||
criticalWarning: int(log.CriticalWarning),
|
||||
percentageUsed: int64(log.PercentageUsed),
|
||||
availableSpare: int64(log.AvailableSpare),
|
||||
spareThreshold: int64(log.SpareThreshold),
|
||||
unsafeShutdowns: log.UnsafeShutdowns,
|
||||
mediaErrors: log.MediaErrors,
|
||||
errorLogEntries: log.NumErrLogEntries,
|
||||
unsafeShutdowns: int64(log.UnsafeShutdowns),
|
||||
mediaErrors: int64(log.MediaErrors),
|
||||
errorLogEntries: int64(log.NumErrLogEntries),
|
||||
})
|
||||
return s
|
||||
}
|
||||
@@ -477,6 +579,251 @@ func nvmeDataUnitsToBytes(units int64) int64 {
|
||||
return units * 512000
|
||||
}
|
||||
|
||||
func smartLBAsToBytes(lbas int64) int64 {
|
||||
if lbas <= 0 {
|
||||
return 0
|
||||
}
|
||||
return lbas * 512
|
||||
}
|
||||
|
||||
func applySCSISmartctlTelemetry(s *schema.HardwareStorage, raw map[string]any, status *storageHealthStatus) {
|
||||
if s == nil || len(raw) == 0 {
|
||||
return
|
||||
}
|
||||
if v, ok := firstInt64(raw,
|
||||
"path:power_on_time.hours",
|
||||
"path:accumulated_power_on_time.hours",
|
||||
"path:power_on_time.hour",
|
||||
"path:accumulated_power_on_time.hour",
|
||||
); ok && v > 0 && s.PowerOnHours == nil {
|
||||
s.PowerOnHours = &v
|
||||
}
|
||||
if v, ok := firstInt64(raw,
|
||||
"path:power_cycle_count",
|
||||
"path:start_stop_cycle_count",
|
||||
"path:accumulated_start_stop_cycles",
|
||||
); ok && v > 0 && s.PowerCycles == nil {
|
||||
s.PowerCycles = &v
|
||||
}
|
||||
if v, ok := firstInt64(raw,
|
||||
"path:scsi_grown_defect_list",
|
||||
"path:grown_defect_list",
|
||||
); ok && v > 0 && s.ReallocatedSectors == nil {
|
||||
s.ReallocatedSectors = &v
|
||||
if status != nil && status.reallocatedSectors == 0 {
|
||||
status.reallocatedSectors = v
|
||||
}
|
||||
}
|
||||
if v, ok := firstInt64(raw,
|
||||
"path:percentage_used_endurance_indicator",
|
||||
"path:scsi_percentage_used_endurance_indicator",
|
||||
); ok && v > 0 {
|
||||
if s.LifeUsedPct == nil {
|
||||
fv := float64(v)
|
||||
s.LifeUsedPct = &fv
|
||||
}
|
||||
if s.LifeRemainingPct == nil && v <= 100 {
|
||||
remaining := float64(100 - v)
|
||||
s.LifeRemainingPct = &remaining
|
||||
if status != nil && status.lifeRemainingPct == 0 {
|
||||
status.lifeRemainingPct = int64(remaining)
|
||||
}
|
||||
}
|
||||
}
|
||||
blockSize, hasBlockSize := firstInt64(raw,
|
||||
"path:logical_block_size",
|
||||
"path:block_size",
|
||||
"path:user_capacity.block_size",
|
||||
)
|
||||
if hasBlockSize && blockSize > 0 {
|
||||
if s.LogicalBlockSizeBytes == nil {
|
||||
s.LogicalBlockSizeBytes = &blockSize
|
||||
}
|
||||
if s.MetadataBytesPerBlock == nil {
|
||||
zero := int64(0)
|
||||
s.MetadataBytesPerBlock = &zero
|
||||
}
|
||||
if s.Telemetry == nil {
|
||||
s.Telemetry = map[string]any{}
|
||||
}
|
||||
s.Telemetry["logical_block_size_bytes"] = *s.LogicalBlockSizeBytes
|
||||
s.Telemetry["metadata_bytes_per_block"] = *s.MetadataBytesPerBlock
|
||||
s.Telemetry["block_format"] = formatBlockFormat(*s.LogicalBlockSizeBytes, *s.MetadataBytesPerBlock)
|
||||
if v, ok := firstInt64(raw,
|
||||
"path:logical_blocks_written",
|
||||
"path:total_lbas_written",
|
||||
); ok && v > 0 && s.WrittenBytes == nil {
|
||||
bytes := v * blockSize
|
||||
s.WrittenBytes = &bytes
|
||||
}
|
||||
if v, ok := firstInt64(raw,
|
||||
"path:logical_blocks_read",
|
||||
"path:total_lbas_read",
|
||||
); ok && v > 0 && s.ReadBytes == nil {
|
||||
bytes := v * blockSize
|
||||
s.ReadBytes = &bytes
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func applyStorageBlockGeometry(s *schema.HardwareStorage, dev lsblkDevice) {
|
||||
if s == nil {
|
||||
return
|
||||
}
|
||||
logical := int64(dev.LogSec)
|
||||
physical := int64(dev.PhySec)
|
||||
if logical <= 0 && physical <= 0 {
|
||||
return
|
||||
}
|
||||
if s.Telemetry == nil {
|
||||
s.Telemetry = map[string]any{}
|
||||
}
|
||||
if logical > 0 {
|
||||
s.LogicalBlockSizeBytes = &logical
|
||||
s.Telemetry["logical_block_size_bytes"] = logical
|
||||
if s.MetadataBytesPerBlock == nil {
|
||||
zero := int64(0)
|
||||
s.MetadataBytesPerBlock = &zero
|
||||
s.Telemetry["metadata_bytes_per_block"] = zero
|
||||
}
|
||||
}
|
||||
if physical > 0 {
|
||||
s.PhysicalBlockSizeBytes = &physical
|
||||
s.Telemetry["physical_block_size_bytes"] = physical
|
||||
}
|
||||
if s.LogicalBlockSizeBytes != nil && s.MetadataBytesPerBlock != nil {
|
||||
s.Telemetry["block_format"] = formatBlockFormat(*s.LogicalBlockSizeBytes, *s.MetadataBytesPerBlock)
|
||||
}
|
||||
}
|
||||
|
||||
func applyNVMeBlockGeometry(s *schema.HardwareStorage, devPath string) {
|
||||
if s == nil || strings.TrimSpace(devPath) == "" {
|
||||
return
|
||||
}
|
||||
out, err := exec.Command("nvme", "id-ns", devPath, "-H").CombinedOutput()
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
dataBytes, metadataBytes, ok := parseNVMeBlockFormat(string(out))
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
setStorageBlockGeometry(s, dataBytes, metadataBytes)
|
||||
}
|
||||
|
||||
func applySCSIProtectionBlockGeometry(s *schema.HardwareStorage, devPath string) {
|
||||
if s == nil || strings.TrimSpace(devPath) == "" {
|
||||
return
|
||||
}
|
||||
out, err := exec.Command("sg_readcap", "-l", devPath).CombinedOutput()
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
dataBytes, metadataBytes, ok := parseSCSIBlockFormat(string(out))
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
setStorageBlockGeometry(s, dataBytes, metadataBytes)
|
||||
}
|
||||
|
||||
func setStorageBlockGeometry(s *schema.HardwareStorage, dataBytes, metadataBytes int64) {
|
||||
if s == nil || dataBytes <= 0 || metadataBytes < 0 {
|
||||
return
|
||||
}
|
||||
if s.Telemetry == nil {
|
||||
s.Telemetry = map[string]any{}
|
||||
}
|
||||
s.LogicalBlockSizeBytes = &dataBytes
|
||||
s.MetadataBytesPerBlock = &metadataBytes
|
||||
s.Telemetry["logical_block_size_bytes"] = dataBytes
|
||||
s.Telemetry["metadata_bytes_per_block"] = metadataBytes
|
||||
s.Telemetry["block_format"] = formatBlockFormat(dataBytes, metadataBytes)
|
||||
}
|
||||
|
||||
func formatBlockFormat(dataBytes, metadataBytes int64) string {
|
||||
return strconv.FormatInt(dataBytes, 10) + "+" + strconv.FormatInt(metadataBytes, 10)
|
||||
}
|
||||
|
||||
func parseNVMeBlockFormat(raw string) (dataBytes, metadataBytes int64, ok bool) {
|
||||
if m := nvmeLBAFCompactRE.FindStringSubmatch(raw); len(m) == 3 {
|
||||
ms, errMS := strconv.ParseInt(m[1], 10, 64)
|
||||
lbads, errLBADS := strconv.ParseInt(m[2], 10, 64)
|
||||
if errMS == nil && errLBADS == nil && lbads >= 0 && lbads < 63 {
|
||||
return 1 << lbads, ms, true
|
||||
}
|
||||
}
|
||||
if m := nvmeLBAFVerboseRE.FindStringSubmatch(raw); len(m) == 3 {
|
||||
ms, errMS := strconv.ParseInt(m[1], 10, 64)
|
||||
ds, errDS := strconv.ParseInt(m[2], 10, 64)
|
||||
if errMS == nil && errDS == nil && ds > 0 {
|
||||
return ds, ms, true
|
||||
}
|
||||
}
|
||||
return 0, 0, false
|
||||
}
|
||||
|
||||
func parseSCSIBlockFormat(raw string) (dataBytes, metadataBytes int64, ok bool) {
|
||||
m := sgReadcapBlockRE.FindStringSubmatch(raw)
|
||||
if len(m) != 2 {
|
||||
return 0, 0, false
|
||||
}
|
||||
blockBytes, err := strconv.ParseInt(m[1], 10, 64)
|
||||
if err != nil || blockBytes <= 0 {
|
||||
return 0, 0, false
|
||||
}
|
||||
if sgReadcapProtRE.MatchString(raw) {
|
||||
return blockBytes, 8, true
|
||||
}
|
||||
return blockBytes, 0, true
|
||||
}
|
||||
|
||||
func firstInt64(root map[string]any, candidates ...string) (int64, bool) {
|
||||
for _, candidate := range candidates {
|
||||
if !strings.HasPrefix(candidate, "path:") {
|
||||
continue
|
||||
}
|
||||
path := strings.TrimPrefix(candidate, "path:")
|
||||
if v, ok := nestedInt64(root, strings.Split(path, ".")); ok {
|
||||
return v, true
|
||||
}
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
func nestedInt64(root map[string]any, path []string) (int64, bool) {
|
||||
var current any = root
|
||||
for _, key := range path {
|
||||
obj, ok := current.(map[string]any)
|
||||
if !ok {
|
||||
return 0, false
|
||||
}
|
||||
current, ok = obj[key]
|
||||
if !ok {
|
||||
return 0, false
|
||||
}
|
||||
}
|
||||
switch v := current.(type) {
|
||||
case float64:
|
||||
return int64(v), true
|
||||
case float32:
|
||||
return int64(v), true
|
||||
case int:
|
||||
return int64(v), true
|
||||
case int64:
|
||||
return v, true
|
||||
case int32:
|
||||
return int64(v), true
|
||||
case json.Number:
|
||||
n, err := v.Int64()
|
||||
return n, err == nil
|
||||
case string:
|
||||
n, err := strconv.ParseInt(strings.TrimSpace(v), 10, 64)
|
||||
return n, err == nil
|
||||
default:
|
||||
return 0, false
|
||||
}
|
||||
}
|
||||
|
||||
type storageHealthStatus struct {
|
||||
hasOverall bool
|
||||
overallPassed bool
|
||||
|
||||
69
audit/internal/collector/storage_block_format_test.go
Normal file
69
audit/internal/collector/storage_block_format_test.go
Normal file
@@ -0,0 +1,69 @@
|
||||
package collector
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestParseNVMeBlockFormatCompact(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
raw := `
|
||||
lbaf 0 : ms:0 lbads:9 rp:0x2 (in use)
|
||||
lbaf 1 : ms:8 lbads:9 rp:0x1
|
||||
`
|
||||
dataBytes, metadataBytes, ok := parseNVMeBlockFormat(raw)
|
||||
if !ok {
|
||||
t.Fatal("parseNVMeBlockFormat returned ok=false")
|
||||
}
|
||||
if dataBytes != 512 || metadataBytes != 0 {
|
||||
t.Fatalf("got %d+%d want 512+0", dataBytes, metadataBytes)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseNVMeBlockFormatVerbose(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
raw := `
|
||||
LBA Format 0 : Metadata Size: 8 bytes - Data Size: 512 bytes - Relative Performance: 0 Better (in use)
|
||||
LBA Format 1 : Metadata Size: 0 bytes - Data Size: 4096 bytes - Relative Performance: 1 Best
|
||||
`
|
||||
dataBytes, metadataBytes, ok := parseNVMeBlockFormat(raw)
|
||||
if !ok {
|
||||
t.Fatal("parseNVMeBlockFormat returned ok=false")
|
||||
}
|
||||
if dataBytes != 512 || metadataBytes != 8 {
|
||||
t.Fatalf("got %d+%d want 512+8", dataBytes, metadataBytes)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseSCSIBlockFormatWithProtection(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
raw := `
|
||||
Read Capacity results:
|
||||
Protection: prot_en=1, p_type=1, p_i_exponent=0
|
||||
Logical block length=512 bytes
|
||||
`
|
||||
dataBytes, metadataBytes, ok := parseSCSIBlockFormat(raw)
|
||||
if !ok {
|
||||
t.Fatal("parseSCSIBlockFormat returned ok=false")
|
||||
}
|
||||
if dataBytes != 512 || metadataBytes != 8 {
|
||||
t.Fatalf("got %d+%d want 512+8", dataBytes, metadataBytes)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseSCSIBlockFormatWithoutProtection(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
raw := `
|
||||
Read Capacity results:
|
||||
Protection: prot_en=0, p_type=0, p_i_exponent=0
|
||||
Logical block length=4096 bytes
|
||||
`
|
||||
dataBytes, metadataBytes, ok := parseSCSIBlockFormat(raw)
|
||||
if !ok {
|
||||
t.Fatal("parseSCSIBlockFormat returned ok=false")
|
||||
}
|
||||
if dataBytes != 4096 || metadataBytes != 0 {
|
||||
t.Fatalf("got %d+%d want 4096+0", dataBytes, metadataBytes)
|
||||
}
|
||||
}
|
||||
@@ -1,6 +1,13 @@
|
||||
package collector
|
||||
|
||||
import "testing"
|
||||
import (
|
||||
"encoding/json"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestMergeStorageDevicePrefersNonEmptyFields(t *testing.T) {
|
||||
t.Parallel()
|
||||
@@ -31,3 +38,130 @@ func TestParseStorageBytes(t *testing.T) {
|
||||
t.Fatalf("parseStorageBytes invalid=%d want 0", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestJsonInt64UnmarshalBothFormats(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
// util-linux ≥ 2.37 emits LOG-SEC / PHY-SEC as bare JSON numbers.
|
||||
// Older versions emit quoted strings. Both must parse without error
|
||||
// so that the entire lsblkDevices() call does not return nil on Debian 12.
|
||||
cases := []struct {
|
||||
json string
|
||||
want int64
|
||||
}{
|
||||
{`512`, 512},
|
||||
{`4096`, 4096},
|
||||
{`"512"`, 512},
|
||||
{`"4096"`, 4096},
|
||||
{`null`, 0},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
var v jsonInt64
|
||||
if err := v.UnmarshalJSON([]byte(tc.json)); err != nil {
|
||||
t.Fatalf("UnmarshalJSON(%s): unexpected error %v", tc.json, err)
|
||||
}
|
||||
if int64(v) != tc.want {
|
||||
t.Fatalf("UnmarshalJSON(%s)=%d want %d", tc.json, int64(v), tc.want)
|
||||
}
|
||||
}
|
||||
|
||||
// Simulate the exact JSON shape that triggered the bug on Debian 12.
|
||||
input := []byte(`{
|
||||
"blockdevices": [
|
||||
{"name":"sda","type":"disk","size":"3.6T","serial":"S1234","model":"SEAGATE","tran":"sata","hctl":"0:0:0:0","log-sec":512,"phy-sec":4096},
|
||||
{"name":"sdb","type":"disk","size":"3.6T","serial":"S5678","model":"SEAGATE","tran":"sata","hctl":"0:0:1:0","log-sec":512,"phy-sec":4096}
|
||||
]
|
||||
}`)
|
||||
var root lsblkRoot
|
||||
if err := json.Unmarshal(input, &root); err != nil {
|
||||
t.Fatalf("lsblkRoot unmarshal with integer log-sec/phy-sec: %v", err)
|
||||
}
|
||||
if len(root.Blockdevices) != 2 {
|
||||
t.Fatalf("got %d blockdevices want 2", len(root.Blockdevices))
|
||||
}
|
||||
if int64(root.Blockdevices[0].LogSec) != 512 {
|
||||
t.Fatalf("LogSec=%d want 512", root.Blockdevices[0].LogSec)
|
||||
}
|
||||
if int64(root.Blockdevices[0].PhySec) != 4096 {
|
||||
t.Fatalf("PhySec=%d want 4096", root.Blockdevices[0].PhySec)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBestEffortRescanHotplugStorage(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tmp := t.TempDir()
|
||||
rescanPath := filepath.Join(tmp, "pci-rescan")
|
||||
scanDir := filepath.Join(tmp, "scsi_host")
|
||||
host0Path := filepath.Join(scanDir, "host0", "scan")
|
||||
host1Path := filepath.Join(scanDir, "host1", "scan")
|
||||
argsPath := filepath.Join(tmp, "udevadm-args")
|
||||
toolPath := filepath.Join(tmp, "udevadm")
|
||||
if err := os.MkdirAll(filepath.Dir(host0Path), 0755); err != nil {
|
||||
t.Fatalf("mkdir host0: %v", err)
|
||||
}
|
||||
if err := os.MkdirAll(filepath.Dir(host1Path), 0755); err != nil {
|
||||
t.Fatalf("mkdir host1: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(host0Path, nil, 0644); err != nil {
|
||||
t.Fatalf("touch host0 scan: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(host1Path, nil, 0644); err != nil {
|
||||
t.Fatalf("touch host1 scan: %v", err)
|
||||
}
|
||||
script := "#!/bin/sh\nprintf '%s' \"$*\" > \"" + argsPath + "\"\n"
|
||||
if err := os.WriteFile(toolPath, []byte(script), 0755); err != nil {
|
||||
t.Fatalf("write udevadm stub: %v", err)
|
||||
}
|
||||
|
||||
oldPath := os.Getenv("PATH")
|
||||
if err := os.Setenv("PATH", tmp+string(os.PathListSeparator)+oldPath); err != nil {
|
||||
t.Fatalf("set PATH: %v", err)
|
||||
}
|
||||
defer func() { _ = os.Setenv("PATH", oldPath) }()
|
||||
|
||||
oldRescanPath := pciRescanPath
|
||||
oldSCSIGlob := scsiHostScanGlob
|
||||
oldWriteFile := hotplugWriteFile
|
||||
oldExecCommand := hotplugExecCommand
|
||||
oldGlob := hotplugGlob
|
||||
pciRescanPath = rescanPath
|
||||
scsiHostScanGlob = filepath.Join(scanDir, "host*", "scan")
|
||||
hotplugWriteFile = os.WriteFile
|
||||
hotplugExecCommand = exec.Command
|
||||
hotplugGlob = filepath.Glob
|
||||
defer func() {
|
||||
pciRescanPath = oldRescanPath
|
||||
scsiHostScanGlob = oldSCSIGlob
|
||||
hotplugWriteFile = oldWriteFile
|
||||
hotplugExecCommand = oldExecCommand
|
||||
hotplugGlob = oldGlob
|
||||
}()
|
||||
|
||||
bestEffortRescanHotplugStorage()
|
||||
|
||||
raw, err := os.ReadFile(rescanPath)
|
||||
if err != nil {
|
||||
t.Fatalf("read rescan file: %v", err)
|
||||
}
|
||||
if string(raw) != "1\n" {
|
||||
t.Fatalf("rescan payload=%q want %q", string(raw), "1\n")
|
||||
}
|
||||
for _, path := range []string{host0Path, host1Path} {
|
||||
raw, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
t.Fatalf("read scsi scan file %s: %v", path, err)
|
||||
}
|
||||
if string(raw) != "- - -\n" {
|
||||
t.Fatalf("scsi scan payload at %s =%q want %q", path, string(raw), "- - -\n")
|
||||
}
|
||||
}
|
||||
|
||||
args, err := os.ReadFile(argsPath)
|
||||
if err != nil {
|
||||
t.Fatalf("read udevadm args: %v", err)
|
||||
}
|
||||
if got := strings.TrimSpace(string(args)); got != "settle --timeout=10" {
|
||||
t.Fatalf("udevadm args=%q want %q", got, "settle --timeout=10")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,11 +1,65 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"testing"
|
||||
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
// TestNVMeSmartLogUnmarshal verifies that nvme-cli JSON output (where most
|
||||
// counters are quoted strings and field names differ from NVMe spec prose)
|
||||
// is correctly parsed into nvmeSmartLog.
|
||||
func TestNVMeSmartLogUnmarshal(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
// Real nvme-cli output: counters are JSON strings, spare is "avail_spare",
|
||||
// percentage used is "percent_used".
|
||||
raw := `{
|
||||
"critical_warning": 0,
|
||||
"temperature": 310,
|
||||
"avail_spare": 100,
|
||||
"spare_thresh": 5,
|
||||
"percent_used": 0,
|
||||
"data_units_read": "10925415",
|
||||
"data_units_written": "8497672",
|
||||
"controller_busy_time": "305",
|
||||
"power_cycles": "53",
|
||||
"power_on_hours": "49",
|
||||
"unsafe_shutdowns": "22",
|
||||
"media_errors": "0",
|
||||
"num_err_log_entries": "0"
|
||||
}`
|
||||
var log nvmeSmartLog
|
||||
if err := json.Unmarshal([]byte(raw), &log); err != nil {
|
||||
t.Fatalf("json.Unmarshal failed: %v", err)
|
||||
}
|
||||
if log.PowerOnHours != 49 {
|
||||
t.Errorf("PowerOnHours=%d want 49", log.PowerOnHours)
|
||||
}
|
||||
if log.PowerCycles != 53 {
|
||||
t.Errorf("PowerCycles=%d want 53", log.PowerCycles)
|
||||
}
|
||||
if log.AvailableSpare != 100 {
|
||||
t.Errorf("AvailableSpare=%d want 100", log.AvailableSpare)
|
||||
}
|
||||
if log.SpareThreshold != 5 {
|
||||
t.Errorf("SpareThreshold=%d want 5", log.SpareThreshold)
|
||||
}
|
||||
if log.PercentageUsed != 0 {
|
||||
t.Errorf("PercentageUsed=%d want 0", log.PercentageUsed)
|
||||
}
|
||||
if log.Temperature != 310 {
|
||||
t.Errorf("Temperature=%d want 310", log.Temperature)
|
||||
}
|
||||
if log.MediaErrors != 0 {
|
||||
t.Errorf("MediaErrors=%d want 0", log.MediaErrors)
|
||||
}
|
||||
if log.UnsafeShutdowns != 22 {
|
||||
t.Errorf("UnsafeShutdowns=%d want 22", log.UnsafeShutdowns)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSetStorageHealthStatus(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
||||
101
audit/internal/collector/storage_scsi_test.go
Normal file
101
audit/internal/collector/storage_scsi_test.go
Normal file
@@ -0,0 +1,101 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
func TestApplySCSISmartctlTelemetry(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
raw := map[string]any{
|
||||
"power_on_time": map[string]any{
|
||||
"hours": float64(32123),
|
||||
},
|
||||
"accumulated_start_stop_cycles": float64(17),
|
||||
"scsi_grown_defect_list": float64(4),
|
||||
"percentage_used_endurance_indicator": float64(12),
|
||||
"logical_block_size": float64(4096),
|
||||
"logical_blocks_written": float64(1000),
|
||||
"logical_blocks_read": float64(2000),
|
||||
}
|
||||
|
||||
var disk schema.HardwareStorage
|
||||
status := storageHealthStatus{}
|
||||
applySCSISmartctlTelemetry(&disk, raw, &status)
|
||||
|
||||
if disk.PowerOnHours == nil || *disk.PowerOnHours != 32123 {
|
||||
t.Fatalf("power_on_hours=%v want 32123", disk.PowerOnHours)
|
||||
}
|
||||
if disk.PowerCycles == nil || *disk.PowerCycles != 17 {
|
||||
t.Fatalf("power_cycles=%v want 17", disk.PowerCycles)
|
||||
}
|
||||
if disk.ReallocatedSectors == nil || *disk.ReallocatedSectors != 4 {
|
||||
t.Fatalf("reallocated=%v want 4", disk.ReallocatedSectors)
|
||||
}
|
||||
if disk.WrittenBytes == nil || *disk.WrittenBytes != 4096000 {
|
||||
t.Fatalf("written_bytes=%v want 4096000", disk.WrittenBytes)
|
||||
}
|
||||
if disk.ReadBytes == nil || *disk.ReadBytes != 8192000 {
|
||||
t.Fatalf("read_bytes=%v want 8192000", disk.ReadBytes)
|
||||
}
|
||||
if disk.LogicalBlockSizeBytes == nil || *disk.LogicalBlockSizeBytes != 4096 {
|
||||
t.Fatalf("logical_block_size_bytes=%v want 4096", disk.LogicalBlockSizeBytes)
|
||||
}
|
||||
if disk.MetadataBytesPerBlock == nil || *disk.MetadataBytesPerBlock != 0 {
|
||||
t.Fatalf("metadata_bytes_per_block=%v want 0", disk.MetadataBytesPerBlock)
|
||||
}
|
||||
if disk.LifeUsedPct == nil || *disk.LifeUsedPct != 12 {
|
||||
t.Fatalf("life_used_pct=%v want 12", disk.LifeUsedPct)
|
||||
}
|
||||
if disk.LifeRemainingPct == nil || *disk.LifeRemainingPct != 88 {
|
||||
t.Fatalf("life_remaining_pct=%v want 88", disk.LifeRemainingPct)
|
||||
}
|
||||
if status.reallocatedSectors != 4 {
|
||||
t.Fatalf("status.reallocated=%d want 4", status.reallocatedSectors)
|
||||
}
|
||||
if status.lifeRemainingPct != 88 {
|
||||
t.Fatalf("status.life_remaining_pct=%d want 88", status.lifeRemainingPct)
|
||||
}
|
||||
}
|
||||
|
||||
func TestApplySCSISmartctlTelemetryDoesNotOverwriteExistingValues(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
powerOnHours := int64(10)
|
||||
writtenBytes := int64(20)
|
||||
lifeRemaining := 30.0
|
||||
disk := schema.HardwareStorage{
|
||||
PowerOnHours: &powerOnHours,
|
||||
WrittenBytes: &writtenBytes,
|
||||
LifeRemainingPct: &lifeRemaining,
|
||||
}
|
||||
raw := map[string]any{
|
||||
"power_on_time": map[string]any{"hours": float64(999)},
|
||||
"logical_block_size": float64(512),
|
||||
"logical_blocks_written": float64(999),
|
||||
"percentage_used_endurance_indicator": float64(50),
|
||||
}
|
||||
|
||||
applySCSISmartctlTelemetry(&disk, raw, nil)
|
||||
|
||||
if *disk.PowerOnHours != 10 {
|
||||
t.Fatalf("power_on_hours overwritten: got %d want 10", *disk.PowerOnHours)
|
||||
}
|
||||
if *disk.WrittenBytes != 20 {
|
||||
t.Fatalf("written_bytes overwritten: got %d want 20", *disk.WrittenBytes)
|
||||
}
|
||||
if disk.LogicalBlockSizeBytes == nil || *disk.LogicalBlockSizeBytes != 512 {
|
||||
t.Fatalf("logical_block_size_bytes=%v want 512", disk.LogicalBlockSizeBytes)
|
||||
}
|
||||
if disk.MetadataBytesPerBlock == nil || *disk.MetadataBytesPerBlock != 0 {
|
||||
t.Fatalf("metadata_bytes_per_block=%v want 0", disk.MetadataBytesPerBlock)
|
||||
}
|
||||
if *disk.LifeRemainingPct != 30 {
|
||||
t.Fatalf("life_remaining_pct overwritten: got %v want 30", *disk.LifeRemainingPct)
|
||||
}
|
||||
if disk.LifeUsedPct == nil || *disk.LifeUsedPct != 50 {
|
||||
t.Fatalf("life_used_pct=%v want 50", disk.LifeUsedPct)
|
||||
}
|
||||
}
|
||||
25
audit/internal/collector/storage_telemetry_test.go
Normal file
25
audit/internal/collector/storage_telemetry_test.go
Normal file
@@ -0,0 +1,25 @@
|
||||
package collector
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestSmartLBAsToBytes(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
lbas int64
|
||||
want int64
|
||||
}{
|
||||
{name: "zero", lbas: 0, want: 0},
|
||||
{name: "single lba", lbas: 1, want: 512},
|
||||
{name: "multiple lbas", lbas: 2048, want: 1048576},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
if got := smartLBAsToBytes(tt.lbas); got != tt.want {
|
||||
t.Fatalf("smartLBAsToBytes(%d)=%d want %d", tt.lbas, got, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
27
audit/internal/collector/testdata/dmidecode_type0_dell.txt
vendored
Normal file
27
audit/internal/collector/testdata/dmidecode_type0_dell.txt
vendored
Normal file
@@ -0,0 +1,27 @@
|
||||
# dmidecode 3.2
|
||||
Getting SMBIOS data from sysfs.
|
||||
SMBIOS 3.1.0 present.
|
||||
|
||||
Handle 0x0000, DMI type 0, 26 bytes
|
||||
BIOS Information
|
||||
Vendor: Dell Inc.
|
||||
Version: 2.5.4
|
||||
Release Date: 01/13/2020
|
||||
Address: 0xF0000
|
||||
Runtime Size: 64 kB
|
||||
ROM Size: 32 MB
|
||||
Characteristics:
|
||||
ISA is supported
|
||||
PCI is supported
|
||||
PNP is supported
|
||||
BIOS is upgradeable
|
||||
BIOS shadowing is allowed
|
||||
Boot from CD is supported
|
||||
Selectable boot is supported
|
||||
EDD is supported
|
||||
ACPI is supported
|
||||
USB legacy is supported
|
||||
BIOS boot specification is supported
|
||||
Targeted content distribution is supported
|
||||
UEFI is supported
|
||||
BIOS Revision: 2.5
|
||||
59
audit/internal/collector/testdata/dmidecode_type17_mixed.txt
vendored
Normal file
59
audit/internal/collector/testdata/dmidecode_type17_mixed.txt
vendored
Normal file
@@ -0,0 +1,59 @@
|
||||
# dmidecode 3.1
|
||||
Getting SMBIOS data from sysfs.
|
||||
SMBIOS 2.8 present.
|
||||
|
||||
Handle 0x0026, DMI type 17, 40 bytes
|
||||
Memory Device
|
||||
Array Handle: 0x0025
|
||||
Error Information Handle: Not Provided
|
||||
Total Width: 72 bits
|
||||
Data Width: 64 bits
|
||||
Size: 16 GB
|
||||
Form Factor: DIMM
|
||||
Set: None
|
||||
Locator: P1-DIMMA1
|
||||
Bank Locator: P0_Node0_Channel0_Dimm0
|
||||
Type: DDR4
|
||||
Type Detail: Synchronous
|
||||
Speed: 2133 MT/s
|
||||
Manufacturer: Micron
|
||||
Serial Number: 1A2B3C4D
|
||||
Asset Tag: Not Specified
|
||||
Part Number: 36ASF2G72PZ-2G1A2
|
||||
Rank: 2
|
||||
Configured Memory Speed: 2133 MT/s
|
||||
|
||||
Handle 0x0027, DMI type 17, 40 bytes
|
||||
Memory Device
|
||||
Array Handle: 0x0025
|
||||
Error Information Handle: Not Provided
|
||||
Total Width: Unknown
|
||||
Data Width: Unknown
|
||||
Size: No Module Installed
|
||||
Form Factor: DIMM
|
||||
Set: None
|
||||
Locator: P1-DIMMA2
|
||||
Bank Locator: P0_Node0_Channel0_Dimm1
|
||||
Type: DDR4
|
||||
Type Detail: Synchronous
|
||||
|
||||
Handle 0x0028, DMI type 17, 84 bytes
|
||||
Memory Device
|
||||
Array Handle: 0x0025
|
||||
Error Information Handle: Not Provided
|
||||
Total Width: 72 bits
|
||||
Data Width: 64 bits
|
||||
Size: 32768 MB
|
||||
Form Factor: DIMM
|
||||
Set: 1
|
||||
Locator: A1
|
||||
Bank Locator: Not Specified
|
||||
Type: DDR4
|
||||
Type Detail: Synchronous Registered (Buffered)
|
||||
Speed: 2933 MT/s
|
||||
Manufacturer: Samsung
|
||||
Serial Number: 5E6F7A8B
|
||||
Asset Tag: Not Specified
|
||||
Part Number: M393A4K40CB2-CVF
|
||||
Rank: 2
|
||||
Configured Memory Speed: 2400 MT/s
|
||||
14
audit/internal/collector/testdata/dmidecode_type1_dell.txt
vendored
Normal file
14
audit/internal/collector/testdata/dmidecode_type1_dell.txt
vendored
Normal file
@@ -0,0 +1,14 @@
|
||||
# dmidecode 3.2
|
||||
Getting SMBIOS data from sysfs.
|
||||
SMBIOS 3.1.0 present.
|
||||
|
||||
Handle 0x0100, DMI type 1, 27 bytes
|
||||
System Information
|
||||
Manufacturer: Dell Inc.
|
||||
Product Name: PowerEdge R740xd
|
||||
Version: Not Specified
|
||||
Serial Number: 7SG9F63
|
||||
UUID: b1c2d3e4-f5a6-7890-bcde-f12345678901
|
||||
Wake-up Type: Power Switch
|
||||
SKU Number: SKU=NotProvided;ModelName=PowerEdge R740xd
|
||||
Family: PowerEdge
|
||||
14
audit/internal/collector/testdata/dmidecode_type1_hpe.txt
vendored
Normal file
14
audit/internal/collector/testdata/dmidecode_type1_hpe.txt
vendored
Normal file
@@ -0,0 +1,14 @@
|
||||
# dmidecode 3.3
|
||||
Getting SMBIOS data from sysfs.
|
||||
SMBIOS 3.1.0 present.
|
||||
|
||||
Handle 0x008E, DMI type 1, 27 bytes
|
||||
System Information
|
||||
Manufacturer: HPE
|
||||
Product Name: ProLiant DL380 Gen10
|
||||
Version: Not Specified
|
||||
Serial Number: CZJ9320CXN
|
||||
UUID: c2d3e4f5-a6b7-8901-cdef-012345678902
|
||||
Wake-up Type: Power Switch
|
||||
SKU Number: 868703-B21
|
||||
Family: ProLiant
|
||||
14
audit/internal/collector/testdata/dmidecode_type1_supermicro.txt
vendored
Normal file
14
audit/internal/collector/testdata/dmidecode_type1_supermicro.txt
vendored
Normal file
@@ -0,0 +1,14 @@
|
||||
# dmidecode 3.1
|
||||
Getting SMBIOS data from sysfs.
|
||||
SMBIOS 2.8 present.
|
||||
|
||||
Handle 0x0001, DMI type 1, 27 bytes
|
||||
System Information
|
||||
Manufacturer: Supermicro
|
||||
Product Name: SYS-6028R-WTR
|
||||
Version: 0123456789
|
||||
Serial Number: S214726X2A36789
|
||||
UUID: d3e4f5a6-b7c8-9012-def0-123456789003
|
||||
Wake-up Type: Power Switch
|
||||
SKU Number: Default string
|
||||
Family: Default string
|
||||
10
audit/internal/collector/testdata/dmidecode_type2_dell.txt
vendored
Normal file
10
audit/internal/collector/testdata/dmidecode_type2_dell.txt
vendored
Normal file
@@ -0,0 +1,10 @@
|
||||
# dmidecode 3.2
|
||||
Getting SMBIOS data from sysfs.
|
||||
SMBIOS 3.1.0 present.
|
||||
|
||||
Handle 0x0200, DMI type 2, 8 bytes
|
||||
Base Board Information
|
||||
Manufacturer: Dell Inc.
|
||||
Product Name: 0F9N89
|
||||
Version: A00
|
||||
Serial Number: 7SG9F63
|
||||
19
audit/internal/collector/testdata/dmidecode_type2_hpe.txt
vendored
Normal file
19
audit/internal/collector/testdata/dmidecode_type2_hpe.txt
vendored
Normal file
@@ -0,0 +1,19 @@
|
||||
# dmidecode 3.3
|
||||
Getting SMBIOS data from sysfs.
|
||||
SMBIOS 3.1.0 present.
|
||||
|
||||
Handle 0x00A4, DMI type 2, 15 bytes
|
||||
Base Board Information
|
||||
Manufacturer: HPE
|
||||
Product Name: ProLiant DL380 Gen10
|
||||
Version: Not Specified
|
||||
Serial Number: CZJ9320CXN
|
||||
Asset Tag: CZJ9320CXN
|
||||
Features:
|
||||
Board is a hosting board
|
||||
Board is removable
|
||||
Board is replaceable
|
||||
Location In Chassis: Not Specified
|
||||
Chassis Handle: 0x0000
|
||||
Type: Motherboard
|
||||
Contained Object Handles: 0
|
||||
18
audit/internal/collector/testdata/dmidecode_type2_supermicro.txt
vendored
Normal file
18
audit/internal/collector/testdata/dmidecode_type2_supermicro.txt
vendored
Normal file
@@ -0,0 +1,18 @@
|
||||
# dmidecode 3.1
|
||||
Getting SMBIOS data from sysfs.
|
||||
SMBIOS 2.8 present.
|
||||
|
||||
Handle 0x0002, DMI type 2, 15 bytes
|
||||
Base Board Information
|
||||
Manufacturer: Supermicro
|
||||
Product Name: X10DRW-i
|
||||
Version: 1.02
|
||||
Serial Number: S214726X2A36789
|
||||
Asset Tag: Default string
|
||||
Features:
|
||||
Board is a hosting board
|
||||
Board is replaceable
|
||||
Location In Chassis: Default string
|
||||
Chassis Handle: 0x0003
|
||||
Type: Motherboard
|
||||
Contained Object Handles: 0
|
||||
@@ -28,6 +28,35 @@ md125 : active raid1 nvme2n1[0] nvme3n1[1]
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseMDAdmPlatformLicense(t *testing.T) {
|
||||
premium := `Platform : Intel(R) Virtual RAID on CPU
|
||||
Version : 1.3.0.1138
|
||||
RAID Levels : raid0 raid1 raid5 raid10
|
||||
Total Disks : 4
|
||||
License : Premium
|
||||
`
|
||||
got := parseMDAdmPlatformLicense(premium)
|
||||
if got == nil || *got != "premium" {
|
||||
t.Fatalf("expected 'premium', got %v", got)
|
||||
}
|
||||
|
||||
standard := `Platform : Intel(R) Virtual RAID on CPU
|
||||
License : Standard
|
||||
`
|
||||
got = parseMDAdmPlatformLicense(standard)
|
||||
if got == nil || *got != "standard" {
|
||||
t.Fatalf("expected 'standard', got %v", got)
|
||||
}
|
||||
|
||||
noLicense := `Platform : Intel(R) Virtual RAID on CPU
|
||||
Version : 1.0.0
|
||||
`
|
||||
got = parseMDAdmPlatformLicense(noLicense)
|
||||
if got != nil {
|
||||
t.Fatalf("expected nil, got %v", *got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHasVROCController(t *testing.T) {
|
||||
intel := vendorIntel
|
||||
model := "Volume Management Device NVMe RAID Controller"
|
||||
|
||||
@@ -105,6 +105,7 @@ var (
|
||||
benchmarkSkippedPattern = regexp.MustCompile(`^([a-z0-9_]+)(?:\[\d+\])?=SKIPPED (.+)$`)
|
||||
benchmarkIterationsPattern = regexp.MustCompile(`^([a-z0-9_]+)_iterations=(\d+)$`)
|
||||
benchmarkGeteuid = os.Geteuid
|
||||
benchmarkResetNvidiaGPU = resetNvidiaGPU
|
||||
benchmarkSleep = time.Sleep
|
||||
)
|
||||
|
||||
@@ -249,6 +250,35 @@ func setBenchmarkPowerLimit(ctx context.Context, verboseLog string, gpuIndex, po
|
||||
return nil
|
||||
}
|
||||
|
||||
func resetBenchmarkGPU(ctx context.Context, verboseLog string, gpuIndex int, logFunc func(string)) error {
|
||||
if logFunc != nil {
|
||||
logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset via shared NVIDIA recover path", gpuIndex))
|
||||
}
|
||||
out, err := benchmarkResetNvidiaGPU(gpuIndex)
|
||||
appendSATVerboseLog(verboseLog,
|
||||
fmt.Sprintf("[%s] start power-preflight-gpu-%d-reset.log", time.Now().UTC().Format(time.RFC3339), gpuIndex),
|
||||
"cmd: bee-nvidia-recover reset-gpu "+strconv.Itoa(gpuIndex),
|
||||
)
|
||||
if trimmed := strings.TrimSpace(out); trimmed != "" && logFunc != nil {
|
||||
for _, line := range strings.Split(trimmed, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line != "" {
|
||||
logFunc(line)
|
||||
}
|
||||
}
|
||||
}
|
||||
rc := 0
|
||||
if err != nil {
|
||||
rc = 1
|
||||
}
|
||||
appendSATVerboseLog(verboseLog,
|
||||
fmt.Sprintf("[%s] finish power-preflight-gpu-%d-reset.log", time.Now().UTC().Format(time.RFC3339), gpuIndex),
|
||||
fmt.Sprintf("rc: %d", rc),
|
||||
"",
|
||||
)
|
||||
return err
|
||||
}
|
||||
|
||||
func resetBenchmarkGPUs(ctx context.Context, verboseLog string, gpuIndices []int, logFunc func(string)) []int {
|
||||
if len(gpuIndices) == 0 {
|
||||
return nil
|
||||
@@ -266,8 +296,7 @@ func resetBenchmarkGPUs(ctx context.Context, verboseLog string, gpuIndices []int
|
||||
}
|
||||
var failed []int
|
||||
for _, idx := range gpuIndices {
|
||||
name := fmt.Sprintf("power-preflight-gpu-%d-reset.log", idx)
|
||||
if _, err := runSATCommandCtx(ctx, verboseLog, name, []string{"nvidia-smi", "-i", strconv.Itoa(idx), "-r"}, nil, logFunc); err != nil {
|
||||
if err := resetBenchmarkGPU(ctx, verboseLog, idx, logFunc); err != nil {
|
||||
failed = append(failed, idx)
|
||||
if logFunc != nil {
|
||||
logFunc(fmt.Sprintf("power benchmark pre-flight: GPU %d reset failed: %v", idx, err))
|
||||
@@ -4440,8 +4469,7 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
_ = os.MkdirAll(singleDir, 0755)
|
||||
singleInfo := cloneBenchmarkGPUInfoMap(infoByIndex)
|
||||
if failed := resetBenchmarkGPUs(ctx, verboseLog, []int{idx}, logFunc); len(failed) > 0 {
|
||||
result.Findings = append(result.Findings,
|
||||
fmt.Sprintf("GPU %d reset pre-flight did not complete before its first power test; throttle counters may contain stale state.", idx))
|
||||
return "", fmt.Errorf("power benchmark pre-flight: failed to reset GPU %d; benchmark aborted to keep measurements clean", idx)
|
||||
}
|
||||
logFunc(fmt.Sprintf("power calibration: GPU %d single-card baseline", idx))
|
||||
singlePowerStopCh := make(chan struct{})
|
||||
|
||||
@@ -2,7 +2,7 @@ package platform
|
||||
|
||||
import (
|
||||
"context"
|
||||
"os"
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
@@ -188,18 +188,16 @@ func TestBenchmarkCalibrationThrottleReasonIgnoresPowerReasons(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestResetBenchmarkGPUsSkipsWithoutRoot(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
oldGeteuid := benchmarkGeteuid
|
||||
oldExec := satExecCommand
|
||||
oldReset := benchmarkResetNvidiaGPU
|
||||
benchmarkGeteuid = func() int { return 1000 }
|
||||
satExecCommand = func(name string, args ...string) *exec.Cmd {
|
||||
t.Fatalf("unexpected command: %s %v", name, args)
|
||||
return nil
|
||||
benchmarkResetNvidiaGPU = func(int) (string, error) {
|
||||
t.Fatal("unexpected reset call")
|
||||
return "", nil
|
||||
}
|
||||
t.Cleanup(func() {
|
||||
benchmarkGeteuid = oldGeteuid
|
||||
satExecCommand = oldExec
|
||||
benchmarkResetNvidiaGPU = oldReset
|
||||
})
|
||||
|
||||
var logs []string
|
||||
@@ -215,44 +213,52 @@ func TestResetBenchmarkGPUsSkipsWithoutRoot(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestResetBenchmarkGPUsResetsEachGPU(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
dir := t.TempDir()
|
||||
script := filepath.Join(dir, "nvidia-smi")
|
||||
argsLog := filepath.Join(dir, "args.log")
|
||||
if err := os.WriteFile(script, []byte("#!/bin/sh\nprintf '%s\\n' \"$*\" >> "+argsLog+"\nprintf 'ok\\n'\n"), 0755); err != nil {
|
||||
t.Fatalf("write script: %v", err)
|
||||
}
|
||||
|
||||
oldGeteuid := benchmarkGeteuid
|
||||
oldSleep := benchmarkSleep
|
||||
oldLookPath := satLookPath
|
||||
oldReset := benchmarkResetNvidiaGPU
|
||||
benchmarkGeteuid = func() int { return 0 }
|
||||
benchmarkSleep = func(time.Duration) {}
|
||||
satLookPath = func(file string) (string, error) {
|
||||
if file == "nvidia-smi" {
|
||||
return script, nil
|
||||
}
|
||||
return exec.LookPath(file)
|
||||
var calls []int
|
||||
benchmarkResetNvidiaGPU = func(index int) (string, error) {
|
||||
calls = append(calls, index)
|
||||
return "ok\n", nil
|
||||
}
|
||||
t.Cleanup(func() {
|
||||
benchmarkGeteuid = oldGeteuid
|
||||
benchmarkSleep = oldSleep
|
||||
satLookPath = oldLookPath
|
||||
benchmarkResetNvidiaGPU = oldReset
|
||||
})
|
||||
|
||||
failed := resetBenchmarkGPUs(context.Background(), filepath.Join(dir, "verbose.log"), []int{2, 5}, nil)
|
||||
failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{2, 5}, nil)
|
||||
if len(failed) != 0 {
|
||||
t.Fatalf("failed=%v want no failures", failed)
|
||||
}
|
||||
raw, err := os.ReadFile(argsLog)
|
||||
if err != nil {
|
||||
t.Fatalf("read args log: %v", err)
|
||||
if got, want := fmt.Sprint(calls), "[2 5]"; got != want {
|
||||
t.Fatalf("calls=%v want %s", calls, want)
|
||||
}
|
||||
got := strings.Fields(string(raw))
|
||||
want := []string{"-i", "2", "-r", "-i", "5", "-r"}
|
||||
if strings.Join(got, " ") != strings.Join(want, " ") {
|
||||
t.Fatalf("args=%v want %v", got, want)
|
||||
}
|
||||
|
||||
func TestResetBenchmarkGPUsTracksFailuresFromSharedReset(t *testing.T) {
|
||||
oldGeteuid := benchmarkGeteuid
|
||||
oldSleep := benchmarkSleep
|
||||
oldReset := benchmarkResetNvidiaGPU
|
||||
benchmarkGeteuid = func() int { return 0 }
|
||||
benchmarkSleep = func(time.Duration) {}
|
||||
benchmarkResetNvidiaGPU = func(index int) (string, error) {
|
||||
if index == 5 {
|
||||
return "busy\n", exec.ErrNotFound
|
||||
}
|
||||
return "ok\n", nil
|
||||
}
|
||||
t.Cleanup(func() {
|
||||
benchmarkGeteuid = oldGeteuid
|
||||
benchmarkSleep = oldSleep
|
||||
benchmarkResetNvidiaGPU = oldReset
|
||||
})
|
||||
|
||||
failed := resetBenchmarkGPUs(context.Background(), filepath.Join(t.TempDir(), "verbose.log"), []int{2, 5}, nil)
|
||||
if got, want := fmt.Sprint(failed), "[5]"; got != want {
|
||||
t.Fatalf("failed=%v want %s", failed, want)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -38,6 +38,15 @@ var HardwareErrorPatterns = []ErrorPattern{
|
||||
Category: "gpu",
|
||||
Severity: "warning",
|
||||
},
|
||||
// PCIe AER correctable from the NVIDIA driver — "bus correctable error" in SEL.
|
||||
// Severity is warning (not critical): correctable errors are hardware-recovered.
|
||||
{
|
||||
Name: "nvidia-aer-correctable",
|
||||
Re: mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER.*[Cc]orrect`),
|
||||
Category: "gpu",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "nvidia-aer",
|
||||
Re: mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
|
||||
@@ -54,6 +63,15 @@ var HardwareErrorPatterns = []ErrorPattern{
|
||||
},
|
||||
|
||||
// ── PCIe AER (generic) ──────────────────────────────────────────────────────
|
||||
// PCIe AER correctable from the root port — captures the reported device BDF
|
||||
// (second BDF in "pcieport X: AER: Correctable error received: Y").
|
||||
{
|
||||
Name: "pcie-aer-correctable",
|
||||
Re: mustPat(`(?i)pcieport.*AER:.*[Cc]orrect.*:\s*([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
|
||||
Category: "pcie",
|
||||
Severity: "warning",
|
||||
BDFGroup: 1,
|
||||
},
|
||||
{
|
||||
Name: "pcie-aer",
|
||||
Re: mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
|
||||
|
||||
@@ -5,6 +5,7 @@ import (
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
@@ -18,7 +19,7 @@ type InstallDisk struct {
|
||||
MountedParts []string // partition mount points currently active
|
||||
}
|
||||
|
||||
const squashfsPath = "/run/live/medium/live/filesystem.squashfs"
|
||||
const squashfsGlob = "/run/live/medium/live/*.squashfs"
|
||||
|
||||
// ListInstallDisks returns block devices suitable for installation.
|
||||
// Excludes the current live boot medium but includes USB drives.
|
||||
@@ -176,11 +177,22 @@ func inferLiveBootKind(fsType, source, deviceType, transport string) string {
|
||||
// squashfs size × 1.5 to allow for extracted filesystem and bootloader.
|
||||
// Returns 0 if the squashfs is not available (non-live environment).
|
||||
func MinInstallBytes() int64 {
|
||||
fi, err := os.Stat(squashfsPath)
|
||||
if err != nil {
|
||||
files, err := filepath.Glob(squashfsGlob)
|
||||
if err != nil || len(files) == 0 {
|
||||
return 0
|
||||
}
|
||||
return fi.Size() * 3 / 2
|
||||
var total int64
|
||||
for _, path := range files {
|
||||
fi, statErr := os.Stat(path)
|
||||
if statErr != nil {
|
||||
continue
|
||||
}
|
||||
total += fi.Size()
|
||||
}
|
||||
if total == 0 {
|
||||
return 0
|
||||
}
|
||||
return total * 3 / 2
|
||||
}
|
||||
|
||||
// toramActive returns true when the live system was booted with toram.
|
||||
@@ -222,12 +234,10 @@ func DiskWarnings(d InstallDisk) []string {
|
||||
humanBytes(min), humanBytes(d.SizeBytes)))
|
||||
}
|
||||
if toramActive() {
|
||||
sqFi, err := os.Stat(squashfsPath)
|
||||
if err == nil {
|
||||
free := freeMemBytes()
|
||||
if free > 0 && free < sqFi.Size()*2 {
|
||||
w = append(w, "toram mode — low RAM, extraction may be slow or fail")
|
||||
}
|
||||
free := freeMemBytes()
|
||||
min := MinInstallBytes()
|
||||
if free > 0 && min > 0 && free < (min*4/3) {
|
||||
w = append(w, "toram mode — low RAM, extraction may be slow or fail")
|
||||
}
|
||||
}
|
||||
return w
|
||||
|
||||
@@ -14,6 +14,22 @@ import (
|
||||
const installToRAMDir = "/dev/shm/bee-live"
|
||||
const copyProgressLogStep int64 = 100 * 1024 * 1024
|
||||
|
||||
var liveMediumSquashfsGlob = func() ([]string, error) {
|
||||
return filepath.Glob("/run/live/medium/live/*.squashfs")
|
||||
}
|
||||
|
||||
var runRemountMedium = func() ([]byte, error) {
|
||||
return exec.Command("bee-remount-medium").CombinedOutput()
|
||||
}
|
||||
|
||||
var umountLiveMedium = func() error {
|
||||
return exec.Command("umount", "/run/live/medium").Run()
|
||||
}
|
||||
|
||||
var ejectDevice = func(device string) error {
|
||||
return exec.Command("eject", device).Run()
|
||||
}
|
||||
|
||||
func (s *System) IsLiveMediaInRAM() bool {
|
||||
return s.LiveMediaRAMState().InRAM
|
||||
}
|
||||
@@ -140,8 +156,7 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) (ret
|
||||
return nil
|
||||
}
|
||||
|
||||
squashfsFiles, err := filepath.Glob("/run/live/medium/live/*.squashfs")
|
||||
sourceAvailable := err == nil && len(squashfsFiles) > 0
|
||||
squashfsFiles, sourceAvailable := ensureLiveMediumAvailable(log)
|
||||
|
||||
dstDir := installToRAMDir
|
||||
|
||||
@@ -171,7 +186,7 @@ func (s *System) RunInstallToRAM(ctx context.Context, logFunc func(string)) (ret
|
||||
}
|
||||
goto bindMedium
|
||||
}
|
||||
return fmt.Errorf("no squashfs files found in /run/live/medium/live/ and no prior RAM copy in %s — reconnect the installation medium and retry", dstDir)
|
||||
return fmt.Errorf("no squashfs files found in /run/live/medium/live/ and no prior RAM copy in %s — reconnect the installation medium and retry (or run bee-remount-medium as root)", dstDir)
|
||||
}
|
||||
|
||||
{
|
||||
@@ -254,10 +269,83 @@ bindMedium:
|
||||
if status.InRAM {
|
||||
log(fmt.Sprintf("Verification passed: live medium now served from %s.", describeLiveBootSource(status)))
|
||||
}
|
||||
log("Done. Squashfs files are in RAM. Installation media can be safely disconnected.")
|
||||
detachInstallMedium(status, log)
|
||||
log("Done. Squashfs files are in RAM. Installation media has been detached when possible.")
|
||||
return nil
|
||||
}
|
||||
|
||||
func tryRemountLiveMedium(log func(string)) error {
|
||||
output, err := runRemountMedium()
|
||||
trimmed := strings.TrimSpace(string(output))
|
||||
if err != nil {
|
||||
if trimmed != "" && log != nil {
|
||||
for _, line := range strings.Split(trimmed, "\n") {
|
||||
log("bee-remount-medium: " + line)
|
||||
}
|
||||
}
|
||||
return err
|
||||
}
|
||||
if trimmed != "" && log != nil {
|
||||
for _, line := range strings.Split(trimmed, "\n") {
|
||||
log("bee-remount-medium: " + line)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func ensureLiveMediumAvailable(log func(string)) ([]string, bool) {
|
||||
squashfsFiles, err := liveMediumSquashfsGlob()
|
||||
sourceAvailable := err == nil && len(squashfsFiles) > 0
|
||||
if sourceAvailable {
|
||||
return squashfsFiles, true
|
||||
}
|
||||
|
||||
if log != nil {
|
||||
log("Live medium not mounted at /run/live/medium — attempting automatic remount scan...")
|
||||
}
|
||||
if remountErr := tryRemountLiveMedium(log); remountErr != nil {
|
||||
if log != nil {
|
||||
log(fmt.Sprintf("Automatic remount did not restore the live medium: %v", remountErr))
|
||||
}
|
||||
return squashfsFiles, false
|
||||
}
|
||||
|
||||
squashfsFiles, err = liveMediumSquashfsGlob()
|
||||
sourceAvailable = err == nil && len(squashfsFiles) > 0
|
||||
if sourceAvailable && log != nil {
|
||||
log("Live medium restored after remount scan.")
|
||||
}
|
||||
return squashfsFiles, sourceAvailable
|
||||
}
|
||||
|
||||
func detachInstallMedium(status LiveBootSource, log func(string)) {
|
||||
if log == nil {
|
||||
log = func(string) {}
|
||||
}
|
||||
|
||||
log("Detaching original installation medium...")
|
||||
if err := umountLiveMedium(); err != nil {
|
||||
log(fmt.Sprintf("Warning: could not unmount /run/live/medium: %v", err))
|
||||
} else {
|
||||
log("Unmounted /run/live/medium.")
|
||||
}
|
||||
|
||||
device := strings.TrimSpace(status.Device)
|
||||
if device == "" {
|
||||
device = strings.TrimSpace(status.Source)
|
||||
}
|
||||
if device == "" || !strings.HasPrefix(device, "/dev/") {
|
||||
log("No block device identified for eject; skipping media eject.")
|
||||
return
|
||||
}
|
||||
|
||||
if err := ejectDevice(device); err != nil {
|
||||
log(fmt.Sprintf("Warning: could not eject %s: %v", device, err))
|
||||
return
|
||||
}
|
||||
log(fmt.Sprintf("Ejected %s.", device))
|
||||
}
|
||||
|
||||
func verifyInstallToRAMStatus(status LiveBootSource, dstDir string, mediumRebound bool, log func(string)) error {
|
||||
if status.InRAM {
|
||||
return nil
|
||||
|
||||
@@ -1,6 +1,9 @@
|
||||
package platform
|
||||
|
||||
import "testing"
|
||||
import (
|
||||
"fmt"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestInferLiveBootKind(t *testing.T) {
|
||||
t.Parallel()
|
||||
@@ -124,3 +127,156 @@ func TestShouldLogCopyProgress(t *testing.T) {
|
||||
t.Fatal("expected final completion log")
|
||||
}
|
||||
}
|
||||
|
||||
func TestTryRemountLiveMedium(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
orig := runRemountMedium
|
||||
t.Cleanup(func() {
|
||||
runRemountMedium = orig
|
||||
})
|
||||
|
||||
t.Run("success", func(t *testing.T) {
|
||||
runRemountMedium = func() ([]byte, error) {
|
||||
return []byte("[10:57:31] Mounted /dev/sr1 on /run/live/medium\n"), nil
|
||||
}
|
||||
var logs []string
|
||||
if err := tryRemountLiveMedium(func(msg string) { logs = append(logs, msg) }); err != nil {
|
||||
t.Fatalf("tryRemountLiveMedium() error = %v", err)
|
||||
}
|
||||
if len(logs) != 1 || logs[0] != "bee-remount-medium: [10:57:31] Mounted /dev/sr1 on /run/live/medium" {
|
||||
t.Fatalf("logs=%v", logs)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("failure", func(t *testing.T) {
|
||||
runRemountMedium = func() ([]byte, error) {
|
||||
return []byte("must be run as root\n"), fmt.Errorf("exit status 1")
|
||||
}
|
||||
var logs []string
|
||||
err := tryRemountLiveMedium(func(msg string) { logs = append(logs, msg) })
|
||||
if err == nil {
|
||||
t.Fatal("expected error")
|
||||
}
|
||||
if len(logs) != 1 || logs[0] != "bee-remount-medium: must be run as root" {
|
||||
t.Fatalf("logs=%v", logs)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestEnsureLiveMediumAvailableRemountsSource(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
origGlob := liveMediumSquashfsGlob
|
||||
origRemount := runRemountMedium
|
||||
t.Cleanup(func() {
|
||||
liveMediumSquashfsGlob = origGlob
|
||||
runRemountMedium = origRemount
|
||||
})
|
||||
|
||||
callCount := 0
|
||||
liveMediumSquashfsGlob = func() ([]string, error) {
|
||||
callCount++
|
||||
if callCount == 1 {
|
||||
return nil, nil
|
||||
}
|
||||
return []string{"/run/live/medium/live/filesystem.squashfs"}, nil
|
||||
}
|
||||
runRemountMedium = func() ([]byte, error) {
|
||||
return []byte("Mounted /dev/sr1 on /run/live/medium\n"), nil
|
||||
}
|
||||
|
||||
var logs []string
|
||||
files, ok := ensureLiveMediumAvailable(func(msg string) { logs = append(logs, msg) })
|
||||
if !ok {
|
||||
t.Fatal("expected live medium to become available after remount")
|
||||
}
|
||||
if callCount < 2 {
|
||||
t.Fatalf("liveMediumSquashfsGlob called %d times, want at least 2", callCount)
|
||||
}
|
||||
if len(files) != 1 || files[0] != "/run/live/medium/live/filesystem.squashfs" {
|
||||
t.Fatalf("files=%v", files)
|
||||
}
|
||||
found := false
|
||||
for _, msg := range logs {
|
||||
if msg == "Live medium restored after remount scan." {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
t.Fatalf("expected remount success log, logs=%v", logs)
|
||||
}
|
||||
}
|
||||
|
||||
func TestDetachInstallMedium(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
origUmount := umountLiveMedium
|
||||
origEject := ejectDevice
|
||||
t.Cleanup(func() {
|
||||
umountLiveMedium = origUmount
|
||||
ejectDevice = origEject
|
||||
})
|
||||
|
||||
t.Run("success", func(t *testing.T) {
|
||||
var umountCalled bool
|
||||
var ejected string
|
||||
umountLiveMedium = func() error {
|
||||
umountCalled = true
|
||||
return nil
|
||||
}
|
||||
ejectDevice = func(device string) error {
|
||||
ejected = device
|
||||
return nil
|
||||
}
|
||||
var logs []string
|
||||
detachInstallMedium(LiveBootSource{Kind: "cdrom", Device: "/dev/sr1"}, func(msg string) { logs = append(logs, msg) })
|
||||
if !umountCalled {
|
||||
t.Fatal("expected umountLiveMedium to be called")
|
||||
}
|
||||
if ejected != "/dev/sr1" {
|
||||
t.Fatalf("ejected=%q want /dev/sr1", ejected)
|
||||
}
|
||||
if len(logs) < 3 {
|
||||
t.Fatalf("logs=%v", logs)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("no device", func(t *testing.T) {
|
||||
umountLiveMedium = func() error { return nil }
|
||||
ejectDevice = func(device string) error {
|
||||
t.Fatalf("unexpected eject for %q", device)
|
||||
return nil
|
||||
}
|
||||
var logs []string
|
||||
detachInstallMedium(LiveBootSource{Kind: "ram", Source: "tmpfs"}, func(msg string) { logs = append(logs, msg) })
|
||||
found := false
|
||||
for _, msg := range logs {
|
||||
if msg == "No block device identified for eject; skipping media eject." {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
t.Fatalf("logs=%v", logs)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("eject failure is warning only", func(t *testing.T) {
|
||||
umountLiveMedium = func() error { return nil }
|
||||
ejectDevice = func(device string) error { return fmt.Errorf("exit status 1") }
|
||||
var logs []string
|
||||
detachInstallMedium(LiveBootSource{Kind: "usb", Device: "/dev/sdb1"}, func(msg string) { logs = append(logs, msg) })
|
||||
found := false
|
||||
for _, msg := range logs {
|
||||
if msg == "Warning: could not eject /dev/sdb1: exit status 1" {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
t.Fatalf("logs=%v", logs)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
@@ -18,6 +18,7 @@ var workerPatterns = []string{
|
||||
"stress-ng",
|
||||
"stressapptest",
|
||||
"memtester",
|
||||
"nvbandwidth",
|
||||
// DCGM diagnostic workers — nvvs is spawned by dcgmi diag and survives
|
||||
// if dcgmi is killed mid-run, leaving the GPU occupied (DCGM_ST_IN_USE).
|
||||
"nvvs",
|
||||
@@ -71,13 +72,19 @@ func KillTestWorkers() []KilledProcess {
|
||||
if idx := strings.LastIndexByte(exe, '/'); idx >= 0 {
|
||||
base = exe[idx+1:]
|
||||
}
|
||||
for _, pat := range workerPatterns {
|
||||
if strings.Contains(base, pat) || strings.Contains(exe, pat) {
|
||||
_ = syscall.Kill(pid, syscall.SIGKILL)
|
||||
killed = append(killed, KilledProcess{PID: pid, Name: base})
|
||||
break
|
||||
}
|
||||
if shouldKillWorkerProcess(exe, base) {
|
||||
_ = syscall.Kill(pid, syscall.SIGKILL)
|
||||
killed = append(killed, KilledProcess{PID: pid, Name: base})
|
||||
}
|
||||
}
|
||||
return killed
|
||||
}
|
||||
|
||||
func shouldKillWorkerProcess(exe, base string) bool {
|
||||
for _, pat := range workerPatterns {
|
||||
if strings.Contains(base, pat) || strings.Contains(exe, pat) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
39
audit/internal/platform/kill_workers_test.go
Normal file
39
audit/internal/platform/kill_workers_test.go
Normal file
@@ -0,0 +1,39 @@
|
||||
package platform
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestShouldKillWorkerProcess(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
exe string
|
||||
base string
|
||||
want bool
|
||||
}{
|
||||
{
|
||||
name: "nvbandwidth executable",
|
||||
exe: "/usr/libexec/datacenter-gpu-manager-4/plugins/cuda13/nvbandwidth",
|
||||
base: "nvbandwidth",
|
||||
want: true,
|
||||
},
|
||||
{
|
||||
name: "dcgmi executable",
|
||||
exe: "/usr/bin/dcgmi",
|
||||
base: "dcgmi",
|
||||
want: true,
|
||||
},
|
||||
{
|
||||
name: "unrelated process",
|
||||
exe: "/usr/bin/bash",
|
||||
base: "bash",
|
||||
want: false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
if got := shouldKillWorkerProcess(tt.exe, tt.base); got != tt.want {
|
||||
t.Fatalf("shouldKillWorkerProcess(%q, %q)=%v want %v", tt.exe, tt.base, got, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -258,7 +258,7 @@ func (s *System) GetInterfaceState(iface string) (bool, error) {
|
||||
func interfaceAdminState(iface string) (bool, error) {
|
||||
raw, err := exec.Command("ip", "-o", "link", "show", "dev", iface).Output()
|
||||
if err != nil {
|
||||
return false, err
|
||||
return false, fmt.Errorf("ip link show dev %s: %w", iface, err)
|
||||
}
|
||||
return parseInterfaceAdminState(string(raw))
|
||||
}
|
||||
@@ -288,7 +288,7 @@ func interfaceIPv4Addrs(iface string) ([]string, error) {
|
||||
if errors.As(err, &exitErr) {
|
||||
return nil, nil
|
||||
}
|
||||
return nil, err
|
||||
return nil, fmt.Errorf("ip addr show dev %s: %w", iface, err)
|
||||
}
|
||||
var ipv4 []string
|
||||
for _, line := range strings.Split(strings.TrimSpace(string(raw)), "\n") {
|
||||
|
||||
@@ -3,6 +3,8 @@ package platform
|
||||
import (
|
||||
"fmt"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
@@ -28,3 +30,22 @@ func runNvidiaRecover(args ...string) (string, error) {
|
||||
raw, err := exec.Command("sudo", helperArgs...).CombinedOutput()
|
||||
return string(raw), err
|
||||
}
|
||||
|
||||
func resetNvidiaGPU(index int) (string, error) {
|
||||
if index < 0 {
|
||||
return "", fmt.Errorf("gpu index must be >= 0")
|
||||
}
|
||||
out, err := runNvidiaRecover("reset-gpu", strconv.Itoa(index))
|
||||
if strings.TrimSpace(out) == "" && err == nil {
|
||||
out = "GPU reset completed.\n"
|
||||
}
|
||||
return out, err
|
||||
}
|
||||
|
||||
func restartNvidiaDrivers() (string, error) {
|
||||
out, err := runNvidiaRecover("restart-drivers")
|
||||
if strings.TrimSpace(out) == "" && err == nil {
|
||||
out = "NVIDIA drivers restarted.\n"
|
||||
}
|
||||
return out, err
|
||||
}
|
||||
|
||||
@@ -55,7 +55,6 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
|
||||
if err == nil {
|
||||
health.Interfaces = make([]schema.RuntimeInterface, 0, len(interfaces))
|
||||
hasIPv4 := false
|
||||
missingIPv4 := false
|
||||
for _, iface := range interfaces {
|
||||
outcome := "no_offer"
|
||||
if len(iface.IPv4) > 0 {
|
||||
@@ -63,8 +62,6 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
|
||||
hasIPv4 = true
|
||||
} else if strings.EqualFold(iface.State, "DOWN") {
|
||||
outcome = "link_down"
|
||||
} else {
|
||||
missingIPv4 = true
|
||||
}
|
||||
health.Interfaces = append(health.Interfaces, schema.RuntimeInterface{
|
||||
Name: iface.Name,
|
||||
@@ -73,17 +70,9 @@ func (s *System) CollectRuntimeHealth(exportDir string) (schema.RuntimeHealth, e
|
||||
Outcome: outcome,
|
||||
})
|
||||
}
|
||||
switch {
|
||||
case hasIPv4 && !missingIPv4:
|
||||
if hasIPv4 {
|
||||
health.NetworkStatus = "OK"
|
||||
case hasIPv4:
|
||||
health.NetworkStatus = "PARTIAL"
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "dhcp_partial",
|
||||
Severity: "warning",
|
||||
Description: "At least one interface did not obtain IPv4 connectivity.",
|
||||
})
|
||||
default:
|
||||
} else {
|
||||
health.NetworkStatus = "FAILED"
|
||||
health.Issues = append(health.Issues, schema.RuntimeIssue{
|
||||
Code: "dhcp_failed",
|
||||
|
||||
@@ -182,9 +182,16 @@ func (s *System) DetectGPUVendor() string {
|
||||
return "amd"
|
||||
}
|
||||
if raw, err := exec.Command("lspci", "-nn").Output(); err == nil {
|
||||
text := strings.ToLower(string(raw))
|
||||
if strings.Contains(text, "advanced micro devices") || strings.Contains(text, "amd/ati") {
|
||||
return "amd"
|
||||
// Only match AMD GPU device classes [0300]=VGA, [0302]=3D controller, [0380]=Display.
|
||||
// AMD CPUs also appear in lspci as "Advanced Micro Devices" (Root Complex, IOMMU, etc.)
|
||||
// so matching vendor alone causes false positives on AMD CPU servers without GPUs.
|
||||
for _, line := range strings.Split(strings.ToLower(string(raw)), "\n") {
|
||||
if !strings.Contains(line, "advanced micro devices") && !strings.Contains(line, "amd/ati") {
|
||||
continue
|
||||
}
|
||||
if strings.Contains(line, "[0300]") || strings.Contains(line, "[0302]") || strings.Contains(line, "[0380]") {
|
||||
return "amd"
|
||||
}
|
||||
}
|
||||
}
|
||||
return ""
|
||||
@@ -404,14 +411,7 @@ func normalizeNvidiaBusID(v string) string {
|
||||
}
|
||||
|
||||
func (s *System) ResetNvidiaGPU(index int) (string, error) {
|
||||
if index < 0 {
|
||||
return "", fmt.Errorf("gpu index must be >= 0")
|
||||
}
|
||||
out, err := runNvidiaRecover("reset-gpu", strconv.Itoa(index))
|
||||
if strings.TrimSpace(out) == "" && err == nil {
|
||||
out = "GPU reset completed.\n"
|
||||
}
|
||||
return out, err
|
||||
return resetNvidiaGPU(index)
|
||||
}
|
||||
|
||||
// RunNCCLTests runs nccl-tests all_reduce_perf across the selected NVIDIA GPUs.
|
||||
|
||||
@@ -62,7 +62,7 @@ func (s *System) ServiceState(name string) string {
|
||||
|
||||
func (s *System) ServiceDo(name string, action ServiceAction) (string, error) {
|
||||
if name == "bee-nvidia" && action == ServiceRestart {
|
||||
return runNvidiaRecover("restart-drivers")
|
||||
return restartNvidiaDrivers()
|
||||
}
|
||||
// bee-web runs as the bee user; sudo is required to control system services.
|
||||
// /etc/sudoers.d/bee grants bee NOPASSWD:ALL.
|
||||
|
||||
@@ -25,6 +25,9 @@ var techDumpFixedCommands = []struct {
|
||||
{Name: "sensors", Args: []string{"-j"}, File: "sensors.json"},
|
||||
{Name: "ipmitool", Args: []string{"fru", "print"}, File: "ipmitool-fru.txt"},
|
||||
{Name: "ipmitool", Args: []string{"sdr"}, File: "ipmitool-sdr.txt"},
|
||||
{Name: "ipmitool", Args: []string{"sensor"}, File: "ipmitool-sensor.txt"},
|
||||
{Name: "ipmitool", Args: []string{"sel", "list"}, File: "ipmitool-sel.txt"},
|
||||
{Name: "ipmitool", Args: []string{"sel", "time", "get"}, File: "ipmitool-sel-time.txt"},
|
||||
{Name: "nvme", Args: []string{"list", "-o", "json"}, File: "nvme-list.json"},
|
||||
}
|
||||
|
||||
|
||||
@@ -2,6 +2,8 @@
|
||||
// core/internal/ingest/parser_hardware.go. No import dependency on core.
|
||||
package schema
|
||||
|
||||
import "encoding/json"
|
||||
|
||||
// HardwareIngestRequest is the top-level output document produced by `bee audit`.
|
||||
// It is accepted as-is by the core /api/ingest/hardware endpoint.
|
||||
type HardwareIngestRequest struct {
|
||||
@@ -64,8 +66,10 @@ type HardwareSnapshot struct {
|
||||
Storage []HardwareStorage `json:"storage,omitempty"`
|
||||
PCIeDevices []HardwarePCIeDevice `json:"pcie_devices,omitempty"`
|
||||
PowerSupplies []HardwarePowerSupply `json:"power_supplies,omitempty"`
|
||||
Sensors *HardwareSensors `json:"sensors,omitempty"`
|
||||
EventLogs []HardwareEventLog `json:"event_logs,omitempty"`
|
||||
Sensors *HardwareSensors `json:"sensors,omitempty"`
|
||||
EventLogs []HardwareEventLog `json:"event_logs,omitempty"`
|
||||
PlatformConfig *json.RawMessage `json:"platform_config,omitempty"`
|
||||
VROCLicense *string `json:"vroc_license,omitempty"`
|
||||
}
|
||||
|
||||
type HardwareHealthSummary struct {
|
||||
@@ -122,7 +126,7 @@ type HardwareCPU struct {
|
||||
type HardwareMemory struct {
|
||||
HardwareComponentStatus
|
||||
Slot *string `json:"slot,omitempty"`
|
||||
Location *string `json:"location,omitempty"`
|
||||
Location *string `json:"-"` // internal: used for DIMM telemetry matching only
|
||||
Present *bool `json:"present,omitempty"`
|
||||
SizeMB *int `json:"size_mb,omitempty"`
|
||||
Type *string `json:"type,omitempty"`
|
||||
@@ -143,30 +147,33 @@ type HardwareMemory struct {
|
||||
|
||||
type HardwareStorage struct {
|
||||
HardwareComponentStatus
|
||||
Slot *string `json:"slot,omitempty"`
|
||||
Type *string `json:"type,omitempty"`
|
||||
Model *string `json:"model,omitempty"`
|
||||
SizeGB *int `json:"size_gb,omitempty"`
|
||||
SerialNumber *string `json:"serial_number,omitempty"`
|
||||
Manufacturer *string `json:"manufacturer,omitempty"`
|
||||
Firmware *string `json:"firmware,omitempty"`
|
||||
Interface *string `json:"interface,omitempty"`
|
||||
Present *bool `json:"present,omitempty"`
|
||||
TemperatureC *float64 `json:"temperature_c,omitempty"`
|
||||
PowerOnHours *int64 `json:"power_on_hours,omitempty"`
|
||||
PowerCycles *int64 `json:"power_cycles,omitempty"`
|
||||
UnsafeShutdowns *int64 `json:"unsafe_shutdowns,omitempty"`
|
||||
MediaErrors *int64 `json:"media_errors,omitempty"`
|
||||
ErrorLogEntries *int64 `json:"error_log_entries,omitempty"`
|
||||
WrittenBytes *int64 `json:"written_bytes,omitempty"`
|
||||
ReadBytes *int64 `json:"read_bytes,omitempty"`
|
||||
LifeUsedPct *float64 `json:"life_used_pct,omitempty"`
|
||||
LifeRemainingPct *float64 `json:"life_remaining_pct,omitempty"`
|
||||
AvailableSparePct *float64 `json:"available_spare_pct,omitempty"`
|
||||
ReallocatedSectors *int64 `json:"reallocated_sectors,omitempty"`
|
||||
CurrentPendingSectors *int64 `json:"current_pending_sectors,omitempty"`
|
||||
OfflineUncorrectable *int64 `json:"offline_uncorrectable,omitempty"`
|
||||
Telemetry map[string]any `json:"-"`
|
||||
Slot *string `json:"slot,omitempty"`
|
||||
Type *string `json:"type,omitempty"`
|
||||
Model *string `json:"model,omitempty"`
|
||||
SizeGB *int `json:"size_gb,omitempty"`
|
||||
LogicalBlockSizeBytes *int64 `json:"logical_block_size_bytes,omitempty"`
|
||||
PhysicalBlockSizeBytes *int64 `json:"physical_block_size_bytes,omitempty"`
|
||||
MetadataBytesPerBlock *int64 `json:"metadata_bytes_per_block,omitempty"`
|
||||
SerialNumber *string `json:"serial_number,omitempty"`
|
||||
Manufacturer *string `json:"manufacturer,omitempty"`
|
||||
Firmware *string `json:"firmware,omitempty"`
|
||||
Interface *string `json:"interface,omitempty"`
|
||||
Present *bool `json:"present,omitempty"`
|
||||
TemperatureC *float64 `json:"temperature_c,omitempty"`
|
||||
PowerOnHours *int64 `json:"power_on_hours,omitempty"`
|
||||
PowerCycles *int64 `json:"power_cycles,omitempty"`
|
||||
UnsafeShutdowns *int64 `json:"unsafe_shutdowns,omitempty"`
|
||||
MediaErrors *int64 `json:"media_errors,omitempty"`
|
||||
ErrorLogEntries *int64 `json:"error_log_entries,omitempty"`
|
||||
WrittenBytes *int64 `json:"written_bytes,omitempty"`
|
||||
ReadBytes *int64 `json:"read_bytes,omitempty"`
|
||||
LifeUsedPct *float64 `json:"life_used_pct,omitempty"`
|
||||
LifeRemainingPct *float64 `json:"life_remaining_pct,omitempty"`
|
||||
AvailableSparePct *float64 `json:"available_spare_pct,omitempty"`
|
||||
ReallocatedSectors *int64 `json:"reallocated_sectors,omitempty"`
|
||||
CurrentPendingSectors *int64 `json:"current_pending_sectors,omitempty"`
|
||||
OfflineUncorrectable *int64 `json:"offline_uncorrectable,omitempty"`
|
||||
Telemetry map[string]any `json:"-"`
|
||||
}
|
||||
|
||||
type HardwarePCIeDevice struct {
|
||||
@@ -211,6 +218,7 @@ type HardwarePCIeDevice struct {
|
||||
Firmware *string `json:"firmware,omitempty"`
|
||||
MacAddresses []string `json:"mac_addresses,omitempty"`
|
||||
Present *bool `json:"present,omitempty"`
|
||||
IOMMUGroup *int `json:"iommu_group,omitempty"`
|
||||
Telemetry map[string]any `json:"-"`
|
||||
}
|
||||
|
||||
@@ -256,15 +264,13 @@ type HardwareSensors struct {
|
||||
}
|
||||
|
||||
type HardwareFanSensor struct {
|
||||
Name string `json:"name"`
|
||||
Location *string `json:"location,omitempty"`
|
||||
RPM *int `json:"rpm,omitempty"`
|
||||
Status *string `json:"status,omitempty"`
|
||||
Name string `json:"name"`
|
||||
RPM *int `json:"rpm,omitempty"`
|
||||
Status *string `json:"status,omitempty"`
|
||||
}
|
||||
|
||||
type HardwarePowerSensor struct {
|
||||
Name string `json:"name"`
|
||||
Location *string `json:"location,omitempty"`
|
||||
VoltageV *float64 `json:"voltage_v,omitempty"`
|
||||
CurrentA *float64 `json:"current_a,omitempty"`
|
||||
PowerW *float64 `json:"power_w,omitempty"`
|
||||
@@ -273,7 +279,6 @@ type HardwarePowerSensor struct {
|
||||
|
||||
type HardwareTemperatureSensor struct {
|
||||
Name string `json:"name"`
|
||||
Location *string `json:"location,omitempty"`
|
||||
Celsius *float64 `json:"celsius,omitempty"`
|
||||
ThresholdWarningCelsius *float64 `json:"threshold_warning_celsius,omitempty"`
|
||||
ThresholdCriticalCelsius *float64 `json:"threshold_critical_celsius,omitempty"`
|
||||
@@ -281,11 +286,10 @@ type HardwareTemperatureSensor struct {
|
||||
}
|
||||
|
||||
type HardwareOtherSensor struct {
|
||||
Name string `json:"name"`
|
||||
Location *string `json:"location,omitempty"`
|
||||
Value *float64 `json:"value,omitempty"`
|
||||
Unit *string `json:"unit,omitempty"`
|
||||
Status *string `json:"status,omitempty"`
|
||||
Name string `json:"name"`
|
||||
Value *float64 `json:"value,omitempty"`
|
||||
Unit *string `json:"unit,omitempty"`
|
||||
Status *string `json:"status,omitempty"`
|
||||
}
|
||||
|
||||
type HardwareEventLog struct {
|
||||
|
||||
@@ -44,3 +44,57 @@ func TestHardwareSnapshotMarshalsNewContractFields(t *testing.T) {
|
||||
t.Fatalf("missing event_logs payload: %s", text)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHardwareSnapshotMarshalsStorageTelemetryFields(t *testing.T) {
|
||||
powerOnHours := int64(12450)
|
||||
writtenBytes := int64(9876543210)
|
||||
readBytes := int64(1234567890)
|
||||
lifeRemainingPct := 91.0
|
||||
logicalBlockSizeBytes := int64(512)
|
||||
physicalBlockSizeBytes := int64(4096)
|
||||
metadataBytesPerBlock := int64(8)
|
||||
|
||||
payload := HardwareIngestRequest{
|
||||
CollectedAt: "2026-03-15T15:00:00Z",
|
||||
Hardware: HardwareSnapshot{
|
||||
Board: HardwareBoard{SerialNumber: "SRV-001"},
|
||||
Storage: []HardwareStorage{
|
||||
{
|
||||
SerialNumber: stringPtr("DISK-001"),
|
||||
Model: stringPtr("TestDisk"),
|
||||
LogicalBlockSizeBytes: &logicalBlockSizeBytes,
|
||||
PhysicalBlockSizeBytes: &physicalBlockSizeBytes,
|
||||
MetadataBytesPerBlock: &metadataBytesPerBlock,
|
||||
PowerOnHours: &powerOnHours,
|
||||
WrittenBytes: &writtenBytes,
|
||||
ReadBytes: &readBytes,
|
||||
LifeRemainingPct: &lifeRemainingPct,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
data, err := json.Marshal(payload)
|
||||
if err != nil {
|
||||
t.Fatalf("marshal: %v", err)
|
||||
}
|
||||
text := string(data)
|
||||
for _, needle := range []string{
|
||||
`"storage":[{`,
|
||||
`"logical_block_size_bytes":512`,
|
||||
`"physical_block_size_bytes":4096`,
|
||||
`"metadata_bytes_per_block":8`,
|
||||
`"power_on_hours":12450`,
|
||||
`"written_bytes":9876543210`,
|
||||
`"read_bytes":1234567890`,
|
||||
`"life_remaining_pct":91`,
|
||||
} {
|
||||
if !strings.Contains(text, needle) {
|
||||
t.Fatalf("missing %q in payload: %s", needle, text)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func stringPtr(v string) *string {
|
||||
return &v
|
||||
}
|
||||
|
||||
@@ -125,6 +125,8 @@ func defaultTaskPriority(target string, params taskParams) int {
|
||||
return taskPriorityInstall
|
||||
case "install-to-ram":
|
||||
return taskPriorityInstallToRAM
|
||||
case "nvme-format":
|
||||
return taskPriorityInstall
|
||||
case "audit":
|
||||
return taskPriorityAudit
|
||||
case "nvidia-bench-perf", "nvidia-bench-power", "nvidia-bench-autotune":
|
||||
@@ -806,15 +808,14 @@ func (h *handler) handleAPISATAbort(w http.ResponseWriter, r *http.Request) {
|
||||
now := time.Now()
|
||||
t.DoneAt = &now
|
||||
case TaskRunning:
|
||||
if t.job != nil {
|
||||
t.job.abort()
|
||||
if t.job == nil || !t.job.abort() {
|
||||
globalQueue.mu.Unlock()
|
||||
writeJSON(w, map[string]string{"status": "not_running"})
|
||||
return
|
||||
}
|
||||
if taskMayLeaveOrphanWorkers(t.Target) {
|
||||
platform.KillTestWorkers()
|
||||
}
|
||||
t.Status = TaskCancelled
|
||||
now := time.Now()
|
||||
t.DoneAt = &now
|
||||
globalQueue.mu.Unlock()
|
||||
writeJSON(w, map[string]string{"status": "aborting"})
|
||||
return
|
||||
}
|
||||
globalQueue.mu.Unlock()
|
||||
writeJSON(w, map[string]string{"status": "aborted"})
|
||||
@@ -1039,6 +1040,81 @@ func (h *handler) handleAPIExportUSBBundle(w http.ResponseWriter, r *http.Reques
|
||||
writeJSON(w, map[string]string{"status": "ok", "message": result.Body})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIBlackboxStatus(w http.ResponseWriter, _ *http.Request) {
|
||||
state, err := app.ReadBlackboxState(filepath.Join(h.opts.ExportDir, "blackbox-state.json"))
|
||||
if err != nil {
|
||||
if errors.Is(err, os.ErrNotExist) {
|
||||
writeJSON(w, app.BlackboxState{Status: "disabled", Targets: []app.BlackboxTargetStatus{}})
|
||||
return
|
||||
}
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
if state.Targets == nil {
|
||||
state.Targets = []app.BlackboxTargetStatus{}
|
||||
}
|
||||
writeJSON(w, state)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIBlackboxEnable(w http.ResponseWriter, r *http.Request) {
|
||||
if h.opts.App == nil {
|
||||
writeError(w, http.StatusServiceUnavailable, "app not configured")
|
||||
return
|
||||
}
|
||||
var target platform.RemovableTarget
|
||||
if err := json.NewDecoder(r.Body).Decode(&target); err != nil || strings.TrimSpace(target.Device) == "" {
|
||||
writeError(w, http.StatusBadRequest, "device is required")
|
||||
return
|
||||
}
|
||||
targets, err := h.opts.App.ListRemovableTargets()
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
allowed := false
|
||||
for _, candidate := range targets {
|
||||
if candidate.Device == target.Device {
|
||||
target = candidate
|
||||
allowed = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !allowed {
|
||||
writeError(w, http.StatusBadRequest, "device not in removable target list")
|
||||
return
|
||||
}
|
||||
marker, err := app.EnableBlackboxTarget(target)
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
writeJSON(w, map[string]any{
|
||||
"status": "ok",
|
||||
"message": "Black-box marker written.",
|
||||
"enrollment_id": marker.EnrollmentID,
|
||||
})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIBlackboxDisable(w http.ResponseWriter, r *http.Request) {
|
||||
var req struct {
|
||||
Device string `json:"device"`
|
||||
EnrollmentID string `json:"enrollment_id"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid request body")
|
||||
return
|
||||
}
|
||||
if err := app.DisableBlackboxTarget(req.Device, req.EnrollmentID); err != nil {
|
||||
if errors.Is(err, os.ErrNotExist) {
|
||||
writeError(w, http.StatusNotFound, "black-box target not found")
|
||||
return
|
||||
}
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
writeJSON(w, map[string]string{"status": "ok", "message": "Black-box marker removed."})
|
||||
}
|
||||
|
||||
// ── GPU presence ──────────────────────────────────────────────────────────────
|
||||
|
||||
func (h *handler) handleAPIGNVIDIAGPUs(w http.ResponseWriter, _ *http.Request) {
|
||||
@@ -1216,12 +1292,28 @@ func (h *handler) handleAPIInstallToRAM(w http.ResponseWriter, r *http.Request)
|
||||
_ = json.NewEncoder(w).Encode(map[string]string{"task_id": t.ID})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPISystemReboot(w http.ResponseWriter, r *http.Request) {
|
||||
if err := exec.Command("systemctl", "reboot").Start(); err != nil {
|
||||
writeError(w, http.StatusInternalServerError, "reboot failed: "+err.Error())
|
||||
return
|
||||
}
|
||||
writeJSON(w, map[string]string{"status": "rebooting"})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPISystemShutdown(w http.ResponseWriter, r *http.Request) {
|
||||
if err := exec.Command("systemctl", "poweroff").Start(); err != nil {
|
||||
writeError(w, http.StatusInternalServerError, "shutdown failed: "+err.Error())
|
||||
return
|
||||
}
|
||||
writeJSON(w, map[string]string{"status": "shutting down"})
|
||||
}
|
||||
|
||||
// ── Tools ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
var standardTools = []string{
|
||||
"dmidecode", "smartctl", "nvme", "lspci", "ipmitool",
|
||||
"nvidia-smi", "dcgmi", "nv-hostengine", "memtester", "stress-ng", "nvtop",
|
||||
"mstflint", "qrencode",
|
||||
"mstflint", "saa",
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIToolsCheck(w http.ResponseWriter, r *http.Request) {
|
||||
@@ -1603,6 +1695,56 @@ func (h *handler) handleAPIBenchmarkResults(w http.ResponseWriter, r *http.Reque
|
||||
fmt.Fprint(w, renderBenchmarkResultsCard(h.opts.ExportDir))
|
||||
}
|
||||
|
||||
// ── Hardware summary / component detail ──────────────────────────────────────
|
||||
|
||||
// handleAPIHardwareSummary returns the hardware summary card HTML fragment for
|
||||
// htmx polling (hx-get="/api/hardware-summary" hx-swap="outerHTML").
|
||||
func (h *handler) handleAPIHardwareSummary(w http.ResponseWriter, _ *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||
w.Header().Set("Cache-Control", "no-store")
|
||||
fmt.Fprint(w, renderHardwareSummaryCard(h.opts))
|
||||
}
|
||||
|
||||
// handleAPIComponentDetail returns an HTML fragment describing the current and
|
||||
// historical status for one component type (cpu, memory, storage, gpu, psu).
|
||||
func (h *handler) handleAPIComponentDetail(w http.ResponseWriter, r *http.Request) {
|
||||
compType := r.PathValue("type")
|
||||
var exact, prefixes []string
|
||||
var title string
|
||||
switch compType {
|
||||
case "cpu":
|
||||
title = "CPU"
|
||||
exact = []string{"cpu:all"}
|
||||
case "memory":
|
||||
title = "Memory"
|
||||
exact = []string{"memory:all"}
|
||||
prefixes = []string{"memory:"}
|
||||
case "storage":
|
||||
title = "Storage"
|
||||
exact = []string{"storage:all"}
|
||||
prefixes = []string{"storage:"}
|
||||
case "gpu":
|
||||
title = "GPU"
|
||||
prefixes = []string{"pcie:gpu:"}
|
||||
case "psu":
|
||||
title = "PSU"
|
||||
prefixes = []string{"psu:"}
|
||||
default:
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
|
||||
var records []app.ComponentStatusRecord
|
||||
if h.opts.App != nil && h.opts.App.StatusDB != nil {
|
||||
all := h.opts.App.StatusDB.All()
|
||||
records = matchedRecords(all, exact, prefixes)
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "text/html; charset=utf-8")
|
||||
w.Header().Set("Cache-Control", "no-store")
|
||||
fmt.Fprint(w, renderComponentDetail(title, records))
|
||||
}
|
||||
|
||||
func (h *handler) rollbackPendingNetworkChange() error {
|
||||
h.pendingNetMu.Lock()
|
||||
pnc := h.pendingNet
|
||||
|
||||
@@ -3,6 +3,8 @@ package webui
|
||||
import (
|
||||
"encoding/json"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
@@ -44,6 +46,66 @@ func TestHandleAPISATRunDecodesBodyWithoutContentLength(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPIBlackboxStatusReturnsDisabledWhenStateMissing(t *testing.T) {
|
||||
h := &handler{opts: HandlerOptions{ExportDir: t.TempDir()}}
|
||||
rec := httptest.NewRecorder()
|
||||
req := httptest.NewRequest("GET", "/api/blackbox/status", nil)
|
||||
|
||||
h.handleAPIBlackboxStatus(rec, req)
|
||||
|
||||
if rec.Code != 200 {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
var state app.BlackboxState
|
||||
if err := json.Unmarshal(rec.Body.Bytes(), &state); err != nil {
|
||||
t.Fatalf("decode state: %v", err)
|
||||
}
|
||||
if state.Status != "disabled" {
|
||||
t.Fatalf("status=%q want disabled", state.Status)
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPIBlackboxStatusReturnsPersistedState(t *testing.T) {
|
||||
exportDir := t.TempDir()
|
||||
statePath := filepath.Join(exportDir, "blackbox-state.json")
|
||||
if err := os.WriteFile(statePath, []byte(`{"status":"running","boot_folder":"boot-folder","targets":[{"enrollment_id":"bb-1","device":"/dev/sdb1","status":"running","flush_period":"1s"}]}`), 0644); err != nil {
|
||||
t.Fatalf("write state: %v", err)
|
||||
}
|
||||
h := &handler{opts: HandlerOptions{ExportDir: exportDir}}
|
||||
rec := httptest.NewRecorder()
|
||||
req := httptest.NewRequest("GET", "/api/blackbox/status", nil)
|
||||
|
||||
h.handleAPIBlackboxStatus(rec, req)
|
||||
|
||||
if rec.Code != 200 {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
if !strings.Contains(rec.Body.String(), `"boot_folder":"boot-folder"`) {
|
||||
t.Fatalf("body=%s", rec.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseNVMeFormatModes(t *testing.T) {
|
||||
raw := `
|
||||
lbaf 0 : ms:0 lbads:9 rp:0x2 (in use)
|
||||
lbaf 1 : ms:8 lbads:9 rp:0x1
|
||||
lbaf 2 : ms:0 lbads:12 rp:0
|
||||
`
|
||||
modes := parseNVMeFormatModes(raw)
|
||||
if len(modes) != 3 {
|
||||
t.Fatalf("modes=%#v want 3 modes", modes)
|
||||
}
|
||||
if modes[0].Mode != 0 || modes[0].DataBytes != 512 || modes[0].MetadataBytes != 0 || !modes[0].InUse {
|
||||
t.Fatalf("mode 0=%#v", modes[0])
|
||||
}
|
||||
if modes[1].Label != "MODE 1 (512+8)" {
|
||||
t.Fatalf("mode 1 label=%q", modes[1].Label)
|
||||
}
|
||||
if modes[2].DataBytes != 4096 || modes[2].MetadataBytes != 0 {
|
||||
t.Fatalf("mode 2=%#v", modes[2])
|
||||
}
|
||||
}
|
||||
|
||||
func TestHandleAPIBenchmarkNvidiaRunQueuesSelectedGPUs(t *testing.T) {
|
||||
globalQueue.mu.Lock()
|
||||
originalTasks := globalQueue.tasks
|
||||
|
||||
76
audit/internal/webui/health_poller.go
Normal file
76
audit/internal/webui/health_poller.go
Normal file
@@ -0,0 +1,76 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"log/slog"
|
||||
"os/exec"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/collector"
|
||||
)
|
||||
|
||||
const healthPollInterval = 60 * time.Second
|
||||
const psuIPMITimeout = 15 * time.Second
|
||||
|
||||
// healthPoller runs periodic health checks for hardware components that do not
|
||||
// emit kernel log events (e.g. PSU). Results are written to ComponentStatusDB.
|
||||
type healthPoller struct {
|
||||
statusDB *app.ComponentStatusDB
|
||||
}
|
||||
|
||||
func newHealthPoller(statusDB *app.ComponentStatusDB) *healthPoller {
|
||||
return &healthPoller{statusDB: statusDB}
|
||||
}
|
||||
|
||||
func (p *healthPoller) start() {
|
||||
goRecoverLoop("health poller", 5*time.Second, p.run)
|
||||
}
|
||||
|
||||
func (p *healthPoller) run() {
|
||||
ticker := time.NewTicker(healthPollInterval)
|
||||
defer ticker.Stop()
|
||||
for range ticker.C {
|
||||
p.pollPSU()
|
||||
}
|
||||
}
|
||||
|
||||
func (p *healthPoller) pollPSU() {
|
||||
if p.statusDB == nil {
|
||||
return
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), psuIPMITimeout)
|
||||
defer cancel()
|
||||
|
||||
cmd := exec.CommandContext(ctx, "ipmitool", "sdr")
|
||||
var out bytes.Buffer
|
||||
cmd.Stdout = &out
|
||||
if err := cmd.Run(); err != nil {
|
||||
// IPMI not available or not a server — skip silently.
|
||||
slog.Debug("health poller: ipmitool sdr unavailable", "err", err)
|
||||
return
|
||||
}
|
||||
|
||||
slots := collector.PSUSlotsFromSDR(out.String())
|
||||
if len(slots) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
const source = "watchdog:psu"
|
||||
for slot, psu := range slots {
|
||||
key := "psu:" + slot
|
||||
status := psu.Status
|
||||
if status == "" {
|
||||
status = "Unknown"
|
||||
}
|
||||
detail := ""
|
||||
switch status {
|
||||
case "Critical":
|
||||
detail = "PSU sensor reported non-OK state"
|
||||
case "Warning":
|
||||
detail = "PSU sensor in warning state"
|
||||
}
|
||||
p.statusDB.Record(key, source, status, detail)
|
||||
}
|
||||
}
|
||||
280
audit/internal/webui/huawei_elabel.go
Normal file
280
audit/internal/webui/huawei_elabel.go
Normal file
@@ -0,0 +1,280 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type huaweiField struct {
|
||||
Name string `json:"name"`
|
||||
Key string `json:"key"`
|
||||
Value string `json:"value"`
|
||||
ReadOnly bool `json:"read_only,omitempty"`
|
||||
}
|
||||
|
||||
type huaweiChange struct {
|
||||
Key string `json:"key"`
|
||||
Value string `json:"value"`
|
||||
}
|
||||
|
||||
type huaweiFieldDef struct {
|
||||
Name string
|
||||
Key string
|
||||
FruID byte
|
||||
TypeID byte
|
||||
FieldID byte
|
||||
Special string // "chassis-type" | "guid"
|
||||
}
|
||||
|
||||
var huaweiElabelDefs = []huaweiFieldDef{
|
||||
{"Device Name", "DeviceName", 0x00, 0x06, 0x01, ""},
|
||||
{"Device Serial Number", "DeviceSerialNumber", 0x00, 0x06, 0x03, ""},
|
||||
{"Product Name", "ProductName", 0x00, 0x03, 0x01, ""},
|
||||
{"Product Serial Number", "ProductSerialNumber", 0x00, 0x03, 0x04, ""},
|
||||
{"Product Asset Tag", "ProductAssetTag", 0x00, 0x03, 0x05, ""},
|
||||
{"Product Manufacturer", "ProductManufacturer", 0x00, 0x03, 0x00, ""},
|
||||
{"Mainboard Manufacturer", "MainboardManufacturer", 0x00, 0x02, 0x01, ""},
|
||||
{"Board Product Name", "BoardProductName", 0x00, 0x02, 0x02, ""},
|
||||
{"Chassis Part Number", "ChassisPartnumber", 0x00, 0x01, 0x01, ""},
|
||||
{"Chassis Type", "ChassisType", 0x00, 0x01, 0x00, "chassis-type"},
|
||||
{"IO Chassis Serial", "IOChassisSerialNumber", 0x01, 0x03, 0x04, ""},
|
||||
{"IO Chassis Asset Tag", "IOChassisAssetTag", 0x01, 0x03, 0x05, ""},
|
||||
{"GUID", "GUID", 0x00, 0x00, 0x00, "guid"},
|
||||
}
|
||||
|
||||
// huaweiGetRaw reads a string elabel field via OEM IPMI raw command.
|
||||
// Protocol: ipmitool raw 0x30 0x90 0x05 <fru_id> <type_id> <field_id> 0x00 0x30
|
||||
// Response: <length_byte> <ascii_byte1> ... (null-terminated)
|
||||
func huaweiGetRaw(ctx context.Context, def huaweiFieldDef) (string, error) {
|
||||
if def.Special == "guid" {
|
||||
return huaweiGetGUID(ctx)
|
||||
}
|
||||
args := []string{
|
||||
"0x30", "0x90", "0x05",
|
||||
fmt.Sprintf("0x%02x", def.FruID),
|
||||
fmt.Sprintf("0x%02x", def.TypeID),
|
||||
fmt.Sprintf("0x%02x", def.FieldID),
|
||||
"0x00", "0x30",
|
||||
}
|
||||
out, err := exec.CommandContext(ctx, "ipmitool", append([]string{"raw"}, args...)...).CombinedOutput()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return huaweiParseStringResponse(strings.TrimSpace(string(out)), def.Special), nil
|
||||
}
|
||||
|
||||
// huaweiParseStringResponse decodes the OEM IPMI response bytes to a string.
|
||||
// Format: <length_byte> <byte1> <byte2> ...
|
||||
func huaweiParseStringResponse(hexOut, special string) string {
|
||||
parts := strings.Fields(hexOut)
|
||||
if len(parts) < 2 {
|
||||
return ""
|
||||
}
|
||||
if special == "chassis-type" {
|
||||
// Response: <length=1> <type_byte>
|
||||
if len(parts) >= 2 {
|
||||
n, err := strconv.ParseUint(parts[1], 16, 8)
|
||||
if err == nil {
|
||||
return fmt.Sprintf("0x%02x", n)
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
var sb strings.Builder
|
||||
for _, p := range parts[1:] {
|
||||
b, err := strconv.ParseUint(p, 16, 8)
|
||||
if err != nil || b == 0 {
|
||||
break
|
||||
}
|
||||
sb.WriteByte(byte(b))
|
||||
}
|
||||
return strings.TrimRight(sb.String(), "\x00")
|
||||
}
|
||||
|
||||
// huaweiGetGUID reads the system GUID via standard IPMI Get System GUID (0x06 0x08).
|
||||
func huaweiGetGUID(ctx context.Context) (string, error) {
|
||||
out, err := exec.CommandContext(ctx, "ipmitool", "raw", "0x06", "0x08").CombinedOutput()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
parts := strings.Fields(strings.TrimSpace(string(out)))
|
||||
if len(parts) != 16 {
|
||||
return "", nil
|
||||
}
|
||||
// Format as UUID: 4-2-2-2-6 byte groups
|
||||
// iBMC returns bytes in reversed order; re-reverse to get canonical UUID.
|
||||
var bytes [16]string
|
||||
for i, p := range parts {
|
||||
bytes[15-i] = p
|
||||
}
|
||||
return fmt.Sprintf("%s%s%s%s-%s%s-%s%s-%s%s-%s%s%s%s%s%s",
|
||||
bytes[0], bytes[1], bytes[2], bytes[3],
|
||||
bytes[4], bytes[5],
|
||||
bytes[6], bytes[7],
|
||||
bytes[8], bytes[9],
|
||||
bytes[10], bytes[11], bytes[12], bytes[13], bytes[14], bytes[15],
|
||||
), nil
|
||||
}
|
||||
|
||||
// huaweiChunks splits a value into 19-byte chunks for the OEM IPMI SET protocol.
|
||||
// Key byte: bit7=1 means more chunks follow; bits 0-6 = offset into string.
|
||||
func huaweiChunks(value string) [][]string {
|
||||
if len(value) == 0 {
|
||||
return [][]string{{"0x00", "0x01", "0x00"}}
|
||||
}
|
||||
const maxLen = 63
|
||||
if len(value) > maxLen {
|
||||
value = value[:maxLen]
|
||||
}
|
||||
const chunkSize = 19
|
||||
var chunks [][]string
|
||||
for offset := 0; offset < len(value); {
|
||||
end := offset + chunkSize
|
||||
if end > len(value) {
|
||||
end = len(value)
|
||||
}
|
||||
isLast := end >= len(value)
|
||||
key := byte(offset)
|
||||
if !isLast {
|
||||
key |= 0x80
|
||||
}
|
||||
args := []string{
|
||||
fmt.Sprintf("0x%02x", key),
|
||||
fmt.Sprintf("0x%02x", end-offset),
|
||||
}
|
||||
for _, b := range []byte(value[offset:end]) {
|
||||
args = append(args, fmt.Sprintf("0x%02x", b))
|
||||
}
|
||||
chunks = append(chunks, args)
|
||||
offset = end
|
||||
}
|
||||
return chunks
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIHuaweiElabelRead(w http.ResponseWriter, r *http.Request) {
|
||||
ctx, cancel := context.WithTimeout(r.Context(), 60*time.Second)
|
||||
defer cancel()
|
||||
|
||||
var fields []huaweiField
|
||||
for _, def := range huaweiElabelDefs {
|
||||
val, err := huaweiGetRaw(ctx, def)
|
||||
if err != nil {
|
||||
// First field failure likely means no Huawei BMC — abort with error.
|
||||
if len(fields) == 0 {
|
||||
msg := strings.TrimSpace(err.Error())
|
||||
writeError(w, http.StatusInternalServerError, "huawei elabel not available: "+msg)
|
||||
return
|
||||
}
|
||||
val = ""
|
||||
}
|
||||
fields = append(fields, huaweiField{
|
||||
Name: def.Name,
|
||||
Key: def.Key,
|
||||
Value: val,
|
||||
ReadOnly: def.Special == "guid" || def.Special == "chassis-type",
|
||||
})
|
||||
}
|
||||
writeJSON(w, fields)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIHuaweiElabelWrite(w http.ResponseWriter, r *http.Request) {
|
||||
var req struct {
|
||||
Changes []huaweiChange `json:"changes"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid JSON")
|
||||
return
|
||||
}
|
||||
if len(req.Changes) == 0 {
|
||||
writeError(w, http.StatusUnprocessableEntity, "no changes provided")
|
||||
return
|
||||
}
|
||||
|
||||
defByKey := make(map[string]huaweiFieldDef, len(huaweiElabelDefs))
|
||||
for _, d := range huaweiElabelDefs {
|
||||
defByKey[d.Key] = d
|
||||
}
|
||||
|
||||
for _, c := range req.Changes {
|
||||
def, ok := defByKey[c.Key]
|
||||
if !ok {
|
||||
writeError(w, http.StatusUnprocessableEntity, "unknown field key: "+c.Key)
|
||||
return
|
||||
}
|
||||
if def.Special == "guid" || def.Special == "chassis-type" {
|
||||
writeError(w, http.StatusUnprocessableEntity, "field is read-only: "+c.Key)
|
||||
return
|
||||
}
|
||||
if len(c.Value) > 63 {
|
||||
writeError(w, http.StatusUnprocessableEntity, "value too long (max 63 chars): "+c.Key)
|
||||
return
|
||||
}
|
||||
for _, ch := range c.Value {
|
||||
if ch < 0x20 || ch > 0x7E {
|
||||
writeError(w, http.StatusUnprocessableEntity, "non-printable character in value for: "+c.Key)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
t := &Task{
|
||||
ID: newJobID("huawei-elabel-write"),
|
||||
Name: fmt.Sprintf("Huawei Elabel Write (%d field(s))", len(req.Changes)),
|
||||
Target: "huawei-elabel-write",
|
||||
Priority: defaultTaskPriority("huawei-elabel-write", taskParams{}),
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{HuaweiElabelChanges: req.Changes},
|
||||
}
|
||||
globalQueue.enqueue(t)
|
||||
writeJSON(w, map[string]string{"task_id": t.ID})
|
||||
}
|
||||
|
||||
func runHuaweiElabelWriteTask(ctx context.Context, j *jobState, p taskParams) error {
|
||||
defByKey := make(map[string]huaweiFieldDef, len(huaweiElabelDefs))
|
||||
for _, d := range huaweiElabelDefs {
|
||||
defByKey[d.Key] = d
|
||||
}
|
||||
|
||||
// Enable device name effective flag before writing.
|
||||
enableCmd := exec.CommandContext(ctx, "ipmitool", "raw", "0x30", "0x90", "0x21", "0x04", "0x01")
|
||||
if out, err := enableCmd.CombinedOutput(); err != nil {
|
||||
j.append("Warning: enable flag: " + strings.TrimSpace(string(out)))
|
||||
}
|
||||
|
||||
for _, c := range p.HuaweiElabelChanges {
|
||||
def := defByKey[c.Key]
|
||||
setPrefix := []string{
|
||||
"0x30", "0x90", "0x04",
|
||||
fmt.Sprintf("0x%02x", def.FruID),
|
||||
fmt.Sprintf("0x%02x", def.TypeID),
|
||||
fmt.Sprintf("0x%02x", def.FieldID),
|
||||
}
|
||||
|
||||
chunks := huaweiChunks(c.Value)
|
||||
j.append(fmt.Sprintf("Setting %s = %q (%d chunk(s))", c.Key, c.Value, len(chunks)))
|
||||
|
||||
for _, chunk := range chunks {
|
||||
args := append([]string{"raw"}, setPrefix...)
|
||||
args = append(args, chunk...)
|
||||
cmd := exec.CommandContext(ctx, "ipmitool", args...)
|
||||
if err := streamCmdJob(j, cmd); err != nil {
|
||||
return fmt.Errorf("set %s: %w", c.Key, err)
|
||||
}
|
||||
}
|
||||
|
||||
// Commit after each field.
|
||||
commitCmd := exec.CommandContext(ctx, "ipmitool", "raw", "0x30", "0x90", "0x06", "0x00", "0xAA")
|
||||
if out, err := commitCmd.CombinedOutput(); err != nil {
|
||||
return fmt.Errorf("commit after %s: %w (output: %s)", c.Key, err, strings.TrimSpace(string(out)))
|
||||
}
|
||||
j.append("Committed " + c.Key)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
204
audit/internal/webui/ipmi_fru.go
Normal file
204
audit/internal/webui/ipmi_fru.go
Normal file
@@ -0,0 +1,204 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
"unicode"
|
||||
)
|
||||
|
||||
type fruField struct {
|
||||
Name string `json:"name"`
|
||||
Value string `json:"value"`
|
||||
Editable bool `json:"editable"`
|
||||
Area string `json:"area,omitempty"`
|
||||
Index int `json:"index,omitempty"`
|
||||
}
|
||||
|
||||
type fruChange struct {
|
||||
Area string `json:"area"`
|
||||
Index int `json:"index"`
|
||||
Name string `json:"name"`
|
||||
Value string `json:"value"`
|
||||
}
|
||||
|
||||
// fruEditableFields maps display name → area + index for ipmitool fru edit.
|
||||
var fruEditableFields = map[string]struct {
|
||||
Area string
|
||||
Index int
|
||||
}{
|
||||
// Chassis — vendor doc names and ipmitool abbreviated names
|
||||
"Chassis Part Number": {"c", 0},
|
||||
"Chassis Serial Number": {"c", 1},
|
||||
"Chassis Serial": {"c", 1},
|
||||
"Chassis Extra": {"c", 2},
|
||||
// Board — vendor doc names and ipmitool abbreviated names
|
||||
"Board Manufacturer": {"b", 0},
|
||||
"Board Mfg": {"b", 0},
|
||||
"Board Product Name": {"b", 1},
|
||||
"Board Product": {"b", 1},
|
||||
"Board Serial Number": {"b", 2},
|
||||
"Board Serial": {"b", 2},
|
||||
"Board Part Number": {"b", 3},
|
||||
// Product — vendor doc names and ipmitool abbreviated names
|
||||
"Product Manufacturer": {"p", 0},
|
||||
"Product Name": {"p", 1},
|
||||
"Product Part Number": {"p", 2},
|
||||
"Product Version": {"p", 3},
|
||||
"Product Serial Number": {"p", 4},
|
||||
"Product Serial": {"p", 4},
|
||||
}
|
||||
|
||||
func parseFRUOutput(output string) []fruField {
|
||||
var fields []fruField
|
||||
for _, line := range strings.Split(output, "\n") {
|
||||
// Lines look like: " Field Name : value"
|
||||
trimmed := strings.TrimLeft(line, " \t")
|
||||
if trimmed == "" {
|
||||
continue
|
||||
}
|
||||
colon := strings.Index(trimmed, " : ")
|
||||
if colon < 0 {
|
||||
// try ": " with no leading space before colon
|
||||
colon = strings.Index(trimmed, ": ")
|
||||
if colon < 0 {
|
||||
continue
|
||||
}
|
||||
name := strings.TrimSpace(trimmed[:colon])
|
||||
value := strings.TrimSpace(trimmed[colon+2:])
|
||||
if name == "" {
|
||||
continue
|
||||
}
|
||||
editable, area, idx := fruFieldMeta(name)
|
||||
fields = append(fields, fruField{Name: name, Value: value, Editable: editable, Area: area, Index: idx})
|
||||
continue
|
||||
}
|
||||
name := strings.TrimSpace(trimmed[:colon])
|
||||
value := strings.TrimSpace(trimmed[colon+3:])
|
||||
if name == "" {
|
||||
continue
|
||||
}
|
||||
editable, area, idx := fruFieldMeta(name)
|
||||
fields = append(fields, fruField{Name: name, Value: value, Editable: editable, Area: area, Index: idx})
|
||||
}
|
||||
return fields
|
||||
}
|
||||
|
||||
func fruFieldMeta(name string) (editable bool, area string, index int) {
|
||||
if e, ok := fruEditableFields[name]; ok {
|
||||
return true, e.Area, e.Index
|
||||
}
|
||||
// All fields are shown as editable; server will reject unknown fields.
|
||||
return true, "", 0
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIIPMIFRURead(w http.ResponseWriter, r *http.Request) {
|
||||
ctx, cancel := context.WithTimeout(r.Context(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
out, err := exec.CommandContext(ctx, "ipmitool", "fru", "print", "0").CombinedOutput()
|
||||
if err != nil {
|
||||
msg := strings.TrimSpace(string(out))
|
||||
if msg == "" {
|
||||
msg = err.Error()
|
||||
}
|
||||
writeError(w, http.StatusInternalServerError, "ipmitool fru print: "+msg)
|
||||
return
|
||||
}
|
||||
|
||||
fields := parseFRUOutput(string(out))
|
||||
writeJSON(w, fields)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIIPMIFRUWrite(w http.ResponseWriter, r *http.Request) {
|
||||
var req struct {
|
||||
Changes []fruChange `json:"changes"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid JSON")
|
||||
return
|
||||
}
|
||||
if len(req.Changes) == 0 {
|
||||
writeError(w, http.StatusUnprocessableEntity, "no changes provided")
|
||||
return
|
||||
}
|
||||
validAreas := map[string]bool{"c": true, "b": true, "p": true}
|
||||
for i, c := range req.Changes {
|
||||
if c.Area == "" {
|
||||
e, ok := fruEditableFields[c.Name]
|
||||
if !ok {
|
||||
writeError(w, http.StatusUnprocessableEntity, "field not writable via ipmitool: "+c.Name)
|
||||
return
|
||||
}
|
||||
req.Changes[i].Area = e.Area
|
||||
req.Changes[i].Index = e.Index
|
||||
c = req.Changes[i]
|
||||
}
|
||||
if !validAreas[c.Area] {
|
||||
writeError(w, http.StatusUnprocessableEntity, "invalid area: "+c.Area)
|
||||
return
|
||||
}
|
||||
if c.Index < 0 || c.Index > 9 {
|
||||
writeError(w, http.StatusUnprocessableEntity, fmt.Sprintf("invalid index %d", c.Index))
|
||||
return
|
||||
}
|
||||
if len(c.Value) > 64 {
|
||||
writeError(w, http.StatusUnprocessableEntity, "value too long (max 64 chars)")
|
||||
return
|
||||
}
|
||||
for _, ch := range c.Value {
|
||||
if ch > unicode.MaxASCII || (ch < 0x20 && ch != 0) {
|
||||
writeError(w, http.StatusUnprocessableEntity, "value contains non-printable characters")
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
t := &Task{
|
||||
ID: newJobID("ipmi-fru-write"),
|
||||
Name: fmt.Sprintf("IPMI FRU Write (%d field(s))", len(req.Changes)),
|
||||
Target: "ipmi-fru-write",
|
||||
Priority: defaultTaskPriority("ipmi-fru-write", taskParams{}),
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{FRUChanges: req.Changes},
|
||||
}
|
||||
globalQueue.enqueue(t)
|
||||
writeJSON(w, map[string]string{"task_id": t.ID})
|
||||
}
|
||||
|
||||
func runIPMIFRUWriteTask(ctx context.Context, j *jobState, exportDir string, p taskParams) error {
|
||||
// Backup current FRU state
|
||||
backupDir := filepath.Join(exportDir, "fru-backups")
|
||||
if err := os.MkdirAll(backupDir, 0755); err != nil {
|
||||
return fmt.Errorf("mkdir fru-backups: %w", err)
|
||||
}
|
||||
stamp := time.Now().Format("20060102150405")
|
||||
backupPath := filepath.Join(backupDir, "fru-"+stamp+".txt")
|
||||
|
||||
backupOut, err := exec.CommandContext(ctx, "ipmitool", "fru", "print", "0").CombinedOutput()
|
||||
if err != nil {
|
||||
return fmt.Errorf("backup fru print: %w", err)
|
||||
}
|
||||
if err := os.WriteFile(backupPath, backupOut, 0644); err != nil {
|
||||
return fmt.Errorf("write backup: %w", err)
|
||||
}
|
||||
j.append("Backup saved to " + backupPath)
|
||||
|
||||
// Apply changes
|
||||
for _, c := range p.FRUChanges {
|
||||
j.append(fmt.Sprintf("Setting %s (%s %d) = %q", c.Name, c.Area, c.Index, c.Value))
|
||||
cmd := exec.CommandContext(ctx, "ipmitool", "fru", "edit", "0", "field", c.Area, fmt.Sprintf("%d", c.Index), c.Value)
|
||||
if err := streamCmdJob(j, cmd); err != nil {
|
||||
return fmt.Errorf("fru edit %s %d: %w", c.Area, c.Index, err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -20,7 +20,7 @@ type jobState struct {
|
||||
cancel func() // optional cancel function; nil if job is not cancellable
|
||||
logPath string
|
||||
serialPrefix string
|
||||
logFile *os.File // kept open for the task lifetime to avoid per-line open/close
|
||||
logFile *os.File // kept open for the task lifetime to avoid per-line open/close
|
||||
logBuf *bufio.Writer
|
||||
}
|
||||
|
||||
@@ -53,13 +53,21 @@ func (j *jobState) abort() bool {
|
||||
}
|
||||
|
||||
func (j *jobState) append(line string) {
|
||||
j.appendWithOptions(line, true, true)
|
||||
}
|
||||
|
||||
func (j *jobState) appendFromLog(line string) {
|
||||
j.appendWithOptions(line, false, false)
|
||||
}
|
||||
|
||||
func (j *jobState) appendWithOptions(line string, persistLog, serialMirror bool) {
|
||||
j.mu.Lock()
|
||||
defer j.mu.Unlock()
|
||||
j.lines = append(j.lines, line)
|
||||
if j.logPath != "" {
|
||||
if persistLog && j.logPath != "" {
|
||||
j.writeLogLineLocked(line)
|
||||
}
|
||||
if j.serialPrefix != "" {
|
||||
if serialMirror && j.serialPrefix != "" {
|
||||
taskSerialWriteLine(j.serialPrefix + line)
|
||||
}
|
||||
for _, ch := range j.subs {
|
||||
@@ -83,6 +91,7 @@ func (j *jobState) writeLogLineLocked(line string) {
|
||||
j.logBuf = bufio.NewWriterSize(f, 64*1024)
|
||||
}
|
||||
_, _ = j.logBuf.WriteString(line + "\n")
|
||||
_ = j.logBuf.Flush()
|
||||
}
|
||||
|
||||
// closeLog flushes and closes the log file. Called after all task output is done.
|
||||
|
||||
@@ -73,6 +73,9 @@ func (w *kmsgWatcher) run() {
|
||||
w.mu.Lock()
|
||||
if w.window != nil {
|
||||
w.recordEvent(evt)
|
||||
} else {
|
||||
evtCopy := evt
|
||||
goRecoverOnce("kmsg flush immediate", func() { w.flushImmediate(evtCopy) })
|
||||
}
|
||||
w.mu.Unlock()
|
||||
}
|
||||
@@ -162,7 +165,9 @@ func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
|
||||
for _, id := range evt.ids {
|
||||
var key string
|
||||
switch evt.category {
|
||||
case "gpu", "pcie":
|
||||
case "gpu":
|
||||
key = "pcie:gpu:" + normalizeBDF(id)
|
||||
case "pcie":
|
||||
key = "pcie:" + normalizeBDF(id)
|
||||
case "storage":
|
||||
key = "storage:" + id
|
||||
@@ -180,6 +185,54 @@ func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
|
||||
}
|
||||
}
|
||||
|
||||
// flushImmediate writes a single kmsg event directly to the status DB without a SAT window.
|
||||
// Called when an error is detected outside of any SAT task (always-on watching).
|
||||
func (w *kmsgWatcher) flushImmediate(evt kmsgEvent) {
|
||||
if w.statusDB == nil {
|
||||
return
|
||||
}
|
||||
const source = "watchdog:kmsg"
|
||||
detail := "kernel: " + truncate(evt.raw, 120)
|
||||
|
||||
var severity string
|
||||
for _, p := range platform.HardwareErrorPatterns {
|
||||
if p.Re.MatchString(evt.raw) {
|
||||
if p.Severity == "critical" {
|
||||
severity = "Critical"
|
||||
} else {
|
||||
severity = "Warning"
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
if severity == "" {
|
||||
severity = "Warning"
|
||||
}
|
||||
|
||||
if len(evt.ids) == 0 {
|
||||
key := "cpu:all"
|
||||
if evt.category == "memory" {
|
||||
key = "memory:all"
|
||||
}
|
||||
w.statusDB.Record(key, source, severity, detail)
|
||||
return
|
||||
}
|
||||
for _, id := range evt.ids {
|
||||
var key string
|
||||
switch evt.category {
|
||||
case "gpu":
|
||||
key = "pcie:gpu:" + normalizeBDF(id)
|
||||
case "pcie":
|
||||
key = "pcie:" + normalizeBDF(id)
|
||||
case "storage":
|
||||
key = "storage:" + id
|
||||
default:
|
||||
key = "pcie:" + normalizeBDF(id)
|
||||
}
|
||||
w.statusDB.Record(key, source, severity, detail)
|
||||
}
|
||||
}
|
||||
|
||||
// parseKmsgLine parses a single /dev/kmsg line and returns an event if it matches
|
||||
// any pattern in platform.HardwareErrorPatterns.
|
||||
// kmsg format: "<priority>,<sequence>,<timestamp_usec>,-;message text"
|
||||
|
||||
@@ -17,6 +17,7 @@ func layoutHead(title string) string {
|
||||
<style>
|
||||
:root{--bg:#fff;--surface:#fff;--surface-2:#f9fafb;--border:rgba(34,36,38,.15);--border-lite:rgba(34,36,38,.1);--ink:rgba(0,0,0,.87);--muted:rgba(0,0,0,.6);--accent:#2185d0;--accent-dark:#1678c2;--crit-bg:#fff6f6;--crit-fg:#9f3a38;--crit-border:#e0b4b4;--ok-bg:#fcfff5;--ok-fg:#2c662d;--warn-bg:#fffaf3;--warn-fg:#573a08}
|
||||
*{box-sizing:border-box;margin:0;padding:0}
|
||||
dialog{margin:auto}
|
||||
body{font:14px/1.5 Lato,"Helvetica Neue",Arial,Helvetica,sans-serif;background:var(--bg);color:var(--ink);display:flex;min-height:100vh}
|
||||
a{color:var(--accent);text-decoration:none}
|
||||
/* Sidebar */
|
||||
@@ -67,6 +68,10 @@ tbody tr:hover td{background:rgba(0,0,0,.03)}
|
||||
.chip-warn{background:var(--warn-bg);color:var(--warn-fg);border:1px solid #c9ba9b}
|
||||
.chip-fail{background:var(--crit-bg);color:var(--crit-fg);border:1px solid var(--crit-border)}
|
||||
.chip-unknown{background:var(--surface-2);color:var(--muted);border:1px solid var(--border)}
|
||||
/* Nav separator and tasks count badge */
|
||||
.nav-sep{height:1px;background:rgba(255,255,255,.12);margin:6px 0}
|
||||
.tasks-nav-count{background:var(--accent);color:#fff;border-radius:10px;padding:1px 7px;font-size:11px;font-weight:700;display:none;margin-left:auto}
|
||||
.tasks-nav-count.active{display:inline}
|
||||
/* Output terminal */
|
||||
.terminal{background:#1b1c1d;border:1px solid rgba(0,0,0,.2);border-radius:4px;padding:14px;font-family:monospace;font-size:12px;color:#b5cea8;max-height:400px;overflow-y:auto;white-space:pre-wrap;word-break:break-all;user-select:text;-webkit-user-select:text}
|
||||
.terminal-wrap{position:relative}.terminal-copy{position:absolute;top:6px;right:6px;background:#2d2f30;border:1px solid #444;color:#aaa;font-size:11px;padding:2px 8px;border-radius:3px;cursor:pointer;opacity:.7}.terminal-copy:hover{opacity:1}
|
||||
@@ -92,14 +97,21 @@ tbody tr:hover td{background:rgba(0,0,0,.03)}
|
||||
}
|
||||
|
||||
func layoutNav(active string, buildLabel string) string {
|
||||
items := []struct{ id, label, href, onclick string }{
|
||||
{"dashboard", "Dashboard", "/", ""},
|
||||
{"audit", "Audit", "/audit", ""},
|
||||
{"validate", "Validate", "/validate", ""},
|
||||
{"burn", "Burn", "/burn", ""},
|
||||
{"benchmark", "Benchmark", "/benchmark", ""},
|
||||
{"tasks", "Tasks", "/tasks", ""},
|
||||
{"tools", "Tools", "/tools", ""},
|
||||
type navItem struct {
|
||||
id, label, href string
|
||||
sep bool
|
||||
}
|
||||
items := []navItem{
|
||||
{id: "dashboard", label: "Dashboard", href: "/"},
|
||||
{id: "audit", label: "1. Audit", href: "/audit"},
|
||||
{id: "check", label: "2. Check", href: "/check"},
|
||||
{id: "load", label: "3. Load", href: "/load"},
|
||||
{id: "burn", label: "4. Burn", href: "/burn"},
|
||||
{id: "benchmark", label: "5. Benchmark", href: "/benchmark"},
|
||||
{sep: true},
|
||||
{id: "tasks", label: "Tasks", href: "/tasks"},
|
||||
{id: "tools", label: "Tools", href: "/tools"},
|
||||
{id: "settings", label: "Settings", href: "/settings"},
|
||||
}
|
||||
var b strings.Builder
|
||||
b.WriteString(`<aside class="sidebar">`)
|
||||
@@ -119,19 +131,24 @@ func layoutNav(active string, buildLabel string) string {
|
||||
}
|
||||
b.WriteString(`<nav class="nav">`)
|
||||
for _, item := range items {
|
||||
if item.sep {
|
||||
b.WriteString(`<div class="nav-sep"></div>`)
|
||||
continue
|
||||
}
|
||||
cls := "nav-item"
|
||||
if item.id == active {
|
||||
cls += " active"
|
||||
}
|
||||
if item.onclick != "" {
|
||||
b.WriteString(fmt.Sprintf(`<a class="%s" href="%s" onclick="%s">%s</a>`,
|
||||
cls, item.href, item.onclick, item.label))
|
||||
if item.id == "tasks" {
|
||||
b.WriteString(fmt.Sprintf(`<a class="%s" href="%s" id="tasks-nav-item">%s<span class="tasks-nav-count" id="tasks-nav-count"></span></a>`, cls, item.href, item.label))
|
||||
} else {
|
||||
b.WriteString(fmt.Sprintf(`<a class="%s" href="%s">%s</a>`,
|
||||
cls, item.href, item.label))
|
||||
b.WriteString(fmt.Sprintf(`<a class="%s" href="%s">%s</a>`, cls, item.href, item.label))
|
||||
}
|
||||
}
|
||||
b.WriteString(`</nav>`)
|
||||
b.WriteString(`<script>`)
|
||||
b.WriteString(`(function(){function u(){fetch('/api/tasks',{cache:'no-store'}).then(function(r){return r.json();}).then(function(d){var n=Array.isArray(d)?d.filter(function(t){return t.status==='pending'||t.status==='running';}).length:0;var c=document.getElementById('tasks-nav-count');var el=document.getElementById('tasks-nav-item');if(c){c.textContent=n>0?String(n):'';c.className='tasks-nav-count'+(n>0?' active':'');}if(el){el.style.color=n>0?'#f6c90e':'';}}).catch(function(){});}u();setInterval(u,5000);})();`)
|
||||
b.WriteString(`</script>`)
|
||||
b.WriteString(`</aside>`)
|
||||
return b.String()
|
||||
}
|
||||
|
||||
368
audit/internal/webui/nvme_format.go
Normal file
368
audit/internal/webui/nvme_format.go
Normal file
@@ -0,0 +1,368 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type nvmeFormatMode struct {
|
||||
Mode int `json:"mode"`
|
||||
DataBytes int64 `json:"data_bytes"`
|
||||
MetadataBytes int64 `json:"metadata_bytes"`
|
||||
InUse bool `json:"in_use"`
|
||||
Label string `json:"label"`
|
||||
}
|
||||
|
||||
type nvmeFormatDisk struct {
|
||||
Device string `json:"device"`
|
||||
Model string `json:"model,omitempty"`
|
||||
Serial string `json:"serial,omitempty"`
|
||||
Size string `json:"size,omitempty"`
|
||||
CurrentMode int `json:"current_mode"`
|
||||
CurrentFormat string `json:"current_format"`
|
||||
Modes []nvmeFormatMode `json:"modes"`
|
||||
Error string `json:"error,omitempty"`
|
||||
}
|
||||
|
||||
type nvmeListJSON struct {
|
||||
Devices []struct {
|
||||
DevicePath string `json:"DevicePath"`
|
||||
ModelNumber string `json:"ModelNumber"`
|
||||
SerialNumber string `json:"SerialNumber"`
|
||||
PhysicalSize int64 `json:"PhysicalSize"`
|
||||
} `json:"Devices"`
|
||||
}
|
||||
|
||||
var (
|
||||
nvmeFormatDeviceRE = regexp.MustCompile(`^/dev/nvme[0-9]+n[0-9]+$`)
|
||||
nvmeLBAFCompactLineRE = regexp.MustCompile(`(?im)^\s*lbaf\s+(\d+)\s*:\s*ms:(\d+)\s+lbads:(\d+).*$`)
|
||||
nvmeLBAFVerboseLineRE = regexp.MustCompile(`(?im)^\s*LBA Format\s+(\d+)\s*:\s*Metadata Size:\s*(\d+)\s+bytes\s*-\s*Data Size:\s*(\d+)\s+bytes.*$`)
|
||||
nvmeCommandContext = exec.CommandContext
|
||||
nvmeListFormatsTimeout = 20 * time.Second
|
||||
)
|
||||
|
||||
func listNVMeFormatDisks(ctx context.Context) ([]nvmeFormatDisk, error) {
|
||||
ctx, cancel := context.WithTimeout(ctx, nvmeListFormatsTimeout)
|
||||
defer cancel()
|
||||
out, err := nvmeCommandContext(ctx, "nvme", "list", "-o", "json").Output()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var root nvmeListJSON
|
||||
if err := json.Unmarshal(out, &root); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
disks := make([]nvmeFormatDisk, 0, len(root.Devices))
|
||||
seen := map[string]struct{}{}
|
||||
for _, dev := range root.Devices {
|
||||
path := strings.TrimSpace(dev.DevicePath)
|
||||
if !nvmeFormatDeviceRE.MatchString(path) {
|
||||
continue
|
||||
}
|
||||
if _, ok := seen[path]; ok {
|
||||
continue
|
||||
}
|
||||
seen[path] = struct{}{}
|
||||
disk := nvmeFormatDisk{
|
||||
Device: path,
|
||||
Model: strings.TrimSpace(dev.ModelNumber),
|
||||
Serial: strings.TrimSpace(dev.SerialNumber),
|
||||
Size: formatNVMeBytes(dev.PhysicalSize),
|
||||
CurrentMode: -1,
|
||||
}
|
||||
modes, parseErr := readNVMeFormatModes(ctx, path)
|
||||
if parseErr != nil {
|
||||
disk.Error = parseErr.Error()
|
||||
}
|
||||
disk.Modes = modes
|
||||
for _, mode := range modes {
|
||||
if mode.InUse {
|
||||
disk.CurrentMode = mode.Mode
|
||||
disk.CurrentFormat = formatNVMeBlock(mode.DataBytes, mode.MetadataBytes)
|
||||
break
|
||||
}
|
||||
}
|
||||
disks = append(disks, disk)
|
||||
}
|
||||
sort.Slice(disks, func(i, j int) bool { return disks[i].Device < disks[j].Device })
|
||||
return disks, nil
|
||||
}
|
||||
|
||||
func readNVMeFormatModes(ctx context.Context, device string) ([]nvmeFormatMode, error) {
|
||||
if !nvmeFormatDeviceRE.MatchString(device) {
|
||||
return nil, fmt.Errorf("invalid NVMe device")
|
||||
}
|
||||
out, err := nvmeCommandContext(ctx, "nvme", "id-ns", device, "-H").CombinedOutput()
|
||||
if err != nil {
|
||||
msg := strings.TrimSpace(string(out))
|
||||
if msg == "" {
|
||||
msg = err.Error()
|
||||
}
|
||||
return nil, fmt.Errorf("%s", msg)
|
||||
}
|
||||
modes := parseNVMeFormatModes(string(out))
|
||||
if len(modes) == 0 {
|
||||
return nil, fmt.Errorf("no LBA format modes found")
|
||||
}
|
||||
return modes, nil
|
||||
}
|
||||
|
||||
func parseNVMeFormatModes(raw string) []nvmeFormatMode {
|
||||
byMode := map[int]nvmeFormatMode{}
|
||||
for _, m := range nvmeLBAFCompactLineRE.FindAllStringSubmatch(raw, -1) {
|
||||
mode, errMode := strconv.Atoi(m[1])
|
||||
metadata, errMS := strconv.ParseInt(m[2], 10, 64)
|
||||
lbads, errLBADS := strconv.Atoi(m[3])
|
||||
if errMode != nil || errMS != nil || errLBADS != nil || lbads < 0 || lbads >= 63 {
|
||||
continue
|
||||
}
|
||||
data := int64(1) << lbads
|
||||
line := m[0]
|
||||
byMode[mode] = nvmeFormatMode{
|
||||
Mode: mode,
|
||||
DataBytes: data,
|
||||
MetadataBytes: metadata,
|
||||
InUse: strings.Contains(strings.ToLower(line), "in use"),
|
||||
Label: fmt.Sprintf("MODE %d (%s)", mode, formatNVMeBlock(data, metadata)),
|
||||
}
|
||||
}
|
||||
for _, m := range nvmeLBAFVerboseLineRE.FindAllStringSubmatch(raw, -1) {
|
||||
mode, errMode := strconv.Atoi(m[1])
|
||||
metadata, errMS := strconv.ParseInt(m[2], 10, 64)
|
||||
data, errData := strconv.ParseInt(m[3], 10, 64)
|
||||
if errMode != nil || errMS != nil || errData != nil || data <= 0 {
|
||||
continue
|
||||
}
|
||||
line := m[0]
|
||||
byMode[mode] = nvmeFormatMode{
|
||||
Mode: mode,
|
||||
DataBytes: data,
|
||||
MetadataBytes: metadata,
|
||||
InUse: strings.Contains(strings.ToLower(line), "in use"),
|
||||
Label: fmt.Sprintf("MODE %d (%s)", mode, formatNVMeBlock(data, metadata)),
|
||||
}
|
||||
}
|
||||
modes := make([]nvmeFormatMode, 0, len(byMode))
|
||||
for _, mode := range byMode {
|
||||
modes = append(modes, mode)
|
||||
}
|
||||
sort.Slice(modes, func(i, j int) bool { return modes[i].Mode < modes[j].Mode })
|
||||
return modes
|
||||
}
|
||||
|
||||
func runNVMeFormatTask(ctx context.Context, j *jobState, device string, lbaf int) error {
|
||||
if !nvmeFormatDeviceRE.MatchString(device) {
|
||||
return fmt.Errorf("invalid NVMe device")
|
||||
}
|
||||
modes, err := readNVMeFormatModes(ctx, device)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
var selected nvmeFormatMode
|
||||
found := false
|
||||
for _, mode := range modes {
|
||||
if mode.Mode == lbaf {
|
||||
selected = mode
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
return fmt.Errorf("MODE %d is not available on %s", lbaf, device)
|
||||
}
|
||||
ms := 0
|
||||
if selected.MetadataBytes > 0 {
|
||||
ms = 1
|
||||
}
|
||||
j.append(fmt.Sprintf("Formatting %s to %s with --lbaf=%d --ms=%d --force", device, formatNVMeBlock(selected.DataBytes, selected.MetadataBytes), selected.Mode, ms))
|
||||
cmd := nvmeCommandContext(ctx, "nvme", "format", device, fmt.Sprintf("--lbaf=%d", selected.Mode), fmt.Sprintf("--ms=%d", ms), "--force")
|
||||
return streamCmdJob(j, cmd)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPINVMeFormats(w http.ResponseWriter, r *http.Request) {
|
||||
disks, err := listNVMeFormatDisks(r.Context())
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
writeJSON(w, disks)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPINVMeFormatRun(w http.ResponseWriter, r *http.Request) {
|
||||
var req struct {
|
||||
Device string `json:"device"`
|
||||
LBAF int `json:"lbaf"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid request body")
|
||||
return
|
||||
}
|
||||
if !nvmeFormatDeviceRE.MatchString(req.Device) {
|
||||
writeError(w, http.StatusBadRequest, "invalid NVMe device")
|
||||
return
|
||||
}
|
||||
disks, err := listNVMeFormatDisks(r.Context())
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, err.Error())
|
||||
return
|
||||
}
|
||||
var label string
|
||||
allowed := false
|
||||
for _, disk := range disks {
|
||||
if disk.Device != req.Device {
|
||||
continue
|
||||
}
|
||||
for _, mode := range disk.Modes {
|
||||
if mode.Mode == req.LBAF {
|
||||
allowed = true
|
||||
label = mode.Label
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
if !allowed {
|
||||
writeError(w, http.StatusBadRequest, "LBA format mode is not available for this device")
|
||||
return
|
||||
}
|
||||
name := fmt.Sprintf("NVMe Format %s to %s", filepath.Base(req.Device), label)
|
||||
t := &Task{
|
||||
ID: newJobID("nvme-format"),
|
||||
Name: name,
|
||||
Target: "nvme-format",
|
||||
Priority: defaultTaskPriority("nvme-format", taskParams{}),
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{
|
||||
Device: req.Device,
|
||||
LBAF: req.LBAF,
|
||||
},
|
||||
}
|
||||
globalQueue.enqueue(t)
|
||||
writeJSON(w, map[string]string{"task_id": t.ID, "job_id": t.ID})
|
||||
}
|
||||
|
||||
func formatNVMeBlock(dataBytes, metadataBytes int64) string {
|
||||
return strconv.FormatInt(dataBytes, 10) + "+" + strconv.FormatInt(metadataBytes, 10)
|
||||
}
|
||||
|
||||
func formatNVMeBytes(n int64) string {
|
||||
if n <= 0 {
|
||||
return ""
|
||||
}
|
||||
units := []string{"B", "KB", "MB", "GB", "TB", "PB"}
|
||||
v := float64(n)
|
||||
unit := 0
|
||||
for v >= 1000 && unit < len(units)-1 {
|
||||
v /= 1000
|
||||
unit++
|
||||
}
|
||||
if unit == 0 {
|
||||
return fmt.Sprintf("%d B", n)
|
||||
}
|
||||
return fmt.Sprintf("%.1f %s", v, units[unit])
|
||||
}
|
||||
|
||||
func renderNVMeFormatInline() string {
|
||||
return `<div id="nvme-format-status" style="font-size:13px;color:var(--muted);margin-bottom:12px">Loading NVMe disks...</div>
|
||||
<div id="nvme-format-table"><p style="color:var(--muted);font-size:13px">Loading...</p></div>
|
||||
<script>
|
||||
function nvmeFormatEsc(s) {
|
||||
return String(s == null ? '' : s).replace(/[&<>"']/g, function(c) {
|
||||
return {'&':'&','<':'<','>':'>','"':'"',"'":'''}[c];
|
||||
});
|
||||
}
|
||||
function loadNVMeFormats() {
|
||||
var status = document.getElementById('nvme-format-status');
|
||||
var table = document.getElementById('nvme-format-table');
|
||||
status.textContent = 'Loading NVMe disks...';
|
||||
status.style.color = 'var(--muted)';
|
||||
table.innerHTML = '<p style="color:var(--muted);font-size:13px">Loading...</p>';
|
||||
fetch('/api/tools/nvme-formats').then(function(r) { return r.json().then(function(d) { if (!r.ok) throw new Error(d.error || ('HTTP ' + r.status)); return d; }); }).then(function(disks) {
|
||||
window._nvmeFormatDisks = Array.isArray(disks) ? disks : [];
|
||||
if (!window._nvmeFormatDisks.length) {
|
||||
status.textContent = 'No NVMe disks found.';
|
||||
table.innerHTML = '';
|
||||
return;
|
||||
}
|
||||
status.textContent = window._nvmeFormatDisks.length + ' NVMe disk(s) found.';
|
||||
var rows = window._nvmeFormatDisks.map(function(d, idx) {
|
||||
var current = d.current_format ? (d.current_format + ' / MODE ' + d.current_mode) : 'unknown';
|
||||
var detail = [d.model || '', d.serial || '', d.size || ''].filter(Boolean).join(' | ');
|
||||
var options = (d.modes || []).map(function(m) {
|
||||
return '<option value="' + m.mode + '"' + (m.in_use ? ' selected' : '') + '>' + nvmeFormatEsc(m.label) + '</option>';
|
||||
}).join('');
|
||||
var disabled = options ? '' : ' disabled';
|
||||
var err = d.error ? '<div style="font-size:12px;color:var(--crit-fg,#9f3a38);margin-top:4px">' + nvmeFormatEsc(d.error) + '</div>' : '';
|
||||
return '<tr>'
|
||||
+ '<td style="font-family:monospace;white-space:nowrap">' + nvmeFormatEsc(d.device) + (detail ? '<div style="font-family:inherit;font-size:12px;color:var(--muted)">' + nvmeFormatEsc(detail) + '</div>' : '') + '</td>'
|
||||
+ '<td style="white-space:nowrap">' + nvmeFormatEsc(current) + err + '</td>'
|
||||
+ '<td style="white-space:nowrap"><select id="nvme-format-select-' + idx + '"' + disabled + '>' + options + '</select></td>'
|
||||
+ '<td style="white-space:nowrap"><button class="btn btn-sm btn-primary" onclick="nvmeFormatRun(' + idx + ', this)"' + disabled + '>Apply</button><div class="nvme-format-row-msg" style="margin-top:6px;font-size:12px;color:var(--muted)"></div></td>'
|
||||
+ '</tr>';
|
||||
}).join('');
|
||||
table.innerHTML = '<table><tr><th>Disk</th><th>Current block / mode</th><th>New mode</th><th>Action</th></tr>' + rows + '</table>';
|
||||
}).catch(function(e) {
|
||||
status.textContent = 'Error loading NVMe disks: ' + e.message;
|
||||
status.style.color = 'var(--crit-fg,#9f3a38)';
|
||||
table.innerHTML = '';
|
||||
});
|
||||
}
|
||||
function nvmeWaitTaskDone(taskID, rowMsg) {
|
||||
var timer = setInterval(function() {
|
||||
fetch('/api/tasks').then(function(r) { return r.json(); }).then(function(tasks) {
|
||||
var task = (tasks || []).find(function(t) { return t.id === taskID; });
|
||||
if (!task) return;
|
||||
if (task.status === 'done' || task.status === 'failed' || task.status === 'cancelled') {
|
||||
clearInterval(timer);
|
||||
rowMsg.textContent = 'Task ' + taskID + ': ' + task.status + (task.error ? ' - ' + task.error : '');
|
||||
rowMsg.style.color = task.status === 'done' ? 'var(--ok,green)' : 'var(--crit-fg,#9f3a38)';
|
||||
loadNVMeFormats();
|
||||
}
|
||||
}).catch(function(){});
|
||||
}, 1500);
|
||||
}
|
||||
function nvmeFormatRun(idx, btn) {
|
||||
var disk = (window._nvmeFormatDisks || [])[idx];
|
||||
var select = document.getElementById('nvme-format-select-' + idx);
|
||||
var row = btn.closest('td');
|
||||
var rowMsg = row.querySelector('.nvme-format-row-msg');
|
||||
if (!disk || !select) return;
|
||||
var lbaf = parseInt(select.value, 10);
|
||||
var mode = (disk.modes || []).find(function(m) { return m.mode === lbaf; });
|
||||
if (!mode) return;
|
||||
if (!window.confirm('Format ' + disk.device + ' to ' + mode.label + '? This erases data on the namespace.')) return;
|
||||
btn.disabled = true;
|
||||
rowMsg.style.color = 'var(--muted)';
|
||||
rowMsg.textContent = 'Queued...';
|
||||
fetch('/api/tools/nvme-format/run', {
|
||||
method:'POST',
|
||||
headers:{'Content-Type':'application/json'},
|
||||
body:JSON.stringify({device: disk.device, lbaf: lbaf})
|
||||
}).then(function(r) { return r.json().then(function(d) { if (!r.ok) throw new Error(d.error || ('HTTP ' + r.status)); return d; }); }).then(function(d) {
|
||||
rowMsg.textContent = 'Task ' + d.task_id + ' queued.';
|
||||
nvmeWaitTaskDone(d.task_id, rowMsg);
|
||||
}).catch(function(e) {
|
||||
rowMsg.style.color = 'var(--crit-fg,#9f3a38)';
|
||||
rowMsg.textContent = 'Error: ' + e.message;
|
||||
}).finally(function() {
|
||||
btn.disabled = false;
|
||||
});
|
||||
}
|
||||
loadNVMeFormats();
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderNVMeFormatCard() string {
|
||||
return `<div class="card"><div class="card-head">NVMe Block Format <button class="btn btn-sm btn-secondary" onclick="loadNVMeFormats()" style="margin-left:auto">↻ Refresh</button></div><div class="card-body">` +
|
||||
`<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Lists NVMe namespaces and changes their LBA format through a queued task.</p>` +
|
||||
renderNVMeFormatInline() + `</div></div>`
|
||||
}
|
||||
@@ -611,3 +611,7 @@ func renderPowerBenchmarkResultsCard(exportDir string) string {
|
||||
b.WriteString(`</div></div>`)
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// renderSpeed and renderEndurance are legacy wrappers; canonical page is 5. Benchmark at /benchmark.
|
||||
func renderSpeed(opts HandlerOptions) string { return renderBenchmark(opts) }
|
||||
func renderEndurance(opts HandlerOptions) string { return renderBenchmark(opts) }
|
||||
|
||||
@@ -2,7 +2,7 @@ package webui
|
||||
|
||||
func renderBurn() string {
|
||||
return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>⚠ Warning:</strong> Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.</div>
|
||||
<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> Burn exposes sustained GPU compute load recipes. DCGM diagnostics (` + "targeted_stress, targeted_power, pulse_test" + `) and LINPACK remain in <a href="/validate">Validate → Stress mode</a>; NCCL and NVBandwidth are available directly from <a href="/validate">Validate</a>.</div>
|
||||
<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> Burn runs sustained GPU compute and CPU/memory stress recipes. DCGM targeted diagnostics (<code>targeted_stress</code>, <code>targeted_power</code>, <code>pulse_test</code>) and NCCL/NVBandwidth are on the <a href="/load">3. Load</a> page. For performance benchmarks, see <a href="/benchmark">5. Benchmark</a>.</div>
|
||||
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
|
||||
@@ -102,47 +102,69 @@ window.supportBundleDownload = function() {
|
||||
|
||||
func renderUSBExportCard() string {
|
||||
return `<div class="card" style="margin-top:16px">
|
||||
<div class="card-head">Export to USB
|
||||
<button class="btn btn-sm btn-secondary" onclick="usbRefresh()" style="margin-left:auto">↻ Refresh</button>
|
||||
<div class="card-head">USB Black-Box
|
||||
<button class="btn btn-sm btn-secondary" onclick="blackboxRefresh()" style="margin-left:auto">↻ Refresh</button>
|
||||
</div>
|
||||
<div class="card-body">` + renderUSBExportInline() + `</div>
|
||||
</div>`
|
||||
}
|
||||
|
||||
func renderUSBExportInline() string {
|
||||
return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Write audit JSON or support bundle directly to a removable USB drive.</p>
|
||||
return `<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Marks removable USB devices as black-box targets. The dedicated bee-blackbox service mirrors export files and system logs into a boot-scoped folder and resumes automatically after restart.</p>
|
||||
<div id="usb-status" style="font-size:13px;color:var(--muted)">Scanning for USB devices...</div>
|
||||
<div id="blackbox-summary" style="margin-top:8px;font-size:13px;color:var(--muted)">Loading black-box status...</div>
|
||||
<div id="usb-targets" style="margin-top:12px"></div>
|
||||
<div id="usb-msg" style="margin-top:10px;font-size:13px"></div>
|
||||
<script>
|
||||
(function(){
|
||||
function usbRefresh() {
|
||||
function blackboxRefresh() {
|
||||
document.getElementById('usb-status').textContent = 'Scanning...';
|
||||
document.getElementById('blackbox-summary').textContent = 'Loading black-box status...';
|
||||
document.getElementById('usb-targets').innerHTML = '';
|
||||
document.getElementById('usb-msg').textContent = '';
|
||||
fetch('/api/export/usb').then(r=>r.json()).then(targets => {
|
||||
window._usbTargets = Array.isArray(targets) ? targets : [];
|
||||
Promise.all([
|
||||
fetch('/api/export/usb').then(r=>r.json()),
|
||||
fetch('/api/blackbox/status').then(r=>r.json())
|
||||
]).then(function(values) {
|
||||
const targets = Array.isArray(values[0]) ? values[0] : [];
|
||||
const state = values[1] || {};
|
||||
const active = Array.isArray(state.targets) ? state.targets : [];
|
||||
window._usbTargets = targets;
|
||||
window._blackboxTargets = active;
|
||||
const st = document.getElementById('usb-status');
|
||||
const ct = document.getElementById('usb-targets');
|
||||
const summary = document.getElementById('blackbox-summary');
|
||||
if (state.boot_folder) {
|
||||
summary.textContent = 'Service state: ' + (state.status || 'unknown') + '. Boot folder: ' + state.boot_folder + '.';
|
||||
} else {
|
||||
summary.textContent = 'Service state: ' + (state.status || 'disabled') + '.';
|
||||
}
|
||||
if (!targets || targets.length === 0) {
|
||||
st.textContent = 'No removable USB devices found.';
|
||||
return;
|
||||
} else {
|
||||
st.textContent = targets.length + ' device(s) found:';
|
||||
}
|
||||
st.textContent = targets.length + ' device(s) found:';
|
||||
ct.innerHTML = '<table><tr><th>Device</th><th>FS</th><th>Size</th><th>Label</th><th>Model</th><th>Actions</th></tr>' +
|
||||
const byDevice = {};
|
||||
active.forEach(function(item) { byDevice[item.device] = item; });
|
||||
ct.innerHTML = '<table><tr><th>Device</th><th>FS</th><th>Size</th><th>Label</th><th>Model</th><th>Black-Box</th><th>Actions</th></tr>' +
|
||||
targets.map((t, idx) => {
|
||||
const dev = t.device || '';
|
||||
const label = t.label || '';
|
||||
const model = t.model || '';
|
||||
const state = byDevice[dev];
|
||||
const status = state ? (state.status + (state.flush_period ? ', flush ' + state.flush_period : '')) : 'not enrolled';
|
||||
const detail = state && state.last_error ? ('<div style="font-size:12px;color:var(--err,red)">'+state.last_error+'</div>') : '';
|
||||
return '<tr>' +
|
||||
'<td style="font-family:monospace">'+dev+'</td>' +
|
||||
'<td>'+t.fs_type+'</td>' +
|
||||
'<td>'+t.size+'</td>' +
|
||||
'<td>'+label+'</td>' +
|
||||
'<td style="font-size:12px;color:var(--muted)">'+model+'</td>' +
|
||||
'<td style="font-size:12px">'+status+detail+'</td>' +
|
||||
'<td style="white-space:nowrap">' +
|
||||
'<button class="btn btn-sm btn-primary" onclick="usbExport(\'audit\','+idx+',this)">Audit JSON</button> ' +
|
||||
'<button class="btn btn-sm btn-secondary" onclick="usbExport(\'bundle\','+idx+',this)">Support Bundle</button>' +
|
||||
(state
|
||||
? '<button class="btn btn-sm btn-secondary" onclick="blackboxDisable('+idx+',this)">Disable</button>'
|
||||
: '<button class="btn btn-sm btn-primary" onclick="blackboxEnable('+idx+',this)">Enable</button>') +
|
||||
'<div class="usb-row-msg" style="margin-top:6px;font-size:12px;color:var(--muted)"></div>' +
|
||||
'</td></tr>';
|
||||
}).join('') + '</table>';
|
||||
@@ -150,7 +172,7 @@ function usbRefresh() {
|
||||
document.getElementById('usb-status').textContent = 'Error: ' + e;
|
||||
});
|
||||
}
|
||||
window.usbExport = function(type, targetIndex, btn) {
|
||||
window.blackboxEnable = function(targetIndex, btn) {
|
||||
const target = (window._usbTargets || [])[targetIndex];
|
||||
if (!target) {
|
||||
const msg = document.getElementById('usb-msg');
|
||||
@@ -164,15 +186,15 @@ window.usbExport = function(type, targetIndex, btn) {
|
||||
const originalText = btn ? btn.textContent : '';
|
||||
if (btn) {
|
||||
btn.disabled = true;
|
||||
btn.textContent = 'Exporting...';
|
||||
btn.textContent = 'Enabling...';
|
||||
}
|
||||
if (rowMsg) {
|
||||
rowMsg.style.color = 'var(--muted)';
|
||||
rowMsg.textContent = 'Working...';
|
||||
}
|
||||
msg.style.color = 'var(--muted)';
|
||||
msg.textContent = 'Exporting ' + (type === 'bundle' ? 'support bundle' : 'audit JSON') + ' to ' + (target.device||'') + '...';
|
||||
fetch('/api/export/usb/'+type, {
|
||||
msg.textContent = 'Enabling black-box on ' + (target.device||'') + '...';
|
||||
fetch('/api/blackbox/enable', {
|
||||
method: 'POST',
|
||||
headers: {'Content-Type':'application/json'},
|
||||
body: JSON.stringify(target)
|
||||
@@ -199,10 +221,64 @@ window.usbExport = function(type, targetIndex, btn) {
|
||||
btn.disabled = false;
|
||||
btn.textContent = originalText;
|
||||
}
|
||||
setTimeout(blackboxRefresh, 300);
|
||||
});
|
||||
};
|
||||
window.usbRefresh = usbRefresh;
|
||||
usbRefresh();
|
||||
window.blackboxDisable = function(targetIndex, btn) {
|
||||
const target = (window._usbTargets || [])[targetIndex];
|
||||
const active = (window._blackboxTargets || []).find(function(item){ return item.device === (target && target.device); });
|
||||
if (!target || !active) {
|
||||
const msg = document.getElementById('usb-msg');
|
||||
msg.style.color = 'var(--err,red)';
|
||||
msg.textContent = 'Error: black-box target not found. Refresh and try again.';
|
||||
return;
|
||||
}
|
||||
const msg = document.getElementById('usb-msg');
|
||||
const row = btn ? btn.closest('td') : null;
|
||||
const rowMsg = row ? row.querySelector('.usb-row-msg') : null;
|
||||
const originalText = btn ? btn.textContent : '';
|
||||
if (btn) {
|
||||
btn.disabled = true;
|
||||
btn.textContent = 'Disabling...';
|
||||
}
|
||||
if (rowMsg) {
|
||||
rowMsg.style.color = 'var(--muted)';
|
||||
rowMsg.textContent = 'Working...';
|
||||
}
|
||||
msg.style.color = 'var(--muted)';
|
||||
msg.textContent = 'Disabling black-box on ' + (target.device||'') + '...';
|
||||
fetch('/api/blackbox/disable', {
|
||||
method:'POST',
|
||||
headers:{'Content-Type':'application/json'},
|
||||
body: JSON.stringify({device: target.device, enrollment_id: active.enrollment_id})
|
||||
}).then(async r => {
|
||||
const d = await r.json();
|
||||
if (!r.ok) throw new Error(d.error || ('HTTP ' + r.status));
|
||||
return d;
|
||||
}).then(d => {
|
||||
msg.style.color = 'var(--ok,green)';
|
||||
msg.textContent = d.message || 'Done.';
|
||||
if (rowMsg) {
|
||||
rowMsg.style.color = 'var(--ok,green)';
|
||||
rowMsg.textContent = d.message || 'Done.';
|
||||
}
|
||||
}).catch(e => {
|
||||
msg.style.color = 'var(--err,red)';
|
||||
msg.textContent = 'Error: '+e;
|
||||
if (rowMsg) {
|
||||
rowMsg.style.color = 'var(--err,red)';
|
||||
rowMsg.textContent = 'Error: ' + e;
|
||||
}
|
||||
}).finally(() => {
|
||||
if (btn) {
|
||||
btn.disabled = false;
|
||||
btn.textContent = originalText;
|
||||
}
|
||||
setTimeout(blackboxRefresh, 300);
|
||||
});
|
||||
};
|
||||
window.blackboxRefresh = blackboxRefresh;
|
||||
blackboxRefresh();
|
||||
})();
|
||||
</script>`
|
||||
}
|
||||
@@ -326,92 +402,226 @@ loadNvidiaSelfHeal();
|
||||
}
|
||||
|
||||
func renderTools() string {
|
||||
return `<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">System Install</div>
|
||||
<div class="card-body">
|
||||
<div style="margin-bottom:20px">
|
||||
<div style="font-weight:600;margin-bottom:8px">Install to RAM</div>
|
||||
<p id="boot-source-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Detecting boot source...</p>
|
||||
<p id="ram-status-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Checking...</p>
|
||||
<button id="ram-install-btn" class="btn btn-primary" onclick="installToRAM()" style="display:none">▶ Copy to RAM</button>
|
||||
</div>
|
||||
<div style="border-top:1px solid var(--line);padding-top:20px">
|
||||
<div style="font-weight:600;margin-bottom:8px">Install to Disk</div>` +
|
||||
renderInstallInline() + `
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<script>
|
||||
fetch('/api/system/ram-status').then(r=>r.json()).then(d=>{
|
||||
const boot = document.getElementById('boot-source-text');
|
||||
const txt = document.getElementById('ram-status-text');
|
||||
const btn = document.getElementById('ram-install-btn');
|
||||
let source = d.device || d.source || 'unknown source';
|
||||
let kind = d.kind || 'unknown';
|
||||
let label = source;
|
||||
if (kind === 'ram') label = 'RAM';
|
||||
else if (kind === 'usb') label = 'USB (' + source + ')';
|
||||
else if (kind === 'cdrom') label = 'CD-ROM (' + source + ')';
|
||||
else if (kind === 'disk') label = 'disk (' + source + ')';
|
||||
else label = source;
|
||||
boot.textContent = 'Current boot source: ' + label + '.';
|
||||
txt.textContent = d.message || 'Checking...';
|
||||
if (d.status === 'ok' || d.in_ram) {
|
||||
txt.style.color = 'var(--ok, green)';
|
||||
} else if (d.status === 'failed') {
|
||||
txt.style.color = 'var(--err, #b91c1c)';
|
||||
} else {
|
||||
txt.style.color = 'var(--muted)';
|
||||
}
|
||||
if (d.can_start_task) {
|
||||
btn.style.display = '';
|
||||
btn.disabled = false;
|
||||
} else {
|
||||
btn.style.display = 'none';
|
||||
}
|
||||
});
|
||||
function installToRAM() {
|
||||
document.getElementById('ram-install-btn').disabled = true;
|
||||
fetch('/api/system/install-to-ram', {method:'POST'}).then(r=>r.json()).then(d=>{
|
||||
window.location.href = '/tasks#' + d.task_id;
|
||||
});
|
||||
}
|
||||
</script>
|
||||
return renderNVMeFormatCard() + `
|
||||
|
||||
<div class="card"><div class="card-head">Support Bundle</div><div class="card-body">
|
||||
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Downloads a tar.gz archive of all audit files, SAT results, and logs.</p>
|
||||
` + renderSupportBundleInline() + `
|
||||
<div style="border-top:1px solid var(--border);margin-top:16px;padding-top:16px">
|
||||
<div style="font-weight:600;margin-bottom:8px">Export to USB</div>
|
||||
` + renderUSBExportInline() + `
|
||||
</div>
|
||||
` + renderFRUEditorCard() + `
|
||||
|
||||
` + renderRAIDMgmtCard()
|
||||
}
|
||||
|
||||
func renderFRUEditorCard() string {
|
||||
return `<div class="card"><div class="card-head card-head-actions">FRU / Elabel<div class="card-head-buttons"><button class="btn btn-sm btn-secondary" onclick="fruAllRead()">Read All</button></div></div><div class="card-body">
|
||||
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Reads and edits hardware identity fields from all available sources. Each field shows its source method.</p>
|
||||
<div id="fru-all-status" style="font-size:13px;color:var(--muted);margin-bottom:8px"></div>
|
||||
<div id="fru-src-status" style="display:none;margin-bottom:10px"></div>
|
||||
<div id="fru-all-table"></div>
|
||||
</div></div>
|
||||
|
||||
<div class="card"><div class="card-head">Tool Check <button class="btn btn-sm btn-secondary" onclick="checkTools()" style="margin-left:auto">↻ Check</button></div>
|
||||
<div class="card-body"><div id="tools-table"><p style="color:var(--muted);font-size:13px">Checking...</p></div></div></div>
|
||||
|
||||
<div class="card"><div class="card-head">NVIDIA Self Heal</div><div class="card-body">` +
|
||||
renderNvidiaSelfHealInline() + `</div></div>
|
||||
|
||||
<div class="card"><div class="card-head">Network</div><div class="card-body">` +
|
||||
renderNetworkInline() + `</div></div>
|
||||
|
||||
<div class="card"><div class="card-head">Services</div><div class="card-body">` +
|
||||
renderServicesInline() + `</div></div>
|
||||
|
||||
|
||||
<style>
|
||||
.fru-chip{display:inline-block;font-size:10px;font-weight:600;letter-spacing:.02em;padding:1px 6px;border-radius:3px;vertical-align:middle;white-space:nowrap;margin-right:8px;flex-shrink:0}
|
||||
.fru-chip-ipmi{background:#e8e8e8;color:#555}
|
||||
.fru-chip-huawei{background:#fff0e6;color:#b83}
|
||||
.fru-chip-saa{background:#e6f0ff;color:#557}
|
||||
.fru-inp-wrap{display:flex;align-items:center;gap:0}
|
||||
</style>
|
||||
<script>
|
||||
function checkTools() {
|
||||
document.getElementById('tools-table').innerHTML = '<p style="color:var(--muted);font-size:13px">Checking...</p>';
|
||||
fetch('/api/tools/check').then(r=>r.json()).then(tools => {
|
||||
const rows = tools.map(t =>
|
||||
'<tr><td>'+t.Name+'</td><td><span class="badge '+(t.OK ? 'badge-ok' : 'badge-err')+'">'+(t.OK ? '✓ '+t.Path : '✗ missing')+'</span></td></tr>'
|
||||
).join('');
|
||||
document.getElementById('tools-table').innerHTML =
|
||||
'<table><tr><th>Tool</th><th>Status</th></tr>'+rows+'</table>';
|
||||
(function(){
|
||||
var _actBtn='width:22px;height:22px;padding:0;font-size:13px;line-height:1;border:1px solid var(--line);border-radius:3px;background:var(--surface);cursor:pointer;vertical-align:middle;';
|
||||
var _inp='width:100%;padding:3px 6px;border:1.5px solid #888;border-radius:3px;font-size:13px;font-family:monospace;background:var(--surface);color:var(--ink);';
|
||||
|
||||
var SOURCES = [
|
||||
{
|
||||
id: 'ipmi-fru',
|
||||
label: 'IPMI FRU',
|
||||
chipClass: 'fru-chip-ipmi',
|
||||
url: '/api/tools/ipmi-fru',
|
||||
writeUrl: '/api/tools/ipmi-fru/write',
|
||||
rowAttrs: function(f) {
|
||||
return 'data-source="ipmi-fru" data-area="'+esc(f.area||'')+'" data-index="'+(f.index||0)+'" data-name="'+esc(f.name)+'"';
|
||||
},
|
||||
writeBody: function(inp) {
|
||||
return JSON.stringify({changes:[{area:inp.dataset.area,index:parseInt(inp.dataset.index,10),name:inp.dataset.name,value:inp.value}]});
|
||||
},
|
||||
fieldName: function(f) { return f.name; },
|
||||
fieldValue: function(f) { return f.value||''; },
|
||||
readOnly: function(f) { return false; },
|
||||
},
|
||||
{
|
||||
id: 'huawei',
|
||||
label: 'Huawei iBMC',
|
||||
chipClass: 'fru-chip-huawei',
|
||||
url: '/api/tools/huawei-elabel',
|
||||
writeUrl: '/api/tools/huawei-elabel/write',
|
||||
rowAttrs: function(f) {
|
||||
return 'data-source="huawei" data-key="'+esc(f.key)+'"';
|
||||
},
|
||||
writeBody: function(inp) {
|
||||
return JSON.stringify({changes:[{key:inp.dataset.key,value:inp.value}]});
|
||||
},
|
||||
fieldName: function(f) { return f.name; },
|
||||
fieldValue: function(f) { return f.value||''; },
|
||||
readOnly: function(f) { return !!f.read_only; },
|
||||
},
|
||||
{
|
||||
id: 'saa-dmi',
|
||||
label: 'SAA DMI',
|
||||
chipClass: 'fru-chip-saa',
|
||||
url: '/api/tools/saa-dmi',
|
||||
writeUrl: '/api/tools/saa-dmi/write',
|
||||
rowAttrs: function(f) {
|
||||
return 'data-source="saa-dmi" data-shn="'+esc(f.shn)+'"';
|
||||
},
|
||||
writeBody: function(inp) {
|
||||
return JSON.stringify({changes:[{shn:inp.dataset.shn,value:inp.value}]});
|
||||
},
|
||||
fieldName: function(f) { return f.name; },
|
||||
fieldValue: function(f) { return f.value||''; },
|
||||
readOnly: function(f) { return false; },
|
||||
},
|
||||
];
|
||||
|
||||
function esc(s){return String(s==null?'':s).replace(/&/g,'&').replace(/</g,'<').replace(/>/g,'>').replace(/"/g,'"');}
|
||||
|
||||
function renderSrcStatus(perSource) {
|
||||
var bar = document.getElementById('fru-src-status');
|
||||
if (!perSource.length) { bar.style.display = 'none'; bar.innerHTML = ''; return; }
|
||||
var html = '';
|
||||
perSource.forEach(function(p) {
|
||||
var state, color;
|
||||
if (p.ok) {
|
||||
state = p.count + ' field(s) available';
|
||||
color = 'var(--ok-fg,green)';
|
||||
} else if (/not activated|product key|SFT-DCMS|SFT-OOB/i.test(p.reason)) {
|
||||
state = 'requires Supermicro license (SFT-OOB-LIC / SFT-DCMS-SINGLE) — activate on BMC';
|
||||
color = 'var(--crit-fg,#9f3a38)';
|
||||
} else {
|
||||
state = p.reason || 'unavailable';
|
||||
color = 'var(--muted)';
|
||||
}
|
||||
html += '<div style="display:flex;align-items:center;gap:8px;font-size:12px;margin:3px 0">'
|
||||
+ '<span class="fru-chip '+p.src.chipClass+'">'+p.src.label+'</span>'
|
||||
+ '<span style="color:'+color+'">'+esc(state)+'</span>'
|
||||
+ '</div>';
|
||||
});
|
||||
bar.innerHTML = html;
|
||||
bar.style.display = '';
|
||||
}
|
||||
checkTools();
|
||||
|
||||
window.fruAllRead = function() {
|
||||
var status = document.getElementById('fru-all-status');
|
||||
var table = document.getElementById('fru-all-table');
|
||||
status.textContent = 'Reading…'; status.style.color = 'var(--muted)';
|
||||
table.innerHTML = '';
|
||||
|
||||
var fetches = SOURCES.map(function(src) {
|
||||
return fetch(src.url, {cache:'no-store'})
|
||||
.then(function(r){ return r.json().then(function(d){ if(!r.ok) throw new Error(d.error||r.statusText); return d; }); });
|
||||
});
|
||||
|
||||
Promise.allSettled(fetches).then(function(results) {
|
||||
var rows = '';
|
||||
var totalFields = 0;
|
||||
var perSource = [];
|
||||
|
||||
results.forEach(function(res, i) {
|
||||
var src = SOURCES[i];
|
||||
if (res.status === 'rejected' || !Array.isArray(res.value) || res.value.length === 0) {
|
||||
var reason = '';
|
||||
if (res.status === 'rejected' && res.reason) reason = res.reason.message;
|
||||
else reason = 'no editable fields returned';
|
||||
perSource.push({src:src, ok:false, count:0, reason:reason});
|
||||
return;
|
||||
}
|
||||
perSource.push({src:src, ok:true, count:res.value.length, reason:''});
|
||||
res.value.forEach(function(f) {
|
||||
var val = esc(src.fieldValue(f));
|
||||
var ro = src.readOnly(f);
|
||||
var attrs = ro ? '' : (' '+src.rowAttrs(f));
|
||||
rows += '<tr>'
|
||||
+ '<td style="white-space:nowrap;padding-right:4px;vertical-align:middle">'
|
||||
+ '<span class="fru-chip '+src.chipClass+'">'+src.label+'</span>'
|
||||
+ '</td>'
|
||||
+ '<td style="color:var(--muted);white-space:nowrap;padding-right:16px;vertical-align:middle;font-size:13px">'+esc(src.fieldName(f))+'</td>'
|
||||
+ '<td style="vertical-align:middle">'
|
||||
+ (ro
|
||||
? '<span style="font-family:monospace;font-size:13px;color:var(--muted)">'+val+'</span>'
|
||||
: '<input class="fru-uni-inp" style="'+_inp+'" value="'+val+'" data-original="'+val+'"'+attrs+' oninput="fruUniChanged(this)">')
|
||||
+ '</td>'
|
||||
+ '<td class="fru-uni-act" style="display:none;white-space:nowrap;padding-left:6px;vertical-align:middle">'
|
||||
+ '<button style="'+_actBtn+'color:var(--ok-fg,green);margin-right:3px" title="Save" onclick="fruUniSave(this)">✓</button>'
|
||||
+ '<button style="'+_actBtn+'color:var(--crit-fg,#9f3a38)" title="Cancel" onclick="fruUniCancel(this)">✗</button>'
|
||||
+ '<span class="fru-uni-msg" style="font-size:11px;margin-left:5px;color:var(--muted)"></span>'
|
||||
+ '</td>'
|
||||
+ '</tr>';
|
||||
totalFields++;
|
||||
});
|
||||
});
|
||||
|
||||
renderSrcStatus(perSource);
|
||||
|
||||
if (totalFields === 0) {
|
||||
status.textContent = 'No editable fields available — see per-source status below.';
|
||||
status.style.color = 'var(--crit-fg,#9f3a38)';
|
||||
table.innerHTML = '';
|
||||
return;
|
||||
}
|
||||
|
||||
table.innerHTML = '<table style="width:100%;border-collapse:collapse">'+rows+'</table>';
|
||||
status.textContent = totalFields + ' field(s) loaded';
|
||||
status.style.color = 'var(--muted)';
|
||||
});
|
||||
};
|
||||
|
||||
window.fruUniChanged = function(inp) {
|
||||
var row = inp.closest('tr');
|
||||
row.querySelector('.fru-uni-act').style.display = inp.value !== inp.dataset.original ? '' : 'none';
|
||||
row.querySelector('.fru-uni-msg').textContent = '';
|
||||
};
|
||||
|
||||
window.fruUniCancel = function(btn) {
|
||||
var row = btn.closest('tr');
|
||||
var inp = row.querySelector('.fru-uni-inp');
|
||||
inp.value = inp.dataset.original;
|
||||
row.querySelector('.fru-uni-act').style.display = 'none';
|
||||
row.querySelector('.fru-uni-msg').textContent = '';
|
||||
};
|
||||
|
||||
window.fruUniSave = function(btn) {
|
||||
var row = btn.closest('tr');
|
||||
var inp = row.querySelector('.fru-uni-inp');
|
||||
var msg = row.querySelector('.fru-uni-msg');
|
||||
var cancelBtn = row.querySelectorAll('.fru-uni-act button')[1];
|
||||
var src = SOURCES.find(function(s){ return s.id === inp.dataset.source; });
|
||||
if (!src) { msg.textContent = 'Unknown source'; msg.style.color='var(--crit-fg)'; return; }
|
||||
|
||||
btn.disabled = true; cancelBtn.disabled = true;
|
||||
msg.textContent = '…'; msg.style.color = 'var(--muted)';
|
||||
|
||||
fetch(src.writeUrl, {method:'POST', headers:{'Content-Type':'application/json'}, body:src.writeBody(inp)})
|
||||
.then(function(r){ return r.json().then(function(d){ if(!r.ok) throw new Error(d.error||r.statusText); return d; }); })
|
||||
.then(function(d) {
|
||||
var poll = setInterval(function() {
|
||||
fetch('/api/tasks',{cache:'no-store'}).then(function(r){return r.json();}).then(function(tasks){
|
||||
var t = Array.isArray(tasks) ? tasks.find(function(x){return x.id===d.task_id;}) : null;
|
||||
if (!t) return;
|
||||
if (t.status==='done') {
|
||||
clearInterval(poll);
|
||||
inp.dataset.original = inp.value;
|
||||
row.querySelector('.fru-uni-act').style.display = 'none';
|
||||
msg.textContent = ''; msg.style.color = '';
|
||||
} else if (t.status==='failed'||t.status==='cancelled') {
|
||||
clearInterval(poll);
|
||||
msg.textContent = t.error||t.status; msg.style.color = 'var(--crit-fg)';
|
||||
btn.disabled = false; cancelBtn.disabled = false;
|
||||
}
|
||||
});
|
||||
}, 1500);
|
||||
})
|
||||
.catch(function(e) {
|
||||
msg.textContent = 'Error: '+e.message; msg.style.color = 'var(--crit-fg)';
|
||||
btn.disabled = false; cancelBtn.disabled = false;
|
||||
});
|
||||
};
|
||||
})();
|
||||
</script>`
|
||||
}
|
||||
|
||||
|
||||
@@ -207,7 +207,7 @@ func renderInstall() string {
|
||||
func renderTasks() string {
|
||||
return `<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px;flex-wrap:wrap">
|
||||
<button class="btn btn-danger btn-sm" onclick="cancelAll()">Cancel All</button>
|
||||
<button class="btn btn-sm" style="background:#b45309;color:#fff" onclick="killWorkers()" title="Send SIGKILL to all running test processes (bee-gpu-burn, stress-ng, stressapptest, memtester)">Kill Workers</button>
|
||||
<button class="btn btn-sm" style="background:#b45309;color:#fff" onclick="killWorkers()" title="Abort running tasks and kill orphaned test processes (bee-gpu-burn, dcgmi, nvvs, nvbandwidth, stress-ng, stressapptest, memtester)">Abort Tasks And Kill Orphans</button>
|
||||
<span id="kill-toast" style="font-size:12px;color:var(--muted);display:none"></span>
|
||||
<span style="font-size:12px;color:var(--muted)">Open a task to view its saved logs and charts.</span>
|
||||
</div>
|
||||
@@ -289,7 +289,7 @@ function cancelAll() {
|
||||
fetch('/api/tasks/cancel-all',{method:'POST'}).then(()=>loadTasks());
|
||||
}
|
||||
function killWorkers() {
|
||||
if (!confirm('Send SIGKILL to all running test workers (bee-gpu-burn, stress-ng, stressapptest, memtester)?\n\nThis will also cancel all queued and running tasks.')) return;
|
||||
if (!confirm('Abort all queued/running tasks and kill orphaned test workers (bee-gpu-burn, dcgmi, nvvs, nvbandwidth, stress-ng, stressapptest, memtester)?\n\nRunning bee-worker processes will first be asked to stop gracefully; orphaned test processes will then be killed.')) return;
|
||||
fetch('/api/tasks/kill-workers',{method:'POST'})
|
||||
.then(r=>r.json())
|
||||
.then(d=>{
|
||||
|
||||
115
audit/internal/webui/page_settings.go
Normal file
115
audit/internal/webui/page_settings.go
Normal file
@@ -0,0 +1,115 @@
|
||||
package webui
|
||||
|
||||
import "html"
|
||||
|
||||
func renderSettings(opts HandlerOptions) string {
|
||||
version := opts.BuildLabel
|
||||
if version == "" {
|
||||
version = "dev"
|
||||
}
|
||||
return `<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">System Install</div>
|
||||
<div class="card-body">
|
||||
<div style="margin-bottom:20px">
|
||||
<div style="font-weight:600;margin-bottom:8px">Install to RAM</div>
|
||||
<p id="boot-source-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Detecting boot source...</p>
|
||||
<p id="ram-status-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Checking...</p>
|
||||
<button id="ram-install-btn" class="btn btn-primary" onclick="installToRAM()" style="display:none">▶ Copy to RAM</button>
|
||||
</div>
|
||||
<div style="border-top:1px solid var(--line);padding-top:20px">
|
||||
<div style="font-weight:600;margin-bottom:8px">Install to Disk</div>` +
|
||||
renderInstallInline() + `
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<script>
|
||||
fetch('/api/system/ram-status').then(r=>r.json()).then(d=>{
|
||||
const boot = document.getElementById('boot-source-text');
|
||||
const txt = document.getElementById('ram-status-text');
|
||||
const btn = document.getElementById('ram-install-btn');
|
||||
let kind = d.kind || 'unknown';
|
||||
let source = d.device || d.source || 'unknown source';
|
||||
let label = kind==='ram'?'RAM':kind==='usb'?'USB ('+source+')':kind==='cdrom'?'CD-ROM ('+source+')':kind==='disk'?'disk ('+source+')':source;
|
||||
boot.textContent = 'Current boot source: ' + label + '.';
|
||||
txt.textContent = d.blocked_reason || d.message || 'Checking...';
|
||||
txt.style.color = (d.status==='ok'||d.in_ram)?'var(--ok,green)':d.status==='failed'?'var(--err,#b91c1c)':'var(--muted)';
|
||||
if (d.can_start_task) { btn.style.display=''; btn.disabled=false; } else { btn.style.display='none'; }
|
||||
});
|
||||
function installToRAM() {
|
||||
document.getElementById('ram-install-btn').disabled = true;
|
||||
fetch('/api/system/install-to-ram', {method:'POST'}).then(r=>r.json()).then(d=>{
|
||||
window.location.href = '/tasks#' + d.task_id;
|
||||
});
|
||||
}
|
||||
</script>
|
||||
|
||||
<div class="card"><div class="card-head">Support Bundle</div><div class="card-body">
|
||||
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Downloads a tar.gz archive of all audit files, SAT results, and logs.</p>
|
||||
` + renderSupportBundleInline() + `
|
||||
<div style="border-top:1px solid var(--border);margin-top:16px;padding-top:16px">
|
||||
<div style="font-weight:600;margin-bottom:8px">USB Black-Box</div>
|
||||
` + renderUSBExportInline() + `
|
||||
</div>
|
||||
</div></div>
|
||||
|
||||
<div class="card"><div class="card-head">Tool Check <button class="btn btn-sm btn-secondary" onclick="checkTools()" style="margin-left:auto">↻ Check</button></div>
|
||||
<div class="card-body"><div id="tools-table"><p style="color:var(--muted);font-size:13px">Checking...</p></div></div></div>
|
||||
<script>
|
||||
function checkTools() {
|
||||
document.getElementById('tools-table').innerHTML = '<p style="color:var(--muted);font-size:13px">Checking...</p>';
|
||||
fetch('/api/tools/check').then(r=>r.json()).then(tools => {
|
||||
const rows = tools.map(t =>
|
||||
'<tr><td>'+t.Name+'</td><td><span class="badge '+(t.OK?'badge-ok':'badge-err')+'">'+(t.OK?'✓ '+t.Path:'✗ missing')+'</span></td></tr>'
|
||||
).join('');
|
||||
document.getElementById('tools-table').innerHTML = '<table><tr><th>Tool</th><th>Status</th></tr>'+rows+'</table>';
|
||||
});
|
||||
}
|
||||
checkTools();
|
||||
</script>
|
||||
|
||||
<div class="card"><div class="card-head">NVIDIA Self Heal</div><div class="card-body">` +
|
||||
renderNvidiaSelfHealInline() + `</div></div>
|
||||
|
||||
<div class="card"><div class="card-head">Network</div><div class="card-body">` +
|
||||
renderNetworkInline() + `</div></div>
|
||||
|
||||
<div class="card"><div class="card-head">Services</div><div class="card-body">` +
|
||||
renderServicesInline() + `</div></div>
|
||||
|
||||
<div class="card">
|
||||
<div class="card-head">Build Info</div>
|
||||
<div class="card-body">
|
||||
<table style="width:auto">
|
||||
<tbody>
|
||||
<tr><td style="color:var(--muted);padding-right:24px">Version</td><td>` + html.EscapeString(version) + `</td></tr>
|
||||
<tr><td style="color:var(--muted);padding-right:24px">Title</td><td>` + html.EscapeString(opts.Title) + `</td></tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card">
|
||||
<div class="card-head">Power</div>
|
||||
<div class="card-body">
|
||||
<div style="display:flex;gap:8px;align-items:center">
|
||||
<button class="btn btn-secondary btn-sm" onclick="systemPower('reboot')">Reboot</button>
|
||||
<button class="btn btn-secondary btn-sm" onclick="systemPower('shutdown')">Shutdown</button>
|
||||
<span id="power-status" style="font-size:12px;color:var(--muted)"></span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
function systemPower(action) {
|
||||
var label = action === 'reboot' ? 'reboot' : 'shut down';
|
||||
if (!confirm('Are you sure you want to ' + label + ' the server?')) return;
|
||||
var el = document.getElementById('power-status');
|
||||
if (el) el.textContent = action === 'reboot' ? 'Rebooting...' : 'Shutting down...';
|
||||
fetch('/api/system/' + action, {method: 'POST'})
|
||||
.then(function(r) { return r.json(); })
|
||||
.catch(function(e) { if (el) el.textContent = 'Error: ' + e.message; });
|
||||
}
|
||||
</script>
|
||||
|
||||
`
|
||||
}
|
||||
@@ -11,6 +11,13 @@ import (
|
||||
"bee/audit/internal/schema"
|
||||
)
|
||||
|
||||
// PCI vendor IDs used for GPU classification (source: pci-ids.ucw.cz).
|
||||
const (
|
||||
pciVendorNvidia = 0x10de
|
||||
pciVendorAMD = 0x1002
|
||||
pciVendorAspeed = 0x1a03
|
||||
)
|
||||
|
||||
type validateInventory struct {
|
||||
CPU string
|
||||
Memory string
|
||||
@@ -61,6 +68,14 @@ func validateTotalStressSec(n int) int {
|
||||
}
|
||||
|
||||
func renderValidate(opts HandlerOptions) string {
|
||||
return renderValidateMode(opts, false)
|
||||
}
|
||||
|
||||
func renderValidateStress(opts HandlerOptions) string {
|
||||
return renderValidateMode(opts, true)
|
||||
}
|
||||
|
||||
func renderValidateMode(opts HandlerOptions, stressDefault bool) string {
|
||||
inv := loadValidateInventory(opts)
|
||||
n := inv.NvidiaGPUCount
|
||||
validateTotalStr := validateFmtDur(validateTotalValidateSec(n))
|
||||
@@ -69,26 +84,49 @@ func renderValidate(opts HandlerOptions) string {
|
||||
if n > 0 {
|
||||
gpuNote = fmt.Sprintf(" (%d GPU)", n)
|
||||
}
|
||||
return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div>
|
||||
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||
estStr := validateTotalStr
|
||||
if stressDefault {
|
||||
estStr = stressTotalStr
|
||||
}
|
||||
alert := `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div>`
|
||||
if stressDefault {
|
||||
alert = `<div class="alert alert-warn" style="margin-bottom:16px"><strong>⚠ Stress mode:</strong> Runs extended load tests — CPU stress-ng, memory passes, DCGM targeted diagnostics. Higher wear than Validate.</div>`
|
||||
}
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">Validate Profile</div>
|
||||
<div class="card-body validate-profile-body">
|
||||
<div class="validate-profile-col">
|
||||
<div class="form-row" style="margin:12px 0 0"><label>Mode</label></div>
|
||||
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-validate" value="validate" checked onchange="satModeChanged()"><span>Validate — quick non-destructive check</span></label>
|
||||
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (` + stressTotalStr + gpuNote + `)</span></label>
|
||||
</div>
|
||||
<div class="validate-profile-col validate-profile-action">
|
||||
<p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially. Validate: ` + validateTotalStr + gpuNote + `; Stress: ` + stressTotalStr + gpuNote + `. Estimates are based on real log data and scale with GPU count.</p>
|
||||
<button type="button" class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
|
||||
<div style="margin-top:12px">
|
||||
<span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
stressOnlyCards := ""
|
||||
if stressDefault {
|
||||
stressOnlyCards = renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
|
||||
`<code>dcgmi diag targeted_stress</code>`,
|
||||
validateFmtDur(platform.SATEstimatedNvidiaTargetedStressSec)+` (all GPUs simultaneously).`,
|
||||
)) +
|
||||
renderSATCard("nvidia-targeted-power", "NVIDIA Targeted Power", "runNvidiaValidateSet('nvidia-targeted-power')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
|
||||
`<code>dcgmi diag targeted_power</code>`,
|
||||
validateFmtDur(platform.SATEstimatedNvidiaTargetedPowerSec)+` (all GPUs simultaneously).`,
|
||||
)) +
|
||||
renderSATCard("nvidia-pulse", "NVIDIA PSU Pulse Test", "runNvidiaFabricValidate('nvidia-pulse')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`,
|
||||
`<code>dcgmi diag pulse_test</code>`,
|
||||
validateFmtDur(platform.SATEstimatedNvidiaPulseTestSec)+` (all GPUs simultaneously; measured on 8-GPU system).`,
|
||||
))
|
||||
}
|
||||
|
||||
satStressModeJS := "function satStressMode() { return false; }"
|
||||
if stressDefault {
|
||||
satStressModeJS = "function satStressMode() { return true; }"
|
||||
}
|
||||
|
||||
return alert + `
|
||||
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||
<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px">
|
||||
<button type="button" class="btn btn-primary" onclick="runAllSAT()">Run All</button>
|
||||
<span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
|
||||
<span style="font-size:12px;color:var(--muted)">est. ` + estStr + gpuNote + `</span>
|
||||
</div>
|
||||
|
||||
<div class="grid3">
|
||||
` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody(
|
||||
@@ -115,7 +153,7 @@ func renderValidate(opts HandlerOptions) string {
|
||||
<div class="card-head">NVIDIA GPU Selection</div>
|
||||
<div class="card-body">
|
||||
<p style="font-size:12px;color:var(--muted);margin:0 0 8px">` + inv.NVIDIA + `</p>
|
||||
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">All NVIDIA validate tasks use only the GPUs selected here. The same selection is used by Validate one by one.</p>
|
||||
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">All NVIDIA validate tasks use only the GPUs selected here. The same selection is used by Run All.</p>
|
||||
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
|
||||
<button class="btn btn-sm btn-secondary" type="button" onclick="satSelectAllGPUs()">Select All</button>
|
||||
<button class="btn btn-sm btn-secondary" type="button" onclick="satSelectNoGPUs()">Clear</button>
|
||||
@@ -136,46 +174,19 @@ func renderValidate(opts HandlerOptions) string {
|
||||
validateFmtDur(platform.SATEstimatedNvidiaGPUValidateSec),
|
||||
validateFmtDur(platform.SATEstimatedNvidiaGPUStressSec)),
|
||||
)) +
|
||||
`<div id="sat-card-nvidia-targeted-stress">` +
|
||||
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
|
||||
`<code>dcgmi diag targeted_stress</code>`,
|
||||
"Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedStressSec) + ` (all GPUs simultaneously).<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`<div id="sat-card-nvidia-targeted-power">` +
|
||||
renderSATCard("nvidia-targeted-power", "NVIDIA Targeted Power", "runNvidiaValidateSet('nvidia-targeted-power')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
|
||||
`<code>dcgmi diag targeted_power</code>`,
|
||||
"Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedPowerSec) + ` (all GPUs simultaneously).<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`<div id="sat-card-nvidia-pulse">` +
|
||||
renderSATCard("nvidia-pulse", "NVIDIA PSU Pulse Test", "runNvidiaFabricValidate('nvidia-pulse')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`,
|
||||
`<code>dcgmi diag pulse_test</code>`,
|
||||
`Skipped in Validate. Stress: `+validateFmtDur(platform.SATEstimatedNvidiaPulseTestSec)+` (all GPUs simultaneously; measured on 8-GPU system).`+`<p id="sat-pt-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`<div id="sat-card-nvidia-interconnect">` +
|
||||
stressOnlyCards +
|
||||
renderSATCard("nvidia-interconnect", "NVIDIA Interconnect (NCCL)", "runNvidiaFabricValidate('nvidia-interconnect')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
|
||||
`<code>all_reduce_perf</code> (NCCL tests)`,
|
||||
`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`,
|
||||
validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`<div id="sat-card-nvidia-bandwidth">` +
|
||||
renderSATCard("nvidia-bandwidth", "NVIDIA Bandwidth (NVBandwidth)", "runNvidiaFabricValidate('nvidia-bandwidth')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
|
||||
`<code>nvbandwidth</code>`,
|
||||
`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously; nvbandwidth runs all built-in tests without a time limit — duration set by the tool).`,
|
||||
validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously; nvbandwidth runs all built-in tests without a time limit — duration set by the tool).`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`</div>
|
||||
<div class="grid3" style="margin-top:16px">
|
||||
` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody(
|
||||
@@ -190,36 +201,15 @@ func renderValidate(opts HandlerOptions) string {
|
||||
<div class="card-body"><div id="sat-terminal" class="terminal"></div></div>
|
||||
</div>
|
||||
<style>
|
||||
.validate-profile-body { display:grid; grid-template-columns:1fr 1fr 1fr; gap:24px; align-items:stretch; }
|
||||
.validate-profile-col { min-width:0; display:flex; flex-direction:column; }
|
||||
.validate-profile-action { display:flex; flex-direction:column; align-items:center; justify-content:center; }
|
||||
.validate-card-body { padding:0; }
|
||||
.validate-card-section { padding:12px 16px 0; }
|
||||
.validate-card-section:last-child { padding-bottom:16px; }
|
||||
.sat-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
|
||||
.sat-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||
@media(max-width:900px){ .validate-profile-body { grid-template-columns:1fr; } }
|
||||
</style>
|
||||
<script>
|
||||
let satES = null;
|
||||
function satStressMode() {
|
||||
return document.querySelector('input[name="sat-mode"]:checked')?.value === 'stress';
|
||||
}
|
||||
function satModeChanged() {
|
||||
const stress = satStressMode();
|
||||
[
|
||||
{card: 'sat-card-nvidia-targeted-stress', hint: 'sat-ts-mode-hint'},
|
||||
{card: 'sat-card-nvidia-targeted-power', hint: 'sat-tp-mode-hint'},
|
||||
{card: 'sat-card-nvidia-pulse', hint: 'sat-pt-mode-hint'},
|
||||
].forEach(function(item) {
|
||||
const card = document.getElementById(item.card);
|
||||
if (card) {
|
||||
card.style.opacity = stress ? '1' : '0.5';
|
||||
const hint = document.getElementById(item.hint);
|
||||
if (hint) hint.style.display = stress ? 'none' : '';
|
||||
}
|
||||
});
|
||||
}
|
||||
` + satStressModeJS + `
|
||||
function satLabels() {
|
||||
return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress (dcgmi diag targeted_stress)', 'nvidia-targeted-power':'NVIDIA Targeted Power (dcgmi diag targeted_power)', 'nvidia-pulse':'NVIDIA PSU Pulse Test (dcgmi diag pulse_test)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL all_reduce_perf)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
|
||||
}
|
||||
@@ -634,25 +624,307 @@ func validateFirstNonEmpty(values ...string) string {
|
||||
}
|
||||
|
||||
func validateIsVendorGPU(dev schema.HardwarePCIeDevice, vendor string) bool {
|
||||
model := strings.ToLower(validateTrimPtr(dev.Model))
|
||||
manufacturer := strings.ToLower(validateTrimPtr(dev.Manufacturer))
|
||||
class := strings.ToLower(validateTrimPtr(dev.DeviceClass))
|
||||
if strings.Contains(model, "aspeed") || strings.Contains(manufacturer, "aspeed") {
|
||||
if dev.VendorID != nil && *dev.VendorID == pciVendorAspeed {
|
||||
return false
|
||||
}
|
||||
class := strings.ToLower(validateTrimPtr(dev.DeviceClass))
|
||||
isGPUClass := class == "videocontroller" || class == "processingaccelerator" || class == "displaycontroller"
|
||||
switch vendor {
|
||||
case "nvidia":
|
||||
return strings.Contains(model, "nvidia") || strings.Contains(manufacturer, "nvidia")
|
||||
return isGPUClass && dev.VendorID != nil && *dev.VendorID == pciVendorNvidia
|
||||
case "amd":
|
||||
isGPUClass := class == "processingaccelerator" || class == "displaycontroller" || class == "videocontroller"
|
||||
isAMDVendor := strings.Contains(manufacturer, "advanced micro devices") || strings.Contains(manufacturer, "amd") || strings.Contains(manufacturer, "ati")
|
||||
isAMDModel := strings.Contains(model, "instinct") || strings.Contains(model, "radeon") || strings.Contains(model, "amd")
|
||||
return isGPUClass && (isAMDVendor || isAMDModel)
|
||||
return isGPUClass && dev.VendorID != nil && *dev.VendorID == pciVendorAMD
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// renderCheck renders the non-destructive Check page (step 2).
|
||||
// Shows validate-mode tests only: CPU, Memory, Storage, NVIDIA L2, NCCL, NVBandwidth, AMD.
|
||||
// Stress-mode tests (targeted-stress, targeted-power, pulse) are on the Load page.
|
||||
func renderCheck(opts HandlerOptions) string {
|
||||
inv := loadValidateInventory(opts)
|
||||
n := inv.NvidiaGPUCount
|
||||
validateTotalStr := validateFmtDur(validateTotalValidateSec(n))
|
||||
gpuNote := ""
|
||||
if n > 0 {
|
||||
gpuNote = fmt.Sprintf(" (%d GPU)", n)
|
||||
}
|
||||
return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Check tests collect diagnostics only — no writes to disks, no sustained load, no hardware wear counters incremented. For stress testing, go to <a href="/burn">4. Burn</a>.</div>
|
||||
<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px">
|
||||
<button type="button" class="btn btn-primary" onclick="runAllCheckSAT()">Run All Checks</button>
|
||||
<span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
|
||||
<span style="font-size:12px;color:var(--muted)">est. ` + validateTotalStr + gpuNote + `</span>
|
||||
</div>
|
||||
|
||||
<div class="grid3">
|
||||
` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody(
|
||||
inv.CPU,
|
||||
`Collects CPU inventory and temperatures, then runs a bounded CPU stress pass.`,
|
||||
`<code>lscpu</code>, <code>sensors</code>, <code>stress-ng</code>`,
|
||||
validateFmtDur(platform.SATEstimatedCPUValidateSec)+` (stress-ng 60 s).`,
|
||||
)) +
|
||||
renderSATCard("memory", "Memory", "runSAT('memory')", "", renderValidateCardBody(
|
||||
inv.Memory,
|
||||
`Runs a RAM validation pass and records memory state around the test.`,
|
||||
`<code>free</code>, <code>memtester</code>`,
|
||||
validateFmtDur(platform.SATEstimatedMemoryValidateSec)+` (256 MB × 1 pass).`,
|
||||
)) +
|
||||
renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
|
||||
inv.Storage,
|
||||
`Scans all storage devices and runs the matching health or self-test path for each.`,
|
||||
`<code>lsblk</code>; NVMe: <code>nvme</code>; SATA/SAS: <code>smartctl</code>`,
|
||||
`Seconds (NVMe: instant device query; SATA/SAS: short self-test).`,
|
||||
)) +
|
||||
`</div>
|
||||
<div style="height:1px;background:var(--border);margin:16px 0"></div>
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">NVIDIA GPU Selection</div>
|
||||
<div class="card-body">
|
||||
<p style="font-size:12px;color:var(--muted);margin:0 0 8px">` + inv.NVIDIA + `</p>
|
||||
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
|
||||
<button class="btn btn-sm btn-secondary" type="button" onclick="satSelectAllGPUs()">Select All</button>
|
||||
<button class="btn btn-sm btn-secondary" type="button" onclick="satSelectNoGPUs()">Clear</button>
|
||||
</div>
|
||||
<div id="sat-gpu-list" style="border:1px solid var(--border);border-radius:4px;padding:12px;min-height:88px">
|
||||
<p style="color:var(--muted);font-size:13px">Loading NVIDIA GPUs...</p>
|
||||
</div>
|
||||
<p id="sat-gpu-selection-note" style="font-size:12px;color:var(--muted);margin:10px 0 0">Select at least one NVIDIA GPU to enable NVIDIA check tasks.</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="grid3">
|
||||
` + renderSATCard("nvidia", "NVIDIA GPU", "runNvidiaValidateSet('nvidia')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Runs NVIDIA diagnostics and board inventory checks (DCGM Level 2).`,
|
||||
`<code>nvidia-smi</code>, <code>dmidecode</code>, <code>dcgmi diag</code>`,
|
||||
validateFmtDur(platform.SATEstimatedNvidiaGPUValidateSec)+` (Level 2, all GPUs simultaneously).`,
|
||||
)) +
|
||||
renderSATCard("nvidia-interconnect", "NVIDIA Interconnect (NCCL)", "runNvidiaFabricValidate('nvidia-interconnect')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs.`,
|
||||
`<code>all_reduce_perf</code> (NCCL tests)`,
|
||||
validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`,
|
||||
)) +
|
||||
renderSATCard("nvidia-bandwidth", "NVIDIA Bandwidth (NVBandwidth)", "runNvidiaFabricValidate('nvidia-bandwidth')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
|
||||
`<code>nvbandwidth</code>`,
|
||||
validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously).`,
|
||||
)) +
|
||||
`</div>
|
||||
<div class="grid3" style="margin-top:16px">
|
||||
` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody(
|
||||
inv.AMD,
|
||||
`Runs AMD GPU inventory, MEM integrity, and MEM bandwidth checks.`,
|
||||
`GPU Validate: <code>rocm-smi</code>, <code>dmidecode</code>; MEM Integrity: <code>rvs mem</code>; MEM Bandwidth: <code>rocm-bandwidth-test</code>, <code>rvs babel</code>`,
|
||||
`<div style="display:flex;flex-direction:column;gap:4px"><label class="cb-row"><input type="checkbox" id="sat-amd-target" checked><span>GPU Validate</span></label><label class="cb-row"><input type="checkbox" id="sat-amd-mem-target" checked><span>MEM Integrity</span></label><label class="cb-row"><input type="checkbox" id="sat-amd-bandwidth-target" checked><span>MEM Bandwidth</span></label></div>`,
|
||||
)) +
|
||||
`</div>
|
||||
<div id="sat-output" style="display:none;margin-top:16px" class="card">
|
||||
<div class="card-head">Test Output <span id="sat-title"></span></div>
|
||||
<div class="card-body"><div id="sat-terminal" class="terminal"></div></div>
|
||||
</div>
|
||||
<style>
|
||||
.validate-card-body { padding:0; }
|
||||
.validate-card-section { padding:12px 16px 0; }
|
||||
.validate-card-section:last-child { padding-bottom:16px; }
|
||||
.sat-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
|
||||
.sat-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||
.cb-row { display:flex; align-items:flex-start; gap:8px; padding:4px 0; cursor:pointer; font-size:13px; }
|
||||
.cb-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||
</style>
|
||||
<script>
|
||||
let satES = null;
|
||||
function satLabels() {
|
||||
return {nvidia:'Check GPU (DCGM L2)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Check Memory', storage:'Check Storage', cpu:'Check CPU', amd:'Check AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
|
||||
}
|
||||
let satNvidiaGPUsPromise = null;
|
||||
function loadSatNvidiaGPUs() {
|
||||
if (!satNvidiaGPUsPromise) {
|
||||
satNvidiaGPUsPromise = fetch('/api/gpu/nvidia').then(r => {
|
||||
if (!r.ok) throw new Error('Failed to load NVIDIA GPUs.');
|
||||
return r.json();
|
||||
}).then(list => Array.isArray(list) ? list : []);
|
||||
}
|
||||
return satNvidiaGPUsPromise;
|
||||
}
|
||||
function satSelectedGPUIndices() {
|
||||
return Array.from(document.querySelectorAll('.sat-nvidia-checkbox'))
|
||||
.filter(el => el.checked && !el.disabled)
|
||||
.map(el => parseInt(el.value, 10))
|
||||
.filter(v => !Number.isNaN(v))
|
||||
.sort((a, b) => a - b);
|
||||
}
|
||||
function satUpdateGPUSelectionNote() {
|
||||
const note = document.getElementById('sat-gpu-selection-note');
|
||||
if (!note) return;
|
||||
const sel = satSelectedGPUIndices();
|
||||
note.textContent = sel.length
|
||||
? 'Selected GPUs: ' + sel.join(', ') + '. Multi-GPU tests will use all selected GPUs.'
|
||||
: 'Select at least one NVIDIA GPU to enable NVIDIA check tasks.';
|
||||
}
|
||||
function satRenderGPUList(gpus) {
|
||||
const root = document.getElementById('sat-gpu-list');
|
||||
if (!root) return;
|
||||
if (!gpus || !gpus.length) {
|
||||
root.innerHTML = '<p style="color:var(--muted);font-size:13px">No NVIDIA GPUs detected.</p>';
|
||||
satUpdateGPUSelectionNote(); return;
|
||||
}
|
||||
root.innerHTML = gpus.map(gpu => {
|
||||
const mem = gpu.memory_mb > 0 ? ' · ' + gpu.memory_mb + ' MiB' : '';
|
||||
return '<label class="sat-gpu-row"><input class="sat-nvidia-checkbox" type="checkbox" value="' + gpu.index + '" checked onchange="satUpdateGPUSelectionNote()"><span><strong>GPU ' + gpu.index + '</strong> — ' + gpu.name + mem + '</span></label>';
|
||||
}).join('');
|
||||
satUpdateGPUSelectionNote();
|
||||
}
|
||||
function satSelectAllGPUs() { document.querySelectorAll('.sat-nvidia-checkbox').forEach(el => { el.checked = true; }); satUpdateGPUSelectionNote(); }
|
||||
function satSelectNoGPUs() { document.querySelectorAll('.sat-nvidia-checkbox').forEach(el => { el.checked = false; }); satUpdateGPUSelectionNote(); }
|
||||
function satGPULoadInit() {
|
||||
loadSatNvidiaGPUs().then(satRenderGPUList).catch(err => {
|
||||
const root = document.getElementById('sat-gpu-list');
|
||||
if (root) root.innerHTML = '<p style="color:var(--crit-fg);font-size:13px">Error: ' + err.message + '</p>';
|
||||
satUpdateGPUSelectionNote();
|
||||
});
|
||||
}
|
||||
function satRequestBody(target, overrides) {
|
||||
const body = {};
|
||||
const labels = satLabels();
|
||||
body.display_name = labels[target] || ('Check ' + target);
|
||||
body.stress_mode = false;
|
||||
if (target === 'cpu') body.duration = 60;
|
||||
if (overrides) Object.keys(overrides).forEach(k => { body[k] = overrides[k]; });
|
||||
return body;
|
||||
}
|
||||
function enqueueSATTarget(target, overrides) {
|
||||
return fetch('/api/sat/' + target + '/run', {method:'POST', headers:{'Content-Type':'application/json'}, body:JSON.stringify(satRequestBody(target, overrides))}).then(r => r.json());
|
||||
}
|
||||
function streamSATTask(taskId, title, resetTerminal) {
|
||||
if (satES) { satES.close(); satES = null; }
|
||||
document.getElementById('sat-output').style.display = 'block';
|
||||
document.getElementById('sat-title').textContent = '— ' + title;
|
||||
const term = document.getElementById('sat-terminal');
|
||||
if (resetTerminal) term.textContent = '';
|
||||
term.textContent += 'Task ' + taskId + ' queued. Streaming log...\n';
|
||||
return new Promise(resolve => {
|
||||
satES = new EventSource('/api/tasks/' + taskId + '/stream');
|
||||
satES.onmessage = e => { term.textContent += e.data + '\n'; term.scrollTop = term.scrollHeight; };
|
||||
satES.addEventListener('done', e => {
|
||||
satES.close(); satES = null;
|
||||
term.textContent += (e.data ? '\nERROR: ' + e.data : '\nCompleted.') + '\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
resolve({ok: !e.data, error: e.data || ''});
|
||||
});
|
||||
satES.onerror = () => {
|
||||
if (satES) { satES.close(); satES = null; }
|
||||
term.textContent += '\nERROR: stream disconnected.\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
resolve({ok: false, error: 'stream disconnected'});
|
||||
};
|
||||
});
|
||||
}
|
||||
function selectedAMDValidateTargets() {
|
||||
const targets = [];
|
||||
const gpu = document.getElementById('sat-amd-target');
|
||||
const mem = document.getElementById('sat-amd-mem-target');
|
||||
const bw = document.getElementById('sat-amd-bandwidth-target');
|
||||
if (gpu && gpu.checked && !gpu.disabled) targets.push('amd');
|
||||
if (mem && mem.checked && !mem.disabled) targets.push('amd-mem');
|
||||
if (bw && bw.checked && !bw.disabled) targets.push('amd-bandwidth');
|
||||
return targets;
|
||||
}
|
||||
function runSAT(target) { return runSATWithOverrides(target, null); }
|
||||
function runSATWithOverrides(target, overrides) {
|
||||
const title = (overrides && overrides.display_name) || target;
|
||||
document.getElementById('sat-output').style.display = 'block';
|
||||
document.getElementById('sat-title').textContent = '— ' + title;
|
||||
const term = document.getElementById('sat-terminal');
|
||||
term.textContent = 'Enqueuing ' + title + ' test...\n';
|
||||
return enqueueSATTarget(target, overrides).then(d => streamSATTask(d.task_id, title, false));
|
||||
}
|
||||
function runNvidiaFabricValidate(target) {
|
||||
const indices = satSelectedGPUIndices();
|
||||
if (!indices.length) { alert('No NVIDIA GPUs available.'); return; }
|
||||
runSATWithOverrides(target, {gpu_indices: indices, display_name: satLabels()[target] || target});
|
||||
}
|
||||
function runNvidiaValidateSet(target) {
|
||||
const sel = satSelectedGPUIndices();
|
||||
if (!sel.length) { alert('Select at least one NVIDIA GPU.'); return; }
|
||||
return runSATWithOverrides(target, {gpu_indices: sel, display_name: satLabels()[target] || target});
|
||||
}
|
||||
function runAMDValidateSet() {
|
||||
const targets = selectedAMDValidateTargets();
|
||||
if (!targets.length) return;
|
||||
if (targets.length === 1) return runSAT(targets[0]);
|
||||
const term = document.getElementById('sat-terminal');
|
||||
document.getElementById('sat-output').style.display = 'block';
|
||||
document.getElementById('sat-title').textContent = '— amd';
|
||||
term.textContent = 'Running AMD check set...\n';
|
||||
const labels = satLabels();
|
||||
const runNext = idx => {
|
||||
if (idx >= targets.length) return Promise.resolve();
|
||||
const t = targets[idx];
|
||||
term.textContent += '\n[' + (idx + 1) + '/' + targets.length + '] ' + labels[t] + '\n';
|
||||
return enqueueSATTarget(t).then(d => streamSATTask(d.task_id, labels[t], false)).then(() => runNext(idx + 1));
|
||||
};
|
||||
return runNext(0);
|
||||
}
|
||||
function runAllCheckSAT() {
|
||||
const status = document.getElementById('sat-all-status');
|
||||
status.textContent = 'Enqueuing...';
|
||||
const nvidiaIndices = satSelectedGPUIndices();
|
||||
const nvidiaAllTargets = ['nvidia', 'nvidia-interconnect', 'nvidia-bandwidth'];
|
||||
const baseTargets = ['cpu', 'memory', 'storage'];
|
||||
const amdTargets = selectedAMDValidateTargets();
|
||||
const expanded = [];
|
||||
baseTargets.forEach(t => expanded.push({target: t}));
|
||||
if (nvidiaIndices.length) {
|
||||
nvidiaAllTargets.forEach(t => {
|
||||
const btn = document.getElementById('sat-btn-' + t);
|
||||
if (!(btn && btn.disabled)) expanded.push({target: t, overrides: {gpu_indices: nvidiaIndices, display_name: satLabels()[t] || t}});
|
||||
});
|
||||
}
|
||||
amdTargets.forEach(t => expanded.push({target: t}));
|
||||
if (!expanded.length) { status.textContent = 'No tasks selected.'; return; }
|
||||
const total = expanded.length;
|
||||
const runNext = idx => {
|
||||
if (idx >= expanded.length) { status.textContent = 'Completed ' + total + ' task(s).'; return Promise.resolve(); }
|
||||
const item = expanded[idx];
|
||||
status.textContent = 'Running ' + (idx + 1) + '/' + total + '...';
|
||||
return enqueueSATTarget(item.target, item.overrides).then(() => runNext(idx + 1));
|
||||
};
|
||||
runNext(0).catch(err => { status.textContent = 'Error: ' + err.message; });
|
||||
}
|
||||
function disableSATCard(id, reason) {
|
||||
const btn = document.getElementById('sat-btn-' + id);
|
||||
if (!btn) return;
|
||||
btn.disabled = true; btn.title = reason; btn.style.opacity = '0.4';
|
||||
const card = btn.closest('.card');
|
||||
if (card) {
|
||||
let note = card.querySelector('.sat-unavail');
|
||||
if (!note) {
|
||||
note = document.createElement('p');
|
||||
note.className = 'sat-unavail';
|
||||
note.style.cssText = 'color:var(--muted);font-size:12px;margin:0 0 8px';
|
||||
const body = card.querySelector('.card-body');
|
||||
if (body) body.insertBefore(note, body.firstChild);
|
||||
}
|
||||
note.textContent = reason;
|
||||
}
|
||||
}
|
||||
fetch('/api/gpu/presence').then(r => r.json()).then(gp => {
|
||||
if (!gp.nvidia) ['nvidia','nvidia-interconnect','nvidia-bandwidth'].forEach(t => disableSATCard(t, 'No NVIDIA GPU detected'));
|
||||
if (!gp.amd) {
|
||||
disableSATCard('amd', 'No AMD GPU detected');
|
||||
['sat-amd-target','sat-amd-mem-target','sat-amd-bandwidth-target'].forEach(id => {
|
||||
const cb = document.getElementById(id);
|
||||
if (cb) { cb.disabled = true; cb.checked = false; }
|
||||
});
|
||||
}
|
||||
});
|
||||
satGPULoadInit();
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderSATCard(id, label, runAction, headerActions, body string) string {
|
||||
actions := `<button id="sat-btn-` + id + `" class="btn btn-primary btn-sm" onclick="` + runAction + `">Run</button>`
|
||||
if strings.TrimSpace(headerActions) != "" {
|
||||
|
||||
@@ -5,7 +5,9 @@ import (
|
||||
"fmt"
|
||||
"html"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
@@ -22,41 +24,54 @@ func renderPage(page string, opts HandlerOptions) string {
|
||||
body = renderDashboard(opts)
|
||||
case "audit":
|
||||
pageID = "audit"
|
||||
title = "Audit"
|
||||
title = "1. Audit"
|
||||
body = renderAudit()
|
||||
case "validate":
|
||||
pageID = "validate"
|
||||
title = "Validate"
|
||||
body = renderValidate(opts)
|
||||
case "check":
|
||||
pageID = "check"
|
||||
title = "2. Check"
|
||||
body = renderCheck(opts)
|
||||
case "load":
|
||||
pageID = "load"
|
||||
title = "3. Load"
|
||||
body = renderValidateStress(opts)
|
||||
case "burn":
|
||||
pageID = "burn"
|
||||
title = "Burn"
|
||||
title = "4. Burn"
|
||||
body = renderBurn()
|
||||
case "benchmark":
|
||||
pageID = "benchmark"
|
||||
title = "Benchmark"
|
||||
title = "5. Benchmark"
|
||||
body = renderBenchmark(opts)
|
||||
case "tools":
|
||||
pageID = "tools"
|
||||
title = "Tools"
|
||||
body = renderTools()
|
||||
case "settings":
|
||||
pageID = "settings"
|
||||
title = "Settings"
|
||||
body = renderSettings(opts)
|
||||
// Legacy routes (redirected at HTTP level in handlePage; these are fallbacks)
|
||||
case "validate", "tests":
|
||||
pageID = "load"
|
||||
title = "3. Load"
|
||||
body = renderValidate(opts)
|
||||
case "burn-in":
|
||||
pageID = "burn"
|
||||
title = "4. Burn"
|
||||
body = renderBurn()
|
||||
case "speed", "endurance":
|
||||
pageID = "benchmark"
|
||||
title = "5. Benchmark"
|
||||
body = renderBenchmark(opts)
|
||||
case "tasks":
|
||||
pageID = "tasks"
|
||||
title = "Tasks"
|
||||
body = renderTasks()
|
||||
case "tools":
|
||||
pageID = "tools"
|
||||
title = "Tools"
|
||||
body = renderTools()
|
||||
// Legacy routes kept accessible but not in nav
|
||||
// Hidden pages (not in nav, accessible by direct URL)
|
||||
case "metrics":
|
||||
pageID = "metrics"
|
||||
title = "Live Metrics"
|
||||
body = renderMetrics()
|
||||
case "tests":
|
||||
pageID = "validate"
|
||||
title = "Acceptance Tests"
|
||||
body = renderValidate(opts)
|
||||
case "burn-in":
|
||||
pageID = "burn"
|
||||
title = "Burn-in Tests"
|
||||
body = renderBurn()
|
||||
case "network":
|
||||
pageID = "network"
|
||||
title = "Network"
|
||||
@@ -85,6 +100,7 @@ func renderPage(page string, opts HandlerOptions) string {
|
||||
body +
|
||||
`</div></div>` +
|
||||
renderAuditModal() +
|
||||
`<dialog id="component-detail-dialog" style="min-width:600px;max-width:900px;width:90vw;padding:0;border:1px solid var(--border);border-radius:8px;background:var(--surface)"><div id="component-detail-body" style="padding-bottom:20px"></div></dialog>` +
|
||||
`<script>
|
||||
// Add copy button to every .terminal on the page
|
||||
document.querySelectorAll('.terminal').forEach(function(t){
|
||||
@@ -94,6 +110,17 @@ document.querySelectorAll('.terminal').forEach(function(t){
|
||||
btn.onclick=function(){navigator.clipboard.writeText(t.textContent).then(function(){btn.textContent='Copied!';setTimeout(function(){btn.textContent='Copy';},1500);});};
|
||||
w.appendChild(btn);
|
||||
});
|
||||
function openComponentDetail(type) {
|
||||
var dlg = document.getElementById('component-detail-dialog');
|
||||
var body = document.getElementById('component-detail-body');
|
||||
body.innerHTML = '<div style="padding:20px;color:var(--muted)">Loading…</div>';
|
||||
dlg.showModal();
|
||||
fetch('/api/components/' + type).then(function(r){ return r.text(); }).then(function(html){
|
||||
body.innerHTML = html;
|
||||
}).catch(function(){
|
||||
body.innerHTML = '<div style="padding:20px;color:var(--crit-fg)">Error loading details.</div>';
|
||||
});
|
||||
}
|
||||
</script>` +
|
||||
`</body></html>`
|
||||
}
|
||||
@@ -106,6 +133,14 @@ func renderDashboard(opts HandlerOptions) string {
|
||||
b.WriteString(renderHardwareSummaryCard(opts))
|
||||
b.WriteString(renderHealthCard(opts))
|
||||
b.WriteString(renderMetrics())
|
||||
b.WriteString(`<script>
|
||||
setInterval(function(){
|
||||
fetch('/api/hardware-summary').then(function(r){return r.text();}).then(function(html){
|
||||
var el=document.getElementById('hw-summary-card');
|
||||
if(el){el.outerHTML=html;}
|
||||
}).catch(function(){});
|
||||
},30000);
|
||||
</script>`)
|
||||
return b.String()
|
||||
}
|
||||
|
||||
@@ -184,13 +219,14 @@ func renderAudit() string {
|
||||
}
|
||||
|
||||
func renderHardwareSummaryCard(opts HandlerOptions) string {
|
||||
const cardID = ` id="hw-summary-card"`
|
||||
data, err := loadSnapshot(opts.AuditPath)
|
||||
if err != nil {
|
||||
return `<div class="card"><div class="card-head card-head-actions"><span>Hardware Summary</span><div class="card-head-buttons"><button class="btn btn-primary btn-sm" onclick="auditModalRun()">Run audit</button></div></div><div class="card-body"></div></div>`
|
||||
return `<div class="card"` + cardID + `><div class="card-head card-head-actions"><span>Hardware Summary</span><div class="card-head-buttons"><button class="btn btn-primary btn-sm" onclick="auditModalRun()">Run audit</button></div></div><div class="card-body"></div></div>`
|
||||
}
|
||||
var ingest schema.HardwareIngestRequest
|
||||
if err := json.Unmarshal(data, &ingest); err != nil {
|
||||
return `<div class="card"><div class="card-head">Hardware Summary</div><div class="card-body"><span class="badge badge-err">Parse error</span></div></div>`
|
||||
return `<div class="card"` + cardID + `><div class="card-head">Hardware Summary</div><div class="card-body"><span class="badge badge-err">Parse error</span></div></div>`
|
||||
}
|
||||
hw := ingest.Hardware
|
||||
|
||||
@@ -200,7 +236,7 @@ func renderHardwareSummaryCard(opts HandlerOptions) string {
|
||||
}
|
||||
|
||||
var b strings.Builder
|
||||
b.WriteString(`<div class="card"><div class="card-head">Hardware Summary</div><div class="card-body">`)
|
||||
b.WriteString(`<div class="card"` + cardID + `><div class="card-head">Hardware Summary</div><div class="card-body">`)
|
||||
|
||||
// Server identity block above the component table.
|
||||
{
|
||||
@@ -229,22 +265,32 @@ func renderHardwareSummaryCard(opts HandlerOptions) string {
|
||||
}
|
||||
|
||||
b.WriteString(`<table style="width:auto">`)
|
||||
writeRow := func(label, value, badgeHTML string) {
|
||||
b.WriteString(fmt.Sprintf(`<tr><td style="padding:6px 14px 6px 0;font-weight:700;white-space:nowrap">%s</td><td style="padding:6px 0;color:var(--muted);font-size:13px">%s</td><td style="padding:6px 0 6px 12px">%s</td></tr>`,
|
||||
html.EscapeString(label), html.EscapeString(value), badgeHTML))
|
||||
// writeRow renders one component row. compType is the URL path segment for the detail
|
||||
// endpoint (e.g. "cpu"). Pass "" for rows that have no detail view.
|
||||
writeRow := func(label, value, badgeHTML, compType string) {
|
||||
var labelHTML string
|
||||
if compType != "" {
|
||||
labelHTML = fmt.Sprintf(
|
||||
`<span style="cursor:pointer;text-decoration:underline dotted;text-underline-offset:3px" onclick="openComponentDetail('%s')">%s</span>`,
|
||||
compType, html.EscapeString(label))
|
||||
} else {
|
||||
labelHTML = html.EscapeString(label)
|
||||
}
|
||||
fmt.Fprintf(&b, `<tr><td style="padding:6px 14px 6px 0;font-weight:700;white-space:nowrap">%s</td><td style="padding:6px 0;color:var(--muted);font-size:13px">%s</td><td style="padding:6px 0 6px 12px">%s</td></tr>`,
|
||||
labelHTML, html.EscapeString(value), badgeHTML)
|
||||
}
|
||||
|
||||
writeRow("CPU", hwDescribeCPU(hw),
|
||||
renderComponentChips(matchedRecords(records, []string{"cpu:all"}, nil)))
|
||||
renderComponentChips(matchedRecords(records, []string{"cpu:all"}, nil)), "cpu")
|
||||
|
||||
writeRow("Memory", hwDescribeMemory(hw),
|
||||
renderComponentChips(matchedRecords(records, []string{"memory:all"}, []string{"memory:"})))
|
||||
renderComponentChips(matchedRecords(records, []string{"memory:all"}, []string{"memory:"})), "memory")
|
||||
|
||||
writeRow("Storage", hwDescribeStorage(hw),
|
||||
renderComponentChips(matchedRecords(records, []string{"storage:all"}, []string{"storage:"})))
|
||||
renderComponentChips(matchedRecords(records, []string{"storage:all"}, []string{"storage:"})), "storage")
|
||||
|
||||
writeRow("GPU", hwDescribeGPU(hw),
|
||||
renderComponentChips(matchedRecords(records, nil, []string{"pcie:gpu:"})))
|
||||
renderComponentChips(matchedRecords(records, nil, []string{"pcie:gpu:"})), "gpu")
|
||||
|
||||
psuMatched := matchedRecords(records, nil, []string{"psu:"})
|
||||
if len(psuMatched) == 0 && len(hw.PowerSupplies) > 0 {
|
||||
@@ -252,10 +298,10 @@ func renderHardwareSummaryCard(opts HandlerOptions) string {
|
||||
psuStatus := hwPSUStatus(hw.PowerSupplies)
|
||||
psuMatched = []app.ComponentStatusRecord{{ComponentKey: "psu:ipmi", Status: psuStatus}}
|
||||
}
|
||||
writeRow("PSU", hwDescribePSU(hw), renderComponentChips(psuMatched))
|
||||
writeRow("PSU", hwDescribePSU(hw), renderComponentChips(psuMatched), "psu")
|
||||
|
||||
if nicDesc := hwDescribeNIC(hw); nicDesc != "" {
|
||||
writeRow("Network", nicDesc, "")
|
||||
writeRow("Network", nicDesc, "", "")
|
||||
}
|
||||
|
||||
b.WriteString(`</table>`)
|
||||
@@ -614,7 +660,7 @@ func buildRuntimeNetworkRow(health schema.RuntimeHealth) runtimeHealthRow {
|
||||
if status == "" {
|
||||
status = "UNKNOWN"
|
||||
}
|
||||
issue := runtimeIssueDescriptions(health.Issues, "dhcp_partial", "dhcp_failed")
|
||||
issue := runtimeIssueDescriptions(health.Issues, "dhcp_failed")
|
||||
return runtimeHealthRow{Title: "Network", Status: status, Source: "ListInterfaces / DHCP", Issue: issue}
|
||||
}
|
||||
|
||||
@@ -672,12 +718,12 @@ func buildRuntimeServicesRow(health schema.RuntimeHealth) runtimeHealthRow {
|
||||
nonActive := make([]string, 0)
|
||||
for _, svc := range health.Services {
|
||||
state := strings.TrimSpace(strings.ToLower(svc.Status))
|
||||
// "activating" and "deactivating" are transient states for oneshot services
|
||||
// (RemainAfterExit=yes) — the service is running normally, not failed.
|
||||
// Only "failed" and "inactive" (after services should be running) are problems.
|
||||
// "inactive" is OK for oneshot services that have completed successfully
|
||||
// (bee-sshsetup, bee-preflight, bee-audit, bee-network, etc.).
|
||||
// Only "failed" is a genuine problem.
|
||||
switch state {
|
||||
case "active", "activating", "deactivating", "reloading":
|
||||
// OK — service is running or transitioning normally
|
||||
case "active", "activating", "deactivating", "reloading", "inactive":
|
||||
// OK — service is running, transitioning normally, or completed successfully
|
||||
default:
|
||||
nonActive = append(nonActive, svc.Name+"="+svc.Status)
|
||||
}
|
||||
@@ -999,3 +1045,200 @@ func rowIssueHTML(issue string) string {
|
||||
}
|
||||
return html.EscapeString(issue)
|
||||
}
|
||||
|
||||
var aerStatusRe = regexp.MustCompile(`aer_status:\s*0x([0-9a-fA-F]{1,8})`)
|
||||
|
||||
// decodeAERStatus parses an AER status hex value from a kernel error detail string
|
||||
// and returns a human-readable list of set bit names with correctable/uncorrectable label,
|
||||
// or "" if no AER status is found.
|
||||
func decodeAERStatus(detail string) string {
|
||||
m := aerStatusRe.FindStringSubmatch(detail)
|
||||
if m == nil {
|
||||
return ""
|
||||
}
|
||||
v64, err := strconv.ParseUint(m[1], 16, 32)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
val := uint32(v64)
|
||||
|
||||
type bitDef struct {
|
||||
bit uint32
|
||||
name string
|
||||
}
|
||||
corrBits := []bitDef{
|
||||
{0, "Receiver Error"}, {6, "Replay Timer Timeout"}, {7, "Advisory Non-Fatal"},
|
||||
{8, "Corrected Internal Error"}, {9, "Header Log Overflow"},
|
||||
{13, "Replay Num Rollover"}, {14, "Bad DLLP"}, {15, "Bad TLP"},
|
||||
}
|
||||
uncorrBits := []bitDef{
|
||||
{4, "Data Link Protocol Error"}, {5, "Surprise Down Error"},
|
||||
{12, "Poisoned TLP Received"}, {13, "Flow Control Protocol Error"},
|
||||
{14, "Completion Timeout"}, {15, "Completer Abort"}, {16, "Unexpected Completion"},
|
||||
{17, "Receiver Overflow"}, {18, "Malformed TLP"}, {19, "ECRC Error"},
|
||||
{20, "Unsupported Request Error"}, {21, "ACS Violation"}, {22, "Uncorrectable Internal Error"},
|
||||
}
|
||||
var corrNames, uncorrNames []string
|
||||
for _, b := range corrBits {
|
||||
if val&(1<<b.bit) != 0 {
|
||||
corrNames = append(corrNames, b.name)
|
||||
}
|
||||
}
|
||||
for _, b := range uncorrBits {
|
||||
if val&(1<<b.bit) != 0 {
|
||||
uncorrNames = append(uncorrNames, b.name)
|
||||
}
|
||||
}
|
||||
if len(corrNames) >= len(uncorrNames) && len(corrNames) > 0 {
|
||||
return strings.Join(corrNames, ", ") + " (correctable)"
|
||||
}
|
||||
if len(uncorrNames) > 0 {
|
||||
return strings.Join(uncorrNames, ", ") + " (uncorrectable)"
|
||||
}
|
||||
return fmt.Sprintf("unknown bits: 0x%08x", val)
|
||||
}
|
||||
|
||||
// renderSparkline returns a small inline SVG showing non-OK events over time.
|
||||
// Events are positioned proportionally along the time axis; if all share the same
|
||||
// timestamp they are spaced evenly. Width is always 100px.
|
||||
func renderSparkline(history []app.ComponentStatusEntry) string {
|
||||
const (
|
||||
svgW = 100
|
||||
svgH = 20
|
||||
barW = 3
|
||||
barH = 14
|
||||
)
|
||||
var events []app.ComponentStatusEntry
|
||||
for _, e := range history {
|
||||
if e.Status != "OK" {
|
||||
events = append(events, e)
|
||||
}
|
||||
}
|
||||
if len(events) == 0 {
|
||||
return ""
|
||||
}
|
||||
n := len(events)
|
||||
barColor := func(status string) string {
|
||||
if status == "Critical" {
|
||||
return "#c0392b"
|
||||
}
|
||||
return "#d97706"
|
||||
}
|
||||
yTop := (svgH - barH) / 2
|
||||
|
||||
var bars strings.Builder
|
||||
if n == 1 {
|
||||
x := (svgW - barW) / 2
|
||||
fmt.Fprintf(&bars, `<rect x="%d" y="%d" width="%d" height="%d" fill="%s" rx="1"/>`,
|
||||
x, yTop, barW, barH, barColor(events[0].Status))
|
||||
} else {
|
||||
minT := events[0].At
|
||||
maxT := events[n-1].At
|
||||
dur := maxT.Sub(minT).Seconds()
|
||||
for i, e := range events {
|
||||
var x int
|
||||
if dur <= 0 {
|
||||
step := svgW / n
|
||||
x = i*step + (step-barW)/2
|
||||
} else {
|
||||
frac := e.At.Sub(minT).Seconds() / dur
|
||||
x = int(frac * float64(svgW-barW))
|
||||
}
|
||||
fmt.Fprintf(&bars, `<rect x="%d" y="%d" width="%d" height="%d" fill="%s" rx="1"/>`,
|
||||
x, yTop, barW, barH, barColor(e.Status))
|
||||
}
|
||||
}
|
||||
return fmt.Sprintf(
|
||||
`<svg width="%d" height="%d" style="display:inline-block;vertical-align:middle;margin-left:6px;flex-shrink:0" xmlns="http://www.w3.org/2000/svg">`+
|
||||
`<rect x="0" y="0" width="%d" height="%d" fill="var(--surface-alt,#ebebeb)" rx="3"/>%s</svg>`,
|
||||
svgW, svgH, svgW, svgH, bars.String())
|
||||
}
|
||||
|
||||
// renderComponentDetail renders a modal content fragment for one component type.
|
||||
// Called by handleAPIComponentDetail and displayed inside #component-detail-dialog.
|
||||
func renderComponentDetail(title string, records []app.ComponentStatusRecord) string {
|
||||
var b strings.Builder
|
||||
fmt.Fprintf(&b, `<div style="padding:20px 24px 0">`)
|
||||
fmt.Fprintf(&b, `<div style="display:flex;align-items:center;justify-content:space-between;margin-bottom:16px">`)
|
||||
fmt.Fprintf(&b, `<span style="font-size:16px;font-weight:700">%s — Status Detail</span>`, html.EscapeString(title))
|
||||
b.WriteString(`<button class="btn btn-sm btn-secondary" onclick="document.getElementById('component-detail-dialog').close()">Close</button>`)
|
||||
b.WriteString(`</div>`)
|
||||
|
||||
if len(records) == 0 {
|
||||
b.WriteString(`<p style="color:var(--muted)">No status data recorded yet for this component type.</p>`)
|
||||
b.WriteString(`</div>`)
|
||||
return b.String()
|
||||
}
|
||||
|
||||
sort.Slice(records, func(i, j int) bool {
|
||||
return records[i].ComponentKey < records[j].ComponentKey
|
||||
})
|
||||
|
||||
for _, rec := range records {
|
||||
letter, cls := chipLetterClass(rec.Status)
|
||||
|
||||
// Count non-OK events across the full history for the badge + sparkline.
|
||||
warnCount := 0
|
||||
for _, e := range rec.History {
|
||||
if e.Status != "OK" {
|
||||
warnCount++
|
||||
}
|
||||
}
|
||||
|
||||
fmt.Fprintf(&b, `<div style="margin-bottom:20px">`)
|
||||
fmt.Fprintf(&b, `<div style="display:flex;align-items:center;gap:8px;margin-bottom:8px;flex-wrap:wrap">`)
|
||||
fmt.Fprintf(&b, `<span class="chip %s">%s</span>`, cls, letter)
|
||||
fmt.Fprintf(&b, `<span style="font-weight:700;font-size:13px">%s</span>`, html.EscapeString(rec.ComponentKey))
|
||||
if !rec.LastCheckedAt.IsZero() {
|
||||
fmt.Fprintf(&b, `<span style="color:var(--muted);font-size:12px">checked %s</span>`, rec.LastCheckedAt.Format("2006-01-02 15:04:05"))
|
||||
}
|
||||
if warnCount > 0 {
|
||||
noun := "events"
|
||||
if warnCount == 1 {
|
||||
noun = "event"
|
||||
}
|
||||
fmt.Fprintf(&b,
|
||||
`<span style="font-size:11px;background:var(--warn-bg,#fffbeb);color:var(--warn-fg,#92400e);border:1px solid var(--warn-border,#fde68a);border-radius:10px;padding:1px 7px;white-space:nowrap">%d %s</span>`,
|
||||
warnCount, noun)
|
||||
b.WriteString(renderSparkline(rec.History))
|
||||
}
|
||||
b.WriteString(`</div>`)
|
||||
|
||||
if rec.ErrorSummary != "" {
|
||||
fmt.Fprintf(&b, `<div style="font-size:12px;margin-bottom:4px;color:var(--muted)">%s</div>`, html.EscapeString(rec.ErrorSummary))
|
||||
if decoded := decodeAERStatus(rec.ErrorSummary); decoded != "" {
|
||||
fmt.Fprintf(&b,
|
||||
`<div style="font-size:12px;margin-bottom:8px;color:var(--muted)"><span style="background:var(--surface-alt,#f5f5f5);border-radius:4px;padding:1px 6px;font-family:monospace">AER: %s</span></div>`,
|
||||
html.EscapeString(decoded))
|
||||
}
|
||||
}
|
||||
|
||||
// History table — newest first, cap at 20 entries.
|
||||
history := rec.History
|
||||
if len(history) > 20 {
|
||||
history = history[len(history)-20:]
|
||||
}
|
||||
b.WriteString(`<table style="width:100%;font-size:12px;border-collapse:collapse">`)
|
||||
b.WriteString(`<tr style="color:var(--muted)"><th style="text-align:left;padding:2px 10px 2px 0;white-space:nowrap">Time</th><th style="text-align:left;padding:2px 10px 2px 0">Status</th><th style="text-align:left;padding:2px 10px 2px 0">Source</th><th style="text-align:left;padding:2px 0">Detail</th></tr>`)
|
||||
for i := len(history) - 1; i >= 0; i-- {
|
||||
e := history[i]
|
||||
eLetter, eCls := chipLetterClass(e.Status)
|
||||
detail := e.Detail
|
||||
if detail == "" {
|
||||
detail = "—"
|
||||
}
|
||||
fmt.Fprintf(&b,
|
||||
`<tr><td style="padding:3px 10px 3px 0;white-space:nowrap;color:var(--muted)">%s</td><td style="padding:3px 10px 3px 0"><span class="chip %s" style="font-size:10px;width:16px;height:16px">%s</span></td><td style="padding:3px 10px 3px 0;white-space:nowrap">%s</td><td style="padding:3px 0;color:var(--muted)">%s</td></tr>`,
|
||||
html.EscapeString(e.At.Format("2006-01-02 15:04:05")),
|
||||
eCls, eLetter,
|
||||
html.EscapeString(e.Source),
|
||||
html.EscapeString(detail),
|
||||
)
|
||||
}
|
||||
b.WriteString(`</table>`)
|
||||
b.WriteString(`</div>`)
|
||||
}
|
||||
|
||||
b.WriteString(`</div>`)
|
||||
return b.String()
|
||||
}
|
||||
|
||||
689
audit/internal/webui/raid_mgmt.go
Normal file
689
audit/internal/webui/raid_mgmt.go
Normal file
@@ -0,0 +1,689 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/exec"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// --- Response types ---
|
||||
|
||||
type raidDriveInfo struct {
|
||||
Slot string `json:"slot,omitempty"`
|
||||
Device string `json:"device,omitempty"`
|
||||
Model string `json:"model,omitempty"`
|
||||
SizeGB float64 `json:"size_gb,omitempty"`
|
||||
Serial string `json:"serial,omitempty"`
|
||||
State string `json:"state,omitempty"`
|
||||
}
|
||||
|
||||
type raidArrayInfo struct {
|
||||
Name string `json:"name"`
|
||||
Level string `json:"level,omitempty"`
|
||||
Members []string `json:"members"`
|
||||
Degraded bool `json:"degraded"`
|
||||
}
|
||||
|
||||
type raidControllerInfo struct {
|
||||
ID string `json:"id"`
|
||||
Type string `json:"type"`
|
||||
Index int `json:"index"`
|
||||
Model string `json:"model"`
|
||||
ForeignDrives []raidDriveInfo `json:"foreign_drives"`
|
||||
FreeDrives []raidDriveInfo `json:"free_drives"`
|
||||
Arrays []raidArrayInfo `json:"arrays,omitempty"`
|
||||
}
|
||||
|
||||
type raidStatusResp struct {
|
||||
Controllers []raidControllerInfo `json:"controllers"`
|
||||
}
|
||||
|
||||
// --- LSI/storcli detection ---
|
||||
|
||||
func detectLSIControllers() []raidControllerInfo {
|
||||
ctrlOut, err := exec.Command("storcli64", "/call", "show", "J").Output()
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
var ctrlDoc struct {
|
||||
Controllers []struct {
|
||||
ResponseData struct {
|
||||
Basics struct {
|
||||
Controller int `json:"Controller"`
|
||||
Model string `json:"Model"`
|
||||
} `json:"Basics"`
|
||||
} `json:"Response Data"`
|
||||
} `json:"Controllers"`
|
||||
}
|
||||
if err := json.Unmarshal(ctrlOut, &ctrlDoc); err != nil || len(ctrlDoc.Controllers) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
driveOut, _ := exec.Command("storcli64", "/call/eall/sall", "show", "all", "J").Output()
|
||||
|
||||
var driveDoc struct {
|
||||
Controllers []struct {
|
||||
ResponseData struct {
|
||||
DriveInformation []struct {
|
||||
EIDSlt string `json:"EID:Slt"`
|
||||
State string `json:"State"`
|
||||
Size string `json:"Size"`
|
||||
Intf string `json:"Intf"`
|
||||
Med string `json:"Med"`
|
||||
Model string `json:"Model"`
|
||||
SN string `json:"SN"`
|
||||
} `json:"Drive Information"`
|
||||
} `json:"Response Data"`
|
||||
} `json:"Controllers"`
|
||||
}
|
||||
if len(driveOut) > 0 {
|
||||
json.Unmarshal(driveOut, &driveDoc) //nolint:errcheck
|
||||
}
|
||||
|
||||
var controllers []raidControllerInfo
|
||||
for i, c := range ctrlDoc.Controllers {
|
||||
ctrl := raidControllerInfo{
|
||||
ID: fmt.Sprintf("lsi-%d", c.ResponseData.Basics.Controller),
|
||||
Type: "lsi",
|
||||
Index: c.ResponseData.Basics.Controller,
|
||||
Model: c.ResponseData.Basics.Model,
|
||||
ForeignDrives: []raidDriveInfo{},
|
||||
FreeDrives: []raidDriveInfo{},
|
||||
}
|
||||
if ctrl.Model == "" {
|
||||
ctrl.Model = fmt.Sprintf("LSI Controller %d", ctrl.Index)
|
||||
}
|
||||
|
||||
if i < len(driveDoc.Controllers) {
|
||||
for _, d := range driveDoc.Controllers[i].ResponseData.DriveInformation {
|
||||
info := raidDriveInfo{
|
||||
Slot: strings.TrimSpace(d.EIDSlt),
|
||||
Model: strings.TrimSpace(d.Model),
|
||||
State: strings.TrimSpace(d.State),
|
||||
SizeGB: raidParseHumanSizeGB(d.Size),
|
||||
Serial: strings.TrimSpace(d.SN),
|
||||
}
|
||||
switch strings.TrimSpace(d.State) {
|
||||
case "Frgn":
|
||||
ctrl.ForeignDrives = append(ctrl.ForeignDrives, info)
|
||||
case "UGood", "JBOD":
|
||||
ctrl.FreeDrives = append(ctrl.FreeDrives, info)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
controllers = append(controllers, ctrl)
|
||||
}
|
||||
return controllers
|
||||
}
|
||||
|
||||
// --- VROC/mdadm detection ---
|
||||
|
||||
var raidMDStatDegradedRx = regexp.MustCompile(`\[[U_]+\]`)
|
||||
|
||||
type mdStatEntry struct {
|
||||
Name string
|
||||
Level string
|
||||
Members []string
|
||||
Degraded bool
|
||||
}
|
||||
|
||||
func parseRAIDMDStat(raw string) []mdStatEntry {
|
||||
var entries []mdStatEntry
|
||||
var cur *mdStatEntry
|
||||
for _, line := range strings.Split(raw, "\n") {
|
||||
if strings.HasPrefix(line, "Personalities") || strings.HasPrefix(line, "unused devices") {
|
||||
continue
|
||||
}
|
||||
if idx := strings.Index(line, " : "); idx > 0 {
|
||||
name := strings.TrimSpace(line[:idx])
|
||||
rest := line[idx+3:]
|
||||
entry := mdStatEntry{Name: name}
|
||||
for _, tok := range strings.Fields(rest) {
|
||||
if strings.HasPrefix(tok, "raid") || strings.HasPrefix(tok, "linear") {
|
||||
entry.Level = tok
|
||||
}
|
||||
if bk := strings.Index(tok, "["); bk > 0 && strings.HasSuffix(tok, "]") {
|
||||
entry.Members = append(entry.Members, tok[:bk])
|
||||
}
|
||||
}
|
||||
entries = append(entries, entry)
|
||||
cur = &entries[len(entries)-1]
|
||||
continue
|
||||
}
|
||||
if cur != nil {
|
||||
if m := raidMDStatDegradedRx.FindString(line); m != "" && strings.Contains(m, "_") {
|
||||
cur.Degraded = true
|
||||
}
|
||||
}
|
||||
}
|
||||
return entries
|
||||
}
|
||||
|
||||
func detectVROCController() *raidControllerInfo {
|
||||
out, err := exec.Command("mdadm", "--detail-platform").CombinedOutput()
|
||||
if err != nil && len(out) == 0 {
|
||||
return nil
|
||||
}
|
||||
hasVROC := false
|
||||
for _, line := range strings.Split(string(out), "\n") {
|
||||
lower := strings.ToLower(line)
|
||||
if strings.Contains(lower, "license") || strings.Contains(lower, "intel") || strings.Contains(lower, "platform") {
|
||||
hasVROC = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !hasVROC {
|
||||
return nil
|
||||
}
|
||||
|
||||
ctrl := &raidControllerInfo{
|
||||
ID: "vroc-0",
|
||||
Type: "vroc",
|
||||
Model: "Intel VROC",
|
||||
ForeignDrives: []raidDriveInfo{},
|
||||
FreeDrives: []raidDriveInfo{},
|
||||
}
|
||||
|
||||
inArray := map[string]bool{}
|
||||
raw, err := os.ReadFile("/proc/mdstat")
|
||||
if err == nil {
|
||||
for _, arr := range parseRAIDMDStat(string(raw)) {
|
||||
ctrl.Arrays = append(ctrl.Arrays, raidArrayInfo{
|
||||
Name: arr.Name,
|
||||
Level: arr.Level,
|
||||
Members: arr.Members,
|
||||
Degraded: arr.Degraded,
|
||||
})
|
||||
for _, m := range arr.Members {
|
||||
inArray[m] = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
lsblkOut, err := exec.Command("lsblk", "-J", "-d", "-o", "NAME,SIZE,TYPE,MODEL,SERIAL").Output()
|
||||
if err == nil {
|
||||
var lsblkDoc struct {
|
||||
BlockDevices []struct {
|
||||
Name string `json:"name"`
|
||||
Size string `json:"size"`
|
||||
Type string `json:"type"`
|
||||
Model string `json:"model"`
|
||||
Serial string `json:"serial"`
|
||||
} `json:"blockdevices"`
|
||||
}
|
||||
if json.Unmarshal(lsblkOut, &lsblkDoc) == nil {
|
||||
for _, d := range lsblkDoc.BlockDevices {
|
||||
if d.Type != "disk" || inArray[d.Name] {
|
||||
continue
|
||||
}
|
||||
ctrl.FreeDrives = append(ctrl.FreeDrives, raidDriveInfo{
|
||||
Device: "/dev/" + d.Name,
|
||||
Model: strings.TrimSpace(d.Model),
|
||||
Serial: strings.TrimSpace(d.Serial),
|
||||
State: "available",
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ctrl
|
||||
}
|
||||
|
||||
// --- API handlers ---
|
||||
|
||||
func (h *handler) handleAPIRAIDStatus(w http.ResponseWriter, r *http.Request) {
|
||||
resp := raidStatusResp{Controllers: []raidControllerInfo{}}
|
||||
|
||||
if lsi := detectLSIControllers(); len(lsi) > 0 {
|
||||
resp.Controllers = append(resp.Controllers, lsi...)
|
||||
}
|
||||
if vroc := detectVROCController(); vroc != nil {
|
||||
resp.Controllers = append(resp.Controllers, *vroc)
|
||||
}
|
||||
|
||||
writeJSON(w, resp)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIRAIDForeignAction(w http.ResponseWriter, r *http.Request) {
|
||||
var req struct {
|
||||
ControllerID string `json:"controller_id"`
|
||||
Action string `json:"action"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid JSON")
|
||||
return
|
||||
}
|
||||
if req.Action != "import" && req.Action != "clear" {
|
||||
writeError(w, http.StatusBadRequest, "action must be 'import' or 'clear'")
|
||||
return
|
||||
}
|
||||
ctrlIdx, ok := parseLSIControllerIndex(req.ControllerID)
|
||||
if !ok {
|
||||
writeError(w, http.StatusBadRequest, "invalid controller_id")
|
||||
return
|
||||
}
|
||||
|
||||
target := "raid-foreign-clear"
|
||||
name := fmt.Sprintf("RAID Foreign Clear (ctrl %d)", ctrlIdx)
|
||||
if req.Action == "import" {
|
||||
target = "raid-foreign-import"
|
||||
name = fmt.Sprintf("RAID Foreign Import (ctrl %d)", ctrlIdx)
|
||||
}
|
||||
|
||||
t := &Task{
|
||||
ID: newJobID(target),
|
||||
Name: name,
|
||||
Target: target,
|
||||
Priority: defaultTaskPriority(target, taskParams{}),
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{RAIDController: ctrlIdx},
|
||||
}
|
||||
globalQueue.enqueue(t)
|
||||
writeJSON(w, map[string]string{"task_id": t.ID})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIRAIDCreateMirror(w http.ResponseWriter, r *http.Request) {
|
||||
var req struct {
|
||||
ControllerID string `json:"controller_id"`
|
||||
Devices []string `json:"devices"`
|
||||
ArrayName string `json:"array_name"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid JSON")
|
||||
return
|
||||
}
|
||||
if len(req.Devices) < 2 {
|
||||
writeError(w, http.StatusBadRequest, "at least 2 devices required")
|
||||
return
|
||||
}
|
||||
|
||||
var target, name string
|
||||
var params taskParams
|
||||
|
||||
switch {
|
||||
case strings.HasPrefix(req.ControllerID, "lsi-"):
|
||||
ctrlIdx, ok := parseLSIControllerIndex(req.ControllerID)
|
||||
if !ok {
|
||||
writeError(w, http.StatusBadRequest, "invalid controller_id")
|
||||
return
|
||||
}
|
||||
target = "raid-lsi-create-mirror"
|
||||
name = fmt.Sprintf("Create RAID 1 Mirror (LSI ctrl %d)", ctrlIdx)
|
||||
params = taskParams{RAIDController: ctrlIdx, RAIDDevices: req.Devices}
|
||||
|
||||
case req.ControllerID == "vroc-0":
|
||||
arrayName := strings.TrimSpace(req.ArrayName)
|
||||
if arrayName == "" {
|
||||
arrayName = "bee-mirror0"
|
||||
}
|
||||
target = "raid-vroc-create-mirror"
|
||||
name = fmt.Sprintf("Create VROC RAID 1 (%s)", arrayName)
|
||||
params = taskParams{RAIDDevices: req.Devices, RAIDArrayName: arrayName}
|
||||
|
||||
default:
|
||||
writeError(w, http.StatusBadRequest, "unknown controller_id")
|
||||
return
|
||||
}
|
||||
|
||||
t := &Task{
|
||||
ID: newJobID(target),
|
||||
Name: name,
|
||||
Target: target,
|
||||
Priority: defaultTaskPriority(target, taskParams{}),
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
params: params,
|
||||
}
|
||||
globalQueue.enqueue(t)
|
||||
writeJSON(w, map[string]string{"task_id": t.ID})
|
||||
}
|
||||
|
||||
func parseLSIControllerIndex(id string) (int, bool) {
|
||||
if !strings.HasPrefix(id, "lsi-") {
|
||||
return 0, false
|
||||
}
|
||||
n, err := strconv.Atoi(strings.TrimPrefix(id, "lsi-"))
|
||||
if err != nil || n < 0 {
|
||||
return 0, false
|
||||
}
|
||||
return n, true
|
||||
}
|
||||
|
||||
// --- Task runner functions ---
|
||||
|
||||
func runRAIDForeignClearTask(ctx context.Context, j *jobState, ctrl int) error {
|
||||
j.append(fmt.Sprintf("Clearing foreign configuration on controller %d...", ctrl))
|
||||
cmd := exec.CommandContext(ctx, "storcli64", fmt.Sprintf("/c%d/fall", ctrl), "del", "noprompt")
|
||||
return streamCmdJob(j, cmd)
|
||||
}
|
||||
|
||||
func runRAIDForeignImportTask(ctx context.Context, j *jobState, ctrl int) error {
|
||||
j.append(fmt.Sprintf("Importing foreign configuration on controller %d...", ctrl))
|
||||
cmd := exec.CommandContext(ctx, "storcli64", fmt.Sprintf("/c%d/fall", ctrl), "import", "noprompt")
|
||||
return streamCmdJob(j, cmd)
|
||||
}
|
||||
|
||||
func runRAIDLSICreateMirrorTask(ctx context.Context, j *jobState, ctrl int, drives []string) error {
|
||||
driveList := strings.Join(drives, ",")
|
||||
j.append(fmt.Sprintf("Creating RAID 1 on controller %d with drives: %s", ctrl, driveList))
|
||||
cmd := exec.CommandContext(ctx, "storcli64",
|
||||
fmt.Sprintf("/c%d", ctrl),
|
||||
"add", "vd", "type=raid1",
|
||||
fmt.Sprintf("drives=%s", driveList),
|
||||
"pdperarray=2",
|
||||
)
|
||||
return streamCmdJob(j, cmd)
|
||||
}
|
||||
|
||||
func runRAIDVROCCreateMirrorTask(ctx context.Context, j *jobState, devices []string, arrayName string) error {
|
||||
if arrayName == "" {
|
||||
arrayName = "bee-mirror0"
|
||||
}
|
||||
devPath := "/dev/md/" + arrayName
|
||||
args := []string{
|
||||
"--create", devPath,
|
||||
"--level=1",
|
||||
fmt.Sprintf("--raid-devices=%d", len(devices)),
|
||||
"--run",
|
||||
}
|
||||
args = append(args, devices...)
|
||||
j.append(fmt.Sprintf("Creating VROC RAID 1 array %s with: %s", devPath, strings.Join(devices, " ")))
|
||||
cmd := exec.CommandContext(ctx, "mdadm", args...)
|
||||
return streamCmdJob(j, cmd)
|
||||
}
|
||||
|
||||
// raidParseHumanSizeGB parses storcli size strings like "1.818 TB", "745.211 GB".
|
||||
func raidParseHumanSizeGB(s string) float64 {
|
||||
s = strings.TrimSpace(s)
|
||||
if s == "" {
|
||||
return 0
|
||||
}
|
||||
upper := strings.ToUpper(s)
|
||||
var mul float64
|
||||
var numStr string
|
||||
switch {
|
||||
case strings.Contains(upper, " TB"):
|
||||
mul = 1024
|
||||
numStr = strings.TrimSpace(strings.SplitN(upper, " T", 2)[0])
|
||||
case strings.Contains(upper, " GB"):
|
||||
mul = 1
|
||||
numStr = strings.TrimSpace(strings.SplitN(upper, " G", 2)[0])
|
||||
case strings.Contains(upper, " MB"):
|
||||
mul = 1.0 / 1024
|
||||
numStr = strings.TrimSpace(strings.SplitN(upper, " M", 2)[0])
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
v, err := strconv.ParseFloat(numStr, 64)
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
return v * mul
|
||||
}
|
||||
|
||||
// --- UI card ---
|
||||
|
||||
func renderRAIDMgmtCard() string {
|
||||
return `<div class="card"><div class="card-head card-head-actions">RAID Controller Management<div class="card-head-buttons"><button class="btn btn-sm btn-secondary" onclick="raidLoad()">↻ Refresh</button></div></div><div class="card-body">
|
||||
<div id="raid-status" style="font-size:13px;color:var(--muted);margin-bottom:8px">Loading...</div>
|
||||
<div id="raid-content"></div>
|
||||
<div id="raid-out-wrap" style="display:none;margin-top:14px">
|
||||
<div style="display:flex;align-items:center;justify-content:space-between;margin-bottom:4px">
|
||||
<span id="raid-out-label" style="font-size:12px;font-weight:600;color:var(--muted)">Output</span>
|
||||
<span id="raid-out-status" style="font-size:12px"></span>
|
||||
</div>
|
||||
<div id="raid-terminal" class="terminal" style="max-height:260px;width:100%;box-sizing:border-box"></div>
|
||||
</div>
|
||||
</div></div>
|
||||
<script>
|
||||
(function(){
|
||||
function escHtml(s) {
|
||||
return String(s||'').replace(/&/g,'&').replace(/</g,'<').replace(/>/g,'>').replace(/"/g,'"');
|
||||
}
|
||||
|
||||
var _raidControllers = [];
|
||||
|
||||
function raidLoad() {
|
||||
var status = document.getElementById('raid-status');
|
||||
var content = document.getElementById('raid-content');
|
||||
status.textContent = 'Detecting RAID controllers...';
|
||||
status.style.color = 'var(--muted)';
|
||||
content.innerHTML = '';
|
||||
fetch('/api/tools/raid/status', {cache:'no-store'})
|
||||
.then(function(r) {
|
||||
if (!r.ok) return r.json().then(function(e) { throw new Error(e.error || r.statusText); });
|
||||
return r.json();
|
||||
})
|
||||
.then(function(data) {
|
||||
_raidControllers = data.controllers || [];
|
||||
if (_raidControllers.length === 0) {
|
||||
status.textContent = 'No RAID controllers detected.';
|
||||
return;
|
||||
}
|
||||
status.textContent = _raidControllers.length + ' controller(s) detected.';
|
||||
content.innerHTML = _raidControllers.map(function(c, i) {
|
||||
return raidRenderController(c, i);
|
||||
}).join('<hr style="margin:16px 0;border:none;border-top:1px solid var(--border)">');
|
||||
})
|
||||
.catch(function(e) {
|
||||
status.textContent = 'Error: ' + e.message;
|
||||
status.style.color = 'var(--crit-fg)';
|
||||
});
|
||||
}
|
||||
|
||||
function raidRenderController(c, idx) {
|
||||
var html = '';
|
||||
var typeLabel = c.type === 'lsi' ? 'LSI / Broadcom' : 'Intel VROC';
|
||||
html += '<div style="font-weight:600;font-size:13px;margin-bottom:10px">' + typeLabel + ' — ' + escHtml(c.model) + '</div>';
|
||||
|
||||
if (c.type === 'lsi') {
|
||||
var foreign = c.foreign_drives || [];
|
||||
if (foreign.length > 0) {
|
||||
html += '<div style="background:var(--warn-bg,rgba(240,192,0,0.1));border:1px solid var(--warn-border,#c8a800);border-radius:4px;padding:10px 12px;margin-bottom:12px">';
|
||||
html += '<div style="font-weight:600;font-size:13px;margin-bottom:6px">⚠︎ Foreign Configuration Detected (' + foreign.length + ' drive(s))</div>';
|
||||
html += '<table style="margin-bottom:10px"><tr><th>Slot</th><th>Model</th><th>Size</th><th>State</th></tr>';
|
||||
foreign.forEach(function(d) {
|
||||
html += '<tr>'
|
||||
+ '<td style="font-family:monospace">' + escHtml(d.slot) + '</td>'
|
||||
+ '<td>' + escHtml(d.model||'—') + '</td>'
|
||||
+ '<td>' + (d.size_gb > 0 ? Math.round(d.size_gb) + ' GB' : '—') + '</td>'
|
||||
+ '<td><span class="badge badge-warn">' + escHtml(d.state) + '</span></td>'
|
||||
+ '</tr>';
|
||||
});
|
||||
html += '</table>';
|
||||
html += '<div style="display:flex;gap:8px;flex-wrap:wrap">';
|
||||
html += '<button class="btn btn-sm btn-primary" onclick="raidForeignAction(\'' + escHtml(c.id) + '\',\'import\',this)">Import Foreign Config</button>';
|
||||
html += '<button class="btn btn-sm btn-secondary" style="color:var(--crit-fg)" onclick="raidForeignAction(\'' + escHtml(c.id) + '\',\'clear\',this)">Clear Foreign Config</button>';
|
||||
html += '</div></div>';
|
||||
}
|
||||
|
||||
html += raidRenderMirrorSection(c, idx, 'lsi');
|
||||
}
|
||||
|
||||
if (c.type === 'vroc') {
|
||||
var arrays = c.arrays || [];
|
||||
if (arrays.length > 0) {
|
||||
html += '<div style="font-size:12px;font-weight:600;color:var(--muted);margin-bottom:6px;text-transform:uppercase;letter-spacing:.04em">Active Arrays</div>';
|
||||
html += '<table style="margin-bottom:14px"><tr><th>Name</th><th>Level</th><th>Members</th><th>Status</th></tr>';
|
||||
arrays.forEach(function(a) {
|
||||
var badge = a.degraded
|
||||
? '<span class="badge badge-err">Degraded</span>'
|
||||
: '<span class="badge badge-ok">OK</span>';
|
||||
html += '<tr>'
|
||||
+ '<td style="font-family:monospace">' + escHtml(a.name) + '</td>'
|
||||
+ '<td>' + escHtml(a.level||'—') + '</td>'
|
||||
+ '<td style="font-family:monospace;font-size:12px">' + (a.members||[]).map(escHtml).join(', ') + '</td>'
|
||||
+ '<td>' + badge + '</td>'
|
||||
+ '</tr>';
|
||||
});
|
||||
html += '</table>';
|
||||
}
|
||||
|
||||
html += raidRenderMirrorSection(c, idx, 'vroc');
|
||||
}
|
||||
|
||||
return html;
|
||||
}
|
||||
|
||||
function raidRenderMirrorSection(c, idx, kind) {
|
||||
var free = c.free_drives || [];
|
||||
var html = '<div style="font-size:12px;font-weight:600;color:var(--muted);margin-bottom:6px;text-transform:uppercase;letter-spacing:.04em">Create RAID 1 Mirror</div>';
|
||||
|
||||
if (free.length < 2) {
|
||||
html += '<p style="font-size:13px;color:var(--muted)">No unconfigured drives available (need at least 2).</p>';
|
||||
return html;
|
||||
}
|
||||
|
||||
html += '<p style="font-size:13px;color:var(--muted);margin-bottom:8px">Select exactly 2 drives:</p>';
|
||||
html += '<div>';
|
||||
free.forEach(function(d) {
|
||||
var val = kind === 'lsi' ? d.slot : d.device;
|
||||
var label = kind === 'lsi'
|
||||
? escHtml(d.slot) + (d.model ? ' — ' + escHtml(d.model) : '') + (d.size_gb > 0 ? ' (' + Math.round(d.size_gb) + ' GB)' : '')
|
||||
: escHtml(d.device) + (d.model ? ' — ' + escHtml(d.model) : '') + (d.serial ? ' [' + escHtml(d.serial) + ']' : '');
|
||||
html += '<label style="display:block;margin-bottom:4px;font-size:13px;cursor:pointer">'
|
||||
+ '<input type="checkbox" class="raid-mirror-check-' + idx + '" value="' + escHtml(val) + '"> '
|
||||
+ label + '</label>';
|
||||
});
|
||||
html += '</div>';
|
||||
|
||||
if (kind === 'vroc') {
|
||||
html += '<div style="margin-top:10px;display:flex;align-items:center;gap:8px;flex-wrap:wrap">'
|
||||
+ '<label style="font-size:13px">Array name: <input type="text" id="vroc-arrayname-' + idx + '" value="bee-mirror0" style="font-family:monospace;padding:2px 6px;width:140px"></label>';
|
||||
} else {
|
||||
html += '<div style="margin-top:10px;display:flex;gap:8px">';
|
||||
}
|
||||
|
||||
html += '<button class="btn btn-sm btn-primary raid-mirror-btn-' + idx + '" onclick="raidCreateMirror(\'' + escHtml(c.id) + '\',' + idx + ',\'' + kind + '\',this)">Create Mirror</button>';
|
||||
html += '</div>';
|
||||
|
||||
return html;
|
||||
}
|
||||
|
||||
function raidForeignAction(ctrlID, action, btn) {
|
||||
if (action === 'clear' && !confirm('Clear foreign configuration on ' + ctrlID + '?\n\nThis will DELETE the foreign RAID metadata. Data on those drives may become inaccessible.')) {
|
||||
return;
|
||||
}
|
||||
var original = btn ? btn.textContent : '';
|
||||
if (btn) { btn.disabled = true; btn.textContent = action === 'import' ? 'Importing...' : 'Clearing...'; }
|
||||
raidShowOutput('RAID foreign ' + action, '', '');
|
||||
fetch('/api/tools/raid/foreign', {
|
||||
method: 'POST',
|
||||
headers: {'Content-Type': 'application/json'},
|
||||
body: JSON.stringify({controller_id: ctrlID, action: action})
|
||||
})
|
||||
.then(function(r) { return r.json(); })
|
||||
.then(function(d) {
|
||||
if (d.error) throw new Error(d.error);
|
||||
var actionLabel = action === 'import' ? 'Import foreign config' : 'Clear foreign config';
|
||||
raidStreamTask(d.task_id, actionLabel, function() {
|
||||
if (btn) { btn.disabled = false; btn.textContent = original; }
|
||||
raidLoad();
|
||||
});
|
||||
})
|
||||
.catch(function(e) {
|
||||
raidShowOutput('Error', 'failed', e.message);
|
||||
if (btn) { btn.disabled = false; btn.textContent = original; }
|
||||
});
|
||||
}
|
||||
|
||||
function raidCreateMirror(ctrlID, idx, kind, btn) {
|
||||
var checks = document.querySelectorAll('.raid-mirror-check-' + idx + ':checked');
|
||||
if (checks.length !== 2) {
|
||||
alert('Select exactly 2 drives.');
|
||||
return;
|
||||
}
|
||||
var devices = Array.from(checks).map(function(c) { return c.value; });
|
||||
var arrayName = '';
|
||||
if (kind === 'vroc') {
|
||||
var nameEl = document.getElementById('vroc-arrayname-' + idx);
|
||||
arrayName = nameEl ? nameEl.value.trim() : 'bee-mirror0';
|
||||
if (!arrayName) arrayName = 'bee-mirror0';
|
||||
}
|
||||
var original = btn ? btn.textContent : '';
|
||||
if (btn) { btn.disabled = true; btn.textContent = 'Creating...'; }
|
||||
raidShowOutput('Create RAID 1', '', '');
|
||||
fetch('/api/tools/raid/create-mirror', {
|
||||
method: 'POST',
|
||||
headers: {'Content-Type': 'application/json'},
|
||||
body: JSON.stringify({controller_id: ctrlID, devices: devices, array_name: arrayName})
|
||||
})
|
||||
.then(function(r) { return r.json(); })
|
||||
.then(function(d) {
|
||||
if (d.error) throw new Error(d.error);
|
||||
raidStreamTask(d.task_id, 'Create RAID 1 mirror', function() {
|
||||
if (btn) { btn.disabled = false; btn.textContent = original; }
|
||||
raidLoad();
|
||||
});
|
||||
})
|
||||
.catch(function(e) {
|
||||
raidShowOutput('Error', 'failed', e.message);
|
||||
if (btn) { btn.disabled = false; btn.textContent = original; }
|
||||
});
|
||||
}
|
||||
|
||||
function raidShowOutput(label, status, text) {
|
||||
var wrap = document.getElementById('raid-out-wrap');
|
||||
var labelEl = document.getElementById('raid-out-label');
|
||||
var statusEl = document.getElementById('raid-out-status');
|
||||
var term = document.getElementById('raid-terminal');
|
||||
wrap.style.display = 'block';
|
||||
labelEl.textContent = label;
|
||||
if (status === 'ok') {
|
||||
statusEl.textContent = '✓ done';
|
||||
statusEl.style.color = 'var(--ok-fg)';
|
||||
} else if (status === 'failed') {
|
||||
statusEl.textContent = '✗ failed';
|
||||
statusEl.style.color = 'var(--crit-fg)';
|
||||
} else {
|
||||
statusEl.textContent = status;
|
||||
statusEl.style.color = 'var(--muted)';
|
||||
}
|
||||
if (text !== undefined) {
|
||||
term.textContent = text;
|
||||
term.scrollTop = term.scrollHeight;
|
||||
}
|
||||
}
|
||||
|
||||
function raidStreamTask(taskID, taskName, onDone) {
|
||||
var term = document.getElementById('raid-terminal');
|
||||
term.textContent = '';
|
||||
raidShowOutput(taskName || 'Running…', 'running…', undefined);
|
||||
var es = new EventSource('/api/tasks/' + taskID + '/stream');
|
||||
es.onmessage = function(e) {
|
||||
term.textContent += e.data + '\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
};
|
||||
es.addEventListener('done', function(e) {
|
||||
es.close();
|
||||
if (!e.data) {
|
||||
raidShowOutput(taskName, 'ok', undefined);
|
||||
} else {
|
||||
raidShowOutput(taskName, 'failed', undefined);
|
||||
term.textContent += '\nFailed: ' + e.data;
|
||||
term.scrollTop = term.scrollHeight;
|
||||
}
|
||||
if (onDone) onDone();
|
||||
});
|
||||
es.onerror = function() {
|
||||
es.close();
|
||||
raidShowOutput(taskName, 'failed', undefined);
|
||||
if (onDone) onDone();
|
||||
};
|
||||
}
|
||||
|
||||
window.raidLoad = raidLoad;
|
||||
raidLoad();
|
||||
})();
|
||||
</script>`
|
||||
}
|
||||
214
audit/internal/webui/saa_dmi.go
Normal file
214
audit/internal/webui/saa_dmi.go
Normal file
@@ -0,0 +1,214 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type dmiField struct {
|
||||
Name string `json:"name"`
|
||||
Shn string `json:"shn"`
|
||||
Value string `json:"value"`
|
||||
}
|
||||
|
||||
type saaChange struct {
|
||||
Shn string `json:"shn"`
|
||||
Value string `json:"value"`
|
||||
}
|
||||
|
||||
var (
|
||||
shnRE = regexp.MustCompile(`^[A-Za-z0-9_]{1,16}$`)
|
||||
dmiSectionRE = regexp.MustCompile(`^\[(.+?)\]$`)
|
||||
// Item Name {SHN} = value // comment
|
||||
// SHN may contain parentheses, e.g. {PS(4)LC} for power supply fields
|
||||
dmiItemRE = regexp.MustCompile(`^(.+?)\s+\{([A-Za-z0-9_()\-]{1,24})\}\s*=\s*(.*)$`)
|
||||
dmiVersionRE = regexp.MustCompile(`(?i)^version\s*=`)
|
||||
)
|
||||
|
||||
|
||||
// parseDMIFile parses the DMI.txt produced by "saa GetDmiInfo".
|
||||
// Real format (from SAA User Guide 4.8.1):
|
||||
//
|
||||
// [System]
|
||||
// Version {SYVS} = "A Version" // string value
|
||||
// Serial Number {SYSN} = $DEFAULT$ // string value
|
||||
// UUID {SYUU} = 00112233-... // hex value
|
||||
func parseDMIFile(content string) []dmiField {
|
||||
var fields []dmiField
|
||||
currentSection := ""
|
||||
for _, line := range strings.Split(content, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" || strings.HasPrefix(line, "//") || strings.HasPrefix(line, "#") {
|
||||
continue
|
||||
}
|
||||
if dmiVersionRE.MatchString(line) {
|
||||
continue
|
||||
}
|
||||
if m := dmiSectionRE.FindStringSubmatch(line); m != nil {
|
||||
currentSection = strings.TrimSpace(m[1])
|
||||
continue
|
||||
}
|
||||
m := dmiItemRE.FindStringSubmatch(line)
|
||||
if m == nil {
|
||||
continue
|
||||
}
|
||||
itemName := strings.TrimSpace(m[1])
|
||||
shn := m[2]
|
||||
rawValue := strings.TrimSpace(m[3])
|
||||
// strip trailing comment (space + //)
|
||||
if idx := strings.LastIndex(rawValue, " //"); idx >= 0 {
|
||||
rawValue = strings.TrimSpace(rawValue[:idx])
|
||||
}
|
||||
// strip surrounding double quotes from string values
|
||||
if len(rawValue) >= 2 && rawValue[0] == '"' && rawValue[len(rawValue)-1] == '"' {
|
||||
rawValue = rawValue[1 : len(rawValue)-1]
|
||||
}
|
||||
displayName := itemName
|
||||
if currentSection != "" {
|
||||
displayName = currentSection + " / " + itemName
|
||||
}
|
||||
fields = append(fields, dmiField{Name: displayName, Shn: shn, Value: rawValue})
|
||||
}
|
||||
return fields
|
||||
}
|
||||
|
||||
func (h *handler) handleAPISAADMIRead(w http.ResponseWriter, r *http.Request) {
|
||||
ctx, cancel := context.WithTimeout(r.Context(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
tmpDir, err := os.MkdirTemp("", "bee-saa-*")
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, "create temp dir: "+err.Error())
|
||||
return
|
||||
}
|
||||
defer os.RemoveAll(tmpDir)
|
||||
|
||||
dmiFile := filepath.Join(tmpDir, "DMI.txt")
|
||||
cmd := exec.CommandContext(ctx, "saa", "-c", "GetDmiInfo", "--file", dmiFile, "--overwrite")
|
||||
cmd.Dir = "/usr/local/bin"
|
||||
out, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
msg := strings.TrimSpace(string(out))
|
||||
if msg == "" {
|
||||
msg = err.Error()
|
||||
}
|
||||
writeError(w, http.StatusInternalServerError, "saa GetDmiInfo: "+msg)
|
||||
return
|
||||
}
|
||||
|
||||
raw, err := os.ReadFile(dmiFile)
|
||||
if err != nil {
|
||||
writeError(w, http.StatusInternalServerError, "read DMI file: "+err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
fields := parseDMIFile(string(raw))
|
||||
if len(fields) == 0 {
|
||||
writeError(w, http.StatusInternalServerError, "no DMI fields found (file may be empty — reboot the server and try again)")
|
||||
return
|
||||
}
|
||||
writeJSON(w, fields)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPISAADMIWrite(w http.ResponseWriter, r *http.Request) {
|
||||
var req struct {
|
||||
Changes []saaChange `json:"changes"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid request body")
|
||||
return
|
||||
}
|
||||
if len(req.Changes) == 0 {
|
||||
writeError(w, http.StatusUnprocessableEntity, "no changes provided")
|
||||
return
|
||||
}
|
||||
for _, c := range req.Changes {
|
||||
if !shnRE.MatchString(c.Shn) {
|
||||
writeError(w, http.StatusUnprocessableEntity, "invalid shn: "+c.Shn)
|
||||
return
|
||||
}
|
||||
if len(c.Value) == 0 || len(c.Value) > 64 {
|
||||
writeError(w, http.StatusUnprocessableEntity, "value length out of range for shn: "+c.Shn)
|
||||
return
|
||||
}
|
||||
for _, ch := range c.Value {
|
||||
if ch < 0x20 || ch > 0x7E {
|
||||
writeError(w, http.StatusUnprocessableEntity, "value contains non-printable character for shn: "+c.Shn)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
t := &Task{
|
||||
ID: newJobID("saa-dmi-write"),
|
||||
Name: fmt.Sprintf("SAA DMI Write (%d field(s))", len(req.Changes)),
|
||||
Target: "saa-dmi-write",
|
||||
Priority: defaultTaskPriority("saa-dmi-write", taskParams{}),
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{
|
||||
SAADmiChanges: req.Changes,
|
||||
},
|
||||
}
|
||||
globalQueue.enqueue(t)
|
||||
writeJSON(w, map[string]string{"task_id": t.ID})
|
||||
}
|
||||
|
||||
func runSAADMIWriteTask(ctx context.Context, j *jobState, exportDir string, p taskParams) error {
|
||||
tmpDir, err := os.MkdirTemp("", "bee-saa-*")
|
||||
if err != nil {
|
||||
return fmt.Errorf("create temp dir: %w", err)
|
||||
}
|
||||
defer os.RemoveAll(tmpDir)
|
||||
dmiFile := filepath.Join(tmpDir, "DMI.txt")
|
||||
|
||||
j.append("Reading current DMI configuration...")
|
||||
getCmd := exec.CommandContext(ctx, "saa", "-c", "GetDmiInfo", "--file", dmiFile, "--overwrite")
|
||||
getCmd.Dir = "/usr/local/bin"
|
||||
if err := streamCmdJob(j, getCmd); err != nil {
|
||||
return fmt.Errorf("GetDmiInfo: %w", err)
|
||||
}
|
||||
|
||||
backupDir := filepath.Join(exportDir, "dmi-backups")
|
||||
if err := os.MkdirAll(backupDir, 0o755); err != nil {
|
||||
return fmt.Errorf("create backup dir: %w", err)
|
||||
}
|
||||
backupName := "dmi-" + time.Now().UTC().Format("20060102-150405") + ".txt"
|
||||
backupPath := filepath.Join(backupDir, backupName)
|
||||
raw, err := os.ReadFile(dmiFile)
|
||||
if err != nil {
|
||||
return fmt.Errorf("read DMI file: %w", err)
|
||||
}
|
||||
if err := os.WriteFile(backupPath, raw, 0o644); err != nil {
|
||||
return fmt.Errorf("write backup: %w", err)
|
||||
}
|
||||
j.append("Backup saved: dmi-backups/" + backupName)
|
||||
|
||||
for _, c := range p.SAADmiChanges {
|
||||
j.append("Setting " + c.Shn + " = " + c.Value)
|
||||
cmd := exec.CommandContext(ctx, "saa", "-c", "EditDmiInfo", "--file", dmiFile, "--shn", c.Shn, "--value", c.Value)
|
||||
cmd.Dir = "/usr/local/bin"
|
||||
if err := streamCmdJob(j, cmd); err != nil {
|
||||
return fmt.Errorf("EditDmiInfo %s: %w", c.Shn, err)
|
||||
}
|
||||
}
|
||||
|
||||
j.append("Applying changes to hardware...")
|
||||
changeCmd := exec.CommandContext(ctx, "saa", "-c", "ChangeDmiInfo", "--file", dmiFile)
|
||||
changeCmd.Dir = "/usr/local/bin"
|
||||
if err := streamCmdJob(j, changeCmd); err != nil {
|
||||
return fmt.Errorf("ChangeDmiInfo: %w", err)
|
||||
}
|
||||
|
||||
j.append("Done. Reboot the server for changes to take effect.")
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -221,6 +221,11 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
h.kmsg = newKmsgWatcher(opts.App.StatusDB)
|
||||
h.kmsg.start()
|
||||
globalQueue.kmsgWatcher = h.kmsg
|
||||
|
||||
// Start periodic health poller for components that don't emit kernel log events (e.g. PSU).
|
||||
if opts.App.StatusDB != nil {
|
||||
newHealthPoller(opts.App.StatusDB).start()
|
||||
}
|
||||
}
|
||||
|
||||
globalQueue.startWorker(&opts)
|
||||
@@ -301,11 +306,23 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
// Export
|
||||
mux.HandleFunc("GET /api/export/list", h.handleAPIExportList)
|
||||
mux.HandleFunc("GET /api/export/usb", h.handleAPIExportUSBTargets)
|
||||
mux.HandleFunc("POST /api/export/usb/audit", h.handleAPIExportUSBAudit)
|
||||
mux.HandleFunc("POST /api/export/usb/bundle", h.handleAPIExportUSBBundle)
|
||||
mux.HandleFunc("GET /api/blackbox/status", h.handleAPIBlackboxStatus)
|
||||
mux.HandleFunc("POST /api/blackbox/enable", h.handleAPIBlackboxEnable)
|
||||
mux.HandleFunc("POST /api/blackbox/disable", h.handleAPIBlackboxDisable)
|
||||
|
||||
// Tools
|
||||
mux.HandleFunc("GET /api/tools/check", h.handleAPIToolsCheck)
|
||||
mux.HandleFunc("GET /api/tools/nvme-formats", h.handleAPINVMeFormats)
|
||||
mux.HandleFunc("POST /api/tools/nvme-format/run", h.handleAPINVMeFormatRun)
|
||||
mux.HandleFunc("GET /api/tools/saa-dmi", h.handleAPISAADMIRead)
|
||||
mux.HandleFunc("POST /api/tools/saa-dmi/write", h.handleAPISAADMIWrite)
|
||||
mux.HandleFunc("GET /api/tools/ipmi-fru", h.handleAPIIPMIFRURead)
|
||||
mux.HandleFunc("POST /api/tools/ipmi-fru/write", h.handleAPIIPMIFRUWrite)
|
||||
mux.HandleFunc("GET /api/tools/huawei-elabel", h.handleAPIHuaweiElabelRead)
|
||||
mux.HandleFunc("POST /api/tools/huawei-elabel/write", h.handleAPIHuaweiElabelWrite)
|
||||
mux.HandleFunc("GET /api/tools/raid/status", h.handleAPIRAIDStatus)
|
||||
mux.HandleFunc("POST /api/tools/raid/foreign", h.handleAPIRAIDForeignAction)
|
||||
mux.HandleFunc("POST /api/tools/raid/create-mirror", h.handleAPIRAIDCreateMirror)
|
||||
|
||||
// GPU presence / tools
|
||||
mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
|
||||
@@ -317,6 +334,8 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
// System
|
||||
mux.HandleFunc("GET /api/system/ram-status", h.handleAPIRAMStatus)
|
||||
mux.HandleFunc("POST /api/system/install-to-ram", h.handleAPIInstallToRAM)
|
||||
mux.HandleFunc("POST /api/system/reboot", h.handleAPISystemReboot)
|
||||
mux.HandleFunc("POST /api/system/shutdown", h.handleAPISystemShutdown)
|
||||
|
||||
// Preflight
|
||||
mux.HandleFunc("GET /api/preflight", h.handleAPIPreflight)
|
||||
@@ -325,6 +344,10 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
mux.HandleFunc("GET /api/install/disks", h.handleAPIInstallDisks)
|
||||
mux.HandleFunc("POST /api/install/run", h.handleAPIInstallRun)
|
||||
|
||||
// Hardware component detail (fragment for modal in Hardware Summary card)
|
||||
mux.HandleFunc("GET /api/hardware-summary", h.handleAPIHardwareSummary)
|
||||
mux.HandleFunc("GET /api/components/{type}", h.handleAPIComponentDetail)
|
||||
|
||||
// Metrics — SSE stream of live sensor data + server-side SVG charts + CSV export
|
||||
mux.HandleFunc("GET /api/metrics/stream", h.handleAPIMetricsStream)
|
||||
mux.HandleFunc("GET /api/metrics/latest", h.handleAPIMetricsLatest)
|
||||
@@ -571,6 +594,7 @@ func (h *handler) handleExportIndex(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
func (h *handler) handleViewer(w http.ResponseWriter, r *http.Request) {
|
||||
snapshot, _ := loadSnapshot(h.opts.AuditPath)
|
||||
snapshot = enrichSnapshotForViewer(snapshot)
|
||||
body, err := viewer.RenderHTML(snapshot, h.opts.Title)
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
@@ -1290,8 +1314,8 @@ const loadingPageHTML = `<!DOCTYPE html>
|
||||
*{margin:0;padding:0;box-sizing:border-box}
|
||||
html,body{height:100%;background:#0f1117;display:flex;align-items:center;justify-content:center;font-family:'Courier New',monospace;color:#e2e8f0}
|
||||
.wrap{text-align:center;width:420px}
|
||||
.logo{font-size:11px;line-height:1.4;color:#f6c90e;margin-bottom:6px;white-space:pre;text-align:left}
|
||||
.subtitle{font-size:12px;color:#a0aec0;text-align:left;margin-bottom:24px;padding-left:2px}
|
||||
.brand{font-size:22px;letter-spacing:.18em;color:#f6c90e;margin-bottom:6px;text-align:left}
|
||||
.subtitle{font-size:12px;color:#a0aec0;text-align:left;margin-bottom:24px}
|
||||
.spinner{width:36px;height:36px;border:3px solid #2d3748;border-top-color:#f6c90e;border-radius:50%;animation:spin .8s linear infinite;margin:0 auto 14px}
|
||||
.spinner.hidden{display:none}
|
||||
@keyframes spin{to{transform:rotate(360deg)}}
|
||||
@@ -1309,12 +1333,7 @@ td:first-child{color:#718096;width:55%}
|
||||
</head>
|
||||
<body>
|
||||
<div class="wrap">
|
||||
<div class="logo"> ███████╗ █████╗ ███████╗██╗ ██╗ ██████╗ ███████╗███████╗
|
||||
██╔════╝██╔══██╗██╔════╝╚██╗ ██╔╝ ██╔══██╗██╔════╝██╔════╝
|
||||
█████╗ ███████║███████╗ ╚████╔╝ █████╗██████╔╝█████╗ █████╗
|
||||
██╔══╝ ██╔══██║╚════██║ ╚██╔╝ ╚════╝██╔══██╗██╔══╝ ██╔══╝
|
||||
███████╗██║ ██║███████║ ██║ ██████╔╝███████╗███████╗
|
||||
╚══════╝╚═╝ ╚═╝╚══════╝ ╚═╝ ╚═════╝ ╚══════╝╚══════╝</div>
|
||||
<div class="brand">EASY BEE</div>
|
||||
<div class="subtitle">Hardware Audit LiveCD</div>
|
||||
<div class="spinner" id="spin"></div>
|
||||
<div class="status" id="st">Connecting to bee-web...</div>
|
||||
@@ -1324,8 +1343,20 @@ td:first-child{color:#718096;width:55%}
|
||||
<script>
|
||||
(function(){
|
||||
var gone = false;
|
||||
var pollStarted = false;
|
||||
var fallbackOpenTimer = null;
|
||||
var AUTO_OPEN_DELAY_MS = 15000;
|
||||
function go(){ if(!gone){gone=true;window.location.replace('/');} }
|
||||
|
||||
function scheduleFallbackOpen(){
|
||||
if(fallbackOpenTimer!==null) return;
|
||||
fallbackOpenTimer=setTimeout(function(){
|
||||
document.getElementById('spin').className='spinner hidden';
|
||||
document.getElementById('st').textContent='Startup checks are taking too long — opening app...';
|
||||
go();
|
||||
},AUTO_OPEN_DELAY_MS);
|
||||
}
|
||||
|
||||
function icon(s){
|
||||
if(s==='active') return '<span class="ok">● active</span>';
|
||||
if(s==='failed') return '<span class="fail">✕ failed</span>';
|
||||
@@ -1357,6 +1388,7 @@ function pollServices(){
|
||||
tbl.innerHTML=html;
|
||||
if(allSettled(svcs)){
|
||||
clearInterval(pollTimer);
|
||||
if(fallbackOpenTimer!==null) clearTimeout(fallbackOpenTimer);
|
||||
document.getElementById('spin').className='spinner hidden';
|
||||
document.getElementById('st').textContent='Ready \u2014 opening...';
|
||||
setTimeout(go,800);
|
||||
@@ -1371,8 +1403,12 @@ function probe(){
|
||||
if(r.ok){
|
||||
document.getElementById('st').textContent='bee-web running \u2014 checking services...';
|
||||
document.getElementById('btn').style.display='';
|
||||
pollServices();
|
||||
pollTimer=setInterval(pollServices,1500);
|
||||
scheduleFallbackOpen();
|
||||
if(!pollStarted){
|
||||
pollStarted=true;
|
||||
pollServices();
|
||||
pollTimer=setInterval(pollServices,1500);
|
||||
}
|
||||
} else {
|
||||
document.getElementById('st').textContent='bee-web starting (status '+r.status+')...';
|
||||
setTimeout(probe,500);
|
||||
@@ -1394,14 +1430,17 @@ func (h *handler) handlePage(w http.ResponseWriter, r *http.Request) {
|
||||
if page == "" {
|
||||
page = "dashboard"
|
||||
}
|
||||
// Redirect old routes to new names
|
||||
// Redirect legacy routes to new named pages
|
||||
switch page {
|
||||
case "tests":
|
||||
http.Redirect(w, r, "/validate", http.StatusMovedPermanently)
|
||||
case "validate", "tests":
|
||||
http.Redirect(w, r, "/load", http.StatusMovedPermanently)
|
||||
return
|
||||
case "burn-in":
|
||||
http.Redirect(w, r, "/burn", http.StatusMovedPermanently)
|
||||
return
|
||||
case "speed", "endurance":
|
||||
http.Redirect(w, r, "/benchmark", http.StatusMovedPermanently)
|
||||
return
|
||||
}
|
||||
body := renderPage(page, h.opts)
|
||||
w.Header().Set("Cache-Control", "no-store")
|
||||
|
||||
@@ -604,6 +604,25 @@ func TestReadyIsOKWhenAuditPathIsUnset(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadingPageHasFallbackAutoOpen(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/loading", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d body=%s", rec.Code, rec.Body.String())
|
||||
}
|
||||
body := rec.Body.String()
|
||||
for _, needle := range []string{
|
||||
`var AUTO_OPEN_DELAY_MS = 15000;`,
|
||||
`function scheduleFallbackOpen(){`,
|
||||
`Startup checks are taking too long — opening app...`,
|
||||
} {
|
||||
if !strings.Contains(body, needle) {
|
||||
t.Fatalf("loading page missing %q: %s", needle, body)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestAuditPageRendersViewerFrameAndActions(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "audit.json")
|
||||
@@ -647,35 +666,51 @@ func TestTasksPageRendersOpenLinksAndPaginationControls(t *testing.T) {
|
||||
|
||||
func TestToolsPageRendersNvidiaSelfHealSection(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tools", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
|
||||
// /tools: only NVMe Block Format and Supermicro DMI remain
|
||||
recTools := httptest.NewRecorder()
|
||||
handler.ServeHTTP(recTools, httptest.NewRequest(http.MethodGet, "/tools", nil))
|
||||
if recTools.Code != http.StatusOK {
|
||||
t.Fatalf("tools status=%d", recTools.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
if !strings.Contains(body, `NVIDIA Self Heal`) {
|
||||
t.Fatalf("tools page missing nvidia self heal section: %s", body)
|
||||
toolsBody := recTools.Body.String()
|
||||
if !strings.Contains(toolsBody, `NVMe Block Format`) {
|
||||
t.Fatalf("tools page missing nvme block format section: %s", toolsBody)
|
||||
}
|
||||
if !strings.Contains(body, `Restart GPU Drivers`) {
|
||||
t.Fatalf("tools page missing restart gpu drivers button: %s", body)
|
||||
if !strings.Contains(toolsBody, `/api/tools/nvme-formats`) || !strings.Contains(toolsBody, `/api/tools/nvme-format/run`) {
|
||||
t.Fatalf("tools page missing nvme format api usage: %s", toolsBody)
|
||||
}
|
||||
if !strings.Contains(body, `nvidiaRestartDrivers()`) {
|
||||
t.Fatalf("tools page missing nvidiaRestartDrivers action: %s", body)
|
||||
|
||||
// /settings: system install, support bundle, tool check, nvidia self heal, network, services
|
||||
recSettings := httptest.NewRecorder()
|
||||
handler.ServeHTTP(recSettings, httptest.NewRequest(http.MethodGet, "/settings", nil))
|
||||
if recSettings.Code != http.StatusOK {
|
||||
t.Fatalf("settings status=%d", recSettings.Code)
|
||||
}
|
||||
if !strings.Contains(body, `/api/gpu/nvidia-status`) {
|
||||
t.Fatalf("tools page missing nvidia status api usage: %s", body)
|
||||
settingsBody := recSettings.Body.String()
|
||||
if !strings.Contains(settingsBody, `NVIDIA Self Heal`) {
|
||||
t.Fatalf("settings page missing nvidia self heal section: %s", settingsBody)
|
||||
}
|
||||
if !strings.Contains(body, `nvidiaResetGPU(`) {
|
||||
t.Fatalf("tools page missing nvidiaResetGPU action: %s", body)
|
||||
if !strings.Contains(settingsBody, `Restart GPU Drivers`) {
|
||||
t.Fatalf("settings page missing restart gpu drivers button: %s", settingsBody)
|
||||
}
|
||||
if !strings.Contains(body, `id="boot-source-text"`) {
|
||||
t.Fatalf("tools page missing boot source field: %s", body)
|
||||
if !strings.Contains(settingsBody, `nvidiaRestartDrivers()`) {
|
||||
t.Fatalf("settings page missing nvidiaRestartDrivers action: %s", settingsBody)
|
||||
}
|
||||
if !strings.Contains(body, `Export to USB`) {
|
||||
t.Fatalf("tools page missing export to usb section: %s", body)
|
||||
if !strings.Contains(settingsBody, `/api/gpu/nvidia-status`) {
|
||||
t.Fatalf("settings page missing nvidia status api usage: %s", settingsBody)
|
||||
}
|
||||
if !strings.Contains(body, `Support Bundle</button>`) {
|
||||
t.Fatalf("tools page missing support bundle usb button: %s", body)
|
||||
if !strings.Contains(settingsBody, `nvidiaResetGPU(`) {
|
||||
t.Fatalf("settings page missing nvidiaResetGPU action: %s", settingsBody)
|
||||
}
|
||||
if !strings.Contains(settingsBody, `id="boot-source-text"`) {
|
||||
t.Fatalf("settings page missing boot source field: %s", settingsBody)
|
||||
}
|
||||
if !strings.Contains(settingsBody, `USB Black-Box`) {
|
||||
t.Fatalf("settings page missing usb black-box section: %s", settingsBody)
|
||||
}
|
||||
if !strings.Contains(settingsBody, `/api/blackbox/status`) {
|
||||
t.Fatalf("settings page missing black-box status api usage: %s", settingsBody)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -766,46 +801,45 @@ func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidatePageRendersNvidiaTargetedStressCard(t *testing.T) {
|
||||
func TestCheckPageRendersGPUSelectionAndNvidiaCards(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil))
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/check", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
for _, needle := range []string{
|
||||
`NVIDIA GPU Targeted Stress`,
|
||||
`nvidia-targeted-stress`,
|
||||
`controlled NVIDIA DCGM load`,
|
||||
`<code>dcgmi diag targeted_stress</code>`,
|
||||
`NVIDIA GPU Selection`,
|
||||
`All NVIDIA validate tasks use only the GPUs selected here.`,
|
||||
`Select All`,
|
||||
`id="sat-gpu-list"`,
|
||||
`Select All`,
|
||||
`id="sat-btn-nvidia"`,
|
||||
`NVIDIA Interconnect (NCCL)`,
|
||||
`NVIDIA Bandwidth (NVBandwidth)`,
|
||||
`Non-destructive`,
|
||||
} {
|
||||
if !strings.Contains(body, needle) {
|
||||
t.Fatalf("validate page missing %q: %s", needle, body)
|
||||
t.Fatalf("check page missing %q: %s", needle, body)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestValidatePageRendersNvidiaFabricCardsInValidateMode(t *testing.T) {
|
||||
func TestCheckPageRendersNvidiaFabricCards(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/validate", nil))
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/check", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
for _, needle := range []string{
|
||||
`NVIDIA Interconnect (NCCL)`,
|
||||
`Validate and Stress:`,
|
||||
`NVIDIA Bandwidth (NVBandwidth)`,
|
||||
`nvbandwidth runs all built-in tests without a time limit`,
|
||||
`nvbandwidth`,
|
||||
`all_reduce_perf`,
|
||||
} {
|
||||
if !strings.Contains(body, needle) {
|
||||
t.Fatalf("validate page missing %q: %s", needle, body)
|
||||
t.Fatalf("check page missing %q: %s", needle, body)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -822,7 +856,6 @@ func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
|
||||
`NVIDIA Max Compute Load`,
|
||||
`dcgmproftester`,
|
||||
`NCCL`,
|
||||
`Validate → Stress mode`,
|
||||
`id="burn-gpu-list"`,
|
||||
} {
|
||||
if !strings.Contains(body, needle) {
|
||||
@@ -1016,6 +1049,39 @@ func TestViewerRendersLatestSnapshot(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestViewerRendersDerivedStorageBlockFormat(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "audit.json")
|
||||
body := `{
|
||||
"collected_at":"2026-04-29T00:05:00Z",
|
||||
"hardware":{
|
||||
"board":{"serial_number":"SERIAL-NEW"},
|
||||
"storage":[
|
||||
{
|
||||
"serial_number":"DISK-1",
|
||||
"model":"Test NVMe",
|
||||
"logical_block_size_bytes":512,
|
||||
"physical_block_size_bytes":4096,
|
||||
"metadata_bytes_per_block":8
|
||||
}
|
||||
]
|
||||
}
|
||||
}`
|
||||
if err := os.WriteFile(path, []byte(body), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
handler := NewHandler(HandlerOptions{AuditPath: path})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/viewer", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
if !strings.Contains(rec.Body.String(), "512+8") {
|
||||
t.Fatalf("viewer body missing derived block format: %s", rec.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
func TestAuditJSONServesLatestSnapshot(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "audit.json")
|
||||
@@ -1038,6 +1104,36 @@ func TestAuditJSONServesLatestSnapshot(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestAuditJSONDoesNotInjectDerivedStorageBlockFormat(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "audit.json")
|
||||
body := `{
|
||||
"hardware":{
|
||||
"board":{"serial_number":"SERIAL-API"},
|
||||
"storage":[
|
||||
{
|
||||
"serial_number":"DISK-1",
|
||||
"logical_block_size_bytes":512,
|
||||
"metadata_bytes_per_block":8
|
||||
}
|
||||
]
|
||||
}
|
||||
}`
|
||||
if err := os.WriteFile(path, []byte(body), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
handler := NewHandler(HandlerOptions{AuditPath: path})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/audit.json", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
if strings.Contains(rec.Body.String(), "block_format") {
|
||||
t.Fatalf("audit.json should remain contract-only: %s", rec.Body.String())
|
||||
}
|
||||
}
|
||||
|
||||
func TestMissingAuditJSONReturnsNotFound(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{AuditPath: "/missing/audit.json"})
|
||||
rec := httptest.NewRecorder()
|
||||
@@ -1131,7 +1227,8 @@ func TestDashboardRendersRuntimeHealthTable(t *testing.T) {
|
||||
],
|
||||
"services":[
|
||||
{"name":"bee-web","status":"active"},
|
||||
{"name":"bee-nvidia","status":"inactive"}
|
||||
{"name":"bee-audit","status":"inactive"},
|
||||
{"name":"bee-nvidia","status":"failed"}
|
||||
]
|
||||
}`
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "runtime-health.json"), []byte(health), 0644); err != nil {
|
||||
@@ -1185,7 +1282,7 @@ func TestDashboardRendersRuntimeHealthTable(t *testing.T) {
|
||||
`Bee Services`,
|
||||
`CUDA runtime is not ready for GPU SAT.`,
|
||||
`Missing: nvidia-smi`,
|
||||
`bee-nvidia=inactive`,
|
||||
`bee-nvidia=failed`,
|
||||
// Hardware Summary card — component health badges
|
||||
`Hardware Summary`,
|
||||
`>CPU<`,
|
||||
|
||||
545
audit/internal/webui/task_runner.go
Normal file
545
audit/internal/webui/task_runner.go
Normal file
@@ -0,0 +1,545 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"os"
|
||||
"os/signal"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
"bee/audit/internal/platform"
|
||||
"bee/audit/internal/runtimeenv"
|
||||
)
|
||||
|
||||
type taskRunnerState struct {
|
||||
PID int `json:"pid"`
|
||||
Status string `json:"status"`
|
||||
Error string `json:"error,omitempty"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
}
|
||||
|
||||
func taskRunnerStatePath(t *Task) string {
|
||||
if t == nil || strings.TrimSpace(t.ArtifactsDir) == "" {
|
||||
return ""
|
||||
}
|
||||
return filepath.Join(t.ArtifactsDir, "runner-state.json")
|
||||
}
|
||||
|
||||
func writeTaskRunnerState(t *Task, state taskRunnerState) error {
|
||||
path := taskRunnerStatePath(t)
|
||||
if path == "" {
|
||||
return nil
|
||||
}
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
data, err := json.MarshalIndent(state, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
tmp := path + ".tmp"
|
||||
if err := os.WriteFile(tmp, data, 0644); err != nil {
|
||||
return err
|
||||
}
|
||||
return os.Rename(tmp, path)
|
||||
}
|
||||
|
||||
func readTaskRunnerState(t *Task) (taskRunnerState, bool) {
|
||||
path := taskRunnerStatePath(t)
|
||||
if path == "" {
|
||||
return taskRunnerState{}, false
|
||||
}
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil || len(data) == 0 {
|
||||
return taskRunnerState{}, false
|
||||
}
|
||||
var state taskRunnerState
|
||||
if err := json.Unmarshal(data, &state); err != nil {
|
||||
return taskRunnerState{}, false
|
||||
}
|
||||
return state, true
|
||||
}
|
||||
|
||||
func processAlive(pid int) bool {
|
||||
if pid <= 0 {
|
||||
return false
|
||||
}
|
||||
err := syscall.Kill(pid, 0)
|
||||
return err == nil || err == syscall.EPERM
|
||||
}
|
||||
|
||||
func finalizeTaskForResult(t *Task, errMsg string, cancelled bool) {
|
||||
now := time.Now()
|
||||
t.DoneAt = &now
|
||||
switch {
|
||||
case cancelled:
|
||||
t.Status = TaskCancelled
|
||||
t.ErrMsg = "aborted"
|
||||
case strings.TrimSpace(errMsg) != "":
|
||||
t.Status = TaskFailed
|
||||
t.ErrMsg = errMsg
|
||||
default:
|
||||
t.Status = TaskDone
|
||||
t.ErrMsg = ""
|
||||
}
|
||||
}
|
||||
|
||||
func executeTaskWithOptions(opts *HandlerOptions, t *Task, j *jobState, ctx context.Context) {
|
||||
if opts == nil {
|
||||
j.append("ERROR: handler options not configured")
|
||||
j.finish("handler options not configured")
|
||||
return
|
||||
}
|
||||
a := opts.App
|
||||
|
||||
recovered := len(j.lines) > 0
|
||||
j.append(fmt.Sprintf("Starting %s...", t.Name))
|
||||
if recovered {
|
||||
j.append(fmt.Sprintf("Recovered after bee-web restart at %s", time.Now().UTC().Format(time.RFC3339)))
|
||||
}
|
||||
|
||||
var (
|
||||
archive string
|
||||
err error
|
||||
)
|
||||
|
||||
switch t.Target {
|
||||
case "nvidia":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
diagLevel := 2
|
||||
if t.params.StressMode {
|
||||
diagLevel = 3
|
||||
}
|
||||
if len(t.params.GPUIndices) > 0 || diagLevel > 0 {
|
||||
result, e := a.RunNvidiaAcceptancePackWithOptions(ctx, "", diagLevel, t.params.GPUIndices, j.append)
|
||||
if e != nil {
|
||||
err = e
|
||||
} else {
|
||||
archive = result.Body
|
||||
}
|
||||
} else {
|
||||
archive, err = a.RunNvidiaAcceptancePack("", j.append)
|
||||
}
|
||||
case "nvidia-targeted-stress":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
dur := t.params.Duration
|
||||
if dur <= 0 {
|
||||
dur = 300
|
||||
}
|
||||
archive, err = a.RunNvidiaTargetedStressValidatePack(ctx, "", dur, t.params.GPUIndices, j.append)
|
||||
case "nvidia-bench-perf":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = a.RunNvidiaBenchmarkCtx(ctx, "", platform.NvidiaBenchmarkOptions{
|
||||
Profile: t.params.BenchmarkProfile,
|
||||
SizeMB: t.params.SizeMB,
|
||||
GPUIndices: t.params.GPUIndices,
|
||||
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||
RunNCCL: t.params.RunNCCL,
|
||||
ParallelGPUs: t.params.ParallelGPUs,
|
||||
RampStep: t.params.RampStep,
|
||||
RampTotal: t.params.RampTotal,
|
||||
RampRunID: t.params.RampRunID,
|
||||
}, j.append)
|
||||
case "nvidia-bench-power":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = a.RunNvidiaPowerBenchCtx(ctx, app.DefaultBeeBenchPowerDir, platform.NvidiaBenchmarkOptions{
|
||||
Profile: t.params.BenchmarkProfile,
|
||||
GPUIndices: t.params.GPUIndices,
|
||||
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||
RampStep: t.params.RampStep,
|
||||
RampTotal: t.params.RampTotal,
|
||||
RampRunID: t.params.RampRunID,
|
||||
}, j.append)
|
||||
case "nvidia-bench-autotune":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = a.RunNvidiaPowerSourceAutotuneCtx(ctx, app.DefaultBeeBenchAutotuneDir, platform.NvidiaBenchmarkOptions{
|
||||
Profile: t.params.BenchmarkProfile,
|
||||
SizeMB: t.params.SizeMB,
|
||||
}, t.params.BenchmarkKind, j.append)
|
||||
case "nvidia-compute":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
dur := t.params.Duration
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
rampPlan, planErr := resolveNvidiaRampPlan(t.params.BurnProfile, t.params.StaggerGPUStart, t.params.GPUIndices)
|
||||
if planErr != nil {
|
||||
err = planErr
|
||||
break
|
||||
}
|
||||
if t.params.BurnProfile != "" && t.params.StaggerGPUStart && dur <= 0 {
|
||||
dur = rampPlan.DurationSec
|
||||
}
|
||||
if rampPlan.StaggerSeconds > 0 {
|
||||
j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU; post-ramp hold: %ds; total runtime: %ds", rampPlan.StaggerSeconds, dur, rampPlan.TotalDurationSec))
|
||||
}
|
||||
archive, err = a.RunNvidiaOfficialComputePack(ctx, "", dur, t.params.GPUIndices, rampPlan.StaggerSeconds, j.append)
|
||||
case "nvidia-targeted-power":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
dur := t.params.Duration
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
archive, err = a.RunNvidiaTargetedPowerPack(ctx, "", dur, t.params.GPUIndices, j.append)
|
||||
case "nvidia-pulse":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
dur := t.params.Duration
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
archive, err = a.RunNvidiaPulseTestPack(ctx, "", dur, t.params.GPUIndices, j.append)
|
||||
case "nvidia-bandwidth":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = a.RunNvidiaBandwidthPack(ctx, "", t.params.GPUIndices, j.append)
|
||||
case "nvidia-interconnect":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = a.RunNCCLTests(ctx, "", t.params.GPUIndices, j.append)
|
||||
case "nvidia-stress":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
dur := t.params.Duration
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
rampPlan, planErr := resolveNvidiaRampPlan(t.params.BurnProfile, t.params.StaggerGPUStart, t.params.GPUIndices)
|
||||
if planErr != nil {
|
||||
err = planErr
|
||||
break
|
||||
}
|
||||
if t.params.BurnProfile != "" && t.params.StaggerGPUStart && dur <= 0 {
|
||||
dur = rampPlan.DurationSec
|
||||
}
|
||||
if rampPlan.StaggerSeconds > 0 {
|
||||
j.append(fmt.Sprintf("NVIDIA staggered ramp-up enabled: %ds per GPU; post-ramp hold: %ds; total runtime: %ds", rampPlan.StaggerSeconds, dur, rampPlan.TotalDurationSec))
|
||||
}
|
||||
archive, err = runNvidiaStressPackCtx(a, ctx, "", platform.NvidiaStressOptions{
|
||||
DurationSec: dur,
|
||||
Loader: t.params.Loader,
|
||||
GPUIndices: t.params.GPUIndices,
|
||||
ExcludeGPUIndices: t.params.ExcludeGPUIndices,
|
||||
StaggerSeconds: rampPlan.StaggerSeconds,
|
||||
}, j.append)
|
||||
case "memory":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
sizeMB, passes := resolveMemoryValidatePreset(t.params.BurnProfile, t.params.StressMode)
|
||||
j.append(fmt.Sprintf("Memory validate preset: %d MB x %d pass(es)", sizeMB, passes))
|
||||
archive, err = runMemoryAcceptancePackCtx(a, ctx, "", sizeMB, passes, j.append)
|
||||
case "storage":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = runStorageAcceptancePackCtx(a, ctx, "", t.params.StressMode, j.append)
|
||||
case "cpu":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
dur := t.params.Duration
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
if dur <= 0 {
|
||||
if t.params.StressMode {
|
||||
dur = 1800
|
||||
} else {
|
||||
dur = 60
|
||||
}
|
||||
}
|
||||
j.append(fmt.Sprintf("CPU stress duration: %ds", dur))
|
||||
archive, err = runCPUAcceptancePackCtx(a, ctx, "", dur, j.append)
|
||||
case "amd":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = runAMDAcceptancePackCtx(a, ctx, "", j.append)
|
||||
case "amd-mem":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = runAMDMemIntegrityPackCtx(a, ctx, "", j.append)
|
||||
case "amd-bandwidth":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = runAMDMemBandwidthPackCtx(a, ctx, "", j.append)
|
||||
case "amd-stress":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
dur := t.params.Duration
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
archive, err = runAMDStressPackCtx(a, ctx, "", dur, j.append)
|
||||
case "memory-stress":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
dur := t.params.Duration
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
archive, err = runMemoryStressPackCtx(a, ctx, "", dur, j.append)
|
||||
case "sat-stress":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
dur := t.params.Duration
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
dur = resolveBurnPreset(t.params.BurnProfile).DurationSec
|
||||
}
|
||||
archive, err = runSATStressPackCtx(a, ctx, "", dur, j.append)
|
||||
case "platform-stress":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
runOpts := resolvePlatformStressPreset(t.params.BurnProfile)
|
||||
runOpts.Components = t.params.PlatformComponents
|
||||
archive, err = a.RunPlatformStress(ctx, "", runOpts, j.append)
|
||||
case "audit":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
result, e := a.RunAuditNow(opts.RuntimeMode)
|
||||
if e != nil {
|
||||
err = e
|
||||
} else {
|
||||
for _, line := range splitLines(result.Body) {
|
||||
j.append(line)
|
||||
}
|
||||
}
|
||||
case "support-bundle":
|
||||
j.append("Building support bundle...")
|
||||
archive, err = buildSupportBundle(opts.ExportDir)
|
||||
case "install":
|
||||
if strings.TrimSpace(t.params.Device) == "" {
|
||||
err = fmt.Errorf("device is required")
|
||||
break
|
||||
}
|
||||
installLogPath := platform.InstallLogPath(t.params.Device)
|
||||
j.append("Install log: " + installLogPath)
|
||||
err = streamCmdJob(j, installCommand(ctx, t.params.Device, installLogPath))
|
||||
case "install-to-ram":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
err = a.RunInstallToRAM(ctx, j.append)
|
||||
case "nvme-format":
|
||||
if strings.TrimSpace(t.params.Device) == "" {
|
||||
err = fmt.Errorf("device is required")
|
||||
break
|
||||
}
|
||||
err = runNVMeFormatTask(ctx, j, t.params.Device, t.params.LBAF)
|
||||
case "saa-dmi-write":
|
||||
if len(t.params.SAADmiChanges) == 0 {
|
||||
err = fmt.Errorf("no changes provided")
|
||||
break
|
||||
}
|
||||
err = runSAADMIWriteTask(ctx, j, opts.ExportDir, t.params)
|
||||
case "ipmi-fru-write":
|
||||
if len(t.params.FRUChanges) == 0 {
|
||||
err = fmt.Errorf("no changes provided")
|
||||
break
|
||||
}
|
||||
err = runIPMIFRUWriteTask(ctx, j, opts.ExportDir, t.params)
|
||||
case "huawei-elabel-write":
|
||||
if len(t.params.HuaweiElabelChanges) == 0 {
|
||||
err = fmt.Errorf("no changes provided")
|
||||
break
|
||||
}
|
||||
err = runHuaweiElabelWriteTask(ctx, j, t.params)
|
||||
case "raid-foreign-clear":
|
||||
err = runRAIDForeignClearTask(ctx, j, t.params.RAIDController)
|
||||
case "raid-foreign-import":
|
||||
err = runRAIDForeignImportTask(ctx, j, t.params.RAIDController)
|
||||
case "raid-lsi-create-mirror":
|
||||
if len(t.params.RAIDDevices) < 2 {
|
||||
err = fmt.Errorf("at least 2 drives required")
|
||||
break
|
||||
}
|
||||
err = runRAIDLSICreateMirrorTask(ctx, j, t.params.RAIDController, t.params.RAIDDevices)
|
||||
case "raid-vroc-create-mirror":
|
||||
if len(t.params.RAIDDevices) < 2 {
|
||||
err = fmt.Errorf("at least 2 devices required")
|
||||
break
|
||||
}
|
||||
err = runRAIDVROCCreateMirrorTask(ctx, j, t.params.RAIDDevices, t.params.RAIDArrayName)
|
||||
default:
|
||||
j.append("ERROR: unknown target: " + t.Target)
|
||||
j.finish("unknown target")
|
||||
return
|
||||
}
|
||||
|
||||
if archive != "" {
|
||||
archivePath := app.ExtractArchivePath(archive)
|
||||
if err == nil && app.ReadSATOverallStatus(archivePath) == "FAILED" {
|
||||
err = fmt.Errorf("SAT overall_status=FAILED (see summary.txt)")
|
||||
}
|
||||
if opts.App != nil && opts.App.StatusDB != nil {
|
||||
app.ApplySATResultToDB(opts.App.StatusDB, t.Target, archivePath)
|
||||
}
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
if ctx.Err() != nil {
|
||||
j.append("Aborted.")
|
||||
j.finish("aborted")
|
||||
} else {
|
||||
j.append("ERROR: " + err.Error())
|
||||
j.finish(err.Error())
|
||||
}
|
||||
return
|
||||
}
|
||||
if archive != "" {
|
||||
j.append("Archive: " + archive)
|
||||
}
|
||||
j.finish("")
|
||||
}
|
||||
|
||||
func loadPersistedTask(statePath, taskID string) (*Task, error) {
|
||||
data, err := os.ReadFile(statePath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var persisted []persistedTask
|
||||
if err := json.Unmarshal(data, &persisted); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
for _, pt := range persisted {
|
||||
if pt.ID != taskID {
|
||||
continue
|
||||
}
|
||||
t := &Task{
|
||||
ID: pt.ID,
|
||||
Name: pt.Name,
|
||||
Target: pt.Target,
|
||||
Priority: pt.Priority,
|
||||
Status: pt.Status,
|
||||
CreatedAt: pt.CreatedAt,
|
||||
StartedAt: pt.StartedAt,
|
||||
DoneAt: pt.DoneAt,
|
||||
ErrMsg: pt.ErrMsg,
|
||||
LogPath: pt.LogPath,
|
||||
ArtifactsDir: pt.ArtifactsDir,
|
||||
ReportJSONPath: pt.ReportJSONPath,
|
||||
ReportHTMLPath: pt.ReportHTMLPath,
|
||||
params: pt.Params,
|
||||
}
|
||||
ensureTaskReportPaths(t)
|
||||
return t, nil
|
||||
}
|
||||
return nil, fmt.Errorf("task %s not found", taskID)
|
||||
}
|
||||
|
||||
func RunPersistedTask(exportDir, taskID string, stdout, stderr io.Writer) int {
|
||||
if strings.TrimSpace(exportDir) == "" || strings.TrimSpace(taskID) == "" {
|
||||
fmt.Fprintln(stderr, "bee task-run: --export-dir and --task-id are required")
|
||||
return 2
|
||||
}
|
||||
|
||||
runtimeInfo, err := runtimeenv.Detect("auto")
|
||||
if err != nil {
|
||||
slog.Warn("resolve runtime for task-run", "err", err)
|
||||
}
|
||||
opts := &HandlerOptions{
|
||||
ExportDir: exportDir,
|
||||
App: app.New(platform.New()),
|
||||
RuntimeMode: runtimeInfo.Mode,
|
||||
}
|
||||
statePath := filepath.Join(exportDir, "tasks-state.json")
|
||||
task, err := loadPersistedTask(statePath, taskID)
|
||||
if err != nil {
|
||||
fmt.Fprintln(stderr, err.Error())
|
||||
return 1
|
||||
}
|
||||
if task.StartedAt == nil || task.StartedAt.IsZero() {
|
||||
now := time.Now()
|
||||
task.StartedAt = &now
|
||||
}
|
||||
if task.Status == "" {
|
||||
task.Status = TaskRunning
|
||||
}
|
||||
if err := writeTaskRunnerState(task, taskRunnerState{
|
||||
PID: os.Getpid(),
|
||||
Status: TaskRunning,
|
||||
UpdatedAt: time.Now().UTC(),
|
||||
}); err != nil {
|
||||
fmt.Fprintln(stderr, err.Error())
|
||||
return 1
|
||||
}
|
||||
|
||||
ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
|
||||
defer cancel()
|
||||
|
||||
j := newTaskJobState(task.LogPath, taskSerialPrefix(task))
|
||||
executeTaskWithOptions(opts, task, j, ctx)
|
||||
finalizeTaskForResult(task, j.err, ctx.Err() != nil)
|
||||
if err := writeTaskReportArtifacts(task); err != nil {
|
||||
appendJobLog(task.LogPath, "WARN: task report generation failed: "+err.Error())
|
||||
}
|
||||
j.closeLog()
|
||||
if err := writeTaskRunnerState(task, taskRunnerState{
|
||||
PID: os.Getpid(),
|
||||
Status: task.Status,
|
||||
Error: task.ErrMsg,
|
||||
UpdatedAt: time.Now().UTC(),
|
||||
}); err != nil {
|
||||
fmt.Fprintln(stderr, err.Error())
|
||||
}
|
||||
if task.ErrMsg != "" {
|
||||
return 1
|
||||
}
|
||||
return 0
|
||||
}
|
||||
@@ -4,6 +4,7 @@ import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"os"
|
||||
@@ -13,6 +14,7 @@ import (
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/app"
|
||||
@@ -55,6 +57,7 @@ var taskNames = map[string]string{
|
||||
"support-bundle": "Support Bundle",
|
||||
"install": "Install to Disk",
|
||||
"install-to-ram": "Install to RAM",
|
||||
"nvme-format": "NVMe Block Format Change",
|
||||
}
|
||||
|
||||
// burnNames maps target → human-readable name when a burn profile is set.
|
||||
@@ -110,8 +113,9 @@ type Task struct {
|
||||
ReportHTMLPath string `json:"report_html_path,omitempty"`
|
||||
|
||||
// runtime fields (not serialised)
|
||||
job *jobState
|
||||
params taskParams
|
||||
job *jobState
|
||||
runnerPID int
|
||||
params taskParams
|
||||
}
|
||||
|
||||
// taskParams holds optional parameters parsed from the run request.
|
||||
@@ -133,8 +137,15 @@ type taskParams struct {
|
||||
RampTotal int `json:"ramp_total,omitempty"`
|
||||
RampRunID string `json:"ramp_run_id,omitempty"`
|
||||
DisplayName string `json:"display_name,omitempty"`
|
||||
Device string `json:"device,omitempty"` // for install
|
||||
PlatformComponents []string `json:"platform_components,omitempty"`
|
||||
Device string `json:"device,omitempty"` // for install
|
||||
LBAF int `json:"lbaf,omitempty"`
|
||||
PlatformComponents []string `json:"platform_components,omitempty"`
|
||||
SAADmiChanges []saaChange `json:"saa_dmi_changes,omitempty"`
|
||||
FRUChanges []fruChange `json:"fru_changes,omitempty"`
|
||||
HuaweiElabelChanges []huaweiChange `json:"huawei_elabel_changes,omitempty"`
|
||||
RAIDController int `json:"raid_controller,omitempty"`
|
||||
RAIDDevices []string `json:"raid_devices,omitempty"`
|
||||
RAIDArrayName string `json:"raid_array_name,omitempty"`
|
||||
}
|
||||
|
||||
type persistedTask struct {
|
||||
@@ -328,6 +339,13 @@ var (
|
||||
installCommand = func(ctx context.Context, device string, logPath string) *exec.Cmd {
|
||||
return exec.CommandContext(ctx, "bee-install", device, logPath)
|
||||
}
|
||||
externalTaskRunnerCommand = func(exportDir, taskID string) (*exec.Cmd, error) {
|
||||
exe, err := os.Executable()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return exec.Command(exe, "bee-worker", "--export-dir", exportDir, "--task-id", taskID), nil
|
||||
}
|
||||
)
|
||||
|
||||
// enqueue adds a task to the queue and notifies the worker.
|
||||
@@ -365,6 +383,11 @@ func (q *taskQueue) prune() {
|
||||
|
||||
// nextPending returns the highest-priority pending task (nil if none).
|
||||
func (q *taskQueue) nextPending() *Task {
|
||||
for _, t := range q.tasks {
|
||||
if t.Status == TaskRunning {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
var best *Task
|
||||
for _, t := range q.tasks {
|
||||
if t.Status != TaskPending {
|
||||
@@ -484,6 +507,7 @@ func (q *taskQueue) startWorker(opts *HandlerOptions) {
|
||||
if !q.started {
|
||||
q.loadLocked()
|
||||
q.started = true
|
||||
q.resumeRunningTasksLocked()
|
||||
goRecoverLoop("task worker", 2*time.Second, q.worker)
|
||||
}
|
||||
hasPending := q.nextPending() != nil
|
||||
@@ -517,15 +541,12 @@ func (q *taskQueue) worker() {
|
||||
t.StartedAt = &now
|
||||
t.DoneAt = nil
|
||||
t.ErrMsg = ""
|
||||
j := newTaskJobState(t.LogPath, taskSerialPrefix(t))
|
||||
j := newTaskJobState(t.LogPath)
|
||||
t.job = j
|
||||
q.persistLocked()
|
||||
q.mu.Unlock()
|
||||
|
||||
taskCtx, taskCancel := context.WithCancel(context.Background())
|
||||
j.cancel = taskCancel
|
||||
q.executeTask(t, j, taskCtx)
|
||||
taskCancel()
|
||||
q.runTaskExternal(t, j)
|
||||
|
||||
q.mu.Lock()
|
||||
q.prune()
|
||||
@@ -537,6 +558,218 @@ func (q *taskQueue) worker() {
|
||||
}
|
||||
}
|
||||
|
||||
func (q *taskQueue) resumeRunningTasksLocked() {
|
||||
for _, t := range q.tasks {
|
||||
if t.Status != TaskRunning {
|
||||
continue
|
||||
}
|
||||
if t.job == nil {
|
||||
t.job = newTaskJobState(t.LogPath)
|
||||
}
|
||||
q.attachExternalTaskControlsLocked(t, t.job)
|
||||
q.startRecoveredTaskMonitorLocked(t, t.job)
|
||||
}
|
||||
}
|
||||
|
||||
func (q *taskQueue) attachExternalTaskControlsLocked(t *Task, j *jobState) {
|
||||
if t == nil || j == nil {
|
||||
return
|
||||
}
|
||||
j.cancel = func() {
|
||||
pid := t.runnerPID
|
||||
if pid <= 0 {
|
||||
if state, ok := readTaskRunnerState(t); ok {
|
||||
pid = state.PID
|
||||
}
|
||||
}
|
||||
if pid > 0 {
|
||||
_ = syscall.Kill(pid, syscall.SIGTERM)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (q *taskQueue) startRecoveredTaskMonitorLocked(t *Task, j *jobState) {
|
||||
if t == nil || j == nil || t.runnerPID <= 0 {
|
||||
return
|
||||
}
|
||||
goRecoverOnce("task runner monitor", func() {
|
||||
stopTail := make(chan struct{})
|
||||
doneTail := make(chan struct{})
|
||||
go q.followTaskLog(t, j, stopTail, doneTail)
|
||||
for processAlive(t.runnerPID) {
|
||||
time.Sleep(500 * time.Millisecond)
|
||||
}
|
||||
close(stopTail)
|
||||
<-doneTail
|
||||
q.finishExternalTask(t, j, nil)
|
||||
})
|
||||
}
|
||||
|
||||
func (q *taskQueue) runTaskExternal(t *Task, j *jobState) {
|
||||
startedKmsgWatch := false
|
||||
if q.kmsgWatcher != nil && isSATTarget(t.Target) {
|
||||
q.kmsgWatcher.NotifyTaskStarted(t.ID, t.Target)
|
||||
startedKmsgWatch = true
|
||||
}
|
||||
defer func() {
|
||||
if startedKmsgWatch && q.kmsgWatcher != nil {
|
||||
q.kmsgWatcher.NotifyTaskFinished(t.ID)
|
||||
}
|
||||
}()
|
||||
|
||||
stopTail := make(chan struct{})
|
||||
doneTail := make(chan struct{})
|
||||
defer func() {
|
||||
close(stopTail)
|
||||
<-doneTail
|
||||
}()
|
||||
go q.followTaskLog(t, j, stopTail, doneTail)
|
||||
|
||||
cmd, err := externalTaskRunnerCommand(q.opts.ExportDir, t.ID)
|
||||
if err != nil {
|
||||
j.appendFromLog("ERROR: " + err.Error())
|
||||
q.finishExternalTask(t, j, err)
|
||||
return
|
||||
}
|
||||
if err := cmd.Start(); err != nil {
|
||||
j.appendFromLog("ERROR: " + err.Error())
|
||||
q.finishExternalTask(t, j, err)
|
||||
return
|
||||
}
|
||||
|
||||
q.mu.Lock()
|
||||
t.runnerPID = cmd.Process.Pid
|
||||
q.attachExternalTaskControlsLocked(t, j)
|
||||
q.persistLocked()
|
||||
q.mu.Unlock()
|
||||
|
||||
waitErr := cmd.Wait()
|
||||
time.Sleep(200 * time.Millisecond)
|
||||
q.finishExternalTask(t, j, waitErr)
|
||||
}
|
||||
|
||||
func (q *taskQueue) followTaskLog(t *Task, j *jobState, stop <-chan struct{}, done chan<- struct{}) {
|
||||
defer close(done)
|
||||
path := ""
|
||||
if t != nil {
|
||||
path = t.LogPath
|
||||
}
|
||||
if strings.TrimSpace(path) == "" {
|
||||
return
|
||||
}
|
||||
offset := int64(0)
|
||||
if info, err := os.Stat(path); err == nil {
|
||||
offset = info.Size()
|
||||
}
|
||||
var partial string
|
||||
ticker := time.NewTicker(250 * time.Millisecond)
|
||||
defer ticker.Stop()
|
||||
flush := func() {
|
||||
data, newOffset, err := readTaskLogDelta(path, offset)
|
||||
if err != nil || len(data) == 0 {
|
||||
offset = newOffset
|
||||
return
|
||||
}
|
||||
offset = newOffset
|
||||
text := partial + strings.ReplaceAll(string(data), "\r\n", "\n")
|
||||
lines := strings.Split(text, "\n")
|
||||
partial = lines[len(lines)-1]
|
||||
for _, line := range lines[:len(lines)-1] {
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
j.appendFromLog(line)
|
||||
}
|
||||
}
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
flush()
|
||||
case <-stop:
|
||||
flush()
|
||||
if strings.TrimSpace(partial) != "" {
|
||||
j.appendFromLog(partial)
|
||||
}
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func readTaskLogDelta(path string, offset int64) ([]byte, int64, error) {
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return nil, offset, err
|
||||
}
|
||||
defer f.Close()
|
||||
info, err := f.Stat()
|
||||
if err != nil {
|
||||
return nil, offset, err
|
||||
}
|
||||
if info.Size() < offset {
|
||||
offset = 0
|
||||
}
|
||||
if _, err := f.Seek(offset, io.SeekStart); err != nil {
|
||||
return nil, offset, err
|
||||
}
|
||||
data, err := io.ReadAll(io.LimitReader(f, 1<<20))
|
||||
return data, offset + int64(len(data)), err
|
||||
}
|
||||
|
||||
func (q *taskQueue) finishExternalTask(t *Task, j *jobState, waitErr error) {
|
||||
q.mu.Lock()
|
||||
defer q.mu.Unlock()
|
||||
if t.Status == TaskDone || t.Status == TaskFailed || t.Status == TaskCancelled {
|
||||
if j != nil && !j.isDone() {
|
||||
j.finish(t.ErrMsg)
|
||||
j.closeLog()
|
||||
}
|
||||
select {
|
||||
case q.trigger <- struct{}{}:
|
||||
default:
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
state, ok := readTaskRunnerState(t)
|
||||
switch {
|
||||
case ok && state.Status != TaskRunning:
|
||||
t.Status = state.Status
|
||||
t.ErrMsg = state.Error
|
||||
now := state.UpdatedAt
|
||||
if now.IsZero() {
|
||||
now = time.Now()
|
||||
}
|
||||
t.DoneAt = &now
|
||||
case waitErr != nil:
|
||||
now := time.Now()
|
||||
t.Status = TaskFailed
|
||||
t.ErrMsg = waitErr.Error()
|
||||
t.DoneAt = &now
|
||||
default:
|
||||
now := time.Now()
|
||||
t.Status = TaskFailed
|
||||
t.ErrMsg = "task runner exited without final state"
|
||||
t.DoneAt = &now
|
||||
}
|
||||
t.runnerPID = 0
|
||||
q.finalizeTaskArtifactPathsLocked(t)
|
||||
q.persistLocked()
|
||||
|
||||
if j != nil && !j.isDone() {
|
||||
j.finish(t.ErrMsg)
|
||||
j.closeLog()
|
||||
}
|
||||
if t.ErrMsg != "" {
|
||||
taskSerialEvent(t, "finished with status="+t.Status+" error="+t.ErrMsg)
|
||||
} else {
|
||||
taskSerialEvent(t, "finished with status="+t.Status)
|
||||
}
|
||||
select {
|
||||
case q.trigger <- struct{}{}:
|
||||
default:
|
||||
}
|
||||
}
|
||||
|
||||
func (q *taskQueue) executeTask(t *Task, j *jobState, ctx context.Context) {
|
||||
startedKmsgWatch := false
|
||||
defer q.finalizeTaskRun(t, j)
|
||||
@@ -985,15 +1218,11 @@ func (h *handler) handleAPITasksCancel(w http.ResponseWriter, r *http.Request) {
|
||||
taskSerialEvent(t, "finished with status="+t.Status)
|
||||
writeJSON(w, map[string]string{"status": "cancelled"})
|
||||
case TaskRunning:
|
||||
if t.job != nil {
|
||||
t.job.abort()
|
||||
if t.job == nil || !t.job.abort() {
|
||||
writeError(w, http.StatusConflict, "task is not cancellable")
|
||||
return
|
||||
}
|
||||
t.Status = TaskCancelled
|
||||
now := time.Now()
|
||||
t.DoneAt = &now
|
||||
globalQueue.persistLocked()
|
||||
taskSerialEvent(t, "finished with status="+t.Status)
|
||||
writeJSON(w, map[string]string{"status": "cancelled"})
|
||||
writeJSON(w, map[string]string{"status": "aborting"})
|
||||
default:
|
||||
writeError(w, http.StatusConflict, "task is not running or pending")
|
||||
}
|
||||
@@ -1039,12 +1268,6 @@ func (h *handler) handleAPITasksCancelAll(w http.ResponseWriter, _ *http.Request
|
||||
if t.job != nil {
|
||||
t.job.abort()
|
||||
}
|
||||
if taskMayLeaveOrphanWorkers(t.Target) {
|
||||
platform.KillTestWorkers()
|
||||
}
|
||||
t.Status = TaskCancelled
|
||||
t.DoneAt = &now
|
||||
taskSerialEvent(t, "finished with status="+t.Status)
|
||||
n++
|
||||
}
|
||||
}
|
||||
@@ -1175,18 +1398,29 @@ func (q *taskQueue) loadLocked() {
|
||||
}
|
||||
q.assignTaskLogPathLocked(t)
|
||||
if t.Status == TaskRunning {
|
||||
// The task was interrupted by a bee-web restart. Child processes
|
||||
// (e.g. bee-gpu-burn-worker, dcgmi/nvvs) survive the restart in
|
||||
// their own process groups. Kill any matching stale workers before
|
||||
// marking the task failed so the next GPU test does not inherit a
|
||||
// busy DCGM slot or duplicate workers.
|
||||
if taskMayLeaveOrphanWorkers(t.Target) {
|
||||
_ = platform.KillTestWorkers()
|
||||
state, ok := readTaskRunnerState(t)
|
||||
switch {
|
||||
case ok && state.Status == TaskRunning && processAlive(state.PID):
|
||||
t.runnerPID = state.PID
|
||||
t.job = newTaskJobState(t.LogPath)
|
||||
case ok && state.Status != TaskRunning:
|
||||
t.runnerPID = state.PID
|
||||
t.Status = state.Status
|
||||
t.ErrMsg = state.Error
|
||||
now := state.UpdatedAt
|
||||
if now.IsZero() {
|
||||
now = time.Now()
|
||||
}
|
||||
t.DoneAt = &now
|
||||
default:
|
||||
if taskMayLeaveOrphanWorkers(t.Target) {
|
||||
_ = platform.KillTestWorkers()
|
||||
}
|
||||
now := time.Now()
|
||||
t.Status = TaskFailed
|
||||
t.DoneAt = &now
|
||||
t.ErrMsg = "interrupted by bee-web restart"
|
||||
}
|
||||
now := time.Now()
|
||||
t.Status = TaskFailed
|
||||
t.DoneAt = &now
|
||||
t.ErrMsg = "interrupted by bee-web restart"
|
||||
} else if t.Status == TaskPending {
|
||||
t.StartedAt = nil
|
||||
t.DoneAt = nil
|
||||
|
||||
@@ -126,6 +126,23 @@ func TestNewTaskJobStateLoadsExistingLog(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestJobAppendFlushesTaskLogImmediately(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "task.log")
|
||||
j := newTaskJobState(path)
|
||||
|
||||
j.append("live-line")
|
||||
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if string(data) != "live-line\n" {
|
||||
t.Fatalf("log=%q want live-line newline", string(data))
|
||||
}
|
||||
j.closeLog()
|
||||
}
|
||||
|
||||
func TestTaskQueueSnapshotSortsNewestFirst(t *testing.T) {
|
||||
now := time.Date(2026, 4, 2, 12, 0, 0, 0, time.UTC)
|
||||
q := &taskQueue{
|
||||
@@ -849,3 +866,82 @@ func TestExecuteTaskMarksPanicsAsFailedAndClosesKmsgWindow(t *testing.T) {
|
||||
t.Fatalf("expected kmsg window to be cleared, got %+v", window)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRunTaskExternalOpensAndClosesKmsgWindow(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
releasePath := filepath.Join(dir, "release")
|
||||
readyPath := filepath.Join(dir, "ready")
|
||||
q := &taskQueue{
|
||||
opts: &HandlerOptions{ExportDir: dir},
|
||||
logsDir: filepath.Join(dir, "tasks"),
|
||||
kmsgWatcher: newKmsgWatcher(nil),
|
||||
trigger: make(chan struct{}, 1),
|
||||
}
|
||||
if err := os.MkdirAll(q.logsDir, 0755); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
tk := &Task{
|
||||
ID: "cpu-external-1",
|
||||
Name: "CPU SAT",
|
||||
Target: "cpu",
|
||||
Status: TaskRunning,
|
||||
CreatedAt: time.Now(),
|
||||
}
|
||||
q.assignTaskLogPathLocked(tk)
|
||||
j := newTaskJobState(tk.LogPath)
|
||||
|
||||
orig := externalTaskRunnerCommand
|
||||
externalTaskRunnerCommand = func(exportDir, taskID string) (*exec.Cmd, error) {
|
||||
script := "printf ready > \"$1\"; while [ ! -f \"$2\" ]; do sleep 0.05; done"
|
||||
return exec.Command("sh", "-c", script, "sh", readyPath, releasePath), nil
|
||||
}
|
||||
defer func() { externalTaskRunnerCommand = orig }()
|
||||
|
||||
done := make(chan struct{})
|
||||
go func() {
|
||||
q.runTaskExternal(tk, j)
|
||||
close(done)
|
||||
}()
|
||||
|
||||
deadline := time.Now().Add(2 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
if _, err := os.Stat(readyPath); err == nil {
|
||||
break
|
||||
}
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
}
|
||||
if _, err := os.Stat(readyPath); err != nil {
|
||||
t.Fatalf("external runner did not start: %v", err)
|
||||
}
|
||||
|
||||
q.kmsgWatcher.mu.Lock()
|
||||
activeCount := q.kmsgWatcher.activeCount
|
||||
window := q.kmsgWatcher.window
|
||||
q.kmsgWatcher.mu.Unlock()
|
||||
if activeCount != 1 {
|
||||
t.Fatalf("activeCount while running=%d want 1", activeCount)
|
||||
}
|
||||
if window == nil || len(window.targets) != 1 || window.targets[0] != "cpu" {
|
||||
t.Fatalf("window while running=%+v", window)
|
||||
}
|
||||
|
||||
if err := os.WriteFile(releasePath, []byte("1\n"), 0644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
select {
|
||||
case <-done:
|
||||
case <-time.After(2 * time.Second):
|
||||
t.Fatal("runTaskExternal did not return")
|
||||
}
|
||||
|
||||
q.kmsgWatcher.mu.Lock()
|
||||
activeCount = q.kmsgWatcher.activeCount
|
||||
window = q.kmsgWatcher.window
|
||||
q.kmsgWatcher.mu.Unlock()
|
||||
if activeCount != 0 {
|
||||
t.Fatalf("activeCount after finish=%d want 0", activeCount)
|
||||
}
|
||||
if window != nil {
|
||||
t.Fatalf("expected kmsg window to be cleared, got %+v", window)
|
||||
}
|
||||
}
|
||||
|
||||
62
audit/internal/webui/viewer_snapshot.go
Normal file
62
audit/internal/webui/viewer_snapshot.go
Normal file
@@ -0,0 +1,62 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"strconv"
|
||||
)
|
||||
|
||||
func enrichSnapshotForViewer(snapshot []byte) []byte {
|
||||
if len(snapshot) == 0 {
|
||||
return snapshot
|
||||
}
|
||||
var root map[string]any
|
||||
if err := json.Unmarshal(snapshot, &root); err != nil {
|
||||
return snapshot
|
||||
}
|
||||
hardware, _ := root["hardware"].(map[string]any)
|
||||
if len(hardware) == 0 {
|
||||
return snapshot
|
||||
}
|
||||
storage, _ := hardware["storage"].([]any)
|
||||
if len(storage) == 0 {
|
||||
return snapshot
|
||||
}
|
||||
changed := false
|
||||
for _, item := range storage {
|
||||
row, _ := item.(map[string]any)
|
||||
if len(row) == 0 {
|
||||
continue
|
||||
}
|
||||
if _, exists := row["block_format"]; exists {
|
||||
continue
|
||||
}
|
||||
logical, okLogical := jsonNumberToInt64(row["logical_block_size_bytes"])
|
||||
metadata, okMetadata := jsonNumberToInt64(row["metadata_bytes_per_block"])
|
||||
if !okLogical || !okMetadata || logical <= 0 || metadata < 0 {
|
||||
continue
|
||||
}
|
||||
row["block_format"] = strconv.FormatInt(logical, 10) + "+" + strconv.FormatInt(metadata, 10)
|
||||
changed = true
|
||||
}
|
||||
if !changed {
|
||||
return snapshot
|
||||
}
|
||||
out, err := json.Marshal(root)
|
||||
if err != nil {
|
||||
return snapshot
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func jsonNumberToInt64(v any) (int64, bool) {
|
||||
switch x := v.(type) {
|
||||
case float64:
|
||||
return int64(x), true
|
||||
case int64:
|
||||
return x, true
|
||||
case int:
|
||||
return int64(x), true
|
||||
default:
|
||||
return 0, false
|
||||
}
|
||||
}
|
||||
2
bible
2
bible
Submodule bible updated: 1d89a4918e...1977730d93
@@ -9,5 +9,63 @@ Generic engineering rules live in `bible/rules/patterns/`.
|
||||
|---|---|
|
||||
| `architecture/system-overview.md` | What bee does, scope, tech stack |
|
||||
| `architecture/runtime-flows.md` | Boot sequence, audit flow, service order |
|
||||
| `docs/customer-gpu-test-methodology.md` | Customer-facing GPU PCIe Validate / Validate -> Stress test list |
|
||||
| `docs/hardware-ingest-contract.md` | Current Reanimator hardware ingest JSON contract |
|
||||
| `decisions/` | Architectural decision log |
|
||||
| `docs/validate-vs-burn.md` | Validate and Validate -> Stress hardware test policy |
|
||||
| `decisions/` | Architectural decision log, including read-only submodule policy |
|
||||
| `proposals/` | RFCs and contract change proposals for Reanimator Core |
|
||||
|
||||
## Validate Test Matrix
|
||||
|
||||
### Validate
|
||||
|
||||
- CPU check
|
||||
- `lscpu`
|
||||
- `sensors`
|
||||
- `stress-ng`
|
||||
- Memory check
|
||||
- `free`
|
||||
- `timeout <timeout_sec> memtester`
|
||||
- `free`
|
||||
- NVMe storage check
|
||||
- `nvme id-ctrl`
|
||||
- `nvme smart-log`
|
||||
- `nvme device-self-test`
|
||||
- SATA/SAS storage check
|
||||
- `smartctl -H -A`
|
||||
- `smartctl -t short`
|
||||
- Basic NVIDIA GPU check
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dmidecode -t baseboard`
|
||||
- `dmidecode -t system`
|
||||
- `dcgmi diag -r 2`
|
||||
- Inter-GPU communication check
|
||||
- `all_reduce_perf`
|
||||
- GPU bandwidth check
|
||||
- `dcgmi diag -r nvbandwidth`
|
||||
|
||||
### Validate -> Stress
|
||||
|
||||
- Extended NVIDIA GPU check
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dmidecode -t baseboard`
|
||||
- `dmidecode -t system`
|
||||
- `dcgmi diag -r 3`
|
||||
- NVIDIA targeted stress
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dcgmi diag -r targeted_stress`
|
||||
- NVIDIA targeted power
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dcgmi diag -r targeted_power`
|
||||
- NVIDIA pulse test
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dcgmi diag -r pulse_test`
|
||||
- Inter-GPU communication check
|
||||
- `all_reduce_perf`
|
||||
- GPU bandwidth check
|
||||
- `dcgmi diag -r nvbandwidth`
|
||||
|
||||
185
bible-local/architecture/api-surface.md
Normal file
185
bible-local/architecture/api-surface.md
Normal file
@@ -0,0 +1,185 @@
|
||||
# API Surface
|
||||
|
||||
HTTP endpoints exposed by `bee web` (binds `0.0.0.0:80`).
|
||||
Handler registration: `audit/internal/webui/server.go` → `NewHandler()`.
|
||||
|
||||
---
|
||||
|
||||
## Health & readiness
|
||||
|
||||
| Method | Path | Description |
|
||||
|--------|----------------|-----------------------------------------------------|
|
||||
| GET | `/healthz` | Always 200. Used by load balancers / boot scripts. |
|
||||
| GET | `/api/ready` | 200 when audit JSON exists and is readable. |
|
||||
| GET | `/loading` | HTML loading page shown before first audit. |
|
||||
|
||||
---
|
||||
|
||||
## Audit
|
||||
|
||||
| Method | Path | Description |
|
||||
|--------|-----------------------|--------------------------------------------------------------|
|
||||
| GET | `/audit.json` | Latest audit JSON with SAT overlay applied. |
|
||||
| GET | `/runtime-health.json`| Latest runtime preflight JSON. |
|
||||
| POST | `/api/audit/run` | Enqueue a full `bee audit` run. Returns task ID. |
|
||||
| GET | `/api/audit/stream` | SSE: audit run log lines (`data:` + newline per line). |
|
||||
| GET | `/api/preflight` | Run runtime preflight check (synchronous, returns JSON). |
|
||||
| GET | `/api/hardware-summary` | Hardware health summary (status counts + failures). |
|
||||
| GET | `/api/components/{type}` | HTML fragment for component detail dialog (e.g. `cpu`, `memory`, `storage`, `pcie`). |
|
||||
|
||||
---
|
||||
|
||||
## SAT (System Acceptance Testing)
|
||||
|
||||
All SAT run endpoints enqueue an async task. Response: `{"task_id": "..."}`.
|
||||
|
||||
| Method | Path | Description |
|
||||
|--------|--------------------------------------------|-----------------------------------|
|
||||
| POST | `/api/sat/nvidia/run` | NVIDIA DCGM SAT |
|
||||
| POST | `/api/sat/nvidia-targeted-stress/run` | NVIDIA targeted stress validate |
|
||||
| POST | `/api/sat/nvidia-compute/run` | NVIDIA max compute load |
|
||||
| POST | `/api/sat/nvidia-targeted-power/run` | NVIDIA targeted power |
|
||||
| POST | `/api/sat/nvidia-pulse/run` | NVIDIA pulse test |
|
||||
| POST | `/api/sat/nvidia-interconnect/run` | NCCL all_reduce_perf |
|
||||
| POST | `/api/sat/nvidia-bandwidth/run` | NVBandwidth test |
|
||||
| POST | `/api/sat/nvidia-stress/run` | NVIDIA stress pack |
|
||||
| POST | `/api/sat/memory/run` | Memory acceptance |
|
||||
| POST | `/api/sat/storage/run` | Storage acceptance (smartctl) |
|
||||
| POST | `/api/sat/cpu/run` | CPU acceptance (stress-ng) |
|
||||
| POST | `/api/sat/amd/run` | AMD GPU SAT (ROCm) |
|
||||
| POST | `/api/sat/amd-mem/run` | AMD memory integrity + bandwidth |
|
||||
| POST | `/api/sat/amd-bandwidth/run` | AMD memory bandwidth |
|
||||
| POST | `/api/sat/amd-stress/run` | AMD GPU stress |
|
||||
| POST | `/api/sat/memory-stress/run` | Memory stress |
|
||||
| POST | `/api/sat/sat-stress/run` | Combined storage+memory stress |
|
||||
| POST | `/api/sat/platform-stress/run` | Fan + thermal stress |
|
||||
| GET | `/api/sat/stream` | SSE: live SAT log stream |
|
||||
| POST | `/api/sat/abort` | Abort the running SAT task |
|
||||
|
||||
---
|
||||
|
||||
## Benchmarks
|
||||
|
||||
| Method | Path | Description |
|
||||
|--------|-----------------------------------------|----------------------------------------------|
|
||||
| POST | `/api/bee-bench/nvidia/perf/run` | NVIDIA performance benchmark |
|
||||
| POST | `/api/bee-bench/nvidia/power/run` | NVIDIA power benchmark |
|
||||
| POST | `/api/bee-bench/nvidia/autotune/run` | Power source autotune (prerequisite for benchmarks) |
|
||||
| GET | `/api/bee-bench/nvidia/autotune/status` | Current autotune result / status |
|
||||
| GET | `/api/benchmark/results` | List completed benchmark result archives |
|
||||
|
||||
---
|
||||
|
||||
## Tasks (async job queue)
|
||||
|
||||
| Method | Path | Description |
|
||||
|--------|-----------------------------|----------------------------------------------------|
|
||||
| GET | `/api/tasks` | List all tasks with status |
|
||||
| POST | `/api/tasks/cancel-all` | Cancel all pending/running tasks |
|
||||
| POST | `/api/tasks/kill-workers` | Force-kill worker goroutines |
|
||||
| POST | `/api/tasks/{id}/cancel` | Cancel a specific task |
|
||||
| POST | `/api/tasks/{id}/priority` | Elevate task priority |
|
||||
| GET | `/api/tasks/{id}/stream` | SSE: live log stream for a task |
|
||||
| GET | `/api/tasks/{id}/charts` | List chart names for a task |
|
||||
| GET | `/api/tasks/{id}/chart/` | SVG chart for a task result |
|
||||
| GET | `/tasks/{id}` | HTML task detail page |
|
||||
|
||||
---
|
||||
|
||||
## Services
|
||||
|
||||
| Method | Path | Description |
|
||||
|--------|---------------------------|--------------------------------------------------|
|
||||
| GET | `/api/services` | List bee-* systemd services and their states |
|
||||
| POST | `/api/services/action` | start/stop/restart a service |
|
||||
|
||||
---
|
||||
|
||||
## Network
|
||||
|
||||
| Method | Path | Description |
|
||||
|--------|----------------------------|-----------------------------------------------------|
|
||||
| GET | `/api/network` | List interfaces with state and IPv4 addresses |
|
||||
| POST | `/api/network/dhcp` | Run dhclient on one or all interfaces |
|
||||
| POST | `/api/network/static` | Set static IPv4 address |
|
||||
| POST | `/api/network/toggle` | Bring interface up or down |
|
||||
| POST | `/api/network/confirm` | Confirm pending network change (clears rollback) |
|
||||
| POST | `/api/network/rollback` | Restore pre-change network snapshot |
|
||||
|
||||
---
|
||||
|
||||
## Export
|
||||
|
||||
| Method | Path | Description |
|
||||
|--------|-------------------------------|---------------------------------------------------|
|
||||
| GET | `/export/support.tar.gz` | Download support bundle (live-generated) |
|
||||
| GET | `/export/file` | Download a file from the export dir by path param |
|
||||
| GET | `/export/` | Browse export dir (HTML index) |
|
||||
| GET | `/api/export/list` | JSON list of files in export dir |
|
||||
| GET | `/api/export/usb` | List removable USB targets available for export |
|
||||
|
||||
---
|
||||
|
||||
## GPU
|
||||
|
||||
| Method | Path | Description |
|
||||
|--------|----------------------------|----------------------------------------------------|
|
||||
| GET | `/api/gpu/presence` | `{"nvidia": bool, "amd": bool}` |
|
||||
| GET | `/api/gpu/nvidia` | List NVIDIA GPUs from nvidia-smi |
|
||||
| GET | `/api/gpu/nvidia-status` | Per-GPU status (ECC, power, throttle) |
|
||||
| POST | `/api/gpu/nvidia-reset` | GPU reset by index |
|
||||
| GET | `/api/gpu/tools` | nvidia-smi / rocm-smi tool availability |
|
||||
|
||||
---
|
||||
|
||||
## System
|
||||
|
||||
| Method | Path | Description |
|
||||
|--------|------------------------------|---------------------------------------------------|
|
||||
| GET | `/api/system/ram-status` | toram boot state and ISO copy status |
|
||||
| POST | `/api/system/install-to-ram` | Copy ISO to RAM (background task) |
|
||||
| GET | `/api/install/disks` | List block devices suitable for disk installation |
|
||||
| POST | `/api/install/run` | Install bee to disk (background task) |
|
||||
|
||||
---
|
||||
|
||||
## Tools & NVMe
|
||||
|
||||
| Method | Path | Description |
|
||||
|--------|-------------------------------|--------------------------------------------------|
|
||||
| GET | `/api/tools/check` | Check availability of required CLI tools |
|
||||
| GET | `/api/tools/nvme-formats` | List NVMe format options for a device |
|
||||
| POST | `/api/tools/nvme-format/run` | Run nvme-format on a device |
|
||||
|
||||
---
|
||||
|
||||
## Live metrics
|
||||
|
||||
| Method | Path | Description |
|
||||
|--------|------------------------------|---------------------------------------------------|
|
||||
| GET | `/api/metrics/stream` | SSE: live metrics (GPU power, temp, utilization) |
|
||||
| GET | `/api/metrics/latest` | Latest metrics snapshot (JSON) |
|
||||
| GET | `/api/metrics/chart/` | SVG chart for a metric over time |
|
||||
| GET | `/api/metrics/export.csv` | Download metrics history as CSV |
|
||||
|
||||
---
|
||||
|
||||
## Blackbox logging
|
||||
|
||||
| Method | Path | Description |
|
||||
|--------|----------------------------|-----------------------------------------------|
|
||||
| GET | `/api/blackbox/status` | Blackbox log state (enabled, size, path) |
|
||||
| POST | `/api/blackbox/enable` | Start recording blackbox log |
|
||||
| POST | `/api/blackbox/disable` | Stop recording, flush to disk |
|
||||
|
||||
---
|
||||
|
||||
## UI pages
|
||||
|
||||
| Method | Path | Description |
|
||||
|--------|------------|-----------------------------------------------|
|
||||
| GET | `/` | Main dashboard (serves all page routes) |
|
||||
| GET | `/viewer` | Standalone JSON viewer for uploaded audit files |
|
||||
|
||||
All pages are rendered server-side as HTML. The `/` route handles sub-paths such as
|
||||
`/network`, `/services`, `/sat`, `/benchmark`, `/install`, `/validate`, `/export`.
|
||||
137
bible-local/architecture/data-model.md
Normal file
137
bible-local/architecture/data-model.md
Normal file
@@ -0,0 +1,137 @@
|
||||
# Data Model
|
||||
|
||||
The canonical output of `bee audit` is a `HardwareIngestRequest` JSON document accepted
|
||||
by the Reanimator `/api/ingest/hardware` endpoint. The ingest endpoint uses a strict
|
||||
decoder — unknown fields cause `400 Bad Request`.
|
||||
|
||||
Source of truth: `audit/internal/schema/hardware.go`
|
||||
|
||||
---
|
||||
|
||||
## Top-level: HardwareIngestRequest
|
||||
|
||||
```
|
||||
HardwareIngestRequest
|
||||
├── collected_at string RFC3339 UTC timestamp of collection
|
||||
├── hardware HardwareSnapshot
|
||||
├── runtime RuntimeHealth? from bee-runtime-preflight service
|
||||
├── filename string?
|
||||
├── source_type string?
|
||||
├── protocol string?
|
||||
└── target_host string?
|
||||
```
|
||||
|
||||
`collected_at` is the primary sort key used by Reanimator to deduplicate ingests.
|
||||
|
||||
---
|
||||
|
||||
## HardwareSnapshot
|
||||
|
||||
All component arrays are `omitempty` — absent when the collector finds nothing.
|
||||
|
||||
| JSON key | Go type | Source |
|
||||
|-------------------|----------------------------|------------------------------|
|
||||
| `board` | HardwareBoard | dmidecode type 1/2 |
|
||||
| `firmware` | []HardwareFirmwareRecord | dmidecode type 0/13 |
|
||||
| `cpus` | []HardwareCPU | dmidecode type 4 |
|
||||
| `memory` | []HardwareMemory | dmidecode type 17 |
|
||||
| `storage` | []HardwareStorage | lsblk + nvme-cli + smartctl |
|
||||
| `pcie_devices` | []HardwarePCIeDevice | lspci |
|
||||
| `power_supplies` | []HardwarePowerSupply | ipmitool fru + sdr |
|
||||
| `sensors` | *HardwareSensors | sensors -j |
|
||||
| `event_logs` | []HardwareEventLog | ipmitool sel + journald |
|
||||
| `platform_config` | *json.RawMessage | reserved, nil until used |
|
||||
| `vroc_license` | *string | vroc-cli |
|
||||
|
||||
---
|
||||
|
||||
## Identity keys
|
||||
|
||||
Reanimator uses these fields to match components across successive audits:
|
||||
|
||||
| Component | Identity key |
|
||||
|----------------|------------------------------------------------|
|
||||
| Board | `board.serial_number` (required, never empty) |
|
||||
| CPU | `serial_number` if present; else generated key |
|
||||
| Memory DIMM | `serial_number` — absent DIMMs have `present: false` |
|
||||
| Storage | `serial_number` if present; else `linux_device` from Telemetry |
|
||||
| PCIe device | `bdf` (Bus:Device.Function address) |
|
||||
| PSU | `slot` |
|
||||
|
||||
Components without a stable identity are still emitted but may not be matched across runs.
|
||||
|
||||
---
|
||||
|
||||
## HardwareComponentStatus (embedded in all components)
|
||||
|
||||
```go
|
||||
type HardwareComponentStatus struct {
|
||||
Status *string `json:"status,omitempty"` // OK | Warning | Critical | Unknown
|
||||
ErrorDescription *string `json:"error_description,omitempty"`
|
||||
}
|
||||
```
|
||||
|
||||
Status is set by collectors and overwritten at render time by `ApplySATOverlay`
|
||||
(latest SAT run results are always merged on top before display).
|
||||
|
||||
---
|
||||
|
||||
## HardwarePCIeDevice
|
||||
|
||||
The most enriched component type. Key fields:
|
||||
|
||||
| JSON key | Meaning |
|
||||
|----------------------|------------------------------------------------|
|
||||
| `bdf` | PCI address (identity key), e.g. `0000:4b:00.0` |
|
||||
| `vendor_id` | Numeric PCI vendor ID (hex). Use this for classification — not `manufacturer`. |
|
||||
| `device_id` | Numeric PCI device ID (hex) |
|
||||
| `device_class` | Human-readable class, e.g. `VideoController` |
|
||||
| `manufacturer` | String label from lspci — for display only |
|
||||
| `model` | From nvidia-smi / rocm-smi — display name |
|
||||
| `link_speed` | Current PCIe link speed, e.g. `Gen4` |
|
||||
| `max_link_speed` | Max negotiated speed |
|
||||
| `link_width` | Current lane count |
|
||||
| `max_link_width` | Max lane count |
|
||||
| `temperature_c` | From nvidia-smi / rocm-smi |
|
||||
| `power_w` | Current power draw |
|
||||
| `ecc_uncorrected_total` | Cumulative ECC uncorrected errors (NVIDIA) |
|
||||
| `ecc_corrected_total` | Cumulative ECC corrected errors (NVIDIA) |
|
||||
| `hw_slowdown` | HW throttle active (NVIDIA) |
|
||||
| `telemetry` | Free-form map for vendor-specific extras |
|
||||
|
||||
**Classification rule**: use `vendor_id` (numeric PCI ID), never `manufacturer` string.
|
||||
|
||||
| Vendor | vendor_id |
|
||||
|-----------|-----------|
|
||||
| NVIDIA | `0x10de` |
|
||||
| AMD | `0x1002` |
|
||||
| Mellanox | `0x15b3` |
|
||||
| Aspeed | `0x1a03` |
|
||||
| Intel | `0x8086` |
|
||||
|
||||
Constants live in `audit/internal/collector/pci_vendors.go`.
|
||||
|
||||
---
|
||||
|
||||
## HardwareMemory
|
||||
|
||||
`location` field exists in the Go struct with `json:"-"` — it is intentionally excluded
|
||||
from JSON output because the Reanimator schema does not include it. It is used internally
|
||||
for DIMM telemetry matching only (`collector/memory_telemetry.go`).
|
||||
|
||||
---
|
||||
|
||||
## HardwareSensors
|
||||
|
||||
Sensor structs (`HardwareFanSensor`, `HardwareTemperatureSensor`,
|
||||
`HardwarePowerSensor`, `HardwareOtherSensor`) do **not** have a `location` field.
|
||||
Location was removed in contract v2.8. The Go types mirror the schema exactly.
|
||||
|
||||
---
|
||||
|
||||
## JSON naming convention
|
||||
|
||||
All JSON keys are `snake_case`. Go field names are `CamelCase`. The mapping is
|
||||
maintained by struct tags in `audit/internal/schema/hardware.go`.
|
||||
|
||||
All pointer fields use `omitempty` — absent means not collected (not zero).
|
||||
@@ -149,7 +149,6 @@ Current validation state:
|
||||
6. psu collector (ipmitool fru + sdr — silent if no /dev/ipmi0)
|
||||
7. nvidia enrichment (nvidia-smi — skipped if binary absent or driver not loaded)
|
||||
8. output JSON → /var/log/bee-audit.json
|
||||
9. QR summary to stdout (qrencode if available)
|
||||
```
|
||||
|
||||
Every collector returns `nil, nil` on tool-not-found. Errors are logged, never fatal.
|
||||
|
||||
@@ -58,6 +58,8 @@ Fills gaps where Redfish/logpile is blind:
|
||||
- `bee` should populate current component state, hardware inventory, telemetry, and `status_checked_at`.
|
||||
- Historical status transitions and component replacement logic belong to the centralized ingest/lifecycle system, not to `bee`.
|
||||
- Contract fields that have no honest local source on a generic Linux host may remain empty.
|
||||
- Embedded submodules such as `internal/chart/` and `bible/` are read-only for `bee` feature work.
|
||||
- If the UI needs extra information, `bee` must emit it through the standard audit JSON contract rather than patching `chart`.
|
||||
|
||||
## Tech stack
|
||||
|
||||
@@ -101,7 +103,7 @@ Fills gaps where Redfish/logpile is blind:
|
||||
| `iso/builder/` | ISO build scripts and `live-build` profile |
|
||||
| `iso/overlay/` | Source overlay copied into a staged build overlay |
|
||||
| `iso/vendor/` | Optional pre-built vendor binaries (storcli64, sas2ircu, sas3ircu, arcconf, ssacli, …) |
|
||||
| `internal/chart/` | Git submodule with `reanimator/chart`, embedded into `bee web` |
|
||||
| `internal/chart/` | Git submodule with `reanimator/chart`, embedded into `bee web`; update by submodule pointer only, never by local `bee`-specific edits |
|
||||
| `iso/builder/VERSIONS` | Pinned versions: Debian, Go, NVIDIA driver, kernel ABI |
|
||||
| `iso/builder/smoketest.sh` | Post-boot smoke test — run via SSH to verify live ISO |
|
||||
| `iso/overlay/etc/profile.d/bee.sh` | tty1 welcome message with web UI URLs |
|
||||
|
||||
@@ -1,5 +1,103 @@
|
||||
# Backlog
|
||||
|
||||
## Сбор SFP-модулей
|
||||
|
||||
**Статус:** не реализовано.
|
||||
|
||||
### Источник данных
|
||||
|
||||
`ethtool -m <iface>` / `ethtool --module-info <iface>` — читает EEPROM SFP/SFP+/QSFP28/QSFP-DD по стандарту MSA (SFF-8472 / SFF-8636).
|
||||
|
||||
Доступные поля из EEPROM:
|
||||
- Идентификатор модуля: `Identifier` (SFP, SFP+, QSFP28, …)
|
||||
- Тип коннектора: `Connector`
|
||||
- Вендор: `Vendor name`, `Vendor OUI`, `Vendor PN`, `Vendor SN`, `Vendor rev`
|
||||
- Оптика: `Wavelength`, `Transceiver type` (10GBase-SR, LR, DAC, …)
|
||||
- Телеметрия DOM (если модуль поддерживает): `Laser tx bias current`, `Transmit avg optical power`, `Receive avg optical power`, `Module temperature`, `Module voltage`
|
||||
- Статус: `Rx power high alarm`, `Tx power low warning`, …
|
||||
|
||||
Для QSFP28 данные повторяются на 4 канала (lane 0–3).
|
||||
|
||||
Инструмент требует root. На bee ISO — доступен (`ethtool` входит в образ).
|
||||
|
||||
### Scope для bee
|
||||
|
||||
1. Собирать список сетевых интерфейсов через `ip -j link show` (только `ether`, без `lo`/VLAN/bond).
|
||||
2. Для каждого интерфейса пробовать `ethtool -m <iface>`. Если модуль отсутствует или не поддерживает EEPROM read — тихо пропускать.
|
||||
3. Связывать интерфейс с PCIe-устройством через `ethtool -i <iface>` → поле `bus-info` (BDF) → сопоставление с `pcie_devices[].slot`.
|
||||
|
||||
### Gap в контракте
|
||||
|
||||
Текущий контракт v2.10 имеет в `pcie_devices[]` скалярные поля:
|
||||
- `sfp_temperature_c`, `sfp_tx_power_dbm`, `sfp_rx_power_dbm`, `sfp_voltage_v`, `sfp_bias_ma`
|
||||
|
||||
Этого **недостаточно**:
|
||||
- Одна NIC-карта может иметь несколько портов — нужен массив, а не скаляр.
|
||||
- Нет полей идентификации модуля (vendor, part_number, serial_number, wavelength, connector).
|
||||
- Нет разбивки по каналам для QSFP28.
|
||||
|
||||
### Предлагаемое расширение контракта
|
||||
|
||||
Добавить в `pcie_devices[]` массив `sfp_modules[]`:
|
||||
|
||||
```json
|
||||
"pcie_devices": [
|
||||
{
|
||||
"slot": "0000:3b:00.0",
|
||||
"device_class": "EthernetController",
|
||||
"model": "ConnectX-6 Dx",
|
||||
"sfp_modules": [
|
||||
{
|
||||
"port": 0,
|
||||
"identifier": "QSFP28",
|
||||
"connector": "LC",
|
||||
"vendor": "Mellanox",
|
||||
"part_number": "MFA1A00-C003",
|
||||
"serial_number": "MT2124VS09999",
|
||||
"revision": "A",
|
||||
"wavelength_nm": 850,
|
||||
"transceiver_type": "100GBase-SR4",
|
||||
"temperature_c": 36.4,
|
||||
"voltage_v": 3.29,
|
||||
"tx_power_dbm": -1.8,
|
||||
"rx_power_dbm": -2.1,
|
||||
"bias_ma": 7.2
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
Поля `sfp_modules[]`:
|
||||
|
||||
| Поле | Тип | Описание |
|
||||
|---|---|---|
|
||||
| `port` | int | Номер порта на NIC (0-based) |
|
||||
| `identifier` | string | `SFP`, `SFP+`, `QSFP28`, `QSFP-DD`, … |
|
||||
| `connector` | string | `LC`, `MPO`, `DAC`, … |
|
||||
| `vendor` | string | Производитель модуля |
|
||||
| `part_number` | string | Партномер |
|
||||
| `serial_number` | string | Серийный номер |
|
||||
| `revision` | string | Ревизия |
|
||||
| `wavelength_nm` | int | Длина волны, нм |
|
||||
| `transceiver_type` | string | `10GBase-SR`, `100GBase-SR4`, `DAC`, … |
|
||||
| `temperature_c` | float | Температура модуля, °C |
|
||||
| `voltage_v` | float | Напряжение, В |
|
||||
| `tx_power_dbm` | float | TX оптическая мощность, dBm |
|
||||
| `rx_power_dbm` | float | RX оптическая мощность, dBm |
|
||||
| `bias_ma` | float | Bias current, мА |
|
||||
|
||||
Старые скалярные поля `sfp_temperature_c` / `sfp_tx_power_dbm` / `sfp_rx_power_dbm` / `sfp_voltage_v` / `sfp_bias_ma` на уровне `pcie_devices[]` — **вывести из контракта** (deprecated), заменить на `sfp_modules[]`.
|
||||
|
||||
### Порядок реализации
|
||||
|
||||
1. Согласовать расширение контракта с Reanimator Core (bump до v2.11).
|
||||
2. Добавить `ethtool` parser в `audit/internal/collector/` — новый файл `sfp.go`.
|
||||
3. Дополнить schema в `audit/internal/schema/` типом `SFPModule`.
|
||||
4. Добавить `sfp_modules` в `PCIeDevice` в schema.
|
||||
5. Заполнять в NIC-коллекторе: связь интерфейс → BDF → `pcie_devices[].sfp_modules`.
|
||||
6. Показывать в TUI и web UI в разделе PCIe/NIC.
|
||||
|
||||
## BMC версия через IPMI
|
||||
|
||||
**Статус:** реализовано.
|
||||
|
||||
@@ -0,0 +1,39 @@
|
||||
# Decision: Treat embedded submodules as read-only
|
||||
|
||||
## Context
|
||||
|
||||
`bee` embeds external git submodules such as:
|
||||
|
||||
- `internal/chart/` — `reanimator/chart`, a generic read-only viewer for Reanimator JSON snapshots
|
||||
- `bible/` — shared engineering rules and contracts
|
||||
|
||||
These repositories are reused by other projects. A local feature request in `bee`
|
||||
must not be solved by silently changing shared submodule behavior.
|
||||
|
||||
The concrete failure mode here was attempting to add project-specific storage
|
||||
telemetry presentation by editing `internal/chart/`. That couples a shared viewer
|
||||
to one host application's needs and creates hidden cross-project regressions.
|
||||
|
||||
## Decision
|
||||
|
||||
Embedded submodules are read-only from the point of view of `bee`.
|
||||
|
||||
- Do not implement `bee`-specific behavior by editing `internal/chart/`.
|
||||
- Do not implement `bee`-specific behavior by editing `bible/`.
|
||||
- If `bee` needs new data in the report, produce it in the standard audit JSON
|
||||
emitted by `bee` itself.
|
||||
- `chart` must continue to consume the canonical snapshot as an external viewer,
|
||||
without host-specific forks.
|
||||
- Updating a submodule pointer to an upstream commit is allowed.
|
||||
- Carrying local unmerged submodule commits as part of a `bee` feature is forbidden.
|
||||
|
||||
## Consequences
|
||||
|
||||
- Audit/report features must be expressed through the contract in
|
||||
`bible-local/docs/hardware-ingest-contract.md`.
|
||||
- `bee` owns collection, normalization, and serialization of storage telemetry in
|
||||
`hardware.storage[]`.
|
||||
- `chart` remains a pure visualization module that reads the snapshot it is given.
|
||||
- If a capability is genuinely missing in a shared submodule, it must be proposed
|
||||
and landed upstream as a generic change first, then pulled into `bee` via a
|
||||
normal submodule update.
|
||||
@@ -0,0 +1,41 @@
|
||||
# Decision: Skip PCIe link-speed warnings for disabled devices
|
||||
|
||||
**Date:** 2026-06-12
|
||||
**Status:** active
|
||||
|
||||
## Context
|
||||
|
||||
On HGX H100 SXM5 baseboards, the Microchip Switchtec PM41028 PSX PCIe switch
|
||||
(vendor 11F8, device 4128, NVIDIA subsystem 10DE:1643) appears in `lspci` as a
|
||||
"Memory controller". Its upstream link trains at Gen3 x2 while the device is
|
||||
capable of Gen4 x16. The device is permanently in a disabled state: memory access
|
||||
and bus-mastering are both off (Mem-, BusMaster-); `/sys/bus/pci/devices/<bdf>/enable`
|
||||
reads `0`.
|
||||
|
||||
This chip is the PCIe fabric management endpoint for the NVSwitch interconnect — it
|
||||
carries only management traffic at low bandwidth and is intentionally not activated
|
||||
by any Linux driver. The bee audit was reporting a `statusWarning` with message
|
||||
"PCIe link speed degraded" for this device, which is misleading because the device
|
||||
is not in the data path.
|
||||
|
||||
## Decision
|
||||
|
||||
`applyPCIeLinkSpeedWarning` reads `/sys/bus/pci/devices/<bdf>/enable` via the
|
||||
existing `readPCIIntAttribute` helper. If the value is `0` the function returns
|
||||
early without setting any warning status.
|
||||
|
||||
The check is vendor-agnostic: it applies to any PCIe device that Linux has not
|
||||
activated, regardless of make or model. This is consistent with the
|
||||
`no-hardcoded-vendors` contract — no vendor ID, device ID, or name string is
|
||||
used as a condition.
|
||||
|
||||
## Consequences
|
||||
|
||||
- PCIe fabric management endpoints, IPMI virtual devices, and other permanently
|
||||
disabled PCIe functions no longer produce spurious link-degradation warnings.
|
||||
- Real link degradation on active devices (GPUs, NICs, NVMe, NVLink bridges)
|
||||
continues to be detected and reported as before.
|
||||
- NVLink bridge cards retain their existing `statusCritical` path (they are always
|
||||
enabled, so the early return is never taken for them).
|
||||
- The Switchtec device on HGX H100 boards shows `statusOK` with no
|
||||
`error_description` in the audit JSON.
|
||||
@@ -6,3 +6,5 @@ One file per decision, named `YYYY-MM-DD-short-topic.md`.
|
||||
|---|---|---|
|
||||
| 2026-03-05 | Use NVIDIA proprietary driver | active |
|
||||
| 2026-04-01 | Treat memtest as explicit ISO content | active |
|
||||
| 2026-04-29 | Treat embedded submodules as read-only | active |
|
||||
| 2026-06-12 | Skip PCIe link-speed warnings for disabled devices | active |
|
||||
|
||||
54
bible-local/docs/customer-gpu-test-methodology.md
Normal file
54
bible-local/docs/customer-gpu-test-methodology.md
Normal file
@@ -0,0 +1,54 @@
|
||||
# GPU PCIe Test Methodology
|
||||
|
||||
## Validate
|
||||
|
||||
- CPU check
|
||||
- `lscpu`
|
||||
- `sensors`
|
||||
- `stress-ng`
|
||||
- Memory check
|
||||
- `free`
|
||||
- `timeout <timeout_sec> memtester`
|
||||
- `free`
|
||||
- NVMe storage check
|
||||
- `nvme id-ctrl`
|
||||
- `nvme smart-log`
|
||||
- `nvme device-self-test`
|
||||
- SATA/SAS storage check
|
||||
- `smartctl -H -A`
|
||||
- `smartctl -t short`
|
||||
- Basic NVIDIA GPU check
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dmidecode -t baseboard`
|
||||
- `dmidecode -t system`
|
||||
- `dcgmi diag -r 2`
|
||||
- Inter-GPU communication check
|
||||
- `all_reduce_perf`
|
||||
- GPU bandwidth check
|
||||
- `dcgmi diag -r nvbandwidth`
|
||||
|
||||
## Validate -> Stress
|
||||
|
||||
- Extended NVIDIA GPU check
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dmidecode -t baseboard`
|
||||
- `dmidecode -t system`
|
||||
- `dcgmi diag -r 3`
|
||||
- NVIDIA targeted stress
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dcgmi diag -r targeted_stress`
|
||||
- NVIDIA targeted power
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dcgmi diag -r targeted_power`
|
||||
- NVIDIA pulse test
|
||||
- `nvidia-smi -pm 1`
|
||||
- `nvidia-smi -q`
|
||||
- `dcgmi diag -r pulse_test`
|
||||
- Inter-GPU communication check
|
||||
- `all_reduce_perf`
|
||||
- GPU bandwidth check
|
||||
- `dcgmi diag -r nvbandwidth`
|
||||
312
bible-local/docs/grub-bitmap-error-history.md
Normal file
312
bible-local/docs/grub-bitmap-error-history.md
Normal file
@@ -0,0 +1,312 @@
|
||||
# GRUB Bitmap Error History
|
||||
|
||||
## Symptom
|
||||
|
||||
On some servers GRUB prints:
|
||||
|
||||
```text
|
||||
error: null src bitmap in grub_video_bitmap_create_scaled.
|
||||
Press any key to continue...
|
||||
```
|
||||
|
||||
The important new observation as of `v10.7` is:
|
||||
|
||||
- the error still appears even when the logo image block is removed from
|
||||
`iso/builder/config/bootloaders/grub-efi/live-theme/theme.txt`
|
||||
- therefore the current error can no longer be explained only by
|
||||
`bee-logo.png` / `bee-logo.tga`
|
||||
|
||||
That does not prove the theme system is healthy. It proves only that the
|
||||
currently remaining failure is deeper than "bad logo file".
|
||||
|
||||
## Current State
|
||||
|
||||
Current source files:
|
||||
|
||||
- [iso/builder/config/bootloaders/grub-efi/live-theme/theme.txt](/Users/mchusavitin/Documents/git/bee/iso/builder/config/bootloaders/grub-efi/live-theme/theme.txt:1)
|
||||
has no `image` block anymore
|
||||
- [iso/builder/config/bootloaders/grub-efi/config.cfg](/Users/mchusavitin/Documents/git/bee/iso/builder/config/bootloaders/grub-efi/config.cfg:1)
|
||||
still does `insmod tga` and then `source /boot/grub/theme.cfg`
|
||||
|
||||
Implication:
|
||||
|
||||
- if the error still fires, the trigger is likely elsewhere in GRUB theme
|
||||
rendering or in the assets/config GRUB resolves while sourcing `theme.cfg`
|
||||
- the old "PNG parser fragility" story is no longer a sufficient explanation
|
||||
for the current failure mode
|
||||
|
||||
Current artifact facts:
|
||||
|
||||
- the provided `easy-bee-nvidia-v10.7-amd64.logs` build logs reference
|
||||
`linux-image-6.1.0-45`
|
||||
- the provided `easy-bee-nvidia-v10.7-amd64.iso` contains
|
||||
`live/initrd.img-6.1.0-45-amd64` and `live/vmlinuz-6.1.0-45-amd64`
|
||||
- a later `BOOT FAILED!` screenshot showed `live/initrd.img-6.1.0-44-amd64`
|
||||
and `live/vmlinuz-6.1.0-44-amd64`
|
||||
|
||||
Implication:
|
||||
|
||||
- the `BOOT FAILED!` screenshot is not from the same artifact as the provided
|
||||
`v10.7` ISO/log set
|
||||
- until the exact ISO filename and checksum are tied to that failure, the
|
||||
GRUB bitmap issue and the live-boot failure must be treated as separate
|
||||
problems
|
||||
|
||||
## Chronology
|
||||
|
||||
### 1. Initial bee GRUB theme introduction
|
||||
|
||||
Relevant commit:
|
||||
|
||||
- `d52ec67` `Stability hardening, build script fixes, GRUB bee logo`
|
||||
|
||||
What changed:
|
||||
|
||||
- bee-branded GRUB theme introduced
|
||||
- image block with explicit `width` / `height`
|
||||
|
||||
Observed result:
|
||||
|
||||
- bitmap error appeared
|
||||
|
||||
### 2. Remove explicit scaling dimensions
|
||||
|
||||
Relevant commit:
|
||||
|
||||
- `aa284ae` `fix(iso): avoid grub logo scaling error`
|
||||
|
||||
What changed:
|
||||
|
||||
- removed `width = 400`
|
||||
- removed `height = 400`
|
||||
|
||||
Reason stated by the change:
|
||||
|
||||
- try to avoid the scaling path
|
||||
|
||||
Observed result:
|
||||
|
||||
- error persisted
|
||||
|
||||
Conclusion:
|
||||
|
||||
- explicit width/height were not the sole trigger
|
||||
|
||||
### 3. Rework PNG handling and menu rendering
|
||||
|
||||
Relevant commit:
|
||||
|
||||
- `6112094` `fix(grub): fix bitmap error and menu rendering`
|
||||
|
||||
Commit message says the change was intended to:
|
||||
|
||||
- convert `bee-logo.png` to RGBA and strip metadata
|
||||
- move `terminal_output gfxterm` before `insmod png` / theme load
|
||||
- remove ASCII banner from GRUB menu area
|
||||
- fix theme typography/layout fields
|
||||
|
||||
Observed result:
|
||||
|
||||
- error persisted
|
||||
|
||||
Notes:
|
||||
|
||||
- this was still operating under the assumption that the issue was the PNG
|
||||
payload or the order of gfxterm/theme init
|
||||
|
||||
### 4. Convert logo PNG back to RGB
|
||||
|
||||
Relevant commit:
|
||||
|
||||
- `333c44f` `Fix GRUB splash: convert bee-logo.png from RGBA to RGB`
|
||||
|
||||
Intended reason:
|
||||
|
||||
- GRUB might dislike RGBA PNG and want RGB PNG
|
||||
|
||||
Observed result:
|
||||
|
||||
- error still persisted according to later project notes
|
||||
|
||||
### 5. Add post-build canonical GRUB/isolinux sync
|
||||
|
||||
Relevant commit:
|
||||
|
||||
- `0cdfbc5` `fix(iso): restore boot UX and boot logs`
|
||||
|
||||
What this introduced:
|
||||
|
||||
- post-`lb build` rewriting of `binary/boot/grub/grub.cfg`
|
||||
- post-`lb build` rewriting of `binary/isolinux/live.cfg`
|
||||
- forced rebuild of `binary_checksums`, `binary_iso`, `binary_zsync`
|
||||
|
||||
Why it was added:
|
||||
|
||||
- restore canonical EASY-BEE boot UX after live-build wrote its own bootloader
|
||||
outputs
|
||||
- restore expected boot menu and logs
|
||||
|
||||
Important note:
|
||||
|
||||
- this commit did not directly solve the bitmap issue
|
||||
- it added a second layer of bootloader mutation after live-build
|
||||
|
||||
### 6. Switch from PNG to TGA
|
||||
|
||||
Relevant commit:
|
||||
|
||||
- `626763e` `Fix GRUB bitmap error: switch from PNG to TGA for splash logo`
|
||||
|
||||
Commit message says:
|
||||
|
||||
- GRUB PNG reader was considered fragile
|
||||
- switch to uncompressed 24-bit TGA
|
||||
- `config.cfg`: `insmod png` -> `insmod tga`
|
||||
- `theme.txt`: `bee-logo.png` -> `bee-logo.tga`
|
||||
|
||||
Observed result:
|
||||
|
||||
- this did not eliminate the problem in the current lineage
|
||||
- today the system still errors even after the entire image block was removed
|
||||
|
||||
Conclusion:
|
||||
|
||||
- switching PNG -> TGA was not a durable root-cause fix
|
||||
|
||||
### 7. Patch EFI image after build
|
||||
|
||||
Relevant commit:
|
||||
|
||||
- `4f20c92` `Make UEFI boot safe and remove GRUB logo`
|
||||
|
||||
What this introduced:
|
||||
|
||||
- `sync_efi_grub_theme_assets`
|
||||
- direct `mtools` patching of `efi.img`
|
||||
- copying `config.cfg`, `theme.cfg`, and `live-theme/*` into the EFI FAT image
|
||||
- removal of the logo image block from `theme.txt`
|
||||
|
||||
Why it was added:
|
||||
|
||||
- make UEFI path "safe"
|
||||
- keep EFI GRUB image aligned with canonical bootloader assets
|
||||
|
||||
Observed result:
|
||||
|
||||
- later this became the direct cause of `Disk full` during build once
|
||||
`bee-logo.tga` was large enough
|
||||
- and even with the logo removed from `theme.txt`, the bitmap error still
|
||||
remained
|
||||
|
||||
Conclusion:
|
||||
|
||||
- EFI post-build patching increased build complexity
|
||||
- removing the logo alone did not remove the runtime GRUB error
|
||||
|
||||
### 8. Remove ASCII logo banners
|
||||
|
||||
Relevant commit:
|
||||
|
||||
- `14505ef` `Remove easy bee ASCII logo banners`
|
||||
|
||||
What changed:
|
||||
|
||||
- web loading page ASCII cleanup only
|
||||
|
||||
Relevance here:
|
||||
|
||||
- none for GRUB bitmap error
|
||||
- included here only to avoid confusion with other "logo removal" work
|
||||
|
||||
### 9. Remove EFI post-build patching
|
||||
|
||||
Relevant commit:
|
||||
|
||||
- `5dc022d` `Drop post-build EFI bootloader patching`
|
||||
|
||||
Why it was done:
|
||||
|
||||
- stop mutating `efi.img` post-build
|
||||
- remove dependence on `mtools` for EFI patching
|
||||
- remove the `Disk full` failure mode
|
||||
|
||||
Impact:
|
||||
|
||||
- this did not target the GRUB bitmap error directly
|
||||
- it targeted build-system complexity and EFI image overflow
|
||||
|
||||
### 10. Restore only GRUB/isolinux post-build sync
|
||||
|
||||
Relevant commit:
|
||||
|
||||
- `42774d4` `Restore post-build GRUB and isolinux sync`
|
||||
|
||||
Why it was needed:
|
||||
|
||||
- removing all post-build sync caused final ISO validation to fail with
|
||||
missing canonical EASY-BEE boot entries
|
||||
- memtest was still fine, but final GRUB menu was no longer canonical
|
||||
|
||||
What it restored:
|
||||
|
||||
- only `binary/boot/grub/grub.cfg`
|
||||
- only `binary/isolinux/live.cfg`
|
||||
|
||||
What it did not restore:
|
||||
|
||||
- no EFI FAT image patching
|
||||
- no `mtools` path
|
||||
|
||||
## What Is Proven False
|
||||
|
||||
The current evidence rules out several simplistic explanations:
|
||||
|
||||
- "the error is only caused by explicit image scaling"
|
||||
- "the error is only caused by PNG vs TGA"
|
||||
- "the error is only caused by the logo file itself"
|
||||
|
||||
Why:
|
||||
|
||||
- scaling dimensions were removed and error persisted
|
||||
- PNG was replaced with TGA and error still survived in the lineage
|
||||
- the image block itself is now absent, and the error still occurs
|
||||
|
||||
## Working Hypotheses Left
|
||||
|
||||
The remaining plausible layers are:
|
||||
|
||||
- GRUB theme engine still tries to render some bitmap-related element even
|
||||
without the logo image block
|
||||
- GRUB is resolving stale theme assets from the built EFI/ISO path rather than
|
||||
what we think the source tree says
|
||||
- `theme.cfg` / `theme.txt` / GRUB module loading order still triggers a bitmap
|
||||
code path elsewhere
|
||||
- live-build may still package a stale `theme.txt` or stale `live-theme`
|
||||
directory into the final image
|
||||
- the GRUB environment on the failing hardware may behave differently from the
|
||||
assumptions in our source tree
|
||||
|
||||
## Decision Boundary
|
||||
|
||||
Before making another change, the next step should be evidence gathering from
|
||||
the real built artifact, not another speculative edit.
|
||||
|
||||
That means checking on the actual built ISO or EFI image:
|
||||
|
||||
- exact `boot/grub/theme.cfg`
|
||||
- exact `boot/grub/live-theme/theme.txt`
|
||||
- exact contents of `boot/grub/live-theme/`
|
||||
- whether the final image still contains a stale logo reference
|
||||
- whether the EFI path and non-EFI path differ
|
||||
|
||||
## Relevant Commits
|
||||
|
||||
- `d52ec67` `Stability hardening, build script fixes, GRUB bee logo`
|
||||
- `aa284ae` `fix(iso): avoid grub logo scaling error`
|
||||
- `6112094` `fix(grub): fix bitmap error and menu rendering`
|
||||
- `333c44f` `Fix GRUB splash: convert bee-logo.png from RGBA to RGB`
|
||||
- `0cdfbc5` `fix(iso): restore boot UX and boot logs`
|
||||
- `626763e` `Fix GRUB bitmap error: switch from PNG to TGA for splash logo`
|
||||
- `4f20c92` `Make UEFI boot safe and remove GRUB logo`
|
||||
- `5dc022d` `Drop post-build EFI bootloader patching`
|
||||
- `42774d4` `Restore post-build GRUB and isolinux sync`
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user