Compare commits
28 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| abaeaea13f | |||
| 5b98005d5d | |||
| 33bc275da2 | |||
| 11ea640626 | |||
| 796acdfec1 | |||
| 2a7d366e50 | |||
| 5bfaecd417 | |||
| 8575cf06f8 | |||
| d1d5f63257 | |||
| fc9b446d2e | |||
| ea68318744 | |||
| 518082c2e2 | |||
| 056dce0b98 | |||
| 24f2e65b6e | |||
| 7f27b9aa38 | |||
| cf29131116 | |||
| 13e6324853 | |||
| 892ef6fb7d | |||
| ce46a97975 | |||
| 258ecb3453 | |||
| cbb0d1e522 | |||
| bab941ccf1 | |||
| b49c71a980 | |||
| 85d1acdaa3 | |||
| a2d7513153 | |||
| 5b5d8609d3 | |||
| e7442972d1 | |||
| 4c6daa1c5e |
@@ -134,6 +134,7 @@ type satRunner interface {
|
||||
ResetNvidiaGPU(index int) (string, error)
|
||||
RunMemoryAcceptancePack(ctx context.Context, baseDir string, sizeMB, passes int, logFunc func(string)) (string, error)
|
||||
RunStorageAcceptancePack(ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error)
|
||||
RunConfidentialComputingCheckPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||
RunCPUAcceptancePack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||
ListNvidiaGPUs() ([]platform.NvidiaGPU, error)
|
||||
DetectGPUVendor() string
|
||||
|
||||
@@ -206,6 +206,22 @@ func (a *App) RunStorageAcceptancePackResult(baseDir string) (ActionResult, erro
|
||||
return ActionResult{Title: "Storage SAT", Body: satResultBody(path)}, err
|
||||
}
|
||||
|
||||
func (a *App) RunConfidentialComputingCheckPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunConfidentialComputingCheckPack(ctx, baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunConfidentialComputingCheckPack(baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunConfidentialComputingCheckPackCtx(context.Background(), baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunConfidentialComputingCheckPackResult(baseDir string) (ActionResult, error) {
|
||||
path, err := a.RunConfidentialComputingCheckPack(baseDir, nil)
|
||||
return ActionResult{Title: "Confidential Computing Check", Body: satResultBody(path)}, err
|
||||
}
|
||||
|
||||
func (a *App) DetectGPUVendor() string {
|
||||
return a.sat.DetectGPUVendor()
|
||||
}
|
||||
|
||||
@@ -243,6 +243,10 @@ func (f fakeSAT) RunStorageAcceptancePack(_ context.Context, baseDir string, _ b
|
||||
return f.runStorageFn(baseDir)
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunConfidentialComputingCheckPack(_ context.Context, baseDir string, _ func(string)) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunCPUAcceptancePack(_ context.Context, baseDir string, durationSec int, _ func(string)) (string, error) {
|
||||
if f.runCPUFn != nil {
|
||||
return f.runCPUFn(baseDir, durationSec)
|
||||
|
||||
@@ -365,7 +365,6 @@ func (w *blackboxWorker) currentFlushPeriod() time.Duration {
|
||||
|
||||
func (w *blackboxWorker) finishCycle(duration time.Duration, err error) {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
w.lastDuration = duration
|
||||
if err != nil {
|
||||
w.status = "degraded"
|
||||
@@ -383,6 +382,10 @@ func (w *blackboxWorker) finishCycle(duration time.Duration, err error) {
|
||||
}
|
||||
w.flushPeriod = adjustFlushPeriod(w.flushPeriod, duration, true, w.fastCycles)
|
||||
}
|
||||
w.mu.Unlock()
|
||||
// persistState must be called without w.mu held: it acquires rt.mu then
|
||||
// each worker.mu inside persistStateLocked, so holding w.mu here would
|
||||
// cause a deadlock (w.mu → rt.mu → w.mu).
|
||||
w.runtime.persistState()
|
||||
}
|
||||
|
||||
|
||||
@@ -49,7 +49,8 @@ func Run(_ runtimeenv.Mode) schema.HardwareIngestRequest {
|
||||
snap.VROCLicense = collectVROCLicense(snap.PCIeDevices)
|
||||
snap.PowerSupplies = collectPSUs(derefString(snap.Board.Manufacturer))
|
||||
snap.PowerSupplies = enrichPSUsWithTelemetry(snap.PowerSupplies, sensorDoc)
|
||||
snap.Sensors = buildSensorsFromDoc(sensorDoc)
|
||||
snap.Sensors = mergeIPMISensors(buildSensorsFromDoc(sensorDoc), collectIPMISensors())
|
||||
snap.EventLogs = append(collectIPMISEL(), collectDmesgErrors()...)
|
||||
finalizeSnapshot(&snap, collectedAt)
|
||||
|
||||
// remaining collectors added in steps 1.8 – 1.10
|
||||
|
||||
@@ -0,0 +1,129 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"log/slog"
|
||||
"os/exec"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// dmesg -T output: [Thu Jun 18 14:23:45 2026] message
|
||||
// dmesg without -T: [ 123.456789] message
|
||||
var dmesgTimestampRE = regexp.MustCompile(`^\[([^\]]+)\]\s*(.*)$`)
|
||||
|
||||
// Keywords that indicate an error or hardware problem worth capturing.
|
||||
var dmesgErrorPatterns = []*regexp.Regexp{
|
||||
regexp.MustCompile(`(?i)\berr(or)?\b`),
|
||||
regexp.MustCompile(`(?i)\bfail(ed|ure)?\b`),
|
||||
regexp.MustCompile(`(?i)\bfault\b`),
|
||||
regexp.MustCompile(`(?i)\bwarn(ing)?\b`),
|
||||
regexp.MustCompile(`(?i)\bAER\b`),
|
||||
regexp.MustCompile(`(?i)\bXid\b`),
|
||||
regexp.MustCompile(`(?i)\bNVRM\b`),
|
||||
regexp.MustCompile(`(?i)\bpanic\b`),
|
||||
regexp.MustCompile(`(?i)\bcorrected\b`),
|
||||
regexp.MustCompile(`(?i)\buncorrect`),
|
||||
regexp.MustCompile(`(?i)\bECC\b`),
|
||||
regexp.MustCompile(`(?i)\btimeout\b`),
|
||||
regexp.MustCompile(`(?i)\breset\b`),
|
||||
regexp.MustCompile(`(?i)\bdead\b`),
|
||||
regexp.MustCompile(`(?i)\bhang\b`),
|
||||
regexp.MustCompile(`(?i)\bstall\b`),
|
||||
regexp.MustCompile(`(?i)\bdisabled\b`),
|
||||
}
|
||||
|
||||
// collectDmesgErrors runs `dmesg -T` (or `dmesg` without -T on failure) and
|
||||
// returns only lines that match known error/warning patterns.
|
||||
func collectDmesgErrors() []schema.HardwareEventLog {
|
||||
out, err := exec.Command("dmesg", "-T").Output()
|
||||
if err != nil || len(out) == 0 {
|
||||
// Fallback: dmesg without human-readable timestamps
|
||||
out, err = exec.Command("dmesg").Output()
|
||||
if err != nil || len(out) == 0 {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
entries := parseDmesgErrors(string(out))
|
||||
if len(entries) == 0 {
|
||||
return nil
|
||||
}
|
||||
slog.Info("dmesg: collected error entries", "count", len(entries))
|
||||
return entries
|
||||
}
|
||||
|
||||
func parseDmesgErrors(output string) []schema.HardwareEventLog {
|
||||
var entries []schema.HardwareEventLog
|
||||
collectedAt := time.Now().UTC().Format(time.RFC3339)
|
||||
|
||||
for _, line := range strings.Split(output, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
var timestamp, message string
|
||||
if m := dmesgTimestampRE.FindStringSubmatch(line); m != nil {
|
||||
timestamp = strings.TrimSpace(m[1])
|
||||
message = strings.TrimSpace(m[2])
|
||||
} else {
|
||||
message = line
|
||||
}
|
||||
|
||||
if message == "" {
|
||||
continue
|
||||
}
|
||||
if !matchesAny(message, dmesgErrorPatterns) {
|
||||
continue
|
||||
}
|
||||
|
||||
severity := dmesgSeverity(message)
|
||||
source := "dmesg"
|
||||
|
||||
var eventTime *string
|
||||
if timestamp != "" {
|
||||
t := timestamp
|
||||
eventTime = &t
|
||||
} else {
|
||||
eventTime = &collectedAt
|
||||
}
|
||||
|
||||
entries = append(entries, schema.HardwareEventLog{
|
||||
Source: source,
|
||||
EventTime: eventTime,
|
||||
Severity: &severity,
|
||||
Message: message,
|
||||
})
|
||||
}
|
||||
return entries
|
||||
}
|
||||
|
||||
func matchesAny(s string, patterns []*regexp.Regexp) bool {
|
||||
for _, p := range patterns {
|
||||
if p.MatchString(s) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func dmesgSeverity(msg string) string {
|
||||
lower := strings.ToLower(msg)
|
||||
switch {
|
||||
case strings.Contains(lower, "panic") ||
|
||||
strings.Contains(lower, "aer") ||
|
||||
strings.Contains(lower, "uncorrect") ||
|
||||
strings.Contains(lower, "xid") ||
|
||||
strings.Contains(lower, "nvrm"):
|
||||
return statusCritical
|
||||
case strings.Contains(lower, "error") ||
|
||||
strings.Contains(lower, "fault") ||
|
||||
strings.Contains(lower, "fail") ||
|
||||
strings.Contains(lower, "dead") ||
|
||||
strings.Contains(lower, "hang"):
|
||||
return statusCritical
|
||||
default:
|
||||
return statusWarning
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,90 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os/exec"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// collectIPMISEL runs `ipmitool sel list` and returns parsed event log entries.
|
||||
// Returns nil if ipmitool is unavailable or the SEL is empty.
|
||||
func collectIPMISEL() []schema.HardwareEventLog {
|
||||
out, err := exec.Command("ipmitool", "sel", "list").Output()
|
||||
if err != nil || len(out) == 0 {
|
||||
return nil
|
||||
}
|
||||
entries := parseIPMISELOutput(string(out))
|
||||
if len(entries) == 0 {
|
||||
return nil
|
||||
}
|
||||
slog.Info("ipmi sel: collected", "entries", len(entries))
|
||||
return entries
|
||||
}
|
||||
|
||||
// parseIPMISELOutput parses `ipmitool sel list` output.
|
||||
// Line format: ID | date | time | sensor | event description | direction
|
||||
// Example: 1 | 06/18/2026 | 14:23:45 | Temperature #0x30 | Upper Critical going high | Asserted
|
||||
func parseIPMISELOutput(output string) []schema.HardwareEventLog {
|
||||
var entries []schema.HardwareEventLog
|
||||
for _, line := range strings.Split(output, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
parts := strings.SplitN(line, "|", 6)
|
||||
if len(parts) < 5 {
|
||||
continue
|
||||
}
|
||||
id := strings.TrimSpace(parts[0])
|
||||
date := strings.TrimSpace(parts[1])
|
||||
timeStr := strings.TrimSpace(parts[2])
|
||||
sensor := strings.TrimSpace(parts[3])
|
||||
event := strings.TrimSpace(parts[4])
|
||||
direction := ""
|
||||
if len(parts) == 6 {
|
||||
direction = strings.TrimSpace(parts[5])
|
||||
}
|
||||
|
||||
var eventTime *string
|
||||
if date != "" && timeStr != "" {
|
||||
t := fmt.Sprintf("%s %s", date, timeStr)
|
||||
eventTime = &t
|
||||
}
|
||||
|
||||
message := event
|
||||
if direction != "" && strings.EqualFold(direction, "Deasserted") {
|
||||
message = event + " (Deasserted)"
|
||||
}
|
||||
|
||||
severity := ipmiSELSeverity(event)
|
||||
isActive := !strings.EqualFold(direction, "Deasserted")
|
||||
|
||||
entry := schema.HardwareEventLog{
|
||||
Source: "ipmi-sel",
|
||||
EventTime: eventTime,
|
||||
Severity: &severity,
|
||||
MessageID: &id,
|
||||
Message: message,
|
||||
IsActive: &isActive,
|
||||
}
|
||||
if sensor != "" {
|
||||
entry.ComponentRef = &sensor
|
||||
}
|
||||
entries = append(entries, entry)
|
||||
}
|
||||
return entries
|
||||
}
|
||||
|
||||
func ipmiSELSeverity(event string) string {
|
||||
lower := strings.ToLower(event)
|
||||
switch {
|
||||
case strings.Contains(lower, "critical") || strings.Contains(lower, "non-recoverable"):
|
||||
return statusCritical
|
||||
case strings.Contains(lower, "non-critical") || strings.Contains(lower, "warning") || strings.Contains(lower, "degraded"):
|
||||
return statusWarning
|
||||
default:
|
||||
return "info"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,216 @@
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bee/audit/internal/schema"
|
||||
"log/slog"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// collectIPMISensors runs `ipmitool sensor` and returns parsed sensor readings.
|
||||
// Returns nil if ipmitool is unavailable or produces no output.
|
||||
func collectIPMISensors() *schema.HardwareSensors {
|
||||
out, err := exec.Command("ipmitool", "sensor").Output()
|
||||
if err != nil || len(out) == 0 {
|
||||
return nil
|
||||
}
|
||||
result := parseIPMISensorOutput(string(out))
|
||||
if result == nil {
|
||||
return nil
|
||||
}
|
||||
slog.Info("ipmi sensors: collected",
|
||||
"fans", len(result.Fans),
|
||||
"temperatures", len(result.Temperatures),
|
||||
"power", len(result.Power),
|
||||
"other", len(result.Other),
|
||||
)
|
||||
return result
|
||||
}
|
||||
|
||||
// parseIPMISensorOutput parses `ipmitool sensor` text output.
|
||||
// Each line: name | value | unit | status | lnr | lcr | lnc | unc | ucr | unr
|
||||
func parseIPMISensorOutput(output string) *schema.HardwareSensors {
|
||||
result := &schema.HardwareSensors{}
|
||||
seen := map[string]struct{}{}
|
||||
|
||||
for _, line := range strings.Split(output, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
parts := strings.Split(line, "|")
|
||||
if len(parts) < 4 {
|
||||
continue
|
||||
}
|
||||
name := strings.TrimSpace(parts[0])
|
||||
rawVal := strings.TrimSpace(parts[1])
|
||||
unit := strings.TrimSpace(parts[2])
|
||||
status := strings.TrimSpace(parts[3])
|
||||
|
||||
if name == "" || rawVal == "na" || rawVal == "N/A" || rawVal == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
value, err := strconv.ParseFloat(rawVal, 64)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
statusStr := normalizeIPMISensorStatus(status)
|
||||
|
||||
switch {
|
||||
case strings.EqualFold(unit, "RPM"):
|
||||
if duplicateSensor(seen, "fan", name) {
|
||||
continue
|
||||
}
|
||||
rpm := int(value)
|
||||
item := schema.HardwareFanSensor{Name: name, RPM: &rpm}
|
||||
if statusStr != "" {
|
||||
item.Status = &statusStr
|
||||
}
|
||||
result.Fans = append(result.Fans, item)
|
||||
|
||||
case strings.EqualFold(unit, "degrees C") || strings.EqualFold(unit, "C"):
|
||||
if duplicateSensor(seen, "temp", name) {
|
||||
continue
|
||||
}
|
||||
item := schema.HardwareTemperatureSensor{Name: name, Celsius: &value}
|
||||
if len(parts) >= 9 {
|
||||
if unc := parseIPMIThreshold(parts[7]); unc != nil {
|
||||
item.ThresholdWarningCelsius = unc
|
||||
}
|
||||
if ucr := parseIPMIThreshold(parts[8]); ucr != nil {
|
||||
item.ThresholdCriticalCelsius = ucr
|
||||
}
|
||||
}
|
||||
if statusStr != "" {
|
||||
item.Status = &statusStr
|
||||
} else {
|
||||
item.Status = deriveTemperatureStatus(item.Celsius, item.ThresholdWarningCelsius, item.ThresholdCriticalCelsius)
|
||||
}
|
||||
result.Temperatures = append(result.Temperatures, item)
|
||||
|
||||
case strings.EqualFold(unit, "Volts") || strings.EqualFold(unit, "V"):
|
||||
if duplicateSensor(seen, "power", name) {
|
||||
continue
|
||||
}
|
||||
item := schema.HardwarePowerSensor{Name: name, VoltageV: &value}
|
||||
if statusStr != "" {
|
||||
item.Status = &statusStr
|
||||
}
|
||||
result.Power = append(result.Power, item)
|
||||
|
||||
case strings.EqualFold(unit, "Watts") || strings.EqualFold(unit, "W"):
|
||||
if duplicateSensor(seen, "power", name) {
|
||||
continue
|
||||
}
|
||||
item := schema.HardwarePowerSensor{Name: name, PowerW: &value}
|
||||
if statusStr != "" {
|
||||
item.Status = &statusStr
|
||||
}
|
||||
result.Power = append(result.Power, item)
|
||||
|
||||
case strings.EqualFold(unit, "Amps") || strings.EqualFold(unit, "A"):
|
||||
if duplicateSensor(seen, "power", name) {
|
||||
continue
|
||||
}
|
||||
item := schema.HardwarePowerSensor{Name: name, CurrentA: &value}
|
||||
if statusStr != "" {
|
||||
item.Status = &statusStr
|
||||
}
|
||||
result.Power = append(result.Power, item)
|
||||
|
||||
default:
|
||||
if duplicateSensor(seen, "other", name) {
|
||||
continue
|
||||
}
|
||||
item := schema.HardwareOtherSensor{Name: name, Value: &value}
|
||||
if unit != "" {
|
||||
item.Unit = &unit
|
||||
}
|
||||
if statusStr != "" {
|
||||
item.Status = &statusStr
|
||||
}
|
||||
result.Other = append(result.Other, item)
|
||||
}
|
||||
}
|
||||
|
||||
if len(result.Fans) == 0 && len(result.Temperatures) == 0 && len(result.Power) == 0 && len(result.Other) == 0 {
|
||||
return nil
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
func parseIPMIThreshold(raw string) *float64 {
|
||||
s := strings.TrimSpace(raw)
|
||||
if s == "" || s == "na" || s == "N/A" {
|
||||
return nil
|
||||
}
|
||||
v, err := strconv.ParseFloat(s, 64)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
return &v
|
||||
}
|
||||
|
||||
func normalizeIPMISensorStatus(s string) string {
|
||||
switch strings.ToLower(s) {
|
||||
case "ok":
|
||||
return statusOK
|
||||
case "cr", "ucr", "lcr":
|
||||
return statusCritical
|
||||
case "nc", "unc", "lnc", "nr", "unr", "lnr":
|
||||
return statusWarning
|
||||
case "ns", "na":
|
||||
return ""
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
// mergeIPMISensors appends IPMI sensor entries into existing, skipping names already present.
|
||||
func mergeIPMISensors(existing, ipmi *schema.HardwareSensors) *schema.HardwareSensors {
|
||||
if ipmi == nil {
|
||||
return existing
|
||||
}
|
||||
if existing == nil {
|
||||
return ipmi
|
||||
}
|
||||
|
||||
existingNames := map[string]struct{}{}
|
||||
for _, s := range existing.Fans {
|
||||
existingNames["fan\x00"+s.Name] = struct{}{}
|
||||
}
|
||||
for _, s := range existing.Temperatures {
|
||||
existingNames["temp\x00"+s.Name] = struct{}{}
|
||||
}
|
||||
for _, s := range existing.Power {
|
||||
existingNames["power\x00"+s.Name] = struct{}{}
|
||||
}
|
||||
for _, s := range existing.Other {
|
||||
existingNames["other\x00"+s.Name] = struct{}{}
|
||||
}
|
||||
|
||||
for _, s := range ipmi.Fans {
|
||||
if _, ok := existingNames["fan\x00"+s.Name]; !ok {
|
||||
existing.Fans = append(existing.Fans, s)
|
||||
}
|
||||
}
|
||||
for _, s := range ipmi.Temperatures {
|
||||
if _, ok := existingNames["temp\x00"+s.Name]; !ok {
|
||||
existing.Temperatures = append(existing.Temperatures, s)
|
||||
}
|
||||
}
|
||||
for _, s := range ipmi.Power {
|
||||
if _, ok := existingNames["power\x00"+s.Name]; !ok {
|
||||
existing.Power = append(existing.Power, s)
|
||||
}
|
||||
}
|
||||
for _, s := range ipmi.Other {
|
||||
if _, ok := existingNames["other\x00"+s.Name]; !ok {
|
||||
existing.Other = append(existing.Other, s)
|
||||
}
|
||||
}
|
||||
return existing
|
||||
}
|
||||
@@ -766,7 +766,7 @@ func parseMDAdmPlatformLicense(raw string) *string {
|
||||
|
||||
func queryDeviceSerial(devPath string) string {
|
||||
if out, err := exec.Command("nvme", "id-ctrl", devPath, "-o", "json").Output(); err == nil {
|
||||
var ctrl nvmeIDCtrl
|
||||
var ctrl NVMeIDCtrl
|
||||
if json.Unmarshal(out, &ctrl) == nil {
|
||||
if v := cleanDMIValue(strings.TrimSpace(ctrl.SerialNumber)); v != "" {
|
||||
return v
|
||||
|
||||
@@ -84,16 +84,19 @@ func collectStorage() []schema.HardwareStorage {
|
||||
return result
|
||||
}
|
||||
|
||||
// jsonInt64 accepts both a bare JSON number and a JSON-quoted number string.
|
||||
// lsblk -J emits LOG-SEC / PHY-SEC as integers on util-linux ≥ 2.37 (Debian 12)
|
||||
// but older versions emit them as strings. This type handles both.
|
||||
type jsonInt64 int64
|
||||
// JSONInt64 accepts a bare JSON number (512), a JSON-quoted number string
|
||||
// ("512" — lsblk -J on util-linux < 2.37, and nvme-cli for large 64-bit
|
||||
// counters that would lose precision as JS numbers), or a {"lo":n,"hi":n}
|
||||
// object (128-bit NVMe counters on some nvme-cli versions; hi is ignored as
|
||||
// no real counter exceeds 64 bits). Shared by lsblk and nvme-cli JSON output
|
||||
// across the collector and the human-readable disk report.
|
||||
type JSONInt64 int64
|
||||
|
||||
func (j *jsonInt64) UnmarshalJSON(data []byte) error {
|
||||
func (j *JSONInt64) UnmarshalJSON(data []byte) error {
|
||||
// bare number: 512
|
||||
var n int64
|
||||
if err := json.Unmarshal(data, &n); err == nil {
|
||||
*j = jsonInt64(n)
|
||||
*j = JSONInt64(n)
|
||||
return nil
|
||||
}
|
||||
// quoted string: "512"
|
||||
@@ -101,24 +104,32 @@ func (j *jsonInt64) UnmarshalJSON(data []byte) error {
|
||||
if err := json.Unmarshal(data, &s); err == nil {
|
||||
n, err := strconv.ParseInt(strings.TrimSpace(s), 10, 64)
|
||||
if err == nil {
|
||||
*j = jsonInt64(n)
|
||||
*j = JSONInt64(n)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
// {"lo":n,"hi":n} 128-bit counter object
|
||||
var obj struct {
|
||||
Lo int64 `json:"lo"`
|
||||
}
|
||||
if err := json.Unmarshal(data, &obj); err == nil {
|
||||
*j = JSONInt64(obj.Lo)
|
||||
return nil
|
||||
}
|
||||
return nil // null or unexpected type — leave zero
|
||||
}
|
||||
|
||||
// lsblkDevice is a minimal lsblk JSON record.
|
||||
type lsblkDevice struct {
|
||||
Name string `json:"name"`
|
||||
Type string `json:"type"`
|
||||
Size string `json:"size"`
|
||||
Serial string `json:"serial"`
|
||||
Model string `json:"model"`
|
||||
Tran string `json:"tran"`
|
||||
Hctl string `json:"hctl"`
|
||||
LogSec jsonInt64 `json:"log-sec"`
|
||||
PhySec jsonInt64 `json:"phy-sec"`
|
||||
Name string `json:"name"`
|
||||
Type string `json:"type"`
|
||||
Size string `json:"size"`
|
||||
Serial string `json:"serial"`
|
||||
Model string `json:"model"`
|
||||
Tran string `json:"tran"`
|
||||
Hctl string `json:"hctl"`
|
||||
LogSec JSONInt64 `json:"log-sec"`
|
||||
PhySec JSONInt64 `json:"phy-sec"`
|
||||
}
|
||||
|
||||
type lsblkRoot struct {
|
||||
@@ -423,32 +434,36 @@ func enrichWithSmartctl(dev lsblkDevice) schema.HardwareStorage {
|
||||
return s
|
||||
}
|
||||
|
||||
// nvmeSmartLog is the subset of `nvme smart-log -o json` output we care about.
|
||||
// nvme-cli emits most counters as JSON strings (e.g. "power_on_hours":"49"),
|
||||
// so all numeric fields use jsonInt64 which accepts both bare numbers and
|
||||
// quoted strings. Field names match nvme-cli JSON output, not NVMe spec prose.
|
||||
type nvmeSmartLog struct {
|
||||
CriticalWarning jsonInt64 `json:"critical_warning"`
|
||||
PercentageUsed jsonInt64 `json:"percent_used"`
|
||||
AvailableSpare jsonInt64 `json:"avail_spare"`
|
||||
SpareThreshold jsonInt64 `json:"spare_thresh"`
|
||||
Temperature jsonInt64 `json:"temperature"`
|
||||
PowerOnHours jsonInt64 `json:"power_on_hours"`
|
||||
PowerCycles jsonInt64 `json:"power_cycles"`
|
||||
UnsafeShutdowns jsonInt64 `json:"unsafe_shutdowns"`
|
||||
DataUnitsRead jsonInt64 `json:"data_units_read"`
|
||||
DataUnitsWritten jsonInt64 `json:"data_units_written"`
|
||||
ControllerBusy jsonInt64 `json:"controller_busy_time"`
|
||||
MediaErrors jsonInt64 `json:"media_errors"`
|
||||
NumErrLogEntries jsonInt64 `json:"num_err_log_entries"`
|
||||
// NVMeSmartLog is the subset of `nvme smart-log -o json` output shared by the
|
||||
// structured collector and the human-readable disk report. nvme-cli emits
|
||||
// most counters as JSON strings (e.g. "power_on_hours":"49") or, on some
|
||||
// versions, as {"lo":n,"hi":n} objects — all numeric fields use JSONInt64,
|
||||
// which accepts bare numbers, quoted strings, and lo/hi objects. Field names
|
||||
// match nvme-cli JSON output, not NVMe spec prose.
|
||||
type NVMeSmartLog struct {
|
||||
CriticalWarning JSONInt64 `json:"critical_warning"`
|
||||
PercentageUsed JSONInt64 `json:"percent_used"`
|
||||
AvailableSpare JSONInt64 `json:"avail_spare"`
|
||||
SpareThreshold JSONInt64 `json:"spare_thresh"`
|
||||
Temperature JSONInt64 `json:"temperature"`
|
||||
PowerOnHours JSONInt64 `json:"power_on_hours"`
|
||||
PowerCycles JSONInt64 `json:"power_cycles"`
|
||||
UnsafeShutdowns JSONInt64 `json:"unsafe_shutdowns"`
|
||||
DataUnitsRead JSONInt64 `json:"data_units_read"`
|
||||
DataUnitsWritten JSONInt64 `json:"data_units_written"`
|
||||
ControllerBusy JSONInt64 `json:"controller_busy_time"`
|
||||
MediaErrors JSONInt64 `json:"media_errors"`
|
||||
NumErrLogEntries JSONInt64 `json:"num_err_log_entries"`
|
||||
}
|
||||
|
||||
// nvmeIDCtrl is the subset of `nvme id-ctrl -o json` output.
|
||||
type nvmeIDCtrl struct {
|
||||
ModelNumber string `json:"mn"`
|
||||
SerialNumber string `json:"sn"`
|
||||
FirmwareRev string `json:"fr"`
|
||||
TotalCapacity int64 `json:"tnvmcap"`
|
||||
// NVMeIDCtrl is the subset of `nvme id-ctrl -o json` output shared by the
|
||||
// structured collector and the human-readable disk report.
|
||||
type NVMeIDCtrl struct {
|
||||
ModelNumber string `json:"mn"`
|
||||
SerialNumber string `json:"sn"`
|
||||
FirmwareRev string `json:"fr"`
|
||||
TotalCapacity JSONInt64 `json:"tnvmcap"`
|
||||
NVMCapacity JSONInt64 `json:"nvmcap"`
|
||||
}
|
||||
|
||||
func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
||||
@@ -481,7 +496,7 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
||||
|
||||
// id-ctrl: model, serial, firmware, capacity
|
||||
if out, err := exec.Command("nvme", "id-ctrl", devPath, "-o", "json").Output(); err == nil {
|
||||
var ctrl nvmeIDCtrl
|
||||
var ctrl NVMeIDCtrl
|
||||
if json.Unmarshal(out, &ctrl) == nil {
|
||||
if v := cleanDMIValue(strings.TrimSpace(ctrl.ModelNumber)); v != "" {
|
||||
s.Model = &v
|
||||
@@ -502,7 +517,7 @@ func enrichWithNVMe(dev lsblkDevice) schema.HardwareStorage {
|
||||
|
||||
// smart-log: wear telemetry
|
||||
if out, err := exec.Command("nvme", "smart-log", devPath, "-o", "json").Output(); err == nil {
|
||||
var log nvmeSmartLog
|
||||
var log NVMeSmartLog
|
||||
if json.Unmarshal(out, &log) == nil {
|
||||
if log.PowerOnHours > 0 {
|
||||
v := int64(log.PowerOnHours)
|
||||
|
||||
@@ -56,7 +56,7 @@ func TestJsonInt64UnmarshalBothFormats(t *testing.T) {
|
||||
{`null`, 0},
|
||||
}
|
||||
for _, tc := range cases {
|
||||
var v jsonInt64
|
||||
var v JSONInt64
|
||||
if err := v.UnmarshalJSON([]byte(tc.json)); err != nil {
|
||||
t.Fatalf("UnmarshalJSON(%s): unexpected error %v", tc.json, err)
|
||||
}
|
||||
|
||||
@@ -9,7 +9,7 @@ import (
|
||||
|
||||
// TestNVMeSmartLogUnmarshal verifies that nvme-cli JSON output (where most
|
||||
// counters are quoted strings and field names differ from NVMe spec prose)
|
||||
// is correctly parsed into nvmeSmartLog.
|
||||
// is correctly parsed into NVMeSmartLog.
|
||||
func TestNVMeSmartLogUnmarshal(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
@@ -30,7 +30,7 @@ func TestNVMeSmartLogUnmarshal(t *testing.T) {
|
||||
"media_errors": "0",
|
||||
"num_err_log_entries": "0"
|
||||
}`
|
||||
var log nvmeSmartLog
|
||||
var log NVMeSmartLog
|
||||
if err := json.Unmarshal([]byte(raw), &log); err != nil {
|
||||
t.Fatalf("json.Unmarshal failed: %v", err)
|
||||
}
|
||||
|
||||
@@ -4008,14 +4008,23 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
idleW = result.ServerPower.IdleW
|
||||
}
|
||||
|
||||
// Build header: Run | GPU 0 | GPU 1 | ... | Server wall W | Per GPU wall W | Platform eff.
|
||||
// Build header: Run | GPU 0 | GPU 1 | ... | GPU total W | Server itself W | Server wall W | Per GPU wall W | Platform eff.
|
||||
headers := []string{"Run"}
|
||||
for _, idx := range allGPUIndices {
|
||||
headers = append(headers, fmt.Sprintf("GPU %d W", idx))
|
||||
}
|
||||
headers = append(headers, "Server wall W", "Per GPU wall W", "Platform eff.")
|
||||
headers = append(headers, "GPU total W", "Server itself W", "Server wall W", "Per GPU wall W", "Platform eff.")
|
||||
|
||||
var rampRows [][]string
|
||||
if idleW > 0 {
|
||||
idleRow := []string{"0 (idle)"}
|
||||
for range allGPUIndices {
|
||||
idleRow = append(idleRow, "—")
|
||||
}
|
||||
// No load: GPU total is negligible, all draw is the server's own baseline.
|
||||
idleRow = append(idleRow, "—", fmt.Sprintf("%.0f", idleW), fmt.Sprintf("%.0f", idleW), "—", "—")
|
||||
rampRows = append(rampRows, idleRow)
|
||||
}
|
||||
for _, step := range result.RampSteps {
|
||||
row := []string{fmt.Sprintf("%d", step.StepIndex)}
|
||||
for _, idx := range allGPUIndices {
|
||||
@@ -4036,6 +4045,16 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
}
|
||||
row = append(row, gpuPwr)
|
||||
}
|
||||
// GPU total W = sum of observed GPU power (nvidia-smi)
|
||||
gpuTotal := "—"
|
||||
if step.TotalObservedPowerW > 0 {
|
||||
gpuTotal = fmt.Sprintf("%.0f", step.TotalObservedPowerW)
|
||||
}
|
||||
// Server itself W = server wall power minus GPU total (non-GPU baseline draw)
|
||||
serverItself := "—"
|
||||
if step.ServerLoadedW > 0 && step.TotalObservedPowerW > 0 {
|
||||
serverItself = fmt.Sprintf("%.0f", step.ServerLoadedW-step.TotalObservedPowerW)
|
||||
}
|
||||
// Server wall W
|
||||
serverWall := "—"
|
||||
if step.ServerLoadedW > 0 {
|
||||
@@ -4055,7 +4074,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
}
|
||||
platEff = fmt.Sprintf("%.2f", eff)
|
||||
}
|
||||
row = append(row, serverWall, perGPUWall, platEff)
|
||||
row = append(row, gpuTotal, serverItself, serverWall, perGPUWall, platEff)
|
||||
rampRows = append(rampRows, row)
|
||||
}
|
||||
b.WriteString(fmtMDTable(headers, rampRows))
|
||||
@@ -4617,6 +4636,8 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
ramp.AvgFanRPM = singleRun.AvgFanRPM
|
||||
ramp.AvgFanDutyCyclePct = singleRun.AvgFanDutyCyclePct
|
||||
}
|
||||
firstSummary := firstCalib.Summary
|
||||
ramp.PerGPUTelemetry = map[int]*BenchmarkTelemetrySummary{firstIdx: &firstSummary}
|
||||
if !firstCalib.Completed {
|
||||
ramp.Status = "FAILED"
|
||||
ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card %s", firstIdx, benchmarkPowerEngineLabel(benchmarkPowerEngine())))
|
||||
|
||||
@@ -0,0 +1,248 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// ConfidentialComputingStatus summarizes whether this server can run NVIDIA
|
||||
// Confidential Computing: CPU-side TEE support (Intel TDX / AMD SEV-SNP) and
|
||||
// GPU firmware CC capability, as reported by `nvidia-smi conf-compute -q`.
|
||||
type ConfidentialComputingStatus struct {
|
||||
CollectedAt time.Time `json:"collected_at"`
|
||||
|
||||
// GPU-reported fields, parsed from `nvidia-smi conf-compute -q`.
|
||||
NvidiaSMIAvailable bool `json:"nvidia_smi_available"`
|
||||
CCState string `json:"cc_state,omitempty"` // ON / OFF
|
||||
MultiGPUMode string `json:"multi_gpu_mode,omitempty"` // Protected PCIe / ...
|
||||
CPUCCCapability string `json:"cpu_cc_capability,omitempty"` // e.g. "INTEL TDX", "AMD SEV-SNP", "NONE"
|
||||
GPUCCCapability string `json:"gpu_cc_capability,omitempty"` // e.g. "CC Capable", "Not Capable"
|
||||
CCGPUsReadyState string `json:"cc_gpus_ready_state,omitempty"` // Ready / Not Ready
|
||||
|
||||
// Host-side evidence that the CPU's TEE is actually active in the running
|
||||
// kernel (BIOS + kernel cmdline + firmware), independent of what the GPU
|
||||
// driver reports. Used as a fallback when the NVIDIA driver isn't loaded.
|
||||
HostAMDSEVSupported bool `json:"host_amd_sev_supported"`
|
||||
HostAMDSEVESSupported bool `json:"host_amd_sev_es_supported"`
|
||||
HostAMDSEVSNPActive bool `json:"host_amd_sev_snp_active"`
|
||||
HostIntelTDXActive bool `json:"host_intel_tdx_active"`
|
||||
|
||||
// GPUCanRunCC is true when the GPU firmware reports CC-capable.
|
||||
GPUCanRunCC bool `json:"gpu_can_run_cc"`
|
||||
// CPUCanRunCC is true when either the GPU driver or the host kernel
|
||||
// reports an active/available CPU TEE (SEV-SNP or TDX).
|
||||
CPUCanRunCC bool `json:"cpu_can_run_cc"`
|
||||
// Ready is true when both the CPU and the GPU support Confidential
|
||||
// Computing, regardless of whether CC mode is currently enabled.
|
||||
Ready bool `json:"ready"`
|
||||
|
||||
Notes []string `json:"notes,omitempty"`
|
||||
}
|
||||
|
||||
// RunConfidentialComputingCheckPack runs a read-only check of whether this
|
||||
// server can run NVIDIA Confidential Computing: it queries the GPU driver
|
||||
// (`nvidia-smi conf-compute -q`) and inspects host kernel/dmesg evidence of
|
||||
// AMD SEV-SNP / Intel TDX support. It changes nothing on the system.
|
||||
func (s *System) RunConfidentialComputingCheckPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
if ctx == nil {
|
||||
ctx = context.Background()
|
||||
}
|
||||
if baseDir == "" {
|
||||
baseDir = "/var/log/bee-sat"
|
||||
}
|
||||
ts := time.Now().UTC().Format("20060102-150405")
|
||||
runDir := filepath.Join(baseDir, "confidential-computing-"+ts)
|
||||
if err := os.MkdirAll(runDir, 0755); err != nil {
|
||||
return "", err
|
||||
}
|
||||
verboseLog := filepath.Join(runDir, "verbose.log")
|
||||
|
||||
status := ConfidentialComputingStatus{CollectedAt: time.Now().UTC()}
|
||||
|
||||
// GPU firmware / driver state.
|
||||
ccOut, ccErr := runSATCommandCtx(ctx, verboseLog, "nvidia-smi-conf-compute-q", []string{"nvidia-smi", "conf-compute", "-q"}, nil, logFunc)
|
||||
_ = os.WriteFile(filepath.Join(runDir, "01-nvidia-smi-conf-compute-q.log"), ccOut, 0644)
|
||||
if ccErr == nil {
|
||||
status.NvidiaSMIAvailable = true
|
||||
fields := parseConfComputeFields(ccOut)
|
||||
status.CCState = fields["CC State"]
|
||||
status.MultiGPUMode = fields["Multi-GPU Mode"]
|
||||
status.CPUCCCapability = fields["CPU CC Capabilities"]
|
||||
status.GPUCCCapability = fields["GPU CC Capabilities"]
|
||||
status.CCGPUsReadyState = fields["CC GPUs Ready State"]
|
||||
} else {
|
||||
status.Notes = append(status.Notes, "nvidia-smi conf-compute -q unavailable (no NVIDIA driver, or GPU not present): "+firstLine(string(ccOut)))
|
||||
}
|
||||
|
||||
// Host kernel evidence, independent of the GPU driver.
|
||||
dmesgOut, _ := runSATCommandCtx(ctx, verboseLog, "dmesg", []string{"dmesg"}, nil, nil)
|
||||
ccDmesgLines := filterConfComputeDmesgLines(dmesgOut)
|
||||
_ = os.WriteFile(filepath.Join(runDir, "02-dmesg-cc-relevant.log"), []byte(strings.Join(ccDmesgLines, "\n")+"\n"), 0644)
|
||||
|
||||
lowerDmesg := strings.ToLower(strings.Join(ccDmesgLines, "\n"))
|
||||
status.HostAMDSEVSNPActive = strings.Contains(lowerDmesg, "sev-snp enabled")
|
||||
status.HostIntelTDXActive = strings.Contains(lowerDmesg, "tdx module") && strings.Contains(lowerDmesg, "module initialized") ||
|
||||
strings.Contains(lowerDmesg, "virt/tdx: module initialized")
|
||||
|
||||
for i, path := range []string{
|
||||
"/sys/module/kvm_amd/parameters/sev",
|
||||
"/sys/module/kvm_amd/parameters/sev_es",
|
||||
"/sys/module/kvm_amd/parameters/sev_snp",
|
||||
} {
|
||||
name := fmt.Sprintf("sysfs-%s", filepath.Base(path))
|
||||
out, err := runSATCommandCtx(ctx, verboseLog, name, []string{"cat", path}, nil, nil)
|
||||
_ = os.WriteFile(filepath.Join(runDir, fmt.Sprintf("03-%02d-%s.log", i+1, name)), out, 0644)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
val := strings.TrimSpace(string(out))
|
||||
switch filepath.Base(path) {
|
||||
case "sev":
|
||||
status.HostAMDSEVSupported = strings.EqualFold(val, "Y")
|
||||
case "sev_es":
|
||||
status.HostAMDSEVESSupported = strings.EqualFold(val, "Y")
|
||||
case "sev_snp":
|
||||
if strings.EqualFold(val, "Y") {
|
||||
status.HostAMDSEVSNPActive = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
status.GPUCanRunCC = strings.EqualFold(strings.TrimSpace(status.GPUCCCapability), "CC Capable")
|
||||
cpuCapReported := strings.TrimSpace(status.CPUCCCapability)
|
||||
status.CPUCanRunCC = status.HostAMDSEVSNPActive || status.HostIntelTDXActive ||
|
||||
(cpuCapReported != "" && !strings.EqualFold(cpuCapReported, "NONE"))
|
||||
status.Ready = status.CPUCanRunCC && status.GPUCanRunCC
|
||||
|
||||
if !status.NvidiaSMIAvailable {
|
||||
status.Notes = append(status.Notes, "GPU CC capability unknown — install the NVIDIA driver to query it with `nvidia-smi conf-compute -q`.")
|
||||
}
|
||||
|
||||
summary := renderConfidentialComputingSummary(status)
|
||||
if err := os.WriteFile(filepath.Join(runDir, "summary.txt"), []byte(summary), 0644); err != nil {
|
||||
return "", err
|
||||
}
|
||||
report := renderConfidentialComputingReport(status)
|
||||
if err := os.WriteFile(filepath.Join(runDir, "confidential-computing-report.txt"), []byte(report), 0644); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return runDir, nil
|
||||
}
|
||||
|
||||
// parseConfComputeFields parses the indented "Key : Value" block emitted by
|
||||
// `nvidia-smi conf-compute -q`, e.g.:
|
||||
//
|
||||
// CC State : OFF
|
||||
// Multi-GPU Mode : Protected PCIe
|
||||
// CPU CC Capabilities : INTEL TDX
|
||||
// GPU CC Capabilities : CC Capable
|
||||
// CC GPUs Ready State : Not Ready
|
||||
func parseConfComputeFields(out []byte) map[string]string {
|
||||
fields := map[string]string{}
|
||||
for _, line := range strings.Split(string(out), "\n") {
|
||||
idx := strings.Index(line, ":")
|
||||
if idx < 0 {
|
||||
continue
|
||||
}
|
||||
key := strings.TrimSpace(line[:idx])
|
||||
val := strings.TrimSpace(line[idx+1:])
|
||||
if key == "" || val == "" {
|
||||
continue
|
||||
}
|
||||
fields[key] = val
|
||||
}
|
||||
return fields
|
||||
}
|
||||
|
||||
// filterConfComputeDmesgLines returns the dmesg lines relevant to CPU
|
||||
// Confidential Computing support (AMD SEV/SEV-ES/SEV-SNP, Intel TDX).
|
||||
func filterConfComputeDmesgLines(dmesgOut []byte) []string {
|
||||
var lines []string
|
||||
scanner := bytes.Split(dmesgOut, []byte("\n"))
|
||||
for _, raw := range scanner {
|
||||
lower := strings.ToLower(string(raw))
|
||||
if strings.Contains(lower, "sev") || strings.Contains(lower, "tdx") {
|
||||
lines = append(lines, string(raw))
|
||||
}
|
||||
}
|
||||
return lines
|
||||
}
|
||||
|
||||
func renderConfidentialComputingSummary(status ConfidentialComputingStatus) string {
|
||||
var b strings.Builder
|
||||
fmt.Fprintf(&b, "run_at_utc=%s\n", status.CollectedAt.Format(time.RFC3339))
|
||||
fmt.Fprintf(&b, "nvidia_smi_available=%t\n", status.NvidiaSMIAvailable)
|
||||
fmt.Fprintf(&b, "cc_state=%s\n", status.CCState)
|
||||
fmt.Fprintf(&b, "multi_gpu_mode=%s\n", status.MultiGPUMode)
|
||||
fmt.Fprintf(&b, "cpu_cc_capability=%s\n", status.CPUCCCapability)
|
||||
fmt.Fprintf(&b, "gpu_cc_capability=%s\n", status.GPUCCCapability)
|
||||
fmt.Fprintf(&b, "cc_gpus_ready_state=%s\n", status.CCGPUsReadyState)
|
||||
fmt.Fprintf(&b, "host_amd_sev_supported=%t\n", status.HostAMDSEVSupported)
|
||||
fmt.Fprintf(&b, "host_amd_sev_es_supported=%t\n", status.HostAMDSEVESSupported)
|
||||
fmt.Fprintf(&b, "host_amd_sev_snp_active=%t\n", status.HostAMDSEVSNPActive)
|
||||
fmt.Fprintf(&b, "host_intel_tdx_active=%t\n", status.HostIntelTDXActive)
|
||||
fmt.Fprintf(&b, "cpu_can_run_cc=%t\n", status.CPUCanRunCC)
|
||||
fmt.Fprintf(&b, "gpu_can_run_cc=%t\n", status.GPUCanRunCC)
|
||||
fmt.Fprintf(&b, "ready=%t\n", status.Ready)
|
||||
if status.Ready {
|
||||
fmt.Fprintln(&b, "overall_status=OK")
|
||||
} else {
|
||||
fmt.Fprintln(&b, "overall_status=NOT_READY")
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func renderConfidentialComputingReport(status ConfidentialComputingStatus) string {
|
||||
var b strings.Builder
|
||||
line := strings.Repeat("=", 80)
|
||||
b.WriteString(line + "\n")
|
||||
b.WriteString("Confidential Computing Readiness\n")
|
||||
b.WriteString(line + "\n\n")
|
||||
|
||||
verdict := "NOT READY"
|
||||
if status.Ready {
|
||||
verdict = "READY"
|
||||
}
|
||||
fmt.Fprintf(&b, "Verdict: %s\n\n", verdict)
|
||||
|
||||
b.WriteString("-- CPU ----------------------------------------------------------------------\n")
|
||||
fmt.Fprintf(&b, " Reported by GPU driver : %s\n", nonEmptyOr(status.CPUCCCapability, "unknown"))
|
||||
fmt.Fprintf(&b, " AMD SEV supported : %t\n", status.HostAMDSEVSupported)
|
||||
fmt.Fprintf(&b, " AMD SEV-ES supported : %t\n", status.HostAMDSEVESSupported)
|
||||
fmt.Fprintf(&b, " AMD SEV-SNP active : %t\n", status.HostAMDSEVSNPActive)
|
||||
fmt.Fprintf(&b, " Intel TDX active : %t\n", status.HostIntelTDXActive)
|
||||
fmt.Fprintf(&b, " Can run CC : %t\n\n", status.CPUCanRunCC)
|
||||
|
||||
b.WriteString("-- GPU ----------------------------------------------------------------------\n")
|
||||
fmt.Fprintf(&b, " nvidia-smi available : %t\n", status.NvidiaSMIAvailable)
|
||||
fmt.Fprintf(&b, " GPU CC Capabilities : %s\n", nonEmptyOr(status.GPUCCCapability, "unknown"))
|
||||
fmt.Fprintf(&b, " CC State (current) : %s\n", nonEmptyOr(status.CCState, "unknown"))
|
||||
fmt.Fprintf(&b, " Multi-GPU Mode : %s\n", nonEmptyOr(status.MultiGPUMode, "unknown"))
|
||||
fmt.Fprintf(&b, " CC GPUs Ready State : %s\n", nonEmptyOr(status.CCGPUsReadyState, "unknown"))
|
||||
fmt.Fprintf(&b, " Can run CC : %t\n\n", status.GPUCanRunCC)
|
||||
|
||||
if len(status.Notes) > 0 {
|
||||
b.WriteString("-- Notes ----------------------------------------------------------------------\n")
|
||||
for _, n := range status.Notes {
|
||||
fmt.Fprintf(&b, " - %s\n", n)
|
||||
}
|
||||
b.WriteString("\n")
|
||||
}
|
||||
|
||||
fmt.Fprintf(&b, "Collected : %s\n", status.CollectedAt.Format("2006-01-02 15:04:05 UTC"))
|
||||
b.WriteString(line + "\n")
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func nonEmptyOr(v, fallback string) string {
|
||||
if strings.TrimSpace(v) == "" {
|
||||
return fallback
|
||||
}
|
||||
return v
|
||||
}
|
||||
@@ -182,9 +182,16 @@ func (s *System) DetectGPUVendor() string {
|
||||
return "amd"
|
||||
}
|
||||
if raw, err := exec.Command("lspci", "-nn").Output(); err == nil {
|
||||
text := strings.ToLower(string(raw))
|
||||
if strings.Contains(text, "advanced micro devices") || strings.Contains(text, "amd/ati") {
|
||||
return "amd"
|
||||
// Only match AMD GPU device classes [0300]=VGA, [0302]=3D controller, [0380]=Display.
|
||||
// AMD CPUs also appear in lspci as "Advanced Micro Devices" (Root Complex, IOMMU, etc.)
|
||||
// so matching vendor alone causes false positives on AMD CPU servers without GPUs.
|
||||
for _, line := range strings.Split(strings.ToLower(string(raw)), "\n") {
|
||||
if !strings.Contains(line, "advanced micro devices") && !strings.Contains(line, "amd/ati") {
|
||||
continue
|
||||
}
|
||||
if strings.Contains(line, "[0300]") || strings.Contains(line, "[0302]") || strings.Contains(line, "[0380]") {
|
||||
return "amd"
|
||||
}
|
||||
}
|
||||
}
|
||||
return ""
|
||||
@@ -723,12 +730,14 @@ func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, e
|
||||
}
|
||||
prefix := fmt.Sprintf("%02d-%s", index+1, filepath.Base(devPath))
|
||||
commands := storageSATCommands(devPath, extended)
|
||||
deviceOutputs := make(map[string][]byte, len(commands))
|
||||
for cmdIndex, job := range commands {
|
||||
if ctx.Err() != nil {
|
||||
break
|
||||
}
|
||||
name := fmt.Sprintf("%s-%02d-%s.log", prefix, cmdIndex+1, job.name)
|
||||
out, err := runSATCommandCtx(ctx, verboseLog, job.name, job.cmd, nil, logFunc)
|
||||
deviceOutputs[job.name] = out
|
||||
if writeErr := os.WriteFile(filepath.Join(runDir, name), out, 0644); writeErr != nil {
|
||||
return "", writeErr
|
||||
}
|
||||
@@ -737,7 +746,28 @@ func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, e
|
||||
key := filepath.Base(devPath) + "_" + strings.ReplaceAll(job.name, "-", "_")
|
||||
fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc)
|
||||
fmt.Fprintf(&summary, "%s_status=%s\n", key, status)
|
||||
|
||||
// smartctl -t short only launches the self-test on the drive firmware and
|
||||
// returns immediately ("Testing has begun"); unlike `nvme device-self-test
|
||||
// --wait`, smartctl has no blocking mode, so we must poll the drive
|
||||
// ourselves until the self-test actually finishes.
|
||||
if job.name == "smartctl-self-test-short" && err == nil {
|
||||
statusName := "smartctl-self-test-status"
|
||||
statusOut := waitForSmartctlSelfTest(ctx, verboseLog, devPath, logFunc)
|
||||
deviceOutputs[statusName] = statusOut
|
||||
statusFile := fmt.Sprintf("%s-%02d-%s.log", prefix, cmdIndex+2, statusName)
|
||||
if writeErr := os.WriteFile(filepath.Join(runDir, statusFile), statusOut, 0644); writeErr != nil {
|
||||
return "", writeErr
|
||||
}
|
||||
sStatus, sRC := classifySATResult(statusName, statusOut, nil)
|
||||
stats.Add(sStatus)
|
||||
sKey := filepath.Base(devPath) + "_" + strings.ReplaceAll(statusName, "-", "_")
|
||||
fmt.Fprintf(&summary, "%s_rc=%d\n", sKey, sRC)
|
||||
fmt.Fprintf(&summary, "%s_status=%s\n", sKey, sStatus)
|
||||
}
|
||||
}
|
||||
reportText := GenerateDiskReportText(index+1, devPath, deviceOutputs, time.Now().UTC())
|
||||
_ = os.WriteFile(filepath.Join(runDir, "disk-"+prefix+"-report.txt"), []byte(reportText), 0644)
|
||||
}
|
||||
|
||||
writeSATStats(&summary, stats)
|
||||
@@ -1170,6 +1200,42 @@ func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string
|
||||
return out, err
|
||||
}
|
||||
|
||||
// smartctlSelfTestPollInterval/Timeout bound how long we poll the drive after
|
||||
// launching `smartctl -t short`, which SMART/ATA specs put at ~2 minutes.
|
||||
const (
|
||||
smartctlSelfTestPollInterval = 5 * time.Second
|
||||
smartctlSelfTestTimeout = 4 * time.Minute
|
||||
)
|
||||
|
||||
// waitForSmartctlSelfTest polls `smartctl -a` until the short self-test
|
||||
// started on devPath finishes (or the timeout/context elapses) and returns
|
||||
// the final output, which reflects the actual test result rather than the
|
||||
// "Testing has begun" launch acknowledgement.
|
||||
func waitForSmartctlSelfTest(ctx context.Context, verboseLog, devPath string, logFunc func(string)) []byte {
|
||||
deadline := time.Now().Add(smartctlSelfTestTimeout)
|
||||
var last []byte
|
||||
for {
|
||||
out, _ := runSATCommandCtx(ctx, verboseLog, "smartctl-self-test-status", []string{"smartctl", "-a", devPath}, nil, nil)
|
||||
last = out
|
||||
if ctx.Err() != nil {
|
||||
return last
|
||||
}
|
||||
lower := bytes.ToLower(out)
|
||||
if !bytes.Contains(lower, []byte("self-test routine in progress")) &&
|
||||
!bytes.Contains(lower, []byte("% of test remaining")) {
|
||||
return last
|
||||
}
|
||||
if time.Now().After(deadline) {
|
||||
return last
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return last
|
||||
case <-time.After(smartctlSelfTestPollInterval):
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func listStorageDevices() ([]string, error) {
|
||||
out, err := satExecCommand("lsblk", "-dn", "-o", "NAME,TYPE,TRAN").Output()
|
||||
if err != nil {
|
||||
@@ -1178,26 +1244,27 @@ func listStorageDevices() ([]string, error) {
|
||||
return parseStorageDevices(string(out)), nil
|
||||
}
|
||||
|
||||
// storageSATCommands returns the commands to run for a single storage device.
|
||||
// extended=false (Check): read-only SMART/NVMe data collection, no self-test.
|
||||
// extended=true (Load): data collection + short self-test.
|
||||
func storageSATCommands(devPath string, extended bool) []satJob {
|
||||
if strings.Contains(filepath.Base(devPath), "nvme") {
|
||||
selfTestLevel := "1"
|
||||
if extended {
|
||||
selfTestLevel = "2"
|
||||
}
|
||||
return []satJob{
|
||||
jobs := []satJob{
|
||||
{name: "nvme-id-ctrl", cmd: []string{"nvme", "id-ctrl", devPath, "-o", "json"}},
|
||||
{name: "nvme-smart-log", cmd: []string{"nvme", "smart-log", devPath, "-o", "json"}},
|
||||
{name: "nvme-device-self-test", cmd: []string{"nvme", "device-self-test", devPath, "-s", selfTestLevel, "--wait"}},
|
||||
}
|
||||
if extended {
|
||||
jobs = append(jobs, satJob{name: "nvme-device-self-test", cmd: []string{"nvme", "device-self-test", devPath, "-s", "1", "--wait"}})
|
||||
}
|
||||
return jobs
|
||||
}
|
||||
jobs := []satJob{
|
||||
{name: "smartctl-health", cmd: []string{"smartctl", "-H", "-A", "-i", devPath}},
|
||||
}
|
||||
smartTestType := "short"
|
||||
if extended {
|
||||
smartTestType = "long"
|
||||
}
|
||||
return []satJob{
|
||||
{name: "smartctl-health", cmd: []string{"smartctl", "-H", "-A", devPath}},
|
||||
{name: "smartctl-self-test-short", cmd: []string{"smartctl", "-t", smartTestType, devPath}},
|
||||
jobs = append(jobs, satJob{name: "smartctl-self-test-short", cmd: []string{"smartctl", "-t", "short", devPath}})
|
||||
}
|
||||
return jobs
|
||||
}
|
||||
|
||||
func (s *satStats) Add(status string) {
|
||||
|
||||
@@ -14,14 +14,42 @@ import (
|
||||
func TestStorageSATCommands(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
nvme := storageSATCommands("/dev/nvme0n1", false)
|
||||
if len(nvme) != 3 || nvme[2].cmd[0] != "nvme" {
|
||||
t.Fatalf("unexpected nvme commands: %#v", nvme)
|
||||
// Check mode (extended=false): read-only collection, no self-test.
|
||||
nvmeCheck := storageSATCommands("/dev/nvme0n1", false)
|
||||
if len(nvmeCheck) != 2 {
|
||||
t.Fatalf("check nvme: want 2 commands, got %d: %#v", len(nvmeCheck), nvmeCheck)
|
||||
}
|
||||
if nvmeCheck[0].name != "nvme-id-ctrl" || nvmeCheck[1].name != "nvme-smart-log" {
|
||||
t.Fatalf("check nvme: unexpected command names: %#v", nvmeCheck)
|
||||
}
|
||||
|
||||
sata := storageSATCommands("/dev/sda", false)
|
||||
if len(sata) != 2 || sata[0].cmd[0] != "smartctl" {
|
||||
t.Fatalf("unexpected sata commands: %#v", sata)
|
||||
sataCheck := storageSATCommands("/dev/sda", false)
|
||||
if len(sataCheck) != 1 || sataCheck[0].cmd[0] != "smartctl" {
|
||||
t.Fatalf("check sata: want 1 smartctl command, got %#v", sataCheck)
|
||||
}
|
||||
|
||||
// Load mode (extended=true): collection + short self-test.
|
||||
nvmeLoad := storageSATCommands("/dev/nvme0n1", true)
|
||||
if len(nvmeLoad) != 3 || nvmeLoad[2].name != "nvme-device-self-test" {
|
||||
t.Fatalf("load nvme: want 3 commands with self-test last, got %#v", nvmeLoad)
|
||||
}
|
||||
if got := nvmeLoad[2].cmd[len(nvmeLoad[2].cmd)-3]; got != "-s" {
|
||||
t.Fatalf("load nvme: want -s flag, got %q", got)
|
||||
}
|
||||
if got := nvmeLoad[2].cmd[len(nvmeLoad[2].cmd)-2]; got != "1" {
|
||||
t.Fatalf("load nvme: want self-test level 1, got %q", got)
|
||||
}
|
||||
|
||||
sataLoad := storageSATCommands("/dev/sda", true)
|
||||
if len(sataLoad) != 2 || sataLoad[1].name != "smartctl-self-test-short" {
|
||||
t.Fatalf("load sata: want 2 commands with short self-test last, got %#v", sataLoad)
|
||||
}
|
||||
// cmd is: smartctl -t short /dev/sda
|
||||
if got := sataLoad[1].cmd[1]; got != "-t" {
|
||||
t.Fatalf("load sata: want -t flag at index 1, got %q", got)
|
||||
}
|
||||
if got := sataLoad[1].cmd[2]; got != "short" {
|
||||
t.Fatalf("load sata: want short at index 2, got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -0,0 +1,548 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"bee/audit/internal/collector"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"math"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// GenerateDiskReportText builds a human-readable text report for one storage
|
||||
// device from the raw command outputs collected during storage SAT.
|
||||
//
|
||||
// outputs keys match satJob.name: "nvme-id-ctrl", "nvme-smart-log",
|
||||
// "smartctl-health", "smartctl-self-test-short".
|
||||
func GenerateDiskReportText(index int, devPath string, outputs map[string][]byte, ts time.Time) string {
|
||||
var b strings.Builder
|
||||
devName := filepath.Base(devPath)
|
||||
line := strings.Repeat("=", 80)
|
||||
b.WriteString(line + "\n")
|
||||
fmt.Fprintf(&b, "Disk %-3d %s\n", index, devPath)
|
||||
b.WriteString(line + "\n")
|
||||
|
||||
isNVMe := strings.Contains(devName, "nvme")
|
||||
if isNVMe {
|
||||
writeNVMeReport(&b, outputs)
|
||||
} else {
|
||||
writeSATAReport(&b, outputs)
|
||||
}
|
||||
|
||||
b.WriteString("\n")
|
||||
fmt.Fprintf(&b, "Collected : %s\n", ts.UTC().Format("2006-01-02 15:04:05 UTC"))
|
||||
b.WriteString(line + "\n")
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// ── NVMe ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
func writeNVMeReport(b *strings.Builder, outputs map[string][]byte) {
|
||||
// id-ctrl
|
||||
var ctrl collector.NVMeIDCtrl
|
||||
if data := outputs["nvme-id-ctrl"]; len(data) > 0 {
|
||||
_ = json.Unmarshal(data, &ctrl)
|
||||
}
|
||||
|
||||
model := strings.TrimSpace(ctrl.ModelNumber)
|
||||
serial := strings.TrimSpace(ctrl.SerialNumber)
|
||||
firmware := strings.TrimSpace(ctrl.FirmwareRev)
|
||||
|
||||
capacityGB := ""
|
||||
if ctrl.TotalCapacity > 0 {
|
||||
capacityGB = formatCapacityGB(uint64(ctrl.TotalCapacity))
|
||||
} else if ctrl.NVMCapacity > 0 {
|
||||
capacityGB = formatCapacityGB(uint64(ctrl.NVMCapacity))
|
||||
}
|
||||
|
||||
writeField(b, "Model", model)
|
||||
writeField(b, "Serial", serial)
|
||||
writeField(b, "Firmware", firmware)
|
||||
if capacityGB != "" {
|
||||
writeField(b, "Capacity", capacityGB)
|
||||
}
|
||||
|
||||
// smart-log
|
||||
data := outputs["nvme-smart-log"]
|
||||
if len(data) == 0 {
|
||||
b.WriteString("\n(no SMART data)\n")
|
||||
return
|
||||
}
|
||||
var sl collector.NVMeSmartLog
|
||||
if err := json.Unmarshal(data, &sl); err != nil {
|
||||
fmt.Fprintf(b, "\n(SMART parse error: %v)\n", err)
|
||||
return
|
||||
}
|
||||
|
||||
tempC := int(sl.Temperature) - 273
|
||||
if tempC < 0 {
|
||||
tempC = 0
|
||||
}
|
||||
|
||||
critWarnStr := "OK"
|
||||
if sl.CriticalWarning != 0 {
|
||||
critWarnStr = fmt.Sprintf("0x%02X", sl.CriticalWarning)
|
||||
}
|
||||
|
||||
poh := uint64(sl.PowerOnHours)
|
||||
pc := uint64(sl.PowerCycles)
|
||||
us := uint64(sl.UnsafeShutdowns)
|
||||
me := uint64(sl.MediaErrors)
|
||||
nel := uint64(sl.NumErrLogEntries)
|
||||
|
||||
// data_units are in 1000 × 512-byte sectors = 512,000 bytes each
|
||||
readBytes := uint64(sl.DataUnitsRead) * 512000
|
||||
writtenBytes := uint64(sl.DataUnitsWritten) * 512000
|
||||
|
||||
writeSectionHeader(b, "Health")
|
||||
writeField(b, "Temperature", fmt.Sprintf("%d °C", tempC))
|
||||
writeField(b, "Critical Warning", critWarnStr)
|
||||
writeField(b, "Percentage Used", fmt.Sprintf("%d %%", sl.PercentageUsed))
|
||||
writeField(b, "Available Spare", fmt.Sprintf("%d %% (threshold: %d %%)", sl.AvailableSpare, sl.SpareThreshold))
|
||||
|
||||
writeSectionHeader(b, "Usage")
|
||||
writeField(b, "Power On Hours", fmt.Sprintf("%s h", formatUint(poh)))
|
||||
writeField(b, "Power Cycles", formatUint(pc))
|
||||
writeField(b, "Unsafe Shutdowns", formatUint(us))
|
||||
writeField(b, "Data Written", formatBytesHuman(float64(writtenBytes)))
|
||||
writeField(b, "Data Read", formatBytesHuman(float64(readBytes)))
|
||||
|
||||
writeSectionHeader(b, "Errors")
|
||||
writeField(b, "Media Errors", formatUint(me))
|
||||
writeField(b, "Error Log Entries", formatUint(nel))
|
||||
|
||||
capacityBytes := uint64(ctrl.TotalCapacity)
|
||||
if capacityBytes == 0 {
|
||||
capacityBytes = uint64(ctrl.NVMCapacity)
|
||||
}
|
||||
ri := resourceInfo{
|
||||
powerOnHours: poh,
|
||||
powerCycles: pc,
|
||||
writtenBytes: writtenBytes,
|
||||
readBytes: readBytes,
|
||||
capacityBytes: capacityBytes,
|
||||
}
|
||||
writeResourceSection(b, ri)
|
||||
|
||||
if selfTest := outputs["nvme-device-self-test"]; len(selfTest) > 0 {
|
||||
writeSectionHeader(b, "Self-Test")
|
||||
result := parseSelfTestResult(string(selfTest))
|
||||
writeField(b, "Result", result)
|
||||
}
|
||||
|
||||
writeConclusionSection(b, ri)
|
||||
}
|
||||
|
||||
// ── SATA / SAS (smartctl) ────────────────────────────────────────────────────
|
||||
|
||||
var (
|
||||
smartHealthRE = regexp.MustCompile(`(?i)SMART overall-health self-assessment test result:\s*(\S+)`)
|
||||
smartAttrLineRE = regexp.MustCompile(
|
||||
`^\s*(\d{1,3})\s+(\S+)\s+0x[0-9a-fA-F]+\s+(\d{1,3})\s+(\d{1,3})\s+(\d{1,3})\s+\S+\s+\S+\s+\S+\s+(.+?)\s*$`,
|
||||
)
|
||||
smartModelRE = regexp.MustCompile(`(?im)^Device Model:\s*(.+)$`)
|
||||
smartSerialRE = regexp.MustCompile(`(?im)^Serial Number:\s*(.+)$`)
|
||||
smartFirmwareRE = regexp.MustCompile(`(?im)^Firmware Version:\s*(.+)$`)
|
||||
smartCapacityRE = regexp.MustCompile(`(?im)^User Capacity:\s*(.+)$`)
|
||||
)
|
||||
|
||||
type smartAttr struct {
|
||||
ID int
|
||||
Name string
|
||||
Value int
|
||||
Worst int
|
||||
Threshold int
|
||||
Raw string
|
||||
}
|
||||
|
||||
func writeSATAReport(b *strings.Builder, outputs map[string][]byte) {
|
||||
data := outputs["smartctl-health"]
|
||||
if len(data) == 0 {
|
||||
b.WriteString("\n(no SMART data)\n")
|
||||
return
|
||||
}
|
||||
text := string(data)
|
||||
|
||||
// Identity
|
||||
if m := smartModelRE.FindStringSubmatch(text); m != nil {
|
||||
writeField(b, "Model", strings.TrimSpace(m[1]))
|
||||
}
|
||||
if m := smartSerialRE.FindStringSubmatch(text); m != nil {
|
||||
writeField(b, "Serial", strings.TrimSpace(m[1]))
|
||||
}
|
||||
if m := smartFirmwareRE.FindStringSubmatch(text); m != nil {
|
||||
writeField(b, "Firmware", strings.TrimSpace(m[1]))
|
||||
}
|
||||
var capacityBytes uint64
|
||||
if m := smartCapacityRE.FindStringSubmatch(text); m != nil {
|
||||
cap := strings.TrimSpace(m[1])
|
||||
capacityBytes = parseLeadingUint(cap)
|
||||
// trim everything after "[" if present (e.g. "500,107,862,016 bytes [500 GB]")
|
||||
if idx := strings.Index(cap, "["); idx > 0 {
|
||||
cap = strings.TrimSpace(cap[idx+1:])
|
||||
cap = strings.TrimSuffix(cap, "]")
|
||||
}
|
||||
writeField(b, "Capacity", cap)
|
||||
}
|
||||
|
||||
writeSectionHeader(b, "Health")
|
||||
health := "unknown"
|
||||
if m := smartHealthRE.FindStringSubmatch(text); m != nil {
|
||||
health = strings.TrimSpace(m[1])
|
||||
}
|
||||
writeField(b, "SMART Overall Health", health)
|
||||
|
||||
attrs := parseSMARTAttrs(text)
|
||||
if len(attrs) > 0 {
|
||||
writeSectionHeader(b, "SMART Attributes")
|
||||
fmt.Fprintf(b, " %-4s %-32s %5s %5s %5s %s\n", "ID", "Attribute", "Value", "Worst", "Thresh", "Raw")
|
||||
b.WriteString(" " + strings.Repeat("-", 72) + "\n")
|
||||
for _, a := range attrs {
|
||||
fmt.Fprintf(b, " %-4d %-32s %5d %5d %5d %s\n",
|
||||
a.ID, a.Name, a.Value, a.Worst, a.Threshold, a.Raw)
|
||||
}
|
||||
}
|
||||
|
||||
var poh, pc, writtenLBAs, readLBAs uint64
|
||||
var readValue int
|
||||
hasReadValue := false
|
||||
for _, a := range attrs {
|
||||
switch a.ID {
|
||||
case 9: // Power_On_Hours
|
||||
poh = parseLeadingUint(a.Raw)
|
||||
case 12: // Power_Cycle_Count
|
||||
pc = parseLeadingUint(a.Raw)
|
||||
case 241: // Total_LBAs_Written
|
||||
writtenLBAs = parseLeadingUint(a.Raw)
|
||||
case 242: // Total_LBAs_Read
|
||||
readLBAs = parseLeadingUint(a.Raw)
|
||||
readValue = a.Value
|
||||
hasReadValue = true
|
||||
}
|
||||
}
|
||||
const sataSectorBytes = 512
|
||||
ri := resourceInfo{
|
||||
powerOnHours: poh,
|
||||
powerCycles: pc,
|
||||
writtenBytes: writtenLBAs * sataSectorBytes,
|
||||
readBytes: readLBAs * sataSectorBytes,
|
||||
capacityBytes: capacityBytes,
|
||||
readPercent: 100 - readValue,
|
||||
hasReadPercent: hasReadValue,
|
||||
}
|
||||
writeResourceSection(b, ri)
|
||||
|
||||
selfTest := outputs["smartctl-self-test-status"]
|
||||
if len(selfTest) == 0 {
|
||||
selfTest = outputs["smartctl-self-test-short"]
|
||||
}
|
||||
if len(selfTest) > 0 {
|
||||
writeSectionHeader(b, "Self-Test")
|
||||
result := parseSelfTestResult(string(selfTest))
|
||||
writeField(b, "Result", result)
|
||||
}
|
||||
|
||||
writeConclusionSection(b, ri)
|
||||
}
|
||||
|
||||
func parseSMARTAttrs(text string) []smartAttr {
|
||||
var attrs []smartAttr
|
||||
inTable := false
|
||||
for _, line := range strings.Split(text, "\n") {
|
||||
if strings.Contains(line, "ATTRIBUTE_NAME") {
|
||||
inTable = true
|
||||
continue
|
||||
}
|
||||
if !inTable {
|
||||
continue
|
||||
}
|
||||
m := smartAttrLineRE.FindStringSubmatch(line)
|
||||
if m == nil {
|
||||
if strings.TrimSpace(line) == "" {
|
||||
inTable = false
|
||||
}
|
||||
continue
|
||||
}
|
||||
id, _ := strconv.Atoi(m[1])
|
||||
val, _ := strconv.Atoi(m[3])
|
||||
worst, _ := strconv.Atoi(m[4])
|
||||
thresh, _ := strconv.Atoi(m[5])
|
||||
attrs = append(attrs, smartAttr{
|
||||
ID: id,
|
||||
Name: m[2],
|
||||
Value: val,
|
||||
Worst: worst,
|
||||
Threshold: thresh,
|
||||
Raw: strings.TrimSpace(m[6]),
|
||||
})
|
||||
}
|
||||
return attrs
|
||||
}
|
||||
|
||||
// parseSelfTestResult extracts a one-line summary from nvme device-self-test,
|
||||
// smartctl -a (post-completion status), or smartctl -t short (launch ack) output.
|
||||
func parseSelfTestResult(text string) string {
|
||||
text = strings.TrimSpace(text)
|
||||
if text == "" {
|
||||
return "no output"
|
||||
}
|
||||
lines := strings.Split(text, "\n")
|
||||
// smartctl -a: "Self-test execution status: ( 0)\n\tThe previous
|
||||
// self-test routine completed\n\twithout error ..." — the description
|
||||
// wraps onto following indented, colon-free continuation lines.
|
||||
for i, line := range lines {
|
||||
if strings.Contains(strings.ToLower(line), "self-test execution status") {
|
||||
parts := []string{strings.TrimSpace(line)}
|
||||
for j := i + 1; j < len(lines) && j < i+4; j++ {
|
||||
cont := strings.TrimSpace(lines[j])
|
||||
if cont == "" || strings.Contains(cont, ":") {
|
||||
break
|
||||
}
|
||||
parts = append(parts, cont)
|
||||
}
|
||||
return strings.Join(parts, " ")
|
||||
}
|
||||
}
|
||||
// nvme device-self-test: look for "Short Device Self-Test Status : 0x0" or similar
|
||||
for _, line := range lines {
|
||||
l := strings.ToLower(line)
|
||||
if strings.Contains(l, "self-test status") || strings.Contains(l, "self test status") {
|
||||
return strings.TrimSpace(line)
|
||||
}
|
||||
}
|
||||
// smartctl -t short: "Testing has begun" or "Short BGST started"
|
||||
for _, line := range lines {
|
||||
l := strings.ToLower(line)
|
||||
if strings.Contains(l, "testing has begun") || strings.Contains(l, "started") || strings.Contains(l, "complete") {
|
||||
return strings.TrimSpace(line)
|
||||
}
|
||||
}
|
||||
// fallback: last non-empty line
|
||||
for i := len(lines) - 1; i >= 0; i-- {
|
||||
if s := strings.TrimSpace(lines[i]); s != "" {
|
||||
return s
|
||||
}
|
||||
}
|
||||
return "done"
|
||||
}
|
||||
|
||||
// ── Resource (pseudographic usage bars) ────────────────────────────────────────
|
||||
|
||||
// designLifeYears/dwpd model the drive's rated endurance: 1 drive-write-per-day
|
||||
// for 5 years, the baseline enterprise endurance spec used when the vendor's
|
||||
// own TBW/DWPD rating isn't available from SMART/NVMe data.
|
||||
const (
|
||||
designLifeYears = 5
|
||||
dwpd = 1.0
|
||||
)
|
||||
|
||||
type resourceInfo struct {
|
||||
powerOnHours uint64
|
||||
powerCycles uint64
|
||||
writtenBytes uint64
|
||||
readBytes uint64
|
||||
capacityBytes uint64
|
||||
readPercent int // only meaningful when hasReadPercent
|
||||
hasReadPercent bool // true when the source SMART attribute exposes a normalized read-wear value
|
||||
}
|
||||
|
||||
func writeResourceSection(b *strings.Builder, r resourceInfo) {
|
||||
writeSectionHeader(b, "Resource")
|
||||
|
||||
const maxLifeHours = designLifeYears * 365 * 24
|
||||
upFrac := float64(r.powerOnHours) / float64(maxLifeHours)
|
||||
fmt.Fprintf(b, " %-9s %s %s / %s (%s)\n",
|
||||
"Uptime", progressBar(upFrac, 24), formatHoursHuman(r.powerOnHours), formatHoursHuman(maxLifeHours), formatPercent(upFrac*100))
|
||||
|
||||
if r.capacityBytes > 0 {
|
||||
maxWritten := float64(r.capacityBytes) * dwpd * designLifeYears * 365
|
||||
wFrac := float64(r.writtenBytes) / maxWritten
|
||||
fmt.Fprintf(b, " %-9s %s %s / %s (%s, %g DWPD×%dy)\n",
|
||||
"Written", progressBar(wFrac, 24), formatBytesHuman(float64(r.writtenBytes)), formatBytesHuman(maxWritten), formatPercent(wFrac*100), dwpd, designLifeYears)
|
||||
} else {
|
||||
fmt.Fprintf(b, " %-9s %s\n", "Written", formatBytesHuman(float64(r.writtenBytes)))
|
||||
}
|
||||
|
||||
if r.hasReadPercent {
|
||||
fmt.Fprintf(b, " %-9s %s %s (%d%%)\n",
|
||||
"Read", progressBar(float64(r.readPercent)/100, 24), formatBytesHuman(float64(r.readBytes)), r.readPercent)
|
||||
} else {
|
||||
fmt.Fprintf(b, " %-9s %s\n", "Read", formatBytesHuman(float64(r.readBytes)))
|
||||
}
|
||||
}
|
||||
|
||||
// ── Conclusion (new-vs-used verdict) ────────────────────────────────────────
|
||||
|
||||
// Thresholds for treating a drive as "new": less than one full drive-write
|
||||
// (110% of capacity, headroom for provisioning/overprovisioning rounding),
|
||||
// less than a bit over two full drive-reads (210% of capacity), under a
|
||||
// week of power-on time, and under 30 power cycles. Any one violation is
|
||||
// enough to call the drive used — these are deliberately loose bounds, not
|
||||
// a wear/endurance judgment (see -- Resource -- for that).
|
||||
const (
|
||||
newDiskMaxWrittenFrac = 1.10
|
||||
newDiskMaxReadFrac = 2.10
|
||||
newDiskMaxUptimeHours = 7 * 24
|
||||
newDiskMaxPowerCycles = 30
|
||||
)
|
||||
|
||||
func writeConclusionSection(b *strings.Builder, r resourceInfo) {
|
||||
writeSectionHeader(b, "Conclusion")
|
||||
|
||||
var reasons, notes []string
|
||||
isNew := true
|
||||
|
||||
if r.capacityBytes > 0 {
|
||||
writtenFrac := float64(r.writtenBytes) / float64(r.capacityBytes)
|
||||
readFrac := float64(r.readBytes) / float64(r.capacityBytes)
|
||||
if writtenFrac >= newDiskMaxWrittenFrac {
|
||||
isNew = false
|
||||
reasons = append(reasons, fmt.Sprintf(
|
||||
"data written %s (%s of capacity)",
|
||||
formatBytesHuman(float64(r.writtenBytes)), formatPercent(writtenFrac*100)))
|
||||
}
|
||||
if readFrac >= newDiskMaxReadFrac {
|
||||
isNew = false
|
||||
reasons = append(reasons, fmt.Sprintf(
|
||||
"data read %s (%s of capacity)",
|
||||
formatBytesHuman(float64(r.readBytes)), formatPercent(readFrac*100)))
|
||||
}
|
||||
} else {
|
||||
notes = append(notes, "capacity unknown — write/read criteria not evaluated")
|
||||
}
|
||||
|
||||
if r.powerOnHours >= newDiskMaxUptimeHours {
|
||||
isNew = false
|
||||
reasons = append(reasons, fmt.Sprintf("uptime %s", formatHoursHuman(r.powerOnHours)))
|
||||
}
|
||||
|
||||
if r.powerCycles >= newDiskMaxPowerCycles {
|
||||
isNew = false
|
||||
reasons = append(reasons, fmt.Sprintf("power cycles %s", formatUint(r.powerCycles)))
|
||||
}
|
||||
|
||||
if isNew {
|
||||
writeField(b, "Disk Condition", "NEW")
|
||||
} else {
|
||||
writeField(b, "Disk Condition", "USED")
|
||||
b.WriteString(" Reason:\n")
|
||||
for _, reason := range reasons {
|
||||
fmt.Fprintf(b, " - %s\n", reason)
|
||||
}
|
||||
}
|
||||
for _, note := range notes {
|
||||
fmt.Fprintf(b, " Note: %s\n", note)
|
||||
}
|
||||
}
|
||||
|
||||
// progressBar renders a fixed-width pseudographic bar, e.g. "[######------]".
|
||||
func progressBar(frac float64, width int) string {
|
||||
if math.IsNaN(frac) || frac < 0 {
|
||||
frac = 0
|
||||
}
|
||||
if frac > 1 {
|
||||
frac = 1
|
||||
}
|
||||
filled := int(math.Round(frac * float64(width)))
|
||||
return "[" + strings.Repeat("#", filled) + strings.Repeat("-", width-filled) + "]"
|
||||
}
|
||||
|
||||
// formatBytesHuman renders a decimal (SI) human-readable byte size, e.g. "1.23 TB".
|
||||
func formatBytesHuman(n float64) string {
|
||||
units := []string{"B", "KB", "MB", "GB", "TB", "PB"}
|
||||
i := 0
|
||||
for n >= 1000 && i < len(units)-1 {
|
||||
n /= 1000
|
||||
i++
|
||||
}
|
||||
if i == 0 {
|
||||
return fmt.Sprintf("%.0f %s", n, units[i])
|
||||
}
|
||||
return fmt.Sprintf("%.2f %s", n, units[i])
|
||||
}
|
||||
|
||||
// formatHoursHuman renders an hour count as a human-scaled duration (hours,
|
||||
// days, or years) so uptimes don't show as raw four/five-digit hour counts.
|
||||
func formatHoursHuman(hours uint64) string {
|
||||
if hours < 48 {
|
||||
return fmt.Sprintf("%d h", hours)
|
||||
}
|
||||
days := float64(hours) / 24
|
||||
if days < 365 {
|
||||
return fmt.Sprintf("%.0f d", days)
|
||||
}
|
||||
years := days / 365
|
||||
if years == math.Trunc(years) {
|
||||
return fmt.Sprintf("%.0f y", years)
|
||||
}
|
||||
return fmt.Sprintf("%.1f y", years)
|
||||
}
|
||||
|
||||
// formatPercent renders a percentage with extra precision below 1% (e.g.
|
||||
// "0.03%"), where a rounded "0%" would hide any usage at all.
|
||||
func formatPercent(pct float64) string {
|
||||
if pct > 0 && pct < 1 {
|
||||
return fmt.Sprintf("%.2f%%", pct)
|
||||
}
|
||||
return fmt.Sprintf("%.0f%%", pct)
|
||||
}
|
||||
|
||||
// parseLeadingUint parses the leading run of digits/commas in s (e.g. from a
|
||||
// SMART raw value or "500,107,862,016 bytes") into a uint64, ignoring the rest.
|
||||
func parseLeadingUint(s string) uint64 {
|
||||
s = strings.TrimSpace(s)
|
||||
end := 0
|
||||
for end < len(s) && (s[end] >= '0' && s[end] <= '9' || s[end] == ',') {
|
||||
end++
|
||||
}
|
||||
digits := strings.ReplaceAll(s[:end], ",", "")
|
||||
n, _ := strconv.ParseUint(digits, 10, 64)
|
||||
return n
|
||||
}
|
||||
|
||||
// ── Formatting helpers ────────────────────────────────────────────────────────
|
||||
|
||||
func writeSectionHeader(b *strings.Builder, title string) {
|
||||
b.WriteString("\n")
|
||||
header := "-- " + title + " "
|
||||
header += strings.Repeat("-", max(0, 76-len(header)))
|
||||
b.WriteString(header + "\n")
|
||||
}
|
||||
|
||||
func writeField(b *strings.Builder, label, value string) {
|
||||
fmt.Fprintf(b, " %-20s : %s\n", label, value)
|
||||
}
|
||||
|
||||
func formatCapacityGB(bytes uint64) string {
|
||||
gb := float64(bytes) / 1e9
|
||||
if gb >= 1000 {
|
||||
return fmt.Sprintf("%.2g TB", gb/1000)
|
||||
}
|
||||
return fmt.Sprintf("%.0f GB", math.Round(gb))
|
||||
}
|
||||
|
||||
func formatUint(n uint64) string {
|
||||
if n == 0 {
|
||||
return "0"
|
||||
}
|
||||
s := strconv.FormatUint(n, 10)
|
||||
// insert thousand separators
|
||||
var out []byte
|
||||
for i, c := range s {
|
||||
if i > 0 && (len(s)-i)%3 == 0 {
|
||||
out = append(out, ',')
|
||||
}
|
||||
out = append(out, byte(c))
|
||||
}
|
||||
return string(out)
|
||||
}
|
||||
|
||||
func max(a, b int) int {
|
||||
if a > b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
@@ -0,0 +1,151 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
var testNVMeIdCtrl = []byte(`{
|
||||
"mn": "SAMSUNG MZ1L2960HCJR-00A07 ",
|
||||
"sn": "S665NN0X415495",
|
||||
"fr": "GDC7602Q",
|
||||
"tnvmcap": 960197124096
|
||||
}`)
|
||||
|
||||
var testNVMeSmartLog = []byte(`{
|
||||
"critical_warning": 0,
|
||||
"temperature": 311,
|
||||
"avail_spare": 100,
|
||||
"spare_thresh": 10,
|
||||
"percent_used": 0,
|
||||
"data_units_read": 1023456,
|
||||
"data_units_written": 738281,
|
||||
"power_cycles": 32,
|
||||
"power_on_hours": 1234,
|
||||
"unsafe_shutdowns": 3,
|
||||
"media_errors": 0,
|
||||
"num_err_log_entries": 0
|
||||
}`)
|
||||
|
||||
// lo/hi variant emitted by some nvme-cli versions
|
||||
var testNVMeSmartLogLoHi = []byte(`{
|
||||
"critical_warning": 0,
|
||||
"temperature": {"lo": 311, "hi": 0},
|
||||
"avail_spare": 100,
|
||||
"spare_thresh": 10,
|
||||
"percent_used": 0,
|
||||
"data_units_read": {"lo": 1023456, "hi": 0},
|
||||
"data_units_written": {"lo": 738281, "hi": 0},
|
||||
"power_cycles": {"lo": 32, "hi": 0},
|
||||
"power_on_hours": {"lo": 1234, "hi": 0},
|
||||
"unsafe_shutdowns": {"lo": 3, "hi": 0},
|
||||
"media_errors": {"lo": 0, "hi": 0},
|
||||
"num_err_log_entries": {"lo": 0, "hi": 0}
|
||||
}`)
|
||||
|
||||
var testSmartCtlHealth = []byte(`
|
||||
smartctl 7.3 2022-02-28 r5338 [x86_64-linux-5.15.0] (local build)
|
||||
Copyright (C) 2002-22, Bruce Allen, Christian Franke, www.smartmontools.org
|
||||
|
||||
=== START OF INFORMATION SECTION ===
|
||||
Device Model: SAMSUNG MZ1L2960HCJR-00A07
|
||||
Serial Number: S665NN0X415495
|
||||
Firmware Version: GDC7602Q
|
||||
User Capacity: 960,197,124,096 bytes [960 GB]
|
||||
|
||||
=== START OF READ SMART DATA SECTION ===
|
||||
SMART overall-health self-assessment test result: PASSED
|
||||
|
||||
SMART Attributes Data Structure revision number: 1
|
||||
Vendor Specific SMART Attributes with Thresholds:
|
||||
ID# ATTRIBUTE_NAME FLAG VALUE WORST THRESH TYPE UPDATED WHEN_FAILED RAW_VALUE
|
||||
5 Reallocated_Sector_Ct 0x0032 100 100 000 Old_age Always - 0
|
||||
9 Power_On_Hours 0x0032 100 100 000 Old_age Always - 1234
|
||||
12 Power_Cycle_Count 0x0032 100 100 000 Old_age Always - 45
|
||||
177 Wear_Leveling_Count 0x0013 097 097 000 Pre-fail Always - 30
|
||||
190 Airflow_Temperature_Cel 0x0032 063 045 000 Old_age Always - 37
|
||||
`)
|
||||
|
||||
func TestGenerateDiskReportNVMe(t *testing.T) {
|
||||
t.Parallel()
|
||||
outputs := map[string][]byte{
|
||||
"nvme-id-ctrl": testNVMeIdCtrl,
|
||||
"nvme-smart-log": testNVMeSmartLog,
|
||||
}
|
||||
report := GenerateDiskReportText(1, "/dev/nvme0n1", outputs, time.Unix(0, 0).UTC())
|
||||
|
||||
assertContains(t, report, "Disk 1", "/dev/nvme0n1")
|
||||
assertContains(t, report, "SAMSUNG MZ1L2960HCJR-00A07")
|
||||
assertContains(t, report, "S665NN0X415495")
|
||||
assertContains(t, report, "GDC7602Q")
|
||||
assertContains(t, report, "38 °C") // 311 K - 273
|
||||
assertContains(t, report, "1,234 h") // power_on_hours with separator
|
||||
assertContains(t, report, "32") // power_cycles
|
||||
assertContains(t, report, "3") // unsafe_shutdowns
|
||||
assertContains(t, report, "378.00 GB") // data_units_written * 512000, human-scaled
|
||||
}
|
||||
|
||||
// TestGenerateDiskReportNVMeDataUnitsScaleToTB verifies that heavy write/read
|
||||
// counters render in the "-- Usage --" section as TB/PB, not raw GB, matching
|
||||
// the "-- Resource --" section which already used formatBytesHuman.
|
||||
func TestGenerateDiskReportNVMeDataUnitsScaleToTB(t *testing.T) {
|
||||
t.Parallel()
|
||||
heavy := []byte(`{
|
||||
"critical_warning": 0,
|
||||
"temperature": 307,
|
||||
"avail_spare": 100,
|
||||
"spare_thresh": 10,
|
||||
"percent_used": 0,
|
||||
"data_units_read": "252420478",
|
||||
"data_units_written": "103834055",
|
||||
"power_cycles": "45",
|
||||
"power_on_hours": "45",
|
||||
"unsafe_shutdowns": "35",
|
||||
"media_errors": "0",
|
||||
"num_err_log_entries": "0"
|
||||
}`)
|
||||
outputs := map[string][]byte{
|
||||
"nvme-id-ctrl": testNVMeIdCtrl,
|
||||
"nvme-smart-log": heavy,
|
||||
}
|
||||
report := GenerateDiskReportText(1, "/dev/nvme0n1", outputs, time.Unix(0, 0).UTC())
|
||||
|
||||
assertContains(t, report, "Data Written : 53.16 TB")
|
||||
assertContains(t, report, "Data Read : 129.24 TB")
|
||||
}
|
||||
|
||||
func TestGenerateDiskReportNVMeLoHi(t *testing.T) {
|
||||
t.Parallel()
|
||||
outputs := map[string][]byte{
|
||||
"nvme-id-ctrl": testNVMeIdCtrl,
|
||||
"nvme-smart-log": testNVMeSmartLogLoHi,
|
||||
}
|
||||
report := GenerateDiskReportText(1, "/dev/nvme0n1", outputs, time.Unix(0, 0).UTC())
|
||||
assertContains(t, report, "38 °C")
|
||||
assertContains(t, report, "1,234 h")
|
||||
}
|
||||
|
||||
func TestGenerateDiskReportSATA(t *testing.T) {
|
||||
t.Parallel()
|
||||
outputs := map[string][]byte{
|
||||
"smartctl-health": testSmartCtlHealth,
|
||||
}
|
||||
report := GenerateDiskReportText(2, "/dev/sda", outputs, time.Unix(0, 0).UTC())
|
||||
|
||||
assertContains(t, report, "Disk 2", "/dev/sda")
|
||||
assertContains(t, report, "SAMSUNG MZ1L2960HCJR-00A07")
|
||||
assertContains(t, report, "S665NN0X415495")
|
||||
assertContains(t, report, "PASSED")
|
||||
assertContains(t, report, "Reallocated_Sector_Ct")
|
||||
assertContains(t, report, "Power_On_Hours")
|
||||
}
|
||||
|
||||
func assertContains(t *testing.T, text string, needles ...string) {
|
||||
t.Helper()
|
||||
for _, needle := range needles {
|
||||
if !strings.Contains(text, needle) {
|
||||
t.Errorf("report missing %q\nreport:\n%s", needle, text)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -25,6 +25,9 @@ var techDumpFixedCommands = []struct {
|
||||
{Name: "sensors", Args: []string{"-j"}, File: "sensors.json"},
|
||||
{Name: "ipmitool", Args: []string{"fru", "print"}, File: "ipmitool-fru.txt"},
|
||||
{Name: "ipmitool", Args: []string{"sdr"}, File: "ipmitool-sdr.txt"},
|
||||
{Name: "ipmitool", Args: []string{"sensor"}, File: "ipmitool-sensor.txt"},
|
||||
{Name: "ipmitool", Args: []string{"sel", "list"}, File: "ipmitool-sel.txt"},
|
||||
{Name: "ipmitool", Args: []string{"sel", "time", "get"}, File: "ipmitool-sel-time.txt"},
|
||||
{Name: "nvme", Args: []string{"list", "-o", "json"}, File: "nvme-list.json"},
|
||||
}
|
||||
|
||||
@@ -35,6 +38,7 @@ var techDumpNvidiaCommands = []struct {
|
||||
}{
|
||||
{Name: "nvidia-smi", Args: []string{"-q"}, File: "nvidia-smi-q.txt"},
|
||||
{Name: "nvidia-smi", Args: []string{"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown", "--format=csv,noheader,nounits"}, File: "nvidia-smi-query.csv"},
|
||||
{Name: "nvidia-smi", Args: []string{"conf-compute", "-q"}, File: "nvidia-smi-conf-compute-q.txt"},
|
||||
}
|
||||
|
||||
type lsblkDumpRoot struct {
|
||||
|
||||
@@ -135,7 +135,7 @@ func defaultTaskPriority(target string, params taskParams) int {
|
||||
return taskPriorityBurn
|
||||
case "nvidia", "nvidia-targeted-stress", "nvidia-targeted-power", "nvidia-pulse",
|
||||
"nvidia-interconnect", "nvidia-bandwidth", "memory", "storage", "cpu",
|
||||
"amd", "amd-mem", "amd-bandwidth":
|
||||
"amd", "amd-mem", "amd-bandwidth", "confidential-computing":
|
||||
if params.StressMode {
|
||||
return taskPriorityValidateStress
|
||||
}
|
||||
@@ -1292,6 +1292,22 @@ func (h *handler) handleAPIInstallToRAM(w http.ResponseWriter, r *http.Request)
|
||||
_ = json.NewEncoder(w).Encode(map[string]string{"task_id": t.ID})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPISystemReboot(w http.ResponseWriter, r *http.Request) {
|
||||
if err := exec.Command("systemctl", "reboot").Start(); err != nil {
|
||||
writeError(w, http.StatusInternalServerError, "reboot failed: "+err.Error())
|
||||
return
|
||||
}
|
||||
writeJSON(w, map[string]string{"status": "rebooting"})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPISystemShutdown(w http.ResponseWriter, r *http.Request) {
|
||||
if err := exec.Command("systemctl", "poweroff").Start(); err != nil {
|
||||
writeError(w, http.StatusInternalServerError, "shutdown failed: "+err.Error())
|
||||
return
|
||||
}
|
||||
writeJSON(w, map[string]string{"status": "shutting down"})
|
||||
}
|
||||
|
||||
// ── Tools ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
var standardTools = []string{
|
||||
|
||||
@@ -0,0 +1,280 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type huaweiField struct {
|
||||
Name string `json:"name"`
|
||||
Key string `json:"key"`
|
||||
Value string `json:"value"`
|
||||
ReadOnly bool `json:"read_only,omitempty"`
|
||||
}
|
||||
|
||||
type huaweiChange struct {
|
||||
Key string `json:"key"`
|
||||
Value string `json:"value"`
|
||||
}
|
||||
|
||||
type huaweiFieldDef struct {
|
||||
Name string
|
||||
Key string
|
||||
FruID byte
|
||||
TypeID byte
|
||||
FieldID byte
|
||||
Special string // "chassis-type" | "guid"
|
||||
}
|
||||
|
||||
var huaweiElabelDefs = []huaweiFieldDef{
|
||||
{"Device Name", "DeviceName", 0x00, 0x06, 0x01, ""},
|
||||
{"Device Serial Number", "DeviceSerialNumber", 0x00, 0x06, 0x03, ""},
|
||||
{"Product Name", "ProductName", 0x00, 0x03, 0x01, ""},
|
||||
{"Product Serial Number", "ProductSerialNumber", 0x00, 0x03, 0x04, ""},
|
||||
{"Product Asset Tag", "ProductAssetTag", 0x00, 0x03, 0x05, ""},
|
||||
{"Product Manufacturer", "ProductManufacturer", 0x00, 0x03, 0x00, ""},
|
||||
{"Mainboard Manufacturer", "MainboardManufacturer", 0x00, 0x02, 0x01, ""},
|
||||
{"Board Product Name", "BoardProductName", 0x00, 0x02, 0x02, ""},
|
||||
{"Chassis Part Number", "ChassisPartnumber", 0x00, 0x01, 0x01, ""},
|
||||
{"Chassis Type", "ChassisType", 0x00, 0x01, 0x00, "chassis-type"},
|
||||
{"IO Chassis Serial", "IOChassisSerialNumber", 0x01, 0x03, 0x04, ""},
|
||||
{"IO Chassis Asset Tag", "IOChassisAssetTag", 0x01, 0x03, 0x05, ""},
|
||||
{"GUID", "GUID", 0x00, 0x00, 0x00, "guid"},
|
||||
}
|
||||
|
||||
// huaweiGetRaw reads a string elabel field via OEM IPMI raw command.
|
||||
// Protocol: ipmitool raw 0x30 0x90 0x05 <fru_id> <type_id> <field_id> 0x00 0x30
|
||||
// Response: <length_byte> <ascii_byte1> ... (null-terminated)
|
||||
func huaweiGetRaw(ctx context.Context, def huaweiFieldDef) (string, error) {
|
||||
if def.Special == "guid" {
|
||||
return huaweiGetGUID(ctx)
|
||||
}
|
||||
args := []string{
|
||||
"0x30", "0x90", "0x05",
|
||||
fmt.Sprintf("0x%02x", def.FruID),
|
||||
fmt.Sprintf("0x%02x", def.TypeID),
|
||||
fmt.Sprintf("0x%02x", def.FieldID),
|
||||
"0x00", "0x30",
|
||||
}
|
||||
out, err := exec.CommandContext(ctx, "ipmitool", append([]string{"raw"}, args...)...).CombinedOutput()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return huaweiParseStringResponse(strings.TrimSpace(string(out)), def.Special), nil
|
||||
}
|
||||
|
||||
// huaweiParseStringResponse decodes the OEM IPMI response bytes to a string.
|
||||
// Format: <length_byte> <byte1> <byte2> ...
|
||||
func huaweiParseStringResponse(hexOut, special string) string {
|
||||
parts := strings.Fields(hexOut)
|
||||
if len(parts) < 2 {
|
||||
return ""
|
||||
}
|
||||
if special == "chassis-type" {
|
||||
// Response: <length=1> <type_byte>
|
||||
if len(parts) >= 2 {
|
||||
n, err := strconv.ParseUint(parts[1], 16, 8)
|
||||
if err == nil {
|
||||
return fmt.Sprintf("0x%02x", n)
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
var sb strings.Builder
|
||||
for _, p := range parts[1:] {
|
||||
b, err := strconv.ParseUint(p, 16, 8)
|
||||
if err != nil || b == 0 {
|
||||
break
|
||||
}
|
||||
sb.WriteByte(byte(b))
|
||||
}
|
||||
return strings.TrimRight(sb.String(), "\x00")
|
||||
}
|
||||
|
||||
// huaweiGetGUID reads the system GUID via standard IPMI Get System GUID (0x06 0x08).
|
||||
func huaweiGetGUID(ctx context.Context) (string, error) {
|
||||
out, err := exec.CommandContext(ctx, "ipmitool", "raw", "0x06", "0x08").CombinedOutput()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
parts := strings.Fields(strings.TrimSpace(string(out)))
|
||||
if len(parts) != 16 {
|
||||
return "", nil
|
||||
}
|
||||
// Format as UUID: 4-2-2-2-6 byte groups
|
||||
// iBMC returns bytes in reversed order; re-reverse to get canonical UUID.
|
||||
var bytes [16]string
|
||||
for i, p := range parts {
|
||||
bytes[15-i] = p
|
||||
}
|
||||
return fmt.Sprintf("%s%s%s%s-%s%s-%s%s-%s%s-%s%s%s%s%s%s",
|
||||
bytes[0], bytes[1], bytes[2], bytes[3],
|
||||
bytes[4], bytes[5],
|
||||
bytes[6], bytes[7],
|
||||
bytes[8], bytes[9],
|
||||
bytes[10], bytes[11], bytes[12], bytes[13], bytes[14], bytes[15],
|
||||
), nil
|
||||
}
|
||||
|
||||
// huaweiChunks splits a value into 19-byte chunks for the OEM IPMI SET protocol.
|
||||
// Key byte: bit7=1 means more chunks follow; bits 0-6 = offset into string.
|
||||
func huaweiChunks(value string) [][]string {
|
||||
if len(value) == 0 {
|
||||
return [][]string{{"0x00", "0x01", "0x00"}}
|
||||
}
|
||||
const maxLen = 63
|
||||
if len(value) > maxLen {
|
||||
value = value[:maxLen]
|
||||
}
|
||||
const chunkSize = 19
|
||||
var chunks [][]string
|
||||
for offset := 0; offset < len(value); {
|
||||
end := offset + chunkSize
|
||||
if end > len(value) {
|
||||
end = len(value)
|
||||
}
|
||||
isLast := end >= len(value)
|
||||
key := byte(offset)
|
||||
if !isLast {
|
||||
key |= 0x80
|
||||
}
|
||||
args := []string{
|
||||
fmt.Sprintf("0x%02x", key),
|
||||
fmt.Sprintf("0x%02x", end-offset),
|
||||
}
|
||||
for _, b := range []byte(value[offset:end]) {
|
||||
args = append(args, fmt.Sprintf("0x%02x", b))
|
||||
}
|
||||
chunks = append(chunks, args)
|
||||
offset = end
|
||||
}
|
||||
return chunks
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIHuaweiElabelRead(w http.ResponseWriter, r *http.Request) {
|
||||
ctx, cancel := context.WithTimeout(r.Context(), 60*time.Second)
|
||||
defer cancel()
|
||||
|
||||
var fields []huaweiField
|
||||
for _, def := range huaweiElabelDefs {
|
||||
val, err := huaweiGetRaw(ctx, def)
|
||||
if err != nil {
|
||||
// First field failure likely means no Huawei BMC — abort with error.
|
||||
if len(fields) == 0 {
|
||||
msg := strings.TrimSpace(err.Error())
|
||||
writeError(w, http.StatusInternalServerError, "huawei elabel not available: "+msg)
|
||||
return
|
||||
}
|
||||
val = ""
|
||||
}
|
||||
fields = append(fields, huaweiField{
|
||||
Name: def.Name,
|
||||
Key: def.Key,
|
||||
Value: val,
|
||||
ReadOnly: def.Special == "guid" || def.Special == "chassis-type",
|
||||
})
|
||||
}
|
||||
writeJSON(w, fields)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIHuaweiElabelWrite(w http.ResponseWriter, r *http.Request) {
|
||||
var req struct {
|
||||
Changes []huaweiChange `json:"changes"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid JSON")
|
||||
return
|
||||
}
|
||||
if len(req.Changes) == 0 {
|
||||
writeError(w, http.StatusUnprocessableEntity, "no changes provided")
|
||||
return
|
||||
}
|
||||
|
||||
defByKey := make(map[string]huaweiFieldDef, len(huaweiElabelDefs))
|
||||
for _, d := range huaweiElabelDefs {
|
||||
defByKey[d.Key] = d
|
||||
}
|
||||
|
||||
for _, c := range req.Changes {
|
||||
def, ok := defByKey[c.Key]
|
||||
if !ok {
|
||||
writeError(w, http.StatusUnprocessableEntity, "unknown field key: "+c.Key)
|
||||
return
|
||||
}
|
||||
if def.Special == "guid" || def.Special == "chassis-type" {
|
||||
writeError(w, http.StatusUnprocessableEntity, "field is read-only: "+c.Key)
|
||||
return
|
||||
}
|
||||
if len(c.Value) > 63 {
|
||||
writeError(w, http.StatusUnprocessableEntity, "value too long (max 63 chars): "+c.Key)
|
||||
return
|
||||
}
|
||||
for _, ch := range c.Value {
|
||||
if ch < 0x20 || ch > 0x7E {
|
||||
writeError(w, http.StatusUnprocessableEntity, "non-printable character in value for: "+c.Key)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
t := &Task{
|
||||
ID: newJobID("huawei-elabel-write"),
|
||||
Name: fmt.Sprintf("Huawei Elabel Write (%d field(s))", len(req.Changes)),
|
||||
Target: "huawei-elabel-write",
|
||||
Priority: defaultTaskPriority("huawei-elabel-write", taskParams{}),
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{HuaweiElabelChanges: req.Changes},
|
||||
}
|
||||
globalQueue.enqueue(t)
|
||||
writeJSON(w, map[string]string{"task_id": t.ID})
|
||||
}
|
||||
|
||||
func runHuaweiElabelWriteTask(ctx context.Context, j *jobState, p taskParams) error {
|
||||
defByKey := make(map[string]huaweiFieldDef, len(huaweiElabelDefs))
|
||||
for _, d := range huaweiElabelDefs {
|
||||
defByKey[d.Key] = d
|
||||
}
|
||||
|
||||
// Enable device name effective flag before writing.
|
||||
enableCmd := exec.CommandContext(ctx, "ipmitool", "raw", "0x30", "0x90", "0x21", "0x04", "0x01")
|
||||
if out, err := enableCmd.CombinedOutput(); err != nil {
|
||||
j.append("Warning: enable flag: " + strings.TrimSpace(string(out)))
|
||||
}
|
||||
|
||||
for _, c := range p.HuaweiElabelChanges {
|
||||
def := defByKey[c.Key]
|
||||
setPrefix := []string{
|
||||
"0x30", "0x90", "0x04",
|
||||
fmt.Sprintf("0x%02x", def.FruID),
|
||||
fmt.Sprintf("0x%02x", def.TypeID),
|
||||
fmt.Sprintf("0x%02x", def.FieldID),
|
||||
}
|
||||
|
||||
chunks := huaweiChunks(c.Value)
|
||||
j.append(fmt.Sprintf("Setting %s = %q (%d chunk(s))", c.Key, c.Value, len(chunks)))
|
||||
|
||||
for _, chunk := range chunks {
|
||||
args := append([]string{"raw"}, setPrefix...)
|
||||
args = append(args, chunk...)
|
||||
cmd := exec.CommandContext(ctx, "ipmitool", args...)
|
||||
if err := streamCmdJob(j, cmd); err != nil {
|
||||
return fmt.Errorf("set %s: %w", c.Key, err)
|
||||
}
|
||||
}
|
||||
|
||||
// Commit after each field.
|
||||
commitCmd := exec.CommandContext(ctx, "ipmitool", "raw", "0x30", "0x90", "0x06", "0x00", "0xAA")
|
||||
if out, err := commitCmd.CombinedOutput(); err != nil {
|
||||
return fmt.Errorf("commit after %s: %w (output: %s)", c.Key, err, strings.TrimSpace(string(out)))
|
||||
}
|
||||
j.append("Committed " + c.Key)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -0,0 +1,217 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
"unicode"
|
||||
)
|
||||
|
||||
type fruField struct {
|
||||
Name string `json:"name"`
|
||||
Value string `json:"value"`
|
||||
Editable bool `json:"editable"`
|
||||
Area string `json:"area,omitempty"`
|
||||
Index int `json:"index,omitempty"`
|
||||
}
|
||||
|
||||
type fruChange struct {
|
||||
Area string `json:"area"`
|
||||
Index int `json:"index"`
|
||||
Name string `json:"name"`
|
||||
Value string `json:"value"`
|
||||
}
|
||||
|
||||
// fruEditableFields maps display name → area + index for ipmitool fru edit.
|
||||
var fruEditableFields = map[string]struct {
|
||||
Area string
|
||||
Index int
|
||||
}{
|
||||
// Chassis — vendor doc names and ipmitool abbreviated names
|
||||
"Chassis Part Number": {"c", 0},
|
||||
"Chassis Serial Number": {"c", 1},
|
||||
"Chassis Serial": {"c", 1},
|
||||
// Board — vendor doc names and ipmitool abbreviated names
|
||||
"Board Manufacturer": {"b", 0},
|
||||
"Board Mfg": {"b", 0},
|
||||
"Board Product Name": {"b", 1},
|
||||
"Board Product": {"b", 1},
|
||||
"Board Serial Number": {"b", 2},
|
||||
"Board Serial": {"b", 2},
|
||||
"Board Part Number": {"b", 3},
|
||||
// Product — vendor doc names and ipmitool abbreviated names
|
||||
"Product Manufacturer": {"p", 0},
|
||||
"Product Name": {"p", 1},
|
||||
"Product Part Number": {"p", 2},
|
||||
"Product Version": {"p", 3},
|
||||
"Product Serial Number": {"p", 4},
|
||||
"Product Serial": {"p", 4},
|
||||
"Product Asset Tag": {"p", 5},
|
||||
}
|
||||
|
||||
// fruExtraBaseIndex gives the starting ipmitool field index for each area's
|
||||
// repeated "<Area> Extra" custom fields, per the vendor FRU field doc (Chassis
|
||||
// extra fields start at 2, Board at 5, Product at 7). ipmitool fru print
|
||||
// emits one identically-named line per custom field, so parseFRUOutput
|
||||
// counts occurrences to recover the real index for each one.
|
||||
var fruExtraBaseIndex = map[string]struct {
|
||||
Area string
|
||||
Base int
|
||||
}{
|
||||
"Chassis Extra": {"c", 2},
|
||||
"Board Extra": {"b", 5},
|
||||
"Product Extra": {"p", 7},
|
||||
}
|
||||
|
||||
func parseFRUOutput(output string) []fruField {
|
||||
var fields []fruField
|
||||
extraSeen := map[string]int{}
|
||||
for _, line := range strings.Split(output, "\n") {
|
||||
// Lines look like: " Field Name : value"
|
||||
trimmed := strings.TrimLeft(line, " \t")
|
||||
if trimmed == "" {
|
||||
continue
|
||||
}
|
||||
colon := strings.Index(trimmed, " : ")
|
||||
valueOffset := 3
|
||||
if colon < 0 {
|
||||
// try ": " with no leading space before colon
|
||||
colon = strings.Index(trimmed, ": ")
|
||||
valueOffset = 2
|
||||
if colon < 0 {
|
||||
continue
|
||||
}
|
||||
}
|
||||
name := strings.TrimSpace(trimmed[:colon])
|
||||
value := strings.TrimSpace(trimmed[colon+valueOffset:])
|
||||
if name == "" {
|
||||
continue
|
||||
}
|
||||
editable, area, idx := fruFieldMeta(name, extraSeen)
|
||||
fields = append(fields, fruField{Name: name, Value: value, Editable: editable, Area: area, Index: idx})
|
||||
}
|
||||
return fields
|
||||
}
|
||||
|
||||
func fruFieldMeta(name string, extraSeen map[string]int) (editable bool, area string, index int) {
|
||||
if e, ok := fruExtraBaseIndex[name]; ok {
|
||||
idx := e.Base + extraSeen[name]
|
||||
extraSeen[name]++
|
||||
return true, e.Area, idx
|
||||
}
|
||||
if e, ok := fruEditableFields[name]; ok {
|
||||
return true, e.Area, e.Index
|
||||
}
|
||||
// All fields are shown as editable; server will reject unknown fields.
|
||||
return true, "", 0
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIIPMIFRURead(w http.ResponseWriter, r *http.Request) {
|
||||
ctx, cancel := context.WithTimeout(r.Context(), 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
out, err := exec.CommandContext(ctx, "ipmitool", "fru", "print", "0").CombinedOutput()
|
||||
if err != nil {
|
||||
msg := strings.TrimSpace(string(out))
|
||||
if msg == "" {
|
||||
msg = err.Error()
|
||||
}
|
||||
writeError(w, http.StatusInternalServerError, "ipmitool fru print: "+msg)
|
||||
return
|
||||
}
|
||||
|
||||
fields := parseFRUOutput(string(out))
|
||||
writeJSON(w, fields)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIIPMIFRUWrite(w http.ResponseWriter, r *http.Request) {
|
||||
var req struct {
|
||||
Changes []fruChange `json:"changes"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid JSON")
|
||||
return
|
||||
}
|
||||
if len(req.Changes) == 0 {
|
||||
writeError(w, http.StatusUnprocessableEntity, "no changes provided")
|
||||
return
|
||||
}
|
||||
validAreas := map[string]bool{"c": true, "b": true, "p": true}
|
||||
for i, c := range req.Changes {
|
||||
if c.Area == "" {
|
||||
e, ok := fruEditableFields[c.Name]
|
||||
if !ok {
|
||||
writeError(w, http.StatusUnprocessableEntity, "field not writable via ipmitool: "+c.Name)
|
||||
return
|
||||
}
|
||||
req.Changes[i].Area = e.Area
|
||||
req.Changes[i].Index = e.Index
|
||||
c = req.Changes[i]
|
||||
}
|
||||
if !validAreas[c.Area] {
|
||||
writeError(w, http.StatusUnprocessableEntity, "invalid area: "+c.Area)
|
||||
return
|
||||
}
|
||||
if c.Index < 0 || c.Index > 9 {
|
||||
writeError(w, http.StatusUnprocessableEntity, fmt.Sprintf("invalid index %d", c.Index))
|
||||
return
|
||||
}
|
||||
if len(c.Value) > 64 {
|
||||
writeError(w, http.StatusUnprocessableEntity, "value too long (max 64 chars)")
|
||||
return
|
||||
}
|
||||
for _, ch := range c.Value {
|
||||
if ch > unicode.MaxASCII || (ch < 0x20 && ch != 0) {
|
||||
writeError(w, http.StatusUnprocessableEntity, "value contains non-printable characters")
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
t := &Task{
|
||||
ID: newJobID("ipmi-fru-write"),
|
||||
Name: fmt.Sprintf("IPMI FRU Write (%d field(s))", len(req.Changes)),
|
||||
Target: "ipmi-fru-write",
|
||||
Priority: defaultTaskPriority("ipmi-fru-write", taskParams{}),
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{FRUChanges: req.Changes},
|
||||
}
|
||||
globalQueue.enqueue(t)
|
||||
writeJSON(w, map[string]string{"task_id": t.ID})
|
||||
}
|
||||
|
||||
func runIPMIFRUWriteTask(ctx context.Context, j *jobState, exportDir string, p taskParams) error {
|
||||
// Backup current FRU state
|
||||
backupDir := filepath.Join(exportDir, "fru-backups")
|
||||
if err := os.MkdirAll(backupDir, 0755); err != nil {
|
||||
return fmt.Errorf("mkdir fru-backups: %w", err)
|
||||
}
|
||||
stamp := time.Now().Format("20060102150405")
|
||||
backupPath := filepath.Join(backupDir, "fru-"+stamp+".txt")
|
||||
|
||||
backupOut, err := exec.CommandContext(ctx, "ipmitool", "fru", "print", "0").CombinedOutput()
|
||||
if err != nil {
|
||||
return fmt.Errorf("backup fru print: %w", err)
|
||||
}
|
||||
if err := os.WriteFile(backupPath, backupOut, 0644); err != nil {
|
||||
return fmt.Errorf("write backup: %w", err)
|
||||
}
|
||||
j.append("Backup saved to " + backupPath)
|
||||
|
||||
// Apply changes
|
||||
for _, c := range p.FRUChanges {
|
||||
j.append(fmt.Sprintf("Setting %s (%s %d) = %q", c.Name, c.Area, c.Index, c.Value))
|
||||
cmd := exec.CommandContext(ctx, "ipmitool", "fru", "edit", "0", "field", c.Area, fmt.Sprintf("%d", c.Index), c.Value)
|
||||
if err := streamCmdJob(j, cmd); err != nil {
|
||||
return fmt.Errorf("fru edit %s %d: %w", c.Area, c.Index, err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -0,0 +1,59 @@
|
||||
package webui
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestParseFRUOutputExtraFields(t *testing.T) {
|
||||
// Realistic ipmitool fru print output: repeated "<Area> Extra" lines
|
||||
// (one per custom field) must resolve to sequential indices per the
|
||||
// vendor FRU doc (Chassis Extra starts at 2, Board Extra at 5, Product
|
||||
// Extra at 7), not all collapse onto the same index.
|
||||
out := `
|
||||
Product Manufacturer : Inspur
|
||||
Product Name : NF5280M6
|
||||
Product Part Number : PN123
|
||||
Product Version : 1.0
|
||||
Product Serial : SN123
|
||||
Product Asset Tag : ASSET01
|
||||
Product Extra : custom-p1
|
||||
Board Mfg : Inspur
|
||||
Board Product : BoardX
|
||||
Board Serial : BSN1
|
||||
Board Part Number : BPN1
|
||||
Board Extra : custom-b1
|
||||
Board Extra : custom-b2
|
||||
Board Extra : custom-b3
|
||||
Chassis Part Number : CPN1
|
||||
Chassis Serial : CSN1
|
||||
Chassis Extra : front-half
|
||||
Chassis Extra : back-half
|
||||
`
|
||||
fields := parseFRUOutput(out)
|
||||
|
||||
byName := map[string][]fruField{}
|
||||
for _, f := range fields {
|
||||
byName[f.Name] = append(byName[f.Name], f)
|
||||
}
|
||||
|
||||
assertMeta := func(name string, occurrence int, wantArea string, wantIndex int) {
|
||||
t.Helper()
|
||||
list := byName[name]
|
||||
if occurrence >= len(list) {
|
||||
t.Fatalf("expected occurrence %d of %q, got %d entries", occurrence, name, len(list))
|
||||
}
|
||||
f := list[occurrence]
|
||||
if f.Area != wantArea || f.Index != wantIndex {
|
||||
t.Errorf("%s[%d] = area:%q index:%d, want area:%q index:%d", name, occurrence, f.Area, f.Index, wantArea, wantIndex)
|
||||
}
|
||||
if !f.Editable {
|
||||
t.Errorf("%s[%d] expected editable", name, occurrence)
|
||||
}
|
||||
}
|
||||
|
||||
assertMeta("Product Asset Tag", 0, "p", 5)
|
||||
assertMeta("Product Extra", 0, "p", 7)
|
||||
assertMeta("Board Extra", 0, "b", 5)
|
||||
assertMeta("Board Extra", 1, "b", 6)
|
||||
assertMeta("Board Extra", 2, "b", 7)
|
||||
assertMeta("Chassis Extra", 0, "c", 2)
|
||||
assertMeta("Chassis Extra", 1, "c", 3)
|
||||
}
|
||||
@@ -68,10 +68,9 @@ tbody tr:hover td{background:rgba(0,0,0,.03)}
|
||||
.chip-warn{background:var(--warn-bg);color:var(--warn-fg);border:1px solid #c9ba9b}
|
||||
.chip-fail{background:var(--crit-bg);color:var(--crit-fg);border:1px solid var(--crit-border)}
|
||||
.chip-unknown{background:var(--surface-2);color:var(--muted);border:1px solid var(--border)}
|
||||
/* Tasks nav badge */
|
||||
.tasks-nav-btn{display:flex;justify-content:space-between;align-items:center;padding:10px 16px;color:rgba(255,255,255,.55);font-size:12px;text-decoration:none;border-top:1px solid rgba(255,255,255,.12);margin-top:auto;transition:color .15s}
|
||||
.tasks-nav-btn:hover{color:#fff}
|
||||
.tasks-nav-count{background:var(--accent);color:#fff;border-radius:10px;padding:1px 7px;font-size:11px;font-weight:700;display:none}
|
||||
/* Nav separator and tasks count badge */
|
||||
.nav-sep{height:1px;background:rgba(255,255,255,.12);margin:6px 0}
|
||||
.tasks-nav-count{background:var(--accent);color:#fff;border-radius:10px;padding:1px 7px;font-size:11px;font-weight:700;display:none;margin-left:auto}
|
||||
.tasks-nav-count.active{display:inline}
|
||||
/* Output terminal */
|
||||
.terminal{background:#1b1c1d;border:1px solid rgba(0,0,0,.2);border-radius:4px;padding:14px;font-family:monospace;font-size:12px;color:#b5cea8;max-height:400px;overflow-y:auto;white-space:pre-wrap;word-break:break-all;user-select:text;-webkit-user-select:text}
|
||||
@@ -98,15 +97,21 @@ tbody tr:hover td{background:rgba(0,0,0,.03)}
|
||||
}
|
||||
|
||||
func layoutNav(active string, buildLabel string) string {
|
||||
items := []struct{ id, label, href string }{
|
||||
{"dashboard", "Dashboard", "/"},
|
||||
{"audit", "1. Audit", "/audit"},
|
||||
{"check", "2. Check", "/check"},
|
||||
{"load", "3. Load", "/load"},
|
||||
{"speed", "4. Speed", "/speed"},
|
||||
{"endurance", "5. Endurance", "/endurance"},
|
||||
{"tools", "6. Tools", "/tools"},
|
||||
{"settings", "7. Settings", "/settings"},
|
||||
type navItem struct {
|
||||
id, label, href string
|
||||
sep bool
|
||||
}
|
||||
items := []navItem{
|
||||
{id: "dashboard", label: "Dashboard", href: "/"},
|
||||
{id: "audit", label: "1. Audit", href: "/audit"},
|
||||
{id: "check", label: "2. Check", href: "/check"},
|
||||
{id: "load", label: "3. Load", href: "/load"},
|
||||
{id: "burn", label: "4. Burn", href: "/burn"},
|
||||
{id: "benchmark", label: "5. Benchmark", href: "/benchmark"},
|
||||
{sep: true},
|
||||
{id: "tasks", label: "Tasks", href: "/tasks"},
|
||||
{id: "tools", label: "Tools", href: "/tools"},
|
||||
{id: "settings", label: "Settings", href: "/settings"},
|
||||
}
|
||||
var b strings.Builder
|
||||
b.WriteString(`<aside class="sidebar">`)
|
||||
@@ -126,19 +131,23 @@ func layoutNav(active string, buildLabel string) string {
|
||||
}
|
||||
b.WriteString(`<nav class="nav">`)
|
||||
for _, item := range items {
|
||||
if item.sep {
|
||||
b.WriteString(`<div class="nav-sep"></div>`)
|
||||
continue
|
||||
}
|
||||
cls := "nav-item"
|
||||
if item.id == active {
|
||||
cls += " active"
|
||||
}
|
||||
b.WriteString(fmt.Sprintf(`<a class="%s" href="%s">%s</a>`, cls, item.href, item.label))
|
||||
if item.id == "tasks" {
|
||||
b.WriteString(fmt.Sprintf(`<a class="%s" href="%s" id="tasks-nav-item">%s<span class="tasks-nav-count" id="tasks-nav-count"></span></a>`, cls, item.href, item.label))
|
||||
} else {
|
||||
b.WriteString(fmt.Sprintf(`<a class="%s" href="%s">%s</a>`, cls, item.href, item.label))
|
||||
}
|
||||
}
|
||||
b.WriteString(`</nav>`)
|
||||
b.WriteString(`<a href="/tasks" class="tasks-nav-btn" id="tasks-nav-btn">`)
|
||||
b.WriteString(`<span>Tasks</span>`)
|
||||
b.WriteString(`<span class="tasks-nav-count" id="tasks-nav-count"></span>`)
|
||||
b.WriteString(`</a>`)
|
||||
b.WriteString(`<script>`)
|
||||
b.WriteString(`(function(){function u(){fetch('/api/tasks',{cache:'no-store'}).then(function(r){return r.json();}).then(function(d){var n=Array.isArray(d)?d.filter(function(t){return t.status==='pending'||t.status==='running';}).length:0;var c=document.getElementById('tasks-nav-count');var b=document.getElementById('tasks-nav-btn');if(c){c.textContent=n>0?String(n):'';c.className='tasks-nav-count'+(n>0?' active':'');}if(b){b.style.color=n>0?'#f6c90e':'';}}).catch(function(){});}u();setInterval(u,5000);})();`)
|
||||
b.WriteString(`(function(){function u(){fetch('/api/tasks',{cache:'no-store'}).then(function(r){return r.json();}).then(function(d){var n=Array.isArray(d)?d.filter(function(t){return t.status==='pending'||t.status==='running';}).length:0;var c=document.getElementById('tasks-nav-count');var el=document.getElementById('tasks-nav-item');if(c){c.textContent=n>0?String(n):'';c.className='tasks-nav-count'+(n>0?' active':'');}if(el){el.style.color=n>0?'#f6c90e':'';}}).catch(function(){});}u();setInterval(u,5000);})();`)
|
||||
b.WriteString(`</script>`)
|
||||
b.WriteString(`</aside>`)
|
||||
return b.String()
|
||||
|
||||
@@ -612,19 +612,6 @@ func renderPowerBenchmarkResultsCard(exportDir string) string {
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// renderSpeed renders the Speed page (step 4): performance benchmarks.
|
||||
// Uses the same benchmark infrastructure; defaults to Standard profile (throughput/bandwidth).
|
||||
// For long-duration stability/overnight runs, see Endurance (step 5).
|
||||
func renderSpeed(opts HandlerOptions) string {
|
||||
base := renderBenchmark(opts)
|
||||
return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Speed:</strong> Measures GPU compute throughput and memory bandwidth. For overnight stability testing, go to <a href="/endurance">5. Endurance</a>.</div>` + base
|
||||
}
|
||||
|
||||
// renderEndurance renders the Endurance page (step 5): long-duration reliability tests.
|
||||
// Focuses on Stability and Overnight profiles for multi-hour burn validation.
|
||||
// For short load tests, see Load (step 3). For throughput measurement, see Speed (step 4).
|
||||
func renderEndurance(opts HandlerOptions) string {
|
||||
base := renderBenchmark(opts)
|
||||
return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>Endurance:</strong> Long-duration reliability tests — Stability (several hours) and Overnight (8+ h) profiles. These profiles run hardware at sustained load; results show whether the server holds its performance envelope over time.</div>
|
||||
<div class="alert alert-info" style="margin-bottom:16px">Use the <strong>Stability</strong> or <strong>Overnight</strong> profile in the setup card below. The Standard profile is available too but is better suited for the <a href="/speed">4. Speed</a> page.</div>` + base
|
||||
}
|
||||
// renderSpeed and renderEndurance are legacy wrappers; canonical page is 5. Benchmark at /benchmark.
|
||||
func renderSpeed(opts HandlerOptions) string { return renderBenchmark(opts) }
|
||||
func renderEndurance(opts HandlerOptions) string { return renderBenchmark(opts) }
|
||||
|
||||
@@ -1,13 +1,8 @@
|
||||
package webui
|
||||
|
||||
// renderLoad renders the Load page (step 3): sustained stress tests.
|
||||
// For non-destructive status checks, see Check (step 2).
|
||||
// For DCGM targeted diagnostics (targeted_stress, targeted_power, pulse), see Check → Validate mode.
|
||||
func renderLoad() string { return renderBurn() }
|
||||
|
||||
func renderBurn() string {
|
||||
return `<div class="alert alert-warn" style="margin-bottom:16px"><strong>⚠ Warning:</strong> Stress tests on this page run hardware at high load. Repeated or prolonged use may reduce hardware lifespan. Use only when necessary.</div>
|
||||
<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> Load runs sustained GPU compute and CPU/memory stress recipes. DCGM diagnostics (<code>targeted_stress</code>, <code>targeted_power</code>, <code>pulse_test</code>) and NCCL/NVBandwidth are on the <a href="/check">2. Check</a> page. For overnight endurance runs, see <a href="/endurance">5. Endurance</a>.</div>
|
||||
<div class="alert alert-info" style="margin-bottom:16px"><strong>Scope:</strong> Burn runs sustained GPU compute and CPU/memory stress recipes. DCGM targeted diagnostics (<code>targeted_stress</code>, <code>targeted_power</code>, <code>pulse_test</code>) and NCCL/NVBandwidth are on the <a href="/load">3. Load</a> page. For performance benchmarks, see <a href="/benchmark">5. Benchmark</a>.</div>
|
||||
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
|
||||
@@ -402,95 +402,226 @@ loadNvidiaSelfHeal();
|
||||
}
|
||||
|
||||
func renderTools() string {
|
||||
return `<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">System Install</div>
|
||||
<div class="card-body">
|
||||
<div style="margin-bottom:20px">
|
||||
<div style="font-weight:600;margin-bottom:8px">Install to RAM</div>
|
||||
<p id="boot-source-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Detecting boot source...</p>
|
||||
<p id="ram-status-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Checking...</p>
|
||||
<button id="ram-install-btn" class="btn btn-primary" onclick="installToRAM()" style="display:none">▶ Copy to RAM</button>
|
||||
</div>
|
||||
<div style="border-top:1px solid var(--line);padding-top:20px">
|
||||
<div style="font-weight:600;margin-bottom:8px">Install to Disk</div>` +
|
||||
renderInstallInline() + `
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<script>
|
||||
fetch('/api/system/ram-status').then(r=>r.json()).then(d=>{
|
||||
const boot = document.getElementById('boot-source-text');
|
||||
const txt = document.getElementById('ram-status-text');
|
||||
const btn = document.getElementById('ram-install-btn');
|
||||
let source = d.device || d.source || 'unknown source';
|
||||
let kind = d.kind || 'unknown';
|
||||
let label = source;
|
||||
if (kind === 'ram') label = 'RAM';
|
||||
else if (kind === 'usb') label = 'USB (' + source + ')';
|
||||
else if (kind === 'cdrom') label = 'CD-ROM (' + source + ')';
|
||||
else if (kind === 'disk') label = 'disk (' + source + ')';
|
||||
else label = source;
|
||||
boot.textContent = 'Current boot source: ' + label + '.';
|
||||
txt.textContent = d.blocked_reason || d.message || 'Checking...';
|
||||
if (d.status === 'ok' || d.in_ram) {
|
||||
txt.style.color = 'var(--ok, green)';
|
||||
} else if (d.status === 'failed') {
|
||||
txt.style.color = 'var(--err, #b91c1c)';
|
||||
} else {
|
||||
txt.style.color = 'var(--muted)';
|
||||
}
|
||||
if (d.can_start_task) {
|
||||
btn.style.display = '';
|
||||
btn.disabled = false;
|
||||
} else {
|
||||
btn.style.display = 'none';
|
||||
}
|
||||
});
|
||||
function installToRAM() {
|
||||
document.getElementById('ram-install-btn').disabled = true;
|
||||
fetch('/api/system/install-to-ram', {method:'POST'}).then(r=>r.json()).then(d=>{
|
||||
window.location.href = '/tasks#' + d.task_id;
|
||||
});
|
||||
}
|
||||
</script>
|
||||
return renderNVMeFormatCard() + `
|
||||
|
||||
<div class="card"><div class="card-head">Support Bundle</div><div class="card-body">
|
||||
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Downloads a tar.gz archive of all audit files, SAT results, and logs.</p>
|
||||
` + renderSupportBundleInline() + `
|
||||
<div style="border-top:1px solid var(--border);margin-top:16px;padding-top:16px">
|
||||
<div style="font-weight:600;margin-bottom:8px">USB Black-Box</div>
|
||||
` + renderUSBExportInline() + `
|
||||
</div>
|
||||
` + renderFRUEditorCard() + `
|
||||
|
||||
` + renderRAIDMgmtCard()
|
||||
}
|
||||
|
||||
func renderFRUEditorCard() string {
|
||||
return `<div class="card"><div class="card-head card-head-actions">FRU / Elabel<div class="card-head-buttons"><button class="btn btn-sm btn-secondary" onclick="fruAllRead()">Read All</button></div></div><div class="card-body">
|
||||
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Reads and edits hardware identity fields from all available sources. Each field shows its source method.</p>
|
||||
<div id="fru-all-status" style="font-size:13px;color:var(--muted);margin-bottom:8px"></div>
|
||||
<div id="fru-src-status" style="display:none;margin-bottom:10px"></div>
|
||||
<div id="fru-all-table"></div>
|
||||
</div></div>
|
||||
|
||||
<div class="card"><div class="card-head">Tool Check <button class="btn btn-sm btn-secondary" onclick="checkTools()" style="margin-left:auto">↻ Check</button></div>
|
||||
<div class="card-body"><div id="tools-table"><p style="color:var(--muted);font-size:13px">Checking...</p></div></div></div>
|
||||
|
||||
<div class="card"><div class="card-head">NVIDIA Self Heal</div><div class="card-body">` +
|
||||
renderNvidiaSelfHealInline() + `</div></div>
|
||||
|
||||
<div class="card"><div class="card-head">Network</div><div class="card-body">` +
|
||||
renderNetworkInline() + `</div></div>
|
||||
|
||||
<div class="card"><div class="card-head">Services</div><div class="card-body">` +
|
||||
renderServicesInline() + `</div></div>
|
||||
|
||||
` + renderNVMeFormatCard() + `
|
||||
|
||||
` + renderSAADMICard() + `
|
||||
|
||||
<style>
|
||||
.fru-chip{display:inline-block;font-size:10px;font-weight:600;letter-spacing:.02em;padding:1px 6px;border-radius:3px;vertical-align:middle;white-space:nowrap;margin-right:8px;flex-shrink:0}
|
||||
.fru-chip-ipmi{background:#e8e8e8;color:#555}
|
||||
.fru-chip-huawei{background:#fff0e6;color:#b83}
|
||||
.fru-chip-saa{background:#e6f0ff;color:#557}
|
||||
.fru-inp-wrap{display:flex;align-items:center;gap:0}
|
||||
</style>
|
||||
<script>
|
||||
function checkTools() {
|
||||
document.getElementById('tools-table').innerHTML = '<p style="color:var(--muted);font-size:13px">Checking...</p>';
|
||||
fetch('/api/tools/check').then(r=>r.json()).then(tools => {
|
||||
const rows = tools.map(t =>
|
||||
'<tr><td>'+t.Name+'</td><td><span class="badge '+(t.OK ? 'badge-ok' : 'badge-err')+'">'+(t.OK ? '✓ '+t.Path : '✗ missing')+'</span></td></tr>'
|
||||
).join('');
|
||||
document.getElementById('tools-table').innerHTML =
|
||||
'<table><tr><th>Tool</th><th>Status</th></tr>'+rows+'</table>';
|
||||
(function(){
|
||||
var _actBtn='width:22px;height:22px;padding:0;font-size:13px;line-height:1;border:1px solid var(--line);border-radius:3px;background:var(--surface);cursor:pointer;vertical-align:middle;';
|
||||
var _inp='width:100%;padding:3px 6px;border:1.5px solid #888;border-radius:3px;font-size:13px;font-family:monospace;background:var(--surface);color:var(--ink);';
|
||||
|
||||
var SOURCES = [
|
||||
{
|
||||
id: 'ipmi-fru',
|
||||
label: 'IPMI FRU',
|
||||
chipClass: 'fru-chip-ipmi',
|
||||
url: '/api/tools/ipmi-fru',
|
||||
writeUrl: '/api/tools/ipmi-fru/write',
|
||||
rowAttrs: function(f) {
|
||||
return 'data-source="ipmi-fru" data-area="'+esc(f.area||'')+'" data-index="'+(f.index||0)+'" data-name="'+esc(f.name)+'"';
|
||||
},
|
||||
writeBody: function(inp) {
|
||||
return JSON.stringify({changes:[{area:inp.dataset.area,index:parseInt(inp.dataset.index,10),name:inp.dataset.name,value:inp.value}]});
|
||||
},
|
||||
fieldName: function(f) { return f.name; },
|
||||
fieldValue: function(f) { return f.value||''; },
|
||||
readOnly: function(f) { return false; },
|
||||
},
|
||||
{
|
||||
id: 'huawei',
|
||||
label: 'Huawei iBMC',
|
||||
chipClass: 'fru-chip-huawei',
|
||||
url: '/api/tools/huawei-elabel',
|
||||
writeUrl: '/api/tools/huawei-elabel/write',
|
||||
rowAttrs: function(f) {
|
||||
return 'data-source="huawei" data-key="'+esc(f.key)+'"';
|
||||
},
|
||||
writeBody: function(inp) {
|
||||
return JSON.stringify({changes:[{key:inp.dataset.key,value:inp.value}]});
|
||||
},
|
||||
fieldName: function(f) { return f.name; },
|
||||
fieldValue: function(f) { return f.value||''; },
|
||||
readOnly: function(f) { return !!f.read_only; },
|
||||
},
|
||||
{
|
||||
id: 'saa-dmi',
|
||||
label: 'SAA DMI',
|
||||
chipClass: 'fru-chip-saa',
|
||||
url: '/api/tools/saa-dmi',
|
||||
writeUrl: '/api/tools/saa-dmi/write',
|
||||
rowAttrs: function(f) {
|
||||
return 'data-source="saa-dmi" data-shn="'+esc(f.shn)+'"';
|
||||
},
|
||||
writeBody: function(inp) {
|
||||
return JSON.stringify({changes:[{shn:inp.dataset.shn,value:inp.value}]});
|
||||
},
|
||||
fieldName: function(f) { return f.name; },
|
||||
fieldValue: function(f) { return f.value||''; },
|
||||
readOnly: function(f) { return false; },
|
||||
},
|
||||
];
|
||||
|
||||
function esc(s){return String(s==null?'':s).replace(/&/g,'&').replace(/</g,'<').replace(/>/g,'>').replace(/"/g,'"');}
|
||||
|
||||
function renderSrcStatus(perSource) {
|
||||
var bar = document.getElementById('fru-src-status');
|
||||
if (!perSource.length) { bar.style.display = 'none'; bar.innerHTML = ''; return; }
|
||||
var html = '';
|
||||
perSource.forEach(function(p) {
|
||||
var state, color;
|
||||
if (p.ok) {
|
||||
state = p.count + ' field(s) available';
|
||||
color = 'var(--ok-fg,green)';
|
||||
} else if (/not activated|product key|SFT-DCMS|SFT-OOB/i.test(p.reason)) {
|
||||
state = 'requires Supermicro license (SFT-OOB-LIC / SFT-DCMS-SINGLE) — activate on BMC';
|
||||
color = 'var(--crit-fg,#9f3a38)';
|
||||
} else {
|
||||
state = p.reason || 'unavailable';
|
||||
color = 'var(--muted)';
|
||||
}
|
||||
html += '<div style="display:flex;align-items:center;gap:8px;font-size:12px;margin:3px 0">'
|
||||
+ '<span class="fru-chip '+p.src.chipClass+'">'+p.src.label+'</span>'
|
||||
+ '<span style="color:'+color+'">'+esc(state)+'</span>'
|
||||
+ '</div>';
|
||||
});
|
||||
bar.innerHTML = html;
|
||||
bar.style.display = '';
|
||||
}
|
||||
checkTools();
|
||||
|
||||
window.fruAllRead = function() {
|
||||
var status = document.getElementById('fru-all-status');
|
||||
var table = document.getElementById('fru-all-table');
|
||||
status.textContent = 'Reading…'; status.style.color = 'var(--muted)';
|
||||
table.innerHTML = '';
|
||||
|
||||
var fetches = SOURCES.map(function(src) {
|
||||
return fetch(src.url, {cache:'no-store'})
|
||||
.then(function(r){ return r.json().then(function(d){ if(!r.ok) throw new Error(d.error||r.statusText); return d; }); });
|
||||
});
|
||||
|
||||
Promise.allSettled(fetches).then(function(results) {
|
||||
var rows = '';
|
||||
var totalFields = 0;
|
||||
var perSource = [];
|
||||
|
||||
results.forEach(function(res, i) {
|
||||
var src = SOURCES[i];
|
||||
if (res.status === 'rejected' || !Array.isArray(res.value) || res.value.length === 0) {
|
||||
var reason = '';
|
||||
if (res.status === 'rejected' && res.reason) reason = res.reason.message;
|
||||
else reason = 'no editable fields returned';
|
||||
perSource.push({src:src, ok:false, count:0, reason:reason});
|
||||
return;
|
||||
}
|
||||
perSource.push({src:src, ok:true, count:res.value.length, reason:''});
|
||||
res.value.forEach(function(f) {
|
||||
var val = esc(src.fieldValue(f));
|
||||
var ro = src.readOnly(f);
|
||||
var attrs = ro ? '' : (' '+src.rowAttrs(f));
|
||||
rows += '<tr>'
|
||||
+ '<td style="white-space:nowrap;padding-right:4px;vertical-align:middle">'
|
||||
+ '<span class="fru-chip '+src.chipClass+'">'+src.label+'</span>'
|
||||
+ '</td>'
|
||||
+ '<td style="color:var(--muted);white-space:nowrap;padding-right:16px;vertical-align:middle;font-size:13px">'+esc(src.fieldName(f))+'</td>'
|
||||
+ '<td style="vertical-align:middle">'
|
||||
+ (ro
|
||||
? '<span style="font-family:monospace;font-size:13px;color:var(--muted)">'+val+'</span>'
|
||||
: '<input class="fru-uni-inp" style="'+_inp+'" value="'+val+'" data-original="'+val+'"'+attrs+' oninput="fruUniChanged(this)">')
|
||||
+ '</td>'
|
||||
+ '<td class="fru-uni-act" style="display:none;white-space:nowrap;padding-left:6px;vertical-align:middle">'
|
||||
+ '<button style="'+_actBtn+'color:var(--ok-fg,green);margin-right:3px" title="Save" onclick="fruUniSave(this)">✓</button>'
|
||||
+ '<button style="'+_actBtn+'color:var(--crit-fg,#9f3a38)" title="Cancel" onclick="fruUniCancel(this)">✗</button>'
|
||||
+ '<span class="fru-uni-msg" style="font-size:11px;margin-left:5px;color:var(--muted)"></span>'
|
||||
+ '</td>'
|
||||
+ '</tr>';
|
||||
totalFields++;
|
||||
});
|
||||
});
|
||||
|
||||
renderSrcStatus(perSource);
|
||||
|
||||
if (totalFields === 0) {
|
||||
status.textContent = 'No editable fields available — see per-source status below.';
|
||||
status.style.color = 'var(--crit-fg,#9f3a38)';
|
||||
table.innerHTML = '';
|
||||
return;
|
||||
}
|
||||
|
||||
table.innerHTML = '<table style="width:100%;border-collapse:collapse">'+rows+'</table>';
|
||||
status.textContent = totalFields + ' field(s) loaded';
|
||||
status.style.color = 'var(--muted)';
|
||||
});
|
||||
};
|
||||
|
||||
window.fruUniChanged = function(inp) {
|
||||
var row = inp.closest('tr');
|
||||
row.querySelector('.fru-uni-act').style.display = inp.value !== inp.dataset.original ? '' : 'none';
|
||||
row.querySelector('.fru-uni-msg').textContent = '';
|
||||
};
|
||||
|
||||
window.fruUniCancel = function(btn) {
|
||||
var row = btn.closest('tr');
|
||||
var inp = row.querySelector('.fru-uni-inp');
|
||||
inp.value = inp.dataset.original;
|
||||
row.querySelector('.fru-uni-act').style.display = 'none';
|
||||
row.querySelector('.fru-uni-msg').textContent = '';
|
||||
};
|
||||
|
||||
window.fruUniSave = function(btn) {
|
||||
var row = btn.closest('tr');
|
||||
var inp = row.querySelector('.fru-uni-inp');
|
||||
var msg = row.querySelector('.fru-uni-msg');
|
||||
var cancelBtn = row.querySelectorAll('.fru-uni-act button')[1];
|
||||
var src = SOURCES.find(function(s){ return s.id === inp.dataset.source; });
|
||||
if (!src) { msg.textContent = 'Unknown source'; msg.style.color='var(--crit-fg)'; return; }
|
||||
|
||||
btn.disabled = true; cancelBtn.disabled = true;
|
||||
msg.textContent = '…'; msg.style.color = 'var(--muted)';
|
||||
|
||||
fetch(src.writeUrl, {method:'POST', headers:{'Content-Type':'application/json'}, body:src.writeBody(inp)})
|
||||
.then(function(r){ return r.json().then(function(d){ if(!r.ok) throw new Error(d.error||r.statusText); return d; }); })
|
||||
.then(function(d) {
|
||||
var poll = setInterval(function() {
|
||||
fetch('/api/tasks',{cache:'no-store'}).then(function(r){return r.json();}).then(function(tasks){
|
||||
var t = Array.isArray(tasks) ? tasks.find(function(x){return x.id===d.task_id;}) : null;
|
||||
if (!t) return;
|
||||
if (t.status==='done') {
|
||||
clearInterval(poll);
|
||||
inp.dataset.original = inp.value;
|
||||
row.querySelector('.fru-uni-act').style.display = 'none';
|
||||
msg.textContent = ''; msg.style.color = '';
|
||||
} else if (t.status==='failed'||t.status==='cancelled') {
|
||||
clearInterval(poll);
|
||||
msg.textContent = t.error||t.status; msg.style.color = 'var(--crit-fg)';
|
||||
btn.disabled = false; cancelBtn.disabled = false;
|
||||
}
|
||||
});
|
||||
}, 1500);
|
||||
})
|
||||
.catch(function(e) {
|
||||
msg.textContent = 'Error: '+e.message; msg.style.color = 'var(--crit-fg)';
|
||||
btn.disabled = false; cancelBtn.disabled = false;
|
||||
});
|
||||
};
|
||||
})();
|
||||
</script>`
|
||||
}
|
||||
|
||||
|
||||
@@ -7,34 +7,76 @@ func renderSettings(opts HandlerOptions) string {
|
||||
if version == "" {
|
||||
version = "dev"
|
||||
}
|
||||
return `<div class="grid2">
|
||||
|
||||
<div class="card">
|
||||
<div class="card-head">Blackbox Logging</div>
|
||||
return `<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">System Install</div>
|
||||
<div class="card-body">
|
||||
<p style="font-size:13px;color:var(--muted);margin-bottom:14px">Continuous hardware monitoring that writes a rolling log of sensor readings to the export directory. Useful for capturing thermal or power anomalies during long runs.</p>
|
||||
<div style="display:flex;gap:8px;align-items:center">
|
||||
<button class="btn btn-primary btn-sm" onclick="blackboxToggle('enable')">Enable</button>
|
||||
<button class="btn btn-secondary btn-sm" onclick="blackboxToggle('disable')">Disable</button>
|
||||
<span id="blackbox-status" style="font-size:12px;color:var(--muted)">Loading...</span>
|
||||
<div style="margin-bottom:20px">
|
||||
<div style="font-weight:600;margin-bottom:8px">Install to RAM</div>
|
||||
<p id="boot-source-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Detecting boot source...</p>
|
||||
<p id="ram-status-text" style="color:var(--muted);font-size:13px;margin-bottom:8px">Checking...</p>
|
||||
<button id="ram-install-btn" class="btn btn-primary" onclick="installToRAM()" style="display:none">▶ Copy to RAM</button>
|
||||
</div>
|
||||
<div style="border-top:1px solid var(--line);padding-top:20px">
|
||||
<div style="font-weight:600;margin-bottom:8px">Install to Disk</div>` +
|
||||
renderInstallInline() + `
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<script>
|
||||
fetch('/api/system/ram-status').then(r=>r.json()).then(d=>{
|
||||
const boot = document.getElementById('boot-source-text');
|
||||
const txt = document.getElementById('ram-status-text');
|
||||
const btn = document.getElementById('ram-install-btn');
|
||||
let kind = d.kind || 'unknown';
|
||||
let source = d.device || d.source || 'unknown source';
|
||||
let label = kind==='ram'?'RAM':kind==='usb'?'USB ('+source+')':kind==='cdrom'?'CD-ROM ('+source+')':kind==='disk'?'disk ('+source+')':source;
|
||||
boot.textContent = 'Current boot source: ' + label + '.';
|
||||
txt.textContent = d.blocked_reason || d.message || 'Checking...';
|
||||
txt.style.color = (d.status==='ok'||d.in_ram)?'var(--ok,green)':d.status==='failed'?'var(--err,#b91c1c)':'var(--muted)';
|
||||
if (d.can_start_task) { btn.style.display=''; btn.disabled=false; } else { btn.style.display='none'; }
|
||||
});
|
||||
function installToRAM() {
|
||||
document.getElementById('ram-install-btn').disabled = true;
|
||||
fetch('/api/system/install-to-ram', {method:'POST'}).then(r=>r.json()).then(d=>{
|
||||
window.location.href = '/tasks#' + d.task_id;
|
||||
});
|
||||
}
|
||||
</script>
|
||||
|
||||
<div class="card"><div class="card-head">Support Bundle</div><div class="card-body">
|
||||
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Downloads a tar.gz archive of all audit files, SAT results, and logs.</p>
|
||||
` + renderSupportBundleInline() + `
|
||||
<div style="border-top:1px solid var(--border);margin-top:16px;padding-top:16px">
|
||||
<div style="font-weight:600;margin-bottom:8px">USB Black-Box</div>
|
||||
` + renderUSBExportInline() + `
|
||||
</div>
|
||||
</div></div>
|
||||
|
||||
<div class="card"><div class="card-head">Tool Check <button class="btn btn-sm btn-secondary" onclick="checkTools()" style="margin-left:auto">↻ Check</button></div>
|
||||
<div class="card-body"><div id="tools-table"><p style="color:var(--muted);font-size:13px">Checking...</p></div></div></div>
|
||||
<script>
|
||||
function checkTools() {
|
||||
document.getElementById('tools-table').innerHTML = '<p style="color:var(--muted);font-size:13px">Checking...</p>';
|
||||
fetch('/api/tools/check').then(r=>r.json()).then(tools => {
|
||||
const rows = tools.map(t =>
|
||||
'<tr><td>'+t.Name+'</td><td><span class="badge '+(t.OK?'badge-ok':'badge-err')+'">'+(t.OK?'✓ '+t.Path:'✗ missing')+'</span></td></tr>'
|
||||
).join('');
|
||||
document.getElementById('tools-table').innerHTML = '<table><tr><th>Tool</th><th>Status</th></tr>'+rows+'</table>';
|
||||
});
|
||||
}
|
||||
checkTools();
|
||||
</script>
|
||||
|
||||
<div class="card"><div class="card-head">NVIDIA Self Heal</div><div class="card-body">` +
|
||||
renderNvidiaSelfHealInline() + `</div></div>
|
||||
|
||||
<div class="card"><div class="card-head">Network</div><div class="card-body">` +
|
||||
renderNetworkInline() + `</div></div>
|
||||
|
||||
<div class="card"><div class="card-head">Services</div><div class="card-body">` +
|
||||
renderServicesInline() + `</div></div>
|
||||
|
||||
<div class="card">
|
||||
<div class="card-head">NVIDIA Recovery</div>
|
||||
<div class="card-body">
|
||||
<p style="font-size:13px;color:var(--muted);margin-bottom:14px">Reset NVIDIA GPU driver state. Use when <code>nvidia-smi</code> reports errors or GPUs appear stuck after a failed test.</p>
|
||||
<div style="display:flex;gap:8px;align-items:center">
|
||||
<button class="btn btn-danger btn-sm" onclick="nvidiaReset()">Reset NVIDIA Driver</button>
|
||||
<span id="nvidia-reset-status" style="font-size:12px;color:var(--muted)"></span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="card" style="margin-top:0">
|
||||
<div class="card-head">Build Info</div>
|
||||
<div class="card-body">
|
||||
<table style="width:auto">
|
||||
@@ -46,32 +88,28 @@ func renderSettings(opts HandlerOptions) string {
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card">
|
||||
<div class="card-head">Power</div>
|
||||
<div class="card-body">
|
||||
<div style="display:flex;gap:8px;align-items:center">
|
||||
<button class="btn btn-secondary btn-sm" onclick="systemPower('reboot')">Reboot</button>
|
||||
<button class="btn btn-secondary btn-sm" onclick="systemPower('shutdown')">Shutdown</button>
|
||||
<span id="power-status" style="font-size:12px;color:var(--muted)"></span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
(function() {
|
||||
fetch('/api/blackbox/status', {cache:'no-store'}).then(r => r.json()).then(d => {
|
||||
var el = document.getElementById('blackbox-status');
|
||||
if (el) el.textContent = d.enabled ? 'Enabled' : 'Disabled';
|
||||
}).catch(() => {
|
||||
var el = document.getElementById('blackbox-status');
|
||||
if (el) el.textContent = 'Status unavailable';
|
||||
});
|
||||
})();
|
||||
function blackboxToggle(action) {
|
||||
var el = document.getElementById('blackbox-status');
|
||||
if (el) el.textContent = 'Updating...';
|
||||
fetch('/api/blackbox/' + action, {method:'POST', cache:'no-store'})
|
||||
.then(r => r.json())
|
||||
.then(d => { if (el) el.textContent = d.enabled ? 'Enabled' : 'Disabled'; })
|
||||
.catch(err => { if (el) el.textContent = 'Error: ' + err.message; });
|
||||
function systemPower(action) {
|
||||
var label = action === 'reboot' ? 'reboot' : 'shut down';
|
||||
if (!confirm('Are you sure you want to ' + label + ' the server?')) return;
|
||||
var el = document.getElementById('power-status');
|
||||
if (el) el.textContent = action === 'reboot' ? 'Rebooting...' : 'Shutting down...';
|
||||
fetch('/api/system/' + action, {method: 'POST'})
|
||||
.then(function(r) { return r.json(); })
|
||||
.catch(function(e) { if (el) el.textContent = 'Error: ' + e.message; });
|
||||
}
|
||||
function nvidiaReset() {
|
||||
var el = document.getElementById('nvidia-reset-status');
|
||||
if (!confirm('Reset NVIDIA driver? This will interrupt any running GPU tasks.')) return;
|
||||
if (el) el.textContent = 'Resetting...';
|
||||
fetch('/api/gpu/nvidia-reset', {method:'POST', cache:'no-store'})
|
||||
.then(r => r.json())
|
||||
.then(d => { if (el) el.textContent = d.error ? ('Error: ' + d.error) : 'Done — driver reset.'; })
|
||||
.catch(err => { if (el) el.textContent = 'Error: ' + err.message; });
|
||||
}
|
||||
</script>`
|
||||
</script>
|
||||
|
||||
`
|
||||
}
|
||||
|
||||
@@ -68,6 +68,14 @@ func validateTotalStressSec(n int) int {
|
||||
}
|
||||
|
||||
func renderValidate(opts HandlerOptions) string {
|
||||
return renderValidateMode(opts, false)
|
||||
}
|
||||
|
||||
func renderValidateStress(opts HandlerOptions) string {
|
||||
return renderValidateMode(opts, true)
|
||||
}
|
||||
|
||||
func renderValidateMode(opts HandlerOptions, stressDefault bool) string {
|
||||
inv := loadValidateInventory(opts)
|
||||
n := inv.NvidiaGPUCount
|
||||
validateTotalStr := validateFmtDur(validateTotalValidateSec(n))
|
||||
@@ -76,26 +84,49 @@ func renderValidate(opts HandlerOptions) string {
|
||||
if n > 0 {
|
||||
gpuNote = fmt.Sprintf(" (%d GPU)", n)
|
||||
}
|
||||
return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div>
|
||||
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||
estStr := validateTotalStr
|
||||
if stressDefault {
|
||||
estStr = stressTotalStr
|
||||
}
|
||||
alert := `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Validate tests collect diagnostics only. They do not write to disks, do not run sustained load, and do not increment hardware wear counters.</div>`
|
||||
if stressDefault {
|
||||
alert = `<div class="alert alert-warn" style="margin-bottom:16px"><strong>⚠ Stress mode:</strong> Runs extended load tests — CPU stress-ng, memory passes, DCGM targeted diagnostics. Higher wear than Validate.</div>`
|
||||
}
|
||||
|
||||
<div class="card" style="margin-bottom:16px">
|
||||
<div class="card-head">Validate Profile</div>
|
||||
<div class="card-body validate-profile-body">
|
||||
<div class="validate-profile-col">
|
||||
<div class="form-row" style="margin:12px 0 0"><label>Mode</label></div>
|
||||
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-validate" value="validate" checked onchange="satModeChanged()"><span>Validate — quick non-destructive check</span></label>
|
||||
<label class="cb-row"><input type="radio" name="sat-mode" id="sat-mode-stress" value="stress" onchange="satModeChanged()"><span>Stress — thorough load test (` + stressTotalStr + gpuNote + `)</span></label>
|
||||
</div>
|
||||
<div class="validate-profile-col validate-profile-action">
|
||||
<p style="color:var(--muted);font-size:12px;margin:0 0 10px">Runs validate modules sequentially. Validate: ` + validateTotalStr + gpuNote + `; Stress: ` + stressTotalStr + gpuNote + `. Estimates are based on real log data and scale with GPU count.</p>
|
||||
<button type="button" class="btn btn-primary" onclick="runAllSAT()">Validate one by one</button>
|
||||
<div style="margin-top:12px">
|
||||
<span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
stressOnlyCards := ""
|
||||
if stressDefault {
|
||||
stressOnlyCards = renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
|
||||
`<code>dcgmi diag targeted_stress</code>`,
|
||||
validateFmtDur(platform.SATEstimatedNvidiaTargetedStressSec)+` (all GPUs simultaneously).`,
|
||||
)) +
|
||||
renderSATCard("nvidia-targeted-power", "NVIDIA Targeted Power", "runNvidiaValidateSet('nvidia-targeted-power')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
|
||||
`<code>dcgmi diag targeted_power</code>`,
|
||||
validateFmtDur(platform.SATEstimatedNvidiaTargetedPowerSec)+` (all GPUs simultaneously).`,
|
||||
)) +
|
||||
renderSATCard("nvidia-pulse", "NVIDIA PSU Pulse Test", "runNvidiaFabricValidate('nvidia-pulse')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`,
|
||||
`<code>dcgmi diag pulse_test</code>`,
|
||||
validateFmtDur(platform.SATEstimatedNvidiaPulseTestSec)+` (all GPUs simultaneously; measured on 8-GPU system).`,
|
||||
))
|
||||
}
|
||||
|
||||
satStressModeJS := "function satStressMode() { return false; }"
|
||||
if stressDefault {
|
||||
satStressModeJS = "function satStressMode() { return true; }"
|
||||
}
|
||||
|
||||
return alert + `
|
||||
<p style="color:var(--muted);font-size:13px;margin-bottom:16px">Tasks continue in the background — view progress in <a href="/tasks">Tasks</a>.</p>
|
||||
<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px">
|
||||
<button type="button" class="btn btn-primary" onclick="runAllSAT()">Run All</button>
|
||||
<span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
|
||||
<span style="font-size:12px;color:var(--muted)">est. ` + estStr + gpuNote + `</span>
|
||||
</div>
|
||||
|
||||
<div class="grid3">
|
||||
` + renderSATCard("cpu", "CPU", "runSAT('cpu')", "", renderValidateCardBody(
|
||||
@@ -112,9 +143,9 @@ func renderValidate(opts HandlerOptions) string {
|
||||
)) +
|
||||
renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
|
||||
inv.Storage,
|
||||
`Scans all storage devices and runs the matching health or self-test path for each device type.`,
|
||||
`<code>lsblk</code>; NVMe: <code>nvme</code>; SATA/SAS: <code>smartctl</code>`,
|
||||
`Seconds in Validate (NVMe: instant device query; SATA/SAS: short self-test). Up to ~1 h per device in Stress (extended self-test, device-dependent).`,
|
||||
`Collects SMART data and runs a short self-test on each storage device.`,
|
||||
`<code>lsblk</code>; NVMe: <code>nvme id-ctrl</code>, <code>nvme smart-log</code>, <code>nvme device-self-test -s 1</code>; SATA/SAS: <code>smartctl -H -A</code>, <code>smartctl -t short</code>`,
|
||||
`~2 min per device (NVMe short self-test; SATA/SAS short self-test — duration device-dependent).`,
|
||||
)) +
|
||||
`</div>
|
||||
<div style="height:1px;background:var(--border);margin:16px 0"></div>
|
||||
@@ -122,7 +153,7 @@ func renderValidate(opts HandlerOptions) string {
|
||||
<div class="card-head">NVIDIA GPU Selection</div>
|
||||
<div class="card-body">
|
||||
<p style="font-size:12px;color:var(--muted);margin:0 0 8px">` + inv.NVIDIA + `</p>
|
||||
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">All NVIDIA validate tasks use only the GPUs selected here. The same selection is used by Validate one by one.</p>
|
||||
<p style="font-size:12px;color:var(--muted);margin:0 0 10px">All NVIDIA validate tasks use only the GPUs selected here. The same selection is used by Run All.</p>
|
||||
<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
|
||||
<button class="btn btn-sm btn-secondary" type="button" onclick="satSelectAllGPUs()">Select All</button>
|
||||
<button class="btn btn-sm btn-secondary" type="button" onclick="satSelectNoGPUs()">Clear</button>
|
||||
@@ -143,46 +174,19 @@ func renderValidate(opts HandlerOptions) string {
|
||||
validateFmtDur(platform.SATEstimatedNvidiaGPUValidateSec),
|
||||
validateFmtDur(platform.SATEstimatedNvidiaGPUStressSec)),
|
||||
)) +
|
||||
`<div id="sat-card-nvidia-targeted-stress">` +
|
||||
renderSATCard("nvidia-targeted-stress", "NVIDIA GPU Targeted Stress", "runNvidiaValidateSet('nvidia-targeted-stress')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Runs a controlled NVIDIA DCGM load to check stability under moderate stress.`,
|
||||
`<code>dcgmi diag targeted_stress</code>`,
|
||||
"Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedStressSec) + ` (all GPUs simultaneously).<p id="sat-ts-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`<div id="sat-card-nvidia-targeted-power">` +
|
||||
renderSATCard("nvidia-targeted-power", "NVIDIA Targeted Power", "runNvidiaValidateSet('nvidia-targeted-power')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Checks that the GPU can sustain its declared power delivery envelope. Pass/fail determined by DCGM.`,
|
||||
`<code>dcgmi diag targeted_power</code>`,
|
||||
"Skipped in Validate. Stress: " + validateFmtDur(platform.SATEstimatedNvidiaTargetedPowerSec) + ` (all GPUs simultaneously).<p id="sat-tp-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`<div id="sat-card-nvidia-pulse">` +
|
||||
renderSATCard("nvidia-pulse", "NVIDIA PSU Pulse Test", "runNvidiaFabricValidate('nvidia-pulse')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Tests power supply transient response by pulsing all GPUs simultaneously between idle and full load. Synchronous pulses across all GPUs create worst-case PSU load spikes — running per-GPU would miss PSU-level failures.`,
|
||||
`<code>dcgmi diag pulse_test</code>`,
|
||||
`Skipped in Validate. Stress: `+validateFmtDur(platform.SATEstimatedNvidiaPulseTestSec)+` (all GPUs simultaneously; measured on 8-GPU system).`+`<p id="sat-pt-mode-hint" style="color:var(--warn-fg);font-size:12px;margin:8px 0 0">Only runs in Stress mode. Switch mode above to enable in Run All.</p>`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`<div id="sat-card-nvidia-interconnect">` +
|
||||
stressOnlyCards +
|
||||
renderSATCard("nvidia-interconnect", "NVIDIA Interconnect (NCCL)", "runNvidiaFabricValidate('nvidia-interconnect')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Verifies NVLink/NVSwitch fabric bandwidth using NCCL all_reduce_perf across all selected GPUs. Pass/fail based on achieved bandwidth vs. theoretical.`,
|
||||
`<code>all_reduce_perf</code> (NCCL tests)`,
|
||||
`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`,
|
||||
validateFmtDur(platform.SATEstimatedNvidiaInterconnectSec)+` (all GPUs simultaneously, requires ≥2).`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`<div id="sat-card-nvidia-bandwidth">` +
|
||||
renderSATCard("nvidia-bandwidth", "NVIDIA Bandwidth (NVBandwidth)", "runNvidiaFabricValidate('nvidia-bandwidth')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Validates GPU memory copy and peer-to-peer bandwidth paths using NVBandwidth.`,
|
||||
`<code>nvbandwidth</code>`,
|
||||
`Validate and Stress: `+validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously; nvbandwidth runs all built-in tests without a time limit — duration set by the tool).`,
|
||||
validateFmtDur(platform.SATEstimatedNvidiaBandwidthSec)+` (all GPUs simultaneously; nvbandwidth runs all built-in tests without a time limit — duration set by the tool).`,
|
||||
)) +
|
||||
`</div>` +
|
||||
`</div>
|
||||
<div class="grid3" style="margin-top:16px">
|
||||
` + renderSATCard("amd", "AMD GPU", "runAMDValidateSet()", "", renderValidateCardBody(
|
||||
@@ -197,36 +201,15 @@ func renderValidate(opts HandlerOptions) string {
|
||||
<div class="card-body"><div id="sat-terminal" class="terminal"></div></div>
|
||||
</div>
|
||||
<style>
|
||||
.validate-profile-body { display:grid; grid-template-columns:1fr 1fr 1fr; gap:24px; align-items:stretch; }
|
||||
.validate-profile-col { min-width:0; display:flex; flex-direction:column; }
|
||||
.validate-profile-action { display:flex; flex-direction:column; align-items:center; justify-content:center; }
|
||||
.validate-card-body { padding:0; }
|
||||
.validate-card-section { padding:12px 16px 0; }
|
||||
.validate-card-section:last-child { padding-bottom:16px; }
|
||||
.sat-gpu-row { display:flex; align-items:flex-start; gap:8px; padding:6px 0; cursor:pointer; font-size:13px; }
|
||||
.sat-gpu-row input[type=checkbox] { width:16px; height:16px; margin-top:2px; flex-shrink:0; }
|
||||
@media(max-width:900px){ .validate-profile-body { grid-template-columns:1fr; } }
|
||||
</style>
|
||||
<script>
|
||||
let satES = null;
|
||||
function satStressMode() {
|
||||
return document.querySelector('input[name="sat-mode"]:checked')?.value === 'stress';
|
||||
}
|
||||
function satModeChanged() {
|
||||
const stress = satStressMode();
|
||||
[
|
||||
{card: 'sat-card-nvidia-targeted-stress', hint: 'sat-ts-mode-hint'},
|
||||
{card: 'sat-card-nvidia-targeted-power', hint: 'sat-tp-mode-hint'},
|
||||
{card: 'sat-card-nvidia-pulse', hint: 'sat-pt-mode-hint'},
|
||||
].forEach(function(item) {
|
||||
const card = document.getElementById(item.card);
|
||||
if (card) {
|
||||
card.style.opacity = stress ? '1' : '0.5';
|
||||
const hint = document.getElementById(item.hint);
|
||||
if (hint) hint.style.display = stress ? 'none' : '';
|
||||
}
|
||||
});
|
||||
}
|
||||
` + satStressModeJS + `
|
||||
function satLabels() {
|
||||
return {nvidia:'Validate GPU', 'nvidia-targeted-stress':'NVIDIA Targeted Stress (dcgmi diag targeted_stress)', 'nvidia-targeted-power':'NVIDIA Targeted Power (dcgmi diag targeted_power)', 'nvidia-pulse':'NVIDIA PSU Pulse Test (dcgmi diag pulse_test)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL all_reduce_perf)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
|
||||
}
|
||||
@@ -667,7 +650,7 @@ func renderCheck(opts HandlerOptions) string {
|
||||
if n > 0 {
|
||||
gpuNote = fmt.Sprintf(" (%d GPU)", n)
|
||||
}
|
||||
return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Check tests collect diagnostics only — no writes to disks, no sustained load, no hardware wear counters incremented. For stress testing, go to <a href="/load">3. Load</a>.</div>
|
||||
return `<div class="alert alert-info" style="margin-bottom:16px"><strong>Non-destructive:</strong> Check tests collect diagnostics only — no writes to disks, no sustained load, no hardware wear counters incremented. For stress testing, go to <a href="/burn">4. Burn</a>.</div>
|
||||
<div style="display:flex;align-items:center;gap:12px;margin-bottom:16px">
|
||||
<button type="button" class="btn btn-primary" onclick="runAllCheckSAT()">Run All Checks</button>
|
||||
<span id="sat-all-status" style="font-size:12px;color:var(--muted)"></span>
|
||||
@@ -689,9 +672,15 @@ func renderCheck(opts HandlerOptions) string {
|
||||
)) +
|
||||
renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
|
||||
inv.Storage,
|
||||
`Scans all storage devices and runs the matching health or self-test path for each.`,
|
||||
`<code>lsblk</code>; NVMe: <code>nvme</code>; SATA/SAS: <code>smartctl</code>`,
|
||||
`Seconds (NVMe: instant device query; SATA/SAS: short self-test).`,
|
||||
`Collects SMART health and attributes for each storage device. No self-test is triggered — read-only query only.`,
|
||||
`<code>lsblk</code>; NVMe: <code>nvme id-ctrl</code>, <code>nvme smart-log</code>; SATA/SAS: <code>smartctl -H -A</code>`,
|
||||
`Seconds — instantaneous device query, no wear counters incremented.`,
|
||||
)) +
|
||||
renderSATCard("confidential-computing", "Confidential Computing", "runSAT('confidential-computing')", "", renderValidateCardBody(
|
||||
inv.NVIDIA,
|
||||
`Checks whether this server can run NVIDIA Confidential Computing: CPU TEE support (Intel TDX / AMD SEV-SNP) and GPU firmware CC capability. Read-only — changes nothing.`,
|
||||
`<code>nvidia-smi conf-compute -q</code>, <code>dmesg</code>, <code>/sys/module/kvm_amd/parameters/*</code>`,
|
||||
`Seconds — read-only query only.`,
|
||||
)) +
|
||||
`</div>
|
||||
<div style="height:1px;background:var(--border);margin:16px 0"></div>
|
||||
@@ -754,7 +743,7 @@ func renderCheck(opts HandlerOptions) string {
|
||||
<script>
|
||||
let satES = null;
|
||||
function satLabels() {
|
||||
return {nvidia:'Check GPU (DCGM L2)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Check Memory', storage:'Check Storage', cpu:'Check CPU', amd:'Check AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
|
||||
return {nvidia:'Check GPU (DCGM L2)', 'nvidia-interconnect':'NVIDIA Interconnect (NCCL)', 'nvidia-bandwidth':'NVIDIA Bandwidth (NVBandwidth)', memory:'Check Memory', storage:'Check Storage', cpu:'Check CPU', amd:'Check AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth', 'confidential-computing':'Check Confidential Computing'};
|
||||
}
|
||||
let satNvidiaGPUsPromise = null;
|
||||
function loadSatNvidiaGPUs() {
|
||||
@@ -890,7 +879,7 @@ function runAllCheckSAT() {
|
||||
status.textContent = 'Enqueuing...';
|
||||
const nvidiaIndices = satSelectedGPUIndices();
|
||||
const nvidiaAllTargets = ['nvidia', 'nvidia-interconnect', 'nvidia-bandwidth'];
|
||||
const baseTargets = ['cpu', 'memory', 'storage'];
|
||||
const baseTargets = ['cpu', 'memory', 'storage', 'confidential-computing'];
|
||||
const amdTargets = selectedAMDValidateTargets();
|
||||
const expanded = [];
|
||||
baseTargets.forEach(t => expanded.push({target: t}));
|
||||
|
||||
@@ -33,36 +33,36 @@ func renderPage(page string, opts HandlerOptions) string {
|
||||
case "load":
|
||||
pageID = "load"
|
||||
title = "3. Load"
|
||||
body = renderLoad()
|
||||
case "speed":
|
||||
pageID = "speed"
|
||||
title = "4. Speed"
|
||||
body = renderSpeed(opts)
|
||||
case "endurance":
|
||||
pageID = "endurance"
|
||||
title = "5. Endurance"
|
||||
body = renderEndurance(opts)
|
||||
body = renderValidateStress(opts)
|
||||
case "burn":
|
||||
pageID = "burn"
|
||||
title = "4. Burn"
|
||||
body = renderBurn()
|
||||
case "benchmark":
|
||||
pageID = "benchmark"
|
||||
title = "5. Benchmark"
|
||||
body = renderBenchmark(opts)
|
||||
case "tools":
|
||||
pageID = "tools"
|
||||
title = "6. Tools"
|
||||
title = "Tools"
|
||||
body = renderTools()
|
||||
case "settings":
|
||||
pageID = "settings"
|
||||
title = "7. Settings"
|
||||
title = "Settings"
|
||||
body = renderSettings(opts)
|
||||
// Legacy routes (redirected at HTTP level in handlePage; these are fallbacks)
|
||||
case "validate", "tests":
|
||||
pageID = "check"
|
||||
title = "2. Check"
|
||||
body = renderCheck(opts)
|
||||
case "burn", "burn-in":
|
||||
pageID = "load"
|
||||
title = "3. Load"
|
||||
body = renderLoad()
|
||||
case "benchmark":
|
||||
pageID = "speed"
|
||||
title = "4. Speed"
|
||||
body = renderSpeed(opts)
|
||||
body = renderValidate(opts)
|
||||
case "burn-in":
|
||||
pageID = "burn"
|
||||
title = "4. Burn"
|
||||
body = renderBurn()
|
||||
case "speed", "endurance":
|
||||
pageID = "benchmark"
|
||||
title = "5. Benchmark"
|
||||
body = renderBenchmark(opts)
|
||||
case "tasks":
|
||||
pageID = "tasks"
|
||||
title = "Tasks"
|
||||
|
||||
@@ -0,0 +1,857 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/exec"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// --- Response types ---
|
||||
|
||||
type raidDriveInfo struct {
|
||||
Slot string `json:"slot,omitempty"`
|
||||
Device string `json:"device,omitempty"`
|
||||
Model string `json:"model,omitempty"`
|
||||
SizeGB float64 `json:"size_gb,omitempty"`
|
||||
Serial string `json:"serial,omitempty"`
|
||||
State string `json:"state,omitempty"`
|
||||
}
|
||||
|
||||
type raidArrayInfo struct {
|
||||
Name string `json:"name"`
|
||||
Level string `json:"level,omitempty"`
|
||||
Members []string `json:"members"`
|
||||
Degraded bool `json:"degraded"`
|
||||
}
|
||||
|
||||
type raidControllerInfo struct {
|
||||
ID string `json:"id"`
|
||||
Type string `json:"type"`
|
||||
Index int `json:"index"`
|
||||
Model string `json:"model"`
|
||||
ForeignDrives []raidDriveInfo `json:"foreign_drives"`
|
||||
FreeDrives []raidDriveInfo `json:"free_drives"`
|
||||
AllDrives []raidDriveInfo `json:"all_drives"`
|
||||
Arrays []raidArrayInfo `json:"arrays,omitempty"`
|
||||
}
|
||||
|
||||
type raidStatusResp struct {
|
||||
Controllers []raidControllerInfo `json:"controllers"`
|
||||
}
|
||||
|
||||
// --- LSI/storcli detection ---
|
||||
|
||||
func detectLSIControllers() []raidControllerInfo {
|
||||
ctrlOut, err := exec.Command("storcli64", "/call", "show", "J").Output()
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
var ctrlDoc struct {
|
||||
Controllers []struct {
|
||||
ResponseData struct {
|
||||
Basics struct {
|
||||
Controller int `json:"Controller"`
|
||||
Model string `json:"Model"`
|
||||
} `json:"Basics"`
|
||||
} `json:"Response Data"`
|
||||
} `json:"Controllers"`
|
||||
}
|
||||
if err := json.Unmarshal(ctrlOut, &ctrlDoc); err != nil || len(ctrlDoc.Controllers) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
driveOut, _ := exec.Command("storcli64", "/call/eall/sall", "show", "all", "J").Output()
|
||||
|
||||
var driveDoc struct {
|
||||
Controllers []struct {
|
||||
ResponseData struct {
|
||||
DriveInformation []struct {
|
||||
EIDSlt string `json:"EID:Slt"`
|
||||
State string `json:"State"`
|
||||
Size string `json:"Size"`
|
||||
Intf string `json:"Intf"`
|
||||
Med string `json:"Med"`
|
||||
Model string `json:"Model"`
|
||||
SN string `json:"SN"`
|
||||
} `json:"Drive Information"`
|
||||
} `json:"Response Data"`
|
||||
} `json:"Controllers"`
|
||||
}
|
||||
if len(driveOut) > 0 {
|
||||
json.Unmarshal(driveOut, &driveDoc) //nolint:errcheck
|
||||
}
|
||||
|
||||
var controllers []raidControllerInfo
|
||||
for i, c := range ctrlDoc.Controllers {
|
||||
ctrl := raidControllerInfo{
|
||||
ID: fmt.Sprintf("lsi-%d", c.ResponseData.Basics.Controller),
|
||||
Type: "lsi",
|
||||
Index: c.ResponseData.Basics.Controller,
|
||||
Model: c.ResponseData.Basics.Model,
|
||||
ForeignDrives: []raidDriveInfo{},
|
||||
FreeDrives: []raidDriveInfo{},
|
||||
AllDrives: []raidDriveInfo{},
|
||||
}
|
||||
if ctrl.Model == "" {
|
||||
ctrl.Model = fmt.Sprintf("LSI Controller %d", ctrl.Index)
|
||||
}
|
||||
|
||||
if i < len(driveDoc.Controllers) {
|
||||
for _, d := range driveDoc.Controllers[i].ResponseData.DriveInformation {
|
||||
info := raidDriveInfo{
|
||||
Slot: strings.TrimSpace(d.EIDSlt),
|
||||
Model: strings.TrimSpace(d.Model),
|
||||
State: strings.TrimSpace(d.State),
|
||||
SizeGB: raidParseHumanSizeGB(d.Size),
|
||||
Serial: strings.TrimSpace(d.SN),
|
||||
}
|
||||
ctrl.AllDrives = append(ctrl.AllDrives, info)
|
||||
switch strings.TrimSpace(d.State) {
|
||||
case "Frgn":
|
||||
ctrl.ForeignDrives = append(ctrl.ForeignDrives, info)
|
||||
case "UGood", "JBOD":
|
||||
ctrl.FreeDrives = append(ctrl.FreeDrives, info)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
controllers = append(controllers, ctrl)
|
||||
}
|
||||
return controllers
|
||||
}
|
||||
|
||||
// --- VROC/mdadm detection ---
|
||||
|
||||
var raidMDStatDegradedRx = regexp.MustCompile(`\[[U_]+\]`)
|
||||
|
||||
type mdStatEntry struct {
|
||||
Name string
|
||||
Level string
|
||||
Members []string
|
||||
Degraded bool
|
||||
}
|
||||
|
||||
func parseRAIDMDStat(raw string) []mdStatEntry {
|
||||
var entries []mdStatEntry
|
||||
var cur *mdStatEntry
|
||||
for _, line := range strings.Split(raw, "\n") {
|
||||
if strings.HasPrefix(line, "Personalities") || strings.HasPrefix(line, "unused devices") {
|
||||
continue
|
||||
}
|
||||
if idx := strings.Index(line, " : "); idx > 0 {
|
||||
name := strings.TrimSpace(line[:idx])
|
||||
rest := line[idx+3:]
|
||||
entry := mdStatEntry{Name: name}
|
||||
for _, tok := range strings.Fields(rest) {
|
||||
if strings.HasPrefix(tok, "raid") || strings.HasPrefix(tok, "linear") {
|
||||
entry.Level = tok
|
||||
}
|
||||
if bk := strings.Index(tok, "["); bk > 0 && strings.HasSuffix(tok, "]") {
|
||||
entry.Members = append(entry.Members, tok[:bk])
|
||||
}
|
||||
}
|
||||
entries = append(entries, entry)
|
||||
cur = &entries[len(entries)-1]
|
||||
continue
|
||||
}
|
||||
if cur != nil {
|
||||
if m := raidMDStatDegradedRx.FindString(line); m != "" && strings.Contains(m, "_") {
|
||||
cur.Degraded = true
|
||||
}
|
||||
}
|
||||
}
|
||||
return entries
|
||||
}
|
||||
|
||||
// raidVROCPortRx matches lines like " Port2 : /dev/sda (SERIAL123)"
|
||||
// or " Port3 : - no device attached -" from `mdadm --detail-platform`.
|
||||
var raidVROCPortRx = regexp.MustCompile(`^\s*Port\d+\s*:\s*(\S+)`)
|
||||
|
||||
// parseVROCPorts returns the block device basenames (e.g. "sda") that are
|
||||
// physically wired to the VROC I/O controller's ports, per `mdadm
|
||||
// --detail-platform` output. Drives attached directly to the CPU (or to a
|
||||
// separate HBA) rather than through this controller's ports are excluded.
|
||||
func parseVROCPorts(raw string) map[string]bool {
|
||||
ports := map[string]bool{}
|
||||
for _, line := range strings.Split(raw, "\n") {
|
||||
m := raidVROCPortRx.FindStringSubmatch(line)
|
||||
if m == nil {
|
||||
continue
|
||||
}
|
||||
dev := m[1]
|
||||
if !strings.HasPrefix(dev, "/dev/") {
|
||||
continue
|
||||
}
|
||||
ports[strings.TrimPrefix(dev, "/dev/")] = true
|
||||
}
|
||||
return ports
|
||||
}
|
||||
|
||||
func detectVROCController() *raidControllerInfo {
|
||||
out, err := exec.Command("mdadm", "--detail-platform").CombinedOutput()
|
||||
if err != nil && len(out) == 0 {
|
||||
return nil
|
||||
}
|
||||
hasVROC := false
|
||||
for _, line := range strings.Split(string(out), "\n") {
|
||||
lower := strings.ToLower(line)
|
||||
if strings.Contains(lower, "license") || strings.Contains(lower, "intel") || strings.Contains(lower, "platform") {
|
||||
hasVROC = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !hasVROC {
|
||||
return nil
|
||||
}
|
||||
|
||||
ctrl := &raidControllerInfo{
|
||||
ID: "vroc-0",
|
||||
Type: "vroc",
|
||||
Model: "Intel VROC",
|
||||
ForeignDrives: []raidDriveInfo{},
|
||||
FreeDrives: []raidDriveInfo{},
|
||||
AllDrives: []raidDriveInfo{},
|
||||
}
|
||||
|
||||
ports := parseVROCPorts(string(out))
|
||||
// Some mdadm builds omit the "Port" lines from --detail-platform. When
|
||||
// we can't determine which drives are actually wired to this
|
||||
// controller, fall back to showing every disk not already in an array
|
||||
// rather than hiding everything.
|
||||
portsKnown := len(ports) > 0
|
||||
|
||||
inArray := map[string]bool{}
|
||||
raw, err := os.ReadFile("/proc/mdstat")
|
||||
if err == nil {
|
||||
for _, arr := range parseRAIDMDStat(string(raw)) {
|
||||
ctrl.Arrays = append(ctrl.Arrays, raidArrayInfo{
|
||||
Name: arr.Name,
|
||||
Level: arr.Level,
|
||||
Members: arr.Members,
|
||||
Degraded: arr.Degraded,
|
||||
})
|
||||
for _, m := range arr.Members {
|
||||
inArray[m] = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
lsblkOut, err := exec.Command("lsblk", "-J", "-d", "-o", "NAME,SIZE,TYPE,MODEL,SERIAL").Output()
|
||||
if err == nil {
|
||||
var lsblkDoc struct {
|
||||
BlockDevices []struct {
|
||||
Name string `json:"name"`
|
||||
Size string `json:"size"`
|
||||
Type string `json:"type"`
|
||||
Model string `json:"model"`
|
||||
Serial string `json:"serial"`
|
||||
} `json:"blockdevices"`
|
||||
}
|
||||
if json.Unmarshal(lsblkOut, &lsblkDoc) == nil {
|
||||
for _, d := range lsblkDoc.BlockDevices {
|
||||
// Only consider disks wired to this controller's ports -
|
||||
// drives attached directly to the CPU (or another
|
||||
// controller) never show up as VROC ports and are skipped.
|
||||
if d.Type != "disk" || (portsKnown && !ports[d.Name]) {
|
||||
continue
|
||||
}
|
||||
info := raidDriveInfo{
|
||||
Device: "/dev/" + d.Name,
|
||||
Model: strings.TrimSpace(d.Model),
|
||||
Serial: strings.TrimSpace(d.Serial),
|
||||
State: "available",
|
||||
}
|
||||
if inArray[d.Name] {
|
||||
info.State = "member"
|
||||
}
|
||||
ctrl.AllDrives = append(ctrl.AllDrives, info)
|
||||
if info.State == "available" {
|
||||
ctrl.FreeDrives = append(ctrl.FreeDrives, info)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ctrl
|
||||
}
|
||||
|
||||
// --- API handlers ---
|
||||
|
||||
func (h *handler) handleAPIRAIDStatus(w http.ResponseWriter, r *http.Request) {
|
||||
resp := raidStatusResp{Controllers: []raidControllerInfo{}}
|
||||
|
||||
if lsi := detectLSIControllers(); len(lsi) > 0 {
|
||||
resp.Controllers = append(resp.Controllers, lsi...)
|
||||
}
|
||||
if vroc := detectVROCController(); vroc != nil {
|
||||
resp.Controllers = append(resp.Controllers, *vroc)
|
||||
}
|
||||
|
||||
writeJSON(w, resp)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIRAIDForeignAction(w http.ResponseWriter, r *http.Request) {
|
||||
var req struct {
|
||||
ControllerID string `json:"controller_id"`
|
||||
Action string `json:"action"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid JSON")
|
||||
return
|
||||
}
|
||||
if req.Action != "import" && req.Action != "clear" {
|
||||
writeError(w, http.StatusBadRequest, "action must be 'import' or 'clear'")
|
||||
return
|
||||
}
|
||||
ctrlIdx, ok := parseLSIControllerIndex(req.ControllerID)
|
||||
if !ok {
|
||||
writeError(w, http.StatusBadRequest, "invalid controller_id")
|
||||
return
|
||||
}
|
||||
|
||||
target := "raid-foreign-clear"
|
||||
name := fmt.Sprintf("RAID Foreign Clear (ctrl %d)", ctrlIdx)
|
||||
if req.Action == "import" {
|
||||
target = "raid-foreign-import"
|
||||
name = fmt.Sprintf("RAID Foreign Import (ctrl %d)", ctrlIdx)
|
||||
}
|
||||
|
||||
t := &Task{
|
||||
ID: newJobID(target),
|
||||
Name: name,
|
||||
Target: target,
|
||||
Priority: defaultTaskPriority(target, taskParams{}),
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{RAIDController: ctrlIdx},
|
||||
}
|
||||
globalQueue.enqueue(t)
|
||||
writeJSON(w, map[string]string{"task_id": t.ID})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIRAIDCreateMirror(w http.ResponseWriter, r *http.Request) {
|
||||
var req struct {
|
||||
ControllerID string `json:"controller_id"`
|
||||
Devices []string `json:"devices"`
|
||||
ArrayName string `json:"array_name"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid JSON")
|
||||
return
|
||||
}
|
||||
if len(req.Devices) < 2 {
|
||||
writeError(w, http.StatusBadRequest, "at least 2 devices required")
|
||||
return
|
||||
}
|
||||
|
||||
var target, name string
|
||||
var params taskParams
|
||||
|
||||
switch {
|
||||
case strings.HasPrefix(req.ControllerID, "lsi-"):
|
||||
ctrlIdx, ok := parseLSIControllerIndex(req.ControllerID)
|
||||
if !ok {
|
||||
writeError(w, http.StatusBadRequest, "invalid controller_id")
|
||||
return
|
||||
}
|
||||
target = "raid-lsi-create-mirror"
|
||||
name = fmt.Sprintf("Create RAID 1 Mirror (LSI ctrl %d)", ctrlIdx)
|
||||
params = taskParams{RAIDController: ctrlIdx, RAIDDevices: req.Devices}
|
||||
|
||||
case req.ControllerID == "vroc-0":
|
||||
arrayName := strings.TrimSpace(req.ArrayName)
|
||||
if arrayName == "" {
|
||||
arrayName = "bee-mirror0"
|
||||
}
|
||||
target = "raid-vroc-create-mirror"
|
||||
name = fmt.Sprintf("Create VROC RAID 1 (%s)", arrayName)
|
||||
params = taskParams{RAIDDevices: req.Devices, RAIDArrayName: arrayName}
|
||||
|
||||
default:
|
||||
writeError(w, http.StatusBadRequest, "unknown controller_id")
|
||||
return
|
||||
}
|
||||
|
||||
t := &Task{
|
||||
ID: newJobID(target),
|
||||
Name: name,
|
||||
Target: target,
|
||||
Priority: defaultTaskPriority(target, taskParams{}),
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
params: params,
|
||||
}
|
||||
globalQueue.enqueue(t)
|
||||
writeJSON(w, map[string]string{"task_id": t.ID})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIRAIDPrepareDrive(w http.ResponseWriter, r *http.Request) {
|
||||
var req struct {
|
||||
ControllerID string `json:"controller_id"`
|
||||
Slot string `json:"slot"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid JSON")
|
||||
return
|
||||
}
|
||||
ctrlIdx, ok := parseLSIControllerIndex(req.ControllerID)
|
||||
if !ok {
|
||||
writeError(w, http.StatusBadRequest, "invalid controller_id")
|
||||
return
|
||||
}
|
||||
if _, _, ok := parseRAIDSlot(req.Slot); !ok {
|
||||
writeError(w, http.StatusBadRequest, "invalid slot")
|
||||
return
|
||||
}
|
||||
|
||||
t := &Task{
|
||||
ID: newJobID("raid-lsi-prepare-drive"),
|
||||
Name: fmt.Sprintf("Prepare drive %s (LSI ctrl %d)", req.Slot, ctrlIdx),
|
||||
Target: "raid-lsi-prepare-drive",
|
||||
Priority: defaultTaskPriority("raid-lsi-prepare-drive", taskParams{}),
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{RAIDController: ctrlIdx, RAIDSlot: req.Slot},
|
||||
}
|
||||
globalQueue.enqueue(t)
|
||||
writeJSON(w, map[string]string{"task_id": t.ID})
|
||||
}
|
||||
|
||||
func parseLSIControllerIndex(id string) (int, bool) {
|
||||
if !strings.HasPrefix(id, "lsi-") {
|
||||
return 0, false
|
||||
}
|
||||
n, err := strconv.Atoi(strings.TrimPrefix(id, "lsi-"))
|
||||
if err != nil || n < 0 {
|
||||
return 0, false
|
||||
}
|
||||
return n, true
|
||||
}
|
||||
|
||||
// --- Task runner functions ---
|
||||
|
||||
func runRAIDForeignClearTask(ctx context.Context, j *jobState, ctrl int) error {
|
||||
j.append(fmt.Sprintf("Clearing foreign configuration on controller %d...", ctrl))
|
||||
cmd := exec.CommandContext(ctx, "storcli64", fmt.Sprintf("/c%d/fall", ctrl), "del", "noprompt")
|
||||
return streamCmdJob(j, cmd)
|
||||
}
|
||||
|
||||
func runRAIDForeignImportTask(ctx context.Context, j *jobState, ctrl int) error {
|
||||
j.append(fmt.Sprintf("Importing foreign configuration on controller %d...", ctrl))
|
||||
cmd := exec.CommandContext(ctx, "storcli64", fmt.Sprintf("/c%d/fall", ctrl), "import", "noprompt")
|
||||
return streamCmdJob(j, cmd)
|
||||
}
|
||||
|
||||
func runRAIDLSICreateMirrorTask(ctx context.Context, j *jobState, ctrl int, drives []string) error {
|
||||
driveList := strings.Join(drives, ",")
|
||||
j.append(fmt.Sprintf("Creating RAID 1 on controller %d with drives: %s", ctrl, driveList))
|
||||
cmd := exec.CommandContext(ctx, "storcli64",
|
||||
fmt.Sprintf("/c%d", ctrl),
|
||||
"add", "vd", "type=raid1",
|
||||
fmt.Sprintf("drives=%s", driveList),
|
||||
"pdperarray=2",
|
||||
)
|
||||
return streamCmdJob(j, cmd)
|
||||
}
|
||||
|
||||
// parseRAIDSlot splits a storcli "EID:Slt" identifier (e.g. "252:0") into
|
||||
// enclosure and slot numbers.
|
||||
func parseRAIDSlot(slot string) (eid int, slt int, ok bool) {
|
||||
parts := strings.SplitN(strings.TrimSpace(slot), ":", 2)
|
||||
if len(parts) != 2 {
|
||||
return 0, 0, false
|
||||
}
|
||||
eid, err1 := strconv.Atoi(strings.TrimSpace(parts[0]))
|
||||
slt, err2 := strconv.Atoi(strings.TrimSpace(parts[1]))
|
||||
if err1 != nil || err2 != nil {
|
||||
return 0, 0, false
|
||||
}
|
||||
return eid, slt, true
|
||||
}
|
||||
|
||||
func runRAIDPrepareDriveTask(ctx context.Context, j *jobState, ctrl int, slot string) error {
|
||||
eid, slt, ok := parseRAIDSlot(slot)
|
||||
if !ok {
|
||||
return fmt.Errorf("invalid slot %q", slot)
|
||||
}
|
||||
j.append(fmt.Sprintf("Preparing drive %s on controller %d (set good, force)...", slot, ctrl))
|
||||
cmd := exec.CommandContext(ctx, "storcli64",
|
||||
fmt.Sprintf("/c%d/e%d/s%d", ctrl, eid, slt),
|
||||
"set", "good", "force",
|
||||
)
|
||||
return streamCmdJob(j, cmd)
|
||||
}
|
||||
|
||||
func runRAIDVROCCreateMirrorTask(ctx context.Context, j *jobState, devices []string, arrayName string) error {
|
||||
if arrayName == "" {
|
||||
arrayName = "bee-mirror0"
|
||||
}
|
||||
devPath := "/dev/md/" + arrayName
|
||||
args := []string{
|
||||
"--create", devPath,
|
||||
"--level=1",
|
||||
fmt.Sprintf("--raid-devices=%d", len(devices)),
|
||||
"--run",
|
||||
}
|
||||
args = append(args, devices...)
|
||||
j.append(fmt.Sprintf("Creating VROC RAID 1 array %s with: %s", devPath, strings.Join(devices, " ")))
|
||||
cmd := exec.CommandContext(ctx, "mdadm", args...)
|
||||
return streamCmdJob(j, cmd)
|
||||
}
|
||||
|
||||
// raidParseHumanSizeGB parses storcli size strings like "1.818 TB", "745.211 GB".
|
||||
func raidParseHumanSizeGB(s string) float64 {
|
||||
s = strings.TrimSpace(s)
|
||||
if s == "" {
|
||||
return 0
|
||||
}
|
||||
upper := strings.ToUpper(s)
|
||||
var mul float64
|
||||
var numStr string
|
||||
switch {
|
||||
case strings.Contains(upper, " TB"):
|
||||
mul = 1024
|
||||
numStr = strings.TrimSpace(strings.SplitN(upper, " T", 2)[0])
|
||||
case strings.Contains(upper, " GB"):
|
||||
mul = 1
|
||||
numStr = strings.TrimSpace(strings.SplitN(upper, " G", 2)[0])
|
||||
case strings.Contains(upper, " MB"):
|
||||
mul = 1.0 / 1024
|
||||
numStr = strings.TrimSpace(strings.SplitN(upper, " M", 2)[0])
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
v, err := strconv.ParseFloat(numStr, 64)
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
return v * mul
|
||||
}
|
||||
|
||||
// --- UI card ---
|
||||
|
||||
func renderRAIDMgmtCard() string {
|
||||
return `<div class="card"><div class="card-head card-head-actions">RAID Controller Management<div class="card-head-buttons"><button class="btn btn-sm btn-secondary" onclick="raidLoad()">↻ Refresh</button></div></div><div class="card-body">
|
||||
<div id="raid-status" style="font-size:13px;color:var(--muted);margin-bottom:8px">Loading...</div>
|
||||
<div id="raid-content"></div>
|
||||
<div id="raid-out-wrap" style="display:none;margin-top:14px">
|
||||
<div style="display:flex;align-items:center;justify-content:space-between;margin-bottom:4px">
|
||||
<span id="raid-out-label" style="font-size:12px;font-weight:600;color:var(--muted)">Output</span>
|
||||
<span id="raid-out-status" style="font-size:12px"></span>
|
||||
</div>
|
||||
<div id="raid-terminal" class="terminal" style="max-height:260px;width:100%;box-sizing:border-box"></div>
|
||||
</div>
|
||||
</div></div>
|
||||
<script>
|
||||
(function(){
|
||||
function escHtml(s) {
|
||||
return String(s||'').replace(/&/g,'&').replace(/</g,'<').replace(/>/g,'>').replace(/"/g,'"');
|
||||
}
|
||||
|
||||
var _raidControllers = [];
|
||||
|
||||
function raidLoad() {
|
||||
var status = document.getElementById('raid-status');
|
||||
var content = document.getElementById('raid-content');
|
||||
status.textContent = 'Detecting RAID controllers...';
|
||||
status.style.color = 'var(--muted)';
|
||||
content.innerHTML = '';
|
||||
fetch('/api/tools/raid/status', {cache:'no-store'})
|
||||
.then(function(r) {
|
||||
if (!r.ok) return r.json().then(function(e) { throw new Error(e.error || r.statusText); });
|
||||
return r.json();
|
||||
})
|
||||
.then(function(data) {
|
||||
_raidControllers = data.controllers || [];
|
||||
if (_raidControllers.length === 0) {
|
||||
status.textContent = 'No RAID controllers detected.';
|
||||
return;
|
||||
}
|
||||
status.textContent = _raidControllers.length + ' controller(s) detected.';
|
||||
content.innerHTML = _raidControllers.map(function(c, i) {
|
||||
return raidRenderController(c, i);
|
||||
}).join('<hr style="margin:16px 0;border:none;border-top:1px solid var(--border)">');
|
||||
})
|
||||
.catch(function(e) {
|
||||
status.textContent = 'Error: ' + e.message;
|
||||
status.style.color = 'var(--crit-fg)';
|
||||
});
|
||||
}
|
||||
|
||||
function raidRenderController(c, idx) {
|
||||
var html = '';
|
||||
var typeLabel = c.type === 'lsi' ? 'LSI / Broadcom' : 'Intel VROC';
|
||||
html += '<div style="font-weight:600;font-size:13px;margin-bottom:10px">' + typeLabel + ' — ' + escHtml(c.model) + '</div>';
|
||||
|
||||
if (c.type === 'lsi') {
|
||||
var foreign = c.foreign_drives || [];
|
||||
if (foreign.length > 0) {
|
||||
html += '<div style="background:var(--warn-bg,rgba(240,192,0,0.1));border:1px solid var(--warn-border,#c8a800);border-radius:4px;padding:10px 12px;margin-bottom:12px">';
|
||||
html += '<div style="font-weight:600;font-size:13px;margin-bottom:6px">⚠︎ Foreign Configuration Detected (' + foreign.length + ' drive(s))</div>';
|
||||
html += '<table style="margin-bottom:10px"><tr><th>Slot</th><th>Model</th><th>Size</th><th>State</th></tr>';
|
||||
foreign.forEach(function(d) {
|
||||
html += '<tr>'
|
||||
+ '<td style="font-family:monospace">' + escHtml(d.slot) + '</td>'
|
||||
+ '<td>' + escHtml(d.model||'—') + '</td>'
|
||||
+ '<td>' + (d.size_gb > 0 ? Math.round(d.size_gb) + ' GB' : '—') + '</td>'
|
||||
+ '<td><span class="badge badge-warn">' + escHtml(d.state) + '</span></td>'
|
||||
+ '</tr>';
|
||||
});
|
||||
html += '</table>';
|
||||
html += '<div style="display:flex;gap:8px;flex-wrap:wrap">';
|
||||
html += '<button class="btn btn-sm btn-primary" onclick="raidForeignAction(\'' + escHtml(c.id) + '\',\'import\',this)">Import Foreign Config</button>';
|
||||
html += '<button class="btn btn-sm btn-secondary" style="color:var(--crit-fg)" onclick="raidForeignAction(\'' + escHtml(c.id) + '\',\'clear\',this)">Clear Foreign Config</button>';
|
||||
html += '</div></div>';
|
||||
}
|
||||
|
||||
html += raidRenderAllDrives(c, idx);
|
||||
html += raidRenderMirrorSection(c, idx, 'lsi');
|
||||
}
|
||||
|
||||
if (c.type === 'vroc') {
|
||||
var arrays = c.arrays || [];
|
||||
if (arrays.length > 0) {
|
||||
html += '<div style="font-size:12px;font-weight:600;color:var(--muted);margin-bottom:6px;text-transform:uppercase;letter-spacing:.04em">Active Arrays</div>';
|
||||
html += '<table style="margin-bottom:14px"><tr><th>Name</th><th>Level</th><th>Members</th><th>Status</th></tr>';
|
||||
arrays.forEach(function(a) {
|
||||
var badge = a.degraded
|
||||
? '<span class="badge badge-err">Degraded</span>'
|
||||
: '<span class="badge badge-ok">OK</span>';
|
||||
html += '<tr>'
|
||||
+ '<td style="font-family:monospace">' + escHtml(a.name) + '</td>'
|
||||
+ '<td>' + escHtml(a.level||'—') + '</td>'
|
||||
+ '<td style="font-family:monospace;font-size:12px">' + (a.members||[]).map(escHtml).join(', ') + '</td>'
|
||||
+ '<td>' + badge + '</td>'
|
||||
+ '</tr>';
|
||||
});
|
||||
html += '</table>';
|
||||
}
|
||||
|
||||
html += raidRenderAllDrives(c, idx);
|
||||
html += raidRenderMirrorSection(c, idx, 'vroc');
|
||||
}
|
||||
|
||||
return html;
|
||||
}
|
||||
|
||||
var RAID_READY_STATES = {'UGood': true, 'JBOD': true, 'available': true};
|
||||
var RAID_NO_PREPARE_STATES = {'UGood': true, 'JBOD': true, 'Frgn': true, 'Onln': true, 'Msng': true};
|
||||
|
||||
function raidRenderAllDrives(c, idx) {
|
||||
var drives = c.all_drives || [];
|
||||
var isLSI = c.type === 'lsi';
|
||||
if (drives.length === 0) {
|
||||
return '<p style="font-size:13px;color:var(--muted);margin-bottom:12px">No drives detected on this controller.</p>';
|
||||
}
|
||||
var html = '<div style="font-size:12px;font-weight:600;color:var(--muted);margin-bottom:6px;text-transform:uppercase;letter-spacing:.04em">All Drives on This Controller</div>';
|
||||
html += '<table style="margin-bottom:14px"><tr><th>' + (isLSI ? 'Slot' : 'Device') + '</th><th>Model</th><th>Size</th><th>State</th>' + (isLSI ? '<th></th>' : '') + '</tr>';
|
||||
drives.forEach(function(d) {
|
||||
var ready = !!RAID_READY_STATES[d.state];
|
||||
var badgeClass = ready ? 'badge-ok' : 'badge-warn';
|
||||
var actionCell = '';
|
||||
if (isLSI && !RAID_NO_PREPARE_STATES[d.state]) {
|
||||
actionCell = '<td><button class="btn btn-sm btn-secondary" onclick="raidPrepareDrive(\'' + escHtml(c.id) + '\',\'' + escHtml(d.slot) + '\',this)">Prepare</button></td>';
|
||||
} else if (isLSI) {
|
||||
actionCell = '<td></td>';
|
||||
}
|
||||
html += '<tr>'
|
||||
+ '<td style="font-family:monospace">' + escHtml(isLSI ? d.slot : d.device) + '</td>'
|
||||
+ '<td>' + escHtml(d.model||'—') + (d.serial ? ' [' + escHtml(d.serial) + ']' : '') + '</td>'
|
||||
+ '<td>' + (d.size_gb > 0 ? Math.round(d.size_gb) + ' GB' : '—') + '</td>'
|
||||
+ '<td><span class="badge ' + badgeClass + '">' + escHtml(d.state||'—') + '</span></td>'
|
||||
+ actionCell
|
||||
+ '</tr>';
|
||||
});
|
||||
html += '</table>';
|
||||
return html;
|
||||
}
|
||||
|
||||
function raidPrepareDrive(ctrlID, slot, btn) {
|
||||
if (!confirm('Prepare drive ' + slot + ' on ' + ctrlID + ' for array creation?\n\nThis forces the drive into Unconfigured Good state. If it currently belongs to a virtual drive or holds data, that data will become inaccessible.')) {
|
||||
return;
|
||||
}
|
||||
var original = btn ? btn.textContent : '';
|
||||
if (btn) { btn.disabled = true; btn.textContent = 'Preparing...'; }
|
||||
raidShowOutput('Prepare drive ' + slot, '', '');
|
||||
fetch('/api/tools/raid/prepare-drive', {
|
||||
method: 'POST',
|
||||
headers: {'Content-Type': 'application/json'},
|
||||
body: JSON.stringify({controller_id: ctrlID, slot: slot})
|
||||
})
|
||||
.then(function(r) { return r.json(); })
|
||||
.then(function(d) {
|
||||
if (d.error) throw new Error(d.error);
|
||||
raidStreamTask(d.task_id, 'Prepare drive ' + slot, function() {
|
||||
if (btn) { btn.disabled = false; btn.textContent = original; }
|
||||
raidLoad();
|
||||
});
|
||||
})
|
||||
.catch(function(e) {
|
||||
raidShowOutput('Error', 'failed', e.message);
|
||||
if (btn) { btn.disabled = false; btn.textContent = original; }
|
||||
});
|
||||
}
|
||||
|
||||
function raidRenderMirrorSection(c, idx, kind) {
|
||||
var free = c.free_drives || [];
|
||||
var html = '<div style="font-size:12px;font-weight:600;color:var(--muted);margin-bottom:6px;text-transform:uppercase;letter-spacing:.04em">Create RAID 1 Mirror</div>';
|
||||
|
||||
if (free.length < 2) {
|
||||
html += '<p style="font-size:13px;color:var(--muted)">No unconfigured drives available (need at least 2).</p>';
|
||||
return html;
|
||||
}
|
||||
|
||||
html += '<p style="font-size:13px;color:var(--muted);margin-bottom:8px">Select exactly 2 drives:</p>';
|
||||
html += '<div>';
|
||||
free.forEach(function(d) {
|
||||
var val = kind === 'lsi' ? d.slot : d.device;
|
||||
var label = kind === 'lsi'
|
||||
? escHtml(d.slot) + (d.model ? ' — ' + escHtml(d.model) : '') + (d.size_gb > 0 ? ' (' + Math.round(d.size_gb) + ' GB)' : '')
|
||||
: escHtml(d.device) + (d.model ? ' — ' + escHtml(d.model) : '') + (d.serial ? ' [' + escHtml(d.serial) + ']' : '');
|
||||
html += '<label style="display:block;margin-bottom:4px;font-size:13px;cursor:pointer">'
|
||||
+ '<input type="checkbox" class="raid-mirror-check-' + idx + '" value="' + escHtml(val) + '"> '
|
||||
+ label + '</label>';
|
||||
});
|
||||
html += '</div>';
|
||||
|
||||
if (kind === 'vroc') {
|
||||
html += '<div style="margin-top:10px;display:flex;align-items:center;gap:8px;flex-wrap:wrap">'
|
||||
+ '<label style="font-size:13px">Array name: <input type="text" id="vroc-arrayname-' + idx + '" value="bee-mirror0" style="font-family:monospace;padding:2px 6px;width:140px"></label>';
|
||||
} else {
|
||||
html += '<div style="margin-top:10px;display:flex;gap:8px">';
|
||||
}
|
||||
|
||||
html += '<button class="btn btn-sm btn-primary raid-mirror-btn-' + idx + '" onclick="raidCreateMirror(\'' + escHtml(c.id) + '\',' + idx + ',\'' + kind + '\',this)">Create Mirror</button>';
|
||||
html += '</div>';
|
||||
|
||||
return html;
|
||||
}
|
||||
|
||||
function raidForeignAction(ctrlID, action, btn) {
|
||||
if (action === 'clear' && !confirm('Clear foreign configuration on ' + ctrlID + '?\n\nThis will DELETE the foreign RAID metadata. Data on those drives may become inaccessible.')) {
|
||||
return;
|
||||
}
|
||||
var original = btn ? btn.textContent : '';
|
||||
if (btn) { btn.disabled = true; btn.textContent = action === 'import' ? 'Importing...' : 'Clearing...'; }
|
||||
raidShowOutput('RAID foreign ' + action, '', '');
|
||||
fetch('/api/tools/raid/foreign', {
|
||||
method: 'POST',
|
||||
headers: {'Content-Type': 'application/json'},
|
||||
body: JSON.stringify({controller_id: ctrlID, action: action})
|
||||
})
|
||||
.then(function(r) { return r.json(); })
|
||||
.then(function(d) {
|
||||
if (d.error) throw new Error(d.error);
|
||||
var actionLabel = action === 'import' ? 'Import foreign config' : 'Clear foreign config';
|
||||
raidStreamTask(d.task_id, actionLabel, function() {
|
||||
if (btn) { btn.disabled = false; btn.textContent = original; }
|
||||
raidLoad();
|
||||
});
|
||||
})
|
||||
.catch(function(e) {
|
||||
raidShowOutput('Error', 'failed', e.message);
|
||||
if (btn) { btn.disabled = false; btn.textContent = original; }
|
||||
});
|
||||
}
|
||||
|
||||
function raidCreateMirror(ctrlID, idx, kind, btn) {
|
||||
var checks = document.querySelectorAll('.raid-mirror-check-' + idx + ':checked');
|
||||
if (checks.length !== 2) {
|
||||
alert('Select exactly 2 drives.');
|
||||
return;
|
||||
}
|
||||
var devices = Array.from(checks).map(function(c) { return c.value; });
|
||||
var arrayName = '';
|
||||
if (kind === 'vroc') {
|
||||
var nameEl = document.getElementById('vroc-arrayname-' + idx);
|
||||
arrayName = nameEl ? nameEl.value.trim() : 'bee-mirror0';
|
||||
if (!arrayName) arrayName = 'bee-mirror0';
|
||||
}
|
||||
var original = btn ? btn.textContent : '';
|
||||
if (btn) { btn.disabled = true; btn.textContent = 'Creating...'; }
|
||||
raidShowOutput('Create RAID 1', '', '');
|
||||
fetch('/api/tools/raid/create-mirror', {
|
||||
method: 'POST',
|
||||
headers: {'Content-Type': 'application/json'},
|
||||
body: JSON.stringify({controller_id: ctrlID, devices: devices, array_name: arrayName})
|
||||
})
|
||||
.then(function(r) { return r.json(); })
|
||||
.then(function(d) {
|
||||
if (d.error) throw new Error(d.error);
|
||||
raidStreamTask(d.task_id, 'Create RAID 1 mirror', function() {
|
||||
if (btn) { btn.disabled = false; btn.textContent = original; }
|
||||
raidLoad();
|
||||
});
|
||||
})
|
||||
.catch(function(e) {
|
||||
raidShowOutput('Error', 'failed', e.message);
|
||||
if (btn) { btn.disabled = false; btn.textContent = original; }
|
||||
});
|
||||
}
|
||||
|
||||
function raidShowOutput(label, status, text) {
|
||||
var wrap = document.getElementById('raid-out-wrap');
|
||||
var labelEl = document.getElementById('raid-out-label');
|
||||
var statusEl = document.getElementById('raid-out-status');
|
||||
var term = document.getElementById('raid-terminal');
|
||||
wrap.style.display = 'block';
|
||||
labelEl.textContent = label;
|
||||
if (status === 'ok') {
|
||||
statusEl.textContent = '✓ done';
|
||||
statusEl.style.color = 'var(--ok-fg)';
|
||||
} else if (status === 'failed') {
|
||||
statusEl.textContent = '✗ failed';
|
||||
statusEl.style.color = 'var(--crit-fg)';
|
||||
} else {
|
||||
statusEl.textContent = status;
|
||||
statusEl.style.color = 'var(--muted)';
|
||||
}
|
||||
if (text !== undefined) {
|
||||
term.textContent = text;
|
||||
term.scrollTop = term.scrollHeight;
|
||||
}
|
||||
}
|
||||
|
||||
function raidStreamTask(taskID, taskName, onDone) {
|
||||
var term = document.getElementById('raid-terminal');
|
||||
term.textContent = '';
|
||||
raidShowOutput(taskName || 'Running…', 'running…', undefined);
|
||||
var es = new EventSource('/api/tasks/' + taskID + '/stream');
|
||||
es.onmessage = function(e) {
|
||||
term.textContent += e.data + '\n';
|
||||
term.scrollTop = term.scrollHeight;
|
||||
};
|
||||
es.addEventListener('done', function(e) {
|
||||
es.close();
|
||||
if (!e.data) {
|
||||
raidShowOutput(taskName, 'ok', undefined);
|
||||
} else {
|
||||
raidShowOutput(taskName, 'failed', undefined);
|
||||
term.textContent += '\nFailed: ' + e.data;
|
||||
term.scrollTop = term.scrollHeight;
|
||||
}
|
||||
if (onDone) onDone();
|
||||
});
|
||||
es.onerror = function() {
|
||||
es.close();
|
||||
raidShowOutput(taskName, 'failed', undefined);
|
||||
if (onDone) onDone();
|
||||
};
|
||||
}
|
||||
|
||||
window.raidLoad = raidLoad;
|
||||
window.raidForeignAction = raidForeignAction;
|
||||
window.raidCreateMirror = raidCreateMirror;
|
||||
window.raidPrepareDrive = raidPrepareDrive;
|
||||
raidLoad();
|
||||
})();
|
||||
</script>`
|
||||
}
|
||||
+13
-100
@@ -28,10 +28,12 @@ var (
|
||||
shnRE = regexp.MustCompile(`^[A-Za-z0-9_]{1,16}$`)
|
||||
dmiSectionRE = regexp.MustCompile(`^\[(.+?)\]$`)
|
||||
// Item Name {SHN} = value // comment
|
||||
dmiItemRE = regexp.MustCompile(`^(.+?)\s+\{([A-Za-z0-9]{1,16})\}\s*=\s*(.*)$`)
|
||||
// SHN may contain parentheses, e.g. {PS(4)LC} for power supply fields
|
||||
dmiItemRE = regexp.MustCompile(`^(.+?)\s+\{([A-Za-z0-9_()\-]{1,24})\}\s*=\s*(.*)$`)
|
||||
dmiVersionRE = regexp.MustCompile(`(?i)^version\s*=`)
|
||||
)
|
||||
|
||||
|
||||
// parseDMIFile parses the DMI.txt produced by "saa GetDmiInfo".
|
||||
// Real format (from SAA User Guide 4.8.1):
|
||||
//
|
||||
@@ -90,7 +92,9 @@ func (h *handler) handleAPISAADMIRead(w http.ResponseWriter, r *http.Request) {
|
||||
defer os.RemoveAll(tmpDir)
|
||||
|
||||
dmiFile := filepath.Join(tmpDir, "DMI.txt")
|
||||
out, err := exec.CommandContext(ctx, "saa", "-c", "GetDmiInfo", "--file", dmiFile, "--overwrite").CombinedOutput()
|
||||
cmd := exec.CommandContext(ctx, "saa", "-c", "GetDmiInfo", "--file", dmiFile, "--overwrite")
|
||||
cmd.Dir = "/usr/local/bin"
|
||||
out, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
msg := strings.TrimSpace(string(out))
|
||||
if msg == "" {
|
||||
@@ -167,7 +171,9 @@ func runSAADMIWriteTask(ctx context.Context, j *jobState, exportDir string, p ta
|
||||
dmiFile := filepath.Join(tmpDir, "DMI.txt")
|
||||
|
||||
j.append("Reading current DMI configuration...")
|
||||
if err := streamCmdJob(j, exec.CommandContext(ctx, "saa", "-c", "GetDmiInfo", "--file", dmiFile, "--overwrite")); err != nil {
|
||||
getCmd := exec.CommandContext(ctx, "saa", "-c", "GetDmiInfo", "--file", dmiFile, "--overwrite")
|
||||
getCmd.Dir = "/usr/local/bin"
|
||||
if err := streamCmdJob(j, getCmd); err != nil {
|
||||
return fmt.Errorf("GetDmiInfo: %w", err)
|
||||
}
|
||||
|
||||
@@ -189,13 +195,16 @@ func runSAADMIWriteTask(ctx context.Context, j *jobState, exportDir string, p ta
|
||||
for _, c := range p.SAADmiChanges {
|
||||
j.append("Setting " + c.Shn + " = " + c.Value)
|
||||
cmd := exec.CommandContext(ctx, "saa", "-c", "EditDmiInfo", "--file", dmiFile, "--shn", c.Shn, "--value", c.Value)
|
||||
cmd.Dir = "/usr/local/bin"
|
||||
if err := streamCmdJob(j, cmd); err != nil {
|
||||
return fmt.Errorf("EditDmiInfo %s: %w", c.Shn, err)
|
||||
}
|
||||
}
|
||||
|
||||
j.append("Applying changes to hardware...")
|
||||
if err := streamCmdJob(j, exec.CommandContext(ctx, "saa", "-c", "ChangeDmiInfo", "--file", dmiFile)); err != nil {
|
||||
changeCmd := exec.CommandContext(ctx, "saa", "-c", "ChangeDmiInfo", "--file", dmiFile)
|
||||
changeCmd.Dir = "/usr/local/bin"
|
||||
if err := streamCmdJob(j, changeCmd); err != nil {
|
||||
return fmt.Errorf("ChangeDmiInfo: %w", err)
|
||||
}
|
||||
|
||||
@@ -203,99 +212,3 @@ func runSAADMIWriteTask(ctx context.Context, j *jobState, exportDir string, p ta
|
||||
return nil
|
||||
}
|
||||
|
||||
func renderSAADMICard() string {
|
||||
return `<div class="card"><div class="card-head">SAA — DMI <button class="btn btn-sm btn-secondary" onclick="saaDMIRead()" style="margin-left:auto">Read</button></div><div class="card-body">
|
||||
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Reads and edits DMI fields via SAA (In-Band). Requires <code>saa</code> on PATH.</p>
|
||||
<div id="saa-dmi-status" style="font-size:13px;color:var(--muted);margin-bottom:8px"></div>
|
||||
<div id="saa-dmi-table"></div>
|
||||
<div id="saa-dmi-save-row" style="display:none;margin-top:12px">
|
||||
<button class="btn btn-primary" id="saa-dmi-save-btn" onclick="saaDMISave()">Save</button>
|
||||
<span id="saa-dmi-save-msg" style="font-size:13px;color:var(--muted);margin-left:10px"></span>
|
||||
</div>
|
||||
<script>
|
||||
function saaDMIEsc(s) {
|
||||
return String(s==null?'':s).replace(/[&<>"']/g,function(c){return{'&':'&','<':'<','>':'>','"':'"',"'":'''}[c];});
|
||||
}
|
||||
function saaDMIUpdateSaveBtn() {
|
||||
var inputs = document.querySelectorAll('#saa-dmi-table input[data-original]');
|
||||
var dirty = [];
|
||||
inputs.forEach(function(inp){if(inp.value!==inp.dataset.original)dirty.push(inp);});
|
||||
var row = document.getElementById('saa-dmi-save-row');
|
||||
var btn = document.getElementById('saa-dmi-save-btn');
|
||||
if(dirty.length>0){row.style.display='';btn.textContent='Save ('+dirty.length+' changed)';}
|
||||
else{row.style.display='none';}
|
||||
}
|
||||
function saaDMIRead() {
|
||||
var status = document.getElementById('saa-dmi-status');
|
||||
var table = document.getElementById('saa-dmi-table');
|
||||
var saveRow = document.getElementById('saa-dmi-save-row');
|
||||
status.textContent = 'Reading...';
|
||||
status.style.color = 'var(--muted)';
|
||||
table.innerHTML = '';
|
||||
saveRow.style.display = 'none';
|
||||
fetch('/api/tools/saa-dmi').then(function(r){return r.json().then(function(d){if(!r.ok)throw new Error(d.error||('HTTP '+r.status));return d;});}).then(function(fields){
|
||||
status.textContent = fields.length+' field(s) loaded.';
|
||||
var rows = fields.map(function(f){
|
||||
return '<tr>'
|
||||
+'<td style="font-size:13px;white-space:nowrap;padding-right:8px">'+saaDMIEsc(f.name)+'</td>'
|
||||
+'<td style="font-family:monospace;font-size:13px;white-space:nowrap;padding-right:8px">'+saaDMIEsc(f.shn)+'</td>'
|
||||
+'<td><input type="text" value="'+saaDMIEsc(f.value)+'" data-shn="'+saaDMIEsc(f.shn)+'" data-original="'+saaDMIEsc(f.value)+'" oninput="saaDMIMarkDirty(this)" style="width:100%;font-family:monospace;font-size:13px;border:1px solid var(--line);padding:3px 6px;border-radius:3px"></td>'
|
||||
+'<td id="saa-dmi-dirty-'+saaDMIEsc(f.shn)+'" style="font-size:12px;color:var(--warn,#b45309);width:50px;padding-left:6px"></td>'
|
||||
+'</tr>';
|
||||
}).join('');
|
||||
table.innerHTML = '<table style="width:100%;border-collapse:collapse"><tr><th style="text-align:left;font-size:13px;padding-bottom:6px">Field</th><th style="text-align:left;font-size:13px;padding-bottom:6px">Shn</th><th style="text-align:left;font-size:13px;padding-bottom:6px">Value</th><th></th></tr>'+rows+'</table>';
|
||||
}).catch(function(e){
|
||||
status.textContent = 'Error: '+e.message;
|
||||
status.style.color = 'var(--crit-fg,#9f3a38)';
|
||||
});
|
||||
}
|
||||
function saaDMIMarkDirty(inp) {
|
||||
var shn = inp.dataset.shn;
|
||||
var cell = document.getElementById('saa-dmi-dirty-'+shn);
|
||||
if(cell)cell.textContent = inp.value!==inp.dataset.original?'changed':'';
|
||||
saaDMIUpdateSaveBtn();
|
||||
}
|
||||
function saaDMIWaitTask(taskID) {
|
||||
var msg = document.getElementById('saa-dmi-save-msg');
|
||||
msg.textContent = 'Task '+taskID+' queued...';
|
||||
msg.style.color = 'var(--muted)';
|
||||
var timer = setInterval(function(){
|
||||
fetch('/api/tasks').then(function(r){return r.json();}).then(function(tasks){
|
||||
var task = (tasks||[]).find(function(t){return t.id===taskID;});
|
||||
if(!task)return;
|
||||
if(task.status==='done'||task.status==='failed'||task.status==='cancelled'){
|
||||
clearInterval(timer);
|
||||
msg.textContent = task.status==='done'?'Saved. Reboot to apply.':'Failed: '+(task.error||task.status);
|
||||
msg.style.color = task.status==='done'?'var(--ok,green)':'var(--crit-fg,#9f3a38)';
|
||||
document.getElementById('saa-dmi-save-btn').disabled = false;
|
||||
}
|
||||
}).catch(function(){});
|
||||
}, 1500);
|
||||
}
|
||||
function saaDMISave() {
|
||||
var inputs = document.querySelectorAll('#saa-dmi-table input[data-original]');
|
||||
var changes = [];
|
||||
inputs.forEach(function(inp){if(inp.value!==inp.dataset.original)changes.push({shn:inp.dataset.shn,value:inp.value});});
|
||||
if(!changes.length)return;
|
||||
var names = changes.map(function(c){return c.shn;}).join(', ');
|
||||
if(!window.confirm('Apply DMI changes for: '+names+'?\n\nThe server will need to be rebooted for changes to take effect.'))return;
|
||||
var btn = document.getElementById('saa-dmi-save-btn');
|
||||
var msg = document.getElementById('saa-dmi-save-msg');
|
||||
btn.disabled = true;
|
||||
msg.textContent = 'Submitting...';
|
||||
msg.style.color = 'var(--muted)';
|
||||
fetch('/api/tools/saa-dmi/write',{
|
||||
method:'POST',
|
||||
headers:{'Content-Type':'application/json'},
|
||||
body:JSON.stringify({changes:changes})
|
||||
}).then(function(r){return r.json().then(function(d){if(!r.ok)throw new Error(d.error||('HTTP '+r.status));return d;});}).then(function(d){
|
||||
saaDMIWaitTask(d.task_id);
|
||||
}).catch(function(e){
|
||||
msg.textContent = 'Error: '+e.message;
|
||||
msg.style.color = 'var(--crit-fg,#9f3a38)';
|
||||
btn.disabled = false;
|
||||
});
|
||||
}
|
||||
</script>
|
||||
</div></div>`
|
||||
}
|
||||
|
||||
@@ -264,6 +264,7 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
mux.HandleFunc("POST /api/sat/nvidia-stress/run", h.handleAPISATRun("nvidia-stress"))
|
||||
mux.HandleFunc("POST /api/sat/memory/run", h.handleAPISATRun("memory"))
|
||||
mux.HandleFunc("POST /api/sat/storage/run", h.handleAPISATRun("storage"))
|
||||
mux.HandleFunc("POST /api/sat/confidential-computing/run", h.handleAPISATRun("confidential-computing"))
|
||||
mux.HandleFunc("POST /api/sat/cpu/run", h.handleAPISATRun("cpu"))
|
||||
mux.HandleFunc("POST /api/sat/amd/run", h.handleAPISATRun("amd"))
|
||||
mux.HandleFunc("POST /api/sat/amd-mem/run", h.handleAPISATRun("amd-mem"))
|
||||
@@ -316,6 +317,14 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
mux.HandleFunc("POST /api/tools/nvme-format/run", h.handleAPINVMeFormatRun)
|
||||
mux.HandleFunc("GET /api/tools/saa-dmi", h.handleAPISAADMIRead)
|
||||
mux.HandleFunc("POST /api/tools/saa-dmi/write", h.handleAPISAADMIWrite)
|
||||
mux.HandleFunc("GET /api/tools/ipmi-fru", h.handleAPIIPMIFRURead)
|
||||
mux.HandleFunc("POST /api/tools/ipmi-fru/write", h.handleAPIIPMIFRUWrite)
|
||||
mux.HandleFunc("GET /api/tools/huawei-elabel", h.handleAPIHuaweiElabelRead)
|
||||
mux.HandleFunc("POST /api/tools/huawei-elabel/write", h.handleAPIHuaweiElabelWrite)
|
||||
mux.HandleFunc("GET /api/tools/raid/status", h.handleAPIRAIDStatus)
|
||||
mux.HandleFunc("POST /api/tools/raid/foreign", h.handleAPIRAIDForeignAction)
|
||||
mux.HandleFunc("POST /api/tools/raid/create-mirror", h.handleAPIRAIDCreateMirror)
|
||||
mux.HandleFunc("POST /api/tools/raid/prepare-drive", h.handleAPIRAIDPrepareDrive)
|
||||
|
||||
// GPU presence / tools
|
||||
mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
|
||||
@@ -327,6 +336,8 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
// System
|
||||
mux.HandleFunc("GET /api/system/ram-status", h.handleAPIRAMStatus)
|
||||
mux.HandleFunc("POST /api/system/install-to-ram", h.handleAPIInstallToRAM)
|
||||
mux.HandleFunc("POST /api/system/reboot", h.handleAPISystemReboot)
|
||||
mux.HandleFunc("POST /api/system/shutdown", h.handleAPISystemShutdown)
|
||||
|
||||
// Preflight
|
||||
mux.HandleFunc("GET /api/preflight", h.handleAPIPreflight)
|
||||
@@ -1424,13 +1435,13 @@ func (h *handler) handlePage(w http.ResponseWriter, r *http.Request) {
|
||||
// Redirect legacy routes to new named pages
|
||||
switch page {
|
||||
case "validate", "tests":
|
||||
http.Redirect(w, r, "/check", http.StatusMovedPermanently)
|
||||
return
|
||||
case "burn", "burn-in":
|
||||
http.Redirect(w, r, "/load", http.StatusMovedPermanently)
|
||||
return
|
||||
case "benchmark":
|
||||
http.Redirect(w, r, "/speed", http.StatusMovedPermanently)
|
||||
case "burn-in":
|
||||
http.Redirect(w, r, "/burn", http.StatusMovedPermanently)
|
||||
return
|
||||
case "speed", "endurance":
|
||||
http.Redirect(w, r, "/benchmark", http.StatusMovedPermanently)
|
||||
return
|
||||
}
|
||||
body := renderPage(page, h.opts)
|
||||
|
||||
@@ -666,54 +666,64 @@ func TestTasksPageRendersOpenLinksAndPaginationControls(t *testing.T) {
|
||||
|
||||
func TestToolsPageRendersNvidiaSelfHealSection(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/tools", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
|
||||
// /tools: only NVMe Block Format and Supermicro DMI remain
|
||||
recTools := httptest.NewRecorder()
|
||||
handler.ServeHTTP(recTools, httptest.NewRequest(http.MethodGet, "/tools", nil))
|
||||
if recTools.Code != http.StatusOK {
|
||||
t.Fatalf("tools status=%d", recTools.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
if !strings.Contains(body, `NVIDIA Self Heal`) {
|
||||
t.Fatalf("tools page missing nvidia self heal section: %s", body)
|
||||
toolsBody := recTools.Body.String()
|
||||
if !strings.Contains(toolsBody, `NVMe Block Format`) {
|
||||
t.Fatalf("tools page missing nvme block format section: %s", toolsBody)
|
||||
}
|
||||
if !strings.Contains(body, `Restart GPU Drivers`) {
|
||||
t.Fatalf("tools page missing restart gpu drivers button: %s", body)
|
||||
if !strings.Contains(toolsBody, `/api/tools/nvme-formats`) || !strings.Contains(toolsBody, `/api/tools/nvme-format/run`) {
|
||||
t.Fatalf("tools page missing nvme format api usage: %s", toolsBody)
|
||||
}
|
||||
if !strings.Contains(body, `nvidiaRestartDrivers()`) {
|
||||
t.Fatalf("tools page missing nvidiaRestartDrivers action: %s", body)
|
||||
|
||||
// /settings: system install, support bundle, tool check, nvidia self heal, network, services
|
||||
recSettings := httptest.NewRecorder()
|
||||
handler.ServeHTTP(recSettings, httptest.NewRequest(http.MethodGet, "/settings", nil))
|
||||
if recSettings.Code != http.StatusOK {
|
||||
t.Fatalf("settings status=%d", recSettings.Code)
|
||||
}
|
||||
if !strings.Contains(body, `/api/gpu/nvidia-status`) {
|
||||
t.Fatalf("tools page missing nvidia status api usage: %s", body)
|
||||
settingsBody := recSettings.Body.String()
|
||||
if !strings.Contains(settingsBody, `NVIDIA Self Heal`) {
|
||||
t.Fatalf("settings page missing nvidia self heal section: %s", settingsBody)
|
||||
}
|
||||
if !strings.Contains(body, `nvidiaResetGPU(`) {
|
||||
t.Fatalf("tools page missing nvidiaResetGPU action: %s", body)
|
||||
if !strings.Contains(settingsBody, `Restart GPU Drivers`) {
|
||||
t.Fatalf("settings page missing restart gpu drivers button: %s", settingsBody)
|
||||
}
|
||||
if !strings.Contains(body, `id="boot-source-text"`) {
|
||||
t.Fatalf("tools page missing boot source field: %s", body)
|
||||
if !strings.Contains(settingsBody, `nvidiaRestartDrivers()`) {
|
||||
t.Fatalf("settings page missing nvidiaRestartDrivers action: %s", settingsBody)
|
||||
}
|
||||
if !strings.Contains(body, `USB Black-Box`) {
|
||||
t.Fatalf("tools page missing usb black-box section: %s", body)
|
||||
if !strings.Contains(settingsBody, `/api/gpu/nvidia-status`) {
|
||||
t.Fatalf("settings page missing nvidia status api usage: %s", settingsBody)
|
||||
}
|
||||
if !strings.Contains(body, `/api/blackbox/status`) {
|
||||
t.Fatalf("tools page missing black-box status api usage: %s", body)
|
||||
if !strings.Contains(settingsBody, `nvidiaResetGPU(`) {
|
||||
t.Fatalf("settings page missing nvidiaResetGPU action: %s", settingsBody)
|
||||
}
|
||||
if !strings.Contains(body, `NVMe Block Format`) {
|
||||
t.Fatalf("tools page missing nvme block format section: %s", body)
|
||||
if !strings.Contains(settingsBody, `id="boot-source-text"`) {
|
||||
t.Fatalf("settings page missing boot source field: %s", settingsBody)
|
||||
}
|
||||
if !strings.Contains(body, `/api/tools/nvme-formats`) || !strings.Contains(body, `/api/tools/nvme-format/run`) {
|
||||
t.Fatalf("tools page missing nvme format api usage: %s", body)
|
||||
if !strings.Contains(settingsBody, `USB Black-Box`) {
|
||||
t.Fatalf("settings page missing usb black-box section: %s", settingsBody)
|
||||
}
|
||||
if !strings.Contains(settingsBody, `/api/blackbox/status`) {
|
||||
t.Fatalf("settings page missing black-box status api usage: %s", settingsBody)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBenchmarkPageRendersGPUSelectionControls(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/speed", nil))
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/benchmark", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
body := rec.Body.String()
|
||||
for _, needle := range []string{
|
||||
`href="/speed"`,
|
||||
`href="/benchmark"`,
|
||||
`id="benchmark-gpu-list"`,
|
||||
`/api/gpu/nvidia`,
|
||||
`/api/bee-bench/nvidia/perf/run`,
|
||||
@@ -769,7 +779,7 @@ func TestBenchmarkPageRendersSavedResultsTable(t *testing.T) {
|
||||
|
||||
handler := NewHandler(HandlerOptions{ExportDir: exportDir})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/speed", nil))
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/benchmark", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
@@ -834,10 +844,10 @@ func TestCheckPageRendersNvidiaFabricCards(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestLoadPageRendersGoalBasedNVIDIACards(t *testing.T) {
|
||||
func TestBurnPageRendersGoalBasedNVIDIACards(t *testing.T) {
|
||||
handler := NewHandler(HandlerOptions{})
|
||||
rec := httptest.NewRecorder()
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/load", nil))
|
||||
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/burn", nil))
|
||||
if rec.Code != http.StatusOK {
|
||||
t.Fatalf("status=%d", rec.Code)
|
||||
}
|
||||
@@ -1217,7 +1227,8 @@ func TestDashboardRendersRuntimeHealthTable(t *testing.T) {
|
||||
],
|
||||
"services":[
|
||||
{"name":"bee-web","status":"active"},
|
||||
{"name":"bee-nvidia","status":"inactive"}
|
||||
{"name":"bee-audit","status":"inactive"},
|
||||
{"name":"bee-nvidia","status":"failed"}
|
||||
]
|
||||
}`
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "runtime-health.json"), []byte(health), 0644); err != nil {
|
||||
@@ -1271,7 +1282,7 @@ func TestDashboardRendersRuntimeHealthTable(t *testing.T) {
|
||||
`Bee Services`,
|
||||
`CUDA runtime is not ready for GPU SAT.`,
|
||||
`Missing: nvidia-smi`,
|
||||
`bee-nvidia=inactive`,
|
||||
`bee-nvidia=failed`,
|
||||
// Hardware Summary card — component health badges
|
||||
`Hardware Summary`,
|
||||
`>CPU<`,
|
||||
|
||||
@@ -232,6 +232,9 @@ func renderTaskReportFragment(report taskReport, charts map[string]string, logTe
|
||||
if powerCard := renderTaskPowerResultsCard(report.Target, logText); powerCard != "" {
|
||||
b.WriteString(powerCard)
|
||||
}
|
||||
if report.Target == "storage" {
|
||||
b.WriteString(renderStorageDiskReportCards(logText))
|
||||
}
|
||||
|
||||
if len(report.Charts) > 0 {
|
||||
for _, chart := range report.Charts {
|
||||
@@ -369,3 +372,60 @@ func formatTaskDuration(sec int) string {
|
||||
}
|
||||
return fmt.Sprintf("%dh %02dm %02ds", sec/3600, (sec%3600)/60, sec%60)
|
||||
}
|
||||
|
||||
// renderStorageDiskReportCards reads disk-*-report.txt files from the storage
|
||||
// SAT run directory and renders one card per disk.
|
||||
func renderStorageDiskReportCards(logText string) string {
|
||||
runDir := taskStorageRunDirFromLog(logText)
|
||||
if runDir == "" {
|
||||
return ""
|
||||
}
|
||||
entries, err := os.ReadDir(runDir)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
|
||||
var cards []string
|
||||
for _, entry := range entries {
|
||||
name := entry.Name()
|
||||
if !strings.HasPrefix(name, "disk-") || !strings.HasSuffix(name, "-report.txt") {
|
||||
continue
|
||||
}
|
||||
data, err := os.ReadFile(filepath.Join(runDir, name))
|
||||
if err != nil || len(data) == 0 {
|
||||
continue
|
||||
}
|
||||
// Extract disk label from filename: "disk-01-nvme0n1-report.txt" → "Disk 01 — nvme0n1"
|
||||
stem := strings.TrimPrefix(strings.TrimSuffix(name, "-report.txt"), "disk-")
|
||||
// stem is like "01-nvme0n1"
|
||||
parts := strings.SplitN(stem, "-", 2)
|
||||
title := "Disk " + stem
|
||||
if len(parts) == 2 {
|
||||
title = "Disk " + parts[0] + " — " + parts[1]
|
||||
}
|
||||
card := `<div class="card">` +
|
||||
`<div class="card-head">` + html.EscapeString(title) + `</div>` +
|
||||
`<div class="card-body" style="padding:0">` +
|
||||
`<pre style="margin:0;padding:16px;font-size:12px;line-height:1.6;overflow-x:auto;white-space:pre">` +
|
||||
html.EscapeString(string(data)) +
|
||||
`</pre></div></div>`
|
||||
cards = append(cards, card)
|
||||
}
|
||||
return strings.Join(cards, "\n")
|
||||
}
|
||||
|
||||
// taskStorageRunDirFromLog finds the storage SAT run directory path logged as
|
||||
// "Archive: /path/to/storage-YYYYMMDD-HHMMSS".
|
||||
func taskStorageRunDirFromLog(logText string) string {
|
||||
for _, line := range strings.Split(logText, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if !strings.HasPrefix(line, "Archive:") {
|
||||
continue
|
||||
}
|
||||
path := strings.TrimSpace(strings.TrimPrefix(line, "Archive:"))
|
||||
if strings.Contains(filepath.Base(path), "storage-") && !strings.HasSuffix(path, ".tar.gz") {
|
||||
return path
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
@@ -272,6 +272,12 @@ func executeTaskWithOptions(opts *HandlerOptions, t *Task, j *jobState, ctx cont
|
||||
break
|
||||
}
|
||||
archive, err = runStorageAcceptancePackCtx(a, ctx, "", t.params.StressMode, j.append)
|
||||
case "confidential-computing":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = runConfidentialComputingCheckPackCtx(a, ctx, "", j.append)
|
||||
case "cpu":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
@@ -388,6 +394,40 @@ func executeTaskWithOptions(opts *HandlerOptions, t *Task, j *jobState, ctx cont
|
||||
break
|
||||
}
|
||||
err = runSAADMIWriteTask(ctx, j, opts.ExportDir, t.params)
|
||||
case "ipmi-fru-write":
|
||||
if len(t.params.FRUChanges) == 0 {
|
||||
err = fmt.Errorf("no changes provided")
|
||||
break
|
||||
}
|
||||
err = runIPMIFRUWriteTask(ctx, j, opts.ExportDir, t.params)
|
||||
case "huawei-elabel-write":
|
||||
if len(t.params.HuaweiElabelChanges) == 0 {
|
||||
err = fmt.Errorf("no changes provided")
|
||||
break
|
||||
}
|
||||
err = runHuaweiElabelWriteTask(ctx, j, t.params)
|
||||
case "raid-foreign-clear":
|
||||
err = runRAIDForeignClearTask(ctx, j, t.params.RAIDController)
|
||||
case "raid-foreign-import":
|
||||
err = runRAIDForeignImportTask(ctx, j, t.params.RAIDController)
|
||||
case "raid-lsi-create-mirror":
|
||||
if len(t.params.RAIDDevices) < 2 {
|
||||
err = fmt.Errorf("at least 2 drives required")
|
||||
break
|
||||
}
|
||||
err = runRAIDLSICreateMirrorTask(ctx, j, t.params.RAIDController, t.params.RAIDDevices)
|
||||
case "raid-lsi-prepare-drive":
|
||||
if strings.TrimSpace(t.params.RAIDSlot) == "" {
|
||||
err = fmt.Errorf("no drive slot provided")
|
||||
break
|
||||
}
|
||||
err = runRAIDPrepareDriveTask(ctx, j, t.params.RAIDController, t.params.RAIDSlot)
|
||||
case "raid-vroc-create-mirror":
|
||||
if len(t.params.RAIDDevices) < 2 {
|
||||
err = fmt.Errorf("at least 2 devices required")
|
||||
break
|
||||
}
|
||||
err = runRAIDVROCCreateMirrorTask(ctx, j, t.params.RAIDDevices, t.params.RAIDArrayName)
|
||||
default:
|
||||
j.append("ERROR: unknown target: " + t.Target)
|
||||
j.finish("unknown target")
|
||||
|
||||
@@ -45,6 +45,7 @@ var taskNames = map[string]string{
|
||||
"nvidia-stress": "NVIDIA GPU Stress",
|
||||
"memory": "Memory SAT",
|
||||
"storage": "Storage SAT",
|
||||
"confidential-computing": "Confidential Computing Check",
|
||||
"cpu": "CPU SAT",
|
||||
"amd": "AMD GPU SAT",
|
||||
"amd-mem": "AMD GPU MEM Integrity",
|
||||
@@ -140,7 +141,13 @@ type taskParams struct {
|
||||
Device string `json:"device,omitempty"` // for install
|
||||
LBAF int `json:"lbaf,omitempty"`
|
||||
PlatformComponents []string `json:"platform_components,omitempty"`
|
||||
SAADmiChanges []saaChange `json:"saa_dmi_changes,omitempty"`
|
||||
SAADmiChanges []saaChange `json:"saa_dmi_changes,omitempty"`
|
||||
FRUChanges []fruChange `json:"fru_changes,omitempty"`
|
||||
HuaweiElabelChanges []huaweiChange `json:"huawei_elabel_changes,omitempty"`
|
||||
RAIDController int `json:"raid_controller,omitempty"`
|
||||
RAIDDevices []string `json:"raid_devices,omitempty"`
|
||||
RAIDArrayName string `json:"raid_array_name,omitempty"`
|
||||
RAIDSlot string `json:"raid_slot,omitempty"`
|
||||
}
|
||||
|
||||
type persistedTask struct {
|
||||
@@ -306,6 +313,9 @@ var (
|
||||
runStorageAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, extended bool, logFunc func(string)) (string, error) {
|
||||
return a.RunStorageAcceptancePackCtx(ctx, baseDir, extended, logFunc)
|
||||
}
|
||||
runConfidentialComputingCheckPackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunConfidentialComputingCheckPackCtx(ctx, baseDir, logFunc)
|
||||
}
|
||||
runCPUAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.RunCPUAcceptancePackCtx(ctx, baseDir, durationSec, logFunc)
|
||||
}
|
||||
@@ -1019,6 +1029,12 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
break
|
||||
}
|
||||
archive, err = runStorageAcceptancePackCtx(a, ctx, "", t.params.StressMode, j.append)
|
||||
case "confidential-computing":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
break
|
||||
}
|
||||
archive, err = runConfidentialComputingCheckPackCtx(a, ctx, "", j.append)
|
||||
case "cpu":
|
||||
if a == nil {
|
||||
err = fmt.Errorf("app not configured")
|
||||
|
||||
+1
-1
Submodule bible updated: 1977730d93...d2600f1279
@@ -13,6 +13,7 @@ Generic engineering rules live in `bible/rules/patterns/`.
|
||||
| `docs/hardware-ingest-contract.md` | Current Reanimator hardware ingest JSON contract |
|
||||
| `docs/validate-vs-burn.md` | Validate and Validate -> Stress hardware test policy |
|
||||
| `decisions/` | Architectural decision log, including read-only submodule policy |
|
||||
| `proposals/` | RFCs and contract change proposals for Reanimator Core |
|
||||
|
||||
## Validate Test Matrix
|
||||
|
||||
|
||||
@@ -1,5 +1,103 @@
|
||||
# Backlog
|
||||
|
||||
## Сбор SFP-модулей
|
||||
|
||||
**Статус:** не реализовано.
|
||||
|
||||
### Источник данных
|
||||
|
||||
`ethtool -m <iface>` / `ethtool --module-info <iface>` — читает EEPROM SFP/SFP+/QSFP28/QSFP-DD по стандарту MSA (SFF-8472 / SFF-8636).
|
||||
|
||||
Доступные поля из EEPROM:
|
||||
- Идентификатор модуля: `Identifier` (SFP, SFP+, QSFP28, …)
|
||||
- Тип коннектора: `Connector`
|
||||
- Вендор: `Vendor name`, `Vendor OUI`, `Vendor PN`, `Vendor SN`, `Vendor rev`
|
||||
- Оптика: `Wavelength`, `Transceiver type` (10GBase-SR, LR, DAC, …)
|
||||
- Телеметрия DOM (если модуль поддерживает): `Laser tx bias current`, `Transmit avg optical power`, `Receive avg optical power`, `Module temperature`, `Module voltage`
|
||||
- Статус: `Rx power high alarm`, `Tx power low warning`, …
|
||||
|
||||
Для QSFP28 данные повторяются на 4 канала (lane 0–3).
|
||||
|
||||
Инструмент требует root. На bee ISO — доступен (`ethtool` входит в образ).
|
||||
|
||||
### Scope для bee
|
||||
|
||||
1. Собирать список сетевых интерфейсов через `ip -j link show` (только `ether`, без `lo`/VLAN/bond).
|
||||
2. Для каждого интерфейса пробовать `ethtool -m <iface>`. Если модуль отсутствует или не поддерживает EEPROM read — тихо пропускать.
|
||||
3. Связывать интерфейс с PCIe-устройством через `ethtool -i <iface>` → поле `bus-info` (BDF) → сопоставление с `pcie_devices[].slot`.
|
||||
|
||||
### Gap в контракте
|
||||
|
||||
Текущий контракт v2.10 имеет в `pcie_devices[]` скалярные поля:
|
||||
- `sfp_temperature_c`, `sfp_tx_power_dbm`, `sfp_rx_power_dbm`, `sfp_voltage_v`, `sfp_bias_ma`
|
||||
|
||||
Этого **недостаточно**:
|
||||
- Одна NIC-карта может иметь несколько портов — нужен массив, а не скаляр.
|
||||
- Нет полей идентификации модуля (vendor, part_number, serial_number, wavelength, connector).
|
||||
- Нет разбивки по каналам для QSFP28.
|
||||
|
||||
### Предлагаемое расширение контракта
|
||||
|
||||
Добавить в `pcie_devices[]` массив `sfp_modules[]`:
|
||||
|
||||
```json
|
||||
"pcie_devices": [
|
||||
{
|
||||
"slot": "0000:3b:00.0",
|
||||
"device_class": "EthernetController",
|
||||
"model": "ConnectX-6 Dx",
|
||||
"sfp_modules": [
|
||||
{
|
||||
"port": 0,
|
||||
"identifier": "QSFP28",
|
||||
"connector": "LC",
|
||||
"vendor": "Mellanox",
|
||||
"part_number": "MFA1A00-C003",
|
||||
"serial_number": "MT2124VS09999",
|
||||
"revision": "A",
|
||||
"wavelength_nm": 850,
|
||||
"transceiver_type": "100GBase-SR4",
|
||||
"temperature_c": 36.4,
|
||||
"voltage_v": 3.29,
|
||||
"tx_power_dbm": -1.8,
|
||||
"rx_power_dbm": -2.1,
|
||||
"bias_ma": 7.2
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
Поля `sfp_modules[]`:
|
||||
|
||||
| Поле | Тип | Описание |
|
||||
|---|---|---|
|
||||
| `port` | int | Номер порта на NIC (0-based) |
|
||||
| `identifier` | string | `SFP`, `SFP+`, `QSFP28`, `QSFP-DD`, … |
|
||||
| `connector` | string | `LC`, `MPO`, `DAC`, … |
|
||||
| `vendor` | string | Производитель модуля |
|
||||
| `part_number` | string | Партномер |
|
||||
| `serial_number` | string | Серийный номер |
|
||||
| `revision` | string | Ревизия |
|
||||
| `wavelength_nm` | int | Длина волны, нм |
|
||||
| `transceiver_type` | string | `10GBase-SR`, `100GBase-SR4`, `DAC`, … |
|
||||
| `temperature_c` | float | Температура модуля, °C |
|
||||
| `voltage_v` | float | Напряжение, В |
|
||||
| `tx_power_dbm` | float | TX оптическая мощность, dBm |
|
||||
| `rx_power_dbm` | float | RX оптическая мощность, dBm |
|
||||
| `bias_ma` | float | Bias current, мА |
|
||||
|
||||
Старые скалярные поля `sfp_temperature_c` / `sfp_tx_power_dbm` / `sfp_rx_power_dbm` / `sfp_voltage_v` / `sfp_bias_ma` на уровне `pcie_devices[]` — **вывести из контракта** (deprecated), заменить на `sfp_modules[]`.
|
||||
|
||||
### Порядок реализации
|
||||
|
||||
1. Согласовать расширение контракта с Reanimator Core (bump до v2.11).
|
||||
2. Добавить `ethtool` parser в `audit/internal/collector/` — новый файл `sfp.go`.
|
||||
3. Дополнить schema в `audit/internal/schema/` типом `SFPModule`.
|
||||
4. Добавить `sfp_modules` в `PCIeDevice` в schema.
|
||||
5. Заполнять в NIC-коллекторе: связь интерфейс → BDF → `pcie_devices[].sfp_modules`.
|
||||
6. Показывать в TUI и web UI в разделе PCIe/NIC.
|
||||
|
||||
## BMC версия через IPMI
|
||||
|
||||
**Статус:** реализовано.
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
---
|
||||
title: Hardware Ingest JSON Contract
|
||||
version: "2.10"
|
||||
updated: "2026-04-29"
|
||||
version: "2.11"
|
||||
updated: "2026-06-19"
|
||||
maintainer: Reanimator Core
|
||||
audience: external-integrators, ai-agents
|
||||
language: ru
|
||||
@@ -9,7 +9,7 @@ language: ru
|
||||
|
||||
# Интеграция с Reanimator: контракт JSON-импорта аппаратного обеспечения
|
||||
|
||||
Версия: **2.10** · Дата: **2026-04-29**
|
||||
Версия: **2.11** · Дата: **2026-06-19**
|
||||
|
||||
Документ описывает формат JSON для передачи данных об аппаратном обеспечении серверов в систему **Reanimator** (управление жизненным циклом аппаратного обеспечения).
|
||||
Предназначен для разработчиков смежных систем (Redfish-коллекторов, агентов мониторинга, CMDB-экспортёров) и может быть включён в документацию интегрируемых проектов.
|
||||
@@ -22,6 +22,7 @@ language: ru
|
||||
|
||||
| Версия | Дата | Изменения |
|
||||
|--------|------|-----------|
|
||||
| 2.11 | 2026-06-19 | В `pcie_devices[]` добавлен необязательный массив `sfp_modules[]` с идентификацией и DOM telemetry SFP/QSFP-модулей. Скалярные поля `sfp_temperature_c` / `sfp_tx_power_dbm` / `sfp_rx_power_dbm` / `sfp_voltage_v` / `sfp_bias_ma` помечены как deprecated (принимаются, но `sfp_modules[]` имеет приоритет) |
|
||||
| 2.10 | 2026-04-29 | Для `hardware.storage[]` добавлены необязательные числовые поля `logical_block_size_bytes`, `physical_block_size_bytes`, `metadata_bytes_per_block` для нормализованного описания формата блока накопителя |
|
||||
| 2.9 | 2026-03-19 | Добавлена необязательная секция `hardware.platform_config` — произвольный объект с настройками платформы (BIOS/Redfish); хранится как latest-snapshot per machine |
|
||||
| 2.8 | 2026-03-15 | Поле `location` удалено из всех `sensors.*`; сенсоры передаются только по `name` и измеренным значениям |
|
||||
@@ -422,11 +423,12 @@ GET /ingest/hardware/jobs/{job_id}
|
||||
| `battery_temperature_c` | float | нет | Температура батареи / supercap, °C |
|
||||
| `battery_voltage_v` | float | нет | Напряжение батареи / supercap, В |
|
||||
| `battery_replace_required` | bool | нет | Требуется замена батареи / supercap |
|
||||
| `sfp_temperature_c` | float | нет | Температура SFP/optic, °C |
|
||||
| `sfp_tx_power_dbm` | float | нет | TX optical power, dBm |
|
||||
| `sfp_rx_power_dbm` | float | нет | RX optical power, dBm |
|
||||
| `sfp_voltage_v` | float | нет | Напряжение SFP, В |
|
||||
| `sfp_bias_ma` | float | нет | Bias current SFP, мА |
|
||||
| `sfp_temperature_c` | float | нет | Температура SFP/optic, °C *(deprecated since 2.11)* |
|
||||
| `sfp_tx_power_dbm` | float | нет | TX optical power, dBm *(deprecated since 2.11)* |
|
||||
| `sfp_rx_power_dbm` | float | нет | RX optical power, dBm *(deprecated since 2.11)* |
|
||||
| `sfp_voltage_v` | float | нет | Напряжение SFP, В *(deprecated since 2.11)* |
|
||||
| `sfp_bias_ma` | float | нет | Bias current SFP, мА *(deprecated since 2.11)* |
|
||||
| `sfp_modules` | array | нет | Установленные SFP/QSFP-модули по портам (см. sfp_modules[]) |
|
||||
| `bdf` | string | нет | Deprecated alias для `slot`; при наличии ingest нормализует его в `slot` |
|
||||
| `device_class` | string | нет | Класс устройства (см. список ниже) |
|
||||
| `manufacturer` | string | нет | Производитель |
|
||||
@@ -444,10 +446,43 @@ GET /ingest/hardware/jobs/{job_id}
|
||||
`numa_node` передавайте для NIC / InfiniBand / RAID / GPU, когда источник знает CPU/NUMA affinity. Поле сохраняется в snapshot-атрибутах PCIe-компонента и дублируется в telemetry для topology use cases.
|
||||
Поля `temperature_c` и `power_w` используйте для device-level telemetry GPU / accelerator / smart PCIe devices. Они не влияют на идентификацию компонента.
|
||||
|
||||
**Deprecated поля sfp_\*:** Скалярные поля `sfp_temperature_c`, `sfp_tx_power_dbm`, `sfp_rx_power_dbm`, `sfp_voltage_v`, `sfp_bias_ma` продолжают приниматься, но помечены как deprecated since 2.11. Если в payload одновременно присутствуют `sfp_modules[]` и deprecated sfp_-скаляры — приоритет у `sfp_modules[]`, скаляры игнорируются. Deprecated поля будут удалены в версии 3.0.
|
||||
|
||||
**Генерация serial_number при отсутствии или `"N/A"`:** `{board_serial}-PCIE-{slot}`, где `slot` для PCIe равен BDF.
|
||||
|
||||
`slot` — единственный канонический адрес компонента. Для PCIe в `slot` передавайте BDF. Поле `bdf` сохраняется только как переходный alias на входе и не должно использоваться как отдельная координата рядом со `slot`.
|
||||
|
||||
#### pcie_devices[].sfp_modules[]
|
||||
|
||||
Необязательный массив установленных SFP/QSFP-модулей для данного PCIe-устройства. Один элемент — один порт. Используйте для многопортовых NIC (ConnectX-6 Dx, Intel X710, Mellanox HDR и др.).
|
||||
|
||||
| Поле | Тип | Обязательно | Описание |
|
||||
|------|-----|-------------|----------|
|
||||
| `port` | int | **да** | Номер порта на NIC (0-based). Ключ дедупликации внутри устройства |
|
||||
| `identifier` | string | нет | Тип модуля: `SFP`, `SFP+`, `SFP28`, `QSFP+`, `QSFP28`, `QSFP-DD`, `DAC` |
|
||||
| `connector` | string | нет | Тип разъёма: `LC`, `MPO`, `RJ45`, `DAC`, `AOC`, `No separable connector` |
|
||||
| `vendor` | string | нет | Производитель модуля из EEPROM |
|
||||
| `part_number` | string | нет | Партномер из EEPROM |
|
||||
| `serial_number` | string | нет | Серийный номер из EEPROM |
|
||||
| `revision` | string | нет | Ревизия из EEPROM |
|
||||
| `wavelength_nm` | int | нет | Длина волны, нм (0 для DAC/медных кабелей) |
|
||||
| `transceiver_type` | string | нет | `10GBase-SR`, `10GBase-LR`, `25GBase-SR`, `100GBase-SR4`, `DAC`, … |
|
||||
| `temperature_c` | float | нет | Температура модуля, °C (DOM telemetry) |
|
||||
| `voltage_v` | float | нет | Напряжение питания, В (DOM telemetry) |
|
||||
| `tx_power_dbm` | float | нет | TX оптическая мощность, dBm (DOM telemetry) |
|
||||
| `rx_power_dbm` | float | нет | RX оптическая мощность, dBm (DOM telemetry) |
|
||||
| `bias_ma` | float | нет | Bias current, мА (DOM telemetry) |
|
||||
|
||||
**Ключ дедупликации:** `(pcie_devices[].slot, sfp_modules[].port)`.
|
||||
|
||||
**Правила ingest:**
|
||||
- При каждом импорте — полная замена `sfp_modules[]` для данного `pcie_devices[].slot` (upsert всего массива целиком).
|
||||
- Если `sfp_modules` отсутствует или `null` — существующие данные SFP не трогать.
|
||||
- Если `sfp_modules: []` (пустой массив) — трактовать как «модули не обнаружены», очистить сохранённые данные.
|
||||
- Дубли по `port` внутри одного `pcie_devices[]` — невалидны, endpoint возвращает `400` с описанием поля.
|
||||
- Модули без `serial_number` допустимы (многие DAC-кабели не имеют SN); сохраняются по ключу `(slot, port)`.
|
||||
- Изменение `serial_number` или `part_number` модуля на порту создаёт событие `COMPONENT_CHANGED` для PCIe-устройства с описанием «SFP module replaced on port N».
|
||||
|
||||
**Значения `device_class`:**
|
||||
|
||||
| Значение | Назначение |
|
||||
@@ -472,16 +507,47 @@ GET /ingest/hardware/jobs/{job_id}
|
||||
"numa_node": 0,
|
||||
"temperature_c": 48.5,
|
||||
"power_w": 18.2,
|
||||
"sfp_temperature_c": 36.2,
|
||||
"sfp_tx_power_dbm": -1.8,
|
||||
"sfp_rx_power_dbm": -2.1,
|
||||
"device_class": "EthernetController",
|
||||
"manufacturer": "Intel",
|
||||
"model": "X710 10GbE",
|
||||
"serial_number": "K65472-003",
|
||||
"firmware": "9.20 0x8000d4ae",
|
||||
"manufacturer": "Mellanox",
|
||||
"model": "ConnectX-6 Dx",
|
||||
"serial_number": "MT2012X12345",
|
||||
"firmware": "22.35.2010",
|
||||
"mac_addresses": ["3c:fd:fe:aa:bb:cc", "3c:fd:fe:aa:bb:cd"],
|
||||
"status": "OK"
|
||||
"status": "OK",
|
||||
"sfp_modules": [
|
||||
{
|
||||
"port": 0,
|
||||
"identifier": "QSFP28",
|
||||
"connector": "LC",
|
||||
"vendor": "Mellanox",
|
||||
"part_number": "MFA1A00-C003",
|
||||
"serial_number": "MT2124VS09999",
|
||||
"revision": "A",
|
||||
"wavelength_nm": 850,
|
||||
"transceiver_type": "100GBase-SR4",
|
||||
"temperature_c": 36.4,
|
||||
"voltage_v": 3.29,
|
||||
"tx_power_dbm": -1.8,
|
||||
"rx_power_dbm": -2.1,
|
||||
"bias_ma": 7.2
|
||||
},
|
||||
{
|
||||
"port": 1,
|
||||
"identifier": "QSFP28",
|
||||
"connector": "LC",
|
||||
"vendor": "Mellanox",
|
||||
"part_number": "MFA1A00-C003",
|
||||
"serial_number": "MT2124VS09998",
|
||||
"revision": "A",
|
||||
"wavelength_nm": 850,
|
||||
"transceiver_type": "100GBase-SR4",
|
||||
"temperature_c": 35.9,
|
||||
"voltage_v": 3.28,
|
||||
"tx_power_dbm": -1.9,
|
||||
"rx_power_dbm": -2.3,
|
||||
"bias_ma": 7.1
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
```
|
||||
@@ -793,7 +859,24 @@ PSU без `serial_number` игнорируется.
|
||||
"model": "X710 10GbE",
|
||||
"serial_number": "K65472-003",
|
||||
"mac_addresses": ["3c:fd:fe:aa:bb:cc", "3c:fd:fe:aa:bb:cd"],
|
||||
"status": "OK"
|
||||
"status": "OK",
|
||||
"sfp_modules": [
|
||||
{
|
||||
"port": 0,
|
||||
"identifier": "SFP+",
|
||||
"connector": "LC",
|
||||
"vendor": "Intel",
|
||||
"part_number": "FTLX8574D3BCV-IT",
|
||||
"serial_number": "FNS123456789",
|
||||
"wavelength_nm": 850,
|
||||
"transceiver_type": "10GBase-SR",
|
||||
"temperature_c": 34.1,
|
||||
"voltage_v": 3.30,
|
||||
"tx_power_dbm": -2.5,
|
||||
"rx_power_dbm": -3.0,
|
||||
"bias_ma": 6.8
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"power_supplies": [
|
||||
|
||||
@@ -0,0 +1,201 @@
|
||||
# RFC: SFP Module Identity and Telemetry in Hardware Ingest Contract
|
||||
|
||||
**Date:** 2026-06-19
|
||||
**Status:** proposal
|
||||
**Target:** Reanimator Core — ingest schema, validation, and storage
|
||||
**Requested by:** bee (hardware audit LiveCD)
|
||||
**Contract version affected:** 2.10 → 2.11
|
||||
|
||||
---
|
||||
|
||||
## Задача для агента
|
||||
|
||||
Внести изменения в контракт ingest и реализовать их в Reanimator Core:
|
||||
|
||||
1. Добавить массив `sfp_modules[]` внутри `pcie_devices[]`.
|
||||
2. Обновить JSON-схему и валидатор ingest endpoint.
|
||||
3. Реализовать хранение/upsert данных SFP-модулей.
|
||||
4. Пометить скалярные поля `sfp_*` на уровне `pcie_devices[]` как deprecated.
|
||||
5. Обновить документ контракта (`hardware-ingest-contract.md`) до версии 2.11.
|
||||
|
||||
Ниже — всё необходимое для принятия решений без дополнительных вопросов.
|
||||
|
||||
---
|
||||
|
||||
## Контекст: откуда берутся данные
|
||||
|
||||
**Источник в bee:** `ethtool -m <iface>` (читает EEPROM SFP/SFP+/QSFP28/QSFP-DD по стандарту MSA SFF-8472 / SFF-8636).
|
||||
|
||||
**Связь с PCIe:** `ethtool -i <iface>` возвращает `bus-info` = BDF (`0000:3b:00.0`), который совпадает с `pcie_devices[].slot`. Так bee связывает SFP-данные конкретного интерфейса с PCIe-устройством.
|
||||
|
||||
**Один NIC — несколько модулей:** карта ConnectX-6 Dx (2 порта), Intel X710 (4 порта), Mellanox HDR (2 порта). Каждый порт — отдельный `ethtool -m`, отдельный SFP-модуль. Одного скаляра на устройство недостаточно.
|
||||
|
||||
**QSFP28/QSFP-DD:** 4-канальные модули возвращают telemetry отдельно по каждому каналу (lane). В предложенной схеме lane-уровень не включён в первую версию — только агрегированные значения модуля в целом. Расширение до lane-уровня — отдельный RFC если понадобится.
|
||||
|
||||
---
|
||||
|
||||
## Проблема с текущим контрактом v2.10
|
||||
|
||||
В `pcie_devices[]` есть пять скалярных полей:
|
||||
|
||||
```
|
||||
sfp_temperature_c float
|
||||
sfp_tx_power_dbm float
|
||||
sfp_rx_power_dbm float
|
||||
sfp_voltage_v float
|
||||
sfp_bias_ma float
|
||||
```
|
||||
|
||||
Ограничения:
|
||||
- **Нет идентификации модуля** — vendor, part_number, serial_number, wavelength отсутствуют; модуль нельзя инвентаризировать как самостоятельный компонент.
|
||||
- **Только один набор значений на устройство** — невозможно описать 4-портовый NIC.
|
||||
- **Нет типа модуля** — SFP, QSFP28, DAC-кабель не различаются.
|
||||
- **Нет connector/transceiver_type** — невозможно понять, оптика это или медь.
|
||||
|
||||
---
|
||||
|
||||
## Предлагаемое изменение схемы
|
||||
|
||||
### Новая структура `sfp_modules[]`
|
||||
|
||||
Добавляется как необязательное поле внутри каждого объекта `pcie_devices[]`.
|
||||
|
||||
```json
|
||||
"pcie_devices": [
|
||||
{
|
||||
"slot": "0000:3b:00.0",
|
||||
"device_class": "EthernetController",
|
||||
"model": "ConnectX-6 Dx",
|
||||
"manufacturer": "Mellanox",
|
||||
"serial_number": "MT2012X12345",
|
||||
"status": "OK",
|
||||
"sfp_modules": [
|
||||
{
|
||||
"port": 0,
|
||||
"identifier": "QSFP28",
|
||||
"connector": "LC",
|
||||
"vendor": "Mellanox",
|
||||
"part_number": "MFA1A00-C003",
|
||||
"serial_number": "MT2124VS09999",
|
||||
"revision": "A",
|
||||
"wavelength_nm": 850,
|
||||
"transceiver_type": "100GBase-SR4",
|
||||
"temperature_c": 36.4,
|
||||
"voltage_v": 3.29,
|
||||
"tx_power_dbm": -1.8,
|
||||
"rx_power_dbm": -2.1,
|
||||
"bias_ma": 7.2
|
||||
},
|
||||
{
|
||||
"port": 1,
|
||||
"identifier": "QSFP28",
|
||||
"connector": "LC",
|
||||
"vendor": "Mellanox",
|
||||
"part_number": "MFA1A00-C003",
|
||||
"serial_number": "MT2124VS09998",
|
||||
"revision": "A",
|
||||
"wavelength_nm": 850,
|
||||
"transceiver_type": "100GBase-SR4",
|
||||
"temperature_c": 35.9,
|
||||
"voltage_v": 3.28,
|
||||
"tx_power_dbm": -1.9,
|
||||
"rx_power_dbm": -2.3,
|
||||
"bias_ma": 7.1
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
### Поля `sfp_modules[]`
|
||||
|
||||
| Поле | Тип | Обязательно | Описание |
|
||||
|---|---|---|---|
|
||||
| `port` | int | **да** | Номер порта на NIC (0-based). Ключ дедупликации внутри устройства. |
|
||||
| `identifier` | string | нет | Тип модуля: `SFP`, `SFP+`, `SFP28`, `QSFP+`, `QSFP28`, `QSFP-DD`, `DAC` |
|
||||
| `connector` | string | нет | Тип разъёма: `LC`, `MPO`, `RJ45`, `DAC`, `AOC`, `No separable connector` |
|
||||
| `vendor` | string | нет | Производитель модуля из EEPROM |
|
||||
| `part_number` | string | нет | Партномер из EEPROM |
|
||||
| `serial_number` | string | нет | Серийный номер из EEPROM |
|
||||
| `revision` | string | нет | Ревизия из EEPROM |
|
||||
| `wavelength_nm` | int | нет | Длина волны, нм (0 для DAC/медных кабелей) |
|
||||
| `transceiver_type` | string | нет | `10GBase-SR`, `10GBase-LR`, `25GBase-SR`, `100GBase-SR4`, `DAC`, … |
|
||||
| `temperature_c` | float | нет | Температура модуля, °C (DOM telemetry) |
|
||||
| `voltage_v` | float | нет | Напряжение питания, В (DOM telemetry) |
|
||||
| `tx_power_dbm` | float | нет | TX оптическая мощность, dBm (DOM telemetry) |
|
||||
| `rx_power_dbm` | float | нет | RX оптическая мощность, dBm (DOM telemetry) |
|
||||
| `bias_ma` | float | нет | Bias current, мА (DOM telemetry) |
|
||||
|
||||
**Ключ дедупликации:** `(pcie_device.slot, sfp_modules[].port)`.
|
||||
|
||||
**Модули без серийного номера** — допустимы; многие DAC-кабели не имеют SN. Не игнорировать, сохранять по ключу `(slot, port)`.
|
||||
|
||||
---
|
||||
|
||||
## Deprecated поля
|
||||
|
||||
Следующие поля на уровне `pcie_devices[]` помечаются как **deprecated** начиная с v2.11:
|
||||
|
||||
```
|
||||
sfp_temperature_c
|
||||
sfp_tx_power_dbm
|
||||
sfp_rx_power_dbm
|
||||
sfp_voltage_v
|
||||
sfp_bias_ma
|
||||
```
|
||||
|
||||
**Поведение при получении deprecated полей:**
|
||||
- Продолжать принимать и сохранять (не ломать существующих интеграторов).
|
||||
- Если одновременно присутствуют `sfp_modules[]` и deprecated скаляры — приоритет у `sfp_modules[]`; скаляры игнорируются.
|
||||
- В документации пометить как `deprecated since 2.11, will be removed in 3.0`.
|
||||
|
||||
**Не удалять** deprecated поля из валидации в этом PR — только пометить в документации и changelog.
|
||||
|
||||
---
|
||||
|
||||
## Правила ingest для `sfp_modules[]`
|
||||
|
||||
- `sfp_modules[]` хранится как snapshot-атрибут PCIe-компонента (аналогично `mac_addresses`).
|
||||
- При каждом импорте — полная замена `sfp_modules[]` для данного `pcie_devices[].slot` (upsert всего массива целиком, не merge по портам).
|
||||
- Если `sfp_modules` отсутствует или `null` — существующие данные SFP не трогать (не затирать).
|
||||
- Если `sfp_modules: []` (пустой массив) — трактовать как «модули не обнаружены», очистить сохранённые данные.
|
||||
- Изменение `serial_number` или `part_number` модуля на порту — создавать событие `COMPONENT_CHANGED` для PCIe-устройства с описанием «SFP module replaced on port N».
|
||||
|
||||
---
|
||||
|
||||
## Изменения в документе контракта
|
||||
|
||||
Файл: `bible-local/docs/hardware-ingest-contract.md`
|
||||
|
||||
1. Заголовок версии: `2.10` → `2.11`, дата → `2026-06-19`.
|
||||
2. Добавить в changelog:
|
||||
```
|
||||
| 2.11 | 2026-06-19 | В `pcie_devices[]` добавлен необязательный массив `sfp_modules[]`
|
||||
с идентификацией и DOM telemetry SFP/QSFP-модулей. Скалярные поля
|
||||
sfp_temperature_c / sfp_tx_power_dbm / sfp_rx_power_dbm / sfp_voltage_v /
|
||||
sfp_bias_ma помечены как deprecated (принимаются, но sfp_modules[] имеет приоритет). |
|
||||
```
|
||||
3. В секции `pcie_devices` добавить строку в таблицу полей:
|
||||
```
|
||||
| `sfp_modules` | array | нет | Установленные SFP/QSFP-модули по портам (см. sfp_modules[]) |
|
||||
```
|
||||
4. Добавить подсекцию `#### pcie_devices[].sfp_modules[]` с таблицей полей и примером JSON (из раздела выше).
|
||||
5. Пометить deprecated поля в таблице: добавить суффикс `*(deprecated since 2.11)*`.
|
||||
6. Обновить полный пример JSON — добавить `sfp_modules` к NIC-записи в `pcie_devices`.
|
||||
|
||||
---
|
||||
|
||||
## Что не нужно делать в этом PR
|
||||
|
||||
- Не добавлять lane-level данные QSFP (tx_power_dbm_lane_0 и т.п.) — отдельный RFC.
|
||||
- Не удалять deprecated поля — только пометить.
|
||||
- Не создавать отдельную top-level секцию `network_ports` — данные остаются вложенными в `pcie_devices`.
|
||||
- Не менять логику идентификации PCIe-компонента — `serial_number` SFP-модуля не является ключом для самостоятельного компонента.
|
||||
|
||||
---
|
||||
|
||||
## Валидация
|
||||
|
||||
Единственное обязательное поле в `sfp_modules[]` — `port` (int, >= 0).
|
||||
Все остальные поля опциональны.
|
||||
Дубли по `port` внутри одного `pcie_devices[]` — невалидны, возвращать `400` с описанием поля.
|
||||
+1
-1
Submodule internal/chart updated: 8105c7ec08...2a15bc87f1
+19
-1
@@ -1419,6 +1419,13 @@ rm -rf \
|
||||
if [ "$BEE_GPU_VENDOR" != "nvidia" ]; then
|
||||
rm -f "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nvidia-load"
|
||||
rm -f "${OVERLAY_STAGE_DIR}/etc/systemd/system/bee-nvidia.service"
|
||||
rm -f "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-burn"
|
||||
rm -f "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-john-gpu-stress"
|
||||
rm -f "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nccl-gpu-stress"
|
||||
rm -f "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nvidia-recover"
|
||||
rm -f "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-dcgmproftester-staggered"
|
||||
rm -f "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-check-nvswitch"
|
||||
rm -rf "${OVERLAY_STAGE_DIR}/etc/systemd/system/nvidia-fabricmanager.service.d"
|
||||
fi
|
||||
|
||||
# --- inject authorized_keys for SSH access ---
|
||||
@@ -1473,7 +1480,7 @@ cp "${BUILDER_DIR}/smoketest.sh" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smokete
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest"
|
||||
|
||||
# --- vendor utilities (optional pre-fetched binaries) ---
|
||||
for tool in storcli64 sas2ircu sas3ircu arcconf ssacli; do
|
||||
for tool in storcli64 sas2ircu sas3ircu arcconf ssacli saa; do
|
||||
if [ -f "${VENDOR_DIR}/${tool}" ]; then
|
||||
cp "${VENDOR_DIR}/${tool}" "${OVERLAY_STAGE_DIR}/usr/local/bin/${tool}"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/${tool}" || true
|
||||
@@ -1483,6 +1490,17 @@ for tool in storcli64 sas2ircu sas3ircu arcconf ssacli; do
|
||||
fi
|
||||
done
|
||||
|
||||
# saa companion directories — saa searches for these relative to CWD (/usr/local/bin)
|
||||
for saa_subdir in acpica_bin ExternalData tool stunnel GO_SNMP; do
|
||||
if [ -d "${VENDOR_DIR}/${saa_subdir}" ]; then
|
||||
cp -r "${VENDOR_DIR}/${saa_subdir}" "${OVERLAY_STAGE_DIR}/usr/local/bin/"
|
||||
find "${OVERLAY_STAGE_DIR}/usr/local/bin/${saa_subdir}" -type f -exec chmod +x {} \; 2>/dev/null || true
|
||||
echo "vendor saa: ${saa_subdir}/ (included)"
|
||||
else
|
||||
echo "vendor saa: ${saa_subdir}/ (not found, skipped)"
|
||||
fi
|
||||
done
|
||||
|
||||
# --- NVIDIA kernel modules and userspace libs ---
|
||||
if [ "$BEE_GPU_VENDOR" = "nvidia" ]; then
|
||||
run_step "build NVIDIA ${NVIDIA_DRIVER_VERSION} modules" "40-nvidia-module" \
|
||||
|
||||
@@ -67,7 +67,8 @@ if ! mkdir "${LOCK_DIR}" 2>/dev/null; then
|
||||
fi
|
||||
trap 'rmdir "${LOCK_DIR}" >/dev/null 2>&1 || true' EXIT
|
||||
|
||||
if have_nvidia_gpu && [ ! -e /dev/nvidia0 ]; then
|
||||
GPU_VENDOR=$(cat /etc/bee-gpu-vendor 2>/dev/null || echo "")
|
||||
if [ "$GPU_VENDOR" = "nvidia" ] && have_nvidia_gpu && [ ! -e /dev/nvidia0 ]; then
|
||||
log_event "NVIDIA GPU detected but /dev/nvidia0 is missing"
|
||||
restart_service bee-nvidia.service || true
|
||||
fi
|
||||
|
||||
Vendored
+1131
File diff suppressed because it is too large
Load Diff
Vendored
+2333
File diff suppressed because it is too large
Load Diff
+37
@@ -0,0 +1,37 @@
|
||||
(UTC-10:00) Aleutian Islands
|
||||
(UTC-09:00) Alaska
|
||||
(UTC-08:00) Baja California
|
||||
(UTC-08:00) Pacific Time (US & Canada)
|
||||
(UTC-07:00) Mountain Time (US & Canada)
|
||||
(UTC-06:00) Central Time (US & Canada)
|
||||
(UTC-06:00) Easter Island
|
||||
(UTC-05:00) Eastern Time (US & Canada)
|
||||
(UTC-05:00) Haiti
|
||||
(UTC-05:00) Havana
|
||||
(UTC-05:00) Indiana (East)
|
||||
(UTC-05:00) Turks and Caicos
|
||||
(UTC-04:00) Asuncion
|
||||
(UTC-04:00) Atlantic Time (Canada)
|
||||
(UTC-04:00) Santiago
|
||||
(UTC-03:30) Newfoundland
|
||||
(UTC-03:00) Saint Pierre and Miquelon
|
||||
(UTC-01:00) Azores
|
||||
(UTC+00:00) Dublin, Edinburgh, Lisbon, London
|
||||
(UTC+01:00) Casablanca
|
||||
(UTC+01:00) Amsterdam, Berlin, Bern, Rome, Stockholm, Vienna
|
||||
(UTC+01:00) Belgrade, Bratislava, Budapest, Ljubljana, Prague
|
||||
(UTC+01:00) Brussels, Copenhagen, Madrid, Paris
|
||||
(UTC+01:00) Sarajevo, Skopje, Warsaw, Zagreb
|
||||
(UTC+02:00) Athens, Bucharest
|
||||
(UTC+02:00) Beirut
|
||||
(UTC+02:00) Chisinau
|
||||
(UTC+02:00) Gaza, Hebron
|
||||
(UTC+02:00) Helsinki, Kyiv, Riga, Sofia, Tallinn, Vilnius
|
||||
(UTC+02:00) Jerusalem
|
||||
(UTC+09:30) Adelaide
|
||||
(UTC+10:00) Canberra, Melbourne, Sydney
|
||||
(UTC+10:00) Hobart
|
||||
(UTC+10:30) Lord Howe Island
|
||||
(UTC+11:00) Norfolk Island
|
||||
(UTC+12:00) Auckland, Wellington
|
||||
(UTC+12:45) Chatham Islands
|
||||
Vendored
+139
@@ -0,0 +1,139 @@
|
||||
(UTC-12:00) International Date Line West
|
||||
(UTC-11:00) Coordinated Universal Time-11
|
||||
(UTC-10:00) Aleutian Islands
|
||||
(UTC-10:00) Hawaii
|
||||
(UTC-09:30) Marquesas Islands
|
||||
(UTC-09:00) Alaska
|
||||
(UTC-09:00) Coordinated Universal Time-09
|
||||
(UTC-08:00) Baja California
|
||||
(UTC-08:00) Coordinated Universal Time-08
|
||||
(UTC-08:00) Pacific Time (US & Canada)
|
||||
(UTC-07:00) Arizona
|
||||
(UTC-07:00) Chihuahua, La Paz, Mazatlan
|
||||
(UTC-07:00) Mountain Time (US & Canada)
|
||||
(UTC-07:00) Yukon
|
||||
(UTC-06:00) Central America
|
||||
(UTC-06:00) Central Time (US & Canada)
|
||||
(UTC-06:00) Easter Island
|
||||
(UTC-06:00) Guadalajara, Mexico City, Monterrey
|
||||
(UTC-06:00) Saskatchewan
|
||||
(UTC-05:00) Bogota, Lima, Quito, Rio Branco
|
||||
(UTC-05:00) Chetumal
|
||||
(UTC-05:00) Eastern Time (US & Canada)
|
||||
(UTC-05:00) Haiti
|
||||
(UTC-05:00) Havana
|
||||
(UTC-05:00) Indiana (East)
|
||||
(UTC-05:00) Turks and Caicos
|
||||
(UTC-04:00) Atlantic Time (Canada)
|
||||
(UTC-04:00) Caracas
|
||||
(UTC-04:00) Cuiaba
|
||||
(UTC-04:00) Georgetown, La Paz, Manaus, San Juan
|
||||
(UTC-04:00) Santiago
|
||||
(UTC-03:30) Newfoundland
|
||||
(UTC-03:00) Asuncion
|
||||
(UTC-03:00) Araguaina
|
||||
(UTC-03:00) Brasilia
|
||||
(UTC-03:00) Cayenne, Fortaleza
|
||||
(UTC-03:00) City of Buenos Aires
|
||||
(UTC-03:00) Greenland
|
||||
(UTC-03:00) Montevideo
|
||||
(UTC-03:00) Punta Arenas
|
||||
(UTC-03:00) Saint Pierre and Miquelon
|
||||
(UTC-03:00) Salvador
|
||||
(UTC-02:00) Coordinated Universal Time-02
|
||||
(UTC-01:00) Azores
|
||||
(UTC-01:00) Cabo Verde Is.
|
||||
(UTC+00:00) Coordinated Universal Time
|
||||
(UTC+00:00) Dublin, Edinburgh, Lisbon, London
|
||||
(UTC+00:00) Monrovia, Reykjavik
|
||||
(UTC+00:00) Sao Tome
|
||||
(UTC+01:00) Casablanca
|
||||
(UTC+01:00) Amsterdam, Berlin, Bern, Rome, Stockholm, Vienna
|
||||
(UTC+01:00) Belgrade, Bratislava, Budapest, Ljubljana, Prague
|
||||
(UTC+01:00) Brussels, Copenhagen, Madrid, Paris
|
||||
(UTC+01:00) Sarajevo, Skopje, Warsaw, Zagreb
|
||||
(UTC+01:00) West Central Africa
|
||||
(UTC+02:00) Amman
|
||||
(UTC+02:00) Athens, Bucharest
|
||||
(UTC+02:00) Beirut
|
||||
(UTC+02:00) Cairo
|
||||
(UTC+02:00) Chisinau
|
||||
(UTC+02:00) Damascus
|
||||
(UTC+02:00) Gaza, Hebron
|
||||
(UTC+02:00) Harare, Pretoria
|
||||
(UTC+02:00) Helsinki, Kyiv, Riga, Sofia, Tallinn, Vilnius
|
||||
(UTC+02:00) Jerusalem
|
||||
(UTC+02:00) Juba
|
||||
(UTC+02:00) Kaliningrad
|
||||
(UTC+02:00) Khartoum
|
||||
(UTC+02:00) Tripoli
|
||||
(UTC+02:00) Windhoek
|
||||
(UTC+03:00) Baghdad
|
||||
(UTC+03:00) Istanbul
|
||||
(UTC+03:00) Kuwait, Riyadh
|
||||
(UTC+03:00) Minsk
|
||||
(UTC+03:00) Moscow, St. Petersburg
|
||||
(UTC+03:00) Nairobi
|
||||
(UTC+03:00) Volgograd
|
||||
(UTC+03:30) Tehran
|
||||
(UTC+04:00) Abu Dhabi, Muscat
|
||||
(UTC+04:00) Astrakhan, Ulyanovsk
|
||||
(UTC+04:00) Baku
|
||||
(UTC+04:00) Izhevsk, Samara
|
||||
(UTC+04:00) Port Louis
|
||||
(UTC+04:00) Saratov
|
||||
(UTC+04:00) Tbilisi
|
||||
(UTC+04:00) Yerevan
|
||||
(UTC+04:30) Kabul
|
||||
(UTC+05:00) Ashgabat, Tashkent
|
||||
(UTC+05:00) Astana
|
||||
(UTC+05:00) Ekaterinburg
|
||||
(UTC+05:00) Islamabad, Karachi
|
||||
(UTC+05:00) Qyzylorda
|
||||
(UTC+05:30) Chennai, Kolkata, Mumbai, New Delhi
|
||||
(UTC+05:30) Sri Jayawardenepura
|
||||
(UTC+05:45) Kathmandu
|
||||
(UTC+06:00) Dhaka
|
||||
(UTC+06:00) Omsk
|
||||
(UTC+06:30) Yangon (Rangoon)
|
||||
(UTC+07:00) Bangkok, Hanoi, Jakarta
|
||||
(UTC+07:00) Barnaul, Gorno-Altaysk
|
||||
(UTC+07:00) Hovd
|
||||
(UTC+07:00) Krasnoyarsk
|
||||
(UTC+07:00) Novosibirsk
|
||||
(UTC+07:00) Tomsk
|
||||
(UTC+08:00) Beijing, Chongqing, Hong Kong, Urumqi
|
||||
(UTC+08:00) Irkutsk
|
||||
(UTC+08:00) Kuala Lumpur, Singapore
|
||||
(UTC+08:00) Perth
|
||||
(UTC+08:00) Taipei
|
||||
(UTC+08:00) Ulaanbaatar
|
||||
(UTC+08:45) Eucla
|
||||
(UTC+09:00) Chita
|
||||
(UTC+09:00) Osaka, Sapporo, Tokyo
|
||||
(UTC+09:00) Pyongyang
|
||||
(UTC+09:00) Seoul
|
||||
(UTC+09:00) Yakutsk
|
||||
(UTC+09:30) Adelaide
|
||||
(UTC+09:30) Darwin
|
||||
(UTC+10:00) Brisbane
|
||||
(UTC+10:00) Canberra, Melbourne, Sydney
|
||||
(UTC+10:00) Guam, Port Moresby
|
||||
(UTC+10:00) Hobart
|
||||
(UTC+10:00) Vladivostok
|
||||
(UTC+10:30) Lord Howe Island
|
||||
(UTC+11:00) Bougainville Island
|
||||
(UTC+11:00) Chokurdakh
|
||||
(UTC+11:00) Magadan
|
||||
(UTC+11:00) Norfolk Island
|
||||
(UTC+11:00) Sakhalin
|
||||
(UTC+11:00) Solomon Is., New Caledonia
|
||||
(UTC+12:00) Anadyr, Petropavlovsk-Kamchatsky
|
||||
(UTC+12:00) Auckland, Wellington
|
||||
(UTC+12:00) Coordinated Universal Time+12
|
||||
(UTC+12:00) Fiji
|
||||
(UTC+12:45) Chatham Islands
|
||||
(UTC+13:00) Coordinated Universal Time+13
|
||||
(UTC+13:00) Nuku'alofa
|
||||
(UTC+13:00) Samoa
|
||||
(UTC+14:00) Kiritimati Island
|
||||
Vendored
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
Reference in New Issue
Block a user