Compare commits
12 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
11ea640626 | ||
|
|
796acdfec1 | ||
|
|
2a7d366e50 | ||
|
|
5bfaecd417 | ||
|
|
8575cf06f8 | ||
|
|
d1d5f63257 | ||
| fc9b446d2e | |||
|
|
ea68318744 | ||
|
|
518082c2e2 | ||
|
|
056dce0b98 | ||
|
|
24f2e65b6e | ||
|
|
7f27b9aa38 |
@@ -365,7 +365,6 @@ func (w *blackboxWorker) currentFlushPeriod() time.Duration {
|
||||
|
||||
func (w *blackboxWorker) finishCycle(duration time.Duration, err error) {
|
||||
w.mu.Lock()
|
||||
defer w.mu.Unlock()
|
||||
w.lastDuration = duration
|
||||
if err != nil {
|
||||
w.status = "degraded"
|
||||
@@ -383,6 +382,10 @@ func (w *blackboxWorker) finishCycle(duration time.Duration, err error) {
|
||||
}
|
||||
w.flushPeriod = adjustFlushPeriod(w.flushPeriod, duration, true, w.fastCycles)
|
||||
}
|
||||
w.mu.Unlock()
|
||||
// persistState must be called without w.mu held: it acquires rt.mu then
|
||||
// each worker.mu inside persistStateLocked, so holding w.mu here would
|
||||
// cause a deadlock (w.mu → rt.mu → w.mu).
|
||||
w.runtime.persistState()
|
||||
}
|
||||
|
||||
|
||||
@@ -4008,14 +4008,23 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
idleW = result.ServerPower.IdleW
|
||||
}
|
||||
|
||||
// Build header: Run | GPU 0 | GPU 1 | ... | Server wall W | Per GPU wall W | Platform eff.
|
||||
// Build header: Run | GPU 0 | GPU 1 | ... | GPU total W | Server itself W | Server wall W | Per GPU wall W | Platform eff.
|
||||
headers := []string{"Run"}
|
||||
for _, idx := range allGPUIndices {
|
||||
headers = append(headers, fmt.Sprintf("GPU %d W", idx))
|
||||
}
|
||||
headers = append(headers, "Server wall W", "Per GPU wall W", "Platform eff.")
|
||||
headers = append(headers, "GPU total W", "Server itself W", "Server wall W", "Per GPU wall W", "Platform eff.")
|
||||
|
||||
var rampRows [][]string
|
||||
if idleW > 0 {
|
||||
idleRow := []string{"0 (idle)"}
|
||||
for range allGPUIndices {
|
||||
idleRow = append(idleRow, "—")
|
||||
}
|
||||
// No load: GPU total is negligible, all draw is the server's own baseline.
|
||||
idleRow = append(idleRow, "—", fmt.Sprintf("%.0f", idleW), fmt.Sprintf("%.0f", idleW), "—", "—")
|
||||
rampRows = append(rampRows, idleRow)
|
||||
}
|
||||
for _, step := range result.RampSteps {
|
||||
row := []string{fmt.Sprintf("%d", step.StepIndex)}
|
||||
for _, idx := range allGPUIndices {
|
||||
@@ -4036,6 +4045,16 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
}
|
||||
row = append(row, gpuPwr)
|
||||
}
|
||||
// GPU total W = sum of observed GPU power (nvidia-smi)
|
||||
gpuTotal := "—"
|
||||
if step.TotalObservedPowerW > 0 {
|
||||
gpuTotal = fmt.Sprintf("%.0f", step.TotalObservedPowerW)
|
||||
}
|
||||
// Server itself W = server wall power minus GPU total (non-GPU baseline draw)
|
||||
serverItself := "—"
|
||||
if step.ServerLoadedW > 0 && step.TotalObservedPowerW > 0 {
|
||||
serverItself = fmt.Sprintf("%.0f", step.ServerLoadedW-step.TotalObservedPowerW)
|
||||
}
|
||||
// Server wall W
|
||||
serverWall := "—"
|
||||
if step.ServerLoadedW > 0 {
|
||||
@@ -4055,7 +4074,7 @@ func renderPowerBenchReport(result NvidiaPowerBenchResult) string {
|
||||
}
|
||||
platEff = fmt.Sprintf("%.2f", eff)
|
||||
}
|
||||
row = append(row, serverWall, perGPUWall, platEff)
|
||||
row = append(row, gpuTotal, serverItself, serverWall, perGPUWall, platEff)
|
||||
rampRows = append(rampRows, row)
|
||||
}
|
||||
b.WriteString(fmtMDTable(headers, rampRows))
|
||||
@@ -4617,6 +4636,8 @@ func (s *System) RunNvidiaPowerBench(ctx context.Context, baseDir string, opts N
|
||||
ramp.AvgFanRPM = singleRun.AvgFanRPM
|
||||
ramp.AvgFanDutyCyclePct = singleRun.AvgFanDutyCyclePct
|
||||
}
|
||||
firstSummary := firstCalib.Summary
|
||||
ramp.PerGPUTelemetry = map[int]*BenchmarkTelemetrySummary{firstIdx: &firstSummary}
|
||||
if !firstCalib.Completed {
|
||||
ramp.Status = "FAILED"
|
||||
ramp.Notes = append(ramp.Notes, fmt.Sprintf("GPU %d did not complete single-card %s", firstIdx, benchmarkPowerEngineLabel(benchmarkPowerEngine())))
|
||||
|
||||
@@ -182,9 +182,16 @@ func (s *System) DetectGPUVendor() string {
|
||||
return "amd"
|
||||
}
|
||||
if raw, err := exec.Command("lspci", "-nn").Output(); err == nil {
|
||||
text := strings.ToLower(string(raw))
|
||||
if strings.Contains(text, "advanced micro devices") || strings.Contains(text, "amd/ati") {
|
||||
return "amd"
|
||||
// Only match AMD GPU device classes [0300]=VGA, [0302]=3D controller, [0380]=Display.
|
||||
// AMD CPUs also appear in lspci as "Advanced Micro Devices" (Root Complex, IOMMU, etc.)
|
||||
// so matching vendor alone causes false positives on AMD CPU servers without GPUs.
|
||||
for _, line := range strings.Split(strings.ToLower(string(raw)), "\n") {
|
||||
if !strings.Contains(line, "advanced micro devices") && !strings.Contains(line, "amd/ati") {
|
||||
continue
|
||||
}
|
||||
if strings.Contains(line, "[0300]") || strings.Contains(line, "[0302]") || strings.Contains(line, "[0380]") {
|
||||
return "amd"
|
||||
}
|
||||
}
|
||||
}
|
||||
return ""
|
||||
@@ -723,12 +730,14 @@ func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, e
|
||||
}
|
||||
prefix := fmt.Sprintf("%02d-%s", index+1, filepath.Base(devPath))
|
||||
commands := storageSATCommands(devPath, extended)
|
||||
deviceOutputs := make(map[string][]byte, len(commands))
|
||||
for cmdIndex, job := range commands {
|
||||
if ctx.Err() != nil {
|
||||
break
|
||||
}
|
||||
name := fmt.Sprintf("%s-%02d-%s.log", prefix, cmdIndex+1, job.name)
|
||||
out, err := runSATCommandCtx(ctx, verboseLog, job.name, job.cmd, nil, logFunc)
|
||||
deviceOutputs[job.name] = out
|
||||
if writeErr := os.WriteFile(filepath.Join(runDir, name), out, 0644); writeErr != nil {
|
||||
return "", writeErr
|
||||
}
|
||||
@@ -737,7 +746,28 @@ func (s *System) RunStorageAcceptancePack(ctx context.Context, baseDir string, e
|
||||
key := filepath.Base(devPath) + "_" + strings.ReplaceAll(job.name, "-", "_")
|
||||
fmt.Fprintf(&summary, "%s_rc=%d\n", key, rc)
|
||||
fmt.Fprintf(&summary, "%s_status=%s\n", key, status)
|
||||
|
||||
// smartctl -t short only launches the self-test on the drive firmware and
|
||||
// returns immediately ("Testing has begun"); unlike `nvme device-self-test
|
||||
// --wait`, smartctl has no blocking mode, so we must poll the drive
|
||||
// ourselves until the self-test actually finishes.
|
||||
if job.name == "smartctl-self-test-short" && err == nil {
|
||||
statusName := "smartctl-self-test-status"
|
||||
statusOut := waitForSmartctlSelfTest(ctx, verboseLog, devPath, logFunc)
|
||||
deviceOutputs[statusName] = statusOut
|
||||
statusFile := fmt.Sprintf("%s-%02d-%s.log", prefix, cmdIndex+2, statusName)
|
||||
if writeErr := os.WriteFile(filepath.Join(runDir, statusFile), statusOut, 0644); writeErr != nil {
|
||||
return "", writeErr
|
||||
}
|
||||
sStatus, sRC := classifySATResult(statusName, statusOut, nil)
|
||||
stats.Add(sStatus)
|
||||
sKey := filepath.Base(devPath) + "_" + strings.ReplaceAll(statusName, "-", "_")
|
||||
fmt.Fprintf(&summary, "%s_rc=%d\n", sKey, sRC)
|
||||
fmt.Fprintf(&summary, "%s_status=%s\n", sKey, sStatus)
|
||||
}
|
||||
}
|
||||
reportText := GenerateDiskReportText(index+1, devPath, deviceOutputs, time.Now().UTC())
|
||||
_ = os.WriteFile(filepath.Join(runDir, "disk-"+prefix+"-report.txt"), []byte(reportText), 0644)
|
||||
}
|
||||
|
||||
writeSATStats(&summary, stats)
|
||||
@@ -1170,6 +1200,42 @@ func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string
|
||||
return out, err
|
||||
}
|
||||
|
||||
// smartctlSelfTestPollInterval/Timeout bound how long we poll the drive after
|
||||
// launching `smartctl -t short`, which SMART/ATA specs put at ~2 minutes.
|
||||
const (
|
||||
smartctlSelfTestPollInterval = 5 * time.Second
|
||||
smartctlSelfTestTimeout = 4 * time.Minute
|
||||
)
|
||||
|
||||
// waitForSmartctlSelfTest polls `smartctl -a` until the short self-test
|
||||
// started on devPath finishes (or the timeout/context elapses) and returns
|
||||
// the final output, which reflects the actual test result rather than the
|
||||
// "Testing has begun" launch acknowledgement.
|
||||
func waitForSmartctlSelfTest(ctx context.Context, verboseLog, devPath string, logFunc func(string)) []byte {
|
||||
deadline := time.Now().Add(smartctlSelfTestTimeout)
|
||||
var last []byte
|
||||
for {
|
||||
out, _ := runSATCommandCtx(ctx, verboseLog, "smartctl-self-test-status", []string{"smartctl", "-a", devPath}, nil, nil)
|
||||
last = out
|
||||
if ctx.Err() != nil {
|
||||
return last
|
||||
}
|
||||
lower := bytes.ToLower(out)
|
||||
if !bytes.Contains(lower, []byte("self-test routine in progress")) &&
|
||||
!bytes.Contains(lower, []byte("% of test remaining")) {
|
||||
return last
|
||||
}
|
||||
if time.Now().After(deadline) {
|
||||
return last
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return last
|
||||
case <-time.After(smartctlSelfTestPollInterval):
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func listStorageDevices() ([]string, error) {
|
||||
out, err := satExecCommand("lsblk", "-dn", "-o", "NAME,TYPE,TRAN").Output()
|
||||
if err != nil {
|
||||
@@ -1178,26 +1244,27 @@ func listStorageDevices() ([]string, error) {
|
||||
return parseStorageDevices(string(out)), nil
|
||||
}
|
||||
|
||||
// storageSATCommands returns the commands to run for a single storage device.
|
||||
// extended=false (Check): read-only SMART/NVMe data collection, no self-test.
|
||||
// extended=true (Load): data collection + short self-test.
|
||||
func storageSATCommands(devPath string, extended bool) []satJob {
|
||||
if strings.Contains(filepath.Base(devPath), "nvme") {
|
||||
selfTestLevel := "1"
|
||||
if extended {
|
||||
selfTestLevel = "2"
|
||||
}
|
||||
return []satJob{
|
||||
jobs := []satJob{
|
||||
{name: "nvme-id-ctrl", cmd: []string{"nvme", "id-ctrl", devPath, "-o", "json"}},
|
||||
{name: "nvme-smart-log", cmd: []string{"nvme", "smart-log", devPath, "-o", "json"}},
|
||||
{name: "nvme-device-self-test", cmd: []string{"nvme", "device-self-test", devPath, "-s", selfTestLevel, "--wait"}},
|
||||
}
|
||||
if extended {
|
||||
jobs = append(jobs, satJob{name: "nvme-device-self-test", cmd: []string{"nvme", "device-self-test", devPath, "-s", "1", "--wait"}})
|
||||
}
|
||||
return jobs
|
||||
}
|
||||
smartTestType := "short"
|
||||
if extended {
|
||||
smartTestType = "long"
|
||||
}
|
||||
return []satJob{
|
||||
jobs := []satJob{
|
||||
{name: "smartctl-health", cmd: []string{"smartctl", "-H", "-A", devPath}},
|
||||
{name: "smartctl-self-test-short", cmd: []string{"smartctl", "-t", smartTestType, devPath}},
|
||||
}
|
||||
if extended {
|
||||
jobs = append(jobs, satJob{name: "smartctl-self-test-short", cmd: []string{"smartctl", "-t", "short", devPath}})
|
||||
}
|
||||
return jobs
|
||||
}
|
||||
|
||||
func (s *satStats) Add(status string) {
|
||||
|
||||
@@ -14,14 +14,42 @@ import (
|
||||
func TestStorageSATCommands(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
nvme := storageSATCommands("/dev/nvme0n1", false)
|
||||
if len(nvme) != 3 || nvme[2].cmd[0] != "nvme" {
|
||||
t.Fatalf("unexpected nvme commands: %#v", nvme)
|
||||
// Check mode (extended=false): read-only collection, no self-test.
|
||||
nvmeCheck := storageSATCommands("/dev/nvme0n1", false)
|
||||
if len(nvmeCheck) != 2 {
|
||||
t.Fatalf("check nvme: want 2 commands, got %d: %#v", len(nvmeCheck), nvmeCheck)
|
||||
}
|
||||
if nvmeCheck[0].name != "nvme-id-ctrl" || nvmeCheck[1].name != "nvme-smart-log" {
|
||||
t.Fatalf("check nvme: unexpected command names: %#v", nvmeCheck)
|
||||
}
|
||||
|
||||
sata := storageSATCommands("/dev/sda", false)
|
||||
if len(sata) != 2 || sata[0].cmd[0] != "smartctl" {
|
||||
t.Fatalf("unexpected sata commands: %#v", sata)
|
||||
sataCheck := storageSATCommands("/dev/sda", false)
|
||||
if len(sataCheck) != 1 || sataCheck[0].cmd[0] != "smartctl" {
|
||||
t.Fatalf("check sata: want 1 smartctl command, got %#v", sataCheck)
|
||||
}
|
||||
|
||||
// Load mode (extended=true): collection + short self-test.
|
||||
nvmeLoad := storageSATCommands("/dev/nvme0n1", true)
|
||||
if len(nvmeLoad) != 3 || nvmeLoad[2].name != "nvme-device-self-test" {
|
||||
t.Fatalf("load nvme: want 3 commands with self-test last, got %#v", nvmeLoad)
|
||||
}
|
||||
if got := nvmeLoad[2].cmd[len(nvmeLoad[2].cmd)-3]; got != "-s" {
|
||||
t.Fatalf("load nvme: want -s flag, got %q", got)
|
||||
}
|
||||
if got := nvmeLoad[2].cmd[len(nvmeLoad[2].cmd)-2]; got != "1" {
|
||||
t.Fatalf("load nvme: want self-test level 1, got %q", got)
|
||||
}
|
||||
|
||||
sataLoad := storageSATCommands("/dev/sda", true)
|
||||
if len(sataLoad) != 2 || sataLoad[1].name != "smartctl-self-test-short" {
|
||||
t.Fatalf("load sata: want 2 commands with short self-test last, got %#v", sataLoad)
|
||||
}
|
||||
// cmd is: smartctl -t short /dev/sda
|
||||
if got := sataLoad[1].cmd[1]; got != "-t" {
|
||||
t.Fatalf("load sata: want -t flag at index 1, got %q", got)
|
||||
}
|
||||
if got := sataLoad[1].cmd[2]; got != "short" {
|
||||
t.Fatalf("load sata: want short at index 2, got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
517
audit/internal/platform/storage_report.go
Normal file
517
audit/internal/platform/storage_report.go
Normal file
@@ -0,0 +1,517 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"math"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// GenerateDiskReportText builds a human-readable text report for one storage
|
||||
// device from the raw command outputs collected during storage SAT.
|
||||
//
|
||||
// outputs keys match satJob.name: "nvme-id-ctrl", "nvme-smart-log",
|
||||
// "smartctl-health", "smartctl-self-test-short".
|
||||
func GenerateDiskReportText(index int, devPath string, outputs map[string][]byte, ts time.Time) string {
|
||||
var b strings.Builder
|
||||
devName := filepath.Base(devPath)
|
||||
line := strings.Repeat("=", 80)
|
||||
b.WriteString(line + "\n")
|
||||
fmt.Fprintf(&b, "Disk %-3d %s\n", index, devPath)
|
||||
b.WriteString(line + "\n")
|
||||
|
||||
isNVMe := strings.Contains(devName, "nvme")
|
||||
if isNVMe {
|
||||
writeNVMeReport(&b, outputs)
|
||||
} else {
|
||||
writeSATAReport(&b, outputs)
|
||||
}
|
||||
|
||||
b.WriteString("\n")
|
||||
fmt.Fprintf(&b, "Collected : %s\n", ts.UTC().Format("2006-01-02 15:04:05 UTC"))
|
||||
b.WriteString(line + "\n")
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// ── NVMe ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
type nvmeIdCtrl struct {
|
||||
ModelNumber string `json:"mn"`
|
||||
SerialNumber string `json:"sn"`
|
||||
Firmware string `json:"fr"`
|
||||
TotalCap uint64 `json:"tnvmcap"`
|
||||
NVMCap uint64 `json:"nvmcap"`
|
||||
}
|
||||
|
||||
// nvmeU64 handles both plain JSON numbers and {"lo":n,"hi":n} objects that
|
||||
// some nvme-cli versions emit for 128-bit counters.
|
||||
func nvmeU64(raw json.RawMessage) uint64 {
|
||||
if len(raw) == 0 {
|
||||
return 0
|
||||
}
|
||||
var n uint64
|
||||
if json.Unmarshal(raw, &n) == nil {
|
||||
return n
|
||||
}
|
||||
var obj struct {
|
||||
Lo uint64 `json:"lo"`
|
||||
Hi uint64 `json:"hi"`
|
||||
}
|
||||
if json.Unmarshal(raw, &obj) == nil {
|
||||
return obj.Lo
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
type nvmeSmartLogRaw struct {
|
||||
CriticalWarning uint64 `json:"critical_warning"`
|
||||
Temperature json.RawMessage `json:"temperature"`
|
||||
AvailSpare uint64 `json:"avail_spare"`
|
||||
SpareThresh uint64 `json:"spare_thresh"`
|
||||
PercentUsed uint64 `json:"percent_used"`
|
||||
DataUnitsRead json.RawMessage `json:"data_units_read"`
|
||||
DataUnitsWritten json.RawMessage `json:"data_units_written"`
|
||||
PowerCycles json.RawMessage `json:"power_cycles"`
|
||||
PowerOnHours json.RawMessage `json:"power_on_hours"`
|
||||
UnsafeShutdowns json.RawMessage `json:"unsafe_shutdowns"`
|
||||
MediaErrors json.RawMessage `json:"media_errors"`
|
||||
NumErrLogEntries json.RawMessage `json:"num_err_log_entries"`
|
||||
}
|
||||
|
||||
func writeNVMeReport(b *strings.Builder, outputs map[string][]byte) {
|
||||
// id-ctrl
|
||||
var ctrl nvmeIdCtrl
|
||||
if data := outputs["nvme-id-ctrl"]; len(data) > 0 {
|
||||
_ = json.Unmarshal(data, &ctrl)
|
||||
}
|
||||
|
||||
model := strings.TrimSpace(ctrl.ModelNumber)
|
||||
serial := strings.TrimSpace(ctrl.SerialNumber)
|
||||
firmware := strings.TrimSpace(ctrl.Firmware)
|
||||
|
||||
capacityGB := ""
|
||||
if ctrl.TotalCap > 0 {
|
||||
capacityGB = formatCapacityGB(ctrl.TotalCap)
|
||||
} else if ctrl.NVMCap > 0 {
|
||||
capacityGB = formatCapacityGB(ctrl.NVMCap)
|
||||
}
|
||||
|
||||
writeField(b, "Model", model)
|
||||
writeField(b, "Serial", serial)
|
||||
writeField(b, "Firmware", firmware)
|
||||
if capacityGB != "" {
|
||||
writeField(b, "Capacity", capacityGB)
|
||||
}
|
||||
|
||||
// smart-log
|
||||
data := outputs["nvme-smart-log"]
|
||||
if len(data) == 0 {
|
||||
b.WriteString("\n(no SMART data)\n")
|
||||
return
|
||||
}
|
||||
var sl nvmeSmartLogRaw
|
||||
if err := json.Unmarshal(data, &sl); err != nil {
|
||||
fmt.Fprintf(b, "\n(SMART parse error: %v)\n", err)
|
||||
return
|
||||
}
|
||||
|
||||
tempK := nvmeU64(sl.Temperature)
|
||||
tempC := int(tempK) - 273
|
||||
if tempC < 0 {
|
||||
tempC = 0
|
||||
}
|
||||
|
||||
critWarn := sl.CriticalWarning
|
||||
critWarnStr := "OK"
|
||||
if critWarn != 0 {
|
||||
critWarnStr = fmt.Sprintf("0x%02X", critWarn)
|
||||
}
|
||||
|
||||
poh := nvmeU64(sl.PowerOnHours)
|
||||
pc := nvmeU64(sl.PowerCycles)
|
||||
us := nvmeU64(sl.UnsafeShutdowns)
|
||||
me := nvmeU64(sl.MediaErrors)
|
||||
nel := nvmeU64(sl.NumErrLogEntries)
|
||||
|
||||
// data_units are in 1000 × 512-byte sectors = 512,000 bytes each
|
||||
dataRead := float64(nvmeU64(sl.DataUnitsRead)) * 512000 / 1e9
|
||||
dataWritten := float64(nvmeU64(sl.DataUnitsWritten)) * 512000 / 1e9
|
||||
|
||||
writeSectionHeader(b, "Health")
|
||||
writeField(b, "Temperature", fmt.Sprintf("%d °C", tempC))
|
||||
writeField(b, "Critical Warning", critWarnStr)
|
||||
writeField(b, "Percentage Used", fmt.Sprintf("%d %%", sl.PercentUsed))
|
||||
writeField(b, "Available Spare", fmt.Sprintf("%d %% (threshold: %d %%)", sl.AvailSpare, sl.SpareThresh))
|
||||
|
||||
writeSectionHeader(b, "Usage")
|
||||
writeField(b, "Power On Hours", fmt.Sprintf("%s h", formatUint(poh)))
|
||||
writeField(b, "Power Cycles", formatUint(pc))
|
||||
writeField(b, "Unsafe Shutdowns", formatUint(us))
|
||||
writeField(b, "Data Written", fmt.Sprintf("%.1f GB", dataWritten))
|
||||
writeField(b, "Data Read", fmt.Sprintf("%.1f GB", dataRead))
|
||||
|
||||
writeSectionHeader(b, "Errors")
|
||||
writeField(b, "Media Errors", formatUint(me))
|
||||
writeField(b, "Error Log Entries", formatUint(nel))
|
||||
|
||||
capacityBytes := ctrl.TotalCap
|
||||
if capacityBytes == 0 {
|
||||
capacityBytes = ctrl.NVMCap
|
||||
}
|
||||
writeResourceSection(b, resourceInfo{
|
||||
powerOnHours: poh,
|
||||
writtenBytes: uint64(nvmeU64(sl.DataUnitsWritten)) * 512000,
|
||||
readBytes: uint64(nvmeU64(sl.DataUnitsRead)) * 512000,
|
||||
capacityBytes: capacityBytes,
|
||||
})
|
||||
|
||||
if selfTest := outputs["nvme-device-self-test"]; len(selfTest) > 0 {
|
||||
writeSectionHeader(b, "Self-Test")
|
||||
result := parseSelfTestResult(string(selfTest))
|
||||
writeField(b, "Result", result)
|
||||
}
|
||||
}
|
||||
|
||||
// ── SATA / SAS (smartctl) ────────────────────────────────────────────────────
|
||||
|
||||
var (
|
||||
smartHealthRE = regexp.MustCompile(`(?i)SMART overall-health self-assessment test result:\s*(\S+)`)
|
||||
smartAttrLineRE = regexp.MustCompile(
|
||||
`^\s*(\d{1,3})\s+(\S+)\s+0x[0-9a-fA-F]+\s+(\d{1,3})\s+(\d{1,3})\s+(\d{1,3})\s+\S+\s+\S+\s+\S+\s+(.+?)\s*$`,
|
||||
)
|
||||
smartModelRE = regexp.MustCompile(`(?im)^Device Model:\s*(.+)$`)
|
||||
smartSerialRE = regexp.MustCompile(`(?im)^Serial Number:\s*(.+)$`)
|
||||
smartFirmwareRE = regexp.MustCompile(`(?im)^Firmware Version:\s*(.+)$`)
|
||||
smartCapacityRE = regexp.MustCompile(`(?im)^User Capacity:\s*(.+)$`)
|
||||
)
|
||||
|
||||
type smartAttr struct {
|
||||
ID int
|
||||
Name string
|
||||
Value int
|
||||
Worst int
|
||||
Threshold int
|
||||
Raw string
|
||||
}
|
||||
|
||||
func writeSATAReport(b *strings.Builder, outputs map[string][]byte) {
|
||||
data := outputs["smartctl-health"]
|
||||
if len(data) == 0 {
|
||||
b.WriteString("\n(no SMART data)\n")
|
||||
return
|
||||
}
|
||||
text := string(data)
|
||||
|
||||
// Identity
|
||||
if m := smartModelRE.FindStringSubmatch(text); m != nil {
|
||||
writeField(b, "Model", strings.TrimSpace(m[1]))
|
||||
}
|
||||
if m := smartSerialRE.FindStringSubmatch(text); m != nil {
|
||||
writeField(b, "Serial", strings.TrimSpace(m[1]))
|
||||
}
|
||||
if m := smartFirmwareRE.FindStringSubmatch(text); m != nil {
|
||||
writeField(b, "Firmware", strings.TrimSpace(m[1]))
|
||||
}
|
||||
var capacityBytes uint64
|
||||
if m := smartCapacityRE.FindStringSubmatch(text); m != nil {
|
||||
cap := strings.TrimSpace(m[1])
|
||||
capacityBytes = parseLeadingUint(cap)
|
||||
// trim everything after "[" if present (e.g. "500,107,862,016 bytes [500 GB]")
|
||||
if idx := strings.Index(cap, "["); idx > 0 {
|
||||
cap = strings.TrimSpace(cap[idx+1:])
|
||||
cap = strings.TrimSuffix(cap, "]")
|
||||
}
|
||||
writeField(b, "Capacity", cap)
|
||||
}
|
||||
|
||||
writeSectionHeader(b, "Health")
|
||||
health := "unknown"
|
||||
if m := smartHealthRE.FindStringSubmatch(text); m != nil {
|
||||
health = strings.TrimSpace(m[1])
|
||||
}
|
||||
writeField(b, "SMART Overall Health", health)
|
||||
|
||||
attrs := parseSMARTAttrs(text)
|
||||
if len(attrs) > 0 {
|
||||
writeSectionHeader(b, "SMART Attributes")
|
||||
fmt.Fprintf(b, " %-4s %-32s %5s %5s %5s %s\n", "ID", "Attribute", "Value", "Worst", "Thresh", "Raw")
|
||||
b.WriteString(" " + strings.Repeat("-", 72) + "\n")
|
||||
for _, a := range attrs {
|
||||
fmt.Fprintf(b, " %-4d %-32s %5d %5d %5d %s\n",
|
||||
a.ID, a.Name, a.Value, a.Worst, a.Threshold, a.Raw)
|
||||
}
|
||||
}
|
||||
|
||||
var poh, writtenLBAs, readLBAs uint64
|
||||
var readValue int
|
||||
hasReadValue := false
|
||||
for _, a := range attrs {
|
||||
switch a.ID {
|
||||
case 9: // Power_On_Hours
|
||||
poh = parseLeadingUint(a.Raw)
|
||||
case 241: // Total_LBAs_Written
|
||||
writtenLBAs = parseLeadingUint(a.Raw)
|
||||
case 242: // Total_LBAs_Read
|
||||
readLBAs = parseLeadingUint(a.Raw)
|
||||
readValue = a.Value
|
||||
hasReadValue = true
|
||||
}
|
||||
}
|
||||
const sataSectorBytes = 512
|
||||
writeResourceSection(b, resourceInfo{
|
||||
powerOnHours: poh,
|
||||
writtenBytes: writtenLBAs * sataSectorBytes,
|
||||
readBytes: readLBAs * sataSectorBytes,
|
||||
capacityBytes: capacityBytes,
|
||||
readPercent: 100 - readValue,
|
||||
hasReadPercent: hasReadValue,
|
||||
})
|
||||
|
||||
selfTest := outputs["smartctl-self-test-status"]
|
||||
if len(selfTest) == 0 {
|
||||
selfTest = outputs["smartctl-self-test-short"]
|
||||
}
|
||||
if len(selfTest) > 0 {
|
||||
writeSectionHeader(b, "Self-Test")
|
||||
result := parseSelfTestResult(string(selfTest))
|
||||
writeField(b, "Result", result)
|
||||
}
|
||||
}
|
||||
|
||||
func parseSMARTAttrs(text string) []smartAttr {
|
||||
var attrs []smartAttr
|
||||
inTable := false
|
||||
for _, line := range strings.Split(text, "\n") {
|
||||
if strings.Contains(line, "ATTRIBUTE_NAME") {
|
||||
inTable = true
|
||||
continue
|
||||
}
|
||||
if !inTable {
|
||||
continue
|
||||
}
|
||||
m := smartAttrLineRE.FindStringSubmatch(line)
|
||||
if m == nil {
|
||||
if strings.TrimSpace(line) == "" {
|
||||
inTable = false
|
||||
}
|
||||
continue
|
||||
}
|
||||
id, _ := strconv.Atoi(m[1])
|
||||
val, _ := strconv.Atoi(m[3])
|
||||
worst, _ := strconv.Atoi(m[4])
|
||||
thresh, _ := strconv.Atoi(m[5])
|
||||
attrs = append(attrs, smartAttr{
|
||||
ID: id,
|
||||
Name: m[2],
|
||||
Value: val,
|
||||
Worst: worst,
|
||||
Threshold: thresh,
|
||||
Raw: strings.TrimSpace(m[6]),
|
||||
})
|
||||
}
|
||||
return attrs
|
||||
}
|
||||
|
||||
// parseSelfTestResult extracts a one-line summary from nvme device-self-test,
|
||||
// smartctl -a (post-completion status), or smartctl -t short (launch ack) output.
|
||||
func parseSelfTestResult(text string) string {
|
||||
text = strings.TrimSpace(text)
|
||||
if text == "" {
|
||||
return "no output"
|
||||
}
|
||||
lines := strings.Split(text, "\n")
|
||||
// smartctl -a: "Self-test execution status: ( 0)\n\tThe previous
|
||||
// self-test routine completed\n\twithout error ..." — the description
|
||||
// wraps onto following indented, colon-free continuation lines.
|
||||
for i, line := range lines {
|
||||
if strings.Contains(strings.ToLower(line), "self-test execution status") {
|
||||
parts := []string{strings.TrimSpace(line)}
|
||||
for j := i + 1; j < len(lines) && j < i+4; j++ {
|
||||
cont := strings.TrimSpace(lines[j])
|
||||
if cont == "" || strings.Contains(cont, ":") {
|
||||
break
|
||||
}
|
||||
parts = append(parts, cont)
|
||||
}
|
||||
return strings.Join(parts, " ")
|
||||
}
|
||||
}
|
||||
// nvme device-self-test: look for "Short Device Self-Test Status : 0x0" or similar
|
||||
for _, line := range lines {
|
||||
l := strings.ToLower(line)
|
||||
if strings.Contains(l, "self-test status") || strings.Contains(l, "self test status") {
|
||||
return strings.TrimSpace(line)
|
||||
}
|
||||
}
|
||||
// smartctl -t short: "Testing has begun" or "Short BGST started"
|
||||
for _, line := range lines {
|
||||
l := strings.ToLower(line)
|
||||
if strings.Contains(l, "testing has begun") || strings.Contains(l, "started") || strings.Contains(l, "complete") {
|
||||
return strings.TrimSpace(line)
|
||||
}
|
||||
}
|
||||
// fallback: last non-empty line
|
||||
for i := len(lines) - 1; i >= 0; i-- {
|
||||
if s := strings.TrimSpace(lines[i]); s != "" {
|
||||
return s
|
||||
}
|
||||
}
|
||||
return "done"
|
||||
}
|
||||
|
||||
// ── Resource (pseudographic usage bars) ────────────────────────────────────────
|
||||
|
||||
// designLifeYears/dwpd model the drive's rated endurance: 1 drive-write-per-day
|
||||
// for 5 years, the baseline enterprise endurance spec used when the vendor's
|
||||
// own TBW/DWPD rating isn't available from SMART/NVMe data.
|
||||
const (
|
||||
designLifeYears = 5
|
||||
dwpd = 1.0
|
||||
)
|
||||
|
||||
type resourceInfo struct {
|
||||
powerOnHours uint64
|
||||
writtenBytes uint64
|
||||
readBytes uint64
|
||||
capacityBytes uint64
|
||||
readPercent int // only meaningful when hasReadPercent
|
||||
hasReadPercent bool // true when the source SMART attribute exposes a normalized read-wear value
|
||||
}
|
||||
|
||||
func writeResourceSection(b *strings.Builder, r resourceInfo) {
|
||||
writeSectionHeader(b, "Resource")
|
||||
|
||||
const maxLifeHours = designLifeYears * 365 * 24
|
||||
upFrac := float64(r.powerOnHours) / float64(maxLifeHours)
|
||||
fmt.Fprintf(b, " %-9s %s %s / %s (%s)\n",
|
||||
"Uptime", progressBar(upFrac, 24), formatHoursHuman(r.powerOnHours), formatHoursHuman(maxLifeHours), formatPercent(upFrac*100))
|
||||
|
||||
if r.capacityBytes > 0 {
|
||||
maxWritten := float64(r.capacityBytes) * dwpd * designLifeYears * 365
|
||||
wFrac := float64(r.writtenBytes) / maxWritten
|
||||
fmt.Fprintf(b, " %-9s %s %s / %s (%s, %g DWPD×%dy)\n",
|
||||
"Written", progressBar(wFrac, 24), formatBytesHuman(float64(r.writtenBytes)), formatBytesHuman(maxWritten), formatPercent(wFrac*100), dwpd, designLifeYears)
|
||||
} else {
|
||||
fmt.Fprintf(b, " %-9s %s\n", "Written", formatBytesHuman(float64(r.writtenBytes)))
|
||||
}
|
||||
|
||||
if r.hasReadPercent {
|
||||
fmt.Fprintf(b, " %-9s %s %s (%d%%)\n",
|
||||
"Read", progressBar(float64(r.readPercent)/100, 24), formatBytesHuman(float64(r.readBytes)), r.readPercent)
|
||||
} else {
|
||||
fmt.Fprintf(b, " %-9s %s\n", "Read", formatBytesHuman(float64(r.readBytes)))
|
||||
}
|
||||
}
|
||||
|
||||
// progressBar renders a fixed-width pseudographic bar, e.g. "[######------]".
|
||||
func progressBar(frac float64, width int) string {
|
||||
if math.IsNaN(frac) || frac < 0 {
|
||||
frac = 0
|
||||
}
|
||||
if frac > 1 {
|
||||
frac = 1
|
||||
}
|
||||
filled := int(math.Round(frac * float64(width)))
|
||||
return "[" + strings.Repeat("#", filled) + strings.Repeat("-", width-filled) + "]"
|
||||
}
|
||||
|
||||
// formatBytesHuman renders a decimal (SI) human-readable byte size, e.g. "1.23 TB".
|
||||
func formatBytesHuman(n float64) string {
|
||||
units := []string{"B", "KB", "MB", "GB", "TB", "PB"}
|
||||
i := 0
|
||||
for n >= 1000 && i < len(units)-1 {
|
||||
n /= 1000
|
||||
i++
|
||||
}
|
||||
if i == 0 {
|
||||
return fmt.Sprintf("%.0f %s", n, units[i])
|
||||
}
|
||||
return fmt.Sprintf("%.2f %s", n, units[i])
|
||||
}
|
||||
|
||||
// formatHoursHuman renders an hour count as a human-scaled duration (hours,
|
||||
// days, or years) so uptimes don't show as raw four/five-digit hour counts.
|
||||
func formatHoursHuman(hours uint64) string {
|
||||
if hours < 48 {
|
||||
return fmt.Sprintf("%d h", hours)
|
||||
}
|
||||
days := float64(hours) / 24
|
||||
if days < 365 {
|
||||
return fmt.Sprintf("%.0f d", days)
|
||||
}
|
||||
years := days / 365
|
||||
if years == math.Trunc(years) {
|
||||
return fmt.Sprintf("%.0f y", years)
|
||||
}
|
||||
return fmt.Sprintf("%.1f y", years)
|
||||
}
|
||||
|
||||
// formatPercent renders a percentage with extra precision below 1% (e.g.
|
||||
// "0.03%"), where a rounded "0%" would hide any usage at all.
|
||||
func formatPercent(pct float64) string {
|
||||
if pct > 0 && pct < 1 {
|
||||
return fmt.Sprintf("%.2f%%", pct)
|
||||
}
|
||||
return fmt.Sprintf("%.0f%%", pct)
|
||||
}
|
||||
|
||||
// parseLeadingUint parses the leading run of digits/commas in s (e.g. from a
|
||||
// SMART raw value or "500,107,862,016 bytes") into a uint64, ignoring the rest.
|
||||
func parseLeadingUint(s string) uint64 {
|
||||
s = strings.TrimSpace(s)
|
||||
end := 0
|
||||
for end < len(s) && (s[end] >= '0' && s[end] <= '9' || s[end] == ',') {
|
||||
end++
|
||||
}
|
||||
digits := strings.ReplaceAll(s[:end], ",", "")
|
||||
n, _ := strconv.ParseUint(digits, 10, 64)
|
||||
return n
|
||||
}
|
||||
|
||||
// ── Formatting helpers ────────────────────────────────────────────────────────
|
||||
|
||||
func writeSectionHeader(b *strings.Builder, title string) {
|
||||
b.WriteString("\n")
|
||||
header := "-- " + title + " "
|
||||
header += strings.Repeat("-", max(0, 76-len(header)))
|
||||
b.WriteString(header + "\n")
|
||||
}
|
||||
|
||||
func writeField(b *strings.Builder, label, value string) {
|
||||
fmt.Fprintf(b, " %-20s : %s\n", label, value)
|
||||
}
|
||||
|
||||
func formatCapacityGB(bytes uint64) string {
|
||||
gb := float64(bytes) / 1e9
|
||||
if gb >= 1000 {
|
||||
return fmt.Sprintf("%.2g TB", gb/1000)
|
||||
}
|
||||
return fmt.Sprintf("%.0f GB", math.Round(gb))
|
||||
}
|
||||
|
||||
func formatUint(n uint64) string {
|
||||
if n == 0 {
|
||||
return "0"
|
||||
}
|
||||
s := strconv.FormatUint(n, 10)
|
||||
// insert thousand separators
|
||||
var out []byte
|
||||
for i, c := range s {
|
||||
if i > 0 && (len(s)-i)%3 == 0 {
|
||||
out = append(out, ',')
|
||||
}
|
||||
out = append(out, byte(c))
|
||||
}
|
||||
return string(out)
|
||||
}
|
||||
|
||||
func max(a, b int) int {
|
||||
if a > b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
122
audit/internal/platform/storage_report_test.go
Normal file
122
audit/internal/platform/storage_report_test.go
Normal file
@@ -0,0 +1,122 @@
|
||||
package platform
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
var testNVMeIdCtrl = []byte(`{
|
||||
"mn": "SAMSUNG MZ1L2960HCJR-00A07 ",
|
||||
"sn": "S665NN0X415495",
|
||||
"fr": "GDC7602Q",
|
||||
"tnvmcap": 960197124096
|
||||
}`)
|
||||
|
||||
var testNVMeSmartLog = []byte(`{
|
||||
"critical_warning": 0,
|
||||
"temperature": 311,
|
||||
"avail_spare": 100,
|
||||
"spare_thresh": 10,
|
||||
"percent_used": 0,
|
||||
"data_units_read": 1023456,
|
||||
"data_units_written": 738281,
|
||||
"power_cycles": 32,
|
||||
"power_on_hours": 1234,
|
||||
"unsafe_shutdowns": 3,
|
||||
"media_errors": 0,
|
||||
"num_err_log_entries": 0
|
||||
}`)
|
||||
|
||||
// lo/hi variant emitted by some nvme-cli versions
|
||||
var testNVMeSmartLogLoHi = []byte(`{
|
||||
"critical_warning": 0,
|
||||
"temperature": {"lo": 311, "hi": 0},
|
||||
"avail_spare": 100,
|
||||
"spare_thresh": 10,
|
||||
"percent_used": 0,
|
||||
"data_units_read": {"lo": 1023456, "hi": 0},
|
||||
"data_units_written": {"lo": 738281, "hi": 0},
|
||||
"power_cycles": {"lo": 32, "hi": 0},
|
||||
"power_on_hours": {"lo": 1234, "hi": 0},
|
||||
"unsafe_shutdowns": {"lo": 3, "hi": 0},
|
||||
"media_errors": {"lo": 0, "hi": 0},
|
||||
"num_err_log_entries": {"lo": 0, "hi": 0}
|
||||
}`)
|
||||
|
||||
var testSmartCtlHealth = []byte(`
|
||||
smartctl 7.3 2022-02-28 r5338 [x86_64-linux-5.15.0] (local build)
|
||||
Copyright (C) 2002-22, Bruce Allen, Christian Franke, www.smartmontools.org
|
||||
|
||||
=== START OF INFORMATION SECTION ===
|
||||
Device Model: SAMSUNG MZ1L2960HCJR-00A07
|
||||
Serial Number: S665NN0X415495
|
||||
Firmware Version: GDC7602Q
|
||||
User Capacity: 960,197,124,096 bytes [960 GB]
|
||||
|
||||
=== START OF READ SMART DATA SECTION ===
|
||||
SMART overall-health self-assessment test result: PASSED
|
||||
|
||||
SMART Attributes Data Structure revision number: 1
|
||||
Vendor Specific SMART Attributes with Thresholds:
|
||||
ID# ATTRIBUTE_NAME FLAG VALUE WORST THRESH TYPE UPDATED WHEN_FAILED RAW_VALUE
|
||||
5 Reallocated_Sector_Ct 0x0032 100 100 000 Old_age Always - 0
|
||||
9 Power_On_Hours 0x0032 100 100 000 Old_age Always - 1234
|
||||
12 Power_Cycle_Count 0x0032 100 100 000 Old_age Always - 45
|
||||
177 Wear_Leveling_Count 0x0013 097 097 000 Pre-fail Always - 30
|
||||
190 Airflow_Temperature_Cel 0x0032 063 045 000 Old_age Always - 37
|
||||
`)
|
||||
|
||||
func TestGenerateDiskReportNVMe(t *testing.T) {
|
||||
t.Parallel()
|
||||
outputs := map[string][]byte{
|
||||
"nvme-id-ctrl": testNVMeIdCtrl,
|
||||
"nvme-smart-log": testNVMeSmartLog,
|
||||
}
|
||||
report := GenerateDiskReportText(1, "/dev/nvme0n1", outputs, time.Unix(0, 0).UTC())
|
||||
|
||||
assertContains(t, report, "Disk 1", "/dev/nvme0n1")
|
||||
assertContains(t, report, "SAMSUNG MZ1L2960HCJR-00A07")
|
||||
assertContains(t, report, "S665NN0X415495")
|
||||
assertContains(t, report, "GDC7602Q")
|
||||
assertContains(t, report, "38 °C") // 311 K - 273
|
||||
assertContains(t, report, "1,234 h") // power_on_hours with separator
|
||||
assertContains(t, report, "32") // power_cycles
|
||||
assertContains(t, report, "3") // unsafe_shutdowns
|
||||
assertContains(t, report, "378.0 GB") // data_units_written * 512000 / 1e9
|
||||
}
|
||||
|
||||
func TestGenerateDiskReportNVMeLoHi(t *testing.T) {
|
||||
t.Parallel()
|
||||
outputs := map[string][]byte{
|
||||
"nvme-id-ctrl": testNVMeIdCtrl,
|
||||
"nvme-smart-log": testNVMeSmartLogLoHi,
|
||||
}
|
||||
report := GenerateDiskReportText(1, "/dev/nvme0n1", outputs, time.Unix(0, 0).UTC())
|
||||
assertContains(t, report, "38 °C")
|
||||
assertContains(t, report, "1,234 h")
|
||||
}
|
||||
|
||||
func TestGenerateDiskReportSATA(t *testing.T) {
|
||||
t.Parallel()
|
||||
outputs := map[string][]byte{
|
||||
"smartctl-health": testSmartCtlHealth,
|
||||
}
|
||||
report := GenerateDiskReportText(2, "/dev/sda", outputs, time.Unix(0, 0).UTC())
|
||||
|
||||
assertContains(t, report, "Disk 2", "/dev/sda")
|
||||
assertContains(t, report, "SAMSUNG MZ1L2960HCJR-00A07")
|
||||
assertContains(t, report, "S665NN0X415495")
|
||||
assertContains(t, report, "PASSED")
|
||||
assertContains(t, report, "Reallocated_Sector_Ct")
|
||||
assertContains(t, report, "Power_On_Hours")
|
||||
}
|
||||
|
||||
func assertContains(t *testing.T, text string, needles ...string) {
|
||||
t.Helper()
|
||||
for _, needle := range needles {
|
||||
if !strings.Contains(text, needle) {
|
||||
t.Errorf("report missing %q\nreport:\n%s", needle, text)
|
||||
}
|
||||
}
|
||||
}
|
||||
280
audit/internal/webui/huawei_elabel.go
Normal file
280
audit/internal/webui/huawei_elabel.go
Normal file
@@ -0,0 +1,280 @@
|
||||
package webui
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type huaweiField struct {
|
||||
Name string `json:"name"`
|
||||
Key string `json:"key"`
|
||||
Value string `json:"value"`
|
||||
ReadOnly bool `json:"read_only,omitempty"`
|
||||
}
|
||||
|
||||
type huaweiChange struct {
|
||||
Key string `json:"key"`
|
||||
Value string `json:"value"`
|
||||
}
|
||||
|
||||
type huaweiFieldDef struct {
|
||||
Name string
|
||||
Key string
|
||||
FruID byte
|
||||
TypeID byte
|
||||
FieldID byte
|
||||
Special string // "chassis-type" | "guid"
|
||||
}
|
||||
|
||||
var huaweiElabelDefs = []huaweiFieldDef{
|
||||
{"Device Name", "DeviceName", 0x00, 0x06, 0x01, ""},
|
||||
{"Device Serial Number", "DeviceSerialNumber", 0x00, 0x06, 0x03, ""},
|
||||
{"Product Name", "ProductName", 0x00, 0x03, 0x01, ""},
|
||||
{"Product Serial Number", "ProductSerialNumber", 0x00, 0x03, 0x04, ""},
|
||||
{"Product Asset Tag", "ProductAssetTag", 0x00, 0x03, 0x05, ""},
|
||||
{"Product Manufacturer", "ProductManufacturer", 0x00, 0x03, 0x00, ""},
|
||||
{"Mainboard Manufacturer", "MainboardManufacturer", 0x00, 0x02, 0x01, ""},
|
||||
{"Board Product Name", "BoardProductName", 0x00, 0x02, 0x02, ""},
|
||||
{"Chassis Part Number", "ChassisPartnumber", 0x00, 0x01, 0x01, ""},
|
||||
{"Chassis Type", "ChassisType", 0x00, 0x01, 0x00, "chassis-type"},
|
||||
{"IO Chassis Serial", "IOChassisSerialNumber", 0x01, 0x03, 0x04, ""},
|
||||
{"IO Chassis Asset Tag", "IOChassisAssetTag", 0x01, 0x03, 0x05, ""},
|
||||
{"GUID", "GUID", 0x00, 0x00, 0x00, "guid"},
|
||||
}
|
||||
|
||||
// huaweiGetRaw reads a string elabel field via OEM IPMI raw command.
|
||||
// Protocol: ipmitool raw 0x30 0x90 0x05 <fru_id> <type_id> <field_id> 0x00 0x30
|
||||
// Response: <length_byte> <ascii_byte1> ... (null-terminated)
|
||||
func huaweiGetRaw(ctx context.Context, def huaweiFieldDef) (string, error) {
|
||||
if def.Special == "guid" {
|
||||
return huaweiGetGUID(ctx)
|
||||
}
|
||||
args := []string{
|
||||
"0x30", "0x90", "0x05",
|
||||
fmt.Sprintf("0x%02x", def.FruID),
|
||||
fmt.Sprintf("0x%02x", def.TypeID),
|
||||
fmt.Sprintf("0x%02x", def.FieldID),
|
||||
"0x00", "0x30",
|
||||
}
|
||||
out, err := exec.CommandContext(ctx, "ipmitool", append([]string{"raw"}, args...)...).CombinedOutput()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return huaweiParseStringResponse(strings.TrimSpace(string(out)), def.Special), nil
|
||||
}
|
||||
|
||||
// huaweiParseStringResponse decodes the OEM IPMI response bytes to a string.
|
||||
// Format: <length_byte> <byte1> <byte2> ...
|
||||
func huaweiParseStringResponse(hexOut, special string) string {
|
||||
parts := strings.Fields(hexOut)
|
||||
if len(parts) < 2 {
|
||||
return ""
|
||||
}
|
||||
if special == "chassis-type" {
|
||||
// Response: <length=1> <type_byte>
|
||||
if len(parts) >= 2 {
|
||||
n, err := strconv.ParseUint(parts[1], 16, 8)
|
||||
if err == nil {
|
||||
return fmt.Sprintf("0x%02x", n)
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
var sb strings.Builder
|
||||
for _, p := range parts[1:] {
|
||||
b, err := strconv.ParseUint(p, 16, 8)
|
||||
if err != nil || b == 0 {
|
||||
break
|
||||
}
|
||||
sb.WriteByte(byte(b))
|
||||
}
|
||||
return strings.TrimRight(sb.String(), "\x00")
|
||||
}
|
||||
|
||||
// huaweiGetGUID reads the system GUID via standard IPMI Get System GUID (0x06 0x08).
|
||||
func huaweiGetGUID(ctx context.Context) (string, error) {
|
||||
out, err := exec.CommandContext(ctx, "ipmitool", "raw", "0x06", "0x08").CombinedOutput()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
parts := strings.Fields(strings.TrimSpace(string(out)))
|
||||
if len(parts) != 16 {
|
||||
return "", nil
|
||||
}
|
||||
// Format as UUID: 4-2-2-2-6 byte groups
|
||||
// iBMC returns bytes in reversed order; re-reverse to get canonical UUID.
|
||||
var bytes [16]string
|
||||
for i, p := range parts {
|
||||
bytes[15-i] = p
|
||||
}
|
||||
return fmt.Sprintf("%s%s%s%s-%s%s-%s%s-%s%s-%s%s%s%s%s%s",
|
||||
bytes[0], bytes[1], bytes[2], bytes[3],
|
||||
bytes[4], bytes[5],
|
||||
bytes[6], bytes[7],
|
||||
bytes[8], bytes[9],
|
||||
bytes[10], bytes[11], bytes[12], bytes[13], bytes[14], bytes[15],
|
||||
), nil
|
||||
}
|
||||
|
||||
// huaweiChunks splits a value into 19-byte chunks for the OEM IPMI SET protocol.
|
||||
// Key byte: bit7=1 means more chunks follow; bits 0-6 = offset into string.
|
||||
func huaweiChunks(value string) [][]string {
|
||||
if len(value) == 0 {
|
||||
return [][]string{{"0x00", "0x01", "0x00"}}
|
||||
}
|
||||
const maxLen = 63
|
||||
if len(value) > maxLen {
|
||||
value = value[:maxLen]
|
||||
}
|
||||
const chunkSize = 19
|
||||
var chunks [][]string
|
||||
for offset := 0; offset < len(value); {
|
||||
end := offset + chunkSize
|
||||
if end > len(value) {
|
||||
end = len(value)
|
||||
}
|
||||
isLast := end >= len(value)
|
||||
key := byte(offset)
|
||||
if !isLast {
|
||||
key |= 0x80
|
||||
}
|
||||
args := []string{
|
||||
fmt.Sprintf("0x%02x", key),
|
||||
fmt.Sprintf("0x%02x", end-offset),
|
||||
}
|
||||
for _, b := range []byte(value[offset:end]) {
|
||||
args = append(args, fmt.Sprintf("0x%02x", b))
|
||||
}
|
||||
chunks = append(chunks, args)
|
||||
offset = end
|
||||
}
|
||||
return chunks
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIHuaweiElabelRead(w http.ResponseWriter, r *http.Request) {
|
||||
ctx, cancel := context.WithTimeout(r.Context(), 60*time.Second)
|
||||
defer cancel()
|
||||
|
||||
var fields []huaweiField
|
||||
for _, def := range huaweiElabelDefs {
|
||||
val, err := huaweiGetRaw(ctx, def)
|
||||
if err != nil {
|
||||
// First field failure likely means no Huawei BMC — abort with error.
|
||||
if len(fields) == 0 {
|
||||
msg := strings.TrimSpace(err.Error())
|
||||
writeError(w, http.StatusInternalServerError, "huawei elabel not available: "+msg)
|
||||
return
|
||||
}
|
||||
val = ""
|
||||
}
|
||||
fields = append(fields, huaweiField{
|
||||
Name: def.Name,
|
||||
Key: def.Key,
|
||||
Value: val,
|
||||
ReadOnly: def.Special == "guid" || def.Special == "chassis-type",
|
||||
})
|
||||
}
|
||||
writeJSON(w, fields)
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIHuaweiElabelWrite(w http.ResponseWriter, r *http.Request) {
|
||||
var req struct {
|
||||
Changes []huaweiChange `json:"changes"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid JSON")
|
||||
return
|
||||
}
|
||||
if len(req.Changes) == 0 {
|
||||
writeError(w, http.StatusUnprocessableEntity, "no changes provided")
|
||||
return
|
||||
}
|
||||
|
||||
defByKey := make(map[string]huaweiFieldDef, len(huaweiElabelDefs))
|
||||
for _, d := range huaweiElabelDefs {
|
||||
defByKey[d.Key] = d
|
||||
}
|
||||
|
||||
for _, c := range req.Changes {
|
||||
def, ok := defByKey[c.Key]
|
||||
if !ok {
|
||||
writeError(w, http.StatusUnprocessableEntity, "unknown field key: "+c.Key)
|
||||
return
|
||||
}
|
||||
if def.Special == "guid" || def.Special == "chassis-type" {
|
||||
writeError(w, http.StatusUnprocessableEntity, "field is read-only: "+c.Key)
|
||||
return
|
||||
}
|
||||
if len(c.Value) > 63 {
|
||||
writeError(w, http.StatusUnprocessableEntity, "value too long (max 63 chars): "+c.Key)
|
||||
return
|
||||
}
|
||||
for _, ch := range c.Value {
|
||||
if ch < 0x20 || ch > 0x7E {
|
||||
writeError(w, http.StatusUnprocessableEntity, "non-printable character in value for: "+c.Key)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
t := &Task{
|
||||
ID: newJobID("huawei-elabel-write"),
|
||||
Name: fmt.Sprintf("Huawei Elabel Write (%d field(s))", len(req.Changes)),
|
||||
Target: "huawei-elabel-write",
|
||||
Priority: defaultTaskPriority("huawei-elabel-write", taskParams{}),
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{HuaweiElabelChanges: req.Changes},
|
||||
}
|
||||
globalQueue.enqueue(t)
|
||||
writeJSON(w, map[string]string{"task_id": t.ID})
|
||||
}
|
||||
|
||||
func runHuaweiElabelWriteTask(ctx context.Context, j *jobState, p taskParams) error {
|
||||
defByKey := make(map[string]huaweiFieldDef, len(huaweiElabelDefs))
|
||||
for _, d := range huaweiElabelDefs {
|
||||
defByKey[d.Key] = d
|
||||
}
|
||||
|
||||
// Enable device name effective flag before writing.
|
||||
enableCmd := exec.CommandContext(ctx, "ipmitool", "raw", "0x30", "0x90", "0x21", "0x04", "0x01")
|
||||
if out, err := enableCmd.CombinedOutput(); err != nil {
|
||||
j.append("Warning: enable flag: " + strings.TrimSpace(string(out)))
|
||||
}
|
||||
|
||||
for _, c := range p.HuaweiElabelChanges {
|
||||
def := defByKey[c.Key]
|
||||
setPrefix := []string{
|
||||
"0x30", "0x90", "0x04",
|
||||
fmt.Sprintf("0x%02x", def.FruID),
|
||||
fmt.Sprintf("0x%02x", def.TypeID),
|
||||
fmt.Sprintf("0x%02x", def.FieldID),
|
||||
}
|
||||
|
||||
chunks := huaweiChunks(c.Value)
|
||||
j.append(fmt.Sprintf("Setting %s = %q (%d chunk(s))", c.Key, c.Value, len(chunks)))
|
||||
|
||||
for _, chunk := range chunks {
|
||||
args := append([]string{"raw"}, setPrefix...)
|
||||
args = append(args, chunk...)
|
||||
cmd := exec.CommandContext(ctx, "ipmitool", args...)
|
||||
if err := streamCmdJob(j, cmd); err != nil {
|
||||
return fmt.Errorf("set %s: %w", c.Key, err)
|
||||
}
|
||||
}
|
||||
|
||||
// Commit after each field.
|
||||
commitCmd := exec.CommandContext(ctx, "ipmitool", "raw", "0x30", "0x90", "0x06", "0x00", "0xAA")
|
||||
if out, err := commitCmd.CombinedOutput(); err != nil {
|
||||
return fmt.Errorf("commit after %s: %w (output: %s)", c.Key, err, strings.TrimSpace(string(out)))
|
||||
}
|
||||
j.append("Committed " + c.Key)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
@@ -37,26 +37,41 @@ var fruEditableFields = map[string]struct {
|
||||
"Chassis Part Number": {"c", 0},
|
||||
"Chassis Serial Number": {"c", 1},
|
||||
"Chassis Serial": {"c", 1},
|
||||
"Chassis Extra": {"c", 2},
|
||||
// Board — vendor doc names and ipmitool abbreviated names
|
||||
"Board Manufacturer": {"b", 0},
|
||||
"Board Mfg": {"b", 0},
|
||||
"Board Product Name": {"b", 1},
|
||||
"Board Product": {"b", 1},
|
||||
"Board Manufacturer": {"b", 0},
|
||||
"Board Mfg": {"b", 0},
|
||||
"Board Product Name": {"b", 1},
|
||||
"Board Product": {"b", 1},
|
||||
"Board Serial Number": {"b", 2},
|
||||
"Board Serial": {"b", 2},
|
||||
"Board Part Number": {"b", 3},
|
||||
"Board Serial": {"b", 2},
|
||||
"Board Part Number": {"b", 3},
|
||||
// Product — vendor doc names and ipmitool abbreviated names
|
||||
"Product Manufacturer": {"p", 0},
|
||||
"Product Name": {"p", 1},
|
||||
"Product Part Number": {"p", 2},
|
||||
"Product Version": {"p", 3},
|
||||
"Product Manufacturer": {"p", 0},
|
||||
"Product Name": {"p", 1},
|
||||
"Product Part Number": {"p", 2},
|
||||
"Product Version": {"p", 3},
|
||||
"Product Serial Number": {"p", 4},
|
||||
"Product Serial": {"p", 4},
|
||||
"Product Serial": {"p", 4},
|
||||
"Product Asset Tag": {"p", 5},
|
||||
}
|
||||
|
||||
// fruExtraBaseIndex gives the starting ipmitool field index for each area's
|
||||
// repeated "<Area> Extra" custom fields, per the vendor FRU field doc (Chassis
|
||||
// extra fields start at 2, Board at 5, Product at 7). ipmitool fru print
|
||||
// emits one identically-named line per custom field, so parseFRUOutput
|
||||
// counts occurrences to recover the real index for each one.
|
||||
var fruExtraBaseIndex = map[string]struct {
|
||||
Area string
|
||||
Base int
|
||||
}{
|
||||
"Chassis Extra": {"c", 2},
|
||||
"Board Extra": {"b", 5},
|
||||
"Product Extra": {"p", 7},
|
||||
}
|
||||
|
||||
func parseFRUOutput(output string) []fruField {
|
||||
var fields []fruField
|
||||
extraSeen := map[string]int{}
|
||||
for _, line := range strings.Split(output, "\n") {
|
||||
// Lines look like: " Field Name : value"
|
||||
trimmed := strings.TrimLeft(line, " \t")
|
||||
@@ -64,33 +79,32 @@ func parseFRUOutput(output string) []fruField {
|
||||
continue
|
||||
}
|
||||
colon := strings.Index(trimmed, " : ")
|
||||
valueOffset := 3
|
||||
if colon < 0 {
|
||||
// try ": " with no leading space before colon
|
||||
colon = strings.Index(trimmed, ": ")
|
||||
valueOffset = 2
|
||||
if colon < 0 {
|
||||
continue
|
||||
}
|
||||
name := strings.TrimSpace(trimmed[:colon])
|
||||
value := strings.TrimSpace(trimmed[colon+2:])
|
||||
if name == "" {
|
||||
continue
|
||||
}
|
||||
editable, area, idx := fruFieldMeta(name)
|
||||
fields = append(fields, fruField{Name: name, Value: value, Editable: editable, Area: area, Index: idx})
|
||||
continue
|
||||
}
|
||||
name := strings.TrimSpace(trimmed[:colon])
|
||||
value := strings.TrimSpace(trimmed[colon+3:])
|
||||
value := strings.TrimSpace(trimmed[colon+valueOffset:])
|
||||
if name == "" {
|
||||
continue
|
||||
}
|
||||
editable, area, idx := fruFieldMeta(name)
|
||||
editable, area, idx := fruFieldMeta(name, extraSeen)
|
||||
fields = append(fields, fruField{Name: name, Value: value, Editable: editable, Area: area, Index: idx})
|
||||
}
|
||||
return fields
|
||||
}
|
||||
|
||||
func fruFieldMeta(name string) (editable bool, area string, index int) {
|
||||
func fruFieldMeta(name string, extraSeen map[string]int) (editable bool, area string, index int) {
|
||||
if e, ok := fruExtraBaseIndex[name]; ok {
|
||||
idx := e.Base + extraSeen[name]
|
||||
extraSeen[name]++
|
||||
return true, e.Area, idx
|
||||
}
|
||||
if e, ok := fruEditableFields[name]; ok {
|
||||
return true, e.Area, e.Index
|
||||
}
|
||||
@@ -201,84 +215,3 @@ func runIPMIFRUWriteTask(ctx context.Context, j *jobState, exportDir string, p t
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func renderIPMIFRUCard() string {
|
||||
return `<div class="card"><div class="card-head card-head-actions">IPMI — FRU<div class="card-head-buttons"><button class="btn btn-sm btn-secondary" onclick="fruRead()">Read</button></div></div><div class="card-body">
|
||||
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Reads and edits FRU fields via ipmitool (In-Band, device 0). Works on any server with IPMI support.</p>
|
||||
<div id="fru-status" style="font-size:13px;color:var(--muted);margin-bottom:8px"></div>
|
||||
<div id="fru-table"></div>
|
||||
</div></div>
|
||||
<script>
|
||||
var _fruActBtnStyle = 'width:22px;height:22px;padding:0;font-size:13px;line-height:1;border:1px solid var(--line);border-radius:3px;background:var(--surface);cursor:pointer;vertical-align:middle;';
|
||||
var _fruInputStyle = 'width:100%;padding:3px 6px;border:1.5px solid #888;border-radius:3px;font-size:13px;font-family:monospace;background:var(--surface);color:var(--ink);';
|
||||
function fruRead() {
|
||||
var status = document.getElementById('fru-status');
|
||||
status.textContent = 'Reading...'; status.style.color = 'var(--muted)';
|
||||
document.getElementById('fru-table').innerHTML = '';
|
||||
fetch('/api/tools/ipmi-fru', {cache:'no-store'})
|
||||
.then(function(r) { return r.json().then(function(d){if(!r.ok)throw new Error(d.error||r.statusText);return d;}); })
|
||||
.then(function(fields) {
|
||||
if (!fields || !fields.length) { status.textContent = 'No FRU fields returned.'; return; }
|
||||
status.textContent = '';
|
||||
var rows = fields.map(function(f) {
|
||||
var val = escHtml(f.value || '');
|
||||
return '<tr>'
|
||||
+ '<td style="color:var(--muted);white-space:nowrap;padding-right:16px;vertical-align:middle;font-size:13px">' + escHtml(f.name) + '</td>'
|
||||
+ '<td style="vertical-align:middle"><input class="fru-inp" style="' + _fruInputStyle + '"'
|
||||
+ ' data-area="' + escHtml(f.area||'') + '" data-index="' + (f.index||0) + '" data-name="' + escHtml(f.name) + '"'
|
||||
+ ' data-original="' + val + '" value="' + val + '" oninput="fruChanged(this)"></td>'
|
||||
+ '<td class="fru-act" style="display:none;white-space:nowrap;padding-left:6px;vertical-align:middle">'
|
||||
+ '<button style="' + _fruActBtnStyle + 'color:var(--ok-fg,green);margin-right:3px" title="Save" onclick="fruSave(this)">✓</button>'
|
||||
+ '<button style="' + _fruActBtnStyle + 'color:var(--crit-fg,#9f3a38)" title="Cancel" onclick="fruCancel(this)">✗</button>'
|
||||
+ '<span class="fru-msg" style="font-size:11px;margin-left:5px;color:var(--muted)"></span>'
|
||||
+ '</td></tr>';
|
||||
}).join('');
|
||||
document.getElementById('fru-table').innerHTML = '<table style="width:100%;border-collapse:collapse">' + rows + '</table>';
|
||||
})
|
||||
.catch(function(e) { status.textContent = 'Error: '+e.message; status.style.color='var(--crit-fg)'; });
|
||||
}
|
||||
function escHtml(s) {
|
||||
return String(s==null?'':s).replace(/&/g,'&').replace(/</g,'<').replace(/>/g,'>').replace(/"/g,'"');
|
||||
}
|
||||
function fruChanged(inp) {
|
||||
inp.closest('tr').querySelector('.fru-act').style.display = inp.value !== inp.dataset.original ? '' : 'none';
|
||||
}
|
||||
function fruCancel(btn) {
|
||||
var row = btn.closest('tr');
|
||||
var inp = row.querySelector('.fru-inp');
|
||||
inp.value = inp.dataset.original;
|
||||
row.querySelector('.fru-act').style.display = 'none';
|
||||
row.querySelector('.fru-msg').textContent = '';
|
||||
}
|
||||
function fruSave(btn) {
|
||||
var row = btn.closest('tr');
|
||||
var inp = row.querySelector('.fru-inp');
|
||||
var msg = row.querySelector('.fru-msg');
|
||||
var cancelBtn = row.querySelectorAll('.fru-act button')[1];
|
||||
btn.disabled = true; cancelBtn.disabled = true;
|
||||
msg.textContent = '…'; msg.style.color = 'var(--muted)';
|
||||
fetch('/api/tools/ipmi-fru/write', {method:'POST', headers:{'Content-Type':'application/json'},
|
||||
body: JSON.stringify({changes:[{area:inp.dataset.area, index:parseInt(inp.dataset.index,10), name:inp.dataset.name, value:inp.value}]})})
|
||||
.then(function(r){return r.json().then(function(d){if(!r.ok)throw new Error(d.error||r.statusText);return d;});})
|
||||
.then(function(d){
|
||||
var poll = setInterval(function(){
|
||||
fetch('/api/tasks',{cache:'no-store'}).then(function(r){return r.json();}).then(function(tasks){
|
||||
var t = Array.isArray(tasks)?tasks.find(function(x){return x.id===d.task_id;}):null;
|
||||
if(!t) return;
|
||||
if(t.status==='done'){
|
||||
clearInterval(poll);
|
||||
inp.dataset.original = inp.value;
|
||||
row.querySelector('.fru-act').style.display = 'none';
|
||||
msg.textContent = '';
|
||||
} else if(t.status==='failed'||t.status==='cancelled'){
|
||||
clearInterval(poll);
|
||||
msg.textContent = t.error||t.status; msg.style.color='var(--crit-fg)';
|
||||
btn.disabled=false; cancelBtn.disabled=false;
|
||||
}
|
||||
});
|
||||
},1500);
|
||||
})
|
||||
.catch(function(e){ msg.textContent='Error: '+e.message; msg.style.color='var(--crit-fg)'; btn.disabled=false; cancelBtn.disabled=false; });
|
||||
}
|
||||
</script>`
|
||||
}
|
||||
|
||||
59
audit/internal/webui/ipmi_fru_test.go
Normal file
59
audit/internal/webui/ipmi_fru_test.go
Normal file
@@ -0,0 +1,59 @@
|
||||
package webui
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestParseFRUOutputExtraFields(t *testing.T) {
|
||||
// Realistic ipmitool fru print output: repeated "<Area> Extra" lines
|
||||
// (one per custom field) must resolve to sequential indices per the
|
||||
// vendor FRU doc (Chassis Extra starts at 2, Board Extra at 5, Product
|
||||
// Extra at 7), not all collapse onto the same index.
|
||||
out := `
|
||||
Product Manufacturer : Inspur
|
||||
Product Name : NF5280M6
|
||||
Product Part Number : PN123
|
||||
Product Version : 1.0
|
||||
Product Serial : SN123
|
||||
Product Asset Tag : ASSET01
|
||||
Product Extra : custom-p1
|
||||
Board Mfg : Inspur
|
||||
Board Product : BoardX
|
||||
Board Serial : BSN1
|
||||
Board Part Number : BPN1
|
||||
Board Extra : custom-b1
|
||||
Board Extra : custom-b2
|
||||
Board Extra : custom-b3
|
||||
Chassis Part Number : CPN1
|
||||
Chassis Serial : CSN1
|
||||
Chassis Extra : front-half
|
||||
Chassis Extra : back-half
|
||||
`
|
||||
fields := parseFRUOutput(out)
|
||||
|
||||
byName := map[string][]fruField{}
|
||||
for _, f := range fields {
|
||||
byName[f.Name] = append(byName[f.Name], f)
|
||||
}
|
||||
|
||||
assertMeta := func(name string, occurrence int, wantArea string, wantIndex int) {
|
||||
t.Helper()
|
||||
list := byName[name]
|
||||
if occurrence >= len(list) {
|
||||
t.Fatalf("expected occurrence %d of %q, got %d entries", occurrence, name, len(list))
|
||||
}
|
||||
f := list[occurrence]
|
||||
if f.Area != wantArea || f.Index != wantIndex {
|
||||
t.Errorf("%s[%d] = area:%q index:%d, want area:%q index:%d", name, occurrence, f.Area, f.Index, wantArea, wantIndex)
|
||||
}
|
||||
if !f.Editable {
|
||||
t.Errorf("%s[%d] expected editable", name, occurrence)
|
||||
}
|
||||
}
|
||||
|
||||
assertMeta("Product Asset Tag", 0, "p", 5)
|
||||
assertMeta("Product Extra", 0, "p", 7)
|
||||
assertMeta("Board Extra", 0, "b", 5)
|
||||
assertMeta("Board Extra", 1, "b", 6)
|
||||
assertMeta("Board Extra", 2, "b", 7)
|
||||
assertMeta("Chassis Extra", 0, "c", 2)
|
||||
assertMeta("Chassis Extra", 1, "c", 3)
|
||||
}
|
||||
@@ -404,13 +404,227 @@ loadNvidiaSelfHeal();
|
||||
func renderTools() string {
|
||||
return renderNVMeFormatCard() + `
|
||||
|
||||
` + renderSAADMICard() + `
|
||||
|
||||
` + renderIPMIFRUCard() + `
|
||||
` + renderFRUEditorCard() + `
|
||||
|
||||
` + renderRAIDMgmtCard()
|
||||
}
|
||||
|
||||
func renderFRUEditorCard() string {
|
||||
return `<div class="card"><div class="card-head card-head-actions">FRU / Elabel<div class="card-head-buttons"><button class="btn btn-sm btn-secondary" onclick="fruAllRead()">Read All</button></div></div><div class="card-body">
|
||||
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Reads and edits hardware identity fields from all available sources. Each field shows its source method.</p>
|
||||
<div id="fru-all-status" style="font-size:13px;color:var(--muted);margin-bottom:8px"></div>
|
||||
<div id="fru-src-status" style="display:none;margin-bottom:10px"></div>
|
||||
<div id="fru-all-table"></div>
|
||||
</div></div>
|
||||
<style>
|
||||
.fru-chip{display:inline-block;font-size:10px;font-weight:600;letter-spacing:.02em;padding:1px 6px;border-radius:3px;vertical-align:middle;white-space:nowrap;margin-right:8px;flex-shrink:0}
|
||||
.fru-chip-ipmi{background:#e8e8e8;color:#555}
|
||||
.fru-chip-huawei{background:#fff0e6;color:#b83}
|
||||
.fru-chip-saa{background:#e6f0ff;color:#557}
|
||||
.fru-inp-wrap{display:flex;align-items:center;gap:0}
|
||||
</style>
|
||||
<script>
|
||||
(function(){
|
||||
var _actBtn='width:22px;height:22px;padding:0;font-size:13px;line-height:1;border:1px solid var(--line);border-radius:3px;background:var(--surface);cursor:pointer;vertical-align:middle;';
|
||||
var _inp='width:100%;padding:3px 6px;border:1.5px solid #888;border-radius:3px;font-size:13px;font-family:monospace;background:var(--surface);color:var(--ink);';
|
||||
|
||||
var SOURCES = [
|
||||
{
|
||||
id: 'ipmi-fru',
|
||||
label: 'IPMI FRU',
|
||||
chipClass: 'fru-chip-ipmi',
|
||||
url: '/api/tools/ipmi-fru',
|
||||
writeUrl: '/api/tools/ipmi-fru/write',
|
||||
rowAttrs: function(f) {
|
||||
return 'data-source="ipmi-fru" data-area="'+esc(f.area||'')+'" data-index="'+(f.index||0)+'" data-name="'+esc(f.name)+'"';
|
||||
},
|
||||
writeBody: function(inp) {
|
||||
return JSON.stringify({changes:[{area:inp.dataset.area,index:parseInt(inp.dataset.index,10),name:inp.dataset.name,value:inp.value}]});
|
||||
},
|
||||
fieldName: function(f) { return f.name; },
|
||||
fieldValue: function(f) { return f.value||''; },
|
||||
readOnly: function(f) { return false; },
|
||||
},
|
||||
{
|
||||
id: 'huawei',
|
||||
label: 'Huawei iBMC',
|
||||
chipClass: 'fru-chip-huawei',
|
||||
url: '/api/tools/huawei-elabel',
|
||||
writeUrl: '/api/tools/huawei-elabel/write',
|
||||
rowAttrs: function(f) {
|
||||
return 'data-source="huawei" data-key="'+esc(f.key)+'"';
|
||||
},
|
||||
writeBody: function(inp) {
|
||||
return JSON.stringify({changes:[{key:inp.dataset.key,value:inp.value}]});
|
||||
},
|
||||
fieldName: function(f) { return f.name; },
|
||||
fieldValue: function(f) { return f.value||''; },
|
||||
readOnly: function(f) { return !!f.read_only; },
|
||||
},
|
||||
{
|
||||
id: 'saa-dmi',
|
||||
label: 'SAA DMI',
|
||||
chipClass: 'fru-chip-saa',
|
||||
url: '/api/tools/saa-dmi',
|
||||
writeUrl: '/api/tools/saa-dmi/write',
|
||||
rowAttrs: function(f) {
|
||||
return 'data-source="saa-dmi" data-shn="'+esc(f.shn)+'"';
|
||||
},
|
||||
writeBody: function(inp) {
|
||||
return JSON.stringify({changes:[{shn:inp.dataset.shn,value:inp.value}]});
|
||||
},
|
||||
fieldName: function(f) { return f.name; },
|
||||
fieldValue: function(f) { return f.value||''; },
|
||||
readOnly: function(f) { return false; },
|
||||
},
|
||||
];
|
||||
|
||||
function esc(s){return String(s==null?'':s).replace(/&/g,'&').replace(/</g,'<').replace(/>/g,'>').replace(/"/g,'"');}
|
||||
|
||||
function renderSrcStatus(perSource) {
|
||||
var bar = document.getElementById('fru-src-status');
|
||||
if (!perSource.length) { bar.style.display = 'none'; bar.innerHTML = ''; return; }
|
||||
var html = '';
|
||||
perSource.forEach(function(p) {
|
||||
var state, color;
|
||||
if (p.ok) {
|
||||
state = p.count + ' field(s) available';
|
||||
color = 'var(--ok-fg,green)';
|
||||
} else if (/not activated|product key|SFT-DCMS|SFT-OOB/i.test(p.reason)) {
|
||||
state = 'requires Supermicro license (SFT-OOB-LIC / SFT-DCMS-SINGLE) — activate on BMC';
|
||||
color = 'var(--crit-fg,#9f3a38)';
|
||||
} else {
|
||||
state = p.reason || 'unavailable';
|
||||
color = 'var(--muted)';
|
||||
}
|
||||
html += '<div style="display:flex;align-items:center;gap:8px;font-size:12px;margin:3px 0">'
|
||||
+ '<span class="fru-chip '+p.src.chipClass+'">'+p.src.label+'</span>'
|
||||
+ '<span style="color:'+color+'">'+esc(state)+'</span>'
|
||||
+ '</div>';
|
||||
});
|
||||
bar.innerHTML = html;
|
||||
bar.style.display = '';
|
||||
}
|
||||
|
||||
window.fruAllRead = function() {
|
||||
var status = document.getElementById('fru-all-status');
|
||||
var table = document.getElementById('fru-all-table');
|
||||
status.textContent = 'Reading…'; status.style.color = 'var(--muted)';
|
||||
table.innerHTML = '';
|
||||
|
||||
var fetches = SOURCES.map(function(src) {
|
||||
return fetch(src.url, {cache:'no-store'})
|
||||
.then(function(r){ return r.json().then(function(d){ if(!r.ok) throw new Error(d.error||r.statusText); return d; }); });
|
||||
});
|
||||
|
||||
Promise.allSettled(fetches).then(function(results) {
|
||||
var rows = '';
|
||||
var totalFields = 0;
|
||||
var perSource = [];
|
||||
|
||||
results.forEach(function(res, i) {
|
||||
var src = SOURCES[i];
|
||||
if (res.status === 'rejected' || !Array.isArray(res.value) || res.value.length === 0) {
|
||||
var reason = '';
|
||||
if (res.status === 'rejected' && res.reason) reason = res.reason.message;
|
||||
else reason = 'no editable fields returned';
|
||||
perSource.push({src:src, ok:false, count:0, reason:reason});
|
||||
return;
|
||||
}
|
||||
perSource.push({src:src, ok:true, count:res.value.length, reason:''});
|
||||
res.value.forEach(function(f) {
|
||||
var val = esc(src.fieldValue(f));
|
||||
var ro = src.readOnly(f);
|
||||
var attrs = ro ? '' : (' '+src.rowAttrs(f));
|
||||
rows += '<tr>'
|
||||
+ '<td style="white-space:nowrap;padding-right:4px;vertical-align:middle">'
|
||||
+ '<span class="fru-chip '+src.chipClass+'">'+src.label+'</span>'
|
||||
+ '</td>'
|
||||
+ '<td style="color:var(--muted);white-space:nowrap;padding-right:16px;vertical-align:middle;font-size:13px">'+esc(src.fieldName(f))+'</td>'
|
||||
+ '<td style="vertical-align:middle">'
|
||||
+ (ro
|
||||
? '<span style="font-family:monospace;font-size:13px;color:var(--muted)">'+val+'</span>'
|
||||
: '<input class="fru-uni-inp" style="'+_inp+'" value="'+val+'" data-original="'+val+'"'+attrs+' oninput="fruUniChanged(this)">')
|
||||
+ '</td>'
|
||||
+ '<td class="fru-uni-act" style="display:none;white-space:nowrap;padding-left:6px;vertical-align:middle">'
|
||||
+ '<button style="'+_actBtn+'color:var(--ok-fg,green);margin-right:3px" title="Save" onclick="fruUniSave(this)">✓</button>'
|
||||
+ '<button style="'+_actBtn+'color:var(--crit-fg,#9f3a38)" title="Cancel" onclick="fruUniCancel(this)">✗</button>'
|
||||
+ '<span class="fru-uni-msg" style="font-size:11px;margin-left:5px;color:var(--muted)"></span>'
|
||||
+ '</td>'
|
||||
+ '</tr>';
|
||||
totalFields++;
|
||||
});
|
||||
});
|
||||
|
||||
renderSrcStatus(perSource);
|
||||
|
||||
if (totalFields === 0) {
|
||||
status.textContent = 'No editable fields available — see per-source status below.';
|
||||
status.style.color = 'var(--crit-fg,#9f3a38)';
|
||||
table.innerHTML = '';
|
||||
return;
|
||||
}
|
||||
|
||||
table.innerHTML = '<table style="width:100%;border-collapse:collapse">'+rows+'</table>';
|
||||
status.textContent = totalFields + ' field(s) loaded';
|
||||
status.style.color = 'var(--muted)';
|
||||
});
|
||||
};
|
||||
|
||||
window.fruUniChanged = function(inp) {
|
||||
var row = inp.closest('tr');
|
||||
row.querySelector('.fru-uni-act').style.display = inp.value !== inp.dataset.original ? '' : 'none';
|
||||
row.querySelector('.fru-uni-msg').textContent = '';
|
||||
};
|
||||
|
||||
window.fruUniCancel = function(btn) {
|
||||
var row = btn.closest('tr');
|
||||
var inp = row.querySelector('.fru-uni-inp');
|
||||
inp.value = inp.dataset.original;
|
||||
row.querySelector('.fru-uni-act').style.display = 'none';
|
||||
row.querySelector('.fru-uni-msg').textContent = '';
|
||||
};
|
||||
|
||||
window.fruUniSave = function(btn) {
|
||||
var row = btn.closest('tr');
|
||||
var inp = row.querySelector('.fru-uni-inp');
|
||||
var msg = row.querySelector('.fru-uni-msg');
|
||||
var cancelBtn = row.querySelectorAll('.fru-uni-act button')[1];
|
||||
var src = SOURCES.find(function(s){ return s.id === inp.dataset.source; });
|
||||
if (!src) { msg.textContent = 'Unknown source'; msg.style.color='var(--crit-fg)'; return; }
|
||||
|
||||
btn.disabled = true; cancelBtn.disabled = true;
|
||||
msg.textContent = '…'; msg.style.color = 'var(--muted)';
|
||||
|
||||
fetch(src.writeUrl, {method:'POST', headers:{'Content-Type':'application/json'}, body:src.writeBody(inp)})
|
||||
.then(function(r){ return r.json().then(function(d){ if(!r.ok) throw new Error(d.error||r.statusText); return d; }); })
|
||||
.then(function(d) {
|
||||
var poll = setInterval(function() {
|
||||
fetch('/api/tasks',{cache:'no-store'}).then(function(r){return r.json();}).then(function(tasks){
|
||||
var t = Array.isArray(tasks) ? tasks.find(function(x){return x.id===d.task_id;}) : null;
|
||||
if (!t) return;
|
||||
if (t.status==='done') {
|
||||
clearInterval(poll);
|
||||
inp.dataset.original = inp.value;
|
||||
row.querySelector('.fru-uni-act').style.display = 'none';
|
||||
msg.textContent = ''; msg.style.color = '';
|
||||
} else if (t.status==='failed'||t.status==='cancelled') {
|
||||
clearInterval(poll);
|
||||
msg.textContent = t.error||t.status; msg.style.color = 'var(--crit-fg)';
|
||||
btn.disabled = false; cancelBtn.disabled = false;
|
||||
}
|
||||
});
|
||||
}, 1500);
|
||||
})
|
||||
.catch(function(e) {
|
||||
msg.textContent = 'Error: '+e.message; msg.style.color = 'var(--crit-fg)';
|
||||
btn.disabled = false; cancelBtn.disabled = false;
|
||||
});
|
||||
};
|
||||
})();
|
||||
</script>`
|
||||
}
|
||||
|
||||
func renderExportIndex(exportDir string) (string, error) {
|
||||
entries, err := listExportFiles(exportDir)
|
||||
if err != nil {
|
||||
|
||||
@@ -143,9 +143,9 @@ func renderValidateMode(opts HandlerOptions, stressDefault bool) string {
|
||||
)) +
|
||||
renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
|
||||
inv.Storage,
|
||||
`Scans all storage devices and runs the matching health or self-test path for each device type.`,
|
||||
`<code>lsblk</code>; NVMe: <code>nvme</code>; SATA/SAS: <code>smartctl</code>`,
|
||||
`Seconds in Validate (NVMe: instant device query; SATA/SAS: short self-test). Up to ~1 h per device in Stress (extended self-test, device-dependent).`,
|
||||
`Collects SMART data and runs a short self-test on each storage device.`,
|
||||
`<code>lsblk</code>; NVMe: <code>nvme id-ctrl</code>, <code>nvme smart-log</code>, <code>nvme device-self-test -s 1</code>; SATA/SAS: <code>smartctl -H -A</code>, <code>smartctl -t short</code>`,
|
||||
`~2 min per device (NVMe short self-test; SATA/SAS short self-test — duration device-dependent).`,
|
||||
)) +
|
||||
`</div>
|
||||
<div style="height:1px;background:var(--border);margin:16px 0"></div>
|
||||
@@ -672,9 +672,9 @@ func renderCheck(opts HandlerOptions) string {
|
||||
)) +
|
||||
renderSATCard("storage", "Storage", "runSAT('storage')", "", renderValidateCardBody(
|
||||
inv.Storage,
|
||||
`Scans all storage devices and runs the matching health or self-test path for each.`,
|
||||
`<code>lsblk</code>; NVMe: <code>nvme</code>; SATA/SAS: <code>smartctl</code>`,
|
||||
`Seconds (NVMe: instant device query; SATA/SAS: short self-test).`,
|
||||
`Collects SMART health and attributes for each storage device. No self-test is triggered — read-only query only.`,
|
||||
`<code>lsblk</code>; NVMe: <code>nvme id-ctrl</code>, <code>nvme smart-log</code>; SATA/SAS: <code>smartctl -H -A</code>`,
|
||||
`Seconds — instantaneous device query, no wear counters incremented.`,
|
||||
)) +
|
||||
`</div>
|
||||
<div style="height:1px;background:var(--border);margin:16px 0"></div>
|
||||
|
||||
@@ -38,6 +38,7 @@ type raidControllerInfo struct {
|
||||
Model string `json:"model"`
|
||||
ForeignDrives []raidDriveInfo `json:"foreign_drives"`
|
||||
FreeDrives []raidDriveInfo `json:"free_drives"`
|
||||
AllDrives []raidDriveInfo `json:"all_drives"`
|
||||
Arrays []raidArrayInfo `json:"arrays,omitempty"`
|
||||
}
|
||||
|
||||
@@ -97,6 +98,7 @@ func detectLSIControllers() []raidControllerInfo {
|
||||
Model: c.ResponseData.Basics.Model,
|
||||
ForeignDrives: []raidDriveInfo{},
|
||||
FreeDrives: []raidDriveInfo{},
|
||||
AllDrives: []raidDriveInfo{},
|
||||
}
|
||||
if ctrl.Model == "" {
|
||||
ctrl.Model = fmt.Sprintf("LSI Controller %d", ctrl.Index)
|
||||
@@ -111,6 +113,7 @@ func detectLSIControllers() []raidControllerInfo {
|
||||
SizeGB: raidParseHumanSizeGB(d.Size),
|
||||
Serial: strings.TrimSpace(d.SN),
|
||||
}
|
||||
ctrl.AllDrives = append(ctrl.AllDrives, info)
|
||||
switch strings.TrimSpace(d.State) {
|
||||
case "Frgn":
|
||||
ctrl.ForeignDrives = append(ctrl.ForeignDrives, info)
|
||||
@@ -168,6 +171,30 @@ func parseRAIDMDStat(raw string) []mdStatEntry {
|
||||
return entries
|
||||
}
|
||||
|
||||
// raidVROCPortRx matches lines like " Port2 : /dev/sda (SERIAL123)"
|
||||
// or " Port3 : - no device attached -" from `mdadm --detail-platform`.
|
||||
var raidVROCPortRx = regexp.MustCompile(`^\s*Port\d+\s*:\s*(\S+)`)
|
||||
|
||||
// parseVROCPorts returns the block device basenames (e.g. "sda") that are
|
||||
// physically wired to the VROC I/O controller's ports, per `mdadm
|
||||
// --detail-platform` output. Drives attached directly to the CPU (or to a
|
||||
// separate HBA) rather than through this controller's ports are excluded.
|
||||
func parseVROCPorts(raw string) map[string]bool {
|
||||
ports := map[string]bool{}
|
||||
for _, line := range strings.Split(raw, "\n") {
|
||||
m := raidVROCPortRx.FindStringSubmatch(line)
|
||||
if m == nil {
|
||||
continue
|
||||
}
|
||||
dev := m[1]
|
||||
if !strings.HasPrefix(dev, "/dev/") {
|
||||
continue
|
||||
}
|
||||
ports[strings.TrimPrefix(dev, "/dev/")] = true
|
||||
}
|
||||
return ports
|
||||
}
|
||||
|
||||
func detectVROCController() *raidControllerInfo {
|
||||
out, err := exec.Command("mdadm", "--detail-platform").CombinedOutput()
|
||||
if err != nil && len(out) == 0 {
|
||||
@@ -191,8 +218,16 @@ func detectVROCController() *raidControllerInfo {
|
||||
Model: "Intel VROC",
|
||||
ForeignDrives: []raidDriveInfo{},
|
||||
FreeDrives: []raidDriveInfo{},
|
||||
AllDrives: []raidDriveInfo{},
|
||||
}
|
||||
|
||||
ports := parseVROCPorts(string(out))
|
||||
// Some mdadm builds omit the "Port" lines from --detail-platform. When
|
||||
// we can't determine which drives are actually wired to this
|
||||
// controller, fall back to showing every disk not already in an array
|
||||
// rather than hiding everything.
|
||||
portsKnown := len(ports) > 0
|
||||
|
||||
inArray := map[string]bool{}
|
||||
raw, err := os.ReadFile("/proc/mdstat")
|
||||
if err == nil {
|
||||
@@ -222,15 +257,25 @@ func detectVROCController() *raidControllerInfo {
|
||||
}
|
||||
if json.Unmarshal(lsblkOut, &lsblkDoc) == nil {
|
||||
for _, d := range lsblkDoc.BlockDevices {
|
||||
if d.Type != "disk" || inArray[d.Name] {
|
||||
// Only consider disks wired to this controller's ports -
|
||||
// drives attached directly to the CPU (or another
|
||||
// controller) never show up as VROC ports and are skipped.
|
||||
if d.Type != "disk" || (portsKnown && !ports[d.Name]) {
|
||||
continue
|
||||
}
|
||||
ctrl.FreeDrives = append(ctrl.FreeDrives, raidDriveInfo{
|
||||
info := raidDriveInfo{
|
||||
Device: "/dev/" + d.Name,
|
||||
Model: strings.TrimSpace(d.Model),
|
||||
Serial: strings.TrimSpace(d.Serial),
|
||||
State: "available",
|
||||
})
|
||||
}
|
||||
if inArray[d.Name] {
|
||||
info.State = "member"
|
||||
}
|
||||
ctrl.AllDrives = append(ctrl.AllDrives, info)
|
||||
if info.State == "available" {
|
||||
ctrl.FreeDrives = append(ctrl.FreeDrives, info)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -348,6 +393,38 @@ func (h *handler) handleAPIRAIDCreateMirror(w http.ResponseWriter, r *http.Reque
|
||||
writeJSON(w, map[string]string{"task_id": t.ID})
|
||||
}
|
||||
|
||||
func (h *handler) handleAPIRAIDPrepareDrive(w http.ResponseWriter, r *http.Request) {
|
||||
var req struct {
|
||||
ControllerID string `json:"controller_id"`
|
||||
Slot string `json:"slot"`
|
||||
}
|
||||
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
|
||||
writeError(w, http.StatusBadRequest, "invalid JSON")
|
||||
return
|
||||
}
|
||||
ctrlIdx, ok := parseLSIControllerIndex(req.ControllerID)
|
||||
if !ok {
|
||||
writeError(w, http.StatusBadRequest, "invalid controller_id")
|
||||
return
|
||||
}
|
||||
if _, _, ok := parseRAIDSlot(req.Slot); !ok {
|
||||
writeError(w, http.StatusBadRequest, "invalid slot")
|
||||
return
|
||||
}
|
||||
|
||||
t := &Task{
|
||||
ID: newJobID("raid-lsi-prepare-drive"),
|
||||
Name: fmt.Sprintf("Prepare drive %s (LSI ctrl %d)", req.Slot, ctrlIdx),
|
||||
Target: "raid-lsi-prepare-drive",
|
||||
Priority: defaultTaskPriority("raid-lsi-prepare-drive", taskParams{}),
|
||||
Status: TaskPending,
|
||||
CreatedAt: time.Now(),
|
||||
params: taskParams{RAIDController: ctrlIdx, RAIDSlot: req.Slot},
|
||||
}
|
||||
globalQueue.enqueue(t)
|
||||
writeJSON(w, map[string]string{"task_id": t.ID})
|
||||
}
|
||||
|
||||
func parseLSIControllerIndex(id string) (int, bool) {
|
||||
if !strings.HasPrefix(id, "lsi-") {
|
||||
return 0, false
|
||||
@@ -385,6 +462,34 @@ func runRAIDLSICreateMirrorTask(ctx context.Context, j *jobState, ctrl int, driv
|
||||
return streamCmdJob(j, cmd)
|
||||
}
|
||||
|
||||
// parseRAIDSlot splits a storcli "EID:Slt" identifier (e.g. "252:0") into
|
||||
// enclosure and slot numbers.
|
||||
func parseRAIDSlot(slot string) (eid int, slt int, ok bool) {
|
||||
parts := strings.SplitN(strings.TrimSpace(slot), ":", 2)
|
||||
if len(parts) != 2 {
|
||||
return 0, 0, false
|
||||
}
|
||||
eid, err1 := strconv.Atoi(strings.TrimSpace(parts[0]))
|
||||
slt, err2 := strconv.Atoi(strings.TrimSpace(parts[1]))
|
||||
if err1 != nil || err2 != nil {
|
||||
return 0, 0, false
|
||||
}
|
||||
return eid, slt, true
|
||||
}
|
||||
|
||||
func runRAIDPrepareDriveTask(ctx context.Context, j *jobState, ctrl int, slot string) error {
|
||||
eid, slt, ok := parseRAIDSlot(slot)
|
||||
if !ok {
|
||||
return fmt.Errorf("invalid slot %q", slot)
|
||||
}
|
||||
j.append(fmt.Sprintf("Preparing drive %s on controller %d (set good, force)...", slot, ctrl))
|
||||
cmd := exec.CommandContext(ctx, "storcli64",
|
||||
fmt.Sprintf("/c%d/e%d/s%d", ctrl, eid, slt),
|
||||
"set", "good", "force",
|
||||
)
|
||||
return streamCmdJob(j, cmd)
|
||||
}
|
||||
|
||||
func runRAIDVROCCreateMirrorTask(ctx context.Context, j *jobState, devices []string, arrayName string) error {
|
||||
if arrayName == "" {
|
||||
arrayName = "bee-mirror0"
|
||||
@@ -507,6 +612,7 @@ function raidRenderController(c, idx) {
|
||||
html += '</div></div>';
|
||||
}
|
||||
|
||||
html += raidRenderAllDrives(c, idx);
|
||||
html += raidRenderMirrorSection(c, idx, 'lsi');
|
||||
}
|
||||
|
||||
@@ -529,12 +635,71 @@ function raidRenderController(c, idx) {
|
||||
html += '</table>';
|
||||
}
|
||||
|
||||
html += raidRenderAllDrives(c, idx);
|
||||
html += raidRenderMirrorSection(c, idx, 'vroc');
|
||||
}
|
||||
|
||||
return html;
|
||||
}
|
||||
|
||||
var RAID_READY_STATES = {'UGood': true, 'JBOD': true, 'available': true};
|
||||
var RAID_NO_PREPARE_STATES = {'UGood': true, 'JBOD': true, 'Frgn': true, 'Onln': true, 'Msng': true};
|
||||
|
||||
function raidRenderAllDrives(c, idx) {
|
||||
var drives = c.all_drives || [];
|
||||
var isLSI = c.type === 'lsi';
|
||||
if (drives.length === 0) {
|
||||
return '<p style="font-size:13px;color:var(--muted);margin-bottom:12px">No drives detected on this controller.</p>';
|
||||
}
|
||||
var html = '<div style="font-size:12px;font-weight:600;color:var(--muted);margin-bottom:6px;text-transform:uppercase;letter-spacing:.04em">All Drives on This Controller</div>';
|
||||
html += '<table style="margin-bottom:14px"><tr><th>' + (isLSI ? 'Slot' : 'Device') + '</th><th>Model</th><th>Size</th><th>State</th>' + (isLSI ? '<th></th>' : '') + '</tr>';
|
||||
drives.forEach(function(d) {
|
||||
var ready = !!RAID_READY_STATES[d.state];
|
||||
var badgeClass = ready ? 'badge-ok' : 'badge-warn';
|
||||
var actionCell = '';
|
||||
if (isLSI && !RAID_NO_PREPARE_STATES[d.state]) {
|
||||
actionCell = '<td><button class="btn btn-sm btn-secondary" onclick="raidPrepareDrive(\'' + escHtml(c.id) + '\',\'' + escHtml(d.slot) + '\',this)">Prepare</button></td>';
|
||||
} else if (isLSI) {
|
||||
actionCell = '<td></td>';
|
||||
}
|
||||
html += '<tr>'
|
||||
+ '<td style="font-family:monospace">' + escHtml(isLSI ? d.slot : d.device) + '</td>'
|
||||
+ '<td>' + escHtml(d.model||'—') + (d.serial ? ' [' + escHtml(d.serial) + ']' : '') + '</td>'
|
||||
+ '<td>' + (d.size_gb > 0 ? Math.round(d.size_gb) + ' GB' : '—') + '</td>'
|
||||
+ '<td><span class="badge ' + badgeClass + '">' + escHtml(d.state||'—') + '</span></td>'
|
||||
+ actionCell
|
||||
+ '</tr>';
|
||||
});
|
||||
html += '</table>';
|
||||
return html;
|
||||
}
|
||||
|
||||
function raidPrepareDrive(ctrlID, slot, btn) {
|
||||
if (!confirm('Prepare drive ' + slot + ' on ' + ctrlID + ' for array creation?\n\nThis forces the drive into Unconfigured Good state. If it currently belongs to a virtual drive or holds data, that data will become inaccessible.')) {
|
||||
return;
|
||||
}
|
||||
var original = btn ? btn.textContent : '';
|
||||
if (btn) { btn.disabled = true; btn.textContent = 'Preparing...'; }
|
||||
raidShowOutput('Prepare drive ' + slot, '', '');
|
||||
fetch('/api/tools/raid/prepare-drive', {
|
||||
method: 'POST',
|
||||
headers: {'Content-Type': 'application/json'},
|
||||
body: JSON.stringify({controller_id: ctrlID, slot: slot})
|
||||
})
|
||||
.then(function(r) { return r.json(); })
|
||||
.then(function(d) {
|
||||
if (d.error) throw new Error(d.error);
|
||||
raidStreamTask(d.task_id, 'Prepare drive ' + slot, function() {
|
||||
if (btn) { btn.disabled = false; btn.textContent = original; }
|
||||
raidLoad();
|
||||
});
|
||||
})
|
||||
.catch(function(e) {
|
||||
raidShowOutput('Error', 'failed', e.message);
|
||||
if (btn) { btn.disabled = false; btn.textContent = original; }
|
||||
});
|
||||
}
|
||||
|
||||
function raidRenderMirrorSection(c, idx, kind) {
|
||||
var free = c.free_drives || [];
|
||||
var html = '<div style="font-size:12px;font-weight:600;color:var(--muted);margin-bottom:6px;text-transform:uppercase;letter-spacing:.04em">Create RAID 1 Mirror</div>';
|
||||
@@ -683,6 +848,9 @@ function raidStreamTask(taskID, taskName, onDone) {
|
||||
}
|
||||
|
||||
window.raidLoad = raidLoad;
|
||||
window.raidForeignAction = raidForeignAction;
|
||||
window.raidCreateMirror = raidCreateMirror;
|
||||
window.raidPrepareDrive = raidPrepareDrive;
|
||||
raidLoad();
|
||||
})();
|
||||
</script>`
|
||||
|
||||
@@ -212,86 +212,3 @@ func runSAADMIWriteTask(ctx context.Context, j *jobState, exportDir string, p ta
|
||||
return nil
|
||||
}
|
||||
|
||||
func renderSAADMICard() string {
|
||||
return `<div class="card"><div class="card-head card-head-actions">Supermicro — DMI<div class="card-head-buttons"><button class="btn btn-sm btn-secondary" onclick="saaDMIRead()">Read</button></div></div><div class="card-body">
|
||||
<p style="font-size:13px;color:var(--muted);margin-bottom:12px">Reads and edits DMI fields via SAA (In-Band).</p>
|
||||
<div id="saa-dmi-status" style="font-size:13px;color:var(--muted);margin-bottom:8px"></div>
|
||||
<div id="saa-dmi-table"></div>
|
||||
</div></div>
|
||||
<script>
|
||||
var _dmiActBtnStyle = 'width:22px;height:22px;padding:0;font-size:13px;line-height:1;border:1px solid var(--line);border-radius:3px;background:var(--surface);cursor:pointer;vertical-align:middle;';
|
||||
var _dmiInputStyle = 'width:100%;padding:3px 6px;border:1.5px solid #888;border-radius:3px;font-size:13px;font-family:monospace;background:var(--surface);color:var(--ink);';
|
||||
function dmiEsc(s){return String(s==null?'':s).replace(/&/g,'&').replace(/</g,'<').replace(/>/g,'>').replace(/"/g,'"');}
|
||||
function saaDMIRead() {
|
||||
var status = document.getElementById('saa-dmi-status');
|
||||
status.textContent = 'Reading...'; status.style.color = 'var(--muted)';
|
||||
document.getElementById('saa-dmi-table').innerHTML = '';
|
||||
fetch('/api/tools/saa-dmi', {cache:'no-store'})
|
||||
.then(function(r){return r.json().then(function(d){if(!r.ok)throw new Error(d.error||('HTTP '+r.status));return d;});})
|
||||
.then(function(fields){
|
||||
status.textContent = fields.length + ' field(s) loaded.';
|
||||
var rows = fields.map(function(f){
|
||||
var val = dmiEsc(f.value||'');
|
||||
return '<tr>'
|
||||
+ '<td style="font-size:13px;color:var(--muted);white-space:nowrap;padding-right:8px;vertical-align:middle">'+dmiEsc(f.name)+'</td>'
|
||||
+ '<td style="font-family:monospace;font-size:12px;color:var(--muted);white-space:nowrap;padding-right:8px;vertical-align:middle">'+dmiEsc(f.shn)+'</td>'
|
||||
+ '<td style="vertical-align:middle"><input class="dmi-inp" type="text" style="'+_dmiInputStyle+'"'
|
||||
+ ' data-shn="'+dmiEsc(f.shn)+'" data-original="'+val+'" value="'+val+'" oninput="dmiChanged(this)"></td>'
|
||||
+ '<td class="dmi-act" style="display:none;white-space:nowrap;padding-left:6px;vertical-align:middle">'
|
||||
+ '<button style="'+_dmiActBtnStyle+'color:var(--ok-fg,green);margin-right:3px" title="Save" onclick="dmiSave(this)">✓</button>'
|
||||
+ '<button style="'+_dmiActBtnStyle+'color:var(--crit-fg,#9f3a38)" title="Cancel" onclick="dmiCancel(this)">✗</button>'
|
||||
+ '<span class="dmi-msg" style="font-size:11px;margin-left:5px;color:var(--muted)"></span>'
|
||||
+ '</td></tr>';
|
||||
}).join('');
|
||||
document.getElementById('saa-dmi-table').innerHTML =
|
||||
'<table style="width:100%;border-collapse:collapse">'
|
||||
+ '<tr><th style="text-align:left;font-size:12px;color:var(--muted);padding-bottom:6px;font-weight:normal">Field</th>'
|
||||
+ '<th style="text-align:left;font-size:12px;color:var(--muted);padding-bottom:6px;font-weight:normal">SHN</th>'
|
||||
+ '<th style="text-align:left;font-size:12px;color:var(--muted);padding-bottom:6px;font-weight:normal">Value</th><th></th></tr>'
|
||||
+ rows + '</table>';
|
||||
})
|
||||
.catch(function(e){ status.textContent='Error: '+e.message; status.style.color='var(--crit-fg,#9f3a38)'; });
|
||||
}
|
||||
function dmiChanged(inp) {
|
||||
inp.closest('tr').querySelector('.dmi-act').style.display = inp.value !== inp.dataset.original ? '' : 'none';
|
||||
}
|
||||
function dmiCancel(btn) {
|
||||
var row = btn.closest('tr');
|
||||
var inp = row.querySelector('.dmi-inp');
|
||||
inp.value = inp.dataset.original;
|
||||
row.querySelector('.dmi-act').style.display = 'none';
|
||||
row.querySelector('.dmi-msg').textContent = '';
|
||||
}
|
||||
function dmiSave(btn) {
|
||||
var row = btn.closest('tr');
|
||||
var inp = row.querySelector('.dmi-inp');
|
||||
var msg = row.querySelector('.dmi-msg');
|
||||
var cancelBtn = row.querySelectorAll('.dmi-act button')[1];
|
||||
if(!window.confirm('Apply DMI change for '+inp.dataset.shn+'?\nServer will need to reboot for changes to take effect.'))return;
|
||||
btn.disabled=true; cancelBtn.disabled=true;
|
||||
msg.textContent='…'; msg.style.color='var(--muted)';
|
||||
fetch('/api/tools/saa-dmi/write',{method:'POST',headers:{'Content-Type':'application/json'},
|
||||
body:JSON.stringify({changes:[{shn:inp.dataset.shn,value:inp.value}]})})
|
||||
.then(function(r){return r.json().then(function(d){if(!r.ok)throw new Error(d.error||('HTTP '+r.status));return d;});})
|
||||
.then(function(d){
|
||||
var poll=setInterval(function(){
|
||||
fetch('/api/tasks',{cache:'no-store'}).then(function(r){return r.json();}).then(function(tasks){
|
||||
var t=(tasks||[]).find(function(x){return x.id===d.task_id;});
|
||||
if(!t)return;
|
||||
if(t.status==='done'){
|
||||
clearInterval(poll);
|
||||
inp.dataset.original=inp.value;
|
||||
row.querySelector('.dmi-act').style.display='none';
|
||||
msg.textContent='Saved. Reboot to apply.'; msg.style.color='var(--ok-fg,green)';
|
||||
} else if(t.status==='failed'||t.status==='cancelled'){
|
||||
clearInterval(poll);
|
||||
msg.textContent=t.error||t.status; msg.style.color='var(--crit-fg)';
|
||||
btn.disabled=false; cancelBtn.disabled=false;
|
||||
}
|
||||
});
|
||||
},1500);
|
||||
})
|
||||
.catch(function(e){msg.textContent='Error: '+e.message; msg.style.color='var(--crit-fg)'; btn.disabled=false; cancelBtn.disabled=false;});
|
||||
}
|
||||
</script>`
|
||||
}
|
||||
|
||||
@@ -318,9 +318,12 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
mux.HandleFunc("POST /api/tools/saa-dmi/write", h.handleAPISAADMIWrite)
|
||||
mux.HandleFunc("GET /api/tools/ipmi-fru", h.handleAPIIPMIFRURead)
|
||||
mux.HandleFunc("POST /api/tools/ipmi-fru/write", h.handleAPIIPMIFRUWrite)
|
||||
mux.HandleFunc("GET /api/tools/huawei-elabel", h.handleAPIHuaweiElabelRead)
|
||||
mux.HandleFunc("POST /api/tools/huawei-elabel/write", h.handleAPIHuaweiElabelWrite)
|
||||
mux.HandleFunc("GET /api/tools/raid/status", h.handleAPIRAIDStatus)
|
||||
mux.HandleFunc("POST /api/tools/raid/foreign", h.handleAPIRAIDForeignAction)
|
||||
mux.HandleFunc("POST /api/tools/raid/create-mirror", h.handleAPIRAIDCreateMirror)
|
||||
mux.HandleFunc("POST /api/tools/raid/prepare-drive", h.handleAPIRAIDPrepareDrive)
|
||||
|
||||
// GPU presence / tools
|
||||
mux.HandleFunc("GET /api/gpu/presence", h.handleAPIGPUPresence)
|
||||
|
||||
@@ -1227,7 +1227,8 @@ func TestDashboardRendersRuntimeHealthTable(t *testing.T) {
|
||||
],
|
||||
"services":[
|
||||
{"name":"bee-web","status":"active"},
|
||||
{"name":"bee-nvidia","status":"inactive"}
|
||||
{"name":"bee-audit","status":"inactive"},
|
||||
{"name":"bee-nvidia","status":"failed"}
|
||||
]
|
||||
}`
|
||||
if err := os.WriteFile(filepath.Join(exportDir, "runtime-health.json"), []byte(health), 0644); err != nil {
|
||||
@@ -1281,7 +1282,7 @@ func TestDashboardRendersRuntimeHealthTable(t *testing.T) {
|
||||
`Bee Services`,
|
||||
`CUDA runtime is not ready for GPU SAT.`,
|
||||
`Missing: nvidia-smi`,
|
||||
`bee-nvidia=inactive`,
|
||||
`bee-nvidia=failed`,
|
||||
// Hardware Summary card — component health badges
|
||||
`Hardware Summary`,
|
||||
`>CPU<`,
|
||||
|
||||
@@ -232,6 +232,9 @@ func renderTaskReportFragment(report taskReport, charts map[string]string, logTe
|
||||
if powerCard := renderTaskPowerResultsCard(report.Target, logText); powerCard != "" {
|
||||
b.WriteString(powerCard)
|
||||
}
|
||||
if report.Target == "storage" {
|
||||
b.WriteString(renderStorageDiskReportCards(logText))
|
||||
}
|
||||
|
||||
if len(report.Charts) > 0 {
|
||||
for _, chart := range report.Charts {
|
||||
@@ -369,3 +372,60 @@ func formatTaskDuration(sec int) string {
|
||||
}
|
||||
return fmt.Sprintf("%dh %02dm %02ds", sec/3600, (sec%3600)/60, sec%60)
|
||||
}
|
||||
|
||||
// renderStorageDiskReportCards reads disk-*-report.txt files from the storage
|
||||
// SAT run directory and renders one card per disk.
|
||||
func renderStorageDiskReportCards(logText string) string {
|
||||
runDir := taskStorageRunDirFromLog(logText)
|
||||
if runDir == "" {
|
||||
return ""
|
||||
}
|
||||
entries, err := os.ReadDir(runDir)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
|
||||
var cards []string
|
||||
for _, entry := range entries {
|
||||
name := entry.Name()
|
||||
if !strings.HasPrefix(name, "disk-") || !strings.HasSuffix(name, "-report.txt") {
|
||||
continue
|
||||
}
|
||||
data, err := os.ReadFile(filepath.Join(runDir, name))
|
||||
if err != nil || len(data) == 0 {
|
||||
continue
|
||||
}
|
||||
// Extract disk label from filename: "disk-01-nvme0n1-report.txt" → "Disk 01 — nvme0n1"
|
||||
stem := strings.TrimPrefix(strings.TrimSuffix(name, "-report.txt"), "disk-")
|
||||
// stem is like "01-nvme0n1"
|
||||
parts := strings.SplitN(stem, "-", 2)
|
||||
title := "Disk " + stem
|
||||
if len(parts) == 2 {
|
||||
title = "Disk " + parts[0] + " — " + parts[1]
|
||||
}
|
||||
card := `<div class="card">` +
|
||||
`<div class="card-head">` + html.EscapeString(title) + `</div>` +
|
||||
`<div class="card-body" style="padding:0">` +
|
||||
`<pre style="margin:0;padding:16px;font-size:12px;line-height:1.6;overflow-x:auto;white-space:pre">` +
|
||||
html.EscapeString(string(data)) +
|
||||
`</pre></div></div>`
|
||||
cards = append(cards, card)
|
||||
}
|
||||
return strings.Join(cards, "\n")
|
||||
}
|
||||
|
||||
// taskStorageRunDirFromLog finds the storage SAT run directory path logged as
|
||||
// "Archive: /path/to/storage-YYYYMMDD-HHMMSS".
|
||||
func taskStorageRunDirFromLog(logText string) string {
|
||||
for _, line := range strings.Split(logText, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if !strings.HasPrefix(line, "Archive:") {
|
||||
continue
|
||||
}
|
||||
path := strings.TrimSpace(strings.TrimPrefix(line, "Archive:"))
|
||||
if strings.Contains(filepath.Base(path), "storage-") && !strings.HasSuffix(path, ".tar.gz") {
|
||||
return path
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
@@ -394,6 +394,12 @@ func executeTaskWithOptions(opts *HandlerOptions, t *Task, j *jobState, ctx cont
|
||||
break
|
||||
}
|
||||
err = runIPMIFRUWriteTask(ctx, j, opts.ExportDir, t.params)
|
||||
case "huawei-elabel-write":
|
||||
if len(t.params.HuaweiElabelChanges) == 0 {
|
||||
err = fmt.Errorf("no changes provided")
|
||||
break
|
||||
}
|
||||
err = runHuaweiElabelWriteTask(ctx, j, t.params)
|
||||
case "raid-foreign-clear":
|
||||
err = runRAIDForeignClearTask(ctx, j, t.params.RAIDController)
|
||||
case "raid-foreign-import":
|
||||
@@ -404,6 +410,12 @@ func executeTaskWithOptions(opts *HandlerOptions, t *Task, j *jobState, ctx cont
|
||||
break
|
||||
}
|
||||
err = runRAIDLSICreateMirrorTask(ctx, j, t.params.RAIDController, t.params.RAIDDevices)
|
||||
case "raid-lsi-prepare-drive":
|
||||
if strings.TrimSpace(t.params.RAIDSlot) == "" {
|
||||
err = fmt.Errorf("no drive slot provided")
|
||||
break
|
||||
}
|
||||
err = runRAIDPrepareDriveTask(ctx, j, t.params.RAIDController, t.params.RAIDSlot)
|
||||
case "raid-vroc-create-mirror":
|
||||
if len(t.params.RAIDDevices) < 2 {
|
||||
err = fmt.Errorf("at least 2 devices required")
|
||||
|
||||
@@ -140,11 +140,13 @@ type taskParams struct {
|
||||
Device string `json:"device,omitempty"` // for install
|
||||
LBAF int `json:"lbaf,omitempty"`
|
||||
PlatformComponents []string `json:"platform_components,omitempty"`
|
||||
SAADmiChanges []saaChange `json:"saa_dmi_changes,omitempty"`
|
||||
FRUChanges []fruChange `json:"fru_changes,omitempty"`
|
||||
SAADmiChanges []saaChange `json:"saa_dmi_changes,omitempty"`
|
||||
FRUChanges []fruChange `json:"fru_changes,omitempty"`
|
||||
HuaweiElabelChanges []huaweiChange `json:"huawei_elabel_changes,omitempty"`
|
||||
RAIDController int `json:"raid_controller,omitempty"`
|
||||
RAIDDevices []string `json:"raid_devices,omitempty"`
|
||||
RAIDArrayName string `json:"raid_array_name,omitempty"`
|
||||
RAIDSlot string `json:"raid_slot,omitempty"`
|
||||
}
|
||||
|
||||
type persistedTask struct {
|
||||
|
||||
2
bible
2
bible
Submodule bible updated: 1977730d93...d2600f1279
@@ -13,6 +13,7 @@ Generic engineering rules live in `bible/rules/patterns/`.
|
||||
| `docs/hardware-ingest-contract.md` | Current Reanimator hardware ingest JSON contract |
|
||||
| `docs/validate-vs-burn.md` | Validate and Validate -> Stress hardware test policy |
|
||||
| `decisions/` | Architectural decision log, including read-only submodule policy |
|
||||
| `proposals/` | RFCs and contract change proposals for Reanimator Core |
|
||||
|
||||
## Validate Test Matrix
|
||||
|
||||
|
||||
@@ -1,5 +1,103 @@
|
||||
# Backlog
|
||||
|
||||
## Сбор SFP-модулей
|
||||
|
||||
**Статус:** не реализовано.
|
||||
|
||||
### Источник данных
|
||||
|
||||
`ethtool -m <iface>` / `ethtool --module-info <iface>` — читает EEPROM SFP/SFP+/QSFP28/QSFP-DD по стандарту MSA (SFF-8472 / SFF-8636).
|
||||
|
||||
Доступные поля из EEPROM:
|
||||
- Идентификатор модуля: `Identifier` (SFP, SFP+, QSFP28, …)
|
||||
- Тип коннектора: `Connector`
|
||||
- Вендор: `Vendor name`, `Vendor OUI`, `Vendor PN`, `Vendor SN`, `Vendor rev`
|
||||
- Оптика: `Wavelength`, `Transceiver type` (10GBase-SR, LR, DAC, …)
|
||||
- Телеметрия DOM (если модуль поддерживает): `Laser tx bias current`, `Transmit avg optical power`, `Receive avg optical power`, `Module temperature`, `Module voltage`
|
||||
- Статус: `Rx power high alarm`, `Tx power low warning`, …
|
||||
|
||||
Для QSFP28 данные повторяются на 4 канала (lane 0–3).
|
||||
|
||||
Инструмент требует root. На bee ISO — доступен (`ethtool` входит в образ).
|
||||
|
||||
### Scope для bee
|
||||
|
||||
1. Собирать список сетевых интерфейсов через `ip -j link show` (только `ether`, без `lo`/VLAN/bond).
|
||||
2. Для каждого интерфейса пробовать `ethtool -m <iface>`. Если модуль отсутствует или не поддерживает EEPROM read — тихо пропускать.
|
||||
3. Связывать интерфейс с PCIe-устройством через `ethtool -i <iface>` → поле `bus-info` (BDF) → сопоставление с `pcie_devices[].slot`.
|
||||
|
||||
### Gap в контракте
|
||||
|
||||
Текущий контракт v2.10 имеет в `pcie_devices[]` скалярные поля:
|
||||
- `sfp_temperature_c`, `sfp_tx_power_dbm`, `sfp_rx_power_dbm`, `sfp_voltage_v`, `sfp_bias_ma`
|
||||
|
||||
Этого **недостаточно**:
|
||||
- Одна NIC-карта может иметь несколько портов — нужен массив, а не скаляр.
|
||||
- Нет полей идентификации модуля (vendor, part_number, serial_number, wavelength, connector).
|
||||
- Нет разбивки по каналам для QSFP28.
|
||||
|
||||
### Предлагаемое расширение контракта
|
||||
|
||||
Добавить в `pcie_devices[]` массив `sfp_modules[]`:
|
||||
|
||||
```json
|
||||
"pcie_devices": [
|
||||
{
|
||||
"slot": "0000:3b:00.0",
|
||||
"device_class": "EthernetController",
|
||||
"model": "ConnectX-6 Dx",
|
||||
"sfp_modules": [
|
||||
{
|
||||
"port": 0,
|
||||
"identifier": "QSFP28",
|
||||
"connector": "LC",
|
||||
"vendor": "Mellanox",
|
||||
"part_number": "MFA1A00-C003",
|
||||
"serial_number": "MT2124VS09999",
|
||||
"revision": "A",
|
||||
"wavelength_nm": 850,
|
||||
"transceiver_type": "100GBase-SR4",
|
||||
"temperature_c": 36.4,
|
||||
"voltage_v": 3.29,
|
||||
"tx_power_dbm": -1.8,
|
||||
"rx_power_dbm": -2.1,
|
||||
"bias_ma": 7.2
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
Поля `sfp_modules[]`:
|
||||
|
||||
| Поле | Тип | Описание |
|
||||
|---|---|---|
|
||||
| `port` | int | Номер порта на NIC (0-based) |
|
||||
| `identifier` | string | `SFP`, `SFP+`, `QSFP28`, `QSFP-DD`, … |
|
||||
| `connector` | string | `LC`, `MPO`, `DAC`, … |
|
||||
| `vendor` | string | Производитель модуля |
|
||||
| `part_number` | string | Партномер |
|
||||
| `serial_number` | string | Серийный номер |
|
||||
| `revision` | string | Ревизия |
|
||||
| `wavelength_nm` | int | Длина волны, нм |
|
||||
| `transceiver_type` | string | `10GBase-SR`, `100GBase-SR4`, `DAC`, … |
|
||||
| `temperature_c` | float | Температура модуля, °C |
|
||||
| `voltage_v` | float | Напряжение, В |
|
||||
| `tx_power_dbm` | float | TX оптическая мощность, dBm |
|
||||
| `rx_power_dbm` | float | RX оптическая мощность, dBm |
|
||||
| `bias_ma` | float | Bias current, мА |
|
||||
|
||||
Старые скалярные поля `sfp_temperature_c` / `sfp_tx_power_dbm` / `sfp_rx_power_dbm` / `sfp_voltage_v` / `sfp_bias_ma` на уровне `pcie_devices[]` — **вывести из контракта** (deprecated), заменить на `sfp_modules[]`.
|
||||
|
||||
### Порядок реализации
|
||||
|
||||
1. Согласовать расширение контракта с Reanimator Core (bump до v2.11).
|
||||
2. Добавить `ethtool` parser в `audit/internal/collector/` — новый файл `sfp.go`.
|
||||
3. Дополнить schema в `audit/internal/schema/` типом `SFPModule`.
|
||||
4. Добавить `sfp_modules` в `PCIeDevice` в schema.
|
||||
5. Заполнять в NIC-коллекторе: связь интерфейс → BDF → `pcie_devices[].sfp_modules`.
|
||||
6. Показывать в TUI и web UI в разделе PCIe/NIC.
|
||||
|
||||
## BMC версия через IPMI
|
||||
|
||||
**Статус:** реализовано.
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
---
|
||||
title: Hardware Ingest JSON Contract
|
||||
version: "2.10"
|
||||
updated: "2026-04-29"
|
||||
version: "2.11"
|
||||
updated: "2026-06-19"
|
||||
maintainer: Reanimator Core
|
||||
audience: external-integrators, ai-agents
|
||||
language: ru
|
||||
@@ -9,7 +9,7 @@ language: ru
|
||||
|
||||
# Интеграция с Reanimator: контракт JSON-импорта аппаратного обеспечения
|
||||
|
||||
Версия: **2.10** · Дата: **2026-04-29**
|
||||
Версия: **2.11** · Дата: **2026-06-19**
|
||||
|
||||
Документ описывает формат JSON для передачи данных об аппаратном обеспечении серверов в систему **Reanimator** (управление жизненным циклом аппаратного обеспечения).
|
||||
Предназначен для разработчиков смежных систем (Redfish-коллекторов, агентов мониторинга, CMDB-экспортёров) и может быть включён в документацию интегрируемых проектов.
|
||||
@@ -22,6 +22,7 @@ language: ru
|
||||
|
||||
| Версия | Дата | Изменения |
|
||||
|--------|------|-----------|
|
||||
| 2.11 | 2026-06-19 | В `pcie_devices[]` добавлен необязательный массив `sfp_modules[]` с идентификацией и DOM telemetry SFP/QSFP-модулей. Скалярные поля `sfp_temperature_c` / `sfp_tx_power_dbm` / `sfp_rx_power_dbm` / `sfp_voltage_v` / `sfp_bias_ma` помечены как deprecated (принимаются, но `sfp_modules[]` имеет приоритет) |
|
||||
| 2.10 | 2026-04-29 | Для `hardware.storage[]` добавлены необязательные числовые поля `logical_block_size_bytes`, `physical_block_size_bytes`, `metadata_bytes_per_block` для нормализованного описания формата блока накопителя |
|
||||
| 2.9 | 2026-03-19 | Добавлена необязательная секция `hardware.platform_config` — произвольный объект с настройками платформы (BIOS/Redfish); хранится как latest-snapshot per machine |
|
||||
| 2.8 | 2026-03-15 | Поле `location` удалено из всех `sensors.*`; сенсоры передаются только по `name` и измеренным значениям |
|
||||
@@ -422,11 +423,12 @@ GET /ingest/hardware/jobs/{job_id}
|
||||
| `battery_temperature_c` | float | нет | Температура батареи / supercap, °C |
|
||||
| `battery_voltage_v` | float | нет | Напряжение батареи / supercap, В |
|
||||
| `battery_replace_required` | bool | нет | Требуется замена батареи / supercap |
|
||||
| `sfp_temperature_c` | float | нет | Температура SFP/optic, °C |
|
||||
| `sfp_tx_power_dbm` | float | нет | TX optical power, dBm |
|
||||
| `sfp_rx_power_dbm` | float | нет | RX optical power, dBm |
|
||||
| `sfp_voltage_v` | float | нет | Напряжение SFP, В |
|
||||
| `sfp_bias_ma` | float | нет | Bias current SFP, мА |
|
||||
| `sfp_temperature_c` | float | нет | Температура SFP/optic, °C *(deprecated since 2.11)* |
|
||||
| `sfp_tx_power_dbm` | float | нет | TX optical power, dBm *(deprecated since 2.11)* |
|
||||
| `sfp_rx_power_dbm` | float | нет | RX optical power, dBm *(deprecated since 2.11)* |
|
||||
| `sfp_voltage_v` | float | нет | Напряжение SFP, В *(deprecated since 2.11)* |
|
||||
| `sfp_bias_ma` | float | нет | Bias current SFP, мА *(deprecated since 2.11)* |
|
||||
| `sfp_modules` | array | нет | Установленные SFP/QSFP-модули по портам (см. sfp_modules[]) |
|
||||
| `bdf` | string | нет | Deprecated alias для `slot`; при наличии ingest нормализует его в `slot` |
|
||||
| `device_class` | string | нет | Класс устройства (см. список ниже) |
|
||||
| `manufacturer` | string | нет | Производитель |
|
||||
@@ -444,10 +446,43 @@ GET /ingest/hardware/jobs/{job_id}
|
||||
`numa_node` передавайте для NIC / InfiniBand / RAID / GPU, когда источник знает CPU/NUMA affinity. Поле сохраняется в snapshot-атрибутах PCIe-компонента и дублируется в telemetry для topology use cases.
|
||||
Поля `temperature_c` и `power_w` используйте для device-level telemetry GPU / accelerator / smart PCIe devices. Они не влияют на идентификацию компонента.
|
||||
|
||||
**Deprecated поля sfp_\*:** Скалярные поля `sfp_temperature_c`, `sfp_tx_power_dbm`, `sfp_rx_power_dbm`, `sfp_voltage_v`, `sfp_bias_ma` продолжают приниматься, но помечены как deprecated since 2.11. Если в payload одновременно присутствуют `sfp_modules[]` и deprecated sfp_-скаляры — приоритет у `sfp_modules[]`, скаляры игнорируются. Deprecated поля будут удалены в версии 3.0.
|
||||
|
||||
**Генерация serial_number при отсутствии или `"N/A"`:** `{board_serial}-PCIE-{slot}`, где `slot` для PCIe равен BDF.
|
||||
|
||||
`slot` — единственный канонический адрес компонента. Для PCIe в `slot` передавайте BDF. Поле `bdf` сохраняется только как переходный alias на входе и не должно использоваться как отдельная координата рядом со `slot`.
|
||||
|
||||
#### pcie_devices[].sfp_modules[]
|
||||
|
||||
Необязательный массив установленных SFP/QSFP-модулей для данного PCIe-устройства. Один элемент — один порт. Используйте для многопортовых NIC (ConnectX-6 Dx, Intel X710, Mellanox HDR и др.).
|
||||
|
||||
| Поле | Тип | Обязательно | Описание |
|
||||
|------|-----|-------------|----------|
|
||||
| `port` | int | **да** | Номер порта на NIC (0-based). Ключ дедупликации внутри устройства |
|
||||
| `identifier` | string | нет | Тип модуля: `SFP`, `SFP+`, `SFP28`, `QSFP+`, `QSFP28`, `QSFP-DD`, `DAC` |
|
||||
| `connector` | string | нет | Тип разъёма: `LC`, `MPO`, `RJ45`, `DAC`, `AOC`, `No separable connector` |
|
||||
| `vendor` | string | нет | Производитель модуля из EEPROM |
|
||||
| `part_number` | string | нет | Партномер из EEPROM |
|
||||
| `serial_number` | string | нет | Серийный номер из EEPROM |
|
||||
| `revision` | string | нет | Ревизия из EEPROM |
|
||||
| `wavelength_nm` | int | нет | Длина волны, нм (0 для DAC/медных кабелей) |
|
||||
| `transceiver_type` | string | нет | `10GBase-SR`, `10GBase-LR`, `25GBase-SR`, `100GBase-SR4`, `DAC`, … |
|
||||
| `temperature_c` | float | нет | Температура модуля, °C (DOM telemetry) |
|
||||
| `voltage_v` | float | нет | Напряжение питания, В (DOM telemetry) |
|
||||
| `tx_power_dbm` | float | нет | TX оптическая мощность, dBm (DOM telemetry) |
|
||||
| `rx_power_dbm` | float | нет | RX оптическая мощность, dBm (DOM telemetry) |
|
||||
| `bias_ma` | float | нет | Bias current, мА (DOM telemetry) |
|
||||
|
||||
**Ключ дедупликации:** `(pcie_devices[].slot, sfp_modules[].port)`.
|
||||
|
||||
**Правила ingest:**
|
||||
- При каждом импорте — полная замена `sfp_modules[]` для данного `pcie_devices[].slot` (upsert всего массива целиком).
|
||||
- Если `sfp_modules` отсутствует или `null` — существующие данные SFP не трогать.
|
||||
- Если `sfp_modules: []` (пустой массив) — трактовать как «модули не обнаружены», очистить сохранённые данные.
|
||||
- Дубли по `port` внутри одного `pcie_devices[]` — невалидны, endpoint возвращает `400` с описанием поля.
|
||||
- Модули без `serial_number` допустимы (многие DAC-кабели не имеют SN); сохраняются по ключу `(slot, port)`.
|
||||
- Изменение `serial_number` или `part_number` модуля на порту создаёт событие `COMPONENT_CHANGED` для PCIe-устройства с описанием «SFP module replaced on port N».
|
||||
|
||||
**Значения `device_class`:**
|
||||
|
||||
| Значение | Назначение |
|
||||
@@ -472,16 +507,47 @@ GET /ingest/hardware/jobs/{job_id}
|
||||
"numa_node": 0,
|
||||
"temperature_c": 48.5,
|
||||
"power_w": 18.2,
|
||||
"sfp_temperature_c": 36.2,
|
||||
"sfp_tx_power_dbm": -1.8,
|
||||
"sfp_rx_power_dbm": -2.1,
|
||||
"device_class": "EthernetController",
|
||||
"manufacturer": "Intel",
|
||||
"model": "X710 10GbE",
|
||||
"serial_number": "K65472-003",
|
||||
"firmware": "9.20 0x8000d4ae",
|
||||
"manufacturer": "Mellanox",
|
||||
"model": "ConnectX-6 Dx",
|
||||
"serial_number": "MT2012X12345",
|
||||
"firmware": "22.35.2010",
|
||||
"mac_addresses": ["3c:fd:fe:aa:bb:cc", "3c:fd:fe:aa:bb:cd"],
|
||||
"status": "OK"
|
||||
"status": "OK",
|
||||
"sfp_modules": [
|
||||
{
|
||||
"port": 0,
|
||||
"identifier": "QSFP28",
|
||||
"connector": "LC",
|
||||
"vendor": "Mellanox",
|
||||
"part_number": "MFA1A00-C003",
|
||||
"serial_number": "MT2124VS09999",
|
||||
"revision": "A",
|
||||
"wavelength_nm": 850,
|
||||
"transceiver_type": "100GBase-SR4",
|
||||
"temperature_c": 36.4,
|
||||
"voltage_v": 3.29,
|
||||
"tx_power_dbm": -1.8,
|
||||
"rx_power_dbm": -2.1,
|
||||
"bias_ma": 7.2
|
||||
},
|
||||
{
|
||||
"port": 1,
|
||||
"identifier": "QSFP28",
|
||||
"connector": "LC",
|
||||
"vendor": "Mellanox",
|
||||
"part_number": "MFA1A00-C003",
|
||||
"serial_number": "MT2124VS09998",
|
||||
"revision": "A",
|
||||
"wavelength_nm": 850,
|
||||
"transceiver_type": "100GBase-SR4",
|
||||
"temperature_c": 35.9,
|
||||
"voltage_v": 3.28,
|
||||
"tx_power_dbm": -1.9,
|
||||
"rx_power_dbm": -2.3,
|
||||
"bias_ma": 7.1
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
```
|
||||
@@ -793,7 +859,24 @@ PSU без `serial_number` игнорируется.
|
||||
"model": "X710 10GbE",
|
||||
"serial_number": "K65472-003",
|
||||
"mac_addresses": ["3c:fd:fe:aa:bb:cc", "3c:fd:fe:aa:bb:cd"],
|
||||
"status": "OK"
|
||||
"status": "OK",
|
||||
"sfp_modules": [
|
||||
{
|
||||
"port": 0,
|
||||
"identifier": "SFP+",
|
||||
"connector": "LC",
|
||||
"vendor": "Intel",
|
||||
"part_number": "FTLX8574D3BCV-IT",
|
||||
"serial_number": "FNS123456789",
|
||||
"wavelength_nm": 850,
|
||||
"transceiver_type": "10GBase-SR",
|
||||
"temperature_c": 34.1,
|
||||
"voltage_v": 3.30,
|
||||
"tx_power_dbm": -2.5,
|
||||
"rx_power_dbm": -3.0,
|
||||
"bias_ma": 6.8
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"power_supplies": [
|
||||
|
||||
201
bible-local/proposals/2026-06-19-sfp-modules-contract-rfc.md
Normal file
201
bible-local/proposals/2026-06-19-sfp-modules-contract-rfc.md
Normal file
@@ -0,0 +1,201 @@
|
||||
# RFC: SFP Module Identity and Telemetry in Hardware Ingest Contract
|
||||
|
||||
**Date:** 2026-06-19
|
||||
**Status:** proposal
|
||||
**Target:** Reanimator Core — ingest schema, validation, and storage
|
||||
**Requested by:** bee (hardware audit LiveCD)
|
||||
**Contract version affected:** 2.10 → 2.11
|
||||
|
||||
---
|
||||
|
||||
## Задача для агента
|
||||
|
||||
Внести изменения в контракт ingest и реализовать их в Reanimator Core:
|
||||
|
||||
1. Добавить массив `sfp_modules[]` внутри `pcie_devices[]`.
|
||||
2. Обновить JSON-схему и валидатор ingest endpoint.
|
||||
3. Реализовать хранение/upsert данных SFP-модулей.
|
||||
4. Пометить скалярные поля `sfp_*` на уровне `pcie_devices[]` как deprecated.
|
||||
5. Обновить документ контракта (`hardware-ingest-contract.md`) до версии 2.11.
|
||||
|
||||
Ниже — всё необходимое для принятия решений без дополнительных вопросов.
|
||||
|
||||
---
|
||||
|
||||
## Контекст: откуда берутся данные
|
||||
|
||||
**Источник в bee:** `ethtool -m <iface>` (читает EEPROM SFP/SFP+/QSFP28/QSFP-DD по стандарту MSA SFF-8472 / SFF-8636).
|
||||
|
||||
**Связь с PCIe:** `ethtool -i <iface>` возвращает `bus-info` = BDF (`0000:3b:00.0`), который совпадает с `pcie_devices[].slot`. Так bee связывает SFP-данные конкретного интерфейса с PCIe-устройством.
|
||||
|
||||
**Один NIC — несколько модулей:** карта ConnectX-6 Dx (2 порта), Intel X710 (4 порта), Mellanox HDR (2 порта). Каждый порт — отдельный `ethtool -m`, отдельный SFP-модуль. Одного скаляра на устройство недостаточно.
|
||||
|
||||
**QSFP28/QSFP-DD:** 4-канальные модули возвращают telemetry отдельно по каждому каналу (lane). В предложенной схеме lane-уровень не включён в первую версию — только агрегированные значения модуля в целом. Расширение до lane-уровня — отдельный RFC если понадобится.
|
||||
|
||||
---
|
||||
|
||||
## Проблема с текущим контрактом v2.10
|
||||
|
||||
В `pcie_devices[]` есть пять скалярных полей:
|
||||
|
||||
```
|
||||
sfp_temperature_c float
|
||||
sfp_tx_power_dbm float
|
||||
sfp_rx_power_dbm float
|
||||
sfp_voltage_v float
|
||||
sfp_bias_ma float
|
||||
```
|
||||
|
||||
Ограничения:
|
||||
- **Нет идентификации модуля** — vendor, part_number, serial_number, wavelength отсутствуют; модуль нельзя инвентаризировать как самостоятельный компонент.
|
||||
- **Только один набор значений на устройство** — невозможно описать 4-портовый NIC.
|
||||
- **Нет типа модуля** — SFP, QSFP28, DAC-кабель не различаются.
|
||||
- **Нет connector/transceiver_type** — невозможно понять, оптика это или медь.
|
||||
|
||||
---
|
||||
|
||||
## Предлагаемое изменение схемы
|
||||
|
||||
### Новая структура `sfp_modules[]`
|
||||
|
||||
Добавляется как необязательное поле внутри каждого объекта `pcie_devices[]`.
|
||||
|
||||
```json
|
||||
"pcie_devices": [
|
||||
{
|
||||
"slot": "0000:3b:00.0",
|
||||
"device_class": "EthernetController",
|
||||
"model": "ConnectX-6 Dx",
|
||||
"manufacturer": "Mellanox",
|
||||
"serial_number": "MT2012X12345",
|
||||
"status": "OK",
|
||||
"sfp_modules": [
|
||||
{
|
||||
"port": 0,
|
||||
"identifier": "QSFP28",
|
||||
"connector": "LC",
|
||||
"vendor": "Mellanox",
|
||||
"part_number": "MFA1A00-C003",
|
||||
"serial_number": "MT2124VS09999",
|
||||
"revision": "A",
|
||||
"wavelength_nm": 850,
|
||||
"transceiver_type": "100GBase-SR4",
|
||||
"temperature_c": 36.4,
|
||||
"voltage_v": 3.29,
|
||||
"tx_power_dbm": -1.8,
|
||||
"rx_power_dbm": -2.1,
|
||||
"bias_ma": 7.2
|
||||
},
|
||||
{
|
||||
"port": 1,
|
||||
"identifier": "QSFP28",
|
||||
"connector": "LC",
|
||||
"vendor": "Mellanox",
|
||||
"part_number": "MFA1A00-C003",
|
||||
"serial_number": "MT2124VS09998",
|
||||
"revision": "A",
|
||||
"wavelength_nm": 850,
|
||||
"transceiver_type": "100GBase-SR4",
|
||||
"temperature_c": 35.9,
|
||||
"voltage_v": 3.28,
|
||||
"tx_power_dbm": -1.9,
|
||||
"rx_power_dbm": -2.3,
|
||||
"bias_ma": 7.1
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
### Поля `sfp_modules[]`
|
||||
|
||||
| Поле | Тип | Обязательно | Описание |
|
||||
|---|---|---|---|
|
||||
| `port` | int | **да** | Номер порта на NIC (0-based). Ключ дедупликации внутри устройства. |
|
||||
| `identifier` | string | нет | Тип модуля: `SFP`, `SFP+`, `SFP28`, `QSFP+`, `QSFP28`, `QSFP-DD`, `DAC` |
|
||||
| `connector` | string | нет | Тип разъёма: `LC`, `MPO`, `RJ45`, `DAC`, `AOC`, `No separable connector` |
|
||||
| `vendor` | string | нет | Производитель модуля из EEPROM |
|
||||
| `part_number` | string | нет | Партномер из EEPROM |
|
||||
| `serial_number` | string | нет | Серийный номер из EEPROM |
|
||||
| `revision` | string | нет | Ревизия из EEPROM |
|
||||
| `wavelength_nm` | int | нет | Длина волны, нм (0 для DAC/медных кабелей) |
|
||||
| `transceiver_type` | string | нет | `10GBase-SR`, `10GBase-LR`, `25GBase-SR`, `100GBase-SR4`, `DAC`, … |
|
||||
| `temperature_c` | float | нет | Температура модуля, °C (DOM telemetry) |
|
||||
| `voltage_v` | float | нет | Напряжение питания, В (DOM telemetry) |
|
||||
| `tx_power_dbm` | float | нет | TX оптическая мощность, dBm (DOM telemetry) |
|
||||
| `rx_power_dbm` | float | нет | RX оптическая мощность, dBm (DOM telemetry) |
|
||||
| `bias_ma` | float | нет | Bias current, мА (DOM telemetry) |
|
||||
|
||||
**Ключ дедупликации:** `(pcie_device.slot, sfp_modules[].port)`.
|
||||
|
||||
**Модули без серийного номера** — допустимы; многие DAC-кабели не имеют SN. Не игнорировать, сохранять по ключу `(slot, port)`.
|
||||
|
||||
---
|
||||
|
||||
## Deprecated поля
|
||||
|
||||
Следующие поля на уровне `pcie_devices[]` помечаются как **deprecated** начиная с v2.11:
|
||||
|
||||
```
|
||||
sfp_temperature_c
|
||||
sfp_tx_power_dbm
|
||||
sfp_rx_power_dbm
|
||||
sfp_voltage_v
|
||||
sfp_bias_ma
|
||||
```
|
||||
|
||||
**Поведение при получении deprecated полей:**
|
||||
- Продолжать принимать и сохранять (не ломать существующих интеграторов).
|
||||
- Если одновременно присутствуют `sfp_modules[]` и deprecated скаляры — приоритет у `sfp_modules[]`; скаляры игнорируются.
|
||||
- В документации пометить как `deprecated since 2.11, will be removed in 3.0`.
|
||||
|
||||
**Не удалять** deprecated поля из валидации в этом PR — только пометить в документации и changelog.
|
||||
|
||||
---
|
||||
|
||||
## Правила ingest для `sfp_modules[]`
|
||||
|
||||
- `sfp_modules[]` хранится как snapshot-атрибут PCIe-компонента (аналогично `mac_addresses`).
|
||||
- При каждом импорте — полная замена `sfp_modules[]` для данного `pcie_devices[].slot` (upsert всего массива целиком, не merge по портам).
|
||||
- Если `sfp_modules` отсутствует или `null` — существующие данные SFP не трогать (не затирать).
|
||||
- Если `sfp_modules: []` (пустой массив) — трактовать как «модули не обнаружены», очистить сохранённые данные.
|
||||
- Изменение `serial_number` или `part_number` модуля на порту — создавать событие `COMPONENT_CHANGED` для PCIe-устройства с описанием «SFP module replaced on port N».
|
||||
|
||||
---
|
||||
|
||||
## Изменения в документе контракта
|
||||
|
||||
Файл: `bible-local/docs/hardware-ingest-contract.md`
|
||||
|
||||
1. Заголовок версии: `2.10` → `2.11`, дата → `2026-06-19`.
|
||||
2. Добавить в changelog:
|
||||
```
|
||||
| 2.11 | 2026-06-19 | В `pcie_devices[]` добавлен необязательный массив `sfp_modules[]`
|
||||
с идентификацией и DOM telemetry SFP/QSFP-модулей. Скалярные поля
|
||||
sfp_temperature_c / sfp_tx_power_dbm / sfp_rx_power_dbm / sfp_voltage_v /
|
||||
sfp_bias_ma помечены как deprecated (принимаются, но sfp_modules[] имеет приоритет). |
|
||||
```
|
||||
3. В секции `pcie_devices` добавить строку в таблицу полей:
|
||||
```
|
||||
| `sfp_modules` | array | нет | Установленные SFP/QSFP-модули по портам (см. sfp_modules[]) |
|
||||
```
|
||||
4. Добавить подсекцию `#### pcie_devices[].sfp_modules[]` с таблицей полей и примером JSON (из раздела выше).
|
||||
5. Пометить deprecated поля в таблице: добавить суффикс `*(deprecated since 2.11)*`.
|
||||
6. Обновить полный пример JSON — добавить `sfp_modules` к NIC-записи в `pcie_devices`.
|
||||
|
||||
---
|
||||
|
||||
## Что не нужно делать в этом PR
|
||||
|
||||
- Не добавлять lane-level данные QSFP (tx_power_dbm_lane_0 и т.п.) — отдельный RFC.
|
||||
- Не удалять deprecated поля — только пометить.
|
||||
- Не создавать отдельную top-level секцию `network_ports` — данные остаются вложенными в `pcie_devices`.
|
||||
- Не менять логику идентификации PCIe-компонента — `serial_number` SFP-модуля не является ключом для самостоятельного компонента.
|
||||
|
||||
---
|
||||
|
||||
## Валидация
|
||||
|
||||
Единственное обязательное поле в `sfp_modules[]` — `port` (int, >= 0).
|
||||
Все остальные поля опциональны.
|
||||
Дубли по `port` внутри одного `pcie_devices[]` — невалидны, возвращать `400` с описанием поля.
|
||||
Submodule internal/chart updated: 8105c7ec08...2a15bc87f1
@@ -1419,6 +1419,13 @@ rm -rf \
|
||||
if [ "$BEE_GPU_VENDOR" != "nvidia" ]; then
|
||||
rm -f "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nvidia-load"
|
||||
rm -f "${OVERLAY_STAGE_DIR}/etc/systemd/system/bee-nvidia.service"
|
||||
rm -f "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-burn"
|
||||
rm -f "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-john-gpu-stress"
|
||||
rm -f "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nccl-gpu-stress"
|
||||
rm -f "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nvidia-recover"
|
||||
rm -f "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-dcgmproftester-staggered"
|
||||
rm -f "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-check-nvswitch"
|
||||
rm -rf "${OVERLAY_STAGE_DIR}/etc/systemd/system/nvidia-fabricmanager.service.d"
|
||||
fi
|
||||
|
||||
# --- inject authorized_keys for SSH access ---
|
||||
|
||||
@@ -67,7 +67,8 @@ if ! mkdir "${LOCK_DIR}" 2>/dev/null; then
|
||||
fi
|
||||
trap 'rmdir "${LOCK_DIR}" >/dev/null 2>&1 || true' EXIT
|
||||
|
||||
if have_nvidia_gpu && [ ! -e /dev/nvidia0 ]; then
|
||||
GPU_VENDOR=$(cat /etc/bee-gpu-vendor 2>/dev/null || echo "")
|
||||
if [ "$GPU_VENDOR" = "nvidia" ] && have_nvidia_gpu && [ ! -e /dev/nvidia0 ]; then
|
||||
log_event "NVIDIA GPU detected but /dev/nvidia0 is missing"
|
||||
restart_service bee-nvidia.service || true
|
||||
fi
|
||||
|
||||
Reference in New Issue
Block a user