Redesign dashboard: split Runtime Health and Hardware Summary
- Runtime Health now shows only LiveCD system status (services, tools, drivers, network, CUDA/ROCm) — hardware component rows removed - Hardware Summary now shows server components with readable descriptions (model, count×size) and component-status.json health badges - Add Network Adapters row to Hardware Summary - SFP module static info (vendor, PN, SN, connector, type, wavelength) now collected via ethtool -m regardless of carrier state - PSU statuses from IPMI audit written to component-status.json so PSU badge shows actual status after first audit instead of UNKNOWN Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -317,106 +317,271 @@ func renderHardwareSummaryCard(opts HandlerOptions) string {
|
||||
if err != nil {
|
||||
return `<div class="card"><div class="card-head card-head-actions"><span>Hardware Summary</span><div class="card-head-buttons"><button class="btn btn-primary btn-sm" onclick="auditModalRun()">Run audit</button></div></div><div class="card-body"></div></div>`
|
||||
}
|
||||
// Parse just enough fields for the summary banner
|
||||
var snap struct {
|
||||
Summary struct {
|
||||
CPU struct{ Model string }
|
||||
Memory struct{ TotalGB float64 }
|
||||
Storage []struct{ Device, Model, Size string }
|
||||
GPUs []struct{ Model string }
|
||||
PSUs []struct{ Model string }
|
||||
}
|
||||
Network struct {
|
||||
Interfaces []struct {
|
||||
Name string
|
||||
IPv4 []string
|
||||
State string
|
||||
}
|
||||
}
|
||||
}
|
||||
// Try to extract top-level fields loosely
|
||||
var raw map[string]json.RawMessage
|
||||
if err := json.Unmarshal(data, &raw); err != nil {
|
||||
var ingest schema.HardwareIngestRequest
|
||||
if err := json.Unmarshal(data, &ingest); err != nil {
|
||||
return `<div class="card"><div class="card-head">Hardware Summary</div><div class="card-body"><span class="badge badge-err">Parse error</span></div></div>`
|
||||
}
|
||||
_ = snap
|
||||
hw := ingest.Hardware
|
||||
|
||||
// Also load runtime-health for badges
|
||||
type componentHealth struct {
|
||||
FailCount int `json:"fail_count"`
|
||||
WarnCount int `json:"warn_count"`
|
||||
var records []app.ComponentStatusRecord
|
||||
if db, err := app.OpenComponentStatusDB(filepath.Join(opts.ExportDir, "component-status.json")); err == nil {
|
||||
records = db.All()
|
||||
}
|
||||
type healthSummary struct {
|
||||
CPU componentHealth `json:"cpu"`
|
||||
Memory componentHealth `json:"memory"`
|
||||
Storage componentHealth `json:"storage"`
|
||||
GPU componentHealth `json:"gpu"`
|
||||
PSU componentHealth `json:"psu"`
|
||||
Network componentHealth `json:"network"`
|
||||
}
|
||||
var health struct {
|
||||
HardwareHealth healthSummary `json:"hardware_health"`
|
||||
}
|
||||
if hdata, herr := loadSnapshot(filepath.Join(opts.ExportDir, "runtime-health.json")); herr == nil {
|
||||
_ = json.Unmarshal(hdata, &health)
|
||||
}
|
||||
|
||||
badge := func(h componentHealth) string {
|
||||
if h.FailCount > 0 {
|
||||
return `<span class="badge badge-err">FAIL</span>`
|
||||
}
|
||||
if h.WarnCount > 0 {
|
||||
return `<span class="badge badge-warn">WARN</span>`
|
||||
}
|
||||
return `<span class="badge badge-ok">OK</span>`
|
||||
}
|
||||
|
||||
// Extract readable strings from raw JSON
|
||||
getString := func(key string) string {
|
||||
v, ok := raw[key]
|
||||
if !ok {
|
||||
return ""
|
||||
}
|
||||
var s string
|
||||
if err := json.Unmarshal(v, &s); err == nil {
|
||||
return s
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
cpuModel := getString("cpu_model")
|
||||
memStr := getString("memory_summary")
|
||||
gpuSummary := getString("gpu_summary")
|
||||
|
||||
var b strings.Builder
|
||||
b.WriteString(`<div class="card"><div class="card-head">Hardware Summary</div><div class="card-body">`)
|
||||
b.WriteString(`<table style="width:auto">`)
|
||||
writeRow := func(label, value, badgeHTML string) {
|
||||
b.WriteString(fmt.Sprintf(`<tr><td style="padding:6px 14px 6px 0;font-weight:700;white-space:nowrap">%s</td><td style="padding:6px 0">%s</td><td style="padding:6px 0 6px 12px">%s</td></tr>`,
|
||||
b.WriteString(fmt.Sprintf(`<tr><td style="padding:6px 14px 6px 0;font-weight:700;white-space:nowrap">%s</td><td style="padding:6px 0;color:var(--muted);font-size:13px">%s</td><td style="padding:6px 0 6px 12px">%s</td></tr>`,
|
||||
html.EscapeString(label), html.EscapeString(value), badgeHTML))
|
||||
}
|
||||
if cpuModel != "" {
|
||||
writeRow("CPU", cpuModel, badge(health.HardwareHealth.CPU))
|
||||
} else {
|
||||
writeRow("CPU", "—", badge(health.HardwareHealth.CPU))
|
||||
|
||||
cpuRow := aggregateComponentStatus("CPU", records, []string{"cpu:all"}, nil)
|
||||
writeRow("CPU", hwDescribeCPU(hw), runtimeStatusBadge(cpuRow.Status))
|
||||
|
||||
memRow := aggregateComponentStatus("Memory", records, []string{"memory:all"}, []string{"memory:"})
|
||||
writeRow("Memory", hwDescribeMemory(hw), runtimeStatusBadge(memRow.Status))
|
||||
|
||||
storageRow := aggregateComponentStatus("Storage", records, []string{"storage:all"}, []string{"storage:"})
|
||||
writeRow("Storage", hwDescribeStorage(hw), runtimeStatusBadge(storageRow.Status))
|
||||
|
||||
gpuRow := aggregateComponentStatus("GPU", records, nil, []string{"pcie:gpu:"})
|
||||
writeRow("GPU", hwDescribeGPU(hw), runtimeStatusBadge(gpuRow.Status))
|
||||
|
||||
psuRow := aggregateComponentStatus("PSU", records, nil, []string{"psu:"})
|
||||
writeRow("PSU", hwDescribePSU(hw), runtimeStatusBadge(psuRow.Status))
|
||||
|
||||
if nicDesc := hwDescribeNIC(hw); nicDesc != "" {
|
||||
writeRow("Network", nicDesc, "")
|
||||
}
|
||||
if memStr != "" {
|
||||
writeRow("Memory", memStr, badge(health.HardwareHealth.Memory))
|
||||
} else {
|
||||
writeRow("Memory", "—", badge(health.HardwareHealth.Memory))
|
||||
}
|
||||
if gpuSummary != "" {
|
||||
writeRow("GPU", gpuSummary, badge(health.HardwareHealth.GPU))
|
||||
} else {
|
||||
writeRow("GPU", "—", badge(health.HardwareHealth.GPU))
|
||||
}
|
||||
writeRow("Storage", "—", badge(health.HardwareHealth.Storage))
|
||||
writeRow("PSU", "—", badge(health.HardwareHealth.PSU))
|
||||
|
||||
b.WriteString(`</table>`)
|
||||
b.WriteString(`</div></div>`)
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// hwDescribeCPU returns a human-readable CPU summary, e.g. "2× Intel Xeon Gold 6338".
|
||||
func hwDescribeCPU(hw schema.HardwareSnapshot) string {
|
||||
counts := map[string]int{}
|
||||
order := []string{}
|
||||
for _, cpu := range hw.CPUs {
|
||||
model := "Unknown CPU"
|
||||
if cpu.Model != nil && *cpu.Model != "" {
|
||||
model = *cpu.Model
|
||||
}
|
||||
if counts[model] == 0 {
|
||||
order = append(order, model)
|
||||
}
|
||||
counts[model]++
|
||||
}
|
||||
if len(order) == 0 {
|
||||
return "—"
|
||||
}
|
||||
parts := make([]string, 0, len(order))
|
||||
for _, m := range order {
|
||||
if counts[m] > 1 {
|
||||
parts = append(parts, fmt.Sprintf("%d× %s", counts[m], m))
|
||||
} else {
|
||||
parts = append(parts, m)
|
||||
}
|
||||
}
|
||||
return strings.Join(parts, ", ")
|
||||
}
|
||||
|
||||
// hwDescribeMemory returns a summary like "16× 32 GB DDR4".
|
||||
func hwDescribeMemory(hw schema.HardwareSnapshot) string {
|
||||
type key struct {
|
||||
sizeMB int
|
||||
typ string
|
||||
}
|
||||
counts := map[key]int{}
|
||||
order := []key{}
|
||||
for _, dimm := range hw.Memory {
|
||||
if dimm.SizeMB == nil || *dimm.SizeMB == 0 {
|
||||
continue
|
||||
}
|
||||
t := ""
|
||||
if dimm.Type != nil {
|
||||
t = *dimm.Type
|
||||
}
|
||||
k := key{*dimm.SizeMB, t}
|
||||
if counts[k] == 0 {
|
||||
order = append(order, k)
|
||||
}
|
||||
counts[k]++
|
||||
}
|
||||
if len(order) == 0 {
|
||||
return "—"
|
||||
}
|
||||
parts := make([]string, 0, len(order))
|
||||
for _, k := range order {
|
||||
gb := k.sizeMB / 1024
|
||||
desc := fmt.Sprintf("%d× %d GB", counts[k], gb)
|
||||
if k.typ != "" {
|
||||
desc += " " + k.typ
|
||||
}
|
||||
parts = append(parts, desc)
|
||||
}
|
||||
return strings.Join(parts, ", ")
|
||||
}
|
||||
|
||||
// hwDescribeStorage returns a summary like "4× 3.84 TB NVMe, 2× 1.92 TB SATA".
|
||||
func hwDescribeStorage(hw schema.HardwareSnapshot) string {
|
||||
type key struct {
|
||||
sizeGB int
|
||||
iface string
|
||||
}
|
||||
counts := map[key]int{}
|
||||
order := []key{}
|
||||
for _, disk := range hw.Storage {
|
||||
sz := 0
|
||||
if disk.SizeGB != nil {
|
||||
sz = *disk.SizeGB
|
||||
}
|
||||
iface := ""
|
||||
if disk.Interface != nil {
|
||||
iface = *disk.Interface
|
||||
} else if disk.Type != nil {
|
||||
iface = *disk.Type
|
||||
}
|
||||
k := key{sz, iface}
|
||||
if counts[k] == 0 {
|
||||
order = append(order, k)
|
||||
}
|
||||
counts[k]++
|
||||
}
|
||||
if len(order) == 0 {
|
||||
return "—"
|
||||
}
|
||||
parts := make([]string, 0, len(order))
|
||||
for _, k := range order {
|
||||
var sizeStr string
|
||||
if k.sizeGB >= 1000 {
|
||||
sizeStr = fmt.Sprintf("%.2g TB", float64(k.sizeGB)/1000)
|
||||
} else if k.sizeGB > 0 {
|
||||
sizeStr = fmt.Sprintf("%d GB", k.sizeGB)
|
||||
} else {
|
||||
sizeStr = "?"
|
||||
}
|
||||
desc := fmt.Sprintf("%d× %s", counts[k], sizeStr)
|
||||
if k.iface != "" {
|
||||
desc += " " + k.iface
|
||||
}
|
||||
parts = append(parts, desc)
|
||||
}
|
||||
return strings.Join(parts, ", ")
|
||||
}
|
||||
|
||||
// hwDescribeGPU returns a summary like "8× NVIDIA H100 80GB".
|
||||
func hwDescribeGPU(hw schema.HardwareSnapshot) string {
|
||||
counts := map[string]int{}
|
||||
order := []string{}
|
||||
for _, dev := range hw.PCIeDevices {
|
||||
if dev.DeviceClass == nil {
|
||||
continue
|
||||
}
|
||||
if !isGPUDeviceClass(*dev.DeviceClass) {
|
||||
continue
|
||||
}
|
||||
model := "Unknown GPU"
|
||||
if dev.Model != nil && *dev.Model != "" {
|
||||
model = *dev.Model
|
||||
}
|
||||
if counts[model] == 0 {
|
||||
order = append(order, model)
|
||||
}
|
||||
counts[model]++
|
||||
}
|
||||
if len(order) == 0 {
|
||||
return "—"
|
||||
}
|
||||
parts := make([]string, 0, len(order))
|
||||
for _, m := range order {
|
||||
if counts[m] > 1 {
|
||||
parts = append(parts, fmt.Sprintf("%d× %s", counts[m], m))
|
||||
} else {
|
||||
parts = append(parts, m)
|
||||
}
|
||||
}
|
||||
return strings.Join(parts, ", ")
|
||||
}
|
||||
|
||||
// hwDescribePSU returns a summary like "2× 1600 W" or "2× PSU".
|
||||
func hwDescribePSU(hw schema.HardwareSnapshot) string {
|
||||
n := len(hw.PowerSupplies)
|
||||
if n == 0 {
|
||||
return "—"
|
||||
}
|
||||
// Try to get a consistent wattage
|
||||
watt := 0
|
||||
consistent := true
|
||||
for _, psu := range hw.PowerSupplies {
|
||||
if psu.WattageW == nil {
|
||||
consistent = false
|
||||
break
|
||||
}
|
||||
if watt == 0 {
|
||||
watt = *psu.WattageW
|
||||
} else if *psu.WattageW != watt {
|
||||
consistent = false
|
||||
break
|
||||
}
|
||||
}
|
||||
if consistent && watt > 0 {
|
||||
return fmt.Sprintf("%d× %d W", n, watt)
|
||||
}
|
||||
return fmt.Sprintf("%d× PSU", n)
|
||||
}
|
||||
|
||||
// hwDescribeNIC returns a summary like "2× Mellanox ConnectX-6".
|
||||
func hwDescribeNIC(hw schema.HardwareSnapshot) string {
|
||||
counts := map[string]int{}
|
||||
order := []string{}
|
||||
for _, dev := range hw.PCIeDevices {
|
||||
isNIC := false
|
||||
if dev.DeviceClass != nil {
|
||||
c := strings.ToLower(strings.TrimSpace(*dev.DeviceClass))
|
||||
isNIC = c == "ethernetcontroller" || c == "networkcontroller" || strings.Contains(c, "fibrechannel")
|
||||
}
|
||||
if !isNIC && len(dev.MacAddresses) == 0 {
|
||||
continue
|
||||
}
|
||||
model := ""
|
||||
if dev.Model != nil && *dev.Model != "" {
|
||||
model = *dev.Model
|
||||
} else if dev.Manufacturer != nil && *dev.Manufacturer != "" {
|
||||
model = *dev.Manufacturer + " NIC"
|
||||
} else {
|
||||
model = "NIC"
|
||||
}
|
||||
if counts[model] == 0 {
|
||||
order = append(order, model)
|
||||
}
|
||||
counts[model]++
|
||||
}
|
||||
if len(order) == 0 {
|
||||
return ""
|
||||
}
|
||||
parts := make([]string, 0, len(order))
|
||||
for _, m := range order {
|
||||
if counts[m] > 1 {
|
||||
parts = append(parts, fmt.Sprintf("%d× %s", counts[m], m))
|
||||
} else {
|
||||
parts = append(parts, m)
|
||||
}
|
||||
}
|
||||
return strings.Join(parts, ", ")
|
||||
}
|
||||
|
||||
func isGPUDeviceClass(class string) bool {
|
||||
switch strings.TrimSpace(class) {
|
||||
case "VideoController", "DisplayController", "ProcessingAccelerator":
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func renderAuditModal() string {
|
||||
return `<div id="audit-modal-overlay" style="display:none;position:fixed;inset:0;background:rgba(0,0,0,.5);z-index:100;align-items:center;justify-content:center">
|
||||
<div style="background:#fff;border-radius:6px;padding:24px;min-width:480px;max-width:1100px;width:min(1100px,92vw);max-height:92vh;overflow:auto;position:relative">
|
||||
@@ -482,7 +647,6 @@ func renderHealthCard(opts HandlerOptions) string {
|
||||
buildRuntimeToolsRow(health),
|
||||
buildRuntimeServicesRow(health),
|
||||
}
|
||||
rows = append(rows, buildHardwareComponentRows(opts.ExportDir)...)
|
||||
b.WriteString(`<table><thead><tr><th>Check</th><th>Status</th><th>Source</th><th>Issue</th></tr></thead><tbody>`)
|
||||
for _, row := range rows {
|
||||
b.WriteString(`<tr><td>` + html.EscapeString(row.Title) + `</td><td>` + runtimeStatusBadge(row.Status) + `</td><td>` + html.EscapeString(row.Source) + `</td><td>` + rowIssueHTML(row.Issue) + `</td></tr>`)
|
||||
@@ -1157,7 +1321,7 @@ func renderValidate(opts HandlerOptions) string {
|
||||
</div>
|
||||
<style>
|
||||
.validate-profile-body { display:grid; grid-template-columns:1fr 1fr 1fr; gap:24px; align-items:stretch; }
|
||||
.validate-profile-col { min-width:0; }
|
||||
.validate-profile-col { min-width:0; display:flex; flex-direction:column; }
|
||||
.validate-profile-action { display:flex; flex-direction:column; align-items:center; justify-content:center; }
|
||||
.validate-card-body { padding:0; }
|
||||
.validate-card-section { padding:12px 16px 0; }
|
||||
|
||||
@@ -1094,6 +1094,7 @@ func TestDashboardRendersRuntimeHealthTable(t *testing.T) {
|
||||
}
|
||||
body := rec.Body.String()
|
||||
for _, needle := range []string{
|
||||
// Runtime Health card — LiveCD checks only
|
||||
`Runtime Health`,
|
||||
`<th>Check</th><th>Status</th><th>Source</th><th>Issue</th>`,
|
||||
`Export Directory`,
|
||||
@@ -1102,16 +1103,18 @@ func TestDashboardRendersRuntimeHealthTable(t *testing.T) {
|
||||
`CUDA / ROCm`,
|
||||
`Required Utilities`,
|
||||
`Bee Services`,
|
||||
`<td>CPU</td>`,
|
||||
`<td>Memory</td>`,
|
||||
`<td>Storage</td>`,
|
||||
`<td>GPU</td>`,
|
||||
`CUDA runtime is not ready for GPU SAT.`,
|
||||
`Missing: nvidia-smi`,
|
||||
`bee-nvidia=inactive`,
|
||||
`cpu SAT: FAILED`,
|
||||
`storage SAT: FAILED`,
|
||||
`sat:nvidia`,
|
||||
// Hardware Summary card — component health badges
|
||||
`Hardware Summary`,
|
||||
`>CPU<`,
|
||||
`>Memory<`,
|
||||
`>Storage<`,
|
||||
`>GPU<`,
|
||||
`>PSU<`,
|
||||
`badge-warn`, // cpu Warning badge
|
||||
`badge-err`, // storage Critical badge
|
||||
} {
|
||||
if !strings.Contains(body, needle) {
|
||||
t.Fatalf("dashboard missing %q: %s", needle, body)
|
||||
|
||||
Reference in New Issue
Block a user