Compare commits

...

5 Commits

Author SHA1 Message Date
dc07580adc Add AER decode, event counter, and sparkline to component detail modal
- decodeAERStatus: parses aer_status hex from kernel error strings and
  maps PCIe AER register bits to human-readable names with correctable/
  uncorrectable classification (e.g. "Receiver Error, Replay Timer Timeout (correctable)")
- renderSparkline: 100px inline SVG showing non-OK events over time,
  bars positioned proportionally to timestamp; evenly spaced when timestamps coincide
- renderComponentDetail: shows event count badge and sparkline in the
  component header row; decoded AER line appears below the raw error summary

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-13 23:54:54 +03:00
Mikhail Chusavitin
87e78e230e Fix ISO build: truncate volume ID to 32 chars (xorriso limit)
EASY_BEE_NVIDIA_LEGACY_V<date> is 33 characters; ISO 9660 volid is
limited to 32. Compute the maximum token length dynamically from the
prefix length and trim ISO_VERSION_LABEL_TOKEN with cut before
assembling BEE_ISO_VOLUME. All four variants now fit within the limit.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-12 18:28:54 +03:00
Mikhail Chusavitin
805a3b277d Track PCIe AER correctable errors; fix GPU status key routing
Add nvidia-aer-correctable and pcie-aer-correctable patterns to catch
"bus correctable error" events seen in SEL (Critical Interrupt / offset 7).
Both patterns carry severity "warning" — correctable errors are
hardware-recovered and should not flag a card as failed.

Fix kmsg_watcher routing: GPU-category events were keyed as pcie:<BDF>
but the UI queries for pcie:gpu: prefix. Split the switch so "gpu" →
pcie:gpu:<BDF> and "pcie" → pcie:<BDF>. This applies to both
flushWindow (SAT-window path) and flushImmediate (always-on path).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-08 12:50:14 +03:00
Mikhail Chusavitin
5bc9bd7fb3 Fix deploy.sh unbound variable on line 51
\\$1 in a double-quoted string expands as literal backslash + $1 (the
script's first positional arg). With set -u and no CLI args (IP entered
via read), this fails. \$1 correctly escapes the dollar sign, producing
a literal $1 for awk on the remote host.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-08 11:58:15 +03:00
Mikhail Chusavitin
0939a647ea Fix component detail modal: replace dead hx-* with fetch-based JS
HTMX was never loaded on the page, so hx-get on the component label
spans was dead code — the dialog opened empty. Replace with a plain
openComponentDetail() fetch call. Also fix dialog positioning broken
by the CSS reset (*{margin:0} overrode the UA margin:auto that centers
<dialog>). Replace card hx-trigger polling with a setInterval.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-08 10:53:20 +03:00
6 changed files with 193 additions and 11 deletions

View File

@@ -38,6 +38,15 @@ var HardwareErrorPatterns = []ErrorPattern{
Category: "gpu",
Severity: "warning",
},
// PCIe AER correctable from the NVIDIA driver — "bus correctable error" in SEL.
// Severity is warning (not critical): correctable errors are hardware-recovered.
{
Name: "nvidia-aer-correctable",
Re: mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER.*[Cc]orrect`),
Category: "gpu",
Severity: "warning",
BDFGroup: 1,
},
{
Name: "nvidia-aer",
Re: mustPat(`(?i)nvidia\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),
@@ -54,6 +63,15 @@ var HardwareErrorPatterns = []ErrorPattern{
},
// ── PCIe AER (generic) ──────────────────────────────────────────────────────
// PCIe AER correctable from the root port — captures the reported device BDF
// (second BDF in "pcieport X: AER: Correctable error received: Y").
{
Name: "pcie-aer-correctable",
Re: mustPat(`(?i)pcieport.*AER:.*[Cc]orrect.*:\s*([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d)`),
Category: "pcie",
Severity: "warning",
BDFGroup: 1,
},
{
Name: "pcie-aer",
Re: mustPat(`(?i)pcieport\s+([\da-f]{4}:[\da-f]{2}:[\da-f]{2}\.\d).*AER`),

View File

@@ -165,7 +165,9 @@ func (w *kmsgWatcher) flushWindow(window *kmsgWindow) {
for _, id := range evt.ids {
var key string
switch evt.category {
case "gpu", "pcie":
case "gpu":
key = "pcie:gpu:" + normalizeBDF(id)
case "pcie":
key = "pcie:" + normalizeBDF(id)
case "storage":
key = "storage:" + id
@@ -218,7 +220,9 @@ func (w *kmsgWatcher) flushImmediate(evt kmsgEvent) {
for _, id := range evt.ids {
var key string
switch evt.category {
case "gpu", "pcie":
case "gpu":
key = "pcie:gpu:" + normalizeBDF(id)
case "pcie":
key = "pcie:" + normalizeBDF(id)
case "storage":
key = "storage:" + id

View File

@@ -17,6 +17,7 @@ func layoutHead(title string) string {
<style>
:root{--bg:#fff;--surface:#fff;--surface-2:#f9fafb;--border:rgba(34,36,38,.15);--border-lite:rgba(34,36,38,.1);--ink:rgba(0,0,0,.87);--muted:rgba(0,0,0,.6);--accent:#2185d0;--accent-dark:#1678c2;--crit-bg:#fff6f6;--crit-fg:#9f3a38;--crit-border:#e0b4b4;--ok-bg:#fcfff5;--ok-fg:#2c662d;--warn-bg:#fffaf3;--warn-fg:#573a08}
*{box-sizing:border-box;margin:0;padding:0}
dialog{margin:auto}
body{font:14px/1.5 Lato,"Helvetica Neue",Arial,Helvetica,sans-serif;background:var(--bg);color:var(--ink);display:flex;min-height:100vh}
a{color:var(--accent);text-decoration:none}
/* Sidebar */

View File

@@ -5,7 +5,9 @@ import (
"fmt"
"html"
"path/filepath"
"regexp"
"sort"
"strconv"
"strings"
"bee/audit/internal/app"
@@ -95,6 +97,17 @@ document.querySelectorAll('.terminal').forEach(function(t){
btn.onclick=function(){navigator.clipboard.writeText(t.textContent).then(function(){btn.textContent='Copied!';setTimeout(function(){btn.textContent='Copy';},1500);});};
w.appendChild(btn);
});
function openComponentDetail(type) {
var dlg = document.getElementById('component-detail-dialog');
var body = document.getElementById('component-detail-body');
body.innerHTML = '<div style="padding:20px;color:var(--muted)">Loading…</div>';
dlg.showModal();
fetch('/api/components/' + type).then(function(r){ return r.text(); }).then(function(html){
body.innerHTML = html;
}).catch(function(){
body.innerHTML = '<div style="padding:20px;color:var(--crit-fg)">Error loading details.</div>';
});
}
</script>` +
`</body></html>`
}
@@ -107,6 +120,14 @@ func renderDashboard(opts HandlerOptions) string {
b.WriteString(renderHardwareSummaryCard(opts))
b.WriteString(renderHealthCard(opts))
b.WriteString(renderMetrics())
b.WriteString(`<script>
setInterval(function(){
fetch('/api/hardware-summary').then(function(r){return r.text();}).then(function(html){
var el=document.getElementById('hw-summary-card');
if(el){el.outerHTML=html;}
}).catch(function(){});
},30000);
</script>`)
return b.String()
}
@@ -185,14 +206,14 @@ func renderAudit() string {
}
func renderHardwareSummaryCard(opts HandlerOptions) string {
const cardAttrs = ` hx-get="/api/hardware-summary" hx-trigger="every 30s" hx-swap="outerHTML"`
const cardID = ` id="hw-summary-card"`
data, err := loadSnapshot(opts.AuditPath)
if err != nil {
return `<div class="card"` + cardAttrs + `><div class="card-head card-head-actions"><span>Hardware Summary</span><div class="card-head-buttons"><button class="btn btn-primary btn-sm" onclick="auditModalRun()">Run audit</button></div></div><div class="card-body"></div></div>`
return `<div class="card"` + cardID + `><div class="card-head card-head-actions"><span>Hardware Summary</span><div class="card-head-buttons"><button class="btn btn-primary btn-sm" onclick="auditModalRun()">Run audit</button></div></div><div class="card-body"></div></div>`
}
var ingest schema.HardwareIngestRequest
if err := json.Unmarshal(data, &ingest); err != nil {
return `<div class="card"` + cardAttrs + `><div class="card-head">Hardware Summary</div><div class="card-body"><span class="badge badge-err">Parse error</span></div></div>`
return `<div class="card"` + cardID + `><div class="card-head">Hardware Summary</div><div class="card-body"><span class="badge badge-err">Parse error</span></div></div>`
}
hw := ingest.Hardware
@@ -202,7 +223,7 @@ func renderHardwareSummaryCard(opts HandlerOptions) string {
}
var b strings.Builder
b.WriteString(`<div class="card"` + cardAttrs + `><div class="card-head">Hardware Summary</div><div class="card-body">`)
b.WriteString(`<div class="card"` + cardID + `><div class="card-head">Hardware Summary</div><div class="card-body">`)
// Server identity block above the component table.
{
@@ -237,7 +258,7 @@ func renderHardwareSummaryCard(opts HandlerOptions) string {
var labelHTML string
if compType != "" {
labelHTML = fmt.Sprintf(
`<span style="cursor:pointer;text-decoration:underline dotted;text-underline-offset:3px" hx-get="/api/components/%s" hx-target="#component-detail-body" hx-swap="innerHTML" onclick="document.getElementById('component-detail-dialog').showModal()">%s</span>`,
`<span style="cursor:pointer;text-decoration:underline dotted;text-underline-offset:3px" onclick="openComponentDetail('%s')">%s</span>`,
compType, html.EscapeString(label))
} else {
labelHTML = html.EscapeString(label)
@@ -1012,6 +1033,114 @@ func rowIssueHTML(issue string) string {
return html.EscapeString(issue)
}
var aerStatusRe = regexp.MustCompile(`aer_status:\s*0x([0-9a-fA-F]{1,8})`)
// decodeAERStatus parses an AER status hex value from a kernel error detail string
// and returns a human-readable list of set bit names with correctable/uncorrectable label,
// or "" if no AER status is found.
func decodeAERStatus(detail string) string {
m := aerStatusRe.FindStringSubmatch(detail)
if m == nil {
return ""
}
v64, err := strconv.ParseUint(m[1], 16, 32)
if err != nil {
return ""
}
val := uint32(v64)
type bitDef struct {
bit uint32
name string
}
corrBits := []bitDef{
{0, "Receiver Error"}, {6, "Replay Timer Timeout"}, {7, "Advisory Non-Fatal"},
{8, "Corrected Internal Error"}, {9, "Header Log Overflow"},
{13, "Replay Num Rollover"}, {14, "Bad DLLP"}, {15, "Bad TLP"},
}
uncorrBits := []bitDef{
{4, "Data Link Protocol Error"}, {5, "Surprise Down Error"},
{12, "Poisoned TLP Received"}, {13, "Flow Control Protocol Error"},
{14, "Completion Timeout"}, {15, "Completer Abort"}, {16, "Unexpected Completion"},
{17, "Receiver Overflow"}, {18, "Malformed TLP"}, {19, "ECRC Error"},
{20, "Unsupported Request Error"}, {21, "ACS Violation"}, {22, "Uncorrectable Internal Error"},
}
var corrNames, uncorrNames []string
for _, b := range corrBits {
if val&(1<<b.bit) != 0 {
corrNames = append(corrNames, b.name)
}
}
for _, b := range uncorrBits {
if val&(1<<b.bit) != 0 {
uncorrNames = append(uncorrNames, b.name)
}
}
if len(corrNames) >= len(uncorrNames) && len(corrNames) > 0 {
return strings.Join(corrNames, ", ") + " (correctable)"
}
if len(uncorrNames) > 0 {
return strings.Join(uncorrNames, ", ") + " (uncorrectable)"
}
return fmt.Sprintf("unknown bits: 0x%08x", val)
}
// renderSparkline returns a small inline SVG showing non-OK events over time.
// Events are positioned proportionally along the time axis; if all share the same
// timestamp they are spaced evenly. Width is always 100px.
func renderSparkline(history []app.ComponentStatusEntry) string {
const (
svgW = 100
svgH = 20
barW = 3
barH = 14
)
var events []app.ComponentStatusEntry
for _, e := range history {
if e.Status != "OK" {
events = append(events, e)
}
}
if len(events) == 0 {
return ""
}
n := len(events)
barColor := func(status string) string {
if status == "Critical" {
return "#c0392b"
}
return "#d97706"
}
yTop := (svgH - barH) / 2
var bars strings.Builder
if n == 1 {
x := (svgW - barW) / 2
fmt.Fprintf(&bars, `<rect x="%d" y="%d" width="%d" height="%d" fill="%s" rx="1"/>`,
x, yTop, barW, barH, barColor(events[0].Status))
} else {
minT := events[0].At
maxT := events[n-1].At
dur := maxT.Sub(minT).Seconds()
for i, e := range events {
var x int
if dur <= 0 {
step := svgW / n
x = i*step + (step-barW)/2
} else {
frac := e.At.Sub(minT).Seconds() / dur
x = int(frac * float64(svgW-barW))
}
fmt.Fprintf(&bars, `<rect x="%d" y="%d" width="%d" height="%d" fill="%s" rx="1"/>`,
x, yTop, barW, barH, barColor(e.Status))
}
}
return fmt.Sprintf(
`<svg width="%d" height="%d" style="display:inline-block;vertical-align:middle;margin-left:6px;flex-shrink:0" xmlns="http://www.w3.org/2000/svg">`+
`<rect x="0" y="0" width="%d" height="%d" fill="var(--surface-alt,#ebebeb)" rx="3"/>%s</svg>`,
svgW, svgH, svgW, svgH, bars.String())
}
// renderComponentDetail renders a modal content fragment for one component type.
// Called by handleAPIComponentDetail and displayed inside #component-detail-dialog.
func renderComponentDetail(title string, records []app.ComponentStatusRecord) string {
@@ -1034,16 +1163,41 @@ func renderComponentDetail(title string, records []app.ComponentStatusRecord) st
for _, rec := range records {
letter, cls := chipLetterClass(rec.Status)
// Count non-OK events across the full history for the badge + sparkline.
warnCount := 0
for _, e := range rec.History {
if e.Status != "OK" {
warnCount++
}
}
fmt.Fprintf(&b, `<div style="margin-bottom:20px">`)
fmt.Fprintf(&b, `<div style="display:flex;align-items:center;gap:8px;margin-bottom:8px">`)
fmt.Fprintf(&b, `<div style="display:flex;align-items:center;gap:8px;margin-bottom:8px;flex-wrap:wrap">`)
fmt.Fprintf(&b, `<span class="chip %s">%s</span>`, cls, letter)
fmt.Fprintf(&b, `<span style="font-weight:700;font-size:13px">%s</span>`, html.EscapeString(rec.ComponentKey))
if !rec.LastCheckedAt.IsZero() {
fmt.Fprintf(&b, `<span style="color:var(--muted);font-size:12px">checked %s</span>`, rec.LastCheckedAt.Format("2006-01-02 15:04:05"))
}
if warnCount > 0 {
noun := "events"
if warnCount == 1 {
noun = "event"
}
fmt.Fprintf(&b,
`<span style="font-size:11px;background:var(--warn-bg,#fffbeb);color:var(--warn-fg,#92400e);border:1px solid var(--warn-border,#fde68a);border-radius:10px;padding:1px 7px;white-space:nowrap">%d %s</span>`,
warnCount, noun)
b.WriteString(renderSparkline(rec.History))
}
b.WriteString(`</div>`)
if rec.ErrorSummary != "" {
fmt.Fprintf(&b, `<div style="font-size:12px;margin-bottom:8px;color:var(--muted)">%s</div>`, html.EscapeString(rec.ErrorSummary))
fmt.Fprintf(&b, `<div style="font-size:12px;margin-bottom:4px;color:var(--muted)">%s</div>`, html.EscapeString(rec.ErrorSummary))
if decoded := decodeAERStatus(rec.ErrorSummary); decoded != "" {
fmt.Fprintf(&b,
`<div style="font-size:12px;margin-bottom:8px;color:var(--muted)"><span style="background:var(--surface-alt,#f5f5f5);border-radius:4px;padding:1px 6px;font-family:monospace">AER: %s</span></div>`,
html.EscapeString(decoded))
}
}
// History table — newest first, cap at 20 entries.

View File

@@ -1688,7 +1688,12 @@ echo "=== building ISO (variant: ${BUILD_VARIANT}) ==="
# Export for auto/config
BEE_GPU_VENDOR_UPPER="$(echo "${BUILD_VARIANT}" | tr 'a-z-' 'A-Z_')"
BEE_ISO_VOLUME="EASY_BEE_${BEE_GPU_VENDOR_UPPER}_V${ISO_VERSION_LABEL_TOKEN}"
# ISO 9660 volume ID is limited to 32 characters; truncate the version token to fit.
_vol_prefix="EASY_BEE_${BEE_GPU_VENDOR_UPPER}_V"
_max_token=$(( 32 - ${#_vol_prefix} ))
_vol_token="$(printf '%s' "${ISO_VERSION_LABEL_TOKEN}" | cut -c1-${_max_token})"
BEE_ISO_VOLUME="${_vol_prefix}${_vol_token}"
unset _vol_prefix _max_token _vol_token
export BEE_GPU_VENDOR_UPPER BEE_ISO_VOLUME
cd "${LB_DIR}"

View File

@@ -48,7 +48,7 @@ echo "==> Сборка бинарника..."
echo " OK: $(ls -lh "${LOCAL_BIN}" | awk '{print $5, $9}')"
LOCAL_SHA="$(shasum -a 256 "${LOCAL_BIN}" | awk '{print $1}')"
REMOTE_SHA="$("${SSH_CMD[@]}" "$REMOTE" "if [ -f '${REMOTE_BIN}' ] && command -v sha256sum >/dev/null 2>&1; then sha256sum '${REMOTE_BIN}' | awk '{print \\$1}'; fi" 2>/dev/null || true)"
REMOTE_SHA="$("${SSH_CMD[@]}" "$REMOTE" "if [ -f '${REMOTE_BIN}' ] && command -v sha256sum >/dev/null 2>&1; then sha256sum '${REMOTE_BIN}' | awk '{print \$1}'; fi" 2>/dev/null || true)"
if [[ -n "${REMOTE_SHA}" && "${LOCAL_SHA}" == "${REMOTE_SHA}" ]]; then
echo "==> Бинарник не изменился (${LOCAL_SHA}); копирование и перезапуск сервисов пропущены."
exit 0