Compare commits

..

12 Commits
v3.21 ... v4

Author SHA1 Message Date
dbab43db90 Fix full-history metrics range loading 2026-04-01 23:55:28 +03:00
bcb7fe5fe9 Render charts from full SQLite history 2026-04-01 23:52:54 +03:00
d21d9d191b fix(build): bump DCGM to 4.5.3-1 — core package updated in CUDA repo
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-01 23:49:57 +03:00
ef45246ea0 fix(sat): kill entire process group on task cancel
exec.CommandContext only kills the direct child (the shell script), leaving
grandchildren (john, gpu-burn, etc.) as orphans. Set Setpgid so each SAT
job runs in its own process group, then send SIGKILL to the whole group
(-pgid) in the Cancel hook.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-01 23:46:33 +03:00
348db35119 fix(stress): stagger john GPU launches to prevent GWS tuning contention
When 8 john processes start simultaneously they race for GPU memory during
OpenCL GWS auto-tuning. Slower devices settle on a smaller work size (~594MiB
vs 762MiB) and run at 40% instead of 100% load. Add 3s sleep between launches
so each instance finishes memory allocation before the next one starts.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-01 23:44:00 +03:00
1dd7f243f5 Keep chart series colors stable 2026-04-01 23:37:57 +03:00
938e499ac2 Serve charts from SQLite history only 2026-04-01 23:33:13 +03:00
964ab39656 fix: run john stress in parallel per GPU, fix chromium fullscreen, filter BMC virtual disks
- bee-john-gpu-stress: spawn one john process per OpenCL device in parallel
  so all GPUs are stressed simultaneously instead of only device 1
- bee-openbox-session: --start-fullscreen → --start-maximized to fix blank
  white page on first render in fbdev environment
- storage collector: skip Virtual HDisk* devices reported by BMC/iDRAC

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-01 23:14:21 +03:00
c2aecc6ce9 Fix fan chart gaps and task durations 2026-04-01 22:36:11 +03:00
439b86ce59 Unify live metrics chart rendering 2026-04-01 22:19:33 +03:00
eb60100297 fix: pcie gen, nccl binary, netconf sudo, boot noise, firmware cleanup
- nvidia collector: read pcie.link.gen.current/max from nvidia-smi instead
  of sysfs to avoid false Gen1 readings when GPU is in ASPM idle state
- build: remove bee-nccl-gpu-stress from rm -f list so shell script from
  overlay is not silently dropped from the ISO
- smoketest: add explicit checks for bee-gpu-burn, bee-john-gpu-stress,
  bee-nccl-gpu-stress, all_reduce_perf
- netconf: re-exec via sudo when not root to fix RTNETLINK/resolv.conf errors
- auto/config: reduce loglevel 7→3 to show clean systemd output on boot
- auto/config: blacklist snd_hda_intel and related audio modules (unused on servers)
- package-lists: remove firmware-intel-sound and firmware-amd-graphics from
  base list; move firmware-amd-graphics to bee-amd variant only
- bible-local: mark memtest ADR resolved, document working solution

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-01 21:25:23 +03:00
Mikhail Chusavitin
2baf3be640 Handle memtest recovery probe under set -e 2026-04-01 17:42:13 +03:00
24 changed files with 804 additions and 319 deletions

View File

@@ -13,14 +13,18 @@ import (
const nvidiaVendorID = 0x10de const nvidiaVendorID = 0x10de
type nvidiaGPUInfo struct { type nvidiaGPUInfo struct {
BDF string BDF string
Serial string Serial string
VBIOS string VBIOS string
TemperatureC *float64 TemperatureC *float64
PowerW *float64 PowerW *float64
ECCUncorrected *int64 ECCUncorrected *int64
ECCCorrected *int64 ECCCorrected *int64
HWSlowdown *bool HWSlowdown *bool
PCIeLinkGenCurrent *int
PCIeLinkGenMax *int
PCIeLinkWidthCur *int
PCIeLinkWidthMax *int
} }
// enrichPCIeWithNVIDIA enriches NVIDIA PCIe devices with data from nvidia-smi. // enrichPCIeWithNVIDIA enriches NVIDIA PCIe devices with data from nvidia-smi.
@@ -94,7 +98,7 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
func queryNVIDIAGPUs() (map[string]nvidiaGPUInfo, error) { func queryNVIDIAGPUs() (map[string]nvidiaGPUInfo, error) {
out, err := exec.Command( out, err := exec.Command(
"nvidia-smi", "nvidia-smi",
"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown", "--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown,pcie.link.gen.current,pcie.link.gen.max,pcie.link.width.current,pcie.link.width.max",
"--format=csv,noheader,nounits", "--format=csv,noheader,nounits",
).Output() ).Output()
if err != nil { if err != nil {
@@ -118,8 +122,8 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
if len(rec) == 0 { if len(rec) == 0 {
continue continue
} }
if len(rec) < 9 { if len(rec) < 13 {
return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 9", len(rec)) return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 13", len(rec))
} }
bdf := normalizePCIeBDF(rec[1]) bdf := normalizePCIeBDF(rec[1])
@@ -128,14 +132,18 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
} }
info := nvidiaGPUInfo{ info := nvidiaGPUInfo{
BDF: bdf, BDF: bdf,
Serial: strings.TrimSpace(rec[2]), Serial: strings.TrimSpace(rec[2]),
VBIOS: strings.TrimSpace(rec[3]), VBIOS: strings.TrimSpace(rec[3]),
TemperatureC: parseMaybeFloat(rec[4]), TemperatureC: parseMaybeFloat(rec[4]),
PowerW: parseMaybeFloat(rec[5]), PowerW: parseMaybeFloat(rec[5]),
ECCUncorrected: parseMaybeInt64(rec[6]), ECCUncorrected: parseMaybeInt64(rec[6]),
ECCCorrected: parseMaybeInt64(rec[7]), ECCCorrected: parseMaybeInt64(rec[7]),
HWSlowdown: parseMaybeBool(rec[8]), HWSlowdown: parseMaybeBool(rec[8]),
PCIeLinkGenCurrent: parseMaybeInt(rec[9]),
PCIeLinkGenMax: parseMaybeInt(rec[10]),
PCIeLinkWidthCur: parseMaybeInt(rec[11]),
PCIeLinkWidthMax: parseMaybeInt(rec[12]),
} }
result[bdf] = info result[bdf] = info
} }
@@ -167,6 +175,22 @@ func parseMaybeInt64(v string) *int64 {
return &n return &n
} }
func parseMaybeInt(v string) *int {
v = strings.TrimSpace(v)
if v == "" || strings.EqualFold(v, "n/a") || strings.EqualFold(v, "not supported") || strings.EqualFold(v, "[not supported]") {
return nil
}
n, err := strconv.Atoi(v)
if err != nil {
return nil
}
return &n
}
func pcieLinkGenLabel(gen int) string {
return fmt.Sprintf("Gen%d", gen)
}
func parseMaybeBool(v string) *bool { func parseMaybeBool(v string) *bool {
v = strings.TrimSpace(strings.ToLower(v)) v = strings.TrimSpace(strings.ToLower(v))
switch v { switch v {
@@ -231,4 +255,22 @@ func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) {
if info.HWSlowdown != nil { if info.HWSlowdown != nil {
dev.HWSlowdown = info.HWSlowdown dev.HWSlowdown = info.HWSlowdown
} }
// Override PCIe link speed/width with nvidia-smi driver values.
// sysfs current_link_speed reflects the instantaneous physical link state and
// can show Gen1 when the GPU is idle due to ASPM power management. The driver
// knows the negotiated speed regardless of the current power state.
if info.PCIeLinkGenCurrent != nil {
s := pcieLinkGenLabel(*info.PCIeLinkGenCurrent)
dev.LinkSpeed = &s
}
if info.PCIeLinkGenMax != nil {
s := pcieLinkGenLabel(*info.PCIeLinkGenMax)
dev.MaxLinkSpeed = &s
}
if info.PCIeLinkWidthCur != nil {
dev.LinkWidth = info.PCIeLinkWidthCur
}
if info.PCIeLinkWidthMax != nil {
dev.MaxLinkWidth = info.PCIeLinkWidthMax
}
} }

View File

@@ -6,7 +6,7 @@ import (
) )
func TestParseNVIDIASMIQuery(t *testing.T) { func TestParseNVIDIASMIQuery(t *testing.T) {
raw := "0, 00000000:65:00.0, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active\n" raw := "0, 00000000:65:00.0, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active, 4, 4, 16, 16\n"
byBDF, err := parseNVIDIASMIQuery(raw) byBDF, err := parseNVIDIASMIQuery(raw)
if err != nil { if err != nil {
t.Fatalf("parse failed: %v", err) t.Fatalf("parse failed: %v", err)
@@ -28,6 +28,12 @@ func TestParseNVIDIASMIQuery(t *testing.T) {
if gpu.HWSlowdown == nil || *gpu.HWSlowdown { if gpu.HWSlowdown == nil || *gpu.HWSlowdown {
t.Fatalf("hw slowdown: got %v, want false", gpu.HWSlowdown) t.Fatalf("hw slowdown: got %v, want false", gpu.HWSlowdown)
} }
if gpu.PCIeLinkGenCurrent == nil || *gpu.PCIeLinkGenCurrent != 4 {
t.Fatalf("pcie link gen current: got %v, want 4", gpu.PCIeLinkGenCurrent)
}
if gpu.PCIeLinkGenMax == nil || *gpu.PCIeLinkGenMax != 4 {
t.Fatalf("pcie link gen max: got %v, want 4", gpu.PCIeLinkGenMax)
}
} }
func TestNormalizePCIeBDF(t *testing.T) { func TestNormalizePCIeBDF(t *testing.T) {

View File

@@ -77,11 +77,24 @@ func discoverStorageDevices() []lsblkDevice {
if dev.Type != "disk" { if dev.Type != "disk" {
continue continue
} }
if isVirtualBMCDisk(dev) {
slog.Debug("storage: skipping BMC virtual disk", "name", dev.Name, "model", dev.Model)
continue
}
disks = append(disks, dev) disks = append(disks, dev)
} }
return disks return disks
} }
// isVirtualBMCDisk returns true for BMC/IPMI virtual USB mass storage devices
// that appear as disks but are not real hardware (e.g. iDRAC Virtual HDisk*).
// These have zero reported size, a generic fake serial, and a model name that
// starts with "Virtual HDisk".
func isVirtualBMCDisk(dev lsblkDevice) bool {
model := strings.ToLower(strings.TrimSpace(dev.Model))
return strings.HasPrefix(model, "virtual hdisk")
}
func lsblkDevices() []lsblkDevice { func lsblkDevices() []lsblkDevice {
out, err := exec.Command("lsblk", "-J", "-d", out, err := exec.Command("lsblk", "-J", "-d",
"-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL").Output() "-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL").Output()

View File

@@ -12,6 +12,7 @@ import (
"os" "os"
"os/exec" "os/exec"
"path/filepath" "path/filepath"
"syscall"
"sort" "sort"
"strconv" "strconv"
"strings" "strings"
@@ -531,6 +532,13 @@ func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string
} }
c := exec.CommandContext(ctx, resolvedCmd[0], resolvedCmd[1:]...) c := exec.CommandContext(ctx, resolvedCmd[0], resolvedCmd[1:]...)
c.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
c.Cancel = func() error {
if c.Process != nil {
_ = syscall.Kill(-c.Process.Pid, syscall.SIGKILL)
}
return nil
}
if len(env) > 0 { if len(env) > 0 {
c.Env = append(os.Environ(), env...) c.Env = append(os.Environ(), env...)
} }

View File

@@ -346,8 +346,10 @@ func (h *handler) handleAPINetworkStatus(w http.ResponseWriter, r *http.Request)
return return
} }
writeJSON(w, map[string]any{ writeJSON(w, map[string]any{
"interfaces": ifaces, "interfaces": ifaces,
"default_route": h.opts.App.DefaultRoute(), "default_route": h.opts.App.DefaultRoute(),
"pending_change": h.hasPendingNetworkChange(),
"rollback_in": h.pendingNetworkRollbackIn(),
}) })
} }
@@ -744,13 +746,7 @@ func (h *handler) feedRings(sample platform.LiveMetricSample) {
h.ringMemLoad.push(sample.MemLoadPct) h.ringMemLoad.push(sample.MemLoadPct)
h.ringsMu.Lock() h.ringsMu.Lock()
for i, fan := range sample.Fans { h.pushFanRings(sample.Fans)
for len(h.ringFans) <= i {
h.ringFans = append(h.ringFans, newMetricsRing(120))
h.fanNames = append(h.fanNames, fan.Name)
}
h.ringFans[i].push(float64(fan.RPM))
}
for _, gpu := range sample.GPUs { for _, gpu := range sample.GPUs {
idx := gpu.GPUIndex idx := gpu.GPUIndex
for len(h.gpuRings) <= idx { for len(h.gpuRings) <= idx {
@@ -769,6 +765,51 @@ func (h *handler) feedRings(sample platform.LiveMetricSample) {
h.ringsMu.Unlock() h.ringsMu.Unlock()
} }
func (h *handler) pushFanRings(fans []platform.FanReading) {
if len(fans) == 0 && len(h.ringFans) == 0 {
return
}
fanValues := make(map[string]float64, len(fans))
for _, fan := range fans {
if fan.Name == "" {
continue
}
fanValues[fan.Name] = fan.RPM
found := false
for i, name := range h.fanNames {
if name == fan.Name {
found = true
if i >= len(h.ringFans) {
h.ringFans = append(h.ringFans, newMetricsRing(120))
}
break
}
}
if !found {
h.fanNames = append(h.fanNames, fan.Name)
h.ringFans = append(h.ringFans, newMetricsRing(120))
}
}
for i, ring := range h.ringFans {
if ring == nil {
continue
}
name := ""
if i < len(h.fanNames) {
name = h.fanNames[i]
}
if rpm, ok := fanValues[name]; ok {
ring.push(rpm)
continue
}
if last, ok := ring.latest(); ok {
ring.push(last)
continue
}
ring.push(0)
}
}
func (h *handler) pushNamedMetricRing(dst *[]*namedMetricsRing, name string, value float64) { func (h *handler) pushNamedMetricRing(dst *[]*namedMetricsRing, name string, value float64) {
if name == "" { if name == "" {
return return
@@ -847,7 +888,10 @@ func (h *handler) applyPendingNetworkChange(apply func() (app.ActionResult, erro
return result, err return result, err
} }
pnc := &pendingNetChange{snapshot: snapshot} pnc := &pendingNetChange{
snapshot: snapshot,
deadline: time.Now().Add(netRollbackTimeout),
}
pnc.timer = time.AfterFunc(netRollbackTimeout, func() { pnc.timer = time.AfterFunc(netRollbackTimeout, func() {
_ = h.opts.App.RestoreNetworkSnapshot(snapshot) _ = h.opts.App.RestoreNetworkSnapshot(snapshot)
h.pendingNetMu.Lock() h.pendingNetMu.Lock()
@@ -864,6 +908,25 @@ func (h *handler) applyPendingNetworkChange(apply func() (app.ActionResult, erro
return result, nil return result, nil
} }
func (h *handler) hasPendingNetworkChange() bool {
h.pendingNetMu.Lock()
defer h.pendingNetMu.Unlock()
return h.pendingNet != nil
}
func (h *handler) pendingNetworkRollbackIn() int {
h.pendingNetMu.Lock()
defer h.pendingNetMu.Unlock()
if h.pendingNet == nil {
return 0
}
remaining := int(time.Until(h.pendingNet.deadline).Seconds())
if remaining < 1 {
return 1
}
return remaining
}
func (h *handler) handleAPINetworkConfirm(w http.ResponseWriter, _ *http.Request) { func (h *handler) handleAPINetworkConfirm(w http.ResponseWriter, _ *http.Request) {
h.pendingNetMu.Lock() h.pendingNetMu.Lock()
pnc := h.pendingNet pnc := h.pendingNet

View File

@@ -7,6 +7,7 @@ import (
"testing" "testing"
"bee/audit/internal/app" "bee/audit/internal/app"
"bee/audit/internal/platform"
) )
func TestXrandrCommandAddsDefaultX11Env(t *testing.T) { func TestXrandrCommandAddsDefaultX11Env(t *testing.T) {
@@ -100,3 +101,29 @@ func TestHandleAPIExportBundleQueuesTask(t *testing.T) {
t.Fatalf("target=%q want support-bundle", got) t.Fatalf("target=%q want support-bundle", got)
} }
} }
func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {
h := &handler{}
h.pushFanRings([]platform.FanReading{
{Name: "FAN_A", RPM: 4200},
{Name: "FAN_B", RPM: 5100},
})
h.pushFanRings([]platform.FanReading{
{Name: "FAN_B", RPM: 5200},
})
if len(h.fanNames) != 2 || h.fanNames[0] != "FAN_A" || h.fanNames[1] != "FAN_B" {
t.Fatalf("fanNames=%v", h.fanNames)
}
aVals, _ := h.ringFans[0].snapshot()
bVals, _ := h.ringFans[1].snapshot()
if len(aVals) != 2 || len(bVals) != 2 {
t.Fatalf("fan ring lengths: A=%d B=%d", len(aVals), len(bVals))
}
if aVals[1] != 4200 {
t.Fatalf("FAN_A should carry forward last value, got %v", aVals)
}
if bVals[1] != 5200 {
t.Fatalf("FAN_B should use latest sampled value, got %v", bVals)
}
}

View File

@@ -120,7 +120,7 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
// LoadRecent returns up to n samples in chronological order (oldest first). // LoadRecent returns up to n samples in chronological order (oldest first).
func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) { func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?`, n) return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
} }
// LoadAll returns all persisted samples in chronological order (oldest first). // LoadAll returns all persisted samples in chronological order (oldest first).
@@ -151,11 +151,6 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
if len(sysRows) == 0 { if len(sysRows) == 0 {
return nil, nil return nil, nil
} }
// Reverse to chronological order
for i, j := 0, len(sysRows)-1; i < j; i, j = i+1, j-1 {
sysRows[i], sysRows[j] = sysRows[j], sysRows[i]
}
// Collect min/max ts for range query // Collect min/max ts for range query
minTS := sysRows[0].ts minTS := sysRows[0].ts
maxTS := sysRows[len(sysRows)-1].ts maxTS := sysRows[len(sysRows)-1].ts

View File

@@ -0,0 +1,69 @@
package webui
import (
"path/filepath"
"testing"
"time"
"bee/audit/internal/platform"
)
func TestMetricsDBLoadSamplesKeepsChronologicalRangeForGPUs(t *testing.T) {
db, err := openMetricsDB(filepath.Join(t.TempDir(), "metrics.db"))
if err != nil {
t.Fatalf("openMetricsDB: %v", err)
}
defer db.Close()
base := time.Unix(1_700_000_000, 0).UTC()
for i := 0; i < 3; i++ {
err := db.Write(platform.LiveMetricSample{
Timestamp: base.Add(time.Duration(i) * time.Second),
CPULoadPct: float64(10 + i),
MemLoadPct: float64(20 + i),
PowerW: float64(300 + i),
GPUs: []platform.GPUMetricRow{
{GPUIndex: 0, PowerW: float64(100 + i)},
{GPUIndex: 2, PowerW: float64(200 + i)},
},
})
if err != nil {
t.Fatalf("Write(%d): %v", i, err)
}
}
all, err := db.LoadAll()
if err != nil {
t.Fatalf("LoadAll: %v", err)
}
if len(all) != 3 {
t.Fatalf("LoadAll len=%d want 3", len(all))
}
for i, sample := range all {
if len(sample.GPUs) != 2 {
t.Fatalf("LoadAll sample %d GPUs=%v want 2 rows", i, sample.GPUs)
}
if sample.GPUs[0].GPUIndex != 0 || sample.GPUs[0].PowerW != float64(100+i) {
t.Fatalf("LoadAll sample %d GPU0=%+v", i, sample.GPUs[0])
}
if sample.GPUs[1].GPUIndex != 2 || sample.GPUs[1].PowerW != float64(200+i) {
t.Fatalf("LoadAll sample %d GPU1=%+v", i, sample.GPUs[1])
}
}
recent, err := db.LoadRecent(2)
if err != nil {
t.Fatalf("LoadRecent: %v", err)
}
if len(recent) != 2 {
t.Fatalf("LoadRecent len=%d want 2", len(recent))
}
if !recent[0].Timestamp.Before(recent[1].Timestamp) {
t.Fatalf("LoadRecent timestamps not ascending: %v >= %v", recent[0].Timestamp, recent[1].Timestamp)
}
for i, sample := range recent {
if len(sample.GPUs) != 2 {
t.Fatalf("LoadRecent sample %d GPUs=%v want 2 rows", i, sample.GPUs)
}
}
}

View File

@@ -522,13 +522,30 @@ func renderMetrics() string {
</div> </div>
<script> <script>
const chartIds = [
'chart-server-load','chart-server-temp-cpu','chart-server-temp-gpu','chart-server-temp-ambient','chart-server-power','chart-server-fans',
'chart-gpu-all-load','chart-gpu-all-memload','chart-gpu-all-power','chart-gpu-all-temp'
];
function refreshChartImage(el) {
if (!el || el.dataset.loading === '1') return;
const baseSrc = el.dataset.baseSrc || el.src.split('?')[0];
const nextSrc = baseSrc + '?t=' + Date.now();
const probe = new Image();
el.dataset.baseSrc = baseSrc;
el.dataset.loading = '1';
probe.onload = function() {
el.src = nextSrc;
el.dataset.loading = '0';
};
probe.onerror = function() {
el.dataset.loading = '0';
};
probe.src = nextSrc;
}
function refreshCharts() { function refreshCharts() {
const t = '?t=' + Date.now(); chartIds.forEach(id => refreshChartImage(document.getElementById(id)));
['chart-server-load','chart-server-temp-cpu','chart-server-temp-gpu','chart-server-temp-ambient','chart-server-power','chart-server-fans',
'chart-gpu-all-load','chart-gpu-all-memload','chart-gpu-all-power','chart-gpu-all-temp'].forEach(id => {
const el = document.getElementById(id);
if (el) el.src = el.src.split('?')[0] + t;
});
} }
setInterval(refreshCharts, 3000); setInterval(refreshCharts, 3000);
@@ -892,6 +909,8 @@ func renderNetworkInline() string {
</div> </div>
<script> <script>
var _netCountdownTimer = null; var _netCountdownTimer = null;
var _netRefreshTimer = null;
const NET_ROLLBACK_SECS = 60;
function loadNetwork() { function loadNetwork() {
fetch('/api/network').then(r=>r.json()).then(d => { fetch('/api/network').then(r=>r.json()).then(d => {
const rows = (d.interfaces||[]).map(i => const rows = (d.interfaces||[]).map(i =>
@@ -902,21 +921,33 @@ function loadNetwork() {
document.getElementById('iface-table').innerHTML = document.getElementById('iface-table').innerHTML =
'<table><tr><th>Interface</th><th>State (click to toggle)</th><th>Addresses</th></tr>'+rows+'</table>' + '<table><tr><th>Interface</th><th>State (click to toggle)</th><th>Addresses</th></tr>'+rows+'</table>' +
(d.default_route ? '<p style="font-size:12px;color:var(--muted);margin-top:8px">Default route: '+d.default_route+'</p>' : ''); (d.default_route ? '<p style="font-size:12px;color:var(--muted);margin-top:8px">Default route: '+d.default_route+'</p>' : '');
}); if (d.pending_change) showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
else hideNetPending();
}).catch(function() {});
} }
function selectIface(iface) { function selectIface(iface) {
document.getElementById('dhcp-iface').value = iface; document.getElementById('dhcp-iface').value = iface;
document.getElementById('st-iface').value = iface; document.getElementById('st-iface').value = iface;
} }
function toggleIface(iface, currentState) { function toggleIface(iface, currentState) {
showNetPending(NET_ROLLBACK_SECS);
fetch('/api/network/toggle',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({iface:iface})}) fetch('/api/network/toggle',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({iface:iface})})
.then(r=>r.json()).then(d => { .then(r=>r.json()).then(d => {
if (d.error) { alert('Error: '+d.error); return; } if (d.error) { hideNetPending(); alert('Error: '+d.error); return; }
loadNetwork(); loadNetwork();
showNetPending(d.rollback_in || 60); showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
}).catch(function() {
setTimeout(loadNetwork, 1500);
}); });
} }
function hideNetPending() {
const el = document.getElementById('net-pending');
if (_netCountdownTimer) clearInterval(_netCountdownTimer);
_netCountdownTimer = null;
el.style.display = 'none';
}
function showNetPending(secs) { function showNetPending(secs) {
if (!secs || secs < 1) { hideNetPending(); return; }
const el = document.getElementById('net-pending'); const el = document.getElementById('net-pending');
el.style.display = 'block'; el.style.display = 'block';
if (_netCountdownTimer) clearInterval(_netCountdownTimer); if (_netCountdownTimer) clearInterval(_netCountdownTimer);
@@ -925,30 +956,33 @@ function showNetPending(secs) {
_netCountdownTimer = setInterval(function() { _netCountdownTimer = setInterval(function() {
remaining--; remaining--;
document.getElementById('net-countdown').textContent = remaining; document.getElementById('net-countdown').textContent = remaining;
if (remaining <= 0) { clearInterval(_netCountdownTimer); _netCountdownTimer=null; el.style.display='none'; loadNetwork(); } if (remaining <= 0) { hideNetPending(); loadNetwork(); }
}, 1000); }, 1000);
} }
function confirmNetChange() { function confirmNetChange() {
if (_netCountdownTimer) { clearInterval(_netCountdownTimer); _netCountdownTimer=null; } hideNetPending();
document.getElementById('net-pending').style.display='none'; fetch('/api/network/confirm',{method:'POST'}).then(()=>loadNetwork()).catch(()=>{});
fetch('/api/network/confirm',{method:'POST'});
} }
function rollbackNetChange() { function rollbackNetChange() {
if (_netCountdownTimer) { clearInterval(_netCountdownTimer); _netCountdownTimer=null; } hideNetPending();
document.getElementById('net-pending').style.display='none'; fetch('/api/network/rollback',{method:'POST'}).then(()=>loadNetwork()).catch(()=>{});
fetch('/api/network/rollback',{method:'POST'}).then(()=>loadNetwork());
} }
function runDHCP() { function runDHCP() {
const iface = document.getElementById('dhcp-iface').value.trim(); const iface = document.getElementById('dhcp-iface').value.trim();
showNetPending(NET_ROLLBACK_SECS);
fetch('/api/network/dhcp',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({interface:iface||'all'})}) fetch('/api/network/dhcp',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({interface:iface||'all'})})
.then(r=>r.json()).then(d => { .then(r=>r.json()).then(d => {
document.getElementById('dhcp-out').textContent = d.output || d.error || 'Done.'; document.getElementById('dhcp-out').textContent = d.output || d.error || 'Done.';
if (!d.error) showNetPending(d.rollback_in || 60); if (d.error) { hideNetPending(); return; }
showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
loadNetwork(); loadNetwork();
}).catch(function() {
setTimeout(loadNetwork, 1500);
}); });
} }
function setStatic() { function setStatic() {
const dns = document.getElementById('st-dns').value.split(',').map(s=>s.trim()).filter(Boolean); const dns = document.getElementById('st-dns').value.split(',').map(s=>s.trim()).filter(Boolean);
showNetPending(NET_ROLLBACK_SECS);
fetch('/api/network/static',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({ fetch('/api/network/static',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({
interface: document.getElementById('st-iface').value, interface: document.getElementById('st-iface').value,
address: document.getElementById('st-addr').value, address: document.getElementById('st-addr').value,
@@ -957,11 +991,16 @@ function setStatic() {
dns: dns, dns: dns,
})}).then(r=>r.json()).then(d => { })}).then(r=>r.json()).then(d => {
document.getElementById('static-out').textContent = d.output || d.error || 'Done.'; document.getElementById('static-out').textContent = d.output || d.error || 'Done.';
if (!d.error) showNetPending(d.rollback_in || 60); if (d.error) { hideNetPending(); return; }
showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
loadNetwork(); loadNetwork();
}).catch(function() {
setTimeout(loadNetwork, 1500);
}); });
} }
loadNetwork(); loadNetwork();
if (_netRefreshTimer) clearInterval(_netRefreshTimer);
_netRefreshTimer = setInterval(loadNetwork, 5000);
</script>` </script>`
} }
@@ -1562,7 +1601,7 @@ function loadTasks() {
return; return;
} }
const rows = tasks.map(t => { const rows = tasks.map(t => {
const dur = t.started_at ? formatDur(t.started_at, t.done_at) : ''; const dur = t.elapsed_sec ? formatDurSec(t.elapsed_sec) : '';
const statusClass = {running:'badge-ok',pending:'badge-unknown',done:'badge-ok',failed:'badge-err',cancelled:'badge-unknown'}[t.status]||'badge-unknown'; const statusClass = {running:'badge-ok',pending:'badge-unknown',done:'badge-ok',failed:'badge-err',cancelled:'badge-unknown'}[t.status]||'badge-unknown';
const statusLabel = {running:'&#9654; running',pending:'pending',done:'&#10003; done',failed:'&#10007; failed',cancelled:'cancelled'}[t.status]||t.status; const statusLabel = {running:'&#9654; running',pending:'pending',done:'&#10003; done',failed:'&#10007; failed',cancelled:'cancelled'}[t.status]||t.status;
let actions = '<button class="btn btn-sm btn-secondary" onclick="viewLog(\''+t.id+'\',\''+escHtml(t.name)+'\')">Logs</button>'; let actions = '<button class="btn btn-sm btn-secondary" onclick="viewLog(\''+t.id+'\',\''+escHtml(t.name)+'\')">Logs</button>';
@@ -1587,14 +1626,11 @@ function loadTasks() {
function escHtml(s) { return (s||'').replace(/&/g,'&amp;').replace(/</g,'&lt;').replace(/>/g,'&gt;').replace(/"/g,'&quot;'); } function escHtml(s) { return (s||'').replace(/&/g,'&amp;').replace(/</g,'&lt;').replace(/>/g,'&gt;').replace(/"/g,'&quot;'); }
function fmtTime(s) { if (!s) return ''; try { return new Date(s).toLocaleTimeString(); } catch(e){ return s; } } function fmtTime(s) { if (!s) return ''; try { return new Date(s).toLocaleTimeString(); } catch(e){ return s; } }
function formatDur(start, end) { function formatDurSec(sec) {
try { sec = Math.max(0, Math.round(sec||0));
const s = new Date(start), e = end ? new Date(end) : new Date(); if (sec < 60) return sec+'s';
const sec = Math.round((e-s)/1000); const m = Math.floor(sec/60), ss = sec%60;
if (sec < 60) return sec+'s'; return m+'m '+ss+'s';
const m = Math.floor(sec/60), ss = sec%60;
return m+'m '+ss+'s';
} catch(e){ return ''; }
} }
function cancelTask(id) { function cancelTask(id) {

View File

@@ -10,6 +10,7 @@ import (
"net/http" "net/http"
"os" "os"
"path/filepath" "path/filepath"
"sort"
"strings" "strings"
"sync" "sync"
"time" "time"
@@ -84,6 +85,15 @@ func (r *metricsRing) snapshot() ([]float64, []string) {
return v, labels return v, labels
} }
func (r *metricsRing) latest() (float64, bool) {
r.mu.Lock()
defer r.mu.Unlock()
if len(r.vals) == 0 {
return 0, false
}
return r.vals[len(r.vals)-1], true
}
func timestampsSameLocalDay(times []time.Time) bool { func timestampsSameLocalDay(times []time.Time) bool {
if len(times) == 0 { if len(times) == 0 {
return true return true
@@ -118,9 +128,12 @@ type namedMetricsRing struct {
Ring *metricsRing Ring *metricsRing
} }
const metricsChartWindow = 120
// pendingNetChange tracks a network state change awaiting confirmation. // pendingNetChange tracks a network state change awaiting confirmation.
type pendingNetChange struct { type pendingNetChange struct {
snapshot platform.NetworkSnapshot snapshot platform.NetworkSnapshot
deadline time.Time
timer *time.Timer timer *time.Timer
mu sync.Mutex mu sync.Mutex
} }
@@ -171,7 +184,7 @@ func NewHandler(opts HandlerOptions) http.Handler {
// Open metrics DB and pre-fill ring buffers from history. // Open metrics DB and pre-fill ring buffers from history.
if db, err := openMetricsDB(metricsDBPath); err == nil { if db, err := openMetricsDB(metricsDBPath); err == nil {
h.metricsDB = db h.metricsDB = db
if samples, err := db.LoadRecent(120); err == nil { if samples, err := db.LoadRecent(metricsChartWindow); err == nil {
for _, s := range samples { for _, s := range samples {
h.feedRings(s) h.feedRings(s)
} }
@@ -292,11 +305,11 @@ func (h *handler) startMetricsCollector() {
defer ticker.Stop() defer ticker.Stop()
for range ticker.C { for range ticker.C {
sample := platform.SampleLiveMetrics() sample := platform.SampleLiveMetrics()
h.feedRings(sample)
h.setLatestMetric(sample)
if h.metricsDB != nil { if h.metricsDB != nil {
_ = h.metricsDB.Write(sample) _ = h.metricsDB.Write(sample)
} }
h.feedRings(sample)
h.setLatestMetric(sample)
} }
}() }()
} }
@@ -448,222 +461,13 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
path := strings.TrimPrefix(r.URL.Path, "/api/metrics/chart/") path := strings.TrimPrefix(r.URL.Path, "/api/metrics/chart/")
path = strings.TrimSuffix(path, ".svg") path = strings.TrimSuffix(path, ".svg")
if h.metricsDB != nil { if h.metricsDB == nil {
if datasets, names, labels, title, yMin, yMax, ok := h.chartDataFromDB(path); ok { http.Error(w, "metrics database not available", http.StatusServiceUnavailable)
buf, err := renderChartSVG(title, datasets, names, labels, yMin, yMax) return
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "image/svg+xml")
w.Header().Set("Cache-Control", "no-store")
_, _ = w.Write(buf)
return
}
} }
datasets, names, labels, title, yMin, yMax, ok := h.chartDataFromDB(path)
var datasets [][]float64 if !ok {
var names []string http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
var labels []string
var title string
var yMin, yMax *float64 // nil = auto; for load charts fixed 0-100
switch {
// ── Server sub-charts ─────────────────────────────────────────────────
case path == "server-load":
title = "CPU / Memory Load"
vCPULoad, l := h.ringCPULoad.snapshot()
vMemLoad, _ := h.ringMemLoad.snapshot()
labels = l
datasets = [][]float64{vCPULoad, vMemLoad}
names = []string{"CPU Load %", "Mem Load %"}
yMin = floatPtr(0)
yMax = floatPtr(100)
case path == "server-temp", path == "server-temp-cpu":
title = "CPU Temperature"
h.ringsMu.Lock()
datasets, names, labels = snapshotNamedRings(h.cpuTempRings)
h.ringsMu.Unlock()
yMin = floatPtr(0)
yMax = autoMax120(datasets...)
case path == "server-temp-gpu":
title = "GPU Temperature"
h.ringsMu.Lock()
for idx, gr := range h.gpuRings {
if gr == nil {
continue
}
vTemp, l := gr.Temp.snapshot()
datasets = append(datasets, vTemp)
names = append(names, fmt.Sprintf("GPU %d", idx))
if len(labels) == 0 {
labels = l
}
}
h.ringsMu.Unlock()
yMin = floatPtr(0)
yMax = autoMax120(datasets...)
case path == "server-temp-ambient":
title = "Ambient / Other Sensors"
h.ringsMu.Lock()
datasets, names, labels = snapshotNamedRings(h.ambientTempRings)
h.ringsMu.Unlock()
yMin = floatPtr(0)
yMax = autoMax120(datasets...)
case path == "server-power":
title = "System Power"
vPower, l := h.ringPower.snapshot()
vPower = normalizePowerSeries(vPower)
labels = l
datasets = [][]float64{vPower}
names = []string{"Power W"}
yMin = floatPtr(0)
yMax = autoMax120(vPower)
case path == "server-fans":
title = "Fan RPM"
h.ringsMu.Lock()
for i, fr := range h.ringFans {
fv, _ := fr.snapshot()
datasets = append(datasets, fv)
name := "Fan"
if i < len(h.fanNames) {
name = h.fanNames[i]
}
names = append(names, name+" RPM")
}
h.ringsMu.Unlock()
yMin = floatPtr(0)
yMax = autoMax120(datasets...)
// ── Combined GPU charts (all GPUs on one chart) ───────────────────────
case path == "gpu-all-load":
title = "GPU Compute Load"
h.ringsMu.Lock()
for idx, gr := range h.gpuRings {
if gr == nil {
continue
}
vUtil, l := gr.Util.snapshot()
datasets = append(datasets, vUtil)
names = append(names, fmt.Sprintf("GPU %d", idx))
if len(labels) == 0 {
labels = l
}
}
h.ringsMu.Unlock()
yMin = floatPtr(0)
yMax = floatPtr(100)
case path == "gpu-all-memload":
title = "GPU Memory Load"
h.ringsMu.Lock()
for idx, gr := range h.gpuRings {
if gr == nil {
continue
}
vMem, l := gr.MemUtil.snapshot()
datasets = append(datasets, vMem)
names = append(names, fmt.Sprintf("GPU %d", idx))
if len(labels) == 0 {
labels = l
}
}
h.ringsMu.Unlock()
yMin = floatPtr(0)
yMax = floatPtr(100)
case path == "gpu-all-power":
title = "GPU Power"
h.ringsMu.Lock()
for idx, gr := range h.gpuRings {
if gr == nil {
continue
}
vPow, l := gr.Power.snapshot()
datasets = append(datasets, vPow)
names = append(names, fmt.Sprintf("GPU %d", idx))
if len(labels) == 0 {
labels = l
}
}
h.ringsMu.Unlock()
yMin = floatPtr(0)
yMax = autoMax120(datasets...)
case path == "gpu-all-temp":
title = "GPU Temperature"
h.ringsMu.Lock()
for idx, gr := range h.gpuRings {
if gr == nil {
continue
}
vTemp, l := gr.Temp.snapshot()
datasets = append(datasets, vTemp)
names = append(names, fmt.Sprintf("GPU %d", idx))
if len(labels) == 0 {
labels = l
}
}
h.ringsMu.Unlock()
yMin = floatPtr(0)
yMax = autoMax120(datasets...)
// ── Per-GPU sub-charts ────────────────────────────────────────────────
case strings.HasPrefix(path, "gpu/"):
rest := strings.TrimPrefix(path, "gpu/")
// rest is either "{idx}-load", "{idx}-temp", "{idx}-power", or legacy "{idx}"
sub := ""
if i := strings.LastIndex(rest, "-"); i > 0 {
sub = rest[i+1:]
rest = rest[:i]
}
idx := 0
fmt.Sscanf(rest, "%d", &idx)
h.ringsMu.Lock()
var gr *gpuRings
if idx < len(h.gpuRings) {
gr = h.gpuRings[idx]
}
h.ringsMu.Unlock()
if gr == nil {
http.NotFound(w, r)
return
}
switch sub {
case "load":
vUtil, l := gr.Util.snapshot()
vMemUtil, _ := gr.MemUtil.snapshot()
labels = l
title = fmt.Sprintf("GPU %d Load", idx)
datasets = [][]float64{vUtil, vMemUtil}
names = []string{"Load %", "Mem %"}
yMin = floatPtr(0)
yMax = floatPtr(100)
case "temp":
vTemp, l := gr.Temp.snapshot()
labels = l
title = fmt.Sprintf("GPU %d Temperature", idx)
datasets = [][]float64{vTemp}
names = []string{"Temp °C"}
yMin = floatPtr(0)
yMax = autoMax120(vTemp)
default: // "power" or legacy (no sub)
vPower, l := gr.Power.snapshot()
labels = l
title = fmt.Sprintf("GPU %d Power", idx)
datasets = [][]float64{vPower}
names = []string{"Power W"}
yMin = floatPtr(0)
yMax = autoMax120(vPower)
}
default:
http.NotFound(w, r)
return return
} }
@@ -840,6 +644,7 @@ func namedTempDatasets(samples []platform.LiveMetricSample, group string) ([][]f
} }
} }
} }
sort.Strings(names)
datasets := make([][]float64, 0, len(names)) datasets := make([][]float64, 0, len(names))
for _, name := range names { for _, name := range names {
ds := make([]float64, len(samples)) ds := make([]float64, len(samples))
@@ -867,6 +672,7 @@ func namedFanDatasets(samples []platform.LiveMetricSample) ([][]float64, []strin
} }
} }
} }
sort.Strings(names)
datasets := make([][]float64, 0, len(names)) datasets := make([][]float64, 0, len(names))
for _, name := range names { for _, name := range names {
ds := make([]float64, len(samples)) ds := make([]float64, len(samples))
@@ -878,7 +684,7 @@ func namedFanDatasets(samples []platform.LiveMetricSample) ([][]float64, []strin
} }
} }
} }
datasets = append(datasets, ds) datasets = append(datasets, normalizeFanSeries(ds))
} }
return datasets, names return datasets, names
} }
@@ -894,6 +700,7 @@ func gpuDatasets(samples []platform.LiveMetricSample, pick func(platform.GPUMetr
} }
} }
} }
sort.Ints(indices)
datasets := make([][]float64, 0, len(indices)) datasets := make([][]float64, 0, len(indices))
names := make([]string, 0, len(indices)) names := make([]string, 0, len(indices))
for _, idx := range indices { for _, idx := range indices {
@@ -953,6 +760,27 @@ func normalizePowerSeries(ds []float64) []float64 {
return out return out
} }
func normalizeFanSeries(ds []float64) []float64 {
if len(ds) == 0 {
return nil
}
out := make([]float64, len(ds))
var lastPositive float64
for i, v := range ds {
if v > 0 {
lastPositive = v
out[i] = v
continue
}
if lastPositive > 0 {
out[i] = lastPositive
continue
}
out[i] = 0
}
return out
}
// floatPtr returns a pointer to a float64 value. // floatPtr returns a pointer to a float64 value.
func floatPtr(v float64) *float64 { return &v } func floatPtr(v float64) *float64 { return &v }
@@ -1044,15 +872,17 @@ func renderChartSVG(title string, datasets [][]float64, names []string, labels [
opt.Title = gocharts.TitleOption{Text: title} opt.Title = gocharts.TitleOption{Text: title}
opt.XAxis.Labels = sparse opt.XAxis.Labels = sparse
opt.Legend = gocharts.LegendOption{SeriesNames: names} opt.Legend = gocharts.LegendOption{SeriesNames: names}
if chartLegendVisible(len(names)) {
opt.Legend.Offset = gocharts.OffsetStr{Top: gocharts.PositionBottom}
opt.Legend.OverlayChart = gocharts.Ptr(false)
} else {
opt.Legend.Show = gocharts.Ptr(false)
}
opt.Symbol = gocharts.SymbolNone opt.Symbol = gocharts.SymbolNone
// Right padding: reserve space for the MarkLine label (library recommendation). // Right padding: reserve space for the MarkLine label (library recommendation).
opt.Padding = gocharts.NewBox(20, 20, 80, 20) opt.Padding = gocharts.NewBox(20, 20, 80, 20)
if yMin != nil || yMax != nil { if yMin != nil || yMax != nil {
opt.YAxis = []gocharts.YAxisOption{{ opt.YAxis = []gocharts.YAxisOption{chartYAxisOption(yMin, yMax)}
Min: yMin,
Max: yMax,
ValueFormatter: chartLegendNumber,
}}
} }
// Add a single peak mark line on the series that holds the global maximum. // Add a single peak mark line on the series that holds the global maximum.
@@ -1064,7 +894,7 @@ func renderChartSVG(title string, datasets [][]float64, names []string, labels [
p := gocharts.NewPainter(gocharts.PainterOptions{ p := gocharts.NewPainter(gocharts.PainterOptions{
OutputFormat: gocharts.ChartOutputSVG, OutputFormat: gocharts.ChartOutputSVG,
Width: 1400, Width: 1400,
Height: 240, Height: chartCanvasHeight(len(names)),
}, gocharts.PainterThemeOption(gocharts.GetTheme("grafana"))) }, gocharts.PainterThemeOption(gocharts.GetTheme("grafana")))
if err := p.LineChart(opt); err != nil { if err := p.LineChart(opt); err != nil {
return nil, err return nil, err
@@ -1072,6 +902,26 @@ func renderChartSVG(title string, datasets [][]float64, names []string, labels [
return p.Bytes() return p.Bytes()
} }
func chartLegendVisible(seriesCount int) bool {
return seriesCount <= 8
}
func chartCanvasHeight(seriesCount int) int {
if chartLegendVisible(seriesCount) {
return 360
}
return 288
}
func chartYAxisOption(yMin, yMax *float64) gocharts.YAxisOption {
return gocharts.YAxisOption{
Min: yMin,
Max: yMax,
LabelCount: 11,
ValueFormatter: chartYAxisNumber,
}
}
// globalPeakSeries returns the index of the series containing the global maximum // globalPeakSeries returns the index of the series containing the global maximum
// value across all datasets, and that maximum value. // value across all datasets, and that maximum value.
func globalPeakSeries(datasets [][]float64) (idx int, peak float64) { func globalPeakSeries(datasets [][]float64) (idx int, peak float64) {
@@ -1159,6 +1009,28 @@ func snapshotNamedRings(rings []*namedMetricsRing) ([][]float64, []string, []str
return datasets, names, labels return datasets, names, labels
} }
func snapshotFanRings(rings []*metricsRing, fanNames []string) ([][]float64, []string, []string) {
var datasets [][]float64
var names []string
var labels []string
for i, ring := range rings {
if ring == nil {
continue
}
vals, l := ring.snapshot()
datasets = append(datasets, normalizeFanSeries(vals))
name := "Fan"
if i < len(fanNames) {
name = fanNames[i]
}
names = append(names, name+" RPM")
if len(labels) == 0 {
labels = l
}
}
return datasets, names, labels
}
func chartLegendNumber(v float64) string { func chartLegendNumber(v float64) string {
neg := v < 0 neg := v < 0
if v < 0 { if v < 0 {
@@ -1181,6 +1053,23 @@ func chartLegendNumber(v float64) string {
return out return out
} }
func chartYAxisNumber(v float64) string {
neg := v < 0
if neg {
v = -v
}
var out string
if v >= 1000 {
out = fmt.Sprintf("%dк", int((v+500)/1000))
} else {
out = fmt.Sprintf("%.0f", v)
}
if neg {
return "-" + out
}
return out
}
func sparseLabels(labels []string, n int) []string { func sparseLabels(labels []string, n int) []string {
out := make([]string, len(labels)) out := make([]string, len(labels))
step := len(labels) / n step := len(labels) / n

View File

@@ -89,6 +89,53 @@ func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
} }
} }
func TestChartDataFromSamplesKeepsStableGPUSeriesOrder(t *testing.T) {
samples := []platform.LiveMetricSample{
{
Timestamp: time.Now().Add(-2 * time.Minute),
GPUs: []platform.GPUMetricRow{
{GPUIndex: 7, PowerW: 170},
{GPUIndex: 2, PowerW: 120},
{GPUIndex: 0, PowerW: 100},
},
},
{
Timestamp: time.Now().Add(-1 * time.Minute),
GPUs: []platform.GPUMetricRow{
{GPUIndex: 0, PowerW: 101},
{GPUIndex: 7, PowerW: 171},
{GPUIndex: 2, PowerW: 121},
},
},
}
datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
if !ok {
t.Fatal("chartDataFromSamples returned ok=false")
}
if title != "GPU Power" {
t.Fatalf("title=%q", title)
}
wantNames := []string{"GPU 0", "GPU 2", "GPU 7"}
if len(names) != len(wantNames) {
t.Fatalf("names len=%d want %d: %v", len(names), len(wantNames), names)
}
for i := range wantNames {
if names[i] != wantNames[i] {
t.Fatalf("names[%d]=%q want %q; full=%v", i, names[i], wantNames[i], names)
}
}
if got := datasets[0]; len(got) != 2 || got[0] != 100 || got[1] != 101 {
t.Fatalf("GPU 0 dataset=%v want [100 101]", got)
}
if got := datasets[1]; len(got) != 2 || got[0] != 120 || got[1] != 121 {
t.Fatalf("GPU 2 dataset=%v want [120 121]", got)
}
if got := datasets[2]; len(got) != 2 || got[0] != 170 || got[1] != 171 {
t.Fatalf("GPU 7 dataset=%v want [170 171]", got)
}
}
func TestNormalizePowerSeriesHoldsLastPositive(t *testing.T) { func TestNormalizePowerSeriesHoldsLastPositive(t *testing.T) {
got := normalizePowerSeries([]float64{0, 480, 0, 0, 510, 0}) got := normalizePowerSeries([]float64{0, 480, 0, 0, 510, 0})
want := []float64{0, 480, 480, 480, 510, 510} want := []float64{0, 480, 480, 480, 510, 510}
@@ -102,6 +149,117 @@ func TestNormalizePowerSeriesHoldsLastPositive(t *testing.T) {
} }
} }
func TestRenderMetricsUsesBufferedChartRefresh(t *testing.T) {
body := renderMetrics()
if !strings.Contains(body, "const probe = new Image();") {
t.Fatalf("metrics page should preload chart images before swap: %s", body)
}
if !strings.Contains(body, "el.dataset.loading === '1'") {
t.Fatalf("metrics page should avoid overlapping chart reloads: %s", body)
}
}
func TestChartLegendVisible(t *testing.T) {
if !chartLegendVisible(8) {
t.Fatal("legend should stay visible for charts with up to 8 series")
}
if chartLegendVisible(9) {
t.Fatal("legend should be hidden for charts with more than 8 series")
}
}
func TestChartYAxisNumber(t *testing.T) {
tests := []struct {
in float64
want string
}{
{in: 999, want: "999"},
{in: 1000, want: "1к"},
{in: 1370, want: "1к"},
{in: 1500, want: "2к"},
{in: 10200, want: "10к"},
{in: -1499, want: "-1к"},
}
for _, tc := range tests {
if got := chartYAxisNumber(tc.in); got != tc.want {
t.Fatalf("chartYAxisNumber(%v)=%q want %q", tc.in, got, tc.want)
}
}
}
func TestChartCanvasHeight(t *testing.T) {
if got := chartCanvasHeight(4); got != 360 {
t.Fatalf("chartCanvasHeight(4)=%d want 360", got)
}
if got := chartCanvasHeight(12); got != 288 {
t.Fatalf("chartCanvasHeight(12)=%d want 288", got)
}
}
func TestNormalizeFanSeriesHoldsLastPositive(t *testing.T) {
got := normalizeFanSeries([]float64{4200, 0, 0, 4300, 0})
want := []float64{4200, 4200, 4200, 4300, 4300}
if len(got) != len(want) {
t.Fatalf("len=%d want %d", len(got), len(want))
}
for i := range want {
if got[i] != want[i] {
t.Fatalf("got[%d]=%v want %v", i, got[i], want[i])
}
}
}
func TestChartYAxisOption(t *testing.T) {
min := floatPtr(0)
max := floatPtr(100)
opt := chartYAxisOption(min, max)
if opt.Min != min || opt.Max != max {
t.Fatalf("chartYAxisOption min/max mismatch: %#v", opt)
}
if opt.LabelCount != 11 {
t.Fatalf("chartYAxisOption labelCount=%d want 11", opt.LabelCount)
}
if got := opt.ValueFormatter(1000); got != "1к" {
t.Fatalf("chartYAxisOption formatter(1000)=%q want 1к", got)
}
}
func TestSnapshotFanRingsUsesTimelineLabels(t *testing.T) {
r1 := newMetricsRing(4)
r2 := newMetricsRing(4)
r1.push(1000)
r1.push(1100)
r2.push(1200)
r2.push(1300)
datasets, names, labels := snapshotFanRings([]*metricsRing{r1, r2}, []string{"FAN_A", "FAN_B"})
if len(datasets) != 2 {
t.Fatalf("datasets=%d want 2", len(datasets))
}
if len(names) != 2 || names[0] != "FAN_A RPM" || names[1] != "FAN_B RPM" {
t.Fatalf("names=%v", names)
}
if len(labels) != 2 {
t.Fatalf("labels=%v want 2 entries", labels)
}
if labels[0] == "" || labels[1] == "" {
t.Fatalf("labels should contain timeline values, got %v", labels)
}
}
func TestRenderNetworkInlineSyncsPendingState(t *testing.T) {
body := renderNetworkInline()
if !strings.Contains(body, "d.pending_change") {
t.Fatalf("network UI should read pending network state from API: %s", body)
}
if !strings.Contains(body, "setInterval(loadNetwork, 5000)") {
t.Fatalf("network UI should periodically refresh network state: %s", body)
}
if !strings.Contains(body, "showNetPending(NET_ROLLBACK_SECS)") {
t.Fatalf("network UI should show pending confirmation immediately on apply: %s", body)
}
}
func TestRootRendersDashboard(t *testing.T) { func TestRootRendersDashboard(t *testing.T) {
dir := t.TempDir() dir := t.TempDir()
path := filepath.Join(dir, "audit.json") path := filepath.Join(dir, "audit.json")

View File

@@ -83,16 +83,17 @@ func taskDisplayName(target, profile, loader string) string {
// Task represents one unit of work in the queue. // Task represents one unit of work in the queue.
type Task struct { type Task struct {
ID string `json:"id"` ID string `json:"id"`
Name string `json:"name"` Name string `json:"name"`
Target string `json:"target"` Target string `json:"target"`
Priority int `json:"priority"` Priority int `json:"priority"`
Status string `json:"status"` Status string `json:"status"`
CreatedAt time.Time `json:"created_at"` CreatedAt time.Time `json:"created_at"`
StartedAt *time.Time `json:"started_at,omitempty"` StartedAt *time.Time `json:"started_at,omitempty"`
DoneAt *time.Time `json:"done_at,omitempty"` DoneAt *time.Time `json:"done_at,omitempty"`
ErrMsg string `json:"error,omitempty"` ElapsedSec int `json:"elapsed_sec,omitempty"`
LogPath string `json:"log_path,omitempty"` ErrMsg string `json:"error,omitempty"`
LogPath string `json:"log_path,omitempty"`
// runtime fields (not serialised) // runtime fields (not serialised)
job *jobState job *jobState
@@ -101,11 +102,11 @@ type Task struct {
// taskParams holds optional parameters parsed from the run request. // taskParams holds optional parameters parsed from the run request.
type taskParams struct { type taskParams struct {
Duration int `json:"duration,omitempty"` Duration int `json:"duration,omitempty"`
DiagLevel int `json:"diag_level,omitempty"` DiagLevel int `json:"diag_level,omitempty"`
GPUIndices []int `json:"gpu_indices,omitempty"` GPUIndices []int `json:"gpu_indices,omitempty"`
ExcludeGPUIndices []int `json:"exclude_gpu_indices,omitempty"` ExcludeGPUIndices []int `json:"exclude_gpu_indices,omitempty"`
Loader string `json:"loader,omitempty"` Loader string `json:"loader,omitempty"`
BurnProfile string `json:"burn_profile,omitempty"` BurnProfile string `json:"burn_profile,omitempty"`
DisplayName string `json:"display_name,omitempty"` DisplayName string `json:"display_name,omitempty"`
Device string `json:"device,omitempty"` // for install Device string `json:"device,omitempty"` // for install
@@ -311,6 +312,7 @@ func (q *taskQueue) snapshot() []Task {
out := make([]Task, len(q.tasks)) out := make([]Task, len(q.tasks))
for i, t := range q.tasks { for i, t := range q.tasks {
out[i] = *t out[i] = *t
out[i].ElapsedSec = taskElapsedSec(&out[i], time.Now())
} }
sort.SliceStable(out, func(i, j int) bool { sort.SliceStable(out, func(i, j int) bool {
si := statusOrder(out[i].Status) si := statusOrder(out[i].Status)
@@ -769,6 +771,7 @@ func (q *taskQueue) loadLocked() {
q.assignTaskLogPathLocked(t) q.assignTaskLogPathLocked(t)
if t.Status == TaskPending || t.Status == TaskRunning { if t.Status == TaskPending || t.Status == TaskRunning {
t.Status = TaskPending t.Status = TaskPending
t.StartedAt = nil
t.DoneAt = nil t.DoneAt = nil
t.ErrMsg = "" t.ErrMsg = ""
} }
@@ -808,3 +811,21 @@ func (q *taskQueue) persistLocked() {
} }
_ = os.Rename(tmp, q.statePath) _ = os.Rename(tmp, q.statePath)
} }
func taskElapsedSec(t *Task, now time.Time) int {
if t == nil || t.StartedAt == nil || t.StartedAt.IsZero() {
return 0
}
start := *t.StartedAt
if !t.CreatedAt.IsZero() && start.Before(t.CreatedAt) {
start = t.CreatedAt
}
end := now
if t.DoneAt != nil && !t.DoneAt.IsZero() {
end = *t.DoneAt
}
if end.Before(start) {
return 0
}
return int(end.Sub(start).Round(time.Second) / time.Second)
}

View File

@@ -55,6 +55,9 @@ func TestTaskQueuePersistsAndRecoversPendingTasks(t *testing.T) {
if got.Status != TaskPending { if got.Status != TaskPending {
t.Fatalf("status=%q want %q", got.Status, TaskPending) t.Fatalf("status=%q want %q", got.Status, TaskPending)
} }
if got.StartedAt != nil {
t.Fatalf("started_at=%v want nil for recovered pending task", got.StartedAt)
}
if got.params.Duration != 300 || got.params.BurnProfile != "smoke" { if got.params.Duration != 300 || got.params.BurnProfile != "smoke" {
t.Fatalf("params=%+v", got.params) t.Fatalf("params=%+v", got.params)
} }
@@ -236,6 +239,26 @@ func TestRunTaskBuildsSupportBundleWithoutApp(t *testing.T) {
} }
} }
func TestTaskElapsedSecClampsInvalidStartedAt(t *testing.T) {
now := time.Date(2026, 4, 1, 19, 10, 0, 0, time.UTC)
created := time.Date(2026, 4, 1, 19, 4, 5, 0, time.UTC)
started := time.Time{}
task := &Task{
Status: TaskRunning,
CreatedAt: created,
StartedAt: &started,
}
if got := taskElapsedSec(task, now); got != 0 {
t.Fatalf("taskElapsedSec(zero start)=%d want 0", got)
}
stale := created.Add(-24 * time.Hour)
task.StartedAt = &stale
if got := taskElapsedSec(task, now); got != int(now.Sub(created).Seconds()) {
t.Fatalf("taskElapsedSec(stale start)=%d want %d", got, int(now.Sub(created).Seconds()))
}
}
func TestRunTaskInstallUsesSharedCommandStreaming(t *testing.T) { func TestRunTaskInstallUsesSharedCommandStreaming(t *testing.T) {
q := &taskQueue{ q := &taskQueue{
opts: &HandlerOptions{}, opts: &HandlerOptions{},

View File

@@ -9,6 +9,34 @@ All live metrics charts in the web UI are server-side SVG images served by Go
and polled by the browser every 2 seconds via `<img src="...?t=now">`. and polled by the browser every 2 seconds via `<img src="...?t=now">`.
There is no client-side canvas or JS chart library. There is no client-side canvas or JS chart library.
## Rule: live charts must be visually uniform
Live charts are a single UI family, not a set of one-off widgets. New charts and
changes to existing charts must keep the same rendering model and presentation
rules unless there is an explicit architectural decision to diverge.
Default expectations:
- same server-side SVG pipeline for all live metrics charts
- same refresh behaviour and failure handling in the browser
- same canvas size class and card layout
- same legend placement policy across charts
- same axis, title, and summary conventions
- no chart-specific visual exceptions added as a quick fix
Current default for live charts:
- legend below the plot area when a chart has 8 series or fewer
- legend hidden when a chart has more than 8 series
- 10 equal Y-axis steps across the chart height
- 1400 x 360 SVG canvas with legend
- 1400 x 288 SVG canvas without legend
- full-width card rendering in a single-column stack
If one chart needs a different layout or legend behaviour, treat that as a
design-level decision affecting the whole chart family, not as a local tweak to
just one endpoint.
### Why go-analyze/charts ### Why go-analyze/charts
- Pure Go, no CGO — builds cleanly inside the live-build container - Pure Go, no CGO — builds cleanly inside the live-build container
@@ -29,7 +57,8 @@ self-contained SVG renderer used **only** for completed SAT run reports
| `GET /api/metrics/chart/server.svg` | CPU temp, CPU load %, mem load %, power W, fan RPMs | | `GET /api/metrics/chart/server.svg` | CPU temp, CPU load %, mem load %, power W, fan RPMs |
| `GET /api/metrics/chart/gpu/{idx}.svg` | GPU temp °C, load %, mem %, power W | | `GET /api/metrics/chart/gpu/{idx}.svg` | GPU temp °C, load %, mem %, power W |
Charts are 1400 × 280 px SVG. The page renders them at `width: 100%` in a Charts are 1400 × 360 px SVG when the legend is shown, and 1400 × 288 px when
the legend is hidden. The page renders them at `width: 100%` in a
single-column layout so they always fill the viewport width. single-column layout so they always fill the viewport width.
### Ring buffers ### Ring buffers

View File

@@ -1,7 +1,7 @@
# Decision: Treat memtest as explicit ISO content, not as trusted live-build magic # Decision: Treat memtest as explicit ISO content, not as trusted live-build magic
**Date:** 2026-04-01 **Date:** 2026-04-01
**Status:** active **Status:** resolved
## Context ## Context
@@ -58,6 +58,18 @@ Root cause of the false alarm:
- as a result, we re-entered the same memtest investigation loop even though - as a result, we re-entered the same memtest investigation loop even though
the real ISO was already correct the real ISO was already correct
Additional correction from the subsequent `v3.21` build logs dated 2026-04-01:
- once ISO reading was fixed, the post-build debug correctly showed the raw ISO
still carried live-build's default memtest layout (`live/memtest.bin`,
`live/memtest.efi`, `boot/grub/memtest.cfg`, `isolinux/memtest.cfg`)
- that mismatch is expected to trigger project recovery, because `bee` requires
`boot/memtest86+x64.bin` / `boot/memtest86+x64.efi` plus matching menu paths
- however, `build.sh` exited before recovery because `set -e` treated a direct
`iso_memtest_present` return code of `1` as fatal
- so the next repeated loop was caused by shell control flow, not by proof that
the recovery design itself was wrong
## Known Failed Attempts ## Known Failed Attempts
These approaches were already tried and should not be repeated blindly: These approaches were already tried and should not be repeated blindly:
@@ -102,6 +114,8 @@ Any future memtest fix must explicitly identify:
- and a post-build proof from a real ISO, not only from intermediate workdir files - and a post-build proof from a real ISO, not only from intermediate workdir files
- whether the ISO inspection step itself succeeded, rather than merely whether - whether the ISO inspection step itself succeeded, rather than merely whether
the validator printed a memtest warning the validator printed a memtest warning
- whether a non-zero probe is intentionally handled inside an `if` / `case`
context rather than accidentally tripping `set -e`
## Decision ## Decision
@@ -134,6 +148,8 @@ Current implementation direction:
- install a stable ISO reader in the builder image - install a stable ISO reader in the builder image
- fail with an explicit reader error if ISO listing/extraction fails - fail with an explicit reader error if ISO listing/extraction fails
- do not treat reader failure as evidence that memtest is missing - do not treat reader failure as evidence that memtest is missing
- do not call a probe that may return "needs recovery" as a bare command under
`set -e`; wrap it in explicit control flow
## Consequences ## Consequences
@@ -144,3 +160,65 @@ Current implementation direction:
- But validation output is only trustworthy if ISO reading itself succeeded. A - But validation output is only trustworthy if ISO reading itself succeeded. A
"missing memtest" warning without a successful ISO read is not evidence. "missing memtest" warning without a successful ISO read is not evidence.
- If we change memtest strategy again, we must update this ADR with the exact build evidence that justified the change. - If we change memtest strategy again, we must update this ADR with the exact build evidence that justified the change.
## Working Solution (confirmed 2026-04-01, commits 76a9100 → 2baf3be)
This approach was confirmed working in ISO `easy-bee-nvidia-v3.20-5-g76a9100-amd64.iso`
and validated again in subsequent builds. The final ISO contains all required memtest artifacts.
### Components
**1. Binary hook `config/hooks/normal/9100-memtest.hook.binary`**
Runs inside the live-build binary phase. Does not patch bootloader files at hook time —
those files may not exist yet. Instead:
- Tries to copy `memtest86+x64.bin` / `memtest86+x64.efi` from `chroot/boot/` first.
- Falls back to extracting from the cached `.deb` (via `dpkg-deb -x`) if `chroot/boot/` is empty.
- Appends GRUB and isolinux menu entries only if the respective cfg files already exist at hook time.
If they do not exist, the hook warns and continues (does not fail).
Controlled by `BEE_REQUIRE_MEMTEST=1` env var to turn warnings into hard errors when needed.
**2. Post-`lb build` recovery step in `build.sh`**
After `lb build` completes, `build.sh` checks whether the fully materialized `binary/` tree
contains all required memtest artifacts. If not:
- Copies/extracts memtest binaries into `binary/boot/`.
- Patches `binary/boot/grub/grub.cfg` and `binary/isolinux/live.cfg` directly.
- Reruns the late binary stages (`binary_checksums`, `binary_iso`, `binary_zsync`) to rebuild
the ISO with the patched tree.
This is the deterministic safety net: even if the hook runs at the wrong time, the recovery
step handles the final `binary/` tree after live-build has written all bootloader configs.
**3. ISO validation hardening**
The memtest probe in `build.sh` is wrapped in explicit `if` / `case` control flow, not called
as a bare command under `set -e`. A non-zero probe return (needs recovery) is intentional and
handled — it does not abort the build prematurely.
ISO reading (`xorriso -indev -ls` / extraction) is treated as a separate prerequisite.
If the reader fails, the validator reports a reader error explicitly, not a memtest warning.
This prevents the false-negative loop that burned 2026-04-01 v3.14v3.19.
### Why this works when earlier attempts did not
The earlier patterns all shared a single flaw: they assumed a single build-time point
(hook or source template) would be the last writer of bootloader configs and memtest payloads.
In live-build on Debian Bookworm that assumption is false — live-build continues writing
bootloader files after custom hooks run, and `chroot/boot/` does not reliably hold memtest payloads.
The recovery step sidesteps the ordering problem entirely: it acts on the fully materialized
`binary/` tree after `lb build` finishes, then rebuilds the ISO from that patched tree.
There is no ordering dependency to get wrong.
### Do not revert
Do not remove the recovery step or the hook without a fresh real ISO build proving
live-build alone produces all four required artifacts:
- `boot/memtest86+x64.bin`
- `boot/memtest86+x64.efi`
- memtest entry in `boot/grub/grub.cfg`
- memtest entry in `isolinux/live.cfg`

View File

@@ -8,7 +8,7 @@ NCCL_TESTS_VERSION=2.13.10
NVCC_VERSION=12.8 NVCC_VERSION=12.8
CUBLAS_VERSION=13.0.2.14-1 CUBLAS_VERSION=13.0.2.14-1
CUDA_USERSPACE_VERSION=13.0.96-1 CUDA_USERSPACE_VERSION=13.0.96-1
DCGM_VERSION=4.5.2-1 DCGM_VERSION=4.5.3-1
JOHN_JUMBO_COMMIT=67fcf9fe5a JOHN_JUMBO_COMMIT=67fcf9fe5a
ROCM_VERSION=6.3.4 ROCM_VERSION=6.3.4
ROCM_SMI_VERSION=7.4.0.60304-76~22.04 ROCM_SMI_VERSION=7.4.0.60304-76~22.04

View File

@@ -32,7 +32,7 @@ lb config noauto \
--memtest memtest86+ \ --memtest memtest86+ \
--iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \ --iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \ --iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \ --bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=3 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
--apt-recommends false \ --apt-recommends false \
--chroot-squashfs-compression-type zstd \ --chroot-squashfs-compression-type zstd \
"${@}" "${@}"

View File

@@ -862,7 +862,6 @@ rm -f \
"${OVERLAY_STAGE_DIR}/etc/bee-release" \ "${OVERLAY_STAGE_DIR}/etc/bee-release" \
"${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys" \ "${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys" \
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee" \ "${OVERLAY_STAGE_DIR}/usr/local/bin/bee" \
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nccl-gpu-stress" \
"${OVERLAY_STAGE_DIR}/usr/local/bin/john" \ "${OVERLAY_STAGE_DIR}/usr/local/bin/john" \
"${OVERLAY_STAGE_DIR}/usr/local/lib/bee/bee-gpu-burn-worker" \ "${OVERLAY_STAGE_DIR}/usr/local/lib/bee/bee-gpu-burn-worker" \
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" \ "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" \
@@ -1136,13 +1135,16 @@ fi
ISO_RAW="${LB_DIR}/live-image-amd64.hybrid.iso" ISO_RAW="${LB_DIR}/live-image-amd64.hybrid.iso"
if [ -f "$ISO_RAW" ]; then if [ -f "$ISO_RAW" ]; then
dump_memtest_debug "post-build" "${LB_DIR}" "$ISO_RAW" dump_memtest_debug "post-build" "${LB_DIR}" "$ISO_RAW"
iso_memtest_present "$ISO_RAW" if iso_memtest_present "$ISO_RAW"; then
memtest_status=$? :
if [ "$memtest_status" -eq 1 ]; then else
recover_iso_memtest "${LB_DIR}" "$ISO_RAW" memtest_status=$?
dump_memtest_debug "post-recovery" "${LB_DIR}" "$ISO_RAW" if [ "$memtest_status" -eq 1 ]; then
elif [ "$memtest_status" -eq 2 ]; then recover_iso_memtest "${LB_DIR}" "$ISO_RAW"
memtest_fail "failed to inspect ISO for memtest before recovery" "$ISO_RAW" dump_memtest_debug "post-recovery" "${LB_DIR}" "$ISO_RAW"
elif [ "$memtest_status" -eq 2 ]; then
memtest_fail "failed to inspect ISO for memtest before recovery" "$ISO_RAW"
fi
fi fi
validate_iso_memtest "$ISO_RAW" validate_iso_memtest "$ISO_RAW"
cp "$ISO_RAW" "$ISO_OUT" cp "$ISO_RAW" "$ISO_OUT"

View File

@@ -1,3 +1,6 @@
# AMD GPU firmware
firmware-amd-graphics
# AMD ROCm — GPU monitoring, bandwidth test, and compute stress (RVS GST) # AMD ROCm — GPU monitoring, bandwidth test, and compute stress (RVS GST)
rocm-smi-lib=%%ROCM_SMI_VERSION%% rocm-smi-lib=%%ROCM_SMI_VERSION%%
rocm-bandwidth-test=%%ROCM_BANDWIDTH_TEST_VERSION%% rocm-bandwidth-test=%%ROCM_BANDWIDTH_TEST_VERSION%%

View File

@@ -71,9 +71,7 @@ lightdm
firmware-linux-free firmware-linux-free
firmware-linux-nonfree firmware-linux-nonfree
firmware-misc-nonfree firmware-misc-nonfree
firmware-amd-graphics
firmware-realtek firmware-realtek
firmware-intel-sound
firmware-bnx2 firmware-bnx2
firmware-bnx2x firmware-bnx2x
firmware-cavium firmware-cavium

View File

@@ -52,6 +52,14 @@ else
fail "nvidia-smi: NOT FOUND" fail "nvidia-smi: NOT FOUND"
fi fi
for tool in bee-gpu-burn bee-john-gpu-stress bee-nccl-gpu-stress all_reduce_perf; do
if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then
ok "$tool found: $p"
else
fail "$tool: NOT FOUND"
fi
done
echo "" echo ""
echo "-- NVIDIA modules --" echo "-- NVIDIA modules --"
KO_DIR="/usr/local/lib/nvidia" KO_DIR="/usr/local/lib/nvidia"

View File

@@ -190,4 +190,16 @@ CHOSEN_FORMAT=$(choose_format) || {
} }
echo "format=${CHOSEN_FORMAT}" echo "format=${CHOSEN_FORMAT}"
exec ./john --test="${SECONDS}" --format="${CHOSEN_FORMAT}" --devices="${JOHN_DEVICES}" PIDS=""
_first=1
for opencl_id in $(echo "${JOHN_DEVICES}" | tr ',' ' '); do
[ "${_first}" = "1" ] || sleep 3
_first=0
./john --test="${SECONDS}" --format="${CHOSEN_FORMAT}" --devices="${opencl_id}" &
PIDS="${PIDS} $!"
done
FAIL=0
for pid in ${PIDS}; do
wait "${pid}" || FAIL=$((FAIL+1))
done
[ "${FAIL}" -eq 0 ] || { echo "john: ${FAIL} device(s) failed" >&2; exit 1; }

View File

@@ -24,7 +24,7 @@ chromium \
--no-first-run \ --no-first-run \
--disable-session-crashed-bubble \ --disable-session-crashed-bubble \
--disable-features=TranslateUI \ --disable-features=TranslateUI \
--start-fullscreen \ --start-maximized \
http://localhost/ & http://localhost/ &
exec openbox exec openbox

View File

@@ -3,6 +3,11 @@
# Type 'a' at any prompt to abort, 'b' to go back. # Type 'a' at any prompt to abort, 'b' to go back.
set -e set -e
# Requires root for ip/dhclient/resolv.conf — re-exec via sudo if needed.
if [ "$(id -u)" -ne 0 ]; then
exec sudo "$0" "$@"
fi
abort() { echo "Aborted."; exit 0; } abort() { echo "Aborted."; exit 0; }
ask() { ask() {