Compare commits
12 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| dbab43db90 | |||
| bcb7fe5fe9 | |||
| d21d9d191b | |||
| ef45246ea0 | |||
| 348db35119 | |||
| 1dd7f243f5 | |||
| 938e499ac2 | |||
| 964ab39656 | |||
| c2aecc6ce9 | |||
| 439b86ce59 | |||
| eb60100297 | |||
|
|
2baf3be640 |
@@ -13,14 +13,18 @@ import (
|
|||||||
const nvidiaVendorID = 0x10de
|
const nvidiaVendorID = 0x10de
|
||||||
|
|
||||||
type nvidiaGPUInfo struct {
|
type nvidiaGPUInfo struct {
|
||||||
BDF string
|
BDF string
|
||||||
Serial string
|
Serial string
|
||||||
VBIOS string
|
VBIOS string
|
||||||
TemperatureC *float64
|
TemperatureC *float64
|
||||||
PowerW *float64
|
PowerW *float64
|
||||||
ECCUncorrected *int64
|
ECCUncorrected *int64
|
||||||
ECCCorrected *int64
|
ECCCorrected *int64
|
||||||
HWSlowdown *bool
|
HWSlowdown *bool
|
||||||
|
PCIeLinkGenCurrent *int
|
||||||
|
PCIeLinkGenMax *int
|
||||||
|
PCIeLinkWidthCur *int
|
||||||
|
PCIeLinkWidthMax *int
|
||||||
}
|
}
|
||||||
|
|
||||||
// enrichPCIeWithNVIDIA enriches NVIDIA PCIe devices with data from nvidia-smi.
|
// enrichPCIeWithNVIDIA enriches NVIDIA PCIe devices with data from nvidia-smi.
|
||||||
@@ -94,7 +98,7 @@ func enrichPCIeWithNVIDIAData(devs []schema.HardwarePCIeDevice, gpuByBDF map[str
|
|||||||
func queryNVIDIAGPUs() (map[string]nvidiaGPUInfo, error) {
|
func queryNVIDIAGPUs() (map[string]nvidiaGPUInfo, error) {
|
||||||
out, err := exec.Command(
|
out, err := exec.Command(
|
||||||
"nvidia-smi",
|
"nvidia-smi",
|
||||||
"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown",
|
"--query-gpu=index,pci.bus_id,serial,vbios_version,temperature.gpu,power.draw,ecc.errors.uncorrected.aggregate.total,ecc.errors.corrected.aggregate.total,clocks_throttle_reasons.hw_slowdown,pcie.link.gen.current,pcie.link.gen.max,pcie.link.width.current,pcie.link.width.max",
|
||||||
"--format=csv,noheader,nounits",
|
"--format=csv,noheader,nounits",
|
||||||
).Output()
|
).Output()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -118,8 +122,8 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
|
|||||||
if len(rec) == 0 {
|
if len(rec) == 0 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if len(rec) < 9 {
|
if len(rec) < 13 {
|
||||||
return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 9", len(rec))
|
return nil, fmt.Errorf("unexpected nvidia-smi columns: got %d, want 13", len(rec))
|
||||||
}
|
}
|
||||||
|
|
||||||
bdf := normalizePCIeBDF(rec[1])
|
bdf := normalizePCIeBDF(rec[1])
|
||||||
@@ -128,14 +132,18 @@ func parseNVIDIASMIQuery(raw string) (map[string]nvidiaGPUInfo, error) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
info := nvidiaGPUInfo{
|
info := nvidiaGPUInfo{
|
||||||
BDF: bdf,
|
BDF: bdf,
|
||||||
Serial: strings.TrimSpace(rec[2]),
|
Serial: strings.TrimSpace(rec[2]),
|
||||||
VBIOS: strings.TrimSpace(rec[3]),
|
VBIOS: strings.TrimSpace(rec[3]),
|
||||||
TemperatureC: parseMaybeFloat(rec[4]),
|
TemperatureC: parseMaybeFloat(rec[4]),
|
||||||
PowerW: parseMaybeFloat(rec[5]),
|
PowerW: parseMaybeFloat(rec[5]),
|
||||||
ECCUncorrected: parseMaybeInt64(rec[6]),
|
ECCUncorrected: parseMaybeInt64(rec[6]),
|
||||||
ECCCorrected: parseMaybeInt64(rec[7]),
|
ECCCorrected: parseMaybeInt64(rec[7]),
|
||||||
HWSlowdown: parseMaybeBool(rec[8]),
|
HWSlowdown: parseMaybeBool(rec[8]),
|
||||||
|
PCIeLinkGenCurrent: parseMaybeInt(rec[9]),
|
||||||
|
PCIeLinkGenMax: parseMaybeInt(rec[10]),
|
||||||
|
PCIeLinkWidthCur: parseMaybeInt(rec[11]),
|
||||||
|
PCIeLinkWidthMax: parseMaybeInt(rec[12]),
|
||||||
}
|
}
|
||||||
result[bdf] = info
|
result[bdf] = info
|
||||||
}
|
}
|
||||||
@@ -167,6 +175,22 @@ func parseMaybeInt64(v string) *int64 {
|
|||||||
return &n
|
return &n
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func parseMaybeInt(v string) *int {
|
||||||
|
v = strings.TrimSpace(v)
|
||||||
|
if v == "" || strings.EqualFold(v, "n/a") || strings.EqualFold(v, "not supported") || strings.EqualFold(v, "[not supported]") {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
n, err := strconv.Atoi(v)
|
||||||
|
if err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return &n
|
||||||
|
}
|
||||||
|
|
||||||
|
func pcieLinkGenLabel(gen int) string {
|
||||||
|
return fmt.Sprintf("Gen%d", gen)
|
||||||
|
}
|
||||||
|
|
||||||
func parseMaybeBool(v string) *bool {
|
func parseMaybeBool(v string) *bool {
|
||||||
v = strings.TrimSpace(strings.ToLower(v))
|
v = strings.TrimSpace(strings.ToLower(v))
|
||||||
switch v {
|
switch v {
|
||||||
@@ -231,4 +255,22 @@ func injectNVIDIATelemetry(dev *schema.HardwarePCIeDevice, info nvidiaGPUInfo) {
|
|||||||
if info.HWSlowdown != nil {
|
if info.HWSlowdown != nil {
|
||||||
dev.HWSlowdown = info.HWSlowdown
|
dev.HWSlowdown = info.HWSlowdown
|
||||||
}
|
}
|
||||||
|
// Override PCIe link speed/width with nvidia-smi driver values.
|
||||||
|
// sysfs current_link_speed reflects the instantaneous physical link state and
|
||||||
|
// can show Gen1 when the GPU is idle due to ASPM power management. The driver
|
||||||
|
// knows the negotiated speed regardless of the current power state.
|
||||||
|
if info.PCIeLinkGenCurrent != nil {
|
||||||
|
s := pcieLinkGenLabel(*info.PCIeLinkGenCurrent)
|
||||||
|
dev.LinkSpeed = &s
|
||||||
|
}
|
||||||
|
if info.PCIeLinkGenMax != nil {
|
||||||
|
s := pcieLinkGenLabel(*info.PCIeLinkGenMax)
|
||||||
|
dev.MaxLinkSpeed = &s
|
||||||
|
}
|
||||||
|
if info.PCIeLinkWidthCur != nil {
|
||||||
|
dev.LinkWidth = info.PCIeLinkWidthCur
|
||||||
|
}
|
||||||
|
if info.PCIeLinkWidthMax != nil {
|
||||||
|
dev.MaxLinkWidth = info.PCIeLinkWidthMax
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func TestParseNVIDIASMIQuery(t *testing.T) {
|
func TestParseNVIDIASMIQuery(t *testing.T) {
|
||||||
raw := "0, 00000000:65:00.0, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active\n"
|
raw := "0, 00000000:65:00.0, GPU-SERIAL-1, 96.00.1F.00.02, 54, 210.33, 0, 5, Not Active, 4, 4, 16, 16\n"
|
||||||
byBDF, err := parseNVIDIASMIQuery(raw)
|
byBDF, err := parseNVIDIASMIQuery(raw)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("parse failed: %v", err)
|
t.Fatalf("parse failed: %v", err)
|
||||||
@@ -28,6 +28,12 @@ func TestParseNVIDIASMIQuery(t *testing.T) {
|
|||||||
if gpu.HWSlowdown == nil || *gpu.HWSlowdown {
|
if gpu.HWSlowdown == nil || *gpu.HWSlowdown {
|
||||||
t.Fatalf("hw slowdown: got %v, want false", gpu.HWSlowdown)
|
t.Fatalf("hw slowdown: got %v, want false", gpu.HWSlowdown)
|
||||||
}
|
}
|
||||||
|
if gpu.PCIeLinkGenCurrent == nil || *gpu.PCIeLinkGenCurrent != 4 {
|
||||||
|
t.Fatalf("pcie link gen current: got %v, want 4", gpu.PCIeLinkGenCurrent)
|
||||||
|
}
|
||||||
|
if gpu.PCIeLinkGenMax == nil || *gpu.PCIeLinkGenMax != 4 {
|
||||||
|
t.Fatalf("pcie link gen max: got %v, want 4", gpu.PCIeLinkGenMax)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestNormalizePCIeBDF(t *testing.T) {
|
func TestNormalizePCIeBDF(t *testing.T) {
|
||||||
|
|||||||
@@ -77,11 +77,24 @@ func discoverStorageDevices() []lsblkDevice {
|
|||||||
if dev.Type != "disk" {
|
if dev.Type != "disk" {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
if isVirtualBMCDisk(dev) {
|
||||||
|
slog.Debug("storage: skipping BMC virtual disk", "name", dev.Name, "model", dev.Model)
|
||||||
|
continue
|
||||||
|
}
|
||||||
disks = append(disks, dev)
|
disks = append(disks, dev)
|
||||||
}
|
}
|
||||||
return disks
|
return disks
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// isVirtualBMCDisk returns true for BMC/IPMI virtual USB mass storage devices
|
||||||
|
// that appear as disks but are not real hardware (e.g. iDRAC Virtual HDisk*).
|
||||||
|
// These have zero reported size, a generic fake serial, and a model name that
|
||||||
|
// starts with "Virtual HDisk".
|
||||||
|
func isVirtualBMCDisk(dev lsblkDevice) bool {
|
||||||
|
model := strings.ToLower(strings.TrimSpace(dev.Model))
|
||||||
|
return strings.HasPrefix(model, "virtual hdisk")
|
||||||
|
}
|
||||||
|
|
||||||
func lsblkDevices() []lsblkDevice {
|
func lsblkDevices() []lsblkDevice {
|
||||||
out, err := exec.Command("lsblk", "-J", "-d",
|
out, err := exec.Command("lsblk", "-J", "-d",
|
||||||
"-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL").Output()
|
"-o", "NAME,TYPE,SIZE,SERIAL,MODEL,TRAN,HCTL").Output()
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"syscall"
|
||||||
"sort"
|
"sort"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
@@ -531,6 +532,13 @@ func runSATCommandCtx(ctx context.Context, verboseLog, name string, cmd []string
|
|||||||
}
|
}
|
||||||
|
|
||||||
c := exec.CommandContext(ctx, resolvedCmd[0], resolvedCmd[1:]...)
|
c := exec.CommandContext(ctx, resolvedCmd[0], resolvedCmd[1:]...)
|
||||||
|
c.SysProcAttr = &syscall.SysProcAttr{Setpgid: true}
|
||||||
|
c.Cancel = func() error {
|
||||||
|
if c.Process != nil {
|
||||||
|
_ = syscall.Kill(-c.Process.Pid, syscall.SIGKILL)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
if len(env) > 0 {
|
if len(env) > 0 {
|
||||||
c.Env = append(os.Environ(), env...)
|
c.Env = append(os.Environ(), env...)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -346,8 +346,10 @@ func (h *handler) handleAPINetworkStatus(w http.ResponseWriter, r *http.Request)
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
writeJSON(w, map[string]any{
|
writeJSON(w, map[string]any{
|
||||||
"interfaces": ifaces,
|
"interfaces": ifaces,
|
||||||
"default_route": h.opts.App.DefaultRoute(),
|
"default_route": h.opts.App.DefaultRoute(),
|
||||||
|
"pending_change": h.hasPendingNetworkChange(),
|
||||||
|
"rollback_in": h.pendingNetworkRollbackIn(),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -744,13 +746,7 @@ func (h *handler) feedRings(sample platform.LiveMetricSample) {
|
|||||||
h.ringMemLoad.push(sample.MemLoadPct)
|
h.ringMemLoad.push(sample.MemLoadPct)
|
||||||
|
|
||||||
h.ringsMu.Lock()
|
h.ringsMu.Lock()
|
||||||
for i, fan := range sample.Fans {
|
h.pushFanRings(sample.Fans)
|
||||||
for len(h.ringFans) <= i {
|
|
||||||
h.ringFans = append(h.ringFans, newMetricsRing(120))
|
|
||||||
h.fanNames = append(h.fanNames, fan.Name)
|
|
||||||
}
|
|
||||||
h.ringFans[i].push(float64(fan.RPM))
|
|
||||||
}
|
|
||||||
for _, gpu := range sample.GPUs {
|
for _, gpu := range sample.GPUs {
|
||||||
idx := gpu.GPUIndex
|
idx := gpu.GPUIndex
|
||||||
for len(h.gpuRings) <= idx {
|
for len(h.gpuRings) <= idx {
|
||||||
@@ -769,6 +765,51 @@ func (h *handler) feedRings(sample platform.LiveMetricSample) {
|
|||||||
h.ringsMu.Unlock()
|
h.ringsMu.Unlock()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (h *handler) pushFanRings(fans []platform.FanReading) {
|
||||||
|
if len(fans) == 0 && len(h.ringFans) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
fanValues := make(map[string]float64, len(fans))
|
||||||
|
for _, fan := range fans {
|
||||||
|
if fan.Name == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
fanValues[fan.Name] = fan.RPM
|
||||||
|
found := false
|
||||||
|
for i, name := range h.fanNames {
|
||||||
|
if name == fan.Name {
|
||||||
|
found = true
|
||||||
|
if i >= len(h.ringFans) {
|
||||||
|
h.ringFans = append(h.ringFans, newMetricsRing(120))
|
||||||
|
}
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !found {
|
||||||
|
h.fanNames = append(h.fanNames, fan.Name)
|
||||||
|
h.ringFans = append(h.ringFans, newMetricsRing(120))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for i, ring := range h.ringFans {
|
||||||
|
if ring == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
name := ""
|
||||||
|
if i < len(h.fanNames) {
|
||||||
|
name = h.fanNames[i]
|
||||||
|
}
|
||||||
|
if rpm, ok := fanValues[name]; ok {
|
||||||
|
ring.push(rpm)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if last, ok := ring.latest(); ok {
|
||||||
|
ring.push(last)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
ring.push(0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func (h *handler) pushNamedMetricRing(dst *[]*namedMetricsRing, name string, value float64) {
|
func (h *handler) pushNamedMetricRing(dst *[]*namedMetricsRing, name string, value float64) {
|
||||||
if name == "" {
|
if name == "" {
|
||||||
return
|
return
|
||||||
@@ -847,7 +888,10 @@ func (h *handler) applyPendingNetworkChange(apply func() (app.ActionResult, erro
|
|||||||
return result, err
|
return result, err
|
||||||
}
|
}
|
||||||
|
|
||||||
pnc := &pendingNetChange{snapshot: snapshot}
|
pnc := &pendingNetChange{
|
||||||
|
snapshot: snapshot,
|
||||||
|
deadline: time.Now().Add(netRollbackTimeout),
|
||||||
|
}
|
||||||
pnc.timer = time.AfterFunc(netRollbackTimeout, func() {
|
pnc.timer = time.AfterFunc(netRollbackTimeout, func() {
|
||||||
_ = h.opts.App.RestoreNetworkSnapshot(snapshot)
|
_ = h.opts.App.RestoreNetworkSnapshot(snapshot)
|
||||||
h.pendingNetMu.Lock()
|
h.pendingNetMu.Lock()
|
||||||
@@ -864,6 +908,25 @@ func (h *handler) applyPendingNetworkChange(apply func() (app.ActionResult, erro
|
|||||||
return result, nil
|
return result, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (h *handler) hasPendingNetworkChange() bool {
|
||||||
|
h.pendingNetMu.Lock()
|
||||||
|
defer h.pendingNetMu.Unlock()
|
||||||
|
return h.pendingNet != nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (h *handler) pendingNetworkRollbackIn() int {
|
||||||
|
h.pendingNetMu.Lock()
|
||||||
|
defer h.pendingNetMu.Unlock()
|
||||||
|
if h.pendingNet == nil {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
remaining := int(time.Until(h.pendingNet.deadline).Seconds())
|
||||||
|
if remaining < 1 {
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
return remaining
|
||||||
|
}
|
||||||
|
|
||||||
func (h *handler) handleAPINetworkConfirm(w http.ResponseWriter, _ *http.Request) {
|
func (h *handler) handleAPINetworkConfirm(w http.ResponseWriter, _ *http.Request) {
|
||||||
h.pendingNetMu.Lock()
|
h.pendingNetMu.Lock()
|
||||||
pnc := h.pendingNet
|
pnc := h.pendingNet
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ import (
|
|||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"bee/audit/internal/app"
|
"bee/audit/internal/app"
|
||||||
|
"bee/audit/internal/platform"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestXrandrCommandAddsDefaultX11Env(t *testing.T) {
|
func TestXrandrCommandAddsDefaultX11Env(t *testing.T) {
|
||||||
@@ -100,3 +101,29 @@ func TestHandleAPIExportBundleQueuesTask(t *testing.T) {
|
|||||||
t.Fatalf("target=%q want support-bundle", got)
|
t.Fatalf("target=%q want support-bundle", got)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestPushFanRingsTracksByNameAndCarriesForwardMissingSamples(t *testing.T) {
|
||||||
|
h := &handler{}
|
||||||
|
h.pushFanRings([]platform.FanReading{
|
||||||
|
{Name: "FAN_A", RPM: 4200},
|
||||||
|
{Name: "FAN_B", RPM: 5100},
|
||||||
|
})
|
||||||
|
h.pushFanRings([]platform.FanReading{
|
||||||
|
{Name: "FAN_B", RPM: 5200},
|
||||||
|
})
|
||||||
|
|
||||||
|
if len(h.fanNames) != 2 || h.fanNames[0] != "FAN_A" || h.fanNames[1] != "FAN_B" {
|
||||||
|
t.Fatalf("fanNames=%v", h.fanNames)
|
||||||
|
}
|
||||||
|
aVals, _ := h.ringFans[0].snapshot()
|
||||||
|
bVals, _ := h.ringFans[1].snapshot()
|
||||||
|
if len(aVals) != 2 || len(bVals) != 2 {
|
||||||
|
t.Fatalf("fan ring lengths: A=%d B=%d", len(aVals), len(bVals))
|
||||||
|
}
|
||||||
|
if aVals[1] != 4200 {
|
||||||
|
t.Fatalf("FAN_A should carry forward last value, got %v", aVals)
|
||||||
|
}
|
||||||
|
if bVals[1] != 5200 {
|
||||||
|
t.Fatalf("FAN_B should use latest sampled value, got %v", bVals)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -120,7 +120,7 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
|
|||||||
|
|
||||||
// LoadRecent returns up to n samples in chronological order (oldest first).
|
// LoadRecent returns up to n samples in chronological order (oldest first).
|
||||||
func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
|
func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
|
||||||
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?`, n)
|
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM (SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?) ORDER BY ts`, n)
|
||||||
}
|
}
|
||||||
|
|
||||||
// LoadAll returns all persisted samples in chronological order (oldest first).
|
// LoadAll returns all persisted samples in chronological order (oldest first).
|
||||||
@@ -151,11 +151,6 @@ func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetri
|
|||||||
if len(sysRows) == 0 {
|
if len(sysRows) == 0 {
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
// Reverse to chronological order
|
|
||||||
for i, j := 0, len(sysRows)-1; i < j; i, j = i+1, j-1 {
|
|
||||||
sysRows[i], sysRows[j] = sysRows[j], sysRows[i]
|
|
||||||
}
|
|
||||||
|
|
||||||
// Collect min/max ts for range query
|
// Collect min/max ts for range query
|
||||||
minTS := sysRows[0].ts
|
minTS := sysRows[0].ts
|
||||||
maxTS := sysRows[len(sysRows)-1].ts
|
maxTS := sysRows[len(sysRows)-1].ts
|
||||||
|
|||||||
69
audit/internal/webui/metricsdb_test.go
Normal file
69
audit/internal/webui/metricsdb_test.go
Normal file
@@ -0,0 +1,69 @@
|
|||||||
|
package webui
|
||||||
|
|
||||||
|
import (
|
||||||
|
"path/filepath"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"bee/audit/internal/platform"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestMetricsDBLoadSamplesKeepsChronologicalRangeForGPUs(t *testing.T) {
|
||||||
|
db, err := openMetricsDB(filepath.Join(t.TempDir(), "metrics.db"))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("openMetricsDB: %v", err)
|
||||||
|
}
|
||||||
|
defer db.Close()
|
||||||
|
|
||||||
|
base := time.Unix(1_700_000_000, 0).UTC()
|
||||||
|
for i := 0; i < 3; i++ {
|
||||||
|
err := db.Write(platform.LiveMetricSample{
|
||||||
|
Timestamp: base.Add(time.Duration(i) * time.Second),
|
||||||
|
CPULoadPct: float64(10 + i),
|
||||||
|
MemLoadPct: float64(20 + i),
|
||||||
|
PowerW: float64(300 + i),
|
||||||
|
GPUs: []platform.GPUMetricRow{
|
||||||
|
{GPUIndex: 0, PowerW: float64(100 + i)},
|
||||||
|
{GPUIndex: 2, PowerW: float64(200 + i)},
|
||||||
|
},
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Write(%d): %v", i, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
all, err := db.LoadAll()
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("LoadAll: %v", err)
|
||||||
|
}
|
||||||
|
if len(all) != 3 {
|
||||||
|
t.Fatalf("LoadAll len=%d want 3", len(all))
|
||||||
|
}
|
||||||
|
for i, sample := range all {
|
||||||
|
if len(sample.GPUs) != 2 {
|
||||||
|
t.Fatalf("LoadAll sample %d GPUs=%v want 2 rows", i, sample.GPUs)
|
||||||
|
}
|
||||||
|
if sample.GPUs[0].GPUIndex != 0 || sample.GPUs[0].PowerW != float64(100+i) {
|
||||||
|
t.Fatalf("LoadAll sample %d GPU0=%+v", i, sample.GPUs[0])
|
||||||
|
}
|
||||||
|
if sample.GPUs[1].GPUIndex != 2 || sample.GPUs[1].PowerW != float64(200+i) {
|
||||||
|
t.Fatalf("LoadAll sample %d GPU1=%+v", i, sample.GPUs[1])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
recent, err := db.LoadRecent(2)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("LoadRecent: %v", err)
|
||||||
|
}
|
||||||
|
if len(recent) != 2 {
|
||||||
|
t.Fatalf("LoadRecent len=%d want 2", len(recent))
|
||||||
|
}
|
||||||
|
if !recent[0].Timestamp.Before(recent[1].Timestamp) {
|
||||||
|
t.Fatalf("LoadRecent timestamps not ascending: %v >= %v", recent[0].Timestamp, recent[1].Timestamp)
|
||||||
|
}
|
||||||
|
for i, sample := range recent {
|
||||||
|
if len(sample.GPUs) != 2 {
|
||||||
|
t.Fatalf("LoadRecent sample %d GPUs=%v want 2 rows", i, sample.GPUs)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -522,13 +522,30 @@ func renderMetrics() string {
|
|||||||
</div>
|
</div>
|
||||||
|
|
||||||
<script>
|
<script>
|
||||||
|
const chartIds = [
|
||||||
|
'chart-server-load','chart-server-temp-cpu','chart-server-temp-gpu','chart-server-temp-ambient','chart-server-power','chart-server-fans',
|
||||||
|
'chart-gpu-all-load','chart-gpu-all-memload','chart-gpu-all-power','chart-gpu-all-temp'
|
||||||
|
];
|
||||||
|
|
||||||
|
function refreshChartImage(el) {
|
||||||
|
if (!el || el.dataset.loading === '1') return;
|
||||||
|
const baseSrc = el.dataset.baseSrc || el.src.split('?')[0];
|
||||||
|
const nextSrc = baseSrc + '?t=' + Date.now();
|
||||||
|
const probe = new Image();
|
||||||
|
el.dataset.baseSrc = baseSrc;
|
||||||
|
el.dataset.loading = '1';
|
||||||
|
probe.onload = function() {
|
||||||
|
el.src = nextSrc;
|
||||||
|
el.dataset.loading = '0';
|
||||||
|
};
|
||||||
|
probe.onerror = function() {
|
||||||
|
el.dataset.loading = '0';
|
||||||
|
};
|
||||||
|
probe.src = nextSrc;
|
||||||
|
}
|
||||||
|
|
||||||
function refreshCharts() {
|
function refreshCharts() {
|
||||||
const t = '?t=' + Date.now();
|
chartIds.forEach(id => refreshChartImage(document.getElementById(id)));
|
||||||
['chart-server-load','chart-server-temp-cpu','chart-server-temp-gpu','chart-server-temp-ambient','chart-server-power','chart-server-fans',
|
|
||||||
'chart-gpu-all-load','chart-gpu-all-memload','chart-gpu-all-power','chart-gpu-all-temp'].forEach(id => {
|
|
||||||
const el = document.getElementById(id);
|
|
||||||
if (el) el.src = el.src.split('?')[0] + t;
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
setInterval(refreshCharts, 3000);
|
setInterval(refreshCharts, 3000);
|
||||||
|
|
||||||
@@ -892,6 +909,8 @@ func renderNetworkInline() string {
|
|||||||
</div>
|
</div>
|
||||||
<script>
|
<script>
|
||||||
var _netCountdownTimer = null;
|
var _netCountdownTimer = null;
|
||||||
|
var _netRefreshTimer = null;
|
||||||
|
const NET_ROLLBACK_SECS = 60;
|
||||||
function loadNetwork() {
|
function loadNetwork() {
|
||||||
fetch('/api/network').then(r=>r.json()).then(d => {
|
fetch('/api/network').then(r=>r.json()).then(d => {
|
||||||
const rows = (d.interfaces||[]).map(i =>
|
const rows = (d.interfaces||[]).map(i =>
|
||||||
@@ -902,21 +921,33 @@ function loadNetwork() {
|
|||||||
document.getElementById('iface-table').innerHTML =
|
document.getElementById('iface-table').innerHTML =
|
||||||
'<table><tr><th>Interface</th><th>State (click to toggle)</th><th>Addresses</th></tr>'+rows+'</table>' +
|
'<table><tr><th>Interface</th><th>State (click to toggle)</th><th>Addresses</th></tr>'+rows+'</table>' +
|
||||||
(d.default_route ? '<p style="font-size:12px;color:var(--muted);margin-top:8px">Default route: '+d.default_route+'</p>' : '');
|
(d.default_route ? '<p style="font-size:12px;color:var(--muted);margin-top:8px">Default route: '+d.default_route+'</p>' : '');
|
||||||
});
|
if (d.pending_change) showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
|
||||||
|
else hideNetPending();
|
||||||
|
}).catch(function() {});
|
||||||
}
|
}
|
||||||
function selectIface(iface) {
|
function selectIface(iface) {
|
||||||
document.getElementById('dhcp-iface').value = iface;
|
document.getElementById('dhcp-iface').value = iface;
|
||||||
document.getElementById('st-iface').value = iface;
|
document.getElementById('st-iface').value = iface;
|
||||||
}
|
}
|
||||||
function toggleIface(iface, currentState) {
|
function toggleIface(iface, currentState) {
|
||||||
|
showNetPending(NET_ROLLBACK_SECS);
|
||||||
fetch('/api/network/toggle',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({iface:iface})})
|
fetch('/api/network/toggle',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({iface:iface})})
|
||||||
.then(r=>r.json()).then(d => {
|
.then(r=>r.json()).then(d => {
|
||||||
if (d.error) { alert('Error: '+d.error); return; }
|
if (d.error) { hideNetPending(); alert('Error: '+d.error); return; }
|
||||||
loadNetwork();
|
loadNetwork();
|
||||||
showNetPending(d.rollback_in || 60);
|
showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
|
||||||
|
}).catch(function() {
|
||||||
|
setTimeout(loadNetwork, 1500);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
function hideNetPending() {
|
||||||
|
const el = document.getElementById('net-pending');
|
||||||
|
if (_netCountdownTimer) clearInterval(_netCountdownTimer);
|
||||||
|
_netCountdownTimer = null;
|
||||||
|
el.style.display = 'none';
|
||||||
|
}
|
||||||
function showNetPending(secs) {
|
function showNetPending(secs) {
|
||||||
|
if (!secs || secs < 1) { hideNetPending(); return; }
|
||||||
const el = document.getElementById('net-pending');
|
const el = document.getElementById('net-pending');
|
||||||
el.style.display = 'block';
|
el.style.display = 'block';
|
||||||
if (_netCountdownTimer) clearInterval(_netCountdownTimer);
|
if (_netCountdownTimer) clearInterval(_netCountdownTimer);
|
||||||
@@ -925,30 +956,33 @@ function showNetPending(secs) {
|
|||||||
_netCountdownTimer = setInterval(function() {
|
_netCountdownTimer = setInterval(function() {
|
||||||
remaining--;
|
remaining--;
|
||||||
document.getElementById('net-countdown').textContent = remaining;
|
document.getElementById('net-countdown').textContent = remaining;
|
||||||
if (remaining <= 0) { clearInterval(_netCountdownTimer); _netCountdownTimer=null; el.style.display='none'; loadNetwork(); }
|
if (remaining <= 0) { hideNetPending(); loadNetwork(); }
|
||||||
}, 1000);
|
}, 1000);
|
||||||
}
|
}
|
||||||
function confirmNetChange() {
|
function confirmNetChange() {
|
||||||
if (_netCountdownTimer) { clearInterval(_netCountdownTimer); _netCountdownTimer=null; }
|
hideNetPending();
|
||||||
document.getElementById('net-pending').style.display='none';
|
fetch('/api/network/confirm',{method:'POST'}).then(()=>loadNetwork()).catch(()=>{});
|
||||||
fetch('/api/network/confirm',{method:'POST'});
|
|
||||||
}
|
}
|
||||||
function rollbackNetChange() {
|
function rollbackNetChange() {
|
||||||
if (_netCountdownTimer) { clearInterval(_netCountdownTimer); _netCountdownTimer=null; }
|
hideNetPending();
|
||||||
document.getElementById('net-pending').style.display='none';
|
fetch('/api/network/rollback',{method:'POST'}).then(()=>loadNetwork()).catch(()=>{});
|
||||||
fetch('/api/network/rollback',{method:'POST'}).then(()=>loadNetwork());
|
|
||||||
}
|
}
|
||||||
function runDHCP() {
|
function runDHCP() {
|
||||||
const iface = document.getElementById('dhcp-iface').value.trim();
|
const iface = document.getElementById('dhcp-iface').value.trim();
|
||||||
|
showNetPending(NET_ROLLBACK_SECS);
|
||||||
fetch('/api/network/dhcp',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({interface:iface||'all'})})
|
fetch('/api/network/dhcp',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({interface:iface||'all'})})
|
||||||
.then(r=>r.json()).then(d => {
|
.then(r=>r.json()).then(d => {
|
||||||
document.getElementById('dhcp-out').textContent = d.output || d.error || 'Done.';
|
document.getElementById('dhcp-out').textContent = d.output || d.error || 'Done.';
|
||||||
if (!d.error) showNetPending(d.rollback_in || 60);
|
if (d.error) { hideNetPending(); return; }
|
||||||
|
showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
|
||||||
loadNetwork();
|
loadNetwork();
|
||||||
|
}).catch(function() {
|
||||||
|
setTimeout(loadNetwork, 1500);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
function setStatic() {
|
function setStatic() {
|
||||||
const dns = document.getElementById('st-dns').value.split(',').map(s=>s.trim()).filter(Boolean);
|
const dns = document.getElementById('st-dns').value.split(',').map(s=>s.trim()).filter(Boolean);
|
||||||
|
showNetPending(NET_ROLLBACK_SECS);
|
||||||
fetch('/api/network/static',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({
|
fetch('/api/network/static',{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({
|
||||||
interface: document.getElementById('st-iface').value,
|
interface: document.getElementById('st-iface').value,
|
||||||
address: document.getElementById('st-addr').value,
|
address: document.getElementById('st-addr').value,
|
||||||
@@ -957,11 +991,16 @@ function setStatic() {
|
|||||||
dns: dns,
|
dns: dns,
|
||||||
})}).then(r=>r.json()).then(d => {
|
})}).then(r=>r.json()).then(d => {
|
||||||
document.getElementById('static-out').textContent = d.output || d.error || 'Done.';
|
document.getElementById('static-out').textContent = d.output || d.error || 'Done.';
|
||||||
if (!d.error) showNetPending(d.rollback_in || 60);
|
if (d.error) { hideNetPending(); return; }
|
||||||
|
showNetPending(d.rollback_in || NET_ROLLBACK_SECS);
|
||||||
loadNetwork();
|
loadNetwork();
|
||||||
|
}).catch(function() {
|
||||||
|
setTimeout(loadNetwork, 1500);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
loadNetwork();
|
loadNetwork();
|
||||||
|
if (_netRefreshTimer) clearInterval(_netRefreshTimer);
|
||||||
|
_netRefreshTimer = setInterval(loadNetwork, 5000);
|
||||||
</script>`
|
</script>`
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1562,7 +1601,7 @@ function loadTasks() {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
const rows = tasks.map(t => {
|
const rows = tasks.map(t => {
|
||||||
const dur = t.started_at ? formatDur(t.started_at, t.done_at) : '';
|
const dur = t.elapsed_sec ? formatDurSec(t.elapsed_sec) : '';
|
||||||
const statusClass = {running:'badge-ok',pending:'badge-unknown',done:'badge-ok',failed:'badge-err',cancelled:'badge-unknown'}[t.status]||'badge-unknown';
|
const statusClass = {running:'badge-ok',pending:'badge-unknown',done:'badge-ok',failed:'badge-err',cancelled:'badge-unknown'}[t.status]||'badge-unknown';
|
||||||
const statusLabel = {running:'▶ running',pending:'pending',done:'✓ done',failed:'✗ failed',cancelled:'cancelled'}[t.status]||t.status;
|
const statusLabel = {running:'▶ running',pending:'pending',done:'✓ done',failed:'✗ failed',cancelled:'cancelled'}[t.status]||t.status;
|
||||||
let actions = '<button class="btn btn-sm btn-secondary" onclick="viewLog(\''+t.id+'\',\''+escHtml(t.name)+'\')">Logs</button>';
|
let actions = '<button class="btn btn-sm btn-secondary" onclick="viewLog(\''+t.id+'\',\''+escHtml(t.name)+'\')">Logs</button>';
|
||||||
@@ -1587,14 +1626,11 @@ function loadTasks() {
|
|||||||
|
|
||||||
function escHtml(s) { return (s||'').replace(/&/g,'&').replace(/</g,'<').replace(/>/g,'>').replace(/"/g,'"'); }
|
function escHtml(s) { return (s||'').replace(/&/g,'&').replace(/</g,'<').replace(/>/g,'>').replace(/"/g,'"'); }
|
||||||
function fmtTime(s) { if (!s) return ''; try { return new Date(s).toLocaleTimeString(); } catch(e){ return s; } }
|
function fmtTime(s) { if (!s) return ''; try { return new Date(s).toLocaleTimeString(); } catch(e){ return s; } }
|
||||||
function formatDur(start, end) {
|
function formatDurSec(sec) {
|
||||||
try {
|
sec = Math.max(0, Math.round(sec||0));
|
||||||
const s = new Date(start), e = end ? new Date(end) : new Date();
|
if (sec < 60) return sec+'s';
|
||||||
const sec = Math.round((e-s)/1000);
|
const m = Math.floor(sec/60), ss = sec%60;
|
||||||
if (sec < 60) return sec+'s';
|
return m+'m '+ss+'s';
|
||||||
const m = Math.floor(sec/60), ss = sec%60;
|
|
||||||
return m+'m '+ss+'s';
|
|
||||||
} catch(e){ return ''; }
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function cancelTask(id) {
|
function cancelTask(id) {
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ import (
|
|||||||
"net/http"
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"sort"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
@@ -84,6 +85,15 @@ func (r *metricsRing) snapshot() ([]float64, []string) {
|
|||||||
return v, labels
|
return v, labels
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (r *metricsRing) latest() (float64, bool) {
|
||||||
|
r.mu.Lock()
|
||||||
|
defer r.mu.Unlock()
|
||||||
|
if len(r.vals) == 0 {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
return r.vals[len(r.vals)-1], true
|
||||||
|
}
|
||||||
|
|
||||||
func timestampsSameLocalDay(times []time.Time) bool {
|
func timestampsSameLocalDay(times []time.Time) bool {
|
||||||
if len(times) == 0 {
|
if len(times) == 0 {
|
||||||
return true
|
return true
|
||||||
@@ -118,9 +128,12 @@ type namedMetricsRing struct {
|
|||||||
Ring *metricsRing
|
Ring *metricsRing
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const metricsChartWindow = 120
|
||||||
|
|
||||||
// pendingNetChange tracks a network state change awaiting confirmation.
|
// pendingNetChange tracks a network state change awaiting confirmation.
|
||||||
type pendingNetChange struct {
|
type pendingNetChange struct {
|
||||||
snapshot platform.NetworkSnapshot
|
snapshot platform.NetworkSnapshot
|
||||||
|
deadline time.Time
|
||||||
timer *time.Timer
|
timer *time.Timer
|
||||||
mu sync.Mutex
|
mu sync.Mutex
|
||||||
}
|
}
|
||||||
@@ -171,7 +184,7 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
|||||||
// Open metrics DB and pre-fill ring buffers from history.
|
// Open metrics DB and pre-fill ring buffers from history.
|
||||||
if db, err := openMetricsDB(metricsDBPath); err == nil {
|
if db, err := openMetricsDB(metricsDBPath); err == nil {
|
||||||
h.metricsDB = db
|
h.metricsDB = db
|
||||||
if samples, err := db.LoadRecent(120); err == nil {
|
if samples, err := db.LoadRecent(metricsChartWindow); err == nil {
|
||||||
for _, s := range samples {
|
for _, s := range samples {
|
||||||
h.feedRings(s)
|
h.feedRings(s)
|
||||||
}
|
}
|
||||||
@@ -292,11 +305,11 @@ func (h *handler) startMetricsCollector() {
|
|||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
for range ticker.C {
|
for range ticker.C {
|
||||||
sample := platform.SampleLiveMetrics()
|
sample := platform.SampleLiveMetrics()
|
||||||
h.feedRings(sample)
|
|
||||||
h.setLatestMetric(sample)
|
|
||||||
if h.metricsDB != nil {
|
if h.metricsDB != nil {
|
||||||
_ = h.metricsDB.Write(sample)
|
_ = h.metricsDB.Write(sample)
|
||||||
}
|
}
|
||||||
|
h.feedRings(sample)
|
||||||
|
h.setLatestMetric(sample)
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
}
|
}
|
||||||
@@ -448,222 +461,13 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
|
|||||||
path := strings.TrimPrefix(r.URL.Path, "/api/metrics/chart/")
|
path := strings.TrimPrefix(r.URL.Path, "/api/metrics/chart/")
|
||||||
path = strings.TrimSuffix(path, ".svg")
|
path = strings.TrimSuffix(path, ".svg")
|
||||||
|
|
||||||
if h.metricsDB != nil {
|
if h.metricsDB == nil {
|
||||||
if datasets, names, labels, title, yMin, yMax, ok := h.chartDataFromDB(path); ok {
|
http.Error(w, "metrics database not available", http.StatusServiceUnavailable)
|
||||||
buf, err := renderChartSVG(title, datasets, names, labels, yMin, yMax)
|
return
|
||||||
if err != nil {
|
|
||||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
w.Header().Set("Content-Type", "image/svg+xml")
|
|
||||||
w.Header().Set("Cache-Control", "no-store")
|
|
||||||
_, _ = w.Write(buf)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
datasets, names, labels, title, yMin, yMax, ok := h.chartDataFromDB(path)
|
||||||
var datasets [][]float64
|
if !ok {
|
||||||
var names []string
|
http.Error(w, "metrics history unavailable", http.StatusServiceUnavailable)
|
||||||
var labels []string
|
|
||||||
var title string
|
|
||||||
var yMin, yMax *float64 // nil = auto; for load charts fixed 0-100
|
|
||||||
|
|
||||||
switch {
|
|
||||||
// ── Server sub-charts ─────────────────────────────────────────────────
|
|
||||||
case path == "server-load":
|
|
||||||
title = "CPU / Memory Load"
|
|
||||||
vCPULoad, l := h.ringCPULoad.snapshot()
|
|
||||||
vMemLoad, _ := h.ringMemLoad.snapshot()
|
|
||||||
labels = l
|
|
||||||
datasets = [][]float64{vCPULoad, vMemLoad}
|
|
||||||
names = []string{"CPU Load %", "Mem Load %"}
|
|
||||||
yMin = floatPtr(0)
|
|
||||||
yMax = floatPtr(100)
|
|
||||||
|
|
||||||
case path == "server-temp", path == "server-temp-cpu":
|
|
||||||
title = "CPU Temperature"
|
|
||||||
h.ringsMu.Lock()
|
|
||||||
datasets, names, labels = snapshotNamedRings(h.cpuTempRings)
|
|
||||||
h.ringsMu.Unlock()
|
|
||||||
yMin = floatPtr(0)
|
|
||||||
yMax = autoMax120(datasets...)
|
|
||||||
|
|
||||||
case path == "server-temp-gpu":
|
|
||||||
title = "GPU Temperature"
|
|
||||||
h.ringsMu.Lock()
|
|
||||||
for idx, gr := range h.gpuRings {
|
|
||||||
if gr == nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
vTemp, l := gr.Temp.snapshot()
|
|
||||||
datasets = append(datasets, vTemp)
|
|
||||||
names = append(names, fmt.Sprintf("GPU %d", idx))
|
|
||||||
if len(labels) == 0 {
|
|
||||||
labels = l
|
|
||||||
}
|
|
||||||
}
|
|
||||||
h.ringsMu.Unlock()
|
|
||||||
yMin = floatPtr(0)
|
|
||||||
yMax = autoMax120(datasets...)
|
|
||||||
|
|
||||||
case path == "server-temp-ambient":
|
|
||||||
title = "Ambient / Other Sensors"
|
|
||||||
h.ringsMu.Lock()
|
|
||||||
datasets, names, labels = snapshotNamedRings(h.ambientTempRings)
|
|
||||||
h.ringsMu.Unlock()
|
|
||||||
yMin = floatPtr(0)
|
|
||||||
yMax = autoMax120(datasets...)
|
|
||||||
|
|
||||||
case path == "server-power":
|
|
||||||
title = "System Power"
|
|
||||||
vPower, l := h.ringPower.snapshot()
|
|
||||||
vPower = normalizePowerSeries(vPower)
|
|
||||||
labels = l
|
|
||||||
datasets = [][]float64{vPower}
|
|
||||||
names = []string{"Power W"}
|
|
||||||
yMin = floatPtr(0)
|
|
||||||
yMax = autoMax120(vPower)
|
|
||||||
|
|
||||||
case path == "server-fans":
|
|
||||||
title = "Fan RPM"
|
|
||||||
h.ringsMu.Lock()
|
|
||||||
for i, fr := range h.ringFans {
|
|
||||||
fv, _ := fr.snapshot()
|
|
||||||
datasets = append(datasets, fv)
|
|
||||||
name := "Fan"
|
|
||||||
if i < len(h.fanNames) {
|
|
||||||
name = h.fanNames[i]
|
|
||||||
}
|
|
||||||
names = append(names, name+" RPM")
|
|
||||||
}
|
|
||||||
h.ringsMu.Unlock()
|
|
||||||
yMin = floatPtr(0)
|
|
||||||
yMax = autoMax120(datasets...)
|
|
||||||
|
|
||||||
// ── Combined GPU charts (all GPUs on one chart) ───────────────────────
|
|
||||||
case path == "gpu-all-load":
|
|
||||||
title = "GPU Compute Load"
|
|
||||||
h.ringsMu.Lock()
|
|
||||||
for idx, gr := range h.gpuRings {
|
|
||||||
if gr == nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
vUtil, l := gr.Util.snapshot()
|
|
||||||
datasets = append(datasets, vUtil)
|
|
||||||
names = append(names, fmt.Sprintf("GPU %d", idx))
|
|
||||||
if len(labels) == 0 {
|
|
||||||
labels = l
|
|
||||||
}
|
|
||||||
}
|
|
||||||
h.ringsMu.Unlock()
|
|
||||||
yMin = floatPtr(0)
|
|
||||||
yMax = floatPtr(100)
|
|
||||||
|
|
||||||
case path == "gpu-all-memload":
|
|
||||||
title = "GPU Memory Load"
|
|
||||||
h.ringsMu.Lock()
|
|
||||||
for idx, gr := range h.gpuRings {
|
|
||||||
if gr == nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
vMem, l := gr.MemUtil.snapshot()
|
|
||||||
datasets = append(datasets, vMem)
|
|
||||||
names = append(names, fmt.Sprintf("GPU %d", idx))
|
|
||||||
if len(labels) == 0 {
|
|
||||||
labels = l
|
|
||||||
}
|
|
||||||
}
|
|
||||||
h.ringsMu.Unlock()
|
|
||||||
yMin = floatPtr(0)
|
|
||||||
yMax = floatPtr(100)
|
|
||||||
|
|
||||||
case path == "gpu-all-power":
|
|
||||||
title = "GPU Power"
|
|
||||||
h.ringsMu.Lock()
|
|
||||||
for idx, gr := range h.gpuRings {
|
|
||||||
if gr == nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
vPow, l := gr.Power.snapshot()
|
|
||||||
datasets = append(datasets, vPow)
|
|
||||||
names = append(names, fmt.Sprintf("GPU %d", idx))
|
|
||||||
if len(labels) == 0 {
|
|
||||||
labels = l
|
|
||||||
}
|
|
||||||
}
|
|
||||||
h.ringsMu.Unlock()
|
|
||||||
yMin = floatPtr(0)
|
|
||||||
yMax = autoMax120(datasets...)
|
|
||||||
|
|
||||||
case path == "gpu-all-temp":
|
|
||||||
title = "GPU Temperature"
|
|
||||||
h.ringsMu.Lock()
|
|
||||||
for idx, gr := range h.gpuRings {
|
|
||||||
if gr == nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
vTemp, l := gr.Temp.snapshot()
|
|
||||||
datasets = append(datasets, vTemp)
|
|
||||||
names = append(names, fmt.Sprintf("GPU %d", idx))
|
|
||||||
if len(labels) == 0 {
|
|
||||||
labels = l
|
|
||||||
}
|
|
||||||
}
|
|
||||||
h.ringsMu.Unlock()
|
|
||||||
yMin = floatPtr(0)
|
|
||||||
yMax = autoMax120(datasets...)
|
|
||||||
|
|
||||||
// ── Per-GPU sub-charts ────────────────────────────────────────────────
|
|
||||||
case strings.HasPrefix(path, "gpu/"):
|
|
||||||
rest := strings.TrimPrefix(path, "gpu/")
|
|
||||||
// rest is either "{idx}-load", "{idx}-temp", "{idx}-power", or legacy "{idx}"
|
|
||||||
sub := ""
|
|
||||||
if i := strings.LastIndex(rest, "-"); i > 0 {
|
|
||||||
sub = rest[i+1:]
|
|
||||||
rest = rest[:i]
|
|
||||||
}
|
|
||||||
idx := 0
|
|
||||||
fmt.Sscanf(rest, "%d", &idx)
|
|
||||||
h.ringsMu.Lock()
|
|
||||||
var gr *gpuRings
|
|
||||||
if idx < len(h.gpuRings) {
|
|
||||||
gr = h.gpuRings[idx]
|
|
||||||
}
|
|
||||||
h.ringsMu.Unlock()
|
|
||||||
if gr == nil {
|
|
||||||
http.NotFound(w, r)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
switch sub {
|
|
||||||
case "load":
|
|
||||||
vUtil, l := gr.Util.snapshot()
|
|
||||||
vMemUtil, _ := gr.MemUtil.snapshot()
|
|
||||||
labels = l
|
|
||||||
title = fmt.Sprintf("GPU %d Load", idx)
|
|
||||||
datasets = [][]float64{vUtil, vMemUtil}
|
|
||||||
names = []string{"Load %", "Mem %"}
|
|
||||||
yMin = floatPtr(0)
|
|
||||||
yMax = floatPtr(100)
|
|
||||||
case "temp":
|
|
||||||
vTemp, l := gr.Temp.snapshot()
|
|
||||||
labels = l
|
|
||||||
title = fmt.Sprintf("GPU %d Temperature", idx)
|
|
||||||
datasets = [][]float64{vTemp}
|
|
||||||
names = []string{"Temp °C"}
|
|
||||||
yMin = floatPtr(0)
|
|
||||||
yMax = autoMax120(vTemp)
|
|
||||||
default: // "power" or legacy (no sub)
|
|
||||||
vPower, l := gr.Power.snapshot()
|
|
||||||
labels = l
|
|
||||||
title = fmt.Sprintf("GPU %d Power", idx)
|
|
||||||
datasets = [][]float64{vPower}
|
|
||||||
names = []string{"Power W"}
|
|
||||||
yMin = floatPtr(0)
|
|
||||||
yMax = autoMax120(vPower)
|
|
||||||
}
|
|
||||||
|
|
||||||
default:
|
|
||||||
http.NotFound(w, r)
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -840,6 +644,7 @@ func namedTempDatasets(samples []platform.LiveMetricSample, group string) ([][]f
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
sort.Strings(names)
|
||||||
datasets := make([][]float64, 0, len(names))
|
datasets := make([][]float64, 0, len(names))
|
||||||
for _, name := range names {
|
for _, name := range names {
|
||||||
ds := make([]float64, len(samples))
|
ds := make([]float64, len(samples))
|
||||||
@@ -867,6 +672,7 @@ func namedFanDatasets(samples []platform.LiveMetricSample) ([][]float64, []strin
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
sort.Strings(names)
|
||||||
datasets := make([][]float64, 0, len(names))
|
datasets := make([][]float64, 0, len(names))
|
||||||
for _, name := range names {
|
for _, name := range names {
|
||||||
ds := make([]float64, len(samples))
|
ds := make([]float64, len(samples))
|
||||||
@@ -878,7 +684,7 @@ func namedFanDatasets(samples []platform.LiveMetricSample) ([][]float64, []strin
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
datasets = append(datasets, ds)
|
datasets = append(datasets, normalizeFanSeries(ds))
|
||||||
}
|
}
|
||||||
return datasets, names
|
return datasets, names
|
||||||
}
|
}
|
||||||
@@ -894,6 +700,7 @@ func gpuDatasets(samples []platform.LiveMetricSample, pick func(platform.GPUMetr
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
sort.Ints(indices)
|
||||||
datasets := make([][]float64, 0, len(indices))
|
datasets := make([][]float64, 0, len(indices))
|
||||||
names := make([]string, 0, len(indices))
|
names := make([]string, 0, len(indices))
|
||||||
for _, idx := range indices {
|
for _, idx := range indices {
|
||||||
@@ -953,6 +760,27 @@ func normalizePowerSeries(ds []float64) []float64 {
|
|||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func normalizeFanSeries(ds []float64) []float64 {
|
||||||
|
if len(ds) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
out := make([]float64, len(ds))
|
||||||
|
var lastPositive float64
|
||||||
|
for i, v := range ds {
|
||||||
|
if v > 0 {
|
||||||
|
lastPositive = v
|
||||||
|
out[i] = v
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if lastPositive > 0 {
|
||||||
|
out[i] = lastPositive
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
out[i] = 0
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
// floatPtr returns a pointer to a float64 value.
|
// floatPtr returns a pointer to a float64 value.
|
||||||
func floatPtr(v float64) *float64 { return &v }
|
func floatPtr(v float64) *float64 { return &v }
|
||||||
|
|
||||||
@@ -1044,15 +872,17 @@ func renderChartSVG(title string, datasets [][]float64, names []string, labels [
|
|||||||
opt.Title = gocharts.TitleOption{Text: title}
|
opt.Title = gocharts.TitleOption{Text: title}
|
||||||
opt.XAxis.Labels = sparse
|
opt.XAxis.Labels = sparse
|
||||||
opt.Legend = gocharts.LegendOption{SeriesNames: names}
|
opt.Legend = gocharts.LegendOption{SeriesNames: names}
|
||||||
|
if chartLegendVisible(len(names)) {
|
||||||
|
opt.Legend.Offset = gocharts.OffsetStr{Top: gocharts.PositionBottom}
|
||||||
|
opt.Legend.OverlayChart = gocharts.Ptr(false)
|
||||||
|
} else {
|
||||||
|
opt.Legend.Show = gocharts.Ptr(false)
|
||||||
|
}
|
||||||
opt.Symbol = gocharts.SymbolNone
|
opt.Symbol = gocharts.SymbolNone
|
||||||
// Right padding: reserve space for the MarkLine label (library recommendation).
|
// Right padding: reserve space for the MarkLine label (library recommendation).
|
||||||
opt.Padding = gocharts.NewBox(20, 20, 80, 20)
|
opt.Padding = gocharts.NewBox(20, 20, 80, 20)
|
||||||
if yMin != nil || yMax != nil {
|
if yMin != nil || yMax != nil {
|
||||||
opt.YAxis = []gocharts.YAxisOption{{
|
opt.YAxis = []gocharts.YAxisOption{chartYAxisOption(yMin, yMax)}
|
||||||
Min: yMin,
|
|
||||||
Max: yMax,
|
|
||||||
ValueFormatter: chartLegendNumber,
|
|
||||||
}}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add a single peak mark line on the series that holds the global maximum.
|
// Add a single peak mark line on the series that holds the global maximum.
|
||||||
@@ -1064,7 +894,7 @@ func renderChartSVG(title string, datasets [][]float64, names []string, labels [
|
|||||||
p := gocharts.NewPainter(gocharts.PainterOptions{
|
p := gocharts.NewPainter(gocharts.PainterOptions{
|
||||||
OutputFormat: gocharts.ChartOutputSVG,
|
OutputFormat: gocharts.ChartOutputSVG,
|
||||||
Width: 1400,
|
Width: 1400,
|
||||||
Height: 240,
|
Height: chartCanvasHeight(len(names)),
|
||||||
}, gocharts.PainterThemeOption(gocharts.GetTheme("grafana")))
|
}, gocharts.PainterThemeOption(gocharts.GetTheme("grafana")))
|
||||||
if err := p.LineChart(opt); err != nil {
|
if err := p.LineChart(opt); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
@@ -1072,6 +902,26 @@ func renderChartSVG(title string, datasets [][]float64, names []string, labels [
|
|||||||
return p.Bytes()
|
return p.Bytes()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func chartLegendVisible(seriesCount int) bool {
|
||||||
|
return seriesCount <= 8
|
||||||
|
}
|
||||||
|
|
||||||
|
func chartCanvasHeight(seriesCount int) int {
|
||||||
|
if chartLegendVisible(seriesCount) {
|
||||||
|
return 360
|
||||||
|
}
|
||||||
|
return 288
|
||||||
|
}
|
||||||
|
|
||||||
|
func chartYAxisOption(yMin, yMax *float64) gocharts.YAxisOption {
|
||||||
|
return gocharts.YAxisOption{
|
||||||
|
Min: yMin,
|
||||||
|
Max: yMax,
|
||||||
|
LabelCount: 11,
|
||||||
|
ValueFormatter: chartYAxisNumber,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// globalPeakSeries returns the index of the series containing the global maximum
|
// globalPeakSeries returns the index of the series containing the global maximum
|
||||||
// value across all datasets, and that maximum value.
|
// value across all datasets, and that maximum value.
|
||||||
func globalPeakSeries(datasets [][]float64) (idx int, peak float64) {
|
func globalPeakSeries(datasets [][]float64) (idx int, peak float64) {
|
||||||
@@ -1159,6 +1009,28 @@ func snapshotNamedRings(rings []*namedMetricsRing) ([][]float64, []string, []str
|
|||||||
return datasets, names, labels
|
return datasets, names, labels
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func snapshotFanRings(rings []*metricsRing, fanNames []string) ([][]float64, []string, []string) {
|
||||||
|
var datasets [][]float64
|
||||||
|
var names []string
|
||||||
|
var labels []string
|
||||||
|
for i, ring := range rings {
|
||||||
|
if ring == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
vals, l := ring.snapshot()
|
||||||
|
datasets = append(datasets, normalizeFanSeries(vals))
|
||||||
|
name := "Fan"
|
||||||
|
if i < len(fanNames) {
|
||||||
|
name = fanNames[i]
|
||||||
|
}
|
||||||
|
names = append(names, name+" RPM")
|
||||||
|
if len(labels) == 0 {
|
||||||
|
labels = l
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return datasets, names, labels
|
||||||
|
}
|
||||||
|
|
||||||
func chartLegendNumber(v float64) string {
|
func chartLegendNumber(v float64) string {
|
||||||
neg := v < 0
|
neg := v < 0
|
||||||
if v < 0 {
|
if v < 0 {
|
||||||
@@ -1181,6 +1053,23 @@ func chartLegendNumber(v float64) string {
|
|||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func chartYAxisNumber(v float64) string {
|
||||||
|
neg := v < 0
|
||||||
|
if neg {
|
||||||
|
v = -v
|
||||||
|
}
|
||||||
|
var out string
|
||||||
|
if v >= 1000 {
|
||||||
|
out = fmt.Sprintf("%dк", int((v+500)/1000))
|
||||||
|
} else {
|
||||||
|
out = fmt.Sprintf("%.0f", v)
|
||||||
|
}
|
||||||
|
if neg {
|
||||||
|
return "-" + out
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
func sparseLabels(labels []string, n int) []string {
|
func sparseLabels(labels []string, n int) []string {
|
||||||
out := make([]string, len(labels))
|
out := make([]string, len(labels))
|
||||||
step := len(labels) / n
|
step := len(labels) / n
|
||||||
|
|||||||
@@ -89,6 +89,53 @@ func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestChartDataFromSamplesKeepsStableGPUSeriesOrder(t *testing.T) {
|
||||||
|
samples := []platform.LiveMetricSample{
|
||||||
|
{
|
||||||
|
Timestamp: time.Now().Add(-2 * time.Minute),
|
||||||
|
GPUs: []platform.GPUMetricRow{
|
||||||
|
{GPUIndex: 7, PowerW: 170},
|
||||||
|
{GPUIndex: 2, PowerW: 120},
|
||||||
|
{GPUIndex: 0, PowerW: 100},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Timestamp: time.Now().Add(-1 * time.Minute),
|
||||||
|
GPUs: []platform.GPUMetricRow{
|
||||||
|
{GPUIndex: 0, PowerW: 101},
|
||||||
|
{GPUIndex: 7, PowerW: 171},
|
||||||
|
{GPUIndex: 2, PowerW: 121},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
datasets, names, _, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
|
||||||
|
if !ok {
|
||||||
|
t.Fatal("chartDataFromSamples returned ok=false")
|
||||||
|
}
|
||||||
|
if title != "GPU Power" {
|
||||||
|
t.Fatalf("title=%q", title)
|
||||||
|
}
|
||||||
|
wantNames := []string{"GPU 0", "GPU 2", "GPU 7"}
|
||||||
|
if len(names) != len(wantNames) {
|
||||||
|
t.Fatalf("names len=%d want %d: %v", len(names), len(wantNames), names)
|
||||||
|
}
|
||||||
|
for i := range wantNames {
|
||||||
|
if names[i] != wantNames[i] {
|
||||||
|
t.Fatalf("names[%d]=%q want %q; full=%v", i, names[i], wantNames[i], names)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if got := datasets[0]; len(got) != 2 || got[0] != 100 || got[1] != 101 {
|
||||||
|
t.Fatalf("GPU 0 dataset=%v want [100 101]", got)
|
||||||
|
}
|
||||||
|
if got := datasets[1]; len(got) != 2 || got[0] != 120 || got[1] != 121 {
|
||||||
|
t.Fatalf("GPU 2 dataset=%v want [120 121]", got)
|
||||||
|
}
|
||||||
|
if got := datasets[2]; len(got) != 2 || got[0] != 170 || got[1] != 171 {
|
||||||
|
t.Fatalf("GPU 7 dataset=%v want [170 171]", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestNormalizePowerSeriesHoldsLastPositive(t *testing.T) {
|
func TestNormalizePowerSeriesHoldsLastPositive(t *testing.T) {
|
||||||
got := normalizePowerSeries([]float64{0, 480, 0, 0, 510, 0})
|
got := normalizePowerSeries([]float64{0, 480, 0, 0, 510, 0})
|
||||||
want := []float64{0, 480, 480, 480, 510, 510}
|
want := []float64{0, 480, 480, 480, 510, 510}
|
||||||
@@ -102,6 +149,117 @@ func TestNormalizePowerSeriesHoldsLastPositive(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestRenderMetricsUsesBufferedChartRefresh(t *testing.T) {
|
||||||
|
body := renderMetrics()
|
||||||
|
if !strings.Contains(body, "const probe = new Image();") {
|
||||||
|
t.Fatalf("metrics page should preload chart images before swap: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, "el.dataset.loading === '1'") {
|
||||||
|
t.Fatalf("metrics page should avoid overlapping chart reloads: %s", body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestChartLegendVisible(t *testing.T) {
|
||||||
|
if !chartLegendVisible(8) {
|
||||||
|
t.Fatal("legend should stay visible for charts with up to 8 series")
|
||||||
|
}
|
||||||
|
if chartLegendVisible(9) {
|
||||||
|
t.Fatal("legend should be hidden for charts with more than 8 series")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestChartYAxisNumber(t *testing.T) {
|
||||||
|
tests := []struct {
|
||||||
|
in float64
|
||||||
|
want string
|
||||||
|
}{
|
||||||
|
{in: 999, want: "999"},
|
||||||
|
{in: 1000, want: "1к"},
|
||||||
|
{in: 1370, want: "1к"},
|
||||||
|
{in: 1500, want: "2к"},
|
||||||
|
{in: 10200, want: "10к"},
|
||||||
|
{in: -1499, want: "-1к"},
|
||||||
|
}
|
||||||
|
for _, tc := range tests {
|
||||||
|
if got := chartYAxisNumber(tc.in); got != tc.want {
|
||||||
|
t.Fatalf("chartYAxisNumber(%v)=%q want %q", tc.in, got, tc.want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestChartCanvasHeight(t *testing.T) {
|
||||||
|
if got := chartCanvasHeight(4); got != 360 {
|
||||||
|
t.Fatalf("chartCanvasHeight(4)=%d want 360", got)
|
||||||
|
}
|
||||||
|
if got := chartCanvasHeight(12); got != 288 {
|
||||||
|
t.Fatalf("chartCanvasHeight(12)=%d want 288", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNormalizeFanSeriesHoldsLastPositive(t *testing.T) {
|
||||||
|
got := normalizeFanSeries([]float64{4200, 0, 0, 4300, 0})
|
||||||
|
want := []float64{4200, 4200, 4200, 4300, 4300}
|
||||||
|
if len(got) != len(want) {
|
||||||
|
t.Fatalf("len=%d want %d", len(got), len(want))
|
||||||
|
}
|
||||||
|
for i := range want {
|
||||||
|
if got[i] != want[i] {
|
||||||
|
t.Fatalf("got[%d]=%v want %v", i, got[i], want[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestChartYAxisOption(t *testing.T) {
|
||||||
|
min := floatPtr(0)
|
||||||
|
max := floatPtr(100)
|
||||||
|
opt := chartYAxisOption(min, max)
|
||||||
|
if opt.Min != min || opt.Max != max {
|
||||||
|
t.Fatalf("chartYAxisOption min/max mismatch: %#v", opt)
|
||||||
|
}
|
||||||
|
if opt.LabelCount != 11 {
|
||||||
|
t.Fatalf("chartYAxisOption labelCount=%d want 11", opt.LabelCount)
|
||||||
|
}
|
||||||
|
if got := opt.ValueFormatter(1000); got != "1к" {
|
||||||
|
t.Fatalf("chartYAxisOption formatter(1000)=%q want 1к", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSnapshotFanRingsUsesTimelineLabels(t *testing.T) {
|
||||||
|
r1 := newMetricsRing(4)
|
||||||
|
r2 := newMetricsRing(4)
|
||||||
|
r1.push(1000)
|
||||||
|
r1.push(1100)
|
||||||
|
r2.push(1200)
|
||||||
|
r2.push(1300)
|
||||||
|
|
||||||
|
datasets, names, labels := snapshotFanRings([]*metricsRing{r1, r2}, []string{"FAN_A", "FAN_B"})
|
||||||
|
if len(datasets) != 2 {
|
||||||
|
t.Fatalf("datasets=%d want 2", len(datasets))
|
||||||
|
}
|
||||||
|
if len(names) != 2 || names[0] != "FAN_A RPM" || names[1] != "FAN_B RPM" {
|
||||||
|
t.Fatalf("names=%v", names)
|
||||||
|
}
|
||||||
|
if len(labels) != 2 {
|
||||||
|
t.Fatalf("labels=%v want 2 entries", labels)
|
||||||
|
}
|
||||||
|
if labels[0] == "" || labels[1] == "" {
|
||||||
|
t.Fatalf("labels should contain timeline values, got %v", labels)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestRenderNetworkInlineSyncsPendingState(t *testing.T) {
|
||||||
|
body := renderNetworkInline()
|
||||||
|
if !strings.Contains(body, "d.pending_change") {
|
||||||
|
t.Fatalf("network UI should read pending network state from API: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, "setInterval(loadNetwork, 5000)") {
|
||||||
|
t.Fatalf("network UI should periodically refresh network state: %s", body)
|
||||||
|
}
|
||||||
|
if !strings.Contains(body, "showNetPending(NET_ROLLBACK_SECS)") {
|
||||||
|
t.Fatalf("network UI should show pending confirmation immediately on apply: %s", body)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestRootRendersDashboard(t *testing.T) {
|
func TestRootRendersDashboard(t *testing.T) {
|
||||||
dir := t.TempDir()
|
dir := t.TempDir()
|
||||||
path := filepath.Join(dir, "audit.json")
|
path := filepath.Join(dir, "audit.json")
|
||||||
|
|||||||
@@ -83,16 +83,17 @@ func taskDisplayName(target, profile, loader string) string {
|
|||||||
|
|
||||||
// Task represents one unit of work in the queue.
|
// Task represents one unit of work in the queue.
|
||||||
type Task struct {
|
type Task struct {
|
||||||
ID string `json:"id"`
|
ID string `json:"id"`
|
||||||
Name string `json:"name"`
|
Name string `json:"name"`
|
||||||
Target string `json:"target"`
|
Target string `json:"target"`
|
||||||
Priority int `json:"priority"`
|
Priority int `json:"priority"`
|
||||||
Status string `json:"status"`
|
Status string `json:"status"`
|
||||||
CreatedAt time.Time `json:"created_at"`
|
CreatedAt time.Time `json:"created_at"`
|
||||||
StartedAt *time.Time `json:"started_at,omitempty"`
|
StartedAt *time.Time `json:"started_at,omitempty"`
|
||||||
DoneAt *time.Time `json:"done_at,omitempty"`
|
DoneAt *time.Time `json:"done_at,omitempty"`
|
||||||
ErrMsg string `json:"error,omitempty"`
|
ElapsedSec int `json:"elapsed_sec,omitempty"`
|
||||||
LogPath string `json:"log_path,omitempty"`
|
ErrMsg string `json:"error,omitempty"`
|
||||||
|
LogPath string `json:"log_path,omitempty"`
|
||||||
|
|
||||||
// runtime fields (not serialised)
|
// runtime fields (not serialised)
|
||||||
job *jobState
|
job *jobState
|
||||||
@@ -101,11 +102,11 @@ type Task struct {
|
|||||||
|
|
||||||
// taskParams holds optional parameters parsed from the run request.
|
// taskParams holds optional parameters parsed from the run request.
|
||||||
type taskParams struct {
|
type taskParams struct {
|
||||||
Duration int `json:"duration,omitempty"`
|
Duration int `json:"duration,omitempty"`
|
||||||
DiagLevel int `json:"diag_level,omitempty"`
|
DiagLevel int `json:"diag_level,omitempty"`
|
||||||
GPUIndices []int `json:"gpu_indices,omitempty"`
|
GPUIndices []int `json:"gpu_indices,omitempty"`
|
||||||
ExcludeGPUIndices []int `json:"exclude_gpu_indices,omitempty"`
|
ExcludeGPUIndices []int `json:"exclude_gpu_indices,omitempty"`
|
||||||
Loader string `json:"loader,omitempty"`
|
Loader string `json:"loader,omitempty"`
|
||||||
BurnProfile string `json:"burn_profile,omitempty"`
|
BurnProfile string `json:"burn_profile,omitempty"`
|
||||||
DisplayName string `json:"display_name,omitempty"`
|
DisplayName string `json:"display_name,omitempty"`
|
||||||
Device string `json:"device,omitempty"` // for install
|
Device string `json:"device,omitempty"` // for install
|
||||||
@@ -311,6 +312,7 @@ func (q *taskQueue) snapshot() []Task {
|
|||||||
out := make([]Task, len(q.tasks))
|
out := make([]Task, len(q.tasks))
|
||||||
for i, t := range q.tasks {
|
for i, t := range q.tasks {
|
||||||
out[i] = *t
|
out[i] = *t
|
||||||
|
out[i].ElapsedSec = taskElapsedSec(&out[i], time.Now())
|
||||||
}
|
}
|
||||||
sort.SliceStable(out, func(i, j int) bool {
|
sort.SliceStable(out, func(i, j int) bool {
|
||||||
si := statusOrder(out[i].Status)
|
si := statusOrder(out[i].Status)
|
||||||
@@ -769,6 +771,7 @@ func (q *taskQueue) loadLocked() {
|
|||||||
q.assignTaskLogPathLocked(t)
|
q.assignTaskLogPathLocked(t)
|
||||||
if t.Status == TaskPending || t.Status == TaskRunning {
|
if t.Status == TaskPending || t.Status == TaskRunning {
|
||||||
t.Status = TaskPending
|
t.Status = TaskPending
|
||||||
|
t.StartedAt = nil
|
||||||
t.DoneAt = nil
|
t.DoneAt = nil
|
||||||
t.ErrMsg = ""
|
t.ErrMsg = ""
|
||||||
}
|
}
|
||||||
@@ -808,3 +811,21 @@ func (q *taskQueue) persistLocked() {
|
|||||||
}
|
}
|
||||||
_ = os.Rename(tmp, q.statePath)
|
_ = os.Rename(tmp, q.statePath)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func taskElapsedSec(t *Task, now time.Time) int {
|
||||||
|
if t == nil || t.StartedAt == nil || t.StartedAt.IsZero() {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
start := *t.StartedAt
|
||||||
|
if !t.CreatedAt.IsZero() && start.Before(t.CreatedAt) {
|
||||||
|
start = t.CreatedAt
|
||||||
|
}
|
||||||
|
end := now
|
||||||
|
if t.DoneAt != nil && !t.DoneAt.IsZero() {
|
||||||
|
end = *t.DoneAt
|
||||||
|
}
|
||||||
|
if end.Before(start) {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
return int(end.Sub(start).Round(time.Second) / time.Second)
|
||||||
|
}
|
||||||
|
|||||||
@@ -55,6 +55,9 @@ func TestTaskQueuePersistsAndRecoversPendingTasks(t *testing.T) {
|
|||||||
if got.Status != TaskPending {
|
if got.Status != TaskPending {
|
||||||
t.Fatalf("status=%q want %q", got.Status, TaskPending)
|
t.Fatalf("status=%q want %q", got.Status, TaskPending)
|
||||||
}
|
}
|
||||||
|
if got.StartedAt != nil {
|
||||||
|
t.Fatalf("started_at=%v want nil for recovered pending task", got.StartedAt)
|
||||||
|
}
|
||||||
if got.params.Duration != 300 || got.params.BurnProfile != "smoke" {
|
if got.params.Duration != 300 || got.params.BurnProfile != "smoke" {
|
||||||
t.Fatalf("params=%+v", got.params)
|
t.Fatalf("params=%+v", got.params)
|
||||||
}
|
}
|
||||||
@@ -236,6 +239,26 @@ func TestRunTaskBuildsSupportBundleWithoutApp(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestTaskElapsedSecClampsInvalidStartedAt(t *testing.T) {
|
||||||
|
now := time.Date(2026, 4, 1, 19, 10, 0, 0, time.UTC)
|
||||||
|
created := time.Date(2026, 4, 1, 19, 4, 5, 0, time.UTC)
|
||||||
|
started := time.Time{}
|
||||||
|
task := &Task{
|
||||||
|
Status: TaskRunning,
|
||||||
|
CreatedAt: created,
|
||||||
|
StartedAt: &started,
|
||||||
|
}
|
||||||
|
if got := taskElapsedSec(task, now); got != 0 {
|
||||||
|
t.Fatalf("taskElapsedSec(zero start)=%d want 0", got)
|
||||||
|
}
|
||||||
|
|
||||||
|
stale := created.Add(-24 * time.Hour)
|
||||||
|
task.StartedAt = &stale
|
||||||
|
if got := taskElapsedSec(task, now); got != int(now.Sub(created).Seconds()) {
|
||||||
|
t.Fatalf("taskElapsedSec(stale start)=%d want %d", got, int(now.Sub(created).Seconds()))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestRunTaskInstallUsesSharedCommandStreaming(t *testing.T) {
|
func TestRunTaskInstallUsesSharedCommandStreaming(t *testing.T) {
|
||||||
q := &taskQueue{
|
q := &taskQueue{
|
||||||
opts: &HandlerOptions{},
|
opts: &HandlerOptions{},
|
||||||
|
|||||||
@@ -9,6 +9,34 @@ All live metrics charts in the web UI are server-side SVG images served by Go
|
|||||||
and polled by the browser every 2 seconds via `<img src="...?t=now">`.
|
and polled by the browser every 2 seconds via `<img src="...?t=now">`.
|
||||||
There is no client-side canvas or JS chart library.
|
There is no client-side canvas or JS chart library.
|
||||||
|
|
||||||
|
## Rule: live charts must be visually uniform
|
||||||
|
|
||||||
|
Live charts are a single UI family, not a set of one-off widgets. New charts and
|
||||||
|
changes to existing charts must keep the same rendering model and presentation
|
||||||
|
rules unless there is an explicit architectural decision to diverge.
|
||||||
|
|
||||||
|
Default expectations:
|
||||||
|
|
||||||
|
- same server-side SVG pipeline for all live metrics charts
|
||||||
|
- same refresh behaviour and failure handling in the browser
|
||||||
|
- same canvas size class and card layout
|
||||||
|
- same legend placement policy across charts
|
||||||
|
- same axis, title, and summary conventions
|
||||||
|
- no chart-specific visual exceptions added as a quick fix
|
||||||
|
|
||||||
|
Current default for live charts:
|
||||||
|
|
||||||
|
- legend below the plot area when a chart has 8 series or fewer
|
||||||
|
- legend hidden when a chart has more than 8 series
|
||||||
|
- 10 equal Y-axis steps across the chart height
|
||||||
|
- 1400 x 360 SVG canvas with legend
|
||||||
|
- 1400 x 288 SVG canvas without legend
|
||||||
|
- full-width card rendering in a single-column stack
|
||||||
|
|
||||||
|
If one chart needs a different layout or legend behaviour, treat that as a
|
||||||
|
design-level decision affecting the whole chart family, not as a local tweak to
|
||||||
|
just one endpoint.
|
||||||
|
|
||||||
### Why go-analyze/charts
|
### Why go-analyze/charts
|
||||||
|
|
||||||
- Pure Go, no CGO — builds cleanly inside the live-build container
|
- Pure Go, no CGO — builds cleanly inside the live-build container
|
||||||
@@ -29,7 +57,8 @@ self-contained SVG renderer used **only** for completed SAT run reports
|
|||||||
| `GET /api/metrics/chart/server.svg` | CPU temp, CPU load %, mem load %, power W, fan RPMs |
|
| `GET /api/metrics/chart/server.svg` | CPU temp, CPU load %, mem load %, power W, fan RPMs |
|
||||||
| `GET /api/metrics/chart/gpu/{idx}.svg` | GPU temp °C, load %, mem %, power W |
|
| `GET /api/metrics/chart/gpu/{idx}.svg` | GPU temp °C, load %, mem %, power W |
|
||||||
|
|
||||||
Charts are 1400 × 280 px SVG. The page renders them at `width: 100%` in a
|
Charts are 1400 × 360 px SVG when the legend is shown, and 1400 × 288 px when
|
||||||
|
the legend is hidden. The page renders them at `width: 100%` in a
|
||||||
single-column layout so they always fill the viewport width.
|
single-column layout so they always fill the viewport width.
|
||||||
|
|
||||||
### Ring buffers
|
### Ring buffers
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
# Decision: Treat memtest as explicit ISO content, not as trusted live-build magic
|
# Decision: Treat memtest as explicit ISO content, not as trusted live-build magic
|
||||||
|
|
||||||
**Date:** 2026-04-01
|
**Date:** 2026-04-01
|
||||||
**Status:** active
|
**Status:** resolved
|
||||||
|
|
||||||
## Context
|
## Context
|
||||||
|
|
||||||
@@ -58,6 +58,18 @@ Root cause of the false alarm:
|
|||||||
- as a result, we re-entered the same memtest investigation loop even though
|
- as a result, we re-entered the same memtest investigation loop even though
|
||||||
the real ISO was already correct
|
the real ISO was already correct
|
||||||
|
|
||||||
|
Additional correction from the subsequent `v3.21` build logs dated 2026-04-01:
|
||||||
|
|
||||||
|
- once ISO reading was fixed, the post-build debug correctly showed the raw ISO
|
||||||
|
still carried live-build's default memtest layout (`live/memtest.bin`,
|
||||||
|
`live/memtest.efi`, `boot/grub/memtest.cfg`, `isolinux/memtest.cfg`)
|
||||||
|
- that mismatch is expected to trigger project recovery, because `bee` requires
|
||||||
|
`boot/memtest86+x64.bin` / `boot/memtest86+x64.efi` plus matching menu paths
|
||||||
|
- however, `build.sh` exited before recovery because `set -e` treated a direct
|
||||||
|
`iso_memtest_present` return code of `1` as fatal
|
||||||
|
- so the next repeated loop was caused by shell control flow, not by proof that
|
||||||
|
the recovery design itself was wrong
|
||||||
|
|
||||||
## Known Failed Attempts
|
## Known Failed Attempts
|
||||||
|
|
||||||
These approaches were already tried and should not be repeated blindly:
|
These approaches were already tried and should not be repeated blindly:
|
||||||
@@ -102,6 +114,8 @@ Any future memtest fix must explicitly identify:
|
|||||||
- and a post-build proof from a real ISO, not only from intermediate workdir files
|
- and a post-build proof from a real ISO, not only from intermediate workdir files
|
||||||
- whether the ISO inspection step itself succeeded, rather than merely whether
|
- whether the ISO inspection step itself succeeded, rather than merely whether
|
||||||
the validator printed a memtest warning
|
the validator printed a memtest warning
|
||||||
|
- whether a non-zero probe is intentionally handled inside an `if` / `case`
|
||||||
|
context rather than accidentally tripping `set -e`
|
||||||
|
|
||||||
## Decision
|
## Decision
|
||||||
|
|
||||||
@@ -134,6 +148,8 @@ Current implementation direction:
|
|||||||
- install a stable ISO reader in the builder image
|
- install a stable ISO reader in the builder image
|
||||||
- fail with an explicit reader error if ISO listing/extraction fails
|
- fail with an explicit reader error if ISO listing/extraction fails
|
||||||
- do not treat reader failure as evidence that memtest is missing
|
- do not treat reader failure as evidence that memtest is missing
|
||||||
|
- do not call a probe that may return "needs recovery" as a bare command under
|
||||||
|
`set -e`; wrap it in explicit control flow
|
||||||
|
|
||||||
## Consequences
|
## Consequences
|
||||||
|
|
||||||
@@ -144,3 +160,65 @@ Current implementation direction:
|
|||||||
- But validation output is only trustworthy if ISO reading itself succeeded. A
|
- But validation output is only trustworthy if ISO reading itself succeeded. A
|
||||||
"missing memtest" warning without a successful ISO read is not evidence.
|
"missing memtest" warning without a successful ISO read is not evidence.
|
||||||
- If we change memtest strategy again, we must update this ADR with the exact build evidence that justified the change.
|
- If we change memtest strategy again, we must update this ADR with the exact build evidence that justified the change.
|
||||||
|
|
||||||
|
## Working Solution (confirmed 2026-04-01, commits 76a9100 → 2baf3be)
|
||||||
|
|
||||||
|
This approach was confirmed working in ISO `easy-bee-nvidia-v3.20-5-g76a9100-amd64.iso`
|
||||||
|
and validated again in subsequent builds. The final ISO contains all required memtest artifacts.
|
||||||
|
|
||||||
|
### Components
|
||||||
|
|
||||||
|
**1. Binary hook `config/hooks/normal/9100-memtest.hook.binary`**
|
||||||
|
|
||||||
|
Runs inside the live-build binary phase. Does not patch bootloader files at hook time —
|
||||||
|
those files may not exist yet. Instead:
|
||||||
|
|
||||||
|
- Tries to copy `memtest86+x64.bin` / `memtest86+x64.efi` from `chroot/boot/` first.
|
||||||
|
- Falls back to extracting from the cached `.deb` (via `dpkg-deb -x`) if `chroot/boot/` is empty.
|
||||||
|
- Appends GRUB and isolinux menu entries only if the respective cfg files already exist at hook time.
|
||||||
|
If they do not exist, the hook warns and continues (does not fail).
|
||||||
|
|
||||||
|
Controlled by `BEE_REQUIRE_MEMTEST=1` env var to turn warnings into hard errors when needed.
|
||||||
|
|
||||||
|
**2. Post-`lb build` recovery step in `build.sh`**
|
||||||
|
|
||||||
|
After `lb build` completes, `build.sh` checks whether the fully materialized `binary/` tree
|
||||||
|
contains all required memtest artifacts. If not:
|
||||||
|
|
||||||
|
- Copies/extracts memtest binaries into `binary/boot/`.
|
||||||
|
- Patches `binary/boot/grub/grub.cfg` and `binary/isolinux/live.cfg` directly.
|
||||||
|
- Reruns the late binary stages (`binary_checksums`, `binary_iso`, `binary_zsync`) to rebuild
|
||||||
|
the ISO with the patched tree.
|
||||||
|
|
||||||
|
This is the deterministic safety net: even if the hook runs at the wrong time, the recovery
|
||||||
|
step handles the final `binary/` tree after live-build has written all bootloader configs.
|
||||||
|
|
||||||
|
**3. ISO validation hardening**
|
||||||
|
|
||||||
|
The memtest probe in `build.sh` is wrapped in explicit `if` / `case` control flow, not called
|
||||||
|
as a bare command under `set -e`. A non-zero probe return (needs recovery) is intentional and
|
||||||
|
handled — it does not abort the build prematurely.
|
||||||
|
|
||||||
|
ISO reading (`xorriso -indev -ls` / extraction) is treated as a separate prerequisite.
|
||||||
|
If the reader fails, the validator reports a reader error explicitly, not a memtest warning.
|
||||||
|
This prevents the false-negative loop that burned 2026-04-01 v3.14–v3.19.
|
||||||
|
|
||||||
|
### Why this works when earlier attempts did not
|
||||||
|
|
||||||
|
The earlier patterns all shared a single flaw: they assumed a single build-time point
|
||||||
|
(hook or source template) would be the last writer of bootloader configs and memtest payloads.
|
||||||
|
In live-build on Debian Bookworm that assumption is false — live-build continues writing
|
||||||
|
bootloader files after custom hooks run, and `chroot/boot/` does not reliably hold memtest payloads.
|
||||||
|
|
||||||
|
The recovery step sidesteps the ordering problem entirely: it acts on the fully materialized
|
||||||
|
`binary/` tree after `lb build` finishes, then rebuilds the ISO from that patched tree.
|
||||||
|
There is no ordering dependency to get wrong.
|
||||||
|
|
||||||
|
### Do not revert
|
||||||
|
|
||||||
|
Do not remove the recovery step or the hook without a fresh real ISO build proving
|
||||||
|
live-build alone produces all four required artifacts:
|
||||||
|
- `boot/memtest86+x64.bin`
|
||||||
|
- `boot/memtest86+x64.efi`
|
||||||
|
- memtest entry in `boot/grub/grub.cfg`
|
||||||
|
- memtest entry in `isolinux/live.cfg`
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ NCCL_TESTS_VERSION=2.13.10
|
|||||||
NVCC_VERSION=12.8
|
NVCC_VERSION=12.8
|
||||||
CUBLAS_VERSION=13.0.2.14-1
|
CUBLAS_VERSION=13.0.2.14-1
|
||||||
CUDA_USERSPACE_VERSION=13.0.96-1
|
CUDA_USERSPACE_VERSION=13.0.96-1
|
||||||
DCGM_VERSION=4.5.2-1
|
DCGM_VERSION=4.5.3-1
|
||||||
JOHN_JUMBO_COMMIT=67fcf9fe5a
|
JOHN_JUMBO_COMMIT=67fcf9fe5a
|
||||||
ROCM_VERSION=6.3.4
|
ROCM_VERSION=6.3.4
|
||||||
ROCM_SMI_VERSION=7.4.0.60304-76~22.04
|
ROCM_SMI_VERSION=7.4.0.60304-76~22.04
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ lb config noauto \
|
|||||||
--memtest memtest86+ \
|
--memtest memtest86+ \
|
||||||
--iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
--iso-volume "EASY_BEE_${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||||
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
--iso-application "EASY-BEE-${BEE_GPU_VENDOR_UPPER:-NVIDIA}" \
|
||||||
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=7 username=bee user-fullname=Bee modprobe.blacklist=nouveau" \
|
--bootappend-live "boot=live components video=1920x1080 console=tty0 console=ttyS0,115200n8 loglevel=3 username=bee user-fullname=Bee modprobe.blacklist=nouveau,snd_hda_intel,snd_hda_codec_realtek,snd_hda_codec_generic,soundcore" \
|
||||||
--apt-recommends false \
|
--apt-recommends false \
|
||||||
--chroot-squashfs-compression-type zstd \
|
--chroot-squashfs-compression-type zstd \
|
||||||
"${@}"
|
"${@}"
|
||||||
|
|||||||
@@ -862,7 +862,6 @@ rm -f \
|
|||||||
"${OVERLAY_STAGE_DIR}/etc/bee-release" \
|
"${OVERLAY_STAGE_DIR}/etc/bee-release" \
|
||||||
"${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys" \
|
"${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys" \
|
||||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee" \
|
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee" \
|
||||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-nccl-gpu-stress" \
|
|
||||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/john" \
|
"${OVERLAY_STAGE_DIR}/usr/local/bin/john" \
|
||||||
"${OVERLAY_STAGE_DIR}/usr/local/lib/bee/bee-gpu-burn-worker" \
|
"${OVERLAY_STAGE_DIR}/usr/local/lib/bee/bee-gpu-burn-worker" \
|
||||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" \
|
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest" \
|
||||||
@@ -1136,13 +1135,16 @@ fi
|
|||||||
ISO_RAW="${LB_DIR}/live-image-amd64.hybrid.iso"
|
ISO_RAW="${LB_DIR}/live-image-amd64.hybrid.iso"
|
||||||
if [ -f "$ISO_RAW" ]; then
|
if [ -f "$ISO_RAW" ]; then
|
||||||
dump_memtest_debug "post-build" "${LB_DIR}" "$ISO_RAW"
|
dump_memtest_debug "post-build" "${LB_DIR}" "$ISO_RAW"
|
||||||
iso_memtest_present "$ISO_RAW"
|
if iso_memtest_present "$ISO_RAW"; then
|
||||||
memtest_status=$?
|
:
|
||||||
if [ "$memtest_status" -eq 1 ]; then
|
else
|
||||||
recover_iso_memtest "${LB_DIR}" "$ISO_RAW"
|
memtest_status=$?
|
||||||
dump_memtest_debug "post-recovery" "${LB_DIR}" "$ISO_RAW"
|
if [ "$memtest_status" -eq 1 ]; then
|
||||||
elif [ "$memtest_status" -eq 2 ]; then
|
recover_iso_memtest "${LB_DIR}" "$ISO_RAW"
|
||||||
memtest_fail "failed to inspect ISO for memtest before recovery" "$ISO_RAW"
|
dump_memtest_debug "post-recovery" "${LB_DIR}" "$ISO_RAW"
|
||||||
|
elif [ "$memtest_status" -eq 2 ]; then
|
||||||
|
memtest_fail "failed to inspect ISO for memtest before recovery" "$ISO_RAW"
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
validate_iso_memtest "$ISO_RAW"
|
validate_iso_memtest "$ISO_RAW"
|
||||||
cp "$ISO_RAW" "$ISO_OUT"
|
cp "$ISO_RAW" "$ISO_OUT"
|
||||||
|
|||||||
@@ -1,3 +1,6 @@
|
|||||||
|
# AMD GPU firmware
|
||||||
|
firmware-amd-graphics
|
||||||
|
|
||||||
# AMD ROCm — GPU monitoring, bandwidth test, and compute stress (RVS GST)
|
# AMD ROCm — GPU monitoring, bandwidth test, and compute stress (RVS GST)
|
||||||
rocm-smi-lib=%%ROCM_SMI_VERSION%%
|
rocm-smi-lib=%%ROCM_SMI_VERSION%%
|
||||||
rocm-bandwidth-test=%%ROCM_BANDWIDTH_TEST_VERSION%%
|
rocm-bandwidth-test=%%ROCM_BANDWIDTH_TEST_VERSION%%
|
||||||
|
|||||||
@@ -71,9 +71,7 @@ lightdm
|
|||||||
firmware-linux-free
|
firmware-linux-free
|
||||||
firmware-linux-nonfree
|
firmware-linux-nonfree
|
||||||
firmware-misc-nonfree
|
firmware-misc-nonfree
|
||||||
firmware-amd-graphics
|
|
||||||
firmware-realtek
|
firmware-realtek
|
||||||
firmware-intel-sound
|
|
||||||
firmware-bnx2
|
firmware-bnx2
|
||||||
firmware-bnx2x
|
firmware-bnx2x
|
||||||
firmware-cavium
|
firmware-cavium
|
||||||
|
|||||||
@@ -52,6 +52,14 @@ else
|
|||||||
fail "nvidia-smi: NOT FOUND"
|
fail "nvidia-smi: NOT FOUND"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
for tool in bee-gpu-burn bee-john-gpu-stress bee-nccl-gpu-stress all_reduce_perf; do
|
||||||
|
if p=$(PATH="/usr/local/bin:$PATH" command -v "$tool" 2>/dev/null); then
|
||||||
|
ok "$tool found: $p"
|
||||||
|
else
|
||||||
|
fail "$tool: NOT FOUND"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
echo ""
|
echo ""
|
||||||
echo "-- NVIDIA modules --"
|
echo "-- NVIDIA modules --"
|
||||||
KO_DIR="/usr/local/lib/nvidia"
|
KO_DIR="/usr/local/lib/nvidia"
|
||||||
|
|||||||
@@ -190,4 +190,16 @@ CHOSEN_FORMAT=$(choose_format) || {
|
|||||||
}
|
}
|
||||||
|
|
||||||
echo "format=${CHOSEN_FORMAT}"
|
echo "format=${CHOSEN_FORMAT}"
|
||||||
exec ./john --test="${SECONDS}" --format="${CHOSEN_FORMAT}" --devices="${JOHN_DEVICES}"
|
PIDS=""
|
||||||
|
_first=1
|
||||||
|
for opencl_id in $(echo "${JOHN_DEVICES}" | tr ',' ' '); do
|
||||||
|
[ "${_first}" = "1" ] || sleep 3
|
||||||
|
_first=0
|
||||||
|
./john --test="${SECONDS}" --format="${CHOSEN_FORMAT}" --devices="${opencl_id}" &
|
||||||
|
PIDS="${PIDS} $!"
|
||||||
|
done
|
||||||
|
FAIL=0
|
||||||
|
for pid in ${PIDS}; do
|
||||||
|
wait "${pid}" || FAIL=$((FAIL+1))
|
||||||
|
done
|
||||||
|
[ "${FAIL}" -eq 0 ] || { echo "john: ${FAIL} device(s) failed" >&2; exit 1; }
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ chromium \
|
|||||||
--no-first-run \
|
--no-first-run \
|
||||||
--disable-session-crashed-bubble \
|
--disable-session-crashed-bubble \
|
||||||
--disable-features=TranslateUI \
|
--disable-features=TranslateUI \
|
||||||
--start-fullscreen \
|
--start-maximized \
|
||||||
http://localhost/ &
|
http://localhost/ &
|
||||||
|
|
||||||
exec openbox
|
exec openbox
|
||||||
|
|||||||
@@ -3,6 +3,11 @@
|
|||||||
# Type 'a' at any prompt to abort, 'b' to go back.
|
# Type 'a' at any prompt to abort, 'b' to go back.
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
|
# Requires root for ip/dhclient/resolv.conf — re-exec via sudo if needed.
|
||||||
|
if [ "$(id -u)" -ne 0 ]; then
|
||||||
|
exec sudo "$0" "$@"
|
||||||
|
fi
|
||||||
|
|
||||||
abort() { echo "Aborted."; exit 0; }
|
abort() { echo "Aborted."; exit 0; }
|
||||||
|
|
||||||
ask() {
|
ask() {
|
||||||
|
|||||||
Reference in New Issue
Block a user