feat(metrics): persist history in sqlite and add AMD memory validate tests
This commit is contained in:
@@ -114,6 +114,8 @@ type satRunner interface {
|
||||
DetectGPUVendor() string
|
||||
ListAMDGPUs() ([]platform.AMDGPUInfo, error)
|
||||
RunAMDAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||
RunAMDMemIntegrityPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||
RunAMDMemBandwidthPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error)
|
||||
RunAMDStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||
RunMemoryStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||
RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error)
|
||||
@@ -577,6 +579,20 @@ func (a *App) RunAMDAcceptancePackResult(baseDir string) (ActionResult, error) {
|
||||
return ActionResult{Title: "AMD GPU SAT", Body: satResultBody(path)}, err
|
||||
}
|
||||
|
||||
func (a *App) RunAMDMemIntegrityPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunAMDMemIntegrityPack(ctx, baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunAMDMemBandwidthPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
if strings.TrimSpace(baseDir) == "" {
|
||||
baseDir = DefaultSATBaseDir
|
||||
}
|
||||
return a.sat.RunAMDMemBandwidthPack(ctx, baseDir, logFunc)
|
||||
}
|
||||
|
||||
func (a *App) RunMemoryStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.RunMemoryStressPackCtx(context.Background(), baseDir, durationSec, logFunc)
|
||||
}
|
||||
|
||||
@@ -181,6 +181,14 @@ func (f fakeSAT) RunAMDAcceptancePack(_ context.Context, baseDir string, _ func(
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunAMDMemIntegrityPack(_ context.Context, _ string, _ func(string)) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunAMDMemBandwidthPack(_ context.Context, _ string, _ func(string)) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
|
||||
func (f fakeSAT) RunAMDStressPack(_ context.Context, _ string, _ int, _ func(string)) (string, error) {
|
||||
return "", nil
|
||||
}
|
||||
|
||||
@@ -136,6 +136,54 @@ func (s *System) RunAMDAcceptancePack(ctx context.Context, baseDir string, logFu
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
// RunAMDMemIntegrityPack runs the official RVS MEM module as a validate-style memory integrity test.
|
||||
func (s *System) RunAMDMemIntegrityPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
if err := ensureAMDRuntimeReady(); err != nil {
|
||||
return "", err
|
||||
}
|
||||
cfgFile := "/tmp/bee-amd-mem.conf"
|
||||
cfg := `actions:
|
||||
- name: mem_integrity
|
||||
device: all
|
||||
module: mem
|
||||
parallel: true
|
||||
duration: 60000
|
||||
copy_matrix: false
|
||||
target_stress: 90
|
||||
matrix_size: 8640
|
||||
`
|
||||
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-mem", []satJob{
|
||||
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
|
||||
{name: "02-rvs-mem.log", cmd: []string{"rvs", "-c", cfgFile}},
|
||||
{name: "03-rocm-smi-after.log", cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--showmemuse", "--csv"}},
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
// RunAMDMemBandwidthPack runs AMD's memory/interconnect bandwidth-oriented tools.
|
||||
func (s *System) RunAMDMemBandwidthPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
if err := ensureAMDRuntimeReady(); err != nil {
|
||||
return "", err
|
||||
}
|
||||
cfgFile := "/tmp/bee-amd-babel.conf"
|
||||
cfg := `actions:
|
||||
- name: babel_mem_bw
|
||||
device: all
|
||||
module: babel
|
||||
parallel: true
|
||||
copy_matrix: true
|
||||
target_stress: 90
|
||||
matrix_size: 134217728
|
||||
`
|
||||
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-bandwidth", []satJob{
|
||||
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
|
||||
{name: "02-rocm-bandwidth-test.log", cmd: []string{"rocm-bandwidth-test"}},
|
||||
{name: "03-rvs-babel.log", cmd: []string{"rvs", "-c", cfgFile}},
|
||||
{name: "04-rocm-smi-after.log", cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--showmemuse", "--csv"}},
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
// RunAMDStressPack runs an AMD GPU burn-in pack.
|
||||
// Missing tools are reported as UNSUPPORTED, consistent with the existing SAT pattern.
|
||||
func (s *System) RunAMDStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
@@ -161,7 +209,7 @@ func amdStressRVSConfig(seconds int) string {
|
||||
module: gst
|
||||
parallel: true
|
||||
duration: %d
|
||||
copy_matrix: true
|
||||
copy_matrix: false
|
||||
target_stress: 90
|
||||
matrix_size_a: 8640
|
||||
matrix_size_b: 8640
|
||||
|
||||
@@ -39,15 +39,26 @@ func TestRunNvidiaAcceptancePackIncludesGPUStress(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestAMDStressConfigEnablesVRAMTraffic(t *testing.T) {
|
||||
func TestAMDStressConfigUsesSingleGSTAction(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
cfg := amdStressRVSConfig(123)
|
||||
if !strings.Contains(cfg, "copy_matrix: true") {
|
||||
t.Fatalf("config missing VRAM copy path:\n%s", cfg)
|
||||
if !strings.Contains(cfg, "module: gst") {
|
||||
t.Fatalf("config missing gst module:\n%s", cfg)
|
||||
}
|
||||
if !strings.Contains(cfg, "duration: 123000") {
|
||||
t.Fatalf("config missing millisecond duration:\n%s", cfg)
|
||||
if strings.Contains(cfg, "module: mem") {
|
||||
t.Fatalf("config should not include mem module:\n%s", cfg)
|
||||
}
|
||||
if !strings.Contains(cfg, "copy_matrix: false") {
|
||||
t.Fatalf("config should use copy_matrix=false:\n%s", cfg)
|
||||
}
|
||||
if strings.Count(cfg, "duration: 123000") != 1 {
|
||||
t.Fatalf("config should apply duration once:\n%s", cfg)
|
||||
}
|
||||
for _, field := range []string{"matrix_size_a: 8640", "matrix_size_b: 8640", "matrix_size_c: 8640"} {
|
||||
if !strings.Contains(cfg, field) {
|
||||
t.Fatalf("config missing %s:\n%s", field, cfg)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -599,10 +599,9 @@ func (h *handler) handleAPIMetricsStream(w http.ResponseWriter, r *http.Request)
|
||||
case <-r.Context().Done():
|
||||
return
|
||||
case <-ticker.C:
|
||||
sample := platform.SampleLiveMetrics()
|
||||
h.feedRings(sample)
|
||||
if h.metricsDB != nil {
|
||||
_ = h.metricsDB.Write(sample)
|
||||
sample, ok := h.latestMetric()
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
b, err := json.Marshal(sample)
|
||||
if err != nil {
|
||||
|
||||
@@ -3,7 +3,6 @@ package webui
|
||||
import (
|
||||
"database/sql"
|
||||
"encoding/csv"
|
||||
"fmt"
|
||||
"io"
|
||||
"strconv"
|
||||
"time"
|
||||
@@ -13,7 +12,6 @@ import (
|
||||
)
|
||||
|
||||
const metricsDBPath = "/appdata/bee/metrics.db"
|
||||
const metricsKeepDuration = 24 * time.Hour
|
||||
|
||||
// MetricsDB persists live metric samples to SQLite.
|
||||
type MetricsDB struct {
|
||||
@@ -116,11 +114,18 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error {
|
||||
}
|
||||
|
||||
// LoadRecent returns up to n samples in chronological order (oldest first).
|
||||
// It reconstructs LiveMetricSample from the normalized tables.
|
||||
func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
|
||||
rows, err := m.db.Query(
|
||||
`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?`, n,
|
||||
)
|
||||
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?`, n)
|
||||
}
|
||||
|
||||
// LoadAll returns all persisted samples in chronological order (oldest first).
|
||||
func (m *MetricsDB) LoadAll() ([]platform.LiveMetricSample, error) {
|
||||
return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts`, nil)
|
||||
}
|
||||
|
||||
// loadSamples reconstructs LiveMetricSample rows from the normalized tables.
|
||||
func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetricSample, error) {
|
||||
rows, err := m.db.Query(query, args...)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -257,14 +262,6 @@ func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) {
|
||||
return samples, nil
|
||||
}
|
||||
|
||||
// Prune deletes samples older than keepDuration.
|
||||
func (m *MetricsDB) Prune(keepDuration time.Duration) {
|
||||
cutoff := time.Now().Add(-keepDuration).Unix()
|
||||
for _, table := range []string{"sys_metrics", "gpu_metrics", "fan_metrics", "temp_metrics"} {
|
||||
_, _ = m.db.Exec(fmt.Sprintf("DELETE FROM %s WHERE ts < ?", table), cutoff)
|
||||
}
|
||||
}
|
||||
|
||||
// ExportCSV writes all sys+gpu data as CSV to w.
|
||||
func (m *MetricsDB) ExportCSV(w io.Writer) error {
|
||||
rows, err := m.db.Query(`
|
||||
|
||||
@@ -494,7 +494,11 @@ func renderValidate() string {
|
||||
renderSATCard("memory", "Memory", "") +
|
||||
renderSATCard("storage", "Storage", "") +
|
||||
renderSATCard("cpu", "CPU", `<div class="form-row"><label>Duration (seconds)</label><input type="number" id="sat-cpu-dur" value="60" min="10"></div>`) +
|
||||
renderSATCard("amd", "AMD GPU", "") +
|
||||
renderSATCard("amd", "AMD GPU", `<div style="display:flex;gap:8px;flex-wrap:wrap;margin-bottom:8px">
|
||||
<button id="sat-btn-amd-mem" class="btn" type="button" onclick="runSAT('amd-mem')">MEM Integrity</button>
|
||||
<button id="sat-btn-amd-bandwidth" class="btn" type="button" onclick="runSAT('amd-bandwidth')">MEM Bandwidth</button>
|
||||
</div>
|
||||
<p style="color:var(--muted);font-size:12px;margin:0">Additional AMD memory diagnostics: RVS MEM for integrity and BABEL + rocm-bandwidth-test for memory/interconnect bandwidth.</p>`) +
|
||||
`</div>
|
||||
<div id="sat-output" style="display:none;margin-top:16px" class="card">
|
||||
<div class="card-head">Test Output <span id="sat-title"></span></div>
|
||||
@@ -505,7 +509,7 @@ let satES = null;
|
||||
function runSAT(target) {
|
||||
if (satES) { satES.close(); satES = null; }
|
||||
const body = {};
|
||||
const labels = {nvidia:'Validate GPU', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU'};
|
||||
const labels = {nvidia:'Validate GPU', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
|
||||
body.display_name = labels[target] || ('Validate ' + target);
|
||||
if (target === 'nvidia') body.diag_level = parseInt(document.getElementById('sat-nvidia-level').value)||1;
|
||||
if (target === 'cpu') body.duration = parseInt(document.getElementById('sat-cpu-dur').value)||60;
|
||||
@@ -524,7 +528,7 @@ function runSAT(target) {
|
||||
}
|
||||
function runAllSAT() {
|
||||
const cycles = Math.max(1, parseInt(document.getElementById('sat-cycles').value)||1);
|
||||
const targets = ['nvidia','memory','storage','cpu','amd'];
|
||||
const targets = ['nvidia','memory','storage','cpu','amd','amd-mem','amd-bandwidth'];
|
||||
const total = targets.length * cycles;
|
||||
let enqueued = 0;
|
||||
const status = document.getElementById('sat-all-status');
|
||||
@@ -536,7 +540,7 @@ function runAllSAT() {
|
||||
const btn = document.getElementById('sat-btn-' + target);
|
||||
if (btn && btn.disabled) { enqueueNext(cycle, idx+1); return; }
|
||||
const body = {};
|
||||
const labels = {nvidia:'Validate GPU', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU'};
|
||||
const labels = {nvidia:'Validate GPU', memory:'Validate Memory', storage:'Validate Storage', cpu:'Validate CPU', amd:'Validate AMD GPU', 'amd-mem':'AMD GPU MEM Integrity', 'amd-bandwidth':'AMD GPU MEM Bandwidth'};
|
||||
body.display_name = labels[target] || ('Validate ' + target);
|
||||
if (target === 'nvidia') body.diag_level = parseInt(document.getElementById('sat-nvidia-level').value)||1;
|
||||
if (target === 'cpu') body.duration = parseInt(document.getElementById('sat-cpu-dur').value)||60;
|
||||
@@ -554,6 +558,8 @@ function runAllSAT() {
|
||||
fetch('/api/gpu/presence').then(r=>r.json()).then(gp => {
|
||||
if (!gp.nvidia) disableSATCard('nvidia', 'No NVIDIA GPU detected');
|
||||
if (!gp.amd) disableSATCard('amd', 'No AMD GPU detected');
|
||||
if (!gp.amd) disableSATCard('amd-mem', 'No AMD GPU detected');
|
||||
if (!gp.amd) disableSATCard('amd-bandwidth', 'No AMD GPU detected');
|
||||
});
|
||||
function disableSATCard(id, reason) {
|
||||
const btn = document.getElementById('sat-btn-' + id);
|
||||
|
||||
@@ -132,6 +132,8 @@ type handler struct {
|
||||
// per-GPU rings (index = GPU index)
|
||||
gpuRings []*gpuRings
|
||||
ringsMu sync.Mutex
|
||||
latestMu sync.RWMutex
|
||||
latest *platform.LiveMetricSample
|
||||
// metrics persistence (nil if DB unavailable)
|
||||
metricsDB *MetricsDB
|
||||
// install job (at most one at a time)
|
||||
@@ -164,13 +166,16 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
// Open metrics DB and pre-fill ring buffers from history.
|
||||
if db, err := openMetricsDB(metricsDBPath); err == nil {
|
||||
h.metricsDB = db
|
||||
db.Prune(metricsKeepDuration)
|
||||
if samples, err := db.LoadRecent(120); err == nil {
|
||||
for _, s := range samples {
|
||||
h.feedRings(s)
|
||||
}
|
||||
if len(samples) > 0 {
|
||||
h.setLatestMetric(samples[len(samples)-1])
|
||||
}
|
||||
}
|
||||
}
|
||||
h.startMetricsCollector()
|
||||
|
||||
globalQueue.startWorker(&opts)
|
||||
mux := http.NewServeMux()
|
||||
@@ -198,6 +203,8 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
mux.HandleFunc("POST /api/sat/storage/run", h.handleAPISATRun("storage"))
|
||||
mux.HandleFunc("POST /api/sat/cpu/run", h.handleAPISATRun("cpu"))
|
||||
mux.HandleFunc("POST /api/sat/amd/run", h.handleAPISATRun("amd"))
|
||||
mux.HandleFunc("POST /api/sat/amd-mem/run", h.handleAPISATRun("amd-mem"))
|
||||
mux.HandleFunc("POST /api/sat/amd-bandwidth/run", h.handleAPISATRun("amd-bandwidth"))
|
||||
mux.HandleFunc("POST /api/sat/amd-stress/run", h.handleAPISATRun("amd-stress"))
|
||||
mux.HandleFunc("POST /api/sat/memory-stress/run", h.handleAPISATRun("memory-stress"))
|
||||
mux.HandleFunc("POST /api/sat/sat-stress/run", h.handleAPISATRun("sat-stress"))
|
||||
@@ -260,6 +267,37 @@ func NewHandler(opts HandlerOptions) http.Handler {
|
||||
return mux
|
||||
}
|
||||
|
||||
func (h *handler) startMetricsCollector() {
|
||||
go func() {
|
||||
ticker := time.NewTicker(1 * time.Second)
|
||||
defer ticker.Stop()
|
||||
for range ticker.C {
|
||||
sample := platform.SampleLiveMetrics()
|
||||
h.feedRings(sample)
|
||||
h.setLatestMetric(sample)
|
||||
if h.metricsDB != nil {
|
||||
_ = h.metricsDB.Write(sample)
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
func (h *handler) setLatestMetric(sample platform.LiveMetricSample) {
|
||||
h.latestMu.Lock()
|
||||
defer h.latestMu.Unlock()
|
||||
cp := sample
|
||||
h.latest = &cp
|
||||
}
|
||||
|
||||
func (h *handler) latestMetric() (platform.LiveMetricSample, bool) {
|
||||
h.latestMu.RLock()
|
||||
defer h.latestMu.RUnlock()
|
||||
if h.latest == nil {
|
||||
return platform.LiveMetricSample{}, false
|
||||
}
|
||||
return *h.latest, true
|
||||
}
|
||||
|
||||
// ListenAndServe starts the HTTP server.
|
||||
func ListenAndServe(addr string, opts HandlerOptions) error {
|
||||
return http.ListenAndServe(addr, NewHandler(opts))
|
||||
@@ -387,6 +425,20 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
|
||||
path := strings.TrimPrefix(r.URL.Path, "/api/metrics/chart/")
|
||||
path = strings.TrimSuffix(path, ".svg")
|
||||
|
||||
if h.metricsDB != nil {
|
||||
if datasets, names, labels, title, yMin, yMax, ok := h.chartDataFromDB(path); ok {
|
||||
buf, err := renderChartSVG(title, datasets, names, labels, yMin, yMax)
|
||||
if err != nil {
|
||||
http.Error(w, err.Error(), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
w.Header().Set("Content-Type", "image/svg+xml")
|
||||
w.Header().Set("Cache-Control", "no-store")
|
||||
_, _ = w.Write(buf)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
var datasets [][]float64
|
||||
var names []string
|
||||
var labels []string
|
||||
@@ -601,6 +653,268 @@ func (h *handler) handleMetricsChartSVG(w http.ResponseWriter, r *http.Request)
|
||||
_, _ = w.Write(buf)
|
||||
}
|
||||
|
||||
func (h *handler) chartDataFromDB(path string) ([][]float64, []string, []string, string, *float64, *float64, bool) {
|
||||
samples, err := h.metricsDB.LoadAll()
|
||||
if err != nil || len(samples) == 0 {
|
||||
return nil, nil, nil, "", nil, nil, false
|
||||
}
|
||||
return chartDataFromSamples(path, samples)
|
||||
}
|
||||
|
||||
func chartDataFromSamples(path string, samples []platform.LiveMetricSample) ([][]float64, []string, []string, string, *float64, *float64, bool) {
|
||||
var datasets [][]float64
|
||||
var names []string
|
||||
var title string
|
||||
var yMin, yMax *float64
|
||||
labels := sampleTimeLabels(samples)
|
||||
|
||||
switch {
|
||||
case path == "server-load":
|
||||
title = "CPU / Memory Load"
|
||||
cpu := make([]float64, len(samples))
|
||||
mem := make([]float64, len(samples))
|
||||
for i, s := range samples {
|
||||
cpu[i] = s.CPULoadPct
|
||||
mem[i] = s.MemLoadPct
|
||||
}
|
||||
datasets = [][]float64{cpu, mem}
|
||||
names = []string{"CPU Load %", "Mem Load %"}
|
||||
yMin = floatPtr(0)
|
||||
yMax = floatPtr(100)
|
||||
|
||||
case path == "server-temp", path == "server-temp-cpu":
|
||||
title = "CPU Temperature"
|
||||
datasets, names = namedTempDatasets(samples, "cpu")
|
||||
yMin = floatPtr(0)
|
||||
yMax = autoMax120(datasets...)
|
||||
|
||||
case path == "server-temp-gpu":
|
||||
title = "GPU Temperature"
|
||||
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.TempC })
|
||||
yMin = floatPtr(0)
|
||||
yMax = autoMax120(datasets...)
|
||||
|
||||
case path == "server-temp-ambient":
|
||||
title = "Ambient / Other Sensors"
|
||||
datasets, names = namedTempDatasets(samples, "ambient")
|
||||
yMin = floatPtr(0)
|
||||
yMax = autoMax120(datasets...)
|
||||
|
||||
case path == "server-power":
|
||||
title = "System Power"
|
||||
power := make([]float64, len(samples))
|
||||
for i, s := range samples {
|
||||
power[i] = s.PowerW
|
||||
}
|
||||
datasets = [][]float64{power}
|
||||
names = []string{"Power W"}
|
||||
yMin, yMax = autoBounds120(power)
|
||||
|
||||
case path == "server-fans":
|
||||
title = "Fan RPM"
|
||||
datasets, names = namedFanDatasets(samples)
|
||||
yMin, yMax = autoBounds120(datasets...)
|
||||
|
||||
case path == "gpu-all-load":
|
||||
title = "GPU Compute Load"
|
||||
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.UsagePct })
|
||||
yMin = floatPtr(0)
|
||||
yMax = floatPtr(100)
|
||||
|
||||
case path == "gpu-all-memload":
|
||||
title = "GPU Memory Load"
|
||||
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct })
|
||||
yMin = floatPtr(0)
|
||||
yMax = floatPtr(100)
|
||||
|
||||
case path == "gpu-all-power":
|
||||
title = "GPU Power"
|
||||
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.PowerW })
|
||||
yMin, yMax = autoBounds120(datasets...)
|
||||
|
||||
case path == "gpu-all-temp":
|
||||
title = "GPU Temperature"
|
||||
datasets, names = gpuDatasets(samples, func(g platform.GPUMetricRow) float64 { return g.TempC })
|
||||
yMin = floatPtr(0)
|
||||
yMax = autoMax120(datasets...)
|
||||
|
||||
case strings.HasPrefix(path, "gpu/"):
|
||||
rest := strings.TrimPrefix(path, "gpu/")
|
||||
sub := ""
|
||||
if i := strings.LastIndex(rest, "-"); i > 0 {
|
||||
sub = rest[i+1:]
|
||||
rest = rest[:i]
|
||||
}
|
||||
idx := 0
|
||||
fmt.Sscanf(rest, "%d", &idx)
|
||||
switch sub {
|
||||
case "load":
|
||||
title = fmt.Sprintf("GPU %d Load", idx)
|
||||
util := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.UsagePct })
|
||||
mem := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.MemUsagePct })
|
||||
if util == nil && mem == nil {
|
||||
return nil, nil, nil, "", nil, nil, false
|
||||
}
|
||||
datasets = [][]float64{coalesceDataset(util, len(samples)), coalesceDataset(mem, len(samples))}
|
||||
names = []string{"Load %", "Mem %"}
|
||||
yMin = floatPtr(0)
|
||||
yMax = floatPtr(100)
|
||||
case "temp":
|
||||
title = fmt.Sprintf("GPU %d Temperature", idx)
|
||||
temp := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.TempC })
|
||||
if temp == nil {
|
||||
return nil, nil, nil, "", nil, nil, false
|
||||
}
|
||||
datasets = [][]float64{temp}
|
||||
names = []string{"Temp °C"}
|
||||
yMin = floatPtr(0)
|
||||
yMax = autoMax120(temp)
|
||||
default:
|
||||
title = fmt.Sprintf("GPU %d Power", idx)
|
||||
power := gpuDatasetByIndex(samples, idx, func(g platform.GPUMetricRow) float64 { return g.PowerW })
|
||||
if power == nil {
|
||||
return nil, nil, nil, "", nil, nil, false
|
||||
}
|
||||
datasets = [][]float64{power}
|
||||
names = []string{"Power W"}
|
||||
yMin, yMax = autoBounds120(power)
|
||||
}
|
||||
|
||||
default:
|
||||
return nil, nil, nil, "", nil, nil, false
|
||||
}
|
||||
|
||||
return datasets, names, labels, title, yMin, yMax, len(datasets) > 0
|
||||
}
|
||||
|
||||
func sampleTimeLabels(samples []platform.LiveMetricSample) []string {
|
||||
labels := make([]string, len(samples))
|
||||
if len(samples) == 0 {
|
||||
return labels
|
||||
}
|
||||
sameDay := true
|
||||
first := samples[0].Timestamp.Local()
|
||||
for _, s := range samples {
|
||||
ts := s.Timestamp.Local()
|
||||
if ts.Year() != first.Year() || ts.YearDay() != first.YearDay() {
|
||||
sameDay = false
|
||||
break
|
||||
}
|
||||
}
|
||||
for i, s := range samples {
|
||||
ts := s.Timestamp.Local()
|
||||
if sameDay {
|
||||
labels[i] = ts.Format("15:04")
|
||||
} else {
|
||||
labels[i] = ts.Format("01-02 15:04")
|
||||
}
|
||||
}
|
||||
return labels
|
||||
}
|
||||
|
||||
func namedTempDatasets(samples []platform.LiveMetricSample, group string) ([][]float64, []string) {
|
||||
seen := map[string]bool{}
|
||||
var names []string
|
||||
for _, s := range samples {
|
||||
for _, t := range s.Temps {
|
||||
if t.Group == group && !seen[t.Name] {
|
||||
seen[t.Name] = true
|
||||
names = append(names, t.Name)
|
||||
}
|
||||
}
|
||||
}
|
||||
datasets := make([][]float64, 0, len(names))
|
||||
for _, name := range names {
|
||||
ds := make([]float64, len(samples))
|
||||
for i, s := range samples {
|
||||
for _, t := range s.Temps {
|
||||
if t.Group == group && t.Name == name {
|
||||
ds[i] = t.Celsius
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
datasets = append(datasets, ds)
|
||||
}
|
||||
return datasets, names
|
||||
}
|
||||
|
||||
func namedFanDatasets(samples []platform.LiveMetricSample) ([][]float64, []string) {
|
||||
seen := map[string]bool{}
|
||||
var names []string
|
||||
for _, s := range samples {
|
||||
for _, f := range s.Fans {
|
||||
if !seen[f.Name] {
|
||||
seen[f.Name] = true
|
||||
names = append(names, f.Name)
|
||||
}
|
||||
}
|
||||
}
|
||||
datasets := make([][]float64, 0, len(names))
|
||||
for _, name := range names {
|
||||
ds := make([]float64, len(samples))
|
||||
for i, s := range samples {
|
||||
for _, f := range s.Fans {
|
||||
if f.Name == name {
|
||||
ds[i] = f.RPM
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
datasets = append(datasets, ds)
|
||||
}
|
||||
return datasets, names
|
||||
}
|
||||
|
||||
func gpuDatasets(samples []platform.LiveMetricSample, pick func(platform.GPUMetricRow) float64) ([][]float64, []string) {
|
||||
seen := map[int]bool{}
|
||||
var indices []int
|
||||
for _, s := range samples {
|
||||
for _, g := range s.GPUs {
|
||||
if !seen[g.GPUIndex] {
|
||||
seen[g.GPUIndex] = true
|
||||
indices = append(indices, g.GPUIndex)
|
||||
}
|
||||
}
|
||||
}
|
||||
datasets := make([][]float64, 0, len(indices))
|
||||
names := make([]string, 0, len(indices))
|
||||
for _, idx := range indices {
|
||||
ds := gpuDatasetByIndex(samples, idx, pick)
|
||||
if ds == nil {
|
||||
continue
|
||||
}
|
||||
datasets = append(datasets, ds)
|
||||
names = append(names, fmt.Sprintf("GPU %d", idx))
|
||||
}
|
||||
return datasets, names
|
||||
}
|
||||
|
||||
func gpuDatasetByIndex(samples []platform.LiveMetricSample, idx int, pick func(platform.GPUMetricRow) float64) []float64 {
|
||||
found := false
|
||||
ds := make([]float64, len(samples))
|
||||
for i, s := range samples {
|
||||
for _, g := range s.GPUs {
|
||||
if g.GPUIndex == idx {
|
||||
ds[i] = pick(g)
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
return nil
|
||||
}
|
||||
return ds
|
||||
}
|
||||
|
||||
func coalesceDataset(ds []float64, n int) []float64 {
|
||||
if ds != nil {
|
||||
return ds
|
||||
}
|
||||
return make([]float64, n)
|
||||
}
|
||||
|
||||
// floatPtr returns a pointer to a float64 value.
|
||||
func floatPtr(v float64) *float64 { return &v }
|
||||
|
||||
@@ -621,6 +935,47 @@ func autoMax120(datasets ...[]float64) *float64 {
|
||||
return &v
|
||||
}
|
||||
|
||||
func autoBounds120(datasets ...[]float64) (*float64, *float64) {
|
||||
min := 0.0
|
||||
max := 0.0
|
||||
first := true
|
||||
for _, ds := range datasets {
|
||||
for _, v := range ds {
|
||||
if first {
|
||||
min, max = v, v
|
||||
first = false
|
||||
continue
|
||||
}
|
||||
if v < min {
|
||||
min = v
|
||||
}
|
||||
if v > max {
|
||||
max = v
|
||||
}
|
||||
}
|
||||
}
|
||||
if first {
|
||||
return nil, nil
|
||||
}
|
||||
if max <= 0 {
|
||||
return floatPtr(0), nil
|
||||
}
|
||||
span := max - min
|
||||
if span <= 0 {
|
||||
span = max * 0.1
|
||||
if span <= 0 {
|
||||
span = 1
|
||||
}
|
||||
}
|
||||
pad := span * 0.2
|
||||
low := min - pad
|
||||
if low < 0 {
|
||||
low = 0
|
||||
}
|
||||
high := max + pad
|
||||
return floatPtr(low), floatPtr(high)
|
||||
}
|
||||
|
||||
// renderChartSVG renders a line chart SVG with a fixed Y-axis range.
|
||||
func renderChartSVG(title string, datasets [][]float64, names []string, labels []string, yMin, yMax *float64) ([]byte, error) {
|
||||
n := len(labels)
|
||||
|
||||
@@ -7,6 +7,9 @@ import (
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"bee/audit/internal/platform"
|
||||
)
|
||||
|
||||
func TestChartLegendNumber(t *testing.T) {
|
||||
@@ -31,6 +34,61 @@ func TestChartLegendNumber(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestChartDataFromSamplesUsesFullHistory(t *testing.T) {
|
||||
samples := []platform.LiveMetricSample{
|
||||
{
|
||||
Timestamp: time.Now().Add(-3 * time.Minute),
|
||||
CPULoadPct: 10,
|
||||
MemLoadPct: 20,
|
||||
PowerW: 300,
|
||||
GPUs: []platform.GPUMetricRow{
|
||||
{GPUIndex: 0, UsagePct: 90, MemUsagePct: 5, PowerW: 120, TempC: 50},
|
||||
},
|
||||
},
|
||||
{
|
||||
Timestamp: time.Now().Add(-2 * time.Minute),
|
||||
CPULoadPct: 30,
|
||||
MemLoadPct: 40,
|
||||
PowerW: 320,
|
||||
GPUs: []platform.GPUMetricRow{
|
||||
{GPUIndex: 0, UsagePct: 95, MemUsagePct: 7, PowerW: 125, TempC: 51},
|
||||
},
|
||||
},
|
||||
{
|
||||
Timestamp: time.Now().Add(-1 * time.Minute),
|
||||
CPULoadPct: 50,
|
||||
MemLoadPct: 60,
|
||||
PowerW: 340,
|
||||
GPUs: []platform.GPUMetricRow{
|
||||
{GPUIndex: 0, UsagePct: 97, MemUsagePct: 9, PowerW: 130, TempC: 52},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
datasets, names, labels, title, _, _, ok := chartDataFromSamples("gpu-all-power", samples)
|
||||
if !ok {
|
||||
t.Fatal("chartDataFromSamples returned ok=false")
|
||||
}
|
||||
if title != "GPU Power" {
|
||||
t.Fatalf("title=%q", title)
|
||||
}
|
||||
if len(names) != 1 || names[0] != "GPU 0" {
|
||||
t.Fatalf("names=%v", names)
|
||||
}
|
||||
if len(labels) != len(samples) {
|
||||
t.Fatalf("labels len=%d want %d", len(labels), len(samples))
|
||||
}
|
||||
if len(datasets) != 1 || len(datasets[0]) != len(samples) {
|
||||
t.Fatalf("datasets shape=%v", datasets)
|
||||
}
|
||||
if got := datasets[0][0]; got != 120 {
|
||||
t.Fatalf("datasets[0][0]=%v want 120", got)
|
||||
}
|
||||
if got := datasets[0][2]; got != 130 {
|
||||
t.Fatalf("datasets[0][2]=%v want 130", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRootRendersDashboard(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "audit.json")
|
||||
|
||||
@@ -30,6 +30,8 @@ var taskNames = map[string]string{
|
||||
"storage": "Storage SAT",
|
||||
"cpu": "CPU SAT",
|
||||
"amd": "AMD GPU SAT",
|
||||
"amd-mem": "AMD GPU MEM Integrity",
|
||||
"amd-bandwidth": "AMD GPU MEM Bandwidth",
|
||||
"amd-stress": "AMD GPU Burn-in",
|
||||
"memory-stress": "Memory Burn-in",
|
||||
"sat-stress": "SAT Stress (stressapptest)",
|
||||
@@ -124,6 +126,12 @@ var (
|
||||
runAMDAcceptancePackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunAMDAcceptancePackCtx(ctx, baseDir, logFunc)
|
||||
}
|
||||
runAMDMemIntegrityPackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunAMDMemIntegrityPackCtx(ctx, baseDir, logFunc)
|
||||
}
|
||||
runAMDMemBandwidthPackCtx = func(a *app.App, ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
return a.RunAMDMemBandwidthPackCtx(ctx, baseDir, logFunc)
|
||||
}
|
||||
runAMDStressPackCtx = func(a *app.App, ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
return a.RunAMDStressPackCtx(ctx, baseDir, durationSec, logFunc)
|
||||
}
|
||||
@@ -380,6 +388,10 @@ func (q *taskQueue) runTask(t *Task, j *jobState, ctx context.Context) {
|
||||
archive, err = runCPUAcceptancePackCtx(a, ctx, "", dur, j.append)
|
||||
case "amd":
|
||||
archive, err = runAMDAcceptancePackCtx(a, ctx, "", j.append)
|
||||
case "amd-mem":
|
||||
archive, err = runAMDMemIntegrityPackCtx(a, ctx, "", j.append)
|
||||
case "amd-bandwidth":
|
||||
archive, err = runAMDMemBandwidthPackCtx(a, ctx, "", j.append)
|
||||
case "amd-stress":
|
||||
dur := t.params.Duration
|
||||
if t.params.BurnProfile != "" && dur <= 0 {
|
||||
|
||||
Reference in New Issue
Block a user