diff --git a/audit/internal/app/app.go b/audit/internal/app/app.go index e898189..8d46181 100644 --- a/audit/internal/app/app.go +++ b/audit/internal/app/app.go @@ -114,6 +114,8 @@ type satRunner interface { DetectGPUVendor() string ListAMDGPUs() ([]platform.AMDGPUInfo, error) RunAMDAcceptancePack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) + RunAMDMemIntegrityPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) + RunAMDMemBandwidthPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) RunAMDStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) RunMemoryStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) RunSATStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) @@ -577,6 +579,20 @@ func (a *App) RunAMDAcceptancePackResult(baseDir string) (ActionResult, error) { return ActionResult{Title: "AMD GPU SAT", Body: satResultBody(path)}, err } +func (a *App) RunAMDMemIntegrityPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) { + if strings.TrimSpace(baseDir) == "" { + baseDir = DefaultSATBaseDir + } + return a.sat.RunAMDMemIntegrityPack(ctx, baseDir, logFunc) +} + +func (a *App) RunAMDMemBandwidthPackCtx(ctx context.Context, baseDir string, logFunc func(string)) (string, error) { + if strings.TrimSpace(baseDir) == "" { + baseDir = DefaultSATBaseDir + } + return a.sat.RunAMDMemBandwidthPack(ctx, baseDir, logFunc) +} + func (a *App) RunMemoryStressPack(baseDir string, durationSec int, logFunc func(string)) (string, error) { return a.RunMemoryStressPackCtx(context.Background(), baseDir, durationSec, logFunc) } diff --git a/audit/internal/app/app_test.go b/audit/internal/app/app_test.go index fcd88c4..93d1b7f 100644 --- a/audit/internal/app/app_test.go +++ b/audit/internal/app/app_test.go @@ -181,6 +181,14 @@ func (f fakeSAT) RunAMDAcceptancePack(_ context.Context, baseDir string, _ func( return "", nil } +func (f fakeSAT) RunAMDMemIntegrityPack(_ context.Context, _ string, _ func(string)) (string, error) { + return "", nil +} + +func (f fakeSAT) RunAMDMemBandwidthPack(_ context.Context, _ string, _ func(string)) (string, error) { + return "", nil +} + func (f fakeSAT) RunAMDStressPack(_ context.Context, _ string, _ int, _ func(string)) (string, error) { return "", nil } diff --git a/audit/internal/platform/sat.go b/audit/internal/platform/sat.go index b7f3d2d..041b402 100644 --- a/audit/internal/platform/sat.go +++ b/audit/internal/platform/sat.go @@ -136,6 +136,54 @@ func (s *System) RunAMDAcceptancePack(ctx context.Context, baseDir string, logFu }, logFunc) } +// RunAMDMemIntegrityPack runs the official RVS MEM module as a validate-style memory integrity test. +func (s *System) RunAMDMemIntegrityPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) { + if err := ensureAMDRuntimeReady(); err != nil { + return "", err + } + cfgFile := "/tmp/bee-amd-mem.conf" + cfg := `actions: +- name: mem_integrity + device: all + module: mem + parallel: true + duration: 60000 + copy_matrix: false + target_stress: 90 + matrix_size: 8640 +` + _ = os.WriteFile(cfgFile, []byte(cfg), 0644) + return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-mem", []satJob{ + {name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}}, + {name: "02-rvs-mem.log", cmd: []string{"rvs", "-c", cfgFile}}, + {name: "03-rocm-smi-after.log", cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--showmemuse", "--csv"}}, + }, logFunc) +} + +// RunAMDMemBandwidthPack runs AMD's memory/interconnect bandwidth-oriented tools. +func (s *System) RunAMDMemBandwidthPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) { + if err := ensureAMDRuntimeReady(); err != nil { + return "", err + } + cfgFile := "/tmp/bee-amd-babel.conf" + cfg := `actions: +- name: babel_mem_bw + device: all + module: babel + parallel: true + copy_matrix: true + target_stress: 90 + matrix_size: 134217728 +` + _ = os.WriteFile(cfgFile, []byte(cfg), 0644) + return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-bandwidth", []satJob{ + {name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}}, + {name: "02-rocm-bandwidth-test.log", cmd: []string{"rocm-bandwidth-test"}}, + {name: "03-rvs-babel.log", cmd: []string{"rvs", "-c", cfgFile}}, + {name: "04-rocm-smi-after.log", cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--showmemuse", "--csv"}}, + }, logFunc) +} + // RunAMDStressPack runs an AMD GPU burn-in pack. // Missing tools are reported as UNSUPPORTED, consistent with the existing SAT pattern. func (s *System) RunAMDStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) { @@ -161,7 +209,7 @@ func amdStressRVSConfig(seconds int) string { module: gst parallel: true duration: %d - copy_matrix: true + copy_matrix: false target_stress: 90 matrix_size_a: 8640 matrix_size_b: 8640 diff --git a/audit/internal/platform/sat_test.go b/audit/internal/platform/sat_test.go index fa3fed3..96b1552 100644 --- a/audit/internal/platform/sat_test.go +++ b/audit/internal/platform/sat_test.go @@ -39,15 +39,26 @@ func TestRunNvidiaAcceptancePackIncludesGPUStress(t *testing.T) { } } -func TestAMDStressConfigEnablesVRAMTraffic(t *testing.T) { +func TestAMDStressConfigUsesSingleGSTAction(t *testing.T) { t.Parallel() cfg := amdStressRVSConfig(123) - if !strings.Contains(cfg, "copy_matrix: true") { - t.Fatalf("config missing VRAM copy path:\n%s", cfg) + if !strings.Contains(cfg, "module: gst") { + t.Fatalf("config missing gst module:\n%s", cfg) } - if !strings.Contains(cfg, "duration: 123000") { - t.Fatalf("config missing millisecond duration:\n%s", cfg) + if strings.Contains(cfg, "module: mem") { + t.Fatalf("config should not include mem module:\n%s", cfg) + } + if !strings.Contains(cfg, "copy_matrix: false") { + t.Fatalf("config should use copy_matrix=false:\n%s", cfg) + } + if strings.Count(cfg, "duration: 123000") != 1 { + t.Fatalf("config should apply duration once:\n%s", cfg) + } + for _, field := range []string{"matrix_size_a: 8640", "matrix_size_b: 8640", "matrix_size_c: 8640"} { + if !strings.Contains(cfg, field) { + t.Fatalf("config missing %s:\n%s", field, cfg) + } } } diff --git a/audit/internal/webui/api.go b/audit/internal/webui/api.go index 7b295c7..a88aabc 100644 --- a/audit/internal/webui/api.go +++ b/audit/internal/webui/api.go @@ -599,10 +599,9 @@ func (h *handler) handleAPIMetricsStream(w http.ResponseWriter, r *http.Request) case <-r.Context().Done(): return case <-ticker.C: - sample := platform.SampleLiveMetrics() - h.feedRings(sample) - if h.metricsDB != nil { - _ = h.metricsDB.Write(sample) + sample, ok := h.latestMetric() + if !ok { + continue } b, err := json.Marshal(sample) if err != nil { diff --git a/audit/internal/webui/metricsdb.go b/audit/internal/webui/metricsdb.go index 90e4b37..704ffb2 100644 --- a/audit/internal/webui/metricsdb.go +++ b/audit/internal/webui/metricsdb.go @@ -3,7 +3,6 @@ package webui import ( "database/sql" "encoding/csv" - "fmt" "io" "strconv" "time" @@ -13,7 +12,6 @@ import ( ) const metricsDBPath = "/appdata/bee/metrics.db" -const metricsKeepDuration = 24 * time.Hour // MetricsDB persists live metric samples to SQLite. type MetricsDB struct { @@ -116,11 +114,18 @@ func (m *MetricsDB) Write(s platform.LiveMetricSample) error { } // LoadRecent returns up to n samples in chronological order (oldest first). -// It reconstructs LiveMetricSample from the normalized tables. func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) { - rows, err := m.db.Query( - `SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?`, n, - ) + return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts DESC LIMIT ?`, n) +} + +// LoadAll returns all persisted samples in chronological order (oldest first). +func (m *MetricsDB) LoadAll() ([]platform.LiveMetricSample, error) { + return m.loadSamples(`SELECT ts,cpu_load_pct,mem_load_pct,power_w FROM sys_metrics ORDER BY ts`, nil) +} + +// loadSamples reconstructs LiveMetricSample rows from the normalized tables. +func (m *MetricsDB) loadSamples(query string, args ...any) ([]platform.LiveMetricSample, error) { + rows, err := m.db.Query(query, args...) if err != nil { return nil, err } @@ -257,14 +262,6 @@ func (m *MetricsDB) LoadRecent(n int) ([]platform.LiveMetricSample, error) { return samples, nil } -// Prune deletes samples older than keepDuration. -func (m *MetricsDB) Prune(keepDuration time.Duration) { - cutoff := time.Now().Add(-keepDuration).Unix() - for _, table := range []string{"sys_metrics", "gpu_metrics", "fan_metrics", "temp_metrics"} { - _, _ = m.db.Exec(fmt.Sprintf("DELETE FROM %s WHERE ts < ?", table), cutoff) - } -} - // ExportCSV writes all sys+gpu data as CSV to w. func (m *MetricsDB) ExportCSV(w io.Writer) error { rows, err := m.db.Query(` diff --git a/audit/internal/webui/pages.go b/audit/internal/webui/pages.go index 05c41f2..00378e8 100644 --- a/audit/internal/webui/pages.go +++ b/audit/internal/webui/pages.go @@ -494,7 +494,11 @@ func renderValidate() string { renderSATCard("memory", "Memory", "") + renderSATCard("storage", "Storage", "") + renderSATCard("cpu", "CPU", `
`) + - renderSATCard("amd", "AMD GPU", "") + + renderSATCard("amd", "AMD GPU", `
+ + +
+

Additional AMD memory diagnostics: RVS MEM for integrity and BABEL + rocm-bandwidth-test for memory/interconnect bandwidth.

`) + `