feat(metrics): persist history in sqlite and add AMD memory validate tests
This commit is contained in:
@@ -136,6 +136,54 @@ func (s *System) RunAMDAcceptancePack(ctx context.Context, baseDir string, logFu
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
// RunAMDMemIntegrityPack runs the official RVS MEM module as a validate-style memory integrity test.
|
||||
func (s *System) RunAMDMemIntegrityPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
if err := ensureAMDRuntimeReady(); err != nil {
|
||||
return "", err
|
||||
}
|
||||
cfgFile := "/tmp/bee-amd-mem.conf"
|
||||
cfg := `actions:
|
||||
- name: mem_integrity
|
||||
device: all
|
||||
module: mem
|
||||
parallel: true
|
||||
duration: 60000
|
||||
copy_matrix: false
|
||||
target_stress: 90
|
||||
matrix_size: 8640
|
||||
`
|
||||
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-mem", []satJob{
|
||||
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
|
||||
{name: "02-rvs-mem.log", cmd: []string{"rvs", "-c", cfgFile}},
|
||||
{name: "03-rocm-smi-after.log", cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--showmemuse", "--csv"}},
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
// RunAMDMemBandwidthPack runs AMD's memory/interconnect bandwidth-oriented tools.
|
||||
func (s *System) RunAMDMemBandwidthPack(ctx context.Context, baseDir string, logFunc func(string)) (string, error) {
|
||||
if err := ensureAMDRuntimeReady(); err != nil {
|
||||
return "", err
|
||||
}
|
||||
cfgFile := "/tmp/bee-amd-babel.conf"
|
||||
cfg := `actions:
|
||||
- name: babel_mem_bw
|
||||
device: all
|
||||
module: babel
|
||||
parallel: true
|
||||
copy_matrix: true
|
||||
target_stress: 90
|
||||
matrix_size: 134217728
|
||||
`
|
||||
_ = os.WriteFile(cfgFile, []byte(cfg), 0644)
|
||||
return runAcceptancePackCtx(ctx, baseDir, "gpu-amd-bandwidth", []satJob{
|
||||
{name: "01-rocm-smi.log", cmd: []string{"rocm-smi"}},
|
||||
{name: "02-rocm-bandwidth-test.log", cmd: []string{"rocm-bandwidth-test"}},
|
||||
{name: "03-rvs-babel.log", cmd: []string{"rvs", "-c", cfgFile}},
|
||||
{name: "04-rocm-smi-after.log", cmd: []string{"rocm-smi", "--showtemp", "--showpower", "--showmemuse", "--csv"}},
|
||||
}, logFunc)
|
||||
}
|
||||
|
||||
// RunAMDStressPack runs an AMD GPU burn-in pack.
|
||||
// Missing tools are reported as UNSUPPORTED, consistent with the existing SAT pattern.
|
||||
func (s *System) RunAMDStressPack(ctx context.Context, baseDir string, durationSec int, logFunc func(string)) (string, error) {
|
||||
@@ -161,7 +209,7 @@ func amdStressRVSConfig(seconds int) string {
|
||||
module: gst
|
||||
parallel: true
|
||||
duration: %d
|
||||
copy_matrix: true
|
||||
copy_matrix: false
|
||||
target_stress: 90
|
||||
matrix_size_a: 8640
|
||||
matrix_size_b: 8640
|
||||
|
||||
@@ -39,15 +39,26 @@ func TestRunNvidiaAcceptancePackIncludesGPUStress(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestAMDStressConfigEnablesVRAMTraffic(t *testing.T) {
|
||||
func TestAMDStressConfigUsesSingleGSTAction(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
cfg := amdStressRVSConfig(123)
|
||||
if !strings.Contains(cfg, "copy_matrix: true") {
|
||||
t.Fatalf("config missing VRAM copy path:\n%s", cfg)
|
||||
if !strings.Contains(cfg, "module: gst") {
|
||||
t.Fatalf("config missing gst module:\n%s", cfg)
|
||||
}
|
||||
if !strings.Contains(cfg, "duration: 123000") {
|
||||
t.Fatalf("config missing millisecond duration:\n%s", cfg)
|
||||
if strings.Contains(cfg, "module: mem") {
|
||||
t.Fatalf("config should not include mem module:\n%s", cfg)
|
||||
}
|
||||
if !strings.Contains(cfg, "copy_matrix: false") {
|
||||
t.Fatalf("config should use copy_matrix=false:\n%s", cfg)
|
||||
}
|
||||
if strings.Count(cfg, "duration: 123000") != 1 {
|
||||
t.Fatalf("config should apply duration once:\n%s", cfg)
|
||||
}
|
||||
for _, field := range []string{"matrix_size_a: 8640", "matrix_size_b: 8640", "matrix_size_c: 8640"} {
|
||||
if !strings.Contains(cfg, field) {
|
||||
t.Fatalf("config missing %s:\n%s", field, cfg)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user