Add health verdicts and acceptance tests
This commit is contained in:
314
iso/builder/bee-gpu-stress.c
Normal file
314
iso/builder/bee-gpu-stress.c
Normal file
@@ -0,0 +1,314 @@
|
||||
#define _POSIX_C_SOURCE 200809L
|
||||
|
||||
#include <dlfcn.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
|
||||
typedef int CUdevice;
|
||||
typedef uint64_t CUdeviceptr;
|
||||
typedef int CUresult;
|
||||
typedef void *CUcontext;
|
||||
typedef void *CUmodule;
|
||||
typedef void *CUfunction;
|
||||
typedef void *CUstream;
|
||||
|
||||
#define CU_SUCCESS 0
|
||||
|
||||
static const char *ptx_source =
|
||||
".version 6.0\n"
|
||||
".target sm_30\n"
|
||||
".address_size 64\n"
|
||||
"\n"
|
||||
".visible .entry burn(\n"
|
||||
" .param .u64 data,\n"
|
||||
" .param .u32 words,\n"
|
||||
" .param .u32 rounds\n"
|
||||
")\n"
|
||||
"{\n"
|
||||
" .reg .pred %p<2>;\n"
|
||||
" .reg .b32 %r<8>;\n"
|
||||
" .reg .b64 %rd<5>;\n"
|
||||
"\n"
|
||||
" ld.param.u64 %rd1, [data];\n"
|
||||
" ld.param.u32 %r1, [words];\n"
|
||||
" ld.param.u32 %r2, [rounds];\n"
|
||||
" mov.u32 %r3, %ctaid.x;\n"
|
||||
" mov.u32 %r4, %ntid.x;\n"
|
||||
" mov.u32 %r5, %tid.x;\n"
|
||||
" mad.lo.s32 %r0, %r3, %r4, %r5;\n"
|
||||
" setp.ge.u32 %p0, %r0, %r1;\n"
|
||||
" @%p0 bra DONE;\n"
|
||||
" mul.wide.u32 %rd2, %r0, 4;\n"
|
||||
" add.s64 %rd3, %rd1, %rd2;\n"
|
||||
" ld.global.u32 %r6, [%rd3];\n"
|
||||
"LOOP:\n"
|
||||
" setp.eq.u32 %p1, %r2, 0;\n"
|
||||
" @%p1 bra STORE;\n"
|
||||
" mad.lo.u32 %r6, %r6, 1664525, 1013904223;\n"
|
||||
" sub.u32 %r2, %r2, 1;\n"
|
||||
" bra LOOP;\n"
|
||||
"STORE:\n"
|
||||
" st.global.u32 [%rd3], %r6;\n"
|
||||
"DONE:\n"
|
||||
" ret;\n"
|
||||
"}\n";
|
||||
|
||||
typedef CUresult (*cuInit_fn)(unsigned int);
|
||||
typedef CUresult (*cuDeviceGetCount_fn)(int *);
|
||||
typedef CUresult (*cuDeviceGet_fn)(CUdevice *, int);
|
||||
typedef CUresult (*cuDeviceGetName_fn)(char *, int, CUdevice);
|
||||
typedef CUresult (*cuCtxCreate_fn)(CUcontext *, unsigned int, CUdevice);
|
||||
typedef CUresult (*cuCtxDestroy_fn)(CUcontext);
|
||||
typedef CUresult (*cuCtxSynchronize_fn)(void);
|
||||
typedef CUresult (*cuMemAlloc_fn)(CUdeviceptr *, size_t);
|
||||
typedef CUresult (*cuMemFree_fn)(CUdeviceptr);
|
||||
typedef CUresult (*cuMemcpyHtoD_fn)(CUdeviceptr, const void *, size_t);
|
||||
typedef CUresult (*cuMemcpyDtoH_fn)(void *, CUdeviceptr, size_t);
|
||||
typedef CUresult (*cuModuleLoadDataEx_fn)(CUmodule *, const void *, unsigned int, void *, void *);
|
||||
typedef CUresult (*cuModuleGetFunction_fn)(CUfunction *, CUmodule, const char *);
|
||||
typedef CUresult (*cuLaunchKernel_fn)(CUfunction,
|
||||
unsigned int,
|
||||
unsigned int,
|
||||
unsigned int,
|
||||
unsigned int,
|
||||
unsigned int,
|
||||
unsigned int,
|
||||
unsigned int,
|
||||
CUstream,
|
||||
void **,
|
||||
void **);
|
||||
typedef CUresult (*cuGetErrorName_fn)(CUresult, const char **);
|
||||
typedef CUresult (*cuGetErrorString_fn)(CUresult, const char **);
|
||||
|
||||
struct cuda_api {
|
||||
void *lib;
|
||||
cuInit_fn cuInit;
|
||||
cuDeviceGetCount_fn cuDeviceGetCount;
|
||||
cuDeviceGet_fn cuDeviceGet;
|
||||
cuDeviceGetName_fn cuDeviceGetName;
|
||||
cuCtxCreate_fn cuCtxCreate;
|
||||
cuCtxDestroy_fn cuCtxDestroy;
|
||||
cuCtxSynchronize_fn cuCtxSynchronize;
|
||||
cuMemAlloc_fn cuMemAlloc;
|
||||
cuMemFree_fn cuMemFree;
|
||||
cuMemcpyHtoD_fn cuMemcpyHtoD;
|
||||
cuMemcpyDtoH_fn cuMemcpyDtoH;
|
||||
cuModuleLoadDataEx_fn cuModuleLoadDataEx;
|
||||
cuModuleGetFunction_fn cuModuleGetFunction;
|
||||
cuLaunchKernel_fn cuLaunchKernel;
|
||||
cuGetErrorName_fn cuGetErrorName;
|
||||
cuGetErrorString_fn cuGetErrorString;
|
||||
};
|
||||
|
||||
static int load_symbol(void *lib, const char *name, void **out) {
|
||||
*out = dlsym(lib, name);
|
||||
return *out != NULL;
|
||||
}
|
||||
|
||||
static int load_cuda(struct cuda_api *api) {
|
||||
memset(api, 0, sizeof(*api));
|
||||
api->lib = dlopen("libcuda.so.1", RTLD_NOW | RTLD_LOCAL);
|
||||
if (!api->lib) {
|
||||
return 0;
|
||||
}
|
||||
return
|
||||
load_symbol(api->lib, "cuInit", (void **)&api->cuInit) &&
|
||||
load_symbol(api->lib, "cuDeviceGetCount", (void **)&api->cuDeviceGetCount) &&
|
||||
load_symbol(api->lib, "cuDeviceGet", (void **)&api->cuDeviceGet) &&
|
||||
load_symbol(api->lib, "cuDeviceGetName", (void **)&api->cuDeviceGetName) &&
|
||||
load_symbol(api->lib, "cuCtxCreate_v2", (void **)&api->cuCtxCreate) &&
|
||||
load_symbol(api->lib, "cuCtxDestroy_v2", (void **)&api->cuCtxDestroy) &&
|
||||
load_symbol(api->lib, "cuCtxSynchronize", (void **)&api->cuCtxSynchronize) &&
|
||||
load_symbol(api->lib, "cuMemAlloc_v2", (void **)&api->cuMemAlloc) &&
|
||||
load_symbol(api->lib, "cuMemFree_v2", (void **)&api->cuMemFree) &&
|
||||
load_symbol(api->lib, "cuMemcpyHtoD_v2", (void **)&api->cuMemcpyHtoD) &&
|
||||
load_symbol(api->lib, "cuMemcpyDtoH_v2", (void **)&api->cuMemcpyDtoH) &&
|
||||
load_symbol(api->lib, "cuModuleLoadDataEx", (void **)&api->cuModuleLoadDataEx) &&
|
||||
load_symbol(api->lib, "cuModuleGetFunction", (void **)&api->cuModuleGetFunction) &&
|
||||
load_symbol(api->lib, "cuLaunchKernel", (void **)&api->cuLaunchKernel);
|
||||
}
|
||||
|
||||
static const char *cu_error_name(struct cuda_api *api, CUresult rc) {
|
||||
const char *value = NULL;
|
||||
if (api->cuGetErrorName && api->cuGetErrorName(rc, &value) == CU_SUCCESS && value) {
|
||||
return value;
|
||||
}
|
||||
return "CUDA_ERROR";
|
||||
}
|
||||
|
||||
static const char *cu_error_string(struct cuda_api *api, CUresult rc) {
|
||||
const char *value = NULL;
|
||||
if (api->cuGetErrorString && api->cuGetErrorString(rc, &value) == CU_SUCCESS && value) {
|
||||
return value;
|
||||
}
|
||||
return "unknown";
|
||||
}
|
||||
|
||||
static int check_rc(struct cuda_api *api, const char *step, CUresult rc) {
|
||||
if (rc == CU_SUCCESS) {
|
||||
return 1;
|
||||
}
|
||||
fprintf(stderr, "%s failed: %s (%s)\n", step, cu_error_name(api, rc), cu_error_string(api, rc));
|
||||
return 0;
|
||||
}
|
||||
|
||||
static double now_seconds(void) {
|
||||
struct timespec ts;
|
||||
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||
return (double)ts.tv_sec + ((double)ts.tv_nsec / 1000000000.0);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
int seconds = 5;
|
||||
int size_mb = 64;
|
||||
for (int i = 1; i < argc; i++) {
|
||||
if ((strcmp(argv[i], "--seconds") == 0 || strcmp(argv[i], "-t") == 0) && i + 1 < argc) {
|
||||
seconds = atoi(argv[++i]);
|
||||
} else if ((strcmp(argv[i], "--size-mb") == 0 || strcmp(argv[i], "-m") == 0) && i + 1 < argc) {
|
||||
size_mb = atoi(argv[++i]);
|
||||
} else {
|
||||
fprintf(stderr, "usage: %s [--seconds N] [--size-mb N]\n", argv[0]);
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
if (seconds <= 0) {
|
||||
seconds = 5;
|
||||
}
|
||||
if (size_mb <= 0) {
|
||||
size_mb = 64;
|
||||
}
|
||||
|
||||
struct cuda_api api;
|
||||
if (!load_cuda(&api)) {
|
||||
fprintf(stderr, "failed to load libcuda.so.1 or required Driver API symbols\n");
|
||||
return 1;
|
||||
}
|
||||
load_symbol(api.lib, "cuGetErrorName", (void **)&api.cuGetErrorName);
|
||||
load_symbol(api.lib, "cuGetErrorString", (void **)&api.cuGetErrorString);
|
||||
|
||||
if (!check_rc(&api, "cuInit", api.cuInit(0))) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
int count = 0;
|
||||
if (!check_rc(&api, "cuDeviceGetCount", api.cuDeviceGetCount(&count))) {
|
||||
return 1;
|
||||
}
|
||||
if (count <= 0) {
|
||||
fprintf(stderr, "no CUDA devices found\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
CUdevice dev = 0;
|
||||
if (!check_rc(&api, "cuDeviceGet", api.cuDeviceGet(&dev, 0))) {
|
||||
return 1;
|
||||
}
|
||||
char name[128] = {0};
|
||||
if (!check_rc(&api, "cuDeviceGetName", api.cuDeviceGetName(name, (int)sizeof(name), dev))) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
CUcontext ctx = NULL;
|
||||
if (!check_rc(&api, "cuCtxCreate", api.cuCtxCreate(&ctx, 0, dev))) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
size_t bytes = (size_t)size_mb * 1024 * 1024;
|
||||
uint32_t words = (uint32_t)(bytes / sizeof(uint32_t));
|
||||
if (words < 1024) {
|
||||
words = 1024;
|
||||
bytes = (size_t)words * sizeof(uint32_t);
|
||||
}
|
||||
|
||||
uint32_t *host = (uint32_t *)malloc(bytes);
|
||||
if (!host) {
|
||||
fprintf(stderr, "malloc failed\n");
|
||||
api.cuCtxDestroy(ctx);
|
||||
return 1;
|
||||
}
|
||||
for (uint32_t i = 0; i < words; i++) {
|
||||
host[i] = i ^ 0x12345678u;
|
||||
}
|
||||
|
||||
CUdeviceptr device_mem = 0;
|
||||
if (!check_rc(&api, "cuMemAlloc", api.cuMemAlloc(&device_mem, bytes))) {
|
||||
free(host);
|
||||
api.cuCtxDestroy(ctx);
|
||||
return 1;
|
||||
}
|
||||
if (!check_rc(&api, "cuMemcpyHtoD", api.cuMemcpyHtoD(device_mem, host, bytes))) {
|
||||
api.cuMemFree(device_mem);
|
||||
free(host);
|
||||
api.cuCtxDestroy(ctx);
|
||||
return 1;
|
||||
}
|
||||
|
||||
CUmodule module = NULL;
|
||||
if (!check_rc(&api, "cuModuleLoadDataEx", api.cuModuleLoadDataEx(&module, ptx_source, 0, NULL, NULL))) {
|
||||
api.cuMemFree(device_mem);
|
||||
free(host);
|
||||
api.cuCtxDestroy(ctx);
|
||||
return 1;
|
||||
}
|
||||
|
||||
CUfunction kernel = NULL;
|
||||
if (!check_rc(&api, "cuModuleGetFunction", api.cuModuleGetFunction(&kernel, module, "burn"))) {
|
||||
api.cuMemFree(device_mem);
|
||||
free(host);
|
||||
api.cuCtxDestroy(ctx);
|
||||
return 1;
|
||||
}
|
||||
|
||||
unsigned int threads = 256;
|
||||
unsigned int blocks = (words + threads - 1) / threads;
|
||||
uint32_t rounds = 256;
|
||||
void *params[] = {&device_mem, &words, &rounds};
|
||||
|
||||
double start = now_seconds();
|
||||
double deadline = start + (double)seconds;
|
||||
unsigned long iterations = 0;
|
||||
while (now_seconds() < deadline) {
|
||||
if (!check_rc(&api, "cuLaunchKernel",
|
||||
api.cuLaunchKernel(kernel, blocks, 1, 1, threads, 1, 1, 0, NULL, params, NULL))) {
|
||||
api.cuMemFree(device_mem);
|
||||
free(host);
|
||||
api.cuCtxDestroy(ctx);
|
||||
return 1;
|
||||
}
|
||||
iterations++;
|
||||
}
|
||||
|
||||
if (!check_rc(&api, "cuCtxSynchronize", api.cuCtxSynchronize())) {
|
||||
api.cuMemFree(device_mem);
|
||||
free(host);
|
||||
api.cuCtxDestroy(ctx);
|
||||
return 1;
|
||||
}
|
||||
if (!check_rc(&api, "cuMemcpyDtoH", api.cuMemcpyDtoH(host, device_mem, bytes))) {
|
||||
api.cuMemFree(device_mem);
|
||||
free(host);
|
||||
api.cuCtxDestroy(ctx);
|
||||
return 1;
|
||||
}
|
||||
|
||||
uint64_t checksum = 0;
|
||||
for (uint32_t i = 0; i < words; i += words / 256 ? words / 256 : 1) {
|
||||
checksum += host[i];
|
||||
}
|
||||
|
||||
double elapsed = now_seconds() - start;
|
||||
printf("device=%s\n", name);
|
||||
printf("duration_s=%.2f\n", elapsed);
|
||||
printf("buffer_mb=%d\n", size_mb);
|
||||
printf("iterations=%lu\n", iterations);
|
||||
printf("checksum=%llu\n", (unsigned long long)checksum);
|
||||
printf("status=OK\n");
|
||||
|
||||
api.cuMemFree(device_mem);
|
||||
free(host);
|
||||
api.cuCtxDestroy(ctx);
|
||||
return 0;
|
||||
}
|
||||
@@ -41,6 +41,7 @@ echo ""
|
||||
|
||||
# --- compile bee binary (static, Linux amd64) ---
|
||||
BEE_BIN="${DIST_DIR}/bee-linux-amd64"
|
||||
GPU_STRESS_BIN="${DIST_DIR}/bee-gpu-stress-linux-amd64"
|
||||
NEED_BUILD=1
|
||||
if [ -f "$BEE_BIN" ]; then
|
||||
NEWEST_SRC=$(find "${REPO_ROOT}/audit" -name '*.go' -newer "$BEE_BIN" | head -1)
|
||||
@@ -70,6 +71,22 @@ else
|
||||
echo "=== bee binary up to date, skipping build ==="
|
||||
fi
|
||||
|
||||
GPU_STRESS_NEED_BUILD=1
|
||||
if [ -f "$GPU_STRESS_BIN" ] && [ "${BUILDER_DIR}/bee-gpu-stress.c" -ot "$GPU_STRESS_BIN" ]; then
|
||||
GPU_STRESS_NEED_BUILD=0
|
||||
fi
|
||||
|
||||
if [ "$GPU_STRESS_NEED_BUILD" = "1" ]; then
|
||||
echo "=== building bee-gpu-stress ==="
|
||||
gcc -O2 -s -Wall -Wextra \
|
||||
-o "$GPU_STRESS_BIN" \
|
||||
"${BUILDER_DIR}/bee-gpu-stress.c" \
|
||||
-ldl
|
||||
echo "binary: $GPU_STRESS_BIN"
|
||||
else
|
||||
echo "=== bee-gpu-stress up to date, skipping build ==="
|
||||
fi
|
||||
|
||||
echo "=== preparing staged overlay ==="
|
||||
rm -rf "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}"
|
||||
mkdir -p "${BUILD_WORK_DIR}" "${OVERLAY_STAGE_DIR}"
|
||||
@@ -80,6 +97,7 @@ rm -f \
|
||||
"${OVERLAY_STAGE_DIR}/etc/bee-release" \
|
||||
"${OVERLAY_STAGE_DIR}/root/.ssh/authorized_keys" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress" \
|
||||
"${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest"
|
||||
|
||||
# --- inject authorized_keys for SSH access ---
|
||||
@@ -119,13 +137,15 @@ fi
|
||||
mkdir -p "${OVERLAY_STAGE_DIR}/usr/local/bin"
|
||||
cp "${DIST_DIR}/bee-linux-amd64" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee"
|
||||
cp "${GPU_STRESS_BIN}" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-gpu-stress"
|
||||
|
||||
# --- inject smoketest into overlay so it runs directly on the live CD ---
|
||||
cp "${BUILDER_DIR}/smoketest.sh" "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/bee-smoketest"
|
||||
|
||||
# --- vendor utilities (optional pre-fetched binaries) ---
|
||||
for tool in storcli64 sas2ircu sas3ircu mstflint; do
|
||||
for tool in storcli64 sas2ircu sas3ircu arcconf ssacli; do
|
||||
if [ -f "${VENDOR_DIR}/${tool}" ]; then
|
||||
cp "${VENDOR_DIR}/${tool}" "${OVERLAY_STAGE_DIR}/usr/local/bin/${tool}"
|
||||
chmod +x "${OVERLAY_STAGE_DIR}/usr/local/bin/${tool}" || true
|
||||
|
||||
@@ -11,6 +11,7 @@ lshw
|
||||
iproute2
|
||||
isc-dhcp-client
|
||||
iputils-ping
|
||||
ethtool
|
||||
qemu-guest-agent
|
||||
|
||||
# SSH
|
||||
@@ -27,6 +28,8 @@ mc
|
||||
htop
|
||||
sudo
|
||||
zstd
|
||||
mstflint
|
||||
memtester
|
||||
|
||||
# QR codes (for displaying audit results)
|
||||
qrencode
|
||||
|
||||
Reference in New Issue
Block a user