#include <stdio.h>
constexpr int THREADS_PER_BLOCK = 256;
constexpr int N = 8;
constexpr int VECTOR_SIZE = 4;
constexpr int REDUCE_SIZE = 8;
__global__ void test_kernel(int *x, int *y, int alpha) {
const int g_tid = threadIdx.x + blockDim.x * blockIdx.x;
const int g_bid = blockIdx.x;
const int tid_in_block = g_tid % blockDim.x;
__shared__ int SH[THREADS_PER_BLOCK];
constexpr int VECTOR_NUM = THREADS_PER_BLOCK / VECTOR_SIZE;
const int g_vector_id = g_tid / VECTOR_SIZE;
const int tid_in_vector = g_tid % VECTOR_SIZE;
const int vec_id_in_block = tid_in_block / VECTOR_SIZE;
__shared__ int lds_y[VECTOR_NUM];
int K = 0;
for (int i = 0; i < N; i++) {
const int index = i * THREADS_PER_BLOCK + g_tid;
SH[tid_in_block] = x[index];
__syncthreads();
if (vec_id_in_block < THREADS_PER_BLOCK / REDUCE_SIZE) {
int sum = 0;
for (int j = 0; j < REDUCE_SIZE / VECTOR_SIZE; j++) {
const int lds_index = vec_id_in_block * REDUCE_SIZE + tid_in_vector + j * VECTOR_SIZE;
sum += SH[lds_index];
}
for (int j = VECTOR_SIZE >> 1; j > 0; j >>= 1) {
sum += __shfl_down(sum, j, VECTOR_SIZE);
}
if (tid_in_vector == 0) {
lds_y[vec_id_in_block] = sum;
}
__syncthreads();
if (tid_in_block < THREADS_PER_BLOCK / REDUCE_SIZE) {
const int local_sum = lds_y[tid_in_block];
y[K + tid_in_block] = alpha * local_sum;
}
}
K += THREADS_PER_BLOCK / REDUCE_SIZE;
}
}
int main() {
constexpr int DATA_SIZE = THREADS_PER_BLOCK * N;
int *hx = new int[DATA_SIZE];
int *hy = new int[DATA_SIZE/REDUCE_SIZE];
for (int i = 0; i < DATA_SIZE; i++) {
hx[i] = i;
}
int *x = nullptr;
int *y = nullptr;
cudaMalloc(&x, DATA_SIZE * sizeof(int));
cudaMalloc(&y, DATA_SIZE / REDUCE_SIZE * sizeof(int));
cudaMemcpy(x, hx, DATA_SIZE * sizeof(int), cudaMemcpyHostToDevice);
test_kernel<<<1, THREADS_PER_BLOCK>>>(x, y, 1);
cudaDeviceSynchronize();
cudaMemcpy(hy, y, DATA_SIZE / REDUCE_SIZE * sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < DATA_SIZE / REDUCE_SIZE; i++) {
int R = REDUCE_SIZE;
printf("%d\n", hy[i] == (R * (2 * R * i + R - 1) / 2));
}
}