/* sum-sync.cu */

#include <unistd.h>
#include <stdio.h>

#define THREADS 256

#define START 0
#define END 1000

/* this is the GPU kernel function */
__global__ void sum(int* result) {
    /* this array is shared for all of the threads in the block */
    __shared__ int partials[THREADS];

    /* find our start and end points */
    int start = ((END - START) / THREADS) * threadIdx.x;
    int end = start + ((END - START) / THREADS) - 1;

    /* the last core must go all the way to the end */
    if (threadIdx.x == (THREADS - 1)) {
        end = END;
    }

    /* calculate our part into the array of partial sums */
    partials[threadIdx.x] = 0;
    int i;
    for (i = start; i <= end; i++) {
        partials[threadIdx.x] += i;
    }

    /* now we need to wait for all threads to calculate their partial sum before
       continuing on - the __syncthreads() call works like a barrier */
    __syncthreads();

    /* now we perform a reduction on the partial sums
       start i at the number of threads in the block divided by 2 */
    i = THREADS / 2;

    /* if there were an odd number at start, grab the last one */
    if (THREADS % 2 != 0) {
        if (threadIdx.x == 0) {
            partials[0] += partials[THREADS - 1];
        }
    }

    while (i != 0) {
        /* if we are not done */
        if (threadIdx.x < i) {
            /* add the one to our right by i places into this one */
            partials[threadIdx.x] += partials[threadIdx.x + i];
        }

        /* if there is an odd one out, add its sum to 0 */
        if ((i > 1) && (i % 2 != 0)) {
            if (threadIdx.x == 0) {
                partials[0] += partials[i - 1];
           }
        }

        /* wait for all threads to do this */
        __syncthreads();

        /* cut i in half */
        i /= 2;
    }


    /* now array slot 0 should hold the final answer - have one thread write it to the output cell */
    if (threadIdx.x == 0) {
        *result = partials[0];
    }
}

/* the main function begins running on the CPU */
int main() {
    /* space to store the result on the CPU */
    int result;

    /* allocate space on the GPU for the resulting sum  */
    int* gpu_result;
    cudaMalloc((void**) &gpu_result, sizeof(int));

    /* invoke the GPU to run the kernel in parallel we specify the number of threads we want and one block */
    sum<<<1, THREADS>>>(gpu_result);

    /* copy the result back from the GPU to the CPU */
    cudaMemcpy(&result, gpu_result, sizeof(int), cudaMemcpyDeviceToHost);

    /* free the memory we allocated on the GPU */
    cudaFree(gpu_result);

    /* print the final result */
    printf("Final answer = %d.\n", result); 
    return 0;
}