/* sum.cu */

#include <unistd.h>
#include <stdio.h>

#define CORES 16 

#define START 0
#define END 100

/* this is the GPU kernel function */
__global__ void sum(int* partials) {
    /* find our start and end points */
    int start = ((END - START) / CORES) * blockIdx.x;
    int end = start + ((END - START) / CORES) - 1;

    /* the last core must go all the way to the end */
    if (blockIdx.x == (CORES - 1)) {
        end = END;
    }

    /* calculate our part into the array of partial sums */
    partials[blockIdx.x] = 0;
    int i;
    for (i = start; i <= end; i++) {
        partials[blockIdx.x] += i;
    }
}

/* the main function begins running on the CPU */
int main() {
    /* space to store partials on CPU */
    int cpu_partials[CORES];

    /* allocate space on the GPU for the partial ints */
    int* partials;
    cudaMalloc((void**) &partials, CORES * sizeof(int));

    /* invoke the GPU to run the kernel in parallel
       we specify CORES cores which each run once */
    sum<<<CORES, 1>>>(partials);

    /* copy the partial sums back from the GPU to the CPU */
    cudaMemcpy(cpu_partials, partials, CORES * sizeof(int), cudaMemcpyDeviceToHost);

    /* free the memory we allocated on the GPU */
    cudaFree(partials);

    /* now find the final answer on the CPU */
    int answer = 0, i;
    for (i = 0; i < CORES; i++) {
        answer += cpu_partials[i];
    }

    /* print the final result */
    printf("Final answer = %d.\n", answer); 
    return 0;
}