/* sum.cu */ #include #include #define CORES 16 #define START 0 #define END 100 /* this is the GPU kernel function */ __global__ void sum(int* partials) { /* find our start and end points */ int start = ((END - START) / CORES) * blockIdx.x; int end = start + ((END - START) / CORES) - 1; /* the last core must go all the way to the end */ if (blockIdx.x == (CORES - 1)) { end = END; } /* calculate our part into the array of partial sums */ partials[blockIdx.x] = 0; int i; for (i = start; i <= end; i++) { partials[blockIdx.x] += i; } } /* the main function begins running on the CPU */ int main() { /* space to store partials on CPU */ int cpu_partials[CORES]; /* allocate space on the GPU for the partial ints */ int* partials; cudaMalloc((void**) &partials, CORES * sizeof(int)); /* invoke the GPU to run the kernel in parallel we specify CORES cores which each run once */ sum<<>>(partials); /* copy the partial sums back from the GPU to the CPU */ cudaMemcpy(cpu_partials, partials, CORES * sizeof(int), cudaMemcpyDeviceToHost); /* free the memory we allocated on the GPU */ cudaFree(partials); /* now find the final answer on the CPU */ int answer = 0, i; for (i = 0; i < CORES; i++) { answer += cpu_partials[i]; } /* print the final result */ printf("Final answer = %d.\n", answer); return 0; }