/* sum-sync.cu */ #include #include #define THREADS 256 #define START 0 #define END 1000 /* this is the GPU kernel function */ __global__ void sum(int* result) { /* this array is shared for all of the threads in the block */ __shared__ int partials[THREADS]; /* find our start and end points */ int start = ((END - START) / THREADS) * threadIdx.x; int end = start + ((END - START) / THREADS) - 1; /* the last core must go all the way to the end */ if (threadIdx.x == (THREADS - 1)) { end = END; } /* calculate our part into the array of partial sums */ partials[threadIdx.x] = 0; int i; for (i = start; i <= end; i++) { partials[threadIdx.x] += i; } /* now we need to wait for all threads to calculate their partial sum before continuing on - the __syncthreads() call works like a barrier */ __syncthreads(); /* now we perform a reduction on the partial sums start i at the number of threads in the block divided by 2 */ i = THREADS / 2; /* if there were an odd number at start, grab the last one */ if (THREADS % 2 != 0) { if (threadIdx.x == 0) { partials[0] += partials[THREADS - 1]; } } while (i != 0) { /* if we are not done */ if (threadIdx.x < i) { /* add the one to our right by i places into this one */ partials[threadIdx.x] += partials[threadIdx.x + i]; } /* if there is an odd one out, add its sum to 0 */ if ((i > 1) && (i % 2 != 0)) { if (threadIdx.x == 0) { partials[0] += partials[i - 1]; } } /* wait for all threads to do this */ __syncthreads(); /* cut i in half */ i /= 2; } /* now array slot 0 should hold the final answer - have one thread write it to the output cell */ if (threadIdx.x == 0) { *result = partials[0]; } } /* the main function begins running on the CPU */ int main() { /* space to store the result on the CPU */ int result; /* allocate space on the GPU for the resulting sum */ int* gpu_result; cudaMalloc((void**) &gpu_result, sizeof(int)); /* invoke the GPU to run the kernel in parallel we specify the number of threads we want and one block */ sum<<<1, THREADS>>>(gpu_result); /* copy the result back from the GPU to the CPU */ cudaMemcpy(&result, gpu_result, sizeof(int), cudaMemcpyDeviceToHost); /* free the memory we allocated on the GPU */ cudaFree(gpu_result); /* print the final result */ printf("Final answer = %d.\n", result); return 0; }