#include #include /* N is 1 billion */ #define N 1000000000 /* we make 1 thousand threads which each calculate 1 million items */ #define THREADS 1000 #define ITEMS (N / THREADS) /* estimate the sum of 1/2 1/4 1/8 ... 1/2^N etc. */ void __global__ est(double* r) { double __shared__ results[THREADS]; int i; double result = 0.0; double denom = pow(2.0, (double) (threadIdx.x * ITEMS + 1)); for (i = 0; i < ITEMS; i++) { result += (1.0 / denom); denom *= 2.0; } /* write ours */ results[threadIdx.x] = result; __syncthreads(); /* now do the sum reduction */ i = THREADS / 2; while (i != 0) { /* if we are not thread 0 */ if (threadIdx.x < i) { /* add the one to our right by i places into this one */ results[threadIdx.x] += results[threadIdx.x + i]; } /* cut i in half */ i /= 2; __syncthreads(); } *r = results[0]; } int main() { double result, *gpu_result; cudaMalloc((void**) &gpu_result, sizeof(double)); est<<<1, THREADS>>>(gpu_result); cudaMemcpy(&result, gpu_result, sizeof(double), cudaMemcpyDeviceToHost); printf("The sum = %lf\n", result); return 0; }