#include <stdio.h>
#include <stdlib.h>

/* N is 1 billion */
#define N 1000000000

/* we make 1 thousand threads which each calculate 1 million items */
#define THREADS 1000
#define ITEMS (N / THREADS)

/* estimate the sum of 1/2 1/4 1/8 ... 1/2^N etc. */
void __global__ est(double* r) {
    double __shared__ results[THREADS];
    int i;

    double result = 0.0;
    double denom = pow(2.0, (double) (threadIdx.x * ITEMS + 1));
    for (i  = 0; i < ITEMS; i++) {
        result += (1.0 / denom);
        denom *= 2.0;
    }

    /* write ours */
    results[threadIdx.x] = result;
    __syncthreads();

    /* now do the sum reduction */
    i = THREADS / 2;
    while (i != 0) {
        /* if we are not thread 0 */
        if (threadIdx.x < i) {
            /* add the one to our right by i places into this one */
            results[threadIdx.x] += results[threadIdx.x + i];
        }

        /* cut i in half */
        i /= 2;
        __syncthreads();
    }

    *r = results[0];
}

int main() {
    double result, *gpu_result;
    cudaMalloc((void**) &gpu_result, sizeof(double));

    est<<<1, THREADS>>>(gpu_result);

    cudaMemcpy(&result, gpu_result, sizeof(double), cudaMemcpyDeviceToHost);

    printf("The sum = %lf\n", result); 
    return 0;
}