#include #include /* 10 million */ #define N 10000000 /* the kernel fills an array up with square roots */ __global__ void square_roots(double* array) { /* find my array index */ unsigned int idx = blockIdx.x; /* compute the square root of this number and store it in the array CUDA provides a sqrt function as part of its math library, so this call uses that - not the one from math.h which is a CPU function */ array[idx] = sqrt((double) idx); } int main() { /* error status */ cudaError_t status; /* store the square roots of 0 to (N-1) on the CPU * stored on the heap since it's too big for the stack for large values of N */ double* roots = (double*) malloc(N * sizeof(double)); /* allocate a GPU array to hold the square roots */ double* gpu_roots; status = cudaMalloc((void**) &gpu_roots, N * sizeof(double)); if (status != cudaSuccess) { printf("Error: %s\n", cudaGetErrorString(status)); return 0; } /* invoke the GPU to calculate the square roots */ square_roots<<>>(gpu_roots); /* check and see if there was an error on the call */ status = cudaPeekAtLastError(); if (status != cudaSuccess) { printf("Error: %s\n", cudaGetErrorString(status)); return 0; } /* wait for the kernel to complete, and check the resulting status code */ status = cudaDeviceSynchronize(); if (status != cudaSuccess) { printf("Error: %s\n", cudaGetErrorString(status)); return 0; } /* copy the data back */ status = cudaMemcpy(roots, gpu_roots, N * sizeof(double), cudaMemcpyDeviceToHost); if (status != cudaSuccess) { printf("Error: %s\n", cudaGetErrorString(status)); return 0; } /* free the memory */ status = cudaFree(gpu_roots); if (status != cudaSuccess) { printf("Error: %s\n", cudaGetErrorString(status)); return 0; } /* print out 100 evenly spaced square roots just to see that it worked */ unsigned int i; for (i = 0; i < N; i += (N / 100)) { printf("sqrt(%d) = %lf\n", i, roots[i]); } /* free the CPU memory */ free(roots); return 0; }