#include <stdio.h>
#include <stdlib.h>
void initwith(float val, float *a, int N)
{
for(int i=0;i<N;i++)
{
a[i]=val;
}
}
void checkElementsAre(float target, float *array, int N)
{
for(int i = 0; i < N; i++)
{
if(array[i] != target)
{
printf("FAIL: array[%d] - %0.0f does not equal %0.0f\n", i, array[i], target);
exit(1);
}
}
printf("SUCCESS! All values added correctly.\n");
}
__global__ void addvec_gpu(float *a, float *b, float *c, int N)
{
int i =threadIdx.x + blockDim.x*blockIdx.x;
if(i<N)
c[i]=a[i]+b[i];
}
int main()
{
const int N = 2<<20;
size_t size = N * sizeof(float);
float *a;
float *b;
float *c;
cudaMallocManaged(&a, size);
cudaMallocManaged(&b, size);
cudaMallocManaged(&c, size);
initwith(3,a,N);
initwith(4,b,N);
initwith(0,c,N);
int numOfThreads = 64;
int numOfBlocks = (numOfThreads+N-1)/numOfThreads;
addvec_gpu<<<numOfBlocks, numOfThreads>>>(a,b,c,N);
cudaDeviceSynchronize();
checkElementsAre(7, c, N);
cudaFree(a);
cudaFree(b);
cudaFree(c);
}