#include <stdio.h>
#include <stdlib.h>
__global__ void initwith(float val, float *a, int N)
{
int st=threadIdx.x+blockDim.x*blockIdx.x;
int stride=gridDim.x*blockDim.x;
for(int i=st;i<N;i+=stride)
{
a[i]=val;
}
}
void checkElementsAre(float target, float *array, int N)
{
for(int i = 0; i < N; i++)
{
if(array[i] != target)
{
printf("FAIL: array[%d] - %0.0f does not equal %0.0f\n", i, array[i], target);
exit(1);
}
}
printf("SUCCESS! All values added correctly.\n");
}
__global__ void addvec_gpu(float *a, float *b, float *c, int N)
{
int st =threadIdx.x + blockDim.x*blockIdx.x;
int stride = gridDim.x*blockDim.x;
for(int i=st;i<N;i+=stride)
c[i]=a[i]+b[i];
}
int main()
{
int deviceId;
int numberOfSMs;
cudaGetDevice(&deviceId);
cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);
printf("Device ID: %d\tNumber of SMs: %d\n", deviceId, numberOfSMs);
const int N = 2<<20;
size_t size = N * sizeof(float);
float *a;
float *b;
float *c;
cudaMallocManaged(&a, size);
cudaMallocManaged(&b, size);
cudaMallocManaged(&c, size);
cudaMemPrefetchAsync(a, size, deviceId); //optim , reduces the execution time alot, as it prefetches the values, in larger batches.
cudaMemPrefetchAsync(b,size, deviceId);
cudaMemPrefetchAsync(c,size, deviceId);
int numOfThreads = 256;
int numOfBlocks = 32*numberOfSMs; //optim
initwith<<<numOfBlocks, numOfThreads>>>(3,a,N); //optim
initwith<<<numOfBlocks, numOfThreads>>>(4,b,N);
initwith<<<numOfBlocks, numOfThreads>>>(0,c,N);
addvec_gpu<<<numOfBlocks, numOfThreads>>>(a,b,c,N);
cudaDeviceSynchronize(); //for working on the cpu.
cudaMemPrefetchAsync(c,size, cudaCpuDeviceId);
checkElementsAre(7, c, N);
cudaFree(a);
cudaFree(b);
cudaFree(c);
}