#include <stdio.h>
#include <stdlib.h>
#include <cublas_v2.h>
void initwith(float val, float *a, int N)
{
for(int i=0;i<N;i++)
{
a[i]=val;
}
}
void checkElementsAre(float target, float *array, int N)
{
for(int i = 0; i < N; i++)
{
if(array[i] != target)
{
printf("FAIL: array[%d] - %0.0f does not equal %0.0f\n", i, array[i], target);
exit(1);
}
}
printf("SUCCESS! All values added correctly.\n");
}
__global__ void addvec_gpu(float *a, float *b, float *c, int N)
{
int i =threadIdx.x + blockDim.x*blockIdx.x;
if(i<N)
c[i]=a[i]+b[i];
}
int main()
{
const int N = 2<<20;
size_t size = N * sizeof(float);
float *h_a;
float *h_b;
float *h_c;
float *d_a;
float *d_b;
h_a = (float*)malloc(size);
h_b = (float*)malloc(size);
h_c = (float*)malloc(size);
cudaMalloc(&d_a, size);
cudaMalloc(&d_b, size);
initwith(3,h_a,N);
initwith(4,h_b,N);
initwith(0,h_c,N);
cublasHandle_t handle; // to interface with the blas lib.
cublasCreate_v2(&handle); //fill the handle with a mem addres.
cublasSetVector(N, sizeof(float), h_a, 1, d_a, 1); // no need to use memcopy
cublasSetVector(N, sizeof(float), h_b, 1, d_b, 1); // 1 is step size, but here we have 1 space between every single element. or rather it copies all elements
// without skipping.
const float scale = 1.0f; //scaled vect addition.
cublasSaxpy(handle, N, &scale, d_a, 1, d_b, 1); //same as above, as vectors have no empty spaces.
cublasGetVector(N, sizeof(float), d_b, 1, h_c, 1);
checkElementsAre(7, h_c, N);
cublasDestroy(handle);
cudaFree(d_a);
cudaFree(d_b);
free(h_a);
free(h_b);
free(h_c);
}