day4/cublas_vec_add.cu · cuda-100-days

#include <stdio.h>
#include <stdlib.h>
#include <cublas_v2.h>


void initwith(float val, float *a, int N)
{
    for(int i=0;i<N;i++)
    {
        a[i]=val;
    }
}



void checkElementsAre(float target, float *array, int N)
{
  for(int i = 0; i < N; i++)
  {
    if(array[i] != target)
    {
      printf("FAIL: array[%d] - %0.0f does not equal %0.0f\n", i, array[i], target);
      exit(1);
    }
  }
  printf("SUCCESS! All values added correctly.\n");
}

__global__ void addvec_gpu(float *a, float *b, float *c, int N)
{
    int i =threadIdx.x + blockDim.x*blockIdx.x;
    if(i<N)
        c[i]=a[i]+b[i];
}
int main()
{
    const int N = 2<<20;
    size_t size = N * sizeof(float);

    float *h_a;
    float *h_b;
    float *h_c;
    float *d_a;
    float *d_b;

    h_a = (float*)malloc(size);
    h_b = (float*)malloc(size);
    h_c = (float*)malloc(size);

    cudaMalloc(&d_a, size);
    cudaMalloc(&d_b, size);


    initwith(3,h_a,N);
    initwith(4,h_b,N);
    initwith(0,h_c,N);

    
    cublasHandle_t handle; // to interface with the blas lib.
    cublasCreate_v2(&handle); //fill the handle with a mem addres.

    cublasSetVector(N, sizeof(float), h_a, 1, d_a, 1); // no need to use memcopy

    cublasSetVector(N, sizeof(float), h_b, 1, d_b, 1); // 1 is step size, but here we have 1 space between every single element. or rather it copies all elements
    // without skipping.

    const float scale = 1.0f; //scaled vect addition.
    cublasSaxpy(handle, N, &scale, d_a, 1, d_b, 1); //same as above, as vectors have no empty spaces.

    cublasGetVector(N, sizeof(float), d_b, 1, h_c, 1);
    checkElementsAre(7, h_c, N);

    cublasDestroy(handle);

    cudaFree(d_a);
    cudaFree(d_b);
    free(h_a);
    free(h_b);
    free(h_c);


}