cuda-100-days / day5 / mat_mul_cublas.cu
mat_mul_cublas.cu
Raw
#include <stdio.h>
#include <cstdlib>
#include <cassert>
#include <iostream>
#include <cublas_v2.h>
#include <curand.h>
#include <math.h>

//verify result
void verifyRes(float *a, float *b, float *c, int N)
{
    float temp;
    float epsi =0.001;
    for(int i=0;i<N;i++)
        for(int j=0;j<N;j++)
        {   //for every element in the row,col but in column major form
            temp=0;
            for(int k=0;k<N;k++)
            {
                temp+=a[k*N+i]*b[j*N+k];
            }
            assert(fabs(temp-c[j*N+i])<epsi);
        }
}

int main()
{
    // set the problem size; //assume square matrix.
    int N = 1<<10;

    //allocate the number of bytes needed.
    int size = N*N*sizeof(float);

    //allocate memory 
    float *h_a;
    float *h_b;
    float *h_c;
    float *d_a;
    float *d_b;
    float *d_c;

    h_a = (float*)malloc(size);
    h_b = (float*)malloc(size);
    h_c = (float*)malloc(size);

    cudaMalloc(&d_a, size);
    cudaMalloc(&d_b, size);
    cudaMalloc(&d_c, size);

    //initialize random values, instead of taking from host.
    curandGenerator_t prng;
    curandCreateGenerator(&prng, CURAND_RNG_PSEUDO_DEFAULT);
    
    // initialize with random uniform dist. values. //directly for gpu variables.
    curandGenerateUniform(prng, d_a, N*N);
    curandGenerateUniform(prng, d_b, N*N);

    cublasHandle_t handle; // to interface with the blas lib.
    cublasCreate_v2(&handle); //fill the handle with a mem addres.

    float alpha =1.0f; // for generalized matric mul.
    float beta =0.0f;

    cublasSgemm(handle, CUBLAS_OP_N,CUBLAS_OP_N, N,N,N, &alpha, d_a, N, d_b, N, &beta, d_c, N);

    cudaMemcpy(h_a, d_a, size, cudaMemcpyDeviceToHost);
    cudaMemcpy(h_b, d_b, size, cudaMemcpyDeviceToHost);
    cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);

    verifyRes(h_a,h_b,h_c,N);

    std::cout<<"Done success!!!"<<std::endl;

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    return 0;



}