#include <stdio.h>
#include <cstdlib>
#include <cassert>
#include <iostream>
#include <cublas_v2.h>
#include <curand.h>
#include <math.h>
//verify result
void verifyRes(float *a, float *b, float *c, int N)
{
float temp;
float epsi =0.001;
for(int i=0;i<N;i++)
for(int j=0;j<N;j++)
{ //for every element in the row,col but in column major form
temp=0;
for(int k=0;k<N;k++)
{
temp+=a[k*N+i]*b[j*N+k];
}
assert(fabs(temp-c[j*N+i])<epsi);
}
}
int main()
{
// set the problem size; //assume square matrix.
int N = 1<<10;
//allocate the number of bytes needed.
int size = N*N*sizeof(float);
//allocate memory
float *h_a;
float *h_b;
float *h_c;
float *d_a;
float *d_b;
float *d_c;
h_a = (float*)malloc(size);
h_b = (float*)malloc(size);
h_c = (float*)malloc(size);
cudaMalloc(&d_a, size);
cudaMalloc(&d_b, size);
cudaMalloc(&d_c, size);
//initialize random values, instead of taking from host.
curandGenerator_t prng;
curandCreateGenerator(&prng, CURAND_RNG_PSEUDO_DEFAULT);
// initialize with random uniform dist. values. //directly for gpu variables.
curandGenerateUniform(prng, d_a, N*N);
curandGenerateUniform(prng, d_b, N*N);
cublasHandle_t handle; // to interface with the blas lib.
cublasCreate_v2(&handle); //fill the handle with a mem addres.
float alpha =1.0f; // for generalized matric mul.
float beta =0.0f;
cublasSgemm(handle, CUBLAS_OP_N,CUBLAS_OP_N, N,N,N, &alpha, d_a, N, d_b, N, &beta, d_c, N);
cudaMemcpy(h_a, d_a, size, cudaMemcpyDeviceToHost);
cudaMemcpy(h_b, d_b, size, cudaMemcpyDeviceToHost);
cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);
verifyRes(h_a,h_b,h_c,N);
std::cout<<"Done success!!!"<<std::endl;
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}