#include <stdio.h>
#include <cstdlib>
#include <cassert>
#include <iostream>
__global__ void MatrixMulGpu(int *a, int *b, int *c, int N)
{
// calci the row and col, rows in y dimnesion.
int row = threadIdx.y+blockDim.y*blockIdx.y; // offset+base
int col = threadIdx.x+blockDim.x*blockIdx.x;
int val=0;
if(row<N && col<N) // make sure you are in bounds
{
for(int k=0;k<N;k++)
val+=a[row*N+k]*b[k*N+col]; //for a - accross the row a, and for b, across the column. For a - keep row constant, and for b- column is constant,
// it is actually like a[row][k]*b[k][col] but in row major case = we need to skip N elements to get to next row.
c[row*N+col]=val;
}
}
//initialize with random numbers between 0-100.
void initWith(int *pt, int N)
{
for(int i=0;i<N*N;i++)
pt[i]=rand()%100;
}
//verify result
void verifyRes(int *a, int *b, int *c, int N)
{
int temp;
for(int i=0;i<N;i++)
for(int j=0;j<N;j++)
{ //for every element in the row,col
temp=0;
for(int k=0;k<N;k++)
{
temp+=a[i*N+k]*b[k*N+j];
}
assert(temp==c[i*N+j]);
}
}
int main()
{
// set the problem size; //assume square matrix.
int N = 1<<10;
//allocate the number of bytes needed.
int size = N*N*sizeof(int);
//allocate memory
int *a;
int *b;
int *c;
cudaMallocManaged(&a, size); // no need to worry about memcopies.
cudaMallocManaged(&b, size);
cudaMallocManaged(&c, size);
initWith(a, N);
initWith(b, N);
// one thead per output element, but here we create a 2d grid.
int threads = 16;
int blocks = (N+threads-1)/threads;
//set up the 2d grid, similar to 1d.
dim3 threadblock(threads,threads);
dim3 numblock(blocks,blocks);
// launch the kernel
MatrixMulGpu<<<numblock, threadblock>>>(a, b, c, N);
cudaDeviceSynchronize();
verifyRes(a,b,c,N);
std::cout<<"Done success!!!"<<std::endl;
cudaFree(a);
cudaFree(b);
cudaFree(c);
return 0;
}