#include <stdio.h>
#include <cstdlib>
#include <cassert>
#include <iostream>
#define TILE_SIZE 16
__global__ void MatrixMulGpu_tiled(int *a, int *b, int *c, int N) // assumption is square matrix!
{
// calci the row and col, rows in y dimnesion.
// int TILE_SIZE = blockDim.x;
int global_row = threadIdx.y+blockDim.y*blockIdx.y; // offset+base
int global_col = threadIdx.x+blockDim.x*blockIdx.x;
int ty = threadIdx.y;
int tx= threadIdx.x;
__shared__ int sh_A[TILE_SIZE][TILE_SIZE];
__shared__ int sh_B[TILE_SIZE][TILE_SIZE];
int val=0;
int num_tiles = (N+TILE_SIZE-1)/TILE_SIZE;
//access the elements from global memory to shared memory.
for(int phase=0;phase<num_tiles;phase++)
{
sh_A[ty][tx] = a[global_row*N+phase*TILE_SIZE+tx]; //same as global row, but column changes. It is like just doing it for the small tile.
sh_B[ty][tx] = b[(phase*TILE_SIZE+ty)*N+global_col];
__syncthreads(); //ensure all threads have loaded the elements.
for(int k=0;k<TILE_SIZE;k++)
val+= sh_A[ty][k]*sh_B[k][tx];
__syncthreads(); //make sure all threads have completed the execution!
}
c[global_row*N+global_col]=val;
}
//initialize with random numbers between 0-100.
void initWith(int *pt, int N)
{
for(int i=0;i<N*N;i++)
pt[i]=rand()%100;
}
//verify result
void verifyRes(int *a, int *b, int *c, int N)
{
int temp;
for(int i=0;i<N;i++)
for(int j=0;j<N;j++)
{ //for every element in the row,col
temp=0;
for(int k=0;k<N;k++)
{
temp+=a[i*N+k]*b[k*N+j];
}
assert(temp==c[i*N+j]);
}
}
int main()
{
// set the problem size; //assume square matrix.
int N = 1<<10;
//allocate the number of bytes needed.
int size = N*N*sizeof(int);
//allocate memory
int *a;
int *b;
int *c;
cudaMallocManaged(&a, size); // no need to worry about memcopies.
cudaMallocManaged(&b, size);
cudaMallocManaged(&c, size);
initWith(a, N);
initWith(b, N);
// one thead per output element, but here we create a 2d grid.
int threads = TILE_SIZE;
int blocks = (N+threads-1)/threads;
// int TILE_SIZE = blocks;
//set up the 2d grid, similar to 1d.
dim3 threadblock(threads,threads);
dim3 numblock(blocks,blocks);
// launch the kernel
MatrixMulGpu_tiled<<<numblock, threadblock>>>(a, b, c, N);
cudaDeviceSynchronize();
verifyRes(a,b,c,N);
std::cout<<"Done success!!!"<<std::endl;
cudaFree(a);
cudaFree(b);
cudaFree(c);
return 0;
}