cuda-100-days / day3 / mat_mul_tiled.cu
mat_mul_tiled.cu
Raw
#include <stdio.h>
#include <cstdlib>
#include <cassert>
#include <iostream>


#define TILE_SIZE 16


__global__ void MatrixMulGpu_tiled(int *a, int *b, int *c, int N) // assumption is square matrix!
{
    // calci the row and col, rows in y dimnesion.
    // int TILE_SIZE = blockDim.x;

    int global_row = threadIdx.y+blockDim.y*blockIdx.y; // offset+base
    int global_col = threadIdx.x+blockDim.x*blockIdx.x;

    int ty = threadIdx.y;
    int tx= threadIdx.x;

    __shared__ int sh_A[TILE_SIZE][TILE_SIZE];
    __shared__ int sh_B[TILE_SIZE][TILE_SIZE];

    int val=0;
    int num_tiles = (N+TILE_SIZE-1)/TILE_SIZE;
    //access the elements from global memory to shared memory.
    for(int phase=0;phase<num_tiles;phase++)
    {
        sh_A[ty][tx] = a[global_row*N+phase*TILE_SIZE+tx]; //same as global row, but column changes. It is like just doing it for the small tile.
        sh_B[ty][tx] = b[(phase*TILE_SIZE+ty)*N+global_col];
    
    __syncthreads(); //ensure all threads have loaded the elements.

    for(int k=0;k<TILE_SIZE;k++)
        val+= sh_A[ty][k]*sh_B[k][tx];

    __syncthreads(); //make sure all threads have completed the execution!
    }
    c[global_row*N+global_col]=val;
 

}
//initialize with random numbers between 0-100.
void initWith(int *pt, int N)
{
    for(int i=0;i<N*N;i++)
        pt[i]=rand()%100;
}

//verify result
void verifyRes(int *a, int *b, int *c, int N)
{
    int temp;
    for(int i=0;i<N;i++)
        for(int j=0;j<N;j++)
        {   //for every element in the row,col
            temp=0;
            for(int k=0;k<N;k++)
            {
                temp+=a[i*N+k]*b[k*N+j];
            }
            assert(temp==c[i*N+j]);
        }
}

int main()
{
    // set the problem size; //assume square matrix.
    int N = 1<<10;

    //allocate the number of bytes needed.
    int size = N*N*sizeof(int);

    //allocate memory 
    int *a;
    int *b;
    int *c;

    cudaMallocManaged(&a, size); // no need to worry about memcopies.
    cudaMallocManaged(&b, size);
    cudaMallocManaged(&c, size);

    initWith(a, N);
    initWith(b, N);

    // one thead per output element, but here we create a 2d grid.
    int threads = TILE_SIZE;
    int blocks = (N+threads-1)/threads;
    // int TILE_SIZE = blocks;

    //set up the 2d grid, similar to 1d.
    dim3 threadblock(threads,threads); 
    dim3 numblock(blocks,blocks);

    // launch the kernel
    MatrixMulGpu_tiled<<<numblock, threadblock>>>(a, b, c, N);

    cudaDeviceSynchronize();

    verifyRes(a,b,c,N);

    std::cout<<"Done success!!!"<<std::endl;

    cudaFree(a);
    cudaFree(b);
    cudaFree(c);

    return 0;



}