cuda-100-days / day2 / mat_mul.cu
mat_mul.cu
Raw
#include <stdio.h>
#include <cstdlib>
#include <cassert>
#include <iostream>

__global__ void MatrixMulGpu(int *a, int *b, int *c, int N)
{
    // calci the row and col, rows in y dimnesion.
    int row = threadIdx.y+blockDim.y*blockIdx.y; // offset+base
    int col = threadIdx.x+blockDim.x*blockIdx.x;

    int val=0;
    if(row<N && col<N) // make sure you are in bounds 
    {   
        for(int k=0;k<N;k++)
            val+=a[row*N+k]*b[k*N+col]; //for a - accross the row a, and for b, across the column. For a - keep row constant, and for b- column is constant, 
        // it is actually like a[row][k]*b[k][col] but in row major case = we need to skip N elements to get to next row.
        c[row*N+col]=val;
    }

}
//initialize with random numbers between 0-100.
void initWith(int *pt, int N)
{
    for(int i=0;i<N*N;i++)
        pt[i]=rand()%100;
}

//verify result
void verifyRes(int *a, int *b, int *c, int N)
{
    int temp;
    for(int i=0;i<N;i++)
        for(int j=0;j<N;j++)
        {   //for every element in the row,col
            temp=0;
            for(int k=0;k<N;k++)
            {
                temp+=a[i*N+k]*b[k*N+j];
            }
            assert(temp==c[i*N+j]);
        }
}

int main()
{
    // set the problem size; //assume square matrix.
    int N = 1<<10;

    //allocate the number of bytes needed.
    int size = N*N*sizeof(int);

    //allocate memory 
    int *a;
    int *b;
    int *c;

    cudaMallocManaged(&a, size); // no need to worry about memcopies.
    cudaMallocManaged(&b, size);
    cudaMallocManaged(&c, size);

    initWith(a, N);
    initWith(b, N);

    // one thead per output element, but here we create a 2d grid.
    int threads = 16;
    int blocks = (N+threads-1)/threads;

    //set up the 2d grid, similar to 1d.
    dim3 threadblock(threads,threads); 
    dim3 numblock(blocks,blocks);

    // launch the kernel
    MatrixMulGpu<<<numblock, threadblock>>>(a, b, c, N);

    cudaDeviceSynchronize();

    verifyRes(a,b,c,N);

    std::cout<<"Done success!!!"<<std::endl;

    cudaFree(a);
    cudaFree(b);
    cudaFree(c);

    return 0;



}