src/gpudb/test/test_par3.cu · Lancelot

#include <assert.h>
#include <stdio.h>
#include <chrono>
#include <pthread.h>

#include "tbb/tbb.h"

#include <cub/cub.cuh>
#include <curand.h>
#include <cuda.h>
#include <thread>
#include <cupti.h>

#include "crystal/crystal.cuh"
#include "papi.h"
#include "papi_test.h"

#define NUM_EVENTS 1


#define CHECK_CU_ERROR(err, cufunc)                                     \
    if (err != CUDA_SUCCESS) { printf ("Error %d for CUDA Driver API function '%s'\n", err, cufunc); return -1; }

#define CHECK_CUDA_ERROR(err)                                           \
    if (err != cudaSuccess) { printf ("%s:%i Error %d for CUDA [%s]\n", __FILE__, __LINE__, err, cudaGetErrorString(err) ); return -1; }

#define CHECK_CUPTI_ERROR(err, cuptifunc)                               \
    if (err != CUPTI_SUCCESS) { const char *errStr; cuptiGetResultString(err, &errStr); \
       printf ("%s:%i Error %d [%s] for CUPTI API function '%s'\n", __FILE__, __LINE__, err, errStr, cuptifunc); return -1; }

using namespace cub;
using namespace std;
using namespace tbb;

void runCPU(int* values, int size, int offset) {

    parallel_for( blocked_range<int>(offset, offset+size),
                       [&](blocked_range<int> r)
    {
    	// int worker_index = tbb::task_arena::current_thread_index();
    	//printf("worker_index = %d\n", worker_index);
        for (int i=r.begin(); i<r.end(); ++i)
        {
            values[i] = values[i] * values[i];
            //printf("index = %d\n", i);
        }
    });
}

__global__ void kernel(int* d_values, int size, int offset) {
	int tid = blockDim.x * blockIdx.x + threadIdx.x;

	if (tid < size) {
		//printf("%d %d\n", tid + offset, d_values[tid + offset]);
		d_values[tid + offset] = d_values[tid + offset] * d_values[tid + offset];
		//if ((tid + offset) == 160) printf("%d\n", d_values[tid + offset]);
	}
}

void runGPU(int* d_values, int size, int offset) {

    cudaStream_t stream;
    cudaStreamCreate(&stream);

    //cout << offset << endl;

    kernel<<<(size + 128 - 1)/128, 128, 0, stream>>>(d_values, size, offset); 

    cudaStreamDestroy(stream);
}

void transferGPU(int* d_values, int* values, int size) {

    cudaStream_t stream;
    cudaStreamCreate(&stream);

    printf("start transfer\n");
	CubDebugExit(cudaMemcpyAsync(d_values, values, size * sizeof(int), cudaMemcpyHostToDevice, stream));
	CubDebugExit(cudaStreamSynchronize(stream));
	printf("transfer done\n");

    cudaStreamDestroy(stream);
}

int main() {

	int* values = new int[64000];
	int* h_values = new int[64000];

	int* d_values;

	for (int i = 0; i < 64000; i++) {
		values[i] = i;
	}

    cudaMalloc(&(d_values), 64000 * sizeof(int));
    cudaMemcpy(d_values, values, 64000 * sizeof(int), cudaMemcpyHostToDevice);

	cudaEvent_t start, stop; 
	float time;

	cudaEventCreate(&start);
	cudaEventCreate(&stop); 
	cudaEventRecord(start, 0);

 // char const *cuptiEventName = "elapsed_cycles_sm"; // "elapsed_cycles_sm" "inst_executed"; "inst_issued0";
 char const *cuptiEventName = "inst_executed";     // "elapsed_cycles_sm" "inst_executed"; "inst_issued0";
    // char const *cuptiEventName = "l2_subp0_read_sector_misses";     // "elapsed_cycles_sm" "inst_executed"; "inst_issued0";
    printf("Setup CUPTI counters internally for event '%s' (CUPTI_ONLY)\n", cuptiEventName);
    CUpti_EventGroup eg[1];
    CUpti_EventID *myevent = (CUpti_EventID*) calloc(1, sizeof(CUpti_EventID));   // Make space for event ids.
    // CHECK_CU_ERROR(cuCtxPushCurrent(ctx[i]), "cuCtxPushCurrent");

	CUcontext sessionCtx=NULL;
	cuCtxCreate(&sessionCtx, 0, 0);

    cuptiSetEventCollectionMode(sessionCtx, CUPTI_EVENT_COLLECTION_MODE_KERNEL);
    cuptiEventGroupCreate( sessionCtx, &eg[0], 0 );
    CHECK_CUPTI_ERROR( cuptiEventGetIdFromName ( 0, cuptiEventName, &myevent[0] ), "cuptiEventGetIdFromName");
    printf("GPU %s=%u.\n", cuptiEventName, myevent[0]);
    CHECK_CUPTI_ERROR( cuptiEventGroupAddEvent( eg[0], myevent[0] ), "cuptiEventGroupAddEvent" );
    CHECK_CUPTI_ERROR( cuptiEventGroupEnable( eg[0] ), "cuptiEventGroupEnable");
    // CHECK_CU_ERROR( cuCtxPopCurrent(&poppedCtx), "cuCtxPopCurrent" );


	parallel_for(int(0), 2, [=](int j){

		cuCtxSetCurrent(sessionCtx);

	// for (int j = 0; j < 1; j++) {
		//runCPU(values, 1000, j*1000);
		runGPU(d_values, 32000, j*32000);
	// }

	});

    size_t size = 1024;
    size_t sizeBytes = size*sizeof(uint64_t);
    uint64_t buffer[size];
    uint64_t tmp[size];     for (int jj=0; jj<1024; jj++) tmp[jj]=0;
    // CHECK_CU_ERROR(cuCtxPushCurrent(ctx[i]), "cuCtxPushCurrent");
    cuCtxSynchronize( );
    CHECK_CUPTI_ERROR( cuptiEventGroupReadEvent ( eg[0], CUPTI_EVENT_READ_FLAG_NONE, myevent[0], &sizeBytes, &tmp[0] ), "cuptiEventGroupReadEvent");
    buffer[0] = tmp[0];
    printf( "CUPTI %s counterValue %u (on one domain, may need to be multiplied by num of domains)\n", cuptiEventName, buffer[0] );
    // CHECK_CU_ERROR( cuCtxPopCurrent(&(ctx[i])), "cuCtxPopCurrent" );

    if (sessionCtx != NULL) {
        cuCtxDestroy(sessionCtx);
    }

   	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);
	cudaEventElapsedTime(&time, start, stop);

	cout << "Time Taken Total: " << time << endl;

	cudaMemcpy(h_values, d_values, 64000 * sizeof(int), cudaMemcpyDeviceToHost);

	//printf("test\n");

	// for (int i = 0; i < 64000; i++) {
	// 	assert(h_values[i] == values[i]);
	// }

	return 0;

}