Lancelot / src / gpudb / test / test_par.cu
test_par.cu
Raw
#include <assert.h>
#include <stdio.h>
#include <chrono>
#include <pthread.h>

#include "tbb/tbb.h"
#include "PerfEvent.hpp"
#include "pcm-cache.cpp"

#include <cub/cub.cuh>
#include <curand.h>
#include <cuda.h>
#include <thread>
#include <cupti.h>

#include "crystal/crystal.cuh"
#include "papi.h"
// #include "papi_test.h"

#define NUM_EVENTS 2

using namespace cub;
using namespace std;
using namespace tbb;

#define CHECK_CU_ERROR(err, cufunc)                                     \
    if (err != CUDA_SUCCESS) { printf ("Error %d for CUDA Driver API function '%s'\n", err, cufunc); return -1; }

void runCPU(int* values, int* h_values, int size, int offset) {

    parallel_for( blocked_range<int>(offset, offset+size),
                       [&](blocked_range<int> r)
    {
    	// int worker_index = tbb::task_arena::current_thread_index();
    	//printf("worker_index = %d\n", worker_index);
        for (int i=r.begin(); i<r.end(); ++i)
        {
            values[i] = values[i] * values[i];
            //printf("index = %d\n", i);
        }
    });
}


void runCPU2(int* values, int* h_values, int size, int offset) {
    for (int i = offset; i < offset+size; i++) {
        values[i] = values[i] * values[i];
    }
}

__global__ void kernel(int* d_values, int size, int offset) {
	int tid = blockDim.x * blockIdx.x + threadIdx.x;

	if (tid < size) {
		//printf("%d %d\n", tid + offset, d_values[tid + offset]);
		d_values[tid + offset] = d_values[tid + offset] * d_values[tid + offset];
		//if ((tid + offset) == 160) printf("%d\n", d_values[tid + offset]);
	}
}

void runGPU(int* d_values, int size, int offset) {

    cudaStream_t stream;
    cudaStreamCreate(&stream);

    //cout << offset << endl;

    kernel<<<(size + 128 - 1)/128, 128, 0, stream>>>(d_values, size, offset); 

    cudaStreamSynchronize(stream);

    cudaStreamDestroy(stream);
}

void transferGPU(int* d_values, int* values, int size) {

    cudaStream_t stream;
    cudaStreamCreate(&stream);

    printf("start transfer\n");
	CubDebugExit(cudaMemcpyAsync(d_values, values, size * sizeof(int), cudaMemcpyHostToDevice, stream));
	CubDebugExit(cudaStreamSynchronize(stream));
	printf("transfer done\n");

    cudaStreamDestroy(stream);
}

int main() {

	cudaSetDevice(0);
	CUdevice device;
	cuDeviceGet(&device, 0);

	// int* values = new int[640000000];
	// int* h_values = new int[640000000];

	int* values;
	int* h_values;

	cudaHostAlloc ( (void**) &values, 640000000 * sizeof(int), cudaHostAllocDefault);
	cudaHostAlloc ( (void**) &h_values, 640000000 * sizeof(int), cudaHostAllocDefault);

	int* d_values;

	cout << "1" << endl;

	for (int i = 0; i < 640000000; i++) {
		values[i] = 2;
	}

    cudaMalloc(&(d_values), 640000000 * sizeof(int));

	long long read = 0;
	long long write = 0;

	PAPI_library_init( PAPI_VER_CURRENT );
	InitMonitor();


	int retval;
	char const *EventName[] = { "cuda:::dram__bytes_read.sum:device=0", "cuda:::dram__bytes_write.sum:device=0"}; // CUPTI_11 event.
	int* events = new int [NUM_EVENTS];
	int EventSet = PAPI_NULL;

	for(int i = 0; i < NUM_EVENTS; i++ ){
	    retval = PAPI_event_name_to_code( (char *)EventName[i], &events[i] );
	    assert(retval == PAPI_OK);
	}

	retval = PAPI_create_eventset( &EventSet ); assert(retval == PAPI_OK);
	retval = PAPI_add_events( EventSet, events, NUM_EVENTS ); assert(retval == PAPI_OK);

	CUcontext sessionCtx = NULL;
	CUcontext poppedCtx, curCtx;
	CHECK_CU_ERROR( cuCtxCreate(&sessionCtx, 0, device), "cuCtxCreate");
	cuCtxGetCurrent(&curCtx); cout << curCtx << endl;
	CHECK_CU_ERROR( cuCtxPopCurrent(&poppedCtx), "cuCtxPopCurrent" );
	cuCtxGetCurrent(&curCtx); cout << curCtx << endl;

	long long p_values[NUM_EVENTS];
	
	for(int i = 0; i < NUM_EVENTS; i++ ){
		p_values[i] = 0;
	}

	CHECK_CU_ERROR(cuCtxPushCurrent(sessionCtx), "cuCtxPushCurrent");
	retval = PAPI_start( EventSet ); assert(retval == PAPI_OK);
	CHECK_CU_ERROR( cuCtxPopCurrent(&sessionCtx), "cuCtxPopCurrent" );

	StartMonitor();

	cudaMemcpy(d_values, values, 320000000 * sizeof(int), cudaMemcpyHostToDevice);

	cudaMemcpy(h_values, d_values, 320000000 * sizeof(int), cudaMemcpyDeviceToHost);

	// for (int i = 0; i < 10; i++) {
	// 	cout << h_values[i] << endl;
	// }

	// parallel_for(int(0), 64, [=](int j){

	// 	CUcontext poppedCtx;
	// 	cuCtxPushCurrent(sessionCtx);

	// 	cudaStream_t stream;
	// 	cudaStreamCreate(&stream);



	// 	cudaStreamSynchronize(stream);

	// 	cudaStreamDestroy(stream);

	// 	cuCtxPopCurrent(&poppedCtx);

	// });

	EndMonitor(read, write);

    CHECK_CU_ERROR(cuCtxPushCurrent(sessionCtx), "cuCtxPushCurrent");
    CHECK_CU_ERROR( cuCtxSynchronize( ), "cuCtxSynchronize" );
    CHECK_CU_ERROR( cuCtxPopCurrent(&sessionCtx), "cuCtxPopCurrent" );
	retval = PAPI_stop( EventSet, p_values ); assert(retval == PAPI_OK);
	retval = PAPI_reset(EventSet); assert(retval == PAPI_OK);

	for(int i = 0; i < NUM_EVENTS; i++ ) {
		printf( "stop: %12lld \t=0X%016llX \t\t --> %s \n", p_values[i], p_values[i], EventName[i] );
	}

    if (sessionCtx != NULL) {
        cuCtxDestroy(sessionCtx);
    }

	retval = PAPI_cleanup_eventset(EventSet); assert(retval == PAPI_OK);
	retval = PAPI_destroy_eventset(&EventSet); assert(retval == PAPI_OK);

	PAPI_shutdown();

	return 0;

}