src/gpudb/test/test_par2.cu · Lancelot

#include <assert.h>
#include <stdio.h>
#include <chrono>
#include <pthread.h>

#include "tbb/tbb.h"
#include "PerfEvent.hpp"

#include <cub/cub.cuh>
#include <curand.h>
#include <cuda.h>
#include <thread>

#include "crystal/crystal.cuh"
#include "papi.h"
// #include "papi_test.h"

#define NUM_EVENTS 1

using namespace cub;
using namespace std;
using namespace tbb;

void runCPU(int* values, int size, int offset) {

    parallel_for( blocked_range<int>(offset, offset+size),
                       [&](blocked_range<int> r)
    {
    	// int worker_index = tbb::task_arena::current_thread_index();
    	//printf("worker_index = %d\n", worker_index);
        for (int i=r.begin(); i<r.end(); ++i)
        {
            values[i] = values[i] * values[i];
            //printf("index = %d\n", i);
        }
    });
}


void runCPU2(int* values, int* h_values, int size, int offset) {
    for (int i = 0; i < size; i++) {
        // h_values[i] = values[i] * values[i];
        h_values[i] = i;
    }
}

__global__ void kernel(int* d_values, int size, int offset) {
	int tid = blockDim.x * blockIdx.x + threadIdx.x;

	if (tid < size) {
		//printf("%d %d\n", tid + offset, d_values[tid + offset]);
		d_values[tid + offset] = d_values[tid + offset] * d_values[tid + offset];
		//if ((tid + offset) == 160) printf("%d\n", d_values[tid + offset]);
	}
}

void runGPU(int* d_values, int size, int offset) {

    cudaStream_t stream;
    cudaStreamCreate(&stream);

    //cout << offset << endl;

    kernel<<<(size + 128 - 1)/128, 128, 0, stream>>>(d_values, size, offset); 

    cudaStreamDestroy(stream);
}

void transferGPU(int* d_values, int* values, int size) {

    cudaStream_t stream;
    cudaStreamCreate(&stream);

    printf("start transfer\n");
	CubDebugExit(cudaMemcpyAsync(d_values, values, size * sizeof(int), cudaMemcpyHostToDevice, stream));
	CubDebugExit(cudaStreamSynchronize(stream));
	printf("transfer done\n");

    cudaStreamDestroy(stream);
}

int main() {

	int* values = new int[64000000];
	int* h_values = new int[64000000];

	for (int i = 0; i < 64000000; i++) {
		values[i] = i;
	}

	PerfEvent e;
	e.startCounters();

	unsigned long long sum = 0;

	for (int i = 0; i < 64000000; i++) {
		sum += values[i];
	}

	for (int i = 0; i < 64000000; i++) {
		values[i] = sum;
	}

	// printf("%llu\n", sum);


	// cudaEvent_t start, stop; 
	// float time;

	// cudaEventCreate(&start);
	// cudaEventCreate(&stop); 
	// cudaEventRecord(start, 0);

	// cout << retval << endl;

	// PAPI_library_init( PAPI_VER_CURRENT );

	// int retval;
 //    char const *EventName[] = { "PAPI_L3_TCM"}; // CUPTI_11 event.
	// int eventCount = NUM_EVENTS;
	// int retval2;
	// int EventSet = PAPI_NULL;
	// long long p_values[NUM_EVENTS];
	// p_values[0] = 0;

	// retval2 = PAPI_create_eventset( &EventSet ); assert(retval2 == PAPI_OK);
	// retval2 = PAPI_add_event( EventSet, PAPI_L3_TCM); assert(retval2 == PAPI_OK);

	// retval2 = PAPI_start( EventSet ); assert(retval2 == PAPI_OK);

	// int tid = PAPI_thread_id();
	// printf("Initial thread id is: %lu\n",tid);

	// parallel_for(int(0), 64, [=](int j){



	// for (int j = 0; j < 1; j++) {
		// PAPI_register_thread();

		// int tid = PAPI_thread_id();
		// printf("Initial thread id is: %lu\n",tid);
		
		// runCPU2(values, h_values, 64000000, 0);
		// runCPU2(values, h_values, 1000000, j*1000000);

	// }
	// });

	e.stopCounters();

	// retval2 = PAPI_stop( EventSet, p_values ); assert(retval2 == PAPI_OK);
	// for(int i = 0; i < eventCount; i++ )
	// 	printf( "stop: %12lld \t=0X%016llX \t\t --> %s \n", p_values[i], p_values[i], EventName[i] );

	// retval2 = PAPI_cleanup_eventset(EventSet); assert(retval2 == PAPI_OK);
	// retval2 = PAPI_destroy_eventset(&EventSet); assert(retval2 == PAPI_OK);

	// PAPI_shutdown();

	e.printReport(cout, 1); // use n as scale factor
	cout << endl;




 //   	cudaEventRecord(stop, 0);
	// cudaEventSynchronize(stop);
	// cudaEventElapsedTime(&time, start, stop);

	// cout << "Time Taken Total: " << time << endl;


	return 0;

}