#include <assert.h>
#include <stdio.h>
#include <chrono>
#include <pthread.h>
#include "tbb/tbb.h"
#include "PerfEvent.hpp"
#include "pcm-cache.cpp"
#include <cub/cub.cuh>
#include <curand.h>
#include <cuda.h>
#include <thread>
#include <cupti.h>
#include "crystal/crystal.cuh"
#include "papi.h"
// #include "papi_test.h"
#define NUM_EVENTS 2
using namespace cub;
using namespace std;
using namespace tbb;
#define CHECK_CU_ERROR(err, cufunc) \
if (err != CUDA_SUCCESS) { printf ("Error %d for CUDA Driver API function '%s'\n", err, cufunc); return -1; }
void runCPU(int* values, int* h_values, int size, int offset) {
parallel_for( blocked_range<int>(offset, offset+size),
[&](blocked_range<int> r)
{
// int worker_index = tbb::task_arena::current_thread_index();
//printf("worker_index = %d\n", worker_index);
for (int i=r.begin(); i<r.end(); ++i)
{
values[i] = values[i] * values[i];
//printf("index = %d\n", i);
}
});
}
void runCPU2(int* values, int* h_values, int size, int offset) {
for (int i = offset; i < offset+size; i++) {
values[i] = values[i] * values[i];
}
}
__global__ void kernel(int* d_values, int size, int offset) {
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid < size) {
//printf("%d %d\n", tid + offset, d_values[tid + offset]);
d_values[tid + offset] = d_values[tid + offset] * d_values[tid + offset];
//if ((tid + offset) == 160) printf("%d\n", d_values[tid + offset]);
}
}
void runGPU(int* d_values, int size, int offset) {
cudaStream_t stream;
cudaStreamCreate(&stream);
//cout << offset << endl;
kernel<<<(size + 128 - 1)/128, 128, 0, stream>>>(d_values, size, offset);
cudaStreamSynchronize(stream);
cudaStreamDestroy(stream);
}
void transferGPU(int* d_values, int* values, int size) {
cudaStream_t stream;
cudaStreamCreate(&stream);
printf("start transfer\n");
CubDebugExit(cudaMemcpyAsync(d_values, values, size * sizeof(int), cudaMemcpyHostToDevice, stream));
CubDebugExit(cudaStreamSynchronize(stream));
printf("transfer done\n");
cudaStreamDestroy(stream);
}
int main() {
cudaSetDevice(0);
CUdevice device;
cuDeviceGet(&device, 0);
// int* values = new int[640000000];
// int* h_values = new int[640000000];
int* values;
int* h_values;
cudaHostAlloc ( (void**) &values, 640000000 * sizeof(int), cudaHostAllocDefault);
cudaHostAlloc ( (void**) &h_values, 640000000 * sizeof(int), cudaHostAllocDefault);
int* d_values;
cout << "1" << endl;
for (int i = 0; i < 640000000; i++) {
values[i] = 2;
}
cudaMalloc(&(d_values), 640000000 * sizeof(int));
long long read = 0;
long long write = 0;
PAPI_library_init( PAPI_VER_CURRENT );
InitMonitor();
int retval;
char const *EventName[] = { "cuda:::dram__bytes_read.sum:device=0", "cuda:::dram__bytes_write.sum:device=0"}; // CUPTI_11 event.
int* events = new int [NUM_EVENTS];
int EventSet = PAPI_NULL;
for(int i = 0; i < NUM_EVENTS; i++ ){
retval = PAPI_event_name_to_code( (char *)EventName[i], &events[i] );
assert(retval == PAPI_OK);
}
retval = PAPI_create_eventset( &EventSet ); assert(retval == PAPI_OK);
retval = PAPI_add_events( EventSet, events, NUM_EVENTS ); assert(retval == PAPI_OK);
CUcontext sessionCtx = NULL;
CUcontext poppedCtx, curCtx;
CHECK_CU_ERROR( cuCtxCreate(&sessionCtx, 0, device), "cuCtxCreate");
cuCtxGetCurrent(&curCtx); cout << curCtx << endl;
CHECK_CU_ERROR( cuCtxPopCurrent(&poppedCtx), "cuCtxPopCurrent" );
cuCtxGetCurrent(&curCtx); cout << curCtx << endl;
long long p_values[NUM_EVENTS];
for(int i = 0; i < NUM_EVENTS; i++ ){
p_values[i] = 0;
}
CHECK_CU_ERROR(cuCtxPushCurrent(sessionCtx), "cuCtxPushCurrent");
retval = PAPI_start( EventSet ); assert(retval == PAPI_OK);
CHECK_CU_ERROR( cuCtxPopCurrent(&sessionCtx), "cuCtxPopCurrent" );
StartMonitor();
cudaMemcpy(d_values, values, 320000000 * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(h_values, d_values, 320000000 * sizeof(int), cudaMemcpyDeviceToHost);
// for (int i = 0; i < 10; i++) {
// cout << h_values[i] << endl;
// }
// parallel_for(int(0), 64, [=](int j){
// CUcontext poppedCtx;
// cuCtxPushCurrent(sessionCtx);
// cudaStream_t stream;
// cudaStreamCreate(&stream);
// cudaStreamSynchronize(stream);
// cudaStreamDestroy(stream);
// cuCtxPopCurrent(&poppedCtx);
// });
EndMonitor(read, write);
CHECK_CU_ERROR(cuCtxPushCurrent(sessionCtx), "cuCtxPushCurrent");
CHECK_CU_ERROR( cuCtxSynchronize( ), "cuCtxSynchronize" );
CHECK_CU_ERROR( cuCtxPopCurrent(&sessionCtx), "cuCtxPopCurrent" );
retval = PAPI_stop( EventSet, p_values ); assert(retval == PAPI_OK);
retval = PAPI_reset(EventSet); assert(retval == PAPI_OK);
for(int i = 0; i < NUM_EVENTS; i++ ) {
printf( "stop: %12lld \t=0X%016llX \t\t --> %s \n", p_values[i], p_values[i], EventName[i] );
}
if (sessionCtx != NULL) {
cuCtxDestroy(sessionCtx);
}
retval = PAPI_cleanup_eventset(EventSet); assert(retval == PAPI_OK);
retval = PAPI_destroy_eventset(&EventSet); assert(retval == PAPI_OK);
PAPI_shutdown();
return 0;
}