// #include "QueryProcessing.h" // #include "QueryOptimizer.h" // #include "CPUGPUProcessing.h" // #include "CacheManager.h" // #include "CPUProcessing.h" // #include "CostModel.h" // #include "pcm-cache.cpp" #include "common.h" #define PROFILE 1 #define CHECK_CU_ERROR(err, cufunc) \ if (err != CUDA_SUCCESS) { printf ("Error %d for CUDA Driver API function '%s'\n", err, cufunc); return -1; } int main() { cudaSetDevice(0); CUdevice device; cuDeviceGet(&device, 0); cout << endl; // #ifdef PROFILE PAPI_library_init( PAPI_VER_CURRENT ); // InitMonitor(); // #endif // #ifdef PROFILE cout << endl; cout << "We are profiling" << endl; int retval; char const *EventName[] = { "cuda:::dram__bytes_read.sum:device=0", "cuda:::dram__bytes_write.sum:device=0"}; // CUPTI_11 event. int* events = new int [NUM_EVENTS]; int EventSet = PAPI_NULL; for(int i = 0; i < NUM_EVENTS; i++ ){ retval = PAPI_event_name_to_code( (char *)EventName[i], &events[i] ); assert(retval == PAPI_OK); } retval = PAPI_create_eventset( &EventSet ); assert(retval == PAPI_OK); retval = PAPI_add_events( EventSet, events, NUM_EVENTS ); assert(retval == PAPI_OK); // #endif CUcontext sessionCtx = NULL; CUcontext poppedCtx, curCtx; CHECK_CU_ERROR( cuCtxCreate(&sessionCtx, 0, device), "cuCtxCreate"); cuCtxGetCurrent(&curCtx); cout << curCtx << endl; CHECK_CU_ERROR( cuCtxPopCurrent(&poppedCtx), "cuCtxPopCurrent" ); cuCtxGetCurrent(&curCtx); cout << curCtx << endl; long long p_values[NUM_EVENTS]; long long read = 0, write = 0; for(int i = 0; i < NUM_EVENTS; i++ ){ p_values[i] = 0; } CHECK_CU_ERROR(cuCtxPushCurrent(sessionCtx), "cuCtxPushCurrent"); retval = PAPI_start( EventSet ); assert(retval == PAPI_OK); CHECK_CU_ERROR( cuCtxPopCurrent(&sessionCtx), "cuCtxPopCurrent" ); parallel_for(int(0), 64, [=](int j){ CUcontext poppedCtx; cuCtxPushCurrent(sessionCtx); // cuCtxSetCurrent(sessionCtx); cudaStream_t stream; cudaStreamCreate(&stream); cudaStreamSynchronize(stream); cudaStreamDestroy(stream); cuCtxPopCurrent(&poppedCtx); }); cout << endl; CHECK_CU_ERROR(cuCtxPushCurrent(sessionCtx), "cuCtxPushCurrent"); CHECK_CU_ERROR( cuCtxSynchronize( ), "cuCtxSynchronize" ); CHECK_CU_ERROR( cuCtxPopCurrent(&sessionCtx), "cuCtxPopCurrent" ); retval = PAPI_stop( EventSet, p_values ); assert(retval == PAPI_OK); // retval = PAPI_reset(EventSet); assert(retval == PAPI_OK); for(int i = 0; i < NUM_EVENTS; i++ ) { printf( "stop: %12lld \t=0X%016llX \t\t --> %s \n", p_values[i], p_values[i], EventName[i] ); } CHECK_CU_ERROR(cuCtxPushCurrent(sessionCtx), "cuCtxPushCurrent"); retval = PAPI_start( EventSet ); assert(retval == PAPI_OK); CHECK_CU_ERROR( cuCtxPopCurrent(&sessionCtx), "cuCtxPopCurrent" ); parallel_for(int(0), 64, [=](int j){ CUcontext poppedCtx; cuCtxPushCurrent(sessionCtx); // cuCtxSetCurrent(sessionCtx); cudaStream_t stream; cudaStreamCreate(&stream); // cudaStreamSynchronize(stream); cudaStreamDestroy(stream); cuCtxPopCurrent(&poppedCtx); }); cout << endl; CHECK_CU_ERROR(cuCtxPushCurrent(sessionCtx), "cuCtxPushCurrent"); CHECK_CU_ERROR( cuCtxSynchronize( ), "cuCtxSynchronize" ); CHECK_CU_ERROR( cuCtxPopCurrent(&sessionCtx), "cuCtxPopCurrent" ); retval = PAPI_stop( EventSet, p_values ); assert(retval == PAPI_OK); // retval = PAPI_reset(EventSet); assert(retval == PAPI_OK); for(int i = 0; i < NUM_EVENTS; i++ ) { printf( "stop: %12lld \t=0X%016llX \t\t --> %s \n", p_values[i], p_values[i], EventName[i] ); } if (sessionCtx != NULL) { cuCtxDestroy(sessionCtx); } // #ifdef PROFILE retval = PAPI_cleanup_eventset(EventSet); assert(retval == PAPI_OK); retval = PAPI_destroy_eventset(&EventSet); assert(retval == PAPI_OK); // #endif // #ifdef PROFILE PAPI_shutdown(); // #endif }