#include <assert.h> #include <stdio.h> #include <chrono> #include <pthread.h> #include "tbb/tbb.h" #include <cub/cub.cuh> #include <curand.h> #include <cuda.h> #include <thread> #include <cupti.h> #include "crystal/crystal.cuh" #include "papi.h" #include "papi_test.h" #define NUM_EVENTS 1 #define CHECK_CU_ERROR(err, cufunc) \ if (err != CUDA_SUCCESS) { printf ("Error %d for CUDA Driver API function '%s'\n", err, cufunc); return -1; } #define CHECK_CUDA_ERROR(err) \ if (err != cudaSuccess) { printf ("%s:%i Error %d for CUDA [%s]\n", __FILE__, __LINE__, err, cudaGetErrorString(err) ); return -1; } #define CHECK_CUPTI_ERROR(err, cuptifunc) \ if (err != CUPTI_SUCCESS) { const char *errStr; cuptiGetResultString(err, &errStr); \ printf ("%s:%i Error %d [%s] for CUPTI API function '%s'\n", __FILE__, __LINE__, err, errStr, cuptifunc); return -1; } using namespace cub; using namespace std; using namespace tbb; void runCPU(int* values, int size, int offset) { parallel_for( blocked_range<int>(offset, offset+size), [&](blocked_range<int> r) { // int worker_index = tbb::task_arena::current_thread_index(); //printf("worker_index = %d\n", worker_index); for (int i=r.begin(); i<r.end(); ++i) { values[i] = values[i] * values[i]; //printf("index = %d\n", i); } }); } __global__ void kernel(int* d_values, int size, int offset) { int tid = blockDim.x * blockIdx.x + threadIdx.x; if (tid < size) { //printf("%d %d\n", tid + offset, d_values[tid + offset]); d_values[tid + offset] = d_values[tid + offset] * d_values[tid + offset]; //if ((tid + offset) == 160) printf("%d\n", d_values[tid + offset]); } } void runGPU(int* d_values, int size, int offset) { cudaStream_t stream; cudaStreamCreate(&stream); //cout << offset << endl; kernel<<<(size + 128 - 1)/128, 128, 0, stream>>>(d_values, size, offset); cudaStreamDestroy(stream); } void transferGPU(int* d_values, int* values, int size) { cudaStream_t stream; cudaStreamCreate(&stream); printf("start transfer\n"); CubDebugExit(cudaMemcpyAsync(d_values, values, size * sizeof(int), cudaMemcpyHostToDevice, stream)); CubDebugExit(cudaStreamSynchronize(stream)); printf("transfer done\n"); cudaStreamDestroy(stream); } int main() { int* values = new int[64000]; int* h_values = new int[64000]; int* d_values; for (int i = 0; i < 64000; i++) { values[i] = i; } cudaMalloc(&(d_values), 64000 * sizeof(int)); cudaMemcpy(d_values, values, 64000 * sizeof(int), cudaMemcpyHostToDevice); cudaEvent_t start, stop; float time; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0); // char const *cuptiEventName = "elapsed_cycles_sm"; // "elapsed_cycles_sm" "inst_executed"; "inst_issued0"; char const *cuptiEventName = "inst_executed"; // "elapsed_cycles_sm" "inst_executed"; "inst_issued0"; // char const *cuptiEventName = "l2_subp0_read_sector_misses"; // "elapsed_cycles_sm" "inst_executed"; "inst_issued0"; printf("Setup CUPTI counters internally for event '%s' (CUPTI_ONLY)\n", cuptiEventName); CUpti_EventGroup eg[1]; CUpti_EventID *myevent = (CUpti_EventID*) calloc(1, sizeof(CUpti_EventID)); // Make space for event ids. // CHECK_CU_ERROR(cuCtxPushCurrent(ctx[i]), "cuCtxPushCurrent"); CUcontext sessionCtx=NULL; cuCtxCreate(&sessionCtx, 0, 0); cuptiSetEventCollectionMode(sessionCtx, CUPTI_EVENT_COLLECTION_MODE_KERNEL); cuptiEventGroupCreate( sessionCtx, &eg[0], 0 ); CHECK_CUPTI_ERROR( cuptiEventGetIdFromName ( 0, cuptiEventName, &myevent[0] ), "cuptiEventGetIdFromName"); printf("GPU %s=%u.\n", cuptiEventName, myevent[0]); CHECK_CUPTI_ERROR( cuptiEventGroupAddEvent( eg[0], myevent[0] ), "cuptiEventGroupAddEvent" ); CHECK_CUPTI_ERROR( cuptiEventGroupEnable( eg[0] ), "cuptiEventGroupEnable"); // CHECK_CU_ERROR( cuCtxPopCurrent(&poppedCtx), "cuCtxPopCurrent" ); parallel_for(int(0), 2, [=](int j){ cuCtxSetCurrent(sessionCtx); // for (int j = 0; j < 1; j++) { //runCPU(values, 1000, j*1000); runGPU(d_values, 32000, j*32000); // } }); size_t size = 1024; size_t sizeBytes = size*sizeof(uint64_t); uint64_t buffer[size]; uint64_t tmp[size]; for (int jj=0; jj<1024; jj++) tmp[jj]=0; // CHECK_CU_ERROR(cuCtxPushCurrent(ctx[i]), "cuCtxPushCurrent"); cuCtxSynchronize( ); CHECK_CUPTI_ERROR( cuptiEventGroupReadEvent ( eg[0], CUPTI_EVENT_READ_FLAG_NONE, myevent[0], &sizeBytes, &tmp[0] ), "cuptiEventGroupReadEvent"); buffer[0] = tmp[0]; printf( "CUPTI %s counterValue %u (on one domain, may need to be multiplied by num of domains)\n", cuptiEventName, buffer[0] ); // CHECK_CU_ERROR( cuCtxPopCurrent(&(ctx[i])), "cuCtxPopCurrent" ); if (sessionCtx != NULL) { cuCtxDestroy(sessionCtx); } cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&time, start, stop); cout << "Time Taken Total: " << time << endl; cudaMemcpy(h_values, d_values, 64000 * sizeof(int), cudaMemcpyDeviceToHost); //printf("test\n"); // for (int i = 0; i < 64000; i++) { // assert(h_values[i] == values[i]); // } return 0; }