#include <assert.h> #include <stdio.h> #include <chrono> #include <pthread.h> #include "tbb/tbb.h" #include "PerfEvent.hpp" #include <cub/cub.cuh> #include <curand.h> #include <cuda.h> #include <thread> #include "crystal/crystal.cuh" #include "papi.h" // #include "papi_test.h" #define NUM_EVENTS 1 using namespace cub; using namespace std; using namespace tbb; void runCPU(int* values, int size, int offset) { parallel_for( blocked_range<int>(offset, offset+size), [&](blocked_range<int> r) { // int worker_index = tbb::task_arena::current_thread_index(); //printf("worker_index = %d\n", worker_index); for (int i=r.begin(); i<r.end(); ++i) { values[i] = values[i] * values[i]; //printf("index = %d\n", i); } }); } void runCPU2(int* values, int* h_values, int size, int offset) { for (int i = 0; i < size; i++) { // h_values[i] = values[i] * values[i]; h_values[i] = i; } } __global__ void kernel(int* d_values, int size, int offset) { int tid = blockDim.x * blockIdx.x + threadIdx.x; if (tid < size) { //printf("%d %d\n", tid + offset, d_values[tid + offset]); d_values[tid + offset] = d_values[tid + offset] * d_values[tid + offset]; //if ((tid + offset) == 160) printf("%d\n", d_values[tid + offset]); } } void runGPU(int* d_values, int size, int offset) { cudaStream_t stream; cudaStreamCreate(&stream); //cout << offset << endl; kernel<<<(size + 128 - 1)/128, 128, 0, stream>>>(d_values, size, offset); cudaStreamDestroy(stream); } void transferGPU(int* d_values, int* values, int size) { cudaStream_t stream; cudaStreamCreate(&stream); printf("start transfer\n"); CubDebugExit(cudaMemcpyAsync(d_values, values, size * sizeof(int), cudaMemcpyHostToDevice, stream)); CubDebugExit(cudaStreamSynchronize(stream)); printf("transfer done\n"); cudaStreamDestroy(stream); } int main() { int* values = new int[64000000]; int* h_values = new int[64000000]; for (int i = 0; i < 64000000; i++) { values[i] = i; } PerfEvent e; e.startCounters(); unsigned long long sum = 0; for (int i = 0; i < 64000000; i++) { sum += values[i]; } for (int i = 0; i < 64000000; i++) { values[i] = sum; } // printf("%llu\n", sum); // cudaEvent_t start, stop; // float time; // cudaEventCreate(&start); // cudaEventCreate(&stop); // cudaEventRecord(start, 0); // cout << retval << endl; // PAPI_library_init( PAPI_VER_CURRENT ); // int retval; // char const *EventName[] = { "PAPI_L3_TCM"}; // CUPTI_11 event. // int eventCount = NUM_EVENTS; // int retval2; // int EventSet = PAPI_NULL; // long long p_values[NUM_EVENTS]; // p_values[0] = 0; // retval2 = PAPI_create_eventset( &EventSet ); assert(retval2 == PAPI_OK); // retval2 = PAPI_add_event( EventSet, PAPI_L3_TCM); assert(retval2 == PAPI_OK); // retval2 = PAPI_start( EventSet ); assert(retval2 == PAPI_OK); // int tid = PAPI_thread_id(); // printf("Initial thread id is: %lu\n",tid); // parallel_for(int(0), 64, [=](int j){ // for (int j = 0; j < 1; j++) { // PAPI_register_thread(); // int tid = PAPI_thread_id(); // printf("Initial thread id is: %lu\n",tid); // runCPU2(values, h_values, 64000000, 0); // runCPU2(values, h_values, 1000000, j*1000000); // } // }); e.stopCounters(); // retval2 = PAPI_stop( EventSet, p_values ); assert(retval2 == PAPI_OK); // for(int i = 0; i < eventCount; i++ ) // printf( "stop: %12lld \t=0X%016llX \t\t --> %s \n", p_values[i], p_values[i], EventName[i] ); // retval2 = PAPI_cleanup_eventset(EventSet); assert(retval2 == PAPI_OK); // retval2 = PAPI_destroy_eventset(&EventSet); assert(retval2 == PAPI_OK); // PAPI_shutdown(); e.printReport(cout, 1); // use n as scale factor cout << endl; // cudaEventRecord(stop, 0); // cudaEventSynchronize(stop); // cudaEventElapsedTime(&time, start, stop); // cout << "Time Taken Total: " << time << endl; return 0; }