#include <assert.h>
#include <stdio.h>
#include <chrono>
#include <pthread.h>
#include "tbb/tbb.h"
#include "PerfEvent.hpp"
#include <cub/cub.cuh>
#include <curand.h>
#include <cuda.h>
#include <thread>
#include "crystal/crystal.cuh"
#include "papi.h"
// #include "papi_test.h"
#define NUM_EVENTS 1
using namespace cub;
using namespace std;
using namespace tbb;
void runCPU(int* values, int size, int offset) {
parallel_for( blocked_range<int>(offset, offset+size),
[&](blocked_range<int> r)
{
// int worker_index = tbb::task_arena::current_thread_index();
//printf("worker_index = %d\n", worker_index);
for (int i=r.begin(); i<r.end(); ++i)
{
values[i] = values[i] * values[i];
//printf("index = %d\n", i);
}
});
}
void runCPU2(int* values, int* h_values, int size, int offset) {
for (int i = 0; i < size; i++) {
// h_values[i] = values[i] * values[i];
h_values[i] = i;
}
}
__global__ void kernel(int* d_values, int size, int offset) {
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid < size) {
//printf("%d %d\n", tid + offset, d_values[tid + offset]);
d_values[tid + offset] = d_values[tid + offset] * d_values[tid + offset];
//if ((tid + offset) == 160) printf("%d\n", d_values[tid + offset]);
}
}
void runGPU(int* d_values, int size, int offset) {
cudaStream_t stream;
cudaStreamCreate(&stream);
//cout << offset << endl;
kernel<<<(size + 128 - 1)/128, 128, 0, stream>>>(d_values, size, offset);
cudaStreamDestroy(stream);
}
void transferGPU(int* d_values, int* values, int size) {
cudaStream_t stream;
cudaStreamCreate(&stream);
printf("start transfer\n");
CubDebugExit(cudaMemcpyAsync(d_values, values, size * sizeof(int), cudaMemcpyHostToDevice, stream));
CubDebugExit(cudaStreamSynchronize(stream));
printf("transfer done\n");
cudaStreamDestroy(stream);
}
int main() {
int* values = new int[64000000];
int* h_values = new int[64000000];
for (int i = 0; i < 64000000; i++) {
values[i] = i;
}
PerfEvent e;
e.startCounters();
unsigned long long sum = 0;
for (int i = 0; i < 64000000; i++) {
sum += values[i];
}
for (int i = 0; i < 64000000; i++) {
values[i] = sum;
}
// printf("%llu\n", sum);
// cudaEvent_t start, stop;
// float time;
// cudaEventCreate(&start);
// cudaEventCreate(&stop);
// cudaEventRecord(start, 0);
// cout << retval << endl;
// PAPI_library_init( PAPI_VER_CURRENT );
// int retval;
// char const *EventName[] = { "PAPI_L3_TCM"}; // CUPTI_11 event.
// int eventCount = NUM_EVENTS;
// int retval2;
// int EventSet = PAPI_NULL;
// long long p_values[NUM_EVENTS];
// p_values[0] = 0;
// retval2 = PAPI_create_eventset( &EventSet ); assert(retval2 == PAPI_OK);
// retval2 = PAPI_add_event( EventSet, PAPI_L3_TCM); assert(retval2 == PAPI_OK);
// retval2 = PAPI_start( EventSet ); assert(retval2 == PAPI_OK);
// int tid = PAPI_thread_id();
// printf("Initial thread id is: %lu\n",tid);
// parallel_for(int(0), 64, [=](int j){
// for (int j = 0; j < 1; j++) {
// PAPI_register_thread();
// int tid = PAPI_thread_id();
// printf("Initial thread id is: %lu\n",tid);
// runCPU2(values, h_values, 64000000, 0);
// runCPU2(values, h_values, 1000000, j*1000000);
// }
// });
e.stopCounters();
// retval2 = PAPI_stop( EventSet, p_values ); assert(retval2 == PAPI_OK);
// for(int i = 0; i < eventCount; i++ )
// printf( "stop: %12lld \t=0X%016llX \t\t --> %s \n", p_values[i], p_values[i], EventName[i] );
// retval2 = PAPI_cleanup_eventset(EventSet); assert(retval2 == PAPI_OK);
// retval2 = PAPI_destroy_eventset(&EventSet); assert(retval2 == PAPI_OK);
// PAPI_shutdown();
e.printReport(cout, 1); // use n as scale factor
cout << endl;
// cudaEventRecord(stop, 0);
// cudaEventSynchronize(stop);
// cudaEventElapsedTime(&time, start, stop);
// cout << "Time Taken Total: " << time << endl;
return 0;
}