#include <assert.h> #include <stdio.h> #include <chrono> #include <pthread.h> #include "tbb/tbb.h" #include "PerfEvent.hpp" #include "pcm-cache.cpp" #include <cub/cub.cuh> #include <curand.h> #include <cuda.h> #include <thread> #define NUM_EVENTS 2 using namespace cub; using namespace std; using namespace tbb; void runCPU(int* values, int size, int offset) { parallel_for( blocked_range<int>(offset, offset+size), [&](blocked_range<int> r) { // int worker_index = tbb::task_arena::current_thread_index(); //printf("worker_index = %d\n", worker_index); for (int i=r.begin(); i<r.end(); ++i) { values[i] = values[i] * values[i]; //printf("index = %d\n", i); } }); } void runCPU2(int* values, int* h_values, int size, int offset) { for (int i = offset; i < offset+size; i++) { values[i] = values[i] * values[i]; } } __global__ void kernel(int* d_values, int size, int offset) { int tid = blockDim.x * blockIdx.x + threadIdx.x; if (tid < size) { //printf("%d %d\n", tid + offset, d_values[tid + offset]); d_values[tid + offset] = d_values[tid + offset] * d_values[tid + offset]; //if ((tid + offset) == 160) printf("%d\n", d_values[tid + offset]); } } void runGPU(int* d_values, int size, int offset) { cudaStream_t stream; cudaStreamCreate(&stream); //cout << offset << endl; kernel<<<(size + 128 - 1)/128, 128, 0, stream>>>(d_values, size, offset); cudaStreamSynchronize(stream); cudaStreamDestroy(stream); } void transferGPU(int* d_values, int* values, int size) { cudaStream_t stream; cudaStreamCreate(&stream); printf("start transfer\n"); CubDebugExit(cudaMemcpyAsync(d_values, values, size * sizeof(int), cudaMemcpyHostToDevice, stream)); CubDebugExit(cudaStreamSynchronize(stream)); printf("transfer done\n"); cudaStreamDestroy(stream); } int main() { int* values = new int[640000000]; int* h_values = new int[640000000]; int* d_values; for (int i = 0; i < 640000000; i++) { values[i] = i; } cudaMalloc(&(d_values), 640000000 * sizeof(int)); cudaMemcpy(d_values, values, 640000000 * sizeof(int), cudaMemcpyHostToDevice); cudaEvent_t start, stop; float time; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0); // cout << retval << endl; // PerfEvent e; // e.startCounters(); long long read = 0; long long write = 0; InitMonitor(); StartMonitor(); cudaMemcpy(d_values, values, 640000000 * sizeof(int), cudaMemcpyHostToDevice); // sleep(1); // for (int j = 0; j < 64; j++) { // parallel_for(int(0), 64, [=](int j){ // // runCPU2(values, h_values, 1000000, j*1000000); // runCPU(values, 1000000, j*1000000); // }); // } // unsigned long long sum = 0; // for (int i = 0; i < 64000000; i++) { // sum += values[i]; // } // for (int i = 0; i < 64000000; i++) { // values[i] = sum; // } // sleep(1); EndMonitor(read, write); // e.stopCounters(); // e.printReport(cout, 1); // use n as scale factor // // cout << endl; cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&time, start, stop); cout << "Time Taken Total: " << time << endl; return 0; }