#include "common.h" #define PROFILE 1 #define CHECK_CU_ERROR(err, cufunc) \ if (err != CUDA_SUCCESS) { printf ("Error %d for CUDA Driver API function '%s'\n", err, cufunc); return -1; } // tbb::task_scheduler_init init(1); int main() { cudaSetDevice(0); CUdevice device; cuDeviceGet(&device, 0); // bool verbose = 0; // srand(123); // unsigned int size = 52428800; //200 MB // double alpha = 2.5; // bool custom = true; // bool skipping = true; //2, 4, 8, 12, 16, 24, 32, 40 //4, 10, 20, 30, 40 //TODO: make it support cache size > 8 GB (there are lots of integer overflow resulting in negative offset to the gpuCache) should have used unsigned int everywhere // CPUGPUProcessing* cgp = new CPUGPUProcessing(size * 12, 0, 52428800 * 15, 52428800 * 20, verbose, custom, skipping); // QueryProcessing* qp; cout << endl; // CacheManager* cm = cgp->cm; // bool exit = 0; // string input, query, many, policy; // int many_query; // ReplacementPolicy repl_policy; // double time = 0; // long long cpu_traffic = 0, gpu_traffic = 0, pcie_traffic = 0, malloc_time_total = 0; // double time1 = 0, time2 = 0; // long long cpu_traffic1 = 0, cpu_traffic2 = 0; // long long gpu_traffic1 = 0, gpu_traffic2 = 0; // long long pcie_traffic1 = 0, pcie_traffic2 = 0; // double malloc_time_total1 = 0, malloc_time_total2 = 0; // Distribution dist = None; // int processed_segment = 0; // int skipped_segment = 0; // double mean = 0; // qp = new QueryProcessing(cgp, verbose, dist); // if (dist == Zipf) { // qp->qo->setDistributionZipfian(alpha); // } else if (dist == Norm) { // qp->qo->setDistributionNormal(mean, 0.5); // } // #ifdef PROFILE PAPI_library_init( PAPI_VER_CURRENT ); // InitMonitor(); // #endif // #ifdef PROFILE cout << endl; cout << "We are profiling" << endl; int retval; char const *EventName[] = { "cuda:::dram__bytes_read.sum:device=0", "cuda:::dram__bytes_write.sum:device=0"}; // CUPTI_11 event. int* events = new int [NUM_EVENTS]; int EventSet = PAPI_NULL; for(int i = 0; i < NUM_EVENTS; i++ ){ retval = PAPI_event_name_to_code( (char *)EventName[i], &events[i] ); assert(retval == PAPI_OK); } retval = PAPI_create_eventset( &EventSet ); assert(retval == PAPI_OK); retval = PAPI_add_events( EventSet, events, NUM_EVENTS ); assert(retval == PAPI_OK); // #endif CUcontext sessionCtx = NULL; CUcontext poppedCtx, curCtx; CHECK_CU_ERROR( cuCtxCreate(&sessionCtx, 0, device), "cuCtxCreate"); cuCtxGetCurrent(&curCtx); cout << curCtx << endl; CHECK_CU_ERROR( cuCtxPopCurrent(&poppedCtx), "cuCtxPopCurrent" ); cuCtxGetCurrent(&curCtx); cout << curCtx << endl; // while (!exit) { // cout << "Select Options:" << endl; // cout << "1. Run Specific Query" << endl; // cout << "2. Run Random Queries" << endl; // cout << "3. Run Experiment" << endl; // cout << "4. Replacement" << endl; // cout << "5. Dump Trace" << endl; // cout << "6. Exit" << endl; // cout << "cache. Cache Specific Column" << endl; // cout << "clear. Delete Columns from GPU" << endl; // cout << "custom. Toggle custom malloc" << endl; // cout << "skipping. Toggle segment skipping" << endl; // cout << "pipelining. Toggle operator pipelining" << endl; // cout << "Your Input: "; // cin >> input; // if (input.compare("1") == 0) { // time = 0; cpu_traffic = 0; gpu_traffic = 0; pcie_traffic = 0; malloc_time_total = 0; // cgp->resetTime(); // cout << "Input Query: "; // cin >> query; // qp->setQuery(stoi(query)); // #ifdef PROFILE long long p_values[NUM_EVENTS]; long long read = 0, write = 0; for(int i = 0; i < NUM_EVENTS; i++ ){ p_values[i] = 0; } // CHECK_CU_ERROR(cuCtxGetCurrent(&curCtx), "cuCtxGetCurrent"); cout << curCtx << endl; // CHECK_CU_ERROR(cuCtxSetCurrent(sessionCtx), "cuCtxSetCurrent"); CHECK_CU_ERROR(cuCtxPushCurrent(sessionCtx), "cuCtxPushCurrent"); retval = PAPI_start( EventSet ); assert(retval == PAPI_OK); CHECK_CU_ERROR( cuCtxPopCurrent(&sessionCtx), "cuCtxPopCurrent" ); // CHECK_CU_ERROR(cuCtxSetCurrent(curCtx), "cuCtxSetCurrent"); // StartMonitor(); // #endif parallel_for(int(0), 64, [=](int j){ CUcontext poppedCtx; cuCtxPushCurrent(sessionCtx); cudaStream_t stream; cudaStreamCreate(&stream); cudaStreamSynchronize(stream); cudaStreamDestroy(stream); cuCtxPopCurrent(&poppedCtx); }); // time1 = qp->processQuery(sessionCtx); // cpu_traffic1 = cgp->cpu_traffic_total; // gpu_traffic1 = cgp->gpu_traffic_total; // pcie_traffic1 = cgp->pcie_traffic_total; // malloc_time_total1 = cgp->malloc_time_total; // #ifdef PROFILE // EndMonitor(read, write); cout << endl; CHECK_CU_ERROR(cuCtxPushCurrent(sessionCtx), "cuCtxPushCurrent"); CHECK_CU_ERROR( cuCtxSynchronize( ), "cuCtxSynchronize" ); CHECK_CU_ERROR( cuCtxPopCurrent(&sessionCtx), "cuCtxPopCurrent" ); retval = PAPI_stop( EventSet, p_values ); assert(retval == PAPI_OK); retval = PAPI_reset(EventSet); assert(retval == PAPI_OK); // gpu_traffic = 0; for(int i = 0; i < NUM_EVENTS; i++ ) { printf( "stop: %12lld \t=0X%016llX \t\t --> %s \n", p_values[i], p_values[i], EventName[i] ); // gpu_traffic += p_values[i]; } // cpu_traffic = read + write; // cout << "Read: " << read << " Write: " << write << endl; // cout << "GPU traffic: " << gpu_traffic << " CPU traffic: " << cpu_traffic << endl; // #endif // cgp->resetTime(); // cout << endl; // cout << endl; // if (time1 <= time2) { // time += time1; cpu_traffic += cpu_traffic1; gpu_traffic += gpu_traffic1; pcie_traffic += pcie_traffic1; malloc_time_total += malloc_time_total1; // } else { // time += time2; cpu_traffic += cpu_traffic2; gpu_traffic += gpu_traffic2; pcie_traffic += pcie_traffic2; malloc_time_total += malloc_time_total2; // } // } else if (input.compare("2") == 0) { // time = 0; cpu_traffic = 0; gpu_traffic = 0; pcie_traffic = 0; malloc_time_total = 0; // cout << "How many queries: "; // cin >> many; // many_query = stoi(many); // cgp->resetTime(); // cgp->qo->processed_segment = 0; // cgp->qo->skipped_segment = 0; // cout << "Executing Random Query" << endl; // for (int i = 0; i < many_query; i++) { // qp->generate_rand_query(); // time1 = qp->processQuery(sessionCtx); // cpu_traffic1 = cgp->cpu_traffic_total; // gpu_traffic1 = cgp->gpu_traffic_total; // pcie_traffic1 = cgp->pcie_traffic_total; // malloc_time_total1 = cgp->malloc_time_total; // cgp->resetTime(); // // time2 = qp->processQuery2(); // // cpu_traffic2 = cgp->cpu_traffic_total; // // gpu_traffic2 = cgp->gpu_traffic_total; // // pcie_traffic2 = cgp->pcie_traffic_total; // // malloc_time_total2 = cgp->malloc_time_total; // // cgp->resetTime(); // // if (time1 <= time2) { // time += time1; cpu_traffic += cpu_traffic1; gpu_traffic += gpu_traffic1; pcie_traffic += pcie_traffic1; malloc_time_total += malloc_time_total1; // // } else { // // time += time2; cpu_traffic += cpu_traffic2; gpu_traffic += gpu_traffic2; pcie_traffic += pcie_traffic2; malloc_time_total += malloc_time_total2; // // } // } // processed_segment = cgp->qo->processed_segment; // skipped_segment = cgp->qo->skipped_segment; // srand(123); // } else if (input.compare("3") == 0) { // time = 0; cpu_traffic = 0; gpu_traffic = 0; pcie_traffic = 0; malloc_time_total = 0; // cout << "How many queries: "; // cin >> many; // many_query = stoi(many); // cout << "Replacement Policy: "; // cin >> policy; // if (policy == "LRU") { // repl_policy = LRU; // } else if (policy == "LFU") { // repl_policy = LFU; // } else if (policy == "LRUSegmented") { // repl_policy = LRUSegmented; // } else if (policy == "LFUSegmented") { // repl_policy = LFUSegmented; // } else if (policy == "LRU2") { // repl_policy = LRU2; // } else if (policy == "LRU2Segmented") { // repl_policy = LRU2Segmented; // } else if (policy == "SemanticAware") { // repl_policy = Segmented; // } // cuCtxGetCurrent(&curCtx); cout << curCtx << endl; cout <<sessionCtx << endl; // cgp->resetTime(); // cgp->qo->processed_segment = 0; // cgp->qo->skipped_segment = 0; // cout << "Warmup" << endl; // for (int i = 0; i < 100; i++) { // qp->generate_rand_query(); // time1 = qp->processQuery(sessionCtx); // cgp->resetTime(); // } // cgp->cm->runReplacement(repl_policy); // cout << "Run Experiment" << endl; // #ifdef PROFILE // cout << endl; // cout << "We are profiling" << endl; // long long p_values[NUM_EVENTS]; // long long read = 0, write = 0; // for(int i = 0; i < NUM_EVENTS; i++ ){ // p_values[i] = 0; // } // #endif // for (int iter = 0; iter < 24; iter++) { // #ifdef PROFILE // CHECK_CU_ERROR(cuCtxPushCurrent(sessionCtx), "cuCtxPushCurrent"); // retval = PAPI_start( EventSet ); assert(retval == PAPI_OK); // CHECK_CU_ERROR( cuCtxPopCurrent(&sessionCtx), "cuCtxPopCurrent" ); // // StartMonitor(); // #endif // for (int i = 0; i < many_query; i++) { // qp->generate_rand_query(); // time1 = qp->processQuery(sessionCtx); // cpu_traffic1 = cgp->cpu_traffic_total; // gpu_traffic1 = cgp->gpu_traffic_total; // pcie_traffic1 = cgp->pcie_traffic_total; // malloc_time_total1 = cgp->malloc_time_total; // cgp->resetTime(); // // time2 = qp->processQuery2(); // // cpu_traffic2 = cgp->cpu_traffic_total; // // gpu_traffic2 = cgp->gpu_traffic_total; // // pcie_traffic2 = cgp->pcie_traffic_total; // // malloc_time_total2 = cgp->malloc_time_total; // // cgp->resetTime(); // // if (time1 <= time2) { // time += time1; cpu_traffic += cpu_traffic1; gpu_traffic += gpu_traffic1; pcie_traffic += pcie_traffic1; malloc_time_total += malloc_time_total1; // // } else { // // time += time2; cpu_traffic += cpu_traffic2; gpu_traffic += gpu_traffic2; pcie_traffic += pcie_traffic2; malloc_time_total += malloc_time_total2; // // } // } // #ifdef PROFILE // // EndMonitor(read, write); // cout << endl; // CHECK_CU_ERROR(cuCtxPushCurrent(sessionCtx), "cuCtxPushCurrent"); // CHECK_CU_ERROR( cuCtxSynchronize( ), "cuCtxSynchronize" ); // CHECK_CU_ERROR( cuCtxPopCurrent(&sessionCtx), "cuCtxPopCurrent" ); // retval = PAPI_stop( EventSet, p_values ); assert(retval == PAPI_OK); // for(int i = 0; i < NUM_EVENTS; i++ ) { // printf( "stop: %12lld \t=0X%016llX \t\t --> %s \n", p_values[i], p_values[i], EventName[i] ); // } // retval = PAPI_reset(EventSet); assert(retval == PAPI_OK); // #endif // cgp->cm->runReplacement(repl_policy); // qp->percentageData(); // if (dist == Norm && iter % 3 == 0 && iter > 0) { // mean = ((int) mean + 1) % 7; // qp->qo->setDistributionNormal(mean, 1); // } // } // #ifdef PROFILE // gpu_traffic = 0; // for(int i = 0; i < NUM_EVENTS; i++ ) { // printf( "stop: %12lld \t=0X%016llX \t\t --> %s \n", p_values[i], p_values[i], EventName[i] ); // gpu_traffic += p_values[i]; // } // cpu_traffic = read + write; // cout << "Read: " << read << " Write: " << write << endl; // cout << "GPU traffic: " << gpu_traffic << " CPU traffic: " << cpu_traffic << endl; // #endif // processed_segment = cgp->qo->processed_segment; // skipped_segment = cgp->qo->skipped_segment; // srand(123); // string runs = "logs/runs"; // FILE *fptr = fopen(runs.c_str(), "a"); // if (fptr == NULL) // { // printf("Could not open file\n"); // assert(0); // } // fprintf(fptr, "{\"cache_size\":%u,\"cumulated_time\":%.2f}\n", size, time); // fclose(fptr); // } else if (input.compare("4") == 0) { // cout << "Replacement Policy: "; // cin >> policy; // if (policy == "LRU") { // repl_policy = LRU; // } else if (policy == "LFU") { // repl_policy = LFU; // } else if (policy == "LRUSegmented") { // repl_policy = LRUSegmented; // } else if (policy == "LFUSegmented") { // repl_policy = LFUSegmented; // } else if (policy == "LRU2") { // repl_policy = LRU2; // } else if (policy == "LRU2Segmented") { // repl_policy = LRU2Segmented; // } else if (policy == "SemanticAware") { // repl_policy = Segmented; // } // cgp->cm->runReplacement(repl_policy); // qp->percentageData(); // srand(123); // } else if (input.compare("5") == 0) { // string filename; // cout << "File name: "; // cin >> filename; // qp->dumpTrace("logs/"+filename); // cout << "Dumped Trace" << endl; // } else if (input.compare("cache") == 0) { // string column_name; // int ret; // do { // cout << " Column to cache: "; // cin >> column_name; // ret = cgp->cm->cacheSpecificColumn(column_name); // } while (ret != 0); // } else if (input.compare("clear") == 0) { // cgp->cm->deleteAll(); // } else if (input.compare("skipping") == 0) { // skipping = !skipping; // cgp->skipping = skipping; // cgp->qo->skipping = skipping; // qp->skipping = skipping; // if (skipping) cout << "Segment skipping is enabled" << endl; // else cout << "Segment skipping is disabled" << endl; // } else if (input.compare("custom") == 0) { // custom = !custom; // cgp->custom = custom; // cgp->qo->custom = custom; // qp->custom = custom; // if (custom) cout << "Custom malloc is enabled" << endl; // else cout << "Custom malloc is disabled" << endl; // } else { // exit = true; // } // cout << endl; // cout << "Cumulated Time: " << time << endl; // cout << "CPU traffic: " << cpu_traffic << endl; // cout << "GPU traffic: " << gpu_traffic << endl; // cout << "PCIe traffic: " << pcie_traffic << endl; // cout << "Malloc time: " << malloc_time_total << endl; // cout << "Fraction Skipped Segment: " << skipped_segment * 1.0 /(processed_segment + skipped_segment) << endl; // cout << endl; // // cout << endl; // } if (sessionCtx != NULL) { cuCtxDestroy(sessionCtx); } // #ifdef PROFILE retval = PAPI_cleanup_eventset(EventSet); assert(retval == PAPI_OK); retval = PAPI_destroy_eventset(&EventSet); assert(retval == PAPI_OK); // #endif // #ifdef PROFILE PAPI_shutdown(); // #endif }