src/gpudb/test/test_par6.cu · Lancelot


#include "common.h"

#define PROFILE 1

#define CHECK_CU_ERROR(err, cufunc)                                     \
    if (err != CUDA_SUCCESS) { printf ("Error %d for CUDA Driver API function '%s'\n", err, cufunc); return -1; }

// tbb::task_scheduler_init init(1);

int main() {

	cudaSetDevice(0);
	CUdevice device;
	cuDeviceGet(&device, 0);

	// bool verbose = 0;

	// srand(123);

	// unsigned int size = 52428800; //200 MB
	// double alpha = 2.5;
	// bool custom = true;
	// bool skipping = true;

	//2, 4, 8, 12, 16, 24, 32, 40
	//4, 10, 20, 30, 40
	
	//TODO: make it support cache size > 8 GB (there are lots of integer overflow resulting in negative offset to the gpuCache) should have used unsigned int everywhere
	// CPUGPUProcessing* cgp = new CPUGPUProcessing(size * 12, 0, 52428800 * 15, 52428800 * 20, verbose, custom, skipping);
	// QueryProcessing* qp;

	cout << endl;

	// CacheManager* cm = cgp->cm;

	// bool exit = 0;
	// string input, query, many, policy;
	// int many_query;
	// ReplacementPolicy repl_policy;
	// double time = 0;
	// long long cpu_traffic = 0, gpu_traffic = 0, pcie_traffic = 0, malloc_time_total = 0;
	// double time1 = 0, time2 = 0;
	// long long cpu_traffic1 = 0, cpu_traffic2 = 0;
	// long long gpu_traffic1 = 0, gpu_traffic2 = 0;
	// long long pcie_traffic1 = 0, pcie_traffic2 = 0;
	// double malloc_time_total1 = 0, malloc_time_total2 = 0;
	// Distribution dist = None;
	// int processed_segment = 0;
	// int skipped_segment = 0;
	// double mean = 0;

	// qp = new QueryProcessing(cgp, verbose, dist);

	// if (dist == Zipf) {
	// 	qp->qo->setDistributionZipfian(alpha);
	// } else if (dist == Norm) {
	// 	qp->qo->setDistributionNormal(mean, 0.5);
	// }

// #ifdef PROFILE
	PAPI_library_init( PAPI_VER_CURRENT );
	// InitMonitor();
// #endif

// #ifdef PROFILE
	cout << endl;
	cout << "We are profiling" << endl;

	int retval;
	char const *EventName[] = { "cuda:::dram__bytes_read.sum:device=0", "cuda:::dram__bytes_write.sum:device=0"}; // CUPTI_11 event.
	int* events = new int [NUM_EVENTS];
	int EventSet = PAPI_NULL;

	for(int i = 0; i < NUM_EVENTS; i++ ){
	    retval = PAPI_event_name_to_code( (char *)EventName[i], &events[i] );
	    assert(retval == PAPI_OK);
	}

	retval = PAPI_create_eventset( &EventSet ); assert(retval == PAPI_OK);
	retval = PAPI_add_events( EventSet, events, NUM_EVENTS ); assert(retval == PAPI_OK);
// #endif

	CUcontext sessionCtx = NULL;
	CUcontext poppedCtx, curCtx;
	CHECK_CU_ERROR( cuCtxCreate(&sessionCtx, 0, device), "cuCtxCreate");
	cuCtxGetCurrent(&curCtx); cout << curCtx << endl;
	CHECK_CU_ERROR( cuCtxPopCurrent(&poppedCtx), "cuCtxPopCurrent" );
	cuCtxGetCurrent(&curCtx); cout << curCtx << endl;

	// while (!exit) {
	// 	cout << "Select Options:" << endl;
	// 	cout << "1. Run Specific Query" << endl;
	// 	cout << "2. Run Random Queries" << endl;
	// 	cout << "3. Run Experiment" << endl;
	// 	cout << "4. Replacement" << endl;
	// 	cout << "5. Dump Trace" << endl;
	// 	cout << "6. Exit" << endl;
	// 	cout << "cache. Cache Specific Column" << endl;
	// 	cout << "clear. Delete Columns from GPU" << endl;
	// 	cout << "custom. Toggle custom malloc" << endl;
	// 	cout << "skipping. Toggle segment skipping" << endl;
	// 	cout << "pipelining. Toggle operator pipelining" << endl;
	// 	cout << "Your Input: ";
	// 	cin >> input;

	// 	if (input.compare("1") == 0) {
	// 		time = 0; cpu_traffic = 0; gpu_traffic = 0; pcie_traffic = 0; malloc_time_total = 0;
	// 		cgp->resetTime();
			// cout << "Input Query: ";
			// cin >> query;
			// qp->setQuery(stoi(query));

// #ifdef PROFILE
			long long p_values[NUM_EVENTS];
			long long read = 0, write = 0;
			
			for(int i = 0; i < NUM_EVENTS; i++ ){
				p_values[i] = 0;
			}

			// CHECK_CU_ERROR(cuCtxGetCurrent(&curCtx), "cuCtxGetCurrent"); cout << curCtx << endl;
			// CHECK_CU_ERROR(cuCtxSetCurrent(sessionCtx), "cuCtxSetCurrent");
			CHECK_CU_ERROR(cuCtxPushCurrent(sessionCtx), "cuCtxPushCurrent");
			retval = PAPI_start( EventSet ); assert(retval == PAPI_OK);
			CHECK_CU_ERROR( cuCtxPopCurrent(&sessionCtx), "cuCtxPopCurrent" );
			// CHECK_CU_ERROR(cuCtxSetCurrent(curCtx), "cuCtxSetCurrent");
			// StartMonitor();
// #endif

			parallel_for(int(0), 64, [=](int j){

				CUcontext poppedCtx;
				cuCtxPushCurrent(sessionCtx);

				cudaStream_t stream;
				cudaStreamCreate(&stream);



				cudaStreamSynchronize(stream);

				cudaStreamDestroy(stream);

				cuCtxPopCurrent(&poppedCtx);

			});

			// time1 = qp->processQuery(sessionCtx);
			// cpu_traffic1 = cgp->cpu_traffic_total;
			// gpu_traffic1 = cgp->gpu_traffic_total;
			// pcie_traffic1 = cgp->pcie_traffic_total;
			// malloc_time_total1 = cgp->malloc_time_total;

// #ifdef PROFILE

			// EndMonitor(read, write);
			cout << endl;

	        CHECK_CU_ERROR(cuCtxPushCurrent(sessionCtx), "cuCtxPushCurrent");
	        CHECK_CU_ERROR( cuCtxSynchronize( ), "cuCtxSynchronize" );
	        CHECK_CU_ERROR( cuCtxPopCurrent(&sessionCtx), "cuCtxPopCurrent" );
			retval = PAPI_stop( EventSet, p_values ); assert(retval == PAPI_OK);
			retval = PAPI_reset(EventSet); assert(retval == PAPI_OK);
			// gpu_traffic = 0;
			for(int i = 0; i < NUM_EVENTS; i++ ) {
				printf( "stop: %12lld \t=0X%016llX \t\t --> %s \n", p_values[i], p_values[i], EventName[i] );
				// gpu_traffic += p_values[i];
			}
 		// 	cpu_traffic = read + write;
			// cout << "Read: " << read << " Write: " << write  << endl;
			// cout << "GPU traffic: " << gpu_traffic << " CPU traffic: " << cpu_traffic << endl;
// #endif

			// cgp->resetTime();
			// cout << endl;
			// cout << endl;

			// if (time1 <= time2) {
				// time += time1; cpu_traffic += cpu_traffic1; gpu_traffic += gpu_traffic1; pcie_traffic += pcie_traffic1; malloc_time_total += malloc_time_total1;
			// } else {
			// 	time += time2; cpu_traffic += cpu_traffic2; gpu_traffic += gpu_traffic2; pcie_traffic += pcie_traffic2; malloc_time_total += malloc_time_total2;
			// }

// 		} else if (input.compare("2") == 0) {
// 			time = 0; cpu_traffic = 0; gpu_traffic = 0; pcie_traffic = 0; malloc_time_total = 0;
// 			cout << "How many queries: ";
// 			cin >> many;
// 			many_query = stoi(many);
// 			cgp->resetTime();
// 			cgp->qo->processed_segment = 0;
// 			cgp->qo->skipped_segment = 0;
// 			cout << "Executing Random Query" << endl;
// 			for (int i = 0; i < many_query; i++) {
// 				qp->generate_rand_query();

// 				time1 = qp->processQuery(sessionCtx);
// 				cpu_traffic1 = cgp->cpu_traffic_total;
// 				gpu_traffic1 = cgp->gpu_traffic_total;
// 				pcie_traffic1 = cgp->pcie_traffic_total;
// 				malloc_time_total1 = cgp->malloc_time_total;
// 				cgp->resetTime();

// 				// time2 = qp->processQuery2();
// 				// cpu_traffic2 = cgp->cpu_traffic_total;
// 				// gpu_traffic2 = cgp->gpu_traffic_total;
// 				// pcie_traffic2 = cgp->pcie_traffic_total;
// 				// malloc_time_total2 = cgp->malloc_time_total;
// 				// cgp->resetTime();

// 				// if (time1 <= time2) {
// 					time += time1; cpu_traffic += cpu_traffic1; gpu_traffic += gpu_traffic1; pcie_traffic += pcie_traffic1; malloc_time_total += malloc_time_total1;
// 				// } else {
// 				// 	time += time2; cpu_traffic += cpu_traffic2; gpu_traffic += gpu_traffic2; pcie_traffic += pcie_traffic2; malloc_time_total += malloc_time_total2;
// 				// }
				
// 			}
// 			processed_segment = cgp->qo->processed_segment;
// 			skipped_segment = cgp->qo->skipped_segment;
// 			srand(123);
// 		} else if (input.compare("3") == 0) {
// 			time = 0; cpu_traffic = 0; gpu_traffic = 0; pcie_traffic = 0; malloc_time_total = 0;
// 			cout << "How many queries: ";
// 			cin >> many;
// 			many_query = stoi(many);

// 			cout << "Replacement Policy: ";
// 			cin >> policy;

// 			if (policy == "LRU") {
// 				repl_policy = LRU;
// 			} else if (policy == "LFU") {
// 				repl_policy = LFU;
// 			} else if (policy == "LRUSegmented") {
// 				repl_policy = LRUSegmented;
// 			} else if (policy == "LFUSegmented") {
// 				repl_policy = LFUSegmented;
// 			} else if (policy == "LRU2") {
// 				repl_policy = LRU2;
// 			} else if (policy == "LRU2Segmented") {
// 				repl_policy = LRU2Segmented;
// 			} else if (policy == "SemanticAware") {
// 				repl_policy = Segmented;
// 			}

// 			cuCtxGetCurrent(&curCtx); cout << curCtx << endl; cout <<sessionCtx << endl;

// 			cgp->resetTime();
// 			cgp->qo->processed_segment = 0;
// 			cgp->qo->skipped_segment = 0;

// 			cout << "Warmup" << endl;
// 			for (int i = 0; i < 100; i++) {
// 				qp->generate_rand_query();
// 				time1 = qp->processQuery(sessionCtx);
// 				cgp->resetTime();
// 			}
// 			cgp->cm->runReplacement(repl_policy);

// 			cout << "Run Experiment" << endl;

// #ifdef PROFILE
// 			cout << endl;
// 			cout << "We are profiling" << endl;

// 			long long p_values[NUM_EVENTS];
// 			long long read = 0, write = 0;
			
// 			for(int i = 0; i < NUM_EVENTS; i++ ){
// 				p_values[i] = 0;
// 			}
// #endif


// 			for (int iter = 0; iter < 24; iter++) {

// #ifdef PROFILE
// 				CHECK_CU_ERROR(cuCtxPushCurrent(sessionCtx), "cuCtxPushCurrent");
// 				retval = PAPI_start( EventSet ); assert(retval == PAPI_OK);
// 				CHECK_CU_ERROR( cuCtxPopCurrent(&sessionCtx), "cuCtxPopCurrent" );

// 				// StartMonitor();
// #endif

// 				for (int i = 0; i < many_query; i++) {
// 					qp->generate_rand_query();
// 					time1 = qp->processQuery(sessionCtx);
// 					cpu_traffic1 = cgp->cpu_traffic_total;
// 					gpu_traffic1 = cgp->gpu_traffic_total;
// 					pcie_traffic1 = cgp->pcie_traffic_total;
// 					malloc_time_total1 = cgp->malloc_time_total;
// 					cgp->resetTime();

// 					// time2 = qp->processQuery2();
// 					// cpu_traffic2 = cgp->cpu_traffic_total;
// 					// gpu_traffic2 = cgp->gpu_traffic_total;
// 					// pcie_traffic2 = cgp->pcie_traffic_total;
// 					// malloc_time_total2 = cgp->malloc_time_total;
// 					// cgp->resetTime();

// 					// if (time1 <= time2) {
// 						time += time1; cpu_traffic += cpu_traffic1; gpu_traffic += gpu_traffic1; pcie_traffic += pcie_traffic1; malloc_time_total += malloc_time_total1;
// 					// } else {
// 					// 	time += time2; cpu_traffic += cpu_traffic2; gpu_traffic += gpu_traffic2; pcie_traffic += pcie_traffic2; malloc_time_total += malloc_time_total2;
// 					// }
// 				}

// #ifdef PROFILE

// 				// EndMonitor(read, write);
// 				cout << endl;

// 				CHECK_CU_ERROR(cuCtxPushCurrent(sessionCtx), "cuCtxPushCurrent");
// 		        CHECK_CU_ERROR( cuCtxSynchronize( ), "cuCtxSynchronize" );
// 		        CHECK_CU_ERROR( cuCtxPopCurrent(&sessionCtx), "cuCtxPopCurrent" );
// 				retval = PAPI_stop( EventSet, p_values ); assert(retval == PAPI_OK);
// 				for(int i = 0; i < NUM_EVENTS; i++ ) {
// 					printf( "stop: %12lld \t=0X%016llX \t\t --> %s \n", p_values[i], p_values[i], EventName[i] );
// 				}
// 				retval = PAPI_reset(EventSet); assert(retval == PAPI_OK);
				
// #endif				

// 				cgp->cm->runReplacement(repl_policy);
// 				qp->percentageData();

// 				if (dist == Norm && iter % 3 == 0 && iter > 0) {
// 					mean = ((int) mean + 1) % 7;
// 					qp->qo->setDistributionNormal(mean, 1);
// 				}

// 			}




// #ifdef PROFILE
// 			gpu_traffic = 0;
// 			for(int i = 0; i < NUM_EVENTS; i++ ) {
// 				printf( "stop: %12lld \t=0X%016llX \t\t --> %s \n", p_values[i], p_values[i], EventName[i] );
// 				gpu_traffic += p_values[i];
// 			}
//  			cpu_traffic = read + write;
// 			cout << "Read: " << read << " Write: " << write  << endl;
// 			cout << "GPU traffic: " << gpu_traffic << " CPU traffic: " << cpu_traffic << endl;
// #endif


// 			processed_segment = cgp->qo->processed_segment;
// 			skipped_segment = cgp->qo->skipped_segment;
// 			srand(123);

// 			string runs = "logs/runs";
// 		    FILE *fptr = fopen(runs.c_str(), "a");
// 		    if (fptr == NULL)
// 		    {
// 		        printf("Could not open file\n");
// 		        assert(0);
// 		    }
// 		   	fprintf(fptr, "{\"cache_size\":%u,\"cumulated_time\":%.2f}\n", size, time);
// 		   	fclose(fptr);
// 		} else if (input.compare("4") == 0) {
// 			cout << "Replacement Policy: ";
// 			cin >> policy;

// 			if (policy == "LRU") {
// 				repl_policy = LRU;
// 			} else if (policy == "LFU") {
// 				repl_policy = LFU;
// 			} else if (policy == "LRUSegmented") {
// 				repl_policy = LRUSegmented;
// 			} else if (policy == "LFUSegmented") {
// 				repl_policy = LFUSegmented;
// 			} else if (policy == "LRU2") {
// 				repl_policy = LRU2;
// 			} else if (policy == "LRU2Segmented") {
// 				repl_policy = LRU2Segmented;
// 			} else if (policy == "SemanticAware") {
// 				repl_policy = Segmented;
// 			}

// 			cgp->cm->runReplacement(repl_policy);
// 			qp->percentageData();
// 			srand(123);
// 		} else if (input.compare("5") == 0) {
// 			string filename;
// 			cout << "File name: ";
// 			cin >> filename;
// 			qp->dumpTrace("logs/"+filename);
// 			cout << "Dumped Trace" << endl;
// 		} else if (input.compare("cache") == 0) {
// 			string column_name;
// 			int ret;
// 			do {
// 				cout << "	Column to cache: ";
// 				cin >> column_name;
// 				ret = cgp->cm->cacheSpecificColumn(column_name);
// 			} while (ret != 0);
// 		} else if (input.compare("clear") == 0) {
// 			cgp->cm->deleteAll();
// 		} else if (input.compare("skipping") == 0) {
// 			skipping = !skipping;
// 			cgp->skipping = skipping;
// 			cgp->qo->skipping = skipping;
// 			qp->skipping = skipping;
// 			if (skipping) cout << "Segment skipping is enabled" << endl;
// 			else cout << "Segment skipping is disabled" << endl;
// 		} else if (input.compare("custom") == 0) {
// 			custom = !custom;
// 			cgp->custom = custom;
// 			cgp->qo->custom = custom;
// 			qp->custom = custom;
// 			if (custom) cout << "Custom malloc is enabled" << endl;
// 			else cout << "Custom malloc is disabled" << endl;		
// 		} else {
// 			exit = true;
// 		}

// 		cout << endl;
// 		cout << "Cumulated Time: " << time << endl;
// 		cout << "CPU traffic: " << cpu_traffic << endl;
// 		cout << "GPU traffic: " << gpu_traffic << endl;
// 		cout << "PCIe traffic: " << pcie_traffic << endl;
// 		cout << "Malloc time: " << malloc_time_total << endl;
// 		cout << "Fraction Skipped Segment: " << skipped_segment * 1.0 /(processed_segment + skipped_segment) << endl;
// 		cout << endl;

// 		// cout << endl;


	// }

    if (sessionCtx != NULL) {
        cuCtxDestroy(sessionCtx);
    }


// #ifdef PROFILE
		retval = PAPI_cleanup_eventset(EventSet); assert(retval == PAPI_OK);
		retval = PAPI_destroy_eventset(&EventSet); assert(retval == PAPI_OK);
// #endif

// #ifdef PROFILE
		PAPI_shutdown();
// #endif

}