// Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include <iostream> #include <stdio.h> #include <curand.h> #include <cuda.h> #include <cub/util_allocator.cuh> #include "cub/test/test_util.h" #include "crystal/crystal.cuh" #include "gpu_utils.h" #include "ssb_utils.h" using namespace std; /** * Globals, constants and typedefs */ bool g_verbose = false; // Whether to display input/output to console cub::CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __global__ void QueryKernel(int* lo_orderdate, int* lo_discount, int* lo_quantity, int* lo_extendedprice, int lo_num_entries, unsigned long long* revenue, int* d_total) { // Load a segment of consecutive items that are blocked across threads int items[ITEMS_PER_THREAD]; int selection_flags[ITEMS_PER_THREAD]; int items2[ITEMS_PER_THREAD]; long long sum = 0; int tile_offset = blockIdx.x * TILE_SIZE; int num_tiles = (lo_num_entries + TILE_SIZE - 1) / TILE_SIZE; int num_tile_items = TILE_SIZE; int t_count = 0; if (blockIdx.x == num_tiles - 1) { num_tile_items = lo_num_entries - tile_offset; } InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags); BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_discount + tile_offset, items, num_tile_items); BlockPredAndGTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 1, selection_flags, num_tile_items); BlockPredAndLTE<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 3, selection_flags, num_tile_items); BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_quantity + tile_offset, items, num_tile_items); BlockPredAndLT<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 25, selection_flags, num_tile_items); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { if ((threadIdx.x + (BLOCK_THREADS * ITEM) < num_tile_items)) if (selection_flags[ITEM]) { t_count++; } } BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_orderdate + tile_offset, items, num_tile_items); BlockPredAndGT<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 19930000, selection_flags, num_tile_items); BlockPredAndLT<int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, 19940000, selection_flags, num_tile_items); BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_discount + tile_offset, items, num_tile_items); BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(lo_extendedprice + tile_offset, items2, num_tile_items); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { if ((threadIdx.x + (BLOCK_THREADS * ITEM) < num_tile_items)) if (selection_flags[ITEM]) { sum += items[ITEM] * items2[ITEM]; } } __syncthreads(); static __shared__ long long buffer[32]; unsigned long long aggregate = BlockSum<long long, BLOCK_THREADS, ITEMS_PER_THREAD>(sum, (long long*)buffer); __syncthreads(); static __shared__ int buffer2[32]; int tot_t_count = BlockSum<int, BLOCK_THREADS, ITEMS_PER_THREAD>(t_count, (int*)buffer2); __syncthreads(); if (threadIdx.x == 0) { atomicAdd(revenue, aggregate); atomicAdd(d_total, tot_t_count); } } float runQuery(int* lo_orderdate, int* lo_discount, int* lo_quantity, int* lo_extendedprice, int lo_num_entries, cub::CachingDeviceAllocator& g_allocator) { SETUP_TIMING(); float time_query; chrono::high_resolution_clock::time_point st, finish; st = chrono::high_resolution_clock::now(); cudaEventRecord(start, 0); unsigned long long* d_sum = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_sum, sizeof(long long))); cudaMemset(d_sum, 0, sizeof(long long)); int* d_total; CubDebugExit(cudaMalloc((void **)&d_total, sizeof(int))); CubDebugExit(cudaMemset(d_total, 0, sizeof(int))); // Run int tile_items = 128*4; int num_blocks = (lo_num_entries + tile_items - 1)/tile_items; QueryKernel<128,4><<<num_blocks, 128>>>(lo_orderdate, lo_discount, lo_quantity, lo_extendedprice, lo_num_entries, d_sum, d_total); cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&time_query, start,stop); unsigned long long revenue; int h_total = 0; CubDebugExit(cudaMemcpy(&revenue, d_sum, sizeof(long long), cudaMemcpyDeviceToHost)); CubDebugExit(cudaMemcpy(&h_total, d_total, sizeof(int), cudaMemcpyDeviceToHost)); printf("%d\n", h_total); finish = chrono::high_resolution_clock::now(); std::chrono::duration<double> diff = finish - st; cout << "Revenue: " << revenue << endl; cout << "Time Taken Total: " << diff.count() * 1000 << endl; CLEANUP(d_sum); return time_query; } /** * Main */ int main(int argc, char** argv) { int num_trials = 3; // Initialize command line CommandLineArgs args(argc, argv); args.GetCmdLineArgument("t", num_trials); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--t=<num trials>] " "[--v] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); int *h_lo_orderdate = loadColumn<int>("lo_orderdate", LO_LEN); int *h_lo_discount = loadColumn<int>("lo_discount", LO_LEN); int *h_lo_quantity = loadColumn<int>("lo_quantity", LO_LEN); int *h_lo_extendedprice = loadColumn<int>("lo_extendedprice", LO_LEN); int *h_d_datekey = loadColumn<int>("d_datekey", D_LEN); int *h_d_year = loadColumn<int>("d_year", D_LEN); cout << "** LOADED DATA **" << endl; cout << "LO_LEN " << LO_LEN << endl; int *d_lo_orderdate = loadToGPU<int>(h_lo_orderdate, LO_LEN, g_allocator); int *d_lo_discount = loadToGPU<int>(h_lo_discount, LO_LEN, g_allocator); int *d_lo_quantity = loadToGPU<int>(h_lo_quantity, LO_LEN, g_allocator); int *d_lo_extendedprice = loadToGPU<int>(h_lo_extendedprice, LO_LEN, g_allocator); int *d_d_datekey = loadToGPU<int>(h_d_datekey, D_LEN, g_allocator); int *d_d_year = loadToGPU<int>(h_d_year, D_LEN, g_allocator); cout << "** LOADED DATA TO GPU **" << endl; for (int t = 0; t < num_trials; t++) { float time_query; time_query = runQuery(d_lo_orderdate, d_lo_discount, d_lo_quantity, d_lo_extendedprice, LO_LEN, g_allocator); cout<< "{" << "\"query\":11" << ",\"time_query\":" << time_query << "}" << endl; } return 0; }