// Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include <iostream> #include <stdio.h> #include <curand.h> #include <cuda.h> #include <cub/util_allocator.cuh> #include "cub/test/test_util.h" #include "crystal/crystal.cuh" #include "utils/generator.h" #include "utils/gpu_utils.h" using namespace std; #define DEBUG 1 template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __global__ void build_kernel(int *dim_key, int *dim_val, int num_tuples, int *hash_table, int num_slots) { int items[ITEMS_PER_THREAD]; int items2[ITEMS_PER_THREAD]; int selection_flags[ITEMS_PER_THREAD]; int tile_offset = blockIdx.x * TILE_SIZE; int num_tiles = (num_tuples + TILE_SIZE - 1) / TILE_SIZE; int num_tile_items = TILE_SIZE; if (blockIdx.x == num_tiles - 1) { num_tile_items = num_tuples - tile_offset; } InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags); BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_key + tile_offset, items, num_tile_items); BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(dim_val + tile_offset, items2, num_tile_items); BlockBuildSelectivePHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(items, items2, selection_flags, hash_table, num_slots, num_tile_items); } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __global__ void probe_kernel(int *fact_fkey, int *fact_val, int num_tuples, int *hash_table, int num_slots, unsigned long long *res) { // Load a tile striped across threads int selection_flags[ITEMS_PER_THREAD]; int keys[ITEMS_PER_THREAD]; int vals[ITEMS_PER_THREAD]; int join_vals[ITEMS_PER_THREAD]; unsigned long long sum = 0; int tile_offset = blockIdx.x * TILE_SIZE; int num_tiles = (num_tuples+ TILE_SIZE - 1) / TILE_SIZE; int num_tile_items = TILE_SIZE; if (blockIdx.x == num_tiles - 1) { num_tile_items = num_tuples - tile_offset; } InitFlags<BLOCK_THREADS, ITEMS_PER_THREAD>(selection_flags); BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(fact_fkey + tile_offset, keys, num_tile_items); BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD>(fact_val + tile_offset, vals, num_tile_items); BlockProbeAndPHT_2<int, int, BLOCK_THREADS, ITEMS_PER_THREAD>(keys, join_vals, selection_flags, hash_table, num_slots, num_tile_items); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { if ((threadIdx.x + (BLOCK_THREADS * ITEM) < num_tile_items)) if (selection_flags[ITEM]) sum += vals[ITEM] * join_vals[ITEM]; } __syncthreads(); static __shared__ long long buffer[32]; unsigned long long aggregate = BlockSum<long long, BLOCK_THREADS, ITEMS_PER_THREAD>(sum, (long long*)buffer); __syncthreads(); if (threadIdx.x == 0) { atomicAdd(res, aggregate); } } struct TimeKeeper { float time_build; float time_probe; float time_extra; float time_total; }; TimeKeeper hashJoin(int* d_dim_key, int* d_dim_val, int* d_fact_fkey, int* d_fact_val, int num_dim, int num_fact, cub::CachingDeviceAllocator& g_allocator) { SETUP_TIMING(); int* hash_table = NULL; unsigned long long* res; int num_slots = num_dim; float time_build, time_probe, time_memset, time_memset2; ALLOCATE(hash_table, sizeof(int) * 2 * num_dim); ALLOCATE(res, sizeof(long long)); TIME_FUNC(cudaMemset(hash_table, 0, num_slots * sizeof(int) * 2), time_memset); TIME_FUNC(cudaMemset(res, 0, sizeof(long long)), time_memset2); int tile_items = 128*4; TIME_FUNC((build_kernel<128, 4><<<(num_dim + tile_items - 1)/tile_items, 128>>>(d_dim_key, d_dim_val, num_dim, hash_table, num_slots)), time_build); TIME_FUNC((probe_kernel<128, 4><<<(num_fact + tile_items - 1)/tile_items, 128>>>(d_fact_fkey, d_fact_val, num_fact, hash_table, num_slots, res)), time_probe); #if DEBUG cout << "{" << "\"time_memset\":" << time_memset << ",\"time_build\"" << time_build << ",\"time_probe\":" << time_probe << "}" << endl; #endif CLEANUP(hash_table); CLEANUP(res); TimeKeeper t = {time_build, time_probe, time_memset, time_build + time_probe + time_memset}; return t; } //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- bool g_verbose = false; // Whether to display input/output to console cub::CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory #define CLEANUP(vec) if(vec)CubDebugExit(g_allocator.DeviceFree(vec)) //--------------------------------------------------------------------- // Main //--------------------------------------------------------------------- int main(int argc, char** argv) { int num_fact = 256 * 1<<20; int num_dim = 16 * 1<<20; int num_trials = 3; // Initialize command line CommandLineArgs args(argc, argv); args.GetCmdLineArgument("n", num_fact); args.GetCmdLineArgument("d", num_dim); args.GetCmdLineArgument("t", num_trials); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--n=<num fact>] " "[--d=<num dim>] " "[--t=<num trials>] " "[--device=<device-id>] " "[--v] " "\n", argv[0]); exit(0); } int log2 = 0; int num_dim_dup = num_dim >> 1; while (num_dim_dup) { num_dim_dup >>= 1; log2 += 1; } // Initialize device CubDebugExit(args.DeviceInit()); // Allocate problem device arrays int *d_dim_key = NULL; int *d_dim_val = NULL; int *d_fact_fkey = NULL; int *d_fact_val = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_dim_key, sizeof(int) * num_dim)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_dim_val, sizeof(int) * num_dim)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_fact_fkey, sizeof(int) * num_fact)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_fact_val, sizeof(int) * num_fact)); int *h_dim_key = NULL; int *h_dim_val = NULL; int *h_fact_fkey = NULL; int *h_fact_val = NULL; create_relation_pk(h_dim_key, h_dim_val, num_dim); create_relation_fk(h_fact_fkey, h_fact_val, num_fact, num_dim); CubDebugExit(cudaMemcpy(d_dim_key, h_dim_key, sizeof(int) * num_dim, cudaMemcpyHostToDevice)); CubDebugExit(cudaMemcpy(d_dim_val, h_dim_val, sizeof(int) * num_dim, cudaMemcpyHostToDevice)); CubDebugExit(cudaMemcpy(d_fact_fkey, h_fact_fkey, sizeof(int) * num_fact, cudaMemcpyHostToDevice)); CubDebugExit(cudaMemcpy(d_fact_val, h_fact_val, sizeof(int) * num_fact, cudaMemcpyHostToDevice)); for (int j = 0; j < num_trials; j++) { TimeKeeper t = hashJoin(d_dim_key, d_dim_val, d_fact_fkey, d_fact_val, num_dim, num_fact, g_allocator); cout<< "{" << "\"num_dim\":" << num_dim << ",\"num_fact\":" << num_fact << ",\"radix\":" << 0 << ",\"time_partition_build\":" << 0 << ",\"time_partition_probe\":" << 0 << ",\"time_partition_total\":" << 0 << ",\"time_build\":" << t.time_build << ",\"time_probe\":" << t.time_probe << ",\"time_extra\":" << t.time_extra << ",\"time_join_total\":" << t.time_total << "}" << endl; } CLEANUP(d_dim_key); CLEANUP(d_dim_val); CLEANUP(d_fact_fkey); CLEANUP(d_fact_val); return 0; }