#ifndef _BLOCK_LIBRARY_H_ #define _BLOCK_LIBRARY_H_ #pragma once #define cudaAssert( X ) if ( !(X) ) { printf( "Thread %d:%d failed assert at %s:%d!\n", blockIdx.x, threadIdx.x, __FILE__, __LINE__ ); return; } template<int BLOCK_THREADS, int ITEMS_PER_THREAD, int NUM_RANGE> __device__ __forceinline__ void BlockCountPartition( int (&item_key)[ITEMS_PER_THREAD], int (&selection_flags)[ITEMS_PER_THREAD], int* smem_index, int num_tile_items, int range_size, int is_replicated, int gpu ) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { if (threadIdx.x + (ITEM * BLOCK_THREADS) < num_tile_items) { if (selection_flags[ITEM]) { int partition = item_key[ITEM]/range_size; //if range size is wrong, there could be an extra partition cudaAssert(partition <= NUM_RANGE); if (partition >= NUM_RANGE) partition = NUM_RANGE-1; if (is_replicated) { if (partition == gpu) atomicAdd(&(smem_index[partition]), 1); } else atomicAdd(&(smem_index[partition]), 1); } } } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockReadOffsetShuffle( int (&item_off)[ITEMS_PER_THREAD], int* data, int (&items)[ITEMS_PER_THREAD], int num_tile_items ) { cudaAssert(data != NULL); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { if (threadIdx.x + (ITEM * BLOCK_THREADS) < num_tile_items) { items[ITEM] = data[item_off[ITEM]]; } } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockReadOffsetShuffle2( int (&item_off)[ITEMS_PER_THREAD], int (&selection_flags)[ITEMS_PER_THREAD], int* data, int (&items)[ITEMS_PER_THREAD], int num_tile_items ) { cudaAssert(data != NULL); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { if (threadIdx.x + (ITEM * BLOCK_THREADS) < num_tile_items) { if (selection_flags[ITEM]) { items[ITEM] = data[item_off[ITEM]]; } } } } // we should modify this so that if its broadcasted or replicated, we will read from local instead template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockReadOffsetGlobal( int (&item_off)[ITEMS_PER_THREAD], int (&selection_flags)[ITEMS_PER_THREAD], int (&items)[ITEMS_PER_THREAD], int** gpuCache, int*** all_col_idx, int** seg_row_to_gpu, int* seg_is_replicated, int column_id, int table_id, int cur_gpu, int num_tile_items ) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { if (threadIdx.x + (ITEM * BLOCK_THREADS) < num_tile_items) { if (selection_flags[ITEM]) { // int segment_idx = item_off[ITEM]/SEGMENT_SIZE; // // if (threadIdx.x == 0) printf("%d\n", segment_idx); // // int gpu = seg_row_to_gpu[table_id][segment_idx]; //0 here is the table_id (for now it's always the table id) // // if (all_col_idx[gpu][column_id] == NULL) printf("%d %d %d\n", table_id, segment_idx, gpu); // int gpu; // if (seg_is_replicated[segment_idx]) gpu = cur_gpu; // else gpu = seg_row_to_gpu[table_id][segment_idx]; //0 here is the table_id (for now it's always the table id) // // cudaAssert(gpu == cur_gpu); // cudaAssert(gpu >= 0 && gpu < NUM_GPU); // cudaAssert(all_col_idx[gpu] != NULL && all_col_idx[gpu][column_id] != NULL); // cudaAssert(all_col_idx[gpu][column_id][segment_idx] != -1); // cudaAssert(gpuCache[gpu] != NULL); // int64_t val_segment = all_col_idx[gpu][column_id][segment_idx]; // items[ITEM] = gpuCache[gpu][val_segment * SEGMENT_SIZE + (item_off[ITEM] % SEGMENT_SIZE)]; int segment_idx = item_off[ITEM]/SEGMENT_SIZE; cudaAssert(all_col_idx[cur_gpu] != NULL && all_col_idx[cur_gpu][column_id] != NULL) if (all_col_idx[cur_gpu][column_id][segment_idx] != -1) { int64_t val_segment = all_col_idx[cur_gpu][column_id][segment_idx]; cudaAssert(gpuCache[cur_gpu] != NULL); items[ITEM] = gpuCache[cur_gpu][val_segment * SEGMENT_SIZE + (item_off[ITEM] % SEGMENT_SIZE)]; } else { cudaAssert(seg_row_to_gpu[table_id] != NULL && seg_row_to_gpu[table_id][segment_idx] != -1); int gpu = seg_row_to_gpu[table_id][segment_idx]; cudaAssert(gpu >= 0 && gpu < NUM_GPU); cudaAssert(all_col_idx[gpu] != NULL && all_col_idx[gpu][column_id] != NULL); cudaAssert(all_col_idx[gpu][column_id][segment_idx] != -1); cudaAssert(gpuCache[gpu] != NULL); int64_t val_segment = all_col_idx[gpu][column_id][segment_idx]; items[ITEM] = gpuCache[gpu][val_segment * SEGMENT_SIZE + (item_off[ITEM] % SEGMENT_SIZE)]; } // printf("%d %d\n", items[ITEM], item_off[ITEM]); } } } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockReadOffsetGlobal2( int (&item_off)[ITEMS_PER_THREAD], int (&selection_flags)[ITEMS_PER_THREAD], int (&items)[ITEMS_PER_THREAD], int** gpuCache, int*** all_col_idx, int** broadcast_idx, int** seg_row_to_gpu, int* seg_is_replicated, int column_id, int table_id, int cur_gpu, int num_tile_items ) { cudaAssert(broadcast_idx[column_id] != NULL); cudaAssert(seg_row_to_gpu[table_id] != NULL); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { if (threadIdx.x + (ITEM * BLOCK_THREADS) < num_tile_items) { if (selection_flags[ITEM]) { int seg = item_off[ITEM]/SEGMENT_SIZE; // if (blockIdx.x == 0 && column_id == 19) printf("broad %d %d %d\n", item_off[ITEM], seg, broadcast_idx[column_id][seg]); if (broadcast_idx[column_id][seg] != -1) { int64_t segment_idx = broadcast_idx[column_id][seg]; cudaAssert(gpuCache[cur_gpu] != NULL); items[ITEM] = gpuCache[cur_gpu][segment_idx * SEGMENT_SIZE + (item_off[ITEM] % SEGMENT_SIZE)]; // if (blockIdx.x == 0 && column_id == 11) printf("broad %d %d %d %d\n", broadcast_idx[column_id][seg], items[ITEM], seg, (item_off[ITEM])); } else { // int gpu; // if (seg_is_replicated[seg]) gpu = cur_gpu; // else gpu = seg_row_to_gpu[table_id][seg]; //0 here is the table_id (for now it's always the table id) // //seg_row_to_gpu just always pick the first GPU which has the segment // // cudaAssert(gpu == cur_gpu); // // if (gpu != cur_gpu) printf("%d %d %d %d\n", gpu, cur_gpu, column_id, seg); // cudaAssert(gpu >= 0 && gpu < NUM_GPU); // cudaAssert(all_col_idx[gpu] != NULL && all_col_idx[gpu][column_id] != NULL); // cudaAssert(all_col_idx[gpu][column_id][seg] != -1); // cudaAssert(gpuCache[gpu] != NULL); // int64_t val_segment = all_col_idx[gpu][column_id][seg]; // items[ITEM] = gpuCache[gpu][val_segment * SEGMENT_SIZE + (item_off[ITEM] % SEGMENT_SIZE)]; // // if (blockIdx.x == 0) printf("not broad %d %d %d %d\n", items[0], items[1], items[2], items[3]); cudaAssert(all_col_idx[cur_gpu] != NULL && all_col_idx[cur_gpu][column_id] != NULL) if (all_col_idx[cur_gpu][column_id][seg] != -1) { int64_t val_segment = all_col_idx[cur_gpu][column_id][seg]; cudaAssert(gpuCache[cur_gpu] != NULL); items[ITEM] = gpuCache[cur_gpu][val_segment * SEGMENT_SIZE + (item_off[ITEM] % SEGMENT_SIZE)]; } else { cudaAssert(seg_row_to_gpu[table_id] != NULL && seg_row_to_gpu[table_id][seg] != -1); int gpu = seg_row_to_gpu[table_id][seg]; cudaAssert(gpu >= 0 && gpu < NUM_GPU); cudaAssert(all_col_idx[gpu] != NULL && all_col_idx[gpu][column_id] != NULL); cudaAssert(all_col_idx[gpu][column_id][seg] != -1); cudaAssert(gpuCache[gpu] != NULL); int64_t val_segment = all_col_idx[gpu][column_id][seg]; items[ITEM] = gpuCache[gpu][val_segment * SEGMENT_SIZE + (item_off[ITEM] % SEGMENT_SIZE)]; } } } } } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD, int NUM_RANGE> __device__ __forceinline__ void BlockLoadAndCountIndexPartition2( int* key, int (&item_key)[ITEMS_PER_THREAD], int (&index)[ITEMS_PER_THREAD], int (&selection_flags)[ITEMS_PER_THREAD], int* smem_index, int num_tile_items, int range_size, int is_replicated, int gpu ) { cudaAssert(key != NULL); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { if (threadIdx.x + (ITEM * BLOCK_THREADS) < num_tile_items) { if (selection_flags[ITEM]) { item_key[ITEM] = key[(ITEM * BLOCK_THREADS) + threadIdx.x]; int partition = item_key[ITEM]/range_size; //if range size is wrong, there could be an extra partition cudaAssert(partition <= NUM_RANGE); if (partition >= NUM_RANGE) partition = NUM_RANGE-1; if (is_replicated) { if (partition == gpu) index[ITEM] = atomicAdd(&(smem_index[partition]), 1); else index[ITEM] = 0; } else index[ITEM] = atomicAdd(&(smem_index[partition]), 1); } } } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD, int NUM_RANGE> __device__ __forceinline__ void BlockCountIndexPartition( int (&item_key)[ITEMS_PER_THREAD], int (&index)[ITEMS_PER_THREAD], int (&selection_flags)[ITEMS_PER_THREAD], int* smem_index, int num_tile_items, int range_size, int is_replicated, int gpu ) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { if (threadIdx.x + (ITEM * BLOCK_THREADS) < num_tile_items) { if (selection_flags[ITEM]) { int partition = item_key[ITEM]/range_size; //if range size is wrong, there could be an extra partition cudaAssert(partition <= NUM_RANGE); if (partition >= NUM_RANGE) partition = NUM_RANGE-1; if (is_replicated) { if (partition == gpu) index[ITEM] = atomicAdd(&(smem_index[partition]), 1); else index[ITEM] = 0; } else index[ITEM] = atomicAdd(&(smem_index[partition]), 1); } } } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD, int NUM_RANGE> __device__ __forceinline__ void BlockWritePartition( int (&items)[ITEMS_PER_THREAD], int (&index)[ITEMS_PER_THREAD], int** output, int num_tile_items, int range_size, int is_replicated, int gpu ) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { if (threadIdx.x + (ITEM * BLOCK_THREADS) < num_tile_items) { int partition = items[ITEM]/range_size; //if range size is wrong, there could be an extra partition cudaAssert(partition <= NUM_RANGE); if (partition >= NUM_RANGE) partition = NUM_RANGE-1; // if (blockIdx.x == 1 && threadIdx.x == 4) printf("key = %d range size = %d partition = %d\n", items[ITEM], range_size, partition); cudaAssert(output[partition] != NULL); if (is_replicated) { if (partition == gpu) output[partition][index[ITEM]] = items[ITEM]; } else output[partition][index[ITEM]] = items[ITEM]; } } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD, int NUM_RANGE> __device__ __forceinline__ void BlockWritePartition2( int (&items)[ITEMS_PER_THREAD], int (&index)[ITEMS_PER_THREAD], int (&selection_flags)[ITEMS_PER_THREAD], int** output, int num_tile_items, int range_size, int is_replicated, int gpu ) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { if (threadIdx.x + (ITEM * BLOCK_THREADS) < num_tile_items) { if (selection_flags[ITEM]) { int partition = items[ITEM]/range_size; //if range size is wrong, there could be an extra partition cudaAssert(partition <= NUM_RANGE); if (partition >= NUM_RANGE) partition = NUM_RANGE-1; cudaAssert(output[partition] != NULL); if (is_replicated) { if (partition == gpu) output[partition][index[ITEM]] = items[ITEM]; } else output[partition][index[ITEM]] = items[ITEM]; } } } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD, int NUM_RANGE> __device__ __forceinline__ void BlockWriteValPartition2( int (&item_val)[ITEMS_PER_THREAD], int (&item_key)[ITEMS_PER_THREAD], int (&index)[ITEMS_PER_THREAD], int (&selection_flags)[ITEMS_PER_THREAD], int** output, int num_tile_items, int range_size, int is_replicated, int gpu ) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { if (threadIdx.x + (ITEM * BLOCK_THREADS) < num_tile_items) { if (selection_flags[ITEM]) { int partition = item_key[ITEM]/range_size; //if range size is wrong, there could be an extra partition cudaAssert(partition <= NUM_RANGE); if (partition >= NUM_RANGE) partition = NUM_RANGE-1; cudaAssert(output[partition] != NULL); if (is_replicated) { if (partition == gpu) output[partition][index[ITEM]] = item_val[ITEM]; } else output[partition][index[ITEM]] = item_val[ITEM]; } } } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD, int NUM_RANGE> __device__ __forceinline__ void BlockLoadAndWriteValPartition( int* value, int (&item_key)[ITEMS_PER_THREAD], int (&item_val)[ITEMS_PER_THREAD], int (&index)[ITEMS_PER_THREAD], int** output, int num_tile_items, int range_size, int is_replicated, int gpu ) { cudaAssert(value != NULL); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { if (threadIdx.x + (ITEM * BLOCK_THREADS) < num_tile_items) { item_val[ITEM] = value[(ITEM * BLOCK_THREADS) + threadIdx.x]; int partition = item_key[ITEM]/range_size; //if range size is wrong, there could be an extra partition cudaAssert(partition <= NUM_RANGE); if (partition >= NUM_RANGE) partition = NUM_RANGE-1; cudaAssert(output[partition] != NULL); if (is_replicated) { if (partition == gpu) output[partition][index[ITEM]] = item_val[ITEM]; } else output[partition][index[ITEM]] = item_val[ITEM]; } } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD, int NUM_RANGE> __device__ __forceinline__ void BlockLoadAndWriteValPartition2( int* value, int (&item_key)[ITEMS_PER_THREAD], int (&item_val)[ITEMS_PER_THREAD], int (&index)[ITEMS_PER_THREAD], int (&selection_flags)[ITEMS_PER_THREAD], int** output, int num_tile_items, int range_size, int is_replicated, int gpu ) { cudaAssert(value != NULL); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { if (threadIdx.x + (ITEM * BLOCK_THREADS) < num_tile_items) { if (selection_flags[ITEM]) { item_val[ITEM] = value[(ITEM * BLOCK_THREADS) + threadIdx.x]; int partition = item_key[ITEM]/range_size; //if range size is wrong, there could be an extra partition cudaAssert(partition <= NUM_RANGE); if (partition >= NUM_RANGE) partition = NUM_RANGE-1; cudaAssert(output[partition] != NULL); if (is_replicated) { if (partition == gpu) output[partition][index[ITEM]] = item_val[ITEM]; } else output[partition][index[ITEM]] = item_val[ITEM]; } } } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD, int NUM_RANGE> __device__ __forceinline__ void BlockWriteOffSelf( int (&item_key)[ITEMS_PER_THREAD], int (&index)[ITEMS_PER_THREAD], int (&selection_flags)[ITEMS_PER_THREAD], int** offset, int start_offset, //offset to the current segment int num_tile_items, int range_size, int is_replicated, int gpu ) { int tile_size = ITEMS_PER_THREAD * BLOCK_THREADS; int tile_idx = blockIdx.x % (SEGMENT_SIZE/tile_size); //tile index in current segment #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { if (threadIdx.x + (ITEM * BLOCK_THREADS) < num_tile_items) { if (selection_flags[ITEM]) { int partition = item_key[ITEM]/range_size; //if range size is wrong, there could be an extra partition cudaAssert(partition <= NUM_RANGE); if (partition >= NUM_RANGE) partition = NUM_RANGE-1; cudaAssert(offset[partition] != NULL); if (is_replicated) { if (partition == gpu) offset[partition][index[ITEM]] = start_offset + tile_idx * tile_size + threadIdx.x + ITEM * BLOCK_THREADS; } else offset[partition][index[ITEM]] = start_offset + tile_idx * tile_size + threadIdx.x + ITEM * BLOCK_THREADS; // if (blockIdx.x == 0) printf("%d\n", index[ITEM]); // if (index[ITEM] < 0 || index[ITEM] > 692060) printf("%d %d %d %d\n", blockIdx.x, threadIdx.x, partition, index[ITEM]); } } } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockProbeGPUDirect( int tid, int (&items)[ITEMS_PER_THREAD], int (&offset)[ITEMS_PER_THREAD], int (&selection_flags)[ITEMS_PER_THREAD], int* ht, int ht_len, int keys_min ) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { // Out-of-bounds items are selection_flags if (selection_flags[ITEM]) { int hash = HASH(items[ITEM], ht_len, keys_min); int slot = ht[(hash << 1) + 1]; if (slot != 0) { offset[ITEM] = slot - 1; } else { // printf("keys %d %d %d\n", items[ITEM], blockIdx.x, threadIdx.x); selection_flags[ITEM] = 0; } } } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockProbeGPUDirect( int tid, int (&items)[ITEMS_PER_THREAD], int (&offset)[ITEMS_PER_THREAD], int (&selection_flags)[ITEMS_PER_THREAD], int* ht, int ht_len, int keys_min, int num_items ) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { // Out-of-bounds items are selection_flags if (tid + (ITEM * BLOCK_THREADS) < num_items) { if (selection_flags[ITEM]) { int hash = HASH(items[ITEM], ht_len, keys_min); int slot = ht[(hash << 1) + 1]; if (slot != 0) { offset[ITEM] = slot - 1; } else { // cudaAssert(0); // printf("keys %d %d %d\n", items[ITEM], blockIdx.x, threadIdx.x); selection_flags[ITEM] = 0; } } } } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockProbeGPU( int tid, int (&items)[ITEMS_PER_THREAD], int (&offset)[ITEMS_PER_THREAD], int (&selection_flags)[ITEMS_PER_THREAD], int* ht, int ht_len, int keys_min, int num_items ) { if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) { BlockProbeGPUDirect<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items, offset, selection_flags, ht, ht_len, keys_min); } else { BlockProbeGPUDirect<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items, offset, selection_flags, ht, ht_len, keys_min, num_items); } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockProbeGroupByGPUDirect( int tid, int (&items)[ITEMS_PER_THREAD], int (&res)[ITEMS_PER_THREAD], int (&selection_flags)[ITEMS_PER_THREAD], int* ht, int ht_len, int keys_min ) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { // Out-of-bounds items are selection_flags if (selection_flags[ITEM]) { int hash = HASH(items[ITEM], ht_len, keys_min); uint64_t slot = *reinterpret_cast<uint64_t*>(&ht[hash << 1]); if (slot != 0) { //res[ITEM] = (slot << 32) >> 32; res[ITEM] = slot; } else { selection_flags[ITEM] = 0; } } } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockProbeGroupByGPUDirect( int tid, int (&items)[ITEMS_PER_THREAD], int (&res)[ITEMS_PER_THREAD], int (&selection_flags)[ITEMS_PER_THREAD], int* ht, int ht_len, int keys_min, int num_items ) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { // Out-of-bounds items are selection_flags if (tid + (ITEM * BLOCK_THREADS) < num_items) { if (selection_flags[ITEM]) { int hash = HASH(items[ITEM], ht_len, keys_min); uint64_t slot = *reinterpret_cast<uint64_t*>(&ht[hash << 1]); if (slot != 0) { //res[ITEM] = (slot << 32) >> 32; res[ITEM] = slot; } else { selection_flags[ITEM] = 0; } } } } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockProbeGroupByGPU( int tid, int (&items)[ITEMS_PER_THREAD], int (&res)[ITEMS_PER_THREAD], int (&selection_flags)[ITEMS_PER_THREAD], int* ht, int ht_len, int keys_min, int num_items ) { if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) { BlockProbeGroupByGPUDirect<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items, res, selection_flags, ht, ht_len, keys_min); } else { BlockProbeGroupByGPUDirect<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items, res, selection_flags, ht, ht_len, keys_min, num_items); } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockProbeGPU2Direct( int tid, int (&items_off)[ITEMS_PER_THREAD], int (&offset)[ITEMS_PER_THREAD], int (&selection_flags)[ITEMS_PER_THREAD], int* gpuCache, int* key_idx, int* ht, int ht_len, int keys_min ) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { // Out-of-bounds items are selection_flags if (selection_flags[ITEM]) { int64_t dimkey_seg = key_idx[items_off[ITEM] / SEGMENT_SIZE]; int key = gpuCache[dimkey_seg * SEGMENT_SIZE + (items_off[ITEM] % SEGMENT_SIZE)]; int hash = HASH(key, ht_len, keys_min); int slot = ht[(hash << 1) + 1]; if (slot != 0) { offset[ITEM] = slot - 1; } else { selection_flags[ITEM] = 0; } } } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockProbeGPU2Direct( int tid, int (&items_off)[ITEMS_PER_THREAD], int (&offset)[ITEMS_PER_THREAD], int (&selection_flags)[ITEMS_PER_THREAD], int* gpuCache, int* key_idx, int* ht, int ht_len, int keys_min, int num_items ) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { // Out-of-bounds items are selection_flags if (tid + (ITEM * BLOCK_THREADS) < num_items) { if (selection_flags[ITEM]) { int64_t dimkey_seg = key_idx[items_off[ITEM] / SEGMENT_SIZE]; int key = gpuCache[dimkey_seg * SEGMENT_SIZE + (items_off[ITEM] % SEGMENT_SIZE)]; int hash = HASH(key, ht_len, keys_min); int slot = ht[(hash << 1) + 1]; if (slot != 0) { offset[ITEM] = slot - 1; } else { selection_flags[ITEM] = 0; } } } } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockProbeGPU2( int tid, int (&items_off)[ITEMS_PER_THREAD], int (&offset)[ITEMS_PER_THREAD], int (&selection_flags)[ITEMS_PER_THREAD], int* gpuCache, int* key_idx, int* ht, int ht_len, int keys_min, int num_items ) { if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) { BlockProbeGPU2Direct<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items_off, offset, selection_flags, gpuCache, key_idx, ht, ht_len, keys_min); } else { BlockProbeGPU2Direct<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items_off, offset, selection_flags, gpuCache, key_idx, ht, ht_len, keys_min, num_items); } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockProbeGroupByGPU2Direct( int tid, int (&items_off)[ITEMS_PER_THREAD], int (&res)[ITEMS_PER_THREAD], int (&selection_flags)[ITEMS_PER_THREAD], int* gpuCache, int* key_idx, int* ht, int ht_len, int keys_min ) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { // Out-of-bounds items are selection_flags if (selection_flags[ITEM]) { int64_t dimkey_seg = key_idx[items_off[ITEM] / SEGMENT_SIZE]; int key = gpuCache[dimkey_seg * SEGMENT_SIZE + (items_off[ITEM] % SEGMENT_SIZE)]; int hash = HASH(key, ht_len, keys_min); uint64_t slot = *reinterpret_cast<uint64_t*>(&ht[hash << 1]); if (slot != 0) { res[ITEM] = slot; } else { selection_flags[ITEM] = 0; } } } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockProbeGroupByGPU2Direct( int tid, int (&items_off)[ITEMS_PER_THREAD], int (&res)[ITEMS_PER_THREAD], int (&selection_flags)[ITEMS_PER_THREAD], int* gpuCache, int* key_idx, int* ht, int ht_len, int keys_min, int num_items ) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { // Out-of-bounds items are selection_flags if (tid + (ITEM * BLOCK_THREADS) < num_items) { if (selection_flags[ITEM]) { int64_t dimkey_seg = key_idx[items_off[ITEM] / SEGMENT_SIZE]; int key = gpuCache[dimkey_seg * SEGMENT_SIZE + (items_off[ITEM] % SEGMENT_SIZE)]; int hash = HASH(key, ht_len, keys_min); uint64_t slot = *reinterpret_cast<uint64_t*>(&ht[hash << 1]); if (slot != 0) { res[ITEM] = slot; } else { selection_flags[ITEM] = 0; } } } } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockProbeGroupByGPU2( int tid, int (&items_off)[ITEMS_PER_THREAD], int (&res)[ITEMS_PER_THREAD], int (&selection_flags)[ITEMS_PER_THREAD], int* gpuCache, int* key_idx, int* ht, int ht_len, int keys_min, int num_items ) { if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) { BlockProbeGroupByGPU2Direct<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items_off, res, selection_flags, gpuCache, key_idx, ht, ht_len, keys_min); } else { BlockProbeGroupByGPU2Direct<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items_off, res, selection_flags, gpuCache, key_idx, ht, ht_len, keys_min, num_items); } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockPassThroughOffsetDirect( int tid, int (&items)[ITEMS_PER_THREAD], int (&offset)[ITEMS_PER_THREAD], int (&selection_flags)[ITEMS_PER_THREAD] ) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { if (selection_flags[ITEM]) { offset[ITEM] = items[ITEM]; } } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockPassThroughOffsetDirect( int tid, int (&items)[ITEMS_PER_THREAD], int (&offset)[ITEMS_PER_THREAD], int (&selection_flags)[ITEMS_PER_THREAD], int num_items ) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { if (tid + (ITEM * BLOCK_THREADS) < num_items) { if (selection_flags[ITEM]) { offset[ITEM] = items[ITEM]; } } } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockPassThroughOffset( int tid, int (&items)[ITEMS_PER_THREAD], int (&offset)[ITEMS_PER_THREAD], int (&selection_flags)[ITEMS_PER_THREAD], int num_items ) { if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) { BlockPassThroughOffsetDirect<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items, offset, selection_flags); } else { BlockPassThroughOffsetDirect<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items, offset, selection_flags, num_items); } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockSetFilteredValueDirect( int tid, int (&items)[ITEMS_PER_THREAD], int value, int (&selection_flags)[ITEMS_PER_THREAD] ) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { if (selection_flags[ITEM]) { items[ITEM] = value; } } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockSetFilteredValueDirect( int tid, int (&items)[ITEMS_PER_THREAD], int value, int (&selection_flags)[ITEMS_PER_THREAD], int num_items ) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { if (tid + (ITEM * BLOCK_THREADS) < num_items) { // CUB use blocked arrangement if (selection_flags[ITEM]) { items[ITEM] = value; } } } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockSetFilteredValue( int tid, int (&items)[ITEMS_PER_THREAD], int value, int (&selection_flags)[ITEMS_PER_THREAD], int num_items ) { if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) { BlockSetFilteredValueDirect<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items, value, selection_flags); } else { BlockSetFilteredValueDirect<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items, value, selection_flags, num_items); } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockSetValueDirect( int tid, int (&items)[ITEMS_PER_THREAD], int value ) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { items[ITEM] = value; } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockSetValueDirect( int tid, int (&items)[ITEMS_PER_THREAD], int value, int num_items ) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { if (tid + (ITEM * BLOCK_THREADS) < num_items) { // CUB use blocked arrangement items[ITEM] = value; } } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockSetValue( int tid, int (&items)[ITEMS_PER_THREAD], int value, int num_items ) { if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) { BlockSetValueDirect<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items, value); } else { BlockSetValueDirect<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items, value, num_items); } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockReadFilteredOffsetDirect( int tid, int (&items_off)[ITEMS_PER_THREAD], int (&items)[ITEMS_PER_THREAD], int (&selection_flags)[ITEMS_PER_THREAD], int* gpuCache, int* col_idx ) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { if (selection_flags[ITEM]) { int64_t val_segment = col_idx[items_off[ITEM] / SEGMENT_SIZE]; items[ITEM] = gpuCache[val_segment * SEGMENT_SIZE + (items_off[ITEM] % SEGMENT_SIZE)]; } } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockReadFilteredOffsetDirect( int tid, int (&items_off)[ITEMS_PER_THREAD], int (&items)[ITEMS_PER_THREAD], int (&selection_flags)[ITEMS_PER_THREAD], int* gpuCache, int* col_idx, int num_items ) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { if (tid + (ITEM * BLOCK_THREADS) < num_items) { if (selection_flags[ITEM]) { int64_t val_segment = col_idx[items_off[ITEM] / SEGMENT_SIZE]; items[ITEM] = gpuCache[val_segment * SEGMENT_SIZE + (items_off[ITEM] % SEGMENT_SIZE)]; } } } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockReadFilteredOffset( int tid, int (&items_off)[ITEMS_PER_THREAD], int (&items)[ITEMS_PER_THREAD], int (&selection_flags)[ITEMS_PER_THREAD], int* gpuCache, int* col_idx, int num_items ) { if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) { BlockReadFilteredOffsetDirect<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items_off, items, selection_flags, gpuCache, col_idx); } else { BlockReadFilteredOffsetDirect<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items_off, items, selection_flags, gpuCache, col_idx, num_items); } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockReadOffsetDirect( int tid, int (&items_off)[ITEMS_PER_THREAD], int (&items)[ITEMS_PER_THREAD], int* gpuCache, int* col_idx ) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { int64_t val_segment = col_idx[items_off[ITEM] / SEGMENT_SIZE]; items[ITEM] = gpuCache[val_segment * SEGMENT_SIZE + (items_off[ITEM] % SEGMENT_SIZE)]; } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockReadOffsetDirect( int tid, int (&items_off)[ITEMS_PER_THREAD], int (&items)[ITEMS_PER_THREAD], int* gpuCache, int* col_idx, int num_items ) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { if (tid + (ITEM * BLOCK_THREADS) < num_items) { int64_t val_segment = col_idx[items_off[ITEM] / SEGMENT_SIZE]; items[ITEM] = gpuCache[val_segment * SEGMENT_SIZE + (items_off[ITEM] % SEGMENT_SIZE)]; } } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockReadOffsetGPU( int tid, int (&items_off)[ITEMS_PER_THREAD], int (&items)[ITEMS_PER_THREAD], int* gpuCache, int* col_idx, int num_items ) { if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) { BlockReadOffsetDirect<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items_off, items, gpuCache, col_idx); } else { BlockReadOffsetDirect<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items_off, items, gpuCache, col_idx, num_items); } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockReadOffsetGPU2( int tid, int (&items_off)[ITEMS_PER_THREAD], int (&items)[ITEMS_PER_THREAD], int (&selection_flags)[ITEMS_PER_THREAD], int* gpuCache, int* col_idx, int num_items ) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { if (tid + (ITEM * BLOCK_THREADS) < num_items) { if (selection_flags[ITEM]) { int64_t val_segment = col_idx[items_off[ITEM] / SEGMENT_SIZE]; items[ITEM] = gpuCache[val_segment * SEGMENT_SIZE + (items_off[ITEM] % SEGMENT_SIZE)]; } } } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockReadOffsetGPU3( int tid, int (&items_off)[ITEMS_PER_THREAD], int (&items)[ITEMS_PER_THREAD], int (&selection_flags)[ITEMS_PER_THREAD], int* gpuCache, int* col_idx, int* broadcast_idx, int num_items ) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { if (tid + (ITEM * BLOCK_THREADS) < num_items) { if (selection_flags[ITEM]) { int64_t segment_idx = col_idx[items_off[ITEM] / SEGMENT_SIZE]; if (segment_idx == -1) { segment_idx = broadcast_idx[items_off[ITEM] / SEGMENT_SIZE]; } cudaAssert(segment_idx != -1); items[ITEM] = gpuCache[segment_idx * SEGMENT_SIZE + (items_off[ITEM] % SEGMENT_SIZE)]; } } } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockBuildValueGPUDirect( int tid, int blockid, int start_offset, int (&keys)[ITEMS_PER_THREAD], //equal to items int (&vals)[ITEMS_PER_THREAD], int (&selection_flags)[ITEMS_PER_THREAD], int* ht, int ht_len, int keys_min ) { int tile_size = BLOCK_THREADS * ITEMS_PER_THREAD; #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { if (selection_flags[ITEM]) { int hash = HASH(keys[ITEM], ht_len, keys_min); atomicCAS(&ht[hash << 1], 0, vals[ITEM]); ht[(hash << 1) + 1] = start_offset + (blockid * tile_size) + (tid + ITEM * BLOCK_THREADS) + 1; } } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockBuildValueGPUDirect( int tid, int blockid, int start_offset, int (&keys)[ITEMS_PER_THREAD], //equal to items int (&vals)[ITEMS_PER_THREAD], int (&selection_flags)[ITEMS_PER_THREAD], int* ht, int ht_len, int keys_min, // equal to val_min int num_items ) { int tile_size = BLOCK_THREADS * ITEMS_PER_THREAD; #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { if (tid + (ITEM * BLOCK_THREADS) < num_items) { // use stripe arrangement since we are using Crystal Blockload instead of CUB if (selection_flags[ITEM]) { int hash = HASH(keys[ITEM], ht_len, keys_min); atomicCAS(&ht[hash << 1], 0, vals[ITEM]); ht[(hash << 1) + 1] = start_offset + (blockid * tile_size) + (tid + ITEM * BLOCK_THREADS) + 1; } } } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockBuildValueGPU( int tid, int blockid, int start_offset, int (&keys)[ITEMS_PER_THREAD], //equal to items int (&vals)[ITEMS_PER_THREAD], int (&selection_flags)[ITEMS_PER_THREAD], int* ht, int ht_len, int keys_min, // equal to val_min int num_items ) { if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) { BlockBuildValueGPUDirect<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, blockid, start_offset, keys, vals, selection_flags, ht, ht_len, keys_min); } else { BlockBuildValueGPUDirect<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, blockid, start_offset, keys, vals, selection_flags, ht, ht_len, keys_min, num_items); } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockBuildOffsetGPUDirect( int tid, int blockid, int start_offset, int (&keys)[ITEMS_PER_THREAD], //equal to items int (&selection_flags)[ITEMS_PER_THREAD], int* ht, int ht_len, int keys_min ) { int tile_size = BLOCK_THREADS * ITEMS_PER_THREAD; #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { if (selection_flags[ITEM]) { int hash = HASH(keys[ITEM], ht_len, keys_min); // if (hash < 0 || hash > 41943040) printf("%d %d %d %d\n", keys[ITEM], hash, ht_len, keys_min); ht[(hash << 1) + 1] = start_offset + (blockid * tile_size) + (tid + ITEM * BLOCK_THREADS) + 1; } } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockBuildOffsetGPUDirect( int tid, int blockid, int start_offset, int (&keys)[ITEMS_PER_THREAD], //equal to items int (&selection_flags)[ITEMS_PER_THREAD], int* ht, int ht_len, int keys_min, // equal to val_min int num_items ) { int tile_size = BLOCK_THREADS * ITEMS_PER_THREAD; #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { if (tid + (ITEM * BLOCK_THREADS) < num_items) { // use stripe arrangement since we are using Crystal Blockload instead of CUB if (selection_flags[ITEM]) { int hash = HASH(keys[ITEM], ht_len, keys_min); ht[(hash << 1) + 1] = start_offset + (blockid * tile_size) + (tid + ITEM * BLOCK_THREADS) + 1; } } } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockBuildOffsetGPU( int tid, int blockid, int start_offset, int (&keys)[ITEMS_PER_THREAD], //equal to items int (&selection_flags)[ITEMS_PER_THREAD], int* ht, int ht_len, int keys_min, // equal to val_min int num_items ) { if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) { BlockBuildOffsetGPUDirect<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, blockid, start_offset, keys, selection_flags, ht, ht_len, keys_min); } else { BlockBuildOffsetGPUDirect<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, blockid, start_offset, keys, selection_flags, ht, ht_len, keys_min, num_items); } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockBuildValueGPU2Direct( int tid, int (&items_off)[ITEMS_PER_THREAD], //equal to items int (&selection_flags)[ITEMS_PER_THREAD], int* gpuCache, int* key_idx, int* val_idx, int* ht, int ht_len, int keys_min ) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { if (selection_flags[ITEM]) { int64_t dimkey_seg = key_idx[items_off[ITEM] / SEGMENT_SIZE]; int64_t dimval_seg = val_idx[items_off[ITEM] / SEGMENT_SIZE]; int key = gpuCache[dimkey_seg * SEGMENT_SIZE + (items_off[ITEM] % SEGMENT_SIZE)]; int val = gpuCache[dimval_seg * SEGMENT_SIZE + (items_off[ITEM] % SEGMENT_SIZE)]; // Out-of-bounds items are selection_flags int hash = HASH(key, ht_len, keys_min); atomicCAS(&ht[hash << 1], 0, val); ht[(hash << 1) + 1] = items_off[ITEM] + 1; } } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockBuildValueGPU2Direct( int tid, int (&items_off)[ITEMS_PER_THREAD], //equal to items int (&selection_flags)[ITEMS_PER_THREAD], int* gpuCache, int* key_idx, int* val_idx, int* ht, int ht_len, int keys_min, // equal to val_min int num_items ) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { if (tid + (ITEM * BLOCK_THREADS) < num_items) { if (selection_flags[ITEM]) { int64_t dimkey_seg = key_idx[items_off[ITEM] / SEGMENT_SIZE]; int64_t dimval_seg = val_idx[items_off[ITEM] / SEGMENT_SIZE]; int key = gpuCache[dimkey_seg * SEGMENT_SIZE + (items_off[ITEM] % SEGMENT_SIZE)]; int val = gpuCache[dimval_seg * SEGMENT_SIZE + (items_off[ITEM] % SEGMENT_SIZE)]; // Out-of-bounds items are selection_flags int hash = HASH(key, ht_len, keys_min); atomicCAS(&ht[hash << 1], 0, val); ht[(hash << 1) + 1] = items_off[ITEM] + 1; } } } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockBuildValueGPU2( int tid, int (&items_off)[ITEMS_PER_THREAD], //equal to items int (&selection_flags)[ITEMS_PER_THREAD], int* gpuCache, int* key_idx, int* val_idx, int* ht, int ht_len, int keys_min, // equal to val_min int num_items ) { if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) { BlockBuildValueGPU2Direct<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items_off, selection_flags, gpuCache, key_idx, val_idx, ht, ht_len, keys_min); } else { BlockBuildValueGPU2Direct<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items_off, selection_flags, gpuCache, key_idx, val_idx, ht, ht_len, keys_min, num_items); } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockBuildOffsetGPU2Direct( int tid, int (&items_off)[ITEMS_PER_THREAD], //equal to items int (&selection_flags)[ITEMS_PER_THREAD], int* gpuCache, int* key_idx, int* ht, int ht_len, int keys_min ) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { if (selection_flags[ITEM]) { int64_t dimkey_seg = key_idx[items_off[ITEM] / SEGMENT_SIZE]; int key = gpuCache[dimkey_seg * SEGMENT_SIZE + (items_off[ITEM] % SEGMENT_SIZE)]; int hash = HASH(key, ht_len, keys_min); ht[(hash << 1) + 1] = items_off[ITEM] + 1; } } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockBuildOffsetGPU2Direct( int tid, int (&items_off)[ITEMS_PER_THREAD], //equal to items int (&selection_flags)[ITEMS_PER_THREAD], int* gpuCache, int* key_idx, int* ht, int ht_len, int keys_min, // equal to val_min int num_items ) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { if (tid + (ITEM * BLOCK_THREADS) < num_items) { if (selection_flags[ITEM]) { int64_t dimkey_seg = key_idx[items_off[ITEM] / SEGMENT_SIZE]; int key = gpuCache[dimkey_seg * SEGMENT_SIZE + (items_off[ITEM] % SEGMENT_SIZE)]; int hash = HASH(key, ht_len, keys_min); ht[(hash << 1) + 1] = items_off[ITEM] + 1; } } } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockBuildOffsetGPU2( int tid, int (&items_off)[ITEMS_PER_THREAD], //equal to items int (&selection_flags)[ITEMS_PER_THREAD], int* gpuCache, int* key_idx, int* ht, int ht_len, int keys_min, // equal to val_min int num_items ) { if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) { BlockBuildOffsetGPU2Direct<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items_off, selection_flags, gpuCache, key_idx, ht, ht_len, keys_min); } else { BlockBuildOffsetGPU2Direct<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items_off, selection_flags, gpuCache, key_idx, ht, ht_len, keys_min, num_items); } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockMinMaxGPUDirect( int tid, int (&keys)[ITEMS_PER_THREAD], //equal to items int (&selection_flags)[ITEMS_PER_THREAD], int &min, int &max, int num_items ) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { if (tid + (ITEM * BLOCK_THREADS) < num_items) { if (selection_flags[ITEM]) { if (keys[ITEM] < min) min = keys[ITEM]; if (keys[ITEM] > max) max = keys[ITEM]; } } } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockMinMaxGPUDirect( int tid, int (&keys)[ITEMS_PER_THREAD], //equal to items int (&selection_flags)[ITEMS_PER_THREAD], int &min, int &max ) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { if (selection_flags[ITEM]) { if (keys[ITEM] < min) min = keys[ITEM]; if (keys[ITEM] > max) max = keys[ITEM]; } } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockMinMaxGPU( int tid, int (&keys)[ITEMS_PER_THREAD], //equal to items int (&selection_flags)[ITEMS_PER_THREAD], int &min, int &max, int num_items ) { if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) { BlockMinMaxGPUDirect<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, keys, selection_flags, min, max); } else { BlockMinMaxGPUDirect<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, keys, selection_flags, min, max, num_items); } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockMinMaxGPU2Direct( int tid, int (&items_off)[ITEMS_PER_THREAD], //equal to items int (&selection_flags)[ITEMS_PER_THREAD], int* gpuCache, int* key_idx, int &min, int &max, int num_items ) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { if (tid + (ITEM * BLOCK_THREADS) < num_items) { if (selection_flags[ITEM]) { int64_t dimkey_seg = key_idx[items_off[ITEM] / SEGMENT_SIZE]; int key = gpuCache[dimkey_seg * SEGMENT_SIZE + (items_off[ITEM] % SEGMENT_SIZE)]; if (key < min) min = key; if (key > max) max = key; } } } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockMinMaxGPU2Direct( int tid, int (&items_off)[ITEMS_PER_THREAD], //equal to items int (&selection_flags)[ITEMS_PER_THREAD], int* gpuCache, int* key_idx, int &min, int &max ) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { if (selection_flags[ITEM]) { int64_t dimkey_seg = key_idx[items_off[ITEM] / SEGMENT_SIZE]; int key = gpuCache[dimkey_seg * SEGMENT_SIZE + (items_off[ITEM] % SEGMENT_SIZE)]; if (key < min) min = key; if (key > max) max = key; } } } template<int BLOCK_THREADS, int ITEMS_PER_THREAD> __device__ __forceinline__ void BlockMinMaxGPU2( int tid, int (&items_off)[ITEMS_PER_THREAD], //equal to items int (&selection_flags)[ITEMS_PER_THREAD], int* gpuCache, int* key_idx, int &min, int &max, int num_items ) { if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) { BlockMinMaxGPU2Direct<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items_off, selection_flags, min, max); } else { BlockMinMaxGPU2Direct<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items_off, selection_flags, min, max, num_items); } } #endif