src/gpudb/BlockLibrary.cuh · Lancelot

#ifndef _BLOCK_LIBRARY_H_
#define _BLOCK_LIBRARY_H_

#pragma once

#define cudaAssert( X ) if ( !(X) ) { printf( "Thread %d:%d failed assert at %s:%d!\n", blockIdx.x, threadIdx.x, __FILE__, __LINE__ ); return; }


template<int BLOCK_THREADS, int ITEMS_PER_THREAD, int NUM_RANGE>
__device__ __forceinline__ void BlockCountPartition(
    int  (&item_key)[ITEMS_PER_THREAD],
    int (&selection_flags)[ITEMS_PER_THREAD],
    int* smem_index,
    int num_tile_items,
    int range_size,
    int is_replicated,
    int gpu
    ) {

  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
    if (threadIdx.x + (ITEM * BLOCK_THREADS) < num_tile_items) {
      if (selection_flags[ITEM]) {
        int partition = item_key[ITEM]/range_size; //if range size is wrong, there could be an extra partition
        cudaAssert(partition <= NUM_RANGE);
        if (partition >= NUM_RANGE) partition = NUM_RANGE-1;
        if (is_replicated) {
          if (partition == gpu) atomicAdd(&(smem_index[partition]), 1);
        } else atomicAdd(&(smem_index[partition]), 1);
      }
    }
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockReadOffsetShuffle(
    int (&item_off)[ITEMS_PER_THREAD],
    int* data,
    int  (&items)[ITEMS_PER_THREAD],
    int num_tile_items
    ) {

  cudaAssert(data != NULL);
  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
    if (threadIdx.x + (ITEM * BLOCK_THREADS) < num_tile_items) {
      items[ITEM] = data[item_off[ITEM]];
    }
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockReadOffsetShuffle2(
    int (&item_off)[ITEMS_PER_THREAD],
    int (&selection_flags)[ITEMS_PER_THREAD],
    int* data,
    int  (&items)[ITEMS_PER_THREAD],
    int num_tile_items
    ) {

  cudaAssert(data != NULL);
  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
    if (threadIdx.x + (ITEM * BLOCK_THREADS) < num_tile_items) {
      if (selection_flags[ITEM]) {
        items[ITEM] = data[item_off[ITEM]];
      }
    }
  }
}


// we should modify this so that if its broadcasted or replicated, we will read from local instead
template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockReadOffsetGlobal(
    int (&item_off)[ITEMS_PER_THREAD],
    int (&selection_flags)[ITEMS_PER_THREAD],
    int  (&items)[ITEMS_PER_THREAD],
    int** gpuCache,
    int*** all_col_idx,
    int** seg_row_to_gpu,
    int* seg_is_replicated,
    int column_id,
    int table_id,
    int cur_gpu,
    int num_tile_items
    ) {

  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
    if (threadIdx.x + (ITEM * BLOCK_THREADS) < num_tile_items) {
      if (selection_flags[ITEM]) {
        // int segment_idx = item_off[ITEM]/SEGMENT_SIZE;
        // // if (threadIdx.x == 0) printf("%d\n", segment_idx);
        // // int gpu = seg_row_to_gpu[table_id][segment_idx]; //0 here is the table_id (for now it's always the table id)
        // // if (all_col_idx[gpu][column_id] == NULL) printf("%d %d %d\n", table_id, segment_idx, gpu);
        // int gpu;
        // if (seg_is_replicated[segment_idx]) gpu = cur_gpu;
        // else gpu = seg_row_to_gpu[table_id][segment_idx]; //0 here is the table_id (for now it's always the table id)
        // // cudaAssert(gpu == cur_gpu);
        // cudaAssert(gpu >= 0 && gpu < NUM_GPU);
        // cudaAssert(all_col_idx[gpu] != NULL && all_col_idx[gpu][column_id] != NULL);        
        // cudaAssert(all_col_idx[gpu][column_id][segment_idx] != -1);
        // cudaAssert(gpuCache[gpu] != NULL);
        // int64_t val_segment = all_col_idx[gpu][column_id][segment_idx];
        // items[ITEM] = gpuCache[gpu][val_segment * SEGMENT_SIZE + (item_off[ITEM] % SEGMENT_SIZE)];
        
        int segment_idx = item_off[ITEM]/SEGMENT_SIZE;
        cudaAssert(all_col_idx[cur_gpu] != NULL && all_col_idx[cur_gpu][column_id] != NULL)
        if (all_col_idx[cur_gpu][column_id][segment_idx] != -1) {
          int64_t val_segment = all_col_idx[cur_gpu][column_id][segment_idx];
          cudaAssert(gpuCache[cur_gpu] != NULL);
          items[ITEM] = gpuCache[cur_gpu][val_segment * SEGMENT_SIZE + (item_off[ITEM] % SEGMENT_SIZE)];
        } else {
          cudaAssert(seg_row_to_gpu[table_id] != NULL && seg_row_to_gpu[table_id][segment_idx] != -1);
          int gpu = seg_row_to_gpu[table_id][segment_idx];
          cudaAssert(gpu >= 0 && gpu < NUM_GPU);
          cudaAssert(all_col_idx[gpu] != NULL && all_col_idx[gpu][column_id] != NULL);        
          cudaAssert(all_col_idx[gpu][column_id][segment_idx] != -1);
          cudaAssert(gpuCache[gpu] != NULL);
          int64_t val_segment = all_col_idx[gpu][column_id][segment_idx];
          items[ITEM] = gpuCache[gpu][val_segment * SEGMENT_SIZE + (item_off[ITEM] % SEGMENT_SIZE)];
        }
        // printf("%d %d\n", items[ITEM], item_off[ITEM]);
      }
    }
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockReadOffsetGlobal2(
    int (&item_off)[ITEMS_PER_THREAD],
    int (&selection_flags)[ITEMS_PER_THREAD],
    int  (&items)[ITEMS_PER_THREAD],
    int** gpuCache,
    int*** all_col_idx,
    int** broadcast_idx,
    int** seg_row_to_gpu,
    int* seg_is_replicated,
    int column_id,
    int table_id,
    int cur_gpu,
    int num_tile_items
    ) {

  cudaAssert(broadcast_idx[column_id] != NULL);
  cudaAssert(seg_row_to_gpu[table_id] != NULL);

  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
    if (threadIdx.x + (ITEM * BLOCK_THREADS) < num_tile_items) {
      if (selection_flags[ITEM]) {
        int seg = item_off[ITEM]/SEGMENT_SIZE;
        // if (blockIdx.x == 0 && column_id == 19) printf("broad %d %d %d\n", item_off[ITEM], seg, broadcast_idx[column_id][seg]);
        if (broadcast_idx[column_id][seg] != -1) {
          int64_t segment_idx = broadcast_idx[column_id][seg];
          cudaAssert(gpuCache[cur_gpu] != NULL);
          items[ITEM] = gpuCache[cur_gpu][segment_idx * SEGMENT_SIZE + (item_off[ITEM] % SEGMENT_SIZE)];
          // if (blockIdx.x == 0 && column_id == 11) printf("broad %d %d %d %d\n", broadcast_idx[column_id][seg], items[ITEM], seg, (item_off[ITEM]));
        } else {
          // int gpu;
          // if (seg_is_replicated[seg]) gpu = cur_gpu;
          // else gpu = seg_row_to_gpu[table_id][seg]; //0 here is the table_id (for now it's always the table id)
          // //seg_row_to_gpu just always pick the first GPU which has the segment
          // // cudaAssert(gpu == cur_gpu);
          // // if (gpu != cur_gpu) printf("%d %d %d %d\n", gpu, cur_gpu, column_id, seg);
          // cudaAssert(gpu >= 0 && gpu < NUM_GPU);
          // cudaAssert(all_col_idx[gpu] != NULL && all_col_idx[gpu][column_id] != NULL);        
          // cudaAssert(all_col_idx[gpu][column_id][seg] != -1);
          // cudaAssert(gpuCache[gpu] != NULL);
          // int64_t val_segment = all_col_idx[gpu][column_id][seg];
          // items[ITEM] = gpuCache[gpu][val_segment * SEGMENT_SIZE + (item_off[ITEM] % SEGMENT_SIZE)];
          // // if (blockIdx.x == 0) printf("not broad %d %d %d %d\n", items[0], items[1], items[2], items[3]);

          cudaAssert(all_col_idx[cur_gpu] != NULL && all_col_idx[cur_gpu][column_id] != NULL)
          if (all_col_idx[cur_gpu][column_id][seg] != -1) {
            int64_t val_segment = all_col_idx[cur_gpu][column_id][seg];
            cudaAssert(gpuCache[cur_gpu] != NULL);
            items[ITEM] = gpuCache[cur_gpu][val_segment * SEGMENT_SIZE + (item_off[ITEM] % SEGMENT_SIZE)];
          } else {
            cudaAssert(seg_row_to_gpu[table_id] != NULL && seg_row_to_gpu[table_id][seg] != -1);
            int gpu = seg_row_to_gpu[table_id][seg];
            cudaAssert(gpu >= 0 && gpu < NUM_GPU);
            cudaAssert(all_col_idx[gpu] != NULL && all_col_idx[gpu][column_id] != NULL);        
            cudaAssert(all_col_idx[gpu][column_id][seg] != -1);
            cudaAssert(gpuCache[gpu] != NULL);
            int64_t val_segment = all_col_idx[gpu][column_id][seg];
            items[ITEM] = gpuCache[gpu][val_segment * SEGMENT_SIZE + (item_off[ITEM] % SEGMENT_SIZE)];
          }
        }
      }
    }
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD, int NUM_RANGE>
__device__ __forceinline__ void BlockLoadAndCountIndexPartition2(
    int* key,
    int  (&item_key)[ITEMS_PER_THREAD],
    int  (&index)[ITEMS_PER_THREAD],
    int (&selection_flags)[ITEMS_PER_THREAD],
    int* smem_index,
    int num_tile_items,
    int range_size,
    int is_replicated,
    int gpu
    ) {

  cudaAssert(key != NULL);
  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
    if (threadIdx.x + (ITEM * BLOCK_THREADS) < num_tile_items) {
      if (selection_flags[ITEM]) {
        item_key[ITEM] = key[(ITEM * BLOCK_THREADS) + threadIdx.x];
        int partition = item_key[ITEM]/range_size; //if range size is wrong, there could be an extra partition
        cudaAssert(partition <= NUM_RANGE);
        if (partition >= NUM_RANGE) partition = NUM_RANGE-1;
        if (is_replicated) {
          if (partition == gpu) index[ITEM] = atomicAdd(&(smem_index[partition]), 1);
          else index[ITEM] = 0;
        } else index[ITEM] = atomicAdd(&(smem_index[partition]), 1);
      }
    }
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD, int NUM_RANGE>
__device__ __forceinline__ void BlockCountIndexPartition(
    int  (&item_key)[ITEMS_PER_THREAD],
    int  (&index)[ITEMS_PER_THREAD],
    int (&selection_flags)[ITEMS_PER_THREAD],
    int* smem_index,
    int num_tile_items,
    int range_size,
    int is_replicated,
    int gpu
    ) {

  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
    if (threadIdx.x + (ITEM * BLOCK_THREADS) < num_tile_items) {
      if (selection_flags[ITEM]) {
        int partition = item_key[ITEM]/range_size; //if range size is wrong, there could be an extra partition
        cudaAssert(partition <= NUM_RANGE);
        if (partition >= NUM_RANGE) partition = NUM_RANGE-1;
        if (is_replicated) {
          if (partition == gpu) index[ITEM] = atomicAdd(&(smem_index[partition]), 1);
          else index[ITEM] = 0;
        } else index[ITEM] = atomicAdd(&(smem_index[partition]), 1);
      }
    }
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD, int NUM_RANGE>
__device__ __forceinline__ void BlockWritePartition(
    int  (&items)[ITEMS_PER_THREAD],
    int  (&index)[ITEMS_PER_THREAD],
    int** output,
    int num_tile_items,
    int range_size,
    int is_replicated,
    int gpu
    ) {

  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
    if (threadIdx.x + (ITEM * BLOCK_THREADS) < num_tile_items) {
      int partition = items[ITEM]/range_size; //if range size is wrong, there could be an extra partition
      cudaAssert(partition <= NUM_RANGE);
      if (partition >= NUM_RANGE) partition = NUM_RANGE-1;
      // if (blockIdx.x == 1 && threadIdx.x == 4) printf("key = %d range size = %d partition = %d\n", items[ITEM], range_size, partition);
      cudaAssert(output[partition] != NULL);
      if (is_replicated) {
        if (partition == gpu) output[partition][index[ITEM]] = items[ITEM];
      } else output[partition][index[ITEM]] = items[ITEM];
    }
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD, int NUM_RANGE>
__device__ __forceinline__ void BlockWritePartition2(
    int  (&items)[ITEMS_PER_THREAD],
    int  (&index)[ITEMS_PER_THREAD],
    int  (&selection_flags)[ITEMS_PER_THREAD],
    int** output,
    int num_tile_items,
    int range_size,
    int is_replicated,
    int gpu
    ) {

  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
    if (threadIdx.x + (ITEM * BLOCK_THREADS) < num_tile_items) {
      if (selection_flags[ITEM]) {
        int partition = items[ITEM]/range_size; //if range size is wrong, there could be an extra partition
        cudaAssert(partition <= NUM_RANGE);
        if (partition >= NUM_RANGE) partition = NUM_RANGE-1;
        cudaAssert(output[partition] != NULL);
        if (is_replicated) {
          if (partition == gpu) output[partition][index[ITEM]] = items[ITEM];
        } else output[partition][index[ITEM]] = items[ITEM];
      }
    }
  }
}


template<int BLOCK_THREADS, int ITEMS_PER_THREAD, int NUM_RANGE>
__device__ __forceinline__ void BlockWriteValPartition2(
    int  (&item_val)[ITEMS_PER_THREAD],
    int  (&item_key)[ITEMS_PER_THREAD],
    int  (&index)[ITEMS_PER_THREAD],
    int (&selection_flags)[ITEMS_PER_THREAD],
    int** output,
    int num_tile_items,
    int range_size,
    int is_replicated,
    int gpu
    ) {

  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
    if (threadIdx.x + (ITEM * BLOCK_THREADS) < num_tile_items) {
      if (selection_flags[ITEM]) {
        int partition = item_key[ITEM]/range_size; //if range size is wrong, there could be an extra partition
        cudaAssert(partition <= NUM_RANGE);
        if (partition >= NUM_RANGE) partition = NUM_RANGE-1;
        cudaAssert(output[partition] != NULL);
        if (is_replicated) {
          if (partition == gpu) output[partition][index[ITEM]] = item_val[ITEM];
        } else output[partition][index[ITEM]] = item_val[ITEM];
      }
    }
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD, int NUM_RANGE>
__device__ __forceinline__ void BlockLoadAndWriteValPartition(
    int* value,
    int  (&item_key)[ITEMS_PER_THREAD],
    int  (&item_val)[ITEMS_PER_THREAD],
    int  (&index)[ITEMS_PER_THREAD],
    int** output,
    int num_tile_items,
    int range_size,
    int is_replicated,
    int gpu
    ) {

  cudaAssert(value != NULL);
  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
    if (threadIdx.x + (ITEM * BLOCK_THREADS) < num_tile_items) {
      item_val[ITEM] = value[(ITEM * BLOCK_THREADS) + threadIdx.x];
      int partition = item_key[ITEM]/range_size; //if range size is wrong, there could be an extra partition
      cudaAssert(partition <= NUM_RANGE);
      if (partition >= NUM_RANGE) partition = NUM_RANGE-1;
      cudaAssert(output[partition] != NULL);
      if (is_replicated) {
        if (partition == gpu) output[partition][index[ITEM]] = item_val[ITEM];
      } else output[partition][index[ITEM]] = item_val[ITEM];
    }
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD, int NUM_RANGE>
__device__ __forceinline__ void BlockLoadAndWriteValPartition2(
    int* value,
    int  (&item_key)[ITEMS_PER_THREAD],
    int  (&item_val)[ITEMS_PER_THREAD],
    int  (&index)[ITEMS_PER_THREAD],
    int (&selection_flags)[ITEMS_PER_THREAD],
    int** output,
    int num_tile_items,
    int range_size,
    int is_replicated,
    int gpu
    ) {

  cudaAssert(value != NULL);
  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
    if (threadIdx.x + (ITEM * BLOCK_THREADS) < num_tile_items) {
      if (selection_flags[ITEM]) {
        item_val[ITEM] = value[(ITEM * BLOCK_THREADS) + threadIdx.x];
        int partition = item_key[ITEM]/range_size; //if range size is wrong, there could be an extra partition
        cudaAssert(partition <= NUM_RANGE);
        if (partition >= NUM_RANGE) partition = NUM_RANGE-1;
        cudaAssert(output[partition] != NULL);
        if (is_replicated) {
          if (partition == gpu) output[partition][index[ITEM]] = item_val[ITEM];
        } else output[partition][index[ITEM]] = item_val[ITEM];
      }
    }
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD, int NUM_RANGE>
__device__ __forceinline__ void BlockWriteOffSelf(
    int  (&item_key)[ITEMS_PER_THREAD],
    int  (&index)[ITEMS_PER_THREAD],
    int  (&selection_flags)[ITEMS_PER_THREAD],
    int** offset,
    int start_offset, //offset to the current segment
    int num_tile_items,
    int range_size,
    int is_replicated,
    int gpu
    ) {

  int tile_size = ITEMS_PER_THREAD * BLOCK_THREADS;
  int tile_idx = blockIdx.x % (SEGMENT_SIZE/tile_size); //tile index in current segment
  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
    if (threadIdx.x + (ITEM * BLOCK_THREADS) < num_tile_items) {
      if (selection_flags[ITEM]) {
        int partition = item_key[ITEM]/range_size; //if range size is wrong, there could be an extra partition
        cudaAssert(partition <= NUM_RANGE);
        if (partition >= NUM_RANGE) partition = NUM_RANGE-1;
        cudaAssert(offset[partition] != NULL);
        if (is_replicated) {
          if (partition == gpu) offset[partition][index[ITEM]] = start_offset + tile_idx * tile_size + threadIdx.x + ITEM * BLOCK_THREADS;
        } else offset[partition][index[ITEM]] = start_offset + tile_idx * tile_size + threadIdx.x + ITEM * BLOCK_THREADS;
        // if (blockIdx.x == 0) printf("%d\n", index[ITEM]);
        // if (index[ITEM] < 0 || index[ITEM] > 692060) printf("%d %d %d %d\n", blockIdx.x, threadIdx.x, partition, index[ITEM]);
      }
    }
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockProbeGPUDirect(
    int tid,
    int  (&items)[ITEMS_PER_THREAD],
    int  (&offset)[ITEMS_PER_THREAD],
    int  (&selection_flags)[ITEMS_PER_THREAD],
    int* ht,
    int ht_len,
    int keys_min
    ) {

  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
  {
    // Out-of-bounds items are selection_flags
    if (selection_flags[ITEM]) { 
      int hash = HASH(items[ITEM], ht_len, keys_min); 
      int slot = ht[(hash << 1) + 1];
      if (slot != 0) {
        offset[ITEM] = slot - 1;
      } else {
        // printf("keys %d %d %d\n", items[ITEM], blockIdx.x, threadIdx.x);
        selection_flags[ITEM] = 0;
      }
    }
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockProbeGPUDirect(
    int tid,
    int  (&items)[ITEMS_PER_THREAD],
    int  (&offset)[ITEMS_PER_THREAD],
    int  (&selection_flags)[ITEMS_PER_THREAD],
    int* ht,
    int ht_len,
    int keys_min,
    int num_items
    ) {

  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
  {
    // Out-of-bounds items are selection_flags
    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
      if (selection_flags[ITEM]) { 
        int hash = HASH(items[ITEM], ht_len, keys_min); 
        int slot = ht[(hash << 1) + 1];
        if (slot != 0) {
          offset[ITEM] = slot - 1;
        } else {
          // cudaAssert(0);
          // printf("keys %d %d %d\n", items[ITEM], blockIdx.x, threadIdx.x);
          selection_flags[ITEM] = 0;
        }
      }
    }
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockProbeGPU(
    int tid,
    int  (&items)[ITEMS_PER_THREAD],
    int  (&offset)[ITEMS_PER_THREAD],
    int  (&selection_flags)[ITEMS_PER_THREAD],
    int* ht,
    int ht_len,
    int keys_min,
    int num_items
    ) {

  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
    BlockProbeGPUDirect<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items, offset, selection_flags, ht, ht_len, keys_min);
  } else {
    BlockProbeGPUDirect<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items, offset, selection_flags, ht, ht_len, keys_min, num_items);
  }

}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockProbeGroupByGPUDirect(
    int tid,
    int  (&items)[ITEMS_PER_THREAD],
    int  (&res)[ITEMS_PER_THREAD],
    int  (&selection_flags)[ITEMS_PER_THREAD],
    int* ht,
    int ht_len,
    int keys_min
    ) {

  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
  {
    // Out-of-bounds items are selection_flags
    if (selection_flags[ITEM]) {
      int hash = HASH(items[ITEM], ht_len, keys_min);
      uint64_t slot = *reinterpret_cast<uint64_t*>(&ht[hash << 1]);
      if (slot != 0) {
        //res[ITEM] = (slot << 32) >> 32;
        res[ITEM] = slot;
      } else {
        selection_flags[ITEM] = 0;
      }
    }
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockProbeGroupByGPUDirect(
    int tid,
    int  (&items)[ITEMS_PER_THREAD],
    int  (&res)[ITEMS_PER_THREAD],
    int  (&selection_flags)[ITEMS_PER_THREAD],
    int* ht,
    int ht_len,
    int keys_min,
    int num_items
    ) {

  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
  {
    // Out-of-bounds items are selection_flags
    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
      if (selection_flags[ITEM]) {
        int hash = HASH(items[ITEM], ht_len, keys_min);
        uint64_t slot = *reinterpret_cast<uint64_t*>(&ht[hash << 1]);
        if (slot != 0) {
          //res[ITEM] = (slot << 32) >> 32;
          res[ITEM] = slot;
        } else {
          selection_flags[ITEM] = 0;
        }
      }
    }
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockProbeGroupByGPU(
    int tid,
    int  (&items)[ITEMS_PER_THREAD],
    int  (&res)[ITEMS_PER_THREAD],
    int  (&selection_flags)[ITEMS_PER_THREAD],
    int* ht,
    int ht_len,
    int keys_min,
    int num_items
    ) {

  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
    BlockProbeGroupByGPUDirect<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items, res, selection_flags, ht, ht_len, keys_min);
  } else {
    BlockProbeGroupByGPUDirect<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items, res, selection_flags, ht, ht_len, keys_min, num_items);
  }

}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockProbeGPU2Direct(
    int tid,
    int  (&items_off)[ITEMS_PER_THREAD],
    int  (&offset)[ITEMS_PER_THREAD],
    int  (&selection_flags)[ITEMS_PER_THREAD],
    int* gpuCache,
    int* key_idx,
    int* ht,
    int ht_len,
    int keys_min
    ) {

  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
  {
    // Out-of-bounds items are selection_flags
    if (selection_flags[ITEM]) {  
      int64_t dimkey_seg = key_idx[items_off[ITEM] / SEGMENT_SIZE];
      int key = gpuCache[dimkey_seg * SEGMENT_SIZE + (items_off[ITEM] % SEGMENT_SIZE)];
      int hash = HASH(key, ht_len, keys_min);
      int slot = ht[(hash << 1) + 1];
      if (slot != 0) {
        offset[ITEM] = slot - 1;
      } else {
        selection_flags[ITEM] = 0;
      }
    }
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockProbeGPU2Direct(
    int tid,
    int  (&items_off)[ITEMS_PER_THREAD],
    int  (&offset)[ITEMS_PER_THREAD],
    int  (&selection_flags)[ITEMS_PER_THREAD],
    int* gpuCache,
    int* key_idx,
    int* ht,
    int ht_len,
    int keys_min,
    int num_items
    ) {

  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
  {
    // Out-of-bounds items are selection_flags
    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
      if (selection_flags[ITEM]) {  
        int64_t dimkey_seg = key_idx[items_off[ITEM] / SEGMENT_SIZE];
        int key = gpuCache[dimkey_seg * SEGMENT_SIZE + (items_off[ITEM] % SEGMENT_SIZE)];
        int hash = HASH(key, ht_len, keys_min);
        int slot = ht[(hash << 1) + 1];
        if (slot != 0) {
          offset[ITEM] = slot - 1;
        } else {
          selection_flags[ITEM] = 0;
        }
      }
    }
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockProbeGPU2(
    int tid,
    int  (&items_off)[ITEMS_PER_THREAD],
    int  (&offset)[ITEMS_PER_THREAD],
    int  (&selection_flags)[ITEMS_PER_THREAD],
    int* gpuCache,
    int* key_idx,
    int* ht,
    int ht_len,
    int keys_min,
    int num_items
    ) {

  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
    BlockProbeGPU2Direct<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items_off, offset, selection_flags, gpuCache, key_idx, ht, ht_len, keys_min);
  } else {
    BlockProbeGPU2Direct<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items_off, offset, selection_flags, gpuCache, key_idx, ht, ht_len, keys_min, num_items);
  }

}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockProbeGroupByGPU2Direct(
    int tid,
    int  (&items_off)[ITEMS_PER_THREAD],
    int  (&res)[ITEMS_PER_THREAD],
    int  (&selection_flags)[ITEMS_PER_THREAD],
    int* gpuCache,
    int* key_idx,
    int* ht,
    int ht_len,
    int keys_min
    ) {

  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
  {
    // Out-of-bounds items are selection_flags
    if (selection_flags[ITEM]) {
      int64_t dimkey_seg = key_idx[items_off[ITEM] / SEGMENT_SIZE];
      int key = gpuCache[dimkey_seg * SEGMENT_SIZE + (items_off[ITEM] % SEGMENT_SIZE)];
      int hash = HASH(key, ht_len, keys_min);
      uint64_t slot = *reinterpret_cast<uint64_t*>(&ht[hash << 1]);
      if (slot != 0) {
        res[ITEM] = slot;
      } else {
        selection_flags[ITEM] = 0;
      }
    }
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockProbeGroupByGPU2Direct(
    int tid,
    int  (&items_off)[ITEMS_PER_THREAD],
    int  (&res)[ITEMS_PER_THREAD],
    int  (&selection_flags)[ITEMS_PER_THREAD],
    int* gpuCache,
    int* key_idx,
    int* ht,
    int ht_len,
    int keys_min,
    int num_items
    ) {

  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
  {
    // Out-of-bounds items are selection_flags
    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
      if (selection_flags[ITEM]) {
        int64_t dimkey_seg = key_idx[items_off[ITEM] / SEGMENT_SIZE];
        int key = gpuCache[dimkey_seg * SEGMENT_SIZE + (items_off[ITEM] % SEGMENT_SIZE)];
        int hash = HASH(key, ht_len, keys_min);
        uint64_t slot = *reinterpret_cast<uint64_t*>(&ht[hash << 1]);
        if (slot != 0) {
          res[ITEM] = slot;
        } else {
          selection_flags[ITEM] = 0;
        }
      }
    }
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockProbeGroupByGPU2(
    int tid,
    int  (&items_off)[ITEMS_PER_THREAD],
    int  (&res)[ITEMS_PER_THREAD],
    int  (&selection_flags)[ITEMS_PER_THREAD],
    int* gpuCache,
    int* key_idx,
    int* ht,
    int ht_len,
    int keys_min,
    int num_items
    ) {

  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
    BlockProbeGroupByGPU2Direct<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items_off, res, selection_flags, gpuCache, key_idx, ht, ht_len, keys_min);
  } else {
    BlockProbeGroupByGPU2Direct<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items_off, res, selection_flags, gpuCache, key_idx, ht, ht_len, keys_min, num_items);
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockPassThroughOffsetDirect(
    int tid,
    int  (&items)[ITEMS_PER_THREAD],
    int  (&offset)[ITEMS_PER_THREAD],
    int  (&selection_flags)[ITEMS_PER_THREAD]
    ) {
  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
  {
    if (selection_flags[ITEM]) {
      offset[ITEM] = items[ITEM];
    }
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockPassThroughOffsetDirect(
    int tid,
    int  (&items)[ITEMS_PER_THREAD],
    int  (&offset)[ITEMS_PER_THREAD],
    int  (&selection_flags)[ITEMS_PER_THREAD],
    int num_items
    ) {
  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
  {
    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
      if (selection_flags[ITEM]) {
        offset[ITEM] = items[ITEM];
      }
    }
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockPassThroughOffset(
    int tid,
    int  (&items)[ITEMS_PER_THREAD],
    int  (&offset)[ITEMS_PER_THREAD],
    int  (&selection_flags)[ITEMS_PER_THREAD],
    int num_items
    ) {

  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
    BlockPassThroughOffsetDirect<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items, offset, selection_flags);
  } else {
    BlockPassThroughOffsetDirect<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items, offset, selection_flags, num_items);
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockSetFilteredValueDirect(
    int tid,
    int  (&items)[ITEMS_PER_THREAD],
    int value,
    int  (&selection_flags)[ITEMS_PER_THREAD]
    ) {
  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
  {
    if (selection_flags[ITEM]) {
      items[ITEM] = value;
    }
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockSetFilteredValueDirect(
    int tid,
    int  (&items)[ITEMS_PER_THREAD],
    int value,
    int  (&selection_flags)[ITEMS_PER_THREAD],
    int num_items
    ) {
  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
  {
    if (tid + (ITEM * BLOCK_THREADS) < num_items) { // CUB use blocked arrangement
      if (selection_flags[ITEM]) {
        items[ITEM] = value;
      }
    }
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockSetFilteredValue(
    int tid,
    int  (&items)[ITEMS_PER_THREAD],
    int value,
    int  (&selection_flags)[ITEMS_PER_THREAD],
    int num_items
    ) {

  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
    BlockSetFilteredValueDirect<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items, value, selection_flags);
  } else {
    BlockSetFilteredValueDirect<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items, value, selection_flags, num_items);
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockSetValueDirect(
    int tid,
    int  (&items)[ITEMS_PER_THREAD],
    int value
    ) {
  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
  {
    items[ITEM] = value;
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockSetValueDirect(
    int tid,
    int  (&items)[ITEMS_PER_THREAD],
    int value,
    int num_items
    ) {
  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
  {
    if (tid + (ITEM * BLOCK_THREADS) < num_items) { // CUB use blocked arrangement
      items[ITEM] = value;
    }
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockSetValue(
    int tid,
    int  (&items)[ITEMS_PER_THREAD],
    int value,
    int num_items
    ) {

  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
    BlockSetValueDirect<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items, value);
  } else {
    BlockSetValueDirect<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items, value, num_items);
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockReadFilteredOffsetDirect(
    int tid,
    int  (&items_off)[ITEMS_PER_THREAD],
    int  (&items)[ITEMS_PER_THREAD],
    int  (&selection_flags)[ITEMS_PER_THREAD],
    int* gpuCache,
    int* col_idx
    ) {

  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
  {
    if (selection_flags[ITEM]) {
      int64_t val_segment = col_idx[items_off[ITEM] / SEGMENT_SIZE];
      items[ITEM] = gpuCache[val_segment * SEGMENT_SIZE + (items_off[ITEM] % SEGMENT_SIZE)];
    }
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockReadFilteredOffsetDirect(
    int tid,
    int  (&items_off)[ITEMS_PER_THREAD],
    int  (&items)[ITEMS_PER_THREAD],
    int  (&selection_flags)[ITEMS_PER_THREAD],
    int* gpuCache,
    int* col_idx,
    int num_items
    ) {

  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
  {
    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
      if (selection_flags[ITEM]) {
        int64_t val_segment = col_idx[items_off[ITEM] / SEGMENT_SIZE];
        items[ITEM] = gpuCache[val_segment * SEGMENT_SIZE + (items_off[ITEM] % SEGMENT_SIZE)];
      }
    }
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockReadFilteredOffset(
    int tid,
    int  (&items_off)[ITEMS_PER_THREAD],
    int  (&items)[ITEMS_PER_THREAD],
    int  (&selection_flags)[ITEMS_PER_THREAD],
    int* gpuCache,
    int* col_idx,
    int num_items
    ) {

  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
    BlockReadFilteredOffsetDirect<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items_off, items, selection_flags, gpuCache, col_idx);
  } else {
    BlockReadFilteredOffsetDirect<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items_off, items, selection_flags, gpuCache, col_idx, num_items);
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockReadOffsetDirect(
    int tid,
    int  (&items_off)[ITEMS_PER_THREAD],
    int  (&items)[ITEMS_PER_THREAD],
    int* gpuCache,
    int* col_idx
    ) {

  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
  {
    int64_t val_segment = col_idx[items_off[ITEM] / SEGMENT_SIZE];
    items[ITEM] = gpuCache[val_segment * SEGMENT_SIZE + (items_off[ITEM] % SEGMENT_SIZE)];
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockReadOffsetDirect(
    int tid,
    int  (&items_off)[ITEMS_PER_THREAD],
    int  (&items)[ITEMS_PER_THREAD],
    int* gpuCache,
    int* col_idx,
    int num_items
    ) {

  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
  {
    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
      int64_t val_segment = col_idx[items_off[ITEM] / SEGMENT_SIZE];
      items[ITEM] = gpuCache[val_segment * SEGMENT_SIZE + (items_off[ITEM] % SEGMENT_SIZE)];
    }
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockReadOffsetGPU(
    int tid,
    int  (&items_off)[ITEMS_PER_THREAD],
    int  (&items)[ITEMS_PER_THREAD],
    int* gpuCache,
    int* col_idx,
    int num_items
    ) {

  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
    BlockReadOffsetDirect<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items_off, items, gpuCache, col_idx);
  } else {
    BlockReadOffsetDirect<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items_off, items, gpuCache, col_idx, num_items);
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockReadOffsetGPU2(
    int tid,
    int  (&items_off)[ITEMS_PER_THREAD],
    int  (&items)[ITEMS_PER_THREAD],
    int  (&selection_flags)[ITEMS_PER_THREAD],
    int* gpuCache,
    int* col_idx,
    int num_items
    ) {

  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
  {
    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
      if (selection_flags[ITEM]) {
        int64_t val_segment = col_idx[items_off[ITEM] / SEGMENT_SIZE];
        items[ITEM] = gpuCache[val_segment * SEGMENT_SIZE + (items_off[ITEM] % SEGMENT_SIZE)];
      }
    }
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockReadOffsetGPU3(
    int tid,
    int  (&items_off)[ITEMS_PER_THREAD],
    int  (&items)[ITEMS_PER_THREAD],
    int  (&selection_flags)[ITEMS_PER_THREAD],
    int* gpuCache,
    int* col_idx,
    int* broadcast_idx,
    int num_items
    ) {

  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
  {
    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
      if (selection_flags[ITEM]) {
        int64_t segment_idx = col_idx[items_off[ITEM] / SEGMENT_SIZE];
        if (segment_idx == -1) {
          segment_idx = broadcast_idx[items_off[ITEM] / SEGMENT_SIZE];
        }
        cudaAssert(segment_idx != -1);
        items[ITEM] = gpuCache[segment_idx * SEGMENT_SIZE + (items_off[ITEM] % SEGMENT_SIZE)];
      }
    }
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockBuildValueGPUDirect(
    int tid,
    int blockid,
    int start_offset,
    int  (&keys)[ITEMS_PER_THREAD], //equal to items
    int  (&vals)[ITEMS_PER_THREAD],
    int  (&selection_flags)[ITEMS_PER_THREAD],
    int* ht,
    int ht_len,
    int keys_min
    ) {

  int tile_size = BLOCK_THREADS * ITEMS_PER_THREAD;

  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
    if (selection_flags[ITEM]) {
      int hash = HASH(keys[ITEM], ht_len, keys_min);
      atomicCAS(&ht[hash << 1], 0, vals[ITEM]);
      ht[(hash << 1) + 1] = start_offset + (blockid * tile_size) + (tid + ITEM * BLOCK_THREADS) + 1;
    }
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockBuildValueGPUDirect(
    int tid,
    int blockid,
    int start_offset,
    int  (&keys)[ITEMS_PER_THREAD], //equal to items
    int  (&vals)[ITEMS_PER_THREAD],
    int  (&selection_flags)[ITEMS_PER_THREAD],
    int* ht,
    int ht_len,
    int keys_min, // equal to val_min
    int num_items
    ) {

  int tile_size = BLOCK_THREADS * ITEMS_PER_THREAD;

  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
    if (tid + (ITEM * BLOCK_THREADS) < num_items) {  // use stripe arrangement since we are using Crystal Blockload instead of CUB
      if (selection_flags[ITEM]) {
        int hash = HASH(keys[ITEM], ht_len, keys_min);
        atomicCAS(&ht[hash << 1], 0, vals[ITEM]);
        ht[(hash << 1) + 1] = start_offset + (blockid * tile_size) + (tid + ITEM * BLOCK_THREADS) + 1;
      }
    }
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockBuildValueGPU(
    int tid,
    int blockid,
    int start_offset,
    int  (&keys)[ITEMS_PER_THREAD], //equal to items
    int  (&vals)[ITEMS_PER_THREAD],
    int  (&selection_flags)[ITEMS_PER_THREAD],
    int* ht,
    int ht_len,
    int keys_min, // equal to val_min
    int num_items
    ) {

  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
    BlockBuildValueGPUDirect<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, blockid, start_offset, keys, vals, selection_flags, ht, ht_len, keys_min);
  } else {
    BlockBuildValueGPUDirect<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, blockid, start_offset, keys, vals, selection_flags, ht, ht_len, keys_min, num_items);
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockBuildOffsetGPUDirect(
    int tid,
    int blockid,
    int start_offset,
    int  (&keys)[ITEMS_PER_THREAD], //equal to items
    int  (&selection_flags)[ITEMS_PER_THREAD],
    int* ht,
    int ht_len,
    int keys_min
    ) {

  int tile_size = BLOCK_THREADS * ITEMS_PER_THREAD;

  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
    if (selection_flags[ITEM]) {
      int hash = HASH(keys[ITEM], ht_len, keys_min);
      // if (hash < 0 || hash > 41943040) printf("%d %d %d %d\n", keys[ITEM], hash, ht_len, keys_min);
      ht[(hash << 1) + 1] = start_offset + (blockid * tile_size) + (tid + ITEM * BLOCK_THREADS) + 1;
    }
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockBuildOffsetGPUDirect(
    int tid,
    int blockid,
    int start_offset,
    int  (&keys)[ITEMS_PER_THREAD], //equal to items
    int  (&selection_flags)[ITEMS_PER_THREAD],
    int* ht,
    int ht_len,
    int keys_min, // equal to val_min
    int num_items
    ) {

  int tile_size = BLOCK_THREADS * ITEMS_PER_THREAD;

  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
    if (tid + (ITEM * BLOCK_THREADS) < num_items) {  // use stripe arrangement since we are using Crystal Blockload instead of CUB
      if (selection_flags[ITEM]) {
        int hash = HASH(keys[ITEM], ht_len, keys_min);
        ht[(hash << 1) + 1] = start_offset + (blockid * tile_size) + (tid + ITEM * BLOCK_THREADS) + 1;
      }
    }
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockBuildOffsetGPU(
    int tid,
    int blockid,
    int start_offset,
    int  (&keys)[ITEMS_PER_THREAD], //equal to items
    int  (&selection_flags)[ITEMS_PER_THREAD],
    int* ht,
    int ht_len,
    int keys_min, // equal to val_min
    int num_items
    ) {

  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
    BlockBuildOffsetGPUDirect<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, blockid, start_offset, keys, selection_flags, ht, ht_len, keys_min);
  } else {
    BlockBuildOffsetGPUDirect<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, blockid, start_offset, keys, selection_flags, ht, ht_len, keys_min, num_items);
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockBuildValueGPU2Direct(
    int tid,
    int  (&items_off)[ITEMS_PER_THREAD], //equal to items
    int  (&selection_flags)[ITEMS_PER_THREAD],
    int* gpuCache,
    int* key_idx,
    int* val_idx,
    int* ht,
    int ht_len,
    int keys_min
    ) {

  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
  {
    if (selection_flags[ITEM]) {
      int64_t dimkey_seg = key_idx[items_off[ITEM] / SEGMENT_SIZE];
      int64_t dimval_seg = val_idx[items_off[ITEM] / SEGMENT_SIZE];
      int key = gpuCache[dimkey_seg * SEGMENT_SIZE + (items_off[ITEM] % SEGMENT_SIZE)];
      int val = gpuCache[dimval_seg * SEGMENT_SIZE + (items_off[ITEM] % SEGMENT_SIZE)];

      // Out-of-bounds items are selection_flags
      int hash = HASH(key, ht_len, keys_min);
      atomicCAS(&ht[hash << 1], 0, val);
      ht[(hash << 1) + 1] = items_off[ITEM] + 1;
    }
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockBuildValueGPU2Direct(
    int tid,
    int  (&items_off)[ITEMS_PER_THREAD], //equal to items
    int  (&selection_flags)[ITEMS_PER_THREAD],
    int* gpuCache,
    int* key_idx,
    int* val_idx,
    int* ht,
    int ht_len,
    int keys_min, // equal to val_min
    int num_items
    ) {

  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
  {
    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
      if (selection_flags[ITEM]) {
        int64_t dimkey_seg = key_idx[items_off[ITEM] / SEGMENT_SIZE];
        int64_t dimval_seg = val_idx[items_off[ITEM] / SEGMENT_SIZE];
        int key = gpuCache[dimkey_seg * SEGMENT_SIZE + (items_off[ITEM] % SEGMENT_SIZE)];
        int val = gpuCache[dimval_seg * SEGMENT_SIZE + (items_off[ITEM] % SEGMENT_SIZE)];

        // Out-of-bounds items are selection_flags
        int hash = HASH(key, ht_len, keys_min);
        atomicCAS(&ht[hash << 1], 0, val);
        ht[(hash << 1) + 1] = items_off[ITEM] + 1;
      }
    }
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockBuildValueGPU2(
    int tid,
    int  (&items_off)[ITEMS_PER_THREAD], //equal to items
    int  (&selection_flags)[ITEMS_PER_THREAD],
    int* gpuCache,
    int* key_idx,
    int* val_idx,
    int* ht,
    int ht_len,
    int keys_min, // equal to val_min
    int num_items
    ) {

  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
    BlockBuildValueGPU2Direct<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items_off, selection_flags, gpuCache, key_idx, val_idx, ht, ht_len, keys_min);
  } else {
    BlockBuildValueGPU2Direct<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items_off, selection_flags, gpuCache, key_idx, val_idx, ht, ht_len, keys_min, num_items);
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockBuildOffsetGPU2Direct(
    int tid,
    int  (&items_off)[ITEMS_PER_THREAD], //equal to items
    int  (&selection_flags)[ITEMS_PER_THREAD],
    int* gpuCache,
    int* key_idx,
    int* ht,
    int ht_len,
    int keys_min
    ) {

  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
  {
    if (selection_flags[ITEM]) {
      int64_t dimkey_seg = key_idx[items_off[ITEM] / SEGMENT_SIZE];
      int key = gpuCache[dimkey_seg * SEGMENT_SIZE + (items_off[ITEM] % SEGMENT_SIZE)];
      int hash = HASH(key, ht_len, keys_min);
      ht[(hash << 1) + 1] = items_off[ITEM] + 1;
    }
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockBuildOffsetGPU2Direct(
    int tid,
    int  (&items_off)[ITEMS_PER_THREAD], //equal to items
    int  (&selection_flags)[ITEMS_PER_THREAD],
    int* gpuCache,
    int* key_idx,
    int* ht,
    int ht_len,
    int keys_min, // equal to val_min
    int num_items
    ) {

  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
  {
    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
      if (selection_flags[ITEM]) {
        int64_t dimkey_seg = key_idx[items_off[ITEM] / SEGMENT_SIZE];
        int key = gpuCache[dimkey_seg * SEGMENT_SIZE + (items_off[ITEM] % SEGMENT_SIZE)];
        int hash = HASH(key, ht_len, keys_min);
        ht[(hash << 1) + 1] = items_off[ITEM] + 1;
      }
    }
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockBuildOffsetGPU2(
    int tid,
    int  (&items_off)[ITEMS_PER_THREAD], //equal to items
    int  (&selection_flags)[ITEMS_PER_THREAD],
    int* gpuCache,
    int* key_idx,
    int* ht,
    int ht_len,
    int keys_min, // equal to val_min
    int num_items
    ) {

  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
    BlockBuildOffsetGPU2Direct<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items_off, selection_flags, gpuCache, key_idx, ht, ht_len, keys_min);
  } else {
    BlockBuildOffsetGPU2Direct<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items_off, selection_flags, gpuCache, key_idx, ht, ht_len, keys_min, num_items);
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockMinMaxGPUDirect(
    int tid,
    int  (&keys)[ITEMS_PER_THREAD], //equal to items
    int  (&selection_flags)[ITEMS_PER_THREAD],
    int &min,
    int &max,
    int num_items
    ) {

  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
  {
    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
      if (selection_flags[ITEM]) {
        if (keys[ITEM] < min) min = keys[ITEM];
        if (keys[ITEM] > max) max = keys[ITEM];
      }
    }
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockMinMaxGPUDirect(
    int tid,
    int  (&keys)[ITEMS_PER_THREAD], //equal to items
    int  (&selection_flags)[ITEMS_PER_THREAD],
    int &min,
    int &max
    ) {

  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
  {
    if (selection_flags[ITEM]) {
      if (keys[ITEM] < min) min = keys[ITEM];
      if (keys[ITEM] > max) max = keys[ITEM];
    }
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockMinMaxGPU(
    int tid,
    int  (&keys)[ITEMS_PER_THREAD], //equal to items
    int  (&selection_flags)[ITEMS_PER_THREAD],
    int &min,
    int &max,
    int num_items
    ) {

  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
    BlockMinMaxGPUDirect<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, keys, selection_flags, min, max);
  } else {
    BlockMinMaxGPUDirect<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, keys, selection_flags, min, max, num_items);
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockMinMaxGPU2Direct(
    int tid,
    int  (&items_off)[ITEMS_PER_THREAD], //equal to items
    int  (&selection_flags)[ITEMS_PER_THREAD],
    int* gpuCache,
    int* key_idx,
    int &min,
    int &max,
    int num_items
    ) {

  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
  {
    if (tid + (ITEM * BLOCK_THREADS) < num_items) {
      if (selection_flags[ITEM]) {
        int64_t dimkey_seg = key_idx[items_off[ITEM] / SEGMENT_SIZE];
        int key = gpuCache[dimkey_seg * SEGMENT_SIZE + (items_off[ITEM] % SEGMENT_SIZE)];
        if (key < min) min = key;
        if (key > max) max = key;
      }
    }
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockMinMaxGPU2Direct(
    int tid,
    int  (&items_off)[ITEMS_PER_THREAD], //equal to items
    int  (&selection_flags)[ITEMS_PER_THREAD],
    int* gpuCache,
    int* key_idx,
    int &min,
    int &max
    ) {

  #pragma unroll
  for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
  {
    if (selection_flags[ITEM]) {
      int64_t dimkey_seg = key_idx[items_off[ITEM] / SEGMENT_SIZE];
      int key = gpuCache[dimkey_seg * SEGMENT_SIZE + (items_off[ITEM] % SEGMENT_SIZE)];
      if (key < min) min = key;
      if (key > max) max = key;
    }
  }
}

template<int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockMinMaxGPU2(
    int tid,
    int  (&items_off)[ITEMS_PER_THREAD], //equal to items
    int  (&selection_flags)[ITEMS_PER_THREAD],
    int* gpuCache,
    int* key_idx,
    int &min,
    int &max,
    int num_items
    ) {

  if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
    BlockMinMaxGPU2Direct<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items_off, selection_flags, min, max);
  } else {
    BlockMinMaxGPU2Direct<BLOCK_THREADS, ITEMS_PER_THREAD>(tid, items_off, selection_flags, min, max, num_items);
  }
}

#endif