#pragma once
#define HASH(X,Y,Z) ((X-Z) % Y)
template<typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockProbeDirectAndPHT_1(
int tid,
K (&items)[ITEMS_PER_THREAD],
int (&selection_flags)[ITEMS_PER_THREAD],
K* ht,
int ht_len,
K keys_min
) {
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
if (selection_flags[ITEM]) {
int hash = HASH(items[ITEM], ht_len, keys_min);
K slot = ht[hash];
if (slot != 0) {
selection_flags[ITEM] = 1;
} else {
selection_flags[ITEM] = 0;
}
}
}
}
template<typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockProbeDirectAndPHT_1(
int tid,
K (&items)[ITEMS_PER_THREAD],
int (&selection_flags)[ITEMS_PER_THREAD],
K* ht,
int ht_len,
K keys_min,
int num_items
) {
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
if (tid + (ITEM * BLOCK_THREADS) < num_items) {
if (selection_flags[ITEM]) {
int hash = HASH(items[ITEM], ht_len, keys_min);
K slot = ht[hash];
if (slot != 0) {
selection_flags[ITEM] = 1;
} else {
selection_flags[ITEM] = 0;
}
}
}
}
}
template<typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockProbeAndPHT_1(
K (&items)[ITEMS_PER_THREAD],
int (&selection_flags)[ITEMS_PER_THREAD],
K* ht,
int ht_len,
K keys_min,
int num_items
) {
if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
BlockProbeDirectAndPHT_1<K, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, items, selection_flags, ht, ht_len, keys_min);
} else {
BlockProbeDirectAndPHT_1<K, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, items, selection_flags, ht, ht_len, keys_min, num_items);
}
}
template<typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockProbeAndPHT_1(
K (&items)[ITEMS_PER_THREAD],
int (&selection_flags)[ITEMS_PER_THREAD],
K* ht,
int ht_len,
int num_items
) {
BlockProbeAndPHT_1<K, BLOCK_THREADS, ITEMS_PER_THREAD>(items, selection_flags, ht, ht_len, 0, num_items);
}
template<typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockProbeDirectAndPHT_2(
int tid,
K (&keys)[ITEMS_PER_THREAD],
V (&res)[ITEMS_PER_THREAD],
int (&selection_flags)[ITEMS_PER_THREAD],
K* ht,
int ht_len,
K keys_min
) {
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
if (selection_flags[ITEM]) {
int hash = HASH(keys[ITEM], ht_len, keys_min);
uint64_t slot = *reinterpret_cast<uint64_t*>(&ht[hash << 1]);
if (slot != 0) {
res[ITEM] = (slot >> 32);
} else {
selection_flags[ITEM] = 0;
}
}
}
}
template<typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockProbeDirectAndPHT_2(
int tid,
K (&items)[ITEMS_PER_THREAD],
V (&res)[ITEMS_PER_THREAD],
int (&selection_flags)[ITEMS_PER_THREAD],
K* ht,
int ht_len,
K keys_min,
int num_items
) {
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
if (tid + (ITEM * BLOCK_THREADS) < num_items) {
if (selection_flags[ITEM]) {
int hash = HASH(items[ITEM], ht_len, keys_min);
uint64_t slot = *reinterpret_cast<uint64_t*>(&ht[hash << 1]);
if (slot != 0) {
res[ITEM] = (slot >> 32);
} else {
selection_flags[ITEM] = 0;
}
}
}
}
}
template<typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockProbeAndPHT_2(
K (&keys)[ITEMS_PER_THREAD],
V (&res)[ITEMS_PER_THREAD],
int (&selection_flags)[ITEMS_PER_THREAD],
K* ht,
int ht_len,
K keys_min,
int num_items
) {
if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
BlockProbeDirectAndPHT_2<K, V, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, keys, res, selection_flags, ht, ht_len, keys_min);
} else {
BlockProbeDirectAndPHT_2<K, V, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, keys, res, selection_flags, ht, ht_len, keys_min, num_items);
}
}
template<typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockProbeAndPHT_2(
K (&keys)[ITEMS_PER_THREAD],
V (&res)[ITEMS_PER_THREAD],
int (&selection_flags)[ITEMS_PER_THREAD],
K* ht,
int ht_len,
int num_items
) {
BlockProbeAndPHT_2<K, V, BLOCK_THREADS, ITEMS_PER_THREAD>(keys, res, selection_flags, ht, ht_len, 0, num_items);
}
template<typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockBuildDirectSelectivePHT_1(
int tid,
K (&keys)[ITEMS_PER_THREAD],
int (&selection_flags)[ITEMS_PER_THREAD],
K* ht,
int ht_len,
K keys_min
) {
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
if (selection_flags[ITEM]) {
int hash = HASH(keys[ITEM], ht_len, keys_min);
K old = atomicCAS(&ht[hash], 0, keys[ITEM]);
}
}
}
template<typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockBuildDirectSelectivePHT_1(
int tid,
K (&items)[ITEMS_PER_THREAD],
int (&selection_flags)[ITEMS_PER_THREAD],
K* ht,
int ht_len,
K keys_min,
int num_items
) {
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
if (tid + (ITEM * BLOCK_THREADS) < num_items) {
if (selection_flags[ITEM]) {
int hash = HASH(items[ITEM], ht_len, keys_min);
K old = atomicCAS(&ht[hash], 0, items[ITEM]);
}
}
}
}
template<typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockBuildSelectivePHT_1(
K (&keys)[ITEMS_PER_THREAD],
int (&selection_flags)[ITEMS_PER_THREAD],
K* ht,
int ht_len,
K keys_min,
int num_items
) {
if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
BlockBuildDirectSelectivePHT_1<K, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, keys, selection_flags, ht, ht_len, keys_min);
} else {
BlockBuildDirectSelectivePHT_1<K, BLOCK_THREADS, ITEMS_PER_THREAD>(threadIdx.x, keys, selection_flags, ht, ht_len, keys_min, num_items);
}
}
template<typename K, int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockBuildSelectivePHT_1(
K (&keys)[ITEMS_PER_THREAD],
int (&selection_flags)[ITEMS_PER_THREAD],
K* ht,
int ht_len,
int num_items
) {
BlockBuildSelectivePHT_1<K, BLOCK_THREADS, ITEMS_PER_THREAD>(keys, selection_flags, ht, ht_len, 0, num_items);
}
template<typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockBuildDirectSelectivePHT_2(
int tid,
K (&keys)[ITEMS_PER_THREAD],
V (&res)[ITEMS_PER_THREAD],
int (&selection_flags)[ITEMS_PER_THREAD],
K* ht,
int ht_len,
K keys_min
) {
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
if (selection_flags[ITEM]) {
int hash = HASH(keys[ITEM], ht_len, keys_min);
K old = atomicCAS(&ht[hash << 1], 0, keys[ITEM]);
ht[(hash << 1) + 1] = res[ITEM];
}
}
}
template<typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockBuildDirectSelectivePHT_2(
int tid,
K (&keys)[ITEMS_PER_THREAD],
V (&res)[ITEMS_PER_THREAD],
int (&selection_flags)[ITEMS_PER_THREAD],
K* ht,
int ht_len,
K keys_min,
int num_items
) {
#pragma unroll
for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) {
if (tid + (ITEM * BLOCK_THREADS) < num_items) {
if (selection_flags[ITEM]) {
int hash = HASH(keys[ITEM], ht_len, keys_min);
K old = atomicCAS(&ht[hash << 1], 0, keys[ITEM]);
ht[(hash << 1) + 1] = res[ITEM];
}
}
}
}
template<typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockBuildSelectivePHT_2(
K (&keys)[ITEMS_PER_THREAD],
V (&res)[ITEMS_PER_THREAD],
int (&selection_flags)[ITEMS_PER_THREAD],
K* ht,
int ht_len,
K keys_min,
int num_items
) {
if ((BLOCK_THREADS * ITEMS_PER_THREAD) == num_items) {
BlockBuildDirectSelectivePHT_2<K, V, BLOCK_THREADS, ITEMS_PER_THREAD>(
threadIdx.x, keys, res, selection_flags, ht, ht_len, keys_min);
} else {
BlockBuildDirectSelectivePHT_2<K, V, BLOCK_THREADS, ITEMS_PER_THREAD>(
threadIdx.x, keys, res, selection_flags, ht, ht_len, keys_min, num_items);
}
}
template<typename K, typename V, int BLOCK_THREADS, int ITEMS_PER_THREAD>
__device__ __forceinline__ void BlockBuildSelectivePHT_2(
K (&keys)[ITEMS_PER_THREAD],
V (&res)[ITEMS_PER_THREAD],
int (&selection_flags)[ITEMS_PER_THREAD],
K* ht,
int ht_len,
int num_items
) {
BlockBuildSelectivePHT_2<K, V, BLOCK_THREADS, ITEMS_PER_THREAD>(keys, res, selection_flags, ht, ht_len, 0, num_items);
}