Lancelot / src / cpu / join.cpp
join.cpp
Raw
#include <math.h>
#include <chrono>
#include <cstdlib>
#include <iostream>
#include <cassert>
#include "tbb/tbb.h"
#include "tbb/parallel_for.h"

#include "utils/generator.h"
#include "utils/cpu_utils.h"

using namespace tbb;
using namespace std;

void *mamalloc(size_t size)
{
  void *ptr = NULL;
  return posix_memalign(&ptr, 64, size) ? NULL : ptr;
}

int *align(const int* p)
{
  while (63 & (size_t) p) p++;
  return (int*) p;
}

int log_2(size_t n)
{
  size_t b = 1;
  int p = 0;
  while (b < n) {
    b += b;
    p++;
  }
  assert(b == n);
  return p;
}

typedef struct {
  uint32_t key;
  uint32_t val;
} bucket_t;

// TODO: Make this multi-threaded
float build(const uint32_t *keys, const uint32_t *vals, size_t size, bucket_t *table,
           int num_slots)
{
  chrono::high_resolution_clock::time_point start, finish;
  const int batch_size = 1<<22;

  start = chrono::high_resolution_clock::now();
  parallel_for(blocked_range<size_t>(0, size, batch_size), [&](auto range) {
    size_t i;
    for (i = range.begin(); i != range.end(); ++i) {
      uint32_t key = keys[i];
      uint32_t val = vals[i];
      size_t h = (key & (num_slots - 1));
      //while (table[h].key != invalid_key)
        //h = (h + 1) & (buckets - 1);
      table[h].key = key;
      table[h].val = val;
    }
  });

  finish = chrono::high_resolution_clock::now();

  std::chrono::duration<double> diff = finish - start;
  return diff.count() * 1000;
}

inline void store(uint32_t *p, uint32_t v)
{
#ifndef __MIC
  _mm_stream_si32((int*) p, v);
#else
  *p = v;
#endif
}

float probe(uint32_t *keys, uint32_t *vals, size_t size,
              const bucket_t *table, int num_slots)
{
  chrono::high_resolution_clock::time_point start, finish;
  const int batch_size = 1<<29;
  const uint32_t invalid_key = 0;
  tbb::atomic<long long> full_checksum;
  full_checksum = 0;

  start = chrono::high_resolution_clock::now();

  parallel_for(blocked_range<size_t>(0, size, batch_size), [&](auto range) {
    size_t i;
    long long checksum = 0;
    const uint64_t *table_64 = (const uint64_t*) table;
    for (i = range.begin(); i != range.end(); ++i) {
      uint32_t key = keys[i];
      uint32_t val = vals[i];
      size_t h = (uint32_t) (key & (num_slots - 1));
      //size_t h = HASH(key, num_slots);
      uint64_t tab = table_64[h];
      if (key == (uint32_t) tab) {
        checksum += val + (tab >>  32);
        //cout << "K V1 V2 " << key << " " << val << " " << (tab >> 32) << endl;
      } else while (invalid_key != (uint32_t) tab) {
        h = (h + 1) & (num_slots - 1);
        tab = table_64[h];
        if (key == (uint32_t) tab) {
          checksum += val + (tab >> 32);
          break;
        }
      }
    }
    full_checksum.fetch_and_add(checksum);
  });

  finish = chrono::high_resolution_clock::now();

  long long res_checksum = full_checksum.fetch_and_add(0);
  cout << "Checksum Scalar " << res_checksum << endl;

  std::chrono::duration<double> diff = finish - start;
  return diff.count() * 1000;
}

// Size of group for prefetching
#define GROUP_PREFETCH_SIZE 16

float probe_prefetch(uint32_t *keys, uint32_t *vals, size_t size,
              const bucket_t *table, int num_slots)
{
  chrono::high_resolution_clock::time_point start, finish;
  const int batch_size = 1<<29;
  const uint32_t invalid_key = 0;
  tbb::atomic<long long> full_checksum;
  full_checksum = 0;

  start = chrono::high_resolution_clock::now();

  parallel_for(blocked_range<size_t>(0, size, batch_size), [&](auto range) {
    size_t i, j;
    long long checksum = 0;
    const uint64_t *table_64 = (const uint64_t*) table;
    for (i = range.begin(); i != range.end(); i += GROUP_PREFETCH_SIZE) {
      uint32_t keys_arr[GROUP_PREFETCH_SIZE];
      uint32_t vals_arr[GROUP_PREFETCH_SIZE];
      uint32_t hash_arr[GROUP_PREFETCH_SIZE];
      for (j = 0; j < GROUP_PREFETCH_SIZE; j++) {
        keys_arr[j] = keys[i + j];
        vals_arr[j] = vals[i + j];
        hash_arr[j] = (uint32_t) (keys_arr[j] & (num_slots - 1));
        _mm_prefetch(&table_64[hash_arr[j]], 0);
      }

      for (j = 0; j < GROUP_PREFETCH_SIZE; j++) {
        uint64_t tab = table_64[hash_arr[j]];
        if (keys_arr[j] == (uint32_t) tab) {
          checksum += vals_arr[j] + (tab >>  32);
          //cout << "K V1 V2 " << key << " " << val << " " << (tab >> 32) << endl;
        } else while (invalid_key != (uint32_t) tab) {
          hash_arr[j] = (hash_arr[j] + 1) & (num_slots - 1);
          tab = table_64[hash_arr[j]];
          if (keys_arr[j] == (uint32_t) tab) {
            checksum += vals_arr[j] + (tab >> 32);
            break;
          }
        }
      }
    }
    full_checksum.fetch_and_add(checksum);
  });

  finish = chrono::high_resolution_clock::now();

  long long res_checksum = full_checksum.fetch_and_add(0);
  cout << "Checksum Prefetch " << res_checksum << endl;

  std::chrono::duration<double> diff = finish - start;
  return diff.count() * 1000;
}

inline __m256i _mm256_packlo_epi32(__m256i x, __m256i y)
{
  __m256 a = _mm256_castsi256_ps(x);
  __m256 b = _mm256_castsi256_ps(y);
  __m256 c = _mm256_shuffle_ps(a, b, _MM_SHUFFLE(2,0,2,0));
  __m256i z = _mm256_castps_si256(c);
  return _mm256_permute4x64_epi64(z, _MM_SHUFFLE(3,1,2,0));
}

inline __m256i _mm256_packhi_epi32(__m256i x, __m256i y)
{
  __m256 a = _mm256_castsi256_ps(x);
  __m256 b = _mm256_castsi256_ps(y);
  __m256 c = _mm256_shuffle_ps(a, b, _MM_SHUFFLE(3,1,3,1));
  __m256i z = _mm256_castps_si256(c);
  return _mm256_permute4x64_epi64(z, _MM_SHUFFLE(3,1,2,0));
}

const uint64_t perm[256] = {0x0706050403020100ull,
     0x0007060504030201ull, 0x0107060504030200ull, 0x0001070605040302ull,
     0x0207060504030100ull, 0x0002070605040301ull, 0x0102070605040300ull,
     0x0001020706050403ull, 0x0307060504020100ull, 0x0003070605040201ull,
     0x0103070605040200ull, 0x0001030706050402ull, 0x0203070605040100ull,
     0x0002030706050401ull, 0x0102030706050400ull, 0x0001020307060504ull,
     0x0407060503020100ull, 0x0004070605030201ull, 0x0104070605030200ull,
     0x0001040706050302ull, 0x0204070605030100ull, 0x0002040706050301ull,
     0x0102040706050300ull, 0x0001020407060503ull, 0x0304070605020100ull,
     0x0003040706050201ull, 0x0103040706050200ull, 0x0001030407060502ull,
     0x0203040706050100ull, 0x0002030407060501ull, 0x0102030407060500ull,
     0x0001020304070605ull, 0x0507060403020100ull, 0x0005070604030201ull,
     0x0105070604030200ull, 0x0001050706040302ull, 0x0205070604030100ull,
     0x0002050706040301ull, 0x0102050706040300ull, 0x0001020507060403ull,
     0x0305070604020100ull, 0x0003050706040201ull, 0x0103050706040200ull,
     0x0001030507060402ull, 0x0203050706040100ull, 0x0002030507060401ull,
     0x0102030507060400ull, 0x0001020305070604ull, 0x0405070603020100ull,
     0x0004050706030201ull, 0x0104050706030200ull, 0x0001040507060302ull,
     0x0204050706030100ull, 0x0002040507060301ull, 0x0102040507060300ull,
     0x0001020405070603ull, 0x0304050706020100ull, 0x0003040507060201ull,
     0x0103040507060200ull, 0x0001030405070602ull, 0x0203040507060100ull,
     0x0002030405070601ull, 0x0102030405070600ull, 0x0001020304050706ull,
     0x0607050403020100ull, 0x0006070504030201ull, 0x0106070504030200ull,
     0x0001060705040302ull, 0x0206070504030100ull, 0x0002060705040301ull,
     0x0102060705040300ull, 0x0001020607050403ull, 0x0306070504020100ull,
     0x0003060705040201ull, 0x0103060705040200ull, 0x0001030607050402ull,
     0x0203060705040100ull, 0x0002030607050401ull, 0x0102030607050400ull,
     0x0001020306070504ull, 0x0406070503020100ull, 0x0004060705030201ull,
     0x0104060705030200ull, 0x0001040607050302ull, 0x0204060705030100ull,
     0x0002040607050301ull, 0x0102040607050300ull, 0x0001020406070503ull,
     0x0304060705020100ull, 0x0003040607050201ull, 0x0103040607050200ull,
     0x0001030406070502ull, 0x0203040607050100ull, 0x0002030406070501ull,
     0x0102030406070500ull, 0x0001020304060705ull, 0x0506070403020100ull,
     0x0005060704030201ull, 0x0105060704030200ull, 0x0001050607040302ull,
     0x0205060704030100ull, 0x0002050607040301ull, 0x0102050607040300ull,
     0x0001020506070403ull, 0x0305060704020100ull, 0x0003050607040201ull,
     0x0103050607040200ull, 0x0001030506070402ull, 0x0203050607040100ull,
     0x0002030506070401ull, 0x0102030506070400ull, 0x0001020305060704ull,
     0x0405060703020100ull, 0x0004050607030201ull, 0x0104050607030200ull,
     0x0001040506070302ull, 0x0204050607030100ull, 0x0002040506070301ull,
     0x0102040506070300ull, 0x0001020405060703ull, 0x0304050607020100ull,
     0x0003040506070201ull, 0x0103040506070200ull, 0x0001030405060702ull,
     0x0203040506070100ull, 0x0002030405060701ull, 0x0102030405060700ull,
     0x0001020304050607ull, 0x0706050403020100ull, 0x0007060504030201ull,
     0x0107060504030200ull, 0x0001070605040302ull, 0x0207060504030100ull,
     0x0002070605040301ull, 0x0102070605040300ull, 0x0001020706050403ull,
     0x0307060504020100ull, 0x0003070605040201ull, 0x0103070605040200ull,
     0x0001030706050402ull, 0x0203070605040100ull, 0x0002030706050401ull,
     0x0102030706050400ull, 0x0001020307060504ull, 0x0407060503020100ull,
     0x0004070605030201ull, 0x0104070605030200ull, 0x0001040706050302ull,
     0x0204070605030100ull, 0x0002040706050301ull, 0x0102040706050300ull,
     0x0001020407060503ull, 0x0304070605020100ull, 0x0003040706050201ull,
     0x0103040706050200ull, 0x0001030407060502ull, 0x0203040706050100ull,
     0x0002030407060501ull, 0x0102030407060500ull, 0x0001020304070605ull,
     0x0507060403020100ull, 0x0005070604030201ull, 0x0105070604030200ull,
     0x0001050706040302ull, 0x0205070604030100ull, 0x0002050706040301ull,
     0x0102050706040300ull, 0x0001020507060403ull, 0x0305070604020100ull,
     0x0003050706040201ull, 0x0103050706040200ull, 0x0001030507060402ull,
     0x0203050706040100ull, 0x0002030507060401ull, 0x0102030507060400ull,
     0x0001020305070604ull, 0x0405070603020100ull, 0x0004050706030201ull,
     0x0104050706030200ull, 0x0001040507060302ull, 0x0204050706030100ull,
     0x0002040507060301ull, 0x0102040507060300ull, 0x0001020405070603ull,
     0x0304050706020100ull, 0x0003040507060201ull, 0x0103040507060200ull,
     0x0001030405070602ull, 0x0203040507060100ull, 0x0002030405070601ull,
     0x0102030405070600ull, 0x0001020304050706ull, 0x0607050403020100ull,
     0x0006070504030201ull, 0x0106070504030200ull, 0x0001060705040302ull,
     0x0206070504030100ull, 0x0002060705040301ull, 0x0102060705040300ull,
     0x0001020607050403ull, 0x0306070504020100ull, 0x0003060705040201ull,
     0x0103060705040200ull, 0x0001030607050402ull, 0x0203060705040100ull,
     0x0002030607050401ull, 0x0102030607050400ull, 0x0001020306070504ull,
     0x0406070503020100ull, 0x0004060705030201ull, 0x0104060705030200ull,
     0x0001040607050302ull, 0x0204060705030100ull, 0x0002040607050301ull,
     0x0102040607050300ull, 0x0001020406070503ull, 0x0304060705020100ull,
     0x0003040607050201ull, 0x0103040607050200ull, 0x0001030406070502ull,
     0x0203040607050100ull, 0x0002030406070501ull, 0x0102030406070500ull,
     0x0001020304060705ull, 0x0506070403020100ull, 0x0005060704030201ull,
     0x0105060704030200ull, 0x0001050607040302ull, 0x0205060704030100ull,
     0x0002050607040301ull, 0x0102050607040300ull, 0x0001020506070403ull,
     0x0305060704020100ull, 0x0003050607040201ull, 0x0103050607040200ull,
     0x0001030506070402ull, 0x0203050607040100ull, 0x0002030506070401ull,
     0x0102030506070400ull, 0x0001020305060704ull, 0x0405060703020100ull,
     0x0004050607030201ull, 0x0104050607030200ull, 0x0001040506070302ull,
     0x0204050607030100ull, 0x0002040506070301ull, 0x0102040506070300ull,
     0x0001020405060703ull, 0x0304050607020100ull, 0x0003040506070201ull,
     0x0103040506070200ull, 0x0001030405060702ull, 0x0203040506070100ull,
     0x0002030405060701ull, 0x0102030405060700ull, 0x0001020304050607ull};

// Print Register
void preg(__m256i reg) {
  uint32_t check_vals[8];
  _mm256_store_si256((__m256i*)check_vals, reg);
  for (int i=0; i<8; i++) cout << check_vals[i] << " ";
  cout << endl;
}

float probe_ver(uint32_t *keys_arr, uint32_t *vals_arr, size_t size,
                 const bucket_t *table, int num_slots)
{
  const uint32_t invalid_key = 0;
  chrono::high_resolution_clock::time_point start, finish;
  const int batch_size = 1<<21;
  tbb::atomic<unsigned long long> full_checksum;
  full_checksum = 0;

  start = chrono::high_resolution_clock::now();

  parallel_for(blocked_range<size_t>(0, size, batch_size), [&](auto range) {
    uint32_t *keys = keys_arr + range.begin();
    uint32_t *vals = vals_arr + range.begin();
    size_t i = 0, o = 0, b = 0, partition_size = range.end() - range.begin();
    assert(sizeof(table[0]) == 8);
    const size_t buckets = num_slots;
    //const __m128i shift = _mm_cvtsi32_si128(32 - log_2(buckets));
    //const __m256i factor = _mm256_set1_epi32(table_factor);
    const __m256i empty = _mm256_set1_epi32(invalid_key);
    //preg(empty);
    const __m256i buckets_minus_1 = _mm256_set1_epi32(buckets - 1);
    const __m256i mask_1 = _mm256_set1_epi32(1);
    __m256i checksum = _mm256_set1_epi32(0);
    //preg(checksum);
#if defined __INTEL_COMPILER &&  __INTEL_COMPILER < 1600
    const long *table_64 = (const long*) table;
#else
    const long long *table_64 = (const long long*) table;
#endif
    const size_t buf_size = 128;
    //int buf_space[buf_size + 8 + 15];
    //int *buf = align((void*) buf_space);
    __m256i key, val, off, inv = _mm256_set1_epi32(-1);
    while (i + 8 <= partition_size) {

      //cout << i << " " << size << endl;
      // load new items (mask out reloads)
      __m256i new_key = _mm256_maskload_epi32((const int*) &keys[i], inv);
      key = _mm256_andnot_si256(inv, key); // Does not(inv) & key
      key = _mm256_or_si256(key, new_key);

      __m256i new_val = _mm256_maskload_epi32((const int*) &vals[i], inv);
      val = _mm256_andnot_si256(inv, val); // Does not(inv) & key
      val = _mm256_or_si256(val, new_val);

      //cout << "Key: "; preg(key);
      //cout << "Val: "; preg(val);

      // hash
      //__m256i h = _mm256_mullo_epi32(key, factor);
      off = _mm256_add_epi32(off, mask_1);
      off = _mm256_andnot_si256(inv, off);
      //h = _mm256_srl_epi32(h, shift);
      __m256i h = _mm256_add_epi32(key, off);
      h = _mm256_and_si256(h, buckets_minus_1);
      // gather
      __m256i tab_lo = _mm256_i32gather_epi64(table_64, _mm256_castsi256_si128(h), 8);
      h = _mm256_permute4x64_epi64(h, _MM_SHUFFLE(1,0,3,2));
      __m256i tab_hi = _mm256_i32gather_epi64(table_64, _mm256_castsi256_si128(h), 8);
      __m256i tab_key = _mm256_packlo_epi32(tab_lo, tab_hi);
      //cout << "Tab Key "; preg(tab_key);
      __m256i tab_val = _mm256_packhi_epi32(tab_lo, tab_hi);
      //cout << "Tab Val "; preg(tab_val);
      // update count & sum
      inv = _mm256_cmpeq_epi32(tab_key, empty);
      __m256i out = _mm256_cmpeq_epi32(tab_key, key);
      inv = _mm256_or_si256(inv, out);
      // load permutation masks
      size_t j = _mm256_movemask_ps(_mm256_castsi256_ps(inv));
      size_t k = _mm256_movemask_ps(_mm256_castsi256_ps(out));
      __m128i perm_inv_comp = _mm_loadl_epi64((__m128i*) &perm[j]);
      __m128i perm_out_comp = _mm_loadl_epi64((__m128i*) &perm[k ^ 255]);
      __m256i perm_inv = _mm256_cvtepi8_epi32(perm_inv_comp);
      __m256i perm_out = _mm256_cvtepi8_epi32(perm_out_comp);
      // permutation for invalid
      inv = _mm256_permutevar8x32_epi32(inv, perm_inv);
      key = _mm256_permutevar8x32_epi32(key, perm_inv);
      off = _mm256_permutevar8x32_epi32(off, perm_inv);

      i += _mm_popcnt_u64(j);
      //cout << "Num Matched " << i << endl;

      // permutation for output
      tab_val = _mm256_and_si256(tab_val, out);
      __m256i c2 = _mm256_and_si256(val, out);
      tab_val = _mm256_add_epi32(tab_val, c2);
      //cout << "Check Add "; preg(tab_val);
      //cout << "Check Before "; preg(checksum);
      checksum = _mm256_add_epi32(checksum, tab_val);
      //cout << "Check After "; preg(checksum);

      val = _mm256_permutevar8x32_epi32(val, perm_inv);

      //tab_val = _mm256_permutevar8x32_epi32(tab_val, perm_out);
      //out = _mm256_permutevar8x32_epi32(out, perm_out);
      //_mm256_maskstore_epi32(&buf[b], out, tab_val);

/*      b += _mm_popcnt_u64(k);*/

      //// flush buffer
      //if (b > buf_size) {
        //size_t b_i = 0;
        //do {
          //__m256i x = _mm256_load_si256((__m256i*) &buf[b_i]);
          //_mm256_stream_si256((__m256i*) &vals[o], x);
          //b_i += 8;
          //o += 8;
        //} while (b_i != buf_size);
        //__m256i x = _mm256_load_si256((__m256i*) &buf[b_i]);
        //_mm256_store_si256((__m256i*) &buf[0], x);
        //b -= buf_size;
      /*}*/

    }

    // clean buffer
    //size_t b_i = 0;
    //while (b_i != b) _mm_stream_si32(&((int*) vals)[o++], buf[b_i++]);
    // extract last keys
    /*
    uint32_t l_keys[8];
    _mm256_storeu_si256((__m256i*) l_keys, key);
    size_t j = _mm256_movemask_ps(_mm256_castsi256_ps(inv));
    j = 8 - _mm_popcnt_u64(j);
    i += j;
    while (i != size) l_keys[j++] = keys[i++];
    // process last keys
    //const uint8_t s = 32 - log_2(buckets);
    for (i = 0 ; i != j ; ++i) {
      uint32_t k = l_keys[i];
      size_t h = ((uint32_t) (k & (num_slots - 1)));
      while (invalid_key != table[h].key) {
        if (k == table[h].key) {
          _mm_stream_si32(&((int*) vals)[o++], table[h].val);
          break;
        }
        h = (h + 1) & (buckets - 1);
      }
    }
    */
    uint32_t check_vals[8];
    _mm256_store_si256((__m256i*)check_vals, checksum);
    unsigned long long local_sum = 0;
    //cout << "Final Vals ";
    for (size_t i=0; i<8; i++) { local_sum += check_vals[i]; }
    //cout << endl;
    //cout << local_sum << endl;
    full_checksum.fetch_and_add(local_sum);
  });

  finish = chrono::high_resolution_clock::now();

  unsigned long long res_checksum = full_checksum.fetch_and_add(0);
  cout << "Checksum SIMD " << res_checksum << endl;

  std::chrono::duration<double> diff = finish - start;
  return diff.count() * 1000;
}

struct TimeKeeper {
  float time_build;
  float time_probe;
  float time_extra;
  float time_total;
};

float parallel_zero(int *arr, int size) {
  chrono::high_resolution_clock::time_point start, finish;
  start = chrono::high_resolution_clock::now();
  const __m256i zero = _mm256_set1_epi32(0);
  int batch_size = 1<<21;

  parallel_for(blocked_range<size_t>(0, size, batch_size), [&](auto range) {
    for (size_t i = range.begin(); i < range.end(); i += 8) {
      _mm256_stream_si256((__m256i*) &arr[i], zero);
    }
  });
  finish = chrono::high_resolution_clock::now();

  std::chrono::duration<double> diff = finish - start;
  return diff.count() * 1000;
}

TimeKeeper joinScalar(uint32_t* dim_key, uint32_t* dim_val, uint32_t* fact_fkey, uint32_t* fact_val, int num_dim, int num_fact) {
  bucket_t* hash_table = NULL;
  int num_slots = num_dim;
  float time_build, time_probe, time_memset, time_memset2;

  hash_table = (bucket_t*) _mm_malloc(sizeof(bucket_t) * num_slots, 256);

  time_memset = parallel_zero((int*)hash_table, num_slots * 2);

  time_build = build(dim_key, dim_val, num_dim, hash_table, num_slots);

  time_probe = probe(fact_fkey, fact_val, num_fact, hash_table, num_slots);

  TimeKeeper t = {time_build, time_probe, time_memset, time_build + time_probe + time_memset};
  return t;
}

TimeKeeper joinPrefetch(uint32_t* dim_key, uint32_t* dim_val, uint32_t* fact_fkey, uint32_t* fact_val, int num_dim, int num_fact) {
  bucket_t* hash_table = NULL;
  int num_slots = num_dim;
  float time_build, time_probe, time_memset, time_memset2;

  hash_table = (bucket_t*) _mm_malloc(sizeof(bucket_t) * num_slots, 256);

  time_memset = parallel_zero((int*)hash_table, num_slots * 2);

  time_build = build(dim_key, dim_val, num_dim, hash_table, num_slots);

  time_probe = probe_prefetch(fact_fkey, fact_val, num_fact, hash_table, num_slots);

  TimeKeeper t = {time_build, time_probe, time_memset, time_build + time_probe + time_memset};
  return t;
}


TimeKeeper joinSIMD(uint32_t* dim_key, uint32_t* dim_val, uint32_t* fact_fkey, uint32_t* fact_val, int num_dim, int num_fact) {
  bucket_t* hash_table = NULL;
  int num_slots = num_dim;
  float time_build, time_probe, time_memset, time_memset2;

  hash_table = (bucket_t*) _mm_malloc(sizeof(bucket_t) * num_slots, 256);

  time_memset = parallel_zero((int*)hash_table, num_slots * 2);

  time_build = build(dim_key, dim_val, num_dim, hash_table, num_slots);

  time_probe = probe_ver(fact_fkey, fact_val, num_fact, hash_table, num_slots);

  TimeKeeper t = {time_build, time_probe, time_memset, time_build + time_probe + time_memset};

  return t;
}

int main(int argc, char** argv)
{
  int num_fact       = 256 * 1<<20;
  int num_dim      = 64 * 1<<20;
  int num_trials     = 3;

  // Initialize command line
  CommandLineArgs args(argc, argv);
  args.GetCmdLineArgument("n", num_fact);
  args.GetCmdLineArgument("d", num_dim);
  args.GetCmdLineArgument("t", num_trials);

  // Print usage
  if (args.CheckCmdLineFlag("help"))
  {
    printf("%s "
      "[--n=<num fact>] "
      "[--d=<num dim>] "
      "[--t=<num trials>] "
      "[--device=<device-id>] "
      "[--v] "
      "\n", argv[0]);
    exit(0);
  }

  int *h_dim_key = NULL;
  int *h_dim_val = NULL;
  int *h_fact_fkey = NULL;
  int *h_fact_val = NULL;

  create_relation_pk(h_dim_key, h_dim_val, num_dim);
  create_relation_fk(h_fact_fkey, h_fact_val, num_fact, num_dim);

  long long sum = 0;
  for (int i=0; i<num_fact; i++) sum += h_fact_val[i];
  //cout << "FSUM " << sum << endl;

  for (int i=0; i<num_dim; i++) sum += h_dim_val[i];
  //cout << "DSUM " << sum << endl;

  cout << "Done Generating" << endl;

  uint32_t *dim_key = (uint32_t*) h_dim_key;
  uint32_t *dim_val = (uint32_t*) h_dim_val;
  uint32_t *fact_fkey = (uint32_t*) h_fact_fkey;
  uint32_t *fact_val = (uint32_t*) h_fact_val;

/*  cout << "Original" << endl;*/
  //for (int i=0; i<8; i++) cout << fact_fkey[i] << " ";
  //cout << endl;
  //for (int i=0; i<8; i++) cout << fact_val[i] << " ";
  /*cout << endl;*/

  for (int t = 0; t < num_trials; t++) {
/*    TimeKeeper time_simd = joinSIMD(dim_key, dim_val, fact_fkey, fact_val, num_dim, num_fact);*/
    /*cout << "Done SIMD" << endl;*/

    TimeKeeper time_scalar = joinScalar(dim_key, dim_val, fact_fkey, fact_val, num_dim, num_fact);
    cout << "Done Scalar" << endl;

    TimeKeeper time_prefetch = joinPrefetch(dim_key, dim_val, fact_fkey, fact_val, num_dim, num_fact);
    cout << "Done Prefetch" << endl;

    cout<< "{"
      << "\"num_dim\":" << num_dim
      << ",\"num_fact\":" << num_fact
      << ",\"time_build_scalar\":" << time_scalar.time_build
      << ",\"time_probe_scalar\":" << time_scalar.time_probe
      << ",\"time_build_prefetch\":" << time_prefetch.time_build
      << ",\"time_probe_prefetch\":" << time_prefetch.time_probe
  /*    << ",\"time_build_simd\":" << time_simd.time_build*/
      /*<< ",\"time_probe_simd\":" << time_simd.time_probe*/
      << "}" << endl;
  }

  return 0;
}