#include "ssb_utils.h"
#include <iostream>
#include <stdio.h>
#include <curand.h>
#include <cuda.h>
#include <assert.h>
#define HASH_WM(X,Y,Z) ((X-Z) % Y)
using namespace std;
__global__
void count_date(int *fact_key, int num_tuples, int *hash_table, int num_slots, int val_min) {
int offset = blockIdx.x * blockDim.x + threadIdx.x;
if (offset < num_tuples) {
int key = fact_key[offset];
int hash = HASH_WM(key, num_slots, val_min);
// int init = 0;
// atomicCAS(&hash_table[hash << 1], init, key);
if (hash_table[(hash << 1)] == 0) hash_table[(hash << 1)] = key;
atomicAdd(&hash_table[(hash << 1) + 1], 1);
// printf("%d %d\n", key, hash);
}
}
__global__
void build_date(int *dim_key, int num_tuples, int *hash_table, int num_slots, int val_min) {
int offset = blockIdx.x * blockDim.x + threadIdx.x;
if (offset < num_tuples) {
int key = dim_key[offset];
int hash = HASH_WM(key, num_slots, val_min);
int init = 0;
atomicCAS(&hash_table[hash << 1], init, key);
}
}
__global__
void modify_date(int *fact_key, int *hash_table, int num_tuples, int final_N) {
int offset = blockIdx.x * blockDim.x + threadIdx.x;
if (offset < num_tuples) {
int start_idx = hash_table[(offset << 1) + 1];
int end_idx;
if (offset == num_tuples - 1) end_idx = final_N;
else end_idx = hash_table[(offset << 1) + 3];
int val = hash_table[(offset << 1)];
if (offset == num_tuples - 1) printf("%d %d %d %d\n", offset, val, start_idx, end_idx);
for (int i = start_idx; i < end_idx; i++) {
if (val != 0) fact_key[i] = val;
}
}
}
int main () {
int* h_lo_orderdate = loadColumnPinned<int>("lo_orderdate", LO_LEN);
// for (int i = 0; i < 1000; i++) {
// if (h_lo_orderdate[i] < 19920101 || h_lo_orderdate[i] > 19981230)
// cout << h_lo_orderdate[i] << endl;
// }
int* h_d_datekey = loadColumnPinned<int>("d_datekey", D_LEN);
int* lo_orderdate, *lo_orderdatesort, *d_datekey;
cudaMalloc((void**) &lo_orderdate, LO_LEN * sizeof(int));
// cudaMalloc((void**) &lo_orderdatesort, LO_LEN * sizeof(int));
cudaMalloc((void**) &d_datekey, D_LEN * sizeof(int));
cudaMemcpy(lo_orderdate, h_lo_orderdate, LO_LEN * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_datekey, h_d_datekey, D_LEN * sizeof(int), cudaMemcpyHostToDevice);
int d_val_len = 19981230 - 19920101 + 1;
int d_val_min = 19920101;
int* ht_d;
cudaMalloc((void**) &ht_d, 2 * d_val_len * sizeof(int));
cudaMemset(ht_d, 0, 2 * d_val_len * sizeof(int));
build_date<<<(D_LEN + 127)/128, 128>>>(d_datekey, D_LEN, ht_d, d_val_len, d_val_min);
count_date<<<(LO_LEN+ 127)/128, 128>>>(lo_orderdate, LO_LEN, ht_d, d_val_len, d_val_min);
cudaDeviceSynchronize();
cout << "Build and count date done" << endl;
// assert(0);
int* h_ht_d;
cudaHostAlloc((void**) &h_ht_d, 2 * d_val_len * sizeof(int), cudaHostAllocDefault);
cudaMemcpy(h_ht_d, ht_d, 2 * d_val_len * sizeof(int), cudaMemcpyDeviceToHost);
int len;
if (SF % 10 == 0) len = 2556;
else len = 2568;
int* modify_ht_d = new int[2 * len];
cudaHostAlloc((void**) &modify_ht_d, 2 * len * sizeof(int), cudaHostAllocDefault);
cudaDeviceSynchronize();
int temp = 0;
int k = 0;
for (int i = 0; i < 2 * d_val_len; i+=2) {
if (h_ht_d[i] != 0) {
modify_ht_d[k] = h_ht_d[i];
modify_ht_d[k+1] = temp;
temp += h_ht_d[i+1];
cout << k << " " << modify_ht_d[k] << " " << modify_ht_d[k+1] << " " << LO_LEN << endl;
cout << h_ht_d[i] << " " << h_ht_d[i+1] << endl;
k+=2;
}
}
cout << "CPU portion done" << endl;
// cout << temp << endl;
// for (int i = 0; i < 2 * D_LEN; i++) {
// cout << modify_ht_d[i] << endl;
// }
// cout << "here " << endl;
int* d_modify_ht_d;
cudaMalloc((void**) &d_modify_ht_d, 2 * len * sizeof(int));
cudaMemcpy(d_modify_ht_d, modify_ht_d, 2 * len * sizeof(int), cudaMemcpyHostToDevice);
modify_date<<<(len + 127)/128, 128>>>(lo_orderdate, d_modify_ht_d, len, LO_LEN);
cudaMemcpy(h_lo_orderdate, lo_orderdate, LO_LEN * sizeof(int), cudaMemcpyDeviceToHost);
cout << "Modify date" << endl;
storeColumn<int>("lo_orderdate", LO_LEN, h_lo_orderdate);
// h_lo_orderdate = loadColumnPinned<int>("lo_orderdate", LO_LEN);
// for (int i = LO_LEN - 1000; i < LO_LEN; i++) {
// // if (h_lo_orderdate[i] < 19920101 && h_lo_orderdate[i] > 19981230)
// cout << h_lo_orderdate[i] << endl;
// }
}