Lancelot / src / gpudb / sort_column_fast.cu
sort_column_fast.cu
Raw
#include "ssb_utils.h"
#include <iostream>
#include <stdio.h>
#include <curand.h>
#include <cuda.h>
#include <assert.h>

#define HASH_WM(X,Y,Z) ((X-Z) % Y)

using namespace std;

__global__
void count_date(int *fact_key, int num_tuples, int *hash_table, int num_slots, int val_min) {
  int offset = blockIdx.x * blockDim.x + threadIdx.x;
  if (offset < num_tuples) {
    int key = fact_key[offset];
    int hash = HASH_WM(key, num_slots, val_min);
	// int init = 0;

	// atomicCAS(&hash_table[hash << 1], init, key);
	if (hash_table[(hash << 1)] == 0) hash_table[(hash << 1)] = key;
    atomicAdd(&hash_table[(hash << 1) + 1], 1);
	// printf("%d %d\n", key, hash);
  }
}

__global__
void build_date(int *dim_key, int num_tuples, int *hash_table, int num_slots, int val_min) {
  int offset = blockIdx.x * blockDim.x + threadIdx.x;
  if (offset < num_tuples) {
    int key = dim_key[offset];
    int hash = HASH_WM(key, num_slots, val_min);
    int init = 0;

    atomicCAS(&hash_table[hash << 1], init, key);
  }
}

__global__
void modify_date(int *fact_key, int *hash_table, int num_tuples, int final_N) {
  int offset = blockIdx.x * blockDim.x + threadIdx.x;
  if (offset < num_tuples) {
    int start_idx = hash_table[(offset << 1) + 1];
    int end_idx;
	if (offset == num_tuples - 1) end_idx = final_N;
	else end_idx = hash_table[(offset << 1) + 3];
    int val = hash_table[(offset << 1)];

	if (offset == num_tuples - 1) printf("%d %d %d %d\n", offset, val, start_idx, end_idx);

    for (int i = start_idx; i < end_idx; i++) {
    	if (val != 0) fact_key[i] = val;
    }
  }
}


int main () {

	int* h_lo_orderdate = loadColumnPinned<int>("lo_orderdate", LO_LEN);

	// for (int i = 0; i < 1000; i++) {
	// 	if (h_lo_orderdate[i] < 19920101 || h_lo_orderdate[i] > 19981230)
	// 		cout << h_lo_orderdate[i] << endl;
	// }

	int* h_d_datekey = loadColumnPinned<int>("d_datekey", D_LEN);

	int* lo_orderdate, *lo_orderdatesort, *d_datekey;
	cudaMalloc((void**) &lo_orderdate, LO_LEN * sizeof(int));
	// cudaMalloc((void**) &lo_orderdatesort, LO_LEN * sizeof(int));
	cudaMalloc((void**) &d_datekey, D_LEN * sizeof(int));

	cudaMemcpy(lo_orderdate, h_lo_orderdate, LO_LEN * sizeof(int), cudaMemcpyHostToDevice);
	cudaMemcpy(d_datekey, h_d_datekey, D_LEN * sizeof(int), cudaMemcpyHostToDevice);

	int d_val_len = 19981230 - 19920101 + 1;
	int d_val_min = 19920101;

	int* ht_d;
	cudaMalloc((void**) &ht_d, 2 * d_val_len * sizeof(int));
	cudaMemset(ht_d, 0, 2 * d_val_len * sizeof(int));

	build_date<<<(D_LEN + 127)/128, 128>>>(d_datekey, D_LEN, ht_d, d_val_len, d_val_min);
	count_date<<<(LO_LEN+ 127)/128, 128>>>(lo_orderdate, LO_LEN, ht_d, d_val_len, d_val_min);

	cudaDeviceSynchronize();

	cout << "Build and count date done" << endl;

	// assert(0);

	int* h_ht_d;
	cudaHostAlloc((void**) &h_ht_d, 2 * d_val_len * sizeof(int), cudaHostAllocDefault);
	cudaMemcpy(h_ht_d, ht_d, 2 * d_val_len * sizeof(int), cudaMemcpyDeviceToHost);

	int len;
	if (SF % 10 == 0) len = 2556;
	else len = 2568;

	int* modify_ht_d = new int[2 * len];
	cudaHostAlloc((void**) &modify_ht_d, 2 * len * sizeof(int), cudaHostAllocDefault);

	cudaDeviceSynchronize();

	int temp = 0;
	int k = 0;
	for (int i = 0; i < 2 * d_val_len; i+=2) {
		if (h_ht_d[i] != 0) {
			modify_ht_d[k] = h_ht_d[i];
			modify_ht_d[k+1] = temp;
			temp += h_ht_d[i+1];
			cout << k << " " << modify_ht_d[k] << " " << modify_ht_d[k+1] << " " << LO_LEN << endl;
			cout << h_ht_d[i] << " " << h_ht_d[i+1] << endl;
			k+=2;
		}
	}

	cout << "CPU portion done" << endl;
	
	// cout << temp << endl;

	// for (int i = 0; i < 2 * D_LEN; i++) {
	// 	cout << modify_ht_d[i] << endl;
	// }

	// cout << "here " << endl;

	int* d_modify_ht_d;
	cudaMalloc((void**) &d_modify_ht_d, 2 * len * sizeof(int));
	cudaMemcpy(d_modify_ht_d, modify_ht_d, 2 * len * sizeof(int), cudaMemcpyHostToDevice);

	modify_date<<<(len + 127)/128, 128>>>(lo_orderdate, d_modify_ht_d, len, LO_LEN);

	cudaMemcpy(h_lo_orderdate, lo_orderdate, LO_LEN * sizeof(int), cudaMemcpyDeviceToHost);

	cout << "Modify date" << endl;

	storeColumn<int>("lo_orderdate", LO_LEN, h_lo_orderdate);

	// h_lo_orderdate = loadColumnPinned<int>("lo_orderdate", LO_LEN);

	// for (int i = LO_LEN - 1000; i < LO_LEN; i++) {
	// 	// if (h_lo_orderdate[i] < 19920101 && h_lo_orderdate[i] > 19981230)
	// 	cout << h_lo_orderdate[i] << endl;
	// }

}