Lancelot / src / gpudb / CacheManager.cu
CacheManager.cu
Raw
#include "CacheManager.h"
#include <bitset>

Segment::Segment(ColumnInfo* _column, int* _seg_ptr, int _priority)
: column(_column), seg_ptr(_seg_ptr), priority(_priority), seg_size(SEGMENT_SIZE) {
	stats = new Statistics();
	col_ptr = column->col_ptr;
	segment_id = (seg_ptr - col_ptr)/seg_size;
	weight = 0;
	repl_weight = 0;
}

Segment::Segment(ColumnInfo* _column, int* _seg_ptr)
: column(_column), seg_ptr(_seg_ptr), priority(0), seg_size(SEGMENT_SIZE) {
	stats = new Statistics();
	col_ptr = column->col_ptr;
	segment_id = (seg_ptr - col_ptr)/seg_size;
	weight = 0;
	repl_weight = 0;
}

ColumnInfo::ColumnInfo(TableInfo* _table, string _column_name, int _column_id, int* _col_ptr)
: table(_table), column_name(_column_name), column_id(_column_id), col_ptr(_col_ptr) {
	stats = new Statistics();
	tot_seg_in_GPU = 0;
	weight = 0;
	seg_ptr = col_ptr;
	table_id = table->table_id;
	table_name = table->table_name;
	LEN = table->LEN;
	total_segment = (LEN+SEGMENT_SIZE-1)/SEGMENT_SIZE;
}

TableInfo::TableInfo(string _table_name, int _LEN, int _table_id)
: table_name(_table_name), LEN(_LEN), table_id(_table_id) {
	total_segment = (LEN+SEGMENT_SIZE-1)/SEGMENT_SIZE;
}

// WILL NOT WORK FOR MULTI GPU
Segment*
ColumnInfo::getSegment(int index) {
	Segment* seg = new Segment(this, col_ptr+SEGMENT_SIZE*index);
	return seg;
}

CacheManager::CacheManager(size_t _cache_size, size_t _broadcast_size, size_t _processing_size, size_t _pinned_memsize) {
	tot_cache_size = _cache_size;
	each_cache_size = _cache_size/NUM_GPU;
	each_broadcast_size = _broadcast_size/NUM_GPU;
	cache_total_seg = tot_cache_size/SEGMENT_SIZE;
	cache_each_tot_seg = each_cache_size/SEGMENT_SIZE;
	broadcast_each_tot_seg = each_broadcast_size/SEGMENT_SIZE;
	tot_processing_size = _processing_size;
	each_processing_size = _processing_size/NUM_GPU;
	pinned_memsize = _pinned_memsize;
	TOT_COLUMN = NUM_COLUMN;
	TOT_TABLE = NUM_TABLE;

	seg_idx_min = 0;

	gpuCache = new int*[NUM_GPU];
	gpuProcessing = new uint64_t*[NUM_GPU];
	gpuPointer = new unsigned int[NUM_GPU];
	broadcastPointer = new unsigned int[NUM_GPU];
	gpuBroadcast = new int*[NUM_GPU];

	for (int gpu = 0; gpu < NUM_GPU; gpu++) {
		cout << gpu << " " << each_cache_size << endl;
		cudaSetDevice(gpu);
		CubDebugExit(cudaMalloc((void**) &gpuCache[gpu], (each_cache_size + each_broadcast_size) * sizeof(int)));
		// CubDebugExit(cudaMemset(gpuCache[gpu], 0, each_cache_size * sizeof(int)));
		CubDebugExit(cudaMalloc((void**) &gpuProcessing[gpu], each_processing_size * sizeof(uint64_t)));
		gpuPointer[gpu] = 0;
		broadcastPointer[gpu] = 0;
		gpuBroadcast[gpu] = gpuCache[gpu] + each_cache_size;
	}
	cudaSetDevice(0);

	cpuProcessing = (uint64_t*) malloc(tot_processing_size * sizeof(uint64_t));
	CubDebugExit(cudaHostAlloc((void**) &pinnedMemory, pinned_memsize * sizeof(uint64_t), cudaHostAllocDefault));
	cpuPointer = 0;
	pinnedPointer = 0;

	allColumn.resize(TOT_COLUMN);
	allTable.resize(TOT_TABLE);
	index_to_segment.resize(TOT_COLUMN);
	segment_row_to_gpu.resize(TOT_TABLE);

	cache_mapper.resize(NUM_GPU);
	empty_gpu_segment.resize(NUM_GPU);

	for (int gpu = 0; gpu < NUM_GPU; gpu++) {
		for(int i = 0; i < cache_each_tot_seg; i++) {
			empty_gpu_segment[gpu].push(i);
		}
	}

	loadColumnToCPU();

	seg_row_to_single_gpu = (int**) malloc (TOT_TABLE * sizeof(int*));
	seg_is_replicated = (int**) malloc (TOT_TABLE * sizeof(int*));
	for (int table = 0; table < TOT_TABLE; table++) {
		int total_segment = allTable[table]->total_segment;
		segment_row_to_gpu[table].resize(total_segment);
		seg_row_to_single_gpu[table] = (int*) malloc (total_segment * sizeof(int));
		seg_is_replicated[table] = (int*) malloc (total_segment * sizeof(int));
		memset(seg_row_to_single_gpu[table], -1, total_segment * sizeof(int));
		memset(seg_is_replicated[table], 0, total_segment * sizeof(int));
	}

	gpu_segment_row_bitmap = (unsigned int***) malloc (NUM_GPU * sizeof(unsigned int**));
	for (int gpu = 0; gpu < NUM_GPU; gpu++) {
		gpu_segment_row_bitmap[gpu] = (unsigned int**) malloc (TOT_TABLE * sizeof(unsigned int*));
		for (int table = 0; table < TOT_TABLE; table++) {
			gpu_segment_row_bitmap[gpu][table] = (unsigned int*) malloc (allTable[table]->total_segment * sizeof(unsigned int));
			memset(gpu_segment_row_bitmap[gpu][table], 0, allTable[table]->total_segment * sizeof(unsigned int));
		}
	}

	segment_list = (int***) malloc (NUM_GPU * sizeof(int**));
	for (int gpu = 0; gpu < NUM_GPU; gpu++) {
			segment_list[gpu] = (int**) malloc (TOT_COLUMN * sizeof(int*));
			for (int i = 0; i < TOT_COLUMN; i++) {
				int n = allColumn[i]->total_segment;
				CubDebugExit(cudaHostAlloc((void**) &(segment_list[gpu][i]), n * sizeof(int), cudaHostAllocDefault));
				memset(segment_list[gpu][i], -1, n * sizeof(int));
			}
	}

	segment_min = (int**) malloc (TOT_COLUMN * sizeof(int*));
	segment_max = (int**) malloc (TOT_COLUMN * sizeof(int*));
	segment_bitmap = (unsigned char**) malloc (TOT_COLUMN * sizeof(unsigned char*));

	for (int i = 0; i < TOT_COLUMN; i++) {
		int n = allColumn[i]->total_segment;
		segment_bitmap[i] = (unsigned char*) malloc(n * sizeof(unsigned char));
		segment_min[i] = (int*) malloc(n * sizeof(int));
		segment_max[i] = (int*) malloc(n * sizeof(int));
		memset(segment_bitmap[i], 0, n * sizeof(unsigned char));
	}

	readSegmentMinMax();

	for (int i = 0; i < TOT_COLUMN; i++) {
		index_to_segment[i].resize(allColumn[i]->total_segment);
		for (int j = 0; j < allColumn[i]->total_segment; j++) {
			index_to_segment[i][j] = allColumn[i]->getSegment(j);
		}
	}
	
}

// void
// CacheManager::resetCache(size_t _cache_size, size_t _processing_size, size_t _pinned_memsize) {

// 	for (int gpu = 0; gpu < NUM_GPU; gpu++) {
// 		cudaSetDevice(gpu);
// 		CubDebugExit(cudaFree(gpuCache[gpu]));
// 		CubDebugExit(cudaFree(gpuProcessing[gpu]));
// 	}
// 	cudaSetDevice(0);
// 	delete[] gpuCache;
// 	delete[] gpuProcessing;

// 	delete[] cpuProcessing;
// 	CubDebugExit(cudaFreeHost(pinnedMemory));

// 	for (int table = 0; table < TOT_TABLE; table++) {
// 		int total_segment = allTable[table]->total_segment;
// 		for (int seg_id = 0; seg_id < total_segment; seg_id++) {
// 			segment_row_to_gpu[table][seg_id].clear();
// 		}
// 		free(seg_row_to_single_gpu[table]);
// 	}
// 	free(seg_row_to_single_gpu);

// 	for (int gpu = 0; gpu < NUM_GPU; gpu++) {
// 		cache_mapper[gpu].clear();
// 	}

// 	for (int i = 0; i < TOT_COLUMN; i++) {
// 		free(segment_bitmap[i]);
// 	}
// 	free(segment_bitmap);

// 	for (int gpu = 0; gpu < NUM_GPU; gpu++) {
// 		for (int i = 0; i < TOT_COLUMN; i++) {
// 			CubDebugExit(cudaFreeHost(segment_list[gpu][i]));
// 		}
// 		free(segment_list[gpu]);
// 	}
// 	free(segment_list);
	
// 	for (int gpu = 0; gpu < NUM_GPU; gpu++) {
// 		for (int i = 0; i < TOT_TABLE; i++) {
// 			free(gpu_segment_row_bitmap[gpu][i]);
// 		}
// 		free(gpu_segment_row_bitmap[gpu]);
// 	}
// 	free(gpu_segment_row_bitmap);

// 	tot_cache_size = _cache_size;
// 	each_cache_size = _cache_size/NUM_GPU;
// 	cache_total_seg = tot_cache_size/SEGMENT_SIZE;
// 	cache_each_tot_seg = each_cache_size/SEGMENT_SIZE;
// 	tot_processing_size = _processing_size;
// 	each_processing_size = _processing_size/NUM_GPU;
// 	pinned_memsize = _pinned_memsize;

// 	cout << each_cache_size  << endl;

// 	gpuCache = new int*[NUM_GPU];
// 	gpuProcessing = new uint64_t*[NUM_GPU];
// 	gpuPointer = new unsigned int[NUM_GPU];

// 	for (int gpu = 0; gpu < NUM_GPU; gpu++) {
// 		cudaSetDevice(gpu);
// 		CubDebugExit(cudaMalloc((void**) &gpuCache[gpu], each_cache_size * sizeof(int)));
// 		CubDebugExit(cudaMemset(gpuCache[gpu], 0, each_cache_size * sizeof(int)));
// 		CubDebugExit(cudaMalloc((void**) &gpuProcessing[gpu], each_processing_size * sizeof(uint64_t)));
// 		gpuPointer[gpu] = 0;
// 	}
// 	cudaSetDevice(0);

// 	cpuProcessing = (uint64_t*) malloc(tot_processing_size * sizeof(uint64_t));
// 	CubDebugExit(cudaHostAlloc((void**) &pinnedMemory, pinned_memsize * sizeof(uint64_t), cudaHostAllocDefault));
// 	cpuPointer = 0;
// 	pinnedPointer = 0;

// 	for (int gpu = 0; gpu < NUM_GPU; gpu++) {
// 		while (!empty_gpu_segment[gpu].empty()) {
// 			empty_gpu_segment[gpu].pop();
// 		}		
// 		for(int i = 0; i < cache_each_tot_seg; i++) {
// 			empty_gpu_segment[gpu].push(i);
// 		}
// 	}

// 	seg_row_to_single_gpu = (int**) malloc (TOT_TABLE * sizeof(int*));
// 	for (int table = 0; table < TOT_TABLE; table++) {
// 		int total_segment = allTable[table]->total_segment;
// 		seg_row_to_single_gpu[table] = (int*) malloc (total_segment * sizeof(int));
// 		memset(seg_row_to_single_gpu[table], -1, total_segment * sizeof(int));
// 	}

// 	gpu_segment_row_bitmap = (unsigned int***) malloc (NUM_GPU * sizeof(unsigned int**));
// 	for (int gpu = 0; gpu < NUM_GPU; gpu++) {
// 		gpu_segment_row_bitmap[gpu] = (unsigned int**) malloc (TOT_TABLE * sizeof(unsigned int*));
// 		for (int table = 0; table < TOT_TABLE; table++) {
// 			gpu_segment_row_bitmap[gpu][table] = (unsigned int*) malloc (allTable[table]->total_segment * sizeof(unsigned int));
// 			memset(gpu_segment_row_bitmap[gpu][table], 0, allTable[table]->total_segment * sizeof(unsigned int));
// 		}
// 	}

// 	segment_list = (int***) malloc (NUM_GPU * sizeof(int**));
// 	for (int gpu = 0; gpu < NUM_GPU; gpu++) {
// 			segment_list[gpu] = (int**) malloc (TOT_COLUMN * sizeof(int*));
// 			for (int i = 0; i < TOT_COLUMN; i++) {
// 				int n = allColumn[i]->total_segment;
// 				CubDebugExit(cudaHostAlloc((void**) &(segment_list[gpu][i]), n * sizeof(int), cudaHostAllocDefault));
// 				memset(segment_list[gpu][i], -1, n * sizeof(int));
// 			}
// 	}

// 	segment_bitmap = (char**) malloc (TOT_COLUMN * sizeof(char*));
// 	for (int i = 0; i < TOT_COLUMN; i++) {
// 		int n = allColumn[i]->total_segment;
// 		segment_bitmap[i] = (char*) malloc(n * sizeof(char));
// 		memset(segment_bitmap[i], 0, n * sizeof(char));
// 	}
// }

void 
CacheManager::readSegmentMinMax() {

	for (int i = 0; i < TOT_COLUMN; i++) {
		string line;
		ifstream myfile (DATA_DIR + allColumn[i]->column_name + "minmax");
		if (myfile.is_open()) {
			int segment_idx = 0;
			string del = " ";
			while ( getline (myfile,line) )
			{
				int start = 0;
				int end = line.find(del);
				if (end != -1) {
				    string minstring = line.substr(start, end - start);
				    segment_min[i][segment_idx] = stoi(minstring);
				    start = end + del.size();
				}
				string maxstring = line.substr(start, end - start);
				segment_max[i][segment_idx] = stoi(maxstring);
				segment_idx++;
			}
			assert(segment_idx == allColumn[i]->total_segment);
			myfile.close();
		} else {
			cout << "Unable to open file" << endl; 
			assert(0);
		}

	}
}

template <typename T>
T*
CacheManager::customMalloc(int size) {
	int alloc = ((size * sizeof(T)) + sizeof(uint64_t) - 1)/ sizeof(uint64_t);
	int start = __atomic_fetch_add(&cpuPointer, alloc, __ATOMIC_RELAXED);
	assert((start + alloc) < tot_processing_size);
	return reinterpret_cast<T*>(cpuProcessing + start);
};

template <typename T>
T*
CacheManager::customCudaMalloc(int size, int gpu) {
	int alloc = ((size * sizeof(T)) + sizeof(uint64_t) - 1)/ sizeof(uint64_t);
	int start = __atomic_fetch_add(&gpuPointer[gpu], alloc, __ATOMIC_RELAXED);
	// cout << gpu << " " << gpuPointer[gpu] << " " << each_processing_size << endl;
	assert((start + alloc) < each_processing_size);
	return reinterpret_cast<T*>(gpuProcessing[gpu] + start);
};

template <typename T>
T*
CacheManager::customCudaHostAlloc(int size) {
	int alloc = ((size * sizeof(T)) + sizeof(uint64_t) - 1)/ sizeof(uint64_t);
	int start = __atomic_fetch_add(&pinnedPointer, alloc, __ATOMIC_RELAXED);
	assert((start + alloc) < tot_processing_size);
	return reinterpret_cast<T*>(pinnedMemory + start);
};

//perform index transfer to all gpu only if they don't have the indexes yet
//the stream synchronize will cause the memcpy to be serialized across GPU, we have to fix in this in the future
void
CacheManager::indexTransfer(int** col_idx, ColumnInfo* column, cudaStream_t stream, int gpu, bool custom) {
    if (col_idx[column->column_id] == NULL) {
      int* desired;
      // if (custom) desired = (int*) customCudaMalloc<int>(column->total_segment, gpu); 
      // else CubDebugExit(cudaMalloc((void**) &desired, column->total_segment * sizeof(int)));

      desired = (int*) customCudaMalloc<int>(column->total_segment, gpu);
      int* expected = NULL;
      CubDebugExit(cudaSetDevice(gpu));
      CubDebugExit(cudaMemcpyAsync(desired, segment_list[gpu][column->column_id], column->total_segment * sizeof(int), cudaMemcpyHostToDevice, stream));
      CubDebugExit(cudaStreamSynchronize(stream));
      __atomic_compare_exchange_n(&(col_idx[column->column_id]), &expected, desired, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
      CubDebugExit(cudaSetDevice(0));
    }
};

void
CacheManager::resetPointer() {
	for (int gpu = 0; gpu < NUM_GPU; gpu++) {
		gpuPointer[gpu] = 0;
		broadcastPointer[gpu] = 0;
	}
	cpuPointer = 0;
	pinnedPointer = 0;
};

// modified for multi gpu
void 
CacheManager::cacheSegmentInGPU(Segment* seg, int gpu) {

	int64_t local_idx = empty_gpu_segment[gpu].front(); //local segment index to each GPU
	empty_gpu_segment[gpu].pop();

	assert(cache_mapper[gpu].find(seg) == cache_mapper[gpu].end());
	cache_mapper[gpu][seg] = local_idx;

	unsigned char old_segment_bitmap = segment_bitmap[seg->column->column_id][seg->segment_id];

	unsigned char check = (segment_bitmap[seg->column->column_id][seg->segment_id] << gpu);
	check = check >> 7;
	// cout << gpu << " " << std::bitset<8>(segment_bitmap[seg->column->column_id][seg->segment_id]) << " " << std::bitset<8>(check) << endl;
	assert(check == 0x00);
	unsigned char code = 0x80 >> gpu;
	segment_bitmap[seg->column->column_id][seg->segment_id] = segment_bitmap[seg->column->column_id][seg->segment_id] | code;
	// cout << std::bitset<8>(segment_bitmap[seg->column->column_id][seg->segment_id]) << endl;

	//segment row management
	if (gpu_segment_row_bitmap[gpu][seg->column->table_id][seg->segment_id] == 0x00000000) { //if this segment row has not been cached in this gpu
		vector<int> vec = segment_row_to_gpu[seg->column->table_id][seg->segment_id];
		vector<int>::iterator it = find(vec.begin(), vec.end(), gpu);
		assert(it == vec.end());
		segment_row_to_gpu[seg->column->table_id][seg->segment_id].push_back(gpu);
		seg_row_to_single_gpu[seg->column->table_id][seg->segment_id] = gpu;
		if (segment_row_to_gpu[seg->column->table_id][seg->segment_id].size() == NUM_GPU) {
			// cout << "im here " << seg->column->table_id << endl;
			seg_is_replicated[seg->column->table_id][seg->segment_id] = 1;
			seg_row_to_single_gpu[seg->column->table_id][seg->segment_id] = seg->segment_id % NUM_GPU;
		}
	}

	unsigned int check_int = (gpu_segment_row_bitmap[gpu][seg->column->table_id][seg->segment_id] << seg->column->column_table_id);
	check_int = check_int >> 31;
	assert(check_int == 0);
	unsigned int code_int = 0x80000000 >> seg->column->column_table_id;
	gpu_segment_row_bitmap[gpu][seg->column->table_id][seg->segment_id] = gpu_segment_row_bitmap[gpu][seg->column->table_id][seg->segment_id] | code_int;

	assert(segment_list[gpu][seg->column->column_id][seg->segment_id] == -1);
	assert(local_idx >= 0);
	segment_list[gpu][seg->column->column_id][seg->segment_id] = local_idx;
	CubDebugExit(cudaSetDevice(gpu));
	CubDebugExit(cudaMemcpy(&gpuCache[gpu][local_idx * SEGMENT_SIZE], seg->seg_ptr, SEGMENT_SIZE * sizeof(int), cudaMemcpyHostToDevice));
	CubDebugExit(cudaSetDevice(0));
	//THIS CHECKING BELOW NO LONGER APPLIES FOR MULTI GPU
	// allColumn[seg->column->column_id]->tot_seg_in_GPU++;
	// assert(allColumn[seg->column->column_id]->tot_seg_in_GPU <= allColumn[seg->column->column_id]->total_segment);

	if (old_segment_bitmap == 0x00) {
		assert(segment_bitmap[seg->column->column_id][seg->segment_id] != 0x00);
		allColumn[seg->column->column_id]->tot_seg_in_GPU++;
		assert(allColumn[seg->column->column_id]->tot_seg_in_GPU <= allColumn[seg->column->column_id]->total_segment);
	}
}

void
CacheManager::cacheSegmentMultipleGPU(Segment* seg, vector<int> &gpu_list) {
	for (int i = 0; i < gpu_list.size(); i++) {
		int gpu = gpu_list[i];
		cacheSegmentInGPU(seg, gpu);
	}
}

void 
CacheManager::deleteSegmentInGPU(Segment* seg, int gpu) {
	assert(cache_mapper[gpu].find(seg) != cache_mapper[gpu].end());
	int local_idx = cache_mapper[gpu][seg];
	int ret = cache_mapper[gpu].erase(seg);
	assert(ret == 1);

	unsigned char old_segment_bitmap = segment_bitmap[seg->column->column_id][seg->segment_id];

	// cout << "1 " << std::bitset<8>(old_segment_bitmap) << endl;

	unsigned char check = (segment_bitmap[seg->column->column_id][seg->segment_id] << gpu);
	check = check >> 7;
	assert(check == 0x01);
	unsigned char code = ~(0x80 >> gpu); //produce all 1 except the gpu bit location which is 0	
	segment_bitmap[seg->column->column_id][seg->segment_id] = segment_bitmap[seg->column->column_id][seg->segment_id] & code;

	// cout << "2 " << std::bitset<32>(gpu_segment_row_bitmap[gpu][seg->column->table_id][seg->segment_id]) << endl;

	//segment row management
	unsigned int check_int = (gpu_segment_row_bitmap[gpu][seg->column->table_id][seg->segment_id] << seg->column->column_table_id);
	check_int = check_int >> 31;
	assert(check_int == 1);
	unsigned int code_int = ~(0x80000000 >> seg->column->column_table_id);
	gpu_segment_row_bitmap[gpu][seg->column->table_id][seg->segment_id] = gpu_segment_row_bitmap[gpu][seg->column->table_id][seg->segment_id] & code_int;

	// cout << "3 " << std::bitset<32>(gpu_segment_row_bitmap[gpu][seg->column->table_id][seg->segment_id]) << endl;

	if (gpu_segment_row_bitmap[gpu][seg->column->table_id][seg->segment_id] == 0x00000000) { //the segment that we deleted is the last segment in this segment row that presents in this gpu 
		// cout << "delete segment row " << segment_row_to_gpu[seg->column->table_id][seg->segment_id].size() << endl;
		int loc = 0;
		for (int i = 0; i < segment_row_to_gpu[seg->column->table_id][seg->segment_id].size(); i++) {
			if (segment_row_to_gpu[seg->column->table_id][seg->segment_id][i] == gpu) loc = i;
		}
		assert(segment_row_to_gpu[seg->column->table_id][seg->segment_id][loc] == gpu);
		segment_row_to_gpu[seg->column->table_id][seg->segment_id].erase(segment_row_to_gpu[seg->column->table_id][seg->segment_id].begin() + loc);
		// cout << "delete segment row " << segment_row_to_gpu[seg->column->table_id][seg->segment_id].size() << endl;
		if (segment_row_to_gpu[seg->column->table_id][seg->segment_id].size() < NUM_GPU) {
			seg_is_replicated[seg->column->table_id][seg->segment_id] = 0;
			if (segment_row_to_gpu[seg->column->table_id][seg->segment_id].size() == 0) {
				seg_row_to_single_gpu[seg->column->table_id][seg->segment_id] = -1;
			} else {
				int tot_gpu = segment_row_to_gpu[seg->column->table_id][seg->segment_id].size();
				int idx = seg->segment_id % tot_gpu;
				seg_row_to_single_gpu[seg->column->table_id][seg->segment_id] = segment_row_to_gpu[seg->column->table_id][seg->segment_id][idx];
			}
		}
	}

	assert(segment_list[gpu][seg->column->column_id][seg->segment_id] != -1);
	segment_list[gpu][seg->column->column_id][seg->segment_id] = -1;
	empty_gpu_segment[gpu].push(local_idx);

	//THIS CHECKING BELOW NO LONGER APPLIES FOR MULTI GPU
	if (segment_bitmap[seg->column->column_id][seg->segment_id] == 0x00) {
		assert(old_segment_bitmap != 0x00);
		seg->column->tot_seg_in_GPU--;
		assert(seg->column->tot_seg_in_GPU >= 0);
	}
}

void
CacheManager::updateColumnFrequency(ColumnInfo* column) {
	column->stats->col_freq+=(1.0 / column->total_segment);
	// cout << column->column_name << " " << column->stats->col_freq << " " << (1 / column->total_segment) << endl;
}

void
CacheManager::updateColumnWeightDirect(ColumnInfo* column, double speedup) {
	if (column->table_id == 0) {
		column->stats->speedup += speedup/column->total_segment;
		column->weight += speedup/column->total_segment;		
	} else {
		column->stats->speedup += speedup*3/column->total_segment;
		column->weight += speedup*3/column->total_segment;			
	}

}

void
CacheManager::updateSegmentWeightDirect(ColumnInfo* column, Segment* segment, double speedup) {
	// cout << segment->segment_id << endl;
	if (speedup > 0) {
		// cout << column->column_name << endl;
		if (column->table_id == 0) {
			segment->stats->speedup += speedup/column->total_segment;
			segment->weight += speedup/column->total_segment;
		} else {
			segment->stats->speedup += speedup*3/column->total_segment;
			segment->weight += speedup*3/column->total_segment;
		}
	}
	// cout << column->column_name << " " << segment->weight << endl;
}

void
CacheManager::updateSegmentWeightCostDirect(ColumnInfo* column, Segment* segment, double speedup) {
	// cout << segment->segment_id << endl;
	if (speedup > 0) {
		// cout << column->column_name << endl;
		if (column->table_id == 0) {
			// if (column->column_name.compare("lo_quantity") == 0 || column->column_name.compare("lo_discount") == 0) {
				// cout << "hello" << endl;
				// speedup = speedup * 0.2;
			// }
			segment->stats->speedup += (speedup/column->total_segment);
			segment->weight += (speedup/column->total_segment);
		} else {
			segment->stats->speedup += (speedup/column->total_segment);
			segment->weight += (speedup/column->total_segment);
		}
	}
	// cout << column->column_name << " " << segment->weight << endl;
}

void
CacheManager::updateSegmentFreqDirect(ColumnInfo* column, Segment* segment) {
	segment->stats->col_freq += (1.0 / column->total_segment);
}

void
CacheManager::updateSegmentTimeDirect(ColumnInfo* column, Segment* segment, double timestamp) {
	segment->stats->backward_t = timestamp - (segment->stats->timestamp * column->total_segment);
	// cout << timestamp << " " << (segment->stats->timestamp * column->total_segment) << endl;
	segment->stats->timestamp = (timestamp/ column->total_segment);
}

void
CacheManager::updateColumnTimestamp(ColumnInfo* column, double timestamp) {
	// cout << column->column_name << " " << timestamp << endl;
	column->stats->backward_t = timestamp - (column->stats->timestamp * column->total_segment);
	// cout << column->column_name << " " << timestamp << " " << (column->stats->timestamp * column->total_segment) << endl;
	column->stats->timestamp = (timestamp/ column->total_segment);
	// cout << column->column_name << " " << column->stats->timestamp << endl;
}


void
CacheManager::deleteAll() {
	for (int i = 0; i < TOT_COLUMN; i++) {
		ColumnInfo* column = allColumn[i];
		for (int j = 0; j < column->total_segment; j++) {
			if (segment_bitmap[column->column_id][j] != 0x00) {
				Segment* seg = index_to_segment[column->column_id][j];
				for (int gpu = 0; gpu < NUM_GPU; gpu++) {
					if (cache_mapper[gpu].find(seg) != cache_mapper[gpu].end()) {
						deleteSegmentInGPU(seg, gpu);
					}
				}
			}
		}
	}
}

void
CacheManager::dumpStats(string filename, Distribution dist) {
	int data_size = 0;
	int cached_data = 0;
	int replicated_data = 0;
	int replicated_seg_per_column[NUM_COLUMN] = {0};

    FILE *fptr = fopen(filename.c_str(), "w");
    if (fptr == NULL)
    {
        printf("Could not open file\n");
        assert(0);
    }
   
    for (int col = 0; col < NUM_COLUMN; col++) {
      data_size += allColumn[col]->total_segment;
      cached_data += allColumn[col]->tot_seg_in_GPU;
	  for (int seg_id = 0; seg_id < allColumn[col]->total_segment; seg_id++) {
		bool replicated = true;
		Segment* seg = index_to_segment[col][seg_id];
		for (int gpu = 0; gpu < NUM_GPU; gpu++) {
			if (cache_mapper[gpu].find(seg) == cache_mapper[gpu].end()) {
				replicated = false;
			}
		}
		if (replicated) {
			replicated_data++;
			replicated_seg_per_column[col]++;
		}
	  }

    }

    fprintf(fptr, "===========================\n");
    fprintf(fptr, "=======  CACHE INFO  ======\n");
    fprintf(fptr, "===========================\n");

    fprintf(fptr, "\n");
    fprintf(fptr, "Segment size: %d\n", SEGMENT_SIZE);
    fprintf(fptr, "Data size: %d segments\n", data_size);
    fprintf(fptr, "Cached data: %d segments\n", cached_data);
	fprintf(fptr, "Replicated data: %d segments\n", replicated_data);
    fprintf(fptr, "\n");

    for (int i = 1; i < NUM_COLUMN; i++) {
        fprintf(fptr,"%s: %d/%d = %.2f segments cached, %d/%d = %.2f segments replicated\n", allColumn[i]->column_name.c_str(), allColumn[i]->tot_seg_in_GPU, allColumn[i]->total_segment, 
			allColumn[i]->tot_seg_in_GPU * 1.0/allColumn[i]->total_segment, replicated_seg_per_column[i], allColumn[i]->total_segment, replicated_seg_per_column[i] * 1.0/allColumn[i]->total_segment);
    }

    fprintf(fptr, "\n");
	fprintf(fptr, "\n");
    fprintf(fptr, "============================\n");
    fprintf(fptr, "=======  COLUMN INFO  ======\n");
    fprintf(fptr, "============================\n");
    fprintf(fptr, "\n");
	fprintf(fptr, "\n");

	int threshold = 19950101;
	seg_idx_min = 0;
	for (int col = 1; col < NUM_COLUMN; col++) {
		fprintf(fptr, "Inspecting column %s\n", allColumn[col]->column_name.c_str());
		if (allColumn[col]->tot_seg_in_GPU == 0) {
			fprintf(fptr, "This column is not cached");
		} else {
			for (int seg_id = 0; seg_id < allColumn[col]->total_segment; seg_id++) {
				Segment* seg = index_to_segment[col][seg_id];
				if (segment_bitmap[col][seg_id] != 0x00) {
					fprintf(fptr, "Segment %d is cached in GPUs ", seg_id);
					for (int gpu = 0; gpu < NUM_GPU; gpu++) {
						if (cache_mapper[gpu].find(seg) != cache_mapper[gpu].end()) {
							fprintf(fptr, "%d ", gpu);
						}
					}
					fprintf(fptr, "\n");
				}
			}
		}
		fprintf(fptr, "\n");
		fprintf(fptr, "\n");
	}

	//WARNING: THIS IS A HACK TO ENSURE JOINCPUCHECK = FALSE WHEN COLUMN IS PARTIALLY CACHED IN ZIPF DISTRIBUTION
	if (dist == Zipf) {
		for (int seg_id = 0; seg_id < lo_orderdate->total_segment; seg_id++) {
			if (segment_min[lo_orderdate->column_id][seg_id] <= threshold && segment_max[lo_orderdate->column_id][seg_id] >= threshold) {
				cout << segment_min[lo_orderdate->column_id][seg_id] << " " << segment_max[lo_orderdate->column_id][seg_id] << endl;
				seg_idx_min = seg_id;
			}
		}
	}
	fclose(fptr);
}

void
CacheManager::assignWeight(ColumnInfo* column, int start_seg, int end_seg, double weight, double repl_weight) {
	for (int seg_id = 0; seg_id < column->total_segment; seg_id++) {
		Segment* seg = index_to_segment[column->column_id][seg_id];
		if (seg_id >= start_seg && seg_id < end_seg) {
			seg->weight = weight;
			seg->repl_weight = repl_weight;
		} else {
			seg->weight = 0;
			seg->repl_weight = 0;
		}
	}
}

bool
CacheManager::couldReplicateCheck(Segment* seg, double weight, int cache_each_tot_seg, int* temp_buffer_size) {
	
	if (weight == 0) return false;

	bool could_replicate = true;
	for (int gpu = 0; gpu < NUM_GPU; gpu++) {
		if (temp_buffer_size[gpu] + 1 >= cache_each_tot_seg) {
			could_replicate = false; break;
		}
	}
	return could_replicate;
}

bool
CacheManager::couldReplicateCachedSegmentRow(Segment* seg, double weight, int cache_each_tot_seg, int* temp_buffer_size, map<Segment*, int> map_segment) {
	int count[NUM_GPU] = {0};

	if (weight == 0) return false;

	bool could_replicate = true;
	if (map_segment[seg] != -1) {
		assert(map_segment[seg] != 99);
		int cur_gpu = map_segment[seg];
		for (int gpu = 0; gpu < NUM_GPU; gpu++) {
			if (gpu != cur_gpu) {
				if (temp_buffer_size[gpu] + count[gpu] + 1 >= cache_each_tot_seg) {
					could_replicate = false; break;
				} else count[gpu]++;
			}
		}
	}

	if (!could_replicate) return could_replicate;

	for (int i = 0; i < columns_in_table[seg->column->table_id].size(); i++) {
		int column_id = columns_in_table[seg->column->table_id][i];
		if (column_id != seg->column->column_id) {
			Segment* next_seg = index_to_segment[column_id][seg->segment_id];
			if (map_segment[next_seg] != -1 && map_segment[next_seg] != 99) {
				int cur_gpu = map_segment[next_seg];
				for (int gpu = 0; gpu < NUM_GPU; gpu++) {
					if (gpu != cur_gpu) {					
						if (temp_buffer_size[gpu] + count[gpu] + 1 >= cache_each_tot_seg) {
							could_replicate = false; break;
						} else count[gpu]++;
					}
				}
			}
		}
	}
	return could_replicate;
}

unsigned long long
CacheManager::PartitionDataPlacement() {
	multimap<double, Segment*> access_weight_map;
	unsigned long long traffic = 0;

	for (int i = TOT_COLUMN-1; i >= 0; i--) {
		for (int j = 0; j < allColumn[i]->total_segment; j++) {
			Segment* segment = index_to_segment[i][j];
			if (segment->weight > 0) access_weight_map.insert({segment->weight, segment});
		}
	}

	int* temp_buffer_size = new int[NUM_GPU]();
	for (int gpu = 0; gpu < NUM_GPU; gpu++) {
		temp_buffer_size[gpu] = 0;
	}
	vector<set<Segment*>> segments_to_place;
	multimap<double, Segment*>::reverse_iterator cit;

	segments_to_place.resize(NUM_GPU);
	cit = access_weight_map.rbegin();

	while (cit != access_weight_map.rend()) {		
			//cache segment to gpu
			//check if segment is marked as replicated
			//if its marked, then replicate those to all gpu right away
			//if its not, then only cache in one gpu
			Segment* seg = cit->second;
			//if this segrow is already replicated then you have to replicate everything
			// int gpu = seg->segment_id % NUM_GPU; //!!!!
			//THIS ONLY WORKS IF THE TOTAL DIM SEGMENT IS DIVISIBLE BY NUM GPU
			int gpu = seg->segment_id % NUM_GPU;
			if(temp_buffer_size[gpu] + 1 < cache_each_tot_seg && cit->first > 0) {
				temp_buffer_size[gpu]+=1;
				segments_to_place[gpu].insert(seg);
				// cout << "Placing ";
				// cout << seg->column->column_name << " segment " << seg->segment_id;
				// cout << " in " << gpu << endl;
			}
			++cit;
	}

	for (int gpu = 0; gpu < NUM_GPU; gpu++) {
    	assert(temp_buffer_size[gpu] <= cache_each_tot_seg);
		cout << "Cached segment in GPU " << gpu << " : " << temp_buffer_size[gpu] << " Cache total: " << cache_each_tot_seg << endl;
	}

    for (int i = 0; i < TOT_COLUMN; i++) {
    	for (int j = 0; j < allColumn[i]->total_segment; j++) {
    		Segment* segment = index_to_segment[i][j];
			for (int gpu = 0; gpu < NUM_GPU; gpu++) {
				if (segments_to_place[gpu].find(segment) == segments_to_place[gpu].end()) {
					unsigned char check = (segment_bitmap[i][j] << gpu);
					check = check >> 7;
					if (check == 0x01) {
						assert(cache_mapper[gpu].find(segment) != cache_mapper[gpu].end());
						// cout << "Deleting segment ";
						// cout << segment->column->column_name << " segment " << segment->segment_id << endl;
						deleteSegmentInGPU(segment, gpu);
					}
				}
			}
		}
    }

	for (int gpu = 0; gpu < NUM_GPU; gpu++) {
		set<Segment*>::const_iterator cit2;
		for(cit2 = segments_to_place[gpu].cbegin();cit2 != segments_to_place[gpu].cend(); ++cit2){
			unsigned char check = (segment_bitmap[(*cit2)->column->column_id][(*cit2)->segment_id] << gpu);
			check = check >> 7;
			if (check == 0x00) {
					assert(cache_mapper[gpu].find(*cit2) == cache_mapper[gpu].end());
					// cout << "Caching segment ";
					// cout << (*cit2)->column->column_name << " " << (*cit2)->segment_id << endl;
					cacheSegmentInGPU(*cit2, gpu);
					traffic += SEGMENT_SIZE * sizeof(int);
			}
		}
	}
    cout << "Successfully cached" << endl;

	delete[] temp_buffer_size;
	segments_to_place.clear();
    return traffic;
}

unsigned long long
CacheManager::ReplicationDataPlacement() {
	multimap<double, Segment*> access_weight_map;
	unsigned long long traffic = 0;

	for (int i = TOT_COLUMN-1; i >= 0; i--) {
		for (int j = 0; j < allColumn[i]->total_segment; j++) {
			Segment* segment = index_to_segment[i][j];
			if (segment->weight > 0) access_weight_map.insert({segment->weight, segment});
		}
	}

	int* temp_buffer_size = new int[NUM_GPU]();
	for (int gpu = 0; gpu < NUM_GPU; gpu++) {
		temp_buffer_size[gpu] = 0;
	}
	vector<set<Segment*>> segments_to_place;
	multimap<double, Segment*>::reverse_iterator cit;

	segments_to_place.resize(NUM_GPU);
	cit = access_weight_map.rbegin();

	while (cit != access_weight_map.rend()) {		
			//cache segment to gpu
			//check if segment is marked as replicated
			//if its marked, then replicate those to all gpu right away
			//if its not, then only cache in one gpu
			Segment* seg = cit->second;
			for (int gpu = 0; gpu < NUM_GPU; gpu++) {
				if(temp_buffer_size[gpu] + 1 < cache_each_tot_seg && cit->first > 0) {
					temp_buffer_size[gpu]+=1;
					segments_to_place[gpu].insert(seg);
					// cout << "Placing ";
					// cout << seg->column->column_name << " segment " << seg->segment_id;
					// cout << " in " << gpu << endl;
				}
			}
			++cit;
	}

	for (int gpu = 0; gpu < NUM_GPU; gpu++) {
    	assert(temp_buffer_size[gpu] <= cache_each_tot_seg);
		cout << "Cached segment in GPU " << gpu << " : " << temp_buffer_size[gpu] << " Cache total: " << cache_each_tot_seg << endl;
	}

    for (int i = 0; i < TOT_COLUMN; i++) {
    	for (int j = 0; j < allColumn[i]->total_segment; j++) {
    		Segment* segment = index_to_segment[i][j];
			for (int gpu = 0; gpu < NUM_GPU; gpu++) {
				if (segments_to_place[gpu].find(segment) == segments_to_place[gpu].end()) {
					unsigned char check = (segment_bitmap[i][j] << gpu);
					check = check >> 7;
					if (check == 0x01) {
						assert(cache_mapper[gpu].find(segment) != cache_mapper[gpu].end());
						// cout << "Deleting segment ";
						// cout << segment->column->column_name << " segment " << segment->segment_id << endl;
						deleteSegmentInGPU(segment, gpu);
					}
				}
			}
		}
    }

	for (int gpu = 0; gpu < NUM_GPU; gpu++) {
		set<Segment*>::const_iterator cit2;
		for(cit2 = segments_to_place[gpu].cbegin();cit2 != segments_to_place[gpu].cend(); ++cit2){
			unsigned char check = (segment_bitmap[(*cit2)->column->column_id][(*cit2)->segment_id] << gpu);
			check = check >> 7;
			if (check == 0x00) {
					assert(cache_mapper[gpu].find(*cit2) == cache_mapper[gpu].end());
					// cout << "Caching segment ";
					// cout << (*cit2)->column->column_name << " " << (*cit2)->segment_id << endl;
					cacheSegmentInGPU(*cit2, gpu);
					traffic += SEGMENT_SIZE * sizeof(int);
			}
		}
	}
    cout << "Successfully cached" << endl;

	delete[] temp_buffer_size;
	segments_to_place.clear();
    return traffic;
}

unsigned long long
CacheManager::ShuffleAwareDataPlacement() {
	multimap<double, Segment*> access_weight_map;
	multimap<double, Segment*> replication_weight_map;
	vector<vector<bool>> segrow_is_replicated;
	unsigned long long traffic = 0;

	for (int i = TOT_COLUMN-1; i >= 0; i--) {
		for (int j = 0; j < allColumn[i]->total_segment; j++) {
			Segment* segment = index_to_segment[i][j];
			if (segment->weight > 0) access_weight_map.insert({segment->weight, segment});
			if (segment->repl_weight > 0) replication_weight_map.insert({segment->repl_weight, segment});
			// cout << allColumn[i]->column_name << " " << j << " " << segment->weight << endl;
		}
	}

	int* temp_buffer_size = new int[NUM_GPU]();
	for (int gpu = 0; gpu < NUM_GPU; gpu++) {
		temp_buffer_size[gpu] = 0;
	}
	vector<set<Segment*>> segments_to_place;
	map<Segment*, int> map_segment;
	multimap<double, Segment*>::reverse_iterator cit;
	multimap<double, Segment*>::reverse_iterator cit2;

	segrow_is_replicated.resize(NUM_TABLE);
	for (int table = 0; table < NUM_TABLE; table++) {
		segrow_is_replicated[table].resize(allTable[table]->total_segment);
	}
	segments_to_place.resize(NUM_GPU);
	cit2 = replication_weight_map.rbegin();
	cit = access_weight_map.rbegin();

	while (cit != access_weight_map.rend() || cit2 != replication_weight_map.rend()) {		
		//cache segment to gpu
		if ((cit != access_weight_map.rend() && cit2 == replication_weight_map.rend()) || (cit != access_weight_map.rend() && cit2 != replication_weight_map.rend() && cit->first >= cit2->first)) {
			//check if segment is marked as replicated
			//if its marked, then replicate those to all gpu right away
			//if its not, then only cache in one gpu
			Segment* seg = cit->second;
			assert(map_segment[seg] != -1);
			//if this segrow is already replicated then you have to replicate everything
			if (segrow_is_replicated[seg->column->table_id][seg->segment_id]) {
				
				//just checking that all cached segment are replicated
				for (int i = 0; i < columns_in_table[seg->column->table_id].size(); i++) {
					int column_id = columns_in_table[seg->column->table_id][i];
					Segment* next_seg = index_to_segment[column_id][seg->segment_id];
					assert(map_segment[next_seg] == -1 || map_segment[next_seg] == 99);
				}

				//check if we have the capacity
				bool could_replicate = couldReplicateCheck(seg, cit->first, cache_each_tot_seg, temp_buffer_size);
				if (could_replicate) {
					for (int gpu = 0; gpu < NUM_GPU; gpu++) {
						if(temp_buffer_size[gpu] + 1 < cache_each_tot_seg && cit->first > 0) {
							temp_buffer_size[gpu]+=1;
							segments_to_place[gpu].insert(seg);
							// cout << "Placing ";
							// cout << seg->column->column_name << " segment " << seg->segment_id;
							// cout << " in " << gpu << endl;
						}
					}
					map_segment[seg] = 99;
				}
			} else {
				// int gpu = seg->segment_id % NUM_GPU; //!!!!
				//THIS ONLY WORKS IF THE TOTAL DIM SEGMENT IS DIVISIBLE BY NUM GPU
				int gpu;
				if (seg->column->table_id == 0) gpu = seg->segment_id % NUM_GPU;
				else gpu = seg->segment_id * NUM_GPU/ seg->column->total_segment;
				if(temp_buffer_size[gpu] + 1 < cache_each_tot_seg && cit->first > 0) {
					temp_buffer_size[gpu]+=1;
					segments_to_place[gpu].insert(seg);
					// cout << "Placing ";
					// cout << seg->column->column_name << " segment " << seg->segment_id;
					// cout << " in " << gpu << endl;
					map_segment[seg] = gpu;
				}
			}
			++cit;
		} else if ((cit == access_weight_map.rend() && cit2 != replication_weight_map.rend()) || (cit != access_weight_map.rend() && cit2 != replication_weight_map.rend() && cit->first < cit2->first)){

			//cache A, cache B, replicate A, replicate B
			//cache A, replicate A, cache B, replicate B
			//cache A, replicate A, replicate B, cache B
			//replicate A, cache A, replicate B, cache B
			//replicate A, replicate B, cache A, cache B
			//replicate A, cache A, cache B, replicate B

			Segment* seg = cit2->second;

			//if segrow is replicated, then the segment should have been replicated when its cached
			if (segrow_is_replicated[seg->column->table_id][seg->segment_id] == 1) {
				for (int i = 0; i < columns_in_table[seg->column->table_id].size(); i++) {
					int column_id = columns_in_table[seg->column->table_id][i];
					Segment* next_seg = index_to_segment[column_id][seg->segment_id];
					assert(map_segment[next_seg] == -1 || map_segment[next_seg] == 99);
				}
			} else {
				//this segment is already replicated then do nothing
				if (map_segment[seg] == 99) {
					assert(0);
				//this segment row has not been replicated but seg is already cached
				} else if (map_segment[seg] != -1) {
					assert(segrow_is_replicated[seg->column->table_id][seg->segment_id] == 0);
					bool could_replicate = couldReplicateCachedSegmentRow(seg, cit2->first, cache_each_tot_seg, temp_buffer_size, map_segment);
					if (could_replicate) {
						int cur_gpu = map_segment[seg];
						for (int gpu = 0; gpu < NUM_GPU; gpu++) {
							if (gpu != cur_gpu) {
								if(temp_buffer_size[gpu] + 1 < cache_each_tot_seg && cit2->first > 0) {
									temp_buffer_size[gpu]+=1;
									segments_to_place[gpu].insert(seg);
									// cout << "Placing ";
									// cout << seg->column->column_name << " segment " << seg->segment_id;
									// cout << " in " << gpu << endl;
								}
							}
						}
						map_segment[seg] = 99; // a sign that it is replicated
						//replicate all other segments in the same segment row
						for (int i = 0; i < columns_in_table[seg->column->table_id].size(); i++) {
							int column_id = columns_in_table[seg->column->table_id][i];
							if (column_id != seg->column->column_id) {
								Segment* next_seg = index_to_segment[column_id][seg->segment_id];
								//if the segment is already cached then you want to replicate it
								//if the segment is already replicated then do nothing
								if (map_segment[next_seg] != -1 && map_segment[next_seg] != 99) {
									int cur_gpu = map_segment[next_seg];
									for (int gpu = 0; gpu < NUM_GPU; gpu++) {
										if (gpu != cur_gpu) {
											if(temp_buffer_size[gpu] + 1 < cache_each_tot_seg && cit2->first > 0) {
												temp_buffer_size[gpu]+=1;
												segments_to_place[gpu].insert(next_seg);
												// cout << "Placing ";
												// cout << next_seg->column->column_name << " segment " << next_seg->segment_id;
												// cout << " in " << gpu << endl;
											}
										}
									}
									map_segment[next_seg] = 99; // a sign that this segment is replicated
								}
							}
						}
						segrow_is_replicated[seg->column->table_id][seg->segment_id] = 1;
					}
				//this segment has not been cached
				} else if (map_segment[seg] == -1) {
					bool could_replicate = couldReplicateCachedSegmentRow(seg, cit2->first, cache_each_tot_seg, temp_buffer_size, map_segment);
					if (could_replicate) {
						//replicate all other segments in the same segment row
						for (int i = 0; i < columns_in_table[seg->column->table_id].size(); i++) {
							int column_id = columns_in_table[seg->column->table_id][i];
							if (column_id != seg->column->column_id) {
								Segment* next_seg = index_to_segment[column_id][seg->segment_id];
								//if the segment is already cached then you want to replicate it
								//if the segment is already replicated then do nothing
								if (map_segment[next_seg] != -1 && map_segment[next_seg] != 99) {
									cout << "Warning" << endl;
									int cur_gpu = map_segment[next_seg];
									for (int gpu = 0; gpu < NUM_GPU; gpu++) {
										if (gpu != cur_gpu) {
											if(temp_buffer_size[gpu] + 1 < cache_each_tot_seg && cit2->first > 0) {
												temp_buffer_size[gpu]+=1;
												segments_to_place[gpu].insert(next_seg);
												// cout << "Placing ";
												// cout << next_seg->column->column_name << " segment " << next_seg->segment_id;
												// cout << " in " << gpu << endl;
											}
										}
									}
									map_segment[next_seg] = 99; // a sign that this segment is replicated
								}
							}
						}
						segrow_is_replicated[seg->column->table_id][seg->segment_id] = 1;
					}
				}
			}
			++cit2;
		} else {
			assert(0);
		}
	}

	for (int gpu = 0; gpu < NUM_GPU; gpu++) {
    	assert(temp_buffer_size[gpu] <= cache_each_tot_seg);
		cout << "Cached segment in GPU " << gpu << " : " << temp_buffer_size[gpu] << " Cache total: " << cache_each_tot_seg << endl;
	}

    for (int i = 0; i < TOT_COLUMN; i++) {
    	for (int j = 0; j < allColumn[i]->total_segment; j++) {
    		Segment* segment = index_to_segment[i][j];
			for (int gpu = 0; gpu < NUM_GPU; gpu++) {
				if (segments_to_place[gpu].find(segment) == segments_to_place[gpu].end()) {
					unsigned char check = (segment_bitmap[i][j] << gpu);
					check = check >> 7;
					if (check == 0x01) {
						assert(cache_mapper[gpu].find(segment) != cache_mapper[gpu].end());
						// cout << "Deleting segment ";
						// cout << segment->column->column_name << " segment " << segment->segment_id << endl;
						deleteSegmentInGPU(segment, gpu);
					}
				}
			}
		}
    }

	for (int gpu = 0; gpu < NUM_GPU; gpu++) {
		set<Segment*>::const_iterator cit2;
		for(cit2 = segments_to_place[gpu].cbegin();cit2 != segments_to_place[gpu].cend(); ++cit2){
			unsigned char check = (segment_bitmap[(*cit2)->column->column_id][(*cit2)->segment_id] << gpu);
			check = check >> 7;
			if (check == 0x00) {
					assert(cache_mapper[gpu].find(*cit2) == cache_mapper[gpu].end());
					// cout << "Caching segment ";
					// cout << (*cit2)->column->column_name << " " << (*cit2)->segment_id << endl;
					cacheSegmentInGPU(*cit2, gpu);
					traffic += SEGMENT_SIZE * sizeof(int);
			}
		}
	}
    cout << "Successfully cached" << endl;

	delete[] temp_buffer_size;
	map_segment.clear();
	segments_to_place.clear();
    return traffic;
}

void
CacheManager::ShuffleAware(Distribution dist, bool opt) {
	
	int start = 0;
	if (dist == Zipf) start = (LO_LEN/SEGMENT_SIZE)*2.8/7;
	// if (dist == Zipf) start = (LO_LEN/SEGMENT_SIZE)/2;

	if (SF == 322 || SF == 402) {
		assert(opt == 0);
		assignWeight(lo_suppkey, start, lo_suppkey->total_segment, 850, 0);
		assignWeight(lo_custkey, start, lo_custkey->total_segment, 750, 0);
		assignWeight(lo_partkey, start, lo_partkey->total_segment, 650, 0);
		assignWeight(lo_orderdate, start, lo_orderdate->total_segment, 550, 0);
		assignWeight(lo_revenue, start, lo_revenue->total_segment, 540, 0);
		assignWeight(lo_supplycost, start, lo_supplycost->total_segment, 520, 0);
		assignWeight(lo_quantity, start, lo_quantity->total_segment, 500, 0);
		assignWeight(lo_discount, start, lo_discount->total_segment, 100, 0);
		assignWeight(lo_extendedprice, start, lo_extendedprice->total_segment, 50, 0);

		assignWeight(s_suppkey, 0, s_suppkey->total_segment, 1000, 400);
		assignWeight(s_region, 0, s_region->total_segment, 900, 400);
		assignWeight(s_nation, 0, s_nation->total_segment, 900, 400);
		assignWeight(s_city, 0, s_city->total_segment, 900, 400);

		if (dist != Zipf) {
			assignWeight(c_custkey, 0, c_custkey->total_segment, 1000, 0);
			assignWeight(c_region, 0, c_region->total_segment, 800, 0);
			assignWeight(c_nation, 0, c_nation->total_segment, 800, 0);
			assignWeight(c_city, 0, c_city->total_segment, 800, 0);
		} else {
			assignWeight(c_custkey, 0, c_custkey->total_segment, 1000, 25);
			assignWeight(c_region, 0, c_region->total_segment, 800, 25);
			assignWeight(c_nation, 0, c_nation->total_segment, 800, 25);
			assignWeight(c_city, 0, c_city->total_segment, 800, 25);
		}

		assignWeight(p_partkey, 0, p_partkey->total_segment, 1000, 0);
		assignWeight(p_mfgr, 0, p_mfgr->total_segment, 700, 0);
		assignWeight(p_category, 0, p_category->total_segment, 700, 0);
		assignWeight(p_brand1, 0, p_brand1->total_segment, 700, 0);

		assignWeight(d_datekey, 0, d_datekey->total_segment, 1000, 1000);
		assignWeight(d_year, 0, d_year->total_segment, 1000, 1000);
		assignWeight(d_yearmonthnum, 0, d_yearmonthnum->total_segment, 1000, 1000);
	} else if (SF % 10 == 0) {
		assert(opt == 0);
		assert(dist == None);
		assignWeight(lo_suppkey, start, lo_suppkey->total_segment, 850, 0);
		assignWeight(lo_custkey, start, lo_custkey->total_segment, 750, 0);
		assignWeight(lo_partkey, start, lo_partkey->total_segment, 650, 0);
		assignWeight(lo_orderdate, start, lo_orderdate->total_segment, 550, 0);
		assignWeight(lo_revenue, start, lo_revenue->total_segment, 540, 0);
		assignWeight(lo_supplycost, start, lo_supplycost->total_segment, 520, 0);
		assignWeight(lo_quantity, start, lo_quantity->total_segment, 500, 0);
		assignWeight(lo_discount, start, lo_discount->total_segment, 100, 0);
		assignWeight(lo_extendedprice, start, lo_extendedprice->total_segment, 50, 0);

		assignWeight(s_suppkey, 0, s_suppkey->total_segment, 1000, 900);
		assignWeight(s_region, 0, s_region->total_segment, 900, 900);
		assignWeight(s_nation, 0, s_nation->total_segment, 900, 900);
		assignWeight(s_city, 0, s_city->total_segment, 900, 900);

		assignWeight(c_custkey, 0, c_custkey->total_segment, 1000, 1000);
		assignWeight(c_region, 0, c_region->total_segment, 900, 900);
		assignWeight(c_nation, 0, c_nation->total_segment, 900, 900);
		assignWeight(c_city, 0, c_city->total_segment, 900, 900);

		assignWeight(p_partkey, 0, p_partkey->total_segment, 1000, 1000);
		assignWeight(p_mfgr, 0, p_mfgr->total_segment, 900, 900);
		assignWeight(p_category, 0, p_category->total_segment, 900, 900);
		assignWeight(p_brand1, 0, p_brand1->total_segment, 900, 900);

		assignWeight(d_datekey, 0, d_datekey->total_segment, 1000, 1000);
		assignWeight(d_year, 0, d_year->total_segment, 1000, 1000);
		assignWeight(d_yearmonthnum, 0, d_yearmonthnum->total_segment, 1000, 1000);
	} else {
		assignWeight(lo_suppkey, start, lo_suppkey->total_segment, 850, 0);
		assignWeight(lo_custkey, start, lo_custkey->total_segment, 750, 0);
		assignWeight(lo_partkey, start, lo_partkey->total_segment, 650, 0);
		assignWeight(lo_orderdate, start, lo_orderdate->total_segment, 550, 0);
		assignWeight(lo_revenue, start, lo_revenue->total_segment, 540, 0);
		assignWeight(lo_supplycost, start, lo_supplycost->total_segment, 520, 0);
		assignWeight(lo_quantity, start, lo_quantity->total_segment, 500, 0);
		assignWeight(lo_discount, start, lo_discount->total_segment, 100, 0);
		assignWeight(lo_extendedprice, start, lo_extendedprice->total_segment, 50, 0);

		if (opt) {
			assignWeight(s_suppkey, 0, s_suppkey->total_segment, 1000, 0);
			assignWeight(s_region, 0, s_region->total_segment, 900, 0);
			assignWeight(s_nation, 0, s_nation->total_segment, 900, 0);
			assignWeight(s_city, 0, s_city->total_segment, 900, 0);

			assignWeight(c_custkey, 0, c_custkey->total_segment, 1000, 0);
			assignWeight(c_region, 0, c_region->total_segment, 800, 0);
			assignWeight(c_nation, 0, c_nation->total_segment, 800, 0);
			assignWeight(c_city, 0, c_city->total_segment, 800, 0);

			assignWeight(p_partkey, 0, p_partkey->total_segment, 1000, 0);
			assignWeight(p_mfgr, 0, p_mfgr->total_segment, 700, 0);
			assignWeight(p_category, 0, p_category->total_segment, 700, 0);
			assignWeight(p_brand1, 0, p_brand1->total_segment, 700, 0);
		} else {
			assignWeight(s_suppkey, 0, s_suppkey->total_segment, 1000, 400);
			assignWeight(s_region, 0, s_region->total_segment, 900, 400);
			assignWeight(s_nation, 0, s_nation->total_segment, 900, 400);
			assignWeight(s_city, 0, s_city->total_segment, 900, 400);

			if (SF == 162 && dist == Zipf && NUM_GPU == 2) {
				assignWeight(c_custkey, 0, c_custkey->total_segment, 1000, 0);
				assignWeight(c_region, 0, c_region->total_segment, 800, 0);
				assignWeight(c_nation, 0, c_nation->total_segment, 800, 0);
				assignWeight(c_city, 0, c_city->total_segment, 800, 0);
			} else {
				assignWeight(c_custkey, 0, c_custkey->total_segment, 1000, 300);
				assignWeight(c_region, 0, c_region->total_segment, 800, 300);
				assignWeight(c_nation, 0, c_nation->total_segment, 800, 300);
				assignWeight(c_city, 0, c_city->total_segment, 800, 300);
			}

			if ((SF == 162 && dist != Zipf && NUM_GPU == 4) || (SF == 162 && dist == Zipf && NUM_GPU == 2)) {
				assignWeight(p_partkey, 0, p_partkey->total_segment, 1000, 0);
				assignWeight(p_mfgr, 0, p_mfgr->total_segment, 700, 0);
				assignWeight(p_category, 0, p_category->total_segment, 700, 0);
				assignWeight(p_brand1, 0, p_brand1->total_segment, 700, 0);
			} else {
				assignWeight(p_partkey, 0, p_partkey->total_segment, 1000, 200);
				assignWeight(p_mfgr, 0, p_mfgr->total_segment, 700, 200);
				assignWeight(p_category, 0, p_category->total_segment, 700, 200);
				assignWeight(p_brand1, 0, p_brand1->total_segment, 700, 200);
			}
		}

		assignWeight(d_datekey, 0, d_datekey->total_segment, 1000, 1000);
		assignWeight(d_year, 0, d_year->total_segment, 1000, 1000);
		assignWeight(d_yearmonthnum, 0, d_yearmonthnum->total_segment, 1000, 1000);
	}

	ShuffleAwareDataPlacement();

	string path;
	if (dist == None) {
		path = string("logs/stats/shuffleaware") + to_string(NUM_GPU) + "GPUsSF" + to_string(SF);
		if (!opt) dumpStats(path, dist);
	} else if (dist == Zipf) {
		path = string("logs/stats/shuffleaware") + to_string(NUM_GPU) + "GPUsSF" + to_string(SF) + "Zipf";
		if (!opt) dumpStats(path, dist);
	}

}

void
CacheManager::PartitioningOnly(Distribution dist) {

	if (SF % 10 == 0) assert(0);

	int start = 0;
	// if (dist == Zipf) start = (LO_LEN/SEGMENT_SIZE)*2.8/7;

	assignWeight(lo_suppkey, start, lo_suppkey->total_segment, 850, 0);
	assignWeight(lo_custkey, start, lo_custkey->total_segment, 750, 0);
	assignWeight(lo_partkey, start, lo_partkey->total_segment, 650, 0);
	assignWeight(lo_orderdate, start, lo_orderdate->total_segment, 550, 0);
	assignWeight(lo_revenue, start, lo_revenue->total_segment, 540, 0);
	assignWeight(lo_supplycost, start, lo_supplycost->total_segment, 520, 0);
	assignWeight(lo_quantity, start, lo_quantity->total_segment, 500, 0);
	assignWeight(lo_discount, start, lo_discount->total_segment, 100, 0);
	assignWeight(lo_extendedprice, start, lo_extendedprice->total_segment, 50, 0);

	assignWeight(s_suppkey, 0, s_suppkey->total_segment, 1000, 0);
	assignWeight(s_region, 0, s_region->total_segment, 900, 0);
	assignWeight(s_nation, 0, s_nation->total_segment, 900, 0);
	assignWeight(s_city, 0, s_city->total_segment, 900, 0);

	assignWeight(c_custkey, 0, c_custkey->total_segment, 1000, 0);
	assignWeight(c_region, 0, c_region->total_segment, 800, 0);
	assignWeight(c_nation, 0, c_nation->total_segment, 800, 0);
	assignWeight(c_city, 0, c_city->total_segment, 800, 0);

	assignWeight(p_partkey, 0, p_partkey->total_segment, 1000, 0);
	assignWeight(p_mfgr, 0, p_mfgr->total_segment, 700, 0);
	assignWeight(p_category, 0, p_category->total_segment, 700, 0);
	assignWeight(p_brand1, 0, p_brand1->total_segment, 700, 0);

	assignWeight(d_datekey, 0, d_datekey->total_segment, 1000, 0);
	assignWeight(d_year, 0, d_year->total_segment, 1000, 0);
	assignWeight(d_yearmonthnum, 0, d_yearmonthnum->total_segment, 1000, 0);

	PartitionDataPlacement();

	string path;
	if (dist == None) {
		path = string("logs/stats/partitiononly") + to_string(NUM_GPU) + "GPUsSF" + to_string(SF);
		dumpStats(path, dist);
	} else if (dist == Zipf) {
		path = string("logs/stats/partitiononly") + to_string(NUM_GPU) + "GPUsSF" + to_string(SF) + "Zipf";
		dumpStats(path, dist);
	}

}

void
CacheManager::ReplicationOnly(Distribution dist) {

	if (SF % 10 == 0) assert(0);

	int start = 0;
	if (dist == Zipf) start = (LO_LEN/SEGMENT_SIZE)*2.8/7;

	assignWeight(lo_suppkey, start, lo_suppkey->total_segment, 850, 0);
	assignWeight(lo_custkey, start, lo_custkey->total_segment, 750, 0);
	assignWeight(lo_partkey, start, lo_partkey->total_segment, 650, 0);
	assignWeight(lo_orderdate, start, lo_orderdate->total_segment, 550, 0);
	assignWeight(lo_revenue, start, lo_revenue->total_segment, 540, 0);
	assignWeight(lo_supplycost, start, lo_supplycost->total_segment, 520, 0);
	assignWeight(lo_quantity, start, lo_quantity->total_segment, 500, 0);
	assignWeight(lo_discount, start, lo_discount->total_segment, 100, 0);
	assignWeight(lo_extendedprice, start, lo_extendedprice->total_segment, 50, 0);

	assignWeight(s_suppkey, 0, s_suppkey->total_segment, 1000, 0);
	assignWeight(s_region, 0, s_region->total_segment, 900, 0);
	assignWeight(s_nation, 0, s_nation->total_segment, 900, 0);
	assignWeight(s_city, 0, s_city->total_segment, 900, 0);

	assignWeight(c_custkey, 0, c_custkey->total_segment, 1000, 0);
	assignWeight(c_region, 0, c_region->total_segment, 800, 0);
	assignWeight(c_nation, 0, c_nation->total_segment, 800, 0);
	assignWeight(c_city, 0, c_city->total_segment, 800, 0);

	assignWeight(p_partkey, 0, p_partkey->total_segment, 1000, 0);
	assignWeight(p_mfgr, 0, p_mfgr->total_segment, 700, 0);
	assignWeight(p_category, 0, p_category->total_segment, 700, 0);
	assignWeight(p_brand1, 0, p_brand1->total_segment, 700, 0);

	assignWeight(d_datekey, 0, d_datekey->total_segment, 1000, 0);
	assignWeight(d_year, 0, d_year->total_segment, 1000, 0);
	assignWeight(d_yearmonthnum, 0, d_yearmonthnum->total_segment, 1000, 0);

	ReplicationDataPlacement();

	string path;
	if (dist == None) {
		path = string("logs/stats/replicationonly") + to_string(NUM_GPU) + "GPUsSF" + to_string(SF);
		dumpStats(path, dist);
	} else if (dist == Zipf) {
		path = string("logs/stats/replicationonly") + to_string(NUM_GPU) + "GPUsSF" + to_string(SF) + "Zipf";
		cout << path << endl;
		dumpStats(path, dist);
	}

}

void
CacheManager::loadColumnToCPU() {

	lo = new TableInfo("lo", LO_LEN, 0);
	s = new TableInfo("s", S_LEN, 1);
	c = new TableInfo("c", C_LEN, 2);
	p = new TableInfo("p", P_LEN, 3);
	d = new TableInfo("d", D_LEN, 4);

	allTable[0] = lo;
	allTable[1] = s;
	allTable[2] = c;
	allTable[3] = p;
	allTable[4] = d;

	h_lo_orderkey = loadColumnPinned<int>("lo_orderkey", LO_LEN);
	h_lo_suppkey = loadColumnPinned<int>("lo_suppkey", LO_LEN);
	h_lo_custkey = loadColumnPinned<int>("lo_custkey", LO_LEN);
	h_lo_partkey = loadColumnPinned<int>("lo_partkey", LO_LEN);
	h_lo_orderdate = loadColumnPinned<int>("lo_orderdate", LO_LEN);
	h_lo_revenue = loadColumnPinned<int>("lo_revenue", LO_LEN);
	h_lo_discount = loadColumnPinned<int>("lo_discount", LO_LEN);
	h_lo_quantity = loadColumnPinned<int>("lo_quantity", LO_LEN);
	h_lo_extendedprice = loadColumnPinned<int>("lo_extendedprice", LO_LEN);
	h_lo_supplycost = loadColumnPinned<int>("lo_supplycost", LO_LEN);

	// h_lo_orderkey = loadColumnPinnedSort<int>("lo_orderkey", LO_LEN);
	// h_lo_suppkey = loadColumnPinnedSort<int>("lo_suppkey", LO_LEN);
	// h_lo_custkey = loadColumnPinnedSort<int>("lo_custkey", LO_LEN);
	// h_lo_partkey = loadColumnPinnedSort<int>("lo_partkey", LO_LEN);
	// h_lo_orderdate = loadColumnPinnedSort<int>("lo_orderdate", LO_LEN);
	// h_lo_revenue = loadColumnPinnedSort<int>("lo_revenue", LO_LEN);
	// h_lo_discount = loadColumnPinnedSort<int>("lo_discount", LO_LEN);
	// h_lo_quantity = loadColumnPinnedSort<int>("lo_quantity", LO_LEN);
	// h_lo_extendedprice = loadColumnPinnedSort<int>("lo_extendedprice", LO_LEN);
	// h_lo_supplycost = loadColumnPinnedSort<int>("lo_supplycost", LO_LEN);

	h_c_custkey = loadColumnPinned<int>("c_custkey", C_LEN);
	h_c_nation = loadColumnPinned<int>("c_nation", C_LEN);
	h_c_region = loadColumnPinned<int>("c_region", C_LEN);
	h_c_city = loadColumnPinned<int>("c_city", C_LEN);

	h_s_suppkey = loadColumnPinned<int>("s_suppkey", S_LEN);
	h_s_nation = loadColumnPinned<int>("s_nation", S_LEN);
	h_s_region = loadColumnPinned<int>("s_region", S_LEN);
	h_s_city = loadColumnPinned<int>("s_city", S_LEN);

	h_p_partkey = loadColumnPinned<int>("p_partkey", P_LEN);
	h_p_brand1 = loadColumnPinned<int>("p_brand1", P_LEN);
	h_p_category = loadColumnPinned<int>("p_category", P_LEN);
	h_p_mfgr = loadColumnPinned<int>("p_mfgr", P_LEN);

	h_d_datekey = loadColumnPinned<int>("d_datekey", D_LEN);
	h_d_year = loadColumnPinned<int>("d_year", D_LEN);
	h_d_yearmonthnum = loadColumnPinned<int>("d_yearmonthnum", D_LEN);

	lo_orderkey = new ColumnInfo(lo, "lo_orderkey", 0, h_lo_orderkey);
	lo_suppkey = new ColumnInfo(lo, "lo_suppkey", 1, h_lo_suppkey);
	lo_custkey = new ColumnInfo(lo, "lo_custkey", 2, h_lo_custkey);
	lo_partkey = new ColumnInfo(lo, "lo_partkey", 3, h_lo_partkey);
	lo_orderdate = new ColumnInfo(lo, "lo_orderdate", 4, h_lo_orderdate);
	lo_revenue = new ColumnInfo(lo, "lo_revenue", 5, h_lo_revenue);
	lo_discount = new ColumnInfo(lo, "lo_discount", 6, h_lo_discount);
	lo_quantity = new ColumnInfo(lo, "lo_quantity", 7, h_lo_quantity);
	lo_extendedprice = new ColumnInfo(lo, "lo_extendedprice", 8, h_lo_extendedprice);
	lo_supplycost = new ColumnInfo(lo, "lo_supplycost", 9, h_lo_supplycost);

	c_custkey = new ColumnInfo(c, "c_custkey", 10, h_c_custkey);
	c_nation = new ColumnInfo(c, "c_nation", 11, h_c_nation);
	c_region = new ColumnInfo(c, "c_region", 12, h_c_region);
	c_city = new ColumnInfo(c, "c_city", 13, h_c_city);

	s_suppkey = new ColumnInfo(s, "s_suppkey", 14, h_s_suppkey);	
	s_nation = new ColumnInfo(s, "s_nation", 15, h_s_nation);
	s_region = new ColumnInfo(s, "s_region", 16, h_s_region);
	s_city = new ColumnInfo(s, "s_city", 17, h_s_city);

	p_partkey = new ColumnInfo(p, "p_partkey", 18, h_p_partkey);
	p_brand1 = new ColumnInfo(p, "p_brand1", 19, h_p_brand1);
	p_category = new ColumnInfo(p, "p_category", 20, h_p_category);
	p_mfgr = new ColumnInfo(p, "p_mfgr", 21, h_p_mfgr);

	d_datekey = new ColumnInfo(d, "d_datekey", 22, h_d_datekey);
	d_year = new ColumnInfo(d, "d_year", 23, h_d_year);
	d_yearmonthnum = new ColumnInfo(d, "d_yearmonthnum", 24, h_d_yearmonthnum);

	allColumn[0] = lo_orderkey;
	allColumn[1] = lo_suppkey;
	allColumn[2] = lo_custkey;
	allColumn[3] = lo_partkey;
	allColumn[4] = lo_orderdate;
	allColumn[5] = lo_revenue;
	allColumn[6] = lo_discount;
	allColumn[7] = lo_quantity;
	allColumn[8] = lo_extendedprice;
	allColumn[9] = lo_supplycost;

	allColumn[10] = c_custkey;
	allColumn[11] = c_nation;
	allColumn[12] = c_region;
	allColumn[13] = c_city;

	allColumn[14] = s_suppkey;
	allColumn[15] = s_nation;
	allColumn[16] = s_region;
	allColumn[17] = s_city;

	allColumn[18] = p_partkey;
	allColumn[19] = p_brand1;
	allColumn[20] = p_category;
	allColumn[21] = p_mfgr;

	allColumn[22] = d_datekey;
	allColumn[23] = d_year;
	allColumn[24] = d_yearmonthnum;

	for (int i = 0; i <= 9; i++) {
		allTable[0]->columns.push_back(allColumn[i]);
		allColumn[i]->column_table_id = allTable[0]->columns.size()-1;
	}

	for (int i = 10; i <= 13; i++) {
		allTable[2]->columns.push_back(allColumn[i]);
		allColumn[i]->column_table_id = allTable[2]->columns.size()-1;
	}

	for (int i = 14; i <= 17; i++) {
		allTable[1]->columns.push_back(allColumn[i]);
		allColumn[i]->column_table_id = allTable[1]->columns.size()-1;
	}

	for (int i = 18; i <= 21; i++) {
		allTable[3]->columns.push_back(allColumn[i]);
		allColumn[i]->column_table_id = allTable[3]->columns.size()-1;
	}

	for (int i = 22; i <= 24; i++) {
		allTable[4]->columns.push_back(allColumn[i]);
		allColumn[i]->column_table_id = allTable[4]->columns.size()-1;
	}

	columns_in_table.resize(TOT_TABLE);
	for (int i = 0; i < TOT_COLUMN; i++) {
		columns_in_table[allColumn[i]->table_id].push_back(allColumn[i]->column_id);
	}

	// unsigned long long sum = 0;
	// for (int i = 0; i < LO_LEN; i++) {
	// 	sum += h_lo_revenue[i];
	// }

		// cout << "Printing partkey" << endl;
		// int count = 0;
		// for (int i = 0; i < S_LEN; i++) {
		// 	if (h_s_city[i] )
		//     printf("%d\n", h_s_city[i]);
		// }

	// cout << endl;
	// cout << "Printing custkey" << endl;
    // for (int i = 0; i < C_LEN; i++) {
        // printf("%d %d %d %d\n", h_c_nation[2343346], h_c_nation[8054136], h_c_nation[8244933], h_c_nation[8247076]);
    // }
	// cout << "total sum " << sum << endl;
}

CacheManager::~CacheManager() {
	for (int gpu = 0; gpu < NUM_GPU; gpu++) {
		cudaSetDevice(gpu);
		CubDebugExit(cudaFree(gpuCache[gpu]));
		CubDebugExit(cudaFree(gpuProcessing[gpu]));	
	}
	cudaSetDevice(0);

	delete[] gpuCache;
	delete[] gpuProcessing;

	delete[] cpuProcessing;
	CubDebugExit(cudaFreeHost(pinnedMemory));

	CubDebugExit(cudaFreeHost(h_lo_orderkey));
	CubDebugExit(cudaFreeHost(h_lo_suppkey));
	CubDebugExit(cudaFreeHost(h_lo_custkey));
	CubDebugExit(cudaFreeHost(h_lo_partkey));
	CubDebugExit(cudaFreeHost(h_lo_orderdate));
	CubDebugExit(cudaFreeHost(h_lo_revenue));
	CubDebugExit(cudaFreeHost(h_lo_discount)); 
	CubDebugExit(cudaFreeHost(h_lo_quantity));
	CubDebugExit(cudaFreeHost(h_lo_extendedprice));
	CubDebugExit(cudaFreeHost(h_lo_supplycost));

	CubDebugExit(cudaFreeHost(h_c_custkey));
	CubDebugExit(cudaFreeHost(h_c_nation));
	CubDebugExit(cudaFreeHost(h_c_region));
	CubDebugExit(cudaFreeHost(h_c_city));

	CubDebugExit(cudaFreeHost(h_s_suppkey));
	CubDebugExit(cudaFreeHost(h_s_nation));
	CubDebugExit(cudaFreeHost(h_s_region));
	CubDebugExit(cudaFreeHost(h_s_city));

	CubDebugExit(cudaFreeHost(h_p_partkey));
	CubDebugExit(cudaFreeHost(h_p_brand1));
	CubDebugExit(cudaFreeHost(h_p_category));
	CubDebugExit(cudaFreeHost(h_p_mfgr));

	CubDebugExit(cudaFreeHost(h_d_datekey));
	CubDebugExit(cudaFreeHost(h_d_year));
	CubDebugExit(cudaFreeHost(h_d_yearmonthnum));

	delete lo_orderkey;
	delete lo_orderdate;
	delete lo_custkey;
	delete lo_suppkey;
	delete lo_partkey;
	delete lo_revenue;
	delete lo_discount;
	delete lo_quantity;
	delete lo_extendedprice;
	delete lo_supplycost;

	delete c_custkey;
	delete c_nation;
	delete c_region;
	delete c_city;

	delete s_suppkey;	
	delete s_nation;
	delete s_region;
	delete s_city;

	delete p_partkey;
	delete p_brand1;
	delete p_category;
	delete p_mfgr;

	delete d_datekey;
	delete d_year;
	delete d_yearmonthnum;

	for (int i = 0; i < TOT_COLUMN; i++) {
		free(segment_bitmap[i]);
	}
	free(segment_bitmap);

	for (int gpu = 0; gpu < NUM_GPU; gpu++) {
		for (int i = 0; i < TOT_COLUMN; i++) {
			CubDebugExit(cudaFreeHost(segment_list[gpu][i]));
		}
		free(segment_list[gpu]);
	}
	free(segment_list);
	
	for (int gpu = 0; gpu < NUM_GPU; gpu++) {
		for (int i = 0; i < TOT_TABLE; i++) {
			free(gpu_segment_row_bitmap[gpu][i]);
		}
		free(gpu_segment_row_bitmap[gpu]);
	}
	free(gpu_segment_row_bitmap);
}


template int*
CacheManager::customMalloc<int>(int size);

template int*
CacheManager::customCudaMalloc<int>(int size, int gpu);

template int**
CacheManager::customCudaMalloc<int*>(int size, int gpu);

template int***
CacheManager::customCudaMalloc<int**>(int size, int gpu);

template int****
CacheManager::customCudaMalloc<int***>(int size, int gpu);

template int*
CacheManager::customCudaHostAlloc<int>(int size);

template short*
CacheManager::customMalloc<short>(int size);

template short*
CacheManager::customCudaMalloc<short>(int size, int gpu);

template short*
CacheManager::customCudaHostAlloc<short>(int size);