#ifndef _KERNEL_LAUNCH_H_
#define _KERNEL_LAUNCH_H_

#pragma once

#include "QueryOptimizer.h"
#include "GPUProcessing.h"
#include "CPUProcessing.h"
#include "common.h"

#define MULTIPLIER 1.1 //MULTIPLIER FOR PESSIMISTIC MEMORY ALLOCATION ASSUMPTIONS

enum KernelType {
    JustFilter, JustProbe, ProbeAggr, ProbeGroupby, ProbePartition, JustBuild,
    ShuffleProbe, ShuffleProbePartition, ShuffleProbeAggr, ShuffleProbeGroupby, Nope
};

class KernelParams {
public:
  int** d_total;
  int** h_total;
  struct filterArgsGPU* fargs;
  struct probeArgsGPU* pargs;
  struct groupbyArgsGPU* gargs;
  struct buildArgsGPU* bargs;
  struct shuffleArgsGPU* sargs;
  struct shuffleOutGPU* sout;
  struct offsetGPU* in_off;
  struct offsetGPU* out_off;
  struct shuffleHelper* shelper;

  KernelParams(struct filterArgsGPU* _fargs, struct probeArgsGPU* _pargs, struct buildArgsGPU* _bargs,
  struct groupbyArgsGPU* _gargs, struct shuffleArgsGPU* _sargs,
  struct shuffleHelper* _shelper, int** _d_total, int** _h_total);
};

class KernelLaunch {
public:
  CacheManager* cm;
  KernelParams* kparams;
  QueryParams* qparams;
  int** d_total;
  int** h_total;
	int sg;
	KernelType kernel_type;
	int gpu;
	cudaStream_t stream;
  int table_id;
	short* d_segment_group_each_gpu;
	int INPUT_LEN;
	int output_estimate;
	int** off_col_out;
  int* key_off_col; //only used for building hash table
  float output_selectivity;
  bool aggrGPUcheck;
  int latemat;

  ColumnInfo* key_column; //only used for building hash table
  int* count; //only used to count partitioning result

  struct filterArgsGPU* fargs;
  struct probeArgsGPU* pargs;
  struct buildArgsGPU* bargs;
  struct groupbyArgsGPU* gargs;
  struct offsetGPU* in_off;
  struct offsetGPU* out_off;
  struct shuffleArgsGPU* sargs;
  struct shuffleOutGPU* sout;
  struct shuffleHelper* shelper;

	void launchKernel(bool has_shuffled, bool broadcast = 0, int* off_to_seg_id = NULL, int broadcast_len = 0);
  void launchPartitioning(int latemat = 0, int pipeline = 0);
  void launchKernelPipelined(int latemat, int first_join_in_pipeline);

  void prepareKernelFact(int*** &off_col, int*** used_col_idx, short*** segment_group_each_gpu_count, 
    short*** segment_group_each_gpu, int** last_segment_gpu, bool* joinGPUcheck, int* fkey_col_id, int* group_col_id,
    int will_shuffle, bool has_shuffled);
  void prepareKernelDim(int*** &off_col, ColumnInfo* build_column, short*** segment_group_each_gpu_count, 
    short*** segment_group_each_gpu, int** last_segment_gpu, int table, bool has_shuffled);
  void preparePartitioningWithoutCount(int*** &off_col, int*** used_col_idx, short*** segment_group_each_gpu_count, 
    short*** segment_group_each_gpu, int** last_segment_gpu, bool* joinGPUcheck, bool first_shuffle, bool pipeline);
  void prepareKernelPipelined(int*** &off_col, int*** used_col_idx, short*** segment_group_each_gpu_count, 
    short*** segment_group_each_gpu, int** last_segment_gpu, bool* joinGPUcheck, int* fkey_col_id, int* group_col_id,
    bool has_shuffled, int latemat = 0);

  void countPartitioning(int*** &off_col, short*** segment_group_each_gpu_count, short*** segment_group_each_gpu, 
  int** last_segment_gpu, bool first_shuffle);
  void synchronizePartitioning();
  void preparePartitioningAfterCount(int*** &off_col, int*** used_col_idx, bool* joinGPUcheck, bool first_shuffle);

  void clearKernel(int*** &off_col, int will_shuffle, bool has_shuffled);
  void clearPartitioning();
  void clearPipelined(int*** &off_col);

  KernelLaunch(CacheManager* _cm, KernelParams* _kparams, QueryParams* _qparams,
    int _sg, int _gpu, KernelType _kernel_type, int _table_id, float _output_selectivity, 
    int latemat, bool aggrGPUcheck, cudaStream_t _stream);

};

#endif