#ifndef _KERNEL_LAUNCH_H_ #define _KERNEL_LAUNCH_H_ #pragma once #include "QueryOptimizer.h" #include "GPUProcessing.h" #include "CPUProcessing.h" #include "common.h" #define MULTIPLIER 1.1 //MULTIPLIER FOR PESSIMISTIC MEMORY ALLOCATION ASSUMPTIONS enum KernelType { JustFilter, JustProbe, ProbeAggr, ProbeGroupby, ProbePartition, JustBuild, ShuffleProbe, ShuffleProbePartition, ShuffleProbeAggr, ShuffleProbeGroupby, Nope }; class KernelParams { public: int** d_total; int** h_total; struct filterArgsGPU* fargs; struct probeArgsGPU* pargs; struct groupbyArgsGPU* gargs; struct buildArgsGPU* bargs; struct shuffleArgsGPU* sargs; struct shuffleOutGPU* sout; struct offsetGPU* in_off; struct offsetGPU* out_off; struct shuffleHelper* shelper; KernelParams(struct filterArgsGPU* _fargs, struct probeArgsGPU* _pargs, struct buildArgsGPU* _bargs, struct groupbyArgsGPU* _gargs, struct shuffleArgsGPU* _sargs, struct shuffleHelper* _shelper, int** _d_total, int** _h_total); }; class KernelLaunch { public: CacheManager* cm; KernelParams* kparams; QueryParams* qparams; int** d_total; int** h_total; int sg; KernelType kernel_type; int gpu; cudaStream_t stream; int table_id; short* d_segment_group_each_gpu; int INPUT_LEN; int output_estimate; int** off_col_out; int* key_off_col; //only used for building hash table float output_selectivity; bool aggrGPUcheck; int latemat; ColumnInfo* key_column; //only used for building hash table int* count; //only used to count partitioning result struct filterArgsGPU* fargs; struct probeArgsGPU* pargs; struct buildArgsGPU* bargs; struct groupbyArgsGPU* gargs; struct offsetGPU* in_off; struct offsetGPU* out_off; struct shuffleArgsGPU* sargs; struct shuffleOutGPU* sout; struct shuffleHelper* shelper; void launchKernel(bool has_shuffled, bool broadcast = 0, int* off_to_seg_id = NULL, int broadcast_len = 0); void launchPartitioning(int latemat = 0, int pipeline = 0); void launchKernelPipelined(int latemat, int first_join_in_pipeline); void prepareKernelFact(int*** &off_col, int*** used_col_idx, short*** segment_group_each_gpu_count, short*** segment_group_each_gpu, int** last_segment_gpu, bool* joinGPUcheck, int* fkey_col_id, int* group_col_id, int will_shuffle, bool has_shuffled); void prepareKernelDim(int*** &off_col, ColumnInfo* build_column, short*** segment_group_each_gpu_count, short*** segment_group_each_gpu, int** last_segment_gpu, int table, bool has_shuffled); void preparePartitioningWithoutCount(int*** &off_col, int*** used_col_idx, short*** segment_group_each_gpu_count, short*** segment_group_each_gpu, int** last_segment_gpu, bool* joinGPUcheck, bool first_shuffle, bool pipeline); void prepareKernelPipelined(int*** &off_col, int*** used_col_idx, short*** segment_group_each_gpu_count, short*** segment_group_each_gpu, int** last_segment_gpu, bool* joinGPUcheck, int* fkey_col_id, int* group_col_id, bool has_shuffled, int latemat = 0); void countPartitioning(int*** &off_col, short*** segment_group_each_gpu_count, short*** segment_group_each_gpu, int** last_segment_gpu, bool first_shuffle); void synchronizePartitioning(); void preparePartitioningAfterCount(int*** &off_col, int*** used_col_idx, bool* joinGPUcheck, bool first_shuffle); void clearKernel(int*** &off_col, int will_shuffle, bool has_shuffled); void clearPartitioning(); void clearPipelined(int*** &off_col); KernelLaunch(CacheManager* _cm, KernelParams* _kparams, QueryParams* _qparams, int _sg, int _gpu, KernelType _kernel_type, int _table_id, float _output_selectivity, int latemat, bool aggrGPUcheck, cudaStream_t _stream); }; #endif