#include "serialcalc.hpp" #include #include namespace py = pybind11; #define CL_TARGET_OPENCL_VERSION 210 #define CL_HPP_TARGET_OPENCL_VERSION 210 #include #include #include #include #include struct Kernel { //OpenCL objects cl::Context context; cl::Program program; cl::Kernel imageproc; cl::CommandQueue queue; //Input/Output buffers cl::Buffer input_buf; cl::Buffer output_buf; //Model buffers cl::Buffer whitebalance_buf; cl::Buffer PCA_buf; cl::Buffer PCAmean_buf; cl::Buffer SVM_buf; cl::Buffer SVM_intercept_buf; cl::Buffer SG_buf; // Model buffer for serial (CPU based processing) const float* PCA; const float* PCAmean; const float* SVM; const float* SVM_intercept; const float* WB; const float* savgol_coef; bool unified_memory = false; int Nx, Ny, Nc, NPCA, Nk, Wl_min, Wl_max, savgol_toggle, whitebalance_toggle, Nwindow; size_t WGS; Kernel( cl_device_type type = CL_DEVICE_TYPE_GPU, const py::array_t PCA = py::array_t(), const py::array_t PCAmean = py::array_t(), const py::array_t SVM = py::array_t(), const py::array_t SVM_intercept = py::array_t(), const py::array_t whitebalance = py::array_t(), const py::array_t savgol_coef = py::array_t(), int Wl_min = 10, int Wl_max = 150, int savgol_toggle = 1) { //Default device priority is GPU > CPU > Accelerator cl::Device device; std::vector devices = utils::get_devices(type); // get the GPU if (devices.empty()) devices = utils::get_devices(CL_DEVICE_TYPE_CPU); // if no GPU, get the CPU if (!devices.empty()) device = devices.at(0); else throw std::runtime_error("No OpenCL devices found"); std::cout << "Choosing " << device.getInfo() << " as the OpenCL device." << std::endl; context = cl::Context(device); unified_memory = utils::is_unified_memory(device); queue = cl::CommandQueue(context, device); this->load_model(PCA, PCAmean, whitebalance, SVM, SVM_intercept, savgol_coef, Wl_min, Wl_max, savgol_toggle); } void load_model(const py::array_t PCA, const py::array_t PCAmean, const py::array_t SVM, const py::array_t SVM_intercept, const py::array_t whitebalance, const py::array_t savgol_coef, int Wl_min = 10, int Wl_max = 150, int savgol_toggle = 1) { if(PCA.size() == 0 || PCAmean.size() == 0) return; //If no model is provided, return //Expect PCA to be Row Major that is (rows: Ncomponents , columns: Nwavelengths) //Expect PCAmean to be a vector of size (Wl_max - Wl_min) //Expect SVM to be Column Major that is (rows: Nk , columns: Ncomponents) //Expect SVM_intercept to be a vector of size Nk //Ncomponents = Dimension that data is reduced to by PCA //Nk = Number of classes in the model that data should be classified into //NPCA = Ncomponents Nwindow = savgol_coef.size(); this->NPCA = PCA.size() > 0 ? PCA.shape(0) : 1; this->Nk = SVM.size() > 0 ? SVM.shape(0) : 1; this->Wl_min = Wl_min; this->Wl_max = Wl_max; this->savgol_toggle = savgol_toggle; Nx = Ny = Nc = 0; if( Wl_max - Wl_min > PCA.shape(1)) throw std::runtime_error("Wavelength range: " + std::to_string(Wl_max - Wl_min) + " does not match the model: " + std::to_string(PCA.shape(1))); if((SVM.size() > 0) && (SVM.shape(1) != PCA.shape(0))) throw std::runtime_error("SVM dimensions: " + std::to_string(SVM.shape(1)) + " do not match the model: " + std::to_string(PCA.shape(0))); if((SVM.size() > 0) && (SVM_intercept.size() != SVM.shape(0))) throw std::runtime_error("SVM_intercept dimensions: " + std::to_string(SVM_intercept.size()) + " do not match the SVM: " + std::to_string(SVM.shape(0))); if(SVM.size() > 0 && SVM_intercept.size() == 0) throw std::runtime_error("SVM intercept not provided"); if(SVM.size() > 0 && Nk != SVM.shape(0)) throw std::runtime_error("Number of classes in SVM: " + std::to_string(SVM.shape(0)) + " does not match the assigned number of classes: " + std::to_string(Nk)); if(PCA.size() > 0 && PCA.shape(0) != NPCA) throw std::runtime_error("Number of PCA components: " + std::to_string(PCA.size()) + " does not match the assigned number of components: " + std::to_string(NPCA)); whitebalance_toggle = whitebalance.size() > 0 ? 1 : 0; //If whitebalance is not provided, do not use whitebalance // store the pointers to the model data this->PCA = PCA.size() > 0 ? PCA.data() : nullptr; this->PCAmean = PCAmean.size() > 0 ? PCAmean.data() : nullptr; this->SVM = SVM.size() > 0 ? SVM.data() : nullptr; this->SVM_intercept = SVM_intercept.size() > 0 ? SVM_intercept.data() : nullptr; this->WB = whitebalance.size() > 0 ? whitebalance.data() : nullptr; this->savgol_coef = savgol_coef.size() > 0 ? savgol_coef.data() : nullptr; //Deduce the dimensions of the model whitebalance_buf = utils::make_buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, whitebalance.size(),const_cast(whitebalance.data())); PCA_buf = utils::make_buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, PCA.size(), const_cast(PCA.data())); PCAmean_buf = utils::make_buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, PCAmean.size(), const_cast(PCAmean.data())); SVM_buf = utils::make_buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, SVM.size(), const_cast(SVM.data())); SVM_intercept_buf= utils::make_buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, SVM_intercept.size(),const_cast(SVM_intercept.data())); SG_buf = utils::make_buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, savgol_coef.size(), const_cast(savgol_coef.data())); auto device = queue.getInfo(); auto is_amd = device.getInfo().find("Advanced Micro Devices") != std::string::npos; auto is_nvidia = device.getInfo().find("NVIDIA Corporation") != std::string::npos; auto pref_wg_multiple = is_amd ? device.getInfo() : is_nvidia ? device.getInfo() : 1; auto max_wg_size = queue.getInfo().getInfo(); auto NC_MAX = Wl_max - Wl_min; WGS = utils::next_multiple(NC_MAX, pref_wg_multiple); if (WGS > max_wg_size) WGS = max_wg_size; auto ITEMS_PER_THREAD = (NC_MAX + WGS - 1) / WGS; program = utils::make_program("src/source.cl", context, "-cl-std=CL2.0 " "-cl-mad-enable " "-cl-no-signed-zeros " "-cl-fast-relaxed-math " "-cl-unsafe-math-optimizations " "-cl-finite-math-only " "-cl-denorms-are-zero " "-cl-single-precision-constant " "-D WGS=" + std::to_string(WGS) + " " "-D ITEMS_PER_THREAD=" + std::to_string(ITEMS_PER_THREAD) + " " "-D NC_EFF=" + std::to_string(NC_MAX) + " " "-D N_WINDOW=" + std::to_string(Nwindow) + " " "-D N_PCA=" + std::to_string(NPCA) + " " "-D N_K=" + std::to_string(Nk)); imageproc = cl::Kernel(program, "process_image"); } void run(const py::array_t in_image, py::array_t out_classified_image, bool print_time = false, bool verbose = false){ auto Tcpy = std::chrono::duration(0); auto Tkernel = std::chrono::duration(0); //Deduce the dimensions of the input image int dim = in_image.ndim(); int Nlines = 0; if (dim == 3) { Nlines = in_image.shape(0); //Number of lines Nx = in_image.shape(1); //Number of pixels per line Nc = in_image.shape(2); //Number of channels } else { Nlines = 1; Nx = in_image.shape(0); //Number of pixels per line Nc = in_image.shape(1); //Number of channels } auto start = std::chrono::high_resolution_clock::now(); if (Nlines > this->Ny && !unified_memory) { this->Ny = Nlines; input_buf = utils::make_buffer(context, CL_MEM_READ_ONLY , in_image.size()); output_buf = utils::make_buffer(context, CL_MEM_WRITE_ONLY, out_classified_image.size()); } else if (unified_memory) { this->Ny = Nlines; input_buf = utils::make_buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, in_image.size(), const_cast(in_image.data())); output_buf = utils::make_buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_WRITE_ONLY,out_classified_image.size(), const_cast(out_classified_image.data())); } auto end = std::chrono::high_resolution_clock::now(); Tcpy += std::chrono::duration(end - start); // THIS FAILS WHEN IN IMAGE IS ONLY A SINGLE LINE... maybe be fix at some point or just let caller be responsible for inputs being correct // if (Nx != out_classified_image.shape(1) || // Ny != out_classified_image.shape(0)){ // throw std::runtime_error("Input image dimensions do not match the model, Input Image Shape:" + std::to_string(Nx) + "x" + std::to_string(Ny) + " vs Output Image Shape: " + std::to_string(out_classified_image.shape(1)) + "x" + std::to_string(out_classified_image.shape(0))); // } // if (Wl_max - Wl_min > in_image.shape(2) || // Wl_max > Nc){ // throw std::runtime_error("Output image dimensions do not match the model, Output Image Shape:" + std::to_string(out_classified_image.shape(2)) + " vs Model Wavelength Range: " + std::to_string(Wl_max - Wl_min)); // } // Set kernel arguments and run the kernel utils::setArgs( imageproc, input_buf, PCA_buf, PCAmean_buf, SVM_buf, SVM_intercept_buf, whitebalance_buf, SG_buf, output_buf, Nx, Ny, Nc, NPCA, Nk, Wl_min, Wl_max, savgol_toggle, whitebalance_toggle, Nwindow ); start = std::chrono::high_resolution_clock::now(); if (!unified_memory) utils::enqueueWrites(queue, std::make_pair(input_buf, const_cast(in_image.data()))); end = std::chrono::high_resolution_clock::now(); Tcpy += std::chrono::duration(end - start); start = std::chrono::high_resolution_clock::now(); CL_CALL(queue.enqueueNDRangeKernel, imageproc, cl::NullRange, cl::NDRange(WGS*Nx*Ny), cl::NDRange(WGS)); queue.finish(); end = std::chrono::high_resolution_clock::now(); Tkernel += std::chrono::duration(end - start); start = std::chrono::high_resolution_clock::now(); if (!unified_memory) utils::enqueueReads(queue, std::make_pair(output_buf, const_cast(out_classified_image.data()))); end = std::chrono::high_resolution_clock::now(); Tcpy += std::chrono::duration(end - start); if (print_time) { // std::cout << "(GPU) Copy Time: " << std::chrono::duration_cast(Tcpy).count() << " mus" << std::endl; std::cout << "(GPU) Computation time: " << std::chrono::duration_cast(Tkernel).count() << " mus" << std::endl; } } // Run the full pipeline with separate kernels std::array run_pipeline_separate( const py::array_t in_image, py::array_t out_classified_image, bool print_time = false, bool print_stage_times = false) { // Determine dimensions for intermediate arrays int dim = in_image.ndim(); int Nlines = 0; if (dim == 3) { Nlines = in_image.shape(0); Nx = in_image.shape(1); Nc = in_image.shape(2); } else { Nlines = 1; Nx = in_image.shape(0); Nc = in_image.shape(1); } int total_pixels = Nx * Nlines; int Nc_eff = Wl_max - Wl_min; // Create separate properly sized buffers for each stage auto start_allocation = std::chrono::steady_clock::now(); if (Nlines > this->Ny && !unified_memory) { input_buf = utils::make_buffer(context, CL_MEM_READ_ONLY , in_image.size()); output_buf = utils::make_buffer(context, CL_MEM_WRITE_ONLY, out_classified_image.size()); } else if (unified_memory) { input_buf = utils::make_buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, in_image.size(), const_cast(in_image.data())); output_buf = utils::make_buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_WRITE_ONLY,out_classified_image.size(), const_cast(out_classified_image.data())); } if (intermediate_buf.getInfo() < total_pixels * Nc_eff * sizeof(float) || intermediate_buf == cl::Buffer() ) { this->Ny = Nlines; intermediate_buf = utils::make_buffer(context, CL_MEM_READ_WRITE, total_pixels * Nc_eff); pca_output_buf = utils::make_buffer(context, CL_MEM_READ_WRITE, total_pixels * NPCA); } if (print_time) { std::cout << "Buffer Allocation Time: " << std::chrono::duration_cast(std::chrono::steady_clock::now() - start_allocation).count() << " μs" << std::endl; } auto start_total = std::chrono::steady_clock::now(); if (!unified_memory) { utils::enqueueWrites(queue, std::make_pair(input_buf, const_cast(in_image.data()))); queue.finish(); } auto end_copy = std::chrono::steady_clock::now(); auto copy_time = std::chrono::duration_cast(end_copy - start_total); std::array stage_times_microsecs = {0, 0, 0, 0}; auto start_kernel = std::chrono::steady_clock::now(); utils::enqueueKernel( queue, white_balance_kernel, cl::NDRange(WGS * total_pixels), cl::NDRange(WGS), input_buf, whitebalance_buf, intermediate_buf, Nx, Nc, Wl_min, Wl_max, whitebalance_toggle ); queue.finish(); auto T1 = std::chrono::steady_clock::now(); stage_times_microsecs[0] = std::chrono::duration_cast(T1 - start_kernel).count(); if (savgol_toggle){ utils::enqueueKernel( queue, savgol_filter_kernel, cl::NDRange(WGS* total_pixels), cl::NDRange(WGS), intermediate_buf, SG_buf );} queue.finish(); auto T2 = std::chrono::steady_clock::now(); stage_times_microsecs[1] = std::chrono::duration_cast(T2 - T1).count(); /* if (absorption_toggle || normalization_toggle) { utils::enqueueKernel( queue, absorbance_norm_kernel, cl::NDRange(WGS* total_pixels), cl::NDRange(WGS), intermediate_buf, absorption_toggle, normalization_toggle );} */ //queue.finish(); //auto T3 = std::chrono::steady_clock::now(); //stage_times_microsecs[2] = std::chrono::duration_cast(T3 - T2).count(); utils::enqueueKernel( queue, pca_kernel, cl::NDRange(WGS* total_pixels), cl::NDRange(WGS), intermediate_buf, PCA_buf, PCAmean_buf, pca_output_buf); queue.finish(); auto T3 = std::chrono::steady_clock::now(); stage_times_microsecs[2] = std::chrono::duration_cast(T3 - T2).count(); if (kmeans_svm_switch) { utils::enqueueKernel( queue, kmeans_kernel, cl::NDRange(NPCA * total_pixels), cl::NDRange(NPCA), pca_output_buf, Kmeans_buf, output_buf); } else { utils::enqueueKernel( queue, svm_kernel, cl::NDRange(NPCA * total_pixels), cl::NDRange(NPCA), pca_output_buf, SVM_buf, SVM_intercept_buf, output_buf); } queue.finish(); auto T4 = std::chrono::steady_clock::now(); stage_times_microsecs[3] = std::chrono::duration_cast(T4 - T3).count(); auto end_kernel = std::chrono::steady_clock::now(); auto kernel_time = std::chrono::duration_cast(end_kernel - start_kernel); if (!unified_memory) { utils::enqueueReads(queue, std::make_pair(output_buf, const_cast(out_classified_image.data()))); queue.finish(); } auto end_read = std::chrono::steady_clock::now(); auto read_time = std::chrono::duration_cast(end_read - end_kernel); if (print_time) { std::cout << "(GPU) Copy Time: " << copy_time.count() << " μs" << std::endl; std::cout << "(GPU) Kernel Time: " << kernel_time.count() << " μs" << std::endl; std::cout << "(GPU) Read Time: " << read_time.count() << " μs" << std::endl; std::cout << "White Balance Time: " << stage_times_microsecs[0] / 1000.0f << " µs" << std::endl; std::cout << "Savitzky-Golay Time: " << stage_times_microsecs[1] / 1000.0f << " µs" << std::endl; std::cout << "PCA Time: " << stage_times_microsecs[2] / 1000.0f << " µs" << std::endl; std::cout << "Classification Time: " << stage_times_microsecs[3] / 1000.0f << " µs" << std::endl; } auto total_time = std::chrono::duration_cast(end_read - start_total); if (print_time) { std::cout << "Total Pipeline (Separate Kernels): " << total_time.count() << " μs" << std::endl; } return stage_times_microsecs; } void runCPU(const py::array_t in_image, py::array_t out_classified_image, bool print_time = false, bool verbose = false){ // if print time and verbose are both true, print verbose time bool print_verbose_time = false; if (print_time && verbose) print_verbose_time = true; //Deduce the dimensions of the input image int dim = in_image.ndim(); int Nlines = 0; if (dim == 3) { Nlines = in_image.shape(0); //Number of lines Nx = in_image.shape(1); //Number of pixels per line Nc = in_image.shape(2); //Number of channels } else { Nlines = 1; Nx = in_image.shape(0); //Number of pixels per line Nc = in_image.shape(1); //Number of channels } unsigned char* labels = const_cast(out_classified_image.data()); auto start = std::chrono::high_resolution_clock::now(); run_serial(in_image.data(), PCA, PCAmean, SVM, SVM_intercept, WB, savgol_coef, labels, Nx, Nlines, Nc, NPCA, Nk, Wl_min, Wl_max, savgol_toggle, whitebalance_toggle, Nwindow, print_verbose_time); auto end = std::chrono::high_resolution_clock::now(); auto duration = std::chrono::duration_cast(end - start).count(); if (print_time) std::cout << "(CPU) Computation time: " << duration << " mus" << std::endl; }; void runCPU_minimal(const py::array_t in_image, py::array_t out_classified_image, bool print_time = false, bool verbose = false){ // if print time and verbose are both true, print verbose time bool print_verbose_time = false; if (print_time && verbose) print_verbose_time = true; //Deduce the dimensions of the input image int dim = in_image.ndim(); int Nlines = 0; if (dim == 3) { Nlines = in_image.shape(0); //Number of lines Nx = in_image.shape(1); //Number of pixels per line Nc = in_image.shape(2); //Number of channels } else { Nlines = 1; Nx = in_image.shape(0); //Number of pixels per line Nc = in_image.shape(1); //Number of channels } unsigned char* labels = const_cast(out_classified_image.data()); auto start = std::chrono::high_resolution_clock::now(); run_serial_minimal(in_image.data(), PCA, PCAmean, SVM, SVM_intercept, labels, Nx, Nlines, Nc, NPCA, Nk, Wl_min, Wl_max, print_verbose_time); auto end = std::chrono::high_resolution_clock::now(); auto duration = std::chrono::duration_cast(end - start).count(); if (print_time) std::cout << "(CPU minimal) Computation time: " << duration << " mus" << std::endl; }; }; PYBIND11_MODULE(plastic_sorting, m) { py::class_(m, "Kernel") .def(py::init<>()) .def("run", &Kernel::run, py::arg("in_image"), py::arg("out_classified_image"), py::arg("print_time") = false, py::arg("verbose") = false) .def("runCPU", &Kernel::runCPU, py::arg("in_image"), py::arg("out_classified_image"), py::arg("print_time") = false, py::arg("verbose") = false) .def("runCPU_minimal", &Kernel::runCPU_minimal, py::arg("in_image"), py::arg("out_classified_image"), py::arg("print_time") = false, py::arg("verbose") = false) .def("run_pipeline_separate", &Kernel::run_pipeline_separate, py::arg("in_image"), py::arg("out_classified_image"), py::arg("print_time") = false, py::arg("print_stage_times") = false) .def("load_model", &Kernel::load_model, py::arg("PCA") = py::array_t(), py::arg("PCAmean") = py::array_t(), py::arg("SVM") = py::array_t(), py::arg("SVM_intercept") = py::array_t(), py::arg("whitebalance") = py::array_t(), py::arg("savgol_coef") = py::array_t(), py::arg("Wl_min") = 10, py::arg("Wl_max") = 150, py::arg("savgol_toggle") = 1); }