#include "serialcalc.hpp"
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
namespace py = pybind11;
#define CL_TARGET_OPENCL_VERSION 210
#define CL_HPP_TARGET_OPENCL_VERSION 210
#include <cinttypes>
#include <utils.hpp>
#include <CL/cl.h>
#include <chrono>
#include <iostream>
struct Kernel {
//OpenCL objects
cl::Context context;
cl::Program program;
cl::Kernel imageproc;
cl::CommandQueue queue;
//Input/Output buffers
cl::Buffer input_buf;
cl::Buffer output_buf;
//Model buffers
cl::Buffer whitebalance_buf;
cl::Buffer PCA_buf;
cl::Buffer PCAmean_buf;
cl::Buffer SVM_buf;
cl::Buffer SVM_intercept_buf;
cl::Buffer SG_buf;
// Model buffer for serial (CPU based processing)
const float* PCA;
const float* PCAmean;
const float* SVM;
const float* SVM_intercept;
const float* WB;
const float* savgol_coef;
bool unified_memory = false;
int Nx, Ny, Nc, NPCA, Nk, Wl_min, Wl_max, savgol_toggle, whitebalance_toggle, Nwindow;
size_t WGS;
Kernel( cl_device_type type = CL_DEVICE_TYPE_GPU,
const py::array_t<float, py::array::c_style | py::array::forcecast> PCA = py::array_t<float>(),
const py::array_t<float, py::array::c_style | py::array::forcecast> PCAmean = py::array_t<float>(),
const py::array_t<float, py::array::f_style | py::array::forcecast> SVM = py::array_t<float>(),
const py::array_t<float, py::array::c_style | py::array::forcecast> SVM_intercept = py::array_t<float>(),
const py::array_t<float, py::array::c_style | py::array::forcecast> whitebalance = py::array_t<float>(),
const py::array_t<float, py::array::f_style | py::array::forcecast> savgol_coef = py::array_t<float>(),
int Wl_min = 10,
int Wl_max = 150,
int savgol_toggle = 1) {
//Default device priority is GPU > CPU > Accelerator
cl::Device device;
std::vector<cl::Device> devices = utils::get_devices(type); // get the GPU
if (devices.empty()) devices = utils::get_devices(CL_DEVICE_TYPE_CPU); // if no GPU, get the CPU
if (!devices.empty()) device = devices.at(0);
else throw std::runtime_error("No OpenCL devices found");
std::cout << "Choosing " << device.getInfo<CL_DEVICE_NAME>() << " as the OpenCL device." << std::endl;
context = cl::Context(device);
unified_memory = utils::is_unified_memory(device);
queue = cl::CommandQueue(context, device);
this->load_model(PCA, PCAmean, whitebalance, SVM, SVM_intercept, savgol_coef, Wl_min, Wl_max, savgol_toggle);
}
void load_model(const py::array_t<float, py::array::c_style | py::array::forcecast> PCA,
const py::array_t<float, py::array::c_style | py::array::forcecast> PCAmean,
const py::array_t<float, py::array::f_style | py::array::forcecast> SVM,
const py::array_t<float, py::array::c_style | py::array::forcecast> SVM_intercept,
const py::array_t<float, py::array::c_style | py::array::forcecast> whitebalance,
const py::array_t<float, py::array::c_style | py::array::forcecast> savgol_coef,
int Wl_min = 10,
int Wl_max = 150,
int savgol_toggle = 1)
{
if(PCA.size() == 0 || PCAmean.size() == 0) return; //If no model is provided, return
//Expect PCA to be Row Major that is (rows: Ncomponents , columns: Nwavelengths)
//Expect PCAmean to be a vector of size (Wl_max - Wl_min)
//Expect SVM to be Column Major that is (rows: Nk , columns: Ncomponents)
//Expect SVM_intercept to be a vector of size Nk
//Ncomponents = Dimension that data is reduced to by PCA
//Nk = Number of classes in the model that data should be classified into
//NPCA = Ncomponents
Nwindow = savgol_coef.size();
this->NPCA = PCA.size() > 0 ? PCA.shape(0) : 1;
this->Nk = SVM.size() > 0 ? SVM.shape(0) : 1;
this->Wl_min = Wl_min;
this->Wl_max = Wl_max;
this->savgol_toggle = savgol_toggle;
Nx = Ny = Nc = 0;
if( Wl_max - Wl_min > PCA.shape(1)) throw std::runtime_error("Wavelength range: " + std::to_string(Wl_max - Wl_min) + " does not match the model: " + std::to_string(PCA.shape(1)));
if((SVM.size() > 0) && (SVM.shape(1) != PCA.shape(0))) throw std::runtime_error("SVM dimensions: " + std::to_string(SVM.shape(1)) + " do not match the model: " + std::to_string(PCA.shape(0)));
if((SVM.size() > 0) && (SVM_intercept.size() != SVM.shape(0))) throw std::runtime_error("SVM_intercept dimensions: " + std::to_string(SVM_intercept.size()) + " do not match the SVM: " + std::to_string(SVM.shape(0)));
if(SVM.size() > 0 && SVM_intercept.size() == 0) throw std::runtime_error("SVM intercept not provided");
if(SVM.size() > 0 && Nk != SVM.shape(0)) throw std::runtime_error("Number of classes in SVM: " + std::to_string(SVM.shape(0)) + " does not match the assigned number of classes: " + std::to_string(Nk));
if(PCA.size() > 0 && PCA.shape(0) != NPCA) throw std::runtime_error("Number of PCA components: " + std::to_string(PCA.size()) + " does not match the assigned number of components: " + std::to_string(NPCA));
whitebalance_toggle = whitebalance.size() > 0 ? 1 : 0; //If whitebalance is not provided, do not use whitebalance
// store the pointers to the model data
this->PCA = PCA.size() > 0 ? PCA.data() : nullptr;
this->PCAmean = PCAmean.size() > 0 ? PCAmean.data() : nullptr;
this->SVM = SVM.size() > 0 ? SVM.data() : nullptr;
this->SVM_intercept = SVM_intercept.size() > 0 ? SVM_intercept.data() : nullptr;
this->WB = whitebalance.size() > 0 ? whitebalance.data() : nullptr;
this->savgol_coef = savgol_coef.size() > 0 ? savgol_coef.data() : nullptr;
//Deduce the dimensions of the model
whitebalance_buf = utils::make_buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, whitebalance.size(),const_cast<float*>(whitebalance.data()));
PCA_buf = utils::make_buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, PCA.size(), const_cast<float*>(PCA.data()));
PCAmean_buf = utils::make_buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, PCAmean.size(), const_cast<float*>(PCAmean.data()));
SVM_buf = utils::make_buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, SVM.size(), const_cast<float*>(SVM.data()));
SVM_intercept_buf= utils::make_buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, SVM_intercept.size(),const_cast<float*>(SVM_intercept.data()));
SG_buf = utils::make_buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, savgol_coef.size(), const_cast<float*>(savgol_coef.data()));
auto device = queue.getInfo<CL_QUEUE_DEVICE>();
auto is_amd = device.getInfo<CL_DEVICE_VENDOR>().find("Advanced Micro Devices") != std::string::npos;
auto is_nvidia = device.getInfo<CL_DEVICE_VENDOR>().find("NVIDIA Corporation") != std::string::npos;
auto pref_wg_multiple = is_amd ? device.getInfo<CL_DEVICE_WAVEFRONT_WIDTH_AMD>() : is_nvidia ? device.getInfo<CL_DEVICE_WARP_SIZE_NV>() : 1;
auto max_wg_size = queue.getInfo<CL_QUEUE_DEVICE>().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
auto NC_MAX = Wl_max - Wl_min;
WGS = utils::next_multiple(NC_MAX, pref_wg_multiple);
if (WGS > max_wg_size) WGS = max_wg_size;
auto ITEMS_PER_THREAD = (NC_MAX + WGS - 1) / WGS;
program = utils::make_program("src/source.cl", context,
"-cl-std=CL2.0 "
"-cl-mad-enable "
"-cl-no-signed-zeros "
"-cl-fast-relaxed-math "
"-cl-unsafe-math-optimizations "
"-cl-finite-math-only "
"-cl-denorms-are-zero "
"-cl-single-precision-constant "
"-D WGS=" + std::to_string(WGS) + " "
"-D ITEMS_PER_THREAD=" + std::to_string(ITEMS_PER_THREAD) + " "
"-D NC_EFF=" + std::to_string(NC_MAX) + " "
"-D N_WINDOW=" + std::to_string(Nwindow) + " "
"-D N_PCA=" + std::to_string(NPCA) + " "
"-D N_K=" + std::to_string(Nk));
imageproc = cl::Kernel(program, "process_image");
}
void run(const py::array_t<uint8_t, py::array::c_style | py::array::forcecast> in_image,
py::array_t<uint8_t, py::array::c_style | py::array::forcecast> out_classified_image,
bool print_time = false,
bool verbose = false){
auto Tcpy = std::chrono::duration<double>(0);
auto Tkernel = std::chrono::duration<double>(0);
//Deduce the dimensions of the input image
int dim = in_image.ndim();
int Nlines = 0;
if (dim == 3) {
Nlines = in_image.shape(0); //Number of lines
Nx = in_image.shape(1); //Number of pixels per line
Nc = in_image.shape(2); //Number of channels
}
else {
Nlines = 1;
Nx = in_image.shape(0); //Number of pixels per line
Nc = in_image.shape(1); //Number of channels
}
auto start = std::chrono::high_resolution_clock::now();
if (Nlines > this->Ny && !unified_memory) {
this->Ny = Nlines;
input_buf = utils::make_buffer<uint8_t>(context, CL_MEM_READ_ONLY , in_image.size());
output_buf = utils::make_buffer<uint8_t>(context, CL_MEM_WRITE_ONLY, out_classified_image.size());
} else if (unified_memory) {
this->Ny = Nlines;
input_buf = utils::make_buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, in_image.size(), const_cast<uint8_t*>(in_image.data()));
output_buf = utils::make_buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_WRITE_ONLY,out_classified_image.size(), const_cast<uint8_t*>(out_classified_image.data()));
}
auto end = std::chrono::high_resolution_clock::now();
Tcpy += std::chrono::duration<double>(end - start);
// THIS FAILS WHEN IN IMAGE IS ONLY A SINGLE LINE... maybe be fix at some point or just let caller be responsible for inputs being correct
// if (Nx != out_classified_image.shape(1) ||
// Ny != out_classified_image.shape(0)){
// throw std::runtime_error("Input image dimensions do not match the model, Input Image Shape:" + std::to_string(Nx) + "x" + std::to_string(Ny) + " vs Output Image Shape: " + std::to_string(out_classified_image.shape(1)) + "x" + std::to_string(out_classified_image.shape(0)));
// }
// if (Wl_max - Wl_min > in_image.shape(2) ||
// Wl_max > Nc){
// throw std::runtime_error("Output image dimensions do not match the model, Output Image Shape:" + std::to_string(out_classified_image.shape(2)) + " vs Model Wavelength Range: " + std::to_string(Wl_max - Wl_min));
// }
// Set kernel arguments and run the kernel
utils::setArgs( imageproc,
input_buf,
PCA_buf,
PCAmean_buf,
SVM_buf,
SVM_intercept_buf,
whitebalance_buf,
SG_buf,
output_buf,
Nx,
Ny,
Nc,
NPCA,
Nk,
Wl_min,
Wl_max,
savgol_toggle,
whitebalance_toggle,
Nwindow
);
start = std::chrono::high_resolution_clock::now();
if (!unified_memory) utils::enqueueWrites(queue, std::make_pair(input_buf, const_cast<uint8_t*>(in_image.data())));
end = std::chrono::high_resolution_clock::now();
Tcpy += std::chrono::duration<double>(end - start);
start = std::chrono::high_resolution_clock::now();
CL_CALL(queue.enqueueNDRangeKernel, imageproc, cl::NullRange, cl::NDRange(WGS*Nx*Ny), cl::NDRange(WGS));
queue.finish();
end = std::chrono::high_resolution_clock::now();
Tkernel += std::chrono::duration<double>(end - start);
start = std::chrono::high_resolution_clock::now();
if (!unified_memory) utils::enqueueReads(queue, std::make_pair(output_buf, const_cast<uint8_t*>(out_classified_image.data())));
end = std::chrono::high_resolution_clock::now();
Tcpy += std::chrono::duration<double>(end - start);
if (print_time) {
// std::cout << "(GPU) Copy Time: " << std::chrono::duration_cast<std::chrono::microseconds>(Tcpy).count() << " mus" << std::endl;
std::cout << "(GPU) Computation time: " << std::chrono::duration_cast<std::chrono::microseconds>(Tkernel).count() << " mus" << std::endl;
}
}
// Run the full pipeline with separate kernels
std::array<float, 4> run_pipeline_separate(
const py::array_t<uint8_t, py::array::c_style> in_image,
py::array_t<uint8_t, py::array::c_style> out_classified_image,
bool print_time = false,
bool print_stage_times = false) {
// Determine dimensions for intermediate arrays
int dim = in_image.ndim();
int Nlines = 0;
if (dim == 3) {
Nlines = in_image.shape(0);
Nx = in_image.shape(1);
Nc = in_image.shape(2);
} else {
Nlines = 1;
Nx = in_image.shape(0);
Nc = in_image.shape(1);
}
int total_pixels = Nx * Nlines;
int Nc_eff = Wl_max - Wl_min;
// Create separate properly sized buffers for each stage
auto start_allocation = std::chrono::steady_clock::now();
if (Nlines > this->Ny && !unified_memory) {
input_buf = utils::make_buffer<uint8_t>(context, CL_MEM_READ_ONLY , in_image.size());
output_buf = utils::make_buffer<uint8_t>(context, CL_MEM_WRITE_ONLY, out_classified_image.size());
} else if (unified_memory) {
input_buf = utils::make_buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, in_image.size(), const_cast<uint8_t*>(in_image.data()));
output_buf = utils::make_buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_WRITE_ONLY,out_classified_image.size(), const_cast<uint8_t*>(out_classified_image.data()));
}
if (intermediate_buf.getInfo<CL_MEM_SIZE>() < total_pixels * Nc_eff * sizeof(float)
|| intermediate_buf == cl::Buffer() ) {
this->Ny = Nlines;
intermediate_buf = utils::make_buffer<float>(context, CL_MEM_READ_WRITE, total_pixels * Nc_eff);
pca_output_buf = utils::make_buffer<float>(context, CL_MEM_READ_WRITE, total_pixels * NPCA);
}
if (print_time) {
std::cout << "Buffer Allocation Time: " <<
std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - start_allocation).count() <<
" μs" << std::endl;
}
auto start_total = std::chrono::steady_clock::now();
if (!unified_memory) {
utils::enqueueWrites(queue, std::make_pair(input_buf, const_cast<uint8_t*>(in_image.data())));
queue.finish();
}
auto end_copy = std::chrono::steady_clock::now();
auto copy_time = std::chrono::duration_cast<std::chrono::microseconds>(end_copy - start_total);
std::array<float, 4> stage_times_microsecs = {0, 0, 0, 0};
auto start_kernel = std::chrono::steady_clock::now();
utils::enqueueKernel(
queue,
white_balance_kernel,
cl::NDRange(WGS * total_pixels),
cl::NDRange(WGS),
input_buf,
whitebalance_buf,
intermediate_buf,
Nx,
Nc,
Wl_min,
Wl_max,
whitebalance_toggle
);
queue.finish();
auto T1 = std::chrono::steady_clock::now();
stage_times_microsecs[0] = std::chrono::duration_cast<std::chrono::nanoseconds>(T1 - start_kernel).count();
if (savgol_toggle){
utils::enqueueKernel(
queue,
savgol_filter_kernel,
cl::NDRange(WGS* total_pixels),
cl::NDRange(WGS),
intermediate_buf,
SG_buf
);}
queue.finish();
auto T2 = std::chrono::steady_clock::now();
stage_times_microsecs[1] = std::chrono::duration_cast<std::chrono::nanoseconds>(T2 - T1).count();
/*
if (absorption_toggle || normalization_toggle) {
utils::enqueueKernel(
queue,
absorbance_norm_kernel,
cl::NDRange(WGS* total_pixels),
cl::NDRange(WGS),
intermediate_buf,
absorption_toggle,
normalization_toggle
);} */
//queue.finish();
//auto T3 = std::chrono::steady_clock::now();
//stage_times_microsecs[2] = std::chrono::duration_cast<std::chrono::nanoseconds>(T3 - T2).count();
utils::enqueueKernel(
queue,
pca_kernel,
cl::NDRange(WGS* total_pixels),
cl::NDRange(WGS),
intermediate_buf,
PCA_buf,
PCAmean_buf,
pca_output_buf);
queue.finish();
auto T3 = std::chrono::steady_clock::now();
stage_times_microsecs[2] = std::chrono::duration_cast<std::chrono::nanoseconds>(T3 - T2).count();
if (kmeans_svm_switch) {
utils::enqueueKernel(
queue,
kmeans_kernel,
cl::NDRange(NPCA * total_pixels),
cl::NDRange(NPCA),
pca_output_buf,
Kmeans_buf,
output_buf);
} else {
utils::enqueueKernel(
queue,
svm_kernel,
cl::NDRange(NPCA * total_pixels),
cl::NDRange(NPCA),
pca_output_buf,
SVM_buf,
SVM_intercept_buf,
output_buf);
}
queue.finish();
auto T4 = std::chrono::steady_clock::now();
stage_times_microsecs[3] = std::chrono::duration_cast<std::chrono::nanoseconds>(T4 - T3).count();
auto end_kernel = std::chrono::steady_clock::now();
auto kernel_time = std::chrono::duration_cast<std::chrono::microseconds>(end_kernel - start_kernel);
if (!unified_memory) {
utils::enqueueReads(queue, std::make_pair(output_buf, const_cast<uint8_t*>(out_classified_image.data())));
queue.finish();
}
auto end_read = std::chrono::steady_clock::now();
auto read_time = std::chrono::duration_cast<std::chrono::microseconds>(end_read - end_kernel);
if (print_time) {
std::cout << "(GPU) Copy Time: " << copy_time.count() << " μs" << std::endl;
std::cout << "(GPU) Kernel Time: " << kernel_time.count() << " μs" << std::endl;
std::cout << "(GPU) Read Time: " << read_time.count() << " μs" << std::endl;
std::cout << "White Balance Time: " << stage_times_microsecs[0] / 1000.0f << " µs" << std::endl;
std::cout << "Savitzky-Golay Time: " << stage_times_microsecs[1] / 1000.0f << " µs" << std::endl;
std::cout << "PCA Time: " << stage_times_microsecs[2] / 1000.0f << " µs" << std::endl;
std::cout << "Classification Time: " << stage_times_microsecs[3] / 1000.0f << " µs" << std::endl;
}
auto total_time = std::chrono::duration_cast<std::chrono::microseconds>(end_read - start_total);
if (print_time) {
std::cout << "Total Pipeline (Separate Kernels): " << total_time.count() << " μs" << std::endl;
}
return stage_times_microsecs;
}
void runCPU(const py::array_t<uint8_t, py::array::c_style | py::array::forcecast> in_image,
py::array_t<uint8_t, py::array::c_style | py::array::forcecast> out_classified_image,
bool print_time = false,
bool verbose = false){
// if print time and verbose are both true, print verbose time
bool print_verbose_time = false;
if (print_time && verbose) print_verbose_time = true;
//Deduce the dimensions of the input image
int dim = in_image.ndim();
int Nlines = 0;
if (dim == 3) {
Nlines = in_image.shape(0); //Number of lines
Nx = in_image.shape(1); //Number of pixels per line
Nc = in_image.shape(2); //Number of channels
}
else {
Nlines = 1;
Nx = in_image.shape(0); //Number of pixels per line
Nc = in_image.shape(1); //Number of channels
}
unsigned char* labels = const_cast<unsigned char *>(out_classified_image.data());
auto start = std::chrono::high_resolution_clock::now();
run_serial(in_image.data(),
PCA,
PCAmean,
SVM,
SVM_intercept,
WB,
savgol_coef,
labels,
Nx,
Nlines,
Nc,
NPCA,
Nk,
Wl_min,
Wl_max,
savgol_toggle,
whitebalance_toggle,
Nwindow,
print_verbose_time);
auto end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
if (print_time) std::cout << "(CPU) Computation time: " << duration << " mus" << std::endl;
};
void runCPU_minimal(const py::array_t<uint8_t, py::array::c_style | py::array::forcecast> in_image,
py::array_t<uint8_t, py::array::c_style | py::array::forcecast> out_classified_image,
bool print_time = false,
bool verbose = false){
// if print time and verbose are both true, print verbose time
bool print_verbose_time = false;
if (print_time && verbose) print_verbose_time = true;
//Deduce the dimensions of the input image
int dim = in_image.ndim();
int Nlines = 0;
if (dim == 3) {
Nlines = in_image.shape(0); //Number of lines
Nx = in_image.shape(1); //Number of pixels per line
Nc = in_image.shape(2); //Number of channels
}
else {
Nlines = 1;
Nx = in_image.shape(0); //Number of pixels per line
Nc = in_image.shape(1); //Number of channels
}
unsigned char* labels = const_cast<unsigned char *>(out_classified_image.data());
auto start = std::chrono::high_resolution_clock::now();
run_serial_minimal(in_image.data(),
PCA,
PCAmean,
SVM,
SVM_intercept,
labels,
Nx,
Nlines,
Nc,
NPCA,
Nk,
Wl_min,
Wl_max,
print_verbose_time);
auto end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
if (print_time) std::cout << "(CPU minimal) Computation time: " << duration << " mus" << std::endl;
};
};
PYBIND11_MODULE(plastic_sorting, m) {
py::class_<Kernel>(m, "Kernel")
.def(py::init<>())
.def("run", &Kernel::run, py::arg("in_image"), py::arg("out_classified_image"), py::arg("print_time") = false, py::arg("verbose") = false)
.def("runCPU", &Kernel::runCPU, py::arg("in_image"), py::arg("out_classified_image"), py::arg("print_time") = false, py::arg("verbose") = false)
.def("runCPU_minimal", &Kernel::runCPU_minimal, py::arg("in_image"), py::arg("out_classified_image"), py::arg("print_time") = false, py::arg("verbose") = false)
.def("run_pipeline_separate", &Kernel::run_pipeline_separate, py::arg("in_image"), py::arg("out_classified_image"), py::arg("print_time") = false, py::arg("print_stage_times") = false)
.def("load_model", &Kernel::load_model, py::arg("PCA") = py::array_t<float>(),
py::arg("PCAmean") = py::array_t<float>(),
py::arg("SVM") = py::array_t<float>(),
py::arg("SVM_intercept") = py::array_t<float>(),
py::arg("whitebalance") = py::array_t<float>(),
py::arg("savgol_coef") = py::array_t<float>(),
py::arg("Wl_min") = 10,
py::arg("Wl_max") = 150,
py::arg("savgol_toggle") = 1);
}