pybind/pybind-wrapper.cc · HSI-PROCESSING

#include "serialcalc.hpp"
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
namespace py = pybind11;


#define CL_TARGET_OPENCL_VERSION 210
#define CL_HPP_TARGET_OPENCL_VERSION 210
#include <cinttypes>
#include <utils.hpp>
#include <CL/cl.h>
#include <chrono>
#include <iostream>

struct Kernel {
    //OpenCL objects
    cl::Context context;
    cl::Program program;
    cl::Kernel imageproc;
    cl::CommandQueue queue;

    //Input/Output buffers
    cl::Buffer input_buf;
    cl::Buffer output_buf;
    
    //Model buffers
    cl::Buffer whitebalance_buf;
    cl::Buffer PCA_buf;
    cl::Buffer PCAmean_buf;
    cl::Buffer SVM_buf;
    cl::Buffer SVM_intercept_buf;
    cl::Buffer SG_buf;

    // Model buffer for serial (CPU based processing)
    const float* PCA;
    const float* PCAmean;
    const float* SVM;
    const float* SVM_intercept;
    const float* WB;
    const float* savgol_coef;


    bool unified_memory = false;

    int Nx, Ny, Nc, NPCA, Nk, Wl_min, Wl_max, savgol_toggle, whitebalance_toggle, Nwindow;

    size_t WGS;

    Kernel(         cl_device_type type = CL_DEVICE_TYPE_GPU,
                    const py::array_t<float, py::array::c_style | py::array::forcecast> PCA = py::array_t<float>(),
                    const py::array_t<float, py::array::c_style | py::array::forcecast> PCAmean = py::array_t<float>(),
                    const py::array_t<float, py::array::f_style | py::array::forcecast> SVM = py::array_t<float>(),
                    const py::array_t<float, py::array::c_style | py::array::forcecast> SVM_intercept = py::array_t<float>(),
                    const py::array_t<float, py::array::c_style | py::array::forcecast> whitebalance = py::array_t<float>(),
                    const py::array_t<float, py::array::f_style | py::array::forcecast> savgol_coef = py::array_t<float>(),

                    int Wl_min = 10,
                    int Wl_max = 150,
                    int savgol_toggle = 1) {
        
        //Default device priority is GPU > CPU > Accelerator
        cl::Device device;
        std::vector<cl::Device> devices = utils::get_devices(type); // get the GPU
        if (devices.empty()) devices = utils::get_devices(CL_DEVICE_TYPE_CPU); // if no GPU, get the CPU

        if (!devices.empty()) device = devices.at(0);
        else throw std::runtime_error("No OpenCL devices found");
        std::cout << "Choosing " << device.getInfo<CL_DEVICE_NAME>() << " as the OpenCL device." << std::endl;
        context = cl::Context(device);
        unified_memory = utils::is_unified_memory(device);
        queue = cl::CommandQueue(context, device);
        
        this->load_model(PCA, PCAmean, whitebalance, SVM, SVM_intercept, savgol_coef, Wl_min, Wl_max, savgol_toggle);
    }

    void load_model(const py::array_t<float, py::array::c_style | py::array::forcecast> PCA,
                    const py::array_t<float, py::array::c_style | py::array::forcecast> PCAmean,
                    const py::array_t<float, py::array::f_style | py::array::forcecast> SVM,
                    const py::array_t<float, py::array::c_style | py::array::forcecast> SVM_intercept,
                    const py::array_t<float, py::array::c_style | py::array::forcecast> whitebalance,
                    const py::array_t<float, py::array::c_style | py::array::forcecast> savgol_coef,
                    int Wl_min = 10,
                    int Wl_max = 150,
                    int savgol_toggle = 1)
    {   
        if(PCA.size() == 0 || PCAmean.size() == 0)  return; //If no model is provided, return
        //Expect PCA to be Row Major that is (rows: Ncomponents , columns: Nwavelengths)
        //Expect PCAmean to be a vector of size (Wl_max - Wl_min)
        //Expect SVM to be Column Major that is (rows: Nk , columns: Ncomponents)
        //Expect SVM_intercept to be a vector of size Nk
        
        //Ncomponents = Dimension that data is reduced to by PCA
        //Nk = Number of classes in the model that data should be classified into
        //NPCA = Ncomponents 

        Nwindow = savgol_coef.size();
        
        this->NPCA = PCA.size() > 0 ? PCA.shape(0) : 1;
        this->Nk   = SVM.size() > 0 ? SVM.shape(0) : 1;
        this->Wl_min = Wl_min;
        this->Wl_max = Wl_max;
        this->savgol_toggle = savgol_toggle;

        Nx = Ny = Nc = 0;

        
        if( Wl_max - Wl_min > PCA.shape(1)) throw std::runtime_error("Wavelength range: " + std::to_string(Wl_max - Wl_min) + " does not match the model: " + std::to_string(PCA.shape(1)));
        if((SVM.size() > 0) && (SVM.shape(1) != PCA.shape(0))) throw std::runtime_error("SVM dimensions: " + std::to_string(SVM.shape(1)) + " do not match the model: " + std::to_string(PCA.shape(0)));
        if((SVM.size() > 0) && (SVM_intercept.size() != SVM.shape(0))) throw std::runtime_error("SVM_intercept dimensions: " + std::to_string(SVM_intercept.size()) + " do not match the SVM: " + std::to_string(SVM.shape(0)));
        if(SVM.size() > 0 && SVM_intercept.size() == 0) throw std::runtime_error("SVM intercept not provided");
        if(SVM.size() > 0 && Nk != SVM.shape(0)) throw std::runtime_error("Number of classes in SVM: " + std::to_string(SVM.shape(0)) + " does not match the assigned number of classes: " + std::to_string(Nk));
        if(PCA.size() > 0 && PCA.shape(0) != NPCA) throw std::runtime_error("Number of PCA components: " + std::to_string(PCA.size()) + " does not match the assigned number of components: " + std::to_string(NPCA));            


        whitebalance_toggle = whitebalance.size() > 0 ? 1 : 0; //If whitebalance is not provided, do not use whitebalance
        
        // store the pointers to the model data
        this->PCA = PCA.size() > 0 ? PCA.data() : nullptr;
        this->PCAmean = PCAmean.size() > 0 ? PCAmean.data() : nullptr;
        this->SVM = SVM.size() > 0 ? SVM.data() : nullptr;
        this->SVM_intercept = SVM_intercept.size() > 0 ? SVM_intercept.data() : nullptr;
        this->WB = whitebalance.size() > 0 ? whitebalance.data() : nullptr;
        this->savgol_coef = savgol_coef.size() > 0 ? savgol_coef.data() : nullptr;


        //Deduce the dimensions of the model
        whitebalance_buf = utils::make_buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, whitebalance.size(),const_cast<float*>(whitebalance.data()));
        PCA_buf          = utils::make_buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, PCA.size(),         const_cast<float*>(PCA.data()));
        PCAmean_buf      = utils::make_buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, PCAmean.size(),     const_cast<float*>(PCAmean.data()));
        SVM_buf          = utils::make_buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, SVM.size(),         const_cast<float*>(SVM.data()));
        SVM_intercept_buf= utils::make_buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, SVM_intercept.size(),const_cast<float*>(SVM_intercept.data()));
        SG_buf          = utils::make_buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, savgol_coef.size(), const_cast<float*>(savgol_coef.data()));

        auto device = queue.getInfo<CL_QUEUE_DEVICE>();
        auto is_amd = device.getInfo<CL_DEVICE_VENDOR>().find("Advanced Micro Devices") != std::string::npos;
        auto is_nvidia = device.getInfo<CL_DEVICE_VENDOR>().find("NVIDIA Corporation") != std::string::npos;
        auto pref_wg_multiple   = is_amd ? device.getInfo<CL_DEVICE_WAVEFRONT_WIDTH_AMD>() : is_nvidia ? device.getInfo<CL_DEVICE_WARP_SIZE_NV>() : 1;
        auto max_wg_size        = queue.getInfo<CL_QUEUE_DEVICE>().getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
        auto NC_MAX = Wl_max - Wl_min;

        WGS = utils::next_multiple(NC_MAX, pref_wg_multiple);
        if (WGS > max_wg_size) WGS = max_wg_size;
        auto ITEMS_PER_THREAD = (NC_MAX + WGS - 1) / WGS;



        program = utils::make_program("src/source.cl", context, 
                                                        "-cl-std=CL2.0 "
                                                        "-cl-mad-enable "
                                                        "-cl-no-signed-zeros "
                                                        "-cl-fast-relaxed-math "
                                                        "-cl-unsafe-math-optimizations "
                                                        "-cl-finite-math-only "
                                                        "-cl-denorms-are-zero "
                                                        "-cl-single-precision-constant "
                                                        "-D WGS=" + std::to_string(WGS) + " "
                                                        "-D ITEMS_PER_THREAD=" + std::to_string(ITEMS_PER_THREAD) + " "
                                                        "-D NC_EFF=" + std::to_string(NC_MAX) + " "
                                                        "-D N_WINDOW=" + std::to_string(Nwindow) + " "
                                                        "-D N_PCA=" + std::to_string(NPCA) + " "
                                                        "-D N_K=" + std::to_string(Nk));
        imageproc = cl::Kernel(program, "process_image");
    }

    void run(const py::array_t<uint8_t, py::array::c_style | py::array::forcecast> in_image,
                    py::array_t<uint8_t, py::array::c_style | py::array::forcecast> out_classified_image,
                    bool print_time = false,
                    bool verbose = false){
        
        auto Tcpy = std::chrono::duration<double>(0);
        auto Tkernel = std::chrono::duration<double>(0);
                        
        //Deduce the dimensions of the input image
        int dim = in_image.ndim();
        int Nlines = 0;
        if (dim == 3) {
            Nlines = in_image.shape(0); //Number of lines
            Nx = in_image.shape(1); //Number of pixels per line
            Nc = in_image.shape(2); //Number of channels
        }
        else {
            Nlines = 1;
            Nx = in_image.shape(0); //Number of pixels per line
            Nc = in_image.shape(1); //Number of channels
        }

        auto start = std::chrono::high_resolution_clock::now();
        if (Nlines > this->Ny && !unified_memory) {
            this->Ny = Nlines;
            input_buf  = utils::make_buffer<uint8_t>(context, CL_MEM_READ_ONLY , in_image.size());
            output_buf = utils::make_buffer<uint8_t>(context, CL_MEM_WRITE_ONLY, out_classified_image.size());
        } else if (unified_memory) {
            this->Ny = Nlines;
            input_buf =  utils::make_buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, in_image.size(), const_cast<uint8_t*>(in_image.data()));
            output_buf = utils::make_buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_WRITE_ONLY,out_classified_image.size(), const_cast<uint8_t*>(out_classified_image.data()));
        }
        auto end = std::chrono::high_resolution_clock::now();
        Tcpy += std::chrono::duration<double>(end - start);

        // THIS FAILS WHEN IN IMAGE IS ONLY A SINGLE LINE... maybe be fix at some point or just let caller be responsible for inputs being correct
        // if (Nx != out_classified_image.shape(1) || 
        //     Ny != out_classified_image.shape(0)){
        //     throw std::runtime_error("Input image dimensions do not match the model, Input Image Shape:" + std::to_string(Nx) + "x" + std::to_string(Ny) + " vs Output Image Shape: " + std::to_string(out_classified_image.shape(1)) + "x" + std::to_string(out_classified_image.shape(0)));
        // }
        // if (Wl_max - Wl_min > in_image.shape(2) ||
        //     Wl_max > Nc){
        //     throw std::runtime_error("Output image dimensions do not match the model, Output Image Shape:" + std::to_string(out_classified_image.shape(2)) + " vs Model Wavelength Range: " + std::to_string(Wl_max - Wl_min));
        // }

        // Set kernel arguments and run the kernel
        utils::setArgs( imageproc,
                        input_buf, 
                        PCA_buf, 
                        PCAmean_buf,
                        SVM_buf,
                        SVM_intercept_buf, 
                        whitebalance_buf,
                        SG_buf,
                        output_buf,
                        Nx, 
                        Ny, 
                        Nc, 
                        NPCA, 
                        Nk, 
                        Wl_min, 
                        Wl_max, 
                        savgol_toggle, 
                        whitebalance_toggle,
                        Nwindow
                        );
        start = std::chrono::high_resolution_clock::now();
        if (!unified_memory) utils::enqueueWrites(queue, std::make_pair(input_buf, const_cast<uint8_t*>(in_image.data())));
        end = std::chrono::high_resolution_clock::now();
        Tcpy += std::chrono::duration<double>(end - start);
        start = std::chrono::high_resolution_clock::now();
        CL_CALL(queue.enqueueNDRangeKernel, imageproc, cl::NullRange, cl::NDRange(WGS*Nx*Ny), cl::NDRange(WGS));
        queue.finish();
        end = std::chrono::high_resolution_clock::now();
        Tkernel += std::chrono::duration<double>(end - start);
        start = std::chrono::high_resolution_clock::now();
        if (!unified_memory) utils::enqueueReads(queue, std::make_pair(output_buf, const_cast<uint8_t*>(out_classified_image.data())));
        end = std::chrono::high_resolution_clock::now();
        Tcpy += std::chrono::duration<double>(end - start);

        if (print_time) {
            // std::cout << "(GPU) Copy Time: " << std::chrono::duration_cast<std::chrono::microseconds>(Tcpy).count() << " mus" << std::endl;
            std::cout << "(GPU) Computation time: " << std::chrono::duration_cast<std::chrono::microseconds>(Tkernel).count() << " mus" << std::endl;
        }
    }

        // Run the full pipeline with separate kernels
    std::array<float, 4> run_pipeline_separate(
        const py::array_t<uint8_t, py::array::c_style> in_image,
        py::array_t<uint8_t, py::array::c_style> out_classified_image,
        bool print_time = false,
        bool print_stage_times = false) {
        
        // Determine dimensions for intermediate arrays
        int dim = in_image.ndim();
        int Nlines = 0;
        if (dim == 3) {
            Nlines = in_image.shape(0);
            Nx = in_image.shape(1);
            Nc = in_image.shape(2);
        } else {
            Nlines = 1;
            Nx = in_image.shape(0);
            Nc = in_image.shape(1);
        }
        int total_pixels = Nx * Nlines;
        int Nc_eff = Wl_max - Wl_min;
        
        // Create separate properly sized buffers for each stage
        auto start_allocation = std::chrono::steady_clock::now();

        if (Nlines > this->Ny && !unified_memory) {
            input_buf  = utils::make_buffer<uint8_t>(context, CL_MEM_READ_ONLY , in_image.size());
            output_buf = utils::make_buffer<uint8_t>(context, CL_MEM_WRITE_ONLY, out_classified_image.size());
        } else if (unified_memory) {
            input_buf =  utils::make_buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, in_image.size(), const_cast<uint8_t*>(in_image.data()));
            output_buf = utils::make_buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_WRITE_ONLY,out_classified_image.size(), const_cast<uint8_t*>(out_classified_image.data()));
        }

        if (intermediate_buf.getInfo<CL_MEM_SIZE>() < total_pixels * Nc_eff * sizeof(float)
            || intermediate_buf == cl::Buffer() ) {
            this->Ny = Nlines;
            intermediate_buf = utils::make_buffer<float>(context, CL_MEM_READ_WRITE, total_pixels * Nc_eff);
            pca_output_buf = utils::make_buffer<float>(context, CL_MEM_READ_WRITE, total_pixels * NPCA);
        }

        if (print_time) {
            std::cout << "Buffer Allocation Time: " << 
                std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - start_allocation).count() << 
                " μs" << std::endl;
        }

        auto start_total = std::chrono::steady_clock::now();
        if (!unified_memory) {
            utils::enqueueWrites(queue, std::make_pair(input_buf, const_cast<uint8_t*>(in_image.data())));
            queue.finish();
        }
        auto end_copy = std::chrono::steady_clock::now();
        auto copy_time = std::chrono::duration_cast<std::chrono::microseconds>(end_copy - start_total);

        std::array<float, 4> stage_times_microsecs = {0, 0, 0, 0};

        auto start_kernel = std::chrono::steady_clock::now();
        
        utils::enqueueKernel(
            queue, 
            white_balance_kernel, 
            cl::NDRange(WGS * total_pixels),
            cl::NDRange(WGS),
            input_buf,
            whitebalance_buf,
            intermediate_buf,
            Nx,
            Nc,
            Wl_min,
            Wl_max,
            whitebalance_toggle
        );

        queue.finish();
        auto T1 = std::chrono::steady_clock::now();
        stage_times_microsecs[0] = std::chrono::duration_cast<std::chrono::nanoseconds>(T1 - start_kernel).count();


        if (savgol_toggle){
            utils::enqueueKernel(
            queue, 
            savgol_filter_kernel, 
            cl::NDRange(WGS* total_pixels),
            cl::NDRange(WGS),
            intermediate_buf,
            SG_buf
        );}

        queue.finish();
        auto T2 = std::chrono::steady_clock::now();
        stage_times_microsecs[1] = std::chrono::duration_cast<std::chrono::nanoseconds>(T2 - T1).count();
/* 
        if (absorption_toggle || normalization_toggle) {
            utils::enqueueKernel(
            queue, 
            absorbance_norm_kernel, 
            cl::NDRange(WGS* total_pixels),
            cl::NDRange(WGS),
            intermediate_buf,
            absorption_toggle,
            normalization_toggle
        );} */

        //queue.finish();
        //auto T3 = std::chrono::steady_clock::now();
        //stage_times_microsecs[2] = std::chrono::duration_cast<std::chrono::nanoseconds>(T3 - T2).count();
        
        utils::enqueueKernel(
            queue, 
            pca_kernel, 
            cl::NDRange(WGS* total_pixels),
            cl::NDRange(WGS),
            intermediate_buf,
            PCA_buf,
            PCAmean_buf,
            pca_output_buf);
            
        queue.finish();
        auto T3 = std::chrono::steady_clock::now();
        stage_times_microsecs[2] = std::chrono::duration_cast<std::chrono::nanoseconds>(T3 - T2).count();


        if (kmeans_svm_switch) {
            utils::enqueueKernel(
                queue, 
                kmeans_kernel,
                cl::NDRange(NPCA * total_pixels),
                cl::NDRange(NPCA),
                pca_output_buf,
                Kmeans_buf,
                output_buf);
        } else {
            utils::enqueueKernel(
                queue, 
                svm_kernel,
                cl::NDRange(NPCA * total_pixels),
                cl::NDRange(NPCA),
                pca_output_buf,
                SVM_buf,
                SVM_intercept_buf,
                output_buf);
        }

        queue.finish();
        auto T4 = std::chrono::steady_clock::now();
        stage_times_microsecs[3] = std::chrono::duration_cast<std::chrono::nanoseconds>(T4 - T3).count();
        


        auto end_kernel = std::chrono::steady_clock::now();
        auto kernel_time = std::chrono::duration_cast<std::chrono::microseconds>(end_kernel - start_kernel);
        if (!unified_memory) {
            utils::enqueueReads(queue, std::make_pair(output_buf, const_cast<uint8_t*>(out_classified_image.data())));
            queue.finish();
        }
        auto end_read = std::chrono::steady_clock::now();
        auto read_time = std::chrono::duration_cast<std::chrono::microseconds>(end_read - end_kernel);
        if (print_time) {
            std::cout << "(GPU) Copy Time: " << copy_time.count() << " μs" << std::endl;
            std::cout << "(GPU) Kernel Time: " << kernel_time.count() << " μs" << std::endl;
            std::cout << "(GPU) Read Time: " << read_time.count() << " μs" << std::endl;
            std::cout << "White Balance Time: " << stage_times_microsecs[0] / 1000.0f << " µs" << std::endl;
            std::cout << "Savitzky-Golay Time: " << stage_times_microsecs[1] / 1000.0f << " µs" << std::endl;
            std::cout << "PCA Time: " << stage_times_microsecs[2] / 1000.0f << " µs" << std::endl;
            std::cout << "Classification Time: " << stage_times_microsecs[3] / 1000.0f << " µs" << std::endl;
        }


        auto total_time = std::chrono::duration_cast<std::chrono::microseconds>(end_read - start_total);
        
        if (print_time) {
            std::cout << "Total Pipeline (Separate Kernels): " << total_time.count() << " μs" << std::endl;
        }
        return stage_times_microsecs;
    }

    void runCPU(const py::array_t<uint8_t, py::array::c_style | py::array::forcecast> in_image,
                    py::array_t<uint8_t, py::array::c_style | py::array::forcecast> out_classified_image,
                    bool print_time = false,
                    bool verbose = false){
        
        // if print time and verbose are both true, print verbose time
        bool print_verbose_time = false;
        if (print_time && verbose) print_verbose_time = true;

        //Deduce the dimensions of the input image
        int dim = in_image.ndim();
        int Nlines = 0;
        if (dim == 3) {
            Nlines = in_image.shape(0); //Number of lines
            Nx = in_image.shape(1); //Number of pixels per line
            Nc = in_image.shape(2); //Number of channels
        }
        else {
            Nlines = 1;
            Nx = in_image.shape(0); //Number of pixels per line
            Nc = in_image.shape(1); //Number of channels
        }

        unsigned char* labels = const_cast<unsigned char *>(out_classified_image.data());
        auto start = std::chrono::high_resolution_clock::now();
        run_serial(in_image.data(),
               PCA,
               PCAmean,
               SVM,
               SVM_intercept,
               WB,
               savgol_coef,
               labels,
               Nx,
               Nlines,
               Nc,
               NPCA,
               Nk,
               Wl_min,
               Wl_max,
               savgol_toggle,
               whitebalance_toggle,
               Nwindow,
               print_verbose_time);
        auto end = std::chrono::high_resolution_clock::now();
        auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
        if (print_time) std::cout << "(CPU) Computation time: " << duration << " mus" << std::endl;
    };

    void runCPU_minimal(const py::array_t<uint8_t, py::array::c_style | py::array::forcecast> in_image,
                    py::array_t<uint8_t, py::array::c_style | py::array::forcecast> out_classified_image,
                    bool print_time = false,
                    bool verbose = false){

        // if print time and verbose are both true, print verbose time
        bool print_verbose_time = false;
        if (print_time && verbose) print_verbose_time = true;
        
        //Deduce the dimensions of the input image
        int dim = in_image.ndim();
        int Nlines = 0;
        if (dim == 3) {
            Nlines = in_image.shape(0); //Number of lines
            Nx = in_image.shape(1); //Number of pixels per line
            Nc = in_image.shape(2); //Number of channels
        }
        else {
            Nlines = 1;
            Nx = in_image.shape(0); //Number of pixels per line
            Nc = in_image.shape(1); //Number of channels
        }

        unsigned char* labels = const_cast<unsigned char *>(out_classified_image.data());
        auto start = std::chrono::high_resolution_clock::now();
        run_serial_minimal(in_image.data(),
               PCA,
               PCAmean,
               SVM,
               SVM_intercept,
               labels,
               Nx,
               Nlines,
               Nc,
               NPCA,
               Nk,
               Wl_min,
               Wl_max,
               print_verbose_time);
        auto end = std::chrono::high_resolution_clock::now();
        auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();

        if (print_time) std::cout << "(CPU minimal) Computation time: " << duration << " mus" << std::endl;
    };
};

PYBIND11_MODULE(plastic_sorting, m) {
    py::class_<Kernel>(m, "Kernel")
        .def(py::init<>())
        .def("run", &Kernel::run, py::arg("in_image"), py::arg("out_classified_image"), py::arg("print_time") = false, py::arg("verbose") = false)
        .def("runCPU", &Kernel::runCPU, py::arg("in_image"), py::arg("out_classified_image"), py::arg("print_time") = false, py::arg("verbose") = false)
        .def("runCPU_minimal", &Kernel::runCPU_minimal, py::arg("in_image"), py::arg("out_classified_image"), py::arg("print_time") = false, py::arg("verbose") = false)
        .def("run_pipeline_separate", &Kernel::run_pipeline_separate, py::arg("in_image"), py::arg("out_classified_image"), py::arg("print_time") = false, py::arg("print_stage_times") = false)
        .def("load_model", &Kernel::load_model, py::arg("PCA") = py::array_t<float>(),
                                                py::arg("PCAmean") = py::array_t<float>(),
                                                py::arg("SVM") = py::array_t<float>(),
                                                py::arg("SVM_intercept") = py::array_t<float>(),
                                                py::arg("whitebalance") = py::array_t<float>(),
                                                py::arg("savgol_coef") = py::array_t<float>(),
                                                py::arg("Wl_min") = 10, 
                                                py::arg("Wl_max") = 150, 
                                                py::arg("savgol_toggle") = 1);
}