float reduce_add(float data, int n){
int tidx = get_local_id(0);
float local_value = tidx < n ? data : 0.0f;
return work_group_reduce_add(local_value);
}
// Helper Functions
void load_and_calibrate(
global const uchar* image, global const float* WB,
__local float* image_shared, float* channels, int tidx, int bid,
int Nx, int Nc, int Nc_eff, int Wl_min, int image_offset, int white_balance
) {
int offset = bid * Nc;
for (int i = tidx; i < Nc_eff; i += WGS) {
if(white_balance == 1) {
image_shared[i + image_offset] = (float)image[offset + Wl_min + i] / WB[(bid%Nx)*Nc + Wl_min + i] * 255.0f;
} else {
image_shared[i + image_offset] = (float)image[offset + Wl_min + i];
}
channels[i / WGS] = image_shared[i + image_offset];
}
work_group_barrier(CLK_LOCAL_MEM_FENCE);
if (tidx < image_offset) {
image_shared[tidx] = image_shared[Wl_min + image_offset];
image_shared[Nc_eff + image_offset + tidx] = image_shared[Wl_min + Nc_eff + image_offset - 1];
}
work_group_barrier(CLK_LOCAL_MEM_FENCE);
}
void apply_savgol(
__local float* image_shared, __local float* savgol_shared, __global const float* savgol_coef, float* channels,
int tidx, int Nc_eff, int image_offset, int savgol_window
) {
for (int i = tidx; i < savgol_window; i += WGS) {
savgol_shared[i] = savgol_coef[i];
}
work_group_barrier(CLK_LOCAL_MEM_FENCE);
for (int i = tidx; i < Nc_eff; i += WGS) {
float sum = 0.0f;
for (int j = -savgol_window / 2; j <= savgol_window / 2; j++) {
sum += image_shared[i + j + image_offset] * savgol_shared[j + savgol_window / 2];
}
channels[i / WGS] = sum;
}
work_group_barrier(CLK_LOCAL_MEM_FENCE);
}
void perform_pca(
float* channels, global const float* PCA, global const float* PCA_mean,
__local float* PCA_shared, int tidx, int Nc_eff, int Npca
) {
for (int i = tidx; i < Nc_eff; i += WGS) {
channels[i / WGS] -= PCA_mean[i];
}
for (int i = 0; i < Npca; i++) {
float local_sum = 0.0f;
for (int j = tidx; j < Nc_eff; j += WGS) {
local_sum += PCA[i * Nc_eff + j] * channels[j / WGS];
}
float pca_dim_val = reduce_add(local_sum, WGS);
if (tidx == 0) PCA_shared[i] = pca_dim_val;
}
}
void svm_assign_labels(
__local float* PCA_shared, global const float* SVM, global const float* SVM_intercept, global uchar* labels,
int tidx, int bid, int Nk, int Npca
) {
float svm_val = -FLT_MAX;
if (tidx < Nk) {
svm_val = 0.0f;
for (int j = 0; j < Npca; j ++) {
svm_val += PCA_shared[j] * SVM[j * Nk + tidx];
}
svm_val += SVM_intercept[tidx];
}
float max_svm = work_group_reduce_max(svm_val);
if (svm_val == max_svm) {
labels[bid] = tidx;
}
}
kernel void process_image( global const uchar* image,
global const float* PCA,
global const float* PCA_mean,
global const float* SVM,
global const float* SVM_intercept,
global const float* WB, //White balance vector
global const float* savgol_coef, //Savitzky-Golay filter coefficients
global uchar* labels, //Output kmeans labels
const int Nx, //Dimension of the image in x
const int Ny, //Dimension of the image in y
const int Nc, //Number of channels in the image
const int Npca, //Number of PCA components
const int Nk, //Number of kmeans clusters
const int Wl_min, //Minimum wavelength
const int Wl_max, //Maximum wavelength
const int savgol, //Toggle Savitzky-Golay filter (0 = off, 1 = on)
const int white_balance, //Toggle white balance (0 = off, 1 = on)
const int savgol_window //Savitzky-Golay window size
){
// Shared memory declarations
__local float savgol_shared[30];
__local float image_shared[1024]; //Larger than number of channels to allow for padding
__local float PCA_shared[200]; //Intermediate results from computing PCA dot products
// Thread and work-group information
int tidx = get_local_id(0);
int bid = get_group_id(0);
// Effective channel range and offset
const int Nc_eff = Wl_max - Wl_min;
const int image_offset = savgol_window / 2;
float channels[ITEMS_PER_THREAD]; for (int i = 0; i < ITEMS_PER_THREAD; i++) channels[i] = 0.0f;
// Load image into shared memory and apply white balance
load_and_calibrate(image, WB, image_shared, channels, tidx, bid, Nx, Nc, Nc_eff, Wl_min, image_offset, white_balance);
// Apply Savitzky-Golay filter if enabled
if (savgol == 1) {
apply_savgol(image_shared, savgol_shared, savgol_coef, channels, tidx, Nc_eff, image_offset, savgol_window);
}
// Apply PCA transformation
perform_pca(channels, PCA, PCA_mean, PCA_shared, tidx, Nc_eff, Npca);
// Perform SVM classification
svm_assign_labels(PCA_shared, SVM, SVM_intercept, labels, tidx, bid, Nk, Npca);
}
kernel void white_balance_kernel(
global const uchar* image,
global const float* WB,
global float* output_image,
const int Nx,
const int Nc,
const int Wl_min,
const int Wl_max,
const int white_balance
) {
int tidx = get_local_id(0);
int bid = get_group_id(0);
int offset = bid * Nc;
for (int i = tidx; i < NC_EFF; i += WGS) {
if (white_balance == 1) {
output_image[bid * NC_EFF + i] = (float)image[offset + Wl_min + i] / WB[(bid%Nx)*Nc + Wl_min + i] * 255.0f;
} else {
output_image[bid * NC_EFF + i] = (float)image[offset + Wl_min + i];
}
}
}
kernel void savgol_filter_kernel(
global float* inline_image,
global const float* savgol_coef
) {
__local float padded_image[NC_EFF + N_WINDOW];
__local float savgol_shared[N_WINDOW];
int tidx = get_local_id(0);
int bid = get_group_id(0);
int image_offset = N_WINDOW / 2;
// Load data into shared memory with padding
for (int i = tidx; i < NC_EFF; i += WGS) {
padded_image[i + image_offset] = inline_image[bid * NC_EFF + i];
}
work_group_barrier(CLK_LOCAL_MEM_FENCE);
// Pad the edges
if (tidx < image_offset) {
padded_image[tidx] = padded_image[image_offset];
padded_image[NC_EFF + image_offset + tidx] = padded_image[NC_EFF + image_offset - 1];
}
work_group_barrier(CLK_LOCAL_MEM_FENCE);
// Load Savgol coefficients
for (int i = tidx; i < N_WINDOW; i += WGS) {
savgol_shared[i] = savgol_coef[i];
}
work_group_barrier(CLK_LOCAL_MEM_FENCE);
// Apply filter
for (int i = tidx; i < NC_EFF; i += WGS) {
float sum = 0.0f;
for (int j = -N_WINDOW / 2; j <= N_WINDOW / 2; j++) {
sum += padded_image[i + j + image_offset] * savgol_shared[j + N_WINDOW / 2];
}
inline_image[bid * NC_EFF + i] = sum;
}
}
kernel void pca_kernel(
global const float* image_data,
global const float* PCA,
global const float* PCA_mean,
global float* pca_output
) {
__local float pca_shared[N_PCA];
int tidx = get_local_id(0);
int bid = get_group_id(0);
// Subtract mean
for (int i = 0; i < N_PCA; i++) {
float local_sum = 0.0f;
for (int j = tidx; j < NC_EFF; j += WGS) {
float centered_val = image_data[bid * NC_EFF + j] - PCA_mean[j];
local_sum += PCA[i * NC_EFF + j] * centered_val;
}
float pca_dim_val = reduce_add(local_sum, WGS);
if (tidx == 0) pca_shared[i] = pca_dim_val;
}
work_group_barrier(CLK_LOCAL_MEM_FENCE);
// Write PCA result to global memory
if (tidx < N_PCA) {
pca_output[bid * N_PCA + tidx] = pca_shared[tidx];
}
}
kernel void svm_kernel(
global const float* pca_data,
global const float* SVM,
global const float* SVM_intercept,
global uchar* labels
) {
__local float pca_shared[N_PCA];
int tidx = get_local_id(0);
int bid = get_group_id(0);
// Load PCA data into shared memory
if (tidx < N_PCA) {
pca_shared[tidx] = pca_data[bid * N_PCA + tidx];
}
work_group_barrier(CLK_LOCAL_MEM_FENCE);
float svm_val = -FLT_MAX;
if (tidx < N_K) {
svm_val = 0.0f;
for (int j = 0; j < N_PCA; j++) {
svm_val += pca_shared[j] * SVM[j * N_K + tidx];
}
svm_val += SVM_intercept[tidx];
}
float max_svm = work_group_reduce_max(svm_val);
if (svm_val == max_svm) {
labels[bid] = tidx;
}
}