import os
import sys
import subprocess
import timeit
import time
import numpy as np
import pandas as pd
from scipy.signal import savgol_coeffs, savgol_filter
import os.path
from os.path import relpath
import argparse

def read_csv(fname):
  try:
    df = pd.read_csv(fname)
    return df["N"].to_numpy(), df["T"].to_numpy(), df["TSD"].to_numpy()
  except Exception as e:
    print(f"Could not read {fname}, skipping due to {e}")
    return None, None, None

# Extract color values and set as the default color cycle
batch_size = 128 ## Memory limit for the Camera 

# subprocess.run(["make", "clean"])
# subprocess.run(["make", "modules"])
build_dir = os.path.relpath("build")
sys.path.append(build_dir)
# Ensure the build directory is in the system path before importing kernels

#Import the plastic_sorting module
import build.plastic_sorting as wrapper
import numpy as np

impl = wrapper.Kernel()

#Configuration
Wl_min = 0
Wl_max = 600
Npoly = np.int32(3)     #Polynomial order (Savitzky-Golay)
Nderiv = np.int32(0)    #Derivative order (Savitzky-Golay)
Nwindow = np.int32(13)   #Window size (Savitzky-Golay)

#LineRange = [10, 400]  #Middle 100 lines of the datacube, demonstrating that we can process any number of lines
Ny = 500

model_dir = "models/calibratedSVM/"
data_dir = "data/"

PCA =           np.ascontiguousarray(np.load(model_dir + "pca_model_components.npy").astype(np.float32))
PCAmean =       (np.load(model_dir + "pca_model_mean.npy").astype(np.float32))
SVM_coef =      (np.load(model_dir + "svm_model_coef.npy").astype(np.float32))
SVM_intercept = (np.load(model_dir + "svm_model_intercept.npy").astype(np.float32))
whiteMatrix = np.ascontiguousarray(np.load(model_dir + "whiteMatrix.npy").astype(np.float32))

savgol_coef = savgol_coeffs(Nwindow, Npoly, Nderiv)

def runNumpy(input, output):
    for i in range(input.shape[0]):
        whiteCalibrated = np.divide(input[i, :, :], whiteMatrix) * 255
        whiteCalibrated = savgol_filter(whiteCalibrated, Nwindow, Npoly, Nderiv, axis=1)
        whiteCalibrated = whiteCalibrated - PCAmean
        PCAprojection = np.dot(whiteCalibrated, PCA.T)
        output[i, :] = np.argmax(np.dot(PCAprojection, SVM_coef.T) + SVM_intercept, axis=1).astype(np.uint8)

data_cube =     np.load(data_dir +  "image_3.npy")[:batch_size, :, :].astype(np.uint8)
# data_cube =     np.roll(np.load(data_dir +  "image_5.npy").astype(np.uint8)[600:1700, :, :], -7, axis=2)

Ny = data_cube.shape[0]
Nx = data_cube.shape[1]
Nc = data_cube.shape[2]
output =    np.full((Ny, Nx), 0, dtype=np.uint8)

impl.load_model(PCA, PCAmean, 
                SVM = SVM_coef,
                SVM_intercept = SVM_intercept,
                whitebalance = whiteMatrix,
                savgol_coef = savgol_coef,
                Wl_min = Wl_min, 
                Wl_max = Wl_max, 
                normalization_toggle = int(0),
                absorption_toggle = int(0),
                tuning_dims = [PCA.shape[0], Nx*16, Nc])

# Set up benchmarking parameters
# Create a sequence of powers of 2 (1, 2, 4, 8, ...) up to the batch size
max_power = int(np.log2(min(batch_size, 512)))
line_counts = np.array([2**i for i in range(0, max_power + 1)])  # Start from 2^1 = 2 lines
repetitions = 20  # Number of repetitions for better statistics
results = {
    'lines': line_counts,
    'cpu_times': [],
    'cpu_lines_per_sec': [],
    'std_dev': []  # Store standard deviations
}

# Run benchmarks
print("Running benchmarks with varying data sizes...")

def run_benchmark(impl_name, repetitions, intermittent_sleep=0, warmup_iterations=1):
    Ny = data_cube.shape[0]
    Nx = data_cube.shape[1]
    data_output = np.ascontiguousarray(np.zeros((batch_size, Nx), dtype=np.uint8))
    timing_results = []
    whitebalance_results = []
    savgol_results = []
    pca_results = []
    classification_results = []
    
    for i, line_count in enumerate(line_counts):
        print(f"Running {impl_name} benchmark with {line_count} lines ({i+1}/{len(line_counts)})...")
        iteration_times = np.zeros(repetitions)
        whitebalance_times = np.zeros(repetitions)
        savgol_times = np.zeros(repetitions)
        pca_times = np.zeros(repetitions)
        classification_times = np.zeros(repetitions)
        
        # Warmup runs
        for j in range(warmup_iterations):
            if impl_name == 'numpy':
                runNumpy(data_cube[:line_count], data_output)
            elif impl_name == 'opencl':
                impl.run_pipeline_separate(data_cube[:line_count], data_output)
            elif impl_name == 'opencl_fused':
                impl.run(data_cube[:line_count], data_output)
            elif impl_name == 'cpu':
                impl.runCPU(data_cube[:line_count], data_output)

        # Actual benchmark runs
        for j in range(repetitions):
            start_time = timeit.default_timer()
            if impl_name == 'numpy':
                runNumpy(data_cube[:line_count], data_output)
            elif impl_name == 'opencl':
                whitebalance_times[j], savgol_times[j], pca_times[j], classification_times[j] = impl.run_pipeline_separate(data_cube[:line_count], data_output)
            elif impl_name == 'opencl_fused':
                impl.run(data_cube[:line_count], data_output)
            elif impl_name == 'cpu':
                impl.runCPU(data_cube[:line_count], data_output)
            elapsed_time = timeit.default_timer() - start_time
            iteration_times[j] = elapsed_time
            time.sleep(intermittent_sleep)  # Sleep to simulate intermittent processing
            
        # Calculate and print throughput for this batch size
        mean_time = np.mean(iteration_times)
        std_dev = np.std(iteration_times)
        throughput = line_count / mean_time
        print(f"  Results for {line_count} lines: {throughput:.2f} lines/sec (±{std_dev/mean_time*100:.2f}%)")
        
        # If OpenCL non-fused also print individual stage throughputs
        if impl_name in ['opencl'] and len(whitebalance_times) > 0:
            # Calculate mean time for each stage (convert from nanoseconds to seconds)
            wb_mean = np.mean(whitebalance_times) / 1e9
            sg_mean = np.mean(savgol_times) / 1e9
            pca_mean = np.mean(pca_times) / 1e9
            cl_mean = np.mean(classification_times) / 1e9
            
            # Calculate throughput for each stage
            wb_tput = line_count / wb_mean
            sg_tput = line_count / sg_mean
            pca_tput = line_count / pca_mean
            cl_tput = line_count / cl_mean
            
            # Print in condensed format
            print(f"  Stage throughputs (lines/sec): WhiteBalance={wb_tput:.2f}, SavGol={sg_tput:.2f}, PCA={pca_tput:.2f}, Classify={cl_tput:.2f}")
            
            # Print percentage of total time
            total_stage_time = wb_mean + sg_mean + pca_mean + cl_mean
            wb_pct = (wb_mean / total_stage_time) * 100
            sg_pct = (sg_mean / total_stage_time) * 100
            pca_pct = (pca_mean / total_stage_time) * 100
            cl_pct = (cl_mean / total_stage_time) * 100
            print(f"  Stage time distribution: WhiteBalance={wb_pct:.1f}%, SavGol={sg_pct:.1f}%, PCA={pca_pct:.1f}%, Classify={cl_pct:.1f}%")
            
        timing_results.append(iteration_times)
        if impl_name == 'opencl':
            whitebalance_results.append(whitebalance_times)
            savgol_results.append(savgol_times)
            pca_results.append(pca_times)
            classification_results.append(classification_times)
        print(f"Finished {impl_name} benchmark for {line_count} lines.")
        
    if impl_name == 'opencl':
        return timing_results, whitebalance_results, savgol_results, pca_results, classification_results
    else:
        return timing_results
        
    
def filter_outliers(data, n_sigmas=2):
    filtered_data = data.copy()
    st_devs = np.zeros(len(data))
    means = np.zeros(len(data))
    for i, reps in enumerate(data):
        mean = np.mean(reps)
        std_dev = np.std(reps)
        filtered_data[i] = reps[np.abs(reps - mean) < n_sigmas * std_dev]
        st_devs[i] = np.std(filtered_data[i])
        means[i] = np.mean(filtered_data[i])
    return means, st_devs


def process_results(timing_results):
    """Process the timing results to compute lines per second and standard deviation."""
    means, st_devs = filter_outliers(timing_results)
    lines_per_sec = line_counts / means
    propagated_uncertainty = lines_per_sec * (st_devs / means)
    return lines_per_sec, propagated_uncertainty, means, st_devs  # Also return raw timing data

def save_results_to_csv(results, output_dir="benchmark_results"):
    """Save benchmark results to CSV files."""
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Calculate execution times for each implementation based on throughput (if not already available)
    for impl in ['numpy', 'opencl', 'opencl_fused', 'cpu']:
        if f'{impl}_time' not in results and f'{impl}_lines_per_sec' in results:
            results[f'{impl}_time'] = results['lines'] / results[f'{impl}_lines_per_sec']
            results[f'{impl}_time_stddev'] = results[f'{impl}_time'] * (results[f'{impl}_std_dev'] / results[f'{impl}_lines_per_sec'])
    
    # Save implementation comparison data - with both throughput and timing data
    comparison_df = pd.DataFrame({
        'lines': results['lines'],
        # Throughput data
        'numpy_throughput': results['numpy_lines_per_sec'],
        'numpy_throughput_stddev': results['numpy_std_dev'],
        'opencl_throughput': results['opencl_lines_per_sec'],
        'opencl_throughput_stddev': results['opencl_std_dev'],
        'opencl_fused_throughput': results['opencl_fused_lines_per_sec'],
        'opencl_fused_throughput_stddev': results['opencl_fused_std_dev'],
        'cpu_throughput': results['cpu_lines_per_sec'],
        'cpu_throughput_stddev': results['cpu_std_dev'],
        # Time data (seconds)
        'numpy_time': results['numpy_time'],
        'numpy_time_stddev': results['numpy_time_stddev'],
        'opencl_time': results['opencl_time'],
        'opencl_time_stddev': results['opencl_time_stddev'],
        'opencl_fused_time': results['opencl_fused_time'],
        'opencl_fused_time_stddev': results['opencl_fused_time_stddev'],
        'cpu_time': results['cpu_time'],
        'cpu_time_stddev': results['cpu_time_stddev']
    })
    comparison_df.to_csv(f"{output_dir}/implementation_comparison.csv", index=False)
    
    # Save stage-wise data
    stage_throughput_df_opencl = pd.DataFrame({'lines': results['lines']})
    stage_stddev_df_opencl = pd.DataFrame({'lines': results['lines']})
    stage_percentage_df_opencl = pd.DataFrame({'lines': results['lines']})
    stage_times_df_opencl = pd.DataFrame({'lines': results['lines']})  # New dataframe for stage times
    stage_times_stddev_df_opencl = pd.DataFrame({'lines': results['lines']})  # New dataframe for stage time stddev
    

    for stage_name in results['stage_names']:
        # OpenCL
        stage_throughput_df_opencl[stage_name + '_opencl'] = results['stage_throughputs_opencl'][stage_name]
        stage_stddev_df_opencl[stage_name + '_opencl'] = results['stage_std_devs_opencl'][stage_name]
        stage_percentage_df_opencl[stage_name + '_opencl'] = results['stage_percentages_opencl'][stage_name]
        stage_times_df_opencl[stage_name + '_opencl'] = results['stage_times_opencl'][stage_name]
        stage_times_stddev_df_opencl[stage_name + '_opencl'] = results['stage_times_stddev_opencl'][stage_name]
        

    # Save all the dataframes
    stage_throughput_df_opencl.to_csv(f"{output_dir}/stage_throughput_opencl.csv", index=False)
    stage_stddev_df_opencl.to_csv(f"{output_dir}/stage_stddev_opencl.csv", index=False)
    stage_percentage_df_opencl.to_csv(f"{output_dir}/stage_percentage_opencl.csv", index=False)
    stage_times_df_opencl.to_csv(f"{output_dir}/stage_times_opencl.csv", index=False)
    stage_times_stddev_df_opencl.to_csv(f"{output_dir}/stage_times_stddev_opencl.csv", index=False)
    

    print(f"Results saved to '{output_dir}' directory")

def load_pca_model(impl, n_components):
    """Load a PCA model with specified number of components."""
    model_dir = "models/pca_models/"
    
    PCA = np.ascontiguousarray(np.load(f"{model_dir}pca_components_{n_components}.npy").astype(np.float32))
    PCAmean = np.load(f"{model_dir}pca_mean_{n_components}.npy").astype(np.float32)
    SVM_coef = np.load(f"{model_dir}svm_model_coef_{n_components}.npy").astype(np.float32)
    SVM_intercept = np.load(f"{model_dir}svm_model_intercept_{n_components}.npy").astype(np.float32)
    
    # Use the original white matrix and savgol coefficients
    savgol_coef = savgol_coeffs(Nwindow, Npoly, Nderiv)
    whiteMatrix = np.ascontiguousarray(np.load(model_dir + "../calibratedSVM/whiteMatrix.npy").astype(np.float32))
    
    impl.load_model(
        PCA, PCAmean,
        SVM=SVM_coef,
        SVM_intercept=SVM_intercept,
        whitebalance=whiteMatrix,
        savgol_coef=savgol_coef,
        Wl_min=Wl_min,
        Wl_max=Wl_max,
        normalization_toggle=int(0),
        absorption_toggle=int(0),
        tuning_dims = [PCA.shape[0], Nx*16, Nc]
    )
    
    return PCA, PCAmean, SVM_coef, SVM_intercept, whiteMatrix

def benchmark_pca_models(batch_size=32, repetitions=10, warmup_iterations=3):
    """Benchmark performance with different PCA component counts."""
    # Available PCA component counts (3-25)
    component_counts = list(range(3, 26))
    
    # Initialize results dictionary
    pca_results = {
        'components': component_counts,
        'opencl_fused_times': [],
        'opencl_fused_std_dev': [],
        'opencl_times': [],
        'opencl_std_dev': [],
        'numpy_times': [],
        'numpy_std_dev': [],
        'cpu_times': [],
        'cpu_std_dev': [],
        # Add PCA-specific timing arrays
        'opencl_pca_times': [],
        'opencl_pca_std_dev': [],
    }
    
    # Prepare data
    data_cube_full = np.load(data_dir + "image_3.npy").astype(np.uint8)
    data_cube = data_cube_full[:batch_size, :, :]
    Ny, Nx, Nc = data_cube.shape
    data_output = np.ascontiguousarray(np.zeros((batch_size, Nx), dtype=np.uint8))
    
    # Run benchmark for each PCA component count
    for n_components in component_counts:
        print(f"\nBenchmarking with {n_components} PCA components:")
        # Load the specific model
        implementation = wrapper.Kernel() # Create a fresh instance of the kernel to avoid leaking memory
        PCA, PCAmean, SVM_coef, SVM_intercept, whiteMatrix = load_pca_model(implementation, n_components)
        
        # Benchmark each implementation
        # 1. OpenCL Fused
        print("  Running OpenCL Fused implementation...")
        # Warmup runs
        for _ in range(warmup_iterations):
            implementation.run(data_cube, data_output)
            
        opencl_fused_times = np.zeros(repetitions)
        for j in range(repetitions):
            start_time = timeit.default_timer()
            implementation.run(data_cube, data_output)
            opencl_fused_times[j] = timeit.default_timer() - start_time
        
        # Calculate and print throughput for OpenCL Fused
        mean_time = np.mean(opencl_fused_times)
        std_dev = np.std(opencl_fused_times)
        throughput = batch_size / mean_time
        print(f"    OpenCL Fused: {throughput:.2f} lines/sec (±{std_dev/mean_time*100:.2f}%)")
        
        # 2. OpenCL
        print("  Running OpenCL implementation...")
        # Warmup runs
        for _ in range(warmup_iterations):
            implementation.run_pipeline_separate(data_cube, data_output)
            
        opencl_times = np.zeros(repetitions)
        opencl_pca_times = np.zeros(repetitions)  # Track PCA-specific times
        for j in range(repetitions):
            start_time = timeit.default_timer()
            wb_time, sg_time, pca_time, cl_time = implementation.run_pipeline_separate(data_cube, data_output)
            opencl_times[j] = timeit.default_timer() - start_time
            opencl_pca_times[j] = pca_time / 1e9  # Convert from nanoseconds to seconds
        
        # Calculate and print throughput for OpenCL
        mean_time = np.mean(opencl_times)
        std_dev = np.std(opencl_times)
        throughput = batch_size / mean_time
        print(f"    OpenCL: {throughput:.2f} lines/sec (±{std_dev/mean_time*100:.2f}%)")
        
        # Calculate and print PCA-specific throughput
        mean_pca_time = np.mean(opencl_pca_times)
        std_pca_dev = np.std(opencl_pca_times)
        pca_throughput = batch_size / mean_pca_time
        print(f"    OpenCL PCA only: {pca_throughput:.2f} lines/sec (±{std_pca_dev/mean_pca_time*100:.2f}%)")
        
        
        # 3. NumPy
        print("  Running NumPy implementation...")
        # Warmup runs
        for _ in range(warmup_iterations):
            runNumpy(data_cube, data_output)
            
        numpy_times = np.zeros(repetitions)
        for j in range(repetitions):
            start_time = timeit.default_timer()
            runNumpy(data_cube, data_output)
            numpy_times[j] = timeit.default_timer() - start_time
        
        # Calculate and print throughput for NumPy
        mean_time = np.mean(numpy_times)
        std_dev = np.std(numpy_times)
        throughput = batch_size / mean_time
        print(f"    NumPy: {throughput:.2f} lines/sec (±{std_dev/mean_time*100:.2f}%)")
        
        # 5. CPU
        print("  Running C++ CPU implementation...")
        # Warmup runs
        for _ in range(warmup_iterations):
            implementation.runCPU(data_cube, data_output)
            
        cpu_times = np.zeros(repetitions)
        for j in range(repetitions):
            start_time = timeit.default_timer()
            implementation.runCPU(data_cube, data_output)
            cpu_times[j] = timeit.default_timer() - start_time
        
        # Calculate and print throughput for CPU
        mean_time = np.mean(cpu_times)
        std_dev = np.std(cpu_times)
        throughput = batch_size / mean_time
        print(f"    CPU: {throughput:.2f} lines/sec (±{std_dev/mean_time*100:.2f}%)")
        
        # Process results
        mean_opencl_fused, std_opencl_fused = np.mean(opencl_fused_times), np.std(opencl_fused_times)
        mean_opencl, std_opencl = np.mean(opencl_times), np.std(opencl_times)
        mean_numpy, std_numpy = np.mean(numpy_times), np.std(numpy_times)
        mean_cpu, std_cpu = np.mean(cpu_times), np.std(cpu_times)
        
        # Process PCA-specific timing results
        mean_opencl_pca, std_opencl_pca = np.mean(opencl_pca_times), np.std(opencl_pca_times)
        
        # Store results
        pca_results['opencl_fused_times'].append(mean_opencl_fused)
        pca_results['opencl_fused_std_dev'].append(std_opencl_fused)
        pca_results['opencl_times'].append(mean_opencl)
        pca_results['opencl_std_dev'].append(std_opencl)
        pca_results['numpy_times'].append(mean_numpy)
        pca_results['numpy_std_dev'].append(std_numpy)
        pca_results['cpu_times'].append(mean_cpu)
        pca_results['cpu_std_dev'].append(std_cpu)
        
        # Store PCA-specific timing results
        pca_results['opencl_pca_times'].append(mean_opencl_pca)
        pca_results['opencl_pca_std_dev'].append(std_opencl_pca)
    
    # Convert times to throughput (lines per second)
    for impl_name in ['opencl_fused', 'opencl', 'numpy', 'cpu']:
        times_key = f'{impl_name}_times'
        stddev_key = f'{impl_name}_std_dev'
        throughput_key = f'{impl_name}_lines_per_sec'
        throughput_stddev_key = f'{impl_name}_throughput_std_dev'
        
        # Add new keys to results dictionary
        pca_results[throughput_key] = batch_size / np.array(pca_results[times_key])
        # Propagate error for throughput calculation
        pca_results[throughput_stddev_key] = pca_results[throughput_key] * (
            np.array(pca_results[stddev_key]) / np.array(pca_results[times_key])
        )
    
    # Calculate PCA-specific throughput
    for impl_name in ['opencl']:
        times_key = f'{impl_name}_pca_times'
        stddev_key = f'{impl_name}_pca_std_dev'
        throughput_key = f'{impl_name}_pca_lines_per_sec'
        throughput_stddev_key = f'{impl_name}_pca_throughput_std_dev'
        
        # Add new keys to results dictionary
        pca_results[throughput_key] = batch_size / np.array(pca_results[times_key])
        # Propagate error for throughput calculation
        pca_results[throughput_stddev_key] = pca_results[throughput_key] * (
            np.array(pca_results[stddev_key]) / np.array(pca_results[times_key])
        )
    
    return pca_results

def save_pca_results_to_csv(results, output_dir="benchmark_results"):
    """Save PCA model benchmark results to CSV files."""
    os.makedirs(output_dir, exist_ok=True)
    
    # Save times data
    times_df = pd.DataFrame({
        'components': results['components'],
        'opencl_fused_time': results['opencl_fused_times'],
        'opencl_fused_stddev': results['opencl_fused_std_dev'],
        'opencl_time': results['opencl_times'],
        'opencl_stddev': results['opencl_std_dev'],
        'numpy_time': results['numpy_times'],
        'numpy_stddev': results['numpy_std_dev'],
        'cpu_time': results['cpu_times'],
        'cpu_stddev': results['cpu_std_dev'],
        # Add PCA-specific timing data
        'opencl_pca_time': results['opencl_pca_times'],
        'opencl_pca_stddev': results['opencl_pca_std_dev'],
    })
    times_df.to_csv(f"{output_dir}/pca_model_times.csv", index=False)
    
    # Save throughput data
    throughput_df = pd.DataFrame({
        'components': results['components'],
        'opencl_fused_throughput': results['opencl_fused_lines_per_sec'],
        'opencl_fused_stddev': results['opencl_fused_throughput_std_dev'],
        'opencl_throughput': results['opencl_lines_per_sec'],
        'opencl_stddev': results['opencl_throughput_std_dev'],
        'numpy_throughput': results['numpy_lines_per_sec'],
        'numpy_stddev': results['numpy_throughput_std_dev'],
        'cpu_throughput': results['cpu_lines_per_sec'],
        'cpu_stddev': results['cpu_throughput_std_dev'],
        # Add PCA-specific throughput data
        'opencl_pca_throughput': results['opencl_pca_lines_per_sec'],
        'opencl_pca_stddev': results['opencl_pca_throughput_std_dev'],
    })
    throughput_df.to_csv(f"{output_dir}/pca_model_throughput.csv", index=False)
    
    # Save PCA-only time and throughput data for clearer analysis
    pca_only_df = pd.DataFrame({
        'components': results['components'],
        'opencl_pca_time': results['opencl_pca_times'],
        'opencl_pca_time_stddev': results['opencl_pca_std_dev'],
        'opencl_pca_throughput': results['opencl_pca_lines_per_sec'],
        'opencl_pca_throughput_stddev': results['opencl_pca_throughput_std_dev'],
    })
    pca_only_df.to_csv(f"{output_dir}/pca_only_performance.csv", index=False)
    
    print(f"PCA model benchmark results saved to '{output_dir}' directory")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Benchmark plastic sorting implementations")
    parser.add_argument("--benchmark-type", choices=["standard", "pca", "all"], default="standard",
                        help="Type of benchmark to run: standard (varying line counts), pca (varying PCA components), or all")
    parser.add_argument("--pca-batch-size", type=int, default=16, 
                        help="Batch size to use for PCA benchmarks (default: 16)")
    parser.add_argument("--repetitions", type=int, default=20,
                        help="Number of repetitions for each benchmark to gather statistics (default: 20)")
    
    args = parser.parse_args()
    
    # Use the specified number of repetitions
    repetitions = args.repetitions
    
    if args.benchmark_type in ["standard", "all"]:
        print(f"Running standard benchmarks (varying line counts) with {repetitions} repetitions...")
        # Run the standard benchmarks
        print("Starting benchmarks...")
        print("\n1. Running OpenCL Fused implementation...")
        opencl_fused_results = run_benchmark('opencl_fused', repetitions)
        results['opencl_fused_lines_per_sec'], results['opencl_fused_std_dev'], results['opencl_fused_time'], results['opencl_fused_time_stddev'] = process_results(opencl_fused_results)
        
        print("\n2. Running OpenCL implementation...")
        opencl_results, whitebalance_results, savgol_results, pca_results, classification_results = run_benchmark('opencl', repetitions)
        results['opencl_lines_per_sec'], results['opencl_std_dev'], results['opencl_time'], results['opencl_time_stddev'] = process_results(opencl_results)
        
        # Process stage-wise results - capture both throughput and raw timing
        results['stage_throughputs_opencl'] = {}
        results['stage_std_devs_opencl'] = {}
        results['stage_times_opencl'] = {}  # Add timing information
        results['stage_times_stddev_opencl'] = {}  # Add timing standard deviation
        
        stage_results = {
            'whitebalance': process_results(whitebalance_results),
            'savgol': process_results(savgol_results),
            'pca': process_results(pca_results),
            'classification': process_results(classification_results)
        }
        
        results['stage_names'] = ['whitebalance', 'savgol', 'pca', 'classification']
        
        for stage_name in results['stage_names']:
            results['stage_throughputs_opencl'][stage_name] = stage_results[stage_name][0]
            results['stage_std_devs_opencl'][stage_name] = stage_results[stage_name][1]
            results['stage_times_opencl'][stage_name] = stage_results[stage_name][2] / 1e9  # Convert from nanoseconds to seconds
            results['stage_times_stddev_opencl'][stage_name] = stage_results[stage_name][3] / 1e9  # Convert from nanoseconds to seconds
        
        # Convert stage throughput to lines per second (from lines per nanoseconds)
        results['stage_throughputs_opencl'] = {k: v * 1e9 for k, v in results['stage_throughputs_opencl'].items()}
        # Convert stage standard deviation to lines per second (from lines per nanoseconds)
        results['stage_std_devs_opencl'] = {k: v * 1e9 for k, v in results['stage_std_devs_opencl'].items()}
        
        # Calculate time per line for each stage (inverse of throughput)
        stage_time_per_line_opencl = {k: 1.0/v for k, v in results['stage_throughputs_opencl'].items()}
        opencl_total_time = sum(stage_time_per_line_opencl[k] for k in results['stage_names'])
        results['stage_percentages_opencl'] = {
            k: (stage_time_per_line_opencl[k] / opencl_total_time) * 100 for k in results['stage_names']
        }


        print("\n3. Running NumPy implementation...")
        numpy_data = run_benchmark('numpy', repetitions)
        results['numpy_lines_per_sec'], results['numpy_std_dev'], results['numpy_time'], results['numpy_time_stddev'] = process_results(numpy_data)

        print("\n4. Running C++ CPU implementation...")
        cpu_data = run_benchmark('cpu', repetitions)
        results['cpu_lines_per_sec'], results['cpu_std_dev'], results['cpu_time'], results['cpu_time_stddev'] = process_results(cpu_data)
        
        # Save all results to CSV
        print("\nSaving benchmark results to CSV files...")
        save_results_to_csv(results)
    
    if args.benchmark_type in ["pca", "all"]:
        print(f"\n\nRunning PCA model benchmarks (varying component counts) with batch size {args.pca_batch_size} and {repetitions} repetitions...")
        pca_results = benchmark_pca_models(batch_size=args.pca_batch_size, repetitions=repetitions)
        save_pca_results_to_csv(pca_results)
    
    print("All benchmarks completed successfully!")