import os import sys import subprocess import timeit import time import numpy as np import pandas as pd from scipy.signal import savgol_coeffs, savgol_filter import os.path from os.path import relpath import argparse def read_csv(fname): try: df = pd.read_csv(fname) return df["N"].to_numpy(), df["T"].to_numpy(), df["TSD"].to_numpy() except Exception as e: print(f"Could not read {fname}, skipping due to {e}") return None, None, None # Extract color values and set as the default color cycle batch_size = 128 ## Memory limit for the Camera # subprocess.run(["make", "clean"]) # subprocess.run(["make", "modules"]) build_dir = os.path.relpath("build") sys.path.append(build_dir) # Ensure the build directory is in the system path before importing kernels #Import the plastic_sorting module import build.plastic_sorting as wrapper import numpy as np impl = wrapper.Kernel() #Configuration Wl_min = 0 Wl_max = 600 Npoly = np.int32(3) #Polynomial order (Savitzky-Golay) Nderiv = np.int32(0) #Derivative order (Savitzky-Golay) Nwindow = np.int32(13) #Window size (Savitzky-Golay) #LineRange = [10, 400] #Middle 100 lines of the datacube, demonstrating that we can process any number of lines Ny = 500 model_dir = "models/calibratedSVM/" data_dir = "data/" PCA = np.ascontiguousarray(np.load(model_dir + "pca_model_components.npy").astype(np.float32)) PCAmean = (np.load(model_dir + "pca_model_mean.npy").astype(np.float32)) SVM_coef = (np.load(model_dir + "svm_model_coef.npy").astype(np.float32)) SVM_intercept = (np.load(model_dir + "svm_model_intercept.npy").astype(np.float32)) whiteMatrix = np.ascontiguousarray(np.load(model_dir + "whiteMatrix.npy").astype(np.float32)) savgol_coef = savgol_coeffs(Nwindow, Npoly, Nderiv) def runNumpy(input, output): for i in range(input.shape[0]): whiteCalibrated = np.divide(input[i, :, :], whiteMatrix) * 255 whiteCalibrated = savgol_filter(whiteCalibrated, Nwindow, Npoly, Nderiv, axis=1) whiteCalibrated = whiteCalibrated - PCAmean PCAprojection = np.dot(whiteCalibrated, PCA.T) output[i, :] = np.argmax(np.dot(PCAprojection, SVM_coef.T) + SVM_intercept, axis=1).astype(np.uint8) data_cube = np.load(data_dir + "image_3.npy")[:batch_size, :, :].astype(np.uint8) # data_cube = np.roll(np.load(data_dir + "image_5.npy").astype(np.uint8)[600:1700, :, :], -7, axis=2) Ny = data_cube.shape[0] Nx = data_cube.shape[1] Nc = data_cube.shape[2] output = np.full((Ny, Nx), 0, dtype=np.uint8) impl.load_model(PCA, PCAmean, SVM = SVM_coef, SVM_intercept = SVM_intercept, whitebalance = whiteMatrix, savgol_coef = savgol_coef, Wl_min = Wl_min, Wl_max = Wl_max, normalization_toggle = int(0), absorption_toggle = int(0), tuning_dims = [PCA.shape[0], Nx*16, Nc]) # Set up benchmarking parameters # Create a sequence of powers of 2 (1, 2, 4, 8, ...) up to the batch size max_power = int(np.log2(min(batch_size, 512))) line_counts = np.array([2**i for i in range(0, max_power + 1)]) # Start from 2^1 = 2 lines repetitions = 20 # Number of repetitions for better statistics results = { 'lines': line_counts, 'cpu_times': [], 'cpu_lines_per_sec': [], 'std_dev': [] # Store standard deviations } # Run benchmarks print("Running benchmarks with varying data sizes...") def run_benchmark(impl_name, repetitions, intermittent_sleep=0, warmup_iterations=1): Ny = data_cube.shape[0] Nx = data_cube.shape[1] data_output = np.ascontiguousarray(np.zeros((batch_size, Nx), dtype=np.uint8)) timing_results = [] whitebalance_results = [] savgol_results = [] pca_results = [] classification_results = [] for i, line_count in enumerate(line_counts): print(f"Running {impl_name} benchmark with {line_count} lines ({i+1}/{len(line_counts)})...") iteration_times = np.zeros(repetitions) whitebalance_times = np.zeros(repetitions) savgol_times = np.zeros(repetitions) pca_times = np.zeros(repetitions) classification_times = np.zeros(repetitions) # Warmup runs for j in range(warmup_iterations): if impl_name == 'numpy': runNumpy(data_cube[:line_count], data_output) elif impl_name == 'opencl': impl.run_pipeline_separate(data_cube[:line_count], data_output) elif impl_name == 'opencl_fused': impl.run(data_cube[:line_count], data_output) elif impl_name == 'cpu': impl.runCPU(data_cube[:line_count], data_output) # Actual benchmark runs for j in range(repetitions): start_time = timeit.default_timer() if impl_name == 'numpy': runNumpy(data_cube[:line_count], data_output) elif impl_name == 'opencl': whitebalance_times[j], savgol_times[j], pca_times[j], classification_times[j] = impl.run_pipeline_separate(data_cube[:line_count], data_output) elif impl_name == 'opencl_fused': impl.run(data_cube[:line_count], data_output) elif impl_name == 'cpu': impl.runCPU(data_cube[:line_count], data_output) elapsed_time = timeit.default_timer() - start_time iteration_times[j] = elapsed_time time.sleep(intermittent_sleep) # Sleep to simulate intermittent processing # Calculate and print throughput for this batch size mean_time = np.mean(iteration_times) std_dev = np.std(iteration_times) throughput = line_count / mean_time print(f" Results for {line_count} lines: {throughput:.2f} lines/sec (±{std_dev/mean_time*100:.2f}%)") # If OpenCL non-fused also print individual stage throughputs if impl_name in ['opencl'] and len(whitebalance_times) > 0: # Calculate mean time for each stage (convert from nanoseconds to seconds) wb_mean = np.mean(whitebalance_times) / 1e9 sg_mean = np.mean(savgol_times) / 1e9 pca_mean = np.mean(pca_times) / 1e9 cl_mean = np.mean(classification_times) / 1e9 # Calculate throughput for each stage wb_tput = line_count / wb_mean sg_tput = line_count / sg_mean pca_tput = line_count / pca_mean cl_tput = line_count / cl_mean # Print in condensed format print(f" Stage throughputs (lines/sec): WhiteBalance={wb_tput:.2f}, SavGol={sg_tput:.2f}, PCA={pca_tput:.2f}, Classify={cl_tput:.2f}") # Print percentage of total time total_stage_time = wb_mean + sg_mean + pca_mean + cl_mean wb_pct = (wb_mean / total_stage_time) * 100 sg_pct = (sg_mean / total_stage_time) * 100 pca_pct = (pca_mean / total_stage_time) * 100 cl_pct = (cl_mean / total_stage_time) * 100 print(f" Stage time distribution: WhiteBalance={wb_pct:.1f}%, SavGol={sg_pct:.1f}%, PCA={pca_pct:.1f}%, Classify={cl_pct:.1f}%") timing_results.append(iteration_times) if impl_name == 'opencl': whitebalance_results.append(whitebalance_times) savgol_results.append(savgol_times) pca_results.append(pca_times) classification_results.append(classification_times) print(f"Finished {impl_name} benchmark for {line_count} lines.") if impl_name == 'opencl': return timing_results, whitebalance_results, savgol_results, pca_results, classification_results else: return timing_results def filter_outliers(data, n_sigmas=2): filtered_data = data.copy() st_devs = np.zeros(len(data)) means = np.zeros(len(data)) for i, reps in enumerate(data): mean = np.mean(reps) std_dev = np.std(reps) filtered_data[i] = reps[np.abs(reps - mean) < n_sigmas * std_dev] st_devs[i] = np.std(filtered_data[i]) means[i] = np.mean(filtered_data[i]) return means, st_devs def process_results(timing_results): """Process the timing results to compute lines per second and standard deviation.""" means, st_devs = filter_outliers(timing_results) lines_per_sec = line_counts / means propagated_uncertainty = lines_per_sec * (st_devs / means) return lines_per_sec, propagated_uncertainty, means, st_devs # Also return raw timing data def save_results_to_csv(results, output_dir="benchmark_results"): """Save benchmark results to CSV files.""" # Create output directory if it doesn't exist os.makedirs(output_dir, exist_ok=True) # Calculate execution times for each implementation based on throughput (if not already available) for impl in ['numpy', 'opencl', 'opencl_fused', 'cpu']: if f'{impl}_time' not in results and f'{impl}_lines_per_sec' in results: results[f'{impl}_time'] = results['lines'] / results[f'{impl}_lines_per_sec'] results[f'{impl}_time_stddev'] = results[f'{impl}_time'] * (results[f'{impl}_std_dev'] / results[f'{impl}_lines_per_sec']) # Save implementation comparison data - with both throughput and timing data comparison_df = pd.DataFrame({ 'lines': results['lines'], # Throughput data 'numpy_throughput': results['numpy_lines_per_sec'], 'numpy_throughput_stddev': results['numpy_std_dev'], 'opencl_throughput': results['opencl_lines_per_sec'], 'opencl_throughput_stddev': results['opencl_std_dev'], 'opencl_fused_throughput': results['opencl_fused_lines_per_sec'], 'opencl_fused_throughput_stddev': results['opencl_fused_std_dev'], 'cpu_throughput': results['cpu_lines_per_sec'], 'cpu_throughput_stddev': results['cpu_std_dev'], # Time data (seconds) 'numpy_time': results['numpy_time'], 'numpy_time_stddev': results['numpy_time_stddev'], 'opencl_time': results['opencl_time'], 'opencl_time_stddev': results['opencl_time_stddev'], 'opencl_fused_time': results['opencl_fused_time'], 'opencl_fused_time_stddev': results['opencl_fused_time_stddev'], 'cpu_time': results['cpu_time'], 'cpu_time_stddev': results['cpu_time_stddev'] }) comparison_df.to_csv(f"{output_dir}/implementation_comparison.csv", index=False) # Save stage-wise data stage_throughput_df_opencl = pd.DataFrame({'lines': results['lines']}) stage_stddev_df_opencl = pd.DataFrame({'lines': results['lines']}) stage_percentage_df_opencl = pd.DataFrame({'lines': results['lines']}) stage_times_df_opencl = pd.DataFrame({'lines': results['lines']}) # New dataframe for stage times stage_times_stddev_df_opencl = pd.DataFrame({'lines': results['lines']}) # New dataframe for stage time stddev for stage_name in results['stage_names']: # OpenCL stage_throughput_df_opencl[stage_name + '_opencl'] = results['stage_throughputs_opencl'][stage_name] stage_stddev_df_opencl[stage_name + '_opencl'] = results['stage_std_devs_opencl'][stage_name] stage_percentage_df_opencl[stage_name + '_opencl'] = results['stage_percentages_opencl'][stage_name] stage_times_df_opencl[stage_name + '_opencl'] = results['stage_times_opencl'][stage_name] stage_times_stddev_df_opencl[stage_name + '_opencl'] = results['stage_times_stddev_opencl'][stage_name] # Save all the dataframes stage_throughput_df_opencl.to_csv(f"{output_dir}/stage_throughput_opencl.csv", index=False) stage_stddev_df_opencl.to_csv(f"{output_dir}/stage_stddev_opencl.csv", index=False) stage_percentage_df_opencl.to_csv(f"{output_dir}/stage_percentage_opencl.csv", index=False) stage_times_df_opencl.to_csv(f"{output_dir}/stage_times_opencl.csv", index=False) stage_times_stddev_df_opencl.to_csv(f"{output_dir}/stage_times_stddev_opencl.csv", index=False) print(f"Results saved to '{output_dir}' directory") def load_pca_model(impl, n_components): """Load a PCA model with specified number of components.""" model_dir = "models/pca_models/" PCA = np.ascontiguousarray(np.load(f"{model_dir}pca_components_{n_components}.npy").astype(np.float32)) PCAmean = np.load(f"{model_dir}pca_mean_{n_components}.npy").astype(np.float32) SVM_coef = np.load(f"{model_dir}svm_model_coef_{n_components}.npy").astype(np.float32) SVM_intercept = np.load(f"{model_dir}svm_model_intercept_{n_components}.npy").astype(np.float32) # Use the original white matrix and savgol coefficients savgol_coef = savgol_coeffs(Nwindow, Npoly, Nderiv) whiteMatrix = np.ascontiguousarray(np.load(model_dir + "../calibratedSVM/whiteMatrix.npy").astype(np.float32)) impl.load_model( PCA, PCAmean, SVM=SVM_coef, SVM_intercept=SVM_intercept, whitebalance=whiteMatrix, savgol_coef=savgol_coef, Wl_min=Wl_min, Wl_max=Wl_max, normalization_toggle=int(0), absorption_toggle=int(0), tuning_dims = [PCA.shape[0], Nx*16, Nc] ) return PCA, PCAmean, SVM_coef, SVM_intercept, whiteMatrix def benchmark_pca_models(batch_size=32, repetitions=10, warmup_iterations=3): """Benchmark performance with different PCA component counts.""" # Available PCA component counts (3-25) component_counts = list(range(3, 26)) # Initialize results dictionary pca_results = { 'components': component_counts, 'opencl_fused_times': [], 'opencl_fused_std_dev': [], 'opencl_times': [], 'opencl_std_dev': [], 'numpy_times': [], 'numpy_std_dev': [], 'cpu_times': [], 'cpu_std_dev': [], # Add PCA-specific timing arrays 'opencl_pca_times': [], 'opencl_pca_std_dev': [], } # Prepare data data_cube_full = np.load(data_dir + "image_3.npy").astype(np.uint8) data_cube = data_cube_full[:batch_size, :, :] Ny, Nx, Nc = data_cube.shape data_output = np.ascontiguousarray(np.zeros((batch_size, Nx), dtype=np.uint8)) # Run benchmark for each PCA component count for n_components in component_counts: print(f"\nBenchmarking with {n_components} PCA components:") # Load the specific model implementation = wrapper.Kernel() # Create a fresh instance of the kernel to avoid leaking memory PCA, PCAmean, SVM_coef, SVM_intercept, whiteMatrix = load_pca_model(implementation, n_components) # Benchmark each implementation # 1. OpenCL Fused print(" Running OpenCL Fused implementation...") # Warmup runs for _ in range(warmup_iterations): implementation.run(data_cube, data_output) opencl_fused_times = np.zeros(repetitions) for j in range(repetitions): start_time = timeit.default_timer() implementation.run(data_cube, data_output) opencl_fused_times[j] = timeit.default_timer() - start_time # Calculate and print throughput for OpenCL Fused mean_time = np.mean(opencl_fused_times) std_dev = np.std(opencl_fused_times) throughput = batch_size / mean_time print(f" OpenCL Fused: {throughput:.2f} lines/sec (±{std_dev/mean_time*100:.2f}%)") # 2. OpenCL print(" Running OpenCL implementation...") # Warmup runs for _ in range(warmup_iterations): implementation.run_pipeline_separate(data_cube, data_output) opencl_times = np.zeros(repetitions) opencl_pca_times = np.zeros(repetitions) # Track PCA-specific times for j in range(repetitions): start_time = timeit.default_timer() wb_time, sg_time, pca_time, cl_time = implementation.run_pipeline_separate(data_cube, data_output) opencl_times[j] = timeit.default_timer() - start_time opencl_pca_times[j] = pca_time / 1e9 # Convert from nanoseconds to seconds # Calculate and print throughput for OpenCL mean_time = np.mean(opencl_times) std_dev = np.std(opencl_times) throughput = batch_size / mean_time print(f" OpenCL: {throughput:.2f} lines/sec (±{std_dev/mean_time*100:.2f}%)") # Calculate and print PCA-specific throughput mean_pca_time = np.mean(opencl_pca_times) std_pca_dev = np.std(opencl_pca_times) pca_throughput = batch_size / mean_pca_time print(f" OpenCL PCA only: {pca_throughput:.2f} lines/sec (±{std_pca_dev/mean_pca_time*100:.2f}%)") # 3. NumPy print(" Running NumPy implementation...") # Warmup runs for _ in range(warmup_iterations): runNumpy(data_cube, data_output) numpy_times = np.zeros(repetitions) for j in range(repetitions): start_time = timeit.default_timer() runNumpy(data_cube, data_output) numpy_times[j] = timeit.default_timer() - start_time # Calculate and print throughput for NumPy mean_time = np.mean(numpy_times) std_dev = np.std(numpy_times) throughput = batch_size / mean_time print(f" NumPy: {throughput:.2f} lines/sec (±{std_dev/mean_time*100:.2f}%)") # 5. CPU print(" Running C++ CPU implementation...") # Warmup runs for _ in range(warmup_iterations): implementation.runCPU(data_cube, data_output) cpu_times = np.zeros(repetitions) for j in range(repetitions): start_time = timeit.default_timer() implementation.runCPU(data_cube, data_output) cpu_times[j] = timeit.default_timer() - start_time # Calculate and print throughput for CPU mean_time = np.mean(cpu_times) std_dev = np.std(cpu_times) throughput = batch_size / mean_time print(f" CPU: {throughput:.2f} lines/sec (±{std_dev/mean_time*100:.2f}%)") # Process results mean_opencl_fused, std_opencl_fused = np.mean(opencl_fused_times), np.std(opencl_fused_times) mean_opencl, std_opencl = np.mean(opencl_times), np.std(opencl_times) mean_numpy, std_numpy = np.mean(numpy_times), np.std(numpy_times) mean_cpu, std_cpu = np.mean(cpu_times), np.std(cpu_times) # Process PCA-specific timing results mean_opencl_pca, std_opencl_pca = np.mean(opencl_pca_times), np.std(opencl_pca_times) # Store results pca_results['opencl_fused_times'].append(mean_opencl_fused) pca_results['opencl_fused_std_dev'].append(std_opencl_fused) pca_results['opencl_times'].append(mean_opencl) pca_results['opencl_std_dev'].append(std_opencl) pca_results['numpy_times'].append(mean_numpy) pca_results['numpy_std_dev'].append(std_numpy) pca_results['cpu_times'].append(mean_cpu) pca_results['cpu_std_dev'].append(std_cpu) # Store PCA-specific timing results pca_results['opencl_pca_times'].append(mean_opencl_pca) pca_results['opencl_pca_std_dev'].append(std_opencl_pca) # Convert times to throughput (lines per second) for impl_name in ['opencl_fused', 'opencl', 'numpy', 'cpu']: times_key = f'{impl_name}_times' stddev_key = f'{impl_name}_std_dev' throughput_key = f'{impl_name}_lines_per_sec' throughput_stddev_key = f'{impl_name}_throughput_std_dev' # Add new keys to results dictionary pca_results[throughput_key] = batch_size / np.array(pca_results[times_key]) # Propagate error for throughput calculation pca_results[throughput_stddev_key] = pca_results[throughput_key] * ( np.array(pca_results[stddev_key]) / np.array(pca_results[times_key]) ) # Calculate PCA-specific throughput for impl_name in ['opencl']: times_key = f'{impl_name}_pca_times' stddev_key = f'{impl_name}_pca_std_dev' throughput_key = f'{impl_name}_pca_lines_per_sec' throughput_stddev_key = f'{impl_name}_pca_throughput_std_dev' # Add new keys to results dictionary pca_results[throughput_key] = batch_size / np.array(pca_results[times_key]) # Propagate error for throughput calculation pca_results[throughput_stddev_key] = pca_results[throughput_key] * ( np.array(pca_results[stddev_key]) / np.array(pca_results[times_key]) ) return pca_results def save_pca_results_to_csv(results, output_dir="benchmark_results"): """Save PCA model benchmark results to CSV files.""" os.makedirs(output_dir, exist_ok=True) # Save times data times_df = pd.DataFrame({ 'components': results['components'], 'opencl_fused_time': results['opencl_fused_times'], 'opencl_fused_stddev': results['opencl_fused_std_dev'], 'opencl_time': results['opencl_times'], 'opencl_stddev': results['opencl_std_dev'], 'numpy_time': results['numpy_times'], 'numpy_stddev': results['numpy_std_dev'], 'cpu_time': results['cpu_times'], 'cpu_stddev': results['cpu_std_dev'], # Add PCA-specific timing data 'opencl_pca_time': results['opencl_pca_times'], 'opencl_pca_stddev': results['opencl_pca_std_dev'], }) times_df.to_csv(f"{output_dir}/pca_model_times.csv", index=False) # Save throughput data throughput_df = pd.DataFrame({ 'components': results['components'], 'opencl_fused_throughput': results['opencl_fused_lines_per_sec'], 'opencl_fused_stddev': results['opencl_fused_throughput_std_dev'], 'opencl_throughput': results['opencl_lines_per_sec'], 'opencl_stddev': results['opencl_throughput_std_dev'], 'numpy_throughput': results['numpy_lines_per_sec'], 'numpy_stddev': results['numpy_throughput_std_dev'], 'cpu_throughput': results['cpu_lines_per_sec'], 'cpu_stddev': results['cpu_throughput_std_dev'], # Add PCA-specific throughput data 'opencl_pca_throughput': results['opencl_pca_lines_per_sec'], 'opencl_pca_stddev': results['opencl_pca_throughput_std_dev'], }) throughput_df.to_csv(f"{output_dir}/pca_model_throughput.csv", index=False) # Save PCA-only time and throughput data for clearer analysis pca_only_df = pd.DataFrame({ 'components': results['components'], 'opencl_pca_time': results['opencl_pca_times'], 'opencl_pca_time_stddev': results['opencl_pca_std_dev'], 'opencl_pca_throughput': results['opencl_pca_lines_per_sec'], 'opencl_pca_throughput_stddev': results['opencl_pca_throughput_std_dev'], }) pca_only_df.to_csv(f"{output_dir}/pca_only_performance.csv", index=False) print(f"PCA model benchmark results saved to '{output_dir}' directory") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Benchmark plastic sorting implementations") parser.add_argument("--benchmark-type", choices=["standard", "pca", "all"], default="standard", help="Type of benchmark to run: standard (varying line counts), pca (varying PCA components), or all") parser.add_argument("--pca-batch-size", type=int, default=16, help="Batch size to use for PCA benchmarks (default: 16)") parser.add_argument("--repetitions", type=int, default=20, help="Number of repetitions for each benchmark to gather statistics (default: 20)") args = parser.parse_args() # Use the specified number of repetitions repetitions = args.repetitions if args.benchmark_type in ["standard", "all"]: print(f"Running standard benchmarks (varying line counts) with {repetitions} repetitions...") # Run the standard benchmarks print("Starting benchmarks...") print("\n1. Running OpenCL Fused implementation...") opencl_fused_results = run_benchmark('opencl_fused', repetitions) results['opencl_fused_lines_per_sec'], results['opencl_fused_std_dev'], results['opencl_fused_time'], results['opencl_fused_time_stddev'] = process_results(opencl_fused_results) print("\n2. Running OpenCL implementation...") opencl_results, whitebalance_results, savgol_results, pca_results, classification_results = run_benchmark('opencl', repetitions) results['opencl_lines_per_sec'], results['opencl_std_dev'], results['opencl_time'], results['opencl_time_stddev'] = process_results(opencl_results) # Process stage-wise results - capture both throughput and raw timing results['stage_throughputs_opencl'] = {} results['stage_std_devs_opencl'] = {} results['stage_times_opencl'] = {} # Add timing information results['stage_times_stddev_opencl'] = {} # Add timing standard deviation stage_results = { 'whitebalance': process_results(whitebalance_results), 'savgol': process_results(savgol_results), 'pca': process_results(pca_results), 'classification': process_results(classification_results) } results['stage_names'] = ['whitebalance', 'savgol', 'pca', 'classification'] for stage_name in results['stage_names']: results['stage_throughputs_opencl'][stage_name] = stage_results[stage_name][0] results['stage_std_devs_opencl'][stage_name] = stage_results[stage_name][1] results['stage_times_opencl'][stage_name] = stage_results[stage_name][2] / 1e9 # Convert from nanoseconds to seconds results['stage_times_stddev_opencl'][stage_name] = stage_results[stage_name][3] / 1e9 # Convert from nanoseconds to seconds # Convert stage throughput to lines per second (from lines per nanoseconds) results['stage_throughputs_opencl'] = {k: v * 1e9 for k, v in results['stage_throughputs_opencl'].items()} # Convert stage standard deviation to lines per second (from lines per nanoseconds) results['stage_std_devs_opencl'] = {k: v * 1e9 for k, v in results['stage_std_devs_opencl'].items()} # Calculate time per line for each stage (inverse of throughput) stage_time_per_line_opencl = {k: 1.0/v for k, v in results['stage_throughputs_opencl'].items()} opencl_total_time = sum(stage_time_per_line_opencl[k] for k in results['stage_names']) results['stage_percentages_opencl'] = { k: (stage_time_per_line_opencl[k] / opencl_total_time) * 100 for k in results['stage_names'] } print("\n3. Running NumPy implementation...") numpy_data = run_benchmark('numpy', repetitions) results['numpy_lines_per_sec'], results['numpy_std_dev'], results['numpy_time'], results['numpy_time_stddev'] = process_results(numpy_data) print("\n4. Running C++ CPU implementation...") cpu_data = run_benchmark('cpu', repetitions) results['cpu_lines_per_sec'], results['cpu_std_dev'], results['cpu_time'], results['cpu_time_stddev'] = process_results(cpu_data) # Save all results to CSV print("\nSaving benchmark results to CSV files...") save_results_to_csv(results) if args.benchmark_type in ["pca", "all"]: print(f"\n\nRunning PCA model benchmarks (varying component counts) with batch size {args.pca_batch_size} and {repetitions} repetitions...") pca_results = benchmark_pca_models(batch_size=args.pca_batch_size, repetitions=repetitions) save_pca_results_to_csv(pca_results) print("All benchmarks completed successfully!")