import os
import sys
import subprocess
import timeit
import time
import numpy as np
import pandas as pd
from scipy.signal import savgol_coeffs, savgol_filter
import os.path
from os.path import relpath
import argparse
def read_csv(fname):
try:
df = pd.read_csv(fname)
return df["N"].to_numpy(), df["T"].to_numpy(), df["TSD"].to_numpy()
except Exception as e:
print(f"Could not read {fname}, skipping due to {e}")
return None, None, None
# Extract color values and set as the default color cycle
batch_size = 128 ## Memory limit for the Camera
# subprocess.run(["make", "clean"])
# subprocess.run(["make", "modules"])
build_dir = os.path.relpath("build")
sys.path.append(build_dir)
# Ensure the build directory is in the system path before importing kernels
#Import the plastic_sorting module
import build.plastic_sorting as wrapper
import numpy as np
impl = wrapper.Kernel()
#Configuration
Wl_min = 0
Wl_max = 600
Npoly = np.int32(3) #Polynomial order (Savitzky-Golay)
Nderiv = np.int32(0) #Derivative order (Savitzky-Golay)
Nwindow = np.int32(13) #Window size (Savitzky-Golay)
#LineRange = [10, 400] #Middle 100 lines of the datacube, demonstrating that we can process any number of lines
Ny = 500
model_dir = "models/calibratedSVM/"
data_dir = "data/"
PCA = np.ascontiguousarray(np.load(model_dir + "pca_model_components.npy").astype(np.float32))
PCAmean = (np.load(model_dir + "pca_model_mean.npy").astype(np.float32))
SVM_coef = (np.load(model_dir + "svm_model_coef.npy").astype(np.float32))
SVM_intercept = (np.load(model_dir + "svm_model_intercept.npy").astype(np.float32))
whiteMatrix = np.ascontiguousarray(np.load(model_dir + "whiteMatrix.npy").astype(np.float32))
savgol_coef = savgol_coeffs(Nwindow, Npoly, Nderiv)
def runNumpy(input, output):
for i in range(input.shape[0]):
whiteCalibrated = np.divide(input[i, :, :], whiteMatrix) * 255
whiteCalibrated = savgol_filter(whiteCalibrated, Nwindow, Npoly, Nderiv, axis=1)
whiteCalibrated = whiteCalibrated - PCAmean
PCAprojection = np.dot(whiteCalibrated, PCA.T)
output[i, :] = np.argmax(np.dot(PCAprojection, SVM_coef.T) + SVM_intercept, axis=1).astype(np.uint8)
data_cube = np.load(data_dir + "image_3.npy")[:batch_size, :, :].astype(np.uint8)
# data_cube = np.roll(np.load(data_dir + "image_5.npy").astype(np.uint8)[600:1700, :, :], -7, axis=2)
Ny = data_cube.shape[0]
Nx = data_cube.shape[1]
Nc = data_cube.shape[2]
output = np.full((Ny, Nx), 0, dtype=np.uint8)
impl.load_model(PCA, PCAmean,
SVM = SVM_coef,
SVM_intercept = SVM_intercept,
whitebalance = whiteMatrix,
savgol_coef = savgol_coef,
Wl_min = Wl_min,
Wl_max = Wl_max,
normalization_toggle = int(0),
absorption_toggle = int(0),
tuning_dims = [PCA.shape[0], Nx*16, Nc])
# Set up benchmarking parameters
# Create a sequence of powers of 2 (1, 2, 4, 8, ...) up to the batch size
max_power = int(np.log2(min(batch_size, 512)))
line_counts = np.array([2**i for i in range(0, max_power + 1)]) # Start from 2^1 = 2 lines
repetitions = 20 # Number of repetitions for better statistics
results = {
'lines': line_counts,
'cpu_times': [],
'cpu_lines_per_sec': [],
'std_dev': [] # Store standard deviations
}
# Run benchmarks
print("Running benchmarks with varying data sizes...")
def run_benchmark(impl_name, repetitions, intermittent_sleep=0, warmup_iterations=1):
Ny = data_cube.shape[0]
Nx = data_cube.shape[1]
data_output = np.ascontiguousarray(np.zeros((batch_size, Nx), dtype=np.uint8))
timing_results = []
whitebalance_results = []
savgol_results = []
pca_results = []
classification_results = []
for i, line_count in enumerate(line_counts):
print(f"Running {impl_name} benchmark with {line_count} lines ({i+1}/{len(line_counts)})...")
iteration_times = np.zeros(repetitions)
whitebalance_times = np.zeros(repetitions)
savgol_times = np.zeros(repetitions)
pca_times = np.zeros(repetitions)
classification_times = np.zeros(repetitions)
# Warmup runs
for j in range(warmup_iterations):
if impl_name == 'numpy':
runNumpy(data_cube[:line_count], data_output)
elif impl_name == 'opencl':
impl.run_pipeline_separate(data_cube[:line_count], data_output)
elif impl_name == 'opencl_fused':
impl.run(data_cube[:line_count], data_output)
elif impl_name == 'cpu':
impl.runCPU(data_cube[:line_count], data_output)
# Actual benchmark runs
for j in range(repetitions):
start_time = timeit.default_timer()
if impl_name == 'numpy':
runNumpy(data_cube[:line_count], data_output)
elif impl_name == 'opencl':
whitebalance_times[j], savgol_times[j], pca_times[j], classification_times[j] = impl.run_pipeline_separate(data_cube[:line_count], data_output)
elif impl_name == 'opencl_fused':
impl.run(data_cube[:line_count], data_output)
elif impl_name == 'cpu':
impl.runCPU(data_cube[:line_count], data_output)
elapsed_time = timeit.default_timer() - start_time
iteration_times[j] = elapsed_time
time.sleep(intermittent_sleep) # Sleep to simulate intermittent processing
# Calculate and print throughput for this batch size
mean_time = np.mean(iteration_times)
std_dev = np.std(iteration_times)
throughput = line_count / mean_time
print(f" Results for {line_count} lines: {throughput:.2f} lines/sec (±{std_dev/mean_time*100:.2f}%)")
# If OpenCL non-fused also print individual stage throughputs
if impl_name in ['opencl'] and len(whitebalance_times) > 0:
# Calculate mean time for each stage (convert from nanoseconds to seconds)
wb_mean = np.mean(whitebalance_times) / 1e9
sg_mean = np.mean(savgol_times) / 1e9
pca_mean = np.mean(pca_times) / 1e9
cl_mean = np.mean(classification_times) / 1e9
# Calculate throughput for each stage
wb_tput = line_count / wb_mean
sg_tput = line_count / sg_mean
pca_tput = line_count / pca_mean
cl_tput = line_count / cl_mean
# Print in condensed format
print(f" Stage throughputs (lines/sec): WhiteBalance={wb_tput:.2f}, SavGol={sg_tput:.2f}, PCA={pca_tput:.2f}, Classify={cl_tput:.2f}")
# Print percentage of total time
total_stage_time = wb_mean + sg_mean + pca_mean + cl_mean
wb_pct = (wb_mean / total_stage_time) * 100
sg_pct = (sg_mean / total_stage_time) * 100
pca_pct = (pca_mean / total_stage_time) * 100
cl_pct = (cl_mean / total_stage_time) * 100
print(f" Stage time distribution: WhiteBalance={wb_pct:.1f}%, SavGol={sg_pct:.1f}%, PCA={pca_pct:.1f}%, Classify={cl_pct:.1f}%")
timing_results.append(iteration_times)
if impl_name == 'opencl':
whitebalance_results.append(whitebalance_times)
savgol_results.append(savgol_times)
pca_results.append(pca_times)
classification_results.append(classification_times)
print(f"Finished {impl_name} benchmark for {line_count} lines.")
if impl_name == 'opencl':
return timing_results, whitebalance_results, savgol_results, pca_results, classification_results
else:
return timing_results
def filter_outliers(data, n_sigmas=2):
filtered_data = data.copy()
st_devs = np.zeros(len(data))
means = np.zeros(len(data))
for i, reps in enumerate(data):
mean = np.mean(reps)
std_dev = np.std(reps)
filtered_data[i] = reps[np.abs(reps - mean) < n_sigmas * std_dev]
st_devs[i] = np.std(filtered_data[i])
means[i] = np.mean(filtered_data[i])
return means, st_devs
def process_results(timing_results):
"""Process the timing results to compute lines per second and standard deviation."""
means, st_devs = filter_outliers(timing_results)
lines_per_sec = line_counts / means
propagated_uncertainty = lines_per_sec * (st_devs / means)
return lines_per_sec, propagated_uncertainty, means, st_devs # Also return raw timing data
def save_results_to_csv(results, output_dir="benchmark_results"):
"""Save benchmark results to CSV files."""
# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
# Calculate execution times for each implementation based on throughput (if not already available)
for impl in ['numpy', 'opencl', 'opencl_fused', 'cpu']:
if f'{impl}_time' not in results and f'{impl}_lines_per_sec' in results:
results[f'{impl}_time'] = results['lines'] / results[f'{impl}_lines_per_sec']
results[f'{impl}_time_stddev'] = results[f'{impl}_time'] * (results[f'{impl}_std_dev'] / results[f'{impl}_lines_per_sec'])
# Save implementation comparison data - with both throughput and timing data
comparison_df = pd.DataFrame({
'lines': results['lines'],
# Throughput data
'numpy_throughput': results['numpy_lines_per_sec'],
'numpy_throughput_stddev': results['numpy_std_dev'],
'opencl_throughput': results['opencl_lines_per_sec'],
'opencl_throughput_stddev': results['opencl_std_dev'],
'opencl_fused_throughput': results['opencl_fused_lines_per_sec'],
'opencl_fused_throughput_stddev': results['opencl_fused_std_dev'],
'cpu_throughput': results['cpu_lines_per_sec'],
'cpu_throughput_stddev': results['cpu_std_dev'],
# Time data (seconds)
'numpy_time': results['numpy_time'],
'numpy_time_stddev': results['numpy_time_stddev'],
'opencl_time': results['opencl_time'],
'opencl_time_stddev': results['opencl_time_stddev'],
'opencl_fused_time': results['opencl_fused_time'],
'opencl_fused_time_stddev': results['opencl_fused_time_stddev'],
'cpu_time': results['cpu_time'],
'cpu_time_stddev': results['cpu_time_stddev']
})
comparison_df.to_csv(f"{output_dir}/implementation_comparison.csv", index=False)
# Save stage-wise data
stage_throughput_df_opencl = pd.DataFrame({'lines': results['lines']})
stage_stddev_df_opencl = pd.DataFrame({'lines': results['lines']})
stage_percentage_df_opencl = pd.DataFrame({'lines': results['lines']})
stage_times_df_opencl = pd.DataFrame({'lines': results['lines']}) # New dataframe for stage times
stage_times_stddev_df_opencl = pd.DataFrame({'lines': results['lines']}) # New dataframe for stage time stddev
for stage_name in results['stage_names']:
# OpenCL
stage_throughput_df_opencl[stage_name + '_opencl'] = results['stage_throughputs_opencl'][stage_name]
stage_stddev_df_opencl[stage_name + '_opencl'] = results['stage_std_devs_opencl'][stage_name]
stage_percentage_df_opencl[stage_name + '_opencl'] = results['stage_percentages_opencl'][stage_name]
stage_times_df_opencl[stage_name + '_opencl'] = results['stage_times_opencl'][stage_name]
stage_times_stddev_df_opencl[stage_name + '_opencl'] = results['stage_times_stddev_opencl'][stage_name]
# Save all the dataframes
stage_throughput_df_opencl.to_csv(f"{output_dir}/stage_throughput_opencl.csv", index=False)
stage_stddev_df_opencl.to_csv(f"{output_dir}/stage_stddev_opencl.csv", index=False)
stage_percentage_df_opencl.to_csv(f"{output_dir}/stage_percentage_opencl.csv", index=False)
stage_times_df_opencl.to_csv(f"{output_dir}/stage_times_opencl.csv", index=False)
stage_times_stddev_df_opencl.to_csv(f"{output_dir}/stage_times_stddev_opencl.csv", index=False)
print(f"Results saved to '{output_dir}' directory")
def load_pca_model(impl, n_components):
"""Load a PCA model with specified number of components."""
model_dir = "models/pca_models/"
PCA = np.ascontiguousarray(np.load(f"{model_dir}pca_components_{n_components}.npy").astype(np.float32))
PCAmean = np.load(f"{model_dir}pca_mean_{n_components}.npy").astype(np.float32)
SVM_coef = np.load(f"{model_dir}svm_model_coef_{n_components}.npy").astype(np.float32)
SVM_intercept = np.load(f"{model_dir}svm_model_intercept_{n_components}.npy").astype(np.float32)
# Use the original white matrix and savgol coefficients
savgol_coef = savgol_coeffs(Nwindow, Npoly, Nderiv)
whiteMatrix = np.ascontiguousarray(np.load(model_dir + "../calibratedSVM/whiteMatrix.npy").astype(np.float32))
impl.load_model(
PCA, PCAmean,
SVM=SVM_coef,
SVM_intercept=SVM_intercept,
whitebalance=whiteMatrix,
savgol_coef=savgol_coef,
Wl_min=Wl_min,
Wl_max=Wl_max,
normalization_toggle=int(0),
absorption_toggle=int(0),
tuning_dims = [PCA.shape[0], Nx*16, Nc]
)
return PCA, PCAmean, SVM_coef, SVM_intercept, whiteMatrix
def benchmark_pca_models(batch_size=32, repetitions=10, warmup_iterations=3):
"""Benchmark performance with different PCA component counts."""
# Available PCA component counts (3-25)
component_counts = list(range(3, 26))
# Initialize results dictionary
pca_results = {
'components': component_counts,
'opencl_fused_times': [],
'opencl_fused_std_dev': [],
'opencl_times': [],
'opencl_std_dev': [],
'numpy_times': [],
'numpy_std_dev': [],
'cpu_times': [],
'cpu_std_dev': [],
# Add PCA-specific timing arrays
'opencl_pca_times': [],
'opencl_pca_std_dev': [],
}
# Prepare data
data_cube_full = np.load(data_dir + "image_3.npy").astype(np.uint8)
data_cube = data_cube_full[:batch_size, :, :]
Ny, Nx, Nc = data_cube.shape
data_output = np.ascontiguousarray(np.zeros((batch_size, Nx), dtype=np.uint8))
# Run benchmark for each PCA component count
for n_components in component_counts:
print(f"\nBenchmarking with {n_components} PCA components:")
# Load the specific model
implementation = wrapper.Kernel() # Create a fresh instance of the kernel to avoid leaking memory
PCA, PCAmean, SVM_coef, SVM_intercept, whiteMatrix = load_pca_model(implementation, n_components)
# Benchmark each implementation
# 1. OpenCL Fused
print(" Running OpenCL Fused implementation...")
# Warmup runs
for _ in range(warmup_iterations):
implementation.run(data_cube, data_output)
opencl_fused_times = np.zeros(repetitions)
for j in range(repetitions):
start_time = timeit.default_timer()
implementation.run(data_cube, data_output)
opencl_fused_times[j] = timeit.default_timer() - start_time
# Calculate and print throughput for OpenCL Fused
mean_time = np.mean(opencl_fused_times)
std_dev = np.std(opencl_fused_times)
throughput = batch_size / mean_time
print(f" OpenCL Fused: {throughput:.2f} lines/sec (±{std_dev/mean_time*100:.2f}%)")
# 2. OpenCL
print(" Running OpenCL implementation...")
# Warmup runs
for _ in range(warmup_iterations):
implementation.run_pipeline_separate(data_cube, data_output)
opencl_times = np.zeros(repetitions)
opencl_pca_times = np.zeros(repetitions) # Track PCA-specific times
for j in range(repetitions):
start_time = timeit.default_timer()
wb_time, sg_time, pca_time, cl_time = implementation.run_pipeline_separate(data_cube, data_output)
opencl_times[j] = timeit.default_timer() - start_time
opencl_pca_times[j] = pca_time / 1e9 # Convert from nanoseconds to seconds
# Calculate and print throughput for OpenCL
mean_time = np.mean(opencl_times)
std_dev = np.std(opencl_times)
throughput = batch_size / mean_time
print(f" OpenCL: {throughput:.2f} lines/sec (±{std_dev/mean_time*100:.2f}%)")
# Calculate and print PCA-specific throughput
mean_pca_time = np.mean(opencl_pca_times)
std_pca_dev = np.std(opencl_pca_times)
pca_throughput = batch_size / mean_pca_time
print(f" OpenCL PCA only: {pca_throughput:.2f} lines/sec (±{std_pca_dev/mean_pca_time*100:.2f}%)")
# 3. NumPy
print(" Running NumPy implementation...")
# Warmup runs
for _ in range(warmup_iterations):
runNumpy(data_cube, data_output)
numpy_times = np.zeros(repetitions)
for j in range(repetitions):
start_time = timeit.default_timer()
runNumpy(data_cube, data_output)
numpy_times[j] = timeit.default_timer() - start_time
# Calculate and print throughput for NumPy
mean_time = np.mean(numpy_times)
std_dev = np.std(numpy_times)
throughput = batch_size / mean_time
print(f" NumPy: {throughput:.2f} lines/sec (±{std_dev/mean_time*100:.2f}%)")
# 5. CPU
print(" Running C++ CPU implementation...")
# Warmup runs
for _ in range(warmup_iterations):
implementation.runCPU(data_cube, data_output)
cpu_times = np.zeros(repetitions)
for j in range(repetitions):
start_time = timeit.default_timer()
implementation.runCPU(data_cube, data_output)
cpu_times[j] = timeit.default_timer() - start_time
# Calculate and print throughput for CPU
mean_time = np.mean(cpu_times)
std_dev = np.std(cpu_times)
throughput = batch_size / mean_time
print(f" CPU: {throughput:.2f} lines/sec (±{std_dev/mean_time*100:.2f}%)")
# Process results
mean_opencl_fused, std_opencl_fused = np.mean(opencl_fused_times), np.std(opencl_fused_times)
mean_opencl, std_opencl = np.mean(opencl_times), np.std(opencl_times)
mean_numpy, std_numpy = np.mean(numpy_times), np.std(numpy_times)
mean_cpu, std_cpu = np.mean(cpu_times), np.std(cpu_times)
# Process PCA-specific timing results
mean_opencl_pca, std_opencl_pca = np.mean(opencl_pca_times), np.std(opencl_pca_times)
# Store results
pca_results['opencl_fused_times'].append(mean_opencl_fused)
pca_results['opencl_fused_std_dev'].append(std_opencl_fused)
pca_results['opencl_times'].append(mean_opencl)
pca_results['opencl_std_dev'].append(std_opencl)
pca_results['numpy_times'].append(mean_numpy)
pca_results['numpy_std_dev'].append(std_numpy)
pca_results['cpu_times'].append(mean_cpu)
pca_results['cpu_std_dev'].append(std_cpu)
# Store PCA-specific timing results
pca_results['opencl_pca_times'].append(mean_opencl_pca)
pca_results['opencl_pca_std_dev'].append(std_opencl_pca)
# Convert times to throughput (lines per second)
for impl_name in ['opencl_fused', 'opencl', 'numpy', 'cpu']:
times_key = f'{impl_name}_times'
stddev_key = f'{impl_name}_std_dev'
throughput_key = f'{impl_name}_lines_per_sec'
throughput_stddev_key = f'{impl_name}_throughput_std_dev'
# Add new keys to results dictionary
pca_results[throughput_key] = batch_size / np.array(pca_results[times_key])
# Propagate error for throughput calculation
pca_results[throughput_stddev_key] = pca_results[throughput_key] * (
np.array(pca_results[stddev_key]) / np.array(pca_results[times_key])
)
# Calculate PCA-specific throughput
for impl_name in ['opencl']:
times_key = f'{impl_name}_pca_times'
stddev_key = f'{impl_name}_pca_std_dev'
throughput_key = f'{impl_name}_pca_lines_per_sec'
throughput_stddev_key = f'{impl_name}_pca_throughput_std_dev'
# Add new keys to results dictionary
pca_results[throughput_key] = batch_size / np.array(pca_results[times_key])
# Propagate error for throughput calculation
pca_results[throughput_stddev_key] = pca_results[throughput_key] * (
np.array(pca_results[stddev_key]) / np.array(pca_results[times_key])
)
return pca_results
def save_pca_results_to_csv(results, output_dir="benchmark_results"):
"""Save PCA model benchmark results to CSV files."""
os.makedirs(output_dir, exist_ok=True)
# Save times data
times_df = pd.DataFrame({
'components': results['components'],
'opencl_fused_time': results['opencl_fused_times'],
'opencl_fused_stddev': results['opencl_fused_std_dev'],
'opencl_time': results['opencl_times'],
'opencl_stddev': results['opencl_std_dev'],
'numpy_time': results['numpy_times'],
'numpy_stddev': results['numpy_std_dev'],
'cpu_time': results['cpu_times'],
'cpu_stddev': results['cpu_std_dev'],
# Add PCA-specific timing data
'opencl_pca_time': results['opencl_pca_times'],
'opencl_pca_stddev': results['opencl_pca_std_dev'],
})
times_df.to_csv(f"{output_dir}/pca_model_times.csv", index=False)
# Save throughput data
throughput_df = pd.DataFrame({
'components': results['components'],
'opencl_fused_throughput': results['opencl_fused_lines_per_sec'],
'opencl_fused_stddev': results['opencl_fused_throughput_std_dev'],
'opencl_throughput': results['opencl_lines_per_sec'],
'opencl_stddev': results['opencl_throughput_std_dev'],
'numpy_throughput': results['numpy_lines_per_sec'],
'numpy_stddev': results['numpy_throughput_std_dev'],
'cpu_throughput': results['cpu_lines_per_sec'],
'cpu_stddev': results['cpu_throughput_std_dev'],
# Add PCA-specific throughput data
'opencl_pca_throughput': results['opencl_pca_lines_per_sec'],
'opencl_pca_stddev': results['opencl_pca_throughput_std_dev'],
})
throughput_df.to_csv(f"{output_dir}/pca_model_throughput.csv", index=False)
# Save PCA-only time and throughput data for clearer analysis
pca_only_df = pd.DataFrame({
'components': results['components'],
'opencl_pca_time': results['opencl_pca_times'],
'opencl_pca_time_stddev': results['opencl_pca_std_dev'],
'opencl_pca_throughput': results['opencl_pca_lines_per_sec'],
'opencl_pca_throughput_stddev': results['opencl_pca_throughput_std_dev'],
})
pca_only_df.to_csv(f"{output_dir}/pca_only_performance.csv", index=False)
print(f"PCA model benchmark results saved to '{output_dir}' directory")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Benchmark plastic sorting implementations")
parser.add_argument("--benchmark-type", choices=["standard", "pca", "all"], default="standard",
help="Type of benchmark to run: standard (varying line counts), pca (varying PCA components), or all")
parser.add_argument("--pca-batch-size", type=int, default=16,
help="Batch size to use for PCA benchmarks (default: 16)")
parser.add_argument("--repetitions", type=int, default=20,
help="Number of repetitions for each benchmark to gather statistics (default: 20)")
args = parser.parse_args()
# Use the specified number of repetitions
repetitions = args.repetitions
if args.benchmark_type in ["standard", "all"]:
print(f"Running standard benchmarks (varying line counts) with {repetitions} repetitions...")
# Run the standard benchmarks
print("Starting benchmarks...")
print("\n1. Running OpenCL Fused implementation...")
opencl_fused_results = run_benchmark('opencl_fused', repetitions)
results['opencl_fused_lines_per_sec'], results['opencl_fused_std_dev'], results['opencl_fused_time'], results['opencl_fused_time_stddev'] = process_results(opencl_fused_results)
print("\n2. Running OpenCL implementation...")
opencl_results, whitebalance_results, savgol_results, pca_results, classification_results = run_benchmark('opencl', repetitions)
results['opencl_lines_per_sec'], results['opencl_std_dev'], results['opencl_time'], results['opencl_time_stddev'] = process_results(opencl_results)
# Process stage-wise results - capture both throughput and raw timing
results['stage_throughputs_opencl'] = {}
results['stage_std_devs_opencl'] = {}
results['stage_times_opencl'] = {} # Add timing information
results['stage_times_stddev_opencl'] = {} # Add timing standard deviation
stage_results = {
'whitebalance': process_results(whitebalance_results),
'savgol': process_results(savgol_results),
'pca': process_results(pca_results),
'classification': process_results(classification_results)
}
results['stage_names'] = ['whitebalance', 'savgol', 'pca', 'classification']
for stage_name in results['stage_names']:
results['stage_throughputs_opencl'][stage_name] = stage_results[stage_name][0]
results['stage_std_devs_opencl'][stage_name] = stage_results[stage_name][1]
results['stage_times_opencl'][stage_name] = stage_results[stage_name][2] / 1e9 # Convert from nanoseconds to seconds
results['stage_times_stddev_opencl'][stage_name] = stage_results[stage_name][3] / 1e9 # Convert from nanoseconds to seconds
# Convert stage throughput to lines per second (from lines per nanoseconds)
results['stage_throughputs_opencl'] = {k: v * 1e9 for k, v in results['stage_throughputs_opencl'].items()}
# Convert stage standard deviation to lines per second (from lines per nanoseconds)
results['stage_std_devs_opencl'] = {k: v * 1e9 for k, v in results['stage_std_devs_opencl'].items()}
# Calculate time per line for each stage (inverse of throughput)
stage_time_per_line_opencl = {k: 1.0/v for k, v in results['stage_throughputs_opencl'].items()}
opencl_total_time = sum(stage_time_per_line_opencl[k] for k in results['stage_names'])
results['stage_percentages_opencl'] = {
k: (stage_time_per_line_opencl[k] / opencl_total_time) * 100 for k in results['stage_names']
}
print("\n3. Running NumPy implementation...")
numpy_data = run_benchmark('numpy', repetitions)
results['numpy_lines_per_sec'], results['numpy_std_dev'], results['numpy_time'], results['numpy_time_stddev'] = process_results(numpy_data)
print("\n4. Running C++ CPU implementation...")
cpu_data = run_benchmark('cpu', repetitions)
results['cpu_lines_per_sec'], results['cpu_std_dev'], results['cpu_time'], results['cpu_time_stddev'] = process_results(cpu_data)
# Save all results to CSV
print("\nSaving benchmark results to CSV files...")
save_results_to_csv(results)
if args.benchmark_type in ["pca", "all"]:
print(f"\n\nRunning PCA model benchmarks (varying component counts) with batch size {args.pca_batch_size} and {repetitions} repetitions...")
pca_results = benchmark_pca_models(batch_size=args.pca_batch_size, repetitions=repetitions)
save_pca_results_to_csv(pca_results)
print("All benchmarks completed successfully!")