SCARLET / notebooks / 0_data_preprocessing / preprocessing_human_data.py
preprocessing_human_data.py
Raw
###################
### Description ###
###################

# This script is for proprocessing the GenScot data (already in methylation beta values in AnnData format, with 
# CpGs as obs and samples as vars). Specifically, it calculates various CpG level statistcics (e.g. spearman r) 
# and adds them to the Anndata object.

# Inputs: Anndata object of GenScot methylation beta values (wave 3), with CpGs as obs and samples as vars.
# Outputs: Anndata object with CpG-level statistics added to obs.

##############
### Author ###
##############
 
# Sam Crofts (sam.crofts@ed.ac.uk)

##################################
##################################

#Imports 
import sys
sys.path.append("../..")   # fix to import modules from root
from src.general_imports import *
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.signal import find_peaks 
from scipy.stats import spearmanr

#Import Wave 3 GenScot data
all_data = ad.read_h5ad('/exports/igmm/eddie/tchandra-lab/EJYang/methylclock/GS_mval/wave3_meta.h5ad', 'r')
amdata = all_data.to_memory()

median_age = np.median(amdata.var.age.values)

#initialize columns
amdata.obs['r2'] = 0
amdata.obs['beta'] = 0
amdata.obs['bp_pval'] = 0
amdata.obs['spearman_stat'] = 0
amdata.obs['spearman_pval'] = 0
amdata.obs['beta_pval'] = 0
amdata.obs['variance_overall'] = 0
amdata.obs['variance_first_half'] = 0
amdata.obs['variance_second_half'] = 0
amdata.obs['mean_meth'] = 0
amdata.obs['var_change'] = 0
amdata.obs['var_change_perc'] = 0
amdata.obs['white_pval'] = 0
amdata.obs['n_peaks'] = 0
amdata.obs['slope_below_median'] = 0
amdata.obs['slope_above_median'] = 0

#loop through sites
for site in tqdm(amdata.obs.index):

    #fit linear model
    data = {'age': amdata.var.age.values, 'meth': amdata[site].X.T[:,0]}
    df = pd.DataFrame(data)

    model = smf.ols('meth ~ age', data=df).fit()
    residuals = model.resid
    beta_overall = model.params["age"]
    r2_overall = model.rsquared_adj
    beta_pval = model.pvalues["age"]
    intercept_overall = model.params["Intercept"]

    mean_meth = np.mean(amdata[site].X.T)

    # Perform Breusch-Pagan test for heteroscedasticity
    _, p_value_bp, _, _ = sm.stats.diagnostic.het_breuschpagan(residuals, model.model.exog)

    below_median_values = amdata[site].X.T[amdata.var.age.values < median_age]
    above_median_values = amdata[site].X.T[amdata.var.age.values >= median_age]
    
    # Calculate variances for each half
    variance_first_half = np.var(below_median_values)
    variance_second_half = np.var(above_median_values)
    variance_overall = np.var(amdata[site].X.T)

    # Spearman correlation
    spearman_pval = spearmanr(amdata.var.age.values, amdata[site].X.T)[1]
    spearman_stat = spearmanr(amdata.var.age.values, amdata[site].X.T)[0]

    # Perform White test for heteroscedasticity
    _, p_value_white, _, _ = sm.stats.diagnostic.het_white(residuals, model.model.exog)

    #add to the anndata object
    amdata.obs.loc[site, 'r2'] = r2_overall
    amdata.obs.loc[site, 'beta'] = beta_overall
    amdata.obs.loc[site, 'bp_pval'] = p_value_bp
    amdata.obs.loc[site, 'spearman_stat'] = spearman_stat
    amdata.obs.loc[site, 'spearman_pval'] = spearman_pval
    amdata.obs.loc[site, 'beta_pval'] = beta_pval
    amdata.obs.loc[site, 'variance_overall'] = variance_overall
    amdata.obs.loc[site, 'variance_first_half'] = variance_first_half
    amdata.obs.loc[site, 'variance_second_half'] = variance_second_half
    amdata.obs.loc[site, 'mean_meth'] = mean_meth
    amdata.obs.loc[site, 'var_change'] = variance_second_half - variance_first_half
    amdata.obs.loc[site, 'var_change_perc'] = ((variance_second_half - variance_first_half)/variance_overall)*100
    amdata.obs.loc[site, 'white_pval'] = p_value_white
    amdata.obs.loc[site, 'intercept'] = intercept_overall
    
    #Now let's get rid of sites that are multimodal 
    site_data = amdata[site].X.flatten()
    dens = sm.nonparametric.KDEUnivariate(site_data)
    #Make bandwidth a function of the SD of the data
    sd = np.sqrt(np.var(site_data))
    ninety_five_pc_of_data_range = 4*sd
    bw = 0.05*ninety_five_pc_of_data_range

    #making sure we get no errors
    if bw == 0:
        bw = 0.1
        
    dens.fit(kernel='gau', bw=bw)
    x = np.linspace(-0.1,1.1,100) #restrict range to (0,1)
    y = dens.evaluate(x)
    y = y/max(y)

    amdata.obs.loc[site, 'n_peaks'] = find_peaks(y, prominence=0.001)[0].shape[0]  

    #for each site, calculate slope above and below median age
    #data below median age
    below_median_mask = amdata.var.age.values < median_age
    data_below = {'age': amdata.var.age.values[below_median_mask], 'meth': amdata[site].X.T[below_median_mask].flatten()}
    df_below = pd.DataFrame(data_below)

    model_below = smf.ols('meth ~ age', data=df_below).fit()
    slope_below = model_below.params["age"]

    #data above median age
    above_median_mask = amdata.var.age.values >= median_age
    data_above = {'age': amdata.var.age.values[above_median_mask], 'meth': amdata[site].X.T[above_median_mask].flatten()}
    df_above = pd.DataFrame(data_above)

    model_above = smf.ols('meth ~ age', data=df_above).fit()
    slope_above = model_above.params["age"]

    #add to anndata
    amdata.obs.loc[site, 'slope_below_median'] = slope_below
    amdata.obs.loc[site, 'slope_above_median'] = slope_above 

#save
with open('../data/genscot_full_with_site_details.pk', 'wb') as f:
    pickle.dump(amdata, f)