################### ### Description ### ################### # This script is for preprocessing the mammal data from Mammalian Methylation Consortium (publicly available on the # Gene Expression Omnibus under accession number GSE223748, previously converted to AnnData format with # CpGs as obs and samples as vars). Specifically, it calculates various CpG level statistcics (e.g. spearman r) # - both before and after trimming the pre-sexual maturity ages - and adds them to the Anndata object. # Also adds some missing lifespan data. # Inputs: List of Anndata objects of mammal methylation beta values, with CpGs as obs and samples as vars # (created by the Mammalian Methylation Consortium (Haghani et al. 2023) and publicly available on the Gene Expression # Omnibus under accession number GSE223748) ############## ### Author ### ############## # Sam Crofts (sam.crofts@ed.ac.uk) ################################## ################################## #Imports import sys sys.path.append("../..") # fix to import modules from root from src.general_imports import * from scipy.interpolate import make_interp_spline from statsmodels.nonparametric.smoothers_lowess import lowess from sklearn.neighbors import KernelDensity from scipy.signal import find_peaks #set seed for reproducibility random.seed(10) # bring in mammal data file = open('../data/pan_mammal_Blood.pk', 'rb') data_list = pickle.load(file) file.close() #for each mammal for amdata in data_list: print(amdata.uns['common_name']) median_age = np.median(amdata.var.age.values) #for each site for site in tqdm(amdata.obs.index): #fit linear model data = {'age': amdata.var.age.values, 'meth': amdata[site].X.T[:,0]} df = pd.DataFrame(data) model = smf.ols('meth ~ age', data=df).fit() residuals = model.resid beta_overall = model.params["age"] r2_overall = model.rsquared_adj beta_pval = model.pvalues["age"] mean_meth = np.mean(amdata[site].X.T) # Perform Breusch-Pagan test for heteroscedasticity _, p_value, _, _ = sm.stats.diagnostic.het_breuschpagan(residuals, model.model.exog) below_median_values = amdata[site].X.T[amdata.var.age.values < median_age] above_median_values = amdata[site].X.T[amdata.var.age.values >= median_age] # Calculate variances for each half variance_first_half = np.var(below_median_values) variance_second_half = np.var(above_median_values) variance_overall = np.var(amdata[site].X.T) # Spearman correlation spearman = sp.stats.spearmanr(amdata.var.age.values, amdata[site].X.T)[1] #add to the anndata object amdata.obs.loc[site, 'r2'] = r2_overall amdata.obs.loc[site, 'beta'] = beta_overall amdata.obs.loc[site, 'bp_pval'] = p_value amdata.obs.loc[site, 'spearman'] = spearman amdata.obs.loc[site, 'beta_pval'] = beta_pval amdata.obs.loc[site, 'variance_overall'] = variance_overall amdata.obs.loc[site, 'variance_first_half'] = variance_first_half amdata.obs.loc[site, 'variance_second_half'] = variance_second_half amdata.obs.loc[site, 'mean_meth'] = mean_meth #also need to calculate the number of peaks site_data = amdata[site].X.flatten() dens = sm.nonparametric.KDEUnivariate(site_data) dens.fit(kernel='gau', bw=0.05) x = np.linspace(-0.1,1.1,100) #restrict range to (0,1) y = dens.evaluate(x) y = y/max(y) amdata.obs.loc[site, 'n_peaks'] = find_peaks(y, prominence=0.2)[0].shape[0] # Save with open('../data/pan_mammal_blood_with_site_details.pk', 'wb') as f: pickle.dump(data_list, f) #re-open again for recalculating values after sexual maturity trimming file = open('../data/pan_mammal_blood_with_site_details.pk', 'rb') data_list = pickle.load(file) file.close() # for each mammal for data in data_list: # append sexual maturity data where missing if data.uns['organism']=='Cervus canadensis': data.uns['common_name'] = 'Wapity elk' data.uns['lifespan'] = 25 data.uns['sex_maturity'] = 2 if data.uns['organism']=='Cervus elaphus': data.uns['common_name'] = 'Red deer' data.uns['lifespan'] = 31.5 data.uns['sex_maturity'] = 2.17 print(data.uns['common_name']) for site in tqdm(data.obs.index): #only keep ages above sexual maturity trimmed_data = data[:,data.var.age>=data.uns['sex_maturity']] #get median age median_age = np.median(trimmed_data.var.age.values) df = {'age': trimmed_data.var.age.values, 'meth': trimmed_data[site].X.T[:,0]} #make into a dataframe df = pd.DataFrame(df) #fit linear model model = smf.ols('meth ~ age', data=df).fit() #get slope beta = model.params["age"] #get variance in first half and second half, and change below_median_values = trimmed_data[site].X.T[trimmed_data.var.age.values < median_age] above_median_values = trimmed_data[site].X.T[trimmed_data.var.age.values >= median_age] variance_first_half = np.var(below_median_values) variance_second_half = np.var(above_median_values) variance_change = variance_second_half - variance_first_half #Now let's get rid of sites that are multimodal site_data = trimmed_data[site].X.flatten() dens = sm.nonparametric.KDEUnivariate(site_data) #Make bandwidth a function of the SD of the data sd = np.sqrt(np.var(site_data)) ninety_five_pc_of_data_range = 4*sd bw = 0.05*ninety_five_pc_of_data_range dens.fit(kernel='gau', bw=bw) x = np.linspace(-0.1,1.1,100) #restrict range to (0,1) y = dens.evaluate(x) y = y/max(y) #add to the anndata object data.obs.loc[site, 'n_peaks_recal'] = find_peaks(y, prominence=0.001)[0].shape[0] data.obs.loc[site, 'variance_first_half_recalc'] = variance_first_half data.obs.loc[site, 'variance_second_half_recalc'] = variance_second_half data.var.loc[site, 'var_change_recal'] = variance_change data.obs.loc[site, 'beta_recal'] = beta data.obs.loc[site, 'mean_meth_recal'] = np.mean(trimmed_data[site].X.T) with open('../data/pan_mammal_blood_with_site_details.pk', 'wb') as f: pickle.dump(data_list, f) #re-open to filter CpGs that don't map well file = open('../../data/pan_mammal_blood_with_site_details.pk', 'rb') data_list = pickle.load(file) file.close() #print common names for data in data_list: print(data.uns['common_name']) #bring in data/Manifest_HorvathMammalMethylChip40.csv manifest = pd.read_csv('../../data/Manifest_HorvathMammalMethylChip40.csv', index_col=1) #Just keep data_list if common_name is in either "Dog", "Cat", "Rat", "Bottlenose dolphin", "Cow", "Mouse" data_list = [x for x in data_list if x.uns['common_name'] in ["Dog", "Cat", "Rat", "Bottlenose dolphin", "Cattle", "Mouse", "Horse", "Sheep", "Rhesus macaque", "Vervet", "Pig", "Yellow-bellied marmot", "Asian elephant", "Marmoset"]] #if column "Mouse.GRCm38.100_CGstart" is NA, set remove those CpGs from "Mouse" in data_list for data in data_list: if data.uns['common_name'] == 'Mouse': na_cpgs = manifest[manifest['Mouse.GRCm38.100_CGstart'].isna()].index data = data[~data.obs.index.isin(na_cpgs)] #remove old data from list and replace with new data data_list = [x for x in data_list if x.uns['common_name'] != 'Mouse'] data_list.append(data) if data.uns['common_name'] == 'Rat': na_cpgs = manifest[manifest['Rat.Rnor.6.0.101_CGstart'].isna()].index data = data[~data.obs.index.isin(na_cpgs)] #remove old data from list and replace with new data data_list = [x for x in data_list if x.uns['common_name'] != 'Rat'] data_list.append(data) if data.uns['common_name'] == 'Dog': na_cpgs = manifest[manifest['Dog.CanFam3.1_CGstart'].isna()].index data = data[~data.obs.index.isin(na_cpgs)] #remove old data from list and replace with new data data_list = [x for x in data_list if x.uns['common_name'] != 'Dog'] data_list.append(data) if data.uns['common_name'] == 'Cat': na_cpgs = manifest[manifest['Cat.FC.9.0.100_CGstart'].isna()].index data = data[~data.obs.index.isin(na_cpgs)] #remove old data from list and replace with new data data_list = [x for x in data_list if x.uns['common_name'] != 'Cat'] data_list.append(data) if data.uns['common_name'] == 'Bottlenose dolphin': na_cpgs = manifest[manifest['Dolphine.turTru1.100_CGstart'].isna()].index data = data[~data.obs.index.isin(na_cpgs)] #remove old data from list and replace with new data data_list = [x for x in data_list if x.uns['common_name'] != 'Bottlenose dolphin'] data_list.append(data) if data.uns['common_name'] == 'Cattle': na_cpgs = manifest[manifest['Cattle.ARS-UCD1.2_CGstart'].isna()].index data = data[~data.obs.index.isin(na_cpgs)] #remove old data from list and replace with new data data_list = [x for x in data_list if x.uns['common_name'] != 'Cattle'] data_list.append(data) if data.uns['common_name'] == 'Horse': na_cpgs = manifest[manifest['Horse.EquCab3.0.100_CGstart'].isna()].index data = data[~data.obs.index.isin(na_cpgs)] #remove old data from list and replace with new data data_list = [x for x in data_list if x.uns['common_name'] != 'Horse'] data_list.append(data) if data.uns['common_name'] == 'Sheep': #open data/Ovis_aries_v4.HorvathMammalMethylChip40.v1.csv file = pd.read_csv('../../data/Ovis_aries_v4.HorvathMammalMethylChip40.v1.csv', index_col=0) #make first column index file.index = file.iloc[:,0] #na cpgs are those where column probeStart is na na_cpgs = file[file['probeStart'].isna()].index data = data[~data.obs.index.isin(na_cpgs)] #remove old data from list and replace with new data data_list = [x for x in data_list if x.uns['common_name'] != 'Sheep'] data_list.append(data) if data.uns['common_name'] == 'Rhesus macaque': #open Macaca_mulatta.mmul_10.100.HorvathMammalMethylChip40.v1 file = pd.read_csv('../../data/Macaca_mulatta.mmul_10.100.HorvathMammalMethylChip40.v1.csv', index_col=0) #make first column index file.index = file.iloc[:,0] #na cpgs are those where column probeStart is na na_cpgs = file[file['probeStart'].isna()].index data = data[~data.obs.index.isin(na_cpgs)] #remove old data from list and replace with new data data_list = [x for x in data_list if x.uns['common_name'] != 'Rhesus macaque'] data_list.append(data) if data.uns['common_name'] == 'Vervet': #open Macaca_mulatta.mmul_10.100.HorvathMammalMethylChip40.v1 file = pd.read_csv('../../data/Marmota_flaviventris.gcf_003676075.2_gsc_ybm_2.0.HorvathMammalMethylChip40.v1.csv', index_col=0) #make first column index file.index = file.iloc[:,0] #na cpgs are those where column probeStart is na na_cpgs = file[file['probeStart'].isna()].index data = data[~data.obs.index.isin(na_cpgs)] #remove old data from list and replace with new data data_list = [x for x in data_list if x.uns['common_name'] != 'Vervet'] data_list.append(data) if data.uns['common_name'] == 'Pig': na_cpgs = manifest[manifest['Pig.Sscrofa11.1.100_CGstart'].isna()].index data = data[~data.obs.index.isin(na_cpgs)] #remove old data from list and replace with new data data_list = [x for x in data_list if x.uns['common_name'] != 'Pig'] data_list.append(data) if data.uns['common_name'] == 'Yellow-bellied marmot': file = pd.read_csv('../../data/Marmota_flaviventris.gcf_003676075.2_gsc_ybm_2.0.HorvathMammalMethylChip40.v1.csv', index_col=0) #make first column index file.index = file.iloc[:,0] #na cpgs are those where column probeStart is na na_cpgs = file[file['probeStart'].isna()].index data = data[~data.obs.index.isin(na_cpgs)] #remove old data from list and replace with new data data_list = [x for x in data_list if x.uns['common_name'] != 'Yellow-bellied marmot'] data_list.append(data) if data.uns['common_name'] == 'Asian elephant': file = pd.read_csv('../../data/Elephas_maximus.kandula3.cat.augustus.HorvathMammalMethylChip40.v1.csv', index_col=0) #make first column index file.index = file.iloc[:,0] #na cpgs are those where column probeStart is na na_cpgs = file[file['probeStart'].isna()].index data = data[~data.obs.index.isin(na_cpgs)] #remove old data from list and replace with new data data_list = [x for x in data_list if x.uns['common_name'] != 'Asian elephant'] data_list.append(data) if data.uns['common_name'] == 'Marmoset': file = pd.read_csv('../../data/Callithrix_jacchus.asm275486v1.100.HorvathMammalMethylChip40.v1.csv', index_col=0) #make first column index file.index = file.iloc[:,0] #na cpgs are those where column probeStart is na na_cpgs = file[file['probeStart'].isna()].index data = data[~data.obs.index.isin(na_cpgs)] #remove old data from list and replace with new data data_list = [x for x in data_list if x.uns['common_name'] != 'Marmoset'] data_list.append(data) with open('../../data/pan_mammal_blood_with_site_details_filtered.pk', 'wb') as f: pickle.dump(data_list, f)