###################
### Description ###
###################
# This script is for preprocessing the mammal data from Mammalian Methylation Consortium (publicly available on the
# Gene Expression Omnibus under accession number GSE223748, previously converted to AnnData format with
# CpGs as obs and samples as vars). Specifically, it calculates various CpG level statistcics (e.g. spearman r)
# - both before and after trimming the pre-sexual maturity ages - and adds them to the Anndata object.
# Also adds some missing lifespan data.
# Inputs: List of Anndata objects of mammal methylation beta values, with CpGs as obs and samples as vars
# (created by the Mammalian Methylation Consortium (Haghani et al. 2023) and publicly available on the Gene Expression
# Omnibus under accession number GSE223748)
##############
### Author ###
##############
# Sam Crofts (sam.crofts@ed.ac.uk)
##################################
##################################
#Imports
import sys
sys.path.append("../..") # fix to import modules from root
from src.general_imports import *
from scipy.interpolate import make_interp_spline
from statsmodels.nonparametric.smoothers_lowess import lowess
from sklearn.neighbors import KernelDensity
from scipy.signal import find_peaks
#set seed for reproducibility
random.seed(10)
# bring in mammal data
file = open('../data/pan_mammal_Blood.pk', 'rb')
data_list = pickle.load(file)
file.close()
#for each mammal
for amdata in data_list:
print(amdata.uns['common_name'])
median_age = np.median(amdata.var.age.values)
#for each site
for site in tqdm(amdata.obs.index):
#fit linear model
data = {'age': amdata.var.age.values, 'meth': amdata[site].X.T[:,0]}
df = pd.DataFrame(data)
model = smf.ols('meth ~ age', data=df).fit()
residuals = model.resid
beta_overall = model.params["age"]
r2_overall = model.rsquared_adj
beta_pval = model.pvalues["age"]
mean_meth = np.mean(amdata[site].X.T)
# Perform Breusch-Pagan test for heteroscedasticity
_, p_value, _, _ = sm.stats.diagnostic.het_breuschpagan(residuals, model.model.exog)
below_median_values = amdata[site].X.T[amdata.var.age.values < median_age]
above_median_values = amdata[site].X.T[amdata.var.age.values >= median_age]
# Calculate variances for each half
variance_first_half = np.var(below_median_values)
variance_second_half = np.var(above_median_values)
variance_overall = np.var(amdata[site].X.T)
# Spearman correlation
spearman = sp.stats.spearmanr(amdata.var.age.values, amdata[site].X.T)[1]
#add to the anndata object
amdata.obs.loc[site, 'r2'] = r2_overall
amdata.obs.loc[site, 'beta'] = beta_overall
amdata.obs.loc[site, 'bp_pval'] = p_value
amdata.obs.loc[site, 'spearman'] = spearman
amdata.obs.loc[site, 'beta_pval'] = beta_pval
amdata.obs.loc[site, 'variance_overall'] = variance_overall
amdata.obs.loc[site, 'variance_first_half'] = variance_first_half
amdata.obs.loc[site, 'variance_second_half'] = variance_second_half
amdata.obs.loc[site, 'mean_meth'] = mean_meth
#also need to calculate the number of peaks
site_data = amdata[site].X.flatten()
dens = sm.nonparametric.KDEUnivariate(site_data)
dens.fit(kernel='gau', bw=0.05)
x = np.linspace(-0.1,1.1,100) #restrict range to (0,1)
y = dens.evaluate(x)
y = y/max(y)
amdata.obs.loc[site, 'n_peaks'] = find_peaks(y, prominence=0.2)[0].shape[0]
# Save
with open('../data/pan_mammal_blood_with_site_details.pk', 'wb') as f:
pickle.dump(data_list, f)
#re-open again for recalculating values after sexual maturity trimming
file = open('../data/pan_mammal_blood_with_site_details.pk', 'rb')
data_list = pickle.load(file)
file.close()
# for each mammal
for data in data_list:
# append sexual maturity data where missing
if data.uns['organism']=='Cervus canadensis':
data.uns['common_name'] = 'Wapity elk'
data.uns['lifespan'] = 25
data.uns['sex_maturity'] = 2
if data.uns['organism']=='Cervus elaphus':
data.uns['common_name'] = 'Red deer'
data.uns['lifespan'] = 31.5
data.uns['sex_maturity'] = 2.17
print(data.uns['common_name'])
for site in tqdm(data.obs.index):
#only keep ages above sexual maturity
trimmed_data = data[:,data.var.age>=data.uns['sex_maturity']]
#get median age
median_age = np.median(trimmed_data.var.age.values)
df = {'age': trimmed_data.var.age.values, 'meth': trimmed_data[site].X.T[:,0]}
#make into a dataframe
df = pd.DataFrame(df)
#fit linear model
model = smf.ols('meth ~ age', data=df).fit()
#get slope
beta = model.params["age"]
#get variance in first half and second half, and change
below_median_values = trimmed_data[site].X.T[trimmed_data.var.age.values < median_age]
above_median_values = trimmed_data[site].X.T[trimmed_data.var.age.values >= median_age]
variance_first_half = np.var(below_median_values)
variance_second_half = np.var(above_median_values)
variance_change = variance_second_half - variance_first_half
#Now let's get rid of sites that are multimodal
site_data = trimmed_data[site].X.flatten()
dens = sm.nonparametric.KDEUnivariate(site_data)
#Make bandwidth a function of the SD of the data
sd = np.sqrt(np.var(site_data))
ninety_five_pc_of_data_range = 4*sd
bw = 0.05*ninety_five_pc_of_data_range
dens.fit(kernel='gau', bw=bw)
x = np.linspace(-0.1,1.1,100) #restrict range to (0,1)
y = dens.evaluate(x)
y = y/max(y)
#add to the anndata object
data.obs.loc[site, 'n_peaks_recal'] = find_peaks(y, prominence=0.001)[0].shape[0]
data.obs.loc[site, 'variance_first_half_recalc'] = variance_first_half
data.obs.loc[site, 'variance_second_half_recalc'] = variance_second_half
data.var.loc[site, 'var_change_recal'] = variance_change
data.obs.loc[site, 'beta_recal'] = beta
data.obs.loc[site, 'mean_meth_recal'] = np.mean(trimmed_data[site].X.T)
with open('../data/pan_mammal_blood_with_site_details.pk', 'wb') as f:
pickle.dump(data_list, f)
#re-open to filter CpGs that don't map well
file = open('../../data/pan_mammal_blood_with_site_details.pk', 'rb')
data_list = pickle.load(file)
file.close()
#print common names
for data in data_list:
print(data.uns['common_name'])
#bring in data/Manifest_HorvathMammalMethylChip40.csv
manifest = pd.read_csv('../../data/Manifest_HorvathMammalMethylChip40.csv', index_col=1)
#Just keep data_list if common_name is in either "Dog", "Cat", "Rat", "Bottlenose dolphin", "Cow", "Mouse"
data_list = [x for x in data_list if x.uns['common_name'] in ["Dog", "Cat", "Rat", "Bottlenose dolphin", "Cattle", "Mouse", "Horse", "Sheep", "Rhesus macaque", "Vervet",
"Pig", "Yellow-bellied marmot", "Asian elephant", "Marmoset"]]
#if column "Mouse.GRCm38.100_CGstart" is NA, set remove those CpGs from "Mouse" in data_list
for data in data_list:
if data.uns['common_name'] == 'Mouse':
na_cpgs = manifest[manifest['Mouse.GRCm38.100_CGstart'].isna()].index
data = data[~data.obs.index.isin(na_cpgs)]
#remove old data from list and replace with new data
data_list = [x for x in data_list if x.uns['common_name'] != 'Mouse']
data_list.append(data)
if data.uns['common_name'] == 'Rat':
na_cpgs = manifest[manifest['Rat.Rnor.6.0.101_CGstart'].isna()].index
data = data[~data.obs.index.isin(na_cpgs)]
#remove old data from list and replace with new data
data_list = [x for x in data_list if x.uns['common_name'] != 'Rat']
data_list.append(data)
if data.uns['common_name'] == 'Dog':
na_cpgs = manifest[manifest['Dog.CanFam3.1_CGstart'].isna()].index
data = data[~data.obs.index.isin(na_cpgs)]
#remove old data from list and replace with new data
data_list = [x for x in data_list if x.uns['common_name'] != 'Dog']
data_list.append(data)
if data.uns['common_name'] == 'Cat':
na_cpgs = manifest[manifest['Cat.FC.9.0.100_CGstart'].isna()].index
data = data[~data.obs.index.isin(na_cpgs)]
#remove old data from list and replace with new data
data_list = [x for x in data_list if x.uns['common_name'] != 'Cat']
data_list.append(data)
if data.uns['common_name'] == 'Bottlenose dolphin':
na_cpgs = manifest[manifest['Dolphine.turTru1.100_CGstart'].isna()].index
data = data[~data.obs.index.isin(na_cpgs)]
#remove old data from list and replace with new data
data_list = [x for x in data_list if x.uns['common_name'] != 'Bottlenose dolphin']
data_list.append(data)
if data.uns['common_name'] == 'Cattle':
na_cpgs = manifest[manifest['Cattle.ARS-UCD1.2_CGstart'].isna()].index
data = data[~data.obs.index.isin(na_cpgs)]
#remove old data from list and replace with new data
data_list = [x for x in data_list if x.uns['common_name'] != 'Cattle']
data_list.append(data)
if data.uns['common_name'] == 'Horse':
na_cpgs = manifest[manifest['Horse.EquCab3.0.100_CGstart'].isna()].index
data = data[~data.obs.index.isin(na_cpgs)]
#remove old data from list and replace with new data
data_list = [x for x in data_list if x.uns['common_name'] != 'Horse']
data_list.append(data)
if data.uns['common_name'] == 'Sheep':
#open data/Ovis_aries_v4.HorvathMammalMethylChip40.v1.csv
file = pd.read_csv('../../data/Ovis_aries_v4.HorvathMammalMethylChip40.v1.csv', index_col=0)
#make first column index
file.index = file.iloc[:,0]
#na cpgs are those where column probeStart is na
na_cpgs = file[file['probeStart'].isna()].index
data = data[~data.obs.index.isin(na_cpgs)]
#remove old data from list and replace with new data
data_list = [x for x in data_list if x.uns['common_name'] != 'Sheep']
data_list.append(data)
if data.uns['common_name'] == 'Rhesus macaque':
#open Macaca_mulatta.mmul_10.100.HorvathMammalMethylChip40.v1
file = pd.read_csv('../../data/Macaca_mulatta.mmul_10.100.HorvathMammalMethylChip40.v1.csv', index_col=0)
#make first column index
file.index = file.iloc[:,0]
#na cpgs are those where column probeStart is na
na_cpgs = file[file['probeStart'].isna()].index
data = data[~data.obs.index.isin(na_cpgs)]
#remove old data from list and replace with new data
data_list = [x for x in data_list if x.uns['common_name'] != 'Rhesus macaque']
data_list.append(data)
if data.uns['common_name'] == 'Vervet':
#open Macaca_mulatta.mmul_10.100.HorvathMammalMethylChip40.v1
file = pd.read_csv('../../data/Marmota_flaviventris.gcf_003676075.2_gsc_ybm_2.0.HorvathMammalMethylChip40.v1.csv', index_col=0)
#make first column index
file.index = file.iloc[:,0]
#na cpgs are those where column probeStart is na
na_cpgs = file[file['probeStart'].isna()].index
data = data[~data.obs.index.isin(na_cpgs)]
#remove old data from list and replace with new data
data_list = [x for x in data_list if x.uns['common_name'] != 'Vervet']
data_list.append(data)
if data.uns['common_name'] == 'Pig':
na_cpgs = manifest[manifest['Pig.Sscrofa11.1.100_CGstart'].isna()].index
data = data[~data.obs.index.isin(na_cpgs)]
#remove old data from list and replace with new data
data_list = [x for x in data_list if x.uns['common_name'] != 'Pig']
data_list.append(data)
if data.uns['common_name'] == 'Yellow-bellied marmot':
file = pd.read_csv('../../data/Marmota_flaviventris.gcf_003676075.2_gsc_ybm_2.0.HorvathMammalMethylChip40.v1.csv', index_col=0)
#make first column index
file.index = file.iloc[:,0]
#na cpgs are those where column probeStart is na
na_cpgs = file[file['probeStart'].isna()].index
data = data[~data.obs.index.isin(na_cpgs)]
#remove old data from list and replace with new data
data_list = [x for x in data_list if x.uns['common_name'] != 'Yellow-bellied marmot']
data_list.append(data)
if data.uns['common_name'] == 'Asian elephant':
file = pd.read_csv('../../data/Elephas_maximus.kandula3.cat.augustus.HorvathMammalMethylChip40.v1.csv', index_col=0)
#make first column index
file.index = file.iloc[:,0]
#na cpgs are those where column probeStart is na
na_cpgs = file[file['probeStart'].isna()].index
data = data[~data.obs.index.isin(na_cpgs)]
#remove old data from list and replace with new data
data_list = [x for x in data_list if x.uns['common_name'] != 'Asian elephant']
data_list.append(data)
if data.uns['common_name'] == 'Marmoset':
file = pd.read_csv('../../data/Callithrix_jacchus.asm275486v1.100.HorvathMammalMethylChip40.v1.csv', index_col=0)
#make first column index
file.index = file.iloc[:,0]
#na cpgs are those where column probeStart is na
na_cpgs = file[file['probeStart'].isna()].index
data = data[~data.obs.index.isin(na_cpgs)]
#remove old data from list and replace with new data
data_list = [x for x in data_list if x.uns['common_name'] != 'Marmoset']
data_list.append(data)
with open('../../data/pan_mammal_blood_with_site_details_filtered.pk', 'wb') as f:
pickle.dump(data_list, f)