################### ### Description ### ################### # This script is for general imports that are used in multiple scripts. # Includes general helper functions, model mean and variance definitions, # and model creation functions with PyMC. ############## ### Author ### ############## # Sam Crofts (sam.crofts@ed.ac.uk) ######################################################################################################################## import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from tqdm import tqdm from multiprocessing import Pool from multiprocessing import cpu_count import anndata as ad import json import pickle import math import random import arviz as az from scipy.stats import bootstrap import pymc as pm from sklearn.feature_selection import r_regression from scipy.stats import bootstrap from itertools import product from functools import partial from scipy.stats import norm from scipy.stats import beta from scipy.stats import ks_2samp from scipy.stats import spearmanr import contextlib import os ########################## #General helper functions# ########################## # Function to uniformly sample participants from the data with regard to age # (as much as possible given the underlying distribution of ages in the data) def sample_to_uniform_age(amdata, n_part): remaining_idx = list(amdata.var.index) used_idx = [] for i in range(n_part): sampled_age = np.random.uniform(amdata.var.age.min(), amdata.var.age.max()) near_idx = np.abs(amdata[:, remaining_idx].var.age - sampled_age).argmin() used_idx.append(remaining_idx[near_idx]) remaining_idx.pop(near_idx) return amdata[:,used_idx] # Function to individually match samples from one dataset to another based on age def match_ages(amdata_to_sample_from, amdata_to_match_on): n_part = amdata_to_match_on.shape[1] remaining_idx = list(amdata_to_sample_from.var.index) used_idx = [] for i in range(n_part): sampled_age = amdata_to_match_on.var.age.iloc[i] #just keep those that appear in remaining_idx near_idx = np.abs(amdata_to_sample_from[:, remaining_idx].var.age - sampled_age).argmin() used_idx.append(remaining_idx[near_idx]) remaining_idx.pop(near_idx) return amdata_to_sample_from[:,used_idx] # Function to individually match samples from one dataset to another based on both age and sex def match_ages_and_sex(amdata_to_sample_from, amdata_to_match_on): n_part = amdata_to_match_on.shape[1] remaining_idx = list(amdata_to_sample_from.var.index) used_idx = [] for i in range(n_part): sampled_age = amdata_to_match_on.var.age.iloc[i] sex = amdata_to_match_on.var.sex.iloc[i] amdata_to_sample_from_sex_matched = amdata_to_sample_from[:, amdata_to_sample_from.var.sex==sex] remaining_idx_temp = list(amdata_to_sample_from_sex_matched.var.index) #just keep those that apepar in remaining_idx remaining_idx_temp = [x for x in remaining_idx_temp if x in remaining_idx] near_idx = np.abs(amdata_to_sample_from_sex_matched[:, remaining_idx_temp].var.age - sampled_age).argmin() used_idx.append(remaining_idx_temp[near_idx]) remaining_idx.pop(near_idx) return amdata_to_sample_from[:,used_idx] # Function to ensure that the mean and variance are valid for a Beta distribution def ensure_valid_beta_mu_var(mu, var): """ Ensure that the mean (mu) and variance (var) are valid for a Beta distribution. The variance must be less than mu * (1 - mu) and greater than 0. Parameters: ----------- mu : array-like Mean values to be checked. var : array-like Variance values to be checked. Returns: -------- mu_clipped : array-like Clipped mean values within valid range [0.001, 0.999]. var_clipped : array-like Clipped variance values within valid range [1e-6, max_variance]. """ # Clip mean to be within (0, 1) mu_clipped = pm.math.clip(mu, 0.001, 0.999) # Calculate maximum allowed variance for Beta distribution max_variance = mu_clipped * (1 - mu_clipped) * 0.99 # 99% of theoretical max # Clip variance to be within (0, max_variance) var_clipped = pm.math.clip(var, 1e-10, max_variance) return mu_clipped, var_clipped ##################################### #Model mean and variance definitions# ##################################### ###Model of methylation changes via stem cell divisons. See mathematical details in supplementary### ##Parameters## # N = number of stem cells # M = number of methylated stem cells # t = time (years) # s = division rate per stem cell per year # n = eta, theoretical final methylation level at t=infinity, equal to (u)/(m+u) # m = P_(M->U), probability of a methylated site becoming unmethylated per division # u = P_(U->M), probability of an unmethylated site becoming methylated per division # w = omega, equal to m+u # Z = total number of methylated cells relative to N (i.e. Z(t) = M(t)/N(t)) # p = theoretical initial methylation level at t=0 # c = initial variance of methylated stem cells, M, at t=0 def mean_Z(t, s, n, w, p): return n + np.exp(-2*s*t*w)*(p-n) def cov_NM(t, s, N, n, w, p): return 2*s*N*t*(n+np.exp(-2*s*t*w)*(1-w)*(p-n)) def var_N(t,s,N): return 2*s*N*t def var_M(t, s, N, n, w, p, c): A0 = N*n*(1 +np.power(w, 2) -n*(1 + np.power(w, 2) - 4*s*w*t) ) A2 = 2*N*(n-p)*(-1 + w - np.power(w, 2) + 2*n*(1 - w*(1+2*s*t) + np.power(w,2)*(1+2*s*t))) A4 = (np.power(n,2)*N*(-3 + 4*w - 3*np.power(w,2)) + 2*(c*w + N*p*(-1 + w - np.power(w, 2))) + n*N*(1+4*p - 2*w*(1+2*p)+np.power(w,2)*(1+4*p)) ) vM = (A0 + A2*np.exp(-2*s*t*w) + A4*np.exp(-4*s*t*w))/(2*w) return vM def var_Z_order_1(t, s, N, n, w, p, c): mZ = mean_Z(t, s, n, w, p) covNM = cov_NM(t, s, N, n, w, p) vN = var_N(t, s, N) vM = var_M(t, s, N, n, w, p, c) return (np.power(mZ, 2)*vN + vM - 2*mZ*covNM)/(np.power(N,2)) # Function to get prior parameters from data and prep data for modelling def get_prior_params_and_prep(amdata_load): # Ensure age is 0 at the youngest individual amdata_load.var.age = amdata_load.var.age - amdata_load.var.age.min() # Get the youngest individuals to make the prior for the starting mean and sd early_idx = amdata_load[:, amdata_load.var.age