Note: the number and quality of accessible scRNA Seq datasets is rapidly increasing with more and more platforms providing streamlined access to single cell omics data such as sfaira, cellxgene etc.

The following notebook shows how we used several scRNA Seq datasets in order to screen for off-tumor toxicities of CAR T target epitopes.

Since the analysis in this notebook depends heavily on the version of specific packages, current versions may require adapting the code provided below.

The notebook reproduces the preprocessing of multiple scRNA Seq datasets from various healthy tissues and the main figures from the corresponding paper.

First we import required packages:

In [1]:
import scanpy as sc
import numpy as np
import scipy as sp
import pandas as pd
import anndata
import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib import colors
import seaborn as sb
import os
import sfaira

from os import listdir
from os.path import isfile, join

import warnings
from rpy2.rinterface import RRuntimeWarning
from rpy2.robjects import pandas2ri

%load_ext rpy2.ipython

warnings.filterwarnings("ignore", category=RRuntimeWarning)
pandas2ri.activate()
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
sc.settings.verbosity = 3

#Define a nice colour map for gene expression
colors2 = plt.cm.Reds(np.linspace(0, 1, 128))
colors3 = plt.cm.Greys_r(np.linspace(0.7,0.8,20))
colorsComb = np.vstack([colors3, colors2])
mymap = colors.LinearSegmentedColormap.from_list('my_colormap', colorsComb)

sc.set_figure_params(vector_friendly=True, color_map='Reds',
                     dpi=200,transparent=True, fontsize=14)

palette=['#3519ED','#EDFBA8', '#4D2E6A', '#0ECDB4', '#0C23B5', '#92BF0D', '#51F2E4', '#1CB09D', '#BFC1D4', '#A98BF3', '#98D1C6', '#9A5E72', '#B89327', '#6704A5', '#0F64B2', '#A69CF3', '#D97C2E',
        '#321795', '#7E2649', '#65A813', '#734E0F', '#D14FBC', '#2CFA50', '#83610A', '#841032', '#04D8E3', '#8605BA', '#1EF60C', '#602587', '#F9AD1B', '#2BD738', '#C8E239', '#46D1BC',
        '#465A01', '#70C51E', '#924DFB', '#5D28E4', '#712E8D', '#30F465', '#5287E1', '#D16B7F', '#B8EF72', '#03B15D', '#4290F3', '#5E9063', '#B0E39F', '#3C6205', '#ACE603', '#2DE57A',
         '#052B18', '#69BE75', '#8BE509', '#C37041', '#E218BA', '#5AC097', '#A597C4','#2A13B5', '#823FA7', '#C8F349', '#7C482A', '#A104E5', '#8E3C27', '#1DB457', '#36FDC7', '#60A934', '#F43B78',
        '#74EF61', '#810527', '#6D293F', '#F1A709', '#274F6C', '#104EB5', '#6D02CE', '#B36AE5', '#13F9BA', '#C7281F', '#31572B', '#07D94B', '#B45E18', '#9F73D1', '#3CA059', '#4D6B1C', 
        '#3BA785', '#E3084C', '#A61BF4', '#F3D50A', '#640A32', '#FB0D73', '#AD0126', '#279136', '#480EB1', '#634158', '#FD80E5', '#CF26AE', '#046DB9', '#15F6B8', '#1A047C', '#D14826', 
        '#45D160', '#C6039B', '#D9C23F', '#70829B', '#940ECF', '#9FAB37', '#BA84C7', '#68F275', '#C79EA1', '#6E89B7', '#712D6C', '#F51D8C', '#D564E0', '#24EA18', '#459C71', '#23F50B', 
        '#3E57C8', '#6D78F9', '#07EC9F', '#3078C5', '#53EC0A','#D283E5', '#16039B', '#61E0A8', '#10A659', '#52374A', '#B31EC4', '#254D10', '#D5B0F4', '#A79E35', '#2D0F45', '#562D18', 
        '#ABE562', '#9A0842', '#92165C', '#FCD98A', '#B384DA', '#3CB108', '#2FE04C', '#386CA2', '#423719', '#E540C2', '#C58DE9', '#F6B8E4','#87592B', '#94D53F', '#4AB5E3', '#B0E96C',
        '#AD94E6', '#E237B8', '#21F0B9', '#FB9C73']


  from pandas.core.index import Index as PandasIndex


In [2]:
writepath = '/path/to/directory/'

In [4]:
sc.logging.print_versions()

-----
anndata     0.8.0
scanpy      1.9.1
-----
PIL                                 7.1.1
absl                                NA
asciitree                           NA
astor                               0.8.1
b77c949143755c3209ad0c9f1475a050    NA
backcall                            0.1.0
biothings_client                    0.2.6
botocore                            1.15.39
certifi                             2020.04.05.1
cffi                                1.14.0
chardet                             3.0.4
charset_normalizer                  2.1.0
cloudpickle                         1.3.0
cycler                              0.10.0
cython_runtime                      NA
dask                                2.14.0
dateutil                            2.8.1
decorator                           4.4.2
entrypoints                         0.3
fasteners                           0.17.3
fsspec                              0.7.2
gast                                NA
google                          

# 00-Datasets with multiple organs

Here we use sfaira to import available datasets with annotations.
Note that the following steps may change depending on the current sfaira version and the path to your repository.

## 00-1-MultipleOrgans-Pisco-2022

In [8]:
target_collections = ["e5f58829-1a66-40b5-a624-9046778e74f5"]
cache_path = os.path.join(".", "data")
dsg = sfaira.data.dataloaders.databases.DatasetSuperGroupDatabases(data_path=cache_path, cache_metadata=True)
dsg.subset(key="collection_id", values=target_collections)
dsg.datasets
dsg.download()

Downloading: ncbitaxon_v2021-06-10.obo to /home/ruben.brabenec/.cache/sfaira/ontologies/ncbitaxon
Downloading: efo.obo to /home/ruben.brabenec/.cache/sfaira/ontologies/efo
Downloading: hsapdv_master.obo to /home/ruben.brabenec/.cache/sfaira/ontologies/hsapdv
Downloading: mmusdv.obo to /home/ruben.brabenec/.cache/sfaira/ontologies/mmusdv
Downloading: uberon_v2021-07-27.obo to /home/ruben.brabenec/.cache/sfaira/ontologies/uberon
Ontology <class 'sfaira.versions.metadata.base.OntologyUberonLifecyclestage'> is not a DAG, treat child-parent reasoning with care.
Downloading: mondo_v2021-08-11.obo to /home/ruben.brabenec/.cache/sfaira/ontologies/mondo
Ontology <class 'sfaira.versions.metadata.base.OntologyMondo'> is not a DAG, treat child-parent reasoning with care.
Ontology <class 'sfaira.versions.metadata.base.OntologyUberon'> is not a DAG, treat child-parent reasoning with care.
Downloading: pato_v2021-08-06.obo to /home/ruben.brabenec/.cache/sfaira/ontologies/pato


In [9]:
path = '/path/to/repo/e5f58829-1a66-40b5-a624-9046778e74f5/'
files = [f for f in listdir(path) if isfile(join(path, f))]

In [10]:
files

['53d208b0-2cfd-4366-9866-c3c6114081bc.h5ad',
 'a68b64d8-aee3-4947-81b7-36b8fe5a44d2.h5ad',
 'c5d88abe-f23a-45fa-a534-788985e93dad.h5ad',
 '97a17473-e2b1-4f31-a544-44a60773e2dd.h5ad',
 '5a11f879-d1ef-458a-910c-9b0bdfca5ebf.h5ad']

In [11]:
files = ['53d208b0-2cfd-4366-9866-c3c6114081bc.h5ad']

In [12]:
for i in range(len(files)):
    print(files[i])
    path_2 = path + files[i]
    u = sc.read_h5ad(path_2)
    u.obs['id'] = files[i]
    print(u)
    if i == 0:
        adata_pisco = u
    else:
        adata_pisco = adata_pisco.concatenate(u, join='outer')

53d208b0-2cfd-4366-9866-c3c6114081bc.h5ad
AnnData object with n_obs × n_vars = 483152 × 58559
    obs: 'tissue_in_publication', 'assay_ontology_term_id', 'donor', 'anatomical_information', 'n_counts_UMIs', 'n_genes', 'cell_ontology_class', 'free_annotation', 'manually_annotated', 'compartment', 'sex_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'disease_ontology_term_id', 'ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'cell_type_ontology_term_id', 'tissue_ontology_term_id', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'ethnicity', 'development_stage', 'id'
    var: 'feature_type', 'ensemblid', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std', 'feature_biotype', 'feature_is_filtered', 'feature_name', 'feature_reference'
    uns: 'X_normalization', '_scvi', '_training_mode', 'compartment_colors', 'default_embedding', 'dendrogram_cell_type_tissue', 'dendrogram_computational_compartment_assignment', 'den

In [18]:
adata_pisco.var.index = adata_pisco.var['feature_name']

In [None]:
adata_pisco.obs['InternDatasetNumber'] ='00-1-MultipleOrgans-Pisco-2022'

In [23]:
adata_pisco.write(writepath + '00-1-MultipleOrgans-Pisco-2022-raw.h5ad')

## 00-2-MultipleOrgans-Han-2020

In [12]:
ID = 'homosapiens_None_2020_microwellseq_han_001_10.1038/s41586-020-2157-4'

In [13]:
# Set this path to your local sfaira data repository
basedir = '.'
datadir = os.path.join(basedir, 'raw')
metadir = os.path.join(basedir, 'meta')
cachedir = os.path.join(basedir, 'cache')

In [None]:
ds = sfaira.data.Universe(data_path=datadir, meta_path=metadir, cache_path=cachedir)
# subset to the selected dataset
ds.subset(key="id", values=[ID])  # subsets all lung data sets
# download and load the specific dataset
ds.download()
ds.load(verbose=1)
# get the unmodified adata object of the dataset
adata = ds.datasets[ID].adata

In [None]:
adata_han = adata.copy()

In [None]:
adata_han.obs['InternDatasetNumber'] ='00-2-MultipleOrgans-Han-2020'

In [None]:
adata_han.write(writepath + '00-2-MultipleOrgans-Han-2020-raw.h5ad')

## 00-3-MultipleOrgans-ImmuneCells-Teichmann-2022

In [7]:
target_collections = ["62ef75e4-cbea-454e-a0ce-998ec40223d3"]
cache_path = os.path.join(".", "data")
dsg = sfaira.data.dataloaders.databases.DatasetSuperGroupDatabases(data_path=cache_path, cache_metadata=True)
dsg.subset(key="collection_id", values=target_collections)
dsg.datasets
dsg.download()

Ontology <class 'sfaira.versions.metadata.base.OntologyUberonLifecyclestage'> is not a DAG, treat child-parent reasoning with care.
Ontology <class 'sfaira.versions.metadata.base.OntologyMondo'> is not a DAG, treat child-parent reasoning with care.
Ontology <class 'sfaira.versions.metadata.base.OntologyUberon'> is not a DAG, treat child-parent reasoning with care.


In [8]:
path = '/path/to/repo/62ef75e4-cbea-454e-a0ce-998ec40223d3/'
files = [f for f in listdir(path) if isfile(join(path, f))]

In [9]:
files

['ae29ebd0-1973-40a4-a6af-d15a5f77a80f.h5ad',
 'fe52003e-1460-4a65-a213-2bb1a508332f.h5ad',
 '71be997d-ff75-41b9-8a9f-1288c865f921.h5ad',
 '1b9d8702-5af8-4142-85ed-020eb06ec4f6.h5ad']

In [10]:
files = ['1b9d8702-5af8-4142-85ed-020eb06ec4f6.h5ad']

In [11]:
for i in range(len(files)):
    print(files[i])
    path_2 = path + files[i]
    u = sc.read_h5ad(path_2)
    u.obs['id'] = files[i]
    print(len(u.obs))
    #if i == 0:
    #    adata_pisco = u
    #else:
    #    adata_pisco = adata_pisco.concatenate(u, join='outer')

1b9d8702-5af8-4142-85ed-020eb06ec4f6.h5ad
329762


In [12]:
adata_teichmann = u.copy()
%reset_selective -f "^u$"

In [34]:
adata_teichmann.obs['InternDatasetNumber'] = '00-3-MultipleOrgans_ImmuneCells-Teichmann-2022'

In [21]:
adata_teichmann.var.index = adata_teichmann.var['gene_symbols']

In [23]:
adata = adata_teichmann.copy()
%reset_selective -f "^adata_teichmann$"

In [27]:
#calculate QC covariates
adata.obs['n_counts'] = adata.X.sum(1)
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1)

In [31]:
# FILTER PARAMETERS
#Filter out cells
sc.pp.filter_cells(adata, max_counts = 4300)
sc.pp.filter_cells(adata, max_genes = 8000)
# Min 20 cells - filters out low count genes
sc.pp.filter_genes(adata, min_cells=20) #500 works # # 450(30494) #400(29837) #300(28268) not

filtered out 61 cells that have more than 8000 genes expressed
filtered out 10098 genes that are detected in less than 20 cells


In [33]:
# get mt genes
mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]
np.array(mt_genes)
mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]

# convert to spare
adata.X=sp.sparse.csr_matrix.todense(adata.X)

# manually define mt score
y = np.bincount(mt_gene_mask)
ii = np.nonzero(y)[0]
np.vstack((ii,y[ii])).T
adata.X[:, mt_gene_mask].sum(1)
mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))
mt_sum=np.array(pd.DataFrame(mt_sum)[0])
mt_frac=mt_sum/adata.obs['n_counts'].values

adata.obs['mt_frac'] = mt_frac

#Filter out cells with over 20% mito fraction
adata = adata[adata.obs['mt_frac'] < 0.20]

In [34]:
adata_pp=adata.copy()

In [35]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

normalizing by total count per cell
    finished (0:00:34): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    with n_comps=15
    finished (0:01:49)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:01:06)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 26 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:01:47)


In [36]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [None]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)
size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

In [38]:
#Delete adata_pp
del adata_pp
adata.obs['size_factors'] = size_factors

  This is separate from the ipykernel package so we can avoid doing imports until


In [39]:
adata.strings_to_categoricals()

In [40]:
#make  (adata.X) copy of counts of raw data for downstream analysis
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [41]:
adata.raw = adata

In [42]:
#Normalize data
adata.X /= adata.obs['size_factors'].values[:, None]
sc.pp.log1p(adata)

In [43]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:45)


In [44]:
# Calculate the visualizations
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata)
sc.tl.umap(adata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:46)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:02:01)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:05:52)


In [49]:
# make consistent annotations across datasets
adata.obs['celltype']=adata.obs['cell_type'].copy()

In [51]:
ref_cluster=pd.Categorical(adata.obs['celltype'],
                           categories=['CD16-negative, CD56-bright natural killer cell, human',
       'CD16-positive, CD56-dim natural killer cell, human',
       'CD4-positive helper T cell', 'CD8-positive, alpha-beta memory T cell',
       'CD8-positive, alpha-beta memory T cell, CD45RO-positive',
       'T follicular helper cell', 'alpha-beta T cell', 'alveolar macrophage',
       'animal cell', 'classical monocyte', 'conventional dendritic cell',
       'dendritic cell, human',
       'effector memory CD4-positive, alpha-beta T cell',
       'effector memory CD8-positive, alpha-beta T cell, terminally differentiated',
       'erythroid lineage cell', 'gamma-delta T cell',
       'germinal center B cell', 'group 3 innate lymphoid cell', 'lymphocyte',
       'macrophage', 'mast cell', 'megakaryocyte', 'memory B cell',
       'mucosal invariant T cell', 'naive B cell',
       'naive thymus-derived CD4-positive, alpha-beta T cell',
       'naive thymus-derived CD8-positive, alpha-beta T cell',
       'non-classical monocyte', 'plasma cell', 'plasmablast',
       'plasmacytoid dendritic cell', 'precursor B cell', 'pro-B cell',
       'progenitor cell', 'regulatory T cell'])

In [52]:
ix=np.isin(ref_cluster,['CD16-positive, CD56-dim natural killer cell, human',])
ref_cluster[ix]='CD16-negative, CD56-bright natural killer cell, human'

ix=np.isin(ref_cluster,['CD8-positive, alpha-beta memory T cell',
       'CD8-positive, alpha-beta memory T cell, CD45RO-positive',
       'T follicular helper cell', 'alpha-beta T cell','effector memory CD4-positive, alpha-beta T cell',
       'effector memory CD8-positive, alpha-beta T cell, terminally differentiated',
       'erythroid lineage cell', 'gamma-delta T cell',  'mucosal invariant T cell', 'naive thymus-derived CD4-positive, alpha-beta T cell',
       'naive thymus-derived CD8-positive, alpha-beta T cell','regulatory T cell'])
ref_cluster[ix]='CD4-positive helper T cell'

ix=np.isin(ref_cluster,[  'dendritic cell, human','plasmacytoid dendritic cell'])
ref_cluster[ix]='conventional dendritic cell'

ix=np.isin(ref_cluster,['memory B cell', 'naive B cell','precursor B cell', 'pro-B cell'])
ref_cluster[ix]= 'germinal center B cell'

ix=np.isin(ref_cluster,[ 'macrophage'])
ref_cluster[ix]='alveolar macrophage',

ix=np.isin(ref_cluster,[ 'non-classical monocyte',])
ref_cluster[ix]='classical monocyte'

ix=np.isin(ref_cluster,['plasmablast'])
ref_cluster[ix]= 'plasma cell'

ix=np.isin(ref_cluster,['progenitor cell'])
ref_cluster[ix]='animal cell'

In [53]:
adata.obs['celltype']=pd.Categorical(ref_cluster,
                                           categories=['CD16-negative, CD56-bright natural killer cell, human',
       
       'CD4-positive helper T cell', 
        'alveolar macrophage',
       'animal cell', 'classical monocyte', 'conventional dendritic cell',
     
       
       'germinal center B cell', 'group 3 innate lymphoid cell', 'lymphocyte',
     'mast cell', 'megakaryocyte', 
     
      
       'plasma cell'])

In [58]:
adata.rename_categories('celltype', ['NK cells',
       
       'T cells', 
        'Macrophages',
       'Remove', 'Monocytes', 'Dendritic cells',
       'B cells', 'Innate lymphoid cells', 'Lymphocytes',
     'Mast cells', 'Megakaryocytes',     
       'Plasma cells'])

In [65]:
ix=np.isin(adata.obs['organism'], ['Homo sapiens'])
adata=adata[ix].copy()

In [71]:
adata.obs['celltype'].cat.categories

Index(['NK cells', 'T cells', 'Macrophages', 'Remove', 'Monocytes',
       'Dendritic cells', 'B cells', 'Innate lymphoid cells', 'Lymphocytes',
       'Mast cells', 'Megakaryocytes', 'Plasma cells'],
      dtype='object')

In [72]:
ix=np.isin(adata.obs['celltype'],['NK cells', 'T cells', 'Macrophages', 'Monocytes',
       'Dendritic cells', 'B cells', 'Innate lymphoid cells', 'Lymphocytes',
       'Mast cells', 'Megakaryocytes', 'Plasma cells'])
adata=adata[ix].copy()

In [73]:
adata.obs['tissue'].cat.categories

Index(['blood', 'bone marrow', 'caecum', 'duodenum', 'ileum',
       'jejunal epithelium', 'lamina propria', 'liver', 'lung',
       'mesenteric lymph node', 'omentum', 'sigmoid colon',
       'skeletal muscle tissue', 'spleen', 'thoracic lymph node', 'thymus',
       'transverse colon'],
      dtype='object')

In [75]:
adata.obs['sub_tissue'] = adata.obs['tissue']
ref_cluster=pd.Categorical(adata.obs['sub_tissue'],
                           categories=['blood', 'bone marrow', 'caecum', 'duodenum', 'ileum',
       'jejunal epithelium', 'lamina propria', 'liver', 'lung',
       'mesenteric lymph node', 'omentum', 'sigmoid colon',
       'skeletal muscle tissue', 'spleen', 'thoracic lymph node', 'thymus',
       'transverse colon'])
adata.rename_categories('sub_tissue', ['Blood', 'BoneMarrow', 'Gut_Colon_Ceacum', 'Gut_SmallIntestine_Duodenum', 'Gut_SmallIntestine_Ileum',
       'Gut_SmallIntestine_Jejunum', 'Gut_SmallIntestine', 'Liver', 'Lung',
       'LymphNode_Mesenteric', 'Omentum', 'Gut_Colon_Sigmoid',
       'Muscle', 'Spleen', 'LymphNode_Thoracic', 'Thymus',
       'Gut_Colon_Transverse'])

In [76]:
adata.obs['tissue_major'] = adata.obs['tissue']
ref_cluster=pd.Categorical(adata.obs['tissue_major'],
                           categories=['blood', 'bone marrow', 'caecum', 'duodenum', 'ileum',
       'jejunal epithelium', 'lamina propria', 'liver', 'lung',
       'mesenteric lymph node', 'omentum', 'sigmoid colon',
       'skeletal muscle tissue', 'spleen', 'thoracic lymph node', 'thymus',
       'transverse colon'])

ix=np.isin(ref_cluster,['ileum',  'jejunal epithelium', 'lamina propria',])
ref_cluster[ix]='duodenum'

ix=np.isin(ref_cluster,[ 'thoracic lymph node'])
ref_cluster[ix]='mesenteric lymph node'

ix=np.isin(ref_cluster,[ 'sigmoid colon', 'transverse colon'])
ref_cluster[ix]='caecum'



adata.obs['tissue_major']=pd.Categorical(ref_cluster,
                                           categories= ['blood', 'bone marrow', 'caecum', 'duodenum',
       'liver', 'lung',
       'mesenteric lymph node', 'omentum',
       'skeletal muscle tissue', 'spleen',  'thymus'])

adata.rename_categories('tissue_major', ['Blood', 'BoneMarrow', 'Gut_Colon', 'Gut_SmallIntestine',
       'Liver', 'Lung',
       'LymphNode', 'Omentum',
       'Muscle', 'Spleen', 'Thymus'])

In [83]:
adata.obs['sex'].cat.categories
ref_cluster=pd.Categorical(adata.obs['sex'],
                           categories=['female', 'male'])
adata.rename_categories('sex', ['Female', 'Male'])

In [84]:
adata.obs['ethnicity'].cat.categories
ref_cluster=pd.Categorical(adata.obs['ethnicity'],
                           categories=['unknown'])
adata.rename_categories('ethnicity', ['NaN'])

In [85]:
adata.obs['development_stage'].cat.categories
ref_cluster=pd.Categorical(adata.obs['development_stage'],
                           categories=['eighth decade human stage', 'seventh decade human stage',
       'sixth decade human stage'])
adata.rename_categories('development_stage', ['80', '70',
       '60'])

In [86]:
adata.obs['donor'] = adata.obs['Donor']
ref_cluster=pd.Categorical(adata.obs['donor'],
                           categories=['582C', '621B', '637C', '640C', 'A29', 'A31', 'A35', 'A36', 'A37',
       'A52', 'D496', 'D503'])
adata.rename_categories('donor', ['582C', '621B', '637C', '640C', 'A29', 'A31', 'A35', 'A36', 'A37',
       'A52', 'D496', 'D503'])

In [89]:
adata.obs['Organ'] =  adata.obs['tissue_major']
adata.obs['Organ_Specific'] = adata.obs['sub_tissue']
adata.obs['Dataset'] = 'Teichmann_MultipleOrgans_ImmuneCells'
adata.obs['InternDatasetNumber'] = '00-3-MultipleOrgans_ImmuneCells-Teichmann-2022'
adata.obs['Dataset_status'] = 'Healthy_Dataset'

adata.obs['celltype'] = adata.obs['celltype']
adata.obs['sub_celltype'] = 'NaN'
adata.obs['Malignant'] = 'NonMalignant'

adata.obs['Patient'] = adata.obs['donor']
adata.obs['Patient_Number'] = 'NaN'
adata.obs['age'] = adata.obs['development_stage']
adata.obs['sex'] = adata.obs['sex']
adata.obs['ethnicity'] = adata.obs['ethnicity']
adata.obs['health_status'] = 'NaN'

adata.obs['original_celltype_1'] = adata.obs['cell_type']
adata.obs['original_celltype_2'] = adata.obs['Majority_voting_CellTypist_high']
adata.obs['original_celltype_3'] = 'NaN'

In [91]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [92]:
adata.obs_names_make_unique()

In [94]:
adata_analysis = adata.copy()

In [120]:
adata_analysis.obs['Dataset'] = adata_analysis.obs['tissue_major']
ref_cluster=pd.Categorical(adata_analysis.obs['Dataset'],
                           categories=['Blood', 'BoneMarrow', 'Gut_Colon', 'Gut_SmallIntestine',
       'Liver', 'Lung',
       'LymphNode', 'Omentum',
       'Muscle', 'Spleen', 'Thymus'])
adata_analysis.rename_categories('Dataset', ['Teichmann_Blood', 'Teichmann_BoneMarrow', 'Teichmann_Gut_Colon', 'Teichmann_Gut_SmallIntestine',
       'Teichmann_Liver', 'Teichmann_Lung',
       'Teichmann_LymphNode', 'Teichmann_Omentum',
       'Teichmann_Muscle', 'Teichmann_Spleen', 'Teichmann_Thymus'])

In [None]:
adata_analysis.write(writepath + '00-3-MultipleOrgans_ImmuneCells-Teichmann-2022-processed.h5ad')

# 01-Brain

## 01-1-Brain-Habib-2017

In [None]:
# here we use sfaira to import available datasets with annotations
# note that the following steps may change depending on the current sfaira version and the path to your repository

datadir = '/path/to/repo/'

In [8]:
ds = sfaira.data.human.DatasetGroupBrain(path=datadir)  # This links all data sets available

In [9]:
ds.ids 

['human_brain_2017_DroNcSeq_habib_001_10.1038/nmeth.4407',
 'human_brain_2020_microwell_han_001_10.1038/s41586-020-2157-4',
 'human_brain_2020_microwell_han_002_10.1038/s41586-020-2157-4',
 'human_brain_2020_microwell_han_003_10.1038/s41586-020-2157-4',
 'human_brain_2020_microwell_han_004_10.1038/s41586-020-2157-4',
 'human_brain_2020_microwell_han_005_10.1038/s41586-020-2157-4',
 'human_brain_2020_microwell_han_006_10.1038/s41586-020-2157-4']

In [10]:
#pick first one (Habib2017)
idx = ds.ids[0]

In [12]:
ds.datasets[idx].load()



In [13]:
print(ds.datasets[idx].adata)

AnnData object with n_obs × n_vars = 13067 × 25587
    obs: 'CellType', 'n_counts', 'log1p_n_counts', 'n_genes', 'log1p_n_genes', 'percent_mito', 'percent_ribo', 'percent_hb', 'percent_top50', 'cell_ontology_class', 'healthy', 'state_exact', 'cell_ontology_id'
    var: 'names', 'gene_ids', 'mito', 'ribo', 'hb', 'n_counts', 'n_cells', 'n_genes', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'ensembl'
    uns: 'leiden', 'neighbors', 'pca', 'lab', 'year', 'doi', 'protocol', 'organ', 'subtissue', 'animal', 'id', 'wget_download', 'has_celltypes', 'counts', 'mapped_features'
    obsm: 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'


In [14]:
adata=ds.datasets[idx].adata

In [6]:
adata.obs['InternDatasetNumber'] ='01-1-Brain-Habib-2017'

In [10]:
sc.pp.calculate_qc_metrics(adata, inplace=True)

In [14]:
# FILTER PARAMETERS
#Filter out cells
sc.pp.filter_cells(adata, max_counts = 14000)
sc.pp.filter_cells(adata, max_genes = 4000)
# Min 20 cells - filters out low count genes
sc.pp.filter_genes(adata, min_cells=20)

filtered out 7876 genes that are detected in less than 20 cells


In [16]:
# get mt genes
mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]
np.array(mt_genes)
mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]

# convert to spare
adata.X=sp.sparse.csr_matrix.todense(adata.X)

# manually define mt score
y = np.bincount(mt_gene_mask)
ii = np.nonzero(y)[0]
np.vstack((ii,y[ii])).T
adata.X[:, mt_gene_mask].sum(1)
mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))
mt_sum=np.array(pd.DataFrame(mt_sum)[0])
mt_frac=mt_sum/adata.obs['n_counts'].values

adata.obs['mt_frac'] = mt_frac

#Filter out cells with over 25% mito fraction
adata = adata[adata.obs['mt_frac'] < 0.25]

In [None]:
adata_pp=adata.copy()

In [19]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

normalizing by total count per cell
    finished (0:00:00): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    on highly variable genes
    with n_comps=15
    finished (0:00:01)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:08)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 9 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:00:01)


In [20]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [22]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)
size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

R[write to console]: Loading required package: scran

R[write to console]: Loading required package: SingleCellExperiment

R[write to console]: Loading required package: SummarizedExperiment

R[write to console]: Loading required package: GenomicRanges

R[write to console]: Loading required package: stats4

R[write to console]: Loading required package: BiocGenerics

R[write to console]: Loading required package: parallel

R[write to console]: 
Attaching package: ‘BiocGenerics’


R[write to console]: The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB


R[write to console]: The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


R[write to console]: The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    d

In [23]:
#Delete adata_pp
del adata_pp
adata.obs['size_factors'] = size_factors

  This is separate from the ipykernel package so we can avoid doing imports until


In [24]:
adata.strings_to_categoricals()

In [25]:
#make  (adata.X) copy of counts of raw data for downstream analysis
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [26]:
adata.raw = adata

In [27]:
#Normalize data
adata.X /= adata.obs['size_factors'].values[:, None]
sc.pp.log1p(adata)

In [28]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [29]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:00)


In [30]:
# Calculate the visualizations
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata)
sc.tl.umap(adata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:01)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:07)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:11)


In [32]:
# make consistent annotations across datasets
adata.obs['celltype']=adata.obs['cell_ontology_class'].copy()

In [33]:
ref_cluster=pd.Categorical(adata.obs['celltype'],
                           categories=['Astrocytes 1', 'Astrocytes 2', 'Endothelial cells',
       'GABAergic interneurons 1', 'GABAergic interneurons 2',
       'Glutamatergic neurons from the PFC 1',
       'Glutamatergic neurons from the PFC 2',
       'Granule neurons from the hip dentate gyrus region', 'Microglia',
       'Neuronal stem cells', 'Oligodendrocyte precursors', 'Oligodendrocytes',
       'Pyramidal neurons from the hip CA region 1',
       'Pyramidal neurons from the hip CA region 2', 'Unknown'])

In [34]:
ix=np.isin(ref_cluster,['Astrocytes 2'])
ref_cluster[ix]='Astrocytes 1'

ix=np.isin(ref_cluster,['Glutamatergic neurons from the PFC 2','Granule neurons from the hip dentate gyrus region',
                        'Pyramidal neurons from the hip CA region 1','Pyramidal neurons from the hip CA region 2',
                       'GABAergic interneurons 1', 'GABAergic interneurons 2'])
ref_cluster[ix]='Glutamatergic neurons from the PFC 1'

ix=np.isin(ref_cluster,['Oligodendrocytes'])
ref_cluster[ix]='Oligodendrocyte precursors'

In [35]:
adata.obs['celltype']=pd.Categorical(ref_cluster,
                                           categories=['Astrocytes 1', 'Endothelial cells',
       'Glutamatergic neurons from the PFC 1','Microglia',
       'Neuronal stem cells', 'Oligodendrocyte precursors',
       'Unknown'])

In [36]:
adata.rename_categories('celltype', ['Astrocytes', 'Endothelial cells',
       'Neurons', 'Microglial cells',
       'Neuronal stem cells', 'Oligodendrocytes',
       'Unknown'])

In [39]:
adata.obs['Organ'] = 'Brain'
adata.obs['Organ_Specific'] = 'Brain_Hippocampus_PrefrontalCortex'
adata.obs['Dataset'] = 'Habib_Brain'
adata.obs['InternDatasetNumber'] = '01-1-Brain-Habib-2017'
adata.obs['Dataset_status'] = 'Healthy_Dataset'

adata.obs['celltype'] = adata.obs['celltype']
adata.obs['sub_celltype'] = 'NaN'
adata.obs['Malignant'] = 'NonMalignant'

adata.obs['Patient'] = 'Habib_Brain-Donor1'
adata.obs['Patient_Number'] = 'NaN'
adata.obs['age'] = 'NaN'
adata.obs['sex'] = 'NaN'
adata.obs['ethnicity'] = 'NaN'
adata.obs['health_status'] = 'NaN'

adata.obs['original_celltype_1'] = adata.obs['cell_ontology_class']
adata.obs['original_celltype_2'] = adata.obs['CellType']
adata.obs['original_celltype_3'] = 'NaN'

In [40]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [41]:
adata.obs_names_make_unique()

In [42]:
adata.write(writepath + '01-1-Brain-Habib-2017-processed.h5ad')

## 01-2-Brain_Cerebellum-Han-2020

In [355]:
ix=np.isin(adata_han.obs['sub_tissue'],['AdultCerebellum']) 
adata=adata_han[ix].copy()

In [356]:
adata.obs['InternDatasetNumber'] ='01-2-Brain_Cerebellum-Han-2020'

In [360]:
#calculate QC covariates
adata.obs['n_counts'] = adata.X.sum(1)
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1)

In [363]:
# FILTER PARAMETERS
#Filter out cells
sc.pp.filter_cells(adata, max_counts = 4000)
sc.pp.filter_cells(adata, max_genes = 2500)
# Min 20 cells - filters out low count genes
sc.pp.filter_genes(adata, min_cells=10) #500 works # # 450(30494) #400(29837) #300(28268) not

filtered out 19 cells that have more than 4000 counts
filtered out 11878 genes that are detected in less than 10 cells


In [364]:
# get mt genes
mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]
np.array(mt_genes)
mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]

# convert to spare
adata.X=sp.sparse.csr_matrix.todense(adata.X)

# manually define mt score
y = np.bincount(mt_gene_mask)
ii = np.nonzero(y)[0]
np.vstack((ii,y[ii])).T
adata.X[:, mt_gene_mask].sum(1)
mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))
mt_sum=np.array(pd.DataFrame(mt_sum)[0])
mt_frac=mt_sum/adata.obs['n_counts'].values

adata.obs['mt_frac'] = mt_frac

#Filter out cells with over 20% mito fraction
adata = adata[adata.obs['mt_frac'] < 0.20]

In [365]:
adata_pp=adata.copy()

In [366]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

normalizing by total count per cell
    finished (0:00:00): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    with n_comps=15
    finished (0:00:01)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 11 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:00:00)


In [367]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [368]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)
size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

In [369]:
#Delete adata_pp
del adata_pp
adata.obs['size_factors'] = size_factors

  This is separate from the ipykernel package so we can avoid doing imports until


In [370]:
adata.strings_to_categoricals()

In [371]:
#make  (adata.X) copy of counts of raw data for downstream analysis
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [372]:
adata.raw = adata

In [373]:
#Normalize data
adata.X /= adata.obs['size_factors'].values[:, None]
sc.pp.log1p(adata)

In [374]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:00)


In [375]:
# Calculate the visualizations
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata)
sc.tl.umap(adata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:01)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:14)


In [376]:
#sc.pl.umap(adata, color='celltype_specific')

In [377]:
# make consistent annotations across datasets
adata.obs['celltype']=adata.obs['celltype_specific'].copy()
adata.obs['celltype'].cat.categories

Index(['Astrocyte', 'Astrocyte(Bergmann glia)', 'B cell', 'Endothelial cell',
       'Epithelial cell', 'Excitatory neuron', 'Inhibitory neuron',
       'Interneuron', 'Macrophage', 'Neutrophil_DEFA3 high',
       'Neutrophil_FCGR3B high', 'Neutrophil_LYZ high', 'Oligodendrocyte',
       'Oligodendrocyte progenitor cell', 'Pericyte', 'Smooth muscle cell',
       'T cell', 'Unknown'],
      dtype='object')

In [378]:
ref_cluster=pd.Categorical(adata.obs['celltype'],
                           categories=['Astrocyte', 'Astrocyte(Bergmann glia)', 'B cell', 'Endothelial cell',
       'Epithelial cell', 'Excitatory neuron', 'Inhibitory neuron',
       'Interneuron', 'Macrophage', 'Neutrophil_DEFA3 high',
       'Neutrophil_FCGR3B high', 'Neutrophil_LYZ high', 'Oligodendrocyte',
       'Oligodendrocyte progenitor cell', 'Pericyte', 'Smooth muscle cell',
       'T cell', 'Unknown'])

In [379]:
ix=np.isin(ref_cluster,['Astrocyte(Bergmann glia)'])
ref_cluster[ix]='Astrocyte'

ix=np.isin(ref_cluster,['Inhibitory neuron', 'Interneuron',])
ref_cluster[ix]='Excitatory neuron'

ix=np.isin(ref_cluster,[ 'Neutrophil_FCGR3B high', 'Neutrophil_LYZ high'])
ref_cluster[ix]='Neutrophil_DEFA3 high'

ix=np.isin(ref_cluster,[ 'Oligodendrocyte progenitor cell'])
ref_cluster[ix]='Oligodendrocyte'

In [380]:
adata.obs['celltype']=pd.Categorical(ref_cluster,
                                           categories=['Astrocyte', 'B cell', 'Endothelial cell',
       'Epithelial cell', 'Excitatory neuron', 
        'Macrophage', 'Neutrophil_DEFA3 high',
        'Oligodendrocyte',
        'Pericyte', 'Smooth muscle cell',
       'T cell', 'Unknown'])

In [381]:
adata.rename_categories('celltype', ['Astrocytes', 'B cells', 'Endothelial cells',
       'Cerebellar epithelial cells', 'Neurons', 
        'Macrophages', 'Neutrophils',
        'Oligodendrocytes',
        'Pericytes', 'Smooth muscle cells',
       'T cells', 'Unknown'])

In [385]:
adata.obs['sub_tissue'].cat.categories
ref_cluster=pd.Categorical(adata.obs['sub_tissue'],
                           categories=['AdultCerebellum'])
adata.rename_categories('sub_tissue', ['Brain_Cerebellum'])

In [386]:
adata.obs['sex'].cat.categories
ref_cluster=pd.Categorical(adata.obs['sex'],
                           categories=['female'])
adata.rename_categories('sex', ['Female'])

In [387]:
adata.obs['age'].cat.categories
ref_cluster=pd.Categorical(adata.obs['age'],
                           categories=['55Y'])
adata.rename_categories('age',['55'])

In [388]:
adata.obs['donor'].cat.categories
ref_cluster=pd.Categorical(adata.obs['donor'],
                           categories=['Donor29'])
adata.rename_categories('donor', ['Han-Donor29'])

In [389]:
adata.obs['Organ'] = 'Brain'
adata.obs['Organ_Specific'] = adata.obs['sub_tissue']
adata.obs['Dataset'] = 'Han_Brain_Cerebellum'
adata.obs['InternDatasetNumber'] = '01-2-Brain_Cerebellum-Han-2020'
adata.obs['Dataset_status'] = 'Healthy_Dataset'

adata.obs['celltype'] = adata.obs['celltype']
adata.obs['sub_celltype'] = 'NaN'
adata.obs['Malignant'] = 'NonMalignant'

adata.obs['Patient'] = adata.obs['donor']
adata.obs['Patient_Number'] = 'NaN'
adata.obs['age'] = adata.obs['age']
adata.obs['sex'] = adata.obs['sex']
adata.obs['ethnicity'] = 'NaN'
adata.obs['health_status'] = 'NaN'

adata.obs['original_celltype_1'] = adata.obs['celltype_specific']
adata.obs['original_celltype_2'] = adata.obs['celltype_global']
adata.obs['original_celltype_3'] = 'NaN'

In [391]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [392]:
adata.write(writepath + '01-2-Brain_Cerebellum-Han-2020-processed.h5ad')

## 01-3-Brain_TemporalLobe-Han-2020

In [965]:
ix=np.isin(adata_han.obs['sub_tissue'],['AdultTemporalLobe']) 
adata=adata_han[ix].copy()

In [966]:
adata.obs['InternDatasetNumber'] ='01-3-Brain_TemporalLobe-Han-2020'

In [970]:
#calculate QC covariates
adata.obs['n_counts'] = adata.X.sum(1)
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1)

In [973]:
# FILTER PARAMETERS#Filter out cells
sc.pp.filter_cells(adata, max_counts = 2100)
sc.pp.filter_cells(adata, max_genes = 1500)
# Min 20 cells - filters out low count genes
sc.pp.filter_genes(adata, min_cells=10) #500 works # # 450(30494) #400(29837) #300(28268) not

filtered out 24 cells that have more than 2100 counts
filtered out 12218 genes that are detected in less than 10 cells


In [974]:
# get mt genes
mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]
np.array(mt_genes)
mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]

# convert to spare
adata.X=sp.sparse.csr_matrix.todense(adata.X)

# manually define mt score
y = np.bincount(mt_gene_mask)
ii = np.nonzero(y)[0]
np.vstack((ii,y[ii])).T
adata.X[:, mt_gene_mask].sum(1)
mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))
mt_sum=np.array(pd.DataFrame(mt_sum)[0])
mt_frac=mt_sum/adata.obs['n_counts'].values

adata.obs['mt_frac'] = mt_frac

#Filter out cells with over 20% mito fraction
adata = adata[adata.obs['mt_frac'] < 0.20]

In [975]:
adata_pp=adata.copy()

In [976]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

normalizing by total count per cell
    finished (0:00:00): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    with n_comps=15
    finished (0:00:03)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 8 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:00:00)


In [977]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [978]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)
size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

In [979]:
#Delete adata_pp
del adata_pp
adata.obs['size_factors'] = size_factors

  This is separate from the ipykernel package so we can avoid doing imports until


In [980]:
adata.strings_to_categoricals()

In [981]:
#make  (adata.X) copy of counts of raw data for downstream analysis
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [982]:
adata.raw = adata

In [983]:
#Normalize data
adata.X /= adata.obs['size_factors'].values[:, None]
sc.pp.log1p(adata)

In [984]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:00)


In [985]:
# Calculate the visualizations
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata)
sc.tl.umap(adata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:01)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:02)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:18)


In [987]:
# make consistent annotations across datasets
adata.obs['celltype']=adata.obs['celltype_specific'].copy()
adata.obs['celltype'].cat.categories

Index(['Astrocyte', 'Endothelial cell', 'Glial cell', 'Inhibitory neuron',
       'Macrophage', 'Microglia_ALOX5AP high', 'Microglia_C3 high',
       'Neutrophil', 'Oligodendrocyte progenitor cell',
       'Oligodendrocyte_MT gene high', 'Oligodendrocyte_TF high',
       'Proliferating cell', 'Smooth muscle cell', 'T cell', 'Unknown'],
      dtype='object')

In [988]:
ref_cluster=pd.Categorical(adata.obs['celltype'],
                           categories=['Astrocyte', 'Endothelial cell', 'Glial cell', 'Inhibitory neuron',
       'Macrophage', 'Microglia_ALOX5AP high', 'Microglia_C3 high',
       'Neutrophil', 'Oligodendrocyte progenitor cell',
       'Oligodendrocyte_MT gene high', 'Oligodendrocyte_TF high',
       'Proliferating cell', 'Smooth muscle cell', 'T cell', 'Unknown'])

In [989]:
ix=np.isin(ref_cluster,[ 'Microglia_C3 high'])
ref_cluster[ix]= 'Microglia_ALOX5AP high'

ix=np.isin(ref_cluster,[ 'Oligodendrocyte_MT gene high', 'Oligodendrocyte_TF high'])
ref_cluster[ix]= 'Oligodendrocyte progenitor cell'

ix=np.isin(ref_cluster,[  'Proliferating cell'])
ref_cluster[ix]=  'Unknown'

In [990]:
adata.obs['celltype']=pd.Categorical(ref_cluster,
                                           categories=['Astrocyte', 'Endothelial cell', 'Glial cell', 'Inhibitory neuron',
       'Macrophage', 'Microglia_ALOX5AP high', 
       'Neutrophil', 'Oligodendrocyte progenitor cell',
       'Smooth muscle cell', 'T cell', 'Unknown'])

In [991]:
adata.rename_categories('celltype', ['Astrocytes', 'Endothelial cells', 'Glial cells', 'Neurons',
       'Macrophages', 'Microglial cells', 
       'Neutrophils', 'Oligodendrocytes',
       'Smooth muscle cells', 'T cells', 'Unknown'])

In [995]:
adata.obs['sub_tissue'].cat.categories
ref_cluster=pd.Categorical(adata.obs['sub_tissue'],
                           categories=['AdultTemporalLobe'])
adata.rename_categories('sub_tissue', ['Brain_TemporalLobe'])

In [996]:
adata.obs['sex'].cat.categories
ref_cluster=pd.Categorical(adata.obs['sex'],
                           categories=['female'])
adata.rename_categories('sex', ['Female'])

In [997]:
adata.obs['age'].cat.categories
ref_cluster=pd.Categorical(adata.obs['age'],
                           categories=['61Y'])
adata.rename_categories('age',['61'])

In [998]:
adata.obs['donor'].cat.categories
ref_cluster=pd.Categorical(adata.obs['donor'],
                           categories=['Donor52'])
adata.rename_categories('donor', ['Donor52'])

In [999]:
adata.obs['Organ'] = 'Brain'
adata.obs['Organ_Specific'] = adata.obs['sub_tissue']
adata.obs['Dataset'] = 'Han_Brain_TemporalLobe'
adata.obs['InternDatasetNumber'] ='01-3-Brain_TemporalLobe-Han-2020'
adata.obs['Dataset_status'] = 'Healthy_Dataset'

adata.obs['celltype'] = adata.obs['celltype']
adata.obs['sub_celltype'] = 'NaN'
adata.obs['Malignant'] = 'NonMalignant'

adata.obs['Patient'] = adata.obs['donor']
adata.obs['Patient_Number'] = 'NaN'
adata.obs['age'] = adata.obs['age']
adata.obs['sex'] = adata.obs['sex']
adata.obs['ethnicity'] = 'NaN'
adata.obs['health_status'] = 'NaN'

adata.obs['original_celltype_1'] = adata.obs['celltype_specific']
adata.obs['original_celltype_2'] = adata.obs['celltype_global']
adata.obs['original_celltype_3'] = 'NaN'

In [1001]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [1002]:
adata.obs_names_make_unique()

In [1003]:
adata.write(writepath + '01-3-Brain_TemporalLobe-Han-2020-processed.h5ad')

# 04-Gut

## 04-1-Oesophagus-Madissoon-2019

In [652]:
# here we use sfaira to import available datasets with annotations
# note that the following steps may change depending on the current sfaira version and the path to your repository

datadir = '/path/to/repo/'

ds = sfaira.data.human.DatasetGroupEsophagus(path=datadir)  # This links all data sets available

In [653]:
ds.ids 

In [654]:
# pick first one
idx = ds.ids[0]

In [656]:
ds.datasets[idx].load()

In [657]:
adata=ds.datasets[idx].adata

In [658]:
adata.var.index=np.array(adata.var.names)

In [660]:
adata.obs['InternDatasetNumber'] ='04-1-Oesophagus-Madissoon-2019'

In [663]:
adata.X

<87947x24245 sparse matrix of type '<class 'numpy.float32'>'
	with 155926526 stored elements in Compressed Sparse Column format>

In [664]:
sc.pp.calculate_qc_metrics(adata, inplace=True)

In [667]:
# FILTER PARAMETERS
#Filter out cells
#sc.pp.filter_cells(adata, max_counts = 14000)
#sc.pp.filter_cells(adata, max_genes = 4000)
# Min 20 cells - filters out low count genes
sc.pp.filter_genes(adata, min_cells=30)

filtered out 5797 genes that are detected in less than 30 cells


In [668]:
adata.X

<87947x18448 sparse matrix of type '<class 'numpy.float32'>'
	with 155870612 stored elements in Compressed Sparse Column format>

In [669]:
# get mt genes
mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]
np.array(mt_genes)
mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]

# convert to spare
adata.X=sp.sparse.csr_matrix.todense(adata.X)

# manually define mt score
y = np.bincount(mt_gene_mask)
ii = np.nonzero(y)[0]
np.vstack((ii,y[ii])).T
adata.X[:, mt_gene_mask].sum(1)
mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))
mt_sum=np.array(pd.DataFrame(mt_sum)[0])
mt_frac=mt_sum/adata.obs['n_counts'].values

adata.obs['mt_frac'] = mt_frac

#Filter out cells with over 25% mito fraction
adata = adata[adata.obs['mt_frac'] < 0.20]

In [671]:
adata_pp=adata.copy()

In [672]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

normalizing by total count per cell
    finished (0:00:12): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    with n_comps=15
    finished (0:00:21)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:11)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 15 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:00:24)


In [673]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [674]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)
size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

In [675]:
#Deletstrings_to_categoricalsa_pp
del adata_pp
adata.obs['size_factors'] = size_factors

  This is separate from the ipykernel package so we can avoid doing imports until


In [676]:
adata.strings_to_categoricals()

In [677]:
#make  (adata.X) copy of counts of raw data for downstream analysis
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [678]:
adata.raw = adata

In [679]:
#Normalize data
adata.X /= adata.obs['size_factors'].values[:, None]
sc.pp.log1p(adata)

In [680]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [681]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:00)


In [682]:
# Calculate the visualizations
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata)
sc.tl.umap(adata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:17)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:16)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:01:09)


In [684]:
# make consistent annotations across datasets
adata.obs['celltype']=adata.obs['cell_ontology_class'].copy()

In [685]:
ref_cluster=pd.Categorical(adata.obs['celltype'],
                           categories=['B_CD27neg', 'B_CD27pos', 'Basal cell', 'Blood_vessel',
       'Dendritic cell', 'Epi_dividing', 'Epi_suprabasal', 'Epi_upper',
       'Glands_duct', 'Glands_mucous', 'Lymph_vessel', 'Mast cell',
       'Mono_macro', 'NK_T_CD8_Cytotoxic', 'Stratified epithelial cell',
       'Stromal cell', 'T_CD4', 'T_CD8'])

In [686]:
ix=np.isin(ref_cluster,['B_CD27pos'])
ref_cluster[ix]='B_CD27neg'

ix=np.isin(ref_cluster,['Epi_suprabasal', 'Epi_upper'])
ref_cluster[ix]='Epi_dividing'

ix=np.isin(ref_cluster,['Glands_mucous'])
ref_cluster[ix]='Glands_duct'

ix=np.isin(ref_cluster,['T_CD4'])
ref_cluster[ix]='T_CD8'

In [687]:
adata.obs['celltype']=pd.Categorical(ref_cluster,
                                           categories=['B_CD27neg', 'Basal cell', 'Blood_vessel',
       'Dendritic cell', 'Epi_dividing',
       'Glands_duct', 'Lymph_vessel', 'Mast cell',
       'Mono_macro', 'NK_T_CD8_Cytotoxic','Stratified epithelial cell',
       'Stromal cell', 'T_CD8'])

In [688]:
adata.rename_categories('celltype', ['B cells', 'Basal cells', 'Endothelial cells', 'Dendritic cells',
        'Oesophageal epithelial cells', 'Intestinal secretory cells', 'Lymphatic endothelial cells', 'Mast cells', 'Macrophages','NK cells', 'Mucosal squamous cells',
        'Mesenchymal stromal cells', 'T cells'])

In [691]:
adata.obs['patient'].cat.categories

Index(['296C', '325C', '328C', '356C', '362C', '367C'], dtype='object')

In [692]:
adata.obs['donor'] = adata.obs['patient']
ref_cluster=pd.Categorical(adata.obs['donor'],
                           categories=['296C', '325C', '328C', '356C', '362C', '367C'])
adata.rename_categories('donor', ['Madissoon_Oesophagus-Donor1', 'Madissoon_Oesophagus-Donor2', 'Madissoon_Oesophagus-Donor3', 'Madissoon_Oesophagus-Donor4', 'Madissoon_Oesophagus-Donor5', 'Madissoon_Oesophagus-Donor6'])

In [693]:
adata.obs['Organ'] = 'Oesophagus'
adata.obs['Organ_Specific'] = 'Oesophagus'
adata.obs['Dataset'] = 'Madissoon_Oesophagus'
adata.obs['InternDatasetNumber'] ='04-1-Oesophagus-Madissoon-2019'
adata.obs['Dataset_status'] = 'Healthy_Dataset'

adata.obs['celltype'] = adata.obs['celltype']
adata.obs['sub_celltype'] = 'NaN'
adata.obs['Malignant'] = 'NonMalignant'

adata.obs['Patient'] = adata.obs['donor']
adata.obs['Patient_Number'] = adata.obs['sample']
adata.obs['age'] = 'NaN'
adata.obs['sex'] = 'NaN'
adata.obs['ethnicity'] = 'NaN'
adata.obs['health_status'] = 'NaN'

adata.obs['original_celltype_1'] = adata.obs['cell_ontology_class']
adata.obs['original_celltype_2'] = adata.obs['Celltypes']
adata.obs['original_celltype_3'] = 'NaN'

In [695]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [696]:
adata.obs_names_make_unique()

In [697]:
adata.write(writepath+'04-1-Oesophagus-Madissoon-2019-processed.h5ad')

## 04-2-Gut_Colon-James-2020

For the colon dataset of James, data was obtained from the original study (https://doi.org/10.1038/s41590-020-0602-z)

In [109]:
adata=sc.read(writepath + 'james2020_raw.h5ad')

In [110]:
adata

AnnData object with n_obs × n_vars = 41650 × 18927
    obs: 'donor', 'region', 'n_genes', 'percent_mito', 'n_counts', 'cell_type', 'BCR_ChainCombination', 'BCR_SEQUENCE_ID', 'IGH_V_CALL_GENOTYPED', 'IGH_D_CALL', 'IGH_J_CALL', 'BCR_ISOTYPE', 'BCR_CLONE', 'BCR_PANDONOR_CLONE', 'IGL_C_Gene', 'IGL_VDJ_Gene', 'IGH_READS', 'IGH_UMIS', 'IGH_MU_FREQ', 'IGK_READS', 'IGK_UMIS', 'IGL_READS', 'IGL_UMIS', 'TCR_v_gene', 'TCR_d_gene', 'TCR_j_gene', 'TCR_c_gene', 'TCR_cdr3', 'TCR_cdr3_nt', 'TCR_reads', 'TCR_umis', 'TCR_Clone'
    var: 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'gene_ids', 'feature_types'
    uns: 'leiden', 'neighbors', 'pca', 'rank_genes_groups'
    obsm: 'X_umap'
    obsp: 'connectivities', 'distances'

In [111]:
adata.obs['InternDatasetNumber'] ='04-2-Gut_Colon-James-2020'

In [113]:
sc.pp.calculate_qc_metrics(adata, inplace=True)

In [116]:
# FILTER PARAMETERS
#Filter out cells
sc.pp.filter_cells(adata, max_counts = 60000)
sc.pp.filter_cells(adata, max_genes = 5000)
# Min 20 cells - filters out low count genes
sc.pp.filter_genes(adata, min_cells=20)

filtered out 10 cells that have more than 5000 genes expressed
filtered out 3755 genes that are detected in less than 20 cells


In [117]:
adata.X

<41640x15172 sparse matrix of type '<class 'numpy.float32'>'
	with 57674816 stored elements in Compressed Sparse Row format>

In [118]:
# get mt genes
mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]
np.array(mt_genes)
mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]

# convert to spare
adata.X=sp.sparse.csr_matrix.todense(adata.X)

# manually define mt score
y = np.bincount(mt_gene_mask)
ii = np.nonzero(y)[0]
np.vstack((ii,y[ii])).T
adata.X[:, mt_gene_mask].sum(1)
mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))
mt_sum=np.array(pd.DataFrame(mt_sum)[0])
mt_frac=mt_sum/adata.obs['n_counts'].values

adata.obs['mt_frac'] = mt_frac

#Filter out cells with over 25% mito fraction
adata = adata[adata.obs['mt_frac'] < 0.20]

In [119]:
adata_pp=adata.copy()

In [120]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

normalizing by total count per cell
    finished (0:00:02): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    on highly variable genes
    with n_comps=15
    finished (0:00:02)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:07)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 15 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:00:03)


In [121]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [122]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)
size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

In [123]:
#Deletstrings_to_categoricals_pp
del adata_pp
adata.obs['size_factors'] = size_factors

  This is separate from the ipykernel package so we can avoid doing imports until


In [124]:
adata.strings_to_categoricals()

In [125]:
#make  (adata.X) copy of counts of raw data for downstream analysis
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [126]:
adata.raw = adata

In [127]:
#Normalize data
adata.X /= adata.obs['size_factors'].values[:, None]
sc.pp.log1p(adata)

In [128]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [129]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:00)


In [130]:
# Calculate the visualizations
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata)
sc.tl.umap(adata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:14)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:08)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:35)


In [131]:
#sc.pl.umap(adata, color='cell_type', palette=palette)

In [132]:
# make consistent annotations across datasets
adata.obs['celltype']=adata.obs['cell_type'].copy()

In [133]:
ref_cluster=pd.Categorical(adata.obs['celltype'],
                           categories=['Activated CD4 T', 'B cell IgA Plasma', 'B cell IgG Plasma',
       'B cell cycling', 'Follicular B cell', 'B cell memory', 'CD8 T', 'ILC',
       'Lymphoid DC', 'Monocyte', 'Mast', 'Macrophage', 'LYVE1 Macrophage',
       'NK', 'Tcm', 'Tfh', 'Th1', 'Th17', 'Treg', 'cDC1', 'cDC2',
       'cycling DCs', 'pDC', 'gd T', 'cycling gd T'])

In [134]:
ix=np.isin(ref_cluster,['CD8 T','Tcm', 'Tfh', 'Th1', 'Th17', 'Treg','gd T', 'cycling gd T'])
ref_cluster[ix]='Activated CD4 T'

ix=np.isin(ref_cluster,['Follicular B cell', 'B cell memory'])
ref_cluster[ix]='B cell cycling'

ix=np.isin(ref_cluster,['B cell IgG Plasma'])
ref_cluster[ix]='B cell IgA Plasma'

ix=np.isin(ref_cluster,['cDC1', 'cDC2','cycling DCs', 'pDC'])
ref_cluster[ix]='Lymphoid DC'

ix=np.isin(ref_cluster,['LYVE1 Macrophage'])
ref_cluster[ix]='Macrophage'

In [135]:
adata.obs['celltype']=pd.Categorical(ref_cluster,
                                           categories=['Activated CD4 T', 'B cell IgA Plasma', 'B cell cycling', 'ILC',
       'Lymphoid DC', 'Monocyte', 'Mast', 'Macrophage','NK'])

In [136]:
adata.rename_categories('celltype', ['T cells', 'Plasma cells', 'B cells', 'Innate lymphoid cells', 'Dendritic cells',
       'Monocytes', 'Mast cells', 'Macrophages', 'NK cells'])

  res = method(*args, **kwargs)


In [140]:
adata.obs['donor'].cat.categories
ref_cluster=pd.Categorical(adata.obs['donor'],
                           categories=['290b', '298c', '302c', '390c', '417c'])
adata.rename_categories('donor', ['James_Gut_Colon-Donor1', 'James_Gut_Colon-Donor2', 'James_Gut_Colon-Donor3', 'James_Gut_Colon-Donor4', 'James_Gut_Colon-Donor5'])

  res = method(*args, **kwargs)


In [141]:
adata.obs['Organ'] = 'Gut_Colon'
adata.obs['Organ_Specific'] = 'Gut_Colon'
adata.obs['Dataset'] = 'James_Gut_Colon'
adata.obs['InternDatasetNumber'] ='04-2-Gut_Colon-James-2020'
adata.obs['Dataset_status'] = 'Healthy_Dataset'

adata.obs['celltype'] = adata.obs['celltype']
adata.obs['sub_celltype'] = 'NaN'
adata.obs['Malignant'] = 'NonMalignant'

adata.obs['Patient'] = adata.obs['donor']
adata.obs['Patient_Number'] = 'NaN'
adata.obs['age'] ='NaN'
adata.obs['sex'] = 'NaN'
adata.obs['ethnicity'] = 'NaN'
adata.obs['health_status'] = 'NaN'

adata.obs['original_celltype_1'] = adata.obs['cell_type']
adata.obs['original_celltype_2'] = 'NaN'
adata.obs['original_celltype_3'] = 'NaN'

In [143]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [144]:
adata.obs_names_make_unique()

In [146]:
adata.write(writepath + '04-2-Gut_Colon-James-2020-processed.h5ad')

## 04-3-Gut_Colon-Simmons-2021

In [10]:
target_collections = ["60358420-6055-411d-ba4f-e8ac80682a2e"]
cache_path = os.path.join(".", "data")
dsg = sfaira.data.dataloaders.databases.DatasetSuperGroupDatabases(data_path=cache_path, cache_metadata=True)
dsg.subset(key="collection_id", values=target_collections)
dsg.datasets
dsg.download()

Downloading: ncbitaxon_v2021-06-10.obo to /home/ruben.brabenec/.cache/sfaira/ontologies/ncbitaxon
Downloading: efo.obo to /home/ruben.brabenec/.cache/sfaira/ontologies/efo
Downloading: hsapdv_master.obo to /home/ruben.brabenec/.cache/sfaira/ontologies/hsapdv
Downloading: mmusdv.obo to /home/ruben.brabenec/.cache/sfaira/ontologies/mmusdv
Downloading: uberon_v2021-07-27.obo to /home/ruben.brabenec/.cache/sfaira/ontologies/uberon
Ontology <class 'sfaira.versions.metadata.base.OntologyUberonLifecyclestage'> is not a DAG, treat child-parent reasoning with care.
Downloading: mondo_v2021-08-11.obo to /home/ruben.brabenec/.cache/sfaira/ontologies/mondo
Ontology <class 'sfaira.versions.metadata.base.OntologyMondo'> is not a DAG, treat child-parent reasoning with care.
Ontology <class 'sfaira.versions.metadata.base.OntologyUberon'> is not a DAG, treat child-parent reasoning with care.
Downloading: pato_v2021-08-06.obo to /home/ruben.brabenec/.cache/sfaira/ontologies/pato


In [11]:
path = '/path/to/repo/60358420-6055-411d-ba4f-e8ac80682a2e/'
files = [f for f in listdir(path) if isfile(join(path, f))]

In [13]:
#1: 774de9c6-9752-4e39-89a9-2a88c869d52a.h5ad
path_2 = path + '774de9c6-9752-4e39-89a9-2a88c869d52a.h5ad'
u1 = sc.read_h5ad(path_2)
u1.var.index = u1.var['feature_name']

In [14]:
#2: 774de9c6-9752-4e39-89a9-2a88c869d52a.h5ad
path_2 = path + '08e94873-c2a6-4f7d-ab72-aeaff3e3f929.h5ad'
u2 = sc.read_h5ad(path_2)
u2.var.index = u2.var['feature_name']

In [15]:
#3: '2d66790a-6621-4a49-8f0d-4002db5cc98d.h5ad'
path_2 = path +  '2d66790a-6621-4a49-8f0d-4002db5cc98d.h5ad'
u3 = sc.read_h5ad(path_2)
u3.var.index = u3.var['feature_name']

In [16]:
#4: '4d2e0563-cf4a-48bd-aa7f-efc26025b53a.h5ad'
path_2 = path +  '4d2e0563-cf4a-48bd-aa7f-efc26025b53a.h5ad'
u4 = sc.read_h5ad(path_2)
u4.var.index = u4.var['feature_name']

In [17]:
#5: '58679288-9ecc-4647-9781-12a3a8f8c6fd.h5ad'
path_2 = path +  '58679288-9ecc-4647-9781-12a3a8f8c6fd.h5ad'
u5 = sc.read_h5ad(path_2)
u5.var.index = u5.var['feature_name']

In [18]:
#6: 'fd89be61-2869-4342-a86e-e1fce3a8f269.h5ad'
path_2 = path +  'fd89be61-2869-4342-a86e-e1fce3a8f269.h5ad'
u6 = sc.read_h5ad(path_2)
u6.var.index = u6.var['feature_name']

In [19]:
#7: c42c8ad3-9761-49e5-b9bf-ee8ebd50416f.h5ad'
path_2 = path +  'c42c8ad3-9761-49e5-b9bf-ee8ebd50416f.h5ad'
u7 = sc.read_h5ad(path_2)
u7.var.index = u7.var['feature_name']

In [20]:
#8: 'aa0b5adb-957d-4f15-ab83-2c5cc2843f77.h5ad'
path_2 = path +  'fd89be61-2869-4342-a86e-e1fce3a8f269.h5ad'
u8 = sc.read_h5ad(path_2)
u8.var.index = u8.var['feature_name']

In [21]:
#9: 'abd889c6-f60a-4fbd-924e-ee1e9dcf909b.h5ad'
path_2 = path +  'abd889c6-f60a-4fbd-924e-ee1e9dcf909b.h5ad'
u9 = sc.read_h5ad(path_2)
u9.var.index = u9.var['feature_name']

In [22]:
#10: '4269074c-f2c1-4d88-b2c3-0946f59d5449.h5ad'
path_2 = path +  '4269074c-f2c1-4d88-b2c3-0946f59d5449.h5ad'
u10 = sc.read_h5ad(path_2)
u10.var.index = u10.var['feature_name']

In [23]:
#11: 'b9b4cf27-9c22-410d-8bd8-5d43e379485b.h5ad'
path_2 = path +  'b9b4cf27-9c22-410d-8bd8-5d43e379485b.h5ad'
u11 = sc.read_h5ad(path_2)
u11.var.index = u11.var['feature_name']

In [24]:
#11: 'e006d4e3-35fa-44b4-9981-09a66c4322e5.h5ad'
path_2 = path +  'e006d4e3-35fa-44b4-9981-09a66c4322e5.h5ad'
u11 = sc.read_h5ad(path_2)
u11.var.index = u11.var['feature_name']

In [25]:
#12: '4506d9e3-4543-4464-aeae-b0b04eee1cea.h5ad'
path_2 = path +  '4506d9e3-4543-4464-aeae-b0b04eee1cea.h5ad'
u12 = sc.read_h5ad(path_2)
u12.var.index = u12.var['feature_name']

In [26]:
#13: 'bbd16004-09e8-4b6c-b465-73ff83a52837.h5ad'
path_2 = path +  'bbd16004-09e8-4b6c-b465-73ff83a52837.h5ad'
u13 = sc.read_h5ad(path_2)
u13.var.index = u13.var['feature_name']

In [27]:
#14: '9dfd2243-74d6-4924-86bd-c206ca9287b1.h5ad',
path_2 = path +  '9dfd2243-74d6-4924-86bd-c206ca9287b1.h5ad'
u14 = sc.read_h5ad(path_2)
u14.var.index = u14.var['feature_name']

In [28]:
#15: '04b0eb97-d816-44bb-93a5-8b2968791aa0.h5ad'
path_2 = path +  '04b0eb97-d816-44bb-93a5-8b2968791aa0.h5ad'
u15 = sc.read_h5ad(path_2)
u15.var.index = u15.var['feature_name']

In [29]:
#16:  '9d5df009-eb76-43a3-b6cd-22017cc53700.h5ad'
path_2 = path +   '9d5df009-eb76-43a3-b6cd-22017cc53700.h5ad'
u16 = sc.read_h5ad(path_2)
u16.var.index = u16.var['feature_name']

In [31]:
adata = u1.concatenate(u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15, u16, join='outer', index_unique="_")

  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


In [32]:
for i in range(len(files)):
    print(files[i])
    path_2 = path + files[i]
    u = sc.read_h5ad(path_2)
    u.obs['id'] = files[i]
    u.var['Gene'] = 'NaN'
    u.var['Gene'] = u.var['feature_name']
    if i == 0:
        adata = u
    else:
        adata = adata.concatenate(u, join='inner', index_unique="_")

774de9c6-9752-4e39-89a9-2a88c869d52a.h5ad
08e94873-c2a6-4f7d-ab72-aeaff3e3f929.h5ad
2d66790a-6621-4a49-8f0d-4002db5cc98d.h5ad
4d2e0563-cf4a-48bd-aa7f-efc26025b53a.h5ad
58679288-9ecc-4647-9781-12a3a8f8c6fd.h5ad
fd89be61-2869-4342-a86e-e1fce3a8f269.h5ad
c42c8ad3-9761-49e5-b9bf-ee8ebd50416f.h5ad
aa0b5adb-957d-4f15-ab83-2c5cc2843f77.h5ad
abd889c6-f60a-4fbd-924e-ee1e9dcf909b.h5ad
4269074c-f2c1-4d88-b2c3-0946f59d5449.h5ad
b9b4cf27-9c22-410d-8bd8-5d43e379485b.h5ad
e006d4e3-35fa-44b4-9981-09a66c4322e5.h5ad
4506d9e3-4543-4464-aeae-b0b04eee1cea.h5ad
bbd16004-09e8-4b6c-b465-73ff83a52837.h5ad
9dfd2243-74d6-4924-86bd-c206ca9287b1.h5ad
04b0eb97-d816-44bb-93a5-8b2968791aa0.h5ad
9d5df009-eb76-43a3-b6cd-22017cc53700.h5ad


In [36]:
adata.obs['InternDatasetNumber'] ='04-3-Gut_Colon_SmallIntestine-Simmons-2021'

In [40]:
adata.var.index = adata.var['feature_name-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0']

In [44]:
sc.pp.calculate_qc_metrics(adata, inplace=True)

In [45]:
#calculate QC covariates
adata.obs['n_counts'] = adata.X.sum(1)
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1)

In [48]:
adata.X

<149329x13796 sparse matrix of type '<class 'numpy.float32'>'
	with 325064862 stored elements in Compressed Sparse Row format>

In [49]:
# FILTER PARAMETERS
#Filter out cells
sc.pp.filter_cells(adata, max_counts = 6000)
sc.pp.filter_cells(adata, max_genes = 9000)
# Min 20 cells - filters out low count genes
sc.pp.filter_genes(adata, min_cells=20)

filtered out 38 cells that have more than 6000 counts


In [50]:
adata.X

<149291x13796 sparse matrix of type '<class 'numpy.float32'>'
	with 324831268 stored elements in Compressed Sparse Row format>

In [51]:
# get mt genes
mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]
np.array(mt_genes)
mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]

# convert to spare
adata.X=sp.sparse.csr_matrix.todense(adata.X)

# manually define mt score
y = np.bincount(mt_gene_mask)
ii = np.nonzero(y)[0]
np.vstack((ii,y[ii])).T
adata.X[:, mt_gene_mask].sum(1)
mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))
mt_sum=np.array(pd.DataFrame(mt_sum)[0])
mt_frac=mt_sum/adata.obs['n_counts'].values

adata.obs['mt_frac'] = mt_frac

#Filter out cells with over 25% mito fraction
adata = adata[adata.obs['mt_frac'] < 0.20]

In [53]:
adata_pp=adata.copy()

In [54]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

normalizing by total count per cell
    finished (0:00:15): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    with n_comps=15
    finished (0:00:33)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:31)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 19 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:00:29)


In [55]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [57]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)
size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

R[write to console]: Loading required package: scran

R[write to console]: Loading required package: SingleCellExperiment

R[write to console]: Loading required package: SummarizedExperiment

R[write to console]: Loading required package: GenomicRanges

R[write to console]: Loading required package: stats4

R[write to console]: Loading required package: BiocGenerics

R[write to console]: Loading required package: parallel

R[write to console]: 
Attaching package: ‘BiocGenerics’


R[write to console]: The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB


R[write to console]: The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


R[write to console]: The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    d

In [58]:
#Delete adata_pp
del adata_pp
adata.obs['size_factors'] = size_factors

  This is separate from the ipykernel package so we can avoid doing imports until


In [59]:
adata.strings_to_categoricals()

In [60]:
#make  (adata.X) copy of counts of raw data for downstream analysis
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [61]:
adata.raw = adata

In [62]:
#Normalize data
adata.X /= adata.obs['size_factors'].values[:, None]
sc.pp.log1p(adata)

In [63]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [64]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:01)


In [69]:
ref_cluster=pd.Categorical(adata.obs['celltype'],
                           categories=['B cell', 'dendritic cell', 'enteric smooth muscle cell', 'enterocyte',
       'enteroendocrine cell', 'erythroid lineage cell', 'fibroblast',
       'glial cell', 'group 3 innate lymphoid cell', 'gut absorptive cell',
       'gut endothelial cell', 'inhibitory motor neuron', 'interneuron',
       'interstitial cell of Cajal',
       'intestinal crypt stem cell of large intestine',
       'intestinal crypt stem cell of small intestine',
       'intestinal epithelial cell', 'intestine goblet cell', 'leukocyte',
       'macrophage', 'mast cell', 'mesothelial cell', 'monocyte',
       'motor neuron', 'myofibroblast cell', 'naive T cell',
       'natural killer cell', 'neural cell', 'neuroendocrine cell',
       'pericyte cell', 'plasmacytoid dendritic cell', 'precursor B cell',
       'progenitor cell', 'secretory cell',
       'smooth muscle cell of large intestine',
       'smooth muscle cell of small intestine',
       'transit amplifying cell of colon',
       'transit amplifying cell of small intestine'])

In [70]:
ix=np.isin(ref_cluster,['plasmacytoid dendritic cell'])
ref_cluster[ix]='dendritic cell'

ix=np.isin(ref_cluster,['gut absorptive cell', 'intestinal epithelial cell'])
ref_cluster[ix]='enterocyte'

ix=np.isin(ref_cluster,['interneuron', 'interstitial cell of Cajal', 'motor neuron', 'neural cell'])
ref_cluster[ix]='inhibitory motor neuron'

ix=np.isin(ref_cluster,['intestinal crypt stem cell of large intestine','transit amplifying cell of colon',
                        'transit amplifying cell of small intestine'])
ref_cluster[ix]='intestinal crypt stem cell of small intestine'

ix=np.isin(ref_cluster,['precursor B cell'])
ref_cluster[ix]='B cell'

ix=np.isin(ref_cluster,['intestine goblet cell',  'enteroendocrine cell'])
ref_cluster[ix]='secretory cell'

ix=np.isin(ref_cluster,[ 'smooth muscle cell of small intestine', 'smooth muscle cell of large intestine'])
ref_cluster[ix]='enteric smooth muscle cell'

In [71]:
adata.obs['celltype']=pd.Categorical(ref_cluster,
                                           categories=['B cell', 'dendritic cell', 'enteric smooth muscle cell', 'enterocyte',
     'erythroid lineage cell', 'fibroblast',
       'glial cell', 'group 3 innate lymphoid cell',
       'gut endothelial cell', 'inhibitory motor neuron',


       'intestinal crypt stem cell of small intestine',
       'leukocyte',
       'macrophage', 'mast cell', 'mesothelial cell', 'monocyte',
      'myofibroblast cell', 'naive T cell',
       'natural killer cell',  'neuroendocrine cell',
       'pericyte cell', 
       'progenitor cell', 'secretory cell'])

In [75]:
adata.obs['id'].cat.categories

Index(['2d66790a-6621-4a49-8f0d-4002db5cc98d.h5ad',
       '04b0eb97-d816-44bb-93a5-8b2968791aa0.h5ad',
       '4d2e0563-cf4a-48bd-aa7f-efc26025b53a.h5ad',
       '08e94873-c2a6-4f7d-ab72-aeaff3e3f929.h5ad',
       '9d5df009-eb76-43a3-b6cd-22017cc53700.h5ad',
       '9dfd2243-74d6-4924-86bd-c206ca9287b1.h5ad',
       '774de9c6-9752-4e39-89a9-2a88c869d52a.h5ad',
       '4506d9e3-4543-4464-aeae-b0b04eee1cea.h5ad',
       '4269074c-f2c1-4d88-b2c3-0946f59d5449.h5ad',
       '58679288-9ecc-4647-9781-12a3a8f8c6fd.h5ad',
       'aa0b5adb-957d-4f15-ab83-2c5cc2843f77.h5ad',
       'abd889c6-f60a-4fbd-924e-ee1e9dcf909b.h5ad',
       'b9b4cf27-9c22-410d-8bd8-5d43e379485b.h5ad',
       'bbd16004-09e8-4b6c-b465-73ff83a52837.h5ad',
       'c42c8ad3-9761-49e5-b9bf-ee8ebd50416f.h5ad',
       'e006d4e3-35fa-44b4-9981-09a66c4322e5.h5ad',
       'fd89be61-2869-4342-a86e-e1fce3a8f269.h5ad'],
      dtype='object')

In [76]:
adata.obs['sex_ontology_term_id'].cat.categories
ref_cluster=pd.Categorical(adata.obs['sex_ontology_term_id'],
                           categories=['PATO:0000384', 'unknown'])
adata.rename_categories('sex_ontology_term_id', ['Male', 'NaN'])

In [77]:
adata.obs['tissue'].cat.categories
ref_cluster=pd.Categorical(adata.obs['tissue'],
                           categories=['colon', 'intestine'])
adata.rename_categories('tissue', ['Gut_Colon', 'Gut_SmallIntestine'])

In [78]:
adata.obs['development_stage'].cat.categories
ref_cluster=pd.Categorical(adata.obs['development_stage'],
                           categories=['66-year-old human stage', 'fetal stage'])
adata.rename_categories('development_stage', ['66', 'Fetal'])

In [79]:
adata.obs['donor'] = adata.obs['id']
ref_cluster=pd.Categorical(adata.obs['donor'],
                           categories=['2d66790a-6621-4a49-8f0d-4002db5cc98d.h5ad',
       '04b0eb97-d816-44bb-93a5-8b2968791aa0.h5ad',
       '4d2e0563-cf4a-48bd-aa7f-efc26025b53a.h5ad',
       '08e94873-c2a6-4f7d-ab72-aeaff3e3f929.h5ad',
       '9d5df009-eb76-43a3-b6cd-22017cc53700.h5ad',
       '9dfd2243-74d6-4924-86bd-c206ca9287b1.h5ad',
       '774de9c6-9752-4e39-89a9-2a88c869d52a.h5ad',
       '4506d9e3-4543-4464-aeae-b0b04eee1cea.h5ad',
       '4269074c-f2c1-4d88-b2c3-0946f59d5449.h5ad',
       '58679288-9ecc-4647-9781-12a3a8f8c6fd.h5ad',
       'aa0b5adb-957d-4f15-ab83-2c5cc2843f77.h5ad',
       'abd889c6-f60a-4fbd-924e-ee1e9dcf909b.h5ad',
       'b9b4cf27-9c22-410d-8bd8-5d43e379485b.h5ad',
       'bbd16004-09e8-4b6c-b465-73ff83a52837.h5ad',
       'c42c8ad3-9761-49e5-b9bf-ee8ebd50416f.h5ad',
       'e006d4e3-35fa-44b4-9981-09a66c4322e5.h5ad',
       'fd89be61-2869-4342-a86e-e1fce3a8f269.h5ad'])
adata.rename_categories('donor', ['Simmons_Gut_Colon-Donor1',
       'Simmons_Gut_Colon-Donor2',
       'Simmons_Gut_Colon-Donor3',
       'Simmons_Gut_Colon-Donor4',
       'Simmons_Gut_Colon-Donor5',
       'Simmons_Gut_Colon-Donor6',
       'Simmons_Gut_Colon-Donor7',
       'Simmons_Gut_Colon-Donor8',
       'Simmons_Gut_Colon-Donor9',
       'Simmons_Gut_Colon-Donor10',
       'Simmons_Gut_Colon-Donor11',
       'Simmons_Gut_Colon-Donor12',
       'Simmons_Gut_Colon-Donor13',
       'Simmons_Gut_Colon-Donor14',
       'Simmons_Gut_Colon-Donor15',
       'Simmons_Gut_Colon-Donor16',
       'Simmons_Gut_Colon-Donor17'])

In [82]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [83]:
adata.obs_names_make_unique()

In [None]:
adata.obs['InternDatasetNumber'] ='04-3-Gut_Colon_SmallIntestine-Simmons-2021'

In [None]:
adata.write(writepath+'04-3-Gut_Colon_SmallIntestine-Simmons-2021-processed.h5ad')

## 04-4-Gut_Colon-Wang-2019

In [698]:
ID = 'homosapiens_colon_2019_10x3transcriptionprofiling_wang_001_10.1084/jem.20191130'

In [699]:
# Set this path to your local sfaira data repository
basedir = '.'
datadir = os.path.join(basedir, 'raw')
metadir = os.path.join(basedir, 'meta')
cachedir = os.path.join(basedir, 'cache')

In [700]:
ds = sfaira.data.Universe(data_path=datadir, meta_path=metadir, cache_path=cachedir)
# subset to the selected dataset
ds.subset(key="id", values=[ID])  # subsets all lung data sets
# download and load the specific dataset
ds.download()
ds.load(verbose=1)
# get the unmodified adata object of the dataset
adata = ds.datasets[ID].adata

Downloading: wang20_colon.processed.h5ad
loading homosapiens_colon_2019_10x3transcriptionprofiling_wang_001_10.1084/jem.20191130


In [701]:
adata.obs['InternDatasetNumber'] ='04-4-Gut_Colon-Wang-2019'

In [714]:
adata_pp=adata.copy()

In [715]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

normalizing by total count per cell
    finished (0:00:00): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)



These matrices should now be stored in the .obsp attribute.
This slicing behavior will be removed in anndata 0.8.

These matrices should now be stored in the .obsp attribute.
This slicing behavior will be removed in anndata 0.8.


computing PCA
    on highly variable genes
    with n_comps=15
    finished (0:00:00)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 8 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:00:00)


In [718]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)
size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

In [719]:
#Delete adata_pp
del adata_pp
adata.obs['size_factors'] = size_factors

  This is separate from the ipykernel package so we can avoid doing imports until


In [720]:
adata.strings_to_categoricals()

In [721]:
#make  (adata.X) copy of counts of raw data for downstream analysis
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [722]:
adata.raw = adata

In [723]:
#Normalize data
adata.X /= adata.obs['size_factors'].values[:, None]
sc.pp.log1p(adata)

In [724]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [725]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:00)


In [728]:
# make consistent annotations across datasets
adata.obs['celltype']=adata.obs['CellType'].copy()

In [729]:
adata.obs['celltype'].cat.categories

Index(['Enteriendocrine', 'Enterocyte', 'Goblet', 'Paneth-like', 'Progenitor',
       'Stem Cell', 'TA'],
      dtype='object')

In [730]:
ref_cluster=pd.Categorical(adata.obs['celltype'],
                           categories= ['Enteriendocrine', 'Enterocyte', 'Goblet', 'Paneth-like', 'Progenitor',
       'Stem Cell', 'TA'])

In [731]:
ix=np.isin(ref_cluster,['TA'])
ref_cluster[ix]='Stem Cell'

ix=np.isin(ref_cluster,['Paneth-like', 'Goblet'])
ref_cluster[ix]='Enteriendocrine'

In [732]:
adata.obs['celltype']=pd.Categorical(ref_cluster,
                                           categories=['Enteriendocrine', 'Enterocyte', 'Progenitor',
       'Stem Cell'])

In [733]:
adata.rename_categories('celltype', ['Intestinal secretory cells', 'Intestinal epithelial cells', 'Intestinal progenitor cells',
       'Intestinal stem cells'])

In [737]:
adata.obs['donor'] = adata.obs['Sample_ID']
ref_cluster=pd.Categorical(adata.obs['donor'],
                           categories=['Colon-1', 'Colon-2'])
adata.rename_categories('donor', ['Wang_Gut_Colon-Donor1', 'Wang_Gut_Colon-Donor2'])

In [738]:
adata.obs['Organ'] = 'Gut_Colon'
adata.obs['Organ_Specific'] = 'Gut_Colon'
adata.obs['Dataset'] = 'Wang_Gut_Colon'
adata.obs['InternDatasetNumber'] ='04-4-Gut_Colon-Wang-2019'
adata.obs['Dataset_status'] = 'HealthyProject'

adata.obs['celltype'] = adata.obs['celltype']
adata.obs['sub_celltype'] = 'NaN'
adata.obs['Malignant'] = 'NonMalignant'

adata.obs['Patient'] = 'NaN'
adata.obs['Patient_Number'] = adata.obs['Sample_ID']
adata.obs['age'] = 'NaN'
adata.obs['sex'] = 'NaN'
adata.obs['ethnicity'] = 'NaN'
adata.obs['health_status'] = 'NaN'

adata.obs['original_celltype_1'] = adata.obs['CellType']
adata.obs['original_celltype_2'] = 'NaN'
adata.obs['original_celltype_3'] = 'NaN'

In [740]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [741]:
adata.obs_names_make_unique()

In [742]:
adata.write(writepath+ '04-4-Gut_Colon-Wang-2019-processed.h5ad')

##   04-6-Gut_Colon-Pisco-2022

In [93]:
ix=np.isin(adata_pisco.obs['tissue'],['large intestine']) 
adata=adata_pisco[ix].copy()



In [96]:
sc.pp.calculate_qc_metrics(adata, inplace=True)

In [97]:
adata.obs['n_counts'] = adata.obs['total_counts']
adata.obs['log_counts'] = adata.obs['log1p_n_genes_by_counts']
adata.obs['n_genes'] = adata.obs['n_genes_by_counts']

In [102]:
# FILTER PARAMETERS
#Filter out cells
#sc.pp.filter_cells(adata, max_counts = 12000)
#sc.pp.filter_cells(adata, max_genes = 7000)
# Min 20 cells - filters out low count genes
sc.pp.filter_genes(adata, min_cells=20) #500 works # # 450(30494) #400(29837) #300(28268) not

filtered out 37317 genes that are detected in less than 20 cells


In [105]:
# get mt genes
mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]
np.array(mt_genes)
mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]

# convert to spare
adata.X=sp.sparse.csr_matrix.todense(adata.X)

# manually define mt score
y = np.bincount(mt_gene_mask)
ii = np.nonzero(y)[0]
np.vstack((ii,y[ii])).T
adata.X[:, mt_gene_mask].sum(1)
mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))
mt_sum=np.array(pd.DataFrame(mt_sum)[0])
mt_frac=mt_sum/adata.obs['n_counts'].values

adata.obs['mt_frac'] = mt_frac

#Filter out cells with over 20% mito fraction
adata = adata[adata.obs['mt_frac'] < 0.20]

In [106]:
adata_pp=adata.copy()

In [107]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

normalizing by total count per cell
    finished (0:00:01): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    on highly variable genes
    with n_comps=15
    finished (0:00:00)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 12 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:00:01)


In [108]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [109]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)
size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

In [110]:
#Delete adata_pp
del adata_pp
adata.obs['size_factors'] = size_factors

  This is separate from the ipykernel package so we can avoid doing imports until


In [111]:
adata.strings_to_categoricals()

In [112]:
#make  (adata.X) copy of counts of raw data for downstream analysis
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [113]:
adata.raw = adata

In [114]:
#Normalize data
adata.X /= adata.obs['size_factors'].values[:, None]
sc.pp.log1p(adata)

In [115]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:01)


In [116]:
# Calculate the visualizations
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata)
sc.tl.umap(adata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:02)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:02)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:11)


In [118]:
# make consistent annotations across datasets
adata.obs['celltype']=adata.obs['cell_type'].copy()

In [119]:
adata.obs['celltype'].cat.categories

Index(['B cell', 'CD4-positive, alpha-beta T cell',
       'CD8-positive, alpha-beta T cell', 'enterocyte',
       'enterocyte of epithelium of large intestine', 'fibroblast',
       'goblet cell', 'gut endothelial cell', 'intestinal crypt stem cell',
       'intestinal crypt stem cell of large intestine',
       'intestinal enteroendocrine cell', 'intestinal tuft cell',
       'large intestine goblet cell', 'mast cell', 'monocyte', 'neutrophil',
       'paneth cell of colon', 'plasma cell',
       'transit amplifying cell of colon'],
      dtype='object')

In [122]:
ref_cluster=pd.Categorical(adata.obs['celltype'],
                           categories=['B cell', 'CD4-positive, alpha-beta T cell',
       'CD8-positive, alpha-beta T cell', 'enterocyte',
       'enterocyte of epithelium of large intestine', 'fibroblast',
       'goblet cell', 'gut endothelial cell', 'intestinal crypt stem cell',
       'intestinal crypt stem cell of large intestine',
       'intestinal enteroendocrine cell', 'intestinal tuft cell',
       'large intestine goblet cell', 'mast cell', 'monocyte', 'neutrophil',
       'paneth cell of colon', 'plasma cell',
       'transit amplifying cell of colon'])

In [123]:
ix=np.isin(ref_cluster,[ 'CD8-positive, alpha-beta T cell'])
ref_cluster[ix]='CD4-positive, alpha-beta T cell'

ix=np.isin(ref_cluster,['enterocyte', 'intestinal tuft cell'])
ref_cluster[ix]= 'enterocyte of epithelium of large intestine'

ix=np.isin(ref_cluster,['intestinal crypt stem cell of large intestine', 'transit amplifying cell of colon'])
ref_cluster[ix]='intestinal crypt stem cell'

ix=np.isin(ref_cluster,[ 'intestinal enteroendocrine cell',  'large intestine goblet cell','paneth cell of colon'])
ref_cluster[ix]='goblet cell'

In [124]:
adata.obs['celltype']=pd.Categorical(ref_cluster,
                                           categories=['B cell', 'CD4-positive, alpha-beta T cell',
        'enterocyte of epithelium of large intestine', 'fibroblast',
       'goblet cell', 'gut endothelial cell', 'intestinal crypt stem cell',
         'mast cell', 'monocyte', 'neutrophil',
      'plasma cell'])

In [125]:
adata.rename_categories('celltype', ['B cells', 'T cells',
        'Intestinal epithelial cells', 'Fibroblast cells',
       'Intestinal secretory cells', 'Endothelial cells', 'Intestinal stem cells',
         'Mast cells', 'Monocytes', 'Neutrophils',
      'Plasma cells'])

  res = method(*args, **kwargs)


In [130]:
adata.obs['tissue'].cat.categories
ref_cluster=pd.Categorical(adata.obs['tissue'],
                           categories=['large intestine'])
adata.rename_categories('tissue', ['Gut_Colon'])

  res = method(*args, **kwargs)


In [131]:
adata.obs['sex'].cat.categories
ref_cluster=pd.Categorical(adata.obs['sex'],
                           categories=['female', 'male'])
adata.rename_categories('sex', ['Female', 'Male'])

In [132]:
adata.obs['ethnicity'].cat.categories
ref_cluster=pd.Categorical(adata.obs['ethnicity'],
                           categories=['African-American or Afro-Caribbean', 'European'])
adata.rename_categories('ethnicity', ['African-American or Afro-Caribbean', 'European'])

In [133]:
adata.obs['development_stage'].cat.categories
ref_cluster=pd.Categorical(adata.obs['development_stage'],
                           categories=['59-year-old human stage', '61-year-old human stage'])
adata.rename_categories('development_stage',['59', '61'])

In [134]:
adata.obs['donor'].cat.categories
ref_cluster=pd.Categorical(adata.obs['donor'],
                           categories=['TSP2', 'TSP14'])
adata.rename_categories('donor', ['TSP2', 'TSP14'])

In [135]:
adata.obs['Organ'] = 'Gut_Colon'
adata.obs['Organ_Specific'] = adata.obs['tissue']
adata.obs['Dataset'] = 'Pisco_Gut_Colon'
adata.obs['InternDatasetNumber'] = '04-6-Gut_Colon-Pisco-2022'
adata.obs['Dataset_status'] = 'Healthy_Dataset'

adata.obs['celltype'] = adata.obs['celltype']
adata.obs['sub_celltype'] = 'NaN'
adata.obs['Malignant'] = 'NonMalignant'

adata.obs['Patient'] = adata.obs['donor']
adata.obs['Patient_Number'] = 'NaN'
adata.obs['age'] = adata.obs['development_stage']
adata.obs['sex'] = adata.obs['sex']
adata.obs['ethnicity'] = adata.obs['ethnicity']
adata.obs['health_status'] = 'NaN'

adata.obs['original_celltype_1'] = adata.obs['cell_type']
adata.obs['original_celltype_2'] = 'NaN'
adata.obs['original_celltype_3'] = 'NaN'

In [137]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [138]:
adata.obs_names_make_unique(join='_')

In [141]:
adata.write(writepath + '04-6-Gut_Colon-Pisco-2022.h5ad')

## 04-9-Oesophagus-Han-2020

In [202]:
ix=np.isin(adata_han.obs['sub_tissue'],['AdultEsophagus']) 
adata=adata_han[ix].copy()

In [203]:
adata.obs['InternDatasetNumber'] ='04-9-Oesophagus-Han-2020'

In [207]:
#calculate QC covariates
adata.obs['n_counts'] = adata.X.sum(1)
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1)

In [210]:
# FILTER PARAMETERS
#Filter out cells
sc.pp.filter_cells(adata, max_counts = 3100)
sc.pp.filter_cells(adata, max_genes = 1700)
# Min 20 cells - filters out low count genes
sc.pp.filter_genes(adata, min_cells=20) #500 works # # 450(30494) #400(29837) #300(28268) not

filtered out 51 cells that have more than 3100 counts
filtered out 13547 genes that are detected in less than 20 cells


In [211]:
# get mt genes
mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]
np.array(mt_genes)
mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]

# convert to spare
adata.X=sp.sparse.csr_matrix.todense(adata.X)

# manually define mt score
y = np.bincount(mt_gene_mask)
ii = np.nonzero(y)[0]
np.vstack((ii,y[ii])).T
adata.X[:, mt_gene_mask].sum(1)
mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))
mt_sum=np.array(pd.DataFrame(mt_sum)[0])
mt_frac=mt_sum/adata.obs['n_counts'].values

adata.obs['mt_frac'] = mt_frac

#Filter out cells with over 20% mito fraction
adata = adata[adata.obs['mt_frac'] < 0.20]

In [212]:
adata_pp=adata.copy()

In [213]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

normalizing by total count per cell
    finished (0:00:00): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    with n_comps=15
    finished (0:00:02)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 10 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:00:01)


In [214]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [215]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)
size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

In [216]:
#Delete adata_pp
del adata_pp
adata.obs['size_factors'] = size_factors

  This is separate from the ipykernel package so we can avoid doing imports until


In [217]:
adata.strings_to_categoricals()

In [218]:
#make  (adata.X) copy of counts of raw data for downstream analysis
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [219]:
adata.raw = adata

In [220]:
#Normalize data
adata.X /= adata.obs['size_factors'].values[:, None]
sc.pp.log1p(adata)

In [221]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:00)


In [222]:
# Calculate the visualizations
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata)
sc.tl.umap(adata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:01)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:08)


In [224]:
# make consistent annotations across datasets
adata.obs['celltype']=adata.obs['celltype_specific'].copy()
adata.obs['celltype'].cat.categories

Index(['Antigen-presenting cell', 'B cell', 'B cell (Plasmocyte)_IGHA high',
       'B cell_JCHAIN high', 'Endothelial cell', 'Endothelial cell_ACKR1 high',
       'Endothelial cell_CCL21 high', 'Endothelial cell_IGFBP3 high',
       'Epithelial cell_KRT4 high', 'Epithelial cell_KRT7 high',
       'Epithelial cell_KRT13 high', 'Epithelial cell_KRT14 high',
       'Epithelial cell_KRT16 high', 'Epithelial cell_KRT17 high',
       'Epithelial cell_MMP7 high', 'Fibroblast', 'Goblet cell',
       'Kerationcyte', 'MT-gene high cell', 'Macrophage_RGS1 high',
       'Macrophage_RNASE1 high', 'Macrophage_TPSB2 high', 'Mast cell',
       'Mucosal aquamous Epithelial cell', 'Neutrophil _S100A8 high',
       'Neutrophil_IL1B high', 'Smooth muscle cell', 'Stromal cell',
       'Stromal cell_PLA2G2A high', 'Stromal cell_PTGDS high'],
      dtype='object')

In [225]:
ref_cluster=pd.Categorical(adata.obs['celltype'],
                           categories=['Antigen-presenting cell', 'B cell', 'B cell (Plasmocyte)_IGHA high',
       'B cell_JCHAIN high', 'Endothelial cell', 'Endothelial cell_ACKR1 high',
       'Endothelial cell_CCL21 high', 'Endothelial cell_IGFBP3 high',
       'Epithelial cell_KRT4 high', 'Epithelial cell_KRT7 high',
       'Epithelial cell_KRT13 high', 'Epithelial cell_KRT14 high',
       'Epithelial cell_KRT16 high', 'Epithelial cell_KRT17 high',
       'Epithelial cell_MMP7 high', 'Fibroblast', 'Goblet cell',
       'Kerationcyte', 'MT-gene high cell', 'Macrophage_RGS1 high',
       'Macrophage_RNASE1 high', 'Macrophage_TPSB2 high', 'Mast cell',
       'Mucosal aquamous Epithelial cell', 'Neutrophil _S100A8 high',
       'Neutrophil_IL1B high', 'Smooth muscle cell', 'Stromal cell',
       'Stromal cell_PLA2G2A high', 'Stromal cell_PTGDS high'])

In [226]:
ix=np.isin(ref_cluster,[ 'B cell_JCHAIN high'])
ref_cluster[ix]='B cell'

ix=np.isin(ref_cluster,[ 'Endothelial cell_ACKR1 high', 'Endothelial cell_CCL21 high', 'Endothelial cell_IGFBP3 high',])
ref_cluster[ix]='Endothelial cell'

ix=np.isin(ref_cluster,[ 'Macrophage_RNASE1 high', 'Macrophage_TPSB2 high'])
ref_cluster[ix]='Macrophage_RGS1 high'

ix=np.isin(ref_cluster,[  'Neutrophil_IL1B high'])
ref_cluster[ix]='Neutrophil _S100A8 high'

ix=np.isin(ref_cluster,[ 'Stromal cell_PLA2G2A high', 'Stromal cell_PTGDS high'])
ref_cluster[ix]='Stromal cell'

ix=np.isin(ref_cluster,[ 'Epithelial cell_KRT7 high',
       'Epithelial cell_KRT13 high', 'Epithelial cell_KRT14 high',
       'Epithelial cell_KRT16 high', 'Epithelial cell_KRT17 high',
       'Epithelial cell_MMP7 high'])
ref_cluster[ix]= 'Epithelial cell_KRT4 high'

In [227]:
adata.obs['celltype']=pd.Categorical(ref_cluster,
                                           categories=['Antigen-presenting cell', 'B cell', 'B cell (Plasmocyte)_IGHA high',
      'Endothelial cell',
         'Epithelial cell_KRT4 high',
        'Fibroblast', 'Goblet cell',
       'Kerationcyte', 'MT-gene high cell', 'Macrophage_RGS1 high',
        'Mast cell',
       'Mucosal aquamous Epithelial cell', 'Neutrophil _S100A8 high',
       'Smooth muscle cell', 'Stromal cell'])

In [228]:
adata.rename_categories('celltype', ['Unknown', 'B cells', 'Plasma cells',
      'Endothelial cells',
         'Oesophageal epithelial cells',
        'Fibroblast cells', 'Intestinal secretory cells',
       'Keratinocytes', 'Oesophageal MT high cells', 'Macrophages',
        'Mast cells',
       'Mucosal squamous cells', 'Neutrophils',
       'Smooth muscle cells', 'Mesenchymal stromal cells'])

In [232]:
adata.obs['sub_tissue'].cat.categories
ref_cluster=pd.Categorical(adata.obs['sub_tissue'],
                           categories=['AdultEsophagus'])
adata.rename_categories('sub_tissue', ['Oesophagus'])

In [233]:
adata.obs['sex'].cat.categories
ref_cluster=pd.Categorical(adata.obs['sex'],
                           categories=['male'])
adata.rename_categories('sex', ['Male'])

In [234]:
adata.obs['age'].cat.categories
ref_cluster=pd.Categorical(adata.obs['age'],
                           categories=['45Y', '56Y'])
adata.rename_categories('age',['45', '56'])

In [235]:
adata.obs['donor'].cat.categories
ref_cluster=pd.Categorical(adata.obs['donor'],
                           categories=['Donor31', 'Donor32'])
adata.rename_categories('donor', ['Han-Donor31', 'Han-Donor32'])

In [236]:
adata.obs['Organ'] = 'Oesophagus'
adata.obs['Organ_Specific'] = adata.obs['sub_tissue']
adata.obs['Dataset'] = 'Han_Oesophagus'
adata.obs['InternDatasetNumber'] ='04-9-Oesophagus-Han-2020'
adata.obs['Dataset_status'] = 'Healthy_Dataset'

adata.obs['celltype'] = adata.obs['celltype']
adata.obs['sub_celltype'] = 'NaN'
adata.obs['Malignant'] = 'NonMalignant'

adata.obs['Patient'] = adata.obs['donor']
adata.obs['Patient_Number'] = 'NaN'
adata.obs['age'] = adata.obs['age']
adata.obs['sex'] = adata.obs['sex']
adata.obs['ethnicity'] = 'NaN'
adata.obs['health_status'] = 'NaN'

adata.obs['original_celltype_1'] = adata.obs['celltype_specific']
adata.obs['original_celltype_2'] = adata.obs['celltype_global']
adata.obs['original_celltype_3'] = 'NaN'

In [238]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [239]:
adata.write(writepath + '04-9-Oesophagus-Han-2020-processed.h5ad')

# 05-Liver

## 05-1-Liver-MacParland-2018

In [204]:
# here we use sfaira to import available datasets with annotations
# note that the following steps may change depending on the current sfaira version and the path to your repository

datadir = '/path/to/repo/'

ds = sfaira.data.human.DatasetGroupLiver(path=datadir)  # This links all data sets available

In [205]:
ds.ids 

['human_liver_2018_10x_macparland_001_10.1038/s41467-018-06318-7',
 'human_liver_2019_10x_popescu_001_10.1038/s41586-019-1652-y',
 'human_liver_2019_10x_ramachandran_001_10.1038/s41586-019-1631-3',
 'human_liver_2019_mCELSeq2_aizarani_001_10.1038/s41586-019-1373-2',
 'human_liver_2020_microwell_han_001_10.1038/s41586-020-2157-4',
 'human_liver_2020_microwell_han_002_10.1038/s41586-020-2157-4',
 'human_liver_2020_microwell_han_003_10.1038/s41586-020-2157-4',
 'human_liver_2020_microwell_han_004_10.1038/s41586-020-2157-4',
 'human_liver_2020_microwell_han_005_10.1038/s41586-020-2157-4']

In [206]:
# pick first one
idx = ds.ids[0]

In [212]:
idx

'human_liver_2018_10x_macparland_001_10.1038/s41467-018-06318-7'

In [207]:
ds.datasets[idx].load()



In [209]:
adata=ds.datasets[idx].adata

In [216]:
adata.var.index=np.array(adata.var.names)

In [185]:
adata.obs['InternDatasetNumber'] ='05-1-Liver-MacParland-2018'

In [190]:
sc.pp.calculate_qc_metrics(adata, inplace=True)

In [191]:
adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]

Index(['MT-ND1', 'MT-ND2', 'MT-CO1', 'MT-CO2', 'MT-ATP8', 'MT-ATP6', 'MT-CO3',
       'MT-ND3', 'MT-ND4L', 'MT-ND4', 'MT-ND5', 'MT-ND6', 'MT-CYB'],
      dtype='object')

In [192]:
mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]
adata.obs['mt_frac'] = adata.X[:, mt_gene_mask].sum(1)/adata.obs['total_counts']

In [193]:
# FILTER PARAMETERS
print('Total number of cells: {:d}'.format(adata.n_obs))

#Filter out counts over 25000
sc.pp.filter_cells(adata, max_counts = 25000)
print('Number of cells after max count filter: {:d}'.format(adata.n_obs))

#MT filter
adata = adata[adata.obs['mt_frac'] < 0.2]
print('Number of cells after MT filter: {:d}'.format(adata.n_obs))

#Filter out genes over 4000
sc.pp.filter_cells(adata, max_genes = 4000)
#print('Number of cells after gene filter: {:d}'.format(adata.n_obs))

Total number of cells: 8444
filtered out 94 cells that have more than 25000 counts
Number of cells after max count filter: 8350
Number of cells after MT filter: 6289
filtered out 18 cells that have more than 4000 genes expressed


  adata.obs['n_genes'] = number


In [194]:
#Filter genes:
print('Total number of genes: {:d}'.format(adata.n_vars))

# Min 20 cells - filters out low count genes
sc.pp.filter_genes(adata, min_cells=20)
print('Number of genes after cell filter: {:d}'.format(adata.n_vars))

Total number of genes: 20007
filtered out 5140 genes that are detected in less than 20 cells
Number of genes after cell filter: 14867


In [196]:
adata_pp=adata.copy()

In [197]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

normalizing by total count per cell
    finished (0:00:00): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    with n_comps=15
    finished (0:00:01)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 15 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:00:00)


In [198]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [199]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)
size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

In [200]:
#Delete adata_pp
del adata_pp
adata.obs['size_factors'] = size_factors

In [201]:
adata.strings_to_categoricals()

In [202]:
#make  (adata.X) copy of counts of raw data for downstream analysis
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [203]:
adata.raw = adata

In [204]:
#Normalize data
adata.X /= adata.obs['size_factors'].values[:, None]
sc.pp.log1p(adata)

In [205]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [206]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:00)


In [207]:
# Calculate the visualizations
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata)
sc.tl.umap(adata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:01)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:12)


In [209]:
# make consistent annotations across datasets
adata.obs['celltype']=adata.obs['cell_ontology_class'].copy()

In [210]:
ref_cluster=pd.Categorical(adata.obs['celltype'],
                           categories=['Alpha beta T cells', 'Central venous LSECs', 'Cholangiocytes',
       'Endothelial cell', 'Erythroid cells', 'Gamma delta T cells 1',
       'Gamma delta T cells 2', 'Hepatic stellate cells', 'Hepatocyte 1',
       'Hepatocyte 2', 'Hepatocyte 3', 'Hepatocyte 4', 'Hepatocyte 5',
       'Hepatocyte 6', 'Inflammatory macrophages', 'Mature B cells', 'NK cell',
       'Non inflammatory macrophages', 'Periportal LSECs', 'Plasma cells'])

In [211]:
ix=np.isin(ref_cluster,['Gamma delta T cells 1','Gamma delta T cells 2'])
ref_cluster[ix]='Alpha beta T cells'

ix=np.isin(ref_cluster,['Central venous LSECs', 'Periportal LSECs'])
ref_cluster[ix]= 'Endothelial cell'

ix=np.isin(ref_cluster,['Hepatocyte 2', 'Hepatocyte 3', 'Hepatocyte 4', 'Hepatocyte 5','Hepatocyte 6'])
ref_cluster[ix]='Hepatocyte 1'

ix=np.isin(ref_cluster,['Non inflammatory macrophages'])
ref_cluster[ix]='Inflammatory macrophages'

In [212]:
adata.obs['celltype']=pd.Categorical(ref_cluster,
                                           categories=['Alpha beta T cells', 'Cholangiocytes',
      'Endothelial cell', 'Erythroid cells', 'Hepatic stellate cells', 'Hepatocyte 1',
      'Inflammatory macrophages', 'Mature B cells', 'NK cell', 'Plasma cells'])

In [213]:
adata.rename_categories('celltype', ['T cells', 'Cholangiocytes',
       'Endothelial cells', 'Erythroid cells', 'Hepatic stellate cells', 'Hepatocytes',
       'Macrophages', 'B cells', 'NK cells', 'Plasma cells'])

In [216]:
adata.obs['Organ'] = 'Liver'
adata.obs['Organ_Specific'] = 'Liver'
adata.obs['Dataset'] = 'MacParland_Liver'
adata.obs['InternDatasetNumber'] ='05-1-Liver-MacParland-2018'
adata.obs['Dataset_status'] = 'Healthy_Dataset'

adata.obs['celltype'] = adata.obs['celltype']
adata.obs['sub_celltype'] = 'NaN'
adata.obs['Malignant'] = 'NonMalignant'

adata.obs['Patient'] = 'MacParland_Liver-Donor1'
adata.obs['Patient_Number'] = 'NaN'
adata.obs['age'] = 'NaN'
adata.obs['sex'] = 'NaN'
adata.obs['ethnicity'] = 'NaN'
adata.obs['health_status'] = 'NaN'

adata.obs['original_celltype_1'] = adata.obs['cell_ontology_class']
adata.obs['original_celltype_2'] = 'NaN'
adata.obs['original_celltype_3'] = 'NaN'

In [218]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [219]:
adata.obs_names_make_unique()

In [220]:
adata.write(writepath + '05-1-Liver-MacParland-2018-processed.h5ad')

##  05-2-Liver-Ramachandran-2019

In [204]:
# here we use sfaira to import available datasets with annotations
# note that the following steps may change depending on the current sfaira version and the path to your repository

datadir = '/path/to/repo/'

ds = sfaira.data.human.DatasetGroupLiver(path=datadir)  # This links all data sets available

In [224]:
# pick
idx = ds.ids[2]

In [225]:
idx

'human_liver_2019_10x_ramachandran_001_10.1038/s41586-019-1631-3'

In [226]:
ds.datasets[idx].load()

In [228]:
adata=ds.datasets[idx].adata

In [233]:
ix=np.isin(adata.obs['aetiology'],['Uninjured']) 
adata=adata[ix].copy()

In [242]:
adata.var.index=np.array(adata.var.names)

In [275]:
adata.obs['InternDatasetNumber'] ='05-2-Liver-Ramachandran-2019'

In [278]:
sc.pp.calculate_qc_metrics(adata, inplace=True)

In [280]:
adata.obs['n_counts'] = adata.obs['total_counts']
adata.obs['log_counts'] = adata.obs['log1p_n_genes_by_counts']
adata.obs['n_genes'] = adata.obs['n_genes_by_counts']

In [284]:
# FILTER PARAMETERS
#Filter out cells
#sc.pp.filter_cells(adata, max_counts = 23000)
#sc.pp.filter_cells(adata, max_genes = 6700)
# Min 20 cells - filters out low count genes
sc.pp.filter_genes(adata, min_cells=20)

filtered out 4933 genes that are detected in less than 20 cells


In [286]:
# get mt genes
mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]
np.array(mt_genes)
mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]

# convert to spare
adata.X=sp.sparse.csr_matrix.todense(adata.X)

# manually define mt score
y = np.bincount(mt_gene_mask)
ii = np.nonzero(y)[0]
np.vstack((ii,y[ii])).T
adata.X[:, mt_gene_mask].sum(1)
mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))
mt_sum=np.array(pd.DataFrame(mt_sum)[0])
mt_frac=mt_sum/adata.obs['n_counts'].values

adata.obs['mt_frac'] = mt_frac

#Filter out cells with over 25% mito fraction
adata = adata[adata.obs['mt_frac'] < 0.20]

In [287]:
adata_pp=adata.copy()

In [288]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

normalizing by total count per cell
    finished (0:00:02): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    with n_comps=15
    finished (0:00:08)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:03)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 16 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:00:03)


In [289]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [290]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)
size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

In [291]:
#Delete adata_pp
del adata_pp
adata.obs['size_factors'] = size_factors

  This is separate from the ipykernel package so we can avoid doing imports until


In [292]:
adata.strings_to_categoricals()

In [293]:
#make  (adata.X) copy of counts of raw data for downstream analysis
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [294]:
#Normalize data
adata.X /= adata.obs['size_factors'].values[:, None]
sc.pp.log1p(adata)

In [295]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [296]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:00)


In [297]:
# Calculate the visualizations
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata)
sc.tl.umap(adata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:07)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:06)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:26)


In [299]:
# make consistent annotations across datasets
adata.obs['celltype']=adata.obs['annotation_indepth'].copy()

In [300]:
adata.obs['celltype'].cat.categories

Index(['MPs (1)', 'MPs (2)', 'MPs (3)', 'MPs (4)', 'MPs (5)', 'MPs (6)',
       'MPs (7)', 'MPs (8)', 'MPs (9)', 'Cycling MPs (1)', 'Cycling MPs (2)',
       'Cycling MPs (3)', 'Cycling MPs (4)', 'pDCs', 'ILCs (1)', 'ILCs (2)',
       'ILCs (3)', 'Cycling ILCs (1)', 'Cycling ILCs (2)', 'Tcells (1)',
       'Tcells (2)', 'Tcells (3)', 'Tcells (4)', 'Tcells (5)',
       'Cycling Tcells', 'Bcells (1)', 'Bcells (2)', 'Plasma Bcells (1)',
       'Plasma Bcells (2)', 'Mast cells', 'Endothelia (1)', 'Endothelia (2)',
       'Endothelia (3)', 'Endothelia (4)', 'Endothelia (5)', 'Endothelia (6)',
       'Endothelia (7)', 'Mesenchyme (1)', 'Mesenchyme (2)', 'Myofibroblasts',
       'Mesothelia', 'Hepatocytes', 'Cholangiocytes (1)', 'Cholangiocytes (2)',
       'Cholangiocytes (3)'],
      dtype='object')

In [301]:
ref_cluster=pd.Categorical(adata.obs['celltype'],
                           categories=['MPs (1)', 'MPs (2)', 'MPs (3)', 'MPs (4)', 'MPs (5)', 'MPs (6)',
       'MPs (7)', 'MPs (8)', 'MPs (9)', 'Cycling MPs (1)', 'Cycling MPs (2)',
       'Cycling MPs (3)', 'Cycling MPs (4)', 'pDCs', 'ILCs (1)', 'ILCs (2)',
       'ILCs (3)', 'Cycling ILCs (1)', 'Cycling ILCs (2)', 'Tcells (1)',
       'Tcells (2)', 'Tcells (3)', 'Tcells (4)', 'Tcells (5)',
       'Cycling Tcells', 'Bcells (1)', 'Bcells (2)', 'Plasma Bcells (1)',
       'Plasma Bcells (2)', 'Mast cells', 'Endothelia (1)', 'Endothelia (2)',
       'Endothelia (3)', 'Endothelia (4)', 'Endothelia (5)', 'Endothelia (6)',
       'Endothelia (7)', 'Mesenchyme (1)', 'Mesenchyme (2)', 'Myofibroblasts',
       'Mesothelia', 'Hepatocytes', 'Cholangiocytes (1)', 'Cholangiocytes (2)',
       'Cholangiocytes (3)'])

In [302]:
ix=np.isin(ref_cluster,['MPs (2)', 'MPs (3)', 'MPs (4)', 'MPs (5)', 'MPs (6)',
       'MPs (7)', 'MPs (8)', 'MPs (9)', 'Cycling MPs (1)', 'Cycling MPs (2)',
       'Cycling MPs (3)', 'Cycling MPs (4)'])
ref_cluster[ix]='MPs (1)'


ix=np.isin(ref_cluster,[ 'ILCs (2)','ILCs (3)', 'Cycling ILCs (1)', 'Cycling ILCs (2)'])
ref_cluster[ix]='ILCs (1)'


ix=np.isin(ref_cluster,['Tcells (2)', 'Tcells (3)', 'Tcells (4)', 'Tcells (5)','Cycling Tcells'])
ref_cluster[ix]='Tcells (1)'

ix=np.isin(ref_cluster,['Bcells (2)'])
ref_cluster[ix]='Bcells (1)'

ix=np.isin(ref_cluster,['Plasma Bcells (2)'])
ref_cluster[ix]='Plasma Bcells (1)'

ix=np.isin(ref_cluster,['Endothelia (2)','Endothelia (3)', 'Endothelia (4)', 'Endothelia (5)', 
                        'Endothelia (6)','Endothelia (7)'])
ref_cluster[ix]='Endothelia (1)'

ix=np.isin(ref_cluster,['Mesenchyme (2)'])
ref_cluster[ix]= 'Mesenchyme (1)'

ix=np.isin(ref_cluster,['Cholangiocytes (2)','Cholangiocytes (3)'])
ref_cluster[ix]= 'Cholangiocytes (1)'



In [303]:
adata.obs['celltype']=pd.Categorical(ref_cluster,
                                           categories=['MPs (1)', 'pDCs', 'ILCs (1)','Tcells (1)', 'Bcells (1)', 'Plasma Bcells (1)',
                                                       'Mast cells', 'Endothelia (1)', 'Mesenchyme (1)',
                                                       'Myofibroblasts','Mesothelia', 'Hepatocytes', 'Cholangiocytes (1)'])

In [304]:
adata.obs['celltype'].cat.categories

Index(['MPs (1)', 'pDCs', 'ILCs (1)', 'Tcells (1)', 'Bcells (1)',
       'Plasma Bcells (1)', 'Mast cells', 'Endothelia (1)', 'Mesenchyme (1)',
       'Myofibroblasts', 'Mesothelia', 'Hepatocytes', 'Cholangiocytes (1)'],
      dtype='object')

In [305]:
adata.rename_categories('celltype', ['Macrophages', 'Dendritic cells', 'Innate lymphoid cells', 'T cells', 'B cells', 'Plasma cells',
        'Mast cells','Endothelial cells', 'Mesenchymal stromal cells ',
        'Myofibroblast cells', 'Mesothelial cells', 'Hepatocytes', 'Cholangiocytes'])

  res = method(*args, **kwargs)


In [310]:
adata.obs['donor'] = adata.obs['liver']
ref_cluster=pd.Categorical(adata.obs['donor'],
                           categories=['Healthy1', 'Healthy2', 'Healthy3', 'Healthy4', 'Healthy5'])
adata.rename_categories('donor', ['Ramachandran_Liver-Donor1', 'Ramachandran_Liver-Donor2', 'Ramachandran_Liver-Donor3', 'Ramachandran_Liver-Donor4', 'Ramachandran_Liver-Donor5'])

  res = method(*args, **kwargs)


In [311]:
adata.obs['Organ'] = 'Liver'
adata.obs['Organ_Specific'] = 'Liver'
adata.obs['Dataset'] = 'Ramachandran_Liver'
adata.obs['InternDatasetNumber'] ='05-2-Liver-Ramachandran-2019'
adata.obs['Dataset_status'] = 'Healthy_Dataset'

adata.obs['celltype'] = adata.obs['celltype']
adata.obs['sub_celltype'] = 'NaN'
adata.obs['Malignant'] = 'NonMalignant'

adata.obs['Patient'] = adata.obs['donor']
adata.obs['Patient_Number'] = 'NaN'
adata.obs['age'] = 'NaN'
adata.obs['sex'] = 'NaN'
adata.obs['ethnicity'] = 'NaN'
adata.obs['health_status'] = 'NaN'

adata.obs['original_celltype_1'] = adata.obs['annotation_indepth']
adata.obs['original_celltype_2'] = adata.obs['annotation_lineage']
adata.obs['original_celltype_3'] = adata.obs['cell_ontology_class']

In [313]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [6]:
adata.obs['celltype'].cat.categories

Index(['Macrophages', 'Dendritic cells', 'Innate lymphoid cells', 'T cells',
       'B cells', 'Plasma cells', 'Mast cells', 'Endothelial cells',
       'Mesenchymal stromal cells ', 'Myofibroblast cells',
       'Mesothelial cells', 'Hepatocytes', 'Cholangiocytes'],
      dtype='object')

In [7]:
adata.rename_categories('celltype',['Macrophages', 'Dendritic cells', 'Innate lymphoid cells', 'T cells',
       'B cells', 'Plasma cells', 'Mast cells', 'Endothelial cells',
       'Mesenchymal stromal cells', 'Myofibroblast cells',
       'Mesothelial cells', 'Hepatocytes', 'Cholangiocytes'])

  res = method(*args, **kwargs)


In [8]:
adata.write(writepath + '05-2-Liver-Ramachandran-2019-processed.h5ad')

## 05-3-Liver-Andrews-2021

In [315]:
target_collections = ["44531dd9-1388-4416-a117-af0a99de2294"]
cache_path = os.path.join(".", "data")
dsg = sfaira.data.dataloaders.databases.DatasetSuperGroupDatabases(data_path=cache_path, cache_metadata=True)
dsg.subset(key="collection_id", values=target_collections)
dsg.datasets
dsg.download()

In [424]:
path = '/path/to/repo/44531dd9-1388-4416-a117-af0a99de2294/'
files = [f for f in listdir(path) if isfile(join(path, f))]

In [425]:
for i in range(len(files)):
    path_2 = path + files[i]
    u = sc.read_h5ad(path_2)
    u.obs['id'] = files[i]
    if i == 0:
        adata = u
    else:
        adata = adata.concatenate(u, join='outer')

  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


In [427]:
adata.obs['InternDatasetNumber'] ='05-3-Liver-2021-Andrews'

In [432]:
adata.var.index = adata.var['feature_name']

In [434]:
sc.pp.calculate_qc_metrics(adata, inplace=True)

In [435]:
adata.obs['n_counts'] = adata.obs['total_counts']
adata.obs['log_counts'] = adata.obs['log1p_n_genes_by_counts']
adata.obs['n_genes'] = adata.obs['n_genes_by_counts']

In [439]:
# FILTER PARAMETERS
#Filter out cells
sc.pp.filter_cells(adata, max_counts = 4200)
sc.pp.filter_cells(adata, max_genes = 9000)
# Min 20 cells - filters out low count genes
sc.pp.filter_genes(adata, min_cells=20)

filtered out 308 cells that have more than 4200 counts




filtered out 23025 genes that are detected in less than 20 cells


In [441]:
# get mt genes
mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]
np.array(mt_genes)
mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]

# convert to spare
adata.X=sp.sparse.csr_matrix.todense(adata.X)

# manually define mt score
y = np.bincount(mt_gene_mask)
ii = np.nonzero(y)[0]
np.vstack((ii,y[ii])).T
adata.X[:, mt_gene_mask].sum(1)
mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))
mt_sum=np.array(pd.DataFrame(mt_sum)[0])
mt_frac=mt_sum/adata.obs['n_counts'].values

adata.obs['mt_frac'] = mt_frac

#Filter out cells with over 25% mito fraction
adata = adata[adata.obs['mt_frac'] < 0.20]

In [442]:
adata_pp=adata.copy()

In [443]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

normalizing by total count per cell
    finished (0:00:06): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    with n_comps=15
    finished (0:00:18)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:21)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 28 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:00:25)


In [444]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [445]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)
size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

In [446]:
#Delete adata_pp
del adata_pp
adata.obs['size_factors'] = size_factors

  This is separate from the ipykernel package so we can avoid doing imports until


In [447]:
adata.strings_to_categoricals()

In [448]:
#make  (adata.X) copy of counts of raw data for downstream analysis
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [449]:
adata.raw = adata

In [450]:
#Normalize data
adata.X /= adata.obs['size_factors'].values[:, None]
sc.pp.log1p(adata)

In [451]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [452]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:00)


In [453]:
# Calculate the visualizations
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata)
sc.tl.umap(adata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:54)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:38)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:02:21)


In [455]:
adata.obs['celltype'] = adata.obs['cell_type']

In [456]:
adata.obs['celltype'].cat.categories

Index(['B cell', 'Kupffer cell', 'alpha-beta T cell',
       'blood vessel endothelial cell', 'centrilobular region hepatocyte',
       'cholangiocyte', 'endothelial cell of pericentral hepatic sinusoid',
       'endothelial cell of periportal hepatic sinusoid',
       'erythroid lineage cell', 'fibroblast', 'gamma-delta T cell',
       'hepatic stellate cell', 'hepatocyte', 'inflammatory macrophage',
       'mature B cell', 'midzonal region hepatocyte', 'natural killer cell',
       'periportal region hepatocyte', 'plasma cell', 'progenitor cell',
       'vascular associated smooth muscle cell'],
      dtype='object')

In [457]:
ref_cluster=pd.Categorical(adata.obs['celltype'],
                           categories=['B cell', 'Kupffer cell', 'alpha-beta T cell',
       'blood vessel endothelial cell', 'centrilobular region hepatocyte',
       'cholangiocyte', 'endothelial cell of pericentral hepatic sinusoid',
       'endothelial cell of periportal hepatic sinusoid',
       'erythroid lineage cell', 'fibroblast', 'gamma-delta T cell',
       'hepatic stellate cell', 'hepatocyte', 'inflammatory macrophage',
       'mature B cell', 'midzonal region hepatocyte', 'natural killer cell',
       'periportal region hepatocyte', 'plasma cell', 'progenitor cell',
       'vascular associated smooth muscle cell'])

In [458]:
ix=np.isin(ref_cluster,['gamma-delta T cell'])
ref_cluster[ix]='alpha-beta T cell'

ix=np.isin(ref_cluster,['endothelial cell of pericentral hepatic sinusoid', 'endothelial cell of periportal hepatic sinusoid',])
ref_cluster[ix]=  'blood vessel endothelial cell'

ix=np.isin(ref_cluster,['centrilobular region hepatocyte', 'midzonal region hepatocyte', 'periportal region hepatocyte'])
ref_cluster[ix]='hepatocyte'

ix=np.isin(ref_cluster,['mature B cell'])
ref_cluster[ix]='B cell'

In [459]:
adata.obs['celltype']=pd.Categorical(ref_cluster,
                                           categories=['B cell', 'Kupffer cell', 'alpha-beta T cell',
       'blood vessel endothelial cell',
       'cholangiocyte',
       'erythroid lineage cell', 'fibroblast', 
       'hepatic stellate cell', 'hepatocyte', 'inflammatory macrophage',
        'natural killer cell',
        'plasma cell', 'progenitor cell',
       'vascular associated smooth muscle cell'])

In [460]:
adata.rename_categories('celltype', ['B cells', 'Kupffer cells', 'T cells',
       'Endothelial cells',
       'Cholangiocytes',
       'Erythroid cells', 'Fibroblast cells', 
       'Hepatic stellate cells', 'Hepatocytes', 'Macrophages',
        'NK cells',
        'Plasma cells', 'Hepatic progenitor cells',
       'Smooth muscle cells'])

  res = method(*args, **kwargs)


In [463]:
adata.obs['sex_ontology_term_id'].cat.categories
ref_cluster=pd.Categorical(adata.obs['sex_ontology_term_id'],
                           categories=['PATO:0000383', 'PATO:0000384'])
adata.rename_categories('sex_ontology_term_id', ['Female', 'Male'])

In [464]:
adata.obs['tissue'].cat.categories
ref_cluster=pd.Categorical(adata.obs['tissue'],
                           categories=['caudate lobe of liver'])
adata.rename_categories('tissue', ['Liver_CaudateLobe'])

In [465]:
adata.obs['development_stage'].cat.categories
ref_cluster=pd.Categorical(adata.obs['development_stage'],
                           categories=['human adult stage', 'mature stage'])
ix=np.isin(ref_cluster,['mature stage'])
ref_cluster[ix]='human adult stage'
adata.obs['development_stage']=pd.Categorical(ref_cluster,
                                           categories=['human adult stage'])
adata.rename_categories('development_stage', ['Adult'])

In [466]:
adata.obs['donor'] = adata.obs['donor_uuid']
ref_cluster=pd.Categorical(adata.obs['donor'],
                           categories=['3c1a1b1e-8b4c-45cf-a0c1-b8208c6b1f72',
       '9bac7886-bfb5-4b90-903c-2b5834bf0408',
       '19b27937-e5aa-4f8d-bb54-da04d451308a',
       'bd0d1069-676f-4469-8241-5854a7d5e111'])
adata.rename_categories('donor', ['Andrews_Liver-Donor1',
       'Andrews_Liver-Donor2',
       'Andrews_Liver-Donor3',
       'Andrews_Liver-Donor4'])

In [467]:
adata.obs['Organ'] = 'Liver'
adata.obs['Organ_Specific'] = adata.obs['tissue']
adata.obs['Dataset'] = 'Andrews_Liver'
adata.obs['InternDatasetNumber'] ='05-3-Liver-2021-Andrews'
adata.obs['Dataset_status'] = 'Healthy_Dataset'

adata.obs['celltype'] = adata.obs['celltype']
adata.obs['sub_celltype'] = 'NaN'
adata.obs['Malignant'] = 'NonMalignant'

adata.obs['Patient'] = adata.obs['donor']
adata.obs['Patient_Number'] = adata.obs['sample']
adata.obs['age'] = adata.obs['development_stage']
adata.obs['sex'] = adata.obs['sex_ontology_term_id']
adata.obs['ethnicity'] = 'NaN'
adata.obs['health_status'] = 'NaN'

adata.obs['original_celltype_1'] = adata.obs['cell_type']
adata.obs['original_celltype_2'] = 'NaN'
adata.obs['original_celltype_3'] = 'NaN'

In [469]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [470]:
adata.obs_names_make_unique()

In [475]:
adata.write(writepath + '05-3-Liver-Andrews-2021-processed.h5ad')

##  05-4-Liver-Pisco-2022

In [149]:
ix=np.isin(adata_pisco.obs['tissue'],['liver']) 
adata=adata_pisco[ix].copy()

In [154]:
#calculate QC covariates
adata.obs['n_counts'] = adata.X.sum(1)
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1)

In [158]:
# FILTER PARAMETERS
#Filter out cells
sc.pp.filter_cells(adata, max_counts = 12000)
sc.pp.filter_cells(adata, max_genes = 7000)
# Min 20 cells - filters out low count genes
sc.pp.filter_genes(adata, min_cells=20) #500 works # # 450(30494) #400(29837) #300(28268) not

filtered out 17 cells that have more than 12000 counts
filtered out 2 cells that have more than 7000 genes expressed
filtered out 40476 genes that are detected in less than 20 cells


In [160]:
# get mt genes
mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]
np.array(mt_genes)
mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]

# convert to spare
adata.X=sp.sparse.csr_matrix.todense(adata.X)

# manually define mt score
y = np.bincount(mt_gene_mask)
ii = np.nonzero(y)[0]
np.vstack((ii,y[ii])).T
adata.X[:, mt_gene_mask].sum(1)
mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))
mt_sum=np.array(pd.DataFrame(mt_sum)[0])
mt_frac=mt_sum/adata.obs['n_counts'].values

adata.obs['mt_frac'] = mt_frac

#Filter out cells with over 20% mito fraction
adata = adata[adata.obs['mt_frac'] < 0.20]

In [161]:
adata_pp=adata.copy()

In [162]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

normalizing by total count per cell
    finished (0:00:00): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    on highly variable genes
    with n_comps=15
    finished (0:00:00)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 14 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:00:00)


In [163]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [164]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)
size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

In [165]:
#Delete adata_pp
del adata_pp
adata.obs['size_factors'] = size_factors

  This is separate from the ipykernel package so we can avoid doing imports until


In [166]:
adata.strings_to_categoricals()

In [167]:
#make  (adata.X) copy of counts of raw data for downstream analysis
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [168]:
adata.raw = adata

In [169]:
#Normalize data
adata.X /= adata.obs['size_factors'].values[:, None]
sc.pp.log1p(adata)

In [170]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:00)


In [171]:
# Calculate the visualizations
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata)
sc.tl.umap(adata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:00)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:10)


In [173]:
# make consistent annotations across datasets
adata.obs['celltype']=adata.obs['cell_type'].copy()

In [174]:
adata.obs['celltype'].cat.categories

Index(['T cell', 'endothelial cell', 'endothelial cell of hepatic sinusoid',
       'erythrocyte', 'fibroblast', 'hepatocyte', 'intrahepatic cholangiocyte',
       'liver dendritic cell', 'macrophage', 'mature NK T cell', 'monocyte',
       'neutrophil', 'plasma cell'],
      dtype='object')

In [175]:
ref_cluster=pd.Categorical(adata.obs['celltype'],
                           categories=['T cell', 'endothelial cell', 'endothelial cell of hepatic sinusoid',
       'erythrocyte', 'fibroblast', 'hepatocyte', 'intrahepatic cholangiocyte',
       'liver dendritic cell', 'macrophage', 'mature NK T cell', 'monocyte',
       'neutrophil', 'plasma cell'])

In [176]:
ix=np.isin(ref_cluster,[ 'endothelial cell of hepatic sinusoid',])
ref_cluster[ix]='endothelial cell'

In [177]:
adata.obs['celltype']=pd.Categorical(ref_cluster,
                                           categories=['T cell', 'endothelial cell', 
       'erythrocyte', 'fibroblast', 'hepatocyte', 'intrahepatic cholangiocyte',
       'liver dendritic cell', 'macrophage', 'mature NK T cell', 'monocyte',
       'neutrophil', 'plasma cell'])

In [178]:
adata.rename_categories('celltype',['T cells', 'Endothelial cells',
       'Erythroid cells', 'Fibroblast cells', 'Hepatocytes', 'Cholangiocytes',
       'Dendritic cells', 'Macrophages', 'NK cells', 'Monocytes',
       'Neutrophils', 'Plasma cells'])

In [181]:
adata.obs['donor'].cat.categories

Index(['TSP6', 'TSP14'], dtype='object')

In [182]:
adata.obs['tissue'].cat.categories
ref_cluster=pd.Categorical(adata.obs['tissue'],
                           categories=['liver'])
adata.rename_categories('tissue', ['Liver'])

In [183]:
adata.obs['sex'].cat.categories
ref_cluster=pd.Categorical(adata.obs['sex'],
                           categories=['male'])
adata.rename_categories('sex', ['Male'])

In [184]:
adata.obs['ethnicity'].cat.categories
ref_cluster=pd.Categorical(adata.obs['ethnicity'],
                           categories=['European'])
adata.rename_categories('ethnicity', ['European'])

In [185]:
adata.obs['development_stage'].cat.categories
ref_cluster=pd.Categorical(adata.obs['development_stage'],
                           categories=['59-year-old human stage', '67-year-old human stage'])
adata.rename_categories('development_stage',['59', '67'])

In [186]:
adata.obs['donor'].cat.categories
ref_cluster=pd.Categorical(adata.obs['donor'],
                           categories=['TSP6', 'TSP14'])
adata.rename_categories('donor', ['TSP6', 'TSP14'])

In [187]:
adata.obs['Organ'] = 'Liver'
adata.obs['Organ_Specific'] = adata.obs['tissue']
adata.obs['Dataset'] = 'Pisco_Liver'
adata.obs['InternDatasetNumber'] = '05-4-Liver-Pisco-2022'
adata.obs['Dataset_status'] = 'Healthy_Dataset'

adata.obs['celltype'] = adata.obs['celltype']
adata.obs['sub_celltype'] = 'NaN'
adata.obs['Malignant'] = 'NonMalignant'

adata.obs['Patient'] = adata.obs['donor']
adata.obs['Patient_Number'] = 'NaN'
adata.obs['age'] = adata.obs['development_stage']
adata.obs['sex'] = adata.obs['sex']
adata.obs['ethnicity'] = adata.obs['ethnicity']
adata.obs['health_status'] = 'NaN'

adata.obs['original_celltype_1'] = adata.obs['cell_type']
adata.obs['original_celltype_2'] = 'NaN'
adata.obs['original_celltype_3'] = 'NaN'

In [189]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [190]:
adata.obs_names_make_unique()

In [191]:
adata.write(writepath + '05-4-Liver-Pisco-2022-processed.h5ad')

## 05-6-Liver-Han-2020

In [585]:
ix=np.isin(adata_han.obs['sub_tissue'],['AdultLiver']) 
adata=adata_han[ix].copy()

In [586]:
adata.obs['InternDatasetNumber'] ='05-6-Liver-Han-2020'

In [590]:
#calculate QC covariates
adata.obs['n_counts'] = adata.X.sum(1)
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1)

In [593]:
# FILTER PARAMETERS#Filter out cells
sc.pp.filter_cells(adata, max_counts = 2400)
sc.pp.filter_cells(adata, max_genes = 1200)
# Min 20 cells - filters out low count genes
sc.pp.filter_genes(adata, min_cells=10) #500 works # # 450(30494) #400(29837) #300(28268) not

filtered out 20 cells that have more than 2400 counts
filtered out 3 cells that have more than 1200 genes expressed
filtered out 13390 genes that are detected in less than 10 cells


In [594]:
# get mt genes
mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]
np.array(mt_genes)
mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]

# convert to spare
adata.X=sp.sparse.csr_matrix.todense(adata.X)

# manually define mt score
y = np.bincount(mt_gene_mask)
ii = np.nonzero(y)[0]
np.vstack((ii,y[ii])).T
adata.X[:, mt_gene_mask].sum(1)
mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))
mt_sum=np.array(pd.DataFrame(mt_sum)[0])
mt_frac=mt_sum/adata.obs['n_counts'].values

adata.obs['mt_frac'] = mt_frac

#Filter out cells with over 20% mito fraction
adata = adata[adata.obs['mt_frac'] < 0.20]

In [595]:
adata_pp=adata.copy()

In [596]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

normalizing by total count per cell
    finished (0:00:00): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    with n_comps=15
    finished (0:00:02)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 13 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:00:00)


In [597]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [598]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)
size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

In [599]:
#Delete adata_pp
del adata_pp
adata.obs['size_factors'] = size_factors

  This is separate from the ipykernel package so we can avoid doing imports until


In [600]:
adata.strings_to_categoricals()

In [601]:
#make  (adata.X) copy of counts of raw data for downstream analysis
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [602]:
adata.raw = adata

In [603]:
#Normalize data
adata.X /= adata.obs['size_factors'].values[:, None]
sc.pp.log1p(adata)

In [604]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:00)


In [605]:
# Calculate the visualizations
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata)
sc.tl.umap(adata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:01)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:09)


In [607]:
# make consistent annotations across datasets
adata.obs['celltype']=adata.obs['celltype_specific'].copy()
adata.obs['celltype'].cat.categories

Index(['Activated T cell', 'B cell (Plasmocyte)_IGHA/HM high',
       'B cell (Plasmocyte)_IGHA/HM_IGK high',
       'B cell (Plasmocyte)_IGHA/HM_IGL high', 'B cell (Plasmocyte)_IGHG high',
       'B cell (Plasmocyte)_IGHG_IGK high',
       'B cell (Plasmocyte)_IGHG_IGL high', 'B cell (Plasmocyte)_IGHM/HG high',
       'Conventional dendritic cell',
       'Conventional dendritic cell_FECER1A high', 'Dendritic cell',
       'Epithelial cell', 'Epithelial cell_SCGB3A1 high',
       'Epithelial cell_TM4SF4 high', 'Hepatocyte', 'Hepatocyte_FGB high',
       'Hepatocyte_GSTA1 high', 'Hepatocyte_HP high', 'Hepatocyte_TF high',
       'Kuppfer Cell', 'Kuppfer cell', 'Macrophage', 'Mast cell',
       'Motile liver macrophage', 'Myeloid cell', 'Neutrophil',
       'Neutrophil_CAMP high', 'Neutrophil_CD177 high',
       'Neutrophil_ELANE high', 'Neutrophil_LCN2 high', 'Proliferating cell',
       'Sinusoidal endothelial cell', 'Sinusoidal endothelial cell_FCN1 high',
       'Smooth muscle cell'

In [608]:
ref_cluster=pd.Categorical(adata.obs['celltype'],
                           categories=['Activated T cell', 'B cell (Plasmocyte)_IGHA/HM high',
       'B cell (Plasmocyte)_IGHA/HM_IGK high',
       'B cell (Plasmocyte)_IGHA/HM_IGL high', 'B cell (Plasmocyte)_IGHG high',
       'B cell (Plasmocyte)_IGHG_IGK high',
       'B cell (Plasmocyte)_IGHG_IGL high', 'B cell (Plasmocyte)_IGHM/HG high',
       'Conventional dendritic cell',
       'Conventional dendritic cell_FECER1A high', 'Dendritic cell',
       'Epithelial cell', 'Epithelial cell_SCGB3A1 high',
       'Epithelial cell_TM4SF4 high', 'Hepatocyte', 'Hepatocyte_FGB high',
       'Hepatocyte_GSTA1 high', 'Hepatocyte_HP high', 'Hepatocyte_TF high',
       'Kuppfer Cell', 'Kuppfer cell', 'Macrophage', 'Mast cell',
       'Motile liver macrophage', 'Myeloid cell', 'Neutrophil',
       'Neutrophil_CAMP high', 'Neutrophil_CD177 high',
       'Neutrophil_ELANE high', 'Neutrophil_LCN2 high', 'Proliferating cell',
       'Sinusoidal endothelial cell', 'Sinusoidal endothelial cell_FCN1 high',
       'Smooth muscle cell', 'Vascular endothelial cell'])

In [609]:
ix=np.isin(ref_cluster,[ 'x'])
ref_cluster[ix]= 'Activated T cell'

ix=np.isin(ref_cluster,[ 'B cell (Plasmocyte)_IGHA/HM_IGK high',
       'B cell (Plasmocyte)_IGHA/HM_IGL high', 'B cell (Plasmocyte)_IGHG high',
       'B cell (Plasmocyte)_IGHG_IGK high',
       'B cell (Plasmocyte)_IGHG_IGL high', 'B cell (Plasmocyte)_IGHM/HG high'])
ref_cluster[ix]= 'B cell (Plasmocyte)_IGHA/HM high'

ix=np.isin(ref_cluster,[  'Conventional dendritic cell_FECER1A high', 'Dendritic cell'])
ref_cluster[ix]= 'Conventional dendritic cell'

ix=np.isin(ref_cluster,[ 'Hepatocyte_FGB high',
       'Hepatocyte_GSTA1 high', 'Hepatocyte_HP high', 'Hepatocyte_TF high'])
ref_cluster[ix]= 'Hepatocyte'

ix=np.isin(ref_cluster,[ 'Kuppfer cell', 'Macrophage',  'Motile liver macrophage'])
ref_cluster[ix]= 'Kuppfer Cell'

ix=np.isin(ref_cluster,['Neutrophil_CAMP high','Neutrophil_CD177 high', 'Neutrophil_ELANE high', 'Neutrophil_LCN2 high'])
ref_cluster[ix]= 'Neutrophil'

ix=np.isin(ref_cluster,[ 'Sinusoidal endothelial cell_FCN1 high','Vascular endothelial cell'])
ref_cluster[ix]=  'Sinusoidal endothelial cell'

ix=np.isin(ref_cluster,['Epithelial cell_SCGB3A1 high',
       'Epithelial cell_TM4SF4 high', 'Proliferating cell'])
ref_cluster[ix]=   'Epithelial cell'

In [610]:
adata.obs['celltype']=pd.Categorical(ref_cluster,
                                           categories=['Activated T cell', 'B cell (Plasmocyte)_IGHA/HM high',
       'Conventional dendritic cell',
      
       'Epithelial cell',  'Hepatocyte', 
       'Kuppfer Cell',  'Mast cell',
       'Myeloid cell', 'Neutrophil',
         
       'Sinusoidal endothelial cell', 
       'Smooth muscle cell'])

In [611]:
adata.rename_categories('celltype', ['T cells', 'Plasma cells',
       'Dendritic cells',
      
       'Unknown',  'Hepatocytes', 
       'Macrophages',  'Mast cells',
       'Myeloid cells', 'Neutrophils',
         
       'Endothelial cells', 
       'Smooth muscle cells'])

In [615]:
adata.obs['sub_tissue'].cat.categories
ref_cluster=pd.Categorical(adata.obs['sub_tissue'],
                           categories=['AdultLiver'])
adata.rename_categories('sub_tissue', ['Liver'])

In [616]:
adata.obs['sex'].cat.categories
ref_cluster=pd.Categorical(adata.obs['sex'],
                           categories=['female', 'male'])
adata.rename_categories('sex', ['Female', 'Male'])

In [617]:
adata.obs['age'].cat.categories
ref_cluster=pd.Categorical(adata.obs['age'],
                           categories=['21Y', '23Y', '52Y'])
adata.rename_categories('age',['21', '23', '52'])

In [618]:
adata.obs['donor'].cat.categories
ref_cluster=pd.Categorical(adata.obs['donor'],
                           categories=['Donor38', 'Donor39', 'Donor40'])
adata.rename_categories('donor',['Han-Donor38', 'Han-Donor39', 'Han-Donor40'] )

In [619]:
adata.obs['Organ'] = 'Liver'
adata.obs['Organ_Specific'] = adata.obs['sub_tissue']
adata.obs['Dataset'] = 'Han_Liver'
adata.obs['InternDatasetNumber'] ='05-6-Liver-Han-2020'
adata.obs['Dataset_status'] = 'Healthy_Dataset'

adata.obs['celltype'] = adata.obs['celltype']
adata.obs['sub_celltype'] = 'NaN'
adata.obs['Malignant'] = 'NonMalignant'

adata.obs['Patient'] = adata.obs['donor']
adata.obs['Patient_Number'] = 'NaN'
adata.obs['age'] = adata.obs['age']
adata.obs['sex'] = adata.obs['sex']
adata.obs['ethnicity'] = 'NaN'
adata.obs['health_status'] = 'NaN'

adata.obs['original_celltype_1'] = adata.obs['celltype_specific']
adata.obs['original_celltype_2'] = adata.obs['celltype_global']
adata.obs['original_celltype_3'] = 'NaN'

In [621]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [622]:
adata.write(writepath + '05-6-Liver-Han-2020-processed.h5ad')

## 05-7-Liver_ImmuneCells-Teichmann-2022

In [188]:
ix=np.isin(adata_analysis.obs['tissue_major'], ['Liver'])
adata=adata_analysis[ix].copy()

In [193]:
adata.obs['Organ'] =  adata.obs['tissue_major']
adata.obs['Organ_Specific'] = adata.obs['sub_tissue']
adata.obs['Dataset'] = adata.obs['Dataset']
adata.obs['InternDatasetNumber'] = '05-7-Liver_ImmuneCells-Teichmann-2022'
adata.obs['Dataset_status'] = 'Healthy_Dataset'

adata.obs['celltype'] = adata.obs['celltype']
adata.obs['sub_celltype'] = 'NaN'
adata.obs['Malignant'] = 'NonMalignant'

adata.obs['Patient'] = adata.obs['donor']
adata.obs['Patient_Number'] = 'NaN'
adata.obs['age'] = adata.obs['development_stage']
adata.obs['sex'] = adata.obs['sex']
adata.obs['ethnicity'] = adata.obs['ethnicity']
adata.obs['health_status'] = 'NaN'

adata.obs['original_celltype_1'] = adata.obs['cell_type']
adata.obs['original_celltype_2'] = adata.obs['Majority_voting_CellTypist_high']
adata.obs['original_celltype_3'] = 'NaN'

In [194]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [195]:
adata.obs_names_make_unique()

In [196]:
adata.write(writepath + '05-7-Liver_ImmuneCells-Teichmann-2022-processed.h5ad')

# 06-Pancreas

## 06-1-Pancreas-Baron-2016

In [20]:
sample1=pd.read_csv(writepath + 'GSM2230757_human1_umifm_counts.csv',
                    delimiter=',')

sample1.index=sample1['barcode']

id_1=np.array(sample1['Unnamed: 0'])
id_1

cells_1=np.array(sample1['assigned_cluster'])
cells_1

del sample1['assigned_cluster']
del sample1['Unnamed: 0']
del sample1['barcode']

adata1=anndata.AnnData(X=sample1)

adata1.obs['assigned_cluster']=cells_1
adata1.obs['id']=id_1

In [39]:
sample2=pd.read_csv(writepath + 'GSM2230758_human2_umifm_counts.csv',
                    delimiter=',')

sample2.index=sample2['barcode']

id_2=np.array(sample2['Unnamed: 0'])
cells_2=np.array(sample2['assigned_cluster'])


del sample2['assigned_cluster']
del sample2['Unnamed: 0']
del sample2['barcode']

adata2=anndata.AnnData(X=sample2)

adata2.obs['assigned_cluster']=cells_2
adata2.obs['id']=id_2

Observation names are not unique. To make them unique, call `.obs_names_make_unique`.


In [45]:
sample3=pd.read_csv(writepath + 'GSM2230759_human3_umifm_counts.csv',
                    delimiter=',')

sample3.index=sample3['barcode']

id_3=np.array(sample3['Unnamed: 0'])
cells_3=np.array(sample3['assigned_cluster'])


del sample3['assigned_cluster']
del sample3['Unnamed: 0']
del sample3['barcode']

adata3=anndata.AnnData(X=sample3)

adata3.obs['assigned_cluster']=cells_3
adata3.obs['id']=id_3

Observation names are not unique. To make them unique, call `.obs_names_make_unique`.


In [48]:
sample4=pd.read_csv(writepath + 'GSM2230760_human4_umifm_counts.csv',
                    delimiter=',')

sample4.index=sample4['barcode']

id_4=np.array(sample4['Unnamed: 0'])
cells_4=np.array(sample4['assigned_cluster'])


del sample4['assigned_cluster']
del sample4['Unnamed: 0']
del sample4['barcode']

adata4=anndata.AnnData(X=sample4)

adata4.obs['assigned_cluster']=cells_4
adata4.obs['id']=id_4

Observation names are not unique. To make them unique, call `.obs_names_make_unique`.


In [49]:
adata=adata1.concatenate(adata2, adata3, adata4, batch_key='sample')

Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
Or pass `index_unique!=None` to `.concatenate`.


In [506]:
adata.obs['InternDatasetNumber'] ='06-1-Pancreas-Baron-2016'

In [510]:
#calculate QC covariates
adata.obs['n_counts'] = adata.X.sum(1)
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1)

In [513]:
# FILTER PARAMETERS
#Filter out cells
#sc.pp.filter_cells(adata, max_counts = 4200)
#sc.pp.filter_cells(adata, max_genes = 9000)
# Min 20 cells - filters out low count genes
sc.pp.filter_genes(adata, min_cells=10)

  utils.warn_names_duplicates("obs")


In [515]:
# get mt genes
mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]
np.array(mt_genes)
mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]

# convert to spare
adata.X=sp.sparse.csr_matrix.todense(adata.X)

# manually define mt score
y = np.bincount(mt_gene_mask)
ii = np.nonzero(y)[0]
np.vstack((ii,y[ii])).T
adata.X[:, mt_gene_mask].sum(1)
mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))
mt_sum=np.array(pd.DataFrame(mt_sum)[0])
mt_frac=mt_sum/adata.obs['n_counts'].values

adata.obs['mt_frac'] = mt_frac

#Filter out cells with over 25% mito fraction
adata = adata[adata.obs['mt_frac'] < 0.20]

In [516]:
adata_pp=adata.copy()

  utils.warn_names_duplicates("obs")


In [517]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

normalizing by total count per cell
    finished (0:00:01): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    on highly variable genes
    with n_comps=15
    finished (0:00:00)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 12 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:00:00)


In [518]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [519]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)
size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

In [520]:
#Delete adata_pp
del adata_pp
adata.obs['size_factors'] = size_factors

  This is separate from the ipykernel package so we can avoid doing imports until
  utils.warn_names_duplicates("obs")


In [521]:
adata.strings_to_categoricals()

In [522]:
#make  (adata.X) copy of counts of raw data for downstream analysis
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [523]:
adata.raw = adata

In [524]:
#Normalize data
adata.X /= adata.obs['size_factors'].values[:, None]
sc.pp.log1p(adata)

In [525]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [526]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:00)


In [527]:
# Calculate the visualizations
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata)
sc.tl.umap(adata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:03)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:16)


In [529]:
# make consistent annotations across datasets
adata.obs['celltype'] = adata.obs['annotations_final'].copy()
adata.obs['celltype'].cat.categories

Index(['Acinar', 'Alpha', 'Beta', 'Delta', 'Ductal', 'Endothelial', 'Epsilon',
       'Gamma', 'Macrophages', 'Mast cells', 'Stellates', 'Schwann',
       'T cells'],
      dtype='object')

In [530]:
ref_cluster=pd.Categorical(adata.obs['celltype'],
                           categories=['Acinar', 'Alpha', 'Beta', 'Delta', 'Ductal', 'Endothelial', 'Epsilon',
       'Gamma', 'Macrophages', 'Mast cells', 'Stellates', 'Schwann',
       'T cells'])

In [531]:
ix=np.isin(ref_cluster,['Beta', 'Delta', 'Epsilon', 'Gamma'])
ref_cluster[ix]='Alpha'

In [532]:
adata.obs['celltype']=pd.Categorical(ref_cluster,
                                           categories=['Acinar', 'Alpha', 'Ductal', 'Endothelial',
                                                       'Macrophages', 'Mast cells', 'Stellates', 'Schwann',
                                                       'T cells'])

In [533]:
adata.obs['celltype'].cat.categories

Index(['Acinar', 'Alpha', 'Ductal', 'Endothelial', 'Macrophages', 'Mast cells',
       'Stellates', 'Schwann', 'T cells'],
      dtype='object')

In [534]:
adata.rename_categories('celltype',['Pancreatic acinar cells', 'Pancreatic endocrine cells', 'Pancreatic ductal cells', 'Endothelial cells', 'Macrophages', 'Mast cells',
       'Pancreatic stellate cells', 'Glial cells', 'T cells'])

  res = method(*args, **kwargs)


In [538]:
adata.obs['donor'] = adata.obs['sample']
ref_cluster=pd.Categorical(adata.obs['donor'],
                           categories=['1', '2', '3', '4'])
adata.rename_categories('donor', ['Baron_Pancreas-Donor1', 'Baron_Pancreas-Donor2', 'Baron_Pancreas-Donor3', 'Baron_Pancreas-Donor4'])

  res = method(*args, **kwargs)


In [539]:
adata.obs['Organ'] = 'Pancreas'
adata.obs['Organ_Specific'] = 'Pancreas'
adata.obs['Dataset'] = 'Baron_Pancreas'
adata.obs['InternDatasetNumber'] ='06-1-Pancreas-Baron-2016'
adata.obs['Dataset_status'] = 'Healthy_Dataset'

adata.obs['celltype'] = adata.obs['celltype']
adata.obs['sub_celltype'] = 'NaN'
adata.obs['Malignant'] = 'NonMalignant'

adata.obs['Patient'] = adata.obs['donor']
adata.obs['Patient_Number'] = adata.obs['sample']
adata.obs['age'] = 'NaN'
adata.obs['sex'] = 'NaN'
adata.obs['ethnicity'] = 'NaN'
adata.obs['health_status'] = 'NaN'

adata.obs['original_celltype_1'] = adata.obs['annotations_final']
adata.obs['original_celltype_2'] = 'NaN'
adata.obs['original_celltype_3'] = 'NaN'

In [541]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [542]:
adata.obs_names_make_unique()

In [543]:
adata.write(writepath + '06-1-Pancreas-Baron-2016-processed.h5ad')

## 06-2-Pancreas-Peng-2019

In [19]:
adata=sc.read_text(writepath+'count-matrix_peng.txt', 
                   delimiter=' ', 
                   dtype='float32')
adata=adata.transpose()

In [21]:
#Get cell type annotations
annotations=pd.read_csv(writepath + 'all_celltype_peng.txt', delimiter='\t')
adata.obs['cell_types']=np.array(annotations['cluster'])

In [23]:
samples=[]

In [24]:
barcodes=adata.obs.index

In [25]:
for i in range(len(barcodes)):
    samples.append(barcodes[i][0:3])

In [26]:
samples=np.array(samples)

In [27]:
string_samples = str(samples).replace('_', '')

In [28]:
adata.obs['sample']=samples

In [29]:
adata.strings_to_categoricals()

... storing 'cell_types' as categorical
... storing 'sample' as categorical


In [31]:
adata.rename_categories('sample',['N1', 'N2', 'N3', 'N4', 'N5', 'N6', 'N7', 'N8', 'N9', 'N10',
       'N11', 'T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', 'T8', 'T9',
       'T10', 'T11', 'T12', 'T13', 'T14', 'T15', 'T16', 'T17', 'T18', 'T19',
       'T20', 'T21', 'T22', 'T23', 'T24'])

In [34]:
adata.obs['condition']=adata.obs['sample'].copy()

In [36]:
ref_cluster=pd.Categorical(adata.obs['condition'],
                           categories=['N1', 'N2', 'N3', 'N4', 'N5', 'N6', 'N7', 'N8', 'N9', 'N10', 'N11',
       'T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', 'T8', 'T9', 'T10', 'T11',
       'T12', 'T13', 'T14', 'T15', 'T16', 'T17', 'T18', 'T19', 'T20', 'T21',
       'T22', 'T23', 'T24'])

In [37]:
ix=np.isin(ref_cluster,['N2', 'N3', 'N4', 'N5', 'N6', 'N7', 'N8', 'N9', 'N10', 'N11'])
ref_cluster[ix]='N1'

ix=np.isin(ref_cluster,['T2', 'T3', 'T4', 'T5', 'T6', 'T7', 'T8', 'T9', 'T10', 'T11',
       'T12', 'T13', 'T14', 'T15', 'T16', 'T17', 'T18', 'T19', 'T20', 'T21',
       'T22', 'T23', 'T24'])
ref_cluster[ix]='T1'

In [38]:
adata.obs['condition']=pd.Categorical(ref_cluster,
                                            categories=['N1','T1'])

In [39]:
adata.rename_categories('condition',
                        ['Healthy','Tumor'])

In [548]:
adata.obs['InternDatasetNumber'] ='06-2-Pancreas-Peng-2019'

In [551]:
ix=np.isin(adata.obs['condition'],['Healthy']) 
adata=adata[ix].copy()

In [554]:
#calculate QC covariates
adata.obs['n_counts'] = adata.X.sum(1)
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1)

In [557]:
# FILTER PARAMETERS
#Filter out cells
#sc.pp.filter_cells(adata, max_counts = 4200)
#sc.pp.filter_cells(adata, max_genes = 9000)
# Min 20 cells - filters out low count genes
sc.pp.filter_genes(adata, min_cells=10)

filtered out 2242 genes that are detected in less than 10 cells


In [559]:
# get mt genes
mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]
np.array(mt_genes)
mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]

# convert to spare
adata.X=sp.sparse.csr_matrix.todense(adata.X)

# manually define mt score
y = np.bincount(mt_gene_mask)
ii = np.nonzero(y)[0]
np.vstack((ii,y[ii])).T
adata.X[:, mt_gene_mask].sum(1)
mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))
mt_sum=np.array(pd.DataFrame(mt_sum)[0])
mt_frac=mt_sum/adata.obs['n_counts'].values

adata.obs['mt_frac'] = mt_frac

#Filter out cells with over 25% mito fraction
adata = adata[adata.obs['mt_frac'] < 0.20]

In [560]:
adata_pp=adata.copy()

In [561]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

normalizing by total count per cell
    finished (0:00:02): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    on highly variable genes
    with n_comps=15
    finished (0:00:00)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 13 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:00:01)


In [562]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [563]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)
size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

In [564]:
#Delete adata_pp
del adata_pp
adata.obs['size_factors'] = size_factors

  This is separate from the ipykernel package so we can avoid doing imports until


In [565]:
adata.strings_to_categoricals()

In [566]:
#make  (adata.X) copy of counts of raw data for downstream analysis
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [567]:
#Normalize data
adata.X /= adata.obs['size_factors'].values[:, None]
sc.pp.log1p(adata)

In [568]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [569]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:00)


In [570]:
# Calculate the visualizations
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata)
sc.tl.umap(adata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:07)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:02)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:13)


In [571]:
#sc.pl.umap(adata, color='cell_type')

In [572]:
# make consistent annotations across datasets
adata.obs['celltype'] = adata.obs['cell_types'].copy()
adata.obs['celltype'].cat.categories

Index(['Acinar cell', 'B cell', 'Ductal cell type 1', 'Endocrine cell',
       'Endothelial cell', 'Fibroblast cell', 'Macrophage cell',
       'Stellate cell', 'T cell'],
      dtype='object')

In [573]:
ref_cluster=pd.Categorical(adata.obs['celltype'],
                           categories=['Acinar cell', 'B cell', 'Ductal cell type 1', 'Ductal cell type 2',
       'Endocrine cell', 'Endothelial cell', 'Fibroblast cell',
       'Macrophage cell', 'Stellate cell', 'T cell'])

In [574]:
ix=np.isin(ref_cluster,['Ductal cell type 2'])
ref_cluster[ix]='Ductal cell type 1'

In [575]:
adata.obs['celltype']=pd.Categorical(ref_cluster,
                                           categories=['Acinar cell', 'B cell', 'Ductal cell type 1', 'Endocrine cell',
       'Endothelial cell', 'Fibroblast cell', 'Macrophage cell',
       'Stellate cell', 'T cell'])

In [576]:
adata.rename_categories('celltype',['Pancreatic acinar cells', 'B cells', 'Pancreatic ductal cells',
       'Pancreatic endocrine cells', 'Endothelial cells', 'Fibroblast cells',
       'Macrophages', 'Pancreatic stellate cells', 'T cells'])

  res = method(*args, **kwargs)


In [579]:
adata.obs['donor'] = adata.obs['sample']
ref_cluster=pd.Categorical(adata.obs['donor'],
                           categories=['N1', 'N2', 'N3', 'N4', 'N5', 'N6', 'N7', 'N8', 'N9', 'N10', 'N11'])
adata.rename_categories('donor', ['Peng_Pancreas-Donor1', 'Peng_Pancreas-Donor2', 'Peng_Pancreas-Donor3', 'Peng_Pancreas-Donor4', 'Peng_Pancreas-Donor5', 
                                  'Peng_Pancreas-Donor6', 'Peng_Pancreas-Donor7', 'Peng_Pancreas-Donor8', 'Peng_Pancreas-Donor9', 'Peng_Pancreas-Donor10', 'Peng_Pancreas-Donor11'])

In [580]:
adata.obs['Organ'] = 'Pancreas'
adata.obs['Organ_Specific'] = 'Pancreas'
adata.obs['Dataset'] = 'Peng_Pancreas'
adata.obs['InternDatasetNumber'] ='06-2-Pancreas-Peng-2019'
adata.obs['Dataset_status'] = 'Healthy_Dataset'

adata.obs['celltype'] = adata.obs['celltype']
adata.obs['sub_celltype'] = 'NaN'
adata.obs['Malignant'] = 'NonMalignant'

adata.obs['Patient'] =  adata.obs['donor']
adata.obs['Patient_Number'] = adata.obs['sample']
adata.obs['age'] = 'NaN'
adata.obs['sex'] = 'NaN'
adata.obs['ethnicity'] = 'NaN'
adata.obs['health_status'] = 'NaN'

adata.obs['original_celltype_1'] = adata.obs['cell_types']
adata.obs['original_celltype_2'] = 'NaN'
adata.obs['original_celltype_3'] = 'NaN'

In [582]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [583]:
adata.obs_names_make_unique()

In [239]:
adata.write(writepath + '06-2-Pancreas-Peng-2019-processed.h5ad')

## 06-3-Pancreas-Enge-2017

In [221]:
target_collections = ["a238e9fa-2bdf-41df-8522-69046f99baff"]
cache_path = os.path.join(".", "data")
dsg = sfaira.data.dataloaders.databases.DatasetSuperGroupDatabases(data_path=cache_path, cache_metadata=True)
dsg.subset(key="collection_id", values=target_collections)
dsg.datasets
dsg.download()

In [222]:
path_X = 'path/to/repo/a238e9fa-2bdf-41df-8522-69046f99baff/'
files = [f for f in listdir(path_X) if isfile(join(path_X, f))]

In [223]:
files

['66d15835-5dc8-4e96-b0eb-f48971cb65e8.h5ad']

In [224]:
for i in range(len(files)):
    print(files[i])
    path_2 = path_X + files[i]
    u = sc.read_h5ad(path_2)
    u.obs['id'] = files[i]
    adata = u

66d15835-5dc8-4e96-b0eb-f48971cb65e8.h5ad


In [226]:
adata.obs['InternDatasetNumber'] ='06-3-Pancreas-Enge-2017'

In [230]:
#calculate QC covariates
adata.obs['n_counts'] = adata.X.sum(1)
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1)

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [233]:
# FILTER PARAMETERS
#Filter out cells
#sc.pp.filter_cells(adata, max_counts = 14000)
sc.pp.filter_cells(adata, max_genes = 7500)
# Min 20 cells - filters out low count genes
sc.pp.filter_genes(adata, min_cells=20)

filtered out 8 cells that have more than 7500 genes expressed
filtered out 6897 genes that are detected in less than 20 cells


In [235]:
adata.var.index = adata.var['feature_name']

In [236]:
# get mt genes
mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]
np.array(mt_genes)
mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]

# convert to spare
adata.X=sp.sparse.csr_matrix.todense(adata.X)

# manually define mt score
y = np.bincount(mt_gene_mask)
ii = np.nonzero(y)[0]
np.vstack((ii,y[ii])).T
adata.X[:, mt_gene_mask].sum(1)
mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))
mt_sum=np.array(pd.DataFrame(mt_sum)[0])
mt_frac=mt_sum/adata.obs['n_counts'].values

adata.obs['mt_frac'] = mt_frac

#Filter out cells with over 25% mito fraction
adata = adata[adata.obs['mt_frac'] < 0.20]

  app.launch_new_instance()


In [238]:
adata_pp=adata.copy()

In [239]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

normalizing by total count per cell
    finished (0:00:00): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    with n_comps=15
    finished (0:00:00)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 19 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:00:00)


In [240]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [242]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)
size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

In [243]:
#Delete adata_pp
del adata_pp
adata.obs['size_factors'] = size_factors

  This is separate from the ipykernel package so we can avoid doing imports until


In [244]:
adata.strings_to_categoricals()

In [245]:
#make  (adata.X) copy of counts of raw data for downstream analysis
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [246]:
adata.raw = adata

In [247]:
#Normalize data
adata.X /= adata.obs['size_factors'].values[:, None]
sc.pp.log1p(adata)

In [248]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [249]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:01)


In [250]:
# Calculate the visualizations
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata)
sc.tl.umap(adata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:02)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:05)


In [252]:
# make consistent annotations across datasets
adata.obs['celltype']=adata.obs['cell_type'].copy()

In [253]:
adata.obs['celltype'].cat.categories

Index(['acinar cell', 'mesenchymal cell', 'native cell',
       'pancreatic ductal cell', 'type A enteroendocrine cell',
       'type B pancreatic cell', 'type D enteroendocrine cell'],
      dtype='object')

In [254]:
ref_cluster=pd.Categorical(adata.obs['celltype'],
                           categories=['acinar cell', 'mesenchymal cell', 'native cell',
       'pancreatic ductal cell', 'type A enteroendocrine cell',
       'type B pancreatic cell', 'type D enteroendocrine cell'])

In [255]:
ix=np.isin(ref_cluster,['type B pancreatic cell', 'type D enteroendocrine cell'])
ref_cluster[ix]='type A enteroendocrine cell'

In [256]:
adata.obs['celltype']=pd.Categorical(ref_cluster,
                                           categories=['acinar cell', 'mesenchymal cell', 'native cell',
       'pancreatic ductal cell', 'type A enteroendocrine cell'])

In [257]:
adata.rename_categories('celltype', ['Pancreatic acinar cells', 'Mesenchymal stromal cells', 'Unknown',
       'Pancreatic ductal cells', 'Pancreatic endocrine cells'])

In [261]:
adata.obs['development_stage'].cat.categories
ref_cluster=pd.Categorical(adata.obs['development_stage'],
                           categories=['1-month-old human stage', '21-year-old human stage',
       '22-year-old human stage', '38-year-old human stage',
       '44-year-old human stage', '5-year-old human stage',
       '54-year-old human stage', '6-year-old human stage', 'unknown'])
adata.rename_categories('development_stage', ['1 Month', '21',
       '22', '38',
       '44', '5',
       '54', '6', 'NaN'])

In [262]:
adata.obs['ethnicity'].cat.categories
ref_cluster=pd.Categorical(adata.obs['ethnicity'],
                           categories=['African American', 'Asian', 'European', 'unknown'])
adata.rename_categories('ethnicity', ['African-American', 'Asian', 'European', 'NaN'])

In [263]:
adata.obs['sex'].cat.categories
ref_cluster=pd.Categorical(adata.obs['sex'],
                           categories=['female', 'male', 'unknown'])
adata.rename_categories('sex', ['Female', 'Male', 'NaN'])

In [264]:
adata.obs['donor'] = adata.obs['development_stage']
ref_cluster=pd.Categorical(adata.obs['donor'],
                           categories=['1-month-old human stage', '21-year-old human stage',
       '22-year-old human stage', '38-year-old human stage',
       '44-year-old human stage', '5-year-old human stage',
       '54-year-old human stage', '6-year-old human stage', 'unknown'])
adata.rename_categories('donor', ['Enge_Pancreas-Donor1', 'Enge_Pancreas-Donor2',
       'Enge_Pancreas-Donor3', 'Enge_Pancreas-Donor4',
       'Enge_Pancreas-Donor5', 'Enge_Pancreas-Donor6',
       'Enge_Pancreas-Donor7', 'Enge_Pancreas-Donor8', 'Enge_Pancreas-Donor9'])

In [265]:
adata.obs['Organ'] = 'Pancreas'
adata.obs['Organ_Specific'] = 'Pancreas'
adata.obs['Dataset'] = 'Enge_Pancreas'
adata.obs['InternDatasetNumber'] ='06-3-Pancreas-Enge-2017'
adata.obs['Dataset_status'] = 'Healthy_Dataset'

adata.obs['celltype'] = adata.obs['celltype']
adata.obs['sub_celltype'] = 'NaN'
adata.obs['Malignant'] = 'NonMalignant'

adata.obs['Patient'] = adata.obs['donor']
adata.obs['Patient_Number'] = 'NaN'
adata.obs['age'] = adata.obs['development_stage']
adata.obs['sex'] = adata.obs['sex']
adata.obs['ethnicity'] = adata.obs['ethnicity']
adata.obs['health_status'] = 'NaN'

adata.obs['original_celltype_1'] = adata.obs['cell_type']
adata.obs['original_celltype_2'] = 'NaN'
adata.obs['original_celltype_3'] = 'NaN'

In [266]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [268]:
adata.obs_names_make_unique()

In [269]:
adata.write(writepath + '06-3-Pancreas-Enge-2017-processed.h5ad')

## 06-4-Pancreas-Oudenaarden-2016

In [270]:
target_collections = ["6e8c5415-302c-492a-a5f9-f29c57ff18fb"]
cache_path = os.path.join(".", "data")
dsg = sfaira.data.dataloaders.databases.DatasetSuperGroupDatabases(data_path=cache_path, cache_metadata=True)
dsg.subset(key="collection_id", values=target_collections)
dsg.datasets
dsg.download()

In [271]:
path_X = '/path/to/repo/6e8c5415-302c-492a-a5f9-f29c57ff18fb/'
files = [f for f in listdir(path_X) if isfile(join(path_X, f))]

In [272]:
files

['b07e5164-baf6-43d2-bdba-5a249d0da879.h5ad']

In [273]:
for i in range(len(files)):
    print(files[i])
    path_2 = path_X + files[i]
    u = sc.read_h5ad(path_2)
    u.obs['id'] = files[i]
    adata = u

b07e5164-baf6-43d2-bdba-5a249d0da879.h5ad


In [275]:
adata.obs['InternDatasetNumber'] ='06-4-Pancreas-Oudenaarden-2016'

In [278]:
#calculate QC covariates
adata.obs['n_counts'] = adata.X.sum(1)
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1)

In [281]:
# FILTER PARAMETERS
#Filter out cells
#sc.pp.filter_cells(adata, max_counts = 23000)
#sc.pp.filter_cells(adata, max_genes = 6700)
# Min 20 cells - filters out low count genes
sc.pp.filter_genes(adata, min_cells=20)

filtered out 9439 genes that are detected in less than 20 cells


In [283]:
adata.var.index = adata.var['feature_name']

In [284]:
# get mt genes
mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]
np.array(mt_genes)
mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]

# convert to spare
adata.X=sp.sparse.csr_matrix.todense(adata.X)

# manually define mt score
y = np.bincount(mt_gene_mask)
ii = np.nonzero(y)[0]
np.vstack((ii,y[ii])).T
adata.X[:, mt_gene_mask].sum(1)
mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))
mt_sum=np.array(pd.DataFrame(mt_sum)[0])
mt_frac=mt_sum/adata.obs['n_counts'].values

adata.obs['mt_frac'] = mt_frac

#Filter out cells with over 25% mito fraction
adata = adata[adata.obs['mt_frac'] < 0.20]

In [285]:
adata_pp=adata.copy()

In [286]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

normalizing by total count per cell
    finished (0:00:00): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    with n_comps=15
    finished (0:00:00)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 10 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:00:00)


In [287]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [288]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)
size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

In [289]:
#Delete adata_pp
del adata_pp
adata.obs['size_factors'] = size_factors

  This is separate from the ipykernel package so we can avoid doing imports until


In [290]:
adata.strings_to_categoricals()

In [291]:
#make  (adata.X) copy of counts of raw data for downstream analysis
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [292]:
#Normalize data
adata.X /= adata.obs['size_factors'].values[:, None]
sc.pp.log1p(adata)

In [293]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [294]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:00)


In [None]:
# Calculate the visualizations
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata)
sc.tl.umap(adata)

In [297]:
# make consistent annotations across datasets
adata.obs['celltype']=adata.obs['cell_type'].copy()

In [298]:
adata.obs['celltype'].cat.categories

Index(['endothelial cell', 'mesenchymal cell', 'pancreatic A cell',
       'pancreatic D cell', 'pancreatic PP cell', 'pancreatic acinar cell',
       'pancreatic ductal cell', 'pancreatic endocrine cell',
       'pancreatic epsilon cell', 'type B pancreatic cell'],
      dtype='object')

In [299]:
ref_cluster=pd.Categorical(adata.obs['celltype'],
                           categories=['endothelial cell', 'mesenchymal cell', 'pancreatic A cell',
       'pancreatic D cell', 'pancreatic PP cell', 'pancreatic acinar cell',
       'pancreatic ductal cell', 'pancreatic endocrine cell',
       'pancreatic epsilon cell', 'type B pancreatic cell'])

In [300]:
ix=np.isin(ref_cluster,[ 'pancreatic D cell', 'pancreatic PP cell','pancreatic endocrine cell',
       'pancreatic epsilon cell', 'type B pancreatic cell'])
ref_cluster[ix]='pancreatic A cell'

In [301]:
adata.obs['celltype']=pd.Categorical(ref_cluster,
                                           categories=['endothelial cell', 'mesenchymal cell', 'pancreatic A cell',
       'pancreatic acinar cell',
       'pancreatic ductal cell'])

In [305]:
adata.obs['id'].cat.categories

Index(['b07e5164-baf6-43d2-bdba-5a249d0da879.h5ad'], dtype='object')

In [306]:
adata.obs['development_stage'].cat.categories
ref_cluster=pd.Categorical(adata.obs['development_stage'],
                           categories=['23-year-old human stage', '48-year-old human stage',
       '54-year-old human stage', '59-year-old human stage'])
adata.rename_categories('development_stage', ['23', '48',
       '54', '59'])

In [307]:
adata.obs['sex'].cat.categories
ref_cluster=pd.Categorical(adata.obs['sex'],
                           categories=['female', 'male'])
adata.rename_categories('sex', ['Female', 'Male'])

In [308]:
adata.obs['donor'] = adata.obs['development_stage']
ref_cluster=pd.Categorical(adata.obs['donor'],
                           categories=['23-year-old human stage', '48-year-old human stage',
       '54-year-old human stage', '59-year-old human stage'])
adata.rename_categories('donor', ['Oudenaarden_Pancreas-Donor1', 'Oudenaarden_Pancreas-Donor2',
       'Oudenaarden_Pancreas-Donor3', 'Oudenaarden_Pancreas-Donor4'])

In [312]:
adata.obs_names_make_unique()

In [None]:
adata.obs['InternDatasetNumber'] ='06-4-Pancreas-Oudenaarden-2016'

In [None]:
adata.write(writepath + '06-4-Pancreas-Oudenaarden-2016-processed.h5ad')

## 06-5-Pancreas-Pisco-2022

In [386]:
ix=np.isin(adata_pisco.obs['tissue'],[ 'endocrine pancreas', 'exocrine pancreas']) 
adata=adata_pisco[ix].copy()

In [387]:
adata.obs['InternDatasetNumber'] = '06-5-Pancreas-Pisco-2022'

In [392]:
#calculate QC covariates
adata.obs['n_counts'] = adata.X.sum(1)
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1)

In [396]:
# FILTER PARAMETERS
#Filter out cells
sc.pp.filter_cells(adata, max_counts = 16000)
sc.pp.filter_cells(adata, max_genes = 11000)
# Min 20 cells - filters out low count genes
sc.pp.filter_genes(adata, min_cells=20) #500 works # # 450(30494) #400(29837) #300(28268) not

filtered out 6 cells that have more than 16000 counts
filtered out 3 cells that have more than 11000 genes expressed
filtered out 34384 genes that are detected in less than 20 cells


In [398]:
# get mt genes
mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]
np.array(mt_genes)
mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]

# convert to spare
adata.X=sp.sparse.csr_matrix.todense(adata.X)

# manually define mt score
y = np.bincount(mt_gene_mask)
ii = np.nonzero(y)[0]
np.vstack((ii,y[ii])).T
adata.X[:, mt_gene_mask].sum(1)
mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))
mt_sum=np.array(pd.DataFrame(mt_sum)[0])
mt_frac=mt_sum/adata.obs['n_counts'].values

adata.obs['mt_frac'] = mt_frac

#Filter out cells with over 20% mito fraction
adata = adata[adata.obs['mt_frac'] < 0.20]

In [399]:
adata_pp=adata.copy()

In [400]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

normalizing by total count per cell
    finished (0:00:01): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    on highly variable genes
    with n_comps=15
    finished (0:00:00)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 17 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:00:00)


In [401]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [402]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)
size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

In [403]:
#Delete adata_pp
del adata_pp
adata.obs['size_factors'] = size_factors

  This is separate from the ipykernel package so we can avoid doing imports until


In [404]:
adata.strings_to_categoricals()

In [405]:
#make  (adata.X) copy of counts of raw data for downstream analysis
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [406]:
adata.raw = adata

In [407]:
#Normalize data
adata.X /= adata.obs['size_factors'].values[:, None]
sc.pp.log1p(adata)

In [408]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:01)


In [409]:
# Calculate the visualizations
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata)
sc.tl.umap(adata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:01)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:02)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:11)


In [411]:
# make consistent annotations across datasets
adata.obs['celltype']=adata.obs['cell_type'].copy()

In [412]:
adata.obs['celltype'].cat.categories

Index(['B cell', 'T cell', 'endothelial cell', 'fibroblast', 'mast cell',
       'mature NK T cell', 'myeloid cell', 'pancreatic A cell',
       'pancreatic D cell', 'pancreatic PP cell', 'pancreatic acinar cell',
       'pancreatic ductal cell', 'pancreatic stellate cell', 'plasma cell',
       'type B pancreatic cell'],
      dtype='object')

In [413]:
ref_cluster=pd.Categorical(adata.obs['celltype'],
                           categories=['B cell', 'T cell', 'endothelial cell', 'fibroblast', 'mast cell',
       'mature NK T cell', 'myeloid cell', 'pancreatic A cell',
       'pancreatic D cell', 'pancreatic PP cell', 'pancreatic acinar cell',
       'pancreatic ductal cell', 'pancreatic stellate cell', 'plasma cell',
       'type B pancreatic cell'])

In [414]:
ix=np.isin(ref_cluster,['pancreatic D cell', 'pancreatic PP cell', 'type B pancreatic cell'])
ref_cluster[ix]='pancreatic A cell'

In [415]:
adata.obs['celltype']=pd.Categorical(ref_cluster,
                                           categories=['B cell', 'T cell', 'endothelial cell', 'fibroblast', 'mast cell',
       'mature NK T cell', 'myeloid cell', 'pancreatic A cell',
       'pancreatic acinar cell',
       'pancreatic ductal cell', 'pancreatic stellate cell', 'plasma cell'])

In [416]:
adata.rename_categories('celltype',['B cells', 'T cells', 'Endothelial cells', 'Fibroblast cells', 'Mast cells',
       'NK cells', 'Myeloid cells', 'Pancreatic endocrine cells',
       'Pancreatic acinar cells',
       'Pancreatic ductal cells', 'Pancreatic stellate cells', 'Plasma cells'])

In [420]:
adata.obs['tissue'].cat.categories
ref_cluster=pd.Categorical(adata.obs['tissue'],
                           categories=['endocrine pancreas', 'exocrine pancreas'])
adata.rename_categories('tissue', ['Pancreas_Endocrine','Pancreas_Exocrine' ])

In [421]:
adata.obs['sex'].cat.categories
ref_cluster=pd.Categorical(adata.obs['sex'],
                           categories=['female', 'male'])
adata.rename_categories('sex', ['Female', 'Male'])

In [422]:
adata.obs['ethnicity'].cat.categories
ref_cluster=pd.Categorical(adata.obs['ethnicity'],
                           categories=['European', 'Hispanic or Latin American'])
adata.rename_categories('ethnicity', ['European', 'Hispanic or Latin-American'])

In [423]:
adata.obs['development_stage'].cat.categories
ref_cluster=pd.Categorical(adata.obs['development_stage'],
                           categories=['37-year-old human stage', '59-year-old human stage'])
adata.rename_categories('development_stage',['37', '59'])

In [424]:
adata.obs['donor'].cat.categories
ref_cluster=pd.Categorical(adata.obs['donor'],
                           categories=['TSP1', 'TSP9'])
adata.rename_categories('donor', ['TSP1', 'TSP9'])

In [425]:
adata.obs['Organ'] = 'Pancreas'
adata.obs['Organ_Specific'] = adata.obs['tissue']
adata.obs['Dataset'] = 'Pisco_Pancreas'
adata.obs['InternDatasetNumber'] = '06-5-Pancreas-Pisco-2022'
adata.obs['Dataset_status'] = 'Healthy_Dataset'

adata.obs['celltype'] = adata.obs['celltype']
adata.obs['sub_celltype'] = 'NaN'
adata.obs['Malignant'] = 'NonMalignant'

adata.obs['Patient'] = adata.obs['donor']
adata.obs['Patient_Number'] = 'NaN'
adata.obs['age'] = adata.obs['development_stage']
adata.obs['sex'] = adata.obs['sex']
adata.obs['ethnicity'] = adata.obs['ethnicity']
adata.obs['health_status'] = 'NaN'

adata.obs['original_celltype_1'] = adata.obs['cell_type']
adata.obs['original_celltype_2'] = 'NaN'
adata.obs['original_celltype_3'] = 'NaN'

In [427]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [428]:
adata.obs_names_make_unique()

In [429]:
adata.write(writepath + '06-5-Pancreas-Pisco-2022-processed.h5ad')

## 06-6-Pancreas-Han-2020

In [733]:
ix=np.isin(adata_han.obs['sub_tissue'],['AdultPancreas']) 
adata=adata_han[ix].copy()

In [734]:
adata.obs['InternDatasetNumber'] ='06-6-Pancreas-Han-2020'

In [738]:
#calculate QC covariates
adata.obs['n_counts'] = adata.X.sum(1)
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1)

In [741]:
# FILTER PARAMETERS#Filter out cells
sc.pp.filter_cells(adata, max_counts = 2500)
sc.pp.filter_cells(adata, max_genes = 1100)
# Min 20 cells - filters out low count genes
sc.pp.filter_genes(adata, min_cells=10) #500 works # # 450(30494) #400(29837) #300(28268) not

filtered out 25 cells that have more than 2500 counts
filtered out 1 cells that have more than 1100 genes expressed
filtered out 14979 genes that are detected in less than 10 cells


In [742]:
# get mt genes
mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]
np.array(mt_genes)
mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]

# convert to spare
adata.X=sp.sparse.csr_matrix.todense(adata.X)

# manually define mt score
y = np.bincount(mt_gene_mask)
ii = np.nonzero(y)[0]
np.vstack((ii,y[ii])).T
adata.X[:, mt_gene_mask].sum(1)
mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))
mt_sum=np.array(pd.DataFrame(mt_sum)[0])
mt_frac=mt_sum/adata.obs['n_counts'].values

adata.obs['mt_frac'] = mt_frac

#Filter out cells with over 20% mito fraction
adata = adata[adata.obs['mt_frac'] < 0.20]

In [743]:
adata_pp=adata.copy()

In [744]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

normalizing by total count per cell
    finished (0:00:00): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    with n_comps=15
    finished (0:00:02)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 8 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:00:00)


In [745]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [746]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)
size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

In [747]:
#Delete adata_pp
del adata_pp
adata.obs['size_factors'] = size_factors

  This is separate from the ipykernel package so we can avoid doing imports until


In [748]:
adata.strings_to_categoricals()

In [749]:
#make  (adata.X) copy of counts of raw data for downstream analysis
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [750]:
adata.raw = adata

In [751]:
#Normalize data
adata.X /= adata.obs['size_factors'].values[:, None]
sc.pp.log1p(adata)

In [752]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:01)


In [753]:
# Calculate the visualizations
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata)
sc.tl.umap(adata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:01)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:19)


In [755]:
# make consistent annotations across datasets
adata.obs['celltype']=adata.obs['celltype_specific'].copy()
adata.obs['celltype'].cat.categories

Index(['Acinar cell_CPA1 high', 'Acinar cell_REG1B high',
       'Acniar cell_ANXA4 high', 'Alpha cell', 'Beta cell', 'Ductal cell',
       'Endothelial cell', 'Exocrine cell', 'Exocrine cell_SAA1 high',
       'Fibroblast', 'M2 Macrophage', 'Smooth muscle cell'],
      dtype='object')

In [756]:
ref_cluster=pd.Categorical(adata.obs['celltype'],
                           categories=['Acinar cell_CPA1 high', 'Acinar cell_REG1B high',
       'Acniar cell_ANXA4 high', 'Alpha cell', 'Beta cell', 'Ductal cell',
       'Endothelial cell', 'Exocrine cell', 'Exocrine cell_SAA1 high',
       'Fibroblast', 'M2 Macrophage', 'Smooth muscle cell'])

In [757]:
ix=np.isin(ref_cluster,[ 'Acinar cell_REG1B high', 'Acniar cell_ANXA4 high', 'Exocrine cell', 'Exocrine cell_SAA1 high'])
ref_cluster[ix]='Acinar cell_CPA1 high'

ix=np.isin(ref_cluster,[  'Beta cell'])
ref_cluster[ix]=   'Alpha cell'

In [758]:
adata.obs['celltype']=pd.Categorical(ref_cluster,
                                           categories=['Acinar cell_CPA1 high', 
        'Alpha cell', 'Ductal cell',
       'Endothelial cell', 
       'Fibroblast', 'M2 Macrophage', 'Smooth muscle cell'])

In [759]:
adata.rename_categories('celltype', ['Pancreatic acinar cells', 
        'Pancreatic endocrine cells', 'Pancreatic ductal cells',
       'Endothelial cells',
       'Fibroblast cells', 'Macrophages', 'Smooth muscle cells'])

In [763]:
adata.obs['sub_tissue'].cat.categories
ref_cluster=pd.Categorical(adata.obs['sub_tissue'],
                           categories=['AdultPancreas'])
adata.rename_categories('sub_tissue', ['Pancreas'])

In [764]:
adata.obs['sex'].cat.categories
ref_cluster=pd.Categorical(adata.obs['sex'],
                           categories=['female'])
adata.rename_categories('sex', ['Female'])

In [765]:
adata.obs['age'].cat.categories
ref_cluster=pd.Categorical(adata.obs['age'],
                           categories=['43Y'])
adata.rename_categories('age',['43'])

In [766]:
adata.obs['donor'].cat.categories
ref_cluster=pd.Categorical(adata.obs['donor'],
                           categories=['Donor44'])
adata.rename_categories('donor', ['Han-Donor44'])

In [767]:
adata.obs['Organ'] = 'Pancreas'
adata.obs['Organ_Specific'] = adata.obs['sub_tissue']
adata.obs['Dataset'] = 'Han_Pancreas'
adata.obs['InternDatasetNumber'] ='06-6-Pancreas-Han-2020'
adata.obs['Dataset_status'] = 'Healthy_Dataset'

adata.obs['celltype'] = adata.obs['celltype']
adata.obs['sub_celltype'] = 'NaN'
adata.obs['Malignant'] = 'NonMalignant'

adata.obs['Patient'] = adata.obs['donor']
adata.obs['Patient_Number'] = 'NaN'
adata.obs['age'] = adata.obs['age']
adata.obs['sex'] = adata.obs['sex']
adata.obs['ethnicity'] = 'NaN'
adata.obs['health_status'] = 'NaN'

adata.obs['original_celltype_1'] = adata.obs['celltype_specific']
adata.obs['original_celltype_2'] = adata.obs['celltype_global']
adata.obs['original_celltype_3'] = 'NaN'

In [769]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [770]:
adata.write(writepath + '06-6-Pancreas-Han-2020-processed.h5ad')

# 07-Spleen

## 07-1-Spleen-Pisco-2022

In [607]:
ix=np.isin(adata_pisco.obs['tissue'],['spleen']) 
adata=adata_pisco[ix].copy()

In [612]:
#calculate QC covariates
adata.obs['n_counts'] = adata.X.sum(1)
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1)

In [616]:
# FILTER PARAMETERS
#Filter out cells
sc.pp.filter_cells(adata, max_counts = 12000)
sc.pp.filter_cells(adata, max_genes = 8000)
# Min 20 cells - filters out low count genes
sc.pp.filter_genes(adata, min_cells=20) #500 works # # 450(30494) #400(29837) #300(28268) not

filtered out 54 cells that have more than 12000 counts
filtered out 9 cells that have more than 8000 genes expressed
filtered out 31568 genes that are detected in less than 20 cells


In [618]:
# get mt genes
mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]
np.array(mt_genes)
mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]

# convert to spare
adata.X=sp.sparse.csr_matrix.todense(adata.X)

# manually define mt score
y = np.bincount(mt_gene_mask)
ii = np.nonzero(y)[0]
np.vstack((ii,y[ii])).T
adata.X[:, mt_gene_mask].sum(1)
mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))
mt_sum=np.array(pd.DataFrame(mt_sum)[0])
mt_frac=mt_sum/adata.obs['n_counts'].values

adata.obs['mt_frac'] = mt_frac

#Filter out cells with over 20% mito fraction
adata = adata[adata.obs['mt_frac'] < 0.20]

In [619]:
adata_pp=adata.copy()

In [620]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

normalizing by total count per cell
    finished (0:00:04): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    on highly variable genes
    with n_comps=15
    finished (0:00:01)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:04)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 22 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:00:05)


In [621]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [622]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)
size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

In [623]:
#Delete adata_pp
del adata_pp
adata.obs['size_factors'] = size_factors

  This is separate from the ipykernel package so we can avoid doing imports until


In [624]:
adata.strings_to_categoricals()

In [625]:
#make  (adata.X) copy of counts of raw data for downstream analysis
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [626]:
adata.raw = adata

In [627]:
#Normalize data
adata.X /= adata.obs['size_factors'].values[:, None]
sc.pp.log1p(adata)

In [628]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:03)


In [629]:
# Calculate the visualizations
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata)
sc.tl.umap(adata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:06)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:06)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:27)


In [630]:
#sc.pl.umap(adata, color='cell_type')

In [631]:
# make consistent annotations across datasets
adata.obs['celltype']=adata.obs['cell_type'].copy()

In [632]:
adata.obs['celltype'].cat.categories

Index(['CD141-positive myeloid dendritic cell',
       'CD1c-positive myeloid dendritic cell',
       'CD4-positive, alpha-beta memory T cell',
       'CD8-positive, alpha-beta T cell',
       'CD8-positive, alpha-beta memory T cell', 'classical monocyte',
       'endothelial cell', 'erythrocyte', 'hematopoietic stem cell',
       'innate lymphoid cell', 'intermediate monocyte', 'macrophage',
       'mature NK T cell', 'memory B cell', 'naive B cell',
       'naive thymus-derived CD4-positive, alpha-beta T cell',
       'naive thymus-derived CD8-positive, alpha-beta T cell', 'neutrophil',
       'plasma cell', 'plasmacytoid dendritic cell', 'platelet',
       'regulatory T cell', 'type I NK T cell'],
      dtype='object')

In [633]:
ref_cluster=pd.Categorical(adata.obs['celltype'],
                           categories=['CD141-positive myeloid dendritic cell',
       'CD1c-positive myeloid dendritic cell',
       'CD4-positive, alpha-beta memory T cell',
       'CD8-positive, alpha-beta T cell',
       'CD8-positive, alpha-beta memory T cell', 'classical monocyte',
       'endothelial cell', 'erythrocyte', 'hematopoietic stem cell',
       'innate lymphoid cell', 'intermediate monocyte', 'macrophage',
       'mature NK T cell', 'memory B cell', 'naive B cell',
       'naive thymus-derived CD4-positive, alpha-beta T cell',
       'naive thymus-derived CD8-positive, alpha-beta T cell', 'neutrophil',
       'plasma cell', 'plasmacytoid dendritic cell', 'platelet',
       'regulatory T cell', 'type I NK T cell'])

In [634]:
ix=np.isin(ref_cluster,['CD1c-positive myeloid dendritic cell',  'plasmacytoid dendritic cell'])
ref_cluster[ix]='CD141-positive myeloid dendritic cell'

ix=np.isin(ref_cluster,[  'CD8-positive, alpha-beta T cell', 'CD8-positive, alpha-beta memory T cell','naive thymus-derived CD4-positive, alpha-beta T cell',
       'naive thymus-derived CD8-positive, alpha-beta T cell', 'regulatory T cell', 'type I NK T cell'])
ref_cluster[ix]='CD4-positive, alpha-beta memory T cell'

ix=np.isin(ref_cluster,[ 'intermediate monocyte'])
ref_cluster[ix]= 'classical monocyte'

ix=np.isin(ref_cluster,['naive B cell'])
ref_cluster[ix]='memory B cell'

In [635]:
adata.obs['celltype']=pd.Categorical(ref_cluster,
                                           categories=['CD141-positive myeloid dendritic cell',
        'CD4-positive, alpha-beta memory T cell',
      'classical monocyte',
       'endothelial cell', 'erythrocyte', 'hematopoietic stem cell',
       'innate lymphoid cell', 'macrophage',
       'mature NK T cell', 'memory B cell',
      'neutrophil',
       'plasma cell', 'platelet'])

In [636]:
adata.rename_categories('celltype',['Dendritic cells',
        'T cells',
      'Monocytes',
       'Endothelial cells', 'Erythroid cells', 'Hematopoietic stem cells',
       'Innate lymphoid cells', 'Macrophages',
       'NK cells', 'B cells',
      'Neutrophils',
       'Plasma cells', 'Thrombocytes'])

In [640]:
adata.obs['tissue'].cat.categories
ref_cluster=pd.Categorical(adata.obs['tissue'],
                           categories=['spleen'])
adata.rename_categories('tissue', ['Spleen'])

In [641]:
adata.obs['sex'].cat.categories
ref_cluster=pd.Categorical(adata.obs['sex'],
                           categories=['female', 'male'])
adata.rename_categories('sex', ['Female', 'Male'])

In [642]:
adata.obs['ethnicity'].cat.categories
ref_cluster=pd.Categorical(adata.obs['ethnicity'],
                           categories=['African American or Afro-Caribbean', 'European'])
adata.rename_categories('ethnicity', ['African-American or Afro-Caribbean', 'European'])

In [643]:
adata.obs['development_stage'].cat.categories
ref_cluster=pd.Categorical(adata.obs['development_stage'],
                           categories=['59-year-old human stage', '61-year-old human stage',
       '69-year-old human stage'])
adata.rename_categories('development_stage',['59', '61',
       '69'])

In [644]:
adata.obs['donor'].cat.categories
ref_cluster=pd.Categorical(adata.obs['donor'],
                           categories=['TSP2', 'TSP7', 'TSP14'])
adata.rename_categories('donor', ['TSP2', 'TSP7', 'TSP14'])

In [645]:
adata.obs['Organ'] = 'Spleen'
adata.obs['Organ_Specific'] = adata.obs['tissue']
adata.obs['Dataset'] = 'Pisco_Spleen'
adata.obs['InternDatasetNumber'] = '07-1-Spleen-Pisco-2022'
adata.obs['Dataset_status'] = 'Healthy_Dataset'

adata.obs['celltype'] = adata.obs['celltype']
adata.obs['sub_celltype'] = 'NaN'
adata.obs['Malignant'] = 'NonMalignant'

adata.obs['Patient'] = adata.obs['donor']
adata.obs['Patient_Number'] = 'NaN'
adata.obs['age'] = adata.obs['development_stage']
adata.obs['sex'] = adata.obs['sex']
adata.obs['ethnicity'] = adata.obs['ethnicity']
adata.obs['health_status'] = 'NaN'

adata.obs['original_celltype_1'] = adata.obs['cell_type']
adata.obs['original_celltype_2'] = 'NaN'
adata.obs['original_celltype_3'] = 'NaN'

In [647]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [648]:
adata.obs_names_make_unique()

In [649]:
adata.write(writepath + '07-1-Spleen-Pisco-2022-processed.h5ad')

## 07-2-Spleen-Han-2020

In [926]:
ix=np.isin(adata_han.obs['sub_tissue'],['AdultSpleen']) 
adata=adata_han[ix].copy()

In [927]:
adata.obs['InternDatasetNumber'] ='07-2-Spleen-Han-2020'

In [931]:
#calculate QC covariates
adata.obs['n_counts'] = adata.X.sum(1)
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1)

In [934]:
# FILTER PARAMETERS#Filter out cells
sc.pp.filter_cells(adata, max_counts = 4000)
sc.pp.filter_cells(adata, max_genes = 2000)
# Min 20 cells - filters out low count genes
sc.pp.filter_genes(adata, min_cells=20) #500 works # # 450(30494) #400(29837) #300(28268) not

filtered out 25 cells that have more than 4000 counts
filtered out 14680 genes that are detected in less than 20 cells


In [935]:
# get mt genes
mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]
np.array(mt_genes)
mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]

# convert to spare
adata.X=sp.sparse.csr_matrix.todense(adata.X)

# manually define mt score
y = np.bincount(mt_gene_mask)
ii = np.nonzero(y)[0]
np.vstack((ii,y[ii])).T
adata.X[:, mt_gene_mask].sum(1)
mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))
mt_sum=np.array(pd.DataFrame(mt_sum)[0])
mt_frac=mt_sum/adata.obs['n_counts'].values

adata.obs['mt_frac'] = mt_frac

#Filter out cells with over 20% mito fraction
adata = adata[adata.obs['mt_frac'] < 0.20]

In [936]:
adata_pp=adata.copy()

In [937]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

normalizing by total count per cell
    finished (0:00:00): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    with n_comps=15
    finished (0:00:02)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 11 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:00:01)


In [938]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [939]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)
size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

In [940]:
#Delete adata_pp
del adata_pp
adata.obs['size_factors'] = size_factors

  This is separate from the ipykernel package so we can avoid doing imports until


In [941]:
adata.strings_to_categoricals()

In [942]:
#make  (adata.X) copy of counts of raw data for downstream analysis
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [943]:
adata.raw = adata

In [944]:
#Normalize data
adata.X /= adata.obs['size_factors'].values[:, None]
sc.pp.log1p(adata)

In [945]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:00)


In [946]:
# Calculate the visualizations
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata)
sc.tl.umap(adata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:02)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:02)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:12)


In [947]:
#sc.pl.umap(adata, color='celltype_specific')

In [948]:
# make consistent annotations across datasets
adata.obs['celltype']=adata.obs['celltype_specific'].copy()
adata.obs['celltype'].cat.categories

Index(['B cell (Plasmocyte)_IGHA/HM_IGK high',
       'B cell (Plasmocyte)_IGHA/HM_IGL high',
       'B cell (Plasmocyte)_IGHG_IGK high',
       'B cell (Plasmocyte)_IGHG_IGL high', 'B cell (centrocyte)',
       'CD8_T cell', 'Endothelial cell', 'Erythroid cell',
       'Lymphoid progenitor cell', 'M2 macrophage_CXCL8 high',
       'M2 macrophage_MALAT1 high', 'Neutrophil', 'Neutrophil_DEFA3 high',
       'Neutrophil_OLFM4 high', 'Neutrophil_S100A12 high', 'T cell'],
      dtype='object')

In [949]:
ref_cluster=pd.Categorical(adata.obs['celltype'],
                           categories=['B cell (Plasmocyte)_IGHA/HM_IGK high',
       'B cell (Plasmocyte)_IGHA/HM_IGL high',
       'B cell (Plasmocyte)_IGHG_IGK high',
       'B cell (Plasmocyte)_IGHG_IGL high', 'B cell (centrocyte)',
       'CD8_T cell', 'Endothelial cell', 'Erythroid cell',
       'Lymphoid progenitor cell', 'M2 macrophage_CXCL8 high',
       'M2 macrophage_MALAT1 high', 'Neutrophil', 'Neutrophil_DEFA3 high',
       'Neutrophil_OLFM4 high', 'Neutrophil_S100A12 high', 'T cell'])

In [950]:
ix=np.isin(ref_cluster,[   'B cell (Plasmocyte)_IGHA/HM_IGL high','B cell (Plasmocyte)_IGHG_IGK high','B cell (Plasmocyte)_IGHG_IGL high'])
ref_cluster[ix]= 'B cell (Plasmocyte)_IGHA/HM_IGK high'

ix=np.isin(ref_cluster,[  'T cell'])
ref_cluster[ix]=  'CD8_T cell'

ix=np.isin(ref_cluster,[ 'M2 macrophage_MALAT1 high'])
ref_cluster[ix]= 'M2 macrophage_CXCL8 high'

ix=np.isin(ref_cluster,[ 'Neutrophil_DEFA3 high',
       'Neutrophil_OLFM4 high', 'Neutrophil_S100A12 high'])
ref_cluster[ix]= 'Neutrophil'

In [951]:
adata.obs['celltype']=pd.Categorical(ref_cluster,
                                           categories=['B cell (Plasmocyte)_IGHA/HM_IGK high',
      'B cell (centrocyte)',
       'CD8_T cell', 'Endothelial cell', 'Erythroid cell',
       'Lymphoid progenitor cell', 'M2 macrophage_CXCL8 high',
      'Neutrophil'])

In [952]:
adata.rename_categories('celltype', ['Plasma cells',
      'B cells',
       'T cells', 'Endothelial cells', 'Erythroid cells',
       'Common lymphoid progenitor cells', 'Macrophages',
      'Neutrophils'])

In [956]:
adata.obs['sub_tissue'].cat.categories
ref_cluster=pd.Categorical(adata.obs['sub_tissue'],
                           categories=['AdultSpleen'])
adata.rename_categories('sub_tissue', ['Spleen'])

In [957]:
adata.obs['sex'].cat.categories
ref_cluster=pd.Categorical(adata.obs['sex'],
                           categories=['female'])
adata.rename_categories('sex', ['Female'])

In [958]:
adata.obs['age'].cat.categories
ref_cluster=pd.Categorical(adata.obs['age'],
                           categories=['51Years'])
adata.rename_categories('age',['51'])

In [959]:
adata.obs['donor'].cat.categories
ref_cluster=pd.Categorical(adata.obs['donor'],
                           categories=['Donor49'])
adata.rename_categories('donor', ['Donor49'])

In [960]:
adata.obs['Organ'] = 'Spleen'
adata.obs['Organ_Specific'] = adata.obs['sub_tissue']
adata.obs['Dataset'] = 'Han_Spleen'
adata.obs['InternDatasetNumber'] ='07-2-Spleen-Han-2020'
adata.obs['Dataset_status'] = 'Healthy_Dataset'

adata.obs['celltype'] = adata.obs['celltype']
adata.obs['sub_celltype'] = 'NaN'
adata.obs['Malignant'] = 'NonMalignant'

adata.obs['Patient'] = adata.obs['donor']
adata.obs['Patient_Number'] = 'NaN'
adata.obs['age'] = adata.obs['age']
adata.obs['sex'] = adata.obs['sex']
adata.obs['ethnicity'] = 'NaN'
adata.obs['health_status'] = 'NaN'

adata.obs['original_celltype_1'] = adata.obs['celltype_specific']
adata.obs['original_celltype_2'] = adata.obs['celltype_global']
adata.obs['original_celltype_3'] = 'NaN'

In [962]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [963]:
adata.obs_names_make_unique()

In [964]:
adata.write(writepath + '07-2-Spleen-Han-2020-processed.h5ad')

##  07-3-Spleen_ImmuneCells-Teichmann-2022

In [247]:
ix=np.isin(adata_analysis.obs['tissue_major'], ['Spleen'])
adata=adata_analysis[ix].copy()

In [252]:
adata.obs['Organ'] =  adata.obs['tissue_major']
adata.obs['Organ_Specific'] = adata.obs['sub_tissue']
adata.obs['Dataset'] = adata.obs['Dataset']
adata.obs['InternDatasetNumber'] = '07-3-Spleen_ImmuneCells-Teichmann-2022'
adata.obs['Dataset_status'] = 'Healthy_Dataset'

adata.obs['celltype'] = adata.obs['celltype']
adata.obs['sub_celltype'] = 'NaN'
adata.obs['Malignant'] = 'NonMalignant'

adata.obs['Patient'] = adata.obs['donor']
adata.obs['Patient_Number'] = 'NaN'
adata.obs['age'] = adata.obs['development_stage']
adata.obs['sex'] = adata.obs['sex']
adata.obs['ethnicity'] = adata.obs['ethnicity']
adata.obs['health_status'] = 'NaN'

adata.obs['original_celltype_1'] = adata.obs['cell_type']
adata.obs['original_celltype_2'] = adata.obs['Majority_voting_CellTypist_high']
adata.obs['original_celltype_3'] = 'NaN'

In [253]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [254]:
adata.obs_names_make_unique()

In [255]:
adata.write(writepath + '07-3-Spleen_ImmuneCells-Teichmann-2022-processed.h5ad')

# 08-Lymph nodes

## 08-1-LymphNode-Kim-2020

For the lymph node dataset of Kim, data was obtained from the original study (https://doi.org/10.1038/s41467-020-16164-1)

In [80]:
adata=sc.read_text(writepath + 'GSE131907_Lung_Cancer_normalized_log2TPM_matrix.txt', 
                   delimiter=None, 
                   first_column_names=None, 
                   dtype='float32')
adata=adata.transpose()

In [83]:
#import annotations
annotations=pd.DataFrame(pd.read_csv(writepath + 'GSE131907_Lung_Cancer_cell_annotation.txt', delimiter='\t'))

In [84]:
#merge the two dfs
df1=pd.DataFrame(adata.obs)
annotations.index=annotations['Index']
df2=annotations

In [85]:
df_merged=pd.merge(df1, df2, left_index=True, right_index=True)

In [87]:
adata.obs['sample']=df_merged['Sample']
adata.obs['origin']=df_merged['Sample_Origin']
adata.obs['cell type']=df_merged['Cell_type']
adata.obs['cell type refined']=df_merged['Cell_type.refined']
adata.obs['cell subtype']=df_merged['Cell_subtype']

In [89]:
adata.obs['origin'].value_counts()

tLung     45149
nLung     42995
nLN       37446
mBrain    29060
mLN       21479
PE        20304
tL/B      12073
Name: origin, dtype: int64

In [90]:
# subset to lung and lymph node data
ix=np.isin(adata.obs['origin'],['nLN']) 
adata_nLN=adata[ix].copy()

In [332]:
# FILTER PARAMETERS
#Filter out cells
sc.pp.filter_cells(adata_nLN, max_counts = 5000)
sc.pp.filter_cells(adata_nLN, max_genes = 6000)
# Min 20 cells - filters out low count genes
sc.pp.filter_genes(adata_nLN, min_cells=20)

filtered out 2 cells that have more than 5000 counts
filtered out 5 cells that have more than 6000 genes expressed
filtered out 13378 genes that are detected in less than 20 cells


In [333]:
mt_genes = adata_nLN.var_names[[gene.startswith('MT-') for gene in adata_nLN.var_names]]
np.array(mt_genes)
mt_gene_mask = [gene.startswith('MT-') for gene in adata_nLN.var_names]
adata_nLN.obs['mt_frac'] = adata_nLN.X[:, mt_gene_mask].sum(1)/adata_nLN.obs['n_counts']
#Filter out cells with over 25% mito fraction
adata_nLN = adata_nLN[adata_nLN.obs['mt_frac'] < 0.20]

In [334]:
adata_pp = adata_nLN.copy()

In [335]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

normalizing by total count per cell
    finished (0:00:01): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    with n_comps=15
    finished (0:00:08)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:06)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 13 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:00:06)


In [336]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata_nLN.X.T

In [337]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)

size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

In [338]:
del adata_pp
adata_nLN.obs['size_factors'] = size_factors

  


In [339]:
adata_nLN.strings_to_categoricals()

In [340]:
#make  (adata.X) copy of counts of raw data for downstream analysis
#Keep the count data in a counts layer
adata_nLN.layers["counts"] = adata_nLN.X.copy()

In [341]:
#Normalize data
adata_nLN.X /= adata_nLN.obs['size_factors'].values[:, None]
sc.pp.log1p(adata_nLN)

In [342]:
# extract highly variable genes
sc.pp.highly_variable_genes(adata_nLN, n_top_genes=4000, flavor='seurat')

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:05)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)


In [345]:
# make consistent annotations across datasets
testdf=pd.DataFrame()
testdf['celltype']=adata_nLN.obs['cell type']
testdf['sub_celltype']=adata_nLN.obs['cell subtype']
testdf['fin']=str('broad: ')+testdf['celltype'].astype(str) + str('___refined: ') + testdf['sub_celltype'].astype(str)
testdf['fin'] = testdf.fin.astype('category')

In [346]:
adata_nLN.obs['celltype']=np.array(testdf['fin'])

In [349]:
adata_nLN.rename_categories('celltype', ['B cells',
       'B cells 2',
       'B cells 3',
       'B cells 4',
       'B cells 5',
        'Plasma cells',
       'B cells 6',
                                           
       'Dendritics',
       'Dendritics 1',
       'Dendritics 2',
       'Dendritics 3',
                                           
       'Monocytes',
                                           
       'Macrophages',                       
       'Macrophages 2',                   
       'Macrophages 3',
                                           
       'Dendritics 4',
                                           
       'T cells',
       'T cells 1',
       'T cells 2',
       'T cells 3',
       'T cells 4',
       'T cells 5',
       'NK cells',
       'T cells 6',
       'T cells 7',
       'T cells 8',
       'T cells 9',
       'T cells 10',
       'Unknown'])

In [350]:
adata_nLN.obs['celltype'].cat.categories

Index(['B cells', 'B cells 2', 'B cells 3', 'B cells 4', 'B cells 5',
       'Plasma cells', 'B cells 6', 'Dendritics', 'Dendritics 1',
       'Dendritics 2', 'Dendritics 3', 'Monocytes', 'Macrophages',
       'Macrophages 2', 'Macrophages 3', 'Dendritics 4', 'T cells',
       'T cells 1', 'T cells 2', 'T cells 3', 'T cells 4', 'T cells 5',
       'NK cells', 'T cells 6', 'T cells 7', 'T cells 8', 'T cells 9',
       'T cells 10', 'Unknown'],
      dtype='object')

In [351]:
ref_cluster=pd.Categorical(adata_nLN.obs['celltype'],
                           categories=['B cells', 'B cells 2', 'B cells 3', 'B cells 4', 'B cells 5',
       'Plasma cells', 'B cells 6', 'Dendritics', 'Dendritics 1',
       'Dendritics 2', 'Dendritics 3', 'Monocytes', 'Macrophages',
       'Macrophages 2', 'Macrophages 3', 'Dendritics 4', 'T cells',
       'T cells 1', 'T cells 2', 'T cells 3', 'T cells 4', 'T cells 5',
       'NK cells', 'T cells 6', 'T cells 7', 'T cells 8', 'T cells 9',
       'T cells 10', 'Unknown'])

In [352]:
ix=np.isin(ref_cluster,['B cells 2', 'B cells 3', 'B cells 4',
       'B cells 5', 'B cells 6'])
ref_cluster[ix]='B cells'

ix=np.isin(ref_cluster,['Dendritics 1', 'Dendritics 2',
       'Dendritics 3','Dendritics 4'])
ref_cluster[ix]='Dendritics'

ix=np.isin(ref_cluster,['Macrophages 2','Macrophages 3'])
ref_cluster[ix]='Macrophages'

ix=np.isin(ref_cluster,['T cells 1', 'T cells 2',
       'T cells 3', 'T cells 4', 'T cells 5', 'T cells 6',
       'T cells 7', 'T cells 8', 'T cells 9', 'T cells 10',])
ref_cluster[ix]='T cells'

In [353]:
adata_nLN.obs['celltype']=pd.Categorical(ref_cluster,
                                           categories=['B cells', 'Plasma cells', 'Dendritics', 'Monocytes', 'Macrophages', 
                                                       'T cells', 'NK cells','Unknown'])

In [357]:
adata_nLN.obs['sample'].cat.categories

Index(['LN_01', 'LN_02', 'LN_03', 'LN_04', 'LN_05', 'LN_06', 'LN_07', 'LN_08',
       'LN_11', 'LN_12'],
      dtype='object')

In [358]:
adata_nLN.obs['donor'] = adata_nLN.obs['sample']
ref_cluster=pd.Categorical(adata_nLN.obs['donor'],
                           categories=['LN_01', 'LN_02', 'LN_03', 'LN_04', 'LN_05', 'LN_06', 'LN_07', 'LN_08',
       'LN_11', 'LN_12'])
adata_nLN.rename_categories('donor', ['LN_01', 'LN_02', 'LN_03', 'LN_04', 'LN_05', 'LN_06', 'LN_07', 'LN_08',
       'LN_11', 'LN_12'])

In [361]:
adata_nLN.X = sp.sparse.csr_matrix(adata_nLN.X)

In [None]:
adata_nLN.obs['InternDatasetNumber'] = '08-1-LymphNode-Kim-2020'

In [363]:
adata_nLN.write(writepath + '08-1-LymphNode-Kim-2020-processed.h5ad')

## 08-2-LymphNode-Butcher-2020

In [364]:
target_collections = ["9c8808ce-1138-4dbe-818c-171cff10e650"]
cache_path = os.path.join(".", "data")
dsg = sfaira.data.dataloaders.databases.DatasetSuperGroupDatabases(data_path=cache_path, cache_metadata=True)
dsg.subset(key="collection_id", values=target_collections)
dsg.datasets
dsg.download()

In [365]:
path = '/path/to/repo/9c8808ce-1138-4dbe-818c-171cff10e650/'
files = [f for f in listdir(path) if isfile(join(path, f))]

In [366]:
for i in range(len(files)):
    print(files[i])
    path_2 = path + files[i]
    u = sc.read_h5ad(path_2)
    u.obs['id'] = files[i]
    if u.n_obs == 4355:
        adata = u

26ae14da-9e5f-4d18-abae-18a5a328feef.h5ad
cfa3c355-ee77-4fc8-9a00-78e61d23024c.h5ad


In [376]:
#calculate QC covariates
adata.obs['n_counts'] = adata.X.sum(1)
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1)

In [383]:
# get mt genes
mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]
np.array(mt_genes)
mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]

# convert to spare
adata.X=sp.sparse.csr_matrix.todense(adata.X)

# manually define mt score
y = np.bincount(mt_gene_mask)
ii = np.nonzero(y)[0]
np.vstack((ii,y[ii])).T
adata.X[:, mt_gene_mask].sum(1)
mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))
mt_sum=np.array(pd.DataFrame(mt_sum)[0])
mt_frac=mt_sum/adata.obs['n_counts'].values

adata.obs['mt_frac'] = mt_frac

#Filter out cells with over 25% mito fraction
adata = adata[adata.obs['mt_frac'] < 0.25]

In [385]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

normalizing by total count per cell
    finished (0:00:00): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    with n_comps=15
    finished (0:00:00)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 11 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:00:00)


In [386]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [387]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)
size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

In [388]:
#Delete adata_pp
del adata_pp
adata.obs['size_factors'] = size_factors

  This is separate from the ipykernel package so we can avoid doing imports until


In [389]:
adata.strings_to_categoricals()

In [390]:
#make  (adata.X) copy of counts of raw data for downstream analysis
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [391]:
adata.raw = adata

In [393]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [394]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:01)


In [395]:
# Calculate the visualizations
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata)
sc.tl.umap(adata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:07)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:08)


In [397]:
# make consistent annotations across datasets
adata.obs['celltype']=adata.obs['cell_type'].copy()

In [400]:
adata.rename_categories('celltype', ['Lymphatic endothelial cells'])

In [403]:
adata.obs['sex'].cat.categories
ref_cluster=pd.Categorical(adata.obs['sex'],
                           categories=['male'])
adata.rename_categories('sex', ['Male'])

In [406]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [407]:
adata.obs_names_make_unique()

In [None]:
adata.obs['InternDatasetNumber'] = '08-2-LymphNode-Butcher-2020'

In [None]:
adata.write(writepath + '08-2-LymphNode-Butcher-2020-processed.h5ad')

## 08-4-LymphNode-Pisco-2022

In [151]:
ix=np.isin(adata_pisco.obs['tissue'],['inguinal lymph node',  'lymph node']) 
adata=adata_pisco[ix].copy()



In [155]:
sc.pp.calculate_qc_metrics(adata, inplace=True)

In [156]:
adata.obs['n_counts'] = adata.obs['total_counts']
adata.obs['log_counts'] = adata.obs['log1p_n_genes_by_counts']
adata.obs['n_genes'] = adata.obs['n_genes_by_counts']

In [161]:
# FILTER PARAMETERS
#Filter out cells
#sc.pp.filter_cells(adata, max_counts = 12000)
#sc.pp.filter_cells(adata, max_genes = 7000)
# Min 20 cells - filters out low count genes
sc.pp.filter_genes(adata, min_cells=20) #500 works # # 450(30494) #400(29837) #300(28268) not

filtered out 30929 genes that are detected in less than 20 cells


In [163]:
# get mt genes
mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]
np.array(mt_genes)
mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]

# convert to spare
adata.X=sp.sparse.csr_matrix.todense(adata.X)

# manually define mt score
y = np.bincount(mt_gene_mask)
ii = np.nonzero(y)[0]
np.vstack((ii,y[ii])).T
adata.X[:, mt_gene_mask].sum(1)
mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))
mt_sum=np.array(pd.DataFrame(mt_sum)[0])
mt_frac=mt_sum/adata.obs['n_counts'].values

adata.obs['mt_frac'] = mt_frac

#Filter out cells with over 20% mito fraction
adata = adata[adata.obs['mt_frac'] < 0.20]

In [164]:
adata_pp=adata.copy()

In [165]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

normalizing by total count per cell
    finished (0:00:06): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    on highly variable genes
    with n_comps=15
    finished (0:00:03)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:08)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 16 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:00:08)


In [166]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [167]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)
size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

In [168]:
#Delete adata_pp
del adata_pp
adata.obs['size_factors'] = size_factors

  This is separate from the ipykernel package so we can avoid doing imports until


In [169]:
adata.strings_to_categoricals()

In [170]:
#make  (adata.X) copy of counts of raw data for downstream analysis
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [171]:
adata.raw = adata

In [172]:
#Normalize data
adata.X /= adata.obs['size_factors'].values[:, None]
sc.pp.log1p(adata)

In [173]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:04)


In [174]:
# Calculate the visualizations
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata)
sc.tl.umap(adata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:10)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:14)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:42)


In [197]:
# make consistent annotations across datasets
adata.obs['celltype']=adata.obs['cell_type'].copy()

In [200]:
ref_cluster=pd.Categorical(adata.obs['celltype'],
                           categories=['B cell', 'CD141-positive myeloid dendritic cell',
       'CD1c-positive myeloid dendritic cell',
       'CD4-positive, alpha-beta memory T cell',
       'CD8-positive, alpha-beta memory T cell', 'T cell',
       'classical monocyte', 'effector CD4-positive, alpha-beta T cell',
       'effector CD8-positive, alpha-beta T cell', 'endothelial cell',
       'erythrocyte', 'hematopoietic stem cell', 'innate lymphoid cell',
       'intermediate monocyte', 'macrophage', 'mast cell', 'mature NK T cell',
       'mature conventional dendritic cell', 'memory B cell', 'naive B cell',
       'naive thymus-derived CD4-positive, alpha-beta T cell', 'neutrophil',
       'non-classical monocyte', 'plasma cell', 'plasmacytoid dendritic cell',
       'regulatory T cell', 'stromal cell', 'type I NK T cell'])

In [201]:
ix=np.isin(ref_cluster,[ 'CD1c-positive myeloid dendritic cell', 'mature conventional dendritic cell', 'plasmacytoid dendritic cell'])
ref_cluster[ix]='CD141-positive myeloid dendritic cell'

ix=np.isin(ref_cluster,['CD4-positive, alpha-beta memory T cell','CD8-positive, alpha-beta memory T cell', 'effector CD4-positive, alpha-beta T cell',
       'effector CD8-positive, alpha-beta T cell',  'naive thymus-derived CD4-positive, alpha-beta T cell', 'regulatory T cell'])
ref_cluster[ix]='T cell'

ix=np.isin(ref_cluster,['intermediate monocyte', 'non-classical monocyte'])
ref_cluster[ix]='classical monocyte'

ix=np.isin(ref_cluster,['memory B cell','naive B cell'])
ref_cluster[ix]= 'B cell'

ix=np.isin(ref_cluster,['type I NK T cell'])
ref_cluster[ix]='mature NK T cell'

In [202]:
adata.obs['celltype']=pd.Categorical(ref_cluster,
                                           categories=['B cell', 'CD141-positive myeloid dendritic cell',
       
      'T cell',
       'classical monocyte',
        'endothelial cell',
       'erythrocyte','hematopoietic stem cell', 'innate lymphoid cell',
      'macrophage', 'mast cell', 'mature NK T cell',
       'neutrophil',
        'plasma cell', 'stromal cell'])

In [203]:
adata.rename_categories('celltype',['B cells', 'Dendritic cells',
        'T cells',
       'Monocytes',
        'Endothelial cells',
       'Erythroid cells', 'Hematopoietic stem cells', 'Innate lymphoid cells',
       'Macrophages', 'Mast cells', 'NK cells',
       'Neutrophils',
       'Plasma cells', 'Mesenchymal stromal cells'])

In [210]:
adata.obs['tissue'].cat.categories
ref_cluster=pd.Categorical(adata.obs['tissue'],
                           categories=['inguinal lymph node', 'lymph node'])
adata.rename_categories('tissue', ['LymphNode_Inguinal', 'LymphNode'])

  res = method(*args, **kwargs)


In [211]:
adata.obs['sex'].cat.categories
ref_cluster=pd.Categorical(adata.obs['sex'],
                           categories=['female', 'male'])
adata.rename_categories('sex', ['Female', 'Male'])

In [212]:
adata.obs['ethnicity'].cat.categories
ref_cluster=pd.Categorical(adata.obs['ethnicity'],
                           categories=['African American or Afro-Caribbean', 'European'])
adata.rename_categories('ethnicity', ['African-American or Afro-Caribbean', 'European'])

In [213]:
adata.obs['development_stage'].cat.categories
ref_cluster=pd.Categorical(adata.obs['development_stage'],
                           categories=['59-year-old human stage', '61-year-old human stage',
       '69-year-old human stage'])
adata.rename_categories('development_stage',['59', '61',
       '69'])

In [214]:
adata.obs['donor'].cat.categories
ref_cluster=pd.Categorical(adata.obs['donor'],
                           categories=['TSP2', 'TSP7', 'TSP14'])
adata.rename_categories('donor', ['TSP1', 'TSP2', 'TSP14'])

In [215]:
adata.obs['Organ'] = 'LymphNode'
adata.obs['Organ_Specific'] = adata.obs['tissue']
adata.obs['Dataset'] = 'Pisco_LymphNode'
adata.obs['InternDatasetNumber'] = '08-4-LymphNode-Pisco-2022'
adata.obs['Dataset_status'] = 'Healthy_Dataset'

adata.obs['celltype'] = adata.obs['celltype']
adata.obs['sub_celltype'] = 'NaN'
adata.obs['Malignant'] = 'NonMalignant'

adata.obs['Patient'] = adata.obs['donor']
adata.obs['Patient_Number'] = 'NaN'
adata.obs['age'] = adata.obs['development_stage']
adata.obs['sex'] = adata.obs['sex']
adata.obs['ethnicity'] = adata.obs['ethnicity']
adata.obs['health_status'] = 'NaN'

adata.obs['original_celltype_1'] = adata.obs['cell_type']
adata.obs['original_celltype_2'] = 'NaN'
adata.obs['original_celltype_3'] = 'NaN'

In [217]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [218]:
adata.obs_names_make_unique()

In [219]:
adata.write(writepath + '08-4-LymphNode-Pisco-2022-processed.h5ad')

## 08-6-LymphNode_ImmuneCells-Teichmann-2022

In [209]:
ix=np.isin(adata_analysis.obs['tissue_major'], ['LymphNode'])
adata=adata_analysis[ix].copy()

In [214]:
adata.obs['Organ'] =  adata.obs['tissue_major']
adata.obs['Organ_Specific'] = adata.obs['sub_tissue']
adata.obs['Dataset'] = adata.obs['Dataset']
adata.obs['InternDatasetNumber'] = '08-6-LymphNode_ImmuneCells-Teichmann-2022'
adata.obs['Dataset_status'] = 'Healthy_Dataset'

adata.obs['celltype'] = adata.obs['celltype']
adata.obs['sub_celltype'] = 'NaN'
adata.obs['Malignant'] = 'NonMalignant'

adata.obs['Patient'] = adata.obs['donor']
adata.obs['Patient_Number'] = 'NaN'
adata.obs['age'] = adata.obs['development_stage']
adata.obs['sex'] = adata.obs['sex']
adata.obs['ethnicity'] = adata.obs['ethnicity']
adata.obs['health_status'] = 'NaN'

adata.obs['original_celltype_1'] = adata.obs['cell_type']
adata.obs['original_celltype_2'] = adata.obs['Majority_voting_CellTypist_high']
adata.obs['original_celltype_3'] = 'NaN'

In [215]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [216]:
adata.obs_names_make_unique()

In [217]:
adata.write(writepath + '08-6-LymphNode_ImmuneCells-Teichmann-2022-processed.h5ad')

# 09-Lung

For the lung datasets of Travaglini, Madissoon and Reyfman, data and cell annotations were obtained from a study  integrating various lung scRNA-seq datasets (https://doi.org/10.1038/s41591-020-01227-z)

In [6]:
lung_data=anndata.read_h5ad(writepath + 'Muus2021_raw.h5ad')

In [8]:
lung_data.obs.dataset.value_counts()

Stanford_Krasnow_bioRxivTravaglini    60993
Sanger_Meyer_2019Madissoon            57020
Northwestern_Misharin_2018Reyfman     41778
Name: dataset, dtype: int64

## 09-1-Lung-Travaglini-2020

In [9]:
ix=np.isin(lung_data.obs['dataset'],['Stanford_Krasnow_bioRxivTravaglini']) 
adata=lung_data[ix].copy()

In [11]:
# FILTER PARAMETERS
print('Total number of cells: {:d}'.format(adata.n_obs))

#Filter out counts over 40000
sc.pp.filter_cells(adata, max_counts = 50000)
print('Number of cells after max count filter: {:d}'.format(adata.n_obs))

#Mito filter
adata = adata[adata.obs['mito_frac'] < 0.2]
print('Number of cells after MT filter: {:d}'.format(adata.n_obs))

#Filter out genes over 7500
sc.pp.filter_cells(adata, max_genes = 6000)
print('Number of cells after gene filter: {:d}'.format(adata.n_obs))

Total number of cells: 60993
filtered out 252 cells that have more than 50000 counts
Number of cells after max count filter: 60741
Number of cells after MT filter: 60741
filtered out 108 cells that have more than 6000 genes expressed


Trying to set attribute `.obs` of view, copying.


Number of cells after gene filter: 60633


In [12]:
#Filter genes:
print('Total number of genes: {:d}'.format(adata.n_vars))

# Min 20 cells - filters out low count genes
sc.pp.filter_genes(adata, min_cells=20)
print('Number of genes after cell filter: {:d}'.format(adata.n_vars))

Total number of genes: 33704
filtered out 15129 genes that are detected in less than 20 cells
Number of genes after cell filter: 18575


In [13]:
adata

AnnData object with n_obs × n_vars = 60633 × 18575
    obs: 'dataset', 'donor', 'last_author/PI', 'original_celltype_ann', 'sample', 'total_counts', 'log10_total_counts', 'n_genes_detected', 'mito_frac', 'ribo_frac', 'compl', 'ann_level_1', 'ann_level_2', 'ann_level_3', 'n_counts', 'n_genes'
    var: 'n_cells'

In [14]:
adata.X=sp.sparse.csr_matrix.todense(adata.X)

In [15]:
adata_pp=adata.copy()

In [16]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

normalizing by total count per cell
    finished (0:00:08): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    with n_comps=15
    finished (0:00:29)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:17)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 17 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:00:20)


In [17]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [18]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)

size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

R[write to console]: Loading required package: scran

R[write to console]: Loading required package: SingleCellExperiment

R[write to console]: Loading required package: SummarizedExperiment

R[write to console]: Loading required package: GenomicRanges

R[write to console]: Loading required package: stats4

R[write to console]: Loading required package: BiocGenerics

R[write to console]: Loading required package: parallel

R[write to console]: 
Attaching package: ‘BiocGenerics’


R[write to console]: The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB


R[write to console]: The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


R[write to console]: The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    d

In [19]:
#Delete adata_pp
del adata_pp
adata.obs['size_factors'] = size_factors

In [21]:
#make  (adata.X) copy of counts of raw data for downstream analysis

#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [22]:
adata.raw = adata

In [23]:
#Normalize data

adata.X /= adata.obs['size_factors'].values[:, None]
sc.pp.log1p(adata)

In [24]:
adata

AnnData object with n_obs × n_vars = 60633 × 18575
    obs: 'dataset', 'donor', 'last_author/PI', 'original_celltype_ann', 'sample', 'total_counts', 'log10_total_counts', 'n_genes_detected', 'mito_frac', 'ribo_frac', 'compl', 'ann_level_1', 'ann_level_2', 'ann_level_3', 'n_counts', 'n_genes', 'size_factors'
    var: 'n_cells'
    uns: 'log1p'
    layers: 'counts'

In [25]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [28]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', 
                              n_top_genes=4000, log=False, subset=False)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:00)


In [30]:
# Calculate the visualizations
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata)

sc.tl.umap(adata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:18)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:15)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:01:06)


In [53]:
# make consistent annotations across datasets
adata.obs['CellType_Atlas']=adata.obs['ann_level_3'].copy()
adata.obs['CellType_Atlas'].cat.categories

Index(['2_Mesothelium', '2_Smooth Muscle', 'AT1', 'AT2',
       'Airway smooth muscle', 'Arterial', 'B cell lineage', 'Basal',
       'Bronchial Vessel 1', 'Bronchial Vessel 2', 'Capillary',
       'Capillary Intermediate 1', 'Capillary Intermediate 2',
       'Dendritic cells', 'Fibroblasts', 'Fibromyocyte',
       'Innate lymphoid cells', 'Lymphatic EC', 'Macrophages', 'Mast cells',
       'Megakaryocytes', 'Monocytes', 'Multiciliated lineage',
       'Myofibroblasts', 'Rare', 'Secretory', 'Submucosal Secretory',
       'T cell lineage', 'Venous'],
      dtype='object')

In [54]:
ref_cluster=pd.Categorical(adata.obs['CellType_Atlas'],
                           categories=['2_Mesothelium', '2_Smooth Muscle', 'AT1', 'AT2',
       'Airway smooth muscle', 'Arterial', 'B cell lineage', 'Basal',
       'Bronchial Vessel 1', 'Bronchial Vessel 2', 'Capillary',
       'Capillary Intermediate 1', 'Capillary Intermediate 2',
       'Dendritic cells', 'Fibroblasts', 'Fibromyocyte',
       'Innate lymphoid cells', 'Lymphatic EC', 'Macrophages', 'Mast cells',
       'Megakaryocytes', 'Monocytes', 'Multiciliated lineage',
       'Myofibroblasts', 'Rare', 'Secretory', 'Submucosal Secretory',
       'T cell lineage', 'Venous'])

In [55]:
ix=np.isin(ref_cluster,['AT2'])
ref_cluster[ix]='AT1'

ix=np.isin(ref_cluster,['Airway smooth muscle'])
ref_cluster[ix]='2_Smooth Muscle'

ix=np.isin(ref_cluster,['Bronchial Vessel 2'])
ref_cluster[ix]='Bronchial Vessel 1'

ix=np.isin(ref_cluster,['Capillary Intermediate 1', 'Capillary Intermediate 2'])
ref_cluster[ix]='Capillary'

ix=np.isin(ref_cluster,['Fibromyocyte'])
ref_cluster[ix]='Fibroblasts'

ix=np.isin(ref_cluster,['Submucosal Secretory'])
ref_cluster[ix]='Secretory'

In [56]:
adata.obs['CellType_Atlas']=pd.Categorical(ref_cluster,
                                           categories=['2_Mesothelium', '2_Smooth Muscle', 'AT1', 
                                                       'Arterial', 'B cell lineage', 'Basal',
       'Bronchial Vessel 1',  'Capillary',
       'Dendritic cells', 'Fibroblasts',
       'Innate lymphoid cells', 'Lymphatic EC', 'Macrophages', 'Mast cells',
       'Megakaryocytes', 'Monocytes', 'Multiciliated lineage',
       'Myofibroblasts', 'Rare', 'Secretory',
       'T cell lineage', 'Venous'])

In [57]:
adata.obs['CellType_Atlas'].cat.categories

Index(['2_Mesothelium', '2_Smooth Muscle', 'AT1', 'Arterial', 'B cell lineage',
       'Basal', 'Bronchial Vessel 1', 'Capillary', 'Dendritic cells',
       'Fibroblasts', 'Innate lymphoid cells', 'Lymphatic EC', 'Macrophages',
       'Mast cells', 'Megakaryocytes', 'Monocytes', 'Multiciliated lineage',
       'Myofibroblasts', 'Rare', 'Secretory', 'T cell lineage', 'Venous'],
      dtype='object')

In [58]:
adata.rename_categories('CellType_Atlas', ['Mesothelial', 'Smooth muscle', 'Alveolar', 'Arterial', 'B cells',
       'Basal', 'Bronchial', 'Capillary', 'Dendritics',
       'Fibroblasts', 'Innate Lymphoid', 'Endothelial', 'Macrophages',
       'Mast', 'Megakaryocytes', 'Monocytes', 'Multiciliated',
       'Lymphoid', 'Remove', 'Secretory', 'T cells', 'Venous'])

In [None]:
adata.obs['InternDatasetNumber'] = '09-1-Lung-Travaglini-2020'

In [60]:
adata.write(writepath + '09-1-Lung-Travaglini-2020-processed.h5ad')

## 09-2-Lung-Madissoon-2019

In [33]:
ix=np.isin(lung_data.obs['dataset'],['Sanger_Meyer_2019Madissoon']) 
adata=lung_data[ix].copy()

In [35]:
# FILTER PARAMETERS
print('Total number of cells: {:d}'.format(adata.n_obs))

#Filter out counts over 30000
sc.pp.filter_cells(adata, max_counts = 30000)
print('Number of cells after max count filter: {:d}'.format(adata.n_obs))

#Mito filter
adata = adata[adata.obs['mito_frac'] < 0.2]
print('Number of cells after MT filter: {:d}'.format(adata.n_obs))

#Filter out genes over 5500
sc.pp.filter_cells(adata, max_genes = 5500)
print('Number of cells after gene filter: {:d}'.format(adata.n_obs))

Total number of cells: 57020
filtered out 706 cells that have more than 30000 counts
Number of cells after max count filter: 56314
Number of cells after MT filter: 56314
filtered out 10 cells that have more than 5500 genes expressed


Trying to set attribute `.obs` of view, copying.


Number of cells after gene filter: 56304


In [36]:
#Filter genes:
print('Total number of genes: {:d}'.format(adata.n_vars))

# Min 20 cells - filters out low count genes
sc.pp.filter_genes(adata, min_cells=20)
print('Number of genes after cell filter: {:d}'.format(adata.n_vars))

Total number of genes: 33704
filtered out 13275 genes that are detected in less than 20 cells
Number of genes after cell filter: 20429


In [38]:
adata.X=sp.sparse.csr_matrix.todense(adata.X)

In [40]:
adata_pp=adata.copy()

In [41]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

normalizing by total count per cell
    finished (0:00:09): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    with n_comps=15
    finished (0:00:30)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:09)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 16 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:00:17)


In [42]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [43]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)

size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

In [44]:
#Delete adata_pp
del adata_pp
adata.obs['size_factors'] = size_factors

In [46]:
#make  (adata.X) copy of counts of raw data for downstream analysis

#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [47]:
adata.raw = adata

In [48]:
#Normalize data

adata.X /= adata.obs['size_factors'].values[:, None]
sc.pp.log1p(adata)

In [49]:
adata

AnnData object with n_obs × n_vars = 56304 × 20429
    obs: 'dataset', 'donor', 'last_author/PI', 'original_celltype_ann', 'sample', 'total_counts', 'log10_total_counts', 'n_genes_detected', 'mito_frac', 'ribo_frac', 'compl', 'ann_level_1', 'ann_level_2', 'ann_level_3', 'n_counts', 'n_genes', 'size_factors'
    var: 'n_cells'
    uns: 'log1p'
    layers: 'counts'

In [50]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [52]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', 
                              n_top_genes=4000, log=False, subset=False)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:00)


In [53]:
# Calculate the visualizations
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata)

sc.tl.umap(adata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:15)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:18)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:55)


In [64]:
# make consistent annotations across datasets
adata.obs['CellType_Atlas']=adata.obs['ann_level_3'].copy()
adata.obs['CellType_Atlas'].cat.categories

Index(['2_Blood vessels', '2_Fibroblast lineage', '2_Smooth Muscle', 'AT1',
       'AT2', 'B cell lineage', 'Dendritic cells', 'Innate lymphoid cells',
       'Lymphatic EC', 'Macrophages', 'Mast cells', 'Monocytes',
       'Multiciliated lineage', 'T cell lineage'],
      dtype='object')

In [65]:
ref_cluster=pd.Categorical(adata.obs['CellType_Atlas'],
                           categories=['2_Blood vessels', '2_Fibroblast lineage', '2_Smooth Muscle', 'AT1',
       'AT2', 'B cell lineage', 'Dendritic cells', 'Innate lymphoid cells',
       'Lymphatic EC', 'Macrophages', 'Mast cells', 'Monocytes',
       'Multiciliated lineage', 'T cell lineage'])

In [66]:
ix=np.isin(ref_cluster,['Lymphatic EC'])
ref_cluster[ix]='2_Blood vessels'

ix=np.isin(ref_cluster,['AT2'])
ref_cluster[ix]='AT1'

In [67]:
adata.obs['CellType_Atlas']=pd.Categorical(ref_cluster,
                                           categories=['2_Blood vessels', '2_Fibroblast lineage', '2_Smooth Muscle', 'AT1',
                                                       'B cell lineage', 'Dendritic cells', 'Innate lymphoid cells',
                                                       'Macrophages', 'Mast cells', 'Monocytes',
                                                       'Multiciliated lineage', 'T cell lineage'])

In [68]:
adata.obs['CellType_Atlas'].cat.categories

Index(['2_Blood vessels', '2_Fibroblast lineage', '2_Smooth Muscle', 'AT1',
       'B cell lineage', 'Dendritic cells', 'Innate lymphoid cells',
       'Macrophages', 'Mast cells', 'Monocytes', 'Multiciliated lineage',
       'T cell lineage'],
      dtype='object')

In [69]:
adata.rename_categories('CellType_Atlas', ['Endothelial', 'Fibroblasts', 'Smooth muscle', 'Alveolar',
       'B cells', 'Dendritics', 'Innate Lymphoid',
       'Macrophages', 'Mast', 'Monocytes', 'Multiciliated',
       'T cells'])

In [None]:
adata.obs['InternDatasetNumber'] = '09-2-Lung-Madissoon-2019'

In [103]:
adata.write(writepath + '09-2-Lung-Madissoon-2019-processed.h5ad')

## 09-3-Lung-Reyfman-2019

In [57]:
ix=np.isin(lung_data.obs['dataset'],['Northwestern_Misharin_2018Reyfman']) 
adata=lung_data[ix].copy()

In [59]:
# FILTER PARAMETERS
print('Total number of cells: {:d}'.format(adata.n_obs))

#Filter out counts over 35000
sc.pp.filter_cells(adata, max_counts = 35000)
print('Number of cells after max count filter: {:d}'.format(adata.n_obs))

#Mito filter
adata = adata[adata.obs['mito_frac'] < 0.2]
print('Number of cells after MT filter: {:d}'.format(adata.n_obs))

#Filter out genes over 6000
sc.pp.filter_cells(adata, max_genes = 6000)
print('Number of cells after gene filter: {:d}'.format(adata.n_obs))

Total number of cells: 41778
filtered out 206 cells that have more than 35000 counts
Number of cells after max count filter: 41572
Number of cells after MT filter: 41517
filtered out 14 cells that have more than 6000 genes expressed


Trying to set attribute `.obs` of view, copying.


Number of cells after gene filter: 41503


In [60]:
#Filter genes:
print('Total number of genes: {:d}'.format(adata.n_vars))

# Min 20 cells - filters out low count genes
sc.pp.filter_genes(adata, min_cells=20)
print('Number of genes after cell filter: {:d}'.format(adata.n_vars))

Total number of genes: 33704
filtered out 14289 genes that are detected in less than 20 cells
Number of genes after cell filter: 19415


In [61]:
adata

AnnData object with n_obs × n_vars = 41503 × 19415
    obs: 'dataset', 'donor', 'last_author/PI', 'original_celltype_ann', 'sample', 'total_counts', 'log10_total_counts', 'n_genes_detected', 'mito_frac', 'ribo_frac', 'compl', 'ann_level_1', 'ann_level_2', 'ann_level_3', 'n_counts', 'n_genes'
    var: 'n_cells'

In [62]:
adata.X=sp.sparse.csr_matrix.todense(adata.X)

In [63]:
adata_pp=adata.copy()

In [64]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

normalizing by total count per cell
    finished (0:00:05): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    with n_comps=15
    finished (0:00:21)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:07)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 20 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:00:08)


In [65]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [66]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)

size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

In [67]:
#Delete adata_pp
del adata_pp
adata.obs['size_factors'] = size_factors

In [69]:
#make  (adata.X) copy of counts of raw data for downstream analysis

#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [70]:
adata.raw = adata

In [71]:
#Normalize data

adata.X /= adata.obs['size_factors'].values[:, None]
sc.pp.log1p(adata)

In [72]:
adata

AnnData object with n_obs × n_vars = 41503 × 19415
    obs: 'dataset', 'donor', 'last_author/PI', 'original_celltype_ann', 'sample', 'total_counts', 'log10_total_counts', 'n_genes_detected', 'mito_frac', 'ribo_frac', 'compl', 'ann_level_1', 'ann_level_2', 'ann_level_3', 'n_counts', 'n_genes', 'size_factors'
    var: 'n_cells'
    uns: 'log1p'
    layers: 'counts'

In [73]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [75]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', 
                              n_top_genes=4000, log=False, subset=False)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:00)


In [76]:
# Calculate the visualizations
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata)

sc.tl.umap(adata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:15)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:12)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:39)


In [75]:
# make consistent annotations across datasets
adata.obs['CellType_Atlas']=adata.obs['ann_level_3'].copy()
adata.obs['CellType_Atlas'].cat.categories

Index(['1_Epithelial', '1_Stroma', '1_Unicorns and artifacts',
       '2_Blood vessels', '2_Lymphoid', 'AT1', 'AT2', 'Dendritic cells',
       'Epithelial cells, proliferating', 'Lymphatic EC', 'Macrophages',
       'Mast cells', 'Monocytes', 'Multiciliated lineage', 'Secretory'],
      dtype='object')

In [76]:
ref_cluster=pd.Categorical(adata.obs['CellType_Atlas'],
                           categories=['1_Epithelial', '1_Stroma', '1_Unicorns and artifacts',
       '2_Blood vessels', '2_Lymphoid', 'AT1', 'AT2', 'Dendritic cells',
       'Epithelial cells, proliferating', 'Lymphatic EC', 'Macrophages',
       'Mast cells', 'Monocytes', 'Multiciliated lineage', 'Secretory'])

In [77]:
ix=np.isin(ref_cluster,['Epithelial cells, proliferating'])
ref_cluster[ix]='1_Epithelial'

ix=np.isin(ref_cluster,['Lymphatic EC'])
ref_cluster[ix]='2_Blood vessels'

ix=np.isin(ref_cluster,['AT2'])
ref_cluster[ix]='AT1'

In [78]:
adata.obs['CellType_Atlas']=pd.Categorical(ref_cluster,
                                           categories=['1_Epithelial', '1_Stroma', '1_Unicorns and artifacts',
       '2_Blood vessels', '2_Lymphoid', 'AT1', 'Dendritic cells',
       'Macrophages',
       'Mast cells', 'Monocytes', 'Multiciliated lineage', 'Secretory'])

In [79]:
adata.obs['CellType_Atlas'].cat.categories

Index(['1_Epithelial', '1_Stroma', '1_Unicorns and artifacts',
       '2_Blood vessels', '2_Lymphoid', 'AT1', 'Dendritic cells',
       'Macrophages', 'Mast cells', 'Monocytes', 'Multiciliated lineage',
       'Secretory'],
      dtype='object')

In [80]:
adata.rename_categories('CellType_Atlas', ['Epithelial', 'Stroma', 'Remove','Endothelial', 'Lymphoid', 'Alveolar', 
                                           'Dendritics','Macrophages', 'Mast', 'Monocytes', 'Multiciliated', 'Secretory'])

In [None]:
adata.obs['InternDatasetNumber'] = '09-3-Lung-Reyfman-2019'

In [142]:
adata.write(writepath + '09-3-Lung-Reyfman-2019-processed.h5ad')

## 09-4-Lung-Kim-2020

For the lung dataset of Kim, data was obtained from the original study (https://doi.org/10.1038/s41467-020-16164-1)

In [80]:
adata=sc.read_text(writepath + 'GSE131907_Lung_Cancer_normalized_log2TPM_matrix.txt', 
                   delimiter=None, 
                   first_column_names=None, 
                   dtype='float32')
adata=adata.transpose()

In [83]:
#import annotations
annotations=pd.DataFrame(pd.read_csv(writepath + 'GSE131907_Lung_Cancer_cell_annotation.txt', delimiter='\t'))

In [84]:
#merge the two dfs
df1=pd.DataFrame(adata.obs)
annotations.index=annotations['Index']
df2=annotations

In [85]:
df_merged=pd.merge(df1, df2, left_index=True, right_index=True)

In [87]:
adata.obs['sample']=df_merged['Sample']
adata.obs['origin']=df_merged['Sample_Origin']
adata.obs['cell type']=df_merged['Cell_type']
adata.obs['cell type refined']=df_merged['Cell_type.refined']
adata.obs['cell subtype']=df_merged['Cell_subtype']

In [90]:
# subset to lung and lymph node data
ix=np.isin(adata.obs['origin'],['nLung']) 
adata_nLung=adata[ix].copy()

In [91]:
#calculate QC covariates
adata_nLung.obs['n_counts'] = adata_nLung.X.sum(1)
adata_nLung.obs['log_counts'] = np.log(adata_nLung.obs['n_counts'])
adata_nLung.obs['n_genes'] = (adata_nLung.X > 0).sum(1)

In [92]:
mt_genes = adata_nLung.var_names[[gene.startswith('MT-') for gene in adata_nLung.var_names]]
np.array(mt_genes)

array(['MT-ATP6', 'MT-ATP8', 'MT-CO1', 'MT-CO2', 'MT-CO3', 'MT-CYB',
       'MT-ND1', 'MT-ND2', 'MT-ND3', 'MT-ND4', 'MT-ND4L', 'MT-ND5',
       'MT-ND6'], dtype=object)

In [93]:
mt_gene_mask = [gene.startswith('MT-') for gene in adata_nLung.var_names]
adata_nLung.obs['mt_frac'] = adata_nLung.X[:, mt_gene_mask].sum(1)/adata_nLung.obs['n_counts']

In [94]:
adata_pp = adata_nLung.copy()

In [95]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)

normalizing by total count per cell
    finished (0:00:07): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    with n_comps=15
    finished (0:00:32)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:07)


In [96]:
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 16 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:00:08)


In [97]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata_nLung.X.T

In [98]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)

size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

In [99]:
#Delete adata_pp
del adata_pp

In [100]:
adata_nLung.obs['size_factors'] = size_factors

In [102]:
adata_nLung.strings_to_categoricals()

In [103]:
#make  (adata.X) copy of counts of raw data for downstream analysis

#Keep the count data in a counts layer
adata_nLung.layers["counts"] = adata_nLung.X.copy()

In [104]:
#Normalize data

adata_nLung.X /= adata_nLung.obs['size_factors'].values[:, None]
sc.pp.log1p(adata_nLung)

In [105]:
adata_nLung

AnnData object with n_obs × n_vars = 42995 × 29634
    obs: 'sample', 'origin', 'cell type', 'cell type refined', 'cell subtype', 'n_counts', 'log_counts', 'n_genes', 'mt_frac', 'size_factors'
    uns: 'log1p'
    layers: 'counts'

In [107]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata_nLung, flavor='cell_ranger', 
                              n_top_genes=4000, log=False, subset=False)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:06)


  / disp_mad_bin[df['mean_bin'].values].values
  gene_subset = df['dispersion_norm'].values >= disp_cut_off


In [108]:
sc.pp.pca(adata_nLung, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata_nLung)

sc.tl.umap(adata_nLung)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:19)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:11)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:40)


In [None]:
adata=adata_nLung.copy()

In [86]:
# make consistent annotations across datasets
adata.obs['CellType_Atlas']=adata.obs['cell subtype'].copy()
adata.obs['CellType_Atlas'].cat.categories

Index(['AT1', 'AT2', 'Activated DCs', 'Alveolar Mac', 'CD1c+ DCs', 'CD4+ Th',
       'CD8 low T', 'CD8+/CD4+ Mixed Th', 'CD141+ DCs', 'CD163+CD14+ DCs',
       'CD207+CD1a+ LCs', 'COL13A1+ matrix FBs', 'COL14A1+ matrix FBs',
       'Ciliated', 'Club', 'Cytotoxic CD8+ T', 'EPCs', 'Exhausted CD8+ T',
       'Exhausted Tfh', 'FB-like cells', 'Follicular B cells',
       'GC B cells in the DZ', 'GrB-secreting B cells', 'Lymphatic ECs',
       'MALT B cells', 'MAST', 'Mesothelial cells', 'Monocytes',
       'Myofibroblasts', 'NK', 'Naive CD4+ T', 'Naive CD8+ T', 'Pericytes',
       'Plasma cells', 'Pleural Mac', 'Smooth muscle cells', 'Stalk-like ECs',
       'Tip-like ECs', 'Treg', 'Tumor ECs', 'Undetermined', 'mo-Mac', 'nan',
       'pDCs'],
      dtype='object')

In [87]:
ref_cluster=pd.Categorical(adata.obs['CellType_Atlas'],
                           categories=['AT1', 'AT2', 'Activated DCs', 'Alveolar Mac', 'CD1c+ DCs', 'CD4+ Th',
       'CD8 low T', 'CD8+/CD4+ Mixed Th', 'CD141+ DCs', 'CD163+CD14+ DCs',
       'CD207+CD1a+ LCs', 'COL13A1+ matrix FBs', 'COL14A1+ matrix FBs',
       'Ciliated', 'Club', 'Cytotoxic CD8+ T', 'EPCs', 'Exhausted CD8+ T',
       'Exhausted Tfh', 'FB-like cells', 'Follicular B cells',
       'GC B cells in the DZ', 'GrB-secreting B cells', 'Lymphatic ECs',
       'MALT B cells', 'MAST', 'Mesothelial cells', 'Monocytes',
       'Myofibroblasts', 'NK', 'Naive CD4+ T', 'Naive CD8+ T', 'Pericytes',
       'Plasma cells', 'Pleural Mac', 'Smooth muscle cells', 'Stalk-like ECs',
       'Tip-like ECs', 'Treg', 'Tumor ECs', 'Undetermined', 'mo-Mac', 'nan',
       'pDCs'])

In [88]:
ix=np.isin(ref_cluster,['AT2'])
ref_cluster[ix]='AT1'

ix=np.isin(ref_cluster,['CD1c+ DCs','CD141+ DCs', 'CD163+CD14+ DCs','CD207+CD1a+ LCs','pDCs'])
ref_cluster[ix]='Activated DCs'

ix=np.isin(ref_cluster,['Pleural Mac','mo-Mac'])
ref_cluster[ix]='Alveolar Mac'

ix=np.isin(ref_cluster,['CD8 low T', 'CD8+/CD4+ Mixed Th','Cytotoxic CD8+ T', 'Exhausted CD8+ T','Exhausted Tfh',
                        'Naive CD4+ T', 'Naive CD8+ T','Treg'])
ref_cluster[ix]='CD4+ Th'

ix=np.isin(ref_cluster,['COL14A1+ matrix FBs','FB-like cells','Pericytes'])
ref_cluster[ix]='COL13A1+ matrix FBs'

ix=np.isin(ref_cluster,['Club'])
ref_cluster[ix]='Ciliated'

ix=np.isin(ref_cluster,['Lymphatic ECs','Stalk-like ECs','Tip-like ECs',  'Tumor ECs'])
ref_cluster[ix]='EPCs'

ix=np.isin(ref_cluster,['GC B cells in the DZ','GrB-secreting B cells','MALT B cells'])
ref_cluster[ix]='Follicular B cells'

In [89]:
adata.obs['CellType_Atlas']=pd.Categorical(ref_cluster,
                                           categories=['AT1','Activated DCs', 'Alveolar Mac', 'CD4+ Th',
                                                       'COL13A1+ matrix FBs', 'Ciliated', 'EPCs','Follicular B cells',
                                                       'MAST','Mesothelial cells', 'Monocytes','Myofibroblasts', 'NK', 
                                                       'Smooth muscle cells', 'Undetermined','nan'])

In [90]:
adata.obs['CellType_Atlas'].cat.categories

Index(['AT1', 'Activated DCs', 'Alveolar Mac', 'CD4+ Th',
       'COL13A1+ matrix FBs', 'Ciliated', 'EPCs', 'Follicular B cells', 'MAST',
       'Mesothelial cells', 'Monocytes', 'Myofibroblasts', 'NK',
       'Smooth muscle cells', 'Undetermined', 'nan'],
      dtype='object')

In [91]:
adata.rename_categories('CellType_Atlas', ['Alveolar', 'Dendritics', 'Macrophages', 'T cells',
       'Fibroblasts', 'Epithelial', 'Endothelial', 'B cells', 'Mast',
       'Mesothelial', 'Monocytes', 'Lymphoid', 'NK',
       'Smooth muscle','Unknown','toassign'])

In [97]:
testdf=pd.DataFrame()
testdf['Atlas']=adata.obs['CellType_Atlas']
testdf['celltype']=adata.obs['cell type']
testdf['fin']=testdf['Atlas'].astype(str) + str('__') + testdf['celltype'].astype(str)
testdf['fin'] = testdf.fin.astype('category')

In [99]:
adata.obs['CellType_Atlas']=np.array(testdf['fin'])

In [100]:
adata.strings_to_categoricals()

... storing 'CellType_Atlas' as categorical


In [103]:
adata.rename_categories('CellType_Atlas', ['Alveolar', 'B cells',
       'Dendritics', 'Endothelial',
       'Epithelial', 'Fibroblasts',
       'Lymphoid', 'Macrophages',
       'Mast', 'Mesothelial',
       'Monocytes', 'NK', 'NK MERGE',
       'Smooth muscle', 'T cells',
       'T cells MERGE', 'Unknown',
       'Unknown MERGE', 'Unknown MERGE2',
       'Unknown MERGE3', 'Unknown MERGE4', 'Unknown MERGE5',
       'Unknown MERGE6', 'Endothelial MERGE',
       'Fibroblasts MERGE', 'NK MERGE2',
       'T cells MERGE2'])

In [104]:
adata.obs['CellType_Atlas'].cat.categories

Index(['Alveolar', 'B cells', 'Dendritics', 'Endothelial', 'Epithelial',
       'Fibroblasts', 'Lymphoid', 'Macrophages', 'Mast', 'Mesothelial',
       'Monocytes', 'NK', 'NK MERGE', 'Smooth muscle', 'T cells',
       'T cells MERGE', 'Unknown', 'Unknown MERGE', 'Unknown MERGE2',
       'Unknown MERGE3', 'Unknown MERGE4', 'Unknown MERGE5', 'Unknown MERGE6',
       'Endothelial MERGE', 'Fibroblasts MERGE', 'NK MERGE2',
       'T cells MERGE2'],
      dtype='object')

In [105]:
ref_cluster=pd.Categorical(adata.obs['CellType_Atlas'],
                           categories=['Alveolar', 'B cells', 'Dendritics', 'Endothelial', 'Epithelial',
       'Fibroblasts', 'Lymphoid', 'Macrophages', 'Mast', 'Mesothelial',
       'Monocytes', 'NK', 'NK MERGE', 'Smooth muscle', 'T cells',
       'T cells MERGE', 'Unknown', 'Unknown MERGE', 'Unknown MERGE2',
       'Unknown MERGE3', 'Unknown MERGE4', 'Unknown MERGE5', 'Unknown MERGE6',
       'Endothelial MERGE', 'Fibroblasts MERGE', 'NK MERGE2',
       'T cells MERGE2'])

In [106]:
ix=np.isin(ref_cluster,['Endothelial MERGE'])
ref_cluster[ix]='Endothelial'

ix=np.isin(ref_cluster,['Fibroblasts MERGE'])
ref_cluster[ix]='Fibroblasts'

ix=np.isin(ref_cluster,['NK MERGE','NK MERGE2'])
ref_cluster[ix]='NK'

ix=np.isin(ref_cluster,['T cells MERGE','T cells MERGE2'])
ref_cluster[ix]='T cells'

ix=np.isin(ref_cluster,['Unknown MERGE', 'Unknown MERGE2',
       'Unknown MERGE3', 'Unknown MERGE4', 'Unknown MERGE5', 'Unknown MERGE6'])
ref_cluster[ix]='Unknown'

In [107]:
adata.obs['CellType_Atlas']=pd.Categorical(ref_cluster,
                                           categories=['Alveolar', 'B cells', 'Dendritics', 'Endothelial', 'Epithelial',
                                                       'Fibroblasts', 'Lymphoid', 'Macrophages', 'Mast', 'Mesothelial',
                                                       'Monocytes', 'NK',  'Smooth muscle', 'T cells','Unknown'])

In [108]:
adata.obs['CellType_Atlas'].cat.categories

Index(['Alveolar', 'B cells', 'Dendritics', 'Endothelial', 'Epithelial',
       'Fibroblasts', 'Lymphoid', 'Macrophages', 'Mast', 'Mesothelial',
       'Monocytes', 'NK', 'Smooth muscle', 'T cells', 'Unknown'],
      dtype='object')

In [109]:
adata.rename_categories('CellType_Atlas', ['Alveolar', 'B cells', 'Dendritics', 'Endothelial', 'Epithelial',
       'Fibroblasts', 'Lymphoid', 'Macrophages', 'Mast', 'Mesothelial',
       'Monocytes', 'NK cells', 'Smooth muscle', 'T cells', 'Unknown'])

In [None]:
adata.obs['InternDatasetNumber'] = '09-4-Lung-Kim-2020'

In [None]:
adata.write(writepath + '09-4-Lung-Kim-2020-processed.h5ad')

##  09-5-Lung-Pisco-2022

In [193]:
ix=np.isin(adata_pisco.obs['tissue'],['lung']) 
adata=adata_pisco[ix].copy()

In [198]:
#calculate QC covariates
adata.obs['n_counts'] = adata.X.sum(1)
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1)

In [202]:
# FILTER PARAMETERS
#Filter out cells
sc.pp.filter_cells(adata, max_counts = 15000)
sc.pp.filter_cells(adata, max_genes = 9000)
# Min 20 cells - filters out low count genes
sc.pp.filter_genes(adata, min_cells=20) #500 works # # 450(30494) #400(29837) #300(28268) not

filtered out 11 cells that have more than 15000 counts
filtered out 17 cells that have more than 9000 genes expressed
filtered out 30257 genes that are detected in less than 20 cells


In [204]:
# get mt genes
mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]
np.array(mt_genes)
mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]

# convert to spare
adata.X=sp.sparse.csr_matrix.todense(adata.X)

# manually define mt score
y = np.bincount(mt_gene_mask)
ii = np.nonzero(y)[0]
np.vstack((ii,y[ii])).T
adata.X[:, mt_gene_mask].sum(1)
mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))
mt_sum=np.array(pd.DataFrame(mt_sum)[0])
mt_frac=mt_sum/adata.obs['n_counts'].values

adata.obs['mt_frac'] = mt_frac

#Filter out cells with over 20% mito fraction
adata = adata[adata.obs['mt_frac'] < 0.20]

In [205]:
adata_pp=adata.copy()

In [206]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

normalizing by total count per cell
    finished (0:00:05): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    on highly variable genes
    with n_comps=15
    finished (0:00:02)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:05)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 20 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:00:04)


In [207]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [208]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)
size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

In [209]:
#Delete adata_pp
del adata_pp
adata.obs['size_factors'] = size_factors

  This is separate from the ipykernel package so we can avoid doing imports until


In [210]:
adata.strings_to_categoricals()

In [211]:
#make  (adata.X) copy of counts of raw data for downstream analysis
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [212]:
adata.raw = adata

In [213]:
#Normalize data
adata.X /= adata.obs['size_factors'].values[:, None]
sc.pp.log1p(adata)

In [214]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:03)


In [215]:
# Calculate the visualizations
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata)
sc.tl.umap(adata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:04)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:07)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:28)


In [234]:
# make consistent annotations across datasets
adata.obs['celltype']=adata.obs['cell_type'].copy()

In [235]:
adata.obs['celltype'].cat.categories

Index(['B cell', 'CD4-positive, alpha-beta T cell',
       'CD8-positive, alpha-beta T cell', 'adventitial cell', 'basal cell',
       'basophil', 'blood vessel endothelial cell',
       'bronchial smooth muscle cell', 'capillary endothelial cell',
       'classical monocyte', 'club cell', 'dendritic cell',
       'effector CD4-positive, alpha-beta T cell',
       'effector CD8-positive, alpha-beta T cell',
       'endothelial cell of artery', 'endothelial cell of lymphatic vessel',
       'fibroblast', 'intermediate monocyte', 'lung ciliated cell',
       'lung microvascular endothelial cell', 'macrophage', 'mature NK T cell',
       'mesothelial cell', 'myofibroblast cell', 'neutrophil',
       'non-classical monocyte', 'pericyte cell', 'plasma cell',
       'plasmacytoid dendritic cell', 'pulmonary ionocyte',
       'respiratory goblet cell', 'serous cell of epithelium of bronchus',
       'smooth muscle cell', 'type I pneumocyte', 'type II pneumocyte',
       'vascular associated s

In [236]:
ref_cluster=pd.Categorical(adata.obs['celltype'],
                           categories=['B cell', 'CD4-positive, alpha-beta T cell',
       'CD8-positive, alpha-beta T cell', 'adventitial cell', 'basal cell',
       'basophil', 'blood vessel endothelial cell',
       'bronchial smooth muscle cell', 'capillary endothelial cell',
       'classical monocyte', 'club cell', 'dendritic cell',
       'effector CD4-positive, alpha-beta T cell',
       'effector CD8-positive, alpha-beta T cell',
       'endothelial cell of artery', 'endothelial cell of lymphatic vessel',
       'fibroblast', 'intermediate monocyte', 'lung ciliated cell',
       'lung microvascular endothelial cell', 'macrophage', 'mature NK T cell',
       'mesothelial cell', 'myofibroblast cell', 'neutrophil',
       'non-classical monocyte', 'pericyte cell', 'plasma cell',
       'plasmacytoid dendritic cell', 'pulmonary ionocyte',
       'respiratory goblet cell', 'serous cell of epithelium of bronchus',
       'smooth muscle cell', 'type I pneumocyte', 'type II pneumocyte',
       'vascular associated smooth muscle cell', 'vein endothelial cell'])

In [237]:
ix=np.isin(ref_cluster,['CD8-positive, alpha-beta T cell','effector CD4-positive, alpha-beta T cell','effector CD8-positive, alpha-beta T cell',])
ref_cluster[ix]='CD4-positive, alpha-beta T cell'

ix=np.isin(ref_cluster,['blood vessel endothelial cell','capillary endothelial cell', 'endothelial cell of artery', 'lung microvascular endothelial cell', 'vein endothelial cell'])
ref_cluster[ix]='adventitial cell'

ix=np.isin(ref_cluster,['intermediate monocyte',  'non-classical monocyte'])
ref_cluster[ix]='classical monocyte'

ix=np.isin(ref_cluster,['smooth muscle cell','vascular associated smooth muscle cell'])
ref_cluster[ix]='bronchial smooth muscle cell'

ix=np.isin(ref_cluster,['respiratory goblet cell'])
ref_cluster[ix]='club cell'

ix=np.isin(ref_cluster,['type II pneumocyte'])
ref_cluster[ix]='type I pneumocyte'

ix=np.isin(ref_cluster,['serous cell of epithelium of bronchus'])
ref_cluster[ix]='club cell'

ix=np.isin(ref_cluster,['plasmacytoid dendritic cell'])
ref_cluster[ix]='dendritic cell'

In [238]:
adata.obs['celltype']=pd.Categorical(ref_cluster,
                                           categories=['B cell', 'CD4-positive, alpha-beta T cell',
       'adventitial cell', 'basal cell',
       'basophil',
       'bronchial smooth muscle cell',
       'classical monocyte', 'club cell', 'dendritic cell',                                               
        'endothelial cell of lymphatic vessel',
       'fibroblast','lung ciliated cell', 
       'macrophage', 'mature NK T cell',
       'mesothelial cell', 'myofibroblast cell', 'neutrophil',
       'pericyte cell', 'plasma cell',
       'pulmonary ionocyte',
        'type I pneumocyte'])

In [239]:
adata.rename_categories('celltype',['B cells', 'T cells',
       'Endothelial cells', 'Basal cells',
       'Basophil cells',
       'Smooth muscle cells',
       'Monocytes', 'Airway epithelial cells', 'Dendritic cells',                                               
        'Lymphatic endothelial cells',
       'Fibroblast cells','Multiciliated cells',
       'Macrophages', 'NK cells',
       'Mesothelial cells', 'Myofibroblast cells', 'Neutrophils',
       'Pericytes', 'Plasma cells',
        'Unknown',
        'Alveolar cells'])

In [243]:
adata.obs['tissue'].cat.categories
ref_cluster=pd.Categorical(adata.obs['tissue'],
                           categories=['lung'])
adata.rename_categories('tissue', ['Lung'])

In [244]:
adata.obs['sex'].cat.categories
ref_cluster=pd.Categorical(adata.obs['sex'],
                           categories=['female', 'male'])
adata.rename_categories('sex', ['Female', 'Male'])

In [245]:
adata.obs['ethnicity'].cat.categories
ref_cluster=pd.Categorical(adata.obs['ethnicity'],
                           categories=['African American or Afro-Caribbean', 'European'])
adata.rename_categories('ethnicity', ['African-American or Afro-Caribbean', 'European'])

In [246]:
adata.obs['development_stage'].cat.categories
ref_cluster=pd.Categorical(adata.obs['development_stage'],
                           categories=['59-year-old human stage', '61-year-old human stage'])
adata.rename_categories('development_stage',['59', '61'])

In [247]:
adata.obs['donor'].cat.categories
ref_cluster=pd.Categorical(adata.obs['donor'],
                           categories=['TSP1', 'TSP2', 'TSP14'])
adata.rename_categories('donor', ['TSP1', 'TSP2', 'TSP14'])

In [248]:
adata.obs['Organ'] = 'Lung'
adata.obs['Organ_Specific'] = adata.obs['tissue']
adata.obs['Dataset'] = 'Pisco_Lung'
adata.obs['InternDatasetNumber'] = '09-5-Lung-Pisco-2022'
adata.obs['Dataset_status'] = 'Healthy_Dataset'

adata.obs['celltype'] = adata.obs['celltype']
adata.obs['sub_celltype'] = 'NaN'
adata.obs['Malignant'] = 'NonMalignant'

adata.obs['Patient'] = adata.obs['donor']
adata.obs['Patient_Number'] = 'NaN'
adata.obs['age'] = adata.obs['development_stage']
adata.obs['sex'] = adata.obs['sex']
adata.obs['ethnicity'] = adata.obs['ethnicity']
adata.obs['health_status'] = 'NaN'

adata.obs['original_celltype_1'] = adata.obs['cell_type']
adata.obs['original_celltype_2'] = 'NaN'
adata.obs['original_celltype_3'] = 'NaN'

In [250]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [251]:
adata.obs_names_make_unique()

In [252]:
adata.write(writepath + '09-5-Lung-Pisco-2022-processed.h5ad')

## 09-7-Lung-Han-2020

In [None]:
ix=np.isin(adata_han.obs['sub_tissue'],['AdultLung']) 
adata=adata_han[ix].copy()

In [624]:
adata.obs['InternDatasetNumber'] ='09-7-Lung-Han-2020'

In [None]:
#calculate QC covariates
adata.obs['n_counts'] = adata.X.sum(1)
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1)

In [None]:
# FILTER PARAMETERS#Filter out cells
sc.pp.filter_cells(adata, max_counts = 5000)
sc.pp.filter_cells(adata, max_genes = 2000)
# Min 20 cells - filters out low count genes
sc.pp.filter_genes(adata, min_cells=20) #500 works # # 450(30494) #400(29837) #300(28268) not

In [None]:
# get mt genes
mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]
np.array(mt_genes)
mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]

# convert to spare
adata.X=sp.sparse.csr_matrix.todense(adata.X)

# manually define mt score
y = np.bincount(mt_gene_mask)
ii = np.nonzero(y)[0]
np.vstack((ii,y[ii])).T
adata.X[:, mt_gene_mask].sum(1)
mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))
mt_sum=np.array(pd.DataFrame(mt_sum)[0])
mt_frac=mt_sum/adata.obs['n_counts'].values

adata.obs['mt_frac'] = mt_frac

#Filter out cells with over 20% mito fraction
adata = adata[adata.obs['mt_frac'] < 0.20]

In [None]:
adata_pp=adata.copy()

In [None]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

In [None]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [None]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)
size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

In [None]:
#Delete adata_pp
del adata_pp
adata.obs['size_factors'] = size_factors

In [None]:
adata.strings_to_categoricals()

In [None]:
#make  (adata.X) copy of counts of raw data for downstream analysis
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [None]:
adata.raw = adata

In [None]:
#Normalize data
adata.X /= adata.obs['size_factors'].values[:, None]
sc.pp.log1p(adata)

In [None]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)

In [None]:
# Calculate the visualizations
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata)
sc.tl.umap(adata)

In [None]:
# make consistent annotations across datasets
adata.obs['celltype']=adata.obs['celltype_specific'].copy()
adata.obs['celltype'].cat.categories

In [None]:
ref_cluster=pd.Categorical(adata.obs['celltype'],
                           categories=['AT1 cell', 'AT2 cell', 'Actived T cell',
       'Alveolar bipotent/intermediate cell', 'Arterial endothelial cell',
       'Artry endothelial cell', 'B cell', 'B cell (Plasmocyte)',
       'B cell (Plasmocyte)_IGHA/HM high', 'B cell (Plasmocyte)_IGHG high',
       'Basal/Epithelial cell', 'Chondrocyte', 'Ciliated cell', 'Club cell',
       'Club cell_BPIFB1 high', 'Club cell_KLK11 high',
       'Conventional dendritic cell', 'Dendritic cell',
       'Endothelial cell_ACKR1 high', 'Endothelial cell_SELE high',
       'Endothelial cell_SPARCL1 high', 'Endothelial cell_TMEM100 high',
       'Endothelial cell_VWF high', 'Epithelial cell_PLA2G2A high',
       'Epithelial cell_S100A2 high', 'Fibroblast', 'Fibroblast_A2M high',
       'Fibroblast_SFRP high', 'Lymphatic endothelial cell', 'M2 macrophage',
       'Macrophage', 'Macrophage_CCL20 high', 'Macrophage_M2',
       'Macrophage_VSIG4 high', 'Mast cell', 'Megakaryocyte', 'Monocyte',
       'Myeloid cell', 'Natural killer cell', 'Neutrophil',
       'Proliferating T cell',
       'Proliferating alveolar bipotent progenitor cell', 'Proliferating cell',
       'Smooth muscle cell', 'T cell'])

In [647]:
ix=np.isin(ref_cluster,[ 'AT2 cell',  'Alveolar bipotent/intermediate cell',   'Proliferating alveolar bipotent progenitor cell'])
ref_cluster[ix]= 'AT1 cell'

ix=np.isin(ref_cluster,[  'Proliferating T cell',  'T cell'])
ref_cluster[ix]= 'Actived T cell'

ix=np.isin(ref_cluster,[  'Artry endothelial cell', 'Endothelial cell_ACKR1 high', 'Endothelial cell_SELE high',
       'Endothelial cell_SPARCL1 high', 'Endothelial cell_TMEM100 high',
       'Endothelial cell_VWF high',])
ref_cluster[ix]=  'Arterial endothelial cell'

ix=np.isin(ref_cluster,[ 'B cell (Plasmocyte)_IGHA/HM high', 'B cell (Plasmocyte)_IGHG high'])
ref_cluster[ix]= 'B cell (Plasmocyte)'

ix=np.isin(ref_cluster,[ 'Club cell_BPIFB1 high', 'Club cell_KLK11 high', 'Epithelial cell_PLA2G2A high',
       'Epithelial cell_S100A2 high'])
ref_cluster[ix]=  'Club cell'

ix=np.isin(ref_cluster,[ 'Dendritic cell'])
ref_cluster[ix]= 'Conventional dendritic cell'

ix=np.isin(ref_cluster,['Fibroblast_A2M high', 'Fibroblast_SFRP high'])
ref_cluster[ix]=  'Fibroblast'

ix=np.isin(ref_cluster,[ 'Macrophage', 'Macrophage_CCL20 high', 'Macrophage_M2', 'Macrophage_VSIG4 high', ])
ref_cluster[ix]= 'M2 macrophage'

ix=np.isin(ref_cluster,[ 'Dendritic cell'])
ref_cluster[ix]= 'Conventional dendritic cell'

ix=np.isin(ref_cluster,[ 'Dendritic cell'])
ref_cluster[ix]= 'Conventional dendritic cell'

In [648]:
adata.obs['celltype']=pd.Categorical(ref_cluster,
                                           categories=['AT1 cell', 'Actived T cell',
      'Arterial endothelial cell',
     'B cell', 'B cell (Plasmocyte)',
       
       'Basal/Epithelial cell', 'Chondrocyte', 'Ciliated cell', 'Club cell',
       
       'Conventional dendritic cell',
       
                                                        'Fibroblast', 'Fibroblast_A2M high',
       'Fibroblast_SFRP high', 'Lymphatic endothelial cell', 'M2 macrophage',
       'Mast cell', 'Megakaryocyte', 'Monocyte',
       'Myeloid cell', 'Natural killer cell', 'Neutrophil',
     
      'Proliferating cell',
       'Smooth muscle cell'])

In [649]:
adata.rename_categories('celltype', ['Alveolar cells', 'T cells',
      'Endothelial cells',
     'B cells', 'Plasma cells',
       
       'Basal cells', 'Chondrocytes', 'Multiciliated cells', 'Airway epithelial cells',
       
       'Dendritic cells',
       
                                                        'Fibroblast', 'Fibroblast_A2M high',
       'Fibroblast cells', 'Lymphatic endothelial cells', 'Macrophages',
       'Mast cells', 'Megakaryocytes', 'Monocytes',
       'Myeloid cells', 'NK cells', 'Neutrophils',
     
      'Unknown',
       'Smooth muscle cells'])

In [653]:
adata.obs['sub_tissue'].cat.categories
ref_cluster=pd.Categorical(adata.obs['sub_tissue'],
                           categories=['AdultLung'])
adata.rename_categories('sub_tissue', ['Lung'])

In [654]:
adata.obs['sex'].cat.categories
ref_cluster=pd.Categorical(adata.obs['sex'],
                           categories=['female', 'male', 'unknown'])
adata.rename_categories('sex', ['Female', 'Male', 'NaN'])

In [655]:
adata.obs['age'].cat.categories
ref_cluster=pd.Categorical(adata.obs['age'],
                           categories=['21Y', '49Y'])
adata.rename_categories('age',['21', '49'])

In [656]:
adata.obs['donor'].cat.categories
ref_cluster=pd.Categorical(adata.obs['donor'],
                           categories=['Donor38', 'Donor41', 'Donor42'])
adata.rename_categories('donor', ['Han-Donor38', 'Han-Donor41', 'Han-Donor42'])

In [657]:
adata.obs['Organ'] = 'Lung'
adata.obs['Organ_Specific'] = adata.obs['sub_tissue']
adata.obs['Dataset'] = 'Han_Lung'
adata.obs['InternDatasetNumber'] = '09-7-Lung-Han-2020'
adata.obs['Dataset_status'] = 'Healthy_Dataset'

adata.obs['celltype'] = adata.obs['celltype']
adata.obs['sub_celltype'] = 'NaN'
adata.obs['Malignant'] = 'NonMalignant'

adata.obs['Patient'] = adata.obs['donor']
adata.obs['Patient_Number'] = 'NaN'
adata.obs['age'] = adata.obs['age']
adata.obs['sex'] = adata.obs['sex']
adata.obs['ethnicity'] = 'NaN'
adata.obs['health_status'] = 'NaN'

adata.obs['original_celltype_1'] = adata.obs['celltype_specific']
adata.obs['original_celltype_2'] = adata.obs['celltype_global']
adata.obs['original_celltype_3'] = 'NaN'

In [45]:
# make consistent annotations across datasets
adata.obs['celltype'].cat.categories

Index(['Alveolar cells', 'T cells', 'Endothelial cells', 'B cells',
       'Plasma cells', 'Basal cells', 'Chondrocytes', 'Multiciliated cells',
       'Airway epithelial cells', 'Dendritic cells', 'Fibroblast cells',
       'Lymphatic endothelial cells', 'Macrophages', 'Mast cells',
       'Megakaryocytes', 'Monocytes', 'Myeloid cells', 'NK cells',
       'Neutrophils', 'Unknown', 'Smooth muscle cells'],
      dtype='object')

In [46]:
ref_cluster=pd.Categorical(adata.obs['celltype'],
                           categories=['Alveolar cells', 'T cells', 'Endothelial cells', 'B cells',
       'Plasma cells', 'Basal cells', 'Chondrocytes', 'Multiciliated cells',
       'Airway epithelial cells', 'Dendritic cells', 'Fibroblast',
       'Fibroblast_A2M high', 'Fibroblast cells',
       'Lymphatic endothelial cells', 'Macrophages', 'Mast cells',
       'Megakaryocytes', 'Monocytes', 'Myeloid cells', 'NK cells',
       'Neutrophils', 'Unknown', 'Smooth muscle cells'])

In [47]:
ix=np.isin(ref_cluster,[ 'Fibroblast', 'Fibroblast_A2M high'])
ref_cluster[ix]= 'Fibroblast cells'

In [48]:
adata.obs['celltype']=pd.Categorical(ref_cluster,
                                           categories=['Alveolar cells', 'T cells', 'Endothelial cells', 'B cells',
       'Plasma cells', 'Basal cells', 'Chondrocytes', 'Multiciliated cells',
       'Airway epithelial cells', 'Dendritic cells',  'Fibroblast cells',
       'Lymphatic endothelial cells', 'Macrophages', 'Mast cells',
       'Megakaryocytes', 'Monocytes', 'Myeloid cells', 'NK cells',
       'Neutrophils', 'Unknown', 'Smooth muscle cells'])

In [49]:
adata.rename_categories('celltype', ['Alveolar cells', 'T cells', 'Endothelial cells', 'B cells',
       'Plasma cells', 'Basal cells', 'Chondrocytes', 'Multiciliated cells',
       'Airway epithelial cells', 'Dendritic cells',  'Fibroblast cells',
       'Lymphatic endothelial cells', 'Macrophages', 'Mast cells',
       'Megakaryocytes', 'Monocytes', 'Myeloid cells', 'NK cells',
       'Neutrophils', 'Unknown', 'Smooth muscle cells'])

In [54]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [55]:
adata.write(writepath + '09-7-Lung-Han-2020-processed.h5ad')

## 09-9-Lung_ImmuneCells-Teichmann-2022

In [198]:
ix=np.isin(adata_analysis.obs['tissue_major'], ['Lung'])
adata=adata_analysis[ix].copy()

In [204]:
adata.obs['Organ'] =  adata.obs['tissue_major']
adata.obs['Organ_Specific'] = adata.obs['sub_tissue']
adata.obs['Dataset'] = adata.obs['Dataset']
adata.obs['InternDatasetNumber'] = '09-9-Lung_ImmuneCells-Teichmann-2022'
adata.obs['Dataset_status'] = 'Healthy_Dataset'

adata.obs['celltype'] = adata.obs['celltype']
adata.obs['sub_celltype'] = 'NaN'
adata.obs['Malignant'] = 'NonMalignant'

adata.obs['Patient'] = adata.obs['donor']
adata.obs['Patient_Number'] = 'NaN'
adata.obs['age'] = adata.obs['development_stage']
adata.obs['sex'] = adata.obs['sex']
adata.obs['ethnicity'] = adata.obs['ethnicity']
adata.obs['health_status'] = 'NaN'

adata.obs['original_celltype_1'] = adata.obs['cell_type']
adata.obs['original_celltype_2'] = adata.obs['Majority_voting_CellTypist_high']
adata.obs['original_celltype_3'] = 'NaN'

In [205]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [206]:
adata.obs_names_make_unique()

In [207]:
adata.write(writepath + '09-9-Lung_ImmuneCells-Teichmann-2022-processed.h5ad')

# 10-Heart

## 10-1-Heart-Han-2020

In [150]:
# here we use sfaira to import available datasets with annotations
# note that the following steps may change depending on the current sfaira version and the path to your repository

datadir = 'path/to/repo/'

ds = sfaira.data.human.DatasetGroupHeart(path=datadir)  # This links all data sets available

In [176]:
ds.ids

['human_heart_2020_microwell_han_001_10.1038/s41586-020-2157-4',
 'human_heart_2020_microwell_han_002_10.1038/s41586-020-2157-4',
 'human_heart_2020_microwell_han_003_10.1038/s41586-020-2157-4',
 'human_heart_2020_microwell_han_004_10.1038/s41586-020-2157-4']

In [177]:
# pick
idx = ds.ids[1]

In [178]:
idx

'human_heart_2020_microwell_han_002_10.1038/s41586-020-2157-4'

In [179]:
ds.datasets[idx].load()

Variable names are not unique. To make them unique, call `.var_names_make_unique`.


In [181]:
adata1=ds.datasets[idx].adata

In [183]:
# pick
idx = ds.ids[2]

In [184]:
idx

'human_heart_2020_microwell_han_003_10.1038/s41586-020-2157-4'

In [185]:
ds.datasets[idx].load()

Variable names are not unique. To make them unique, call `.var_names_make_unique`.


In [187]:
adata2=ds.datasets[idx].adata

In [192]:
adata=adata1.concatenate(adata2, batch_key='batch')

In [246]:
adata.var.index=adata.var['names'].tolist()

In [249]:
adata.X=sp.sparse.csr_matrix.todense(adata.X)

In [34]:
#calculate QC covariates
adata.obs['n_counts'] = adata.X.sum(1)
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1)

In [35]:
# FILTER PARAMETERS
#Filter out cells
#sc.pp.filter_cells(adata, max_counts = 4000)
sc.pp.filter_cells(adata, max_genes = 1000)
# Min 20 cells - filters out low count genes
sc.pp.filter_genes(adata, min_cells=20)

filtered out 33 cells that have more than 1000 genes expressed
filtered out 18573 genes that are detected in less than 20 cells


In [36]:
adata.var.index=adata.var['names'].copy()

In [38]:
# get mt genes
mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]
np.array(mt_genes)
mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]

# convert to spare
adata.X=sp.sparse.csr_matrix.todense(adata.X)

# manually define mt score
y = np.bincount(mt_gene_mask)
ii = np.nonzero(y)[0]
np.vstack((ii,y[ii])).T
adata.X[:, mt_gene_mask].sum(1)
mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))
mt_sum=np.array(pd.DataFrame(mt_sum)[0])
mt_frac=mt_sum/adata.obs['n_counts'].values

adata.obs['mt_frac'] = mt_frac

#Filter out cells with over 25% mito fraction
adata = adata[adata.obs['mt_frac'] < 0.25]

In [39]:
adata_pp=adata.copy()

In [40]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

normalizing by total count per cell
    finished (0:00:00): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    with n_comps=15
    finished (0:00:00)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 8 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:00:00)


In [42]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [43]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)

size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

R[write to console]: Loading required package: scran

R[write to console]: Loading required package: SingleCellExperiment

R[write to console]: Loading required package: SummarizedExperiment

R[write to console]: Loading required package: GenomicRanges

R[write to console]: Loading required package: stats4

R[write to console]: Loading required package: BiocGenerics

R[write to console]: Loading required package: parallel

R[write to console]: 
Attaching package: ‘BiocGenerics’


R[write to console]: The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB


R[write to console]: The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


R[write to console]: The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    d

In [44]:
#Delete adata_pp
del adata_pp
adata.obs['size_factors'] = size_factors

  This is separate from the ipykernel package so we can avoid doing imports until


In [45]:
adata.strings_to_categoricals()

In [46]:
#make  (adata.X) copy of counts of raw data for downstream analysis
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [47]:
adata.raw = adata

In [48]:
#Normalize data
adata.X /= adata.obs['size_factors'].values[:, None]
sc.pp.log1p(adata)

In [49]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [50]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:00)


In [51]:
# Calculate the visualizations
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata)
sc.tl.umap(adata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:00)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:05)


In [53]:
# make consistent annotations across datasets
adata.obs['sub_celltype']=adata.obs['celltype_specific'].copy()
adata.obs['celltype']=adata.obs['celltype_specific'].copy()

In [54]:
ref_cluster=pd.Categorical(adata.obs['celltype'],
                           categories=['Apoptotic cell', 'Cardiomyocyte', 'Conventional dendritic cell',
       'Dendritic cell', 'Endothelial cell', 'Fibroblast', 'M1 Macrophage',
       'M2 Macrophage', 'Macrophage', 'Mast cell', 'Neutrophil',
       'Smooth muscle cell', 'T cell', 'Vascular endothelial cell',
       'Ventricle cardiomyocyte'])

In [55]:
ix=np.isin(ref_cluster,['Ventricle cardiomyocyte'])
ref_cluster[ix]='Cardiomyocyte'

ix=np.isin(ref_cluster,['Dendritic cell'])
ref_cluster[ix]='Conventional dendritic cell'

ix=np.isin(ref_cluster,['Vascular endothelial cell'])
ref_cluster[ix]='Endothelial cell'

ix=np.isin(ref_cluster,[ 'M2 Macrophage', 'Macrophage'])
ref_cluster[ix]='M1 Macrophage',

In [56]:
adata.obs['celltype']=pd.Categorical(ref_cluster,
                                           categories=['Apoptotic cell', 'Cardiomyocyte', 'Conventional dendritic cell',
    'Endothelial cell', 'Fibroblast', 'M1 Macrophage',
        'Mast cell', 'Neutrophil',
       'Smooth muscle cell', 'T cell'])

In [57]:
adata.rename_categories('celltype', ['Unknown', 'Cardiomyocytes', 'Dendritic cells',
       'Endothelial cells', 'Fibroblast cells', 'Macrophages', 'Mast cells','Neutrophils', 'Smooth muscle', 'T cells'])

In [61]:
adata.obs['Organ'] = 'Heart'
adata.obs['Organ_Specific'] = 'Heart'
adata.obs['Dataset'] = 'Han_Heart'
adata.obs['InternDatasetNumber'] = '10-1-Heart-Han-2020'
adata.obs['Dataset_status'] = 'Healthy_Dataset'

adata.obs['celltype'] = adata.obs['celltype']
adata.obs['sub_celltype'] = 'NaN'
adata.obs['Malignant'] = 'NonMalignant'

adata.obs['Patient'] = adata.obs['donor']
adata.obs['Patient_Number'] = adata.obs['sample']
adata.obs['age'] = adata.obs['dev_stage']
adata.obs['sex'] = adata.obs['gender']
adata.obs['ethnicity'] = 'NaN'
adata.obs['health_status'] = 'NaN'

adata.obs['original_celltype_1'] = adata.obs['celltype_specific']
adata.obs['original_celltype_2'] = adata.obs['celltype_global']
adata.obs['original_celltype_3'] = 'NaN'

In [62]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [63]:
adata.write(writepath + '10-1-Heart-Han-2020-processed.h5ad')

## 10-2_1-Heart-Teichmann-2020

In [64]:
target_collections = ["b52eb423-5d0d-4645-b217-e1c6d38b2e72"]
cache_path = os.path.join(".", "data")
dsg = sfaira.data.dataloaders.databases.DatasetSuperGroupDatabases(data_path=cache_path, cache_metadata=True)
dsg.subset(key="collection_id", values=target_collections)
dsg.datasets
dsg.download()

Ontology <class 'sfaira.versions.metadata.base.OntologyUberonLifecyclestage'> is not a DAG, treat child-parent reasoning with care.
Ontology <class 'sfaira.versions.metadata.base.OntologyMondo'> is not a DAG, treat child-parent reasoning with care.
Ontology <class 'sfaira.versions.metadata.base.OntologyUberon'> is not a DAG, treat child-parent reasoning with care.


In [65]:
path = '/path/to/repo/b52eb423-5d0d-4645-b217-e1c6d38b2e72/'
files = [f for f in listdir(path) if isfile(join(path, f))]

In [66]:
files

['ed852810-a003-4386-9846-1638362cee39.h5ad',
 '1009f384-b12d-448e-ba9f-1b7d2ecfbb4e.h5ad',
 '84f1a631-910b-4fbb-9f76-d915a07316d2.h5ad',
 'd4e69e01-3ba2-4d6b-a15d-e7048f78f22e.h5ad',
 'f75f2ff4-2884-4c2d-b375-70de37a34507.h5ad',
 '572f3f3e-d3e4-4d13-8e2b-88215e508481.h5ad',
 '78fd69d2-75e4-4207-819a-563139f273c6.h5ad',
 '9d584fcb-a28a-4b91-a886-ceb66a88ef81.h5ad']

In [67]:
path = '/path/to/repo/b52eb423-5d0d-4645-b217-e1c6d38b2e72/'

In [68]:
files = ['ed852810-a003-4386-9846-1638362cee39.h5ad',
 '1009f384-b12d-448e-ba9f-1b7d2ecfbb4e.h5ad',
 '84f1a631-910b-4fbb-9f76-d915a07316d2.h5ad']

In [69]:
for i in range(len(files)):
    print(files[i])
    path_2 = path + files[i]
    u = sc.read_h5ad(path_2)
    u.obs['id'] = files[i]
    if i == 0:
        adata = u
    else:
        adata = adata.concatenate(u, join='outer')

ed852810-a003-4386-9846-1638362cee39.h5ad
1009f384-b12d-448e-ba9f-1b7d2ecfbb4e.h5ad


  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


84f1a631-910b-4fbb-9f76-d915a07316d2.h5ad


In [75]:
#calculate QC covariates
adata.obs['n_counts'] = adata.X.sum(1)
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1)

In [78]:
# FILTER PARAMETERS
#Filter out cells
sc.pp.filter_cells(adata, max_counts = 1600)
sc.pp.filter_cells(adata, max_genes = 4500)
# Min 20 cells - filters out low count genes
sc.pp.filter_genes(adata, min_cells=30)

filtered out 159 cells that have more than 1600 counts
filtered out 2 cells that have more than 4500 genes expressed
filtered out 9356 genes that are detected in less than 30 cells


In [79]:
adata.strings_to_categoricals()

In [80]:
adata.var['feature_name'] = adata.var['feature_name']
adata.var.index = adata.var['feature_name']
list_1 = adata.var.index.astype(str)
adata.var.index = list_1

In [81]:
# get mt genes
mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]
np.array(mt_genes)
mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]

# convert to spare
adata.X=sp.sparse.csr_matrix.todense(adata.X)

# manually define mt score
y = np.bincount(mt_gene_mask)
ii = np.nonzero(y)[0]
np.vstack((ii,y[ii])).T
adata.X[:, mt_gene_mask].sum(1)
mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))
mt_sum=np.array(pd.DataFrame(mt_sum)[0])
mt_frac=mt_sum/adata.obs['n_counts'].values

adata.obs['mt_frac'] = mt_frac

#Filter out cells with over 10% mito fraction
adata = adata[adata.obs['mt_frac'] < 0.10]

In [82]:
adata_pp=adata.copy()

In [83]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

normalizing by total count per cell
    finished (0:00:06): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    with n_comps=15
    finished (0:00:21)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:12)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 15 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:00:16)


In [84]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [85]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)
size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

In [86]:
#Delete adata_pp
del adata_pp
adata.obs['size_factors'] = size_factors

  This is separate from the ipykernel package so we can avoid doing imports until


In [87]:
adata.strings_to_categoricals()

In [88]:
#make  (adata.X) copy of counts of raw data for downstream analysis
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [89]:
adata.raw = adata

In [90]:
#Normalize data
adata.X /= adata.obs['size_factors'].values[:, None]
sc.pp.log1p(adata)

In [91]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [92]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:00)


In [93]:
# Calculate the visualizations
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata)
sc.tl.umap(adata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:10)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:19)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:01:00)


In [95]:
# make consistent annotations across datasets
adata.obs['celltype']=adata.obs['cell_type'].copy()

In [96]:
adata.obs['celltype'].cat.categories

Index(['B cell', 'CD4-positive, alpha-beta cytotoxic T cell',
       'CD8-positive, alpha-beta cytotoxic T cell',
       'CD14-positive, CD16-positive monocyte',
       'activated CD4-positive, alpha-beta T cell',
       'activated CD8-positive, alpha-beta T cell', 'dendritic cell',
       'macrophage', 'mast cell', 'mature NK T cell', 'monocyte',
       'native cell', 'natural killer cell', 'neural cell',
       'regular atrial cardiac myocyte'],
      dtype='object')

In [97]:
ref_cluster=pd.Categorical(adata.obs['celltype'],
                           categories=['B cell', 'CD4-positive, alpha-beta cytotoxic T cell',
       'CD8-positive, alpha-beta cytotoxic T cell',
       'CD14-positive, CD16-positive monocyte',
       'activated CD4-positive, alpha-beta T cell',
       'activated CD8-positive, alpha-beta T cell', 'dendritic cell',
       'macrophage', 'mast cell', 'mature NK T cell', 'monocyte',
       'native cell', 'natural killer cell', 'neural cell',
       'regular atrial cardiac myocyte'])

In [98]:
ix=np.isin(ref_cluster,[ 'CD8-positive, alpha-beta cytotoxic T cell', 'activated CD4-positive, alpha-beta T cell', 'activated CD8-positive, alpha-beta T cell'])
ref_cluster[ix]='CD4-positive, alpha-beta cytotoxic T cell'

ix=np.isin(ref_cluster,[  'CD14-positive, CD16-positive monocyte', ])
ref_cluster[ix]='monocyte'

ix=np.isin(ref_cluster,['mature NK T cell'])
ref_cluster[ix]='natural killer cell'

In [99]:
adata.obs['celltype']=pd.Categorical(ref_cluster,
                                           categories=['B cell', 'CD4-positive, alpha-beta cytotoxic T cell',
         'dendritic cell',
       'macrophage', 'mast cell', 'monocyte',
       'native cell', 'natural killer cell', 'neural cell',
       'regular atrial cardiac myocyte'])

In [100]:
adata.rename_categories('celltype', ['B cells', 'T cells',
         'Dendritic cells',
       'Macrophages', 'Mast cells', 'Monocytes',
       'Unknown', 'NK cells', 'Neurons',
       'Cardiomyocytes'])

In [104]:
adata.obs['sex'].cat.categories
ref_cluster=pd.Categorical(adata.obs['sex'],
                           categories=['female', 'male'])
adata.rename_categories('sex', ['Female', 'Male'])

In [105]:
adata.obs['development_stage'].cat.categories
ref_cluster=pd.Categorical(adata.obs['development_stage'],
                           categories=['eighth decade human stage', 'fifth decade human stage',
       'human late adulthood stage', 'human middle aged stage',
       'seventh decade human stage', 'sixth decade human stage'])

ix=np.isin(ref_cluster,['fifth decade human stage',
       'human late adulthood stage', 'human middle aged stage',
       'seventh decade human stage', 'sixth decade human stage'])
ref_cluster[ix]='eighth decade human stage'

adata.obs['development_stage']=pd.Categorical(ref_cluster,
                                           categories=['eighth decade human stage'])
adata.rename_categories('development_stage', ['Adult'])

In [106]:
adata.obs['tissue'].cat.categories
ref_cluster=pd.Categorical(adata.obs['tissue'],
                           categories=['apex of heart', 'heart left ventricle', 'heart right ventricle',
       'interventricular septum', 'left cardiac atrium',
       'right cardiac atrium'])

ix=np.isin(ref_cluster,['heart right ventricle','interventricular septum'])
ref_cluster[ix]='heart left ventricle'

ix=np.isin(ref_cluster,['right cardiac atrium'])
ref_cluster[ix]='left cardiac atrium'

adata.obs['tissue']=pd.Categorical(ref_cluster,
                                           categories=['apex of heart', 'heart left ventricle', 'left cardiac atrium'])
adata.rename_categories('tissue', ['Heart_Apex', 'Heart_Ventricle', 'Heart_Atrium'])

In [107]:
adata.obs['donor'].cat.categories
ref_cluster=pd.Categorical(adata.obs['donor'],
                           categories=['D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D11', 'H2', 'H3', 'H4', 'H5',
       'H6', 'H7'])
adata.rename_categories('donor', ['D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D11', 'H2', 'H3', 'H4', 'H5',
       'H6', 'H7'])

In [108]:
adata.obs['Organ'] = 'Heart'
adata.obs['Organ_Specific'] = adata.obs['tissue']
adata.obs['Dataset'] = 'Teichmann_Heart'
adata.obs['InternDatasetNumber'] = '10-2_1-Heart-Teichmann-2020'
adata.obs['Dataset_status'] = 'Healthy_Dataset'

adata.obs['celltype'] = adata.obs['celltype']
adata.obs['sub_celltype'] = 'NaN'
adata.obs['Malignant'] = 'NonMalignant'

adata.obs['Patient'] = adata.obs['donor']
adata.obs['Patient_Number'] = adata.obs['sample']
adata.obs['age'] = adata.obs['development_stage']
adata.obs['sex'] = adata.obs['sex']
adata.obs['ethnicity'] = adata.obs['ethnicity']
adata.obs['health_status'] = 'NaN'

adata.obs['original_celltype_1'] = adata.obs['cell_type']
adata.obs['original_celltype_2'] = 'NaN'
adata.obs['original_celltype_3'] = 'NaN'

In [110]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [111]:
adata.obs_names_make_unique()

## 10-2_2-Heart-Teichmann-2020

## 10-2_2-Heart-Teichmann-2020

In [115]:
path = '/path/to/repo/b52eb423-5d0d-4645-b217-e1c6d38b2e72/'

In [116]:
files = ['d4e69e01-3ba2-4d6b-a15d-e7048f78f22e.h5ad']

In [117]:
for i in range(len(files)):
    print(files[i])
    path_2 = path + files[i]
    u = sc.read_h5ad(path_2)
    u.obs['id'] = files[i]
    if i == 0:
        adata = u
    else:
        adata = adata.concatenate(u, join='outer')

d4e69e01-3ba2-4d6b-a15d-e7048f78f22e.h5ad


In [123]:
#calculate QC covariates
adata.obs['n_counts'] = adata.X.sum(1)
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1)

In [126]:
# FILTER PARAMETERS
#Filter out cells
sc.pp.filter_cells(adata, max_counts = 1500)
sc.pp.filter_cells(adata, max_genes = 5000)
# Min 20 cells - filters out low count genes
sc.pp.filter_genes(adata, min_cells=100)

filtered out 184 cells that have more than 5000 genes expressed
filtered out 7608 genes that are detected in less than 100 cells


In [127]:
adata.strings_to_categoricals()

In [128]:
adata.var['feature_name'] = adata.var['feature_name']
adata.var.index = adata.var['feature_name']
list_1 = adata.var.index.astype(str)
adata.var.index = list_1

In [129]:
# get mt genes
mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]
np.array(mt_genes)
mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]

# convert to spare
adata.X=sp.sparse.csr_matrix.todense(adata.X)

# manually define mt score
y = np.bincount(mt_gene_mask)
ii = np.nonzero(y)[0]
np.vstack((ii,y[ii])).T
adata.X[:, mt_gene_mask].sum(1)
mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))
mt_sum=np.array(pd.DataFrame(mt_sum)[0])
mt_frac=mt_sum/adata.obs['n_counts'].values

adata.obs['mt_frac'] = mt_frac

#Filter out cells with over 10% mito fraction
adata = adata[adata.obs['mt_frac'] < 0.10]

In [130]:
adata_pp=adata.copy()

In [131]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

normalizing by total count per cell
    finished (0:00:47): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    with n_comps=15
    finished (0:02:30)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:01:46)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 28 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:04:28)


In [132]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [133]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)
size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

In [134]:
#Delete adata_pp
del adata_pp
adata.obs['size_factors'] = size_factors

  This is separate from the ipykernel package so we can avoid doing imports until


In [135]:
adata.strings_to_categoricals()

In [136]:
#make  (adata.X) copy of counts of raw data for downstream analysis
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [137]:
adata.raw = adata

In [138]:
#Normalize data
adata.X /= adata.obs['size_factors'].values[:, None]
sc.pp.log1p(adata)

In [139]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [140]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:03)


In [141]:
# Calculate the visualizations
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata)
sc.tl.umap(adata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:45)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:03:48)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:09:33)


In [143]:
# make consistent annotations across datasets
adata.obs['celltype']=adata.obs['cell_type'].copy()

In [144]:
adata.obs['celltype'].cat.categories

Index(['B cell', 'CD14-positive monocyte',
       'CD14-positive, CD16-positive monocyte',
       'CD4-positive, alpha-beta cytotoxic T cell',
       'CD8-positive, alpha-beta cytotoxic T cell',
       'activated CD4-positive, alpha-beta T cell',
       'activated CD8-positive, alpha-beta T cell',
       'capillary endothelial cell', 'dendritic cell', 'endothelial cell',
       'endothelial cell of artery', 'endothelial cell of lymphatic vessel',
       'epicardial adipocyte', 'fibroblast', 'macrophage', 'mast cell',
       'mature NK T cell', 'mesothelial cell', 'monocyte', 'native cell',
       'natural killer cell', 'neural cell', 'pericyte cell',
       'regular atrial cardiac myocyte', 'regular ventricular cardiac myocyte',
       'smooth muscle cell', 'vein endothelial cell'],
      dtype='object')

In [145]:
ref_cluster=pd.Categorical(adata.obs['celltype'],
                           categories=['B cell', 'CD14-positive monocyte',
       'CD14-positive, CD16-positive monocyte',
       'CD4-positive, alpha-beta cytotoxic T cell',
       'CD8-positive, alpha-beta cytotoxic T cell',
       'activated CD4-positive, alpha-beta T cell',
       'activated CD8-positive, alpha-beta T cell',
       'capillary endothelial cell', 'dendritic cell', 'endothelial cell',
       'endothelial cell of artery', 'endothelial cell of lymphatic vessel',
       'epicardial adipocyte', 'fibroblast', 'macrophage', 'mast cell',
       'mature NK T cell', 'mesothelial cell', 'monocyte', 'native cell',
       'natural killer cell', 'neural cell', 'pericyte cell',
       'regular atrial cardiac myocyte', 'regular ventricular cardiac myocyte',
       'smooth muscle cell', 'vein endothelial cell'])

In [146]:
ix=np.isin(ref_cluster,[ 'CD14-positive, CD16-positive monocyte', 'monocyte'])
ref_cluster[ix]= 'CD14-positive monocyte'

ix=np.isin(ref_cluster,['CD8-positive, alpha-beta cytotoxic T cell','activated CD4-positive, alpha-beta T cell','activated CD8-positive, alpha-beta T cell'])
ref_cluster[ix]= 'CD4-positive, alpha-beta cytotoxic T cell'

ix=np.isin(ref_cluster,['endothelial cell','endothelial cell of artery', 'vein endothelial cell'])
ref_cluster[ix]='capillary endothelial cell'

ix=np.isin(ref_cluster,['regular ventricular cardiac myocyte'])
ref_cluster[ix]='regular atrial cardiac myocyte'

ix=np.isin(ref_cluster,[ 'mature NK T cell'])
ref_cluster[ix]='natural killer cell'

In [147]:
adata.obs['celltype']=pd.Categorical(ref_cluster,
                                           categories=['B cell', 'CD14-positive monocyte',
      
       'CD4-positive, alpha-beta cytotoxic T cell',
     
       'capillary endothelial cell', 'dendritic cell', 
       'endothelial cell of lymphatic vessel',
       'epicardial adipocyte', 'fibroblast', 'macrophage', 'mast cell',
       'mesothelial cell','native cell',
       'natural killer cell', 'neural cell', 'pericyte cell',
       'regular atrial cardiac myocyte',
       'smooth muscle cell'])

In [148]:
adata.rename_categories('celltype', ['B cells', 'Monocytes',
      
       'T cells',
     
       'Endothelial cells', 'Dendritic cells', 
       'Lymphatic endothelial cells',
       'Adipocytes', 'Fibroblast cells', 'Macrophages', 'Mast cells',
       'Mesothelial cells','Unknown',
       'NK cells', 'Neurons', 'Pericytes',
       'Cardiomyocytes',
       'Smooth muscle cells'])

In [152]:
adata.obs['sex'].cat.categories
ref_cluster=pd.Categorical(adata.obs['sex'],
                           categories=['female', 'male'])
adata.rename_categories('sex', ['Female', 'Male'])

In [153]:
adata.obs['development_stage'].cat.categories
ref_cluster=pd.Categorical(adata.obs['development_stage'],
                           categories=['eighth decade human stage', 'fifth decade human stage',
       'human late adulthood stage', 'human middle aged stage',
       'seventh decade human stage', 'sixth decade human stage'])

ix=np.isin(ref_cluster,['fifth decade human stage',
       'human late adulthood stage', 'human middle aged stage',
       'seventh decade human stage', 'sixth decade human stage'])
ref_cluster[ix]='eighth decade human stage'

adata.obs['development_stage']=pd.Categorical(ref_cluster,
                                           categories=['eighth decade human stage'])
adata.rename_categories('development_stage', ['Adult'])

In [154]:
adata.obs['tissue'].cat.categories
ref_cluster=pd.Categorical(adata.obs['tissue'],
                           categories=['apex of heart', 'heart left ventricle', 'heart right ventricle',
       'interventricular septum', 'left cardiac atrium',
       'right cardiac atrium'])

ix=np.isin(ref_cluster,['heart right ventricle','interventricular septum'])
ref_cluster[ix]='heart left ventricle'

ix=np.isin(ref_cluster,['right cardiac atrium'])
ref_cluster[ix]='left cardiac atrium'

adata.obs['tissue']=pd.Categorical(ref_cluster,
                                           categories=['apex of heart', 'heart left ventricle', 'left cardiac atrium'])
adata.rename_categories('tissue', ['Heart_Apex', 'Heart_Ventricle', 'Heart_Atrium'])

In [155]:
adata.obs['donor'].cat.categories
ref_cluster=pd.Categorical(adata.obs['donor'],
                           categories=['D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D11', 'H2', 'H3', 'H4', 'H5',
       'H6', 'H7'])
adata.rename_categories('donor', ['D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D11', 'H2', 'H3', 'H4', 'H5',
       'H6', 'H7'])

In [156]:
adata.obs['Organ'] = 'Heart'
adata.obs['Organ_Specific'] = adata.obs['tissue']
adata.obs['Dataset'] = 'Teichmann_Heart'
adata.obs['InternDatasetNumber'] = '10-2_2-Heart-Teichmann-2020'
adata.obs['Dataset_status'] = 'Healthy_Dataset'

adata.obs['celltype'] = adata.obs['celltype']
adata.obs['sub_celltype'] = 'NaN'
adata.obs['Malignant'] = 'NonMalignant'

adata.obs['Patient'] = adata.obs['donor']
adata.obs['Patient_Number'] = adata.obs['sample']
adata.obs['age'] = adata.obs['development_stage']
adata.obs['sex'] = adata.obs['sex']
adata.obs['ethnicity'] = adata.obs['ethnicity']
adata.obs['health_status'] = 'NaN'

adata.obs['original_celltype_1'] = adata.obs['cell_type']
adata.obs['original_celltype_2'] = 'NaN'
adata.obs['original_celltype_3'] = 'NaN'

In [158]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [159]:
adata.obs_names_make_unique()

## 10-2_3-Heart-Teichmann-2020

## 10-2_3-Heart-Teichmann-2020

In [161]:
path = '/path/to/repo/b52eb423-5d0d-4645-b217-e1c6d38b2e72/'

In [162]:
files = ['f75f2ff4-2884-4c2d-b375-70de37a34507.h5ad',
 '572f3f3e-d3e4-4d13-8e2b-88215e508481.h5ad',
 '78fd69d2-75e4-4207-819a-563139f273c6.h5ad',
 '9d584fcb-a28a-4b91-a886-ceb66a88ef81.h5ad']

In [164]:
adata

AnnData object with n_obs × n_vars = 383824 × 33178
    obs: 'NRP', 'cell_source', 'donor', 'n_counts', 'n_genes', 'percent_mito', 'percent_ribo', 'sample', 'scrublet_score', 'type', 'cell_states', 'Used', 'disease_ontology_term_id', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'tissue_ontology_term_id', 'development_stage_ontology_term_id', 'suspension_type', 'ethnicity_ontology_term_id', 'sex_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'ethnicity', 'development_stage', 'id', 'source', 'cell_type_original', 'batch'
    var: 'feature_biotype', 'feature_is_filtered', 'feature_name', 'feature_reference'
    obsm: 'X_pca', 'X_umap'

In [166]:
adata.obs['InternDatasetNumber'] ='10-2_3-Heart-Teichmann-2020'

In [171]:
#calculate QC covariates
adata.obs['n_counts'] = adata.X.sum(1)
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1)

In [174]:
# FILTER PARAMETERS
#Filter out cells
sc.pp.filter_cells(adata, max_counts = 2000)
sc.pp.filter_cells(adata, max_genes = 5000)
# Min 20 cells - filters out low count genes
sc.pp.filter_genes(adata, min_cells=20)

filtered out 650 cells that have more than 2000 counts
filtered out 69 cells that have more than 5000 genes expressed
filtered out 5535 genes that are detected in less than 20 cells


In [175]:
adata.strings_to_categoricals()

In [176]:
adata.var['feature_name'] = adata.var['feature_name']
adata.var.index = adata.var['feature_name']
list_1 = adata.var.index.astype(str)
adata.var.index = list_1

In [177]:
# get mt genes
mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]
np.array(mt_genes)
mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]

# convert to spare
adata.X=sp.sparse.csr_matrix.todense(adata.X)

# manually define mt score
y = np.bincount(mt_gene_mask)
ii = np.nonzero(y)[0]
np.vstack((ii,y[ii])).T
adata.X[:, mt_gene_mask].sum(1)
mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))
mt_sum=np.array(pd.DataFrame(mt_sum)[0])
mt_frac=mt_sum/adata.obs['n_counts'].values

adata.obs['mt_frac'] = mt_frac

#Filter out cells with over 10% mito fraction
adata = adata[adata.obs['mt_frac'] < 0.10]

In [178]:
adata_pp=adata.copy()

In [179]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

normalizing by total count per cell
    finished (0:01:47): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    with n_comps=15
    finished (0:04:03)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:01:30)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 23 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:02:23)


In [180]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [181]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)
size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

In [182]:
#Delete adata_pp
del adata_pp
adata.obs['size_factors'] = size_factors

  This is separate from the ipykernel package so we can avoid doing imports until


In [183]:
adata.strings_to_categoricals()

In [184]:
#make  (adata.X) copy of counts of raw data for downstream analysis
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [185]:
adata.raw = adata

In [186]:
#Normalize data
adata.X /= adata.obs['size_factors'].values[:, None]
sc.pp.log1p(adata)

In [187]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [188]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:05)


In [189]:
# Calculate the visualizations
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata)
sc.tl.umap(adata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:40)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:02:49)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:07:03)


In [191]:
# make consistent annotations across datasets
adata.obs['celltype']=adata.obs['cell_type'].copy()

In [192]:
adata.obs['celltype'].cat.categories

Index(['capillary endothelial cell', 'endothelial cell',
       'endothelial cell of artery', 'endothelial cell of lymphatic vessel',
       'epicardial adipocyte', 'fibroblast', 'mesothelial cell',
       'pericyte cell', 'regular ventricular cardiac myocyte',
       'smooth muscle cell', 'vein endothelial cell'],
      dtype='object')

In [193]:
ref_cluster=pd.Categorical(adata.obs['celltype'],
                           categories=['capillary endothelial cell', 'endothelial cell',
       'endothelial cell of artery', 'endothelial cell of lymphatic vessel',
       'epicardial adipocyte', 'fibroblast', 'mesothelial cell',
       'pericyte cell', 'regular ventricular cardiac myocyte',
       'smooth muscle cell', 'vein endothelial cell'])

In [194]:
ix=np.isin(ref_cluster,['endothelial cell',  'endothelial cell of artery',  'vein endothelial cell'])
ref_cluster[ix]='capillary endothelial cell'

In [195]:
adata.obs['celltype']=pd.Categorical(ref_cluster,
                                           categories=['capillary endothelial cell',
       'endothelial cell of lymphatic vessel',
       'epicardial adipocyte', 'fibroblast', 'mesothelial cell',
       'pericyte cell', 'regular ventricular cardiac myocyte',
       'smooth muscle cell'])

In [196]:
adata.rename_categories('celltype', ['Endothelial cells',
       'Lymphatic endothelial cells',
       'Adipocytes', 'Fibroblast cells', 'Mesenchymal stromal cells',
       'Pericytes', 'Cardiomyocytes',
       'Smooth muscle cells'])

In [200]:
adata.obs['sex'].cat.categories
ref_cluster=pd.Categorical(adata.obs['sex'],
                           categories=['female', 'male'])
adata.rename_categories('sex', ['Female', 'Male'])

In [201]:
adata.obs['development_stage'].cat.categories
ref_cluster=pd.Categorical(adata.obs['development_stage'],
                           categories=['eighth decade human stage', 'fifth decade human stage',
       'human late adulthood stage', 'human middle aged stage',
       'seventh decade human stage', 'sixth decade human stage'])

ix=np.isin(ref_cluster,['fifth decade human stage',
       'human late adulthood stage', 'human middle aged stage',
       'seventh decade human stage', 'sixth decade human stage'])
ref_cluster[ix]='eighth decade human stage'

adata.obs['development_stage']=pd.Categorical(ref_cluster,
                                           categories=['eighth decade human stage'])
adata.rename_categories('development_stage', ['Adult'])

In [202]:
adata.obs['tissue'].cat.categories
ref_cluster=pd.Categorical(adata.obs['tissue'],
                           categories=['apex of heart', 'heart left ventricle', 'heart right ventricle',
       'interventricular septum', 'left cardiac atrium',
       'right cardiac atrium'])

ix=np.isin(ref_cluster,['heart right ventricle','interventricular septum'])
ref_cluster[ix]='heart left ventricle'

ix=np.isin(ref_cluster,['right cardiac atrium'])
ref_cluster[ix]='left cardiac atrium'

adata.obs['tissue']=pd.Categorical(ref_cluster,
                                           categories=['apex of heart', 'heart left ventricle', 'left cardiac atrium'])
adata.rename_categories('tissue', ['Heart_Apex', 'Heart_Ventricle', 'Heart_Atrium'])

In [203]:
adata.obs['donor'].cat.categories
ref_cluster=pd.Categorical(adata.obs['donor'],
                           categories=['D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D11', 'H2', 'H3', 'H4', 'H5',
       'H6', 'H7'])
adata.rename_categories('donor', ['D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D11', 'H2', 'H3', 'H4', 'H5',
       'H6', 'H7'])

In [204]:
adata.obs['Organ'] = 'Heart'
adata.obs['Organ_Specific'] = adata.obs['tissue']
adata.obs['Dataset'] = 'Teichmann_Heart'
adata.obs['InternDatasetNumber'] = '10-2_3-Heart-Teichmann-2020'
adata.obs['Dataset_status'] = 'Healthy_Dataset'

adata.obs['celltype'] = adata.obs['celltype']
adata.obs['sub_celltype'] = 'NaN'
adata.obs['Malignant'] = 'NonMalignant'

adata.obs['Patient'] = adata.obs['donor']
adata.obs['Patient_Number'] = adata.obs['sample']
adata.obs['age'] = adata.obs['development_stage']
adata.obs['sex'] = adata.obs['sex']
adata.obs['ethnicity'] = adata.obs['ethnicity']
adata.obs['health_status'] = 'NaN'

adata.obs['original_celltype_1'] = adata.obs['cell_type']
adata.obs['original_celltype_2'] = 'NaN'
adata.obs['original_celltype_3'] = 'NaN'

In [206]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [207]:
adata.obs_names_make_unique()

## 10-3-Heart-Pisco-2022

##  10-3-Heart-Pisco-2022 (human) - Pisco Multiple Organs

In [17]:
ix=np.isin(adata_pisco.obs['tissue'],['cardiac atrium', 'cardiac ventricle'])
adata=adata_pisco[ix].copy()

In [18]:
adata.obs['InternDatasetNumber'] ='10-3-Heart-Pisco-2022'

In [22]:
sc.pp.calculate_qc_metrics(adata, inplace=True)

In [23]:
#calculate QC covariates
adata.obs['n_counts'] = adata.X.sum(1)
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1)

In [27]:
# FILTER PARAMETERS
#Filter out cells
#sc.pp.filter_cells(adata, max_counts = 13000)
#sc.pp.filter_cells(adata, max_genes = 7500)
# Min 20 cells - filters out low count genes
sc.pp.filter_genes(adata, min_cells=20) #500 works # # 450(30494) #400(29837) #300(28268) not

filtered out 36195 genes that are detected in less than 20 cells


In [30]:
# get mt genes
mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]
np.array(mt_genes)
mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]

# convert to spare
adata.X=sp.sparse.csr_matrix.todense(adata.X)

# manually define mt score
y = np.bincount(mt_gene_mask)
ii = np.nonzero(y)[0]
np.vstack((ii,y[ii])).T
adata.X[:, mt_gene_mask].sum(1)
mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))
mt_sum=np.array(pd.DataFrame(mt_sum)[0])
mt_frac=mt_sum/adata.obs['n_counts'].values

adata.obs['mt_frac'] = mt_frac

#Filter out cells with over 20% mito fraction
adata = adata[adata.obs['mt_frac'] < 0.20]

In [31]:
adata_pp=adata.copy()

In [32]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

normalizing by total count per cell
    finished (0:00:01): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    on highly variable genes
    with n_comps=15
    finished (0:00:00)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:07)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 13 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:00:01)


In [33]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [34]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)
size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

R[write to console]: Loading required package: scran

R[write to console]: Loading required package: SingleCellExperiment

R[write to console]: Loading required package: SummarizedExperiment

R[write to console]: Loading required package: GenomicRanges

R[write to console]: Loading required package: stats4

R[write to console]: Loading required package: BiocGenerics

R[write to console]: Loading required package: parallel

R[write to console]: 
Attaching package: ‘BiocGenerics’


R[write to console]: The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB


R[write to console]: The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


R[write to console]: The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    d

In [35]:
#Delete adata_pp
del adata_pp
adata.obs['size_factors'] = size_factors

  This is separate from the ipykernel package so we can avoid doing imports until


In [36]:
adata.strings_to_categoricals()

In [37]:
#make  (adata.X) copy of counts of raw data for downstream analysis
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [38]:
adata.raw = adata

In [39]:
#Normalize data
adata.X /= adata.obs['size_factors'].values[:, None]
sc.pp.log1p(adata)

In [40]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:01)


In [41]:
# Calculate the visualizations
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata)
sc.tl.umap(adata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:02)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:02)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:10)


In [43]:
# make consistent annotations across datasets
adata.obs['celltype']=adata.obs['cell_type'].copy()

In [44]:
adata.obs['celltype'].cat.categories

Index(['cardiac endothelial cell', 'cardiac muscle cell',
       'fibroblast of cardiac tissue', 'hepatocyte', 'macrophage',
       'smooth muscle cell'],
      dtype='object')

In [45]:
ref_cluster=pd.Categorical(adata.obs['celltype'],
                           categories=['cardiac endothelial cell', 'cardiac muscle cell',
       'fibroblast of cardiac tissue', 'hepatocyte', 'macrophage',
       'smooth muscle cell'])

In [47]:
adata.obs['celltype']=pd.Categorical(ref_cluster,
                                           categories=['cardiac endothelial cell', 'cardiac muscle cell',
       'fibroblast of cardiac tissue', 'hepatocyte', 'macrophage',
       'smooth muscle cell'])

In [48]:
adata.rename_categories('celltype', ['Endothelial cells', 'Cardiomyocytes',
       'Fibroblast cells', 'Unknown', 'Macrophages',
       'Smooth muscle cells'])

In [52]:
adata.obs['tissue'].cat.categories
ref_cluster=pd.Categorical(adata.obs['tissue'],
                           categories=['cardiac atrium', 'cardiac ventricle'])
adata.rename_categories('tissue', ['Heart_Atrium', 'Heart_Ventricle'])

In [53]:
adata.obs['sex'].cat.categories
ref_cluster=pd.Categorical(adata.obs['sex'],
                           categories=['male'])
adata.rename_categories('sex', ['Male'])

In [54]:
adata.obs['ethnicity'].cat.categories
ref_cluster=pd.Categorical(adata.obs['ethnicity'],
                           categories=['Hispanic or Latin American'])
adata.rename_categories('ethnicity', ['Hispanic or Latin-American'])

In [55]:
adata.obs['development_stage'].cat.categories
ref_cluster=pd.Categorical(adata.obs['development_stage'],
                           categories=['74-year-old human stage'])
adata.rename_categories('development_stage',['74'])

In [56]:
adata.obs['donor'].cat.categories
ref_cluster=pd.Categorical(adata.obs['donor'],
                           categories=['TSP12'])
adata.rename_categories('donor',['TSP12'])

In [57]:
adata.obs['Organ'] = 'Heart'
adata.obs['Organ_Specific'] = adata.obs['tissue']
adata.obs['Dataset'] = 'Pisco_Heart'
adata.obs['InternDatasetNumber'] = '10-3-Heart-Pisco-2022'
adata.obs['Dataset_status'] = 'Healthy_Dataset'

adata.obs['celltype'] = adata.obs['celltype']
adata.obs['sub_celltype'] = 'NaN'
adata.obs['Malignant'] = 'NonMalignant'

adata.obs['Patient'] = adata.obs['donor']
adata.obs['Patient_Number'] = 'NaN'
adata.obs['age'] = adata.obs['development_stage']
adata.obs['sex'] = adata.obs['sex']
adata.obs['ethnicity'] = adata.obs['ethnicity']
adata.obs['health_status'] = 'NaN'

adata.obs['original_celltype_1'] = adata.obs['cell_type']
adata.obs['original_celltype_2'] = 'NaN'
adata.obs['original_celltype_3'] = 'NaN'

In [59]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [60]:
adata.obs_names_make_unique()

# 11-Blood

## 11-2-Blood-Pisco-2022

## 11-2-Blood-Pisco-2022 (human) - Pisco Multiple Organs

In [121]:
#sc.pp.calculate_qc_metrics(adata, inplace=True)

In [125]:
adata.X

<50115x58559 sparse matrix of type '<class 'numpy.float32'>'
	with 94755802 stored elements in Compressed Sparse Row format>

In [127]:
adata.X

<50054x23807 sparse matrix of type '<class 'numpy.float32'>'
	with 94045592 stored elements in Compressed Sparse Row format>

In [128]:
# get mt genes
mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]
np.array(mt_genes)
mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]

# convert to spare
adata.X=sp.sparse.csr_matrix.todense(adata.X)

# manually define mt score
y = np.bincount(mt_gene_mask)
ii = np.nonzero(y)[0]
np.vstack((ii,y[ii])).T
adata.X[:, mt_gene_mask].sum(1)
mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))
mt_sum=np.array(pd.DataFrame(mt_sum)[0])
mt_frac=mt_sum/adata.obs['n_counts'].values

adata.obs['mt_frac'] = mt_frac

#Filter out cells with over 20% mito fraction
adata = adata[adata.obs['mt_frac'] < 0.05]

In [129]:
adata_pp=adata.copy()

In [130]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

normalizing by total count per cell
    finished (0:00:05): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    on highly variable genes
    with n_comps=15
    finished (0:00:02)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:07)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 15 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:00:05)


In [131]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [132]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)
size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

In [133]:
#Delete adata_pp
del adata_pp
adata.obs['size_factors'] = size_factors

  This is separate from the ipykernel package so we can avoid doing imports until


In [134]:
adata.strings_to_categoricals()

In [135]:
#make  (adata.X) copy of counts of raw data for downstream analysis
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [136]:
adata.raw = adata

In [137]:
#Normalize data
adata.X /= adata.obs['size_factors'].values[:, None]
sc.pp.log1p(adata)

In [138]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:04)


In [139]:
# Calculate the visualizations
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata)
sc.tl.umap(adata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:08)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:12)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:40)


In [159]:
# make consistent annotations across datasets
adata.obs['celltype']=adata.obs['cell_type'].copy()

In [160]:
adata.obs['celltype'].cat.categories

Index(['CD141-positive myeloid dendritic cell',
       'CD4-positive, alpha-beta T cell',
       'CD4-positive, alpha-beta memory T cell',
       'CD8-positive, alpha-beta T cell',
       'CD8-positive, alpha-beta cytokine secreting effector T cell', 'T cell',
       'basophil', 'classical monocyte', 'common myeloid progenitor',
       'erythrocyte', 'granulocyte', 'hematopoietic stem cell', 'macrophage',
       'mature NK T cell', 'memory B cell', 'monocyte', 'naive B cell',
       'naive thymus-derived CD4-positive, alpha-beta T cell', 'neutrophil',
       'non-classical monocyte', 'plasma cell', 'plasmablast',
       'plasmacytoid dendritic cell', 'platelet', 'type I NK T cell'],
      dtype='object')

In [162]:
ref_cluster=pd.Categorical(adata.obs['celltype'],
                           categories=['CD141-positive myeloid dendritic cell',
       'CD4-positive, alpha-beta T cell',
       'CD4-positive, alpha-beta memory T cell',
       'CD8-positive, alpha-beta T cell',
       'CD8-positive, alpha-beta cytokine secreting effector T cell', 'T cell',
       'basophil', 'classical monocyte', 'common myeloid progenitor',
       'erythrocyte', 'granulocyte', 'hematopoietic stem cell', 'macrophage',
       'mature NK T cell', 'memory B cell', 'monocyte', 'naive B cell',
       'naive thymus-derived CD4-positive, alpha-beta T cell', 'neutrophil',
       'non-classical monocyte', 'plasma cell', 'plasmablast',
       'plasmacytoid dendritic cell', 'platelet', 'type I NK T cell'])

In [163]:
ix=np.isin(ref_cluster,['plasmacytoid dendritic cell'])
ref_cluster[ix]= 'CD141-positive myeloid dendritic cell'

ix=np.isin(ref_cluster,['CD4-positive, alpha-beta memory T cell',
       'CD8-positive, alpha-beta T cell',
       'CD8-positive, alpha-beta cytokine secreting effector T cell', 'T cell', 'naive thymus-derived CD4-positive, alpha-beta T cell','type I NK T cell'])
ref_cluster[ix]= 'CD4-positive, alpha-beta T cell'

ix=np.isin(ref_cluster,['monocyte',  'non-classical monocyte'])
ref_cluster[ix]= 'classical monocyte'

ix=np.isin(ref_cluster,['naive B cell'])
ref_cluster[ix]= 'memory B cell'

ix=np.isin(ref_cluster,['plasmablast'])
ref_cluster[ix]=  'plasma cell'

In [164]:
adata.obs['celltype']=pd.Categorical(ref_cluster,
                                           categories=['CD141-positive myeloid dendritic cell',
       'CD4-positive, alpha-beta T cell',
       
       'basophil', 'classical monocyte', 'common myeloid progenitor',
       'erythrocyte', 'granulocyte', 'hematopoietic stem cell', 'macrophage',
       'mature NK T cell', 'memory B cell',  
        'neutrophil',
       'plasma cell', 
        'platelet'])

In [165]:
adata.rename_categories('celltype', ['Dendritic cells',
                                     'T cells',
       
        'Basophil cells', 'Monocytes', 'Common myeloid progenitor cells', 
       'Erythroid cells', 'Granulocyte cells', 'Hematopoietic stem cells', 'Macrophages', 
                                     'NK cells', 'B cells',  
         'Neutrophils',
                                     'Plasma cells',
                                     'Thrombocytes'])

In [169]:
adata.obs['tissue'].cat.categories
ref_cluster=pd.Categorical(adata.obs['tissue'],
                           categories=['blood'])
adata.rename_categories('tissue', ['Blood'])

In [170]:
adata.obs['sex'].cat.categories
ref_cluster=pd.Categorical(adata.obs['sex'],
                           categories=['female', 'male'])
adata.rename_categories('sex', ['Female', 'Male'])

In [171]:
adata.obs['ethnicity'].cat.categories
ref_cluster=pd.Categorical(adata.obs['ethnicity'],
                           categories=['African American or Afro-Caribbean', 'European',
       'Hispanic or Latin-American'])
adata.rename_categories('ethnicity', ['African-American or Afro-Caribbean', 'European', 'Hispanic or Latin-American'])

In [172]:
adata.obs['development_stage'].cat.categories
ref_cluster=pd.Categorical(adata.obs['development_stage'],
                           categories=['33-year-old human stage', '56-year-old human stage',
       '59-year-old human stage', '61-year-old human stage',
       '69-year-old human stage'])
adata.rename_categories('development_stage', ['33', '56',
       '59', '61',
       '69'])

In [173]:
adata.obs['donor'].cat.categories
ref_cluster=pd.Categorical(adata.obs['donor'],
                           categories=['TSP1', 'TSP2', 'TSP7', 'TSP8', 'TSP10', 'TSP14'])
adata.rename_categories('donor', ['TSP1', 'TSP2', 'TSP7', 'TSP8', 'TSP10', 'TSP14'])

In [174]:
adata.obs['Organ'] = 'Blood'
adata.obs['Organ_Specific'] = adata.obs['tissue']
adata.obs['Dataset'] = 'Pisco_Blood'
adata.obs['InternDatasetNumber'] = '11-2-Blood-Pisco-2022'
adata.obs['Dataset_status'] = 'Healthy_Dataset'

adata.obs['celltype'] = adata.obs['celltype']
adata.obs['sub_celltype'] = 'NaN'
adata.obs['Malignant'] = 'NonMalignant'

adata.obs['Patient'] = adata.obs['donor']
adata.obs['Patient_Number'] = 'NaN'
adata.obs['age'] = adata.obs['development_stage']
adata.obs['sex'] = adata.obs['sex']
adata.obs['ethnicity'] = adata.obs['ethnicity']
adata.obs['health_status'] = 'NaN'

adata.obs['original_celltype_1'] = adata.obs['cell_type']
adata.obs['original_celltype_2'] = 'NaN'
adata.obs['original_celltype_3'] = 'NaN'

In [176]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [177]:
adata.obs_names_make_unique()

## 11-5-Blood-Han-2020

## 11-5-Blood-Han-2020-RubenBrabenec_HealthyProject

In [771]:
ix=np.isin(adata_han.obs['sub_tissue'],['AdultPeripheralBlood']) 
adata=adata_han[ix].copy()

In [772]:
adata.obs['InternDatasetNumber'] ='11-5-Blood-Han-2020'

In [776]:
#calculate QC covariates
adata.obs['n_counts'] = adata.X.sum(1)
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1)

In [779]:
# FILTER PARAMETERS#Filter out cells
sc.pp.filter_cells(adata, max_counts = 3000)
sc.pp.filter_cells(adata, max_genes = 1500)
# Min 20 cells - filters out low count genes
sc.pp.filter_genes(adata, min_cells=20) #500 works # # 450(30494) #400(29837) #300(28268) not

filtered out 11 cells that have more than 3000 counts
filtered out 1 cells that have more than 1500 genes expressed
filtered out 16679 genes that are detected in less than 20 cells


In [780]:
# get mt genes
mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]
np.array(mt_genes)
mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]

# convert to spare
adata.X=sp.sparse.csr_matrix.todense(adata.X)

# manually define mt score
y = np.bincount(mt_gene_mask)
ii = np.nonzero(y)[0]
np.vstack((ii,y[ii])).T
adata.X[:, mt_gene_mask].sum(1)
mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))
mt_sum=np.array(pd.DataFrame(mt_sum)[0])
mt_frac=mt_sum/adata.obs['n_counts'].values

adata.obs['mt_frac'] = mt_frac

#Filter out cells with over 20% mito fraction
adata = adata[adata.obs['mt_frac'] < 0.20]

In [781]:
adata_pp=adata.copy()

In [782]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

normalizing by total count per cell
    finished (0:00:00): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    with n_comps=15
    finished (0:00:02)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 11 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:00:01)


In [783]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [784]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)
size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

In [785]:
#Delete adata_pp
del adata_pp
adata.obs['size_factors'] = size_factors

  This is separate from the ipykernel package so we can avoid doing imports until


In [786]:
adata.strings_to_categoricals()

In [787]:
#make  (adata.X) copy of counts of raw data for downstream analysis
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [788]:
adata.raw = adata

In [789]:
#Normalize data
adata.X /= adata.obs['size_factors'].values[:, None]
sc.pp.log1p(adata)

In [None]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes


In [791]:
# Calculate the visualizations
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata)
sc.tl.umap(adata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:02)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:02)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:09)


In [793]:
# make consistent annotations across datasets
adata.obs['celltype']=adata.obs['celltype_specific'].copy()
adata.obs['celltype'].cat.categories

Index(['B cell', 'B cell(Centrocyte)', 'B cell(Plasmocyte)',
       'B cell(Plasmocyte)_IGHG4 high', 'B cell(Plasmocyte)_IGHM high',
       'CD4_T cell', 'CD8+ T cell', 'CD8_T cell',
       'Conventional dendritic cell', 'Dendritic cell',
       'Dendritic cell_FCER1A high', 'Dendritic cell_LGALS2 high',
       'Dendritic cell_WDFY4 high', 'Eosinophil', 'Macrophage',
       'Macrophage_FCGR3A high', 'Monocyte', 'Monocyte_CXCR2 high',
       'Monocyte_IGHG4 high', 'Monocyte_ISG15 high', 'Monocyte_S100A12 high',
       'NK cell', 'Neutrophil_CAMP high', 'Neutrophil_DEFA3 high',
       'Plasmacytoid dendritic cell', 'Proliferating  B cell',
       'Proliferating T cell', 'T cell', 'T cell_GNLY high',
       'T cell_TRAC high', 'activative T cell'],
      dtype='object')

In [794]:
ref_cluster=pd.Categorical(adata.obs['celltype'],
                           categories=['B cell', 'B cell(Centrocyte)', 'B cell(Plasmocyte)',
       'B cell(Plasmocyte)_IGHG4 high', 'B cell(Plasmocyte)_IGHM high',
       'CD4_T cell', 'CD8+ T cell', 'CD8_T cell',
       'Conventional dendritic cell', 'Dendritic cell',
       'Dendritic cell_FCER1A high', 'Dendritic cell_LGALS2 high',
       'Dendritic cell_WDFY4 high', 'Eosinophil', 'Macrophage',
       'Macrophage_FCGR3A high', 'Monocyte', 'Monocyte_CXCR2 high',
       'Monocyte_IGHG4 high', 'Monocyte_ISG15 high', 'Monocyte_S100A12 high',
       'NK cell', 'Neutrophil_CAMP high', 'Neutrophil_DEFA3 high',
       'Plasmacytoid dendritic cell', 'Proliferating  B cell',
       'Proliferating T cell', 'T cell', 'T cell_GNLY high',
       'T cell_TRAC high', 'activative T cell'])

In [795]:
ix=np.isin(ref_cluster,[ 'B cell(Centrocyte)',   'Proliferating  B cell'])
ref_cluster[ix]= 'B cell'

ix=np.isin(ref_cluster,[ 'B cell(Plasmocyte)_IGHG4 high', 'B cell(Plasmocyte)_IGHM high'])
ref_cluster[ix]= 'B cell(Plasmocyte)'

ix=np.isin(ref_cluster,[ 'CD8+ T cell', 'CD8_T cell',   'Proliferating T cell', 'T cell', 'T cell_GNLY high',
       'T cell_TRAC high', 'activative T cell'])
ref_cluster[ix]=  'CD4_T cell'

ix=np.isin(ref_cluster,[ 'Dendritic cell', 'Dendritic cell_FCER1A high', 'Dendritic cell_LGALS2 high',
       'Dendritic cell_WDFY4 high','Plasmacytoid dendritic cell'])
ref_cluster[ix]= 'Conventional dendritic cell'

ix=np.isin(ref_cluster,[ 'Macrophage_FCGR3A high'])
ref_cluster[ix]='Macrophage'

ix=np.isin(ref_cluster,[ 'Monocyte_CXCR2 high', 'Monocyte_IGHG4 high', 'Monocyte_ISG15 high', 'Monocyte_S100A12 high'])
ref_cluster[ix]='Monocyte'

ix=np.isin(ref_cluster,[  'Neutrophil_DEFA3 high'])
ref_cluster[ix]= 'Neutrophil_CAMP high'

In [796]:
adata.obs['celltype']=pd.Categorical(ref_cluster,
                                           categories=['B cell', 'B cell(Plasmocyte)',
       
       'CD4_T cell', 
       'Conventional dendritic cell', 
        'Eosinophil', 'Macrophage',
        'Monocyte',
       'NK cell', 'Neutrophil_CAMP high'])

In [797]:
adata.rename_categories('celltype', ['B cells', 'Plasma cells',
       
       'T cells', 
       'Dendritic cells', 
        'Eosinophils', 'Macrophages',
        'Monocytes',
       'NK cells', 'Neutrophils'])

In [801]:
adata.obs['sub_tissue'].cat.categories
ref_cluster=pd.Categorical(adata.obs['sub_tissue'],
                           categories=['AdultPeripheralBlood'])
adata.rename_categories('sub_tissue', ['Blood'])

In [802]:
adata.obs['sex'].cat.categories
ref_cluster=pd.Categorical(adata.obs['sex'],
                           categories=[ 'male'])
adata.rename_categories('sex', [ 'Male'])

In [803]:
adata.obs['age'].cat.categories
ref_cluster=pd.Categorical(adata.obs['age'],
                           categories=['25Y', '27Y', '34Y'])
adata.rename_categories('age',['25', '27', '34'])

In [804]:
adata.obs['donor'].cat.categories
ref_cluster=pd.Categorical(adata.obs['donor'],
                           categories=['Donor45', 'Donor47', 'Donor48'])
adata.rename_categories('donor', ['Han-Donor45', 'Han-Donor47', 'Han-Donor48'])

In [805]:
adata.obs['Organ'] = 'Blood'
adata.obs['Organ_Specific'] = adata.obs['sub_tissue']
adata.obs['Dataset'] = 'Han_Blood'
adata.obs['InternDatasetNumber'] = '11-5-Blood-Han-2020'
adata.obs['Dataset_status'] = 'Healthy_Dataset'

adata.obs['celltype'] = adata.obs['celltype']
adata.obs['sub_celltype'] = 'NaN'
adata.obs['Malignant'] = 'NonMalignant'

adata.obs['Patient'] = adata.obs['donor']
adata.obs['Patient_Number'] = 'NaN'
adata.obs['age'] = adata.obs['age']
adata.obs['sex'] = adata.obs['sex']
adata.obs['ethnicity'] = 'NaN'
adata.obs['health_status'] = 'NaN'

adata.obs['original_celltype_1'] = adata.obs['celltype_specific']
adata.obs['original_celltype_2'] = adata.obs['celltype_global']
adata.obs['original_celltype_3'] = 'NaN'

In [807]:
adata.X = sp.sparse.csr_matrix(adata.X)

## 11-6-Blood_ImmuneCells-Teichmann-2022

## 11-6-Blood_ImmuneCells-Teichmann-2022-RubenBrabenec

In [121]:
ix=np.isin(adata_analysis.obs['tissue_major'], ['Blood'])
adata=adata_analysis[ix].copy()

In [125]:
adata.obs['Organ'] =  adata.obs['tissue_major']
adata.obs['Organ_Specific'] = adata.obs['sub_tissue']
adata.obs['Dataset'] = adata.obs['Dataset']
adata.obs['InternDatasetNumber'] = '11-6-Blood_ImmuneCells-Teichmann-2022'
adata.obs['Dataset_status'] = 'Healthy_Dataset'

adata.obs['celltype'] = adata.obs['celltype']
adata.obs['sub_celltype'] = 'NaN'
adata.obs['Malignant'] = 'NonMalignant'

adata.obs['Patient'] = adata.obs['donor']
adata.obs['Patient_Number'] = 'NaN'
adata.obs['age'] = adata.obs['development_stage']
adata.obs['sex'] = adata.obs['sex']
adata.obs['ethnicity'] = adata.obs['ethnicity']
adata.obs['health_status'] = 'NaN'

adata.obs['original_celltype_1'] = adata.obs['cell_type']
adata.obs['original_celltype_2'] = adata.obs['Majority_voting_CellTypist_high']
adata.obs['original_celltype_3'] = 'NaN'

In [126]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [127]:
adata.obs_names_make_unique()

# 12-BoneMarrow

## 12-1-BoneMarrow-VanGalen-2019

In [4]:
path = '/storage/groups/ml01/workspace/moritz.thomas/VanGalen2019/raw_data/'

In [5]:
matrices = ['GSM3587996_BM1','GSM3587997_BM2','GSM3587998_BM3','GSM3588000_BM4',
            'GSM3588002_BM5-34p','GSM3588003_BM5-34p38n']
annotations = ['GSM3587996_BM1','GSM3587997_BM2','GSM3587999_BM3','GSM3588001_BM4',
               'GSM3588002_BM5-34p','GSM3588003_BM5-34p38n']
matrix_file_end = '.dem.txt'
anno_file_end = '.anno.txt'

In [6]:
#Load first data set & annotation
sample = matrices.pop(0)
data_file = writepath+sample+matrix_file_end

anno = annotations.pop(0)
anno_file = writepath+anno+anno_file_end

In [9]:
#Load data
adata=sc.read_text(data_file, delimiter='\t', dtype='float32')
adata=adata.transpose()

In [10]:
#Load annotation

annotation = pd.read_csv(anno_file, delimiter='\t')
annotation.rename(columns={'Cell':'barcode'}, inplace=True)
annotation.set_index('barcode', inplace=True)

adata.obs = annotation
adata.obs['sample'] = 'BM1'

In [None]:
# Loop to load rest of data sets

for i in range(len(matrices)):
    
    #Parse Filenames
    sample = matrices[i]
    data_file = writepath+sample+matrix_file_end
    
    anno = annotations[i]
    anno_file = writepath+anno+anno_file_end
    
    #Load data
    adata_tmp = sc.read_text(data_file, delimiter='\t', dtype='float32')
    adata_tmp = adata_tmp.transpose()
    #adata_tmp.X = adata_tmp.X.toarray()

    #Annotate data
    annotation_tmp = pd.read_csv(anno_file, delimiter='\t')
    annotation_tmp.rename(columns={'Cell':'barcode'}, inplace=True)
    annotation_tmp.set_index('barcode', inplace=True)
    adata_tmp.obs = annotation_tmp
    adata_tmp.obs['sample'] = str(matrices[i]).split("_")[1]
    

    # Concatenate to main adata object
    adata = adata.concatenate(adata_tmp, batch_key='sample_id')
    
    adata.obs.drop(columns=['sample_id'], inplace=True)
    adata.obs_names_make_unique(join='_')

In [188]:
barcodes = adata.obs_names

In [189]:
first_barcodes=barcodes[0:4677]
first_barcodes_true = [c.split("-")[0] for c in first_barcodes]

In [190]:
last_barcodes=barcodes[4677:7698]
last_barcodes_true = [c.split("-")[1] for c in last_barcodes]
last_barcodes_true = ['BM5_' + s for s in last_barcodes_true]

In [191]:
new_barcodes=np.append(first_barcodes_true, last_barcodes_true)

In [194]:
adata.obs.index=new_barcodes

In [213]:
adata.obs['InternDatasetNumber'] ='12-1-BoneMarrow-VanGalen-2019'

In [215]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [216]:
#calculate QC covariates
adata.obs['n_counts'] = adata.X.sum(1)
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1)

In [None]:
# FILTER PARAMETERS#Filter out cells
sc.pp.filter_cells(adata, max_counts = 1700)
sc.pp.filter_cells(adata, max_genes = 4200)
# Min 20 cells - filters out low count genes
sc.pp.filter_genes(adata, min_cells=10)

In [220]:
# get mt genes
mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]
np.array(mt_genes)
mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]

# convert to spare
adata.X=sp.sparse.csr_matrix.todense(adata.X)

# manually define mt score
y = np.bincount(mt_gene_mask)
ii = np.nonzero(y)[0]
np.vstack((ii,y[ii])).T
adata.X[:, mt_gene_mask].sum(1)
mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))
mt_sum=np.array(pd.DataFrame(mt_sum)[0])
mt_frac=mt_sum/adata.obs['n_counts'].values

adata.obs['mt_frac'] = mt_frac

#Filter out cells with over 20% mito fraction
adata = adata[adata.obs['mt_frac'] < 0.20]

In [222]:
adata_pp=adata.copy()

In [223]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

normalizing by total count per cell
    finished (0:00:01): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    on highly variable genes
    with n_comps=15
    finished (0:00:00)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 8 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:00:00)


In [224]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [225]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)
size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

In [226]:
#Deletstrings_to_categoricals_pp
del adata_pp
adata.obs['size_factors'] = size_factors

  This is separate from the ipykernel package so we can avoid doing imports until


In [227]:
adata.strings_to_categoricals()

In [228]:
#make  (adata.X) copy of counts of raw data for downstream analysis
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [229]:
adata.raw = adata

In [230]:
#Normalize data
adata.X /= adata.obs['size_factors'].values[:, None]
sc.pp.log1p(adata)

In [231]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [232]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:00)


In [233]:
# Calculate the visualizations
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata)
sc.tl.umap(adata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:02)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:16)


In [237]:
#sc.pl.umap(adata, color='celltype', palette=palette)

In [242]:
# make consistent annotations across datasets
adata.obs['celltype'] = adata.obs['CellType']
adata.obs['celltype'].cat.categories

Index(['B', 'CTL', 'GMP', 'Mono', 'NK', 'Plasma', 'ProB', 'ProMono', 'Prog',
       'T', 'cDC', 'earlyEry', 'lateEry', 'HSC', 'pDC'],
      dtype='object')

In [243]:
ref_cluster=pd.Categorical(adata.obs['celltype'],
                           categories=['B', 'CTL', 'GMP', 'Mono', 'NK', 'Plasma', 'ProB', 'ProMono', 'Prog',
       'T', 'cDC', 'earlyEry', 'lateEry', 'HSC', 'pDC'])

In [244]:
ix=np.isin(ref_cluster,['ProB'])
ref_cluster[ix]='B'

ix=np.isin(ref_cluster,['CTL'])
ref_cluster[ix]='T'

ix=np.isin(ref_cluster,['ProMono'])
ref_cluster[ix]='Mono'

ix=np.isin(ref_cluster,['earlyEry'])
ref_cluster[ix]='lateEry'

ix=np.isin(ref_cluster,['pDC'])
ref_cluster[ix]='cDC'

In [245]:
adata.obs['celltype']=pd.Categorical(ref_cluster,
                                           categories=['B', 'GMP', 'Mono', 'NK', 'Plasma', 'Prog',
       'T', 'cDC', 'lateEry', 'HSC'])

In [246]:
adata.obs['celltype'].cat.categories

Index(['B', 'GMP', 'Mono', 'NK', 'Plasma', 'Prog', 'T', 'cDC', 'lateEry',
       'HSC'],
      dtype='object')

In [247]:
adata.rename_categories('celltype', ['B cells', 'Granulocyte-monocyte progenitor cells', 'Monocytes', 'NK cells', 'Plasma cells', 'Multipotent progenitor cells',
                                          'T cells', 'Dendritic cells', 'Erythroid cells', 'Hematopoietic stem cells'])

In [251]:
adata.obs['Organ'] = 'BoneMarrow'
adata.obs['Organ_Specific'] = 'BoneMarrow'
adata.obs['Dataset'] = 'XX_BoneMarrow'
adata.obs['InternDatasetNumber'] ='12-1-BoneMarrow-VanGalen-2019'
adata.obs['Dataset_status'] = 'Healthy_Dataset'

adata.obs['celltype'] = adata.obs['celltype']
adata.obs['sub_celltype'] ='NaN'
adata.obs['Malignant'] = 'NonMalignant'

adata.obs['Patient'] = 'NameXX-Donor1'
adata.obs['Patient_Number'] = adata.obs['sample']
adata.obs['age'] = 'NaN'
adata.obs['sex'] = 'NaN'
adata.obs['health_status'] = 'NaN'

adata.obs['original_celltype_1'] = adata.obs['CellType_Atlas']
adata.obs['original_celltype_2'] = adata.obs['CellType']
adata.obs['original_celltype_3'] = 'NaN'

In [253]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [254]:
adata.obs_names_make_unique()

## 12-3-BoneMarrow-Pisco-2022

## 12-3-BoneMarrow-Pisco-2022 (human) - Pisco Multiple Organs

In [179]:
ix=np.isin(adata_pisco.obs['tissue'],['bone marrow']) 
adata=adata_pisco[ix].copy()

In [184]:
#calculate QC covariates
adata.obs['n_counts'] = adata.X.sum(1)
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1)

In [188]:
# FILTER PARAMETERS
#Filter out cells
sc.pp.filter_cells(adata, max_counts = 15000)
sc.pp.filter_cells(adata, max_genes = 10000)
# Min 20 cells - filters out low count genes
sc.pp.filter_genes(adata, min_cells=30) #500 works # # 450(30494) #400(29837) #300(28268) not

filtered out 1 cells that have more than 15000 counts
filtered out 2 cells that have more than 10000 genes expressed
filtered out 36049 genes that are detected in less than 30 cells


In [190]:
# get mt genes
mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]
np.array(mt_genes)
mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]

# convert to spare
adata.X=sp.sparse.csr_matrix.todense(adata.X)

# manually define mt score
y = np.bincount(mt_gene_mask)
ii = np.nonzero(y)[0]
np.vstack((ii,y[ii])).T
adata.X[:, mt_gene_mask].sum(1)
mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))
mt_sum=np.array(pd.DataFrame(mt_sum)[0])
mt_frac=mt_sum/adata.obs['n_counts'].values

adata.obs['mt_frac'] = mt_frac

#Filter out cells with over 20% mito fraction
adata = adata[adata.obs['mt_frac'] < 0.20]

In [191]:
adata_pp=adata.copy()

In [192]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

normalizing by total count per cell
    finished (0:00:01): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    on highly variable genes
    with n_comps=15
    finished (0:00:00)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 22 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:00:00)


In [193]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [194]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)
size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

In [195]:
#Delete adata_pp
del adata_pp
adata.obs['size_factors'] = size_factors

  This is separate from the ipykernel package so we can avoid doing imports until


In [196]:
adata.strings_to_categoricals()

In [197]:
#make  (adata.X) copy of counts of raw data for downstream analysis
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [198]:
adata.raw = adata

In [199]:
#Normalize data
adata.X /= adata.obs['size_factors'].values[:, None]
sc.pp.log1p(adata)

In [200]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:01)


In [201]:
# Calculate the visualizations
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata)
sc.tl.umap(adata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:02)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:10)


In [203]:
# make consistent annotations across datasets
adata.obs['celltype']=adata.obs['cell_type'].copy()

In [204]:
adata.obs['celltype'].cat.categories

Index(['CD4-positive, alpha-beta T cell', 'CD8-positive, alpha-beta T cell',
       'common myeloid progenitor', 'erythrocyte', 'erythroid progenitor cell',
       'granulocyte', 'hematopoietic stem cell', 'macrophage',
       'mature NK T cell', 'memory B cell', 'monocyte', 'naive B cell',
       'neutrophil', 'plasma cell', 'plasmablast'],
      dtype='object')

In [205]:
ref_cluster=pd.Categorical(adata.obs['celltype'],
                           categories=['CD4-positive, alpha-beta T cell', 'CD8-positive, alpha-beta T cell',
       'common myeloid progenitor', 'erythrocyte', 'erythroid progenitor cell',
       'granulocyte', 'hematopoietic stem cell', 'macrophage',
       'mature NK T cell', 'memory B cell', 'monocyte', 'naive B cell',
       'neutrophil', 'plasma cell', 'plasmablast'])

In [206]:
ix=np.isin(ref_cluster,[ 'CD8-positive, alpha-beta T cell'])
ref_cluster[ix]='CD4-positive, alpha-beta T cell'

ix=np.isin(ref_cluster,['naive B cell'])
ref_cluster[ix]='memory B cell'

ix=np.isin(ref_cluster,['plasmablast'])
ref_cluster[ix]='plasma cell'

In [207]:
adata.obs['celltype']=pd.Categorical(ref_cluster,
                                           categories=['CD4-positive, alpha-beta T cell',
       'common myeloid progenitor', 'erythrocyte', 'erythroid progenitor cell',
       'granulocyte', 'hematopoietic stem cell', 'macrophage',
       'mature NK T cell', 'memory B cell', 'monocyte',
       'neutrophil', 'plasma cell'])

In [208]:
adata.rename_categories('celltype', ['T cells',
       'Common myeloid progenitor cells', 'Erythroid cells', 'Erythroid progenitor cells',
       'Granulocyte cells', 'Hematopoietic stem cells', 'Macrophages',
       'NK cells', 'B cells', 'Monocytes',
       'Neutrophils', 'Plasma cells'])

In [211]:
adata.obs['donor'].cat.categories

Index(['TSP2', 'TSP11', 'TSP13', 'TSP14'], dtype='object')

In [212]:
adata.obs['tissue'].cat.categories
ref_cluster=pd.Categorical(adata.obs['tissue'],
                           categories=['bone marrow'])
adata.rename_categories('tissue', ['BoneMarrow'])

In [213]:
adata.obs['sex'].cat.categories
ref_cluster=pd.Categorical(adata.obs['sex'],
                           categories=['female', 'male'])
adata.rename_categories('sex', ['Female', 'Male'])

In [214]:
adata.obs['ethnicity'].cat.categories
ref_cluster=pd.Categorical(adata.obs['ethnicity'],
                           categories=['African American or Afro-Caribbean', 'Asian', 'European',
       'Hispanic or Latin American'])
adata.rename_categories('ethnicity', ['African-American or Afro-Caribbean', 'Asian', 'European', 'Hispanic or Latin-American'])

In [215]:
adata.obs['development_stage'].cat.categories
ref_cluster=pd.Categorical(adata.obs['development_stage'],
                           categories=['22-year-old human stage', '42-year-old human stage',
       '59-year-old human stage', '61-year-old human stage'])
adata.rename_categories('development_stage',['22', '42',
       '59', '61'])

In [216]:
adata.obs['donor'].cat.categories
ref_cluster=pd.Categorical(adata.obs['donor'],
                           categories=['TSP2', 'TSP11', 'TSP13', 'TSP14'])
adata.rename_categories('donor', ['TSP2', 'TSP11', 'TSP13', 'TSP14'])

In [None]:
adata.obs['Organ'] = 'BoneMarrow'
adata.obs['Organ_Specific'] = adata.obs['tissue']
adata.obs['Dataset'] = 'Pisco_BoneMarrow'
adata.obs['InternDatasetNumber'] = '12-3-BoneMarrow-Pisco-2022'
adata.obs['Dataset_status'] = 'Healthy_Dataset'

adata.obs['celltype'] = adata.obs['celltype']
adata.obs['sub_celltype'] = 'NaN'
adata.obs['Malignant'] = 'NonMalignant'

adata.obs['Patient'] = adata.obs['donor']
adata.obs['Patient_Number'] = 'NaN'
adata.obs['age'] = adata.obs['development_stage']
adata.obs['sex'] = adata.obs['sex']
adata.obs['ethnicity'] = adata.obs['ethnicity']
adata.obs['health_status'] = 'NaN'

adata.obs['original_celltype_1'] = adata.obs['cell_type']
adata.obs['original_celltype_2'] = 'NaN'
adata.obs['original_celltype_3'] = 'NaN'

In [219]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [220]:
adata.obs_names_make_unique()

## 12-4-BoneMarrow-Han-2020

## 12-4-BoneMarrow-Han-2020-RubenBrabenec_HealthyProject

In [317]:
ix=np.isin(adata_han.obs['sub_tissue'],['AdultBoneMarrow']) 
adata=adata_han[ix].copy()

In [318]:
adata.obs['InternDatasetNumber'] ='12-4-BoneMarrow-Han-2020'

In [322]:
#calculate QC covariates
adata.obs['n_counts'] = adata.X.sum(1)
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1)

In [325]:
# FILTER PARAMETERS
#Filter out cells
sc.pp.filter_cells(adata, max_counts = 4200)
sc.pp.filter_cells(adata, max_genes = 2200)
# Min 20 cells - filters out low count genes
sc.pp.filter_genes(adata, min_cells=10) #500 works # # 450(30494) #400(29837) #300(28268) not

filtered out 37 cells that have more than 4200 counts
filtered out 14569 genes that are detected in less than 10 cells


In [326]:
# get mt genes
mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]
np.array(mt_genes)
mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]

# convert to spare
adata.X=sp.sparse.csr_matrix.todense(adata.X)

# manually define mt score
y = np.bincount(mt_gene_mask)
ii = np.nonzero(y)[0]
np.vstack((ii,y[ii])).T
adata.X[:, mt_gene_mask].sum(1)
mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))
mt_sum=np.array(pd.DataFrame(mt_sum)[0])
mt_frac=mt_sum/adata.obs['n_counts'].values

adata.obs['mt_frac'] = mt_frac

#Filter out cells with over 20% mito fraction
adata = adata[adata.obs['mt_frac'] < 0.20]

In [327]:
adata_pp=adata.copy()

In [328]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

normalizing by total count per cell
    finished (0:00:00): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    with n_comps=15
    finished (0:00:01)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 12 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:00:00)


In [329]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [330]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)
size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

In [331]:
#Delete adata_pp
del adata_pp
adata.obs['size_factors'] = size_factors

  This is separate from the ipykernel package so we can avoid doing imports until


In [332]:
adata.strings_to_categoricals()

In [333]:
#make  (adata.X) copy of counts of raw data for downstream analysis
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [334]:
adata.raw = adata

In [335]:
#Normalize data
adata.X /= adata.obs['size_factors'].values[:, None]
sc.pp.log1p(adata)

In [336]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:00)


In [337]:
# Calculate the visualizations
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata)
sc.tl.umap(adata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:01)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:17)


In [339]:
# make consistent annotations across datasets
adata.obs['celltype']=adata.obs['celltype_specific'].copy()
adata.obs['celltype'].cat.categories

Index(['B cell', 'B cell (Centrocyte)', 'B cell (Plasmocyte)',
       'Dendritic cell', 'Erythroid cell', 'Erythroid progenitor cell', 'HSPC',
       'M2 Macrophage', 'Monocyte', 'Monocyte/DC progenitor',
       'Neutrophil_DEFA3 high', 'Neutrophil_LTF high', 'Neutrophil_PRTN3 high',
       'Neutrophil_S100A9 high', 'Neutrophil_S100A12 high', 'T cell'],
      dtype='object')

In [340]:
ref_cluster=pd.Categorical(adata.obs['celltype'],
                           categories=['B cell', 'B cell (Centrocyte)', 'B cell (Plasmocyte)',
       'Dendritic cell', 'Erythroid cell', 'Erythroid progenitor cell', 'HSPC',
       'M2 Macrophage', 'Monocyte', 'Monocyte/DC progenitor',
       'Neutrophil_DEFA3 high', 'Neutrophil_LTF high', 'Neutrophil_PRTN3 high',
       'Neutrophil_S100A9 high', 'Neutrophil_S100A12 high', 'T cell'])

In [341]:
ix=np.isin(ref_cluster,[ 'B cell (Centrocyte)'])
ref_cluster[ix]='B cell (Plasmocyte)'

ix=np.isin(ref_cluster,[ 'Monocyte/DC progenitor'])
ref_cluster[ix]='Monocyte'

ix=np.isin(ref_cluster,[ 'Neutrophil_LTF high', 'Neutrophil_PRTN3 high','Neutrophil_S100A9 high', 'Neutrophil_S100A12 high'])
ref_cluster[ix]='Neutrophil_DEFA3 high'

In [342]:
adata.obs['celltype']=pd.Categorical(ref_cluster,
                                           categories=['B cell','B cell (Plasmocyte)',
       'Dendritic cell', 'Erythroid cell', 'Erythroid progenitor cell', 'HSPC',
       'M2 Macrophage', 'Monocyte', 
       'Neutrophil_DEFA3 high', 'T cell'])

In [343]:
adata.rename_categories('celltype', ['B cells', 'Plasma cells',
       'Dendritic cells', 'Erythroid cells', 'Erythroid progenitor cells', 'Hematopoietic stem cells',
       'Macrophages', 'Monocytes',
       'Neutrophils', 'T cells'])

In [347]:
adata.obs['sub_tissue'].cat.categories
ref_cluster=pd.Categorical(adata.obs['sub_tissue'],
                           categories=['AdultBoneMarrow'])
adata.rename_categories('sub_tissue', ['BoneMarrow'])

In [348]:
adata.obs['sex'].cat.categories
ref_cluster=pd.Categorical(adata.obs['sex'],
                           categories=['female'])
adata.rename_categories('sex', ['Female'])

In [349]:
adata.obs['age'].cat.categories
ref_cluster=pd.Categorical(adata.obs['age'],
                           categories=['49Y', '60Y'])
adata.rename_categories('age',['49', '60'])

In [350]:
adata.obs['donor'].cat.categories
ref_cluster=pd.Categorical(adata.obs['donor'],
                           categories=['Donor27', 'Donor28'])
adata.rename_categories('donor', ['Han-Donor27', 'Han-Donor28'])

In [353]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [354]:
adata.write(writepath + '12-4-BoneMarrow-Han-2020-processed.h5ad')

## 12-5-BoneMarrow_ImmuneCells-Teichmann-2022

In [152]:
ix=np.isin(adata_analysis.obs['tissue_major'], ['BoneMarrow'])
adata=adata_analysis[ix].copy()

In [156]:
adata.obs['Organ'] =  adata.obs['tissue_major']
adata.obs['Organ_Specific'] = adata.obs['sub_tissue']
adata.obs['Dataset'] = adata.obs['Dataset']
adata.obs['InternDatasetNumber'] = '12-5-BoneMarrow_ImmuneCells-Teichmann-2022'
adata.obs['Dataset_status'] = 'Healthy_Dataset'

adata.obs['celltype'] = adata.obs['celltype']
adata.obs['sub_celltype'] = 'NaN'
adata.obs['Malignant'] = 'NonMalignant'

adata.obs['Patient'] = adata.obs['donor']
adata.obs['Patient_Number'] = 'NaN'
adata.obs['age'] = adata.obs['development_stage']
adata.obs['sex'] = adata.obs['sex']
adata.obs['ethnicity'] = adata.obs['ethnicity']
adata.obs['health_status'] = 'NaN'

adata.obs['original_celltype_1'] = adata.obs['cell_type']
adata.obs['original_celltype_2'] = adata.obs['Majority_voting_CellTypist_high']
adata.obs['original_celltype_3'] = 'NaN'

In [157]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [158]:
adata.obs_names_make_unique()

In [159]:
adata.write(writepath + '12-5-BoneMarrow_ImmuneCells-Teichmann-2022-processed.h5ad')

# 13-Skin

## 13-1-Skin-Cheng-2018

In [111]:
# here we use sfaira to import available datasets with annotations
# note that the following steps may change depending on the current sfaira version and the path to your repository

datadir = '/path/to/repo/'

ds = sfaira.data.human.DatasetGroupSkin(path=datadir)  # This links all data sets available

In [112]:
ds.ids

['human_skin_2018_10x_cheng_001_10.1016/j.celrep.2018.09.006',
 'human_skin_2020_microwell_han_001_10.1038/s41586-020-2157-4',
 'human_skin_2020_microwell_han_002_10.1038/s41586-020-2157-4']

In [113]:
# pick first one
idx = ds.ids[0]

In [114]:
idx

'human_skin_2018_10x_cheng_001_10.1016/j.celrep.2018.09.006'

In [115]:
ds.datasets[idx].load()



In [117]:
adata=ds.datasets[idx].adata

In [121]:
adata.var.index=np.array(adata.var.names)

In [267]:
adata.obs['InternDatasetNumber'] ='13-1-Skin-Cheng-2018'

In [270]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [271]:
#calculate QC covariates
adata.obs['n_counts'] = adata.X.sum(1)
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1)

In [274]:
# FILTER PARAMETERS#Filter out cells
#sc.pp.filter_cells(adata, max_counts = 1700)
#sc.pp.filter_cells(adata, max_genes = 4200)
# Min 20 cells - filters out low count genes
sc.pp.filter_genes(adata, min_cells=20)

filtered out 424 genes that are detected in less than 20 cells


In [275]:
# get mt genes
mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]
np.array(mt_genes)
mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]

# convert to spare
adata.X=sp.sparse.csr_matrix.todense(adata.X)

# manually define mt score
y = np.bincount(mt_gene_mask)
ii = np.nonzero(y)[0]
np.vstack((ii,y[ii])).T
adata.X[:, mt_gene_mask].sum(1)
mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))
mt_sum=np.array(pd.DataFrame(mt_sum)[0])
mt_frac=mt_sum/adata.obs['n_counts'].values

adata.obs['mt_frac'] = mt_frac

#Filter out cells with over 20% mito fraction
adata = adata[adata.obs['mt_frac'] < 0.20]


These matrices should now be stored in the .obsp attribute.
This slicing behavior will be removed in anndata 0.8.

These matrices should now be stored in the .obsp attribute.
This slicing behavior will be removed in anndata 0.8.


In [277]:
adata_pp=adata.copy()

In [278]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

normalizing by total count per cell
    finished (0:00:07): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    on highly variable genes
    with n_comps=15
    finished (0:00:02)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:10)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 17 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:00:11)


In [279]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [280]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)
size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

In [281]:
#Deletstrings_to_categoricals_pp
del adata_pp
adata.obs['size_factors'] = size_factors

  This is separate from the ipykernel package so we can avoid doing imports until


In [282]:
adata.strings_to_categoricals()

In [283]:
#make  (adata.X) copy of counts of raw data for downstream analysis
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [284]:
adata.raw = adata

In [285]:
#Normalize data
adata.X /= adata.obs['size_factors'].values[:, None]
sc.pp.log1p(adata)

In [286]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [287]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:00)


In [288]:
# Calculate the visualizations
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata)
sc.tl.umap(adata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:18)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:15)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:01:00)


In [290]:
# make consistent annotations across datasets
adata.obs['celltype']=adata.obs['cell_ontology_class'].copy()

In [291]:
ref_cluster=pd.Categorical(adata.obs['celltype'],
                           categories=['Basal cell 1', 'Basal cell 2', 'WNT1', 'channel', 'folicular',
       'granular', 'immune', 'melanocyte', 'mitotic', 'spinous'])

In [292]:
ix=np.isin(ref_cluster,['Basal cell 2'])
ref_cluster[ix]='Basal cell 1'

ix=np.isin(ref_cluster,['channel', 'folicular','granular', 'mitotic', 'spinous'])
ref_cluster[ix]='WNT1'

In [293]:
adata.obs['celltype']=pd.Categorical(ref_cluster,
                                           categories=['Basal cell 1', 'WNT1', 'immune', 'melanocyte'])

In [294]:
adata.rename_categories('celltype', ['Basal cells', 'Keratinocytes', 'Monocytes', 'Melanocytes'])

In [298]:
adata.obs['tissue'].cat.categories
ref_cluster=pd.Categorical(adata.obs['tissue'],
                           categories=['foreskin', 'scalp', 'trunk'])
adata.rename_categories('tissue', ['Skin_Foreskin', 'Skin_Scalp', 'Skin_Trunk'])

In [299]:
adata.obs['donor'] = adata.obs['sample']
ref_cluster=pd.Categorical(adata.obs['donor'],
                           categories=['abd4', 'br41epi', 'br53epi', 'fore8', 'fore9', 'fore12', 's11',
       'scalp26', 'scalp32'])
adata.rename_categories('donor', ['Cheng_Skin-Donor1', 'Cheng_Skin-Donor2', 'Cheng_Skin-Donor3', 'Cheng_Skin-Donor4', 'Cheng_Skin-Donor5', 'Cheng_Skin-Donor6', 'Cheng_Skin-Donor7',
       'Cheng_Skin-Donor8', 'Cheng_Skin-Donor9'])

In [300]:
adata.obs['Organ'] = 'Skin'
adata.obs['Organ_Specific'] = adata.obs['tissue']
adata.obs['Dataset'] = 'Cheng_Skin'
adata.obs['InternDatasetNumber'] = '13-1-Skin-Cheng-2018'
adata.obs['Dataset_status'] = 'Healthy_Dataset'

adata.obs['celltype'] = adata.obs['celltype']
adata.obs['sub_celltype'] ='NaN'
adata.obs['Malignant'] = 'NonMalignant'

adata.obs['Patient'] = adata.obs['donor']
adata.obs['Patient_Number'] = adata.obs['sample']
adata.obs['age'] = 'NaN'
adata.obs['sex'] = 'NaN'
adata.obs['health_status'] = 'NaN'

adata.obs['original_celltype_1'] = adata.obs['cell_ontology_class']
adata.obs['original_celltype_2'] = adata.obs['CellType']
adata.obs['original_celltype_3'] = 'NaN'

In [302]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [303]:
adata.obs_names_make_unique()

In [304]:
adata.write(writepath + '13-1-Skin-Cheng-2018-processed.h5ad')

## 13-2-Skin-Pisco-2022

In [519]:
ix=np.isin(adata_pisco.obs['tissue'],['skin of abdomen', 'skin of body', 'skin of chest']) 
adata=adata_pisco[ix].copy()

In [524]:
#calculate QC covariates
adata.obs['n_counts'] = adata.X.sum(1)
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1)

In [528]:
# FILTER PARAMETERS
#Filter out cells
sc.pp.filter_cells(adata, max_counts = 10000)
sc.pp.filter_cells(adata, max_genes = 7000)
# Min 20 cells - filters out low count genes
sc.pp.filter_genes(adata, min_cells=20) #500 works # # 450(30494) #400(29837) #300(28268) not

filtered out 1 cells that have more than 10000 counts
filtered out 1 cells that have more than 7000 genes expressed
filtered out 37950 genes that are detected in less than 20 cells


In [530]:
# get mt genes
mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]
np.array(mt_genes)
mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]

# convert to spare
adata.X=sp.sparse.csr_matrix.todense(adata.X)

# manually define mt score
y = np.bincount(mt_gene_mask)
ii = np.nonzero(y)[0]
np.vstack((ii,y[ii])).T
adata.X[:, mt_gene_mask].sum(1)
mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))
mt_sum=np.array(pd.DataFrame(mt_sum)[0])
mt_frac=mt_sum/adata.obs['n_counts'].values

adata.obs['mt_frac'] = mt_frac

#Filter out cells with over 20% mito fraction
adata = adata[adata.obs['mt_frac'] < 0.20]

In [531]:
adata_pp=adata.copy()

In [532]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

normalizing by total count per cell
    finished (0:00:01): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    on highly variable genes
    with n_comps=15
    finished (0:00:00)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 10 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:00:00)


In [533]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [534]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)
size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

In [535]:
#Delete adata_pp
del adata_pp
adata.obs['size_factors'] = size_factors

  This is separate from the ipykernel package so we can avoid doing imports until


In [536]:
adata.strings_to_categoricals()

In [537]:
#make  (adata.X) copy of counts of raw data for downstream analysis
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [538]:
adata.raw = adata

In [539]:
#Normalize data
adata.X /= adata.obs['size_factors'].values[:, None]
sc.pp.log1p(adata)

In [540]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:01)


In [541]:
# Calculate the visualizations
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata)
sc.tl.umap(adata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:01)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:18)


In [542]:
#sc.pl.umap(adata, color='cell_type')

In [543]:
# make consistent annotations across datasets
adata.obs['celltype']=adata.obs['cell_type'].copy()

In [544]:
adata.obs['celltype'].cat.categories

Index(['CD141-positive myeloid dendritic cell',
       'CD1c-positive myeloid dendritic cell', 'CD4-positive helper T cell',
       'CD4-positive, alpha-beta memory T cell',
       'CD8-positive, alpha-beta cytotoxic T cell',
       'CD8-positive, alpha-beta memory T cell', 'Langerhans cell', 'T cell',
       'cell of skeletal muscle', 'endothelial cell', 'epithelial cell',
       'macrophage', 'mast cell', 'mature NK T cell', 'melanocyte',
       'memory B cell', 'muscle cell', 'naive B cell',
       'naive thymus-derived CD4-positive, alpha-beta T cell',
       'naive thymus-derived CD8-positive, alpha-beta T cell', 'plasma cell',
       'regulatory T cell', 'smooth muscle cell', 'stromal cell'],
      dtype='object')

In [545]:
ref_cluster=pd.Categorical(adata.obs['celltype'],
                           categories=['CD141-positive myeloid dendritic cell',
       'CD1c-positive myeloid dendritic cell', 'CD4-positive helper T cell',
       'CD4-positive, alpha-beta memory T cell',
       'CD8-positive, alpha-beta cytotoxic T cell',
       'CD8-positive, alpha-beta memory T cell', 'Langerhans cell', 'T cell',
       'cell of skeletal muscle', 'endothelial cell', 'epithelial cell',
       'macrophage', 'mast cell', 'mature NK T cell', 'melanocyte',
       'memory B cell', 'muscle cell', 'naive B cell',
       'naive thymus-derived CD4-positive, alpha-beta T cell',
       'naive thymus-derived CD8-positive, alpha-beta T cell', 'plasma cell',
       'regulatory T cell', 'smooth muscle cell', 'stromal cell'])

In [546]:
ix=np.isin(ref_cluster,[  'CD1c-positive myeloid dendritic cell'])
ref_cluster[ix]='CD141-positive myeloid dendritic cell'

ix=np.isin(ref_cluster,[ 'CD4-positive, alpha-beta memory T cell','CD8-positive, alpha-beta cytotoxic T cell', 'CD8-positive, alpha-beta memory T cell', 'T cell', 
                        'naive thymus-derived CD4-positive, alpha-beta T cell', 'naive thymus-derived CD8-positive, alpha-beta T cell',  'regulatory T cell'])
ref_cluster[ix]='CD4-positive helper T cell'

ix=np.isin(ref_cluster,['naive B cell'])
ref_cluster[ix]= 'memory B cell'

ix=np.isin(ref_cluster,['Langerhans cell'])
ref_cluster[ix]= 'macrophage'

ix=np.isin(ref_cluster,['muscle cell'])
ref_cluster[ix]= 'smooth muscle cell'

In [547]:
adata.obs['celltype']=pd.Categorical(ref_cluster,
                                           categories=['CD141-positive myeloid dendritic cell',
      'CD4-positive helper T cell',
       'cell of skeletal muscle', 'endothelial cell', 'epithelial cell',
       'macrophage', 'mast cell', 'mature NK T cell', 'melanocyte',
       'memory B cell',
      'plasma cell',
        'smooth muscle cell', 'stromal cell'])

In [548]:
adata.rename_categories('celltype',['Dendritic cells',
      'T cells',
       'Skeletal muscle cells', 'Endothelial cells', 'Keratinocytes',
       'Macrophages', 'Mast cells', 'NK cells', 'Melanocytes',
       'B cells',
      'Plasma cells',
        'Smooth muscle cells', 'Mesenchymal stromal cells'])

In [552]:
adata.obs['tissue'].cat.categories
ref_cluster=pd.Categorical(adata.obs['tissue'],
                           categories=['skin of abdomen', 'skin of body', 'skin of chest'])
adata.rename_categories('tissue', ['Skin_Abdomen', 'Skin', 'Skin_Chest'])

In [553]:
adata.obs['sex'].cat.categories
ref_cluster=pd.Categorical(adata.obs['sex'],
                           categories=['male'])
adata.rename_categories('sex', ['Male'])

In [554]:
adata.obs['ethnicity'].cat.categories
ref_cluster=pd.Categorical(adata.obs['ethnicity'],
                           categories=['European', 'Hispanic or Latin American'])
adata.rename_categories('ethnicity', ['European', 'Hispanic or Latin-American'])

In [555]:
adata.obs['development_stage'].cat.categories
ref_cluster=pd.Categorical(adata.obs['development_stage'],
                           categories=['33-year-old human stage', '59-year-old human stage'])
adata.rename_categories('development_stage',['33', '59'])

In [556]:
adata.obs['donor'].cat.categories
ref_cluster=pd.Categorical(adata.obs['donor'],
                           categories=['TSP10', 'TSP14'])
adata.rename_categories('donor', ['TSP10', 'TSP14'])

In [557]:
adata.obs['Organ'] = 'Skin'
adata.obs['Organ_Specific'] = adata.obs['tissue']
adata.obs['Dataset'] = 'Pisco_Skin'
adata.obs['InternDatasetNumber'] = '13-1-Skin-Pisco-2022'
adata.obs['Dataset_status'] = 'Healthy_Dataset'

adata.obs['celltype'] = adata.obs['celltype']
adata.obs['sub_celltype'] = 'NaN'
adata.obs['Malignant'] = 'NonMalignant'

adata.obs['Patient'] = adata.obs['donor']
adata.obs['Patient_Number'] = 'NaN'
adata.obs['age'] = adata.obs['development_stage']
adata.obs['sex'] = adata.obs['sex']
adata.obs['ethnicity'] = adata.obs['ethnicity']
adata.obs['health_status'] = 'NaN'

adata.obs['original_celltype_1'] = adata.obs['cell_type']
adata.obs['original_celltype_2'] = 'NaN'
adata.obs['original_celltype_3'] = 'NaN'

In [559]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [560]:
adata.obs_names_make_unique()

In [561]:
adata.write(writepath + '13-2-Skin-Pisco-2022-processed.h5ad')

# 15-Kidney

## 15-1-Kidney-Steward-2019

In [None]:
# here we use sfaira to import available datasets with annotations
# note that the following steps may change depending on the current sfaira version and the path to your repository

In [136]:
datadir = '/path/to/repo/'

In [6]:
ds = sfaira.data.human.DatasetGroupKidney(path=datadir)  # This links all data sets available
ds.ids 

['human_kidney_2019_10xSn_lake_001_10.1038/s41467-019-10861-2',
 'human_kidney_2019_10x_stewart_001_10.1126/science.aat5031',
 'human_kidney_2020_10x_liao_001_10.1038/s41597-019-0351-8',
 'human_kidney_2020_microwell_han_001_10.1038/s41586-020-2157-4',
 'human_kidney_2020_microwell_han_002_10.1038/s41586-020-2157-4',
 'human_kidney_2020_microwell_han_003_10.1038/s41586-020-2157-4',
 'human_kidney_2020_microwell_han_004_10.1038/s41586-020-2157-4',
 'human_kidney_2020_microwell_han_005_10.1038/s41586-020-2157-4',
 'human_kidney_2020_microwell_han_006_10.1038/s41586-020-2157-4',
 'human_kidney_2020_microwell_han_007_10.1038/s41586-020-2157-4']

In [27]:
idx = ds.ids[1]

In [28]:
idx

'human_kidney_2019_10x_stewart_001_10.1126/science.aat5031'

In [29]:
ds.datasets[idx].load()



In [31]:
adata=ds.datasets[idx].adata

In [315]:
#subset to only 40268 mature kidney cells (batch 1)
ix=np.isin(adata.obs['batch'],['1']) 
adata=adata[ix].copy()

In [317]:
adata.var.index = adata.var['names'].tolist()

In [318]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [319]:
#calculate QC covariates
adata.obs['n_counts'] = adata.X.sum(1)
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1)

In [322]:
# FILTER PARAMETERS#Filter out cells
#sc.pp.filter_cells(adata, max_counts = 1700)
sc.pp.filter_cells(adata, max_genes = 7500)
# Min 20 cells - filters out low count genes
sc.pp.filter_genes(adata, min_cells=20)

filtered out 2 cells that have more than 7500 genes expressed
filtered out 14602 genes that are detected in less than 20 cells


In [323]:
# get mt genes
mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]
np.array(mt_genes)
mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]

# convert to spare
adata.X=sp.sparse.csr_matrix.todense(adata.X)

# manually define mt score
y = np.bincount(mt_gene_mask)
ii = np.nonzero(y)[0]
np.vstack((ii,y[ii])).T
adata.X[:, mt_gene_mask].sum(1)
mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))
mt_sum=np.array(pd.DataFrame(mt_sum)[0])
mt_frac=mt_sum/adata.obs['n_counts'].values

adata.obs['mt_frac'] = mt_frac

#Filter out cells with over 20% mito fraction
adata = adata[adata.obs['mt_frac'] < 0.20]

In [325]:
adata_pp=adata.copy()

In [326]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

normalizing by total count per cell
    finished (0:00:02): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    with n_comps=15
    finished (0:00:10)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:05)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 14 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:00:06)


In [327]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [328]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)
size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

In [329]:
#Deletstrings_to_categoricals_pp
del adata_pp
adata.obs['size_factors'] = size_factors

  This is separate from the ipykernel package so we can avoid doing imports until


In [330]:
adata.strings_to_categoricals()

In [331]:
#make  (adata.X) copy of counts of raw data for downstream analysis
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [332]:
adata.raw = adata

In [333]:
#Normalize data
adata.X /= adata.obs['size_factors'].values[:, None]
sc.pp.log1p(adata)

In [334]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [335]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:00)


In [336]:
# Calculate the visualizations
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata)
sc.tl.umap(adata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:07)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:09)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:31)


In [338]:
adata.obs['cell_ontology_class_broad']=adata.obs['cell_ontology_class']

In [339]:
ref_cluster=pd.Categorical(adata.obs['cell_ontology_class_broad'],
                           categories=['B cell', 'CD4 T cell', 'CD8 T cell',
       'Collecting Duct - Intercalated Cells Type B', 'Connecting tubule',
       'Distinct proximal tubule 1', 'Distinct proximal tubule 2',
       'Endothelial Cells - AEA & DVR', 'Endothelial Cells - AVR',
       'Endothelial Cells - glomerular capillaries', 'Epithelial progenitor',
       'Fibroblast', 'Indistinct intercalated cell',
       'MNP-a/classical monocyte derived',
       'MNP-b/non-classical monocyte derived', 'MNP-c/dendritic cell',
       'MNP-d/Tissue macrophage', 'Mast cell', 'Myofibroblast', 'NK cell',
       'NKT cell', 'Neutrophil', 'Pelvic epithelium',
       'Peritubular capillary endothelium 1',
       'Peritubular capillary endothelium 2', 'Plasmacytoid dendritic cell',
       'Podocyte', 'Principal cell', 'Proliferating Proximal Tubule',
       'Proximal tubule', 'Thick ascending limb of Loop of Henle',
       'Transitional urothelium', 'Type A intercalated cell'])

In [340]:
ix=np.isin(ref_cluster,['Distinct proximal tubule 2'])
ref_cluster[ix]='Distinct proximal tubule 1'

ix=np.isin(ref_cluster,['Endothelial Cells - AVR',
       'Endothelial Cells - glomerular capillaries'])
ref_cluster[ix]= 'Endothelial Cells - AEA & DVR'

ix=np.isin(ref_cluster,['MNP-b/non-classical monocyte derived', 'MNP-c/dendritic cell',
       'MNP-d/Tissue macrophage'])
ref_cluster[ix]='MNP-a/classical monocyte derived'

ix=np.isin(ref_cluster,[ 'Peritubular capillary endothelium 2'])
ref_cluster[ix]= 'Peritubular capillary endothelium 1'

In [341]:
adata.obs['cell_ontology_class_broad']=pd.Categorical(ref_cluster,
                                                        categories=['B cell', 'CD4 T cell', 'CD8 T cell',
       'Collecting Duct - Intercalated Cells Type B', 'Connecting tubule',
       'Distinct proximal tubule 1',
       'Endothelial Cells - AEA & DVR', 'Epithelial progenitor',
       'Fibroblast', 'Indistinct intercalated cell',
       'MNP-a/classical monocyte derived','Mast cell', 'Myofibroblast', 'NK cell',
       'NKT cell', 'Neutrophil', 'Pelvic epithelium',
       'Peritubular capillary endothelium 1','Plasmacytoid dendritic cell',
       'Podocyte', 'Principal cell', 'Proliferating Proximal Tubule',
       'Proximal tubule', 'Thick ascending limb of Loop of Henle',
       'Transitional urothelium', 'Type A intercalated cell'])

In [342]:
adata.rename_categories('cell_ontology_class_broad',
                        ['B cell', 'CD4 T cell', 'CD8 T cell',
       'Collecting Duct - Intercalated Cells Type B', 'Connecting tubule',
       'Distinct proximal tubule', 'Endothelial Cells',
       'Epithelial progenitor', 'Fibroblast', 'Indistinct intercalated cell',
       'MNP', 'Mast cell', 'Myofibroblast',
       'NK cell', 'NKT cell', 'Neutrophil', 'Pelvic epithelium',
       'Peritubular capillary endothelium', 'Plasmacytoid dendritic cell',
       'Podocyte', 'Principal cell', 'Proliferating Proximal Tubule',
       'Proximal tubule', 'Thick ascending limb of Loop of Henle',
       'Transitional urothelium', 'Type A intercalated cell'])

In [344]:
# make consistent annotations across datasets
adata.obs['celltype']=adata.obs['celltype'].copy()
adata.obs['celltype'].cat.categories

Index(['Ascending vasa recta endothelium', 'B cell', 'CD4 T cell',
       'CD8 T cell', 'Connecting tubule', 'Descending vasa recta endothelium',
       'Distinct proximal tubule 1', 'Distinct proximal tubule 2',
       'Epithelial progenitor cell', 'Fibroblast', 'Glomerular endothelium',
       'Indistinct intercalated cell', 'MNP-a/classical monocyte derived',
       'MNP-b/non-classical monocyte derived', 'MNP-c/dendritic cell',
       'MNP-d/Tissue macrophage', 'Mast cell', 'Myofibroblast', 'NK cell',
       'NKT cell', 'Neutrophil', 'Pelvic epithelium',
       'Peritubular capillary endothelium 1',
       'Peritubular capillary endothelium 2', 'Plasmacytoid dendritic cell',
       'Podocyte', 'Principal cell', 'Proliferating Proximal Tubule',
       'Proximal tubule', 'Thick ascending limb of Loop of Henle',
       'Transitional urothelium', 'Type A intercalated cell',
       'Type B intercalated cell'],
      dtype='object')

In [345]:
ref_cluster=pd.Categorical(adata.obs['celltype'],
                           categories=['Ascending vasa recta endothelium', 'B cell', 'CD4 T cell',
       'CD8 T cell', 'Connecting tubule', 'Descending vasa recta endothelium',
       'Distinct proximal tubule 1', 'Distinct proximal tubule 2',
       'Epithelial progenitor cell', 'Fibroblast', 'Glomerular endothelium',
       'Indistinct intercalated cell', 'MNP-a/classical monocyte derived',
       'MNP-b/non-classical monocyte derived', 'MNP-c/dendritic cell',
       'MNP-d/Tissue macrophage', 'Mast cell', 'Myofibroblast', 'NK cell',
       'NKT cell', 'Neutrophil', 'Pelvic epithelium',
       'Peritubular capillary endothelium 1',
       'Peritubular capillary endothelium 2', 'Plasmacytoid dendritic cell',
       'Podocyte', 'Principal cell', 'Proliferating Proximal Tubule',
       'Proximal tubule', 'Thick ascending limb of Loop of Henle',
       'Transitional urothelium', 'Type A intercalated cell',
       'Type B intercalated cell'])

In [346]:
ix=np.isin(ref_cluster,['Descending vasa recta endothelium','Glomerular endothelium','Peritubular capillary endothelium 1',
       'Peritubular capillary endothelium 2'])
ref_cluster[ix]='Ascending vasa recta endothelium'

ix=np.isin(ref_cluster,['CD8 T cell','NKT cell'])
ref_cluster[ix]='CD4 T cell'

ix=np.isin(ref_cluster,['Transitional urothelium'])
ref_cluster[ix]='Pelvic epithelium'

ix=np.isin(ref_cluster,['Type A intercalated cell','Type B intercalated cell', 'Indistinct intercalated cell'])
ref_cluster[ix]='Principal cell'

ix=np.isin(ref_cluster,['Distinct proximal tubule 1', 'Distinct proximal tubule 2', 'Proximal tubule', 'Connecting tubule', 'Epithelial progenitor cell', 
                       'Thick ascending limb of Loop of Henle'])
ref_cluster[ix]= 'Proliferating Proximal Tubule'

ix=np.isin(ref_cluster,['MNP-b/non-classical monocyte derived'])
ref_cluster[ix]='MNP-a/classical monocyte derived'

ix=np.isin(ref_cluster,['Plasmacytoid dendritic cell'])
ref_cluster[ix]='MNP-c/dendritic cell'

In [347]:
adata.obs['celltype']=pd.Categorical(ref_cluster,
                                           categories=['Ascending vasa recta endothelium', 'B cell', 'CD4 T cell',
                                                      'Fibroblast','MNP-a/classical monocyte derived',
                                                       'MNP-c/dendritic cell','MNP-d/Tissue macrophage', 
                                                       'Mast cell', 'Myofibroblast', 'NK cell',
                                                       'Neutrophil', 'Pelvic epithelium', 'Podocyte', 'Principal cell', 'Proliferating Proximal Tubule'])

In [348]:
adata.obs['celltype'].cat.categories

Index(['Ascending vasa recta endothelium', 'B cell', 'CD4 T cell',
       'Fibroblast', 'MNP-a/classical monocyte derived',
       'MNP-c/dendritic cell', 'MNP-d/Tissue macrophage', 'Mast cell',
       'Myofibroblast', 'NK cell', 'Neutrophil', 'Pelvic epithelium',
       'Podocyte', 'Principal cell', 'Proliferating Proximal Tubule'],
      dtype='object')

In [349]:
adata.rename_categories('celltype', ['Endothelial cells', 'B cells', 'T cells',
        'Fibroblast cells','Monocytes',
        'Dendritic cells','Macrophages', 'Mast cells', 
        'Myofibroblast cells', 'NK cells','Neutrophils', 'Urothelial cells',
        'Podocytes', 'Collecting duct system cells','Tubule cells'])

In [350]:
adata.obs['celltype'].cat.categories

Index(['Endothelial cells', 'B cells', 'T cells', 'Fibroblast cells',
       'Monocytes', 'Dendritic cells', 'Macrophages', 'Mast cells',
       'Myofibroblast cells', 'NK cells', 'Neutrophils', 'Urothelial cells',
       'Podocytes', 'Collecting duct system cells', 'Tubule cells'],
      dtype='object')

In [354]:
adata.obs['Organ'] = 'Kidney'
adata.obs['Organ_Specific'] = 'Kidney'
adata.obs['Dataset'] = 'Steward_Kidney'
adata.obs['InternDatasetNumber'] = '15-1-Kidney-Steward-2019'
adata.obs['Dataset_status'] = 'Healthy_Dataset'

adata.obs['celltype'] = adata.obs['celltype']
adata.obs['sub_celltype'] ='NaN'
adata.obs['Malignant'] = 'NonMalignant'

adata.obs['Patient'] = 'Steward_Kidney-Donor1'
adata.obs['Patient_Number'] = 'NaN'
adata.obs['age'] = 'NaN'
adata.obs['sex'] = 'NaN'
adata.obs['ethnicity'] = 'NaN'
adata.obs['health_status'] = 'NaN'

adata.obs['original_celltype_1'] = adata.obs['cell_ontology_class']
adata.obs['original_celltype_2'] = adata.obs['cell_ontology_class_broad']
adata.obs['original_celltype_3'] = 'NaN'

In [356]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [357]:
adata.obs_names_make_unique()

In [358]:
adata.write(writepath + '15-1-Kidney-Steward-2019-processed.h5ad')

## 15-2-Kidney-Wilson-2021

In [361]:
target_collections = ["9b02383a-9358-4f0f-9795-a891ec523bcc"]
cache_path = os.path.join(".", "data")
dsg = sfaira.data.dataloaders.databases.DatasetSuperGroupDatabases(data_path=cache_path, cache_metadata=True)
dsg.subset(key="collection_id", values=target_collections)
dsg.datasets
dsg.download()

In [362]:
path = '/path/to/repo/9b02383a-9358-4f0f-9795-a891ec523bcc/'
files = [f for f in listdir(path) if isfile(join(path, f))]

In [363]:
files

['13a027de-ea3e-432b-9a5e-6bc7048498fc.h5ad',
 '9df60c57-fdf3-4e93-828e-fe9303f20438.h5ad']

In [364]:
for i in range(len(files)):
    print(files[i])
    path_2 = path + files[i]
    u = sc.read_h5ad(path_2)
    u.obs['id'] = files[i]
    u.var.index = u.var['feature_name']
    #print(u)
    #print(u.obs['disease'])
    if u.n_obs == 19985:
        adata = u

13a027de-ea3e-432b-9a5e-6bc7048498fc.h5ad
9df60c57-fdf3-4e93-828e-fe9303f20438.h5ad


In [368]:
adata.obs['InternDatasetNumber'] ='15-2-Kidney-Wilson-2021'

In [372]:
#calculate QC covariates
adata.obs['n_counts'] = adata.X.sum(1)
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1)

In [376]:
# FILTER PARAMETERS
#Filter out cells
sc.pp.filter_cells(adata, max_counts = 2000)
sc.pp.filter_cells(adata, max_genes = 2500)
# Min 20 cells - filters out low count genes
sc.pp.filter_genes(adata, min_cells=20)

filtered out 1 cells that have more than 2000 counts
filtered out 2 cells that have more than 2500 genes expressed
filtered out 14058 genes that are detected in less than 20 cells


In [378]:
# get mt genes
mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]
np.array(mt_genes)
mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]

# convert to spare
adata.X=sp.sparse.csr_matrix.todense(adata.X)

# manually define mt score
y = np.bincount(mt_gene_mask)
ii = np.nonzero(y)[0]
np.vstack((ii,y[ii])).T
adata.X[:, mt_gene_mask].sum(1)
mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))
mt_sum=np.array(pd.DataFrame(mt_sum)[0])
mt_frac=mt_sum/adata.obs['n_counts'].values

adata.obs['mt_frac'] = mt_frac

#Filter out cells with over 20% mito fraction
adata = adata[adata.obs['mt_frac'] < 0.20]

In [379]:
adata_pp=adata.copy()

In [380]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

normalizing by total count per cell
    finished (0:00:01): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    with n_comps=15
    finished (0:00:05)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:02)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 18 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:00:01)


In [381]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [382]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)
size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

In [383]:
#Delete adata_pp
del adata_pp
adata.obs['size_factors'] = size_factors

  This is separate from the ipykernel package so we can avoid doing imports until


In [384]:
adata.strings_to_categoricals()

In [385]:
#make  (adata.X) copy of counts of raw data for downstream analysis
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [386]:
adata.raw = adata

In [387]:
#Normalize data
adata.X /= adata.obs['size_factors'].values[:, None]
sc.pp.log1p(adata)

In [388]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:02)


In [389]:
# Calculate the visualizations
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata)
sc.tl.umap(adata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:03)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:03)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:16)


In [391]:
# make consistent annotations across datasets
adata.obs['celltype']=adata.obs['cell_type'].copy()

In [392]:
adata.obs['celltype'].cat.categories

Index(['epithelial cell of proximal tubule', 'fibroblast',
       'glomerular visceral epithelial cell',
       'kidney capillary endothelial cell',
       'kidney connecting tubule epithelial cell',
       'kidney distal convoluted tubule epithelial cell',
       'kidney loop of Henle thick ascending limb epithelial cell',
       'leukocyte', 'mesangial cell', 'parietal epithelial cell',
       'renal alpha-intercalated cell', 'renal beta-intercalated cell',
       'renal principal cell'],
      dtype='object')

In [393]:
ref_cluster=pd.Categorical(adata.obs['celltype'],
                           categories=['epithelial cell of proximal tubule', 'fibroblast',
       'glomerular visceral epithelial cell',
       'kidney capillary endothelial cell',
       'kidney connecting tubule epithelial cell',
       'kidney distal convoluted tubule epithelial cell',
       'kidney loop of Henle thick ascending limb epithelial cell',
       'leukocyte', 'mesangial cell', 'parietal epithelial cell',
       'renal alpha-intercalated cell', 'renal beta-intercalated cell',
       'renal principal cell'])

In [394]:
ix=np.isin(ref_cluster,['kidney connecting tubule epithelial cell', 'kidney distal convoluted tubule epithelial cell',  'kidney loop of Henle thick ascending limb epithelial cell'])
ref_cluster[ix]='epithelial cell of proximal tubule'

ix=np.isin(ref_cluster,['renal beta-intercalated cell', 'renal principal cell'])
ref_cluster[ix]='renal alpha-intercalated cell'

In [395]:
adata.obs['celltype']=pd.Categorical(ref_cluster,
                                           categories=['epithelial cell of proximal tubule', 'fibroblast',
       'glomerular visceral epithelial cell',
       'kidney capillary endothelial cell',

       
      
       'leukocyte', 'mesangial cell', 'parietal epithelial cell',
       'renal alpha-intercalated cell'])

In [396]:
adata.rename_categories('celltype', ['Tubule cells', 'Fibroblast cells',
       'Glomerular visceral epithelial cells',
       'Endothelial cells',

       
      
       'Leukocytes', 'Mesangial cells', 'Parietal epithelial cells',
       'Collecting duct system cells'])

In [400]:
adata.obs['reported_diseases'].cat.categories
ref_cluster=pd.Categorical(adata.obs['reported_diseases'],
                           categories=['[Lyme disease,anxiety disorder,depressive disorder,kidney cancer,diverticulitis,gastroesophageal reflux disease,arthritic joint disease]',
       '[benign prostatic hyperplasia,pericardial effusion,acute kidney tubular necrosis,essential hypertension,kidney cancer]',
       '[kidney cancer,obstructive sleep apnea syndrome,acoustic neuroma]',
       '[kidney cancer]'])
adata.rename_categories('reported_diseases', ['Lyme disease,Anxiety disorder,Depressive disorder,Kidney cancer,Diverticulitis,Gastroesophageal reflux disease,Arthritic joint disease',
       'Benign prostatic hyperplasia,Pericardial effusion,Acute kidney tubular necrosis,Essential hypertension,Kidney cancer',
       'Kidney cancer,Obstructive sleep apnea syndrome,Acoustic neuroma',
       'Kidney cancer'])

In [401]:
adata.obs['tissue'].cat.categories
ref_cluster=pd.Categorical(adata.obs['tissue'],
                           categories=['cortex of kidney'])
adata.rename_categories('tissue', ['Kidney_Cortex'])

In [402]:
adata.obs['sex'].cat.categories
ref_cluster=pd.Categorical(adata.obs['sex'],
                           categories=['female', 'male'])
adata.rename_categories('sex', ['Female', 'Male'])

In [403]:
adata.obs['development_stage'].cat.categories
ref_cluster=pd.Categorical(adata.obs['development_stage'],
                           categories=['50-year-old human stage', '52-year-old human stage',
       '54-year-old human stage', '61-year-old human stage',
       '62-year-old human stage'])
adata.rename_categories('development_stage', ['50', '52',
       '54', '61',
       '62'])

In [404]:
adata.obs['donor'] = adata.obs['donor_uuid']
ref_cluster=pd.Categorical(adata.obs['donor'],
                           categories=['8c570254-4bef-48d8-bd79-c812f60835a5',
       '5028f75a-8c09-4155-a232-ad7dbfa6042e',
       '8213a3f7-2437-4e8a-b836-caec33df901d',
       'e0def004-9e30-4a3b-9a65-007110f3a1f2',
       'f6c0f811-2fb8-4989-b796-37c14b055517'])
adata.rename_categories('donor', ['Wilson_Kidney-Donor1',
       'Wilson_Kidney-Donor2',
       'Wilson_Kidney-Donor3',
       'Wilson_Kidney-Donor4',
       'Wilson_Kidney-Donor5'])

In [405]:
adata.obs['Organ'] = 'Kidney'
adata.obs['Organ_Specific'] = adata.obs['tissue']
adata.obs['Dataset'] = 'Wilson_Kidney'
adata.obs['InternDatasetNumber'] = '15-2-Kidney-Wilson-2021'
adata.obs['Dataset_status'] = 'Ill_Dataset'

adata.obs['celltype'] = adata.obs['celltype']
adata.obs['sub_celltype'] ='NaN'
adata.obs['Malignant'] = 'NonMalignant'

adata.obs['Patient'] = adata.obs['donor']
adata.obs['Patient_Number'] = adata.obs['sample_uuid']
adata.obs['age'] = adata.obs['development_stage']
adata.obs['sex'] = adata.obs['sex']
adata.obs['ethnicity'] = adata.obs['ethnicity']
adata.obs['health_status'] = adata.obs['reported_diseases']

adata.obs['original_celltype_1'] = adata.obs['cell_type']
adata.obs['original_celltype_2'] = 'NaN'
adata.obs['original_celltype_3'] = 'NaN'

In [407]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [408]:
adata.obs_names_make_unique()

In [411]:
adata.write(writepath + '15-2-Kidney-Wilson-2021-processed.h5ad')

##  15-3-Kidney-Pisco-2022

In [62]:
ix=np.isin(adata_pisco.obs['tissue'],['kidney']) 
adata=adata_pisco[ix].copy()

In [67]:
#calculate QC covariates
adata.obs['n_counts'] = adata.X.sum(1)
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1)

In [71]:
# FILTER PARAMETERS
#Filter out cells
sc.pp.filter_cells(adata, max_counts = 13000)
sc.pp.filter_cells(adata, max_genes = 7500)
# Min 20 cells - filters out low count genes
sc.pp.filter_genes(adata, min_cells=20) #500 works # # 450(30494) #400(29837) #300(28268) not

filtered out 4 cells that have more than 13000 counts
filtered out 3 cells that have more than 7500 genes expressed
filtered out 40007 genes that are detected in less than 20 cells


In [73]:
# get mt genes
mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]
np.array(mt_genes)
mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]

# convert to spare
adata.X=sp.sparse.csr_matrix.todense(adata.X)

# manually define mt score
y = np.bincount(mt_gene_mask)
ii = np.nonzero(y)[0]
np.vstack((ii,y[ii])).T
adata.X[:, mt_gene_mask].sum(1)
mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))
mt_sum=np.array(pd.DataFrame(mt_sum)[0])
mt_frac=mt_sum/adata.obs['n_counts'].values

adata.obs['mt_frac'] = mt_frac

#Filter out cells with over 20% mito fraction
adata = adata[adata.obs['mt_frac'] < 0.20]

In [74]:
adata_pp=adata.copy()

In [75]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

normalizing by total count per cell
    finished (0:00:00): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    on highly variable genes
    with n_comps=15
    finished (0:00:00)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 12 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:00:00)


In [76]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [77]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)
size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

In [78]:
#Delete adata_pp
del adata_pp
adata.obs['size_factors'] = size_factors

  This is separate from the ipykernel package so we can avoid doing imports until


In [79]:
adata.strings_to_categoricals()

In [80]:
#make  (adata.X) copy of counts of raw data for downstream analysis
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [81]:
adata.raw = adata

In [82]:
#Normalize data
adata.X /= adata.obs['size_factors'].values[:, None]
sc.pp.log1p(adata)

In [83]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:01)


In [84]:
# Calculate the visualizations
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata)
sc.tl.umap(adata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:01)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:02)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:19)


In [86]:
# make consistent annotations across datasets
adata.obs['celltype']=adata.obs['cell_type'].copy()

In [87]:
adata.obs['celltype'].cat.categories

Index(['B cell', 'CD4-positive helper T cell',
       'CD8-positive, alpha-beta T cell', 'endothelial cell',
       'kidney epithelial cell', 'macrophage', 'mature NK T cell'],
      dtype='object')

In [88]:
ref_cluster=pd.Categorical(adata.obs['celltype'],
                           categories=['B cell', 'CD4-positive helper T cell',
       'CD8-positive, alpha-beta T cell', 'endothelial cell',
       'kidney epithelial cell', 'macrophage', 'mature NK T cell'])

In [89]:
ix=np.isin(ref_cluster,['CD8-positive, alpha-beta T cell'])
ref_cluster[ix]='CD4-positive helper T cell'

In [90]:
adata.obs['celltype']=pd.Categorical(ref_cluster,
                                           categories=['B cell', 'CD4-positive helper T cell',
       'endothelial cell',
       'kidney epithelial cell', 'macrophage', 'mature NK T cell'])

In [91]:
adata.rename_categories('celltype', ['B cells', 'T cells',
       'Endothelial cells',
       'Tubule cells', 'Macrophages', 'NK cells'])

In [95]:
adata.obs['tissue'].cat.categories
ref_cluster=pd.Categorical(adata.obs['tissue'],
                           categories=['kidney'])
adata.rename_categories('tissue', ['Kidney'])

In [96]:
adata.obs['sex'].cat.categories
ref_cluster=pd.Categorical(adata.obs['sex'],
                           categories=['female'])
adata.rename_categories('sex', ['Female'])

In [97]:
adata.obs['ethnicity'].cat.categories
ref_cluster=pd.Categorical(adata.obs['ethnicity'],
                           categories=['African American or Afro-Caribbean'])
adata.rename_categories('ethnicity', ['African-American or Afro-Caribbean'])

In [98]:
adata.obs['development_stage'].cat.categories
ref_cluster=pd.Categorical(adata.obs['development_stage'],
                           categories=['61-year-old human stage'])
adata.rename_categories('development_stage',['61'])

In [99]:
adata.obs['donor'].cat.categories
ref_cluster=pd.Categorical(adata.obs['donor'],
                           categories=['TSP2'])
adata.rename_categories('donor', ['TSP2'])

In [100]:
adata.obs['Organ'] = 'Kidney'
adata.obs['Organ_Specific'] = adata.obs['tissue']
adata.obs['Dataset'] = 'Kidney_Heart'
adata.obs['InternDatasetNumber'] = '15-3-Kidney-Pisco-2022'
adata.obs['Dataset_status'] = 'Healthy_Dataset'

adata.obs['celltype'] = adata.obs['celltype']
adata.obs['sub_celltype'] = 'NaN'
adata.obs['Malignant'] = 'NonMalignant'

adata.obs['Patient'] = adata.obs['donor']
adata.obs['Patient_Number'] = 'NaN'
adata.obs['age'] = adata.obs['development_stage']
adata.obs['sex'] = adata.obs['sex']
adata.obs['ethnicity'] = adata.obs['ethnicity']
adata.obs['health_status'] = 'NaN'

adata.obs['original_celltype_1'] = adata.obs['cell_type']
adata.obs['original_celltype_2'] = 'NaN'
adata.obs['original_celltype_3'] = 'NaN'

In [102]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [103]:
adata.obs_names_make_unique()

In [104]:
adata.write(writepath + '15-3-Kidney-Pisco-2022-processed.h5ad')

## 15-4-Kidney-Han-2020

In [547]:
adata.obs['InternDatasetNumber'] ='15-4-Kidney-Han-2020'

In [551]:
#calculate QC covariates
adata.obs['n_counts'] = adata.X.sum(1)
adata.obs['log_counts'] = np.log(adata.obs['n_counts'])
adata.obs['n_genes'] = (adata.X > 0).sum(1)

In [554]:
# FILTER PARAMETERS#Filter out cells
sc.pp.filter_cells(adata, max_counts = 4400)
sc.pp.filter_cells(adata, max_genes = 2200)
# Min 20 cells - filters out low count genes
sc.pp.filter_genes(adata, min_cells=20) 

filtered out 39 cells that have more than 4400 counts
filtered out 10454 genes that are detected in less than 20 cells


In [555]:
# get mt genes
mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]
np.array(mt_genes)
mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]

# convert to spare
adata.X=sp.sparse.csr_matrix.todense(adata.X)

# manually define mt score
y = np.bincount(mt_gene_mask)
ii = np.nonzero(y)[0]
np.vstack((ii,y[ii])).T
adata.X[:, mt_gene_mask].sum(1)
mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))
mt_sum=np.array(pd.DataFrame(mt_sum)[0])
mt_frac=mt_sum/adata.obs['n_counts'].values

adata.obs['mt_frac'] = mt_frac

#Filter out cells with over 20% mito fraction
adata = adata[adata.obs['mt_frac'] < 0.20]

In [556]:
adata_pp=adata.copy()

In [557]:
#Perform a clustering for scran normalization in clusters
sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)
sc.pp.log1p(adata_pp)
sc.pp.pca(adata_pp, n_comps=15)
sc.pp.neighbors(adata_pp)
sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)

normalizing by total count per cell
    finished (0:00:01): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
computing PCA
    with n_comps=15
    finished (0:00:05)
computing neighbors
    using 'X_pca' with n_pcs = 15
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:02)
running Louvain clustering
    using the "louvain" package of Traag (2017)
    finished: found 16 clusters and added
    'groups', the cluster labels (adata.obs, categorical) (0:00:03)


In [558]:
#Preprocess variables for scran normalization
input_groups = adata_pp.obs['groups']
data_mat = adata.X.T

In [559]:
%%R -i data_mat -i input_groups -o size_factors
require(scran)
size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)

In [560]:
#Delete adata_pp
del adata_pp
adata.obs['size_factors'] = size_factors

  This is separate from the ipykernel package so we can avoid doing imports until


In [561]:
adata.strings_to_categoricals()

In [562]:
#make  (adata.X) copy of counts of raw data for downstream analysis
#Keep the count data in a counts layer
adata.layers["counts"] = adata.X.copy()

In [563]:
adata.raw = adata

In [564]:
#Normalize data
adata.X /= adata.obs['size_factors'].values[:, None]
sc.pp.log1p(adata)

In [565]:
# extract highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)

If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:02)


In [566]:
# Calculate the visualizations
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
sc.pp.neighbors(adata)
sc.tl.umap(adata)

computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:03)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:04)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:18)


In [568]:
# make consistent annotations across datasets
adata.obs['celltype']=adata.obs['celltype_specific'].copy()
adata.obs['celltype'].cat.categories

Index(['B cell', 'B cell (Plasmocyte)', 'B cell(Plasmocyte)',
       'Conventional dendritic cell', 'Dendritic cell', 'Distal tubule cell',
       'Distal tubule cell_SLC12A3 high', 'Endothelial cell',
       'Endothelial cell_EMCN high', 'Endothelial cell_IGFBP5 high',
       'Epithelial_cell_NUPR1 high', 'Fenestrated endothelial cell_EMCN high',
       'Fenestrated endothelial cell_SELE high', 'Fibroblast',
       'Glomerular endothelial cell_AQP1 high', 'IC-tran-PC',
       'Intercalated cell', 'Intercalated cell_SLC26A4 high',
       'Intercalated cell_SPINK1 high', 'Kidney Epithelial cell',
       'Loop of Henle (Thick ascending limb)',
       'Loop of Henle(Thick ascending limb)', 'Loop of henle _ANXA1 high',
       'Loop of henle _KNG1 high', 'Loop of henle _UMOD high',
       'Loop of henle_SFN high', 'Loop of henle_SLPI high',
       'Loop of henle_SOD3 high', 'Loop of henle_SPP1 high', 'Macrophage',
       'Macrophage_APOC1 high', 'Macrophage_GPR183 high', 'Mast cell',
      

In [569]:
adata.rename_categories('celltype', ['B cell', 'B cell (Plasmocyte)', 'B cell(Plasmocyte)_2',
       'Conventional dendritic cell', 'Dendritic cell', 'Distal tubule cell',
       'Distal tubule cell_SLC12A3 high', 'Endothelial cell',
       'Endothelial cell_EMCN high', 'Endothelial cell_IGFBP5 high',
       'Epithelial_cell_NUPR1 high', 'Fenestrated endothelial cell_EMCN high',
       'Fenestrated endothelial cell_SELE high', 'Fibroblast',
       'Glomerular endothelial cell_AQP1 high', 'IC-tran-PC',
       'Intercalated cell', 'Intercalated cell_SLC26A4 high',
       'Intercalated cell_SPINK1 high', 'Kidney Epithelial cell',
       'Loop of Henle (Thick ascending limb)',
       'Loop of Henle(Thick ascending limb)', 'Loop of henle _ANXA1 high',
       'Loop of henle _KNG1 high', 'Loop of henle _UMOD high',
       'Loop of henle_SFN high', 'Loop of henle_SLPI high',
       'Loop of henle_SOD3 high', 'Loop of henle_SPP1 high', 'Macrophage',
       'Macrophage_APOC1 high', 'Macrophage_GPR183 high', 'Mast cell',
       'Myocyte', 'Myofibroblast', 'Neutrophil', 'Principle cell',
       'Proximal tubule cell', 'Proximal tubule cell_ALDOB high',
       'Proximal tubule cell_MT1G high', 'Proximal tubule cell_SOX4 high',
       'Smooth muscle cell', 'T cell', 'Unknown', 'Ureteric Epithelial cell'])

In [570]:
ref_cluster=pd.Categorical(adata.obs['celltype'],
                           categories=['B cell', 'B cell (Plasmocyte)', 'B cell(Plasmocyte)_2',
       'Conventional dendritic cell', 'Dendritic cell', 'Distal tubule cell',
       'Distal tubule cell_SLC12A3 high', 'Endothelial cell',
       'Endothelial cell_EMCN high', 'Endothelial cell_IGFBP5 high',
       'Epithelial_cell_NUPR1 high', 'Fenestrated endothelial cell_EMCN high',
       'Fenestrated endothelial cell_SELE high', 'Fibroblast',
       'Glomerular endothelial cell_AQP1 high', 'IC-tran-PC',
       'Intercalated cell', 'Intercalated cell_SLC26A4 high',
       'Intercalated cell_SPINK1 high', 'Kidney Epithelial cell',
       'Loop of Henle (Thick ascending limb)',
       'Loop of Henle(Thick ascending limb)', 'Loop of henle _ANXA1 high',
       'Loop of henle _KNG1 high', 'Loop of henle _UMOD high',
       'Loop of henle_SFN high', 'Loop of henle_SLPI high',
       'Loop of henle_SOD3 high', 'Loop of henle_SPP1 high', 'Macrophage',
       'Macrophage_APOC1 high', 'Macrophage_GPR183 high', 'Mast cell',
       'Myocyte', 'Myofibroblast', 'Neutrophil', 'Principle cell',
       'Proximal tubule cell', 'Proximal tubule cell_ALDOB high',
       'Proximal tubule cell_MT1G high', 'Proximal tubule cell_SOX4 high',
       'Smooth muscle cell', 'T cell', 'Unknown', 'Ureteric Epithelial cell'])

In [571]:
ix=np.isin(ref_cluster,[ 'B cell(Plasmocyte)_2'])
ref_cluster[ix]= 'B cell (Plasmocyte)'

ix=np.isin(ref_cluster,[  'Conventional dendritic cell'])
ref_cluster[ix]=  'Dendritic cell'

ix=np.isin(ref_cluster,[ 'Endothelial cell_EMCN high', 'Endothelial cell_IGFBP5 high','Fenestrated endothelial cell_EMCN high',
       'Fenestrated endothelial cell_SELE high'])
ref_cluster[ix]= 'Endothelial cell'

ix=np.isin(ref_cluster,[  'Distal tubule cell_SLC12A3 high',   'Glomerular endothelial cell_AQP1 high',  'Loop of Henle (Thick ascending limb)',
       'Loop of Henle(Thick ascending limb)', 'Loop of henle _ANXA1 high',
       'Loop of henle _KNG1 high', 'Loop of henle _UMOD high',
       'Loop of henle_SFN high', 'Loop of henle_SLPI high',
       'Loop of henle_SOD3 high', 'Loop of henle_SPP1 high', 
                         'Proximal tubule cell', 'Proximal tubule cell_ALDOB high',
       'Proximal tubule cell_MT1G high', 'Proximal tubule cell_SOX4 high',  'Kidney Epithelial cell',  'Epithelial_cell_NUPR1 high'])
ref_cluster[ix]= 'Distal tubule cell'

ix=np.isin(ref_cluster,[ 'Intercalated cell_SLC26A4 high', 'Intercalated cell_SPINK1 high',  'IC-tran-PC',
      'Principle cell'])
ref_cluster[ix]=  'Intercalated cell'

ix=np.isin(ref_cluster,[   'Macrophage_APOC1 high', 'Macrophage_GPR183 high'])
ref_cluster[ix]=  'Macrophage'

ix=np.isin(ref_cluster,[ 'Myocyte'])
ref_cluster[ix]=  'Smooth muscle cell'

In [572]:
adata.obs['celltype']=pd.Categorical(ref_cluster,
                                           categories=['B cell', 'B cell (Plasmocyte)', 
       'Dendritic cell', 'Distal tubule cell',
       'Endothelial cell',
       
       
                                                       'Fibroblast',
    
                  'Intercalated cell',                                   
      'Macrophage',
      'Mast cell',
        'Myofibroblast', 'Neutrophil', 
     
       'Smooth muscle cell', 'T cell', 'Unknown', 'Ureteric Epithelial cell'])

In [573]:
adata.rename_categories('celltype', ['B cells', 'Plasma cells', 
       'Dendritic cells', 'Tubule cells',
       'Endothelial cells',
       
       
                                                       'Fibroblast cells',
    
                  'Collecting duct system cells' ,                                  
      'Macrophages',
      'Mast cells',
        'Myofibroblast cells', 'Neutrophils', 
     
       'Smooth muscle cells', 'T cells', 'Unknown', 'Ureteric epithelial cells'])

In [577]:
adata.obs['sub_tissue'].cat.categories
ref_cluster=pd.Categorical(adata.obs['sub_tissue'],
                           categories=['AdultKidney'])
adata.rename_categories('sub_tissue',['Kidney'])

In [578]:
adata.obs['sex'].cat.categories
ref_cluster=pd.Categorical(adata.obs['sex'],
                           categories=['male'])
adata.rename_categories('sex', ['Male'])

In [579]:
adata.obs['age'].cat.categories
ref_cluster=pd.Categorical(adata.obs['age'],
                           categories=['41Y', '57Y', '66Y'])
adata.rename_categories('age',['41', '57', '66'])

In [580]:
adata.obs['donor'].cat.categories
ref_cluster=pd.Categorical(adata.obs['donor'],
                           categories=['Donor34', 'Donor36', 'Donor37'])
adata.rename_categories('donor', ['Han-Donor34', 'Han-Donor36', 'Han-Donor37'])

In [581]:
adata.obs['Organ'] = 'Kidney'
adata.obs['Organ_Specific'] = adata.obs['sub_tissue']
adata.obs['Dataset'] = 'Han_Kidney'
adata.obs['InternDatasetNumber'] = '15-4-Kidney-Han-2020'
adata.obs['Dataset_status'] = 'Healthy_Dataset'

adata.obs['celltype'] = adata.obs['celltype']
adata.obs['sub_celltype'] = 'NaN'
adata.obs['Malignant'] = 'NonMalignant'

adata.obs['Patient'] = adata.obs['donor']
adata.obs['Patient_Number'] = 'NaN'
adata.obs['age'] = adata.obs['age']
adata.obs['sex'] = adata.obs['sex']
adata.obs['ethnicity'] = 'NaN'
adata.obs['health_status'] = 'NaN'

adata.obs['original_celltype_1'] = adata.obs['celltype_specific']
adata.obs['original_celltype_2'] = adata.obs['celltype_global']
adata.obs['original_celltype_3'] = 'NaN'

In [583]:
adata.X = sp.sparse.csr_matrix(adata.X)

In [584]:
adata.write(writepath + '15-4-Kidney-Han-2020-processed.h5ad')