HL-off-tumor-antigen-expression / Preprocessing.ipynb
Preprocessing.ipynb
Raw
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Note: the number and quality of accessible scRNA Seq datasets is rapidly increasing with more and more platforms providing streamlined access to single cell omics data such as sfaira, cellxgene etc.\n",
    "\n",
    "The following notebook shows how we used several scRNA Seq datasets in order to screen for off-tumor toxicities of CAR T target epitopes.\n",
    "\n",
    "Since the analysis in this notebook depends heavily on the version of specific packages, current versions may require adapting the code provided below.\n",
    "\n",
    "The notebook reproduces the preprocessing of multiple scRNA Seq datasets from various healthy tissues and the main figures from the corresponding paper.\n",
    "\n",
    "First we import required packages:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/python/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:14: FutureWarning: pandas.core.index is deprecated and will be removed in a future version.  The public classes are available in the top-level namespace.\n",
      "  from pandas.core.index import Index as PandasIndex\n",
      "/opt/python/lib/python3.7/site-packages/rpy2/robjects/pandas2ri.py:34: UserWarning: pandas >= 1.0 is not supported.\n",
      "  warnings.warn('pandas >= 1.0 is not supported.')\n"
     ]
    }
   ],
   "source": [
    "import scanpy as sc\n",
    "import numpy as np\n",
    "import scipy as sp\n",
    "import pandas as pd\n",
    "import anndata\n",
    "import matplotlib.pyplot as plt\n",
    "from matplotlib import rcParams\n",
    "from matplotlib import colors\n",
    "import seaborn as sb\n",
    "import os\n",
    "import sfaira\n",
    "\n",
    "from os import listdir\n",
    "from os.path import isfile, join\n",
    "\n",
    "import warnings\n",
    "from rpy2.rinterface import RRuntimeWarning\n",
    "from rpy2.robjects import pandas2ri\n",
    "\n",
    "%load_ext rpy2.ipython\n",
    "\n",
    "warnings.filterwarnings(\"ignore\", category=RRuntimeWarning)\n",
    "pandas2ri.activate()\n",
    "pd.set_option('display.max_rows', 500)\n",
    "pd.set_option('display.max_columns', 500)\n",
    "sc.settings.verbosity = 3\n",
    "\n",
    "#Define a nice colour map for gene expression\n",
    "colors2 = plt.cm.Reds(np.linspace(0, 1, 128))\n",
    "colors3 = plt.cm.Greys_r(np.linspace(0.7,0.8,20))\n",
    "colorsComb = np.vstack([colors3, colors2])\n",
    "mymap = colors.LinearSegmentedColormap.from_list('my_colormap', colorsComb)\n",
    "\n",
    "sc.set_figure_params(vector_friendly=True, color_map='Reds',\n",
    "                     dpi=200,transparent=True, fontsize=14)\n",
    "\n",
    "palette=['#3519ED','#EDFBA8', '#4D2E6A', '#0ECDB4', '#0C23B5', '#92BF0D', '#51F2E4', '#1CB09D', '#BFC1D4', '#A98BF3', '#98D1C6', '#9A5E72', '#B89327', '#6704A5', '#0F64B2', '#A69CF3', '#D97C2E',\n",
    "        '#321795', '#7E2649', '#65A813', '#734E0F', '#D14FBC', '#2CFA50', '#83610A', '#841032', '#04D8E3', '#8605BA', '#1EF60C', '#602587', '#F9AD1B', '#2BD738', '#C8E239', '#46D1BC',\n",
    "        '#465A01', '#70C51E', '#924DFB', '#5D28E4', '#712E8D', '#30F465', '#5287E1', '#D16B7F', '#B8EF72', '#03B15D', '#4290F3', '#5E9063', '#B0E39F', '#3C6205', '#ACE603', '#2DE57A',\n",
    "         '#052B18', '#69BE75', '#8BE509', '#C37041', '#E218BA', '#5AC097', '#A597C4','#2A13B5', '#823FA7', '#C8F349', '#7C482A', '#A104E5', '#8E3C27', '#1DB457', '#36FDC7', '#60A934', '#F43B78',\n",
    "        '#74EF61', '#810527', '#6D293F', '#F1A709', '#274F6C', '#104EB5', '#6D02CE', '#B36AE5', '#13F9BA', '#C7281F', '#31572B', '#07D94B', '#B45E18', '#9F73D1', '#3CA059', '#4D6B1C', \n",
    "        '#3BA785', '#E3084C', '#A61BF4', '#F3D50A', '#640A32', '#FB0D73', '#AD0126', '#279136', '#480EB1', '#634158', '#FD80E5', '#CF26AE', '#046DB9', '#15F6B8', '#1A047C', '#D14826', \n",
    "        '#45D160', '#C6039B', '#D9C23F', '#70829B', '#940ECF', '#9FAB37', '#BA84C7', '#68F275', '#C79EA1', '#6E89B7', '#712D6C', '#F51D8C', '#D564E0', '#24EA18', '#459C71', '#23F50B', \n",
    "        '#3E57C8', '#6D78F9', '#07EC9F', '#3078C5', '#53EC0A','#D283E5', '#16039B', '#61E0A8', '#10A659', '#52374A', '#B31EC4', '#254D10', '#D5B0F4', '#A79E35', '#2D0F45', '#562D18', \n",
    "        '#ABE562', '#9A0842', '#92165C', '#FCD98A', '#B384DA', '#3CB108', '#2FE04C', '#386CA2', '#423719', '#E540C2', '#C58DE9', '#F6B8E4','#87592B', '#94D53F', '#4AB5E3', '#B0E96C',\n",
    "        '#AD94E6', '#E237B8', '#21F0B9', '#FB9C73']\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "writepath = '/path/to/directory/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "-----\n",
      "anndata     0.8.0\n",
      "scanpy      1.9.1\n",
      "-----\n",
      "PIL                                 7.1.1\n",
      "absl                                NA\n",
      "asciitree                           NA\n",
      "astor                               0.8.1\n",
      "b77c949143755c3209ad0c9f1475a050    NA\n",
      "backcall                            0.1.0\n",
      "biothings_client                    0.2.6\n",
      "botocore                            1.15.39\n",
      "certifi                             2020.04.05.1\n",
      "cffi                                1.14.0\n",
      "chardet                             3.0.4\n",
      "charset_normalizer                  2.1.0\n",
      "cloudpickle                         1.3.0\n",
      "cycler                              0.10.0\n",
      "cython_runtime                      NA\n",
      "dask                                2.14.0\n",
      "dateutil                            2.8.1\n",
      "decorator                           4.4.2\n",
      "entrypoints                         0.3\n",
      "fasteners                           0.17.3\n",
      "fsspec                              0.7.2\n",
      "gast                                NA\n",
      "google                              NA\n",
      "gprofiler                           1.0.0\n",
      "h5py                                3.7.0\n",
      "idna                                2.9\n",
      "igraph                              0.8.0\n",
      "ipykernel                           5.2.0\n",
      "ipython_genutils                    0.2.0\n",
      "ipywidgets                          7.5.1\n",
      "jedi                                0.16.0\n",
      "jinja2                              2.11.1\n",
      "joblib                              0.14.1\n",
      "keras_applications                  1.0.8\n",
      "keras_preprocessing                 1.1.0\n",
      "kiwisolver                          1.2.0\n",
      "leidenalg                           0.7.0\n",
      "llvmlite                            0.31.0\n",
      "louvain                             0.6.1\n",
      "lxml                                4.5.0\n",
      "markupsafe                          1.1.1\n",
      "matplotlib                          3.2.1\n",
      "mpl_toolkits                        NA\n",
      "mygene                              3.2.2\n",
      "natsort                             7.0.1\n",
      "networkx                            2.4\n",
      "numba                               0.48.0\n",
      "numcodecs                           0.10.0\n",
      "numexpr                             2.7.1\n",
      "numpy                               1.18.2\n",
      "obonet                              0.3.0\n",
      "opt_einsum                          v3.2.0\n",
      "owlready2                           0.38\n",
      "owlready2_optimized                 NA\n",
      "packaging                           21.3\n",
      "pandas                              1.3.5\n",
      "parso                               0.6.2\n",
      "pexpect                             4.8.0\n",
      "pickleshare                         0.7.5\n",
      "pkg_resources                       NA\n",
      "prompt_toolkit                      3.0.5\n",
      "psutil                              5.9.1\n",
      "ptyprocess                          0.6.0\n",
      "pyarrow                             8.0.0\n",
      "pygments                            2.6.1\n",
      "pyparsing                           2.4.7\n",
      "pytz                                2019.3\n",
      "requests                            2.28.1\n",
      "rpy2                                3.2.7\n",
      "scipy                               1.4.1\n",
      "seaborn                             0.10.0\n",
      "session_info                        1.0.0\n",
      "setuptools_scm                      NA\n",
      "sfaira                              v0.3.12\n",
      "six                                 1.14.0\n",
      "sklearn                             0.22.2.post1\n",
      "statsmodels                         0.11.1\n",
      "storemagic                          NA\n",
      "swig_runtime_data4                  NA\n",
      "tensorboard                         2.1.1\n",
      "tensorflow                          2.1.0\n",
      "tensorflow_core                     2.1.0\n",
      "tensorflow_estimator                NA\n",
      "termcolor                           1.1.0\n",
      "texttable                           1.6.2\n",
      "tlz                                 0.10.0\n",
      "toolz                               0.10.0\n",
      "tornado                             6.0.4\n",
      "tqdm                                4.45.0\n",
      "traitlets                           4.3.3\n",
      "typing_extensions                   NA\n",
      "tzlocal                             NA\n",
      "urllib3                             1.25.8\n",
      "wcwidth                             NA\n",
      "wrapt                               1.12.1\n",
      "yaml                                5.3.1\n",
      "zarr                                2.12.0\n",
      "zipp                                NA\n",
      "zmq                                 19.0.0\n",
      "-----\n",
      "IPython             7.13.0\n",
      "jupyter_client      6.1.2\n",
      "jupyter_core        4.6.3\n",
      "jupyterlab          2.1.0\n",
      "notebook            6.0.3\n",
      "-----\n",
      "Python 3.7.7 (default, Apr 10 2020, 15:22:44) [GCC 8.3.0]\n",
      "Linux-3.10.0-1160.42.2.el7.x86_64-x86_64-with-debian-10.3\n",
      "-----\n",
      "Session information updated at 2022-07-07 09:11\n"
     ]
    }
   ],
   "source": [
    "sc.logging.print_versions()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true,
    "tags": []
   },
   "source": [
    "# 00-Datasets with multiple organs"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here we use sfaira to import available datasets with annotations.\n",
    "Note that the following steps may change depending on the current sfaira version and the path to your repository."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 00-1-MultipleOrgans-Pisco-2022"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading: ncbitaxon_v2021-06-10.obo to /home/ruben.brabenec/.cache/sfaira/ontologies/ncbitaxon\n",
      "Downloading: efo.obo to /home/ruben.brabenec/.cache/sfaira/ontologies/efo\n",
      "Downloading: hsapdv_master.obo to /home/ruben.brabenec/.cache/sfaira/ontologies/hsapdv\n",
      "Downloading: mmusdv.obo to /home/ruben.brabenec/.cache/sfaira/ontologies/mmusdv\n",
      "Downloading: uberon_v2021-07-27.obo to /home/ruben.brabenec/.cache/sfaira/ontologies/uberon\n",
      "Ontology <class 'sfaira.versions.metadata.base.OntologyUberonLifecyclestage'> is not a DAG, treat child-parent reasoning with care.\n",
      "Downloading: mondo_v2021-08-11.obo to /home/ruben.brabenec/.cache/sfaira/ontologies/mondo\n",
      "Ontology <class 'sfaira.versions.metadata.base.OntologyMondo'> is not a DAG, treat child-parent reasoning with care.\n",
      "Ontology <class 'sfaira.versions.metadata.base.OntologyUberon'> is not a DAG, treat child-parent reasoning with care.\n",
      "Downloading: pato_v2021-08-06.obo to /home/ruben.brabenec/.cache/sfaira/ontologies/pato\n"
     ]
    }
   ],
   "source": [
    "target_collections = [\"e5f58829-1a66-40b5-a624-9046778e74f5\"]\n",
    "cache_path = os.path.join(\".\", \"data\")\n",
    "dsg = sfaira.data.dataloaders.databases.DatasetSuperGroupDatabases(data_path=cache_path, cache_metadata=True)\n",
    "dsg.subset(key=\"collection_id\", values=target_collections)\n",
    "dsg.datasets\n",
    "dsg.download()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "path = '/path/to/repo/e5f58829-1a66-40b5-a624-9046778e74f5/'\n",
    "files = [f for f in listdir(path) if isfile(join(path, f))]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['53d208b0-2cfd-4366-9866-c3c6114081bc.h5ad',\n",
       " 'a68b64d8-aee3-4947-81b7-36b8fe5a44d2.h5ad',\n",
       " 'c5d88abe-f23a-45fa-a534-788985e93dad.h5ad',\n",
       " '97a17473-e2b1-4f31-a544-44a60773e2dd.h5ad',\n",
       " '5a11f879-d1ef-458a-910c-9b0bdfca5ebf.h5ad']"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "files = ['53d208b0-2cfd-4366-9866-c3c6114081bc.h5ad']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "53d208b0-2cfd-4366-9866-c3c6114081bc.h5ad\n",
      "AnnData object with n_obs × n_vars = 483152 × 58559\n",
      "    obs: 'tissue_in_publication', 'assay_ontology_term_id', 'donor', 'anatomical_information', 'n_counts_UMIs', 'n_genes', 'cell_ontology_class', 'free_annotation', 'manually_annotated', 'compartment', 'sex_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'disease_ontology_term_id', 'ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'cell_type_ontology_term_id', 'tissue_ontology_term_id', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'ethnicity', 'development_stage', 'id'\n",
      "    var: 'feature_type', 'ensemblid', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std', 'feature_biotype', 'feature_is_filtered', 'feature_name', 'feature_reference'\n",
      "    uns: 'X_normalization', '_scvi', '_training_mode', 'compartment_colors', 'default_embedding', 'dendrogram_cell_type_tissue', 'dendrogram_computational_compartment_assignment', 'dendrogram_consensus_prediction', 'dendrogram_tissue_cell_type', 'donor_colors', 'donor_method_colors', 'hvg', 'method_colors', 'neighbors', 'schema_version', 'sex_colors', 'tissue_in_publication_colors', 'title', 'umap'\n",
      "    obsm: 'X_pca', 'X_scvi', 'X_scvi_umap', 'X_umap'\n",
      "    layers: 'decontXcounts'\n",
      "    obsp: 'connectivities', 'distances'\n"
     ]
    }
   ],
   "source": [
    "for i in range(len(files)):\n",
    "    print(files[i])\n",
    "    path_2 = path + files[i]\n",
    "    u = sc.read_h5ad(path_2)\n",
    "    u.obs['id'] = files[i]\n",
    "    print(u)\n",
    "    if i == 0:\n",
    "        adata_pisco = u\n",
    "    else:\n",
    "        adata_pisco = adata_pisco.concatenate(u, join='outer')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pisco.var.index = adata_pisco.var['feature_name']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pisco.obs['InternDatasetNumber'] ='00-1-MultipleOrgans-Pisco-2022'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pisco.write(writepath + '00-1-MultipleOrgans-Pisco-2022-raw.h5ad')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 00-2-MultipleOrgans-Han-2020"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "ID = 'homosapiens_None_2020_microwellseq_han_001_10.1038/s41586-020-2157-4'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set this path to your local sfaira data repository\n",
    "basedir = '.'\n",
    "datadir = os.path.join(basedir, 'raw')\n",
    "metadir = os.path.join(basedir, 'meta')\n",
    "cachedir = os.path.join(basedir, 'cache')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ds = sfaira.data.Universe(data_path=datadir, meta_path=metadir, cache_path=cachedir)\n",
    "# subset to the selected dataset\n",
    "ds.subset(key=\"id\", values=[ID])  # subsets all lung data sets\n",
    "# download and load the specific dataset\n",
    "ds.download()\n",
    "ds.load(verbose=1)\n",
    "# get the unmodified adata object of the dataset\n",
    "adata = ds.datasets[ID].adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_han = adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_han.obs['InternDatasetNumber'] ='00-2-MultipleOrgans-Han-2020'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_han.write(writepath + '00-2-MultipleOrgans-Han-2020-raw.h5ad')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 00-3-MultipleOrgans-ImmuneCells-Teichmann-2022"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Ontology <class 'sfaira.versions.metadata.base.OntologyUberonLifecyclestage'> is not a DAG, treat child-parent reasoning with care.\n",
      "Ontology <class 'sfaira.versions.metadata.base.OntologyMondo'> is not a DAG, treat child-parent reasoning with care.\n",
      "Ontology <class 'sfaira.versions.metadata.base.OntologyUberon'> is not a DAG, treat child-parent reasoning with care.\n"
     ]
    }
   ],
   "source": [
    "target_collections = [\"62ef75e4-cbea-454e-a0ce-998ec40223d3\"]\n",
    "cache_path = os.path.join(\".\", \"data\")\n",
    "dsg = sfaira.data.dataloaders.databases.DatasetSuperGroupDatabases(data_path=cache_path, cache_metadata=True)\n",
    "dsg.subset(key=\"collection_id\", values=target_collections)\n",
    "dsg.datasets\n",
    "dsg.download()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "path = '/path/to/repo/62ef75e4-cbea-454e-a0ce-998ec40223d3/'\n",
    "files = [f for f in listdir(path) if isfile(join(path, f))]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['ae29ebd0-1973-40a4-a6af-d15a5f77a80f.h5ad',\n",
       " 'fe52003e-1460-4a65-a213-2bb1a508332f.h5ad',\n",
       " '71be997d-ff75-41b9-8a9f-1288c865f921.h5ad',\n",
       " '1b9d8702-5af8-4142-85ed-020eb06ec4f6.h5ad']"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "files = ['1b9d8702-5af8-4142-85ed-020eb06ec4f6.h5ad']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1b9d8702-5af8-4142-85ed-020eb06ec4f6.h5ad\n",
      "329762\n"
     ]
    }
   ],
   "source": [
    "for i in range(len(files)):\n",
    "    print(files[i])\n",
    "    path_2 = path + files[i]\n",
    "    u = sc.read_h5ad(path_2)\n",
    "    u.obs['id'] = files[i]\n",
    "    print(len(u.obs))\n",
    "    #if i == 0:\n",
    "    #    adata_pisco = u\n",
    "    #else:\n",
    "    #    adata_pisco = adata_pisco.concatenate(u, join='outer')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_teichmann = u.copy()\n",
    "%reset_selective -f \"^u$\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_teichmann.obs['InternDatasetNumber'] = '00-3-MultipleOrgans_ImmuneCells-Teichmann-2022'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_teichmann.var.index = adata_teichmann.var['gene_symbols']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata = adata_teichmann.copy()\n",
    "%reset_selective -f \"^adata_teichmann$\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "#calculate QC covariates\n",
    "adata.obs['n_counts'] = adata.X.sum(1)\n",
    "adata.obs['log_counts'] = np.log(adata.obs['n_counts'])\n",
    "adata.obs['n_genes'] = (adata.X > 0).sum(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "filtered out 61 cells that have more than 8000 genes expressed\n",
      "filtered out 10098 genes that are detected in less than 20 cells\n"
     ]
    }
   ],
   "source": [
    "# FILTER PARAMETERS\n",
    "#Filter out cells\n",
    "sc.pp.filter_cells(adata, max_counts = 4300)\n",
    "sc.pp.filter_cells(adata, max_genes = 8000)\n",
    "# Min 20 cells - filters out low count genes\n",
    "sc.pp.filter_genes(adata, min_cells=20) #500 works # # 450(30494) #400(29837) #300(28268) not"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get mt genes\n",
    "mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]\n",
    "np.array(mt_genes)\n",
    "mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]\n",
    "\n",
    "# convert to spare\n",
    "adata.X=sp.sparse.csr_matrix.todense(adata.X)\n",
    "\n",
    "# manually define mt score\n",
    "y = np.bincount(mt_gene_mask)\n",
    "ii = np.nonzero(y)[0]\n",
    "np.vstack((ii,y[ii])).T\n",
    "adata.X[:, mt_gene_mask].sum(1)\n",
    "mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))\n",
    "mt_sum=np.array(pd.DataFrame(mt_sum)[0])\n",
    "mt_frac=mt_sum/adata.obs['n_counts'].values\n",
    "\n",
    "adata.obs['mt_frac'] = mt_frac\n",
    "\n",
    "#Filter out cells with over 20% mito fraction\n",
    "adata = adata[adata.obs['mt_frac'] < 0.20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pp=adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:00:34): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n",
      "computing PCA\n",
      "    with n_comps=15\n",
      "    finished (0:01:49)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:01:06)\n",
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 26 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:01:47)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/python/lib/python3.7/site-packages/ipykernel_launcher.py:3: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "#Delete adata_pp\n",
    "del adata_pp\n",
    "adata.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "#Keep the count data in a counts layer\n",
    "adata.layers[\"counts\"] = adata.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.raw = adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "adata.X /= adata.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n",
      "    finished (0:00:45)\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=50\n",
      "    finished (0:00:46)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 50\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:02:01)\n",
      "computing UMAP\n",
      "    finished: added\n",
      "    'X_umap', UMAP coordinates (adata.obsm) (0:05:52)\n"
     ]
    }
   ],
   "source": [
    "# Calculate the visualizations\n",
    "sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')\n",
    "sc.pp.neighbors(adata)\n",
    "sc.tl.umap(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "# make consistent annotations across datasets\n",
    "adata.obs['celltype']=adata.obs['cell_type'].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['celltype'],\n",
    "                           categories=['CD16-negative, CD56-bright natural killer cell, human',\n",
    "       'CD16-positive, CD56-dim natural killer cell, human',\n",
    "       'CD4-positive helper T cell', 'CD8-positive, alpha-beta memory T cell',\n",
    "       'CD8-positive, alpha-beta memory T cell, CD45RO-positive',\n",
    "       'T follicular helper cell', 'alpha-beta T cell', 'alveolar macrophage',\n",
    "       'animal cell', 'classical monocyte', 'conventional dendritic cell',\n",
    "       'dendritic cell, human',\n",
    "       'effector memory CD4-positive, alpha-beta T cell',\n",
    "       'effector memory CD8-positive, alpha-beta T cell, terminally differentiated',\n",
    "       'erythroid lineage cell', 'gamma-delta T cell',\n",
    "       'germinal center B cell', 'group 3 innate lymphoid cell', 'lymphocyte',\n",
    "       'macrophage', 'mast cell', 'megakaryocyte', 'memory B cell',\n",
    "       'mucosal invariant T cell', 'naive B cell',\n",
    "       'naive thymus-derived CD4-positive, alpha-beta T cell',\n",
    "       'naive thymus-derived CD8-positive, alpha-beta T cell',\n",
    "       'non-classical monocyte', 'plasma cell', 'plasmablast',\n",
    "       'plasmacytoid dendritic cell', 'precursor B cell', 'pro-B cell',\n",
    "       'progenitor cell', 'regulatory T cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,['CD16-positive, CD56-dim natural killer cell, human',])\n",
    "ref_cluster[ix]='CD16-negative, CD56-bright natural killer cell, human'\n",
    "\n",
    "ix=np.isin(ref_cluster,['CD8-positive, alpha-beta memory T cell',\n",
    "       'CD8-positive, alpha-beta memory T cell, CD45RO-positive',\n",
    "       'T follicular helper cell', 'alpha-beta T cell','effector memory CD4-positive, alpha-beta T cell',\n",
    "       'effector memory CD8-positive, alpha-beta T cell, terminally differentiated',\n",
    "       'erythroid lineage cell', 'gamma-delta T cell',  'mucosal invariant T cell', 'naive thymus-derived CD4-positive, alpha-beta T cell',\n",
    "       'naive thymus-derived CD8-positive, alpha-beta T cell','regulatory T cell'])\n",
    "ref_cluster[ix]='CD4-positive helper T cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,[  'dendritic cell, human','plasmacytoid dendritic cell'])\n",
    "ref_cluster[ix]='conventional dendritic cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,['memory B cell', 'naive B cell','precursor B cell', 'pro-B cell'])\n",
    "ref_cluster[ix]= 'germinal center B cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,[ 'macrophage'])\n",
    "ref_cluster[ix]='alveolar macrophage',\n",
    "\n",
    "ix=np.isin(ref_cluster,[ 'non-classical monocyte',])\n",
    "ref_cluster[ix]='classical monocyte'\n",
    "\n",
    "ix=np.isin(ref_cluster,['plasmablast'])\n",
    "ref_cluster[ix]= 'plasma cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,['progenitor cell'])\n",
    "ref_cluster[ix]='animal cell'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['celltype']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['CD16-negative, CD56-bright natural killer cell, human',\n",
    "       \n",
    "       'CD4-positive helper T cell', \n",
    "        'alveolar macrophage',\n",
    "       'animal cell', 'classical monocyte', 'conventional dendritic cell',\n",
    "     \n",
    "       \n",
    "       'germinal center B cell', 'group 3 innate lymphoid cell', 'lymphocyte',\n",
    "     'mast cell', 'megakaryocyte', \n",
    "     \n",
    "      \n",
    "       'plasma cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.rename_categories('celltype', ['NK cells',\n",
    "       \n",
    "       'T cells', \n",
    "        'Macrophages',\n",
    "       'Remove', 'Monocytes', 'Dendritic cells',\n",
    "       'B cells', 'Innate lymphoid cells', 'Lymphocytes',\n",
    "     'Mast cells', 'Megakaryocytes',     \n",
    "       'Plasma cells'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(adata.obs['organism'], ['Homo sapiens'])\n",
    "adata=adata[ix].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['NK cells', 'T cells', 'Macrophages', 'Remove', 'Monocytes',\n",
       "       'Dendritic cells', 'B cells', 'Innate lymphoid cells', 'Lymphocytes',\n",
       "       'Mast cells', 'Megakaryocytes', 'Plasma cells'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.obs['celltype'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(adata.obs['celltype'],['NK cells', 'T cells', 'Macrophages', 'Monocytes',\n",
    "       'Dendritic cells', 'B cells', 'Innate lymphoid cells', 'Lymphocytes',\n",
    "       'Mast cells', 'Megakaryocytes', 'Plasma cells'])\n",
    "adata=adata[ix].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['blood', 'bone marrow', 'caecum', 'duodenum', 'ileum',\n",
       "       'jejunal epithelium', 'lamina propria', 'liver', 'lung',\n",
       "       'mesenteric lymph node', 'omentum', 'sigmoid colon',\n",
       "       'skeletal muscle tissue', 'spleen', 'thoracic lymph node', 'thymus',\n",
       "       'transverse colon'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 73,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.obs['tissue'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['sub_tissue'] = adata.obs['tissue']\n",
    "ref_cluster=pd.Categorical(adata.obs['sub_tissue'],\n",
    "                           categories=['blood', 'bone marrow', 'caecum', 'duodenum', 'ileum',\n",
    "       'jejunal epithelium', 'lamina propria', 'liver', 'lung',\n",
    "       'mesenteric lymph node', 'omentum', 'sigmoid colon',\n",
    "       'skeletal muscle tissue', 'spleen', 'thoracic lymph node', 'thymus',\n",
    "       'transverse colon'])\n",
    "adata.rename_categories('sub_tissue', ['Blood', 'BoneMarrow', 'Gut_Colon_Ceacum', 'Gut_SmallIntestine_Duodenum', 'Gut_SmallIntestine_Ileum',\n",
    "       'Gut_SmallIntestine_Jejunum', 'Gut_SmallIntestine', 'Liver', 'Lung',\n",
    "       'LymphNode_Mesenteric', 'Omentum', 'Gut_Colon_Sigmoid',\n",
    "       'Muscle', 'Spleen', 'LymphNode_Thoracic', 'Thymus',\n",
    "       'Gut_Colon_Transverse'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['tissue_major'] = adata.obs['tissue']\n",
    "ref_cluster=pd.Categorical(adata.obs['tissue_major'],\n",
    "                           categories=['blood', 'bone marrow', 'caecum', 'duodenum', 'ileum',\n",
    "       'jejunal epithelium', 'lamina propria', 'liver', 'lung',\n",
    "       'mesenteric lymph node', 'omentum', 'sigmoid colon',\n",
    "       'skeletal muscle tissue', 'spleen', 'thoracic lymph node', 'thymus',\n",
    "       'transverse colon'])\n",
    "\n",
    "ix=np.isin(ref_cluster,['ileum',  'jejunal epithelium', 'lamina propria',])\n",
    "ref_cluster[ix]='duodenum'\n",
    "\n",
    "ix=np.isin(ref_cluster,[ 'thoracic lymph node'])\n",
    "ref_cluster[ix]='mesenteric lymph node'\n",
    "\n",
    "ix=np.isin(ref_cluster,[ 'sigmoid colon', 'transverse colon'])\n",
    "ref_cluster[ix]='caecum'\n",
    "\n",
    "\n",
    "\n",
    "adata.obs['tissue_major']=pd.Categorical(ref_cluster,\n",
    "                                           categories= ['blood', 'bone marrow', 'caecum', 'duodenum',\n",
    "       'liver', 'lung',\n",
    "       'mesenteric lymph node', 'omentum',\n",
    "       'skeletal muscle tissue', 'spleen',  'thymus'])\n",
    "\n",
    "adata.rename_categories('tissue_major', ['Blood', 'BoneMarrow', 'Gut_Colon', 'Gut_SmallIntestine',\n",
    "       'Liver', 'Lung',\n",
    "       'LymphNode', 'Omentum',\n",
    "       'Muscle', 'Spleen', 'Thymus'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['sex'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['sex'],\n",
    "                           categories=['female', 'male'])\n",
    "adata.rename_categories('sex', ['Female', 'Male'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['ethnicity'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['ethnicity'],\n",
    "                           categories=['unknown'])\n",
    "adata.rename_categories('ethnicity', ['NaN'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['development_stage'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['development_stage'],\n",
    "                           categories=['eighth decade human stage', 'seventh decade human stage',\n",
    "       'sixth decade human stage'])\n",
    "adata.rename_categories('development_stage', ['80', '70',\n",
    "       '60'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['donor'] = adata.obs['Donor']\n",
    "ref_cluster=pd.Categorical(adata.obs['donor'],\n",
    "                           categories=['582C', '621B', '637C', '640C', 'A29', 'A31', 'A35', 'A36', 'A37',\n",
    "       'A52', 'D496', 'D503'])\n",
    "adata.rename_categories('donor', ['582C', '621B', '637C', '640C', 'A29', 'A31', 'A35', 'A36', 'A37',\n",
    "       'A52', 'D496', 'D503'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['Organ'] =  adata.obs['tissue_major']\n",
    "adata.obs['Organ_Specific'] = adata.obs['sub_tissue']\n",
    "adata.obs['Dataset'] = 'Teichmann_MultipleOrgans_ImmuneCells'\n",
    "adata.obs['InternDatasetNumber'] = '00-3-MultipleOrgans_ImmuneCells-Teichmann-2022'\n",
    "adata.obs['Dataset_status'] = 'Healthy_Dataset'\n",
    "\n",
    "adata.obs['celltype'] = adata.obs['celltype']\n",
    "adata.obs['sub_celltype'] = 'NaN'\n",
    "adata.obs['Malignant'] = 'NonMalignant'\n",
    "\n",
    "adata.obs['Patient'] = adata.obs['donor']\n",
    "adata.obs['Patient_Number'] = 'NaN'\n",
    "adata.obs['age'] = adata.obs['development_stage']\n",
    "adata.obs['sex'] = adata.obs['sex']\n",
    "adata.obs['ethnicity'] = adata.obs['ethnicity']\n",
    "adata.obs['health_status'] = 'NaN'\n",
    "\n",
    "adata.obs['original_celltype_1'] = adata.obs['cell_type']\n",
    "adata.obs['original_celltype_2'] = adata.obs['Majority_voting_CellTypist_high']\n",
    "adata.obs['original_celltype_3'] = 'NaN'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs_names_make_unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_analysis = adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_analysis.obs['Dataset'] = adata_analysis.obs['tissue_major']\n",
    "ref_cluster=pd.Categorical(adata_analysis.obs['Dataset'],\n",
    "                           categories=['Blood', 'BoneMarrow', 'Gut_Colon', 'Gut_SmallIntestine',\n",
    "       'Liver', 'Lung',\n",
    "       'LymphNode', 'Omentum',\n",
    "       'Muscle', 'Spleen', 'Thymus'])\n",
    "adata_analysis.rename_categories('Dataset', ['Teichmann_Blood', 'Teichmann_BoneMarrow', 'Teichmann_Gut_Colon', 'Teichmann_Gut_SmallIntestine',\n",
    "       'Teichmann_Liver', 'Teichmann_Lung',\n",
    "       'Teichmann_LymphNode', 'Teichmann_Omentum',\n",
    "       'Teichmann_Muscle', 'Teichmann_Spleen', 'Teichmann_Thymus'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_analysis.write(writepath + '00-3-MultipleOrgans_ImmuneCells-Teichmann-2022-processed.h5ad')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "# 01-Brain"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 01-1-Brain-Habib-2017"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# here we use sfaira to import available datasets with annotations\n",
    "# note that the following steps may change depending on the current sfaira version and the path to your repository\n",
    "\n",
    "datadir = '/path/to/repo/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "ds = sfaira.data.human.DatasetGroupBrain(path=datadir)  # This links all data sets available"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['human_brain_2017_DroNcSeq_habib_001_10.1038/nmeth.4407',\n",
       " 'human_brain_2020_microwell_han_001_10.1038/s41586-020-2157-4',\n",
       " 'human_brain_2020_microwell_han_002_10.1038/s41586-020-2157-4',\n",
       " 'human_brain_2020_microwell_han_003_10.1038/s41586-020-2157-4',\n",
       " 'human_brain_2020_microwell_han_004_10.1038/s41586-020-2157-4',\n",
       " 'human_brain_2020_microwell_han_005_10.1038/s41586-020-2157-4',\n",
       " 'human_brain_2020_microwell_han_006_10.1038/s41586-020-2157-4']"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds.ids "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "#pick first one (Habib2017)\n",
    "idx = ds.ids[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/icb/moritz.thomas/miniconda3/lib/python3.7/site-packages/sfaira-master/sfaira/data/base.py:84: UserWarning: using default genomes Homo_sapiens_GRCh38_97\n",
      "  warnings.warn(f\"using default genomes {genome}\")\n"
     ]
    }
   ],
   "source": [
    "ds.datasets[idx].load()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "AnnData object with n_obs × n_vars = 13067 × 25587\n",
      "    obs: 'CellType', 'n_counts', 'log1p_n_counts', 'n_genes', 'log1p_n_genes', 'percent_mito', 'percent_ribo', 'percent_hb', 'percent_top50', 'cell_ontology_class', 'healthy', 'state_exact', 'cell_ontology_id'\n",
      "    var: 'names', 'gene_ids', 'mito', 'ribo', 'hb', 'n_counts', 'n_cells', 'n_genes', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'ensembl'\n",
      "    uns: 'leiden', 'neighbors', 'pca', 'lab', 'year', 'doi', 'protocol', 'organ', 'subtissue', 'animal', 'id', 'wget_download', 'has_celltypes', 'counts', 'mapped_features'\n",
      "    obsm: 'X_umap'\n",
      "    varm: 'PCs'\n",
      "    obsp: 'connectivities', 'distances'\n"
     ]
    }
   ],
   "source": [
    "print(ds.datasets[idx].adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata=ds.datasets[idx].adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['InternDatasetNumber'] ='01-1-Brain-Habib-2017'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "sc.pp.calculate_qc_metrics(adata, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "filtered out 7876 genes that are detected in less than 20 cells\n"
     ]
    }
   ],
   "source": [
    "# FILTER PARAMETERS\n",
    "#Filter out cells\n",
    "sc.pp.filter_cells(adata, max_counts = 14000)\n",
    "sc.pp.filter_cells(adata, max_genes = 4000)\n",
    "# Min 20 cells - filters out low count genes\n",
    "sc.pp.filter_genes(adata, min_cells=20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get mt genes\n",
    "mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]\n",
    "np.array(mt_genes)\n",
    "mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]\n",
    "\n",
    "# convert to spare\n",
    "adata.X=sp.sparse.csr_matrix.todense(adata.X)\n",
    "\n",
    "# manually define mt score\n",
    "y = np.bincount(mt_gene_mask)\n",
    "ii = np.nonzero(y)[0]\n",
    "np.vstack((ii,y[ii])).T\n",
    "adata.X[:, mt_gene_mask].sum(1)\n",
    "mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))\n",
    "mt_sum=np.array(pd.DataFrame(mt_sum)[0])\n",
    "mt_frac=mt_sum/adata.obs['n_counts'].values\n",
    "\n",
    "adata.obs['mt_frac'] = mt_frac\n",
    "\n",
    "#Filter out cells with over 25% mito fraction\n",
    "adata = adata[adata.obs['mt_frac'] < 0.25]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pp=adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:00:00): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n",
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=15\n",
      "    finished (0:00:01)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:08)\n",
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 9 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:00:01)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "R[write to console]: Loading required package: scran\n",
      "\n",
      "R[write to console]: Loading required package: SingleCellExperiment\n",
      "\n",
      "R[write to console]: Loading required package: SummarizedExperiment\n",
      "\n",
      "R[write to console]: Loading required package: GenomicRanges\n",
      "\n",
      "R[write to console]: Loading required package: stats4\n",
      "\n",
      "R[write to console]: Loading required package: BiocGenerics\n",
      "\n",
      "R[write to console]: Loading required package: parallel\n",
      "\n",
      "R[write to console]: \n",
      "Attaching package: ‘BiocGenerics’\n",
      "\n",
      "\n",
      "R[write to console]: The following objects are masked from ‘package:parallel’:\n",
      "\n",
      "    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,\n",
      "    clusterExport, clusterMap, parApply, parCapply, parLapply,\n",
      "    parLapplyLB, parRapply, parSapply, parSapplyLB\n",
      "\n",
      "\n",
      "R[write to console]: The following objects are masked from ‘package:stats’:\n",
      "\n",
      "    IQR, mad, sd, var, xtabs\n",
      "\n",
      "\n",
      "R[write to console]: The following objects are masked from ‘package:base’:\n",
      "\n",
      "    anyDuplicated, append, as.data.frame, basename, cbind, colnames,\n",
      "    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,\n",
      "    grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,\n",
      "    order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,\n",
      "    rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,\n",
      "    union, unique, unsplit, which, which.max, which.min\n",
      "\n",
      "\n",
      "R[write to console]: Loading required package: S4Vectors\n",
      "\n",
      "R[write to console]: \n",
      "Attaching package: ‘S4Vectors’\n",
      "\n",
      "\n",
      "R[write to console]: The following object is masked from ‘package:base’:\n",
      "\n",
      "    expand.grid\n",
      "\n",
      "\n",
      "R[write to console]: Loading required package: IRanges\n",
      "\n",
      "R[write to console]: Loading required package: GenomeInfoDb\n",
      "\n",
      "R[write to console]: Loading required package: Biobase\n",
      "\n",
      "R[write to console]: Welcome to Bioconductor\n",
      "\n",
      "    Vignettes contain introductory material; view with\n",
      "    'browseVignettes()'. To cite Bioconductor, see\n",
      "    'citation(\"Biobase\")', and for packages 'citation(\"pkgname\")'.\n",
      "\n",
      "\n",
      "R[write to console]: Loading required package: DelayedArray\n",
      "\n",
      "R[write to console]: Loading required package: matrixStats\n",
      "\n",
      "R[write to console]: \n",
      "Attaching package: ‘matrixStats’\n",
      "\n",
      "\n",
      "R[write to console]: The following objects are masked from ‘package:Biobase’:\n",
      "\n",
      "    anyMissing, rowMedians\n",
      "\n",
      "\n",
      "R[write to console]: Loading required package: BiocParallel\n",
      "\n",
      "R[write to console]: \n",
      "Attaching package: ‘DelayedArray’\n",
      "\n",
      "\n",
      "R[write to console]: The following objects are masked from ‘package:matrixStats’:\n",
      "\n",
      "    colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges\n",
      "\n",
      "\n",
      "R[write to console]: The following objects are masked from ‘package:base’:\n",
      "\n",
      "    aperm, apply, rowsum\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/python/lib/python3.7/site-packages/ipykernel_launcher.py:3: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "#Delete adata_pp\n",
    "del adata_pp\n",
    "adata.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "#Keep the count data in a counts layer\n",
    "adata.layers[\"counts\"] = adata.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.raw = adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "adata.X /= adata.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n",
      "    finished (0:00:00)\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=50\n",
      "    finished (0:00:01)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 50\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:07)\n",
      "computing UMAP\n",
      "    finished: added\n",
      "    'X_umap', UMAP coordinates (adata.obsm) (0:00:11)\n"
     ]
    }
   ],
   "source": [
    "# Calculate the visualizations\n",
    "sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')\n",
    "sc.pp.neighbors(adata)\n",
    "sc.tl.umap(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "# make consistent annotations across datasets\n",
    "adata.obs['celltype']=adata.obs['cell_ontology_class'].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['celltype'],\n",
    "                           categories=['Astrocytes 1', 'Astrocytes 2', 'Endothelial cells',\n",
    "       'GABAergic interneurons 1', 'GABAergic interneurons 2',\n",
    "       'Glutamatergic neurons from the PFC 1',\n",
    "       'Glutamatergic neurons from the PFC 2',\n",
    "       'Granule neurons from the hip dentate gyrus region', 'Microglia',\n",
    "       'Neuronal stem cells', 'Oligodendrocyte precursors', 'Oligodendrocytes',\n",
    "       'Pyramidal neurons from the hip CA region 1',\n",
    "       'Pyramidal neurons from the hip CA region 2', 'Unknown'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,['Astrocytes 2'])\n",
    "ref_cluster[ix]='Astrocytes 1'\n",
    "\n",
    "ix=np.isin(ref_cluster,['Glutamatergic neurons from the PFC 2','Granule neurons from the hip dentate gyrus region',\n",
    "                        'Pyramidal neurons from the hip CA region 1','Pyramidal neurons from the hip CA region 2',\n",
    "                       'GABAergic interneurons 1', 'GABAergic interneurons 2'])\n",
    "ref_cluster[ix]='Glutamatergic neurons from the PFC 1'\n",
    "\n",
    "ix=np.isin(ref_cluster,['Oligodendrocytes'])\n",
    "ref_cluster[ix]='Oligodendrocyte precursors'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['celltype']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['Astrocytes 1', 'Endothelial cells',\n",
    "       'Glutamatergic neurons from the PFC 1','Microglia',\n",
    "       'Neuronal stem cells', 'Oligodendrocyte precursors',\n",
    "       'Unknown'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.rename_categories('celltype', ['Astrocytes', 'Endothelial cells',\n",
    "       'Neurons', 'Microglial cells',\n",
    "       'Neuronal stem cells', 'Oligodendrocytes',\n",
    "       'Unknown'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['Organ'] = 'Brain'\n",
    "adata.obs['Organ_Specific'] = 'Brain_Hippocampus_PrefrontalCortex'\n",
    "adata.obs['Dataset'] = 'Habib_Brain'\n",
    "adata.obs['InternDatasetNumber'] = '01-1-Brain-Habib-2017'\n",
    "adata.obs['Dataset_status'] = 'Healthy_Dataset'\n",
    "\n",
    "adata.obs['celltype'] = adata.obs['celltype']\n",
    "adata.obs['sub_celltype'] = 'NaN'\n",
    "adata.obs['Malignant'] = 'NonMalignant'\n",
    "\n",
    "adata.obs['Patient'] = 'Habib_Brain-Donor1'\n",
    "adata.obs['Patient_Number'] = 'NaN'\n",
    "adata.obs['age'] = 'NaN'\n",
    "adata.obs['sex'] = 'NaN'\n",
    "adata.obs['ethnicity'] = 'NaN'\n",
    "adata.obs['health_status'] = 'NaN'\n",
    "\n",
    "adata.obs['original_celltype_1'] = adata.obs['cell_ontology_class']\n",
    "adata.obs['original_celltype_2'] = adata.obs['CellType']\n",
    "adata.obs['original_celltype_3'] = 'NaN'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs_names_make_unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.write(writepath + '01-1-Brain-Habib-2017-processed.h5ad')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true,
    "tags": []
   },
   "source": [
    "## 01-2-Brain_Cerebellum-Han-2020"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 355,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(adata_han.obs['sub_tissue'],['AdultCerebellum']) \n",
    "adata=adata_han[ix].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 356,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['InternDatasetNumber'] ='01-2-Brain_Cerebellum-Han-2020'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 360,
   "metadata": {},
   "outputs": [],
   "source": [
    "#calculate QC covariates\n",
    "adata.obs['n_counts'] = adata.X.sum(1)\n",
    "adata.obs['log_counts'] = np.log(adata.obs['n_counts'])\n",
    "adata.obs['n_genes'] = (adata.X > 0).sum(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 363,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "filtered out 19 cells that have more than 4000 counts\n",
      "filtered out 11878 genes that are detected in less than 10 cells\n"
     ]
    }
   ],
   "source": [
    "# FILTER PARAMETERS\n",
    "#Filter out cells\n",
    "sc.pp.filter_cells(adata, max_counts = 4000)\n",
    "sc.pp.filter_cells(adata, max_genes = 2500)\n",
    "# Min 20 cells - filters out low count genes\n",
    "sc.pp.filter_genes(adata, min_cells=10) #500 works # # 450(30494) #400(29837) #300(28268) not"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 364,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get mt genes\n",
    "mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]\n",
    "np.array(mt_genes)\n",
    "mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]\n",
    "\n",
    "# convert to spare\n",
    "adata.X=sp.sparse.csr_matrix.todense(adata.X)\n",
    "\n",
    "# manually define mt score\n",
    "y = np.bincount(mt_gene_mask)\n",
    "ii = np.nonzero(y)[0]\n",
    "np.vstack((ii,y[ii])).T\n",
    "adata.X[:, mt_gene_mask].sum(1)\n",
    "mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))\n",
    "mt_sum=np.array(pd.DataFrame(mt_sum)[0])\n",
    "mt_frac=mt_sum/adata.obs['n_counts'].values\n",
    "\n",
    "adata.obs['mt_frac'] = mt_frac\n",
    "\n",
    "#Filter out cells with over 20% mito fraction\n",
    "adata = adata[adata.obs['mt_frac'] < 0.20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 365,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pp=adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 366,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:00:00): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n",
      "computing PCA\n",
      "    with n_comps=15\n",
      "    finished (0:00:01)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)\n",
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 11 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:00:00)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 367,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 368,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 369,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/python/lib/python3.7/site-packages/ipykernel_launcher.py:3: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "#Delete adata_pp\n",
    "del adata_pp\n",
    "adata.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 370,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 371,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "#Keep the count data in a counts layer\n",
    "adata.layers[\"counts\"] = adata.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 372,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.raw = adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 373,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "adata.X /= adata.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 374,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n",
      "    finished (0:00:00)\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 375,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=50\n",
      "    finished (0:00:01)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 50\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)\n",
      "computing UMAP\n",
      "    finished: added\n",
      "    'X_umap', UMAP coordinates (adata.obsm) (0:00:14)\n"
     ]
    }
   ],
   "source": [
    "# Calculate the visualizations\n",
    "sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')\n",
    "sc.pp.neighbors(adata)\n",
    "sc.tl.umap(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 376,
   "metadata": {},
   "outputs": [],
   "source": [
    "#sc.pl.umap(adata, color='celltype_specific')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 377,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['Astrocyte', 'Astrocyte(Bergmann glia)', 'B cell', 'Endothelial cell',\n",
       "       'Epithelial cell', 'Excitatory neuron', 'Inhibitory neuron',\n",
       "       'Interneuron', 'Macrophage', 'Neutrophil_DEFA3 high',\n",
       "       'Neutrophil_FCGR3B high', 'Neutrophil_LYZ high', 'Oligodendrocyte',\n",
       "       'Oligodendrocyte progenitor cell', 'Pericyte', 'Smooth muscle cell',\n",
       "       'T cell', 'Unknown'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 377,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# make consistent annotations across datasets\n",
    "adata.obs['celltype']=adata.obs['celltype_specific'].copy()\n",
    "adata.obs['celltype'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 378,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['celltype'],\n",
    "                           categories=['Astrocyte', 'Astrocyte(Bergmann glia)', 'B cell', 'Endothelial cell',\n",
    "       'Epithelial cell', 'Excitatory neuron', 'Inhibitory neuron',\n",
    "       'Interneuron', 'Macrophage', 'Neutrophil_DEFA3 high',\n",
    "       'Neutrophil_FCGR3B high', 'Neutrophil_LYZ high', 'Oligodendrocyte',\n",
    "       'Oligodendrocyte progenitor cell', 'Pericyte', 'Smooth muscle cell',\n",
    "       'T cell', 'Unknown'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 379,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,['Astrocyte(Bergmann glia)'])\n",
    "ref_cluster[ix]='Astrocyte'\n",
    "\n",
    "ix=np.isin(ref_cluster,['Inhibitory neuron', 'Interneuron',])\n",
    "ref_cluster[ix]='Excitatory neuron'\n",
    "\n",
    "ix=np.isin(ref_cluster,[ 'Neutrophil_FCGR3B high', 'Neutrophil_LYZ high'])\n",
    "ref_cluster[ix]='Neutrophil_DEFA3 high'\n",
    "\n",
    "ix=np.isin(ref_cluster,[ 'Oligodendrocyte progenitor cell'])\n",
    "ref_cluster[ix]='Oligodendrocyte'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 380,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['celltype']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['Astrocyte', 'B cell', 'Endothelial cell',\n",
    "       'Epithelial cell', 'Excitatory neuron', \n",
    "        'Macrophage', 'Neutrophil_DEFA3 high',\n",
    "        'Oligodendrocyte',\n",
    "        'Pericyte', 'Smooth muscle cell',\n",
    "       'T cell', 'Unknown'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 381,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.rename_categories('celltype', ['Astrocytes', 'B cells', 'Endothelial cells',\n",
    "       'Cerebellar epithelial cells', 'Neurons', \n",
    "        'Macrophages', 'Neutrophils',\n",
    "        'Oligodendrocytes',\n",
    "        'Pericytes', 'Smooth muscle cells',\n",
    "       'T cells', 'Unknown'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 385,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['sub_tissue'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['sub_tissue'],\n",
    "                           categories=['AdultCerebellum'])\n",
    "adata.rename_categories('sub_tissue', ['Brain_Cerebellum'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 386,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['sex'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['sex'],\n",
    "                           categories=['female'])\n",
    "adata.rename_categories('sex', ['Female'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 387,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['age'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['age'],\n",
    "                           categories=['55Y'])\n",
    "adata.rename_categories('age',['55'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 388,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['donor'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['donor'],\n",
    "                           categories=['Donor29'])\n",
    "adata.rename_categories('donor', ['Han-Donor29'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 389,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['Organ'] = 'Brain'\n",
    "adata.obs['Organ_Specific'] = adata.obs['sub_tissue']\n",
    "adata.obs['Dataset'] = 'Han_Brain_Cerebellum'\n",
    "adata.obs['InternDatasetNumber'] = '01-2-Brain_Cerebellum-Han-2020'\n",
    "adata.obs['Dataset_status'] = 'Healthy_Dataset'\n",
    "\n",
    "adata.obs['celltype'] = adata.obs['celltype']\n",
    "adata.obs['sub_celltype'] = 'NaN'\n",
    "adata.obs['Malignant'] = 'NonMalignant'\n",
    "\n",
    "adata.obs['Patient'] = adata.obs['donor']\n",
    "adata.obs['Patient_Number'] = 'NaN'\n",
    "adata.obs['age'] = adata.obs['age']\n",
    "adata.obs['sex'] = adata.obs['sex']\n",
    "adata.obs['ethnicity'] = 'NaN'\n",
    "adata.obs['health_status'] = 'NaN'\n",
    "\n",
    "adata.obs['original_celltype_1'] = adata.obs['celltype_specific']\n",
    "adata.obs['original_celltype_2'] = adata.obs['celltype_global']\n",
    "adata.obs['original_celltype_3'] = 'NaN'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 391,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 392,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.write(writepath + '01-2-Brain_Cerebellum-Han-2020-processed.h5ad')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 01-3-Brain_TemporalLobe-Han-2020"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 965,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(adata_han.obs['sub_tissue'],['AdultTemporalLobe']) \n",
    "adata=adata_han[ix].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 966,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['InternDatasetNumber'] ='01-3-Brain_TemporalLobe-Han-2020'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 970,
   "metadata": {},
   "outputs": [],
   "source": [
    "#calculate QC covariates\n",
    "adata.obs['n_counts'] = adata.X.sum(1)\n",
    "adata.obs['log_counts'] = np.log(adata.obs['n_counts'])\n",
    "adata.obs['n_genes'] = (adata.X > 0).sum(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 973,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "filtered out 24 cells that have more than 2100 counts\n",
      "filtered out 12218 genes that are detected in less than 10 cells\n"
     ]
    }
   ],
   "source": [
    "# FILTER PARAMETERS#Filter out cells\n",
    "sc.pp.filter_cells(adata, max_counts = 2100)\n",
    "sc.pp.filter_cells(adata, max_genes = 1500)\n",
    "# Min 20 cells - filters out low count genes\n",
    "sc.pp.filter_genes(adata, min_cells=10) #500 works # # 450(30494) #400(29837) #300(28268) not"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 974,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get mt genes\n",
    "mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]\n",
    "np.array(mt_genes)\n",
    "mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]\n",
    "\n",
    "# convert to spare\n",
    "adata.X=sp.sparse.csr_matrix.todense(adata.X)\n",
    "\n",
    "# manually define mt score\n",
    "y = np.bincount(mt_gene_mask)\n",
    "ii = np.nonzero(y)[0]\n",
    "np.vstack((ii,y[ii])).T\n",
    "adata.X[:, mt_gene_mask].sum(1)\n",
    "mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))\n",
    "mt_sum=np.array(pd.DataFrame(mt_sum)[0])\n",
    "mt_frac=mt_sum/adata.obs['n_counts'].values\n",
    "\n",
    "adata.obs['mt_frac'] = mt_frac\n",
    "\n",
    "#Filter out cells with over 20% mito fraction\n",
    "adata = adata[adata.obs['mt_frac'] < 0.20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 975,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pp=adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 976,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:00:00): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n",
      "computing PCA\n",
      "    with n_comps=15\n",
      "    finished (0:00:03)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)\n",
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 8 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:00:00)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 977,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 978,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 979,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/python/lib/python3.7/site-packages/ipykernel_launcher.py:3: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "#Delete adata_pp\n",
    "del adata_pp\n",
    "adata.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 980,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 981,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "#Keep the count data in a counts layer\n",
    "adata.layers[\"counts\"] = adata.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 982,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.raw = adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 983,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "adata.X /= adata.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 984,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n",
      "    finished (0:00:00)\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 985,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=50\n",
      "    finished (0:00:01)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 50\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:02)\n",
      "computing UMAP\n",
      "    finished: added\n",
      "    'X_umap', UMAP coordinates (adata.obsm) (0:00:18)\n"
     ]
    }
   ],
   "source": [
    "# Calculate the visualizations\n",
    "sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')\n",
    "sc.pp.neighbors(adata)\n",
    "sc.tl.umap(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 987,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['Astrocyte', 'Endothelial cell', 'Glial cell', 'Inhibitory neuron',\n",
       "       'Macrophage', 'Microglia_ALOX5AP high', 'Microglia_C3 high',\n",
       "       'Neutrophil', 'Oligodendrocyte progenitor cell',\n",
       "       'Oligodendrocyte_MT gene high', 'Oligodendrocyte_TF high',\n",
       "       'Proliferating cell', 'Smooth muscle cell', 'T cell', 'Unknown'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 987,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# make consistent annotations across datasets\n",
    "adata.obs['celltype']=adata.obs['celltype_specific'].copy()\n",
    "adata.obs['celltype'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 988,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['celltype'],\n",
    "                           categories=['Astrocyte', 'Endothelial cell', 'Glial cell', 'Inhibitory neuron',\n",
    "       'Macrophage', 'Microglia_ALOX5AP high', 'Microglia_C3 high',\n",
    "       'Neutrophil', 'Oligodendrocyte progenitor cell',\n",
    "       'Oligodendrocyte_MT gene high', 'Oligodendrocyte_TF high',\n",
    "       'Proliferating cell', 'Smooth muscle cell', 'T cell', 'Unknown'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 989,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,[ 'Microglia_C3 high'])\n",
    "ref_cluster[ix]= 'Microglia_ALOX5AP high'\n",
    "\n",
    "ix=np.isin(ref_cluster,[ 'Oligodendrocyte_MT gene high', 'Oligodendrocyte_TF high'])\n",
    "ref_cluster[ix]= 'Oligodendrocyte progenitor cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,[  'Proliferating cell'])\n",
    "ref_cluster[ix]=  'Unknown'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 990,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['celltype']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['Astrocyte', 'Endothelial cell', 'Glial cell', 'Inhibitory neuron',\n",
    "       'Macrophage', 'Microglia_ALOX5AP high', \n",
    "       'Neutrophil', 'Oligodendrocyte progenitor cell',\n",
    "       'Smooth muscle cell', 'T cell', 'Unknown'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 991,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.rename_categories('celltype', ['Astrocytes', 'Endothelial cells', 'Glial cells', 'Neurons',\n",
    "       'Macrophages', 'Microglial cells', \n",
    "       'Neutrophils', 'Oligodendrocytes',\n",
    "       'Smooth muscle cells', 'T cells', 'Unknown'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 995,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['sub_tissue'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['sub_tissue'],\n",
    "                           categories=['AdultTemporalLobe'])\n",
    "adata.rename_categories('sub_tissue', ['Brain_TemporalLobe'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 996,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['sex'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['sex'],\n",
    "                           categories=['female'])\n",
    "adata.rename_categories('sex', ['Female'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 997,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['age'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['age'],\n",
    "                           categories=['61Y'])\n",
    "adata.rename_categories('age',['61'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 998,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['donor'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['donor'],\n",
    "                           categories=['Donor52'])\n",
    "adata.rename_categories('donor', ['Donor52'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 999,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['Organ'] = 'Brain'\n",
    "adata.obs['Organ_Specific'] = adata.obs['sub_tissue']\n",
    "adata.obs['Dataset'] = 'Han_Brain_TemporalLobe'\n",
    "adata.obs['InternDatasetNumber'] ='01-3-Brain_TemporalLobe-Han-2020'\n",
    "adata.obs['Dataset_status'] = 'Healthy_Dataset'\n",
    "\n",
    "adata.obs['celltype'] = adata.obs['celltype']\n",
    "adata.obs['sub_celltype'] = 'NaN'\n",
    "adata.obs['Malignant'] = 'NonMalignant'\n",
    "\n",
    "adata.obs['Patient'] = adata.obs['donor']\n",
    "adata.obs['Patient_Number'] = 'NaN'\n",
    "adata.obs['age'] = adata.obs['age']\n",
    "adata.obs['sex'] = adata.obs['sex']\n",
    "adata.obs['ethnicity'] = 'NaN'\n",
    "adata.obs['health_status'] = 'NaN'\n",
    "\n",
    "adata.obs['original_celltype_1'] = adata.obs['celltype_specific']\n",
    "adata.obs['original_celltype_2'] = adata.obs['celltype_global']\n",
    "adata.obs['original_celltype_3'] = 'NaN'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1001,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1002,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs_names_make_unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1003,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.write(writepath + '01-3-Brain_TemporalLobe-Han-2020-processed.h5ad')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "# 04-Gut"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 04-1-Oesophagus-Madissoon-2019"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 652,
   "metadata": {},
   "outputs": [],
   "source": [
    "# here we use sfaira to import available datasets with annotations\n",
    "# note that the following steps may change depending on the current sfaira version and the path to your repository\n",
    "\n",
    "datadir = '/path/to/repo/'\n",
    "\n",
    "ds = sfaira.data.human.DatasetGroupEsophagus(path=datadir)  # This links all data sets available"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 653,
   "metadata": {},
   "outputs": [],
   "source": [
    "ds.ids "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 654,
   "metadata": {},
   "outputs": [],
   "source": [
    "# pick first one\n",
    "idx = ds.ids[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 656,
   "metadata": {},
   "outputs": [],
   "source": [
    "ds.datasets[idx].load()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 657,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata=ds.datasets[idx].adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 658,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.var.index=np.array(adata.var.names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 660,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['InternDatasetNumber'] ='04-1-Oesophagus-Madissoon-2019'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 663,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<87947x24245 sparse matrix of type '<class 'numpy.float32'>'\n",
       "\twith 155926526 stored elements in Compressed Sparse Column format>"
      ]
     },
     "execution_count": 663,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 664,
   "metadata": {},
   "outputs": [],
   "source": [
    "sc.pp.calculate_qc_metrics(adata, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 667,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "filtered out 5797 genes that are detected in less than 30 cells\n"
     ]
    }
   ],
   "source": [
    "# FILTER PARAMETERS\n",
    "#Filter out cells\n",
    "#sc.pp.filter_cells(adata, max_counts = 14000)\n",
    "#sc.pp.filter_cells(adata, max_genes = 4000)\n",
    "# Min 20 cells - filters out low count genes\n",
    "sc.pp.filter_genes(adata, min_cells=30)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 668,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<87947x18448 sparse matrix of type '<class 'numpy.float32'>'\n",
       "\twith 155870612 stored elements in Compressed Sparse Column format>"
      ]
     },
     "execution_count": 668,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 669,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get mt genes\n",
    "mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]\n",
    "np.array(mt_genes)\n",
    "mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]\n",
    "\n",
    "# convert to spare\n",
    "adata.X=sp.sparse.csr_matrix.todense(adata.X)\n",
    "\n",
    "# manually define mt score\n",
    "y = np.bincount(mt_gene_mask)\n",
    "ii = np.nonzero(y)[0]\n",
    "np.vstack((ii,y[ii])).T\n",
    "adata.X[:, mt_gene_mask].sum(1)\n",
    "mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))\n",
    "mt_sum=np.array(pd.DataFrame(mt_sum)[0])\n",
    "mt_frac=mt_sum/adata.obs['n_counts'].values\n",
    "\n",
    "adata.obs['mt_frac'] = mt_frac\n",
    "\n",
    "#Filter out cells with over 25% mito fraction\n",
    "adata = adata[adata.obs['mt_frac'] < 0.20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 671,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pp=adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 672,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:00:12): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n",
      "computing PCA\n",
      "    with n_comps=15\n",
      "    finished (0:00:21)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:11)\n",
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 15 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:00:24)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 673,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 674,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 675,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/python/lib/python3.7/site-packages/ipykernel_launcher.py:3: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "#Deletstrings_to_categoricalsa_pp\n",
    "del adata_pp\n",
    "adata.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 676,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 677,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "#Keep the count data in a counts layer\n",
    "adata.layers[\"counts\"] = adata.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 678,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.raw = adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 679,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "adata.X /= adata.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 680,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 681,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n",
      "    finished (0:00:00)\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 682,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=50\n",
      "    finished (0:00:17)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 50\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:16)\n",
      "computing UMAP\n",
      "    finished: added\n",
      "    'X_umap', UMAP coordinates (adata.obsm) (0:01:09)\n"
     ]
    }
   ],
   "source": [
    "# Calculate the visualizations\n",
    "sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')\n",
    "sc.pp.neighbors(adata)\n",
    "sc.tl.umap(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 684,
   "metadata": {},
   "outputs": [],
   "source": [
    "# make consistent annotations across datasets\n",
    "adata.obs['celltype']=adata.obs['cell_ontology_class'].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 685,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['celltype'],\n",
    "                           categories=['B_CD27neg', 'B_CD27pos', 'Basal cell', 'Blood_vessel',\n",
    "       'Dendritic cell', 'Epi_dividing', 'Epi_suprabasal', 'Epi_upper',\n",
    "       'Glands_duct', 'Glands_mucous', 'Lymph_vessel', 'Mast cell',\n",
    "       'Mono_macro', 'NK_T_CD8_Cytotoxic', 'Stratified epithelial cell',\n",
    "       'Stromal cell', 'T_CD4', 'T_CD8'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 686,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,['B_CD27pos'])\n",
    "ref_cluster[ix]='B_CD27neg'\n",
    "\n",
    "ix=np.isin(ref_cluster,['Epi_suprabasal', 'Epi_upper'])\n",
    "ref_cluster[ix]='Epi_dividing'\n",
    "\n",
    "ix=np.isin(ref_cluster,['Glands_mucous'])\n",
    "ref_cluster[ix]='Glands_duct'\n",
    "\n",
    "ix=np.isin(ref_cluster,['T_CD4'])\n",
    "ref_cluster[ix]='T_CD8'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 687,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['celltype']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['B_CD27neg', 'Basal cell', 'Blood_vessel',\n",
    "       'Dendritic cell', 'Epi_dividing',\n",
    "       'Glands_duct', 'Lymph_vessel', 'Mast cell',\n",
    "       'Mono_macro', 'NK_T_CD8_Cytotoxic','Stratified epithelial cell',\n",
    "       'Stromal cell', 'T_CD8'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 688,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.rename_categories('celltype', ['B cells', 'Basal cells', 'Endothelial cells', 'Dendritic cells',\n",
    "        'Oesophageal epithelial cells', 'Intestinal secretory cells', 'Lymphatic endothelial cells', 'Mast cells', 'Macrophages','NK cells', 'Mucosal squamous cells',\n",
    "        'Mesenchymal stromal cells', 'T cells'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 691,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['296C', '325C', '328C', '356C', '362C', '367C'], dtype='object')"
      ]
     },
     "execution_count": 691,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.obs['patient'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 692,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['donor'] = adata.obs['patient']\n",
    "ref_cluster=pd.Categorical(adata.obs['donor'],\n",
    "                           categories=['296C', '325C', '328C', '356C', '362C', '367C'])\n",
    "adata.rename_categories('donor', ['Madissoon_Oesophagus-Donor1', 'Madissoon_Oesophagus-Donor2', 'Madissoon_Oesophagus-Donor3', 'Madissoon_Oesophagus-Donor4', 'Madissoon_Oesophagus-Donor5', 'Madissoon_Oesophagus-Donor6'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 693,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['Organ'] = 'Oesophagus'\n",
    "adata.obs['Organ_Specific'] = 'Oesophagus'\n",
    "adata.obs['Dataset'] = 'Madissoon_Oesophagus'\n",
    "adata.obs['InternDatasetNumber'] ='04-1-Oesophagus-Madissoon-2019'\n",
    "adata.obs['Dataset_status'] = 'Healthy_Dataset'\n",
    "\n",
    "adata.obs['celltype'] = adata.obs['celltype']\n",
    "adata.obs['sub_celltype'] = 'NaN'\n",
    "adata.obs['Malignant'] = 'NonMalignant'\n",
    "\n",
    "adata.obs['Patient'] = adata.obs['donor']\n",
    "adata.obs['Patient_Number'] = adata.obs['sample']\n",
    "adata.obs['age'] = 'NaN'\n",
    "adata.obs['sex'] = 'NaN'\n",
    "adata.obs['ethnicity'] = 'NaN'\n",
    "adata.obs['health_status'] = 'NaN'\n",
    "\n",
    "adata.obs['original_celltype_1'] = adata.obs['cell_ontology_class']\n",
    "adata.obs['original_celltype_2'] = adata.obs['Celltypes']\n",
    "adata.obs['original_celltype_3'] = 'NaN'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 695,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 696,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs_names_make_unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 697,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.write(writepath+'04-1-Oesophagus-Madissoon-2019-processed.h5ad')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 04-2-Gut_Colon-James-2020"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For the colon dataset of James, data was obtained from the original study (https://doi.org/10.1038/s41590-020-0602-z)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata=sc.read(writepath + 'james2020_raw.h5ad')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "AnnData object with n_obs × n_vars = 41650 × 18927\n",
       "    obs: 'donor', 'region', 'n_genes', 'percent_mito', 'n_counts', 'cell_type', 'BCR_ChainCombination', 'BCR_SEQUENCE_ID', 'IGH_V_CALL_GENOTYPED', 'IGH_D_CALL', 'IGH_J_CALL', 'BCR_ISOTYPE', 'BCR_CLONE', 'BCR_PANDONOR_CLONE', 'IGL_C_Gene', 'IGL_VDJ_Gene', 'IGH_READS', 'IGH_UMIS', 'IGH_MU_FREQ', 'IGK_READS', 'IGK_UMIS', 'IGL_READS', 'IGL_UMIS', 'TCR_v_gene', 'TCR_d_gene', 'TCR_j_gene', 'TCR_c_gene', 'TCR_cdr3', 'TCR_cdr3_nt', 'TCR_reads', 'TCR_umis', 'TCR_Clone'\n",
       "    var: 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'gene_ids', 'feature_types'\n",
       "    uns: 'leiden', 'neighbors', 'pca', 'rank_genes_groups'\n",
       "    obsm: 'X_umap'\n",
       "    obsp: 'connectivities', 'distances'"
      ]
     },
     "execution_count": 110,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['InternDatasetNumber'] ='04-2-Gut_Colon-James-2020'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {},
   "outputs": [],
   "source": [
    "sc.pp.calculate_qc_metrics(adata, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "filtered out 10 cells that have more than 5000 genes expressed\n",
      "filtered out 3755 genes that are detected in less than 20 cells\n"
     ]
    }
   ],
   "source": [
    "# FILTER PARAMETERS\n",
    "#Filter out cells\n",
    "sc.pp.filter_cells(adata, max_counts = 60000)\n",
    "sc.pp.filter_cells(adata, max_genes = 5000)\n",
    "# Min 20 cells - filters out low count genes\n",
    "sc.pp.filter_genes(adata, min_cells=20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<41640x15172 sparse matrix of type '<class 'numpy.float32'>'\n",
       "\twith 57674816 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 117,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get mt genes\n",
    "mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]\n",
    "np.array(mt_genes)\n",
    "mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]\n",
    "\n",
    "# convert to spare\n",
    "adata.X=sp.sparse.csr_matrix.todense(adata.X)\n",
    "\n",
    "# manually define mt score\n",
    "y = np.bincount(mt_gene_mask)\n",
    "ii = np.nonzero(y)[0]\n",
    "np.vstack((ii,y[ii])).T\n",
    "adata.X[:, mt_gene_mask].sum(1)\n",
    "mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))\n",
    "mt_sum=np.array(pd.DataFrame(mt_sum)[0])\n",
    "mt_frac=mt_sum/adata.obs['n_counts'].values\n",
    "\n",
    "adata.obs['mt_frac'] = mt_frac\n",
    "\n",
    "#Filter out cells with over 25% mito fraction\n",
    "adata = adata[adata.obs['mt_frac'] < 0.20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pp=adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:00:02): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n",
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=15\n",
      "    finished (0:00:02)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:07)\n",
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 15 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:00:03)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/python/lib/python3.7/site-packages/ipykernel_launcher.py:3: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "#Deletstrings_to_categoricals_pp\n",
    "del adata_pp\n",
    "adata.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "#Keep the count data in a counts layer\n",
    "adata.layers[\"counts\"] = adata.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.raw = adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "adata.X /= adata.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n",
      "    finished (0:00:00)\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=50\n",
      "    finished (0:00:14)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 50\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:08)\n",
      "computing UMAP\n",
      "    finished: added\n",
      "    'X_umap', UMAP coordinates (adata.obsm) (0:00:35)\n"
     ]
    }
   ],
   "source": [
    "# Calculate the visualizations\n",
    "sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')\n",
    "sc.pp.neighbors(adata)\n",
    "sc.tl.umap(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {},
   "outputs": [],
   "source": [
    "#sc.pl.umap(adata, color='cell_type', palette=palette)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "metadata": {},
   "outputs": [],
   "source": [
    "# make consistent annotations across datasets\n",
    "adata.obs['celltype']=adata.obs['cell_type'].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['celltype'],\n",
    "                           categories=['Activated CD4 T', 'B cell IgA Plasma', 'B cell IgG Plasma',\n",
    "       'B cell cycling', 'Follicular B cell', 'B cell memory', 'CD8 T', 'ILC',\n",
    "       'Lymphoid DC', 'Monocyte', 'Mast', 'Macrophage', 'LYVE1 Macrophage',\n",
    "       'NK', 'Tcm', 'Tfh', 'Th1', 'Th17', 'Treg', 'cDC1', 'cDC2',\n",
    "       'cycling DCs', 'pDC', 'gd T', 'cycling gd T'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,['CD8 T','Tcm', 'Tfh', 'Th1', 'Th17', 'Treg','gd T', 'cycling gd T'])\n",
    "ref_cluster[ix]='Activated CD4 T'\n",
    "\n",
    "ix=np.isin(ref_cluster,['Follicular B cell', 'B cell memory'])\n",
    "ref_cluster[ix]='B cell cycling'\n",
    "\n",
    "ix=np.isin(ref_cluster,['B cell IgG Plasma'])\n",
    "ref_cluster[ix]='B cell IgA Plasma'\n",
    "\n",
    "ix=np.isin(ref_cluster,['cDC1', 'cDC2','cycling DCs', 'pDC'])\n",
    "ref_cluster[ix]='Lymphoid DC'\n",
    "\n",
    "ix=np.isin(ref_cluster,['LYVE1 Macrophage'])\n",
    "ref_cluster[ix]='Macrophage'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['celltype']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['Activated CD4 T', 'B cell IgA Plasma', 'B cell cycling', 'ILC',\n",
    "       'Lymphoid DC', 'Monocyte', 'Mast', 'Macrophage','NK'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 136,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ruben.brabenec/.local/lib/python3.7/site-packages/pandas/core/arrays/categorical.py:2631: FutureWarning: The `inplace` parameter in pandas.Categorical.rename_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.\n",
      "  res = method(*args, **kwargs)\n"
     ]
    }
   ],
   "source": [
    "adata.rename_categories('celltype', ['T cells', 'Plasma cells', 'B cells', 'Innate lymphoid cells', 'Dendritic cells',\n",
    "       'Monocytes', 'Mast cells', 'Macrophages', 'NK cells'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 140,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ruben.brabenec/.local/lib/python3.7/site-packages/pandas/core/arrays/categorical.py:2631: FutureWarning: The `inplace` parameter in pandas.Categorical.rename_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.\n",
      "  res = method(*args, **kwargs)\n"
     ]
    }
   ],
   "source": [
    "adata.obs['donor'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['donor'],\n",
    "                           categories=['290b', '298c', '302c', '390c', '417c'])\n",
    "adata.rename_categories('donor', ['James_Gut_Colon-Donor1', 'James_Gut_Colon-Donor2', 'James_Gut_Colon-Donor3', 'James_Gut_Colon-Donor4', 'James_Gut_Colon-Donor5'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 141,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['Organ'] = 'Gut_Colon'\n",
    "adata.obs['Organ_Specific'] = 'Gut_Colon'\n",
    "adata.obs['Dataset'] = 'James_Gut_Colon'\n",
    "adata.obs['InternDatasetNumber'] ='04-2-Gut_Colon-James-2020'\n",
    "adata.obs['Dataset_status'] = 'Healthy_Dataset'\n",
    "\n",
    "adata.obs['celltype'] = adata.obs['celltype']\n",
    "adata.obs['sub_celltype'] = 'NaN'\n",
    "adata.obs['Malignant'] = 'NonMalignant'\n",
    "\n",
    "adata.obs['Patient'] = adata.obs['donor']\n",
    "adata.obs['Patient_Number'] = 'NaN'\n",
    "adata.obs['age'] ='NaN'\n",
    "adata.obs['sex'] = 'NaN'\n",
    "adata.obs['ethnicity'] = 'NaN'\n",
    "adata.obs['health_status'] = 'NaN'\n",
    "\n",
    "adata.obs['original_celltype_1'] = adata.obs['cell_type']\n",
    "adata.obs['original_celltype_2'] = 'NaN'\n",
    "adata.obs['original_celltype_3'] = 'NaN'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 143,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 144,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs_names_make_unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 146,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.write(writepath + '04-2-Gut_Colon-James-2020-processed.h5ad')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 04-3-Gut_Colon-Simmons-2021"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading: ncbitaxon_v2021-06-10.obo to /home/ruben.brabenec/.cache/sfaira/ontologies/ncbitaxon\n",
      "Downloading: efo.obo to /home/ruben.brabenec/.cache/sfaira/ontologies/efo\n",
      "Downloading: hsapdv_master.obo to /home/ruben.brabenec/.cache/sfaira/ontologies/hsapdv\n",
      "Downloading: mmusdv.obo to /home/ruben.brabenec/.cache/sfaira/ontologies/mmusdv\n",
      "Downloading: uberon_v2021-07-27.obo to /home/ruben.brabenec/.cache/sfaira/ontologies/uberon\n",
      "Ontology <class 'sfaira.versions.metadata.base.OntologyUberonLifecyclestage'> is not a DAG, treat child-parent reasoning with care.\n",
      "Downloading: mondo_v2021-08-11.obo to /home/ruben.brabenec/.cache/sfaira/ontologies/mondo\n",
      "Ontology <class 'sfaira.versions.metadata.base.OntologyMondo'> is not a DAG, treat child-parent reasoning with care.\n",
      "Ontology <class 'sfaira.versions.metadata.base.OntologyUberon'> is not a DAG, treat child-parent reasoning with care.\n",
      "Downloading: pato_v2021-08-06.obo to /home/ruben.brabenec/.cache/sfaira/ontologies/pato\n"
     ]
    }
   ],
   "source": [
    "target_collections = [\"60358420-6055-411d-ba4f-e8ac80682a2e\"]\n",
    "cache_path = os.path.join(\".\", \"data\")\n",
    "dsg = sfaira.data.dataloaders.databases.DatasetSuperGroupDatabases(data_path=cache_path, cache_metadata=True)\n",
    "dsg.subset(key=\"collection_id\", values=target_collections)\n",
    "dsg.datasets\n",
    "dsg.download()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "path = '/path/to/repo/60358420-6055-411d-ba4f-e8ac80682a2e/'\n",
    "files = [f for f in listdir(path) if isfile(join(path, f))]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "#1: 774de9c6-9752-4e39-89a9-2a88c869d52a.h5ad\n",
    "path_2 = path + '774de9c6-9752-4e39-89a9-2a88c869d52a.h5ad'\n",
    "u1 = sc.read_h5ad(path_2)\n",
    "u1.var.index = u1.var['feature_name']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "#2: 774de9c6-9752-4e39-89a9-2a88c869d52a.h5ad\n",
    "path_2 = path + '08e94873-c2a6-4f7d-ab72-aeaff3e3f929.h5ad'\n",
    "u2 = sc.read_h5ad(path_2)\n",
    "u2.var.index = u2.var['feature_name']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "#3: '2d66790a-6621-4a49-8f0d-4002db5cc98d.h5ad'\n",
    "path_2 = path +  '2d66790a-6621-4a49-8f0d-4002db5cc98d.h5ad'\n",
    "u3 = sc.read_h5ad(path_2)\n",
    "u3.var.index = u3.var['feature_name']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "#4: '4d2e0563-cf4a-48bd-aa7f-efc26025b53a.h5ad'\n",
    "path_2 = path +  '4d2e0563-cf4a-48bd-aa7f-efc26025b53a.h5ad'\n",
    "u4 = sc.read_h5ad(path_2)\n",
    "u4.var.index = u4.var['feature_name']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "#5: '58679288-9ecc-4647-9781-12a3a8f8c6fd.h5ad'\n",
    "path_2 = path +  '58679288-9ecc-4647-9781-12a3a8f8c6fd.h5ad'\n",
    "u5 = sc.read_h5ad(path_2)\n",
    "u5.var.index = u5.var['feature_name']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "#6: 'fd89be61-2869-4342-a86e-e1fce3a8f269.h5ad'\n",
    "path_2 = path +  'fd89be61-2869-4342-a86e-e1fce3a8f269.h5ad'\n",
    "u6 = sc.read_h5ad(path_2)\n",
    "u6.var.index = u6.var['feature_name']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "#7: c42c8ad3-9761-49e5-b9bf-ee8ebd50416f.h5ad'\n",
    "path_2 = path +  'c42c8ad3-9761-49e5-b9bf-ee8ebd50416f.h5ad'\n",
    "u7 = sc.read_h5ad(path_2)\n",
    "u7.var.index = u7.var['feature_name']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "#8: 'aa0b5adb-957d-4f15-ab83-2c5cc2843f77.h5ad'\n",
    "path_2 = path +  'fd89be61-2869-4342-a86e-e1fce3a8f269.h5ad'\n",
    "u8 = sc.read_h5ad(path_2)\n",
    "u8.var.index = u8.var['feature_name']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "#9: 'abd889c6-f60a-4fbd-924e-ee1e9dcf909b.h5ad'\n",
    "path_2 = path +  'abd889c6-f60a-4fbd-924e-ee1e9dcf909b.h5ad'\n",
    "u9 = sc.read_h5ad(path_2)\n",
    "u9.var.index = u9.var['feature_name']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "#10: '4269074c-f2c1-4d88-b2c3-0946f59d5449.h5ad'\n",
    "path_2 = path +  '4269074c-f2c1-4d88-b2c3-0946f59d5449.h5ad'\n",
    "u10 = sc.read_h5ad(path_2)\n",
    "u10.var.index = u10.var['feature_name']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "#11: 'b9b4cf27-9c22-410d-8bd8-5d43e379485b.h5ad'\n",
    "path_2 = path +  'b9b4cf27-9c22-410d-8bd8-5d43e379485b.h5ad'\n",
    "u11 = sc.read_h5ad(path_2)\n",
    "u11.var.index = u11.var['feature_name']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "#11: 'e006d4e3-35fa-44b4-9981-09a66c4322e5.h5ad'\n",
    "path_2 = path +  'e006d4e3-35fa-44b4-9981-09a66c4322e5.h5ad'\n",
    "u11 = sc.read_h5ad(path_2)\n",
    "u11.var.index = u11.var['feature_name']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "#12: '4506d9e3-4543-4464-aeae-b0b04eee1cea.h5ad'\n",
    "path_2 = path +  '4506d9e3-4543-4464-aeae-b0b04eee1cea.h5ad'\n",
    "u12 = sc.read_h5ad(path_2)\n",
    "u12.var.index = u12.var['feature_name']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "#13: 'bbd16004-09e8-4b6c-b465-73ff83a52837.h5ad'\n",
    "path_2 = path +  'bbd16004-09e8-4b6c-b465-73ff83a52837.h5ad'\n",
    "u13 = sc.read_h5ad(path_2)\n",
    "u13.var.index = u13.var['feature_name']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "#14: '9dfd2243-74d6-4924-86bd-c206ca9287b1.h5ad',\n",
    "path_2 = path +  '9dfd2243-74d6-4924-86bd-c206ca9287b1.h5ad'\n",
    "u14 = sc.read_h5ad(path_2)\n",
    "u14.var.index = u14.var['feature_name']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "#15: '04b0eb97-d816-44bb-93a5-8b2968791aa0.h5ad'\n",
    "path_2 = path +  '04b0eb97-d816-44bb-93a5-8b2968791aa0.h5ad'\n",
    "u15 = sc.read_h5ad(path_2)\n",
    "u15.var.index = u15.var['feature_name']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "#16:  '9d5df009-eb76-43a3-b6cd-22017cc53700.h5ad'\n",
    "path_2 = path +   '9d5df009-eb76-43a3-b6cd-22017cc53700.h5ad'\n",
    "u16 = sc.read_h5ad(path_2)\n",
    "u16.var.index = u16.var['feature_name']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ruben.brabenec/.local/lib/python3.7/site-packages/anndata/_core/anndata.py:1785: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n",
      "  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],\n"
     ]
    }
   ],
   "source": [
    "adata = u1.concatenate(u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15, u16, join='outer', index_unique=\"_\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "774de9c6-9752-4e39-89a9-2a88c869d52a.h5ad\n",
      "08e94873-c2a6-4f7d-ab72-aeaff3e3f929.h5ad\n",
      "2d66790a-6621-4a49-8f0d-4002db5cc98d.h5ad\n",
      "4d2e0563-cf4a-48bd-aa7f-efc26025b53a.h5ad\n",
      "58679288-9ecc-4647-9781-12a3a8f8c6fd.h5ad\n",
      "fd89be61-2869-4342-a86e-e1fce3a8f269.h5ad\n",
      "c42c8ad3-9761-49e5-b9bf-ee8ebd50416f.h5ad\n",
      "aa0b5adb-957d-4f15-ab83-2c5cc2843f77.h5ad\n",
      "abd889c6-f60a-4fbd-924e-ee1e9dcf909b.h5ad\n",
      "4269074c-f2c1-4d88-b2c3-0946f59d5449.h5ad\n",
      "b9b4cf27-9c22-410d-8bd8-5d43e379485b.h5ad\n",
      "e006d4e3-35fa-44b4-9981-09a66c4322e5.h5ad\n",
      "4506d9e3-4543-4464-aeae-b0b04eee1cea.h5ad\n",
      "bbd16004-09e8-4b6c-b465-73ff83a52837.h5ad\n",
      "9dfd2243-74d6-4924-86bd-c206ca9287b1.h5ad\n",
      "04b0eb97-d816-44bb-93a5-8b2968791aa0.h5ad\n",
      "9d5df009-eb76-43a3-b6cd-22017cc53700.h5ad\n"
     ]
    }
   ],
   "source": [
    "for i in range(len(files)):\n",
    "    print(files[i])\n",
    "    path_2 = path + files[i]\n",
    "    u = sc.read_h5ad(path_2)\n",
    "    u.obs['id'] = files[i]\n",
    "    u.var['Gene'] = 'NaN'\n",
    "    u.var['Gene'] = u.var['feature_name']\n",
    "    if i == 0:\n",
    "        adata = u\n",
    "    else:\n",
    "        adata = adata.concatenate(u, join='inner', index_unique=\"_\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['InternDatasetNumber'] ='04-3-Gut_Colon_SmallIntestine-Simmons-2021'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.var.index = adata.var['feature_name-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "sc.pp.calculate_qc_metrics(adata, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "#calculate QC covariates\n",
    "adata.obs['n_counts'] = adata.X.sum(1)\n",
    "adata.obs['log_counts'] = np.log(adata.obs['n_counts'])\n",
    "adata.obs['n_genes'] = (adata.X > 0).sum(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<149329x13796 sparse matrix of type '<class 'numpy.float32'>'\n",
       "\twith 325064862 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "filtered out 38 cells that have more than 6000 counts\n"
     ]
    }
   ],
   "source": [
    "# FILTER PARAMETERS\n",
    "#Filter out cells\n",
    "sc.pp.filter_cells(adata, max_counts = 6000)\n",
    "sc.pp.filter_cells(adata, max_genes = 9000)\n",
    "# Min 20 cells - filters out low count genes\n",
    "sc.pp.filter_genes(adata, min_cells=20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<149291x13796 sparse matrix of type '<class 'numpy.float32'>'\n",
       "\twith 324831268 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get mt genes\n",
    "mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]\n",
    "np.array(mt_genes)\n",
    "mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]\n",
    "\n",
    "# convert to spare\n",
    "adata.X=sp.sparse.csr_matrix.todense(adata.X)\n",
    "\n",
    "# manually define mt score\n",
    "y = np.bincount(mt_gene_mask)\n",
    "ii = np.nonzero(y)[0]\n",
    "np.vstack((ii,y[ii])).T\n",
    "adata.X[:, mt_gene_mask].sum(1)\n",
    "mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))\n",
    "mt_sum=np.array(pd.DataFrame(mt_sum)[0])\n",
    "mt_frac=mt_sum/adata.obs['n_counts'].values\n",
    "\n",
    "adata.obs['mt_frac'] = mt_frac\n",
    "\n",
    "#Filter out cells with over 25% mito fraction\n",
    "adata = adata[adata.obs['mt_frac'] < 0.20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pp=adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:00:15): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n",
      "computing PCA\n",
      "    with n_comps=15\n",
      "    finished (0:00:33)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:31)\n",
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 19 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:00:29)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "R[write to console]: Loading required package: scran\n",
      "\n",
      "R[write to console]: Loading required package: SingleCellExperiment\n",
      "\n",
      "R[write to console]: Loading required package: SummarizedExperiment\n",
      "\n",
      "R[write to console]: Loading required package: GenomicRanges\n",
      "\n",
      "R[write to console]: Loading required package: stats4\n",
      "\n",
      "R[write to console]: Loading required package: BiocGenerics\n",
      "\n",
      "R[write to console]: Loading required package: parallel\n",
      "\n",
      "R[write to console]: \n",
      "Attaching package: ‘BiocGenerics’\n",
      "\n",
      "\n",
      "R[write to console]: The following objects are masked from ‘package:parallel’:\n",
      "\n",
      "    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,\n",
      "    clusterExport, clusterMap, parApply, parCapply, parLapply,\n",
      "    parLapplyLB, parRapply, parSapply, parSapplyLB\n",
      "\n",
      "\n",
      "R[write to console]: The following objects are masked from ‘package:stats’:\n",
      "\n",
      "    IQR, mad, sd, var, xtabs\n",
      "\n",
      "\n",
      "R[write to console]: The following objects are masked from ‘package:base’:\n",
      "\n",
      "    anyDuplicated, append, as.data.frame, basename, cbind, colnames,\n",
      "    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,\n",
      "    grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,\n",
      "    order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,\n",
      "    rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,\n",
      "    union, unique, unsplit, which, which.max, which.min\n",
      "\n",
      "\n",
      "R[write to console]: Loading required package: S4Vectors\n",
      "\n",
      "R[write to console]: \n",
      "Attaching package: ‘S4Vectors’\n",
      "\n",
      "\n",
      "R[write to console]: The following object is masked from ‘package:base’:\n",
      "\n",
      "    expand.grid\n",
      "\n",
      "\n",
      "R[write to console]: Loading required package: IRanges\n",
      "\n",
      "R[write to console]: Loading required package: GenomeInfoDb\n",
      "\n",
      "R[write to console]: Loading required package: Biobase\n",
      "\n",
      "R[write to console]: Welcome to Bioconductor\n",
      "\n",
      "    Vignettes contain introductory material; view with\n",
      "    'browseVignettes()'. To cite Bioconductor, see\n",
      "    'citation(\"Biobase\")', and for packages 'citation(\"pkgname\")'.\n",
      "\n",
      "\n",
      "R[write to console]: Loading required package: DelayedArray\n",
      "\n",
      "R[write to console]: Loading required package: matrixStats\n",
      "\n",
      "R[write to console]: \n",
      "Attaching package: ‘matrixStats’\n",
      "\n",
      "\n",
      "R[write to console]: The following objects are masked from ‘package:Biobase’:\n",
      "\n",
      "    anyMissing, rowMedians\n",
      "\n",
      "\n",
      "R[write to console]: Loading required package: BiocParallel\n",
      "\n",
      "R[write to console]: \n",
      "Attaching package: ‘DelayedArray’\n",
      "\n",
      "\n",
      "R[write to console]: The following objects are masked from ‘package:matrixStats’:\n",
      "\n",
      "    colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges\n",
      "\n",
      "\n",
      "R[write to console]: The following objects are masked from ‘package:base’:\n",
      "\n",
      "    aperm, apply, rowsum\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/python/lib/python3.7/site-packages/ipykernel_launcher.py:3: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "#Delete adata_pp\n",
    "del adata_pp\n",
    "adata.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "#Keep the count data in a counts layer\n",
    "adata.layers[\"counts\"] = adata.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.raw = adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "adata.X /= adata.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n",
      "    finished (0:00:01)\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['celltype'],\n",
    "                           categories=['B cell', 'dendritic cell', 'enteric smooth muscle cell', 'enterocyte',\n",
    "       'enteroendocrine cell', 'erythroid lineage cell', 'fibroblast',\n",
    "       'glial cell', 'group 3 innate lymphoid cell', 'gut absorptive cell',\n",
    "       'gut endothelial cell', 'inhibitory motor neuron', 'interneuron',\n",
    "       'interstitial cell of Cajal',\n",
    "       'intestinal crypt stem cell of large intestine',\n",
    "       'intestinal crypt stem cell of small intestine',\n",
    "       'intestinal epithelial cell', 'intestine goblet cell', 'leukocyte',\n",
    "       'macrophage', 'mast cell', 'mesothelial cell', 'monocyte',\n",
    "       'motor neuron', 'myofibroblast cell', 'naive T cell',\n",
    "       'natural killer cell', 'neural cell', 'neuroendocrine cell',\n",
    "       'pericyte cell', 'plasmacytoid dendritic cell', 'precursor B cell',\n",
    "       'progenitor cell', 'secretory cell',\n",
    "       'smooth muscle cell of large intestine',\n",
    "       'smooth muscle cell of small intestine',\n",
    "       'transit amplifying cell of colon',\n",
    "       'transit amplifying cell of small intestine'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,['plasmacytoid dendritic cell'])\n",
    "ref_cluster[ix]='dendritic cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,['gut absorptive cell', 'intestinal epithelial cell'])\n",
    "ref_cluster[ix]='enterocyte'\n",
    "\n",
    "ix=np.isin(ref_cluster,['interneuron', 'interstitial cell of Cajal', 'motor neuron', 'neural cell'])\n",
    "ref_cluster[ix]='inhibitory motor neuron'\n",
    "\n",
    "ix=np.isin(ref_cluster,['intestinal crypt stem cell of large intestine','transit amplifying cell of colon',\n",
    "                        'transit amplifying cell of small intestine'])\n",
    "ref_cluster[ix]='intestinal crypt stem cell of small intestine'\n",
    "\n",
    "ix=np.isin(ref_cluster,['precursor B cell'])\n",
    "ref_cluster[ix]='B cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,['intestine goblet cell',  'enteroendocrine cell'])\n",
    "ref_cluster[ix]='secretory cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,[ 'smooth muscle cell of small intestine', 'smooth muscle cell of large intestine'])\n",
    "ref_cluster[ix]='enteric smooth muscle cell'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['celltype']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['B cell', 'dendritic cell', 'enteric smooth muscle cell', 'enterocyte',\n",
    "     'erythroid lineage cell', 'fibroblast',\n",
    "       'glial cell', 'group 3 innate lymphoid cell',\n",
    "       'gut endothelial cell', 'inhibitory motor neuron',\n",
    "\n",
    "\n",
    "       'intestinal crypt stem cell of small intestine',\n",
    "       'leukocyte',\n",
    "       'macrophage', 'mast cell', 'mesothelial cell', 'monocyte',\n",
    "      'myofibroblast cell', 'naive T cell',\n",
    "       'natural killer cell',  'neuroendocrine cell',\n",
    "       'pericyte cell', \n",
    "       'progenitor cell', 'secretory cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['2d66790a-6621-4a49-8f0d-4002db5cc98d.h5ad',\n",
       "       '04b0eb97-d816-44bb-93a5-8b2968791aa0.h5ad',\n",
       "       '4d2e0563-cf4a-48bd-aa7f-efc26025b53a.h5ad',\n",
       "       '08e94873-c2a6-4f7d-ab72-aeaff3e3f929.h5ad',\n",
       "       '9d5df009-eb76-43a3-b6cd-22017cc53700.h5ad',\n",
       "       '9dfd2243-74d6-4924-86bd-c206ca9287b1.h5ad',\n",
       "       '774de9c6-9752-4e39-89a9-2a88c869d52a.h5ad',\n",
       "       '4506d9e3-4543-4464-aeae-b0b04eee1cea.h5ad',\n",
       "       '4269074c-f2c1-4d88-b2c3-0946f59d5449.h5ad',\n",
       "       '58679288-9ecc-4647-9781-12a3a8f8c6fd.h5ad',\n",
       "       'aa0b5adb-957d-4f15-ab83-2c5cc2843f77.h5ad',\n",
       "       'abd889c6-f60a-4fbd-924e-ee1e9dcf909b.h5ad',\n",
       "       'b9b4cf27-9c22-410d-8bd8-5d43e379485b.h5ad',\n",
       "       'bbd16004-09e8-4b6c-b465-73ff83a52837.h5ad',\n",
       "       'c42c8ad3-9761-49e5-b9bf-ee8ebd50416f.h5ad',\n",
       "       'e006d4e3-35fa-44b4-9981-09a66c4322e5.h5ad',\n",
       "       'fd89be61-2869-4342-a86e-e1fce3a8f269.h5ad'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.obs['id'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['sex_ontology_term_id'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['sex_ontology_term_id'],\n",
    "                           categories=['PATO:0000384', 'unknown'])\n",
    "adata.rename_categories('sex_ontology_term_id', ['Male', 'NaN'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['tissue'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['tissue'],\n",
    "                           categories=['colon', 'intestine'])\n",
    "adata.rename_categories('tissue', ['Gut_Colon', 'Gut_SmallIntestine'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['development_stage'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['development_stage'],\n",
    "                           categories=['66-year-old human stage', 'fetal stage'])\n",
    "adata.rename_categories('development_stage', ['66', 'Fetal'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['donor'] = adata.obs['id']\n",
    "ref_cluster=pd.Categorical(adata.obs['donor'],\n",
    "                           categories=['2d66790a-6621-4a49-8f0d-4002db5cc98d.h5ad',\n",
    "       '04b0eb97-d816-44bb-93a5-8b2968791aa0.h5ad',\n",
    "       '4d2e0563-cf4a-48bd-aa7f-efc26025b53a.h5ad',\n",
    "       '08e94873-c2a6-4f7d-ab72-aeaff3e3f929.h5ad',\n",
    "       '9d5df009-eb76-43a3-b6cd-22017cc53700.h5ad',\n",
    "       '9dfd2243-74d6-4924-86bd-c206ca9287b1.h5ad',\n",
    "       '774de9c6-9752-4e39-89a9-2a88c869d52a.h5ad',\n",
    "       '4506d9e3-4543-4464-aeae-b0b04eee1cea.h5ad',\n",
    "       '4269074c-f2c1-4d88-b2c3-0946f59d5449.h5ad',\n",
    "       '58679288-9ecc-4647-9781-12a3a8f8c6fd.h5ad',\n",
    "       'aa0b5adb-957d-4f15-ab83-2c5cc2843f77.h5ad',\n",
    "       'abd889c6-f60a-4fbd-924e-ee1e9dcf909b.h5ad',\n",
    "       'b9b4cf27-9c22-410d-8bd8-5d43e379485b.h5ad',\n",
    "       'bbd16004-09e8-4b6c-b465-73ff83a52837.h5ad',\n",
    "       'c42c8ad3-9761-49e5-b9bf-ee8ebd50416f.h5ad',\n",
    "       'e006d4e3-35fa-44b4-9981-09a66c4322e5.h5ad',\n",
    "       'fd89be61-2869-4342-a86e-e1fce3a8f269.h5ad'])\n",
    "adata.rename_categories('donor', ['Simmons_Gut_Colon-Donor1',\n",
    "       'Simmons_Gut_Colon-Donor2',\n",
    "       'Simmons_Gut_Colon-Donor3',\n",
    "       'Simmons_Gut_Colon-Donor4',\n",
    "       'Simmons_Gut_Colon-Donor5',\n",
    "       'Simmons_Gut_Colon-Donor6',\n",
    "       'Simmons_Gut_Colon-Donor7',\n",
    "       'Simmons_Gut_Colon-Donor8',\n",
    "       'Simmons_Gut_Colon-Donor9',\n",
    "       'Simmons_Gut_Colon-Donor10',\n",
    "       'Simmons_Gut_Colon-Donor11',\n",
    "       'Simmons_Gut_Colon-Donor12',\n",
    "       'Simmons_Gut_Colon-Donor13',\n",
    "       'Simmons_Gut_Colon-Donor14',\n",
    "       'Simmons_Gut_Colon-Donor15',\n",
    "       'Simmons_Gut_Colon-Donor16',\n",
    "       'Simmons_Gut_Colon-Donor17'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs_names_make_unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['InternDatasetNumber'] ='04-3-Gut_Colon_SmallIntestine-Simmons-2021'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.write(writepath+'04-3-Gut_Colon_SmallIntestine-Simmons-2021-processed.h5ad')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 04-4-Gut_Colon-Wang-2019"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 698,
   "metadata": {},
   "outputs": [],
   "source": [
    "ID = 'homosapiens_colon_2019_10x3transcriptionprofiling_wang_001_10.1084/jem.20191130'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 699,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set this path to your local sfaira data repository\n",
    "basedir = '.'\n",
    "datadir = os.path.join(basedir, 'raw')\n",
    "metadir = os.path.join(basedir, 'meta')\n",
    "cachedir = os.path.join(basedir, 'cache')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 700,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading: wang20_colon.processed.h5ad\n",
      "loading homosapiens_colon_2019_10x3transcriptionprofiling_wang_001_10.1084/jem.20191130\n"
     ]
    }
   ],
   "source": [
    "ds = sfaira.data.Universe(data_path=datadir, meta_path=metadir, cache_path=cachedir)\n",
    "# subset to the selected dataset\n",
    "ds.subset(key=\"id\", values=[ID])  # subsets all lung data sets\n",
    "# download and load the specific dataset\n",
    "ds.download()\n",
    "ds.load(verbose=1)\n",
    "# get the unmodified adata object of the dataset\n",
    "adata = ds.datasets[ID].adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 701,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['InternDatasetNumber'] ='04-4-Gut_Colon-Wang-2019'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 714,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pp=adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 715,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:00:00): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ruben.brabenec/.local/lib/python3.7/site-packages/anndata/compat/__init__.py:268: FutureWarning: During AnnData slicing, found matrix at .uns['neighbors_hm']['connectivities'] that happens to be dimensioned at n_obs×n_obs (3638×3638).\n",
      "\n",
      "These matrices should now be stored in the .obsp attribute.\n",
      "This slicing behavior will be removed in anndata 0.8.\n",
      "  FutureWarning,\n",
      "/home/ruben.brabenec/.local/lib/python3.7/site-packages/anndata/compat/__init__.py:268: FutureWarning: During AnnData slicing, found matrix at .uns['neighbors_hm']['distances'] that happens to be dimensioned at n_obs×n_obs (3638×3638).\n",
      "\n",
      "These matrices should now be stored in the .obsp attribute.\n",
      "This slicing behavior will be removed in anndata 0.8.\n",
      "  FutureWarning,\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=15\n",
      "    finished (0:00:00)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)\n",
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 8 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:00:00)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 718,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 719,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/python/lib/python3.7/site-packages/ipykernel_launcher.py:3: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "#Delete adata_pp\n",
    "del adata_pp\n",
    "adata.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 720,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 721,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "#Keep the count data in a counts layer\n",
    "adata.layers[\"counts\"] = adata.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 722,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.raw = adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 723,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "adata.X /= adata.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 724,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 725,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n",
      "    finished (0:00:00)\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 728,
   "metadata": {},
   "outputs": [],
   "source": [
    "# make consistent annotations across datasets\n",
    "adata.obs['celltype']=adata.obs['CellType'].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 729,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['Enteriendocrine', 'Enterocyte', 'Goblet', 'Paneth-like', 'Progenitor',\n",
       "       'Stem Cell', 'TA'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 729,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.obs['celltype'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 730,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['celltype'],\n",
    "                           categories= ['Enteriendocrine', 'Enterocyte', 'Goblet', 'Paneth-like', 'Progenitor',\n",
    "       'Stem Cell', 'TA'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 731,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,['TA'])\n",
    "ref_cluster[ix]='Stem Cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,['Paneth-like', 'Goblet'])\n",
    "ref_cluster[ix]='Enteriendocrine'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 732,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['celltype']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['Enteriendocrine', 'Enterocyte', 'Progenitor',\n",
    "       'Stem Cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 733,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.rename_categories('celltype', ['Intestinal secretory cells', 'Intestinal epithelial cells', 'Intestinal progenitor cells',\n",
    "       'Intestinal stem cells'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 737,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['donor'] = adata.obs['Sample_ID']\n",
    "ref_cluster=pd.Categorical(adata.obs['donor'],\n",
    "                           categories=['Colon-1', 'Colon-2'])\n",
    "adata.rename_categories('donor', ['Wang_Gut_Colon-Donor1', 'Wang_Gut_Colon-Donor2'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 738,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['Organ'] = 'Gut_Colon'\n",
    "adata.obs['Organ_Specific'] = 'Gut_Colon'\n",
    "adata.obs['Dataset'] = 'Wang_Gut_Colon'\n",
    "adata.obs['InternDatasetNumber'] ='04-4-Gut_Colon-Wang-2019'\n",
    "adata.obs['Dataset_status'] = 'HealthyProject'\n",
    "\n",
    "adata.obs['celltype'] = adata.obs['celltype']\n",
    "adata.obs['sub_celltype'] = 'NaN'\n",
    "adata.obs['Malignant'] = 'NonMalignant'\n",
    "\n",
    "adata.obs['Patient'] = 'NaN'\n",
    "adata.obs['Patient_Number'] = adata.obs['Sample_ID']\n",
    "adata.obs['age'] = 'NaN'\n",
    "adata.obs['sex'] = 'NaN'\n",
    "adata.obs['ethnicity'] = 'NaN'\n",
    "adata.obs['health_status'] = 'NaN'\n",
    "\n",
    "adata.obs['original_celltype_1'] = adata.obs['CellType']\n",
    "adata.obs['original_celltype_2'] = 'NaN'\n",
    "adata.obs['original_celltype_3'] = 'NaN'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 740,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 741,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs_names_make_unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 742,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.write(writepath+ '04-4-Gut_Colon-Wang-2019-processed.h5ad')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "##   04-6-Gut_Colon-Pisco-2022"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ruben.brabenec/.local/lib/python3.7/site-packages/anndata/_core/anndata.py:121: ImplicitModificationWarning: Transforming to str index.\n",
      "  warnings.warn(\"Transforming to str index.\", ImplicitModificationWarning)\n"
     ]
    }
   ],
   "source": [
    "ix=np.isin(adata_pisco.obs['tissue'],['large intestine']) \n",
    "adata=adata_pisco[ix].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [],
   "source": [
    "sc.pp.calculate_qc_metrics(adata, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['n_counts'] = adata.obs['total_counts']\n",
    "adata.obs['log_counts'] = adata.obs['log1p_n_genes_by_counts']\n",
    "adata.obs['n_genes'] = adata.obs['n_genes_by_counts']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "filtered out 37317 genes that are detected in less than 20 cells\n"
     ]
    }
   ],
   "source": [
    "# FILTER PARAMETERS\n",
    "#Filter out cells\n",
    "#sc.pp.filter_cells(adata, max_counts = 12000)\n",
    "#sc.pp.filter_cells(adata, max_genes = 7000)\n",
    "# Min 20 cells - filters out low count genes\n",
    "sc.pp.filter_genes(adata, min_cells=20) #500 works # # 450(30494) #400(29837) #300(28268) not"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get mt genes\n",
    "mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]\n",
    "np.array(mt_genes)\n",
    "mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]\n",
    "\n",
    "# convert to spare\n",
    "adata.X=sp.sparse.csr_matrix.todense(adata.X)\n",
    "\n",
    "# manually define mt score\n",
    "y = np.bincount(mt_gene_mask)\n",
    "ii = np.nonzero(y)[0]\n",
    "np.vstack((ii,y[ii])).T\n",
    "adata.X[:, mt_gene_mask].sum(1)\n",
    "mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))\n",
    "mt_sum=np.array(pd.DataFrame(mt_sum)[0])\n",
    "mt_frac=mt_sum/adata.obs['n_counts'].values\n",
    "\n",
    "adata.obs['mt_frac'] = mt_frac\n",
    "\n",
    "#Filter out cells with over 20% mito fraction\n",
    "adata = adata[adata.obs['mt_frac'] < 0.20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pp=adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:00:01): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n",
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=15\n",
      "    finished (0:00:00)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)\n",
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 12 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:00:01)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/python/lib/python3.7/site-packages/ipykernel_launcher.py:3: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "#Delete adata_pp\n",
    "del adata_pp\n",
    "adata.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "#Keep the count data in a counts layer\n",
    "adata.layers[\"counts\"] = adata.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.raw = adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "adata.X /= adata.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n",
      "    finished (0:00:01)\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=50\n",
      "    finished (0:00:02)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 50\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:02)\n",
      "computing UMAP\n",
      "    finished: added\n",
      "    'X_umap', UMAP coordinates (adata.obsm) (0:00:11)\n"
     ]
    }
   ],
   "source": [
    "# Calculate the visualizations\n",
    "sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')\n",
    "sc.pp.neighbors(adata)\n",
    "sc.tl.umap(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {},
   "outputs": [],
   "source": [
    "# make consistent annotations across datasets\n",
    "adata.obs['celltype']=adata.obs['cell_type'].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['B cell', 'CD4-positive, alpha-beta T cell',\n",
       "       'CD8-positive, alpha-beta T cell', 'enterocyte',\n",
       "       'enterocyte of epithelium of large intestine', 'fibroblast',\n",
       "       'goblet cell', 'gut endothelial cell', 'intestinal crypt stem cell',\n",
       "       'intestinal crypt stem cell of large intestine',\n",
       "       'intestinal enteroendocrine cell', 'intestinal tuft cell',\n",
       "       'large intestine goblet cell', 'mast cell', 'monocyte', 'neutrophil',\n",
       "       'paneth cell of colon', 'plasma cell',\n",
       "       'transit amplifying cell of colon'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 119,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.obs['celltype'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['celltype'],\n",
    "                           categories=['B cell', 'CD4-positive, alpha-beta T cell',\n",
    "       'CD8-positive, alpha-beta T cell', 'enterocyte',\n",
    "       'enterocyte of epithelium of large intestine', 'fibroblast',\n",
    "       'goblet cell', 'gut endothelial cell', 'intestinal crypt stem cell',\n",
    "       'intestinal crypt stem cell of large intestine',\n",
    "       'intestinal enteroendocrine cell', 'intestinal tuft cell',\n",
    "       'large intestine goblet cell', 'mast cell', 'monocyte', 'neutrophil',\n",
    "       'paneth cell of colon', 'plasma cell',\n",
    "       'transit amplifying cell of colon'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,[ 'CD8-positive, alpha-beta T cell'])\n",
    "ref_cluster[ix]='CD4-positive, alpha-beta T cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,['enterocyte', 'intestinal tuft cell'])\n",
    "ref_cluster[ix]= 'enterocyte of epithelium of large intestine'\n",
    "\n",
    "ix=np.isin(ref_cluster,['intestinal crypt stem cell of large intestine', 'transit amplifying cell of colon'])\n",
    "ref_cluster[ix]='intestinal crypt stem cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,[ 'intestinal enteroendocrine cell',  'large intestine goblet cell','paneth cell of colon'])\n",
    "ref_cluster[ix]='goblet cell'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['celltype']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['B cell', 'CD4-positive, alpha-beta T cell',\n",
    "        'enterocyte of epithelium of large intestine', 'fibroblast',\n",
    "       'goblet cell', 'gut endothelial cell', 'intestinal crypt stem cell',\n",
    "         'mast cell', 'monocyte', 'neutrophil',\n",
    "      'plasma cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ruben.brabenec/.local/lib/python3.7/site-packages/pandas/core/arrays/categorical.py:2631: FutureWarning: The `inplace` parameter in pandas.Categorical.rename_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.\n",
      "  res = method(*args, **kwargs)\n"
     ]
    }
   ],
   "source": [
    "adata.rename_categories('celltype', ['B cells', 'T cells',\n",
    "        'Intestinal epithelial cells', 'Fibroblast cells',\n",
    "       'Intestinal secretory cells', 'Endothelial cells', 'Intestinal stem cells',\n",
    "         'Mast cells', 'Monocytes', 'Neutrophils',\n",
    "      'Plasma cells'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ruben.brabenec/.local/lib/python3.7/site-packages/pandas/core/arrays/categorical.py:2631: FutureWarning: The `inplace` parameter in pandas.Categorical.rename_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.\n",
      "  res = method(*args, **kwargs)\n"
     ]
    }
   ],
   "source": [
    "adata.obs['tissue'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['tissue'],\n",
    "                           categories=['large intestine'])\n",
    "adata.rename_categories('tissue', ['Gut_Colon'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['sex'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['sex'],\n",
    "                           categories=['female', 'male'])\n",
    "adata.rename_categories('sex', ['Female', 'Male'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['ethnicity'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['ethnicity'],\n",
    "                           categories=['African-American or Afro-Caribbean', 'European'])\n",
    "adata.rename_categories('ethnicity', ['African-American or Afro-Caribbean', 'European'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['development_stage'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['development_stage'],\n",
    "                           categories=['59-year-old human stage', '61-year-old human stage'])\n",
    "adata.rename_categories('development_stage',['59', '61'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['donor'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['donor'],\n",
    "                           categories=['TSP2', 'TSP14'])\n",
    "adata.rename_categories('donor', ['TSP2', 'TSP14'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['Organ'] = 'Gut_Colon'\n",
    "adata.obs['Organ_Specific'] = adata.obs['tissue']\n",
    "adata.obs['Dataset'] = 'Pisco_Gut_Colon'\n",
    "adata.obs['InternDatasetNumber'] = '04-6-Gut_Colon-Pisco-2022'\n",
    "adata.obs['Dataset_status'] = 'Healthy_Dataset'\n",
    "\n",
    "adata.obs['celltype'] = adata.obs['celltype']\n",
    "adata.obs['sub_celltype'] = 'NaN'\n",
    "adata.obs['Malignant'] = 'NonMalignant'\n",
    "\n",
    "adata.obs['Patient'] = adata.obs['donor']\n",
    "adata.obs['Patient_Number'] = 'NaN'\n",
    "adata.obs['age'] = adata.obs['development_stage']\n",
    "adata.obs['sex'] = adata.obs['sex']\n",
    "adata.obs['ethnicity'] = adata.obs['ethnicity']\n",
    "adata.obs['health_status'] = 'NaN'\n",
    "\n",
    "adata.obs['original_celltype_1'] = adata.obs['cell_type']\n",
    "adata.obs['original_celltype_2'] = 'NaN'\n",
    "adata.obs['original_celltype_3'] = 'NaN'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 137,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 138,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs_names_make_unique(join='_')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 141,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.write(writepath + '04-6-Gut_Colon-Pisco-2022.h5ad')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 04-9-Oesophagus-Han-2020"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 202,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(adata_han.obs['sub_tissue'],['AdultEsophagus']) \n",
    "adata=adata_han[ix].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 203,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['InternDatasetNumber'] ='04-9-Oesophagus-Han-2020'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 207,
   "metadata": {},
   "outputs": [],
   "source": [
    "#calculate QC covariates\n",
    "adata.obs['n_counts'] = adata.X.sum(1)\n",
    "adata.obs['log_counts'] = np.log(adata.obs['n_counts'])\n",
    "adata.obs['n_genes'] = (adata.X > 0).sum(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 210,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "filtered out 51 cells that have more than 3100 counts\n",
      "filtered out 13547 genes that are detected in less than 20 cells\n"
     ]
    }
   ],
   "source": [
    "# FILTER PARAMETERS\n",
    "#Filter out cells\n",
    "sc.pp.filter_cells(adata, max_counts = 3100)\n",
    "sc.pp.filter_cells(adata, max_genes = 1700)\n",
    "# Min 20 cells - filters out low count genes\n",
    "sc.pp.filter_genes(adata, min_cells=20) #500 works # # 450(30494) #400(29837) #300(28268) not"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 211,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get mt genes\n",
    "mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]\n",
    "np.array(mt_genes)\n",
    "mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]\n",
    "\n",
    "# convert to spare\n",
    "adata.X=sp.sparse.csr_matrix.todense(adata.X)\n",
    "\n",
    "# manually define mt score\n",
    "y = np.bincount(mt_gene_mask)\n",
    "ii = np.nonzero(y)[0]\n",
    "np.vstack((ii,y[ii])).T\n",
    "adata.X[:, mt_gene_mask].sum(1)\n",
    "mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))\n",
    "mt_sum=np.array(pd.DataFrame(mt_sum)[0])\n",
    "mt_frac=mt_sum/adata.obs['n_counts'].values\n",
    "\n",
    "adata.obs['mt_frac'] = mt_frac\n",
    "\n",
    "#Filter out cells with over 20% mito fraction\n",
    "adata = adata[adata.obs['mt_frac'] < 0.20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 212,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pp=adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 213,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:00:00): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n",
      "computing PCA\n",
      "    with n_comps=15\n",
      "    finished (0:00:02)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)\n",
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 10 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:00:01)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 214,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 215,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 216,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/python/lib/python3.7/site-packages/ipykernel_launcher.py:3: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "#Delete adata_pp\n",
    "del adata_pp\n",
    "adata.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 217,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 218,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "#Keep the count data in a counts layer\n",
    "adata.layers[\"counts\"] = adata.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 219,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.raw = adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 220,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "adata.X /= adata.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 221,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n",
      "    finished (0:00:00)\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 222,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=50\n",
      "    finished (0:00:01)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 50\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)\n",
      "computing UMAP\n",
      "    finished: added\n",
      "    'X_umap', UMAP coordinates (adata.obsm) (0:00:08)\n"
     ]
    }
   ],
   "source": [
    "# Calculate the visualizations\n",
    "sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')\n",
    "sc.pp.neighbors(adata)\n",
    "sc.tl.umap(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 224,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['Antigen-presenting cell', 'B cell', 'B cell (Plasmocyte)_IGHA high',\n",
       "       'B cell_JCHAIN high', 'Endothelial cell', 'Endothelial cell_ACKR1 high',\n",
       "       'Endothelial cell_CCL21 high', 'Endothelial cell_IGFBP3 high',\n",
       "       'Epithelial cell_KRT4 high', 'Epithelial cell_KRT7 high',\n",
       "       'Epithelial cell_KRT13 high', 'Epithelial cell_KRT14 high',\n",
       "       'Epithelial cell_KRT16 high', 'Epithelial cell_KRT17 high',\n",
       "       'Epithelial cell_MMP7 high', 'Fibroblast', 'Goblet cell',\n",
       "       'Kerationcyte', 'MT-gene high cell', 'Macrophage_RGS1 high',\n",
       "       'Macrophage_RNASE1 high', 'Macrophage_TPSB2 high', 'Mast cell',\n",
       "       'Mucosal aquamous Epithelial cell', 'Neutrophil _S100A8 high',\n",
       "       'Neutrophil_IL1B high', 'Smooth muscle cell', 'Stromal cell',\n",
       "       'Stromal cell_PLA2G2A high', 'Stromal cell_PTGDS high'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 224,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# make consistent annotations across datasets\n",
    "adata.obs['celltype']=adata.obs['celltype_specific'].copy()\n",
    "adata.obs['celltype'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 225,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['celltype'],\n",
    "                           categories=['Antigen-presenting cell', 'B cell', 'B cell (Plasmocyte)_IGHA high',\n",
    "       'B cell_JCHAIN high', 'Endothelial cell', 'Endothelial cell_ACKR1 high',\n",
    "       'Endothelial cell_CCL21 high', 'Endothelial cell_IGFBP3 high',\n",
    "       'Epithelial cell_KRT4 high', 'Epithelial cell_KRT7 high',\n",
    "       'Epithelial cell_KRT13 high', 'Epithelial cell_KRT14 high',\n",
    "       'Epithelial cell_KRT16 high', 'Epithelial cell_KRT17 high',\n",
    "       'Epithelial cell_MMP7 high', 'Fibroblast', 'Goblet cell',\n",
    "       'Kerationcyte', 'MT-gene high cell', 'Macrophage_RGS1 high',\n",
    "       'Macrophage_RNASE1 high', 'Macrophage_TPSB2 high', 'Mast cell',\n",
    "       'Mucosal aquamous Epithelial cell', 'Neutrophil _S100A8 high',\n",
    "       'Neutrophil_IL1B high', 'Smooth muscle cell', 'Stromal cell',\n",
    "       'Stromal cell_PLA2G2A high', 'Stromal cell_PTGDS high'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 226,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,[ 'B cell_JCHAIN high'])\n",
    "ref_cluster[ix]='B cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,[ 'Endothelial cell_ACKR1 high', 'Endothelial cell_CCL21 high', 'Endothelial cell_IGFBP3 high',])\n",
    "ref_cluster[ix]='Endothelial cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,[ 'Macrophage_RNASE1 high', 'Macrophage_TPSB2 high'])\n",
    "ref_cluster[ix]='Macrophage_RGS1 high'\n",
    "\n",
    "ix=np.isin(ref_cluster,[  'Neutrophil_IL1B high'])\n",
    "ref_cluster[ix]='Neutrophil _S100A8 high'\n",
    "\n",
    "ix=np.isin(ref_cluster,[ 'Stromal cell_PLA2G2A high', 'Stromal cell_PTGDS high'])\n",
    "ref_cluster[ix]='Stromal cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,[ 'Epithelial cell_KRT7 high',\n",
    "       'Epithelial cell_KRT13 high', 'Epithelial cell_KRT14 high',\n",
    "       'Epithelial cell_KRT16 high', 'Epithelial cell_KRT17 high',\n",
    "       'Epithelial cell_MMP7 high'])\n",
    "ref_cluster[ix]= 'Epithelial cell_KRT4 high'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 227,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['celltype']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['Antigen-presenting cell', 'B cell', 'B cell (Plasmocyte)_IGHA high',\n",
    "      'Endothelial cell',\n",
    "         'Epithelial cell_KRT4 high',\n",
    "        'Fibroblast', 'Goblet cell',\n",
    "       'Kerationcyte', 'MT-gene high cell', 'Macrophage_RGS1 high',\n",
    "        'Mast cell',\n",
    "       'Mucosal aquamous Epithelial cell', 'Neutrophil _S100A8 high',\n",
    "       'Smooth muscle cell', 'Stromal cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 228,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.rename_categories('celltype', ['Unknown', 'B cells', 'Plasma cells',\n",
    "      'Endothelial cells',\n",
    "         'Oesophageal epithelial cells',\n",
    "        'Fibroblast cells', 'Intestinal secretory cells',\n",
    "       'Keratinocytes', 'Oesophageal MT high cells', 'Macrophages',\n",
    "        'Mast cells',\n",
    "       'Mucosal squamous cells', 'Neutrophils',\n",
    "       'Smooth muscle cells', 'Mesenchymal stromal cells'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 232,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['sub_tissue'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['sub_tissue'],\n",
    "                           categories=['AdultEsophagus'])\n",
    "adata.rename_categories('sub_tissue', ['Oesophagus'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 233,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['sex'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['sex'],\n",
    "                           categories=['male'])\n",
    "adata.rename_categories('sex', ['Male'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 234,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['age'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['age'],\n",
    "                           categories=['45Y', '56Y'])\n",
    "adata.rename_categories('age',['45', '56'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 235,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['donor'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['donor'],\n",
    "                           categories=['Donor31', 'Donor32'])\n",
    "adata.rename_categories('donor', ['Han-Donor31', 'Han-Donor32'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 236,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['Organ'] = 'Oesophagus'\n",
    "adata.obs['Organ_Specific'] = adata.obs['sub_tissue']\n",
    "adata.obs['Dataset'] = 'Han_Oesophagus'\n",
    "adata.obs['InternDatasetNumber'] ='04-9-Oesophagus-Han-2020'\n",
    "adata.obs['Dataset_status'] = 'Healthy_Dataset'\n",
    "\n",
    "adata.obs['celltype'] = adata.obs['celltype']\n",
    "adata.obs['sub_celltype'] = 'NaN'\n",
    "adata.obs['Malignant'] = 'NonMalignant'\n",
    "\n",
    "adata.obs['Patient'] = adata.obs['donor']\n",
    "adata.obs['Patient_Number'] = 'NaN'\n",
    "adata.obs['age'] = adata.obs['age']\n",
    "adata.obs['sex'] = adata.obs['sex']\n",
    "adata.obs['ethnicity'] = 'NaN'\n",
    "adata.obs['health_status'] = 'NaN'\n",
    "\n",
    "adata.obs['original_celltype_1'] = adata.obs['celltype_specific']\n",
    "adata.obs['original_celltype_2'] = adata.obs['celltype_global']\n",
    "adata.obs['original_celltype_3'] = 'NaN'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 238,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 239,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.write(writepath + '04-9-Oesophagus-Han-2020-processed.h5ad')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true,
    "tags": []
   },
   "source": [
    "# 05-Liver"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 05-1-Liver-MacParland-2018"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 204,
   "metadata": {},
   "outputs": [],
   "source": [
    "# here we use sfaira to import available datasets with annotations\n",
    "# note that the following steps may change depending on the current sfaira version and the path to your repository\n",
    "\n",
    "datadir = '/path/to/repo/'\n",
    "\n",
    "ds = sfaira.data.human.DatasetGroupLiver(path=datadir)  # This links all data sets available"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 205,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['human_liver_2018_10x_macparland_001_10.1038/s41467-018-06318-7',\n",
       " 'human_liver_2019_10x_popescu_001_10.1038/s41586-019-1652-y',\n",
       " 'human_liver_2019_10x_ramachandran_001_10.1038/s41586-019-1631-3',\n",
       " 'human_liver_2019_mCELSeq2_aizarani_001_10.1038/s41586-019-1373-2',\n",
       " 'human_liver_2020_microwell_han_001_10.1038/s41586-020-2157-4',\n",
       " 'human_liver_2020_microwell_han_002_10.1038/s41586-020-2157-4',\n",
       " 'human_liver_2020_microwell_han_003_10.1038/s41586-020-2157-4',\n",
       " 'human_liver_2020_microwell_han_004_10.1038/s41586-020-2157-4',\n",
       " 'human_liver_2020_microwell_han_005_10.1038/s41586-020-2157-4']"
      ]
     },
     "execution_count": 205,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds.ids "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 206,
   "metadata": {},
   "outputs": [],
   "source": [
    "# pick first one\n",
    "idx = ds.ids[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 212,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'human_liver_2018_10x_macparland_001_10.1038/s41467-018-06318-7'"
      ]
     },
     "execution_count": 212,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "idx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 207,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/icb/moritz.thomas/miniconda3/lib/python3.7/site-packages/sfaira-master/sfaira/data/base.py:84: UserWarning: using default genomes Homo_sapiens_GRCh38_97\n",
      "  warnings.warn(f\"using default genomes {genome}\")\n"
     ]
    }
   ],
   "source": [
    "ds.datasets[idx].load()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 209,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata=ds.datasets[idx].adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 216,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.var.index=np.array(adata.var.names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 185,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['InternDatasetNumber'] ='05-1-Liver-MacParland-2018'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 190,
   "metadata": {},
   "outputs": [],
   "source": [
    "sc.pp.calculate_qc_metrics(adata, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 191,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['MT-ND1', 'MT-ND2', 'MT-CO1', 'MT-CO2', 'MT-ATP8', 'MT-ATP6', 'MT-CO3',\n",
       "       'MT-ND3', 'MT-ND4L', 'MT-ND4', 'MT-ND5', 'MT-ND6', 'MT-CYB'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 191,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 192,
   "metadata": {},
   "outputs": [],
   "source": [
    "mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]\n",
    "adata.obs['mt_frac'] = adata.X[:, mt_gene_mask].sum(1)/adata.obs['total_counts']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 193,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total number of cells: 8444\n",
      "filtered out 94 cells that have more than 25000 counts\n",
      "Number of cells after max count filter: 8350\n",
      "Number of cells after MT filter: 6289\n",
      "filtered out 18 cells that have more than 4000 genes expressed\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ruben.brabenec/.local/lib/python3.7/site-packages/scanpy/preprocessing/_simple.py:140: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
      "  adata.obs['n_genes'] = number\n"
     ]
    }
   ],
   "source": [
    "# FILTER PARAMETERS\n",
    "print('Total number of cells: {:d}'.format(adata.n_obs))\n",
    "\n",
    "#Filter out counts over 25000\n",
    "sc.pp.filter_cells(adata, max_counts = 25000)\n",
    "print('Number of cells after max count filter: {:d}'.format(adata.n_obs))\n",
    "\n",
    "#MT filter\n",
    "adata = adata[adata.obs['mt_frac'] < 0.2]\n",
    "print('Number of cells after MT filter: {:d}'.format(adata.n_obs))\n",
    "\n",
    "#Filter out genes over 4000\n",
    "sc.pp.filter_cells(adata, max_genes = 4000)\n",
    "#print('Number of cells after gene filter: {:d}'.format(adata.n_obs))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 194,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total number of genes: 20007\n",
      "filtered out 5140 genes that are detected in less than 20 cells\n",
      "Number of genes after cell filter: 14867\n"
     ]
    }
   ],
   "source": [
    "#Filter genes:\n",
    "print('Total number of genes: {:d}'.format(adata.n_vars))\n",
    "\n",
    "# Min 20 cells - filters out low count genes\n",
    "sc.pp.filter_genes(adata, min_cells=20)\n",
    "print('Number of genes after cell filter: {:d}'.format(adata.n_vars))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 196,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pp=adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 197,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:00:00): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n",
      "computing PCA\n",
      "    with n_comps=15\n",
      "    finished (0:00:01)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)\n",
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 15 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:00:00)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 198,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 199,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 200,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Delete adata_pp\n",
    "del adata_pp\n",
    "adata.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 201,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 202,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "#Keep the count data in a counts layer\n",
    "adata.layers[\"counts\"] = adata.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 203,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.raw = adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 204,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "adata.X /= adata.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 205,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 206,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n",
      "    finished (0:00:00)\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 207,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=50\n",
      "    finished (0:00:01)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 50\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)\n",
      "computing UMAP\n",
      "    finished: added\n",
      "    'X_umap', UMAP coordinates (adata.obsm) (0:00:12)\n"
     ]
    }
   ],
   "source": [
    "# Calculate the visualizations\n",
    "sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')\n",
    "sc.pp.neighbors(adata)\n",
    "sc.tl.umap(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 209,
   "metadata": {},
   "outputs": [],
   "source": [
    "# make consistent annotations across datasets\n",
    "adata.obs['celltype']=adata.obs['cell_ontology_class'].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 210,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['celltype'],\n",
    "                           categories=['Alpha beta T cells', 'Central venous LSECs', 'Cholangiocytes',\n",
    "       'Endothelial cell', 'Erythroid cells', 'Gamma delta T cells 1',\n",
    "       'Gamma delta T cells 2', 'Hepatic stellate cells', 'Hepatocyte 1',\n",
    "       'Hepatocyte 2', 'Hepatocyte 3', 'Hepatocyte 4', 'Hepatocyte 5',\n",
    "       'Hepatocyte 6', 'Inflammatory macrophages', 'Mature B cells', 'NK cell',\n",
    "       'Non inflammatory macrophages', 'Periportal LSECs', 'Plasma cells'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 211,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,['Gamma delta T cells 1','Gamma delta T cells 2'])\n",
    "ref_cluster[ix]='Alpha beta T cells'\n",
    "\n",
    "ix=np.isin(ref_cluster,['Central venous LSECs', 'Periportal LSECs'])\n",
    "ref_cluster[ix]= 'Endothelial cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,['Hepatocyte 2', 'Hepatocyte 3', 'Hepatocyte 4', 'Hepatocyte 5','Hepatocyte 6'])\n",
    "ref_cluster[ix]='Hepatocyte 1'\n",
    "\n",
    "ix=np.isin(ref_cluster,['Non inflammatory macrophages'])\n",
    "ref_cluster[ix]='Inflammatory macrophages'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 212,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['celltype']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['Alpha beta T cells', 'Cholangiocytes',\n",
    "      'Endothelial cell', 'Erythroid cells', 'Hepatic stellate cells', 'Hepatocyte 1',\n",
    "      'Inflammatory macrophages', 'Mature B cells', 'NK cell', 'Plasma cells'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 213,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.rename_categories('celltype', ['T cells', 'Cholangiocytes',\n",
    "       'Endothelial cells', 'Erythroid cells', 'Hepatic stellate cells', 'Hepatocytes',\n",
    "       'Macrophages', 'B cells', 'NK cells', 'Plasma cells'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 216,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['Organ'] = 'Liver'\n",
    "adata.obs['Organ_Specific'] = 'Liver'\n",
    "adata.obs['Dataset'] = 'MacParland_Liver'\n",
    "adata.obs['InternDatasetNumber'] ='05-1-Liver-MacParland-2018'\n",
    "adata.obs['Dataset_status'] = 'Healthy_Dataset'\n",
    "\n",
    "adata.obs['celltype'] = adata.obs['celltype']\n",
    "adata.obs['sub_celltype'] = 'NaN'\n",
    "adata.obs['Malignant'] = 'NonMalignant'\n",
    "\n",
    "adata.obs['Patient'] = 'MacParland_Liver-Donor1'\n",
    "adata.obs['Patient_Number'] = 'NaN'\n",
    "adata.obs['age'] = 'NaN'\n",
    "adata.obs['sex'] = 'NaN'\n",
    "adata.obs['ethnicity'] = 'NaN'\n",
    "adata.obs['health_status'] = 'NaN'\n",
    "\n",
    "adata.obs['original_celltype_1'] = adata.obs['cell_ontology_class']\n",
    "adata.obs['original_celltype_2'] = 'NaN'\n",
    "adata.obs['original_celltype_3'] = 'NaN'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 218,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 219,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs_names_make_unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 220,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.write(writepath + '05-1-Liver-MacParland-2018-processed.h5ad')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "##  05-2-Liver-Ramachandran-2019"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 204,
   "metadata": {},
   "outputs": [],
   "source": [
    "# here we use sfaira to import available datasets with annotations\n",
    "# note that the following steps may change depending on the current sfaira version and the path to your repository\n",
    "\n",
    "datadir = '/path/to/repo/'\n",
    "\n",
    "ds = sfaira.data.human.DatasetGroupLiver(path=datadir)  # This links all data sets available"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 224,
   "metadata": {},
   "outputs": [],
   "source": [
    "# pick\n",
    "idx = ds.ids[2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 225,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'human_liver_2019_10x_ramachandran_001_10.1038/s41586-019-1631-3'"
      ]
     },
     "execution_count": 225,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "idx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 226,
   "metadata": {},
   "outputs": [],
   "source": [
    "ds.datasets[idx].load()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 228,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata=ds.datasets[idx].adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 233,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(adata.obs['aetiology'],['Uninjured']) \n",
    "adata=adata[ix].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 242,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.var.index=np.array(adata.var.names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 275,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['InternDatasetNumber'] ='05-2-Liver-Ramachandran-2019'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 278,
   "metadata": {},
   "outputs": [],
   "source": [
    "sc.pp.calculate_qc_metrics(adata, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 280,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['n_counts'] = adata.obs['total_counts']\n",
    "adata.obs['log_counts'] = adata.obs['log1p_n_genes_by_counts']\n",
    "adata.obs['n_genes'] = adata.obs['n_genes_by_counts']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 284,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "filtered out 4933 genes that are detected in less than 20 cells\n"
     ]
    }
   ],
   "source": [
    "# FILTER PARAMETERS\n",
    "#Filter out cells\n",
    "#sc.pp.filter_cells(adata, max_counts = 23000)\n",
    "#sc.pp.filter_cells(adata, max_genes = 6700)\n",
    "# Min 20 cells - filters out low count genes\n",
    "sc.pp.filter_genes(adata, min_cells=20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 286,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get mt genes\n",
    "mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]\n",
    "np.array(mt_genes)\n",
    "mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]\n",
    "\n",
    "# convert to spare\n",
    "adata.X=sp.sparse.csr_matrix.todense(adata.X)\n",
    "\n",
    "# manually define mt score\n",
    "y = np.bincount(mt_gene_mask)\n",
    "ii = np.nonzero(y)[0]\n",
    "np.vstack((ii,y[ii])).T\n",
    "adata.X[:, mt_gene_mask].sum(1)\n",
    "mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))\n",
    "mt_sum=np.array(pd.DataFrame(mt_sum)[0])\n",
    "mt_frac=mt_sum/adata.obs['n_counts'].values\n",
    "\n",
    "adata.obs['mt_frac'] = mt_frac\n",
    "\n",
    "#Filter out cells with over 25% mito fraction\n",
    "adata = adata[adata.obs['mt_frac'] < 0.20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 287,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pp=adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 288,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:00:02): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n",
      "computing PCA\n",
      "    with n_comps=15\n",
      "    finished (0:00:08)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:03)\n",
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 16 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:00:03)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 289,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 290,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 291,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/python/lib/python3.7/site-packages/ipykernel_launcher.py:3: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "#Delete adata_pp\n",
    "del adata_pp\n",
    "adata.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 292,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 293,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "#Keep the count data in a counts layer\n",
    "adata.layers[\"counts\"] = adata.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 294,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "adata.X /= adata.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 295,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 296,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n",
      "    finished (0:00:00)\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 297,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=50\n",
      "    finished (0:00:07)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 50\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:06)\n",
      "computing UMAP\n",
      "    finished: added\n",
      "    'X_umap', UMAP coordinates (adata.obsm) (0:00:26)\n"
     ]
    }
   ],
   "source": [
    "# Calculate the visualizations\n",
    "sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')\n",
    "sc.pp.neighbors(adata)\n",
    "sc.tl.umap(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 299,
   "metadata": {},
   "outputs": [],
   "source": [
    "# make consistent annotations across datasets\n",
    "adata.obs['celltype']=adata.obs['annotation_indepth'].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 300,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['MPs (1)', 'MPs (2)', 'MPs (3)', 'MPs (4)', 'MPs (5)', 'MPs (6)',\n",
       "       'MPs (7)', 'MPs (8)', 'MPs (9)', 'Cycling MPs (1)', 'Cycling MPs (2)',\n",
       "       'Cycling MPs (3)', 'Cycling MPs (4)', 'pDCs', 'ILCs (1)', 'ILCs (2)',\n",
       "       'ILCs (3)', 'Cycling ILCs (1)', 'Cycling ILCs (2)', 'Tcells (1)',\n",
       "       'Tcells (2)', 'Tcells (3)', 'Tcells (4)', 'Tcells (5)',\n",
       "       'Cycling Tcells', 'Bcells (1)', 'Bcells (2)', 'Plasma Bcells (1)',\n",
       "       'Plasma Bcells (2)', 'Mast cells', 'Endothelia (1)', 'Endothelia (2)',\n",
       "       'Endothelia (3)', 'Endothelia (4)', 'Endothelia (5)', 'Endothelia (6)',\n",
       "       'Endothelia (7)', 'Mesenchyme (1)', 'Mesenchyme (2)', 'Myofibroblasts',\n",
       "       'Mesothelia', 'Hepatocytes', 'Cholangiocytes (1)', 'Cholangiocytes (2)',\n",
       "       'Cholangiocytes (3)'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 300,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.obs['celltype'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 301,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['celltype'],\n",
    "                           categories=['MPs (1)', 'MPs (2)', 'MPs (3)', 'MPs (4)', 'MPs (5)', 'MPs (6)',\n",
    "       'MPs (7)', 'MPs (8)', 'MPs (9)', 'Cycling MPs (1)', 'Cycling MPs (2)',\n",
    "       'Cycling MPs (3)', 'Cycling MPs (4)', 'pDCs', 'ILCs (1)', 'ILCs (2)',\n",
    "       'ILCs (3)', 'Cycling ILCs (1)', 'Cycling ILCs (2)', 'Tcells (1)',\n",
    "       'Tcells (2)', 'Tcells (3)', 'Tcells (4)', 'Tcells (5)',\n",
    "       'Cycling Tcells', 'Bcells (1)', 'Bcells (2)', 'Plasma Bcells (1)',\n",
    "       'Plasma Bcells (2)', 'Mast cells', 'Endothelia (1)', 'Endothelia (2)',\n",
    "       'Endothelia (3)', 'Endothelia (4)', 'Endothelia (5)', 'Endothelia (6)',\n",
    "       'Endothelia (7)', 'Mesenchyme (1)', 'Mesenchyme (2)', 'Myofibroblasts',\n",
    "       'Mesothelia', 'Hepatocytes', 'Cholangiocytes (1)', 'Cholangiocytes (2)',\n",
    "       'Cholangiocytes (3)'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 302,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,['MPs (2)', 'MPs (3)', 'MPs (4)', 'MPs (5)', 'MPs (6)',\n",
    "       'MPs (7)', 'MPs (8)', 'MPs (9)', 'Cycling MPs (1)', 'Cycling MPs (2)',\n",
    "       'Cycling MPs (3)', 'Cycling MPs (4)'])\n",
    "ref_cluster[ix]='MPs (1)'\n",
    "\n",
    "\n",
    "ix=np.isin(ref_cluster,[ 'ILCs (2)','ILCs (3)', 'Cycling ILCs (1)', 'Cycling ILCs (2)'])\n",
    "ref_cluster[ix]='ILCs (1)'\n",
    "\n",
    "\n",
    "ix=np.isin(ref_cluster,['Tcells (2)', 'Tcells (3)', 'Tcells (4)', 'Tcells (5)','Cycling Tcells'])\n",
    "ref_cluster[ix]='Tcells (1)'\n",
    "\n",
    "ix=np.isin(ref_cluster,['Bcells (2)'])\n",
    "ref_cluster[ix]='Bcells (1)'\n",
    "\n",
    "ix=np.isin(ref_cluster,['Plasma Bcells (2)'])\n",
    "ref_cluster[ix]='Plasma Bcells (1)'\n",
    "\n",
    "ix=np.isin(ref_cluster,['Endothelia (2)','Endothelia (3)', 'Endothelia (4)', 'Endothelia (5)', \n",
    "                        'Endothelia (6)','Endothelia (7)'])\n",
    "ref_cluster[ix]='Endothelia (1)'\n",
    "\n",
    "ix=np.isin(ref_cluster,['Mesenchyme (2)'])\n",
    "ref_cluster[ix]= 'Mesenchyme (1)'\n",
    "\n",
    "ix=np.isin(ref_cluster,['Cholangiocytes (2)','Cholangiocytes (3)'])\n",
    "ref_cluster[ix]= 'Cholangiocytes (1)'\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 303,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['celltype']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['MPs (1)', 'pDCs', 'ILCs (1)','Tcells (1)', 'Bcells (1)', 'Plasma Bcells (1)',\n",
    "                                                       'Mast cells', 'Endothelia (1)', 'Mesenchyme (1)',\n",
    "                                                       'Myofibroblasts','Mesothelia', 'Hepatocytes', 'Cholangiocytes (1)'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 304,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['MPs (1)', 'pDCs', 'ILCs (1)', 'Tcells (1)', 'Bcells (1)',\n",
       "       'Plasma Bcells (1)', 'Mast cells', 'Endothelia (1)', 'Mesenchyme (1)',\n",
       "       'Myofibroblasts', 'Mesothelia', 'Hepatocytes', 'Cholangiocytes (1)'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 304,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.obs['celltype'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 305,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ruben.brabenec/.local/lib/python3.7/site-packages/pandas/core/arrays/categorical.py:2631: FutureWarning: The `inplace` parameter in pandas.Categorical.rename_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.\n",
      "  res = method(*args, **kwargs)\n"
     ]
    }
   ],
   "source": [
    "adata.rename_categories('celltype', ['Macrophages', 'Dendritic cells', 'Innate lymphoid cells', 'T cells', 'B cells', 'Plasma cells',\n",
    "        'Mast cells','Endothelial cells', 'Mesenchymal stromal cells ',\n",
    "        'Myofibroblast cells', 'Mesothelial cells', 'Hepatocytes', 'Cholangiocytes'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 310,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ruben.brabenec/.local/lib/python3.7/site-packages/pandas/core/arrays/categorical.py:2631: FutureWarning: The `inplace` parameter in pandas.Categorical.rename_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.\n",
      "  res = method(*args, **kwargs)\n"
     ]
    }
   ],
   "source": [
    "adata.obs['donor'] = adata.obs['liver']\n",
    "ref_cluster=pd.Categorical(adata.obs['donor'],\n",
    "                           categories=['Healthy1', 'Healthy2', 'Healthy3', 'Healthy4', 'Healthy5'])\n",
    "adata.rename_categories('donor', ['Ramachandran_Liver-Donor1', 'Ramachandran_Liver-Donor2', 'Ramachandran_Liver-Donor3', 'Ramachandran_Liver-Donor4', 'Ramachandran_Liver-Donor5'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 311,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['Organ'] = 'Liver'\n",
    "adata.obs['Organ_Specific'] = 'Liver'\n",
    "adata.obs['Dataset'] = 'Ramachandran_Liver'\n",
    "adata.obs['InternDatasetNumber'] ='05-2-Liver-Ramachandran-2019'\n",
    "adata.obs['Dataset_status'] = 'Healthy_Dataset'\n",
    "\n",
    "adata.obs['celltype'] = adata.obs['celltype']\n",
    "adata.obs['sub_celltype'] = 'NaN'\n",
    "adata.obs['Malignant'] = 'NonMalignant'\n",
    "\n",
    "adata.obs['Patient'] = adata.obs['donor']\n",
    "adata.obs['Patient_Number'] = 'NaN'\n",
    "adata.obs['age'] = 'NaN'\n",
    "adata.obs['sex'] = 'NaN'\n",
    "adata.obs['ethnicity'] = 'NaN'\n",
    "adata.obs['health_status'] = 'NaN'\n",
    "\n",
    "adata.obs['original_celltype_1'] = adata.obs['annotation_indepth']\n",
    "adata.obs['original_celltype_2'] = adata.obs['annotation_lineage']\n",
    "adata.obs['original_celltype_3'] = adata.obs['cell_ontology_class']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 313,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['Macrophages', 'Dendritic cells', 'Innate lymphoid cells', 'T cells',\n",
       "       'B cells', 'Plasma cells', 'Mast cells', 'Endothelial cells',\n",
       "       'Mesenchymal stromal cells ', 'Myofibroblast cells',\n",
       "       'Mesothelial cells', 'Hepatocytes', 'Cholangiocytes'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.obs['celltype'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ruben.brabenec/.local/lib/python3.7/site-packages/pandas/core/arrays/categorical.py:2631: FutureWarning: The `inplace` parameter in pandas.Categorical.rename_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.\n",
      "  res = method(*args, **kwargs)\n"
     ]
    }
   ],
   "source": [
    "adata.rename_categories('celltype',['Macrophages', 'Dendritic cells', 'Innate lymphoid cells', 'T cells',\n",
    "       'B cells', 'Plasma cells', 'Mast cells', 'Endothelial cells',\n",
    "       'Mesenchymal stromal cells', 'Myofibroblast cells',\n",
    "       'Mesothelial cells', 'Hepatocytes', 'Cholangiocytes'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.write(writepath + '05-2-Liver-Ramachandran-2019-processed.h5ad')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 05-3-Liver-Andrews-2021"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 315,
   "metadata": {},
   "outputs": [],
   "source": [
    "target_collections = [\"44531dd9-1388-4416-a117-af0a99de2294\"]\n",
    "cache_path = os.path.join(\".\", \"data\")\n",
    "dsg = sfaira.data.dataloaders.databases.DatasetSuperGroupDatabases(data_path=cache_path, cache_metadata=True)\n",
    "dsg.subset(key=\"collection_id\", values=target_collections)\n",
    "dsg.datasets\n",
    "dsg.download()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 424,
   "metadata": {},
   "outputs": [],
   "source": [
    "path = '/path/to/repo/44531dd9-1388-4416-a117-af0a99de2294/'\n",
    "files = [f for f in listdir(path) if isfile(join(path, f))]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 425,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ruben.brabenec/.local/lib/python3.7/site-packages/anndata/_core/anndata.py:1785: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n",
      "  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],\n"
     ]
    }
   ],
   "source": [
    "for i in range(len(files)):\n",
    "    path_2 = path + files[i]\n",
    "    u = sc.read_h5ad(path_2)\n",
    "    u.obs['id'] = files[i]\n",
    "    if i == 0:\n",
    "        adata = u\n",
    "    else:\n",
    "        adata = adata.concatenate(u, join='outer')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 427,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['InternDatasetNumber'] ='05-3-Liver-2021-Andrews'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 432,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.var.index = adata.var['feature_name']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 434,
   "metadata": {},
   "outputs": [],
   "source": [
    "sc.pp.calculate_qc_metrics(adata, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 435,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['n_counts'] = adata.obs['total_counts']\n",
    "adata.obs['log_counts'] = adata.obs['log1p_n_genes_by_counts']\n",
    "adata.obs['n_genes'] = adata.obs['n_genes_by_counts']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 439,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "filtered out 308 cells that have more than 4200 counts\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ruben.brabenec/.local/lib/python3.7/site-packages/anndata/_core/anndata.py:121: ImplicitModificationWarning: Transforming to str index.\n",
      "  warnings.warn(\"Transforming to str index.\", ImplicitModificationWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "filtered out 23025 genes that are detected in less than 20 cells\n"
     ]
    }
   ],
   "source": [
    "# FILTER PARAMETERS\n",
    "#Filter out cells\n",
    "sc.pp.filter_cells(adata, max_counts = 4200)\n",
    "sc.pp.filter_cells(adata, max_genes = 9000)\n",
    "# Min 20 cells - filters out low count genes\n",
    "sc.pp.filter_genes(adata, min_cells=20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 441,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get mt genes\n",
    "mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]\n",
    "np.array(mt_genes)\n",
    "mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]\n",
    "\n",
    "# convert to spare\n",
    "adata.X=sp.sparse.csr_matrix.todense(adata.X)\n",
    "\n",
    "# manually define mt score\n",
    "y = np.bincount(mt_gene_mask)\n",
    "ii = np.nonzero(y)[0]\n",
    "np.vstack((ii,y[ii])).T\n",
    "adata.X[:, mt_gene_mask].sum(1)\n",
    "mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))\n",
    "mt_sum=np.array(pd.DataFrame(mt_sum)[0])\n",
    "mt_frac=mt_sum/adata.obs['n_counts'].values\n",
    "\n",
    "adata.obs['mt_frac'] = mt_frac\n",
    "\n",
    "#Filter out cells with over 25% mito fraction\n",
    "adata = adata[adata.obs['mt_frac'] < 0.20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 442,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pp=adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 443,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:00:06): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n",
      "computing PCA\n",
      "    with n_comps=15\n",
      "    finished (0:00:18)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:21)\n",
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 28 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:00:25)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 444,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 445,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 446,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/python/lib/python3.7/site-packages/ipykernel_launcher.py:3: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "#Delete adata_pp\n",
    "del adata_pp\n",
    "adata.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 447,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 448,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "#Keep the count data in a counts layer\n",
    "adata.layers[\"counts\"] = adata.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 449,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.raw = adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 450,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "adata.X /= adata.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 451,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 452,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n",
      "    finished (0:00:00)\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 453,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=50\n",
      "    finished (0:00:54)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 50\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:38)\n",
      "computing UMAP\n",
      "    finished: added\n",
      "    'X_umap', UMAP coordinates (adata.obsm) (0:02:21)\n"
     ]
    }
   ],
   "source": [
    "# Calculate the visualizations\n",
    "sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')\n",
    "sc.pp.neighbors(adata)\n",
    "sc.tl.umap(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 455,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['celltype'] = adata.obs['cell_type']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 456,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['B cell', 'Kupffer cell', 'alpha-beta T cell',\n",
       "       'blood vessel endothelial cell', 'centrilobular region hepatocyte',\n",
       "       'cholangiocyte', 'endothelial cell of pericentral hepatic sinusoid',\n",
       "       'endothelial cell of periportal hepatic sinusoid',\n",
       "       'erythroid lineage cell', 'fibroblast', 'gamma-delta T cell',\n",
       "       'hepatic stellate cell', 'hepatocyte', 'inflammatory macrophage',\n",
       "       'mature B cell', 'midzonal region hepatocyte', 'natural killer cell',\n",
       "       'periportal region hepatocyte', 'plasma cell', 'progenitor cell',\n",
       "       'vascular associated smooth muscle cell'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 456,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.obs['celltype'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 457,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['celltype'],\n",
    "                           categories=['B cell', 'Kupffer cell', 'alpha-beta T cell',\n",
    "       'blood vessel endothelial cell', 'centrilobular region hepatocyte',\n",
    "       'cholangiocyte', 'endothelial cell of pericentral hepatic sinusoid',\n",
    "       'endothelial cell of periportal hepatic sinusoid',\n",
    "       'erythroid lineage cell', 'fibroblast', 'gamma-delta T cell',\n",
    "       'hepatic stellate cell', 'hepatocyte', 'inflammatory macrophage',\n",
    "       'mature B cell', 'midzonal region hepatocyte', 'natural killer cell',\n",
    "       'periportal region hepatocyte', 'plasma cell', 'progenitor cell',\n",
    "       'vascular associated smooth muscle cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 458,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,['gamma-delta T cell'])\n",
    "ref_cluster[ix]='alpha-beta T cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,['endothelial cell of pericentral hepatic sinusoid', 'endothelial cell of periportal hepatic sinusoid',])\n",
    "ref_cluster[ix]=  'blood vessel endothelial cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,['centrilobular region hepatocyte', 'midzonal region hepatocyte', 'periportal region hepatocyte'])\n",
    "ref_cluster[ix]='hepatocyte'\n",
    "\n",
    "ix=np.isin(ref_cluster,['mature B cell'])\n",
    "ref_cluster[ix]='B cell'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 459,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['celltype']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['B cell', 'Kupffer cell', 'alpha-beta T cell',\n",
    "       'blood vessel endothelial cell',\n",
    "       'cholangiocyte',\n",
    "       'erythroid lineage cell', 'fibroblast', \n",
    "       'hepatic stellate cell', 'hepatocyte', 'inflammatory macrophage',\n",
    "        'natural killer cell',\n",
    "        'plasma cell', 'progenitor cell',\n",
    "       'vascular associated smooth muscle cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 460,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ruben.brabenec/.local/lib/python3.7/site-packages/pandas/core/arrays/categorical.py:2631: FutureWarning: The `inplace` parameter in pandas.Categorical.rename_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.\n",
      "  res = method(*args, **kwargs)\n"
     ]
    }
   ],
   "source": [
    "adata.rename_categories('celltype', ['B cells', 'Kupffer cells', 'T cells',\n",
    "       'Endothelial cells',\n",
    "       'Cholangiocytes',\n",
    "       'Erythroid cells', 'Fibroblast cells', \n",
    "       'Hepatic stellate cells', 'Hepatocytes', 'Macrophages',\n",
    "        'NK cells',\n",
    "        'Plasma cells', 'Hepatic progenitor cells',\n",
    "       'Smooth muscle cells'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 463,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['sex_ontology_term_id'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['sex_ontology_term_id'],\n",
    "                           categories=['PATO:0000383', 'PATO:0000384'])\n",
    "adata.rename_categories('sex_ontology_term_id', ['Female', 'Male'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 464,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['tissue'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['tissue'],\n",
    "                           categories=['caudate lobe of liver'])\n",
    "adata.rename_categories('tissue', ['Liver_CaudateLobe'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 465,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['development_stage'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['development_stage'],\n",
    "                           categories=['human adult stage', 'mature stage'])\n",
    "ix=np.isin(ref_cluster,['mature stage'])\n",
    "ref_cluster[ix]='human adult stage'\n",
    "adata.obs['development_stage']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['human adult stage'])\n",
    "adata.rename_categories('development_stage', ['Adult'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 466,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['donor'] = adata.obs['donor_uuid']\n",
    "ref_cluster=pd.Categorical(adata.obs['donor'],\n",
    "                           categories=['3c1a1b1e-8b4c-45cf-a0c1-b8208c6b1f72',\n",
    "       '9bac7886-bfb5-4b90-903c-2b5834bf0408',\n",
    "       '19b27937-e5aa-4f8d-bb54-da04d451308a',\n",
    "       'bd0d1069-676f-4469-8241-5854a7d5e111'])\n",
    "adata.rename_categories('donor', ['Andrews_Liver-Donor1',\n",
    "       'Andrews_Liver-Donor2',\n",
    "       'Andrews_Liver-Donor3',\n",
    "       'Andrews_Liver-Donor4'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 467,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['Organ'] = 'Liver'\n",
    "adata.obs['Organ_Specific'] = adata.obs['tissue']\n",
    "adata.obs['Dataset'] = 'Andrews_Liver'\n",
    "adata.obs['InternDatasetNumber'] ='05-3-Liver-2021-Andrews'\n",
    "adata.obs['Dataset_status'] = 'Healthy_Dataset'\n",
    "\n",
    "adata.obs['celltype'] = adata.obs['celltype']\n",
    "adata.obs['sub_celltype'] = 'NaN'\n",
    "adata.obs['Malignant'] = 'NonMalignant'\n",
    "\n",
    "adata.obs['Patient'] = adata.obs['donor']\n",
    "adata.obs['Patient_Number'] = adata.obs['sample']\n",
    "adata.obs['age'] = adata.obs['development_stage']\n",
    "adata.obs['sex'] = adata.obs['sex_ontology_term_id']\n",
    "adata.obs['ethnicity'] = 'NaN'\n",
    "adata.obs['health_status'] = 'NaN'\n",
    "\n",
    "adata.obs['original_celltype_1'] = adata.obs['cell_type']\n",
    "adata.obs['original_celltype_2'] = 'NaN'\n",
    "adata.obs['original_celltype_3'] = 'NaN'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 469,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 470,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs_names_make_unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 475,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.write(writepath + '05-3-Liver-Andrews-2021-processed.h5ad')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "##  05-4-Liver-Pisco-2022"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 149,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(adata_pisco.obs['tissue'],['liver']) \n",
    "adata=adata_pisco[ix].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 154,
   "metadata": {},
   "outputs": [],
   "source": [
    "#calculate QC covariates\n",
    "adata.obs['n_counts'] = adata.X.sum(1)\n",
    "adata.obs['log_counts'] = np.log(adata.obs['n_counts'])\n",
    "adata.obs['n_genes'] = (adata.X > 0).sum(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 158,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "filtered out 17 cells that have more than 12000 counts\n",
      "filtered out 2 cells that have more than 7000 genes expressed\n",
      "filtered out 40476 genes that are detected in less than 20 cells\n"
     ]
    }
   ],
   "source": [
    "# FILTER PARAMETERS\n",
    "#Filter out cells\n",
    "sc.pp.filter_cells(adata, max_counts = 12000)\n",
    "sc.pp.filter_cells(adata, max_genes = 7000)\n",
    "# Min 20 cells - filters out low count genes\n",
    "sc.pp.filter_genes(adata, min_cells=20) #500 works # # 450(30494) #400(29837) #300(28268) not"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 160,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get mt genes\n",
    "mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]\n",
    "np.array(mt_genes)\n",
    "mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]\n",
    "\n",
    "# convert to spare\n",
    "adata.X=sp.sparse.csr_matrix.todense(adata.X)\n",
    "\n",
    "# manually define mt score\n",
    "y = np.bincount(mt_gene_mask)\n",
    "ii = np.nonzero(y)[0]\n",
    "np.vstack((ii,y[ii])).T\n",
    "adata.X[:, mt_gene_mask].sum(1)\n",
    "mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))\n",
    "mt_sum=np.array(pd.DataFrame(mt_sum)[0])\n",
    "mt_frac=mt_sum/adata.obs['n_counts'].values\n",
    "\n",
    "adata.obs['mt_frac'] = mt_frac\n",
    "\n",
    "#Filter out cells with over 20% mito fraction\n",
    "adata = adata[adata.obs['mt_frac'] < 0.20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 161,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pp=adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 162,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:00:00): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n",
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=15\n",
      "    finished (0:00:00)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)\n",
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 14 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:00:00)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 163,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 164,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 165,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/python/lib/python3.7/site-packages/ipykernel_launcher.py:3: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "#Delete adata_pp\n",
    "del adata_pp\n",
    "adata.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 166,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 167,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "#Keep the count data in a counts layer\n",
    "adata.layers[\"counts\"] = adata.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 168,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.raw = adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 169,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "adata.X /= adata.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 170,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n",
      "    finished (0:00:00)\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 171,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=50\n",
      "    finished (0:00:00)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 50\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)\n",
      "computing UMAP\n",
      "    finished: added\n",
      "    'X_umap', UMAP coordinates (adata.obsm) (0:00:10)\n"
     ]
    }
   ],
   "source": [
    "# Calculate the visualizations\n",
    "sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')\n",
    "sc.pp.neighbors(adata)\n",
    "sc.tl.umap(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 173,
   "metadata": {},
   "outputs": [],
   "source": [
    "# make consistent annotations across datasets\n",
    "adata.obs['celltype']=adata.obs['cell_type'].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 174,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['T cell', 'endothelial cell', 'endothelial cell of hepatic sinusoid',\n",
       "       'erythrocyte', 'fibroblast', 'hepatocyte', 'intrahepatic cholangiocyte',\n",
       "       'liver dendritic cell', 'macrophage', 'mature NK T cell', 'monocyte',\n",
       "       'neutrophil', 'plasma cell'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 174,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.obs['celltype'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 175,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['celltype'],\n",
    "                           categories=['T cell', 'endothelial cell', 'endothelial cell of hepatic sinusoid',\n",
    "       'erythrocyte', 'fibroblast', 'hepatocyte', 'intrahepatic cholangiocyte',\n",
    "       'liver dendritic cell', 'macrophage', 'mature NK T cell', 'monocyte',\n",
    "       'neutrophil', 'plasma cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 176,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,[ 'endothelial cell of hepatic sinusoid',])\n",
    "ref_cluster[ix]='endothelial cell'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 177,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['celltype']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['T cell', 'endothelial cell', \n",
    "       'erythrocyte', 'fibroblast', 'hepatocyte', 'intrahepatic cholangiocyte',\n",
    "       'liver dendritic cell', 'macrophage', 'mature NK T cell', 'monocyte',\n",
    "       'neutrophil', 'plasma cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 178,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.rename_categories('celltype',['T cells', 'Endothelial cells',\n",
    "       'Erythroid cells', 'Fibroblast cells', 'Hepatocytes', 'Cholangiocytes',\n",
    "       'Dendritic cells', 'Macrophages', 'NK cells', 'Monocytes',\n",
    "       'Neutrophils', 'Plasma cells'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 181,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['TSP6', 'TSP14'], dtype='object')"
      ]
     },
     "execution_count": 181,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.obs['donor'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 182,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['tissue'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['tissue'],\n",
    "                           categories=['liver'])\n",
    "adata.rename_categories('tissue', ['Liver'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 183,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['sex'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['sex'],\n",
    "                           categories=['male'])\n",
    "adata.rename_categories('sex', ['Male'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 184,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['ethnicity'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['ethnicity'],\n",
    "                           categories=['European'])\n",
    "adata.rename_categories('ethnicity', ['European'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 185,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['development_stage'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['development_stage'],\n",
    "                           categories=['59-year-old human stage', '67-year-old human stage'])\n",
    "adata.rename_categories('development_stage',['59', '67'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 186,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['donor'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['donor'],\n",
    "                           categories=['TSP6', 'TSP14'])\n",
    "adata.rename_categories('donor', ['TSP6', 'TSP14'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 187,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['Organ'] = 'Liver'\n",
    "adata.obs['Organ_Specific'] = adata.obs['tissue']\n",
    "adata.obs['Dataset'] = 'Pisco_Liver'\n",
    "adata.obs['InternDatasetNumber'] = '05-4-Liver-Pisco-2022'\n",
    "adata.obs['Dataset_status'] = 'Healthy_Dataset'\n",
    "\n",
    "adata.obs['celltype'] = adata.obs['celltype']\n",
    "adata.obs['sub_celltype'] = 'NaN'\n",
    "adata.obs['Malignant'] = 'NonMalignant'\n",
    "\n",
    "adata.obs['Patient'] = adata.obs['donor']\n",
    "adata.obs['Patient_Number'] = 'NaN'\n",
    "adata.obs['age'] = adata.obs['development_stage']\n",
    "adata.obs['sex'] = adata.obs['sex']\n",
    "adata.obs['ethnicity'] = adata.obs['ethnicity']\n",
    "adata.obs['health_status'] = 'NaN'\n",
    "\n",
    "adata.obs['original_celltype_1'] = adata.obs['cell_type']\n",
    "adata.obs['original_celltype_2'] = 'NaN'\n",
    "adata.obs['original_celltype_3'] = 'NaN'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 189,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 190,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs_names_make_unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 191,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.write(writepath + '05-4-Liver-Pisco-2022-processed.h5ad')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 05-6-Liver-Han-2020"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 585,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(adata_han.obs['sub_tissue'],['AdultLiver']) \n",
    "adata=adata_han[ix].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 586,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['InternDatasetNumber'] ='05-6-Liver-Han-2020'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 590,
   "metadata": {},
   "outputs": [],
   "source": [
    "#calculate QC covariates\n",
    "adata.obs['n_counts'] = adata.X.sum(1)\n",
    "adata.obs['log_counts'] = np.log(adata.obs['n_counts'])\n",
    "adata.obs['n_genes'] = (adata.X > 0).sum(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 593,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "filtered out 20 cells that have more than 2400 counts\n",
      "filtered out 3 cells that have more than 1200 genes expressed\n",
      "filtered out 13390 genes that are detected in less than 10 cells\n"
     ]
    }
   ],
   "source": [
    "# FILTER PARAMETERS#Filter out cells\n",
    "sc.pp.filter_cells(adata, max_counts = 2400)\n",
    "sc.pp.filter_cells(adata, max_genes = 1200)\n",
    "# Min 20 cells - filters out low count genes\n",
    "sc.pp.filter_genes(adata, min_cells=10) #500 works # # 450(30494) #400(29837) #300(28268) not"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 594,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get mt genes\n",
    "mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]\n",
    "np.array(mt_genes)\n",
    "mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]\n",
    "\n",
    "# convert to spare\n",
    "adata.X=sp.sparse.csr_matrix.todense(adata.X)\n",
    "\n",
    "# manually define mt score\n",
    "y = np.bincount(mt_gene_mask)\n",
    "ii = np.nonzero(y)[0]\n",
    "np.vstack((ii,y[ii])).T\n",
    "adata.X[:, mt_gene_mask].sum(1)\n",
    "mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))\n",
    "mt_sum=np.array(pd.DataFrame(mt_sum)[0])\n",
    "mt_frac=mt_sum/adata.obs['n_counts'].values\n",
    "\n",
    "adata.obs['mt_frac'] = mt_frac\n",
    "\n",
    "#Filter out cells with over 20% mito fraction\n",
    "adata = adata[adata.obs['mt_frac'] < 0.20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 595,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pp=adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 596,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:00:00): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n",
      "computing PCA\n",
      "    with n_comps=15\n",
      "    finished (0:00:02)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)\n",
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 13 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:00:00)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 597,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 598,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 599,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/python/lib/python3.7/site-packages/ipykernel_launcher.py:3: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "#Delete adata_pp\n",
    "del adata_pp\n",
    "adata.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 600,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 601,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "#Keep the count data in a counts layer\n",
    "adata.layers[\"counts\"] = adata.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 602,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.raw = adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 603,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "adata.X /= adata.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 604,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n",
      "    finished (0:00:00)\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 605,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=50\n",
      "    finished (0:00:01)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 50\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)\n",
      "computing UMAP\n",
      "    finished: added\n",
      "    'X_umap', UMAP coordinates (adata.obsm) (0:00:09)\n"
     ]
    }
   ],
   "source": [
    "# Calculate the visualizations\n",
    "sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')\n",
    "sc.pp.neighbors(adata)\n",
    "sc.tl.umap(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 607,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['Activated T cell', 'B cell (Plasmocyte)_IGHA/HM high',\n",
       "       'B cell (Plasmocyte)_IGHA/HM_IGK high',\n",
       "       'B cell (Plasmocyte)_IGHA/HM_IGL high', 'B cell (Plasmocyte)_IGHG high',\n",
       "       'B cell (Plasmocyte)_IGHG_IGK high',\n",
       "       'B cell (Plasmocyte)_IGHG_IGL high', 'B cell (Plasmocyte)_IGHM/HG high',\n",
       "       'Conventional dendritic cell',\n",
       "       'Conventional dendritic cell_FECER1A high', 'Dendritic cell',\n",
       "       'Epithelial cell', 'Epithelial cell_SCGB3A1 high',\n",
       "       'Epithelial cell_TM4SF4 high', 'Hepatocyte', 'Hepatocyte_FGB high',\n",
       "       'Hepatocyte_GSTA1 high', 'Hepatocyte_HP high', 'Hepatocyte_TF high',\n",
       "       'Kuppfer Cell', 'Kuppfer cell', 'Macrophage', 'Mast cell',\n",
       "       'Motile liver macrophage', 'Myeloid cell', 'Neutrophil',\n",
       "       'Neutrophil_CAMP high', 'Neutrophil_CD177 high',\n",
       "       'Neutrophil_ELANE high', 'Neutrophil_LCN2 high', 'Proliferating cell',\n",
       "       'Sinusoidal endothelial cell', 'Sinusoidal endothelial cell_FCN1 high',\n",
       "       'Smooth muscle cell', 'Vascular endothelial cell'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 607,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# make consistent annotations across datasets\n",
    "adata.obs['celltype']=adata.obs['celltype_specific'].copy()\n",
    "adata.obs['celltype'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 608,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['celltype'],\n",
    "                           categories=['Activated T cell', 'B cell (Plasmocyte)_IGHA/HM high',\n",
    "       'B cell (Plasmocyte)_IGHA/HM_IGK high',\n",
    "       'B cell (Plasmocyte)_IGHA/HM_IGL high', 'B cell (Plasmocyte)_IGHG high',\n",
    "       'B cell (Plasmocyte)_IGHG_IGK high',\n",
    "       'B cell (Plasmocyte)_IGHG_IGL high', 'B cell (Plasmocyte)_IGHM/HG high',\n",
    "       'Conventional dendritic cell',\n",
    "       'Conventional dendritic cell_FECER1A high', 'Dendritic cell',\n",
    "       'Epithelial cell', 'Epithelial cell_SCGB3A1 high',\n",
    "       'Epithelial cell_TM4SF4 high', 'Hepatocyte', 'Hepatocyte_FGB high',\n",
    "       'Hepatocyte_GSTA1 high', 'Hepatocyte_HP high', 'Hepatocyte_TF high',\n",
    "       'Kuppfer Cell', 'Kuppfer cell', 'Macrophage', 'Mast cell',\n",
    "       'Motile liver macrophage', 'Myeloid cell', 'Neutrophil',\n",
    "       'Neutrophil_CAMP high', 'Neutrophil_CD177 high',\n",
    "       'Neutrophil_ELANE high', 'Neutrophil_LCN2 high', 'Proliferating cell',\n",
    "       'Sinusoidal endothelial cell', 'Sinusoidal endothelial cell_FCN1 high',\n",
    "       'Smooth muscle cell', 'Vascular endothelial cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 609,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,[ 'x'])\n",
    "ref_cluster[ix]= 'Activated T cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,[ 'B cell (Plasmocyte)_IGHA/HM_IGK high',\n",
    "       'B cell (Plasmocyte)_IGHA/HM_IGL high', 'B cell (Plasmocyte)_IGHG high',\n",
    "       'B cell (Plasmocyte)_IGHG_IGK high',\n",
    "       'B cell (Plasmocyte)_IGHG_IGL high', 'B cell (Plasmocyte)_IGHM/HG high'])\n",
    "ref_cluster[ix]= 'B cell (Plasmocyte)_IGHA/HM high'\n",
    "\n",
    "ix=np.isin(ref_cluster,[  'Conventional dendritic cell_FECER1A high', 'Dendritic cell'])\n",
    "ref_cluster[ix]= 'Conventional dendritic cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,[ 'Hepatocyte_FGB high',\n",
    "       'Hepatocyte_GSTA1 high', 'Hepatocyte_HP high', 'Hepatocyte_TF high'])\n",
    "ref_cluster[ix]= 'Hepatocyte'\n",
    "\n",
    "ix=np.isin(ref_cluster,[ 'Kuppfer cell', 'Macrophage',  'Motile liver macrophage'])\n",
    "ref_cluster[ix]= 'Kuppfer Cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,['Neutrophil_CAMP high','Neutrophil_CD177 high', 'Neutrophil_ELANE high', 'Neutrophil_LCN2 high'])\n",
    "ref_cluster[ix]= 'Neutrophil'\n",
    "\n",
    "ix=np.isin(ref_cluster,[ 'Sinusoidal endothelial cell_FCN1 high','Vascular endothelial cell'])\n",
    "ref_cluster[ix]=  'Sinusoidal endothelial cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,['Epithelial cell_SCGB3A1 high',\n",
    "       'Epithelial cell_TM4SF4 high', 'Proliferating cell'])\n",
    "ref_cluster[ix]=   'Epithelial cell'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 610,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['celltype']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['Activated T cell', 'B cell (Plasmocyte)_IGHA/HM high',\n",
    "       'Conventional dendritic cell',\n",
    "      \n",
    "       'Epithelial cell',  'Hepatocyte', \n",
    "       'Kuppfer Cell',  'Mast cell',\n",
    "       'Myeloid cell', 'Neutrophil',\n",
    "         \n",
    "       'Sinusoidal endothelial cell', \n",
    "       'Smooth muscle cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 611,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.rename_categories('celltype', ['T cells', 'Plasma cells',\n",
    "       'Dendritic cells',\n",
    "      \n",
    "       'Unknown',  'Hepatocytes', \n",
    "       'Macrophages',  'Mast cells',\n",
    "       'Myeloid cells', 'Neutrophils',\n",
    "         \n",
    "       'Endothelial cells', \n",
    "       'Smooth muscle cells'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 615,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['sub_tissue'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['sub_tissue'],\n",
    "                           categories=['AdultLiver'])\n",
    "adata.rename_categories('sub_tissue', ['Liver'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 616,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['sex'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['sex'],\n",
    "                           categories=['female', 'male'])\n",
    "adata.rename_categories('sex', ['Female', 'Male'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 617,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['age'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['age'],\n",
    "                           categories=['21Y', '23Y', '52Y'])\n",
    "adata.rename_categories('age',['21', '23', '52'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 618,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['donor'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['donor'],\n",
    "                           categories=['Donor38', 'Donor39', 'Donor40'])\n",
    "adata.rename_categories('donor',['Han-Donor38', 'Han-Donor39', 'Han-Donor40'] )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 619,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['Organ'] = 'Liver'\n",
    "adata.obs['Organ_Specific'] = adata.obs['sub_tissue']\n",
    "adata.obs['Dataset'] = 'Han_Liver'\n",
    "adata.obs['InternDatasetNumber'] ='05-6-Liver-Han-2020'\n",
    "adata.obs['Dataset_status'] = 'Healthy_Dataset'\n",
    "\n",
    "adata.obs['celltype'] = adata.obs['celltype']\n",
    "adata.obs['sub_celltype'] = 'NaN'\n",
    "adata.obs['Malignant'] = 'NonMalignant'\n",
    "\n",
    "adata.obs['Patient'] = adata.obs['donor']\n",
    "adata.obs['Patient_Number'] = 'NaN'\n",
    "adata.obs['age'] = adata.obs['age']\n",
    "adata.obs['sex'] = adata.obs['sex']\n",
    "adata.obs['ethnicity'] = 'NaN'\n",
    "adata.obs['health_status'] = 'NaN'\n",
    "\n",
    "adata.obs['original_celltype_1'] = adata.obs['celltype_specific']\n",
    "adata.obs['original_celltype_2'] = adata.obs['celltype_global']\n",
    "adata.obs['original_celltype_3'] = 'NaN'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 621,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 622,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.write(writepath + '05-6-Liver-Han-2020-processed.h5ad')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 05-7-Liver_ImmuneCells-Teichmann-2022"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 188,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(adata_analysis.obs['tissue_major'], ['Liver'])\n",
    "adata=adata_analysis[ix].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 193,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['Organ'] =  adata.obs['tissue_major']\n",
    "adata.obs['Organ_Specific'] = adata.obs['sub_tissue']\n",
    "adata.obs['Dataset'] = adata.obs['Dataset']\n",
    "adata.obs['InternDatasetNumber'] = '05-7-Liver_ImmuneCells-Teichmann-2022'\n",
    "adata.obs['Dataset_status'] = 'Healthy_Dataset'\n",
    "\n",
    "adata.obs['celltype'] = adata.obs['celltype']\n",
    "adata.obs['sub_celltype'] = 'NaN'\n",
    "adata.obs['Malignant'] = 'NonMalignant'\n",
    "\n",
    "adata.obs['Patient'] = adata.obs['donor']\n",
    "adata.obs['Patient_Number'] = 'NaN'\n",
    "adata.obs['age'] = adata.obs['development_stage']\n",
    "adata.obs['sex'] = adata.obs['sex']\n",
    "adata.obs['ethnicity'] = adata.obs['ethnicity']\n",
    "adata.obs['health_status'] = 'NaN'\n",
    "\n",
    "adata.obs['original_celltype_1'] = adata.obs['cell_type']\n",
    "adata.obs['original_celltype_2'] = adata.obs['Majority_voting_CellTypist_high']\n",
    "adata.obs['original_celltype_3'] = 'NaN'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 194,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 195,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs_names_make_unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 196,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.write(writepath + '05-7-Liver_ImmuneCells-Teichmann-2022-processed.h5ad')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true,
    "tags": []
   },
   "source": [
    "# 06-Pancreas"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 06-1-Pancreas-Baron-2016"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "sample1=pd.read_csv(writepath + 'GSM2230757_human1_umifm_counts.csv',\n",
    "                    delimiter=',')\n",
    "\n",
    "sample1.index=sample1['barcode']\n",
    "\n",
    "id_1=np.array(sample1['Unnamed: 0'])\n",
    "id_1\n",
    "\n",
    "cells_1=np.array(sample1['assigned_cluster'])\n",
    "cells_1\n",
    "\n",
    "del sample1['assigned_cluster']\n",
    "del sample1['Unnamed: 0']\n",
    "del sample1['barcode']\n",
    "\n",
    "adata1=anndata.AnnData(X=sample1)\n",
    "\n",
    "adata1.obs['assigned_cluster']=cells_1\n",
    "adata1.obs['id']=id_1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Observation names are not unique. To make them unique, call `.obs_names_make_unique`.\n"
     ]
    }
   ],
   "source": [
    "sample2=pd.read_csv(writepath + 'GSM2230758_human2_umifm_counts.csv',\n",
    "                    delimiter=',')\n",
    "\n",
    "sample2.index=sample2['barcode']\n",
    "\n",
    "id_2=np.array(sample2['Unnamed: 0'])\n",
    "cells_2=np.array(sample2['assigned_cluster'])\n",
    "\n",
    "\n",
    "del sample2['assigned_cluster']\n",
    "del sample2['Unnamed: 0']\n",
    "del sample2['barcode']\n",
    "\n",
    "adata2=anndata.AnnData(X=sample2)\n",
    "\n",
    "adata2.obs['assigned_cluster']=cells_2\n",
    "adata2.obs['id']=id_2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Observation names are not unique. To make them unique, call `.obs_names_make_unique`.\n"
     ]
    }
   ],
   "source": [
    "sample3=pd.read_csv(writepath + 'GSM2230759_human3_umifm_counts.csv',\n",
    "                    delimiter=',')\n",
    "\n",
    "sample3.index=sample3['barcode']\n",
    "\n",
    "id_3=np.array(sample3['Unnamed: 0'])\n",
    "cells_3=np.array(sample3['assigned_cluster'])\n",
    "\n",
    "\n",
    "del sample3['assigned_cluster']\n",
    "del sample3['Unnamed: 0']\n",
    "del sample3['barcode']\n",
    "\n",
    "adata3=anndata.AnnData(X=sample3)\n",
    "\n",
    "adata3.obs['assigned_cluster']=cells_3\n",
    "adata3.obs['id']=id_3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Observation names are not unique. To make them unique, call `.obs_names_make_unique`.\n"
     ]
    }
   ],
   "source": [
    "sample4=pd.read_csv(writepath + 'GSM2230760_human4_umifm_counts.csv',\n",
    "                    delimiter=',')\n",
    "\n",
    "sample4.index=sample4['barcode']\n",
    "\n",
    "id_4=np.array(sample4['Unnamed: 0'])\n",
    "cells_4=np.array(sample4['assigned_cluster'])\n",
    "\n",
    "\n",
    "del sample4['assigned_cluster']\n",
    "del sample4['Unnamed: 0']\n",
    "del sample4['barcode']\n",
    "\n",
    "adata4=anndata.AnnData(X=sample4)\n",
    "\n",
    "adata4.obs['assigned_cluster']=cells_4\n",
    "adata4.obs['id']=id_4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Observation names are not unique. To make them unique, call `.obs_names_make_unique`.\n",
      "Or pass `index_unique!=None` to `.concatenate`.\n"
     ]
    }
   ],
   "source": [
    "adata=adata1.concatenate(adata2, adata3, adata4, batch_key='sample')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 506,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['InternDatasetNumber'] ='06-1-Pancreas-Baron-2016'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 510,
   "metadata": {},
   "outputs": [],
   "source": [
    "#calculate QC covariates\n",
    "adata.obs['n_counts'] = adata.X.sum(1)\n",
    "adata.obs['log_counts'] = np.log(adata.obs['n_counts'])\n",
    "adata.obs['n_genes'] = (adata.X > 0).sum(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 513,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ruben.brabenec/.local/lib/python3.7/site-packages/anndata/_core/anndata.py:1828: UserWarning: Observation names are not unique. To make them unique, call `.obs_names_make_unique`.\n",
      "  utils.warn_names_duplicates(\"obs\")\n"
     ]
    }
   ],
   "source": [
    "# FILTER PARAMETERS\n",
    "#Filter out cells\n",
    "#sc.pp.filter_cells(adata, max_counts = 4200)\n",
    "#sc.pp.filter_cells(adata, max_genes = 9000)\n",
    "# Min 20 cells - filters out low count genes\n",
    "sc.pp.filter_genes(adata, min_cells=10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 515,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get mt genes\n",
    "mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]\n",
    "np.array(mt_genes)\n",
    "mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]\n",
    "\n",
    "# convert to spare\n",
    "adata.X=sp.sparse.csr_matrix.todense(adata.X)\n",
    "\n",
    "# manually define mt score\n",
    "y = np.bincount(mt_gene_mask)\n",
    "ii = np.nonzero(y)[0]\n",
    "np.vstack((ii,y[ii])).T\n",
    "adata.X[:, mt_gene_mask].sum(1)\n",
    "mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))\n",
    "mt_sum=np.array(pd.DataFrame(mt_sum)[0])\n",
    "mt_frac=mt_sum/adata.obs['n_counts'].values\n",
    "\n",
    "adata.obs['mt_frac'] = mt_frac\n",
    "\n",
    "#Filter out cells with over 25% mito fraction\n",
    "adata = adata[adata.obs['mt_frac'] < 0.20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 516,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ruben.brabenec/.local/lib/python3.7/site-packages/anndata/_core/anndata.py:1828: UserWarning: Observation names are not unique. To make them unique, call `.obs_names_make_unique`.\n",
      "  utils.warn_names_duplicates(\"obs\")\n"
     ]
    }
   ],
   "source": [
    "adata_pp=adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 517,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:00:01): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n",
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=15\n",
      "    finished (0:00:00)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)\n",
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 12 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:00:00)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 518,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 519,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 520,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/python/lib/python3.7/site-packages/ipykernel_launcher.py:3: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n",
      "/home/ruben.brabenec/.local/lib/python3.7/site-packages/anndata/_core/anndata.py:1828: UserWarning: Observation names are not unique. To make them unique, call `.obs_names_make_unique`.\n",
      "  utils.warn_names_duplicates(\"obs\")\n"
     ]
    }
   ],
   "source": [
    "#Delete adata_pp\n",
    "del adata_pp\n",
    "adata.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 521,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 522,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "#Keep the count data in a counts layer\n",
    "adata.layers[\"counts\"] = adata.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 523,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.raw = adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 524,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "adata.X /= adata.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 525,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 526,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n",
      "    finished (0:00:00)\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 527,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=50\n",
      "    finished (0:00:03)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 50\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)\n",
      "computing UMAP\n",
      "    finished: added\n",
      "    'X_umap', UMAP coordinates (adata.obsm) (0:00:16)\n"
     ]
    }
   ],
   "source": [
    "# Calculate the visualizations\n",
    "sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')\n",
    "sc.pp.neighbors(adata)\n",
    "sc.tl.umap(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 529,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['Acinar', 'Alpha', 'Beta', 'Delta', 'Ductal', 'Endothelial', 'Epsilon',\n",
       "       'Gamma', 'Macrophages', 'Mast cells', 'Stellates', 'Schwann',\n",
       "       'T cells'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 529,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# make consistent annotations across datasets\n",
    "adata.obs['celltype'] = adata.obs['annotations_final'].copy()\n",
    "adata.obs['celltype'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 530,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['celltype'],\n",
    "                           categories=['Acinar', 'Alpha', 'Beta', 'Delta', 'Ductal', 'Endothelial', 'Epsilon',\n",
    "       'Gamma', 'Macrophages', 'Mast cells', 'Stellates', 'Schwann',\n",
    "       'T cells'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 531,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,['Beta', 'Delta', 'Epsilon', 'Gamma'])\n",
    "ref_cluster[ix]='Alpha'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 532,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['celltype']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['Acinar', 'Alpha', 'Ductal', 'Endothelial',\n",
    "                                                       'Macrophages', 'Mast cells', 'Stellates', 'Schwann',\n",
    "                                                       'T cells'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 533,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['Acinar', 'Alpha', 'Ductal', 'Endothelial', 'Macrophages', 'Mast cells',\n",
       "       'Stellates', 'Schwann', 'T cells'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 533,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.obs['celltype'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 534,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ruben.brabenec/.local/lib/python3.7/site-packages/pandas/core/arrays/categorical.py:2631: FutureWarning: The `inplace` parameter in pandas.Categorical.rename_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.\n",
      "  res = method(*args, **kwargs)\n"
     ]
    }
   ],
   "source": [
    "adata.rename_categories('celltype',['Pancreatic acinar cells', 'Pancreatic endocrine cells', 'Pancreatic ductal cells', 'Endothelial cells', 'Macrophages', 'Mast cells',\n",
    "       'Pancreatic stellate cells', 'Glial cells', 'T cells'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 538,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ruben.brabenec/.local/lib/python3.7/site-packages/pandas/core/arrays/categorical.py:2631: FutureWarning: The `inplace` parameter in pandas.Categorical.rename_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.\n",
      "  res = method(*args, **kwargs)\n"
     ]
    }
   ],
   "source": [
    "adata.obs['donor'] = adata.obs['sample']\n",
    "ref_cluster=pd.Categorical(adata.obs['donor'],\n",
    "                           categories=['1', '2', '3', '4'])\n",
    "adata.rename_categories('donor', ['Baron_Pancreas-Donor1', 'Baron_Pancreas-Donor2', 'Baron_Pancreas-Donor3', 'Baron_Pancreas-Donor4'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 539,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['Organ'] = 'Pancreas'\n",
    "adata.obs['Organ_Specific'] = 'Pancreas'\n",
    "adata.obs['Dataset'] = 'Baron_Pancreas'\n",
    "adata.obs['InternDatasetNumber'] ='06-1-Pancreas-Baron-2016'\n",
    "adata.obs['Dataset_status'] = 'Healthy_Dataset'\n",
    "\n",
    "adata.obs['celltype'] = adata.obs['celltype']\n",
    "adata.obs['sub_celltype'] = 'NaN'\n",
    "adata.obs['Malignant'] = 'NonMalignant'\n",
    "\n",
    "adata.obs['Patient'] = adata.obs['donor']\n",
    "adata.obs['Patient_Number'] = adata.obs['sample']\n",
    "adata.obs['age'] = 'NaN'\n",
    "adata.obs['sex'] = 'NaN'\n",
    "adata.obs['ethnicity'] = 'NaN'\n",
    "adata.obs['health_status'] = 'NaN'\n",
    "\n",
    "adata.obs['original_celltype_1'] = adata.obs['annotations_final']\n",
    "adata.obs['original_celltype_2'] = 'NaN'\n",
    "adata.obs['original_celltype_3'] = 'NaN'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 541,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 542,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs_names_make_unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 543,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.write(writepath + '06-1-Pancreas-Baron-2016-processed.h5ad')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 06-2-Pancreas-Peng-2019"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata=sc.read_text(writepath+'count-matrix_peng.txt', \n",
    "                   delimiter=' ', \n",
    "                   dtype='float32')\n",
    "adata=adata.transpose()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Get cell type annotations\n",
    "annotations=pd.read_csv(writepath + 'all_celltype_peng.txt', delimiter='\\t')\n",
    "adata.obs['cell_types']=np.array(annotations['cluster'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "samples=[]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "barcodes=adata.obs.index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(len(barcodes)):\n",
    "    samples.append(barcodes[i][0:3])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "samples=np.array(samples)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "string_samples = str(samples).replace('_', '')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['sample']=samples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "... storing 'cell_types' as categorical\n",
      "... storing 'sample' as categorical\n"
     ]
    }
   ],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.rename_categories('sample',['N1', 'N2', 'N3', 'N4', 'N5', 'N6', 'N7', 'N8', 'N9', 'N10',\n",
    "       'N11', 'T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', 'T8', 'T9',\n",
    "       'T10', 'T11', 'T12', 'T13', 'T14', 'T15', 'T16', 'T17', 'T18', 'T19',\n",
    "       'T20', 'T21', 'T22', 'T23', 'T24'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['condition']=adata.obs['sample'].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['condition'],\n",
    "                           categories=['N1', 'N2', 'N3', 'N4', 'N5', 'N6', 'N7', 'N8', 'N9', 'N10', 'N11',\n",
    "       'T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', 'T8', 'T9', 'T10', 'T11',\n",
    "       'T12', 'T13', 'T14', 'T15', 'T16', 'T17', 'T18', 'T19', 'T20', 'T21',\n",
    "       'T22', 'T23', 'T24'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,['N2', 'N3', 'N4', 'N5', 'N6', 'N7', 'N8', 'N9', 'N10', 'N11'])\n",
    "ref_cluster[ix]='N1'\n",
    "\n",
    "ix=np.isin(ref_cluster,['T2', 'T3', 'T4', 'T5', 'T6', 'T7', 'T8', 'T9', 'T10', 'T11',\n",
    "       'T12', 'T13', 'T14', 'T15', 'T16', 'T17', 'T18', 'T19', 'T20', 'T21',\n",
    "       'T22', 'T23', 'T24'])\n",
    "ref_cluster[ix]='T1'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['condition']=pd.Categorical(ref_cluster,\n",
    "                                            categories=['N1','T1'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.rename_categories('condition',\n",
    "                        ['Healthy','Tumor'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 548,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['InternDatasetNumber'] ='06-2-Pancreas-Peng-2019'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 551,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(adata.obs['condition'],['Healthy']) \n",
    "adata=adata[ix].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 554,
   "metadata": {},
   "outputs": [],
   "source": [
    "#calculate QC covariates\n",
    "adata.obs['n_counts'] = adata.X.sum(1)\n",
    "adata.obs['log_counts'] = np.log(adata.obs['n_counts'])\n",
    "adata.obs['n_genes'] = (adata.X > 0).sum(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 557,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "filtered out 2242 genes that are detected in less than 10 cells\n"
     ]
    }
   ],
   "source": [
    "# FILTER PARAMETERS\n",
    "#Filter out cells\n",
    "#sc.pp.filter_cells(adata, max_counts = 4200)\n",
    "#sc.pp.filter_cells(adata, max_genes = 9000)\n",
    "# Min 20 cells - filters out low count genes\n",
    "sc.pp.filter_genes(adata, min_cells=10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 559,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get mt genes\n",
    "mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]\n",
    "np.array(mt_genes)\n",
    "mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]\n",
    "\n",
    "# convert to spare\n",
    "adata.X=sp.sparse.csr_matrix.todense(adata.X)\n",
    "\n",
    "# manually define mt score\n",
    "y = np.bincount(mt_gene_mask)\n",
    "ii = np.nonzero(y)[0]\n",
    "np.vstack((ii,y[ii])).T\n",
    "adata.X[:, mt_gene_mask].sum(1)\n",
    "mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))\n",
    "mt_sum=np.array(pd.DataFrame(mt_sum)[0])\n",
    "mt_frac=mt_sum/adata.obs['n_counts'].values\n",
    "\n",
    "adata.obs['mt_frac'] = mt_frac\n",
    "\n",
    "#Filter out cells with over 25% mito fraction\n",
    "adata = adata[adata.obs['mt_frac'] < 0.20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 560,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pp=adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 561,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:00:02): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n",
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=15\n",
      "    finished (0:00:00)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)\n",
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 13 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:00:01)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 562,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 563,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 564,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/python/lib/python3.7/site-packages/ipykernel_launcher.py:3: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "#Delete adata_pp\n",
    "del adata_pp\n",
    "adata.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 565,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 566,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "#Keep the count data in a counts layer\n",
    "adata.layers[\"counts\"] = adata.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 567,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "adata.X /= adata.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 568,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 569,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n",
      "    finished (0:00:00)\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 570,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=50\n",
      "    finished (0:00:07)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 50\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:02)\n",
      "computing UMAP\n",
      "    finished: added\n",
      "    'X_umap', UMAP coordinates (adata.obsm) (0:00:13)\n"
     ]
    }
   ],
   "source": [
    "# Calculate the visualizations\n",
    "sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')\n",
    "sc.pp.neighbors(adata)\n",
    "sc.tl.umap(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 571,
   "metadata": {},
   "outputs": [],
   "source": [
    "#sc.pl.umap(adata, color='cell_type')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 572,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['Acinar cell', 'B cell', 'Ductal cell type 1', 'Endocrine cell',\n",
       "       'Endothelial cell', 'Fibroblast cell', 'Macrophage cell',\n",
       "       'Stellate cell', 'T cell'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 572,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# make consistent annotations across datasets\n",
    "adata.obs['celltype'] = adata.obs['cell_types'].copy()\n",
    "adata.obs['celltype'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 573,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['celltype'],\n",
    "                           categories=['Acinar cell', 'B cell', 'Ductal cell type 1', 'Ductal cell type 2',\n",
    "       'Endocrine cell', 'Endothelial cell', 'Fibroblast cell',\n",
    "       'Macrophage cell', 'Stellate cell', 'T cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 574,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,['Ductal cell type 2'])\n",
    "ref_cluster[ix]='Ductal cell type 1'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 575,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['celltype']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['Acinar cell', 'B cell', 'Ductal cell type 1', 'Endocrine cell',\n",
    "       'Endothelial cell', 'Fibroblast cell', 'Macrophage cell',\n",
    "       'Stellate cell', 'T cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 576,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ruben.brabenec/.local/lib/python3.7/site-packages/pandas/core/arrays/categorical.py:2631: FutureWarning: The `inplace` parameter in pandas.Categorical.rename_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.\n",
      "  res = method(*args, **kwargs)\n"
     ]
    }
   ],
   "source": [
    "adata.rename_categories('celltype',['Pancreatic acinar cells', 'B cells', 'Pancreatic ductal cells',\n",
    "       'Pancreatic endocrine cells', 'Endothelial cells', 'Fibroblast cells',\n",
    "       'Macrophages', 'Pancreatic stellate cells', 'T cells'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 579,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['donor'] = adata.obs['sample']\n",
    "ref_cluster=pd.Categorical(adata.obs['donor'],\n",
    "                           categories=['N1', 'N2', 'N3', 'N4', 'N5', 'N6', 'N7', 'N8', 'N9', 'N10', 'N11'])\n",
    "adata.rename_categories('donor', ['Peng_Pancreas-Donor1', 'Peng_Pancreas-Donor2', 'Peng_Pancreas-Donor3', 'Peng_Pancreas-Donor4', 'Peng_Pancreas-Donor5', \n",
    "                                  'Peng_Pancreas-Donor6', 'Peng_Pancreas-Donor7', 'Peng_Pancreas-Donor8', 'Peng_Pancreas-Donor9', 'Peng_Pancreas-Donor10', 'Peng_Pancreas-Donor11'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 580,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['Organ'] = 'Pancreas'\n",
    "adata.obs['Organ_Specific'] = 'Pancreas'\n",
    "adata.obs['Dataset'] = 'Peng_Pancreas'\n",
    "adata.obs['InternDatasetNumber'] ='06-2-Pancreas-Peng-2019'\n",
    "adata.obs['Dataset_status'] = 'Healthy_Dataset'\n",
    "\n",
    "adata.obs['celltype'] = adata.obs['celltype']\n",
    "adata.obs['sub_celltype'] = 'NaN'\n",
    "adata.obs['Malignant'] = 'NonMalignant'\n",
    "\n",
    "adata.obs['Patient'] =  adata.obs['donor']\n",
    "adata.obs['Patient_Number'] = adata.obs['sample']\n",
    "adata.obs['age'] = 'NaN'\n",
    "adata.obs['sex'] = 'NaN'\n",
    "adata.obs['ethnicity'] = 'NaN'\n",
    "adata.obs['health_status'] = 'NaN'\n",
    "\n",
    "adata.obs['original_celltype_1'] = adata.obs['cell_types']\n",
    "adata.obs['original_celltype_2'] = 'NaN'\n",
    "adata.obs['original_celltype_3'] = 'NaN'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 582,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 583,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs_names_make_unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 239,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.write(writepath + '06-2-Pancreas-Peng-2019-processed.h5ad')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 06-3-Pancreas-Enge-2017"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 221,
   "metadata": {},
   "outputs": [],
   "source": [
    "target_collections = [\"a238e9fa-2bdf-41df-8522-69046f99baff\"]\n",
    "cache_path = os.path.join(\".\", \"data\")\n",
    "dsg = sfaira.data.dataloaders.databases.DatasetSuperGroupDatabases(data_path=cache_path, cache_metadata=True)\n",
    "dsg.subset(key=\"collection_id\", values=target_collections)\n",
    "dsg.datasets\n",
    "dsg.download()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 222,
   "metadata": {},
   "outputs": [],
   "source": [
    "path_X = 'path/to/repo/a238e9fa-2bdf-41df-8522-69046f99baff/'\n",
    "files = [f for f in listdir(path_X) if isfile(join(path_X, f))]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 223,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['66d15835-5dc8-4e96-b0eb-f48971cb65e8.h5ad']"
      ]
     },
     "execution_count": 223,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 224,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "66d15835-5dc8-4e96-b0eb-f48971cb65e8.h5ad\n"
     ]
    }
   ],
   "source": [
    "for i in range(len(files)):\n",
    "    print(files[i])\n",
    "    path_2 = path_X + files[i]\n",
    "    u = sc.read_h5ad(path_2)\n",
    "    u.obs['id'] = files[i]\n",
    "    adata = u"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 226,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['InternDatasetNumber'] ='06-3-Pancreas-Enge-2017'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 230,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/python/lib/python3.7/site-packages/pandas/core/series.py:679: RuntimeWarning: divide by zero encountered in log\n",
      "  result = getattr(ufunc, method)(*inputs, **kwargs)\n"
     ]
    }
   ],
   "source": [
    "#calculate QC covariates\n",
    "adata.obs['n_counts'] = adata.X.sum(1)\n",
    "adata.obs['log_counts'] = np.log(adata.obs['n_counts'])\n",
    "adata.obs['n_genes'] = (adata.X > 0).sum(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 233,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "filtered out 8 cells that have more than 7500 genes expressed\n",
      "filtered out 6897 genes that are detected in less than 20 cells\n"
     ]
    }
   ],
   "source": [
    "# FILTER PARAMETERS\n",
    "#Filter out cells\n",
    "#sc.pp.filter_cells(adata, max_counts = 14000)\n",
    "sc.pp.filter_cells(adata, max_genes = 7500)\n",
    "# Min 20 cells - filters out low count genes\n",
    "sc.pp.filter_genes(adata, min_cells=20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 235,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.var.index = adata.var['feature_name']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 236,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/python/lib/python3.7/site-packages/ipykernel_launcher.py:16: RuntimeWarning: invalid value encountered in true_divide\n",
      "  app.launch_new_instance()\n"
     ]
    }
   ],
   "source": [
    "# get mt genes\n",
    "mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]\n",
    "np.array(mt_genes)\n",
    "mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]\n",
    "\n",
    "# convert to spare\n",
    "adata.X=sp.sparse.csr_matrix.todense(adata.X)\n",
    "\n",
    "# manually define mt score\n",
    "y = np.bincount(mt_gene_mask)\n",
    "ii = np.nonzero(y)[0]\n",
    "np.vstack((ii,y[ii])).T\n",
    "adata.X[:, mt_gene_mask].sum(1)\n",
    "mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))\n",
    "mt_sum=np.array(pd.DataFrame(mt_sum)[0])\n",
    "mt_frac=mt_sum/adata.obs['n_counts'].values\n",
    "\n",
    "adata.obs['mt_frac'] = mt_frac\n",
    "\n",
    "#Filter out cells with over 25% mito fraction\n",
    "adata = adata[adata.obs['mt_frac'] < 0.20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 238,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pp=adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 239,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:00:00): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n",
      "computing PCA\n",
      "    with n_comps=15\n",
      "    finished (0:00:00)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)\n",
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 19 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:00:00)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 240,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 242,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 243,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/python/lib/python3.7/site-packages/ipykernel_launcher.py:3: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "#Delete adata_pp\n",
    "del adata_pp\n",
    "adata.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 244,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 245,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "#Keep the count data in a counts layer\n",
    "adata.layers[\"counts\"] = adata.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 246,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.raw = adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 247,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "adata.X /= adata.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 248,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 249,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n",
      "    finished (0:00:01)\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 250,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=50\n",
      "    finished (0:00:02)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 50\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)\n",
      "computing UMAP\n",
      "    finished: added\n",
      "    'X_umap', UMAP coordinates (adata.obsm) (0:00:05)\n"
     ]
    }
   ],
   "source": [
    "# Calculate the visualizations\n",
    "sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')\n",
    "sc.pp.neighbors(adata)\n",
    "sc.tl.umap(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 252,
   "metadata": {},
   "outputs": [],
   "source": [
    "# make consistent annotations across datasets\n",
    "adata.obs['celltype']=adata.obs['cell_type'].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 253,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['acinar cell', 'mesenchymal cell', 'native cell',\n",
       "       'pancreatic ductal cell', 'type A enteroendocrine cell',\n",
       "       'type B pancreatic cell', 'type D enteroendocrine cell'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 253,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.obs['celltype'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 254,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['celltype'],\n",
    "                           categories=['acinar cell', 'mesenchymal cell', 'native cell',\n",
    "       'pancreatic ductal cell', 'type A enteroendocrine cell',\n",
    "       'type B pancreatic cell', 'type D enteroendocrine cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 255,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,['type B pancreatic cell', 'type D enteroendocrine cell'])\n",
    "ref_cluster[ix]='type A enteroendocrine cell'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 256,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['celltype']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['acinar cell', 'mesenchymal cell', 'native cell',\n",
    "       'pancreatic ductal cell', 'type A enteroendocrine cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 257,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.rename_categories('celltype', ['Pancreatic acinar cells', 'Mesenchymal stromal cells', 'Unknown',\n",
    "       'Pancreatic ductal cells', 'Pancreatic endocrine cells'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 261,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['development_stage'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['development_stage'],\n",
    "                           categories=['1-month-old human stage', '21-year-old human stage',\n",
    "       '22-year-old human stage', '38-year-old human stage',\n",
    "       '44-year-old human stage', '5-year-old human stage',\n",
    "       '54-year-old human stage', '6-year-old human stage', 'unknown'])\n",
    "adata.rename_categories('development_stage', ['1 Month', '21',\n",
    "       '22', '38',\n",
    "       '44', '5',\n",
    "       '54', '6', 'NaN'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 262,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['ethnicity'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['ethnicity'],\n",
    "                           categories=['African American', 'Asian', 'European', 'unknown'])\n",
    "adata.rename_categories('ethnicity', ['African-American', 'Asian', 'European', 'NaN'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 263,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['sex'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['sex'],\n",
    "                           categories=['female', 'male', 'unknown'])\n",
    "adata.rename_categories('sex', ['Female', 'Male', 'NaN'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 264,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['donor'] = adata.obs['development_stage']\n",
    "ref_cluster=pd.Categorical(adata.obs['donor'],\n",
    "                           categories=['1-month-old human stage', '21-year-old human stage',\n",
    "       '22-year-old human stage', '38-year-old human stage',\n",
    "       '44-year-old human stage', '5-year-old human stage',\n",
    "       '54-year-old human stage', '6-year-old human stage', 'unknown'])\n",
    "adata.rename_categories('donor', ['Enge_Pancreas-Donor1', 'Enge_Pancreas-Donor2',\n",
    "       'Enge_Pancreas-Donor3', 'Enge_Pancreas-Donor4',\n",
    "       'Enge_Pancreas-Donor5', 'Enge_Pancreas-Donor6',\n",
    "       'Enge_Pancreas-Donor7', 'Enge_Pancreas-Donor8', 'Enge_Pancreas-Donor9'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 265,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['Organ'] = 'Pancreas'\n",
    "adata.obs['Organ_Specific'] = 'Pancreas'\n",
    "adata.obs['Dataset'] = 'Enge_Pancreas'\n",
    "adata.obs['InternDatasetNumber'] ='06-3-Pancreas-Enge-2017'\n",
    "adata.obs['Dataset_status'] = 'Healthy_Dataset'\n",
    "\n",
    "adata.obs['celltype'] = adata.obs['celltype']\n",
    "adata.obs['sub_celltype'] = 'NaN'\n",
    "adata.obs['Malignant'] = 'NonMalignant'\n",
    "\n",
    "adata.obs['Patient'] = adata.obs['donor']\n",
    "adata.obs['Patient_Number'] = 'NaN'\n",
    "adata.obs['age'] = adata.obs['development_stage']\n",
    "adata.obs['sex'] = adata.obs['sex']\n",
    "adata.obs['ethnicity'] = adata.obs['ethnicity']\n",
    "adata.obs['health_status'] = 'NaN'\n",
    "\n",
    "adata.obs['original_celltype_1'] = adata.obs['cell_type']\n",
    "adata.obs['original_celltype_2'] = 'NaN'\n",
    "adata.obs['original_celltype_3'] = 'NaN'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 266,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 268,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs_names_make_unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 269,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.write(writepath + '06-3-Pancreas-Enge-2017-processed.h5ad')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 06-4-Pancreas-Oudenaarden-2016"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 270,
   "metadata": {},
   "outputs": [],
   "source": [
    "target_collections = [\"6e8c5415-302c-492a-a5f9-f29c57ff18fb\"]\n",
    "cache_path = os.path.join(\".\", \"data\")\n",
    "dsg = sfaira.data.dataloaders.databases.DatasetSuperGroupDatabases(data_path=cache_path, cache_metadata=True)\n",
    "dsg.subset(key=\"collection_id\", values=target_collections)\n",
    "dsg.datasets\n",
    "dsg.download()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 271,
   "metadata": {},
   "outputs": [],
   "source": [
    "path_X = '/path/to/repo/6e8c5415-302c-492a-a5f9-f29c57ff18fb/'\n",
    "files = [f for f in listdir(path_X) if isfile(join(path_X, f))]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 272,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['b07e5164-baf6-43d2-bdba-5a249d0da879.h5ad']"
      ]
     },
     "execution_count": 272,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 273,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "b07e5164-baf6-43d2-bdba-5a249d0da879.h5ad\n"
     ]
    }
   ],
   "source": [
    "for i in range(len(files)):\n",
    "    print(files[i])\n",
    "    path_2 = path_X + files[i]\n",
    "    u = sc.read_h5ad(path_2)\n",
    "    u.obs['id'] = files[i]\n",
    "    adata = u"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 275,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['InternDatasetNumber'] ='06-4-Pancreas-Oudenaarden-2016'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 278,
   "metadata": {},
   "outputs": [],
   "source": [
    "#calculate QC covariates\n",
    "adata.obs['n_counts'] = adata.X.sum(1)\n",
    "adata.obs['log_counts'] = np.log(adata.obs['n_counts'])\n",
    "adata.obs['n_genes'] = (adata.X > 0).sum(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 281,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "filtered out 9439 genes that are detected in less than 20 cells\n"
     ]
    }
   ],
   "source": [
    "# FILTER PARAMETERS\n",
    "#Filter out cells\n",
    "#sc.pp.filter_cells(adata, max_counts = 23000)\n",
    "#sc.pp.filter_cells(adata, max_genes = 6700)\n",
    "# Min 20 cells - filters out low count genes\n",
    "sc.pp.filter_genes(adata, min_cells=20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 283,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.var.index = adata.var['feature_name']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 284,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get mt genes\n",
    "mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]\n",
    "np.array(mt_genes)\n",
    "mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]\n",
    "\n",
    "# convert to spare\n",
    "adata.X=sp.sparse.csr_matrix.todense(adata.X)\n",
    "\n",
    "# manually define mt score\n",
    "y = np.bincount(mt_gene_mask)\n",
    "ii = np.nonzero(y)[0]\n",
    "np.vstack((ii,y[ii])).T\n",
    "adata.X[:, mt_gene_mask].sum(1)\n",
    "mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))\n",
    "mt_sum=np.array(pd.DataFrame(mt_sum)[0])\n",
    "mt_frac=mt_sum/adata.obs['n_counts'].values\n",
    "\n",
    "adata.obs['mt_frac'] = mt_frac\n",
    "\n",
    "#Filter out cells with over 25% mito fraction\n",
    "adata = adata[adata.obs['mt_frac'] < 0.20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 285,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pp=adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 286,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:00:00): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n",
      "computing PCA\n",
      "    with n_comps=15\n",
      "    finished (0:00:00)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)\n",
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 10 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:00:00)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 287,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 288,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 289,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/python/lib/python3.7/site-packages/ipykernel_launcher.py:3: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "#Delete adata_pp\n",
    "del adata_pp\n",
    "adata.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 290,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 291,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "#Keep the count data in a counts layer\n",
    "adata.layers[\"counts\"] = adata.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 292,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "adata.X /= adata.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 293,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 294,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n",
      "    finished (0:00:00)\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculate the visualizations\n",
    "sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')\n",
    "sc.pp.neighbors(adata)\n",
    "sc.tl.umap(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 297,
   "metadata": {},
   "outputs": [],
   "source": [
    "# make consistent annotations across datasets\n",
    "adata.obs['celltype']=adata.obs['cell_type'].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 298,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['endothelial cell', 'mesenchymal cell', 'pancreatic A cell',\n",
       "       'pancreatic D cell', 'pancreatic PP cell', 'pancreatic acinar cell',\n",
       "       'pancreatic ductal cell', 'pancreatic endocrine cell',\n",
       "       'pancreatic epsilon cell', 'type B pancreatic cell'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 298,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.obs['celltype'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 299,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['celltype'],\n",
    "                           categories=['endothelial cell', 'mesenchymal cell', 'pancreatic A cell',\n",
    "       'pancreatic D cell', 'pancreatic PP cell', 'pancreatic acinar cell',\n",
    "       'pancreatic ductal cell', 'pancreatic endocrine cell',\n",
    "       'pancreatic epsilon cell', 'type B pancreatic cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 300,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,[ 'pancreatic D cell', 'pancreatic PP cell','pancreatic endocrine cell',\n",
    "       'pancreatic epsilon cell', 'type B pancreatic cell'])\n",
    "ref_cluster[ix]='pancreatic A cell'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 301,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['celltype']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['endothelial cell', 'mesenchymal cell', 'pancreatic A cell',\n",
    "       'pancreatic acinar cell',\n",
    "       'pancreatic ductal cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 305,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['b07e5164-baf6-43d2-bdba-5a249d0da879.h5ad'], dtype='object')"
      ]
     },
     "execution_count": 305,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.obs['id'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 306,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['development_stage'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['development_stage'],\n",
    "                           categories=['23-year-old human stage', '48-year-old human stage',\n",
    "       '54-year-old human stage', '59-year-old human stage'])\n",
    "adata.rename_categories('development_stage', ['23', '48',\n",
    "       '54', '59'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 307,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['sex'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['sex'],\n",
    "                           categories=['female', 'male'])\n",
    "adata.rename_categories('sex', ['Female', 'Male'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 308,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['donor'] = adata.obs['development_stage']\n",
    "ref_cluster=pd.Categorical(adata.obs['donor'],\n",
    "                           categories=['23-year-old human stage', '48-year-old human stage',\n",
    "       '54-year-old human stage', '59-year-old human stage'])\n",
    "adata.rename_categories('donor', ['Oudenaarden_Pancreas-Donor1', 'Oudenaarden_Pancreas-Donor2',\n",
    "       'Oudenaarden_Pancreas-Donor3', 'Oudenaarden_Pancreas-Donor4'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 312,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs_names_make_unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['InternDatasetNumber'] ='06-4-Pancreas-Oudenaarden-2016'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.write(writepath + '06-4-Pancreas-Oudenaarden-2016-processed.h5ad')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 06-5-Pancreas-Pisco-2022"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 386,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(adata_pisco.obs['tissue'],[ 'endocrine pancreas', 'exocrine pancreas']) \n",
    "adata=adata_pisco[ix].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 387,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['InternDatasetNumber'] = '06-5-Pancreas-Pisco-2022'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 392,
   "metadata": {},
   "outputs": [],
   "source": [
    "#calculate QC covariates\n",
    "adata.obs['n_counts'] = adata.X.sum(1)\n",
    "adata.obs['log_counts'] = np.log(adata.obs['n_counts'])\n",
    "adata.obs['n_genes'] = (adata.X > 0).sum(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 396,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "filtered out 6 cells that have more than 16000 counts\n",
      "filtered out 3 cells that have more than 11000 genes expressed\n",
      "filtered out 34384 genes that are detected in less than 20 cells\n"
     ]
    }
   ],
   "source": [
    "# FILTER PARAMETERS\n",
    "#Filter out cells\n",
    "sc.pp.filter_cells(adata, max_counts = 16000)\n",
    "sc.pp.filter_cells(adata, max_genes = 11000)\n",
    "# Min 20 cells - filters out low count genes\n",
    "sc.pp.filter_genes(adata, min_cells=20) #500 works # # 450(30494) #400(29837) #300(28268) not"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 398,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get mt genes\n",
    "mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]\n",
    "np.array(mt_genes)\n",
    "mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]\n",
    "\n",
    "# convert to spare\n",
    "adata.X=sp.sparse.csr_matrix.todense(adata.X)\n",
    "\n",
    "# manually define mt score\n",
    "y = np.bincount(mt_gene_mask)\n",
    "ii = np.nonzero(y)[0]\n",
    "np.vstack((ii,y[ii])).T\n",
    "adata.X[:, mt_gene_mask].sum(1)\n",
    "mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))\n",
    "mt_sum=np.array(pd.DataFrame(mt_sum)[0])\n",
    "mt_frac=mt_sum/adata.obs['n_counts'].values\n",
    "\n",
    "adata.obs['mt_frac'] = mt_frac\n",
    "\n",
    "#Filter out cells with over 20% mito fraction\n",
    "adata = adata[adata.obs['mt_frac'] < 0.20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 399,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pp=adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 400,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:00:01): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n",
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=15\n",
      "    finished (0:00:00)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)\n",
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 17 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:00:00)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 401,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 402,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 403,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/python/lib/python3.7/site-packages/ipykernel_launcher.py:3: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "#Delete adata_pp\n",
    "del adata_pp\n",
    "adata.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 404,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 405,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "#Keep the count data in a counts layer\n",
    "adata.layers[\"counts\"] = adata.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 406,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.raw = adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 407,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "adata.X /= adata.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 408,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n",
      "    finished (0:00:01)\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 409,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=50\n",
      "    finished (0:00:01)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 50\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:02)\n",
      "computing UMAP\n",
      "    finished: added\n",
      "    'X_umap', UMAP coordinates (adata.obsm) (0:00:11)\n"
     ]
    }
   ],
   "source": [
    "# Calculate the visualizations\n",
    "sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')\n",
    "sc.pp.neighbors(adata)\n",
    "sc.tl.umap(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 411,
   "metadata": {},
   "outputs": [],
   "source": [
    "# make consistent annotations across datasets\n",
    "adata.obs['celltype']=adata.obs['cell_type'].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 412,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['B cell', 'T cell', 'endothelial cell', 'fibroblast', 'mast cell',\n",
       "       'mature NK T cell', 'myeloid cell', 'pancreatic A cell',\n",
       "       'pancreatic D cell', 'pancreatic PP cell', 'pancreatic acinar cell',\n",
       "       'pancreatic ductal cell', 'pancreatic stellate cell', 'plasma cell',\n",
       "       'type B pancreatic cell'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 412,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.obs['celltype'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 413,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['celltype'],\n",
    "                           categories=['B cell', 'T cell', 'endothelial cell', 'fibroblast', 'mast cell',\n",
    "       'mature NK T cell', 'myeloid cell', 'pancreatic A cell',\n",
    "       'pancreatic D cell', 'pancreatic PP cell', 'pancreatic acinar cell',\n",
    "       'pancreatic ductal cell', 'pancreatic stellate cell', 'plasma cell',\n",
    "       'type B pancreatic cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 414,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,['pancreatic D cell', 'pancreatic PP cell', 'type B pancreatic cell'])\n",
    "ref_cluster[ix]='pancreatic A cell'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 415,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['celltype']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['B cell', 'T cell', 'endothelial cell', 'fibroblast', 'mast cell',\n",
    "       'mature NK T cell', 'myeloid cell', 'pancreatic A cell',\n",
    "       'pancreatic acinar cell',\n",
    "       'pancreatic ductal cell', 'pancreatic stellate cell', 'plasma cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 416,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.rename_categories('celltype',['B cells', 'T cells', 'Endothelial cells', 'Fibroblast cells', 'Mast cells',\n",
    "       'NK cells', 'Myeloid cells', 'Pancreatic endocrine cells',\n",
    "       'Pancreatic acinar cells',\n",
    "       'Pancreatic ductal cells', 'Pancreatic stellate cells', 'Plasma cells'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 420,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['tissue'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['tissue'],\n",
    "                           categories=['endocrine pancreas', 'exocrine pancreas'])\n",
    "adata.rename_categories('tissue', ['Pancreas_Endocrine','Pancreas_Exocrine' ])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 421,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['sex'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['sex'],\n",
    "                           categories=['female', 'male'])\n",
    "adata.rename_categories('sex', ['Female', 'Male'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 422,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['ethnicity'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['ethnicity'],\n",
    "                           categories=['European', 'Hispanic or Latin American'])\n",
    "adata.rename_categories('ethnicity', ['European', 'Hispanic or Latin-American'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 423,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['development_stage'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['development_stage'],\n",
    "                           categories=['37-year-old human stage', '59-year-old human stage'])\n",
    "adata.rename_categories('development_stage',['37', '59'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 424,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['donor'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['donor'],\n",
    "                           categories=['TSP1', 'TSP9'])\n",
    "adata.rename_categories('donor', ['TSP1', 'TSP9'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 425,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['Organ'] = 'Pancreas'\n",
    "adata.obs['Organ_Specific'] = adata.obs['tissue']\n",
    "adata.obs['Dataset'] = 'Pisco_Pancreas'\n",
    "adata.obs['InternDatasetNumber'] = '06-5-Pancreas-Pisco-2022'\n",
    "adata.obs['Dataset_status'] = 'Healthy_Dataset'\n",
    "\n",
    "adata.obs['celltype'] = adata.obs['celltype']\n",
    "adata.obs['sub_celltype'] = 'NaN'\n",
    "adata.obs['Malignant'] = 'NonMalignant'\n",
    "\n",
    "adata.obs['Patient'] = adata.obs['donor']\n",
    "adata.obs['Patient_Number'] = 'NaN'\n",
    "adata.obs['age'] = adata.obs['development_stage']\n",
    "adata.obs['sex'] = adata.obs['sex']\n",
    "adata.obs['ethnicity'] = adata.obs['ethnicity']\n",
    "adata.obs['health_status'] = 'NaN'\n",
    "\n",
    "adata.obs['original_celltype_1'] = adata.obs['cell_type']\n",
    "adata.obs['original_celltype_2'] = 'NaN'\n",
    "adata.obs['original_celltype_3'] = 'NaN'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 427,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 428,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs_names_make_unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 429,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.write(writepath + '06-5-Pancreas-Pisco-2022-processed.h5ad')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 06-6-Pancreas-Han-2020"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 733,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(adata_han.obs['sub_tissue'],['AdultPancreas']) \n",
    "adata=adata_han[ix].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 734,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['InternDatasetNumber'] ='06-6-Pancreas-Han-2020'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 738,
   "metadata": {},
   "outputs": [],
   "source": [
    "#calculate QC covariates\n",
    "adata.obs['n_counts'] = adata.X.sum(1)\n",
    "adata.obs['log_counts'] = np.log(adata.obs['n_counts'])\n",
    "adata.obs['n_genes'] = (adata.X > 0).sum(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 741,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "filtered out 25 cells that have more than 2500 counts\n",
      "filtered out 1 cells that have more than 1100 genes expressed\n",
      "filtered out 14979 genes that are detected in less than 10 cells\n"
     ]
    }
   ],
   "source": [
    "# FILTER PARAMETERS#Filter out cells\n",
    "sc.pp.filter_cells(adata, max_counts = 2500)\n",
    "sc.pp.filter_cells(adata, max_genes = 1100)\n",
    "# Min 20 cells - filters out low count genes\n",
    "sc.pp.filter_genes(adata, min_cells=10) #500 works # # 450(30494) #400(29837) #300(28268) not"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 742,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get mt genes\n",
    "mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]\n",
    "np.array(mt_genes)\n",
    "mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]\n",
    "\n",
    "# convert to spare\n",
    "adata.X=sp.sparse.csr_matrix.todense(adata.X)\n",
    "\n",
    "# manually define mt score\n",
    "y = np.bincount(mt_gene_mask)\n",
    "ii = np.nonzero(y)[0]\n",
    "np.vstack((ii,y[ii])).T\n",
    "adata.X[:, mt_gene_mask].sum(1)\n",
    "mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))\n",
    "mt_sum=np.array(pd.DataFrame(mt_sum)[0])\n",
    "mt_frac=mt_sum/adata.obs['n_counts'].values\n",
    "\n",
    "adata.obs['mt_frac'] = mt_frac\n",
    "\n",
    "#Filter out cells with over 20% mito fraction\n",
    "adata = adata[adata.obs['mt_frac'] < 0.20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 743,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pp=adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 744,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:00:00): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n",
      "computing PCA\n",
      "    with n_comps=15\n",
      "    finished (0:00:02)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)\n",
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 8 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:00:00)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 745,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 746,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 747,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/python/lib/python3.7/site-packages/ipykernel_launcher.py:3: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "#Delete adata_pp\n",
    "del adata_pp\n",
    "adata.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 748,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 749,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "#Keep the count data in a counts layer\n",
    "adata.layers[\"counts\"] = adata.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 750,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.raw = adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 751,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "adata.X /= adata.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 752,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n",
      "    finished (0:00:01)\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 753,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=50\n",
      "    finished (0:00:01)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 50\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)\n",
      "computing UMAP\n",
      "    finished: added\n",
      "    'X_umap', UMAP coordinates (adata.obsm) (0:00:19)\n"
     ]
    }
   ],
   "source": [
    "# Calculate the visualizations\n",
    "sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')\n",
    "sc.pp.neighbors(adata)\n",
    "sc.tl.umap(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 755,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['Acinar cell_CPA1 high', 'Acinar cell_REG1B high',\n",
       "       'Acniar cell_ANXA4 high', 'Alpha cell', 'Beta cell', 'Ductal cell',\n",
       "       'Endothelial cell', 'Exocrine cell', 'Exocrine cell_SAA1 high',\n",
       "       'Fibroblast', 'M2 Macrophage', 'Smooth muscle cell'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 755,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# make consistent annotations across datasets\n",
    "adata.obs['celltype']=adata.obs['celltype_specific'].copy()\n",
    "adata.obs['celltype'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 756,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['celltype'],\n",
    "                           categories=['Acinar cell_CPA1 high', 'Acinar cell_REG1B high',\n",
    "       'Acniar cell_ANXA4 high', 'Alpha cell', 'Beta cell', 'Ductal cell',\n",
    "       'Endothelial cell', 'Exocrine cell', 'Exocrine cell_SAA1 high',\n",
    "       'Fibroblast', 'M2 Macrophage', 'Smooth muscle cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 757,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,[ 'Acinar cell_REG1B high', 'Acniar cell_ANXA4 high', 'Exocrine cell', 'Exocrine cell_SAA1 high'])\n",
    "ref_cluster[ix]='Acinar cell_CPA1 high'\n",
    "\n",
    "ix=np.isin(ref_cluster,[  'Beta cell'])\n",
    "ref_cluster[ix]=   'Alpha cell'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 758,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['celltype']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['Acinar cell_CPA1 high', \n",
    "        'Alpha cell', 'Ductal cell',\n",
    "       'Endothelial cell', \n",
    "       'Fibroblast', 'M2 Macrophage', 'Smooth muscle cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 759,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.rename_categories('celltype', ['Pancreatic acinar cells', \n",
    "        'Pancreatic endocrine cells', 'Pancreatic ductal cells',\n",
    "       'Endothelial cells',\n",
    "       'Fibroblast cells', 'Macrophages', 'Smooth muscle cells'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 763,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['sub_tissue'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['sub_tissue'],\n",
    "                           categories=['AdultPancreas'])\n",
    "adata.rename_categories('sub_tissue', ['Pancreas'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 764,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['sex'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['sex'],\n",
    "                           categories=['female'])\n",
    "adata.rename_categories('sex', ['Female'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 765,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['age'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['age'],\n",
    "                           categories=['43Y'])\n",
    "adata.rename_categories('age',['43'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 766,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['donor'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['donor'],\n",
    "                           categories=['Donor44'])\n",
    "adata.rename_categories('donor', ['Han-Donor44'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 767,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['Organ'] = 'Pancreas'\n",
    "adata.obs['Organ_Specific'] = adata.obs['sub_tissue']\n",
    "adata.obs['Dataset'] = 'Han_Pancreas'\n",
    "adata.obs['InternDatasetNumber'] ='06-6-Pancreas-Han-2020'\n",
    "adata.obs['Dataset_status'] = 'Healthy_Dataset'\n",
    "\n",
    "adata.obs['celltype'] = adata.obs['celltype']\n",
    "adata.obs['sub_celltype'] = 'NaN'\n",
    "adata.obs['Malignant'] = 'NonMalignant'\n",
    "\n",
    "adata.obs['Patient'] = adata.obs['donor']\n",
    "adata.obs['Patient_Number'] = 'NaN'\n",
    "adata.obs['age'] = adata.obs['age']\n",
    "adata.obs['sex'] = adata.obs['sex']\n",
    "adata.obs['ethnicity'] = 'NaN'\n",
    "adata.obs['health_status'] = 'NaN'\n",
    "\n",
    "adata.obs['original_celltype_1'] = adata.obs['celltype_specific']\n",
    "adata.obs['original_celltype_2'] = adata.obs['celltype_global']\n",
    "adata.obs['original_celltype_3'] = 'NaN'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 769,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 770,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.write(writepath + '06-6-Pancreas-Han-2020-processed.h5ad')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true,
    "tags": []
   },
   "source": [
    "# 07-Spleen"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 07-1-Spleen-Pisco-2022"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 607,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(adata_pisco.obs['tissue'],['spleen']) \n",
    "adata=adata_pisco[ix].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 612,
   "metadata": {},
   "outputs": [],
   "source": [
    "#calculate QC covariates\n",
    "adata.obs['n_counts'] = adata.X.sum(1)\n",
    "adata.obs['log_counts'] = np.log(adata.obs['n_counts'])\n",
    "adata.obs['n_genes'] = (adata.X > 0).sum(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 616,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "filtered out 54 cells that have more than 12000 counts\n",
      "filtered out 9 cells that have more than 8000 genes expressed\n",
      "filtered out 31568 genes that are detected in less than 20 cells\n"
     ]
    }
   ],
   "source": [
    "# FILTER PARAMETERS\n",
    "#Filter out cells\n",
    "sc.pp.filter_cells(adata, max_counts = 12000)\n",
    "sc.pp.filter_cells(adata, max_genes = 8000)\n",
    "# Min 20 cells - filters out low count genes\n",
    "sc.pp.filter_genes(adata, min_cells=20) #500 works # # 450(30494) #400(29837) #300(28268) not"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 618,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get mt genes\n",
    "mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]\n",
    "np.array(mt_genes)\n",
    "mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]\n",
    "\n",
    "# convert to spare\n",
    "adata.X=sp.sparse.csr_matrix.todense(adata.X)\n",
    "\n",
    "# manually define mt score\n",
    "y = np.bincount(mt_gene_mask)\n",
    "ii = np.nonzero(y)[0]\n",
    "np.vstack((ii,y[ii])).T\n",
    "adata.X[:, mt_gene_mask].sum(1)\n",
    "mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))\n",
    "mt_sum=np.array(pd.DataFrame(mt_sum)[0])\n",
    "mt_frac=mt_sum/adata.obs['n_counts'].values\n",
    "\n",
    "adata.obs['mt_frac'] = mt_frac\n",
    "\n",
    "#Filter out cells with over 20% mito fraction\n",
    "adata = adata[adata.obs['mt_frac'] < 0.20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 619,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pp=adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 620,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:00:04): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n",
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=15\n",
      "    finished (0:00:01)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:04)\n",
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 22 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:00:05)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 621,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 622,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 623,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/python/lib/python3.7/site-packages/ipykernel_launcher.py:3: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "#Delete adata_pp\n",
    "del adata_pp\n",
    "adata.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 624,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 625,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "#Keep the count data in a counts layer\n",
    "adata.layers[\"counts\"] = adata.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 626,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.raw = adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 627,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "adata.X /= adata.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 628,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n",
      "    finished (0:00:03)\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 629,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=50\n",
      "    finished (0:00:06)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 50\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:06)\n",
      "computing UMAP\n",
      "    finished: added\n",
      "    'X_umap', UMAP coordinates (adata.obsm) (0:00:27)\n"
     ]
    }
   ],
   "source": [
    "# Calculate the visualizations\n",
    "sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')\n",
    "sc.pp.neighbors(adata)\n",
    "sc.tl.umap(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 630,
   "metadata": {},
   "outputs": [],
   "source": [
    "#sc.pl.umap(adata, color='cell_type')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 631,
   "metadata": {},
   "outputs": [],
   "source": [
    "# make consistent annotations across datasets\n",
    "adata.obs['celltype']=adata.obs['cell_type'].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 632,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['CD141-positive myeloid dendritic cell',\n",
       "       'CD1c-positive myeloid dendritic cell',\n",
       "       'CD4-positive, alpha-beta memory T cell',\n",
       "       'CD8-positive, alpha-beta T cell',\n",
       "       'CD8-positive, alpha-beta memory T cell', 'classical monocyte',\n",
       "       'endothelial cell', 'erythrocyte', 'hematopoietic stem cell',\n",
       "       'innate lymphoid cell', 'intermediate monocyte', 'macrophage',\n",
       "       'mature NK T cell', 'memory B cell', 'naive B cell',\n",
       "       'naive thymus-derived CD4-positive, alpha-beta T cell',\n",
       "       'naive thymus-derived CD8-positive, alpha-beta T cell', 'neutrophil',\n",
       "       'plasma cell', 'plasmacytoid dendritic cell', 'platelet',\n",
       "       'regulatory T cell', 'type I NK T cell'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 632,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.obs['celltype'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 633,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['celltype'],\n",
    "                           categories=['CD141-positive myeloid dendritic cell',\n",
    "       'CD1c-positive myeloid dendritic cell',\n",
    "       'CD4-positive, alpha-beta memory T cell',\n",
    "       'CD8-positive, alpha-beta T cell',\n",
    "       'CD8-positive, alpha-beta memory T cell', 'classical monocyte',\n",
    "       'endothelial cell', 'erythrocyte', 'hematopoietic stem cell',\n",
    "       'innate lymphoid cell', 'intermediate monocyte', 'macrophage',\n",
    "       'mature NK T cell', 'memory B cell', 'naive B cell',\n",
    "       'naive thymus-derived CD4-positive, alpha-beta T cell',\n",
    "       'naive thymus-derived CD8-positive, alpha-beta T cell', 'neutrophil',\n",
    "       'plasma cell', 'plasmacytoid dendritic cell', 'platelet',\n",
    "       'regulatory T cell', 'type I NK T cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 634,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,['CD1c-positive myeloid dendritic cell',  'plasmacytoid dendritic cell'])\n",
    "ref_cluster[ix]='CD141-positive myeloid dendritic cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,[  'CD8-positive, alpha-beta T cell', 'CD8-positive, alpha-beta memory T cell','naive thymus-derived CD4-positive, alpha-beta T cell',\n",
    "       'naive thymus-derived CD8-positive, alpha-beta T cell', 'regulatory T cell', 'type I NK T cell'])\n",
    "ref_cluster[ix]='CD4-positive, alpha-beta memory T cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,[ 'intermediate monocyte'])\n",
    "ref_cluster[ix]= 'classical monocyte'\n",
    "\n",
    "ix=np.isin(ref_cluster,['naive B cell'])\n",
    "ref_cluster[ix]='memory B cell'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 635,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['celltype']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['CD141-positive myeloid dendritic cell',\n",
    "        'CD4-positive, alpha-beta memory T cell',\n",
    "      'classical monocyte',\n",
    "       'endothelial cell', 'erythrocyte', 'hematopoietic stem cell',\n",
    "       'innate lymphoid cell', 'macrophage',\n",
    "       'mature NK T cell', 'memory B cell',\n",
    "      'neutrophil',\n",
    "       'plasma cell', 'platelet'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 636,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.rename_categories('celltype',['Dendritic cells',\n",
    "        'T cells',\n",
    "      'Monocytes',\n",
    "       'Endothelial cells', 'Erythroid cells', 'Hematopoietic stem cells',\n",
    "       'Innate lymphoid cells', 'Macrophages',\n",
    "       'NK cells', 'B cells',\n",
    "      'Neutrophils',\n",
    "       'Plasma cells', 'Thrombocytes'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 640,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['tissue'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['tissue'],\n",
    "                           categories=['spleen'])\n",
    "adata.rename_categories('tissue', ['Spleen'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 641,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['sex'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['sex'],\n",
    "                           categories=['female', 'male'])\n",
    "adata.rename_categories('sex', ['Female', 'Male'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 642,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['ethnicity'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['ethnicity'],\n",
    "                           categories=['African American or Afro-Caribbean', 'European'])\n",
    "adata.rename_categories('ethnicity', ['African-American or Afro-Caribbean', 'European'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 643,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['development_stage'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['development_stage'],\n",
    "                           categories=['59-year-old human stage', '61-year-old human stage',\n",
    "       '69-year-old human stage'])\n",
    "adata.rename_categories('development_stage',['59', '61',\n",
    "       '69'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 644,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['donor'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['donor'],\n",
    "                           categories=['TSP2', 'TSP7', 'TSP14'])\n",
    "adata.rename_categories('donor', ['TSP2', 'TSP7', 'TSP14'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 645,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['Organ'] = 'Spleen'\n",
    "adata.obs['Organ_Specific'] = adata.obs['tissue']\n",
    "adata.obs['Dataset'] = 'Pisco_Spleen'\n",
    "adata.obs['InternDatasetNumber'] = '07-1-Spleen-Pisco-2022'\n",
    "adata.obs['Dataset_status'] = 'Healthy_Dataset'\n",
    "\n",
    "adata.obs['celltype'] = adata.obs['celltype']\n",
    "adata.obs['sub_celltype'] = 'NaN'\n",
    "adata.obs['Malignant'] = 'NonMalignant'\n",
    "\n",
    "adata.obs['Patient'] = adata.obs['donor']\n",
    "adata.obs['Patient_Number'] = 'NaN'\n",
    "adata.obs['age'] = adata.obs['development_stage']\n",
    "adata.obs['sex'] = adata.obs['sex']\n",
    "adata.obs['ethnicity'] = adata.obs['ethnicity']\n",
    "adata.obs['health_status'] = 'NaN'\n",
    "\n",
    "adata.obs['original_celltype_1'] = adata.obs['cell_type']\n",
    "adata.obs['original_celltype_2'] = 'NaN'\n",
    "adata.obs['original_celltype_3'] = 'NaN'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 647,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 648,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs_names_make_unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 649,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.write(writepath + '07-1-Spleen-Pisco-2022-processed.h5ad')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 07-2-Spleen-Han-2020"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 926,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(adata_han.obs['sub_tissue'],['AdultSpleen']) \n",
    "adata=adata_han[ix].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 927,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['InternDatasetNumber'] ='07-2-Spleen-Han-2020'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 931,
   "metadata": {},
   "outputs": [],
   "source": [
    "#calculate QC covariates\n",
    "adata.obs['n_counts'] = adata.X.sum(1)\n",
    "adata.obs['log_counts'] = np.log(adata.obs['n_counts'])\n",
    "adata.obs['n_genes'] = (adata.X > 0).sum(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 934,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "filtered out 25 cells that have more than 4000 counts\n",
      "filtered out 14680 genes that are detected in less than 20 cells\n"
     ]
    }
   ],
   "source": [
    "# FILTER PARAMETERS#Filter out cells\n",
    "sc.pp.filter_cells(adata, max_counts = 4000)\n",
    "sc.pp.filter_cells(adata, max_genes = 2000)\n",
    "# Min 20 cells - filters out low count genes\n",
    "sc.pp.filter_genes(adata, min_cells=20) #500 works # # 450(30494) #400(29837) #300(28268) not"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 935,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get mt genes\n",
    "mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]\n",
    "np.array(mt_genes)\n",
    "mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]\n",
    "\n",
    "# convert to spare\n",
    "adata.X=sp.sparse.csr_matrix.todense(adata.X)\n",
    "\n",
    "# manually define mt score\n",
    "y = np.bincount(mt_gene_mask)\n",
    "ii = np.nonzero(y)[0]\n",
    "np.vstack((ii,y[ii])).T\n",
    "adata.X[:, mt_gene_mask].sum(1)\n",
    "mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))\n",
    "mt_sum=np.array(pd.DataFrame(mt_sum)[0])\n",
    "mt_frac=mt_sum/adata.obs['n_counts'].values\n",
    "\n",
    "adata.obs['mt_frac'] = mt_frac\n",
    "\n",
    "#Filter out cells with over 20% mito fraction\n",
    "adata = adata[adata.obs['mt_frac'] < 0.20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 936,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pp=adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 937,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:00:00): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n",
      "computing PCA\n",
      "    with n_comps=15\n",
      "    finished (0:00:02)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)\n",
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 11 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:00:01)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 938,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 939,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 940,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/python/lib/python3.7/site-packages/ipykernel_launcher.py:3: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "#Delete adata_pp\n",
    "del adata_pp\n",
    "adata.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 941,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 942,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "#Keep the count data in a counts layer\n",
    "adata.layers[\"counts\"] = adata.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 943,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.raw = adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 944,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "adata.X /= adata.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 945,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n",
      "    finished (0:00:00)\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 946,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=50\n",
      "    finished (0:00:02)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 50\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:02)\n",
      "computing UMAP\n",
      "    finished: added\n",
      "    'X_umap', UMAP coordinates (adata.obsm) (0:00:12)\n"
     ]
    }
   ],
   "source": [
    "# Calculate the visualizations\n",
    "sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')\n",
    "sc.pp.neighbors(adata)\n",
    "sc.tl.umap(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 947,
   "metadata": {},
   "outputs": [],
   "source": [
    "#sc.pl.umap(adata, color='celltype_specific')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 948,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['B cell (Plasmocyte)_IGHA/HM_IGK high',\n",
       "       'B cell (Plasmocyte)_IGHA/HM_IGL high',\n",
       "       'B cell (Plasmocyte)_IGHG_IGK high',\n",
       "       'B cell (Plasmocyte)_IGHG_IGL high', 'B cell (centrocyte)',\n",
       "       'CD8_T cell', 'Endothelial cell', 'Erythroid cell',\n",
       "       'Lymphoid progenitor cell', 'M2 macrophage_CXCL8 high',\n",
       "       'M2 macrophage_MALAT1 high', 'Neutrophil', 'Neutrophil_DEFA3 high',\n",
       "       'Neutrophil_OLFM4 high', 'Neutrophil_S100A12 high', 'T cell'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 948,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# make consistent annotations across datasets\n",
    "adata.obs['celltype']=adata.obs['celltype_specific'].copy()\n",
    "adata.obs['celltype'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 949,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['celltype'],\n",
    "                           categories=['B cell (Plasmocyte)_IGHA/HM_IGK high',\n",
    "       'B cell (Plasmocyte)_IGHA/HM_IGL high',\n",
    "       'B cell (Plasmocyte)_IGHG_IGK high',\n",
    "       'B cell (Plasmocyte)_IGHG_IGL high', 'B cell (centrocyte)',\n",
    "       'CD8_T cell', 'Endothelial cell', 'Erythroid cell',\n",
    "       'Lymphoid progenitor cell', 'M2 macrophage_CXCL8 high',\n",
    "       'M2 macrophage_MALAT1 high', 'Neutrophil', 'Neutrophil_DEFA3 high',\n",
    "       'Neutrophil_OLFM4 high', 'Neutrophil_S100A12 high', 'T cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 950,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,[   'B cell (Plasmocyte)_IGHA/HM_IGL high','B cell (Plasmocyte)_IGHG_IGK high','B cell (Plasmocyte)_IGHG_IGL high'])\n",
    "ref_cluster[ix]= 'B cell (Plasmocyte)_IGHA/HM_IGK high'\n",
    "\n",
    "ix=np.isin(ref_cluster,[  'T cell'])\n",
    "ref_cluster[ix]=  'CD8_T cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,[ 'M2 macrophage_MALAT1 high'])\n",
    "ref_cluster[ix]= 'M2 macrophage_CXCL8 high'\n",
    "\n",
    "ix=np.isin(ref_cluster,[ 'Neutrophil_DEFA3 high',\n",
    "       'Neutrophil_OLFM4 high', 'Neutrophil_S100A12 high'])\n",
    "ref_cluster[ix]= 'Neutrophil'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 951,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['celltype']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['B cell (Plasmocyte)_IGHA/HM_IGK high',\n",
    "      'B cell (centrocyte)',\n",
    "       'CD8_T cell', 'Endothelial cell', 'Erythroid cell',\n",
    "       'Lymphoid progenitor cell', 'M2 macrophage_CXCL8 high',\n",
    "      'Neutrophil'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 952,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.rename_categories('celltype', ['Plasma cells',\n",
    "      'B cells',\n",
    "       'T cells', 'Endothelial cells', 'Erythroid cells',\n",
    "       'Common lymphoid progenitor cells', 'Macrophages',\n",
    "      'Neutrophils'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 956,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['sub_tissue'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['sub_tissue'],\n",
    "                           categories=['AdultSpleen'])\n",
    "adata.rename_categories('sub_tissue', ['Spleen'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 957,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['sex'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['sex'],\n",
    "                           categories=['female'])\n",
    "adata.rename_categories('sex', ['Female'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 958,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['age'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['age'],\n",
    "                           categories=['51Years'])\n",
    "adata.rename_categories('age',['51'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 959,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['donor'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['donor'],\n",
    "                           categories=['Donor49'])\n",
    "adata.rename_categories('donor', ['Donor49'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 960,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['Organ'] = 'Spleen'\n",
    "adata.obs['Organ_Specific'] = adata.obs['sub_tissue']\n",
    "adata.obs['Dataset'] = 'Han_Spleen'\n",
    "adata.obs['InternDatasetNumber'] ='07-2-Spleen-Han-2020'\n",
    "adata.obs['Dataset_status'] = 'Healthy_Dataset'\n",
    "\n",
    "adata.obs['celltype'] = adata.obs['celltype']\n",
    "adata.obs['sub_celltype'] = 'NaN'\n",
    "adata.obs['Malignant'] = 'NonMalignant'\n",
    "\n",
    "adata.obs['Patient'] = adata.obs['donor']\n",
    "adata.obs['Patient_Number'] = 'NaN'\n",
    "adata.obs['age'] = adata.obs['age']\n",
    "adata.obs['sex'] = adata.obs['sex']\n",
    "adata.obs['ethnicity'] = 'NaN'\n",
    "adata.obs['health_status'] = 'NaN'\n",
    "\n",
    "adata.obs['original_celltype_1'] = adata.obs['celltype_specific']\n",
    "adata.obs['original_celltype_2'] = adata.obs['celltype_global']\n",
    "adata.obs['original_celltype_3'] = 'NaN'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 962,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 963,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs_names_make_unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 964,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.write(writepath + '07-2-Spleen-Han-2020-processed.h5ad')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "##  07-3-Spleen_ImmuneCells-Teichmann-2022"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 247,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(adata_analysis.obs['tissue_major'], ['Spleen'])\n",
    "adata=adata_analysis[ix].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 252,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['Organ'] =  adata.obs['tissue_major']\n",
    "adata.obs['Organ_Specific'] = adata.obs['sub_tissue']\n",
    "adata.obs['Dataset'] = adata.obs['Dataset']\n",
    "adata.obs['InternDatasetNumber'] = '07-3-Spleen_ImmuneCells-Teichmann-2022'\n",
    "adata.obs['Dataset_status'] = 'Healthy_Dataset'\n",
    "\n",
    "adata.obs['celltype'] = adata.obs['celltype']\n",
    "adata.obs['sub_celltype'] = 'NaN'\n",
    "adata.obs['Malignant'] = 'NonMalignant'\n",
    "\n",
    "adata.obs['Patient'] = adata.obs['donor']\n",
    "adata.obs['Patient_Number'] = 'NaN'\n",
    "adata.obs['age'] = adata.obs['development_stage']\n",
    "adata.obs['sex'] = adata.obs['sex']\n",
    "adata.obs['ethnicity'] = adata.obs['ethnicity']\n",
    "adata.obs['health_status'] = 'NaN'\n",
    "\n",
    "adata.obs['original_celltype_1'] = adata.obs['cell_type']\n",
    "adata.obs['original_celltype_2'] = adata.obs['Majority_voting_CellTypist_high']\n",
    "adata.obs['original_celltype_3'] = 'NaN'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 253,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 254,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs_names_make_unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 255,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.write(writepath + '07-3-Spleen_ImmuneCells-Teichmann-2022-processed.h5ad')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true,
    "tags": []
   },
   "source": [
    "# 08-Lymph nodes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 08-1-LymphNode-Kim-2020"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For the lymph node dataset of Kim, data was obtained from the original study (https://doi.org/10.1038/s41467-020-16164-1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata=sc.read_text(writepath + 'GSE131907_Lung_Cancer_normalized_log2TPM_matrix.txt', \n",
    "                   delimiter=None, \n",
    "                   first_column_names=None, \n",
    "                   dtype='float32')\n",
    "adata=adata.transpose()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [],
   "source": [
    "#import annotations\n",
    "annotations=pd.DataFrame(pd.read_csv(writepath + 'GSE131907_Lung_Cancer_cell_annotation.txt', delimiter='\\t'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [],
   "source": [
    "#merge the two dfs\n",
    "df1=pd.DataFrame(adata.obs)\n",
    "annotations.index=annotations['Index']\n",
    "df2=annotations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_merged=pd.merge(df1, df2, left_index=True, right_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['sample']=df_merged['Sample']\n",
    "adata.obs['origin']=df_merged['Sample_Origin']\n",
    "adata.obs['cell type']=df_merged['Cell_type']\n",
    "adata.obs['cell type refined']=df_merged['Cell_type.refined']\n",
    "adata.obs['cell subtype']=df_merged['Cell_subtype']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tLung     45149\n",
       "nLung     42995\n",
       "nLN       37446\n",
       "mBrain    29060\n",
       "mLN       21479\n",
       "PE        20304\n",
       "tL/B      12073\n",
       "Name: origin, dtype: int64"
      ]
     },
     "execution_count": 89,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.obs['origin'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [],
   "source": [
    "# subset to lung and lymph node data\n",
    "ix=np.isin(adata.obs['origin'],['nLN']) \n",
    "adata_nLN=adata[ix].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 332,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "filtered out 2 cells that have more than 5000 counts\n",
      "filtered out 5 cells that have more than 6000 genes expressed\n",
      "filtered out 13378 genes that are detected in less than 20 cells\n"
     ]
    }
   ],
   "source": [
    "# FILTER PARAMETERS\n",
    "#Filter out cells\n",
    "sc.pp.filter_cells(adata_nLN, max_counts = 5000)\n",
    "sc.pp.filter_cells(adata_nLN, max_genes = 6000)\n",
    "# Min 20 cells - filters out low count genes\n",
    "sc.pp.filter_genes(adata_nLN, min_cells=20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 333,
   "metadata": {},
   "outputs": [],
   "source": [
    "mt_genes = adata_nLN.var_names[[gene.startswith('MT-') for gene in adata_nLN.var_names]]\n",
    "np.array(mt_genes)\n",
    "mt_gene_mask = [gene.startswith('MT-') for gene in adata_nLN.var_names]\n",
    "adata_nLN.obs['mt_frac'] = adata_nLN.X[:, mt_gene_mask].sum(1)/adata_nLN.obs['n_counts']\n",
    "#Filter out cells with over 25% mito fraction\n",
    "adata_nLN = adata_nLN[adata_nLN.obs['mt_frac'] < 0.20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 334,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pp = adata_nLN.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 335,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:00:01): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n",
      "computing PCA\n",
      "    with n_comps=15\n",
      "    finished (0:00:08)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:06)\n",
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 13 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:00:06)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 336,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata_nLN.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 337,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 338,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/python/lib/python3.7/site-packages/ipykernel_launcher.py:2: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
      "  \n"
     ]
    }
   ],
   "source": [
    "del adata_pp\n",
    "adata_nLN.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 339,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_nLN.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 340,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "#Keep the count data in a counts layer\n",
    "adata_nLN.layers[\"counts\"] = adata_nLN.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 341,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "adata_nLN.X /= adata_nLN.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata_nLN)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 342,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n",
      "    finished (0:00:05)\n",
      "--> added\n",
      "    'highly_variable', boolean vector (adata.var)\n",
      "    'means', float vector (adata.var)\n",
      "    'dispersions', float vector (adata.var)\n",
      "    'dispersions_norm', float vector (adata.var)\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.highly_variable_genes(adata_nLN, n_top_genes=4000, flavor='seurat')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 345,
   "metadata": {},
   "outputs": [],
   "source": [
    "# make consistent annotations across datasets\n",
    "testdf=pd.DataFrame()\n",
    "testdf['celltype']=adata_nLN.obs['cell type']\n",
    "testdf['sub_celltype']=adata_nLN.obs['cell subtype']\n",
    "testdf['fin']=str('broad: ')+testdf['celltype'].astype(str) + str('___refined: ') + testdf['sub_celltype'].astype(str)\n",
    "testdf['fin'] = testdf.fin.astype('category')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 346,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_nLN.obs['celltype']=np.array(testdf['fin'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 349,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_nLN.rename_categories('celltype', ['B cells',\n",
    "       'B cells 2',\n",
    "       'B cells 3',\n",
    "       'B cells 4',\n",
    "       'B cells 5',\n",
    "        'Plasma cells',\n",
    "       'B cells 6',\n",
    "                                           \n",
    "       'Dendritics',\n",
    "       'Dendritics 1',\n",
    "       'Dendritics 2',\n",
    "       'Dendritics 3',\n",
    "                                           \n",
    "       'Monocytes',\n",
    "                                           \n",
    "       'Macrophages',                       \n",
    "       'Macrophages 2',                   \n",
    "       'Macrophages 3',\n",
    "                                           \n",
    "       'Dendritics 4',\n",
    "                                           \n",
    "       'T cells',\n",
    "       'T cells 1',\n",
    "       'T cells 2',\n",
    "       'T cells 3',\n",
    "       'T cells 4',\n",
    "       'T cells 5',\n",
    "       'NK cells',\n",
    "       'T cells 6',\n",
    "       'T cells 7',\n",
    "       'T cells 8',\n",
    "       'T cells 9',\n",
    "       'T cells 10',\n",
    "       'Unknown'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 350,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['B cells', 'B cells 2', 'B cells 3', 'B cells 4', 'B cells 5',\n",
       "       'Plasma cells', 'B cells 6', 'Dendritics', 'Dendritics 1',\n",
       "       'Dendritics 2', 'Dendritics 3', 'Monocytes', 'Macrophages',\n",
       "       'Macrophages 2', 'Macrophages 3', 'Dendritics 4', 'T cells',\n",
       "       'T cells 1', 'T cells 2', 'T cells 3', 'T cells 4', 'T cells 5',\n",
       "       'NK cells', 'T cells 6', 'T cells 7', 'T cells 8', 'T cells 9',\n",
       "       'T cells 10', 'Unknown'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 350,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata_nLN.obs['celltype'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 351,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata_nLN.obs['celltype'],\n",
    "                           categories=['B cells', 'B cells 2', 'B cells 3', 'B cells 4', 'B cells 5',\n",
    "       'Plasma cells', 'B cells 6', 'Dendritics', 'Dendritics 1',\n",
    "       'Dendritics 2', 'Dendritics 3', 'Monocytes', 'Macrophages',\n",
    "       'Macrophages 2', 'Macrophages 3', 'Dendritics 4', 'T cells',\n",
    "       'T cells 1', 'T cells 2', 'T cells 3', 'T cells 4', 'T cells 5',\n",
    "       'NK cells', 'T cells 6', 'T cells 7', 'T cells 8', 'T cells 9',\n",
    "       'T cells 10', 'Unknown'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 352,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,['B cells 2', 'B cells 3', 'B cells 4',\n",
    "       'B cells 5', 'B cells 6'])\n",
    "ref_cluster[ix]='B cells'\n",
    "\n",
    "ix=np.isin(ref_cluster,['Dendritics 1', 'Dendritics 2',\n",
    "       'Dendritics 3','Dendritics 4'])\n",
    "ref_cluster[ix]='Dendritics'\n",
    "\n",
    "ix=np.isin(ref_cluster,['Macrophages 2','Macrophages 3'])\n",
    "ref_cluster[ix]='Macrophages'\n",
    "\n",
    "ix=np.isin(ref_cluster,['T cells 1', 'T cells 2',\n",
    "       'T cells 3', 'T cells 4', 'T cells 5', 'T cells 6',\n",
    "       'T cells 7', 'T cells 8', 'T cells 9', 'T cells 10',])\n",
    "ref_cluster[ix]='T cells'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 353,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_nLN.obs['celltype']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['B cells', 'Plasma cells', 'Dendritics', 'Monocytes', 'Macrophages', \n",
    "                                                       'T cells', 'NK cells','Unknown'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 357,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['LN_01', 'LN_02', 'LN_03', 'LN_04', 'LN_05', 'LN_06', 'LN_07', 'LN_08',\n",
       "       'LN_11', 'LN_12'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 357,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata_nLN.obs['sample'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 358,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_nLN.obs['donor'] = adata_nLN.obs['sample']\n",
    "ref_cluster=pd.Categorical(adata_nLN.obs['donor'],\n",
    "                           categories=['LN_01', 'LN_02', 'LN_03', 'LN_04', 'LN_05', 'LN_06', 'LN_07', 'LN_08',\n",
    "       'LN_11', 'LN_12'])\n",
    "adata_nLN.rename_categories('donor', ['LN_01', 'LN_02', 'LN_03', 'LN_04', 'LN_05', 'LN_06', 'LN_07', 'LN_08',\n",
    "       'LN_11', 'LN_12'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 361,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_nLN.X = sp.sparse.csr_matrix(adata_nLN.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_nLN.obs['InternDatasetNumber'] = '08-1-LymphNode-Kim-2020'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 363,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_nLN.write(writepath + '08-1-LymphNode-Kim-2020-processed.h5ad')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 08-2-LymphNode-Butcher-2020"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 364,
   "metadata": {},
   "outputs": [],
   "source": [
    "target_collections = [\"9c8808ce-1138-4dbe-818c-171cff10e650\"]\n",
    "cache_path = os.path.join(\".\", \"data\")\n",
    "dsg = sfaira.data.dataloaders.databases.DatasetSuperGroupDatabases(data_path=cache_path, cache_metadata=True)\n",
    "dsg.subset(key=\"collection_id\", values=target_collections)\n",
    "dsg.datasets\n",
    "dsg.download()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 365,
   "metadata": {},
   "outputs": [],
   "source": [
    "path = '/path/to/repo/9c8808ce-1138-4dbe-818c-171cff10e650/'\n",
    "files = [f for f in listdir(path) if isfile(join(path, f))]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 366,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "26ae14da-9e5f-4d18-abae-18a5a328feef.h5ad\n",
      "cfa3c355-ee77-4fc8-9a00-78e61d23024c.h5ad\n"
     ]
    }
   ],
   "source": [
    "for i in range(len(files)):\n",
    "    print(files[i])\n",
    "    path_2 = path + files[i]\n",
    "    u = sc.read_h5ad(path_2)\n",
    "    u.obs['id'] = files[i]\n",
    "    if u.n_obs == 4355:\n",
    "        adata = u"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 376,
   "metadata": {},
   "outputs": [],
   "source": [
    "#calculate QC covariates\n",
    "adata.obs['n_counts'] = adata.X.sum(1)\n",
    "adata.obs['log_counts'] = np.log(adata.obs['n_counts'])\n",
    "adata.obs['n_genes'] = (adata.X > 0).sum(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 383,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get mt genes\n",
    "mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]\n",
    "np.array(mt_genes)\n",
    "mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]\n",
    "\n",
    "# convert to spare\n",
    "adata.X=sp.sparse.csr_matrix.todense(adata.X)\n",
    "\n",
    "# manually define mt score\n",
    "y = np.bincount(mt_gene_mask)\n",
    "ii = np.nonzero(y)[0]\n",
    "np.vstack((ii,y[ii])).T\n",
    "adata.X[:, mt_gene_mask].sum(1)\n",
    "mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))\n",
    "mt_sum=np.array(pd.DataFrame(mt_sum)[0])\n",
    "mt_frac=mt_sum/adata.obs['n_counts'].values\n",
    "\n",
    "adata.obs['mt_frac'] = mt_frac\n",
    "\n",
    "#Filter out cells with over 25% mito fraction\n",
    "adata = adata[adata.obs['mt_frac'] < 0.25]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 385,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:00:00): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n",
      "computing PCA\n",
      "    with n_comps=15\n",
      "    finished (0:00:00)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)\n",
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 11 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:00:00)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 386,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 387,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 388,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/python/lib/python3.7/site-packages/ipykernel_launcher.py:3: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "#Delete adata_pp\n",
    "del adata_pp\n",
    "adata.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 389,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 390,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "#Keep the count data in a counts layer\n",
    "adata.layers[\"counts\"] = adata.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 391,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.raw = adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 393,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 394,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n",
      "    finished (0:00:01)\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 395,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=50\n",
      "    finished (0:00:07)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 50\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)\n",
      "computing UMAP\n",
      "    finished: added\n",
      "    'X_umap', UMAP coordinates (adata.obsm) (0:00:08)\n"
     ]
    }
   ],
   "source": [
    "# Calculate the visualizations\n",
    "sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')\n",
    "sc.pp.neighbors(adata)\n",
    "sc.tl.umap(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 397,
   "metadata": {},
   "outputs": [],
   "source": [
    "# make consistent annotations across datasets\n",
    "adata.obs['celltype']=adata.obs['cell_type'].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 400,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.rename_categories('celltype', ['Lymphatic endothelial cells'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 403,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['sex'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['sex'],\n",
    "                           categories=['male'])\n",
    "adata.rename_categories('sex', ['Male'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 406,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 407,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs_names_make_unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['InternDatasetNumber'] = '08-2-LymphNode-Butcher-2020'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.write(writepath + '08-2-LymphNode-Butcher-2020-processed.h5ad')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 08-4-LymphNode-Pisco-2022"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 151,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ruben.brabenec/.local/lib/python3.7/site-packages/anndata/_core/anndata.py:121: ImplicitModificationWarning: Transforming to str index.\n",
      "  warnings.warn(\"Transforming to str index.\", ImplicitModificationWarning)\n"
     ]
    }
   ],
   "source": [
    "ix=np.isin(adata_pisco.obs['tissue'],['inguinal lymph node',  'lymph node']) \n",
    "adata=adata_pisco[ix].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 155,
   "metadata": {},
   "outputs": [],
   "source": [
    "sc.pp.calculate_qc_metrics(adata, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 156,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['n_counts'] = adata.obs['total_counts']\n",
    "adata.obs['log_counts'] = adata.obs['log1p_n_genes_by_counts']\n",
    "adata.obs['n_genes'] = adata.obs['n_genes_by_counts']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 161,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "filtered out 30929 genes that are detected in less than 20 cells\n"
     ]
    }
   ],
   "source": [
    "# FILTER PARAMETERS\n",
    "#Filter out cells\n",
    "#sc.pp.filter_cells(adata, max_counts = 12000)\n",
    "#sc.pp.filter_cells(adata, max_genes = 7000)\n",
    "# Min 20 cells - filters out low count genes\n",
    "sc.pp.filter_genes(adata, min_cells=20) #500 works # # 450(30494) #400(29837) #300(28268) not"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 163,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get mt genes\n",
    "mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]\n",
    "np.array(mt_genes)\n",
    "mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]\n",
    "\n",
    "# convert to spare\n",
    "adata.X=sp.sparse.csr_matrix.todense(adata.X)\n",
    "\n",
    "# manually define mt score\n",
    "y = np.bincount(mt_gene_mask)\n",
    "ii = np.nonzero(y)[0]\n",
    "np.vstack((ii,y[ii])).T\n",
    "adata.X[:, mt_gene_mask].sum(1)\n",
    "mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))\n",
    "mt_sum=np.array(pd.DataFrame(mt_sum)[0])\n",
    "mt_frac=mt_sum/adata.obs['n_counts'].values\n",
    "\n",
    "adata.obs['mt_frac'] = mt_frac\n",
    "\n",
    "#Filter out cells with over 20% mito fraction\n",
    "adata = adata[adata.obs['mt_frac'] < 0.20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 164,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pp=adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 165,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:00:06): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n",
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=15\n",
      "    finished (0:00:03)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:08)\n",
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 16 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:00:08)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 166,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 167,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 168,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/python/lib/python3.7/site-packages/ipykernel_launcher.py:3: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "#Delete adata_pp\n",
    "del adata_pp\n",
    "adata.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 169,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 170,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "#Keep the count data in a counts layer\n",
    "adata.layers[\"counts\"] = adata.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 171,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.raw = adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 172,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "adata.X /= adata.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 173,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n",
      "    finished (0:00:04)\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 174,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=50\n",
      "    finished (0:00:10)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 50\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:14)\n",
      "computing UMAP\n",
      "    finished: added\n",
      "    'X_umap', UMAP coordinates (adata.obsm) (0:00:42)\n"
     ]
    }
   ],
   "source": [
    "# Calculate the visualizations\n",
    "sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')\n",
    "sc.pp.neighbors(adata)\n",
    "sc.tl.umap(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 197,
   "metadata": {},
   "outputs": [],
   "source": [
    "# make consistent annotations across datasets\n",
    "adata.obs['celltype']=adata.obs['cell_type'].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 200,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['celltype'],\n",
    "                           categories=['B cell', 'CD141-positive myeloid dendritic cell',\n",
    "       'CD1c-positive myeloid dendritic cell',\n",
    "       'CD4-positive, alpha-beta memory T cell',\n",
    "       'CD8-positive, alpha-beta memory T cell', 'T cell',\n",
    "       'classical monocyte', 'effector CD4-positive, alpha-beta T cell',\n",
    "       'effector CD8-positive, alpha-beta T cell', 'endothelial cell',\n",
    "       'erythrocyte', 'hematopoietic stem cell', 'innate lymphoid cell',\n",
    "       'intermediate monocyte', 'macrophage', 'mast cell', 'mature NK T cell',\n",
    "       'mature conventional dendritic cell', 'memory B cell', 'naive B cell',\n",
    "       'naive thymus-derived CD4-positive, alpha-beta T cell', 'neutrophil',\n",
    "       'non-classical monocyte', 'plasma cell', 'plasmacytoid dendritic cell',\n",
    "       'regulatory T cell', 'stromal cell', 'type I NK T cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 201,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,[ 'CD1c-positive myeloid dendritic cell', 'mature conventional dendritic cell', 'plasmacytoid dendritic cell'])\n",
    "ref_cluster[ix]='CD141-positive myeloid dendritic cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,['CD4-positive, alpha-beta memory T cell','CD8-positive, alpha-beta memory T cell', 'effector CD4-positive, alpha-beta T cell',\n",
    "       'effector CD8-positive, alpha-beta T cell',  'naive thymus-derived CD4-positive, alpha-beta T cell', 'regulatory T cell'])\n",
    "ref_cluster[ix]='T cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,['intermediate monocyte', 'non-classical monocyte'])\n",
    "ref_cluster[ix]='classical monocyte'\n",
    "\n",
    "ix=np.isin(ref_cluster,['memory B cell','naive B cell'])\n",
    "ref_cluster[ix]= 'B cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,['type I NK T cell'])\n",
    "ref_cluster[ix]='mature NK T cell'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 202,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['celltype']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['B cell', 'CD141-positive myeloid dendritic cell',\n",
    "       \n",
    "      'T cell',\n",
    "       'classical monocyte',\n",
    "        'endothelial cell',\n",
    "       'erythrocyte','hematopoietic stem cell', 'innate lymphoid cell',\n",
    "      'macrophage', 'mast cell', 'mature NK T cell',\n",
    "       'neutrophil',\n",
    "        'plasma cell', 'stromal cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 203,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.rename_categories('celltype',['B cells', 'Dendritic cells',\n",
    "        'T cells',\n",
    "       'Monocytes',\n",
    "        'Endothelial cells',\n",
    "       'Erythroid cells', 'Hematopoietic stem cells', 'Innate lymphoid cells',\n",
    "       'Macrophages', 'Mast cells', 'NK cells',\n",
    "       'Neutrophils',\n",
    "       'Plasma cells', 'Mesenchymal stromal cells'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 210,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ruben.brabenec/.local/lib/python3.7/site-packages/pandas/core/arrays/categorical.py:2631: FutureWarning: The `inplace` parameter in pandas.Categorical.rename_categories is deprecated and will be removed in a future version. Removing unused categories will always return a new Categorical object.\n",
      "  res = method(*args, **kwargs)\n"
     ]
    }
   ],
   "source": [
    "adata.obs['tissue'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['tissue'],\n",
    "                           categories=['inguinal lymph node', 'lymph node'])\n",
    "adata.rename_categories('tissue', ['LymphNode_Inguinal', 'LymphNode'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 211,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['sex'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['sex'],\n",
    "                           categories=['female', 'male'])\n",
    "adata.rename_categories('sex', ['Female', 'Male'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 212,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['ethnicity'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['ethnicity'],\n",
    "                           categories=['African American or Afro-Caribbean', 'European'])\n",
    "adata.rename_categories('ethnicity', ['African-American or Afro-Caribbean', 'European'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 213,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['development_stage'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['development_stage'],\n",
    "                           categories=['59-year-old human stage', '61-year-old human stage',\n",
    "       '69-year-old human stage'])\n",
    "adata.rename_categories('development_stage',['59', '61',\n",
    "       '69'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 214,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['donor'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['donor'],\n",
    "                           categories=['TSP2', 'TSP7', 'TSP14'])\n",
    "adata.rename_categories('donor', ['TSP1', 'TSP2', 'TSP14'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 215,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['Organ'] = 'LymphNode'\n",
    "adata.obs['Organ_Specific'] = adata.obs['tissue']\n",
    "adata.obs['Dataset'] = 'Pisco_LymphNode'\n",
    "adata.obs['InternDatasetNumber'] = '08-4-LymphNode-Pisco-2022'\n",
    "adata.obs['Dataset_status'] = 'Healthy_Dataset'\n",
    "\n",
    "adata.obs['celltype'] = adata.obs['celltype']\n",
    "adata.obs['sub_celltype'] = 'NaN'\n",
    "adata.obs['Malignant'] = 'NonMalignant'\n",
    "\n",
    "adata.obs['Patient'] = adata.obs['donor']\n",
    "adata.obs['Patient_Number'] = 'NaN'\n",
    "adata.obs['age'] = adata.obs['development_stage']\n",
    "adata.obs['sex'] = adata.obs['sex']\n",
    "adata.obs['ethnicity'] = adata.obs['ethnicity']\n",
    "adata.obs['health_status'] = 'NaN'\n",
    "\n",
    "adata.obs['original_celltype_1'] = adata.obs['cell_type']\n",
    "adata.obs['original_celltype_2'] = 'NaN'\n",
    "adata.obs['original_celltype_3'] = 'NaN'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 217,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 218,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs_names_make_unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 219,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.write(writepath + '08-4-LymphNode-Pisco-2022-processed.h5ad')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 08-6-LymphNode_ImmuneCells-Teichmann-2022"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 209,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(adata_analysis.obs['tissue_major'], ['LymphNode'])\n",
    "adata=adata_analysis[ix].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 214,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['Organ'] =  adata.obs['tissue_major']\n",
    "adata.obs['Organ_Specific'] = adata.obs['sub_tissue']\n",
    "adata.obs['Dataset'] = adata.obs['Dataset']\n",
    "adata.obs['InternDatasetNumber'] = '08-6-LymphNode_ImmuneCells-Teichmann-2022'\n",
    "adata.obs['Dataset_status'] = 'Healthy_Dataset'\n",
    "\n",
    "adata.obs['celltype'] = adata.obs['celltype']\n",
    "adata.obs['sub_celltype'] = 'NaN'\n",
    "adata.obs['Malignant'] = 'NonMalignant'\n",
    "\n",
    "adata.obs['Patient'] = adata.obs['donor']\n",
    "adata.obs['Patient_Number'] = 'NaN'\n",
    "adata.obs['age'] = adata.obs['development_stage']\n",
    "adata.obs['sex'] = adata.obs['sex']\n",
    "adata.obs['ethnicity'] = adata.obs['ethnicity']\n",
    "adata.obs['health_status'] = 'NaN'\n",
    "\n",
    "adata.obs['original_celltype_1'] = adata.obs['cell_type']\n",
    "adata.obs['original_celltype_2'] = adata.obs['Majority_voting_CellTypist_high']\n",
    "adata.obs['original_celltype_3'] = 'NaN'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 215,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 216,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs_names_make_unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 217,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.write(writepath + '08-6-LymphNode_ImmuneCells-Teichmann-2022-processed.h5ad')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "# 09-Lung"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For the lung datasets of Travaglini, Madissoon and Reyfman, data and cell annotations were obtained from a study  integrating various lung scRNA-seq datasets (https://doi.org/10.1038/s41591-020-01227-z)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "lung_data=anndata.read_h5ad(writepath + 'Muus2021_raw.h5ad')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Stanford_Krasnow_bioRxivTravaglini    60993\n",
       "Sanger_Meyer_2019Madissoon            57020\n",
       "Northwestern_Misharin_2018Reyfman     41778\n",
       "Name: dataset, dtype: int64"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lung_data.obs.dataset.value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 09-1-Lung-Travaglini-2020"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(lung_data.obs['dataset'],['Stanford_Krasnow_bioRxivTravaglini']) \n",
    "adata=lung_data[ix].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total number of cells: 60993\n",
      "filtered out 252 cells that have more than 50000 counts\n",
      "Number of cells after max count filter: 60741\n",
      "Number of cells after MT filter: 60741\n",
      "filtered out 108 cells that have more than 6000 genes expressed\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Trying to set attribute `.obs` of view, copying.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of cells after gene filter: 60633\n"
     ]
    }
   ],
   "source": [
    "# FILTER PARAMETERS\n",
    "print('Total number of cells: {:d}'.format(adata.n_obs))\n",
    "\n",
    "#Filter out counts over 40000\n",
    "sc.pp.filter_cells(adata, max_counts = 50000)\n",
    "print('Number of cells after max count filter: {:d}'.format(adata.n_obs))\n",
    "\n",
    "#Mito filter\n",
    "adata = adata[adata.obs['mito_frac'] < 0.2]\n",
    "print('Number of cells after MT filter: {:d}'.format(adata.n_obs))\n",
    "\n",
    "#Filter out genes over 7500\n",
    "sc.pp.filter_cells(adata, max_genes = 6000)\n",
    "print('Number of cells after gene filter: {:d}'.format(adata.n_obs))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total number of genes: 33704\n",
      "filtered out 15129 genes that are detected in less than 20 cells\n",
      "Number of genes after cell filter: 18575\n"
     ]
    }
   ],
   "source": [
    "#Filter genes:\n",
    "print('Total number of genes: {:d}'.format(adata.n_vars))\n",
    "\n",
    "# Min 20 cells - filters out low count genes\n",
    "sc.pp.filter_genes(adata, min_cells=20)\n",
    "print('Number of genes after cell filter: {:d}'.format(adata.n_vars))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "AnnData object with n_obs × n_vars = 60633 × 18575\n",
       "    obs: 'dataset', 'donor', 'last_author/PI', 'original_celltype_ann', 'sample', 'total_counts', 'log10_total_counts', 'n_genes_detected', 'mito_frac', 'ribo_frac', 'compl', 'ann_level_1', 'ann_level_2', 'ann_level_3', 'n_counts', 'n_genes'\n",
       "    var: 'n_cells'"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X=sp.sparse.csr_matrix.todense(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pp=adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:00:08): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n",
      "computing PCA\n",
      "    with n_comps=15\n",
      "    finished (0:00:29)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:17)\n",
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 17 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:00:20)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "R[write to console]: Loading required package: scran\n",
      "\n",
      "R[write to console]: Loading required package: SingleCellExperiment\n",
      "\n",
      "R[write to console]: Loading required package: SummarizedExperiment\n",
      "\n",
      "R[write to console]: Loading required package: GenomicRanges\n",
      "\n",
      "R[write to console]: Loading required package: stats4\n",
      "\n",
      "R[write to console]: Loading required package: BiocGenerics\n",
      "\n",
      "R[write to console]: Loading required package: parallel\n",
      "\n",
      "R[write to console]: \n",
      "Attaching package: ‘BiocGenerics’\n",
      "\n",
      "\n",
      "R[write to console]: The following objects are masked from ‘package:parallel’:\n",
      "\n",
      "    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,\n",
      "    clusterExport, clusterMap, parApply, parCapply, parLapply,\n",
      "    parLapplyLB, parRapply, parSapply, parSapplyLB\n",
      "\n",
      "\n",
      "R[write to console]: The following objects are masked from ‘package:stats’:\n",
      "\n",
      "    IQR, mad, sd, var, xtabs\n",
      "\n",
      "\n",
      "R[write to console]: The following objects are masked from ‘package:base’:\n",
      "\n",
      "    anyDuplicated, append, as.data.frame, basename, cbind, colnames,\n",
      "    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,\n",
      "    grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,\n",
      "    order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,\n",
      "    rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,\n",
      "    union, unique, unsplit, which, which.max, which.min\n",
      "\n",
      "\n",
      "R[write to console]: Loading required package: S4Vectors\n",
      "\n",
      "R[write to console]: \n",
      "Attaching package: ‘S4Vectors’\n",
      "\n",
      "\n",
      "R[write to console]: The following object is masked from ‘package:base’:\n",
      "\n",
      "    expand.grid\n",
      "\n",
      "\n",
      "R[write to console]: Loading required package: IRanges\n",
      "\n",
      "R[write to console]: Loading required package: GenomeInfoDb\n",
      "\n",
      "R[write to console]: Loading required package: Biobase\n",
      "\n",
      "R[write to console]: Welcome to Bioconductor\n",
      "\n",
      "    Vignettes contain introductory material; view with\n",
      "    'browseVignettes()'. To cite Bioconductor, see\n",
      "    'citation(\"Biobase\")', and for packages 'citation(\"pkgname\")'.\n",
      "\n",
      "\n",
      "R[write to console]: Loading required package: DelayedArray\n",
      "\n",
      "R[write to console]: Loading required package: matrixStats\n",
      "\n",
      "R[write to console]: \n",
      "Attaching package: ‘matrixStats’\n",
      "\n",
      "\n",
      "R[write to console]: The following objects are masked from ‘package:Biobase’:\n",
      "\n",
      "    anyMissing, rowMedians\n",
      "\n",
      "\n",
      "R[write to console]: Loading required package: BiocParallel\n",
      "\n",
      "R[write to console]: \n",
      "Attaching package: ‘DelayedArray’\n",
      "\n",
      "\n",
      "R[write to console]: The following objects are masked from ‘package:matrixStats’:\n",
      "\n",
      "    colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges\n",
      "\n",
      "\n",
      "R[write to console]: The following objects are masked from ‘package:base’:\n",
      "\n",
      "    aperm, apply, rowsum\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Delete adata_pp\n",
    "del adata_pp\n",
    "adata.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "\n",
    "#Keep the count data in a counts layer\n",
    "adata.layers[\"counts\"] = adata.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.raw = adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "\n",
    "adata.X /= adata.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "AnnData object with n_obs × n_vars = 60633 × 18575\n",
       "    obs: 'dataset', 'donor', 'last_author/PI', 'original_celltype_ann', 'sample', 'total_counts', 'log10_total_counts', 'n_genes_detected', 'mito_frac', 'ribo_frac', 'compl', 'ann_level_1', 'ann_level_2', 'ann_level_3', 'n_counts', 'n_genes', 'size_factors'\n",
       "    var: 'n_cells'\n",
       "    uns: 'log1p'\n",
       "    layers: 'counts'"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n",
      "    finished (0:00:00)\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', \n",
    "                              n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=50\n",
      "    finished (0:00:18)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 50\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:15)\n",
      "computing UMAP\n",
      "    finished: added\n",
      "    'X_umap', UMAP coordinates (adata.obsm) (0:01:06)\n"
     ]
    }
   ],
   "source": [
    "# Calculate the visualizations\n",
    "sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')\n",
    "sc.pp.neighbors(adata)\n",
    "\n",
    "sc.tl.umap(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['2_Mesothelium', '2_Smooth Muscle', 'AT1', 'AT2',\n",
       "       'Airway smooth muscle', 'Arterial', 'B cell lineage', 'Basal',\n",
       "       'Bronchial Vessel 1', 'Bronchial Vessel 2', 'Capillary',\n",
       "       'Capillary Intermediate 1', 'Capillary Intermediate 2',\n",
       "       'Dendritic cells', 'Fibroblasts', 'Fibromyocyte',\n",
       "       'Innate lymphoid cells', 'Lymphatic EC', 'Macrophages', 'Mast cells',\n",
       "       'Megakaryocytes', 'Monocytes', 'Multiciliated lineage',\n",
       "       'Myofibroblasts', 'Rare', 'Secretory', 'Submucosal Secretory',\n",
       "       'T cell lineage', 'Venous'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# make consistent annotations across datasets\n",
    "adata.obs['CellType_Atlas']=adata.obs['ann_level_3'].copy()\n",
    "adata.obs['CellType_Atlas'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['CellType_Atlas'],\n",
    "                           categories=['2_Mesothelium', '2_Smooth Muscle', 'AT1', 'AT2',\n",
    "       'Airway smooth muscle', 'Arterial', 'B cell lineage', 'Basal',\n",
    "       'Bronchial Vessel 1', 'Bronchial Vessel 2', 'Capillary',\n",
    "       'Capillary Intermediate 1', 'Capillary Intermediate 2',\n",
    "       'Dendritic cells', 'Fibroblasts', 'Fibromyocyte',\n",
    "       'Innate lymphoid cells', 'Lymphatic EC', 'Macrophages', 'Mast cells',\n",
    "       'Megakaryocytes', 'Monocytes', 'Multiciliated lineage',\n",
    "       'Myofibroblasts', 'Rare', 'Secretory', 'Submucosal Secretory',\n",
    "       'T cell lineage', 'Venous'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,['AT2'])\n",
    "ref_cluster[ix]='AT1'\n",
    "\n",
    "ix=np.isin(ref_cluster,['Airway smooth muscle'])\n",
    "ref_cluster[ix]='2_Smooth Muscle'\n",
    "\n",
    "ix=np.isin(ref_cluster,['Bronchial Vessel 2'])\n",
    "ref_cluster[ix]='Bronchial Vessel 1'\n",
    "\n",
    "ix=np.isin(ref_cluster,['Capillary Intermediate 1', 'Capillary Intermediate 2'])\n",
    "ref_cluster[ix]='Capillary'\n",
    "\n",
    "ix=np.isin(ref_cluster,['Fibromyocyte'])\n",
    "ref_cluster[ix]='Fibroblasts'\n",
    "\n",
    "ix=np.isin(ref_cluster,['Submucosal Secretory'])\n",
    "ref_cluster[ix]='Secretory'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['CellType_Atlas']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['2_Mesothelium', '2_Smooth Muscle', 'AT1', \n",
    "                                                       'Arterial', 'B cell lineage', 'Basal',\n",
    "       'Bronchial Vessel 1',  'Capillary',\n",
    "       'Dendritic cells', 'Fibroblasts',\n",
    "       'Innate lymphoid cells', 'Lymphatic EC', 'Macrophages', 'Mast cells',\n",
    "       'Megakaryocytes', 'Monocytes', 'Multiciliated lineage',\n",
    "       'Myofibroblasts', 'Rare', 'Secretory',\n",
    "       'T cell lineage', 'Venous'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['2_Mesothelium', '2_Smooth Muscle', 'AT1', 'Arterial', 'B cell lineage',\n",
       "       'Basal', 'Bronchial Vessel 1', 'Capillary', 'Dendritic cells',\n",
       "       'Fibroblasts', 'Innate lymphoid cells', 'Lymphatic EC', 'Macrophages',\n",
       "       'Mast cells', 'Megakaryocytes', 'Monocytes', 'Multiciliated lineage',\n",
       "       'Myofibroblasts', 'Rare', 'Secretory', 'T cell lineage', 'Venous'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.obs['CellType_Atlas'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.rename_categories('CellType_Atlas', ['Mesothelial', 'Smooth muscle', 'Alveolar', 'Arterial', 'B cells',\n",
    "       'Basal', 'Bronchial', 'Capillary', 'Dendritics',\n",
    "       'Fibroblasts', 'Innate Lymphoid', 'Endothelial', 'Macrophages',\n",
    "       'Mast', 'Megakaryocytes', 'Monocytes', 'Multiciliated',\n",
    "       'Lymphoid', 'Remove', 'Secretory', 'T cells', 'Venous'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['InternDatasetNumber'] = '09-1-Lung-Travaglini-2020'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.write(writepath + '09-1-Lung-Travaglini-2020-processed.h5ad')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 09-2-Lung-Madissoon-2019"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(lung_data.obs['dataset'],['Sanger_Meyer_2019Madissoon']) \n",
    "adata=lung_data[ix].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total number of cells: 57020\n",
      "filtered out 706 cells that have more than 30000 counts\n",
      "Number of cells after max count filter: 56314\n",
      "Number of cells after MT filter: 56314\n",
      "filtered out 10 cells that have more than 5500 genes expressed\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Trying to set attribute `.obs` of view, copying.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of cells after gene filter: 56304\n"
     ]
    }
   ],
   "source": [
    "# FILTER PARAMETERS\n",
    "print('Total number of cells: {:d}'.format(adata.n_obs))\n",
    "\n",
    "#Filter out counts over 30000\n",
    "sc.pp.filter_cells(adata, max_counts = 30000)\n",
    "print('Number of cells after max count filter: {:d}'.format(adata.n_obs))\n",
    "\n",
    "#Mito filter\n",
    "adata = adata[adata.obs['mito_frac'] < 0.2]\n",
    "print('Number of cells after MT filter: {:d}'.format(adata.n_obs))\n",
    "\n",
    "#Filter out genes over 5500\n",
    "sc.pp.filter_cells(adata, max_genes = 5500)\n",
    "print('Number of cells after gene filter: {:d}'.format(adata.n_obs))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total number of genes: 33704\n",
      "filtered out 13275 genes that are detected in less than 20 cells\n",
      "Number of genes after cell filter: 20429\n"
     ]
    }
   ],
   "source": [
    "#Filter genes:\n",
    "print('Total number of genes: {:d}'.format(adata.n_vars))\n",
    "\n",
    "# Min 20 cells - filters out low count genes\n",
    "sc.pp.filter_genes(adata, min_cells=20)\n",
    "print('Number of genes after cell filter: {:d}'.format(adata.n_vars))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X=sp.sparse.csr_matrix.todense(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pp=adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:00:09): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n",
      "computing PCA\n",
      "    with n_comps=15\n",
      "    finished (0:00:30)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:09)\n",
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 16 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:00:17)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Delete adata_pp\n",
    "del adata_pp\n",
    "adata.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "\n",
    "#Keep the count data in a counts layer\n",
    "adata.layers[\"counts\"] = adata.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.raw = adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "\n",
    "adata.X /= adata.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "AnnData object with n_obs × n_vars = 56304 × 20429\n",
       "    obs: 'dataset', 'donor', 'last_author/PI', 'original_celltype_ann', 'sample', 'total_counts', 'log10_total_counts', 'n_genes_detected', 'mito_frac', 'ribo_frac', 'compl', 'ann_level_1', 'ann_level_2', 'ann_level_3', 'n_counts', 'n_genes', 'size_factors'\n",
       "    var: 'n_cells'\n",
       "    uns: 'log1p'\n",
       "    layers: 'counts'"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n",
      "    finished (0:00:00)\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', \n",
    "                              n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=50\n",
      "    finished (0:00:15)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 50\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:18)\n",
      "computing UMAP\n",
      "    finished: added\n",
      "    'X_umap', UMAP coordinates (adata.obsm) (0:00:55)\n"
     ]
    }
   ],
   "source": [
    "# Calculate the visualizations\n",
    "sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')\n",
    "sc.pp.neighbors(adata)\n",
    "\n",
    "sc.tl.umap(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['2_Blood vessels', '2_Fibroblast lineage', '2_Smooth Muscle', 'AT1',\n",
       "       'AT2', 'B cell lineage', 'Dendritic cells', 'Innate lymphoid cells',\n",
       "       'Lymphatic EC', 'Macrophages', 'Mast cells', 'Monocytes',\n",
       "       'Multiciliated lineage', 'T cell lineage'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# make consistent annotations across datasets\n",
    "adata.obs['CellType_Atlas']=adata.obs['ann_level_3'].copy()\n",
    "adata.obs['CellType_Atlas'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['CellType_Atlas'],\n",
    "                           categories=['2_Blood vessels', '2_Fibroblast lineage', '2_Smooth Muscle', 'AT1',\n",
    "       'AT2', 'B cell lineage', 'Dendritic cells', 'Innate lymphoid cells',\n",
    "       'Lymphatic EC', 'Macrophages', 'Mast cells', 'Monocytes',\n",
    "       'Multiciliated lineage', 'T cell lineage'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,['Lymphatic EC'])\n",
    "ref_cluster[ix]='2_Blood vessels'\n",
    "\n",
    "ix=np.isin(ref_cluster,['AT2'])\n",
    "ref_cluster[ix]='AT1'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['CellType_Atlas']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['2_Blood vessels', '2_Fibroblast lineage', '2_Smooth Muscle', 'AT1',\n",
    "                                                       'B cell lineage', 'Dendritic cells', 'Innate lymphoid cells',\n",
    "                                                       'Macrophages', 'Mast cells', 'Monocytes',\n",
    "                                                       'Multiciliated lineage', 'T cell lineage'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['2_Blood vessels', '2_Fibroblast lineage', '2_Smooth Muscle', 'AT1',\n",
       "       'B cell lineage', 'Dendritic cells', 'Innate lymphoid cells',\n",
       "       'Macrophages', 'Mast cells', 'Monocytes', 'Multiciliated lineage',\n",
       "       'T cell lineage'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.obs['CellType_Atlas'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.rename_categories('CellType_Atlas', ['Endothelial', 'Fibroblasts', 'Smooth muscle', 'Alveolar',\n",
    "       'B cells', 'Dendritics', 'Innate Lymphoid',\n",
    "       'Macrophages', 'Mast', 'Monocytes', 'Multiciliated',\n",
    "       'T cells'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['InternDatasetNumber'] = '09-2-Lung-Madissoon-2019'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.write(writepath + '09-2-Lung-Madissoon-2019-processed.h5ad')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 09-3-Lung-Reyfman-2019"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(lung_data.obs['dataset'],['Northwestern_Misharin_2018Reyfman']) \n",
    "adata=lung_data[ix].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total number of cells: 41778\n",
      "filtered out 206 cells that have more than 35000 counts\n",
      "Number of cells after max count filter: 41572\n",
      "Number of cells after MT filter: 41517\n",
      "filtered out 14 cells that have more than 6000 genes expressed\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Trying to set attribute `.obs` of view, copying.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of cells after gene filter: 41503\n"
     ]
    }
   ],
   "source": [
    "# FILTER PARAMETERS\n",
    "print('Total number of cells: {:d}'.format(adata.n_obs))\n",
    "\n",
    "#Filter out counts over 35000\n",
    "sc.pp.filter_cells(adata, max_counts = 35000)\n",
    "print('Number of cells after max count filter: {:d}'.format(adata.n_obs))\n",
    "\n",
    "#Mito filter\n",
    "adata = adata[adata.obs['mito_frac'] < 0.2]\n",
    "print('Number of cells after MT filter: {:d}'.format(adata.n_obs))\n",
    "\n",
    "#Filter out genes over 6000\n",
    "sc.pp.filter_cells(adata, max_genes = 6000)\n",
    "print('Number of cells after gene filter: {:d}'.format(adata.n_obs))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total number of genes: 33704\n",
      "filtered out 14289 genes that are detected in less than 20 cells\n",
      "Number of genes after cell filter: 19415\n"
     ]
    }
   ],
   "source": [
    "#Filter genes:\n",
    "print('Total number of genes: {:d}'.format(adata.n_vars))\n",
    "\n",
    "# Min 20 cells - filters out low count genes\n",
    "sc.pp.filter_genes(adata, min_cells=20)\n",
    "print('Number of genes after cell filter: {:d}'.format(adata.n_vars))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "AnnData object with n_obs × n_vars = 41503 × 19415\n",
       "    obs: 'dataset', 'donor', 'last_author/PI', 'original_celltype_ann', 'sample', 'total_counts', 'log10_total_counts', 'n_genes_detected', 'mito_frac', 'ribo_frac', 'compl', 'ann_level_1', 'ann_level_2', 'ann_level_3', 'n_counts', 'n_genes'\n",
       "    var: 'n_cells'"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X=sp.sparse.csr_matrix.todense(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pp=adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:00:05): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n",
      "computing PCA\n",
      "    with n_comps=15\n",
      "    finished (0:00:21)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:07)\n",
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 20 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:00:08)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Delete adata_pp\n",
    "del adata_pp\n",
    "adata.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "\n",
    "#Keep the count data in a counts layer\n",
    "adata.layers[\"counts\"] = adata.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.raw = adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "\n",
    "adata.X /= adata.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "AnnData object with n_obs × n_vars = 41503 × 19415\n",
       "    obs: 'dataset', 'donor', 'last_author/PI', 'original_celltype_ann', 'sample', 'total_counts', 'log10_total_counts', 'n_genes_detected', 'mito_frac', 'ribo_frac', 'compl', 'ann_level_1', 'ann_level_2', 'ann_level_3', 'n_counts', 'n_genes', 'size_factors'\n",
       "    var: 'n_cells'\n",
       "    uns: 'log1p'\n",
       "    layers: 'counts'"
      ]
     },
     "execution_count": 72,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n",
      "    finished (0:00:00)\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', \n",
    "                              n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=50\n",
      "    finished (0:00:15)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 50\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:12)\n",
      "computing UMAP\n",
      "    finished: added\n",
      "    'X_umap', UMAP coordinates (adata.obsm) (0:00:39)\n"
     ]
    }
   ],
   "source": [
    "# Calculate the visualizations\n",
    "sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')\n",
    "sc.pp.neighbors(adata)\n",
    "\n",
    "sc.tl.umap(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['1_Epithelial', '1_Stroma', '1_Unicorns and artifacts',\n",
       "       '2_Blood vessels', '2_Lymphoid', 'AT1', 'AT2', 'Dendritic cells',\n",
       "       'Epithelial cells, proliferating', 'Lymphatic EC', 'Macrophages',\n",
       "       'Mast cells', 'Monocytes', 'Multiciliated lineage', 'Secretory'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# make consistent annotations across datasets\n",
    "adata.obs['CellType_Atlas']=adata.obs['ann_level_3'].copy()\n",
    "adata.obs['CellType_Atlas'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['CellType_Atlas'],\n",
    "                           categories=['1_Epithelial', '1_Stroma', '1_Unicorns and artifacts',\n",
    "       '2_Blood vessels', '2_Lymphoid', 'AT1', 'AT2', 'Dendritic cells',\n",
    "       'Epithelial cells, proliferating', 'Lymphatic EC', 'Macrophages',\n",
    "       'Mast cells', 'Monocytes', 'Multiciliated lineage', 'Secretory'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,['Epithelial cells, proliferating'])\n",
    "ref_cluster[ix]='1_Epithelial'\n",
    "\n",
    "ix=np.isin(ref_cluster,['Lymphatic EC'])\n",
    "ref_cluster[ix]='2_Blood vessels'\n",
    "\n",
    "ix=np.isin(ref_cluster,['AT2'])\n",
    "ref_cluster[ix]='AT1'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['CellType_Atlas']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['1_Epithelial', '1_Stroma', '1_Unicorns and artifacts',\n",
    "       '2_Blood vessels', '2_Lymphoid', 'AT1', 'Dendritic cells',\n",
    "       'Macrophages',\n",
    "       'Mast cells', 'Monocytes', 'Multiciliated lineage', 'Secretory'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['1_Epithelial', '1_Stroma', '1_Unicorns and artifacts',\n",
       "       '2_Blood vessels', '2_Lymphoid', 'AT1', 'Dendritic cells',\n",
       "       'Macrophages', 'Mast cells', 'Monocytes', 'Multiciliated lineage',\n",
       "       'Secretory'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 79,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.obs['CellType_Atlas'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.rename_categories('CellType_Atlas', ['Epithelial', 'Stroma', 'Remove','Endothelial', 'Lymphoid', 'Alveolar', \n",
    "                                           'Dendritics','Macrophages', 'Mast', 'Monocytes', 'Multiciliated', 'Secretory'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['InternDatasetNumber'] = '09-3-Lung-Reyfman-2019'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 142,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.write(writepath + '09-3-Lung-Reyfman-2019-processed.h5ad')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 09-4-Lung-Kim-2020"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For the lung dataset of Kim, data was obtained from the original study (https://doi.org/10.1038/s41467-020-16164-1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata=sc.read_text(writepath + 'GSE131907_Lung_Cancer_normalized_log2TPM_matrix.txt', \n",
    "                   delimiter=None, \n",
    "                   first_column_names=None, \n",
    "                   dtype='float32')\n",
    "adata=adata.transpose()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [],
   "source": [
    "#import annotations\n",
    "annotations=pd.DataFrame(pd.read_csv(writepath + 'GSE131907_Lung_Cancer_cell_annotation.txt', delimiter='\\t'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [],
   "source": [
    "#merge the two dfs\n",
    "df1=pd.DataFrame(adata.obs)\n",
    "annotations.index=annotations['Index']\n",
    "df2=annotations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_merged=pd.merge(df1, df2, left_index=True, right_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['sample']=df_merged['Sample']\n",
    "adata.obs['origin']=df_merged['Sample_Origin']\n",
    "adata.obs['cell type']=df_merged['Cell_type']\n",
    "adata.obs['cell type refined']=df_merged['Cell_type.refined']\n",
    "adata.obs['cell subtype']=df_merged['Cell_subtype']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [],
   "source": [
    "# subset to lung and lymph node data\n",
    "ix=np.isin(adata.obs['origin'],['nLung']) \n",
    "adata_nLung=adata[ix].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [],
   "source": [
    "#calculate QC covariates\n",
    "adata_nLung.obs['n_counts'] = adata_nLung.X.sum(1)\n",
    "adata_nLung.obs['log_counts'] = np.log(adata_nLung.obs['n_counts'])\n",
    "adata_nLung.obs['n_genes'] = (adata_nLung.X > 0).sum(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['MT-ATP6', 'MT-ATP8', 'MT-CO1', 'MT-CO2', 'MT-CO3', 'MT-CYB',\n",
       "       'MT-ND1', 'MT-ND2', 'MT-ND3', 'MT-ND4', 'MT-ND4L', 'MT-ND5',\n",
       "       'MT-ND6'], dtype=object)"
      ]
     },
     "execution_count": 92,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mt_genes = adata_nLung.var_names[[gene.startswith('MT-') for gene in adata_nLung.var_names]]\n",
    "np.array(mt_genes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [],
   "source": [
    "mt_gene_mask = [gene.startswith('MT-') for gene in adata_nLung.var_names]\n",
    "adata_nLung.obs['mt_frac'] = adata_nLung.X[:, mt_gene_mask].sum(1)/adata_nLung.obs['n_counts']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pp = adata_nLung.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:00:07): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n",
      "computing PCA\n",
      "    with n_comps=15\n",
      "    finished (0:00:32)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:07)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 16 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:00:08)\n"
     ]
    }
   ],
   "source": [
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata_nLung.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Delete adata_pp\n",
    "del adata_pp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_nLung.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_nLung.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "\n",
    "#Keep the count data in a counts layer\n",
    "adata_nLung.layers[\"counts\"] = adata_nLung.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "\n",
    "adata_nLung.X /= adata_nLung.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata_nLung)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "AnnData object with n_obs × n_vars = 42995 × 29634\n",
       "    obs: 'sample', 'origin', 'cell type', 'cell type refined', 'cell subtype', 'n_counts', 'log_counts', 'n_genes', 'mt_frac', 'size_factors'\n",
       "    uns: 'log1p'\n",
       "    layers: 'counts'"
      ]
     },
     "execution_count": 105,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata_nLung"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n",
      "    finished (0:00:06)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/moritz.thomas/.local/lib/python3.7/site-packages/scanpy/preprocessing/_deprecated/highly_variable_genes.py:194: RuntimeWarning: invalid value encountered in true_divide\n",
      "  / disp_mad_bin[df['mean_bin'].values].values\n",
      "/home/moritz.thomas/.local/lib/python3.7/site-packages/scanpy/preprocessing/_deprecated/highly_variable_genes.py:205: RuntimeWarning: invalid value encountered in greater_equal\n",
      "  gene_subset = df['dispersion_norm'].values >= disp_cut_off\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata_nLung, flavor='cell_ranger', \n",
    "                              n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=50\n",
      "    finished (0:00:19)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 50\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:11)\n",
      "computing UMAP\n",
      "    finished: added\n",
      "    'X_umap', UMAP coordinates (adata.obsm) (0:00:40)\n"
     ]
    }
   ],
   "source": [
    "sc.pp.pca(adata_nLung, n_comps=50, use_highly_variable=True, svd_solver='arpack')\n",
    "sc.pp.neighbors(adata_nLung)\n",
    "\n",
    "sc.tl.umap(adata_nLung)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata=adata_nLung.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['AT1', 'AT2', 'Activated DCs', 'Alveolar Mac', 'CD1c+ DCs', 'CD4+ Th',\n",
       "       'CD8 low T', 'CD8+/CD4+ Mixed Th', 'CD141+ DCs', 'CD163+CD14+ DCs',\n",
       "       'CD207+CD1a+ LCs', 'COL13A1+ matrix FBs', 'COL14A1+ matrix FBs',\n",
       "       'Ciliated', 'Club', 'Cytotoxic CD8+ T', 'EPCs', 'Exhausted CD8+ T',\n",
       "       'Exhausted Tfh', 'FB-like cells', 'Follicular B cells',\n",
       "       'GC B cells in the DZ', 'GrB-secreting B cells', 'Lymphatic ECs',\n",
       "       'MALT B cells', 'MAST', 'Mesothelial cells', 'Monocytes',\n",
       "       'Myofibroblasts', 'NK', 'Naive CD4+ T', 'Naive CD8+ T', 'Pericytes',\n",
       "       'Plasma cells', 'Pleural Mac', 'Smooth muscle cells', 'Stalk-like ECs',\n",
       "       'Tip-like ECs', 'Treg', 'Tumor ECs', 'Undetermined', 'mo-Mac', 'nan',\n",
       "       'pDCs'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 86,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# make consistent annotations across datasets\n",
    "adata.obs['CellType_Atlas']=adata.obs['cell subtype'].copy()\n",
    "adata.obs['CellType_Atlas'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['CellType_Atlas'],\n",
    "                           categories=['AT1', 'AT2', 'Activated DCs', 'Alveolar Mac', 'CD1c+ DCs', 'CD4+ Th',\n",
    "       'CD8 low T', 'CD8+/CD4+ Mixed Th', 'CD141+ DCs', 'CD163+CD14+ DCs',\n",
    "       'CD207+CD1a+ LCs', 'COL13A1+ matrix FBs', 'COL14A1+ matrix FBs',\n",
    "       'Ciliated', 'Club', 'Cytotoxic CD8+ T', 'EPCs', 'Exhausted CD8+ T',\n",
    "       'Exhausted Tfh', 'FB-like cells', 'Follicular B cells',\n",
    "       'GC B cells in the DZ', 'GrB-secreting B cells', 'Lymphatic ECs',\n",
    "       'MALT B cells', 'MAST', 'Mesothelial cells', 'Monocytes',\n",
    "       'Myofibroblasts', 'NK', 'Naive CD4+ T', 'Naive CD8+ T', 'Pericytes',\n",
    "       'Plasma cells', 'Pleural Mac', 'Smooth muscle cells', 'Stalk-like ECs',\n",
    "       'Tip-like ECs', 'Treg', 'Tumor ECs', 'Undetermined', 'mo-Mac', 'nan',\n",
    "       'pDCs'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,['AT2'])\n",
    "ref_cluster[ix]='AT1'\n",
    "\n",
    "ix=np.isin(ref_cluster,['CD1c+ DCs','CD141+ DCs', 'CD163+CD14+ DCs','CD207+CD1a+ LCs','pDCs'])\n",
    "ref_cluster[ix]='Activated DCs'\n",
    "\n",
    "ix=np.isin(ref_cluster,['Pleural Mac','mo-Mac'])\n",
    "ref_cluster[ix]='Alveolar Mac'\n",
    "\n",
    "ix=np.isin(ref_cluster,['CD8 low T', 'CD8+/CD4+ Mixed Th','Cytotoxic CD8+ T', 'Exhausted CD8+ T','Exhausted Tfh',\n",
    "                        'Naive CD4+ T', 'Naive CD8+ T','Treg'])\n",
    "ref_cluster[ix]='CD4+ Th'\n",
    "\n",
    "ix=np.isin(ref_cluster,['COL14A1+ matrix FBs','FB-like cells','Pericytes'])\n",
    "ref_cluster[ix]='COL13A1+ matrix FBs'\n",
    "\n",
    "ix=np.isin(ref_cluster,['Club'])\n",
    "ref_cluster[ix]='Ciliated'\n",
    "\n",
    "ix=np.isin(ref_cluster,['Lymphatic ECs','Stalk-like ECs','Tip-like ECs',  'Tumor ECs'])\n",
    "ref_cluster[ix]='EPCs'\n",
    "\n",
    "ix=np.isin(ref_cluster,['GC B cells in the DZ','GrB-secreting B cells','MALT B cells'])\n",
    "ref_cluster[ix]='Follicular B cells'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['CellType_Atlas']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['AT1','Activated DCs', 'Alveolar Mac', 'CD4+ Th',\n",
    "                                                       'COL13A1+ matrix FBs', 'Ciliated', 'EPCs','Follicular B cells',\n",
    "                                                       'MAST','Mesothelial cells', 'Monocytes','Myofibroblasts', 'NK', \n",
    "                                                       'Smooth muscle cells', 'Undetermined','nan'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['AT1', 'Activated DCs', 'Alveolar Mac', 'CD4+ Th',\n",
       "       'COL13A1+ matrix FBs', 'Ciliated', 'EPCs', 'Follicular B cells', 'MAST',\n",
       "       'Mesothelial cells', 'Monocytes', 'Myofibroblasts', 'NK',\n",
       "       'Smooth muscle cells', 'Undetermined', 'nan'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 90,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.obs['CellType_Atlas'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.rename_categories('CellType_Atlas', ['Alveolar', 'Dendritics', 'Macrophages', 'T cells',\n",
    "       'Fibroblasts', 'Epithelial', 'Endothelial', 'B cells', 'Mast',\n",
    "       'Mesothelial', 'Monocytes', 'Lymphoid', 'NK',\n",
    "       'Smooth muscle','Unknown','toassign'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [],
   "source": [
    "testdf=pd.DataFrame()\n",
    "testdf['Atlas']=adata.obs['CellType_Atlas']\n",
    "testdf['celltype']=adata.obs['cell type']\n",
    "testdf['fin']=testdf['Atlas'].astype(str) + str('__') + testdf['celltype'].astype(str)\n",
    "testdf['fin'] = testdf.fin.astype('category')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['CellType_Atlas']=np.array(testdf['fin'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "... storing 'CellType_Atlas' as categorical\n"
     ]
    }
   ],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.rename_categories('CellType_Atlas', ['Alveolar', 'B cells',\n",
    "       'Dendritics', 'Endothelial',\n",
    "       'Epithelial', 'Fibroblasts',\n",
    "       'Lymphoid', 'Macrophages',\n",
    "       'Mast', 'Mesothelial',\n",
    "       'Monocytes', 'NK', 'NK MERGE',\n",
    "       'Smooth muscle', 'T cells',\n",
    "       'T cells MERGE', 'Unknown',\n",
    "       'Unknown MERGE', 'Unknown MERGE2',\n",
    "       'Unknown MERGE3', 'Unknown MERGE4', 'Unknown MERGE5',\n",
    "       'Unknown MERGE6', 'Endothelial MERGE',\n",
    "       'Fibroblasts MERGE', 'NK MERGE2',\n",
    "       'T cells MERGE2'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['Alveolar', 'B cells', 'Dendritics', 'Endothelial', 'Epithelial',\n",
       "       'Fibroblasts', 'Lymphoid', 'Macrophages', 'Mast', 'Mesothelial',\n",
       "       'Monocytes', 'NK', 'NK MERGE', 'Smooth muscle', 'T cells',\n",
       "       'T cells MERGE', 'Unknown', 'Unknown MERGE', 'Unknown MERGE2',\n",
       "       'Unknown MERGE3', 'Unknown MERGE4', 'Unknown MERGE5', 'Unknown MERGE6',\n",
       "       'Endothelial MERGE', 'Fibroblasts MERGE', 'NK MERGE2',\n",
       "       'T cells MERGE2'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 104,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.obs['CellType_Atlas'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['CellType_Atlas'],\n",
    "                           categories=['Alveolar', 'B cells', 'Dendritics', 'Endothelial', 'Epithelial',\n",
    "       'Fibroblasts', 'Lymphoid', 'Macrophages', 'Mast', 'Mesothelial',\n",
    "       'Monocytes', 'NK', 'NK MERGE', 'Smooth muscle', 'T cells',\n",
    "       'T cells MERGE', 'Unknown', 'Unknown MERGE', 'Unknown MERGE2',\n",
    "       'Unknown MERGE3', 'Unknown MERGE4', 'Unknown MERGE5', 'Unknown MERGE6',\n",
    "       'Endothelial MERGE', 'Fibroblasts MERGE', 'NK MERGE2',\n",
    "       'T cells MERGE2'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,['Endothelial MERGE'])\n",
    "ref_cluster[ix]='Endothelial'\n",
    "\n",
    "ix=np.isin(ref_cluster,['Fibroblasts MERGE'])\n",
    "ref_cluster[ix]='Fibroblasts'\n",
    "\n",
    "ix=np.isin(ref_cluster,['NK MERGE','NK MERGE2'])\n",
    "ref_cluster[ix]='NK'\n",
    "\n",
    "ix=np.isin(ref_cluster,['T cells MERGE','T cells MERGE2'])\n",
    "ref_cluster[ix]='T cells'\n",
    "\n",
    "ix=np.isin(ref_cluster,['Unknown MERGE', 'Unknown MERGE2',\n",
    "       'Unknown MERGE3', 'Unknown MERGE4', 'Unknown MERGE5', 'Unknown MERGE6'])\n",
    "ref_cluster[ix]='Unknown'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['CellType_Atlas']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['Alveolar', 'B cells', 'Dendritics', 'Endothelial', 'Epithelial',\n",
    "                                                       'Fibroblasts', 'Lymphoid', 'Macrophages', 'Mast', 'Mesothelial',\n",
    "                                                       'Monocytes', 'NK',  'Smooth muscle', 'T cells','Unknown'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['Alveolar', 'B cells', 'Dendritics', 'Endothelial', 'Epithelial',\n",
       "       'Fibroblasts', 'Lymphoid', 'Macrophages', 'Mast', 'Mesothelial',\n",
       "       'Monocytes', 'NK', 'Smooth muscle', 'T cells', 'Unknown'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 108,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.obs['CellType_Atlas'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.rename_categories('CellType_Atlas', ['Alveolar', 'B cells', 'Dendritics', 'Endothelial', 'Epithelial',\n",
    "       'Fibroblasts', 'Lymphoid', 'Macrophages', 'Mast', 'Mesothelial',\n",
    "       'Monocytes', 'NK cells', 'Smooth muscle', 'T cells', 'Unknown'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['InternDatasetNumber'] = '09-4-Lung-Kim-2020'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.write(writepath + '09-4-Lung-Kim-2020-processed.h5ad')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "##  09-5-Lung-Pisco-2022"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 193,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(adata_pisco.obs['tissue'],['lung']) \n",
    "adata=adata_pisco[ix].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 198,
   "metadata": {},
   "outputs": [],
   "source": [
    "#calculate QC covariates\n",
    "adata.obs['n_counts'] = adata.X.sum(1)\n",
    "adata.obs['log_counts'] = np.log(adata.obs['n_counts'])\n",
    "adata.obs['n_genes'] = (adata.X > 0).sum(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 202,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "filtered out 11 cells that have more than 15000 counts\n",
      "filtered out 17 cells that have more than 9000 genes expressed\n",
      "filtered out 30257 genes that are detected in less than 20 cells\n"
     ]
    }
   ],
   "source": [
    "# FILTER PARAMETERS\n",
    "#Filter out cells\n",
    "sc.pp.filter_cells(adata, max_counts = 15000)\n",
    "sc.pp.filter_cells(adata, max_genes = 9000)\n",
    "# Min 20 cells - filters out low count genes\n",
    "sc.pp.filter_genes(adata, min_cells=20) #500 works # # 450(30494) #400(29837) #300(28268) not"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 204,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get mt genes\n",
    "mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]\n",
    "np.array(mt_genes)\n",
    "mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]\n",
    "\n",
    "# convert to spare\n",
    "adata.X=sp.sparse.csr_matrix.todense(adata.X)\n",
    "\n",
    "# manually define mt score\n",
    "y = np.bincount(mt_gene_mask)\n",
    "ii = np.nonzero(y)[0]\n",
    "np.vstack((ii,y[ii])).T\n",
    "adata.X[:, mt_gene_mask].sum(1)\n",
    "mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))\n",
    "mt_sum=np.array(pd.DataFrame(mt_sum)[0])\n",
    "mt_frac=mt_sum/adata.obs['n_counts'].values\n",
    "\n",
    "adata.obs['mt_frac'] = mt_frac\n",
    "\n",
    "#Filter out cells with over 20% mito fraction\n",
    "adata = adata[adata.obs['mt_frac'] < 0.20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 205,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pp=adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 206,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:00:05): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n",
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=15\n",
      "    finished (0:00:02)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:05)\n",
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 20 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:00:04)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 207,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 208,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 209,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/python/lib/python3.7/site-packages/ipykernel_launcher.py:3: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "#Delete adata_pp\n",
    "del adata_pp\n",
    "adata.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 210,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 211,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "#Keep the count data in a counts layer\n",
    "adata.layers[\"counts\"] = adata.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 212,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.raw = adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 213,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "adata.X /= adata.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 214,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n",
      "    finished (0:00:03)\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 215,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=50\n",
      "    finished (0:00:04)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 50\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:07)\n",
      "computing UMAP\n",
      "    finished: added\n",
      "    'X_umap', UMAP coordinates (adata.obsm) (0:00:28)\n"
     ]
    }
   ],
   "source": [
    "# Calculate the visualizations\n",
    "sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')\n",
    "sc.pp.neighbors(adata)\n",
    "sc.tl.umap(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 234,
   "metadata": {},
   "outputs": [],
   "source": [
    "# make consistent annotations across datasets\n",
    "adata.obs['celltype']=adata.obs['cell_type'].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 235,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['B cell', 'CD4-positive, alpha-beta T cell',\n",
       "       'CD8-positive, alpha-beta T cell', 'adventitial cell', 'basal cell',\n",
       "       'basophil', 'blood vessel endothelial cell',\n",
       "       'bronchial smooth muscle cell', 'capillary endothelial cell',\n",
       "       'classical monocyte', 'club cell', 'dendritic cell',\n",
       "       'effector CD4-positive, alpha-beta T cell',\n",
       "       'effector CD8-positive, alpha-beta T cell',\n",
       "       'endothelial cell of artery', 'endothelial cell of lymphatic vessel',\n",
       "       'fibroblast', 'intermediate monocyte', 'lung ciliated cell',\n",
       "       'lung microvascular endothelial cell', 'macrophage', 'mature NK T cell',\n",
       "       'mesothelial cell', 'myofibroblast cell', 'neutrophil',\n",
       "       'non-classical monocyte', 'pericyte cell', 'plasma cell',\n",
       "       'plasmacytoid dendritic cell', 'pulmonary ionocyte',\n",
       "       'respiratory goblet cell', 'serous cell of epithelium of bronchus',\n",
       "       'smooth muscle cell', 'type I pneumocyte', 'type II pneumocyte',\n",
       "       'vascular associated smooth muscle cell', 'vein endothelial cell'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 235,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.obs['celltype'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 236,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['celltype'],\n",
    "                           categories=['B cell', 'CD4-positive, alpha-beta T cell',\n",
    "       'CD8-positive, alpha-beta T cell', 'adventitial cell', 'basal cell',\n",
    "       'basophil', 'blood vessel endothelial cell',\n",
    "       'bronchial smooth muscle cell', 'capillary endothelial cell',\n",
    "       'classical monocyte', 'club cell', 'dendritic cell',\n",
    "       'effector CD4-positive, alpha-beta T cell',\n",
    "       'effector CD8-positive, alpha-beta T cell',\n",
    "       'endothelial cell of artery', 'endothelial cell of lymphatic vessel',\n",
    "       'fibroblast', 'intermediate monocyte', 'lung ciliated cell',\n",
    "       'lung microvascular endothelial cell', 'macrophage', 'mature NK T cell',\n",
    "       'mesothelial cell', 'myofibroblast cell', 'neutrophil',\n",
    "       'non-classical monocyte', 'pericyte cell', 'plasma cell',\n",
    "       'plasmacytoid dendritic cell', 'pulmonary ionocyte',\n",
    "       'respiratory goblet cell', 'serous cell of epithelium of bronchus',\n",
    "       'smooth muscle cell', 'type I pneumocyte', 'type II pneumocyte',\n",
    "       'vascular associated smooth muscle cell', 'vein endothelial cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 237,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,['CD8-positive, alpha-beta T cell','effector CD4-positive, alpha-beta T cell','effector CD8-positive, alpha-beta T cell',])\n",
    "ref_cluster[ix]='CD4-positive, alpha-beta T cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,['blood vessel endothelial cell','capillary endothelial cell', 'endothelial cell of artery', 'lung microvascular endothelial cell', 'vein endothelial cell'])\n",
    "ref_cluster[ix]='adventitial cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,['intermediate monocyte',  'non-classical monocyte'])\n",
    "ref_cluster[ix]='classical monocyte'\n",
    "\n",
    "ix=np.isin(ref_cluster,['smooth muscle cell','vascular associated smooth muscle cell'])\n",
    "ref_cluster[ix]='bronchial smooth muscle cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,['respiratory goblet cell'])\n",
    "ref_cluster[ix]='club cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,['type II pneumocyte'])\n",
    "ref_cluster[ix]='type I pneumocyte'\n",
    "\n",
    "ix=np.isin(ref_cluster,['serous cell of epithelium of bronchus'])\n",
    "ref_cluster[ix]='club cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,['plasmacytoid dendritic cell'])\n",
    "ref_cluster[ix]='dendritic cell'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 238,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['celltype']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['B cell', 'CD4-positive, alpha-beta T cell',\n",
    "       'adventitial cell', 'basal cell',\n",
    "       'basophil',\n",
    "       'bronchial smooth muscle cell',\n",
    "       'classical monocyte', 'club cell', 'dendritic cell',                                               \n",
    "        'endothelial cell of lymphatic vessel',\n",
    "       'fibroblast','lung ciliated cell', \n",
    "       'macrophage', 'mature NK T cell',\n",
    "       'mesothelial cell', 'myofibroblast cell', 'neutrophil',\n",
    "       'pericyte cell', 'plasma cell',\n",
    "       'pulmonary ionocyte',\n",
    "        'type I pneumocyte'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 239,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.rename_categories('celltype',['B cells', 'T cells',\n",
    "       'Endothelial cells', 'Basal cells',\n",
    "       'Basophil cells',\n",
    "       'Smooth muscle cells',\n",
    "       'Monocytes', 'Airway epithelial cells', 'Dendritic cells',                                               \n",
    "        'Lymphatic endothelial cells',\n",
    "       'Fibroblast cells','Multiciliated cells',\n",
    "       'Macrophages', 'NK cells',\n",
    "       'Mesothelial cells', 'Myofibroblast cells', 'Neutrophils',\n",
    "       'Pericytes', 'Plasma cells',\n",
    "        'Unknown',\n",
    "        'Alveolar cells'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 243,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['tissue'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['tissue'],\n",
    "                           categories=['lung'])\n",
    "adata.rename_categories('tissue', ['Lung'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 244,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['sex'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['sex'],\n",
    "                           categories=['female', 'male'])\n",
    "adata.rename_categories('sex', ['Female', 'Male'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 245,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['ethnicity'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['ethnicity'],\n",
    "                           categories=['African American or Afro-Caribbean', 'European'])\n",
    "adata.rename_categories('ethnicity', ['African-American or Afro-Caribbean', 'European'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 246,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['development_stage'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['development_stage'],\n",
    "                           categories=['59-year-old human stage', '61-year-old human stage'])\n",
    "adata.rename_categories('development_stage',['59', '61'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 247,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['donor'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['donor'],\n",
    "                           categories=['TSP1', 'TSP2', 'TSP14'])\n",
    "adata.rename_categories('donor', ['TSP1', 'TSP2', 'TSP14'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 248,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['Organ'] = 'Lung'\n",
    "adata.obs['Organ_Specific'] = adata.obs['tissue']\n",
    "adata.obs['Dataset'] = 'Pisco_Lung'\n",
    "adata.obs['InternDatasetNumber'] = '09-5-Lung-Pisco-2022'\n",
    "adata.obs['Dataset_status'] = 'Healthy_Dataset'\n",
    "\n",
    "adata.obs['celltype'] = adata.obs['celltype']\n",
    "adata.obs['sub_celltype'] = 'NaN'\n",
    "adata.obs['Malignant'] = 'NonMalignant'\n",
    "\n",
    "adata.obs['Patient'] = adata.obs['donor']\n",
    "adata.obs['Patient_Number'] = 'NaN'\n",
    "adata.obs['age'] = adata.obs['development_stage']\n",
    "adata.obs['sex'] = adata.obs['sex']\n",
    "adata.obs['ethnicity'] = adata.obs['ethnicity']\n",
    "adata.obs['health_status'] = 'NaN'\n",
    "\n",
    "adata.obs['original_celltype_1'] = adata.obs['cell_type']\n",
    "adata.obs['original_celltype_2'] = 'NaN'\n",
    "adata.obs['original_celltype_3'] = 'NaN'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 250,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 251,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs_names_make_unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 252,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.write(writepath + '09-5-Lung-Pisco-2022-processed.h5ad')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 09-7-Lung-Han-2020"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(adata_han.obs['sub_tissue'],['AdultLung']) \n",
    "adata=adata_han[ix].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 624,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['InternDatasetNumber'] ='09-7-Lung-Han-2020'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#calculate QC covariates\n",
    "adata.obs['n_counts'] = adata.X.sum(1)\n",
    "adata.obs['log_counts'] = np.log(adata.obs['n_counts'])\n",
    "adata.obs['n_genes'] = (adata.X > 0).sum(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# FILTER PARAMETERS#Filter out cells\n",
    "sc.pp.filter_cells(adata, max_counts = 5000)\n",
    "sc.pp.filter_cells(adata, max_genes = 2000)\n",
    "# Min 20 cells - filters out low count genes\n",
    "sc.pp.filter_genes(adata, min_cells=20) #500 works # # 450(30494) #400(29837) #300(28268) not"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get mt genes\n",
    "mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]\n",
    "np.array(mt_genes)\n",
    "mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]\n",
    "\n",
    "# convert to spare\n",
    "adata.X=sp.sparse.csr_matrix.todense(adata.X)\n",
    "\n",
    "# manually define mt score\n",
    "y = np.bincount(mt_gene_mask)\n",
    "ii = np.nonzero(y)[0]\n",
    "np.vstack((ii,y[ii])).T\n",
    "adata.X[:, mt_gene_mask].sum(1)\n",
    "mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))\n",
    "mt_sum=np.array(pd.DataFrame(mt_sum)[0])\n",
    "mt_frac=mt_sum/adata.obs['n_counts'].values\n",
    "\n",
    "adata.obs['mt_frac'] = mt_frac\n",
    "\n",
    "#Filter out cells with over 20% mito fraction\n",
    "adata = adata[adata.obs['mt_frac'] < 0.20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pp=adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Delete adata_pp\n",
    "del adata_pp\n",
    "adata.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "#Keep the count data in a counts layer\n",
    "adata.layers[\"counts\"] = adata.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.raw = adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "adata.X /= adata.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculate the visualizations\n",
    "sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')\n",
    "sc.pp.neighbors(adata)\n",
    "sc.tl.umap(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# make consistent annotations across datasets\n",
    "adata.obs['celltype']=adata.obs['celltype_specific'].copy()\n",
    "adata.obs['celltype'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['celltype'],\n",
    "                           categories=['AT1 cell', 'AT2 cell', 'Actived T cell',\n",
    "       'Alveolar bipotent/intermediate cell', 'Arterial endothelial cell',\n",
    "       'Artry endothelial cell', 'B cell', 'B cell (Plasmocyte)',\n",
    "       'B cell (Plasmocyte)_IGHA/HM high', 'B cell (Plasmocyte)_IGHG high',\n",
    "       'Basal/Epithelial cell', 'Chondrocyte', 'Ciliated cell', 'Club cell',\n",
    "       'Club cell_BPIFB1 high', 'Club cell_KLK11 high',\n",
    "       'Conventional dendritic cell', 'Dendritic cell',\n",
    "       'Endothelial cell_ACKR1 high', 'Endothelial cell_SELE high',\n",
    "       'Endothelial cell_SPARCL1 high', 'Endothelial cell_TMEM100 high',\n",
    "       'Endothelial cell_VWF high', 'Epithelial cell_PLA2G2A high',\n",
    "       'Epithelial cell_S100A2 high', 'Fibroblast', 'Fibroblast_A2M high',\n",
    "       'Fibroblast_SFRP high', 'Lymphatic endothelial cell', 'M2 macrophage',\n",
    "       'Macrophage', 'Macrophage_CCL20 high', 'Macrophage_M2',\n",
    "       'Macrophage_VSIG4 high', 'Mast cell', 'Megakaryocyte', 'Monocyte',\n",
    "       'Myeloid cell', 'Natural killer cell', 'Neutrophil',\n",
    "       'Proliferating T cell',\n",
    "       'Proliferating alveolar bipotent progenitor cell', 'Proliferating cell',\n",
    "       'Smooth muscle cell', 'T cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 647,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,[ 'AT2 cell',  'Alveolar bipotent/intermediate cell',   'Proliferating alveolar bipotent progenitor cell'])\n",
    "ref_cluster[ix]= 'AT1 cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,[  'Proliferating T cell',  'T cell'])\n",
    "ref_cluster[ix]= 'Actived T cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,[  'Artry endothelial cell', 'Endothelial cell_ACKR1 high', 'Endothelial cell_SELE high',\n",
    "       'Endothelial cell_SPARCL1 high', 'Endothelial cell_TMEM100 high',\n",
    "       'Endothelial cell_VWF high',])\n",
    "ref_cluster[ix]=  'Arterial endothelial cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,[ 'B cell (Plasmocyte)_IGHA/HM high', 'B cell (Plasmocyte)_IGHG high'])\n",
    "ref_cluster[ix]= 'B cell (Plasmocyte)'\n",
    "\n",
    "ix=np.isin(ref_cluster,[ 'Club cell_BPIFB1 high', 'Club cell_KLK11 high', 'Epithelial cell_PLA2G2A high',\n",
    "       'Epithelial cell_S100A2 high'])\n",
    "ref_cluster[ix]=  'Club cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,[ 'Dendritic cell'])\n",
    "ref_cluster[ix]= 'Conventional dendritic cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,['Fibroblast_A2M high', 'Fibroblast_SFRP high'])\n",
    "ref_cluster[ix]=  'Fibroblast'\n",
    "\n",
    "ix=np.isin(ref_cluster,[ 'Macrophage', 'Macrophage_CCL20 high', 'Macrophage_M2', 'Macrophage_VSIG4 high', ])\n",
    "ref_cluster[ix]= 'M2 macrophage'\n",
    "\n",
    "ix=np.isin(ref_cluster,[ 'Dendritic cell'])\n",
    "ref_cluster[ix]= 'Conventional dendritic cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,[ 'Dendritic cell'])\n",
    "ref_cluster[ix]= 'Conventional dendritic cell'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 648,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['celltype']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['AT1 cell', 'Actived T cell',\n",
    "      'Arterial endothelial cell',\n",
    "     'B cell', 'B cell (Plasmocyte)',\n",
    "       \n",
    "       'Basal/Epithelial cell', 'Chondrocyte', 'Ciliated cell', 'Club cell',\n",
    "       \n",
    "       'Conventional dendritic cell',\n",
    "       \n",
    "                                                        'Fibroblast', 'Fibroblast_A2M high',\n",
    "       'Fibroblast_SFRP high', 'Lymphatic endothelial cell', 'M2 macrophage',\n",
    "       'Mast cell', 'Megakaryocyte', 'Monocyte',\n",
    "       'Myeloid cell', 'Natural killer cell', 'Neutrophil',\n",
    "     \n",
    "      'Proliferating cell',\n",
    "       'Smooth muscle cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 649,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.rename_categories('celltype', ['Alveolar cells', 'T cells',\n",
    "      'Endothelial cells',\n",
    "     'B cells', 'Plasma cells',\n",
    "       \n",
    "       'Basal cells', 'Chondrocytes', 'Multiciliated cells', 'Airway epithelial cells',\n",
    "       \n",
    "       'Dendritic cells',\n",
    "       \n",
    "                                                        'Fibroblast', 'Fibroblast_A2M high',\n",
    "       'Fibroblast cells', 'Lymphatic endothelial cells', 'Macrophages',\n",
    "       'Mast cells', 'Megakaryocytes', 'Monocytes',\n",
    "       'Myeloid cells', 'NK cells', 'Neutrophils',\n",
    "     \n",
    "      'Unknown',\n",
    "       'Smooth muscle cells'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 653,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['sub_tissue'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['sub_tissue'],\n",
    "                           categories=['AdultLung'])\n",
    "adata.rename_categories('sub_tissue', ['Lung'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 654,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['sex'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['sex'],\n",
    "                           categories=['female', 'male', 'unknown'])\n",
    "adata.rename_categories('sex', ['Female', 'Male', 'NaN'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 655,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['age'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['age'],\n",
    "                           categories=['21Y', '49Y'])\n",
    "adata.rename_categories('age',['21', '49'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 656,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['donor'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['donor'],\n",
    "                           categories=['Donor38', 'Donor41', 'Donor42'])\n",
    "adata.rename_categories('donor', ['Han-Donor38', 'Han-Donor41', 'Han-Donor42'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 657,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['Organ'] = 'Lung'\n",
    "adata.obs['Organ_Specific'] = adata.obs['sub_tissue']\n",
    "adata.obs['Dataset'] = 'Han_Lung'\n",
    "adata.obs['InternDatasetNumber'] = '09-7-Lung-Han-2020'\n",
    "adata.obs['Dataset_status'] = 'Healthy_Dataset'\n",
    "\n",
    "adata.obs['celltype'] = adata.obs['celltype']\n",
    "adata.obs['sub_celltype'] = 'NaN'\n",
    "adata.obs['Malignant'] = 'NonMalignant'\n",
    "\n",
    "adata.obs['Patient'] = adata.obs['donor']\n",
    "adata.obs['Patient_Number'] = 'NaN'\n",
    "adata.obs['age'] = adata.obs['age']\n",
    "adata.obs['sex'] = adata.obs['sex']\n",
    "adata.obs['ethnicity'] = 'NaN'\n",
    "adata.obs['health_status'] = 'NaN'\n",
    "\n",
    "adata.obs['original_celltype_1'] = adata.obs['celltype_specific']\n",
    "adata.obs['original_celltype_2'] = adata.obs['celltype_global']\n",
    "adata.obs['original_celltype_3'] = 'NaN'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['Alveolar cells', 'T cells', 'Endothelial cells', 'B cells',\n",
       "       'Plasma cells', 'Basal cells', 'Chondrocytes', 'Multiciliated cells',\n",
       "       'Airway epithelial cells', 'Dendritic cells', 'Fibroblast cells',\n",
       "       'Lymphatic endothelial cells', 'Macrophages', 'Mast cells',\n",
       "       'Megakaryocytes', 'Monocytes', 'Myeloid cells', 'NK cells',\n",
       "       'Neutrophils', 'Unknown', 'Smooth muscle cells'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# make consistent annotations across datasets\n",
    "adata.obs['celltype'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['celltype'],\n",
    "                           categories=['Alveolar cells', 'T cells', 'Endothelial cells', 'B cells',\n",
    "       'Plasma cells', 'Basal cells', 'Chondrocytes', 'Multiciliated cells',\n",
    "       'Airway epithelial cells', 'Dendritic cells', 'Fibroblast',\n",
    "       'Fibroblast_A2M high', 'Fibroblast cells',\n",
    "       'Lymphatic endothelial cells', 'Macrophages', 'Mast cells',\n",
    "       'Megakaryocytes', 'Monocytes', 'Myeloid cells', 'NK cells',\n",
    "       'Neutrophils', 'Unknown', 'Smooth muscle cells'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,[ 'Fibroblast', 'Fibroblast_A2M high'])\n",
    "ref_cluster[ix]= 'Fibroblast cells'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['celltype']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['Alveolar cells', 'T cells', 'Endothelial cells', 'B cells',\n",
    "       'Plasma cells', 'Basal cells', 'Chondrocytes', 'Multiciliated cells',\n",
    "       'Airway epithelial cells', 'Dendritic cells',  'Fibroblast cells',\n",
    "       'Lymphatic endothelial cells', 'Macrophages', 'Mast cells',\n",
    "       'Megakaryocytes', 'Monocytes', 'Myeloid cells', 'NK cells',\n",
    "       'Neutrophils', 'Unknown', 'Smooth muscle cells'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.rename_categories('celltype', ['Alveolar cells', 'T cells', 'Endothelial cells', 'B cells',\n",
    "       'Plasma cells', 'Basal cells', 'Chondrocytes', 'Multiciliated cells',\n",
    "       'Airway epithelial cells', 'Dendritic cells',  'Fibroblast cells',\n",
    "       'Lymphatic endothelial cells', 'Macrophages', 'Mast cells',\n",
    "       'Megakaryocytes', 'Monocytes', 'Myeloid cells', 'NK cells',\n",
    "       'Neutrophils', 'Unknown', 'Smooth muscle cells'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.write(writepath + '09-7-Lung-Han-2020-processed.h5ad')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 09-9-Lung_ImmuneCells-Teichmann-2022"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 198,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(adata_analysis.obs['tissue_major'], ['Lung'])\n",
    "adata=adata_analysis[ix].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 204,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['Organ'] =  adata.obs['tissue_major']\n",
    "adata.obs['Organ_Specific'] = adata.obs['sub_tissue']\n",
    "adata.obs['Dataset'] = adata.obs['Dataset']\n",
    "adata.obs['InternDatasetNumber'] = '09-9-Lung_ImmuneCells-Teichmann-2022'\n",
    "adata.obs['Dataset_status'] = 'Healthy_Dataset'\n",
    "\n",
    "adata.obs['celltype'] = adata.obs['celltype']\n",
    "adata.obs['sub_celltype'] = 'NaN'\n",
    "adata.obs['Malignant'] = 'NonMalignant'\n",
    "\n",
    "adata.obs['Patient'] = adata.obs['donor']\n",
    "adata.obs['Patient_Number'] = 'NaN'\n",
    "adata.obs['age'] = adata.obs['development_stage']\n",
    "adata.obs['sex'] = adata.obs['sex']\n",
    "adata.obs['ethnicity'] = adata.obs['ethnicity']\n",
    "adata.obs['health_status'] = 'NaN'\n",
    "\n",
    "adata.obs['original_celltype_1'] = adata.obs['cell_type']\n",
    "adata.obs['original_celltype_2'] = adata.obs['Majority_voting_CellTypist_high']\n",
    "adata.obs['original_celltype_3'] = 'NaN'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 205,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 206,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs_names_make_unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 207,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.write(writepath + '09-9-Lung_ImmuneCells-Teichmann-2022-processed.h5ad')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "# 10-Heart"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 10-1-Heart-Han-2020"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 150,
   "metadata": {},
   "outputs": [],
   "source": [
    "# here we use sfaira to import available datasets with annotations\n",
    "# note that the following steps may change depending on the current sfaira version and the path to your repository\n",
    "\n",
    "datadir = 'path/to/repo/'\n",
    "\n",
    "ds = sfaira.data.human.DatasetGroupHeart(path=datadir)  # This links all data sets available"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 176,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['human_heart_2020_microwell_han_001_10.1038/s41586-020-2157-4',\n",
       " 'human_heart_2020_microwell_han_002_10.1038/s41586-020-2157-4',\n",
       " 'human_heart_2020_microwell_han_003_10.1038/s41586-020-2157-4',\n",
       " 'human_heart_2020_microwell_han_004_10.1038/s41586-020-2157-4']"
      ]
     },
     "execution_count": 176,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds.ids"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 177,
   "metadata": {},
   "outputs": [],
   "source": [
    "# pick\n",
    "idx = ds.ids[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 178,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'human_heart_2020_microwell_han_002_10.1038/s41586-020-2157-4'"
      ]
     },
     "execution_count": 178,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "idx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 179,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/icb/moritz.thomas/miniconda3/lib/python3.7/site-packages/sfaira-master/sfaira/data/base.py:84: UserWarning: using default genomes Homo_sapiens_GRCh38_97\n",
      "  warnings.warn(f\"using default genomes {genome}\")\n",
      "Variable names are not unique. To make them unique, call `.var_names_make_unique`.\n"
     ]
    }
   ],
   "source": [
    "ds.datasets[idx].load()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 181,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata1=ds.datasets[idx].adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 183,
   "metadata": {},
   "outputs": [],
   "source": [
    "# pick\n",
    "idx = ds.ids[2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 184,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'human_heart_2020_microwell_han_003_10.1038/s41586-020-2157-4'"
      ]
     },
     "execution_count": 184,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "idx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 185,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/icb/moritz.thomas/miniconda3/lib/python3.7/site-packages/sfaira-master/sfaira/data/base.py:84: UserWarning: using default genomes Homo_sapiens_GRCh38_97\n",
      "  warnings.warn(f\"using default genomes {genome}\")\n",
      "Variable names are not unique. To make them unique, call `.var_names_make_unique`.\n"
     ]
    }
   ],
   "source": [
    "ds.datasets[idx].load()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 187,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata2=ds.datasets[idx].adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 192,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata=adata1.concatenate(adata2, batch_key='batch')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 246,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.var.index=adata.var['names'].tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 249,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X=sp.sparse.csr_matrix.todense(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "#calculate QC covariates\n",
    "adata.obs['n_counts'] = adata.X.sum(1)\n",
    "adata.obs['log_counts'] = np.log(adata.obs['n_counts'])\n",
    "adata.obs['n_genes'] = (adata.X > 0).sum(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "filtered out 33 cells that have more than 1000 genes expressed\n",
      "filtered out 18573 genes that are detected in less than 20 cells\n"
     ]
    }
   ],
   "source": [
    "# FILTER PARAMETERS\n",
    "#Filter out cells\n",
    "#sc.pp.filter_cells(adata, max_counts = 4000)\n",
    "sc.pp.filter_cells(adata, max_genes = 1000)\n",
    "# Min 20 cells - filters out low count genes\n",
    "sc.pp.filter_genes(adata, min_cells=20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.var.index=adata.var['names'].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get mt genes\n",
    "mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]\n",
    "np.array(mt_genes)\n",
    "mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]\n",
    "\n",
    "# convert to spare\n",
    "adata.X=sp.sparse.csr_matrix.todense(adata.X)\n",
    "\n",
    "# manually define mt score\n",
    "y = np.bincount(mt_gene_mask)\n",
    "ii = np.nonzero(y)[0]\n",
    "np.vstack((ii,y[ii])).T\n",
    "adata.X[:, mt_gene_mask].sum(1)\n",
    "mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))\n",
    "mt_sum=np.array(pd.DataFrame(mt_sum)[0])\n",
    "mt_frac=mt_sum/adata.obs['n_counts'].values\n",
    "\n",
    "adata.obs['mt_frac'] = mt_frac\n",
    "\n",
    "#Filter out cells with over 25% mito fraction\n",
    "adata = adata[adata.obs['mt_frac'] < 0.25]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pp=adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:00:00): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n",
      "computing PCA\n",
      "    with n_comps=15\n",
      "    finished (0:00:00)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)\n",
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 8 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:00:00)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "R[write to console]: Loading required package: scran\n",
      "\n",
      "R[write to console]: Loading required package: SingleCellExperiment\n",
      "\n",
      "R[write to console]: Loading required package: SummarizedExperiment\n",
      "\n",
      "R[write to console]: Loading required package: GenomicRanges\n",
      "\n",
      "R[write to console]: Loading required package: stats4\n",
      "\n",
      "R[write to console]: Loading required package: BiocGenerics\n",
      "\n",
      "R[write to console]: Loading required package: parallel\n",
      "\n",
      "R[write to console]: \n",
      "Attaching package: ‘BiocGenerics’\n",
      "\n",
      "\n",
      "R[write to console]: The following objects are masked from ‘package:parallel’:\n",
      "\n",
      "    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,\n",
      "    clusterExport, clusterMap, parApply, parCapply, parLapply,\n",
      "    parLapplyLB, parRapply, parSapply, parSapplyLB\n",
      "\n",
      "\n",
      "R[write to console]: The following objects are masked from ‘package:stats’:\n",
      "\n",
      "    IQR, mad, sd, var, xtabs\n",
      "\n",
      "\n",
      "R[write to console]: The following objects are masked from ‘package:base’:\n",
      "\n",
      "    anyDuplicated, append, as.data.frame, basename, cbind, colnames,\n",
      "    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,\n",
      "    grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,\n",
      "    order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,\n",
      "    rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,\n",
      "    union, unique, unsplit, which, which.max, which.min\n",
      "\n",
      "\n",
      "R[write to console]: Loading required package: S4Vectors\n",
      "\n",
      "R[write to console]: \n",
      "Attaching package: ‘S4Vectors’\n",
      "\n",
      "\n",
      "R[write to console]: The following object is masked from ‘package:base’:\n",
      "\n",
      "    expand.grid\n",
      "\n",
      "\n",
      "R[write to console]: Loading required package: IRanges\n",
      "\n",
      "R[write to console]: Loading required package: GenomeInfoDb\n",
      "\n",
      "R[write to console]: Loading required package: Biobase\n",
      "\n",
      "R[write to console]: Welcome to Bioconductor\n",
      "\n",
      "    Vignettes contain introductory material; view with\n",
      "    'browseVignettes()'. To cite Bioconductor, see\n",
      "    'citation(\"Biobase\")', and for packages 'citation(\"pkgname\")'.\n",
      "\n",
      "\n",
      "R[write to console]: Loading required package: DelayedArray\n",
      "\n",
      "R[write to console]: Loading required package: matrixStats\n",
      "\n",
      "R[write to console]: \n",
      "Attaching package: ‘matrixStats’\n",
      "\n",
      "\n",
      "R[write to console]: The following objects are masked from ‘package:Biobase’:\n",
      "\n",
      "    anyMissing, rowMedians\n",
      "\n",
      "\n",
      "R[write to console]: Loading required package: BiocParallel\n",
      "\n",
      "R[write to console]: \n",
      "Attaching package: ‘DelayedArray’\n",
      "\n",
      "\n",
      "R[write to console]: The following objects are masked from ‘package:matrixStats’:\n",
      "\n",
      "    colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges\n",
      "\n",
      "\n",
      "R[write to console]: The following objects are masked from ‘package:base’:\n",
      "\n",
      "    aperm, apply, rowsum\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/python/lib/python3.7/site-packages/ipykernel_launcher.py:3: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "#Delete adata_pp\n",
    "del adata_pp\n",
    "adata.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "#Keep the count data in a counts layer\n",
    "adata.layers[\"counts\"] = adata.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.raw = adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "adata.X /= adata.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n",
      "    finished (0:00:00)\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=50\n",
      "    finished (0:00:00)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 50\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)\n",
      "computing UMAP\n",
      "    finished: added\n",
      "    'X_umap', UMAP coordinates (adata.obsm) (0:00:05)\n"
     ]
    }
   ],
   "source": [
    "# Calculate the visualizations\n",
    "sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')\n",
    "sc.pp.neighbors(adata)\n",
    "sc.tl.umap(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "# make consistent annotations across datasets\n",
    "adata.obs['sub_celltype']=adata.obs['celltype_specific'].copy()\n",
    "adata.obs['celltype']=adata.obs['celltype_specific'].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['celltype'],\n",
    "                           categories=['Apoptotic cell', 'Cardiomyocyte', 'Conventional dendritic cell',\n",
    "       'Dendritic cell', 'Endothelial cell', 'Fibroblast', 'M1 Macrophage',\n",
    "       'M2 Macrophage', 'Macrophage', 'Mast cell', 'Neutrophil',\n",
    "       'Smooth muscle cell', 'T cell', 'Vascular endothelial cell',\n",
    "       'Ventricle cardiomyocyte'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,['Ventricle cardiomyocyte'])\n",
    "ref_cluster[ix]='Cardiomyocyte'\n",
    "\n",
    "ix=np.isin(ref_cluster,['Dendritic cell'])\n",
    "ref_cluster[ix]='Conventional dendritic cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,['Vascular endothelial cell'])\n",
    "ref_cluster[ix]='Endothelial cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,[ 'M2 Macrophage', 'Macrophage'])\n",
    "ref_cluster[ix]='M1 Macrophage',"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['celltype']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['Apoptotic cell', 'Cardiomyocyte', 'Conventional dendritic cell',\n",
    "    'Endothelial cell', 'Fibroblast', 'M1 Macrophage',\n",
    "        'Mast cell', 'Neutrophil',\n",
    "       'Smooth muscle cell', 'T cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.rename_categories('celltype', ['Unknown', 'Cardiomyocytes', 'Dendritic cells',\n",
    "       'Endothelial cells', 'Fibroblast cells', 'Macrophages', 'Mast cells','Neutrophils', 'Smooth muscle', 'T cells'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['Organ'] = 'Heart'\n",
    "adata.obs['Organ_Specific'] = 'Heart'\n",
    "adata.obs['Dataset'] = 'Han_Heart'\n",
    "adata.obs['InternDatasetNumber'] = '10-1-Heart-Han-2020'\n",
    "adata.obs['Dataset_status'] = 'Healthy_Dataset'\n",
    "\n",
    "adata.obs['celltype'] = adata.obs['celltype']\n",
    "adata.obs['sub_celltype'] = 'NaN'\n",
    "adata.obs['Malignant'] = 'NonMalignant'\n",
    "\n",
    "adata.obs['Patient'] = adata.obs['donor']\n",
    "adata.obs['Patient_Number'] = adata.obs['sample']\n",
    "adata.obs['age'] = adata.obs['dev_stage']\n",
    "adata.obs['sex'] = adata.obs['gender']\n",
    "adata.obs['ethnicity'] = 'NaN'\n",
    "adata.obs['health_status'] = 'NaN'\n",
    "\n",
    "adata.obs['original_celltype_1'] = adata.obs['celltype_specific']\n",
    "adata.obs['original_celltype_2'] = adata.obs['celltype_global']\n",
    "adata.obs['original_celltype_3'] = 'NaN'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.write(writepath + '10-1-Heart-Han-2020-processed.h5ad')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 10-2_1-Heart-Teichmann-2020"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Ontology <class 'sfaira.versions.metadata.base.OntologyUberonLifecyclestage'> is not a DAG, treat child-parent reasoning with care.\n",
      "Ontology <class 'sfaira.versions.metadata.base.OntologyMondo'> is not a DAG, treat child-parent reasoning with care.\n",
      "Ontology <class 'sfaira.versions.metadata.base.OntologyUberon'> is not a DAG, treat child-parent reasoning with care.\n"
     ]
    }
   ],
   "source": [
    "target_collections = [\"b52eb423-5d0d-4645-b217-e1c6d38b2e72\"]\n",
    "cache_path = os.path.join(\".\", \"data\")\n",
    "dsg = sfaira.data.dataloaders.databases.DatasetSuperGroupDatabases(data_path=cache_path, cache_metadata=True)\n",
    "dsg.subset(key=\"collection_id\", values=target_collections)\n",
    "dsg.datasets\n",
    "dsg.download()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
    "path = '/path/to/repo/b52eb423-5d0d-4645-b217-e1c6d38b2e72/'\n",
    "files = [f for f in listdir(path) if isfile(join(path, f))]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['ed852810-a003-4386-9846-1638362cee39.h5ad',\n",
       " '1009f384-b12d-448e-ba9f-1b7d2ecfbb4e.h5ad',\n",
       " '84f1a631-910b-4fbb-9f76-d915a07316d2.h5ad',\n",
       " 'd4e69e01-3ba2-4d6b-a15d-e7048f78f22e.h5ad',\n",
       " 'f75f2ff4-2884-4c2d-b375-70de37a34507.h5ad',\n",
       " '572f3f3e-d3e4-4d13-8e2b-88215e508481.h5ad',\n",
       " '78fd69d2-75e4-4207-819a-563139f273c6.h5ad',\n",
       " '9d584fcb-a28a-4b91-a886-ceb66a88ef81.h5ad']"
      ]
     },
     "execution_count": 66,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "path = '/path/to/repo/b52eb423-5d0d-4645-b217-e1c6d38b2e72/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "files = ['ed852810-a003-4386-9846-1638362cee39.h5ad',\n",
    " '1009f384-b12d-448e-ba9f-1b7d2ecfbb4e.h5ad',\n",
    " '84f1a631-910b-4fbb-9f76-d915a07316d2.h5ad']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ed852810-a003-4386-9846-1638362cee39.h5ad\n",
      "1009f384-b12d-448e-ba9f-1b7d2ecfbb4e.h5ad\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ruben.brabenec/.local/lib/python3.7/site-packages/anndata/_core/anndata.py:1785: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n",
      "  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "84f1a631-910b-4fbb-9f76-d915a07316d2.h5ad\n"
     ]
    }
   ],
   "source": [
    "for i in range(len(files)):\n",
    "    print(files[i])\n",
    "    path_2 = path + files[i]\n",
    "    u = sc.read_h5ad(path_2)\n",
    "    u.obs['id'] = files[i]\n",
    "    if i == 0:\n",
    "        adata = u\n",
    "    else:\n",
    "        adata = adata.concatenate(u, join='outer')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [],
   "source": [
    "#calculate QC covariates\n",
    "adata.obs['n_counts'] = adata.X.sum(1)\n",
    "adata.obs['log_counts'] = np.log(adata.obs['n_counts'])\n",
    "adata.obs['n_genes'] = (adata.X > 0).sum(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "filtered out 159 cells that have more than 1600 counts\n",
      "filtered out 2 cells that have more than 4500 genes expressed\n",
      "filtered out 9356 genes that are detected in less than 30 cells\n"
     ]
    }
   ],
   "source": [
    "# FILTER PARAMETERS\n",
    "#Filter out cells\n",
    "sc.pp.filter_cells(adata, max_counts = 1600)\n",
    "sc.pp.filter_cells(adata, max_genes = 4500)\n",
    "# Min 20 cells - filters out low count genes\n",
    "sc.pp.filter_genes(adata, min_cells=30)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.var['feature_name'] = adata.var['feature_name']\n",
    "adata.var.index = adata.var['feature_name']\n",
    "list_1 = adata.var.index.astype(str)\n",
    "adata.var.index = list_1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get mt genes\n",
    "mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]\n",
    "np.array(mt_genes)\n",
    "mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]\n",
    "\n",
    "# convert to spare\n",
    "adata.X=sp.sparse.csr_matrix.todense(adata.X)\n",
    "\n",
    "# manually define mt score\n",
    "y = np.bincount(mt_gene_mask)\n",
    "ii = np.nonzero(y)[0]\n",
    "np.vstack((ii,y[ii])).T\n",
    "adata.X[:, mt_gene_mask].sum(1)\n",
    "mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))\n",
    "mt_sum=np.array(pd.DataFrame(mt_sum)[0])\n",
    "mt_frac=mt_sum/adata.obs['n_counts'].values\n",
    "\n",
    "adata.obs['mt_frac'] = mt_frac\n",
    "\n",
    "#Filter out cells with over 10% mito fraction\n",
    "adata = adata[adata.obs['mt_frac'] < 0.10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pp=adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:00:06): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n",
      "computing PCA\n",
      "    with n_comps=15\n",
      "    finished (0:00:21)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:12)\n",
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 15 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:00:16)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/python/lib/python3.7/site-packages/ipykernel_launcher.py:3: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "#Delete adata_pp\n",
    "del adata_pp\n",
    "adata.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "#Keep the count data in a counts layer\n",
    "adata.layers[\"counts\"] = adata.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.raw = adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "adata.X /= adata.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n",
      "    finished (0:00:00)\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=50\n",
      "    finished (0:00:10)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 50\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:19)\n",
      "computing UMAP\n",
      "    finished: added\n",
      "    'X_umap', UMAP coordinates (adata.obsm) (0:01:00)\n"
     ]
    }
   ],
   "source": [
    "# Calculate the visualizations\n",
    "sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')\n",
    "sc.pp.neighbors(adata)\n",
    "sc.tl.umap(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [],
   "source": [
    "# make consistent annotations across datasets\n",
    "adata.obs['celltype']=adata.obs['cell_type'].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['B cell', 'CD4-positive, alpha-beta cytotoxic T cell',\n",
       "       'CD8-positive, alpha-beta cytotoxic T cell',\n",
       "       'CD14-positive, CD16-positive monocyte',\n",
       "       'activated CD4-positive, alpha-beta T cell',\n",
       "       'activated CD8-positive, alpha-beta T cell', 'dendritic cell',\n",
       "       'macrophage', 'mast cell', 'mature NK T cell', 'monocyte',\n",
       "       'native cell', 'natural killer cell', 'neural cell',\n",
       "       'regular atrial cardiac myocyte'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 96,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.obs['celltype'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['celltype'],\n",
    "                           categories=['B cell', 'CD4-positive, alpha-beta cytotoxic T cell',\n",
    "       'CD8-positive, alpha-beta cytotoxic T cell',\n",
    "       'CD14-positive, CD16-positive monocyte',\n",
    "       'activated CD4-positive, alpha-beta T cell',\n",
    "       'activated CD8-positive, alpha-beta T cell', 'dendritic cell',\n",
    "       'macrophage', 'mast cell', 'mature NK T cell', 'monocyte',\n",
    "       'native cell', 'natural killer cell', 'neural cell',\n",
    "       'regular atrial cardiac myocyte'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,[ 'CD8-positive, alpha-beta cytotoxic T cell', 'activated CD4-positive, alpha-beta T cell', 'activated CD8-positive, alpha-beta T cell'])\n",
    "ref_cluster[ix]='CD4-positive, alpha-beta cytotoxic T cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,[  'CD14-positive, CD16-positive monocyte', ])\n",
    "ref_cluster[ix]='monocyte'\n",
    "\n",
    "ix=np.isin(ref_cluster,['mature NK T cell'])\n",
    "ref_cluster[ix]='natural killer cell'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['celltype']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['B cell', 'CD4-positive, alpha-beta cytotoxic T cell',\n",
    "         'dendritic cell',\n",
    "       'macrophage', 'mast cell', 'monocyte',\n",
    "       'native cell', 'natural killer cell', 'neural cell',\n",
    "       'regular atrial cardiac myocyte'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.rename_categories('celltype', ['B cells', 'T cells',\n",
    "         'Dendritic cells',\n",
    "       'Macrophages', 'Mast cells', 'Monocytes',\n",
    "       'Unknown', 'NK cells', 'Neurons',\n",
    "       'Cardiomyocytes'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['sex'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['sex'],\n",
    "                           categories=['female', 'male'])\n",
    "adata.rename_categories('sex', ['Female', 'Male'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['development_stage'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['development_stage'],\n",
    "                           categories=['eighth decade human stage', 'fifth decade human stage',\n",
    "       'human late adulthood stage', 'human middle aged stage',\n",
    "       'seventh decade human stage', 'sixth decade human stage'])\n",
    "\n",
    "ix=np.isin(ref_cluster,['fifth decade human stage',\n",
    "       'human late adulthood stage', 'human middle aged stage',\n",
    "       'seventh decade human stage', 'sixth decade human stage'])\n",
    "ref_cluster[ix]='eighth decade human stage'\n",
    "\n",
    "adata.obs['development_stage']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['eighth decade human stage'])\n",
    "adata.rename_categories('development_stage', ['Adult'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['tissue'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['tissue'],\n",
    "                           categories=['apex of heart', 'heart left ventricle', 'heart right ventricle',\n",
    "       'interventricular septum', 'left cardiac atrium',\n",
    "       'right cardiac atrium'])\n",
    "\n",
    "ix=np.isin(ref_cluster,['heart right ventricle','interventricular septum'])\n",
    "ref_cluster[ix]='heart left ventricle'\n",
    "\n",
    "ix=np.isin(ref_cluster,['right cardiac atrium'])\n",
    "ref_cluster[ix]='left cardiac atrium'\n",
    "\n",
    "adata.obs['tissue']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['apex of heart', 'heart left ventricle', 'left cardiac atrium'])\n",
    "adata.rename_categories('tissue', ['Heart_Apex', 'Heart_Ventricle', 'Heart_Atrium'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['donor'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['donor'],\n",
    "                           categories=['D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D11', 'H2', 'H3', 'H4', 'H5',\n",
    "       'H6', 'H7'])\n",
    "adata.rename_categories('donor', ['D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D11', 'H2', 'H3', 'H4', 'H5',\n",
    "       'H6', 'H7'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['Organ'] = 'Heart'\n",
    "adata.obs['Organ_Specific'] = adata.obs['tissue']\n",
    "adata.obs['Dataset'] = 'Teichmann_Heart'\n",
    "adata.obs['InternDatasetNumber'] = '10-2_1-Heart-Teichmann-2020'\n",
    "adata.obs['Dataset_status'] = 'Healthy_Dataset'\n",
    "\n",
    "adata.obs['celltype'] = adata.obs['celltype']\n",
    "adata.obs['sub_celltype'] = 'NaN'\n",
    "adata.obs['Malignant'] = 'NonMalignant'\n",
    "\n",
    "adata.obs['Patient'] = adata.obs['donor']\n",
    "adata.obs['Patient_Number'] = adata.obs['sample']\n",
    "adata.obs['age'] = adata.obs['development_stage']\n",
    "adata.obs['sex'] = adata.obs['sex']\n",
    "adata.obs['ethnicity'] = adata.obs['ethnicity']\n",
    "adata.obs['health_status'] = 'NaN'\n",
    "\n",
    "adata.obs['original_celltype_1'] = adata.obs['cell_type']\n",
    "adata.obs['original_celltype_2'] = 'NaN'\n",
    "adata.obs['original_celltype_3'] = 'NaN'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs_names_make_unique()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 10-2_2-Heart-Teichmann-2020"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true,
    "tags": []
   },
   "source": [
    "## 10-2_2-Heart-Teichmann-2020"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [],
   "source": [
    "path = '/path/to/repo/b52eb423-5d0d-4645-b217-e1c6d38b2e72/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {},
   "outputs": [],
   "source": [
    "files = ['d4e69e01-3ba2-4d6b-a15d-e7048f78f22e.h5ad']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "d4e69e01-3ba2-4d6b-a15d-e7048f78f22e.h5ad\n"
     ]
    }
   ],
   "source": [
    "for i in range(len(files)):\n",
    "    print(files[i])\n",
    "    path_2 = path + files[i]\n",
    "    u = sc.read_h5ad(path_2)\n",
    "    u.obs['id'] = files[i]\n",
    "    if i == 0:\n",
    "        adata = u\n",
    "    else:\n",
    "        adata = adata.concatenate(u, join='outer')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {},
   "outputs": [],
   "source": [
    "#calculate QC covariates\n",
    "adata.obs['n_counts'] = adata.X.sum(1)\n",
    "adata.obs['log_counts'] = np.log(adata.obs['n_counts'])\n",
    "adata.obs['n_genes'] = (adata.X > 0).sum(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "filtered out 184 cells that have more than 5000 genes expressed\n",
      "filtered out 7608 genes that are detected in less than 100 cells\n"
     ]
    }
   ],
   "source": [
    "# FILTER PARAMETERS\n",
    "#Filter out cells\n",
    "sc.pp.filter_cells(adata, max_counts = 1500)\n",
    "sc.pp.filter_cells(adata, max_genes = 5000)\n",
    "# Min 20 cells - filters out low count genes\n",
    "sc.pp.filter_genes(adata, min_cells=100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.var['feature_name'] = adata.var['feature_name']\n",
    "adata.var.index = adata.var['feature_name']\n",
    "list_1 = adata.var.index.astype(str)\n",
    "adata.var.index = list_1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get mt genes\n",
    "mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]\n",
    "np.array(mt_genes)\n",
    "mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]\n",
    "\n",
    "# convert to spare\n",
    "adata.X=sp.sparse.csr_matrix.todense(adata.X)\n",
    "\n",
    "# manually define mt score\n",
    "y = np.bincount(mt_gene_mask)\n",
    "ii = np.nonzero(y)[0]\n",
    "np.vstack((ii,y[ii])).T\n",
    "adata.X[:, mt_gene_mask].sum(1)\n",
    "mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))\n",
    "mt_sum=np.array(pd.DataFrame(mt_sum)[0])\n",
    "mt_frac=mt_sum/adata.obs['n_counts'].values\n",
    "\n",
    "adata.obs['mt_frac'] = mt_frac\n",
    "\n",
    "#Filter out cells with over 10% mito fraction\n",
    "adata = adata[adata.obs['mt_frac'] < 0.10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pp=adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:00:47): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n",
      "computing PCA\n",
      "    with n_comps=15\n",
      "    finished (0:02:30)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:01:46)\n",
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 28 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:04:28)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/python/lib/python3.7/site-packages/ipykernel_launcher.py:3: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "#Delete adata_pp\n",
    "del adata_pp\n",
    "adata.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 136,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "#Keep the count data in a counts layer\n",
    "adata.layers[\"counts\"] = adata.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 137,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.raw = adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 138,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "adata.X /= adata.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 139,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 140,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n",
      "    finished (0:00:03)\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 141,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=50\n",
      "    finished (0:00:45)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 50\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:03:48)\n",
      "computing UMAP\n",
      "    finished: added\n",
      "    'X_umap', UMAP coordinates (adata.obsm) (0:09:33)\n"
     ]
    }
   ],
   "source": [
    "# Calculate the visualizations\n",
    "sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')\n",
    "sc.pp.neighbors(adata)\n",
    "sc.tl.umap(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 143,
   "metadata": {},
   "outputs": [],
   "source": [
    "# make consistent annotations across datasets\n",
    "adata.obs['celltype']=adata.obs['cell_type'].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 144,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['B cell', 'CD14-positive monocyte',\n",
       "       'CD14-positive, CD16-positive monocyte',\n",
       "       'CD4-positive, alpha-beta cytotoxic T cell',\n",
       "       'CD8-positive, alpha-beta cytotoxic T cell',\n",
       "       'activated CD4-positive, alpha-beta T cell',\n",
       "       'activated CD8-positive, alpha-beta T cell',\n",
       "       'capillary endothelial cell', 'dendritic cell', 'endothelial cell',\n",
       "       'endothelial cell of artery', 'endothelial cell of lymphatic vessel',\n",
       "       'epicardial adipocyte', 'fibroblast', 'macrophage', 'mast cell',\n",
       "       'mature NK T cell', 'mesothelial cell', 'monocyte', 'native cell',\n",
       "       'natural killer cell', 'neural cell', 'pericyte cell',\n",
       "       'regular atrial cardiac myocyte', 'regular ventricular cardiac myocyte',\n",
       "       'smooth muscle cell', 'vein endothelial cell'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 144,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.obs['celltype'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 145,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['celltype'],\n",
    "                           categories=['B cell', 'CD14-positive monocyte',\n",
    "       'CD14-positive, CD16-positive monocyte',\n",
    "       'CD4-positive, alpha-beta cytotoxic T cell',\n",
    "       'CD8-positive, alpha-beta cytotoxic T cell',\n",
    "       'activated CD4-positive, alpha-beta T cell',\n",
    "       'activated CD8-positive, alpha-beta T cell',\n",
    "       'capillary endothelial cell', 'dendritic cell', 'endothelial cell',\n",
    "       'endothelial cell of artery', 'endothelial cell of lymphatic vessel',\n",
    "       'epicardial adipocyte', 'fibroblast', 'macrophage', 'mast cell',\n",
    "       'mature NK T cell', 'mesothelial cell', 'monocyte', 'native cell',\n",
    "       'natural killer cell', 'neural cell', 'pericyte cell',\n",
    "       'regular atrial cardiac myocyte', 'regular ventricular cardiac myocyte',\n",
    "       'smooth muscle cell', 'vein endothelial cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 146,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,[ 'CD14-positive, CD16-positive monocyte', 'monocyte'])\n",
    "ref_cluster[ix]= 'CD14-positive monocyte'\n",
    "\n",
    "ix=np.isin(ref_cluster,['CD8-positive, alpha-beta cytotoxic T cell','activated CD4-positive, alpha-beta T cell','activated CD8-positive, alpha-beta T cell'])\n",
    "ref_cluster[ix]= 'CD4-positive, alpha-beta cytotoxic T cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,['endothelial cell','endothelial cell of artery', 'vein endothelial cell'])\n",
    "ref_cluster[ix]='capillary endothelial cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,['regular ventricular cardiac myocyte'])\n",
    "ref_cluster[ix]='regular atrial cardiac myocyte'\n",
    "\n",
    "ix=np.isin(ref_cluster,[ 'mature NK T cell'])\n",
    "ref_cluster[ix]='natural killer cell'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 147,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['celltype']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['B cell', 'CD14-positive monocyte',\n",
    "      \n",
    "       'CD4-positive, alpha-beta cytotoxic T cell',\n",
    "     \n",
    "       'capillary endothelial cell', 'dendritic cell', \n",
    "       'endothelial cell of lymphatic vessel',\n",
    "       'epicardial adipocyte', 'fibroblast', 'macrophage', 'mast cell',\n",
    "       'mesothelial cell','native cell',\n",
    "       'natural killer cell', 'neural cell', 'pericyte cell',\n",
    "       'regular atrial cardiac myocyte',\n",
    "       'smooth muscle cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 148,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.rename_categories('celltype', ['B cells', 'Monocytes',\n",
    "      \n",
    "       'T cells',\n",
    "     \n",
    "       'Endothelial cells', 'Dendritic cells', \n",
    "       'Lymphatic endothelial cells',\n",
    "       'Adipocytes', 'Fibroblast cells', 'Macrophages', 'Mast cells',\n",
    "       'Mesothelial cells','Unknown',\n",
    "       'NK cells', 'Neurons', 'Pericytes',\n",
    "       'Cardiomyocytes',\n",
    "       'Smooth muscle cells'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 152,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['sex'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['sex'],\n",
    "                           categories=['female', 'male'])\n",
    "adata.rename_categories('sex', ['Female', 'Male'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 153,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['development_stage'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['development_stage'],\n",
    "                           categories=['eighth decade human stage', 'fifth decade human stage',\n",
    "       'human late adulthood stage', 'human middle aged stage',\n",
    "       'seventh decade human stage', 'sixth decade human stage'])\n",
    "\n",
    "ix=np.isin(ref_cluster,['fifth decade human stage',\n",
    "       'human late adulthood stage', 'human middle aged stage',\n",
    "       'seventh decade human stage', 'sixth decade human stage'])\n",
    "ref_cluster[ix]='eighth decade human stage'\n",
    "\n",
    "adata.obs['development_stage']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['eighth decade human stage'])\n",
    "adata.rename_categories('development_stage', ['Adult'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 154,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['tissue'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['tissue'],\n",
    "                           categories=['apex of heart', 'heart left ventricle', 'heart right ventricle',\n",
    "       'interventricular septum', 'left cardiac atrium',\n",
    "       'right cardiac atrium'])\n",
    "\n",
    "ix=np.isin(ref_cluster,['heart right ventricle','interventricular septum'])\n",
    "ref_cluster[ix]='heart left ventricle'\n",
    "\n",
    "ix=np.isin(ref_cluster,['right cardiac atrium'])\n",
    "ref_cluster[ix]='left cardiac atrium'\n",
    "\n",
    "adata.obs['tissue']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['apex of heart', 'heart left ventricle', 'left cardiac atrium'])\n",
    "adata.rename_categories('tissue', ['Heart_Apex', 'Heart_Ventricle', 'Heart_Atrium'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 155,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['donor'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['donor'],\n",
    "                           categories=['D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D11', 'H2', 'H3', 'H4', 'H5',\n",
    "       'H6', 'H7'])\n",
    "adata.rename_categories('donor', ['D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D11', 'H2', 'H3', 'H4', 'H5',\n",
    "       'H6', 'H7'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 156,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['Organ'] = 'Heart'\n",
    "adata.obs['Organ_Specific'] = adata.obs['tissue']\n",
    "adata.obs['Dataset'] = 'Teichmann_Heart'\n",
    "adata.obs['InternDatasetNumber'] = '10-2_2-Heart-Teichmann-2020'\n",
    "adata.obs['Dataset_status'] = 'Healthy_Dataset'\n",
    "\n",
    "adata.obs['celltype'] = adata.obs['celltype']\n",
    "adata.obs['sub_celltype'] = 'NaN'\n",
    "adata.obs['Malignant'] = 'NonMalignant'\n",
    "\n",
    "adata.obs['Patient'] = adata.obs['donor']\n",
    "adata.obs['Patient_Number'] = adata.obs['sample']\n",
    "adata.obs['age'] = adata.obs['development_stage']\n",
    "adata.obs['sex'] = adata.obs['sex']\n",
    "adata.obs['ethnicity'] = adata.obs['ethnicity']\n",
    "adata.obs['health_status'] = 'NaN'\n",
    "\n",
    "adata.obs['original_celltype_1'] = adata.obs['cell_type']\n",
    "adata.obs['original_celltype_2'] = 'NaN'\n",
    "adata.obs['original_celltype_3'] = 'NaN'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 158,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 159,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs_names_make_unique()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 10-2_3-Heart-Teichmann-2020"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true,
    "tags": []
   },
   "source": [
    "## 10-2_3-Heart-Teichmann-2020"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 161,
   "metadata": {},
   "outputs": [],
   "source": [
    "path = '/path/to/repo/b52eb423-5d0d-4645-b217-e1c6d38b2e72/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 162,
   "metadata": {},
   "outputs": [],
   "source": [
    "files = ['f75f2ff4-2884-4c2d-b375-70de37a34507.h5ad',\n",
    " '572f3f3e-d3e4-4d13-8e2b-88215e508481.h5ad',\n",
    " '78fd69d2-75e4-4207-819a-563139f273c6.h5ad',\n",
    " '9d584fcb-a28a-4b91-a886-ceb66a88ef81.h5ad']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 164,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "AnnData object with n_obs × n_vars = 383824 × 33178\n",
       "    obs: 'NRP', 'cell_source', 'donor', 'n_counts', 'n_genes', 'percent_mito', 'percent_ribo', 'sample', 'scrublet_score', 'type', 'cell_states', 'Used', 'disease_ontology_term_id', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'tissue_ontology_term_id', 'development_stage_ontology_term_id', 'suspension_type', 'ethnicity_ontology_term_id', 'sex_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'ethnicity', 'development_stage', 'id', 'source', 'cell_type_original', 'batch'\n",
       "    var: 'feature_biotype', 'feature_is_filtered', 'feature_name', 'feature_reference'\n",
       "    obsm: 'X_pca', 'X_umap'"
      ]
     },
     "execution_count": 164,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 166,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['InternDatasetNumber'] ='10-2_3-Heart-Teichmann-2020'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 171,
   "metadata": {},
   "outputs": [],
   "source": [
    "#calculate QC covariates\n",
    "adata.obs['n_counts'] = adata.X.sum(1)\n",
    "adata.obs['log_counts'] = np.log(adata.obs['n_counts'])\n",
    "adata.obs['n_genes'] = (adata.X > 0).sum(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 174,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "filtered out 650 cells that have more than 2000 counts\n",
      "filtered out 69 cells that have more than 5000 genes expressed\n",
      "filtered out 5535 genes that are detected in less than 20 cells\n"
     ]
    }
   ],
   "source": [
    "# FILTER PARAMETERS\n",
    "#Filter out cells\n",
    "sc.pp.filter_cells(adata, max_counts = 2000)\n",
    "sc.pp.filter_cells(adata, max_genes = 5000)\n",
    "# Min 20 cells - filters out low count genes\n",
    "sc.pp.filter_genes(adata, min_cells=20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 175,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 176,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.var['feature_name'] = adata.var['feature_name']\n",
    "adata.var.index = adata.var['feature_name']\n",
    "list_1 = adata.var.index.astype(str)\n",
    "adata.var.index = list_1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 177,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get mt genes\n",
    "mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]\n",
    "np.array(mt_genes)\n",
    "mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]\n",
    "\n",
    "# convert to spare\n",
    "adata.X=sp.sparse.csr_matrix.todense(adata.X)\n",
    "\n",
    "# manually define mt score\n",
    "y = np.bincount(mt_gene_mask)\n",
    "ii = np.nonzero(y)[0]\n",
    "np.vstack((ii,y[ii])).T\n",
    "adata.X[:, mt_gene_mask].sum(1)\n",
    "mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))\n",
    "mt_sum=np.array(pd.DataFrame(mt_sum)[0])\n",
    "mt_frac=mt_sum/adata.obs['n_counts'].values\n",
    "\n",
    "adata.obs['mt_frac'] = mt_frac\n",
    "\n",
    "#Filter out cells with over 10% mito fraction\n",
    "adata = adata[adata.obs['mt_frac'] < 0.10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 178,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pp=adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 179,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:01:47): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n",
      "computing PCA\n",
      "    with n_comps=15\n",
      "    finished (0:04:03)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:01:30)\n",
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 23 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:02:23)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 180,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 181,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 182,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/python/lib/python3.7/site-packages/ipykernel_launcher.py:3: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "#Delete adata_pp\n",
    "del adata_pp\n",
    "adata.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 183,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 184,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "#Keep the count data in a counts layer\n",
    "adata.layers[\"counts\"] = adata.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 185,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.raw = adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 186,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "adata.X /= adata.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 187,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 188,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n",
      "    finished (0:00:05)\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 189,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=50\n",
      "    finished (0:00:40)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 50\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:02:49)\n",
      "computing UMAP\n",
      "    finished: added\n",
      "    'X_umap', UMAP coordinates (adata.obsm) (0:07:03)\n"
     ]
    }
   ],
   "source": [
    "# Calculate the visualizations\n",
    "sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')\n",
    "sc.pp.neighbors(adata)\n",
    "sc.tl.umap(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 191,
   "metadata": {},
   "outputs": [],
   "source": [
    "# make consistent annotations across datasets\n",
    "adata.obs['celltype']=adata.obs['cell_type'].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 192,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['capillary endothelial cell', 'endothelial cell',\n",
       "       'endothelial cell of artery', 'endothelial cell of lymphatic vessel',\n",
       "       'epicardial adipocyte', 'fibroblast', 'mesothelial cell',\n",
       "       'pericyte cell', 'regular ventricular cardiac myocyte',\n",
       "       'smooth muscle cell', 'vein endothelial cell'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 192,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.obs['celltype'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 193,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['celltype'],\n",
    "                           categories=['capillary endothelial cell', 'endothelial cell',\n",
    "       'endothelial cell of artery', 'endothelial cell of lymphatic vessel',\n",
    "       'epicardial adipocyte', 'fibroblast', 'mesothelial cell',\n",
    "       'pericyte cell', 'regular ventricular cardiac myocyte',\n",
    "       'smooth muscle cell', 'vein endothelial cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 194,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,['endothelial cell',  'endothelial cell of artery',  'vein endothelial cell'])\n",
    "ref_cluster[ix]='capillary endothelial cell'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 195,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['celltype']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['capillary endothelial cell',\n",
    "       'endothelial cell of lymphatic vessel',\n",
    "       'epicardial adipocyte', 'fibroblast', 'mesothelial cell',\n",
    "       'pericyte cell', 'regular ventricular cardiac myocyte',\n",
    "       'smooth muscle cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 196,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.rename_categories('celltype', ['Endothelial cells',\n",
    "       'Lymphatic endothelial cells',\n",
    "       'Adipocytes', 'Fibroblast cells', 'Mesenchymal stromal cells',\n",
    "       'Pericytes', 'Cardiomyocytes',\n",
    "       'Smooth muscle cells'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 200,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['sex'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['sex'],\n",
    "                           categories=['female', 'male'])\n",
    "adata.rename_categories('sex', ['Female', 'Male'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 201,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['development_stage'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['development_stage'],\n",
    "                           categories=['eighth decade human stage', 'fifth decade human stage',\n",
    "       'human late adulthood stage', 'human middle aged stage',\n",
    "       'seventh decade human stage', 'sixth decade human stage'])\n",
    "\n",
    "ix=np.isin(ref_cluster,['fifth decade human stage',\n",
    "       'human late adulthood stage', 'human middle aged stage',\n",
    "       'seventh decade human stage', 'sixth decade human stage'])\n",
    "ref_cluster[ix]='eighth decade human stage'\n",
    "\n",
    "adata.obs['development_stage']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['eighth decade human stage'])\n",
    "adata.rename_categories('development_stage', ['Adult'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 202,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['tissue'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['tissue'],\n",
    "                           categories=['apex of heart', 'heart left ventricle', 'heart right ventricle',\n",
    "       'interventricular septum', 'left cardiac atrium',\n",
    "       'right cardiac atrium'])\n",
    "\n",
    "ix=np.isin(ref_cluster,['heart right ventricle','interventricular septum'])\n",
    "ref_cluster[ix]='heart left ventricle'\n",
    "\n",
    "ix=np.isin(ref_cluster,['right cardiac atrium'])\n",
    "ref_cluster[ix]='left cardiac atrium'\n",
    "\n",
    "adata.obs['tissue']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['apex of heart', 'heart left ventricle', 'left cardiac atrium'])\n",
    "adata.rename_categories('tissue', ['Heart_Apex', 'Heart_Ventricle', 'Heart_Atrium'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 203,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['donor'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['donor'],\n",
    "                           categories=['D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D11', 'H2', 'H3', 'H4', 'H5',\n",
    "       'H6', 'H7'])\n",
    "adata.rename_categories('donor', ['D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D11', 'H2', 'H3', 'H4', 'H5',\n",
    "       'H6', 'H7'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 204,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['Organ'] = 'Heart'\n",
    "adata.obs['Organ_Specific'] = adata.obs['tissue']\n",
    "adata.obs['Dataset'] = 'Teichmann_Heart'\n",
    "adata.obs['InternDatasetNumber'] = '10-2_3-Heart-Teichmann-2020'\n",
    "adata.obs['Dataset_status'] = 'Healthy_Dataset'\n",
    "\n",
    "adata.obs['celltype'] = adata.obs['celltype']\n",
    "adata.obs['sub_celltype'] = 'NaN'\n",
    "adata.obs['Malignant'] = 'NonMalignant'\n",
    "\n",
    "adata.obs['Patient'] = adata.obs['donor']\n",
    "adata.obs['Patient_Number'] = adata.obs['sample']\n",
    "adata.obs['age'] = adata.obs['development_stage']\n",
    "adata.obs['sex'] = adata.obs['sex']\n",
    "adata.obs['ethnicity'] = adata.obs['ethnicity']\n",
    "adata.obs['health_status'] = 'NaN'\n",
    "\n",
    "adata.obs['original_celltype_1'] = adata.obs['cell_type']\n",
    "adata.obs['original_celltype_2'] = 'NaN'\n",
    "adata.obs['original_celltype_3'] = 'NaN'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 206,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 207,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs_names_make_unique()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 10-3-Heart-Pisco-2022"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "##  10-3-Heart-Pisco-2022 (human) - Pisco Multiple Organs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(adata_pisco.obs['tissue'],['cardiac atrium', 'cardiac ventricle'])\n",
    "adata=adata_pisco[ix].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['InternDatasetNumber'] ='10-3-Heart-Pisco-2022'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "sc.pp.calculate_qc_metrics(adata, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "#calculate QC covariates\n",
    "adata.obs['n_counts'] = adata.X.sum(1)\n",
    "adata.obs['log_counts'] = np.log(adata.obs['n_counts'])\n",
    "adata.obs['n_genes'] = (adata.X > 0).sum(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "filtered out 36195 genes that are detected in less than 20 cells\n"
     ]
    }
   ],
   "source": [
    "# FILTER PARAMETERS\n",
    "#Filter out cells\n",
    "#sc.pp.filter_cells(adata, max_counts = 13000)\n",
    "#sc.pp.filter_cells(adata, max_genes = 7500)\n",
    "# Min 20 cells - filters out low count genes\n",
    "sc.pp.filter_genes(adata, min_cells=20) #500 works # # 450(30494) #400(29837) #300(28268) not"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get mt genes\n",
    "mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]\n",
    "np.array(mt_genes)\n",
    "mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]\n",
    "\n",
    "# convert to spare\n",
    "adata.X=sp.sparse.csr_matrix.todense(adata.X)\n",
    "\n",
    "# manually define mt score\n",
    "y = np.bincount(mt_gene_mask)\n",
    "ii = np.nonzero(y)[0]\n",
    "np.vstack((ii,y[ii])).T\n",
    "adata.X[:, mt_gene_mask].sum(1)\n",
    "mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))\n",
    "mt_sum=np.array(pd.DataFrame(mt_sum)[0])\n",
    "mt_frac=mt_sum/adata.obs['n_counts'].values\n",
    "\n",
    "adata.obs['mt_frac'] = mt_frac\n",
    "\n",
    "#Filter out cells with over 20% mito fraction\n",
    "adata = adata[adata.obs['mt_frac'] < 0.20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pp=adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:00:01): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n",
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=15\n",
      "    finished (0:00:00)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:07)\n",
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 13 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:00:01)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "R[write to console]: Loading required package: scran\n",
      "\n",
      "R[write to console]: Loading required package: SingleCellExperiment\n",
      "\n",
      "R[write to console]: Loading required package: SummarizedExperiment\n",
      "\n",
      "R[write to console]: Loading required package: GenomicRanges\n",
      "\n",
      "R[write to console]: Loading required package: stats4\n",
      "\n",
      "R[write to console]: Loading required package: BiocGenerics\n",
      "\n",
      "R[write to console]: Loading required package: parallel\n",
      "\n",
      "R[write to console]: \n",
      "Attaching package: ‘BiocGenerics’\n",
      "\n",
      "\n",
      "R[write to console]: The following objects are masked from ‘package:parallel’:\n",
      "\n",
      "    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,\n",
      "    clusterExport, clusterMap, parApply, parCapply, parLapply,\n",
      "    parLapplyLB, parRapply, parSapply, parSapplyLB\n",
      "\n",
      "\n",
      "R[write to console]: The following objects are masked from ‘package:stats’:\n",
      "\n",
      "    IQR, mad, sd, var, xtabs\n",
      "\n",
      "\n",
      "R[write to console]: The following objects are masked from ‘package:base’:\n",
      "\n",
      "    anyDuplicated, append, as.data.frame, basename, cbind, colnames,\n",
      "    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,\n",
      "    grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,\n",
      "    order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,\n",
      "    rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,\n",
      "    union, unique, unsplit, which, which.max, which.min\n",
      "\n",
      "\n",
      "R[write to console]: Loading required package: S4Vectors\n",
      "\n",
      "R[write to console]: \n",
      "Attaching package: ‘S4Vectors’\n",
      "\n",
      "\n",
      "R[write to console]: The following object is masked from ‘package:base’:\n",
      "\n",
      "    expand.grid\n",
      "\n",
      "\n",
      "R[write to console]: Loading required package: IRanges\n",
      "\n",
      "R[write to console]: Loading required package: GenomeInfoDb\n",
      "\n",
      "R[write to console]: Loading required package: Biobase\n",
      "\n",
      "R[write to console]: Welcome to Bioconductor\n",
      "\n",
      "    Vignettes contain introductory material; view with\n",
      "    'browseVignettes()'. To cite Bioconductor, see\n",
      "    'citation(\"Biobase\")', and for packages 'citation(\"pkgname\")'.\n",
      "\n",
      "\n",
      "R[write to console]: Loading required package: DelayedArray\n",
      "\n",
      "R[write to console]: Loading required package: matrixStats\n",
      "\n",
      "R[write to console]: \n",
      "Attaching package: ‘matrixStats’\n",
      "\n",
      "\n",
      "R[write to console]: The following objects are masked from ‘package:Biobase’:\n",
      "\n",
      "    anyMissing, rowMedians\n",
      "\n",
      "\n",
      "R[write to console]: Loading required package: BiocParallel\n",
      "\n",
      "R[write to console]: \n",
      "Attaching package: ‘DelayedArray’\n",
      "\n",
      "\n",
      "R[write to console]: The following objects are masked from ‘package:matrixStats’:\n",
      "\n",
      "    colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges\n",
      "\n",
      "\n",
      "R[write to console]: The following objects are masked from ‘package:base’:\n",
      "\n",
      "    aperm, apply, rowsum\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/python/lib/python3.7/site-packages/ipykernel_launcher.py:3: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "#Delete adata_pp\n",
    "del adata_pp\n",
    "adata.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "#Keep the count data in a counts layer\n",
    "adata.layers[\"counts\"] = adata.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.raw = adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "adata.X /= adata.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n",
      "    finished (0:00:01)\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=50\n",
      "    finished (0:00:02)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 50\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:02)\n",
      "computing UMAP\n",
      "    finished: added\n",
      "    'X_umap', UMAP coordinates (adata.obsm) (0:00:10)\n"
     ]
    }
   ],
   "source": [
    "# Calculate the visualizations\n",
    "sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')\n",
    "sc.pp.neighbors(adata)\n",
    "sc.tl.umap(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "# make consistent annotations across datasets\n",
    "adata.obs['celltype']=adata.obs['cell_type'].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['cardiac endothelial cell', 'cardiac muscle cell',\n",
       "       'fibroblast of cardiac tissue', 'hepatocyte', 'macrophage',\n",
       "       'smooth muscle cell'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.obs['celltype'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['celltype'],\n",
    "                           categories=['cardiac endothelial cell', 'cardiac muscle cell',\n",
    "       'fibroblast of cardiac tissue', 'hepatocyte', 'macrophage',\n",
    "       'smooth muscle cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['celltype']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['cardiac endothelial cell', 'cardiac muscle cell',\n",
    "       'fibroblast of cardiac tissue', 'hepatocyte', 'macrophage',\n",
    "       'smooth muscle cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.rename_categories('celltype', ['Endothelial cells', 'Cardiomyocytes',\n",
    "       'Fibroblast cells', 'Unknown', 'Macrophages',\n",
    "       'Smooth muscle cells'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['tissue'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['tissue'],\n",
    "                           categories=['cardiac atrium', 'cardiac ventricle'])\n",
    "adata.rename_categories('tissue', ['Heart_Atrium', 'Heart_Ventricle'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['sex'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['sex'],\n",
    "                           categories=['male'])\n",
    "adata.rename_categories('sex', ['Male'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['ethnicity'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['ethnicity'],\n",
    "                           categories=['Hispanic or Latin American'])\n",
    "adata.rename_categories('ethnicity', ['Hispanic or Latin-American'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['development_stage'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['development_stage'],\n",
    "                           categories=['74-year-old human stage'])\n",
    "adata.rename_categories('development_stage',['74'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['donor'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['donor'],\n",
    "                           categories=['TSP12'])\n",
    "adata.rename_categories('donor',['TSP12'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['Organ'] = 'Heart'\n",
    "adata.obs['Organ_Specific'] = adata.obs['tissue']\n",
    "adata.obs['Dataset'] = 'Pisco_Heart'\n",
    "adata.obs['InternDatasetNumber'] = '10-3-Heart-Pisco-2022'\n",
    "adata.obs['Dataset_status'] = 'Healthy_Dataset'\n",
    "\n",
    "adata.obs['celltype'] = adata.obs['celltype']\n",
    "adata.obs['sub_celltype'] = 'NaN'\n",
    "adata.obs['Malignant'] = 'NonMalignant'\n",
    "\n",
    "adata.obs['Patient'] = adata.obs['donor']\n",
    "adata.obs['Patient_Number'] = 'NaN'\n",
    "adata.obs['age'] = adata.obs['development_stage']\n",
    "adata.obs['sex'] = adata.obs['sex']\n",
    "adata.obs['ethnicity'] = adata.obs['ethnicity']\n",
    "adata.obs['health_status'] = 'NaN'\n",
    "\n",
    "adata.obs['original_celltype_1'] = adata.obs['cell_type']\n",
    "adata.obs['original_celltype_2'] = 'NaN'\n",
    "adata.obs['original_celltype_3'] = 'NaN'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs_names_make_unique()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true,
    "tags": []
   },
   "source": [
    "# 11-Blood"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 11-2-Blood-Pisco-2022"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true,
    "tags": []
   },
   "source": [
    "## 11-2-Blood-Pisco-2022 (human) - Pisco Multiple Organs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {},
   "outputs": [],
   "source": [
    "#sc.pp.calculate_qc_metrics(adata, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<50115x58559 sparse matrix of type '<class 'numpy.float32'>'\n",
       "\twith 94755802 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 125,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<50054x23807 sparse matrix of type '<class 'numpy.float32'>'\n",
       "\twith 94045592 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 127,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.X"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get mt genes\n",
    "mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]\n",
    "np.array(mt_genes)\n",
    "mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]\n",
    "\n",
    "# convert to spare\n",
    "adata.X=sp.sparse.csr_matrix.todense(adata.X)\n",
    "\n",
    "# manually define mt score\n",
    "y = np.bincount(mt_gene_mask)\n",
    "ii = np.nonzero(y)[0]\n",
    "np.vstack((ii,y[ii])).T\n",
    "adata.X[:, mt_gene_mask].sum(1)\n",
    "mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))\n",
    "mt_sum=np.array(pd.DataFrame(mt_sum)[0])\n",
    "mt_frac=mt_sum/adata.obs['n_counts'].values\n",
    "\n",
    "adata.obs['mt_frac'] = mt_frac\n",
    "\n",
    "#Filter out cells with over 20% mito fraction\n",
    "adata = adata[adata.obs['mt_frac'] < 0.05]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pp=adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:00:05): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n",
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=15\n",
      "    finished (0:00:02)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:07)\n",
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 15 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:00:05)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/python/lib/python3.7/site-packages/ipykernel_launcher.py:3: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "#Delete adata_pp\n",
    "del adata_pp\n",
    "adata.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "#Keep the count data in a counts layer\n",
    "adata.layers[\"counts\"] = adata.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 136,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.raw = adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 137,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "adata.X /= adata.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 138,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n",
      "    finished (0:00:04)\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 139,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=50\n",
      "    finished (0:00:08)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 50\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:12)\n",
      "computing UMAP\n",
      "    finished: added\n",
      "    'X_umap', UMAP coordinates (adata.obsm) (0:00:40)\n"
     ]
    }
   ],
   "source": [
    "# Calculate the visualizations\n",
    "sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')\n",
    "sc.pp.neighbors(adata)\n",
    "sc.tl.umap(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 159,
   "metadata": {},
   "outputs": [],
   "source": [
    "# make consistent annotations across datasets\n",
    "adata.obs['celltype']=adata.obs['cell_type'].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 160,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['CD141-positive myeloid dendritic cell',\n",
       "       'CD4-positive, alpha-beta T cell',\n",
       "       'CD4-positive, alpha-beta memory T cell',\n",
       "       'CD8-positive, alpha-beta T cell',\n",
       "       'CD8-positive, alpha-beta cytokine secreting effector T cell', 'T cell',\n",
       "       'basophil', 'classical monocyte', 'common myeloid progenitor',\n",
       "       'erythrocyte', 'granulocyte', 'hematopoietic stem cell', 'macrophage',\n",
       "       'mature NK T cell', 'memory B cell', 'monocyte', 'naive B cell',\n",
       "       'naive thymus-derived CD4-positive, alpha-beta T cell', 'neutrophil',\n",
       "       'non-classical monocyte', 'plasma cell', 'plasmablast',\n",
       "       'plasmacytoid dendritic cell', 'platelet', 'type I NK T cell'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 160,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.obs['celltype'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 162,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['celltype'],\n",
    "                           categories=['CD141-positive myeloid dendritic cell',\n",
    "       'CD4-positive, alpha-beta T cell',\n",
    "       'CD4-positive, alpha-beta memory T cell',\n",
    "       'CD8-positive, alpha-beta T cell',\n",
    "       'CD8-positive, alpha-beta cytokine secreting effector T cell', 'T cell',\n",
    "       'basophil', 'classical monocyte', 'common myeloid progenitor',\n",
    "       'erythrocyte', 'granulocyte', 'hematopoietic stem cell', 'macrophage',\n",
    "       'mature NK T cell', 'memory B cell', 'monocyte', 'naive B cell',\n",
    "       'naive thymus-derived CD4-positive, alpha-beta T cell', 'neutrophil',\n",
    "       'non-classical monocyte', 'plasma cell', 'plasmablast',\n",
    "       'plasmacytoid dendritic cell', 'platelet', 'type I NK T cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 163,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,['plasmacytoid dendritic cell'])\n",
    "ref_cluster[ix]= 'CD141-positive myeloid dendritic cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,['CD4-positive, alpha-beta memory T cell',\n",
    "       'CD8-positive, alpha-beta T cell',\n",
    "       'CD8-positive, alpha-beta cytokine secreting effector T cell', 'T cell', 'naive thymus-derived CD4-positive, alpha-beta T cell','type I NK T cell'])\n",
    "ref_cluster[ix]= 'CD4-positive, alpha-beta T cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,['monocyte',  'non-classical monocyte'])\n",
    "ref_cluster[ix]= 'classical monocyte'\n",
    "\n",
    "ix=np.isin(ref_cluster,['naive B cell'])\n",
    "ref_cluster[ix]= 'memory B cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,['plasmablast'])\n",
    "ref_cluster[ix]=  'plasma cell'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 164,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['celltype']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['CD141-positive myeloid dendritic cell',\n",
    "       'CD4-positive, alpha-beta T cell',\n",
    "       \n",
    "       'basophil', 'classical monocyte', 'common myeloid progenitor',\n",
    "       'erythrocyte', 'granulocyte', 'hematopoietic stem cell', 'macrophage',\n",
    "       'mature NK T cell', 'memory B cell',  \n",
    "        'neutrophil',\n",
    "       'plasma cell', \n",
    "        'platelet'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 165,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.rename_categories('celltype', ['Dendritic cells',\n",
    "                                     'T cells',\n",
    "       \n",
    "        'Basophil cells', 'Monocytes', 'Common myeloid progenitor cells', \n",
    "       'Erythroid cells', 'Granulocyte cells', 'Hematopoietic stem cells', 'Macrophages', \n",
    "                                     'NK cells', 'B cells',  \n",
    "         'Neutrophils',\n",
    "                                     'Plasma cells',\n",
    "                                     'Thrombocytes'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 169,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['tissue'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['tissue'],\n",
    "                           categories=['blood'])\n",
    "adata.rename_categories('tissue', ['Blood'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 170,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['sex'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['sex'],\n",
    "                           categories=['female', 'male'])\n",
    "adata.rename_categories('sex', ['Female', 'Male'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 171,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['ethnicity'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['ethnicity'],\n",
    "                           categories=['African American or Afro-Caribbean', 'European',\n",
    "       'Hispanic or Latin-American'])\n",
    "adata.rename_categories('ethnicity', ['African-American or Afro-Caribbean', 'European', 'Hispanic or Latin-American'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 172,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['development_stage'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['development_stage'],\n",
    "                           categories=['33-year-old human stage', '56-year-old human stage',\n",
    "       '59-year-old human stage', '61-year-old human stage',\n",
    "       '69-year-old human stage'])\n",
    "adata.rename_categories('development_stage', ['33', '56',\n",
    "       '59', '61',\n",
    "       '69'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 173,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['donor'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['donor'],\n",
    "                           categories=['TSP1', 'TSP2', 'TSP7', 'TSP8', 'TSP10', 'TSP14'])\n",
    "adata.rename_categories('donor', ['TSP1', 'TSP2', 'TSP7', 'TSP8', 'TSP10', 'TSP14'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 174,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['Organ'] = 'Blood'\n",
    "adata.obs['Organ_Specific'] = adata.obs['tissue']\n",
    "adata.obs['Dataset'] = 'Pisco_Blood'\n",
    "adata.obs['InternDatasetNumber'] = '11-2-Blood-Pisco-2022'\n",
    "adata.obs['Dataset_status'] = 'Healthy_Dataset'\n",
    "\n",
    "adata.obs['celltype'] = adata.obs['celltype']\n",
    "adata.obs['sub_celltype'] = 'NaN'\n",
    "adata.obs['Malignant'] = 'NonMalignant'\n",
    "\n",
    "adata.obs['Patient'] = adata.obs['donor']\n",
    "adata.obs['Patient_Number'] = 'NaN'\n",
    "adata.obs['age'] = adata.obs['development_stage']\n",
    "adata.obs['sex'] = adata.obs['sex']\n",
    "adata.obs['ethnicity'] = adata.obs['ethnicity']\n",
    "adata.obs['health_status'] = 'NaN'\n",
    "\n",
    "adata.obs['original_celltype_1'] = adata.obs['cell_type']\n",
    "adata.obs['original_celltype_2'] = 'NaN'\n",
    "adata.obs['original_celltype_3'] = 'NaN'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 176,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 177,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs_names_make_unique()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 11-5-Blood-Han-2020"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true,
    "tags": []
   },
   "source": [
    "## 11-5-Blood-Han-2020-RubenBrabenec_HealthyProject"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 771,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(adata_han.obs['sub_tissue'],['AdultPeripheralBlood']) \n",
    "adata=adata_han[ix].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 772,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['InternDatasetNumber'] ='11-5-Blood-Han-2020'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 776,
   "metadata": {},
   "outputs": [],
   "source": [
    "#calculate QC covariates\n",
    "adata.obs['n_counts'] = adata.X.sum(1)\n",
    "adata.obs['log_counts'] = np.log(adata.obs['n_counts'])\n",
    "adata.obs['n_genes'] = (adata.X > 0).sum(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 779,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "filtered out 11 cells that have more than 3000 counts\n",
      "filtered out 1 cells that have more than 1500 genes expressed\n",
      "filtered out 16679 genes that are detected in less than 20 cells\n"
     ]
    }
   ],
   "source": [
    "# FILTER PARAMETERS#Filter out cells\n",
    "sc.pp.filter_cells(adata, max_counts = 3000)\n",
    "sc.pp.filter_cells(adata, max_genes = 1500)\n",
    "# Min 20 cells - filters out low count genes\n",
    "sc.pp.filter_genes(adata, min_cells=20) #500 works # # 450(30494) #400(29837) #300(28268) not"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 780,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get mt genes\n",
    "mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]\n",
    "np.array(mt_genes)\n",
    "mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]\n",
    "\n",
    "# convert to spare\n",
    "adata.X=sp.sparse.csr_matrix.todense(adata.X)\n",
    "\n",
    "# manually define mt score\n",
    "y = np.bincount(mt_gene_mask)\n",
    "ii = np.nonzero(y)[0]\n",
    "np.vstack((ii,y[ii])).T\n",
    "adata.X[:, mt_gene_mask].sum(1)\n",
    "mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))\n",
    "mt_sum=np.array(pd.DataFrame(mt_sum)[0])\n",
    "mt_frac=mt_sum/adata.obs['n_counts'].values\n",
    "\n",
    "adata.obs['mt_frac'] = mt_frac\n",
    "\n",
    "#Filter out cells with over 20% mito fraction\n",
    "adata = adata[adata.obs['mt_frac'] < 0.20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 781,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pp=adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 782,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:00:00): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n",
      "computing PCA\n",
      "    with n_comps=15\n",
      "    finished (0:00:02)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)\n",
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 11 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:00:01)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 783,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 784,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 785,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/python/lib/python3.7/site-packages/ipykernel_launcher.py:3: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "#Delete adata_pp\n",
    "del adata_pp\n",
    "adata.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 786,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 787,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "#Keep the count data in a counts layer\n",
    "adata.layers[\"counts\"] = adata.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 788,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.raw = adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 789,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "adata.X /= adata.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 791,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=50\n",
      "    finished (0:00:02)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 50\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:02)\n",
      "computing UMAP\n",
      "    finished: added\n",
      "    'X_umap', UMAP coordinates (adata.obsm) (0:00:09)\n"
     ]
    }
   ],
   "source": [
    "# Calculate the visualizations\n",
    "sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')\n",
    "sc.pp.neighbors(adata)\n",
    "sc.tl.umap(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 793,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['B cell', 'B cell(Centrocyte)', 'B cell(Plasmocyte)',\n",
       "       'B cell(Plasmocyte)_IGHG4 high', 'B cell(Plasmocyte)_IGHM high',\n",
       "       'CD4_T cell', 'CD8+ T cell', 'CD8_T cell',\n",
       "       'Conventional dendritic cell', 'Dendritic cell',\n",
       "       'Dendritic cell_FCER1A high', 'Dendritic cell_LGALS2 high',\n",
       "       'Dendritic cell_WDFY4 high', 'Eosinophil', 'Macrophage',\n",
       "       'Macrophage_FCGR3A high', 'Monocyte', 'Monocyte_CXCR2 high',\n",
       "       'Monocyte_IGHG4 high', 'Monocyte_ISG15 high', 'Monocyte_S100A12 high',\n",
       "       'NK cell', 'Neutrophil_CAMP high', 'Neutrophil_DEFA3 high',\n",
       "       'Plasmacytoid dendritic cell', 'Proliferating  B cell',\n",
       "       'Proliferating T cell', 'T cell', 'T cell_GNLY high',\n",
       "       'T cell_TRAC high', 'activative T cell'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 793,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# make consistent annotations across datasets\n",
    "adata.obs['celltype']=adata.obs['celltype_specific'].copy()\n",
    "adata.obs['celltype'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 794,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['celltype'],\n",
    "                           categories=['B cell', 'B cell(Centrocyte)', 'B cell(Plasmocyte)',\n",
    "       'B cell(Plasmocyte)_IGHG4 high', 'B cell(Plasmocyte)_IGHM high',\n",
    "       'CD4_T cell', 'CD8+ T cell', 'CD8_T cell',\n",
    "       'Conventional dendritic cell', 'Dendritic cell',\n",
    "       'Dendritic cell_FCER1A high', 'Dendritic cell_LGALS2 high',\n",
    "       'Dendritic cell_WDFY4 high', 'Eosinophil', 'Macrophage',\n",
    "       'Macrophage_FCGR3A high', 'Monocyte', 'Monocyte_CXCR2 high',\n",
    "       'Monocyte_IGHG4 high', 'Monocyte_ISG15 high', 'Monocyte_S100A12 high',\n",
    "       'NK cell', 'Neutrophil_CAMP high', 'Neutrophil_DEFA3 high',\n",
    "       'Plasmacytoid dendritic cell', 'Proliferating  B cell',\n",
    "       'Proliferating T cell', 'T cell', 'T cell_GNLY high',\n",
    "       'T cell_TRAC high', 'activative T cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 795,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,[ 'B cell(Centrocyte)',   'Proliferating  B cell'])\n",
    "ref_cluster[ix]= 'B cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,[ 'B cell(Plasmocyte)_IGHG4 high', 'B cell(Plasmocyte)_IGHM high'])\n",
    "ref_cluster[ix]= 'B cell(Plasmocyte)'\n",
    "\n",
    "ix=np.isin(ref_cluster,[ 'CD8+ T cell', 'CD8_T cell',   'Proliferating T cell', 'T cell', 'T cell_GNLY high',\n",
    "       'T cell_TRAC high', 'activative T cell'])\n",
    "ref_cluster[ix]=  'CD4_T cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,[ 'Dendritic cell', 'Dendritic cell_FCER1A high', 'Dendritic cell_LGALS2 high',\n",
    "       'Dendritic cell_WDFY4 high','Plasmacytoid dendritic cell'])\n",
    "ref_cluster[ix]= 'Conventional dendritic cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,[ 'Macrophage_FCGR3A high'])\n",
    "ref_cluster[ix]='Macrophage'\n",
    "\n",
    "ix=np.isin(ref_cluster,[ 'Monocyte_CXCR2 high', 'Monocyte_IGHG4 high', 'Monocyte_ISG15 high', 'Monocyte_S100A12 high'])\n",
    "ref_cluster[ix]='Monocyte'\n",
    "\n",
    "ix=np.isin(ref_cluster,[  'Neutrophil_DEFA3 high'])\n",
    "ref_cluster[ix]= 'Neutrophil_CAMP high'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 796,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['celltype']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['B cell', 'B cell(Plasmocyte)',\n",
    "       \n",
    "       'CD4_T cell', \n",
    "       'Conventional dendritic cell', \n",
    "        'Eosinophil', 'Macrophage',\n",
    "        'Monocyte',\n",
    "       'NK cell', 'Neutrophil_CAMP high'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 797,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.rename_categories('celltype', ['B cells', 'Plasma cells',\n",
    "       \n",
    "       'T cells', \n",
    "       'Dendritic cells', \n",
    "        'Eosinophils', 'Macrophages',\n",
    "        'Monocytes',\n",
    "       'NK cells', 'Neutrophils'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 801,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['sub_tissue'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['sub_tissue'],\n",
    "                           categories=['AdultPeripheralBlood'])\n",
    "adata.rename_categories('sub_tissue', ['Blood'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 802,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['sex'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['sex'],\n",
    "                           categories=[ 'male'])\n",
    "adata.rename_categories('sex', [ 'Male'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 803,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['age'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['age'],\n",
    "                           categories=['25Y', '27Y', '34Y'])\n",
    "adata.rename_categories('age',['25', '27', '34'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 804,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['donor'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['donor'],\n",
    "                           categories=['Donor45', 'Donor47', 'Donor48'])\n",
    "adata.rename_categories('donor', ['Han-Donor45', 'Han-Donor47', 'Han-Donor48'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 805,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['Organ'] = 'Blood'\n",
    "adata.obs['Organ_Specific'] = adata.obs['sub_tissue']\n",
    "adata.obs['Dataset'] = 'Han_Blood'\n",
    "adata.obs['InternDatasetNumber'] = '11-5-Blood-Han-2020'\n",
    "adata.obs['Dataset_status'] = 'Healthy_Dataset'\n",
    "\n",
    "adata.obs['celltype'] = adata.obs['celltype']\n",
    "adata.obs['sub_celltype'] = 'NaN'\n",
    "adata.obs['Malignant'] = 'NonMalignant'\n",
    "\n",
    "adata.obs['Patient'] = adata.obs['donor']\n",
    "adata.obs['Patient_Number'] = 'NaN'\n",
    "adata.obs['age'] = adata.obs['age']\n",
    "adata.obs['sex'] = adata.obs['sex']\n",
    "adata.obs['ethnicity'] = 'NaN'\n",
    "adata.obs['health_status'] = 'NaN'\n",
    "\n",
    "adata.obs['original_celltype_1'] = adata.obs['celltype_specific']\n",
    "adata.obs['original_celltype_2'] = adata.obs['celltype_global']\n",
    "adata.obs['original_celltype_3'] = 'NaN'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 807,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 11-6-Blood_ImmuneCells-Teichmann-2022"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true,
    "tags": []
   },
   "source": [
    "## 11-6-Blood_ImmuneCells-Teichmann-2022-RubenBrabenec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(adata_analysis.obs['tissue_major'], ['Blood'])\n",
    "adata=adata_analysis[ix].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['Organ'] =  adata.obs['tissue_major']\n",
    "adata.obs['Organ_Specific'] = adata.obs['sub_tissue']\n",
    "adata.obs['Dataset'] = adata.obs['Dataset']\n",
    "adata.obs['InternDatasetNumber'] = '11-6-Blood_ImmuneCells-Teichmann-2022'\n",
    "adata.obs['Dataset_status'] = 'Healthy_Dataset'\n",
    "\n",
    "adata.obs['celltype'] = adata.obs['celltype']\n",
    "adata.obs['sub_celltype'] = 'NaN'\n",
    "adata.obs['Malignant'] = 'NonMalignant'\n",
    "\n",
    "adata.obs['Patient'] = adata.obs['donor']\n",
    "adata.obs['Patient_Number'] = 'NaN'\n",
    "adata.obs['age'] = adata.obs['development_stage']\n",
    "adata.obs['sex'] = adata.obs['sex']\n",
    "adata.obs['ethnicity'] = adata.obs['ethnicity']\n",
    "adata.obs['health_status'] = 'NaN'\n",
    "\n",
    "adata.obs['original_celltype_1'] = adata.obs['cell_type']\n",
    "adata.obs['original_celltype_2'] = adata.obs['Majority_voting_CellTypist_high']\n",
    "adata.obs['original_celltype_3'] = 'NaN'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs_names_make_unique()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": "true",
    "jp-MarkdownHeadingCollapsed": true,
    "tags": []
   },
   "source": [
    "# 12-BoneMarrow"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 12-1-BoneMarrow-VanGalen-2019"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "path = '/storage/groups/ml01/workspace/moritz.thomas/VanGalen2019/raw_data/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "matrices = ['GSM3587996_BM1','GSM3587997_BM2','GSM3587998_BM3','GSM3588000_BM4',\n",
    "            'GSM3588002_BM5-34p','GSM3588003_BM5-34p38n']\n",
    "annotations = ['GSM3587996_BM1','GSM3587997_BM2','GSM3587999_BM3','GSM3588001_BM4',\n",
    "               'GSM3588002_BM5-34p','GSM3588003_BM5-34p38n']\n",
    "matrix_file_end = '.dem.txt'\n",
    "anno_file_end = '.anno.txt'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Load first data set & annotation\n",
    "sample = matrices.pop(0)\n",
    "data_file = writepath+sample+matrix_file_end\n",
    "\n",
    "anno = annotations.pop(0)\n",
    "anno_file = writepath+anno+anno_file_end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Load data\n",
    "adata=sc.read_text(data_file, delimiter='\\t', dtype='float32')\n",
    "adata=adata.transpose()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Load annotation\n",
    "\n",
    "annotation = pd.read_csv(anno_file, delimiter='\\t')\n",
    "annotation.rename(columns={'Cell':'barcode'}, inplace=True)\n",
    "annotation.set_index('barcode', inplace=True)\n",
    "\n",
    "adata.obs = annotation\n",
    "adata.obs['sample'] = 'BM1'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Loop to load rest of data sets\n",
    "\n",
    "for i in range(len(matrices)):\n",
    "    \n",
    "    #Parse Filenames\n",
    "    sample = matrices[i]\n",
    "    data_file = writepath+sample+matrix_file_end\n",
    "    \n",
    "    anno = annotations[i]\n",
    "    anno_file = writepath+anno+anno_file_end\n",
    "    \n",
    "    #Load data\n",
    "    adata_tmp = sc.read_text(data_file, delimiter='\\t', dtype='float32')\n",
    "    adata_tmp = adata_tmp.transpose()\n",
    "    #adata_tmp.X = adata_tmp.X.toarray()\n",
    "\n",
    "    #Annotate data\n",
    "    annotation_tmp = pd.read_csv(anno_file, delimiter='\\t')\n",
    "    annotation_tmp.rename(columns={'Cell':'barcode'}, inplace=True)\n",
    "    annotation_tmp.set_index('barcode', inplace=True)\n",
    "    adata_tmp.obs = annotation_tmp\n",
    "    adata_tmp.obs['sample'] = str(matrices[i]).split(\"_\")[1]\n",
    "    \n",
    "\n",
    "    # Concatenate to main adata object\n",
    "    adata = adata.concatenate(adata_tmp, batch_key='sample_id')\n",
    "    \n",
    "    adata.obs.drop(columns=['sample_id'], inplace=True)\n",
    "    adata.obs_names_make_unique(join='_')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 188,
   "metadata": {},
   "outputs": [],
   "source": [
    "barcodes = adata.obs_names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 189,
   "metadata": {},
   "outputs": [],
   "source": [
    "first_barcodes=barcodes[0:4677]\n",
    "first_barcodes_true = [c.split(\"-\")[0] for c in first_barcodes]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 190,
   "metadata": {},
   "outputs": [],
   "source": [
    "last_barcodes=barcodes[4677:7698]\n",
    "last_barcodes_true = [c.split(\"-\")[1] for c in last_barcodes]\n",
    "last_barcodes_true = ['BM5_' + s for s in last_barcodes_true]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 191,
   "metadata": {},
   "outputs": [],
   "source": [
    "new_barcodes=np.append(first_barcodes_true, last_barcodes_true)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 194,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs.index=new_barcodes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 213,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['InternDatasetNumber'] ='12-1-BoneMarrow-VanGalen-2019'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 215,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 216,
   "metadata": {},
   "outputs": [],
   "source": [
    "#calculate QC covariates\n",
    "adata.obs['n_counts'] = adata.X.sum(1)\n",
    "adata.obs['log_counts'] = np.log(adata.obs['n_counts'])\n",
    "adata.obs['n_genes'] = (adata.X > 0).sum(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# FILTER PARAMETERS#Filter out cells\n",
    "sc.pp.filter_cells(adata, max_counts = 1700)\n",
    "sc.pp.filter_cells(adata, max_genes = 4200)\n",
    "# Min 20 cells - filters out low count genes\n",
    "sc.pp.filter_genes(adata, min_cells=10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 220,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get mt genes\n",
    "mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]\n",
    "np.array(mt_genes)\n",
    "mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]\n",
    "\n",
    "# convert to spare\n",
    "adata.X=sp.sparse.csr_matrix.todense(adata.X)\n",
    "\n",
    "# manually define mt score\n",
    "y = np.bincount(mt_gene_mask)\n",
    "ii = np.nonzero(y)[0]\n",
    "np.vstack((ii,y[ii])).T\n",
    "adata.X[:, mt_gene_mask].sum(1)\n",
    "mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))\n",
    "mt_sum=np.array(pd.DataFrame(mt_sum)[0])\n",
    "mt_frac=mt_sum/adata.obs['n_counts'].values\n",
    "\n",
    "adata.obs['mt_frac'] = mt_frac\n",
    "\n",
    "#Filter out cells with over 20% mito fraction\n",
    "adata = adata[adata.obs['mt_frac'] < 0.20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 222,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pp=adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 223,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:00:01): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n",
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=15\n",
      "    finished (0:00:00)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)\n",
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 8 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:00:00)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 224,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 225,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 226,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/python/lib/python3.7/site-packages/ipykernel_launcher.py:3: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "#Deletstrings_to_categoricals_pp\n",
    "del adata_pp\n",
    "adata.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 227,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 228,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "#Keep the count data in a counts layer\n",
    "adata.layers[\"counts\"] = adata.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 229,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.raw = adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 230,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "adata.X /= adata.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 231,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 232,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n",
      "    finished (0:00:00)\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 233,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=50\n",
      "    finished (0:00:02)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 50\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)\n",
      "computing UMAP\n",
      "    finished: added\n",
      "    'X_umap', UMAP coordinates (adata.obsm) (0:00:16)\n"
     ]
    }
   ],
   "source": [
    "# Calculate the visualizations\n",
    "sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')\n",
    "sc.pp.neighbors(adata)\n",
    "sc.tl.umap(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 237,
   "metadata": {},
   "outputs": [],
   "source": [
    "#sc.pl.umap(adata, color='celltype', palette=palette)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 242,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['B', 'CTL', 'GMP', 'Mono', 'NK', 'Plasma', 'ProB', 'ProMono', 'Prog',\n",
       "       'T', 'cDC', 'earlyEry', 'lateEry', 'HSC', 'pDC'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 242,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# make consistent annotations across datasets\n",
    "adata.obs['celltype'] = adata.obs['CellType']\n",
    "adata.obs['celltype'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 243,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['celltype'],\n",
    "                           categories=['B', 'CTL', 'GMP', 'Mono', 'NK', 'Plasma', 'ProB', 'ProMono', 'Prog',\n",
    "       'T', 'cDC', 'earlyEry', 'lateEry', 'HSC', 'pDC'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 244,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,['ProB'])\n",
    "ref_cluster[ix]='B'\n",
    "\n",
    "ix=np.isin(ref_cluster,['CTL'])\n",
    "ref_cluster[ix]='T'\n",
    "\n",
    "ix=np.isin(ref_cluster,['ProMono'])\n",
    "ref_cluster[ix]='Mono'\n",
    "\n",
    "ix=np.isin(ref_cluster,['earlyEry'])\n",
    "ref_cluster[ix]='lateEry'\n",
    "\n",
    "ix=np.isin(ref_cluster,['pDC'])\n",
    "ref_cluster[ix]='cDC'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 245,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['celltype']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['B', 'GMP', 'Mono', 'NK', 'Plasma', 'Prog',\n",
    "       'T', 'cDC', 'lateEry', 'HSC'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 246,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['B', 'GMP', 'Mono', 'NK', 'Plasma', 'Prog', 'T', 'cDC', 'lateEry',\n",
       "       'HSC'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 246,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.obs['celltype'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 247,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.rename_categories('celltype', ['B cells', 'Granulocyte-monocyte progenitor cells', 'Monocytes', 'NK cells', 'Plasma cells', 'Multipotent progenitor cells',\n",
    "                                          'T cells', 'Dendritic cells', 'Erythroid cells', 'Hematopoietic stem cells'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 251,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['Organ'] = 'BoneMarrow'\n",
    "adata.obs['Organ_Specific'] = 'BoneMarrow'\n",
    "adata.obs['Dataset'] = 'XX_BoneMarrow'\n",
    "adata.obs['InternDatasetNumber'] ='12-1-BoneMarrow-VanGalen-2019'\n",
    "adata.obs['Dataset_status'] = 'Healthy_Dataset'\n",
    "\n",
    "adata.obs['celltype'] = adata.obs['celltype']\n",
    "adata.obs['sub_celltype'] ='NaN'\n",
    "adata.obs['Malignant'] = 'NonMalignant'\n",
    "\n",
    "adata.obs['Patient'] = 'NameXX-Donor1'\n",
    "adata.obs['Patient_Number'] = adata.obs['sample']\n",
    "adata.obs['age'] = 'NaN'\n",
    "adata.obs['sex'] = 'NaN'\n",
    "adata.obs['health_status'] = 'NaN'\n",
    "\n",
    "adata.obs['original_celltype_1'] = adata.obs['CellType_Atlas']\n",
    "adata.obs['original_celltype_2'] = adata.obs['CellType']\n",
    "adata.obs['original_celltype_3'] = 'NaN'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 253,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 254,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs_names_make_unique()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 12-3-BoneMarrow-Pisco-2022"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true,
    "tags": []
   },
   "source": [
    "## 12-3-BoneMarrow-Pisco-2022 (human) - Pisco Multiple Organs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 179,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(adata_pisco.obs['tissue'],['bone marrow']) \n",
    "adata=adata_pisco[ix].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 184,
   "metadata": {},
   "outputs": [],
   "source": [
    "#calculate QC covariates\n",
    "adata.obs['n_counts'] = adata.X.sum(1)\n",
    "adata.obs['log_counts'] = np.log(adata.obs['n_counts'])\n",
    "adata.obs['n_genes'] = (adata.X > 0).sum(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 188,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "filtered out 1 cells that have more than 15000 counts\n",
      "filtered out 2 cells that have more than 10000 genes expressed\n",
      "filtered out 36049 genes that are detected in less than 30 cells\n"
     ]
    }
   ],
   "source": [
    "# FILTER PARAMETERS\n",
    "#Filter out cells\n",
    "sc.pp.filter_cells(adata, max_counts = 15000)\n",
    "sc.pp.filter_cells(adata, max_genes = 10000)\n",
    "# Min 20 cells - filters out low count genes\n",
    "sc.pp.filter_genes(adata, min_cells=30) #500 works # # 450(30494) #400(29837) #300(28268) not"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 190,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get mt genes\n",
    "mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]\n",
    "np.array(mt_genes)\n",
    "mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]\n",
    "\n",
    "# convert to spare\n",
    "adata.X=sp.sparse.csr_matrix.todense(adata.X)\n",
    "\n",
    "# manually define mt score\n",
    "y = np.bincount(mt_gene_mask)\n",
    "ii = np.nonzero(y)[0]\n",
    "np.vstack((ii,y[ii])).T\n",
    "adata.X[:, mt_gene_mask].sum(1)\n",
    "mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))\n",
    "mt_sum=np.array(pd.DataFrame(mt_sum)[0])\n",
    "mt_frac=mt_sum/adata.obs['n_counts'].values\n",
    "\n",
    "adata.obs['mt_frac'] = mt_frac\n",
    "\n",
    "#Filter out cells with over 20% mito fraction\n",
    "adata = adata[adata.obs['mt_frac'] < 0.20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 191,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pp=adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 192,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:00:01): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n",
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=15\n",
      "    finished (0:00:00)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)\n",
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 22 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:00:00)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 193,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 194,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 195,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/python/lib/python3.7/site-packages/ipykernel_launcher.py:3: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "#Delete adata_pp\n",
    "del adata_pp\n",
    "adata.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 196,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 197,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "#Keep the count data in a counts layer\n",
    "adata.layers[\"counts\"] = adata.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 198,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.raw = adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 199,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "adata.X /= adata.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 200,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n",
      "    finished (0:00:01)\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 201,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=50\n",
      "    finished (0:00:02)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 50\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)\n",
      "computing UMAP\n",
      "    finished: added\n",
      "    'X_umap', UMAP coordinates (adata.obsm) (0:00:10)\n"
     ]
    }
   ],
   "source": [
    "# Calculate the visualizations\n",
    "sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')\n",
    "sc.pp.neighbors(adata)\n",
    "sc.tl.umap(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 203,
   "metadata": {},
   "outputs": [],
   "source": [
    "# make consistent annotations across datasets\n",
    "adata.obs['celltype']=adata.obs['cell_type'].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 204,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['CD4-positive, alpha-beta T cell', 'CD8-positive, alpha-beta T cell',\n",
       "       'common myeloid progenitor', 'erythrocyte', 'erythroid progenitor cell',\n",
       "       'granulocyte', 'hematopoietic stem cell', 'macrophage',\n",
       "       'mature NK T cell', 'memory B cell', 'monocyte', 'naive B cell',\n",
       "       'neutrophil', 'plasma cell', 'plasmablast'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 204,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.obs['celltype'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 205,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['celltype'],\n",
    "                           categories=['CD4-positive, alpha-beta T cell', 'CD8-positive, alpha-beta T cell',\n",
    "       'common myeloid progenitor', 'erythrocyte', 'erythroid progenitor cell',\n",
    "       'granulocyte', 'hematopoietic stem cell', 'macrophage',\n",
    "       'mature NK T cell', 'memory B cell', 'monocyte', 'naive B cell',\n",
    "       'neutrophil', 'plasma cell', 'plasmablast'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 206,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,[ 'CD8-positive, alpha-beta T cell'])\n",
    "ref_cluster[ix]='CD4-positive, alpha-beta T cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,['naive B cell'])\n",
    "ref_cluster[ix]='memory B cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,['plasmablast'])\n",
    "ref_cluster[ix]='plasma cell'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 207,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['celltype']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['CD4-positive, alpha-beta T cell',\n",
    "       'common myeloid progenitor', 'erythrocyte', 'erythroid progenitor cell',\n",
    "       'granulocyte', 'hematopoietic stem cell', 'macrophage',\n",
    "       'mature NK T cell', 'memory B cell', 'monocyte',\n",
    "       'neutrophil', 'plasma cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 208,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.rename_categories('celltype', ['T cells',\n",
    "       'Common myeloid progenitor cells', 'Erythroid cells', 'Erythroid progenitor cells',\n",
    "       'Granulocyte cells', 'Hematopoietic stem cells', 'Macrophages',\n",
    "       'NK cells', 'B cells', 'Monocytes',\n",
    "       'Neutrophils', 'Plasma cells'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 211,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['TSP2', 'TSP11', 'TSP13', 'TSP14'], dtype='object')"
      ]
     },
     "execution_count": 211,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.obs['donor'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 212,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['tissue'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['tissue'],\n",
    "                           categories=['bone marrow'])\n",
    "adata.rename_categories('tissue', ['BoneMarrow'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 213,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['sex'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['sex'],\n",
    "                           categories=['female', 'male'])\n",
    "adata.rename_categories('sex', ['Female', 'Male'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 214,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['ethnicity'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['ethnicity'],\n",
    "                           categories=['African American or Afro-Caribbean', 'Asian', 'European',\n",
    "       'Hispanic or Latin American'])\n",
    "adata.rename_categories('ethnicity', ['African-American or Afro-Caribbean', 'Asian', 'European', 'Hispanic or Latin-American'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 215,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['development_stage'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['development_stage'],\n",
    "                           categories=['22-year-old human stage', '42-year-old human stage',\n",
    "       '59-year-old human stage', '61-year-old human stage'])\n",
    "adata.rename_categories('development_stage',['22', '42',\n",
    "       '59', '61'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 216,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['donor'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['donor'],\n",
    "                           categories=['TSP2', 'TSP11', 'TSP13', 'TSP14'])\n",
    "adata.rename_categories('donor', ['TSP2', 'TSP11', 'TSP13', 'TSP14'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['Organ'] = 'BoneMarrow'\n",
    "adata.obs['Organ_Specific'] = adata.obs['tissue']\n",
    "adata.obs['Dataset'] = 'Pisco_BoneMarrow'\n",
    "adata.obs['InternDatasetNumber'] = '12-3-BoneMarrow-Pisco-2022'\n",
    "adata.obs['Dataset_status'] = 'Healthy_Dataset'\n",
    "\n",
    "adata.obs['celltype'] = adata.obs['celltype']\n",
    "adata.obs['sub_celltype'] = 'NaN'\n",
    "adata.obs['Malignant'] = 'NonMalignant'\n",
    "\n",
    "adata.obs['Patient'] = adata.obs['donor']\n",
    "adata.obs['Patient_Number'] = 'NaN'\n",
    "adata.obs['age'] = adata.obs['development_stage']\n",
    "adata.obs['sex'] = adata.obs['sex']\n",
    "adata.obs['ethnicity'] = adata.obs['ethnicity']\n",
    "adata.obs['health_status'] = 'NaN'\n",
    "\n",
    "adata.obs['original_celltype_1'] = adata.obs['cell_type']\n",
    "adata.obs['original_celltype_2'] = 'NaN'\n",
    "adata.obs['original_celltype_3'] = 'NaN'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 219,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 220,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs_names_make_unique()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 12-4-BoneMarrow-Han-2020"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true,
    "tags": []
   },
   "source": [
    "## 12-4-BoneMarrow-Han-2020-RubenBrabenec_HealthyProject"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 317,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(adata_han.obs['sub_tissue'],['AdultBoneMarrow']) \n",
    "adata=adata_han[ix].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 318,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['InternDatasetNumber'] ='12-4-BoneMarrow-Han-2020'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 322,
   "metadata": {},
   "outputs": [],
   "source": [
    "#calculate QC covariates\n",
    "adata.obs['n_counts'] = adata.X.sum(1)\n",
    "adata.obs['log_counts'] = np.log(adata.obs['n_counts'])\n",
    "adata.obs['n_genes'] = (adata.X > 0).sum(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 325,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "filtered out 37 cells that have more than 4200 counts\n",
      "filtered out 14569 genes that are detected in less than 10 cells\n"
     ]
    }
   ],
   "source": [
    "# FILTER PARAMETERS\n",
    "#Filter out cells\n",
    "sc.pp.filter_cells(adata, max_counts = 4200)\n",
    "sc.pp.filter_cells(adata, max_genes = 2200)\n",
    "# Min 20 cells - filters out low count genes\n",
    "sc.pp.filter_genes(adata, min_cells=10) #500 works # # 450(30494) #400(29837) #300(28268) not"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 326,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get mt genes\n",
    "mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]\n",
    "np.array(mt_genes)\n",
    "mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]\n",
    "\n",
    "# convert to spare\n",
    "adata.X=sp.sparse.csr_matrix.todense(adata.X)\n",
    "\n",
    "# manually define mt score\n",
    "y = np.bincount(mt_gene_mask)\n",
    "ii = np.nonzero(y)[0]\n",
    "np.vstack((ii,y[ii])).T\n",
    "adata.X[:, mt_gene_mask].sum(1)\n",
    "mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))\n",
    "mt_sum=np.array(pd.DataFrame(mt_sum)[0])\n",
    "mt_frac=mt_sum/adata.obs['n_counts'].values\n",
    "\n",
    "adata.obs['mt_frac'] = mt_frac\n",
    "\n",
    "#Filter out cells with over 20% mito fraction\n",
    "adata = adata[adata.obs['mt_frac'] < 0.20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 327,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pp=adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 328,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:00:00): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n",
      "computing PCA\n",
      "    with n_comps=15\n",
      "    finished (0:00:01)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)\n",
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 12 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:00:00)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 329,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 330,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 331,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/python/lib/python3.7/site-packages/ipykernel_launcher.py:3: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "#Delete adata_pp\n",
    "del adata_pp\n",
    "adata.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 332,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 333,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "#Keep the count data in a counts layer\n",
    "adata.layers[\"counts\"] = adata.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 334,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.raw = adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 335,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "adata.X /= adata.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 336,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n",
      "    finished (0:00:00)\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 337,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=50\n",
      "    finished (0:00:01)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 50\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)\n",
      "computing UMAP\n",
      "    finished: added\n",
      "    'X_umap', UMAP coordinates (adata.obsm) (0:00:17)\n"
     ]
    }
   ],
   "source": [
    "# Calculate the visualizations\n",
    "sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')\n",
    "sc.pp.neighbors(adata)\n",
    "sc.tl.umap(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 339,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['B cell', 'B cell (Centrocyte)', 'B cell (Plasmocyte)',\n",
       "       'Dendritic cell', 'Erythroid cell', 'Erythroid progenitor cell', 'HSPC',\n",
       "       'M2 Macrophage', 'Monocyte', 'Monocyte/DC progenitor',\n",
       "       'Neutrophil_DEFA3 high', 'Neutrophil_LTF high', 'Neutrophil_PRTN3 high',\n",
       "       'Neutrophil_S100A9 high', 'Neutrophil_S100A12 high', 'T cell'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 339,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# make consistent annotations across datasets\n",
    "adata.obs['celltype']=adata.obs['celltype_specific'].copy()\n",
    "adata.obs['celltype'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 340,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['celltype'],\n",
    "                           categories=['B cell', 'B cell (Centrocyte)', 'B cell (Plasmocyte)',\n",
    "       'Dendritic cell', 'Erythroid cell', 'Erythroid progenitor cell', 'HSPC',\n",
    "       'M2 Macrophage', 'Monocyte', 'Monocyte/DC progenitor',\n",
    "       'Neutrophil_DEFA3 high', 'Neutrophil_LTF high', 'Neutrophil_PRTN3 high',\n",
    "       'Neutrophil_S100A9 high', 'Neutrophil_S100A12 high', 'T cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 341,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,[ 'B cell (Centrocyte)'])\n",
    "ref_cluster[ix]='B cell (Plasmocyte)'\n",
    "\n",
    "ix=np.isin(ref_cluster,[ 'Monocyte/DC progenitor'])\n",
    "ref_cluster[ix]='Monocyte'\n",
    "\n",
    "ix=np.isin(ref_cluster,[ 'Neutrophil_LTF high', 'Neutrophil_PRTN3 high','Neutrophil_S100A9 high', 'Neutrophil_S100A12 high'])\n",
    "ref_cluster[ix]='Neutrophil_DEFA3 high'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 342,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['celltype']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['B cell','B cell (Plasmocyte)',\n",
    "       'Dendritic cell', 'Erythroid cell', 'Erythroid progenitor cell', 'HSPC',\n",
    "       'M2 Macrophage', 'Monocyte', \n",
    "       'Neutrophil_DEFA3 high', 'T cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 343,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.rename_categories('celltype', ['B cells', 'Plasma cells',\n",
    "       'Dendritic cells', 'Erythroid cells', 'Erythroid progenitor cells', 'Hematopoietic stem cells',\n",
    "       'Macrophages', 'Monocytes',\n",
    "       'Neutrophils', 'T cells'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 347,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['sub_tissue'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['sub_tissue'],\n",
    "                           categories=['AdultBoneMarrow'])\n",
    "adata.rename_categories('sub_tissue', ['BoneMarrow'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 348,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['sex'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['sex'],\n",
    "                           categories=['female'])\n",
    "adata.rename_categories('sex', ['Female'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 349,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['age'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['age'],\n",
    "                           categories=['49Y', '60Y'])\n",
    "adata.rename_categories('age',['49', '60'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 350,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['donor'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['donor'],\n",
    "                           categories=['Donor27', 'Donor28'])\n",
    "adata.rename_categories('donor', ['Han-Donor27', 'Han-Donor28'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 353,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 354,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.write(writepath + '12-4-BoneMarrow-Han-2020-processed.h5ad')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 12-5-BoneMarrow_ImmuneCells-Teichmann-2022"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 152,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(adata_analysis.obs['tissue_major'], ['BoneMarrow'])\n",
    "adata=adata_analysis[ix].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 156,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['Organ'] =  adata.obs['tissue_major']\n",
    "adata.obs['Organ_Specific'] = adata.obs['sub_tissue']\n",
    "adata.obs['Dataset'] = adata.obs['Dataset']\n",
    "adata.obs['InternDatasetNumber'] = '12-5-BoneMarrow_ImmuneCells-Teichmann-2022'\n",
    "adata.obs['Dataset_status'] = 'Healthy_Dataset'\n",
    "\n",
    "adata.obs['celltype'] = adata.obs['celltype']\n",
    "adata.obs['sub_celltype'] = 'NaN'\n",
    "adata.obs['Malignant'] = 'NonMalignant'\n",
    "\n",
    "adata.obs['Patient'] = adata.obs['donor']\n",
    "adata.obs['Patient_Number'] = 'NaN'\n",
    "adata.obs['age'] = adata.obs['development_stage']\n",
    "adata.obs['sex'] = adata.obs['sex']\n",
    "adata.obs['ethnicity'] = adata.obs['ethnicity']\n",
    "adata.obs['health_status'] = 'NaN'\n",
    "\n",
    "adata.obs['original_celltype_1'] = adata.obs['cell_type']\n",
    "adata.obs['original_celltype_2'] = adata.obs['Majority_voting_CellTypist_high']\n",
    "adata.obs['original_celltype_3'] = 'NaN'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 157,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 158,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs_names_make_unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 159,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.write(writepath + '12-5-BoneMarrow_ImmuneCells-Teichmann-2022-processed.h5ad')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true,
    "tags": []
   },
   "source": [
    "# 13-Skin"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 13-1-Skin-Cheng-2018"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [],
   "source": [
    "# here we use sfaira to import available datasets with annotations\n",
    "# note that the following steps may change depending on the current sfaira version and the path to your repository\n",
    "\n",
    "datadir = '/path/to/repo/'\n",
    "\n",
    "ds = sfaira.data.human.DatasetGroupSkin(path=datadir)  # This links all data sets available"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['human_skin_2018_10x_cheng_001_10.1016/j.celrep.2018.09.006',\n",
       " 'human_skin_2020_microwell_han_001_10.1038/s41586-020-2157-4',\n",
       " 'human_skin_2020_microwell_han_002_10.1038/s41586-020-2157-4']"
      ]
     },
     "execution_count": 112,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds.ids"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {},
   "outputs": [],
   "source": [
    "# pick first one\n",
    "idx = ds.ids[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'human_skin_2018_10x_cheng_001_10.1016/j.celrep.2018.09.006'"
      ]
     },
     "execution_count": 114,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "idx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/icb/moritz.thomas/miniconda3/lib/python3.7/site-packages/sfaira-master/sfaira/data/base.py:84: UserWarning: using default genomes Homo_sapiens_GRCh38_97\n",
      "  warnings.warn(f\"using default genomes {genome}\")\n"
     ]
    }
   ],
   "source": [
    "ds.datasets[idx].load()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata=ds.datasets[idx].adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.var.index=np.array(adata.var.names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 267,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['InternDatasetNumber'] ='13-1-Skin-Cheng-2018'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 270,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 271,
   "metadata": {},
   "outputs": [],
   "source": [
    "#calculate QC covariates\n",
    "adata.obs['n_counts'] = adata.X.sum(1)\n",
    "adata.obs['log_counts'] = np.log(adata.obs['n_counts'])\n",
    "adata.obs['n_genes'] = (adata.X > 0).sum(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 274,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "filtered out 424 genes that are detected in less than 20 cells\n"
     ]
    }
   ],
   "source": [
    "# FILTER PARAMETERS#Filter out cells\n",
    "#sc.pp.filter_cells(adata, max_counts = 1700)\n",
    "#sc.pp.filter_cells(adata, max_genes = 4200)\n",
    "# Min 20 cells - filters out low count genes\n",
    "sc.pp.filter_genes(adata, min_cells=20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 275,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ruben.brabenec/.local/lib/python3.7/site-packages/anndata/compat/__init__.py:268: FutureWarning: During AnnData slicing, found matrix at .uns['neighbors_hm']['connectivities'] that happens to be dimensioned at n_obs×n_obs (68036×68036).\n",
      "\n",
      "These matrices should now be stored in the .obsp attribute.\n",
      "This slicing behavior will be removed in anndata 0.8.\n",
      "  FutureWarning,\n",
      "/home/ruben.brabenec/.local/lib/python3.7/site-packages/anndata/compat/__init__.py:268: FutureWarning: During AnnData slicing, found matrix at .uns['neighbors_hm']['distances'] that happens to be dimensioned at n_obs×n_obs (68036×68036).\n",
      "\n",
      "These matrices should now be stored in the .obsp attribute.\n",
      "This slicing behavior will be removed in anndata 0.8.\n",
      "  FutureWarning,\n"
     ]
    }
   ],
   "source": [
    "# get mt genes\n",
    "mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]\n",
    "np.array(mt_genes)\n",
    "mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]\n",
    "\n",
    "# convert to spare\n",
    "adata.X=sp.sparse.csr_matrix.todense(adata.X)\n",
    "\n",
    "# manually define mt score\n",
    "y = np.bincount(mt_gene_mask)\n",
    "ii = np.nonzero(y)[0]\n",
    "np.vstack((ii,y[ii])).T\n",
    "adata.X[:, mt_gene_mask].sum(1)\n",
    "mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))\n",
    "mt_sum=np.array(pd.DataFrame(mt_sum)[0])\n",
    "mt_frac=mt_sum/adata.obs['n_counts'].values\n",
    "\n",
    "adata.obs['mt_frac'] = mt_frac\n",
    "\n",
    "#Filter out cells with over 20% mito fraction\n",
    "adata = adata[adata.obs['mt_frac'] < 0.20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 277,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pp=adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 278,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:00:07): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n",
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=15\n",
      "    finished (0:00:02)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:10)\n",
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 17 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:00:11)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 279,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 280,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 281,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/python/lib/python3.7/site-packages/ipykernel_launcher.py:3: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "#Deletstrings_to_categoricals_pp\n",
    "del adata_pp\n",
    "adata.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 282,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 283,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "#Keep the count data in a counts layer\n",
    "adata.layers[\"counts\"] = adata.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 284,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.raw = adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 285,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "adata.X /= adata.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 286,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 287,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n",
      "    finished (0:00:00)\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 288,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=50\n",
      "    finished (0:00:18)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 50\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:15)\n",
      "computing UMAP\n",
      "    finished: added\n",
      "    'X_umap', UMAP coordinates (adata.obsm) (0:01:00)\n"
     ]
    }
   ],
   "source": [
    "# Calculate the visualizations\n",
    "sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')\n",
    "sc.pp.neighbors(adata)\n",
    "sc.tl.umap(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 290,
   "metadata": {},
   "outputs": [],
   "source": [
    "# make consistent annotations across datasets\n",
    "adata.obs['celltype']=adata.obs['cell_ontology_class'].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 291,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['celltype'],\n",
    "                           categories=['Basal cell 1', 'Basal cell 2', 'WNT1', 'channel', 'folicular',\n",
    "       'granular', 'immune', 'melanocyte', 'mitotic', 'spinous'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 292,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,['Basal cell 2'])\n",
    "ref_cluster[ix]='Basal cell 1'\n",
    "\n",
    "ix=np.isin(ref_cluster,['channel', 'folicular','granular', 'mitotic', 'spinous'])\n",
    "ref_cluster[ix]='WNT1'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 293,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['celltype']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['Basal cell 1', 'WNT1', 'immune', 'melanocyte'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 294,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.rename_categories('celltype', ['Basal cells', 'Keratinocytes', 'Monocytes', 'Melanocytes'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 298,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['tissue'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['tissue'],\n",
    "                           categories=['foreskin', 'scalp', 'trunk'])\n",
    "adata.rename_categories('tissue', ['Skin_Foreskin', 'Skin_Scalp', 'Skin_Trunk'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 299,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['donor'] = adata.obs['sample']\n",
    "ref_cluster=pd.Categorical(adata.obs['donor'],\n",
    "                           categories=['abd4', 'br41epi', 'br53epi', 'fore8', 'fore9', 'fore12', 's11',\n",
    "       'scalp26', 'scalp32'])\n",
    "adata.rename_categories('donor', ['Cheng_Skin-Donor1', 'Cheng_Skin-Donor2', 'Cheng_Skin-Donor3', 'Cheng_Skin-Donor4', 'Cheng_Skin-Donor5', 'Cheng_Skin-Donor6', 'Cheng_Skin-Donor7',\n",
    "       'Cheng_Skin-Donor8', 'Cheng_Skin-Donor9'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 300,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['Organ'] = 'Skin'\n",
    "adata.obs['Organ_Specific'] = adata.obs['tissue']\n",
    "adata.obs['Dataset'] = 'Cheng_Skin'\n",
    "adata.obs['InternDatasetNumber'] = '13-1-Skin-Cheng-2018'\n",
    "adata.obs['Dataset_status'] = 'Healthy_Dataset'\n",
    "\n",
    "adata.obs['celltype'] = adata.obs['celltype']\n",
    "adata.obs['sub_celltype'] ='NaN'\n",
    "adata.obs['Malignant'] = 'NonMalignant'\n",
    "\n",
    "adata.obs['Patient'] = adata.obs['donor']\n",
    "adata.obs['Patient_Number'] = adata.obs['sample']\n",
    "adata.obs['age'] = 'NaN'\n",
    "adata.obs['sex'] = 'NaN'\n",
    "adata.obs['health_status'] = 'NaN'\n",
    "\n",
    "adata.obs['original_celltype_1'] = adata.obs['cell_ontology_class']\n",
    "adata.obs['original_celltype_2'] = adata.obs['CellType']\n",
    "adata.obs['original_celltype_3'] = 'NaN'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 302,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 303,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs_names_make_unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 304,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.write(writepath + '13-1-Skin-Cheng-2018-processed.h5ad')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 13-2-Skin-Pisco-2022"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 519,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(adata_pisco.obs['tissue'],['skin of abdomen', 'skin of body', 'skin of chest']) \n",
    "adata=adata_pisco[ix].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 524,
   "metadata": {},
   "outputs": [],
   "source": [
    "#calculate QC covariates\n",
    "adata.obs['n_counts'] = adata.X.sum(1)\n",
    "adata.obs['log_counts'] = np.log(adata.obs['n_counts'])\n",
    "adata.obs['n_genes'] = (adata.X > 0).sum(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 528,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "filtered out 1 cells that have more than 10000 counts\n",
      "filtered out 1 cells that have more than 7000 genes expressed\n",
      "filtered out 37950 genes that are detected in less than 20 cells\n"
     ]
    }
   ],
   "source": [
    "# FILTER PARAMETERS\n",
    "#Filter out cells\n",
    "sc.pp.filter_cells(adata, max_counts = 10000)\n",
    "sc.pp.filter_cells(adata, max_genes = 7000)\n",
    "# Min 20 cells - filters out low count genes\n",
    "sc.pp.filter_genes(adata, min_cells=20) #500 works # # 450(30494) #400(29837) #300(28268) not"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 530,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get mt genes\n",
    "mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]\n",
    "np.array(mt_genes)\n",
    "mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]\n",
    "\n",
    "# convert to spare\n",
    "adata.X=sp.sparse.csr_matrix.todense(adata.X)\n",
    "\n",
    "# manually define mt score\n",
    "y = np.bincount(mt_gene_mask)\n",
    "ii = np.nonzero(y)[0]\n",
    "np.vstack((ii,y[ii])).T\n",
    "adata.X[:, mt_gene_mask].sum(1)\n",
    "mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))\n",
    "mt_sum=np.array(pd.DataFrame(mt_sum)[0])\n",
    "mt_frac=mt_sum/adata.obs['n_counts'].values\n",
    "\n",
    "adata.obs['mt_frac'] = mt_frac\n",
    "\n",
    "#Filter out cells with over 20% mito fraction\n",
    "adata = adata[adata.obs['mt_frac'] < 0.20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 531,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pp=adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 532,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:00:01): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n",
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=15\n",
      "    finished (0:00:00)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)\n",
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 10 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:00:00)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 533,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 534,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 535,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/python/lib/python3.7/site-packages/ipykernel_launcher.py:3: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "#Delete adata_pp\n",
    "del adata_pp\n",
    "adata.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 536,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 537,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "#Keep the count data in a counts layer\n",
    "adata.layers[\"counts\"] = adata.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 538,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.raw = adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 539,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "adata.X /= adata.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 540,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n",
      "    finished (0:00:01)\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 541,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=50\n",
      "    finished (0:00:01)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 50\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)\n",
      "computing UMAP\n",
      "    finished: added\n",
      "    'X_umap', UMAP coordinates (adata.obsm) (0:00:18)\n"
     ]
    }
   ],
   "source": [
    "# Calculate the visualizations\n",
    "sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')\n",
    "sc.pp.neighbors(adata)\n",
    "sc.tl.umap(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 542,
   "metadata": {},
   "outputs": [],
   "source": [
    "#sc.pl.umap(adata, color='cell_type')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 543,
   "metadata": {},
   "outputs": [],
   "source": [
    "# make consistent annotations across datasets\n",
    "adata.obs['celltype']=adata.obs['cell_type'].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 544,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['CD141-positive myeloid dendritic cell',\n",
       "       'CD1c-positive myeloid dendritic cell', 'CD4-positive helper T cell',\n",
       "       'CD4-positive, alpha-beta memory T cell',\n",
       "       'CD8-positive, alpha-beta cytotoxic T cell',\n",
       "       'CD8-positive, alpha-beta memory T cell', 'Langerhans cell', 'T cell',\n",
       "       'cell of skeletal muscle', 'endothelial cell', 'epithelial cell',\n",
       "       'macrophage', 'mast cell', 'mature NK T cell', 'melanocyte',\n",
       "       'memory B cell', 'muscle cell', 'naive B cell',\n",
       "       'naive thymus-derived CD4-positive, alpha-beta T cell',\n",
       "       'naive thymus-derived CD8-positive, alpha-beta T cell', 'plasma cell',\n",
       "       'regulatory T cell', 'smooth muscle cell', 'stromal cell'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 544,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.obs['celltype'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 545,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['celltype'],\n",
    "                           categories=['CD141-positive myeloid dendritic cell',\n",
    "       'CD1c-positive myeloid dendritic cell', 'CD4-positive helper T cell',\n",
    "       'CD4-positive, alpha-beta memory T cell',\n",
    "       'CD8-positive, alpha-beta cytotoxic T cell',\n",
    "       'CD8-positive, alpha-beta memory T cell', 'Langerhans cell', 'T cell',\n",
    "       'cell of skeletal muscle', 'endothelial cell', 'epithelial cell',\n",
    "       'macrophage', 'mast cell', 'mature NK T cell', 'melanocyte',\n",
    "       'memory B cell', 'muscle cell', 'naive B cell',\n",
    "       'naive thymus-derived CD4-positive, alpha-beta T cell',\n",
    "       'naive thymus-derived CD8-positive, alpha-beta T cell', 'plasma cell',\n",
    "       'regulatory T cell', 'smooth muscle cell', 'stromal cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 546,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,[  'CD1c-positive myeloid dendritic cell'])\n",
    "ref_cluster[ix]='CD141-positive myeloid dendritic cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,[ 'CD4-positive, alpha-beta memory T cell','CD8-positive, alpha-beta cytotoxic T cell', 'CD8-positive, alpha-beta memory T cell', 'T cell', \n",
    "                        'naive thymus-derived CD4-positive, alpha-beta T cell', 'naive thymus-derived CD8-positive, alpha-beta T cell',  'regulatory T cell'])\n",
    "ref_cluster[ix]='CD4-positive helper T cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,['naive B cell'])\n",
    "ref_cluster[ix]= 'memory B cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,['Langerhans cell'])\n",
    "ref_cluster[ix]= 'macrophage'\n",
    "\n",
    "ix=np.isin(ref_cluster,['muscle cell'])\n",
    "ref_cluster[ix]= 'smooth muscle cell'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 547,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['celltype']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['CD141-positive myeloid dendritic cell',\n",
    "      'CD4-positive helper T cell',\n",
    "       'cell of skeletal muscle', 'endothelial cell', 'epithelial cell',\n",
    "       'macrophage', 'mast cell', 'mature NK T cell', 'melanocyte',\n",
    "       'memory B cell',\n",
    "      'plasma cell',\n",
    "        'smooth muscle cell', 'stromal cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 548,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.rename_categories('celltype',['Dendritic cells',\n",
    "      'T cells',\n",
    "       'Skeletal muscle cells', 'Endothelial cells', 'Keratinocytes',\n",
    "       'Macrophages', 'Mast cells', 'NK cells', 'Melanocytes',\n",
    "       'B cells',\n",
    "      'Plasma cells',\n",
    "        'Smooth muscle cells', 'Mesenchymal stromal cells'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 552,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['tissue'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['tissue'],\n",
    "                           categories=['skin of abdomen', 'skin of body', 'skin of chest'])\n",
    "adata.rename_categories('tissue', ['Skin_Abdomen', 'Skin', 'Skin_Chest'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 553,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['sex'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['sex'],\n",
    "                           categories=['male'])\n",
    "adata.rename_categories('sex', ['Male'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 554,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['ethnicity'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['ethnicity'],\n",
    "                           categories=['European', 'Hispanic or Latin American'])\n",
    "adata.rename_categories('ethnicity', ['European', 'Hispanic or Latin-American'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 555,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['development_stage'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['development_stage'],\n",
    "                           categories=['33-year-old human stage', '59-year-old human stage'])\n",
    "adata.rename_categories('development_stage',['33', '59'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 556,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['donor'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['donor'],\n",
    "                           categories=['TSP10', 'TSP14'])\n",
    "adata.rename_categories('donor', ['TSP10', 'TSP14'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 557,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['Organ'] = 'Skin'\n",
    "adata.obs['Organ_Specific'] = adata.obs['tissue']\n",
    "adata.obs['Dataset'] = 'Pisco_Skin'\n",
    "adata.obs['InternDatasetNumber'] = '13-1-Skin-Pisco-2022'\n",
    "adata.obs['Dataset_status'] = 'Healthy_Dataset'\n",
    "\n",
    "adata.obs['celltype'] = adata.obs['celltype']\n",
    "adata.obs['sub_celltype'] = 'NaN'\n",
    "adata.obs['Malignant'] = 'NonMalignant'\n",
    "\n",
    "adata.obs['Patient'] = adata.obs['donor']\n",
    "adata.obs['Patient_Number'] = 'NaN'\n",
    "adata.obs['age'] = adata.obs['development_stage']\n",
    "adata.obs['sex'] = adata.obs['sex']\n",
    "adata.obs['ethnicity'] = adata.obs['ethnicity']\n",
    "adata.obs['health_status'] = 'NaN'\n",
    "\n",
    "adata.obs['original_celltype_1'] = adata.obs['cell_type']\n",
    "adata.obs['original_celltype_2'] = 'NaN'\n",
    "adata.obs['original_celltype_3'] = 'NaN'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 559,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 560,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs_names_make_unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 561,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.write(writepath + '13-2-Skin-Pisco-2022-processed.h5ad')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "jp-MarkdownHeadingCollapsed": true,
    "tags": []
   },
   "source": [
    "# 15-Kidney"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 15-1-Kidney-Steward-2019"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# here we use sfaira to import available datasets with annotations\n",
    "# note that the following steps may change depending on the current sfaira version and the path to your repository"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 136,
   "metadata": {},
   "outputs": [],
   "source": [
    "datadir = '/path/to/repo/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['human_kidney_2019_10xSn_lake_001_10.1038/s41467-019-10861-2',\n",
       " 'human_kidney_2019_10x_stewart_001_10.1126/science.aat5031',\n",
       " 'human_kidney_2020_10x_liao_001_10.1038/s41597-019-0351-8',\n",
       " 'human_kidney_2020_microwell_han_001_10.1038/s41586-020-2157-4',\n",
       " 'human_kidney_2020_microwell_han_002_10.1038/s41586-020-2157-4',\n",
       " 'human_kidney_2020_microwell_han_003_10.1038/s41586-020-2157-4',\n",
       " 'human_kidney_2020_microwell_han_004_10.1038/s41586-020-2157-4',\n",
       " 'human_kidney_2020_microwell_han_005_10.1038/s41586-020-2157-4',\n",
       " 'human_kidney_2020_microwell_han_006_10.1038/s41586-020-2157-4',\n",
       " 'human_kidney_2020_microwell_han_007_10.1038/s41586-020-2157-4']"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds = sfaira.data.human.DatasetGroupKidney(path=datadir)  # This links all data sets available\n",
    "ds.ids "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "idx = ds.ids[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'human_kidney_2019_10x_stewart_001_10.1126/science.aat5031'"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "idx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/icb/moritz.thomas/miniconda3/lib/python3.7/site-packages/sfaira-master/sfaira/data/base.py:84: UserWarning: using default genomes Homo_sapiens_GRCh38_97\n",
      "  warnings.warn(f\"using default genomes {genome}\")\n"
     ]
    }
   ],
   "source": [
    "ds.datasets[idx].load()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata=ds.datasets[idx].adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 315,
   "metadata": {},
   "outputs": [],
   "source": [
    "#subset to only 40268 mature kidney cells (batch 1)\n",
    "ix=np.isin(adata.obs['batch'],['1']) \n",
    "adata=adata[ix].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 317,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.var.index = adata.var['names'].tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 318,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 319,
   "metadata": {},
   "outputs": [],
   "source": [
    "#calculate QC covariates\n",
    "adata.obs['n_counts'] = adata.X.sum(1)\n",
    "adata.obs['log_counts'] = np.log(adata.obs['n_counts'])\n",
    "adata.obs['n_genes'] = (adata.X > 0).sum(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 322,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "filtered out 2 cells that have more than 7500 genes expressed\n",
      "filtered out 14602 genes that are detected in less than 20 cells\n"
     ]
    }
   ],
   "source": [
    "# FILTER PARAMETERS#Filter out cells\n",
    "#sc.pp.filter_cells(adata, max_counts = 1700)\n",
    "sc.pp.filter_cells(adata, max_genes = 7500)\n",
    "# Min 20 cells - filters out low count genes\n",
    "sc.pp.filter_genes(adata, min_cells=20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 323,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get mt genes\n",
    "mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]\n",
    "np.array(mt_genes)\n",
    "mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]\n",
    "\n",
    "# convert to spare\n",
    "adata.X=sp.sparse.csr_matrix.todense(adata.X)\n",
    "\n",
    "# manually define mt score\n",
    "y = np.bincount(mt_gene_mask)\n",
    "ii = np.nonzero(y)[0]\n",
    "np.vstack((ii,y[ii])).T\n",
    "adata.X[:, mt_gene_mask].sum(1)\n",
    "mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))\n",
    "mt_sum=np.array(pd.DataFrame(mt_sum)[0])\n",
    "mt_frac=mt_sum/adata.obs['n_counts'].values\n",
    "\n",
    "adata.obs['mt_frac'] = mt_frac\n",
    "\n",
    "#Filter out cells with over 20% mito fraction\n",
    "adata = adata[adata.obs['mt_frac'] < 0.20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 325,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pp=adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 326,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:00:02): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n",
      "computing PCA\n",
      "    with n_comps=15\n",
      "    finished (0:00:10)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:05)\n",
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 14 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:00:06)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 327,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 328,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 329,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/python/lib/python3.7/site-packages/ipykernel_launcher.py:3: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "#Deletstrings_to_categoricals_pp\n",
    "del adata_pp\n",
    "adata.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 330,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 331,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "#Keep the count data in a counts layer\n",
    "adata.layers[\"counts\"] = adata.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 332,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.raw = adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 333,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "adata.X /= adata.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 334,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 335,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n",
      "    finished (0:00:00)\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 336,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=50\n",
      "    finished (0:00:07)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 50\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:09)\n",
      "computing UMAP\n",
      "    finished: added\n",
      "    'X_umap', UMAP coordinates (adata.obsm) (0:00:31)\n"
     ]
    }
   ],
   "source": [
    "# Calculate the visualizations\n",
    "sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')\n",
    "sc.pp.neighbors(adata)\n",
    "sc.tl.umap(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 338,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['cell_ontology_class_broad']=adata.obs['cell_ontology_class']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 339,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['cell_ontology_class_broad'],\n",
    "                           categories=['B cell', 'CD4 T cell', 'CD8 T cell',\n",
    "       'Collecting Duct - Intercalated Cells Type B', 'Connecting tubule',\n",
    "       'Distinct proximal tubule 1', 'Distinct proximal tubule 2',\n",
    "       'Endothelial Cells - AEA & DVR', 'Endothelial Cells - AVR',\n",
    "       'Endothelial Cells - glomerular capillaries', 'Epithelial progenitor',\n",
    "       'Fibroblast', 'Indistinct intercalated cell',\n",
    "       'MNP-a/classical monocyte derived',\n",
    "       'MNP-b/non-classical monocyte derived', 'MNP-c/dendritic cell',\n",
    "       'MNP-d/Tissue macrophage', 'Mast cell', 'Myofibroblast', 'NK cell',\n",
    "       'NKT cell', 'Neutrophil', 'Pelvic epithelium',\n",
    "       'Peritubular capillary endothelium 1',\n",
    "       'Peritubular capillary endothelium 2', 'Plasmacytoid dendritic cell',\n",
    "       'Podocyte', 'Principal cell', 'Proliferating Proximal Tubule',\n",
    "       'Proximal tubule', 'Thick ascending limb of Loop of Henle',\n",
    "       'Transitional urothelium', 'Type A intercalated cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 340,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,['Distinct proximal tubule 2'])\n",
    "ref_cluster[ix]='Distinct proximal tubule 1'\n",
    "\n",
    "ix=np.isin(ref_cluster,['Endothelial Cells - AVR',\n",
    "       'Endothelial Cells - glomerular capillaries'])\n",
    "ref_cluster[ix]= 'Endothelial Cells - AEA & DVR'\n",
    "\n",
    "ix=np.isin(ref_cluster,['MNP-b/non-classical monocyte derived', 'MNP-c/dendritic cell',\n",
    "       'MNP-d/Tissue macrophage'])\n",
    "ref_cluster[ix]='MNP-a/classical monocyte derived'\n",
    "\n",
    "ix=np.isin(ref_cluster,[ 'Peritubular capillary endothelium 2'])\n",
    "ref_cluster[ix]= 'Peritubular capillary endothelium 1'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 341,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['cell_ontology_class_broad']=pd.Categorical(ref_cluster,\n",
    "                                                        categories=['B cell', 'CD4 T cell', 'CD8 T cell',\n",
    "       'Collecting Duct - Intercalated Cells Type B', 'Connecting tubule',\n",
    "       'Distinct proximal tubule 1',\n",
    "       'Endothelial Cells - AEA & DVR', 'Epithelial progenitor',\n",
    "       'Fibroblast', 'Indistinct intercalated cell',\n",
    "       'MNP-a/classical monocyte derived','Mast cell', 'Myofibroblast', 'NK cell',\n",
    "       'NKT cell', 'Neutrophil', 'Pelvic epithelium',\n",
    "       'Peritubular capillary endothelium 1','Plasmacytoid dendritic cell',\n",
    "       'Podocyte', 'Principal cell', 'Proliferating Proximal Tubule',\n",
    "       'Proximal tubule', 'Thick ascending limb of Loop of Henle',\n",
    "       'Transitional urothelium', 'Type A intercalated cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 342,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.rename_categories('cell_ontology_class_broad',\n",
    "                        ['B cell', 'CD4 T cell', 'CD8 T cell',\n",
    "       'Collecting Duct - Intercalated Cells Type B', 'Connecting tubule',\n",
    "       'Distinct proximal tubule', 'Endothelial Cells',\n",
    "       'Epithelial progenitor', 'Fibroblast', 'Indistinct intercalated cell',\n",
    "       'MNP', 'Mast cell', 'Myofibroblast',\n",
    "       'NK cell', 'NKT cell', 'Neutrophil', 'Pelvic epithelium',\n",
    "       'Peritubular capillary endothelium', 'Plasmacytoid dendritic cell',\n",
    "       'Podocyte', 'Principal cell', 'Proliferating Proximal Tubule',\n",
    "       'Proximal tubule', 'Thick ascending limb of Loop of Henle',\n",
    "       'Transitional urothelium', 'Type A intercalated cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 344,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['Ascending vasa recta endothelium', 'B cell', 'CD4 T cell',\n",
       "       'CD8 T cell', 'Connecting tubule', 'Descending vasa recta endothelium',\n",
       "       'Distinct proximal tubule 1', 'Distinct proximal tubule 2',\n",
       "       'Epithelial progenitor cell', 'Fibroblast', 'Glomerular endothelium',\n",
       "       'Indistinct intercalated cell', 'MNP-a/classical monocyte derived',\n",
       "       'MNP-b/non-classical monocyte derived', 'MNP-c/dendritic cell',\n",
       "       'MNP-d/Tissue macrophage', 'Mast cell', 'Myofibroblast', 'NK cell',\n",
       "       'NKT cell', 'Neutrophil', 'Pelvic epithelium',\n",
       "       'Peritubular capillary endothelium 1',\n",
       "       'Peritubular capillary endothelium 2', 'Plasmacytoid dendritic cell',\n",
       "       'Podocyte', 'Principal cell', 'Proliferating Proximal Tubule',\n",
       "       'Proximal tubule', 'Thick ascending limb of Loop of Henle',\n",
       "       'Transitional urothelium', 'Type A intercalated cell',\n",
       "       'Type B intercalated cell'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 344,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# make consistent annotations across datasets\n",
    "adata.obs['celltype']=adata.obs['celltype'].copy()\n",
    "adata.obs['celltype'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 345,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['celltype'],\n",
    "                           categories=['Ascending vasa recta endothelium', 'B cell', 'CD4 T cell',\n",
    "       'CD8 T cell', 'Connecting tubule', 'Descending vasa recta endothelium',\n",
    "       'Distinct proximal tubule 1', 'Distinct proximal tubule 2',\n",
    "       'Epithelial progenitor cell', 'Fibroblast', 'Glomerular endothelium',\n",
    "       'Indistinct intercalated cell', 'MNP-a/classical monocyte derived',\n",
    "       'MNP-b/non-classical monocyte derived', 'MNP-c/dendritic cell',\n",
    "       'MNP-d/Tissue macrophage', 'Mast cell', 'Myofibroblast', 'NK cell',\n",
    "       'NKT cell', 'Neutrophil', 'Pelvic epithelium',\n",
    "       'Peritubular capillary endothelium 1',\n",
    "       'Peritubular capillary endothelium 2', 'Plasmacytoid dendritic cell',\n",
    "       'Podocyte', 'Principal cell', 'Proliferating Proximal Tubule',\n",
    "       'Proximal tubule', 'Thick ascending limb of Loop of Henle',\n",
    "       'Transitional urothelium', 'Type A intercalated cell',\n",
    "       'Type B intercalated cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 346,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,['Descending vasa recta endothelium','Glomerular endothelium','Peritubular capillary endothelium 1',\n",
    "       'Peritubular capillary endothelium 2'])\n",
    "ref_cluster[ix]='Ascending vasa recta endothelium'\n",
    "\n",
    "ix=np.isin(ref_cluster,['CD8 T cell','NKT cell'])\n",
    "ref_cluster[ix]='CD4 T cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,['Transitional urothelium'])\n",
    "ref_cluster[ix]='Pelvic epithelium'\n",
    "\n",
    "ix=np.isin(ref_cluster,['Type A intercalated cell','Type B intercalated cell', 'Indistinct intercalated cell'])\n",
    "ref_cluster[ix]='Principal cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,['Distinct proximal tubule 1', 'Distinct proximal tubule 2', 'Proximal tubule', 'Connecting tubule', 'Epithelial progenitor cell', \n",
    "                       'Thick ascending limb of Loop of Henle'])\n",
    "ref_cluster[ix]= 'Proliferating Proximal Tubule'\n",
    "\n",
    "ix=np.isin(ref_cluster,['MNP-b/non-classical monocyte derived'])\n",
    "ref_cluster[ix]='MNP-a/classical monocyte derived'\n",
    "\n",
    "ix=np.isin(ref_cluster,['Plasmacytoid dendritic cell'])\n",
    "ref_cluster[ix]='MNP-c/dendritic cell'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 347,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['celltype']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['Ascending vasa recta endothelium', 'B cell', 'CD4 T cell',\n",
    "                                                      'Fibroblast','MNP-a/classical monocyte derived',\n",
    "                                                       'MNP-c/dendritic cell','MNP-d/Tissue macrophage', \n",
    "                                                       'Mast cell', 'Myofibroblast', 'NK cell',\n",
    "                                                       'Neutrophil', 'Pelvic epithelium', 'Podocyte', 'Principal cell', 'Proliferating Proximal Tubule'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 348,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['Ascending vasa recta endothelium', 'B cell', 'CD4 T cell',\n",
       "       'Fibroblast', 'MNP-a/classical monocyte derived',\n",
       "       'MNP-c/dendritic cell', 'MNP-d/Tissue macrophage', 'Mast cell',\n",
       "       'Myofibroblast', 'NK cell', 'Neutrophil', 'Pelvic epithelium',\n",
       "       'Podocyte', 'Principal cell', 'Proliferating Proximal Tubule'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 348,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.obs['celltype'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 349,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.rename_categories('celltype', ['Endothelial cells', 'B cells', 'T cells',\n",
    "        'Fibroblast cells','Monocytes',\n",
    "        'Dendritic cells','Macrophages', 'Mast cells', \n",
    "        'Myofibroblast cells', 'NK cells','Neutrophils', 'Urothelial cells',\n",
    "        'Podocytes', 'Collecting duct system cells','Tubule cells'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 350,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['Endothelial cells', 'B cells', 'T cells', 'Fibroblast cells',\n",
       "       'Monocytes', 'Dendritic cells', 'Macrophages', 'Mast cells',\n",
       "       'Myofibroblast cells', 'NK cells', 'Neutrophils', 'Urothelial cells',\n",
       "       'Podocytes', 'Collecting duct system cells', 'Tubule cells'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 350,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.obs['celltype'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 354,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['Organ'] = 'Kidney'\n",
    "adata.obs['Organ_Specific'] = 'Kidney'\n",
    "adata.obs['Dataset'] = 'Steward_Kidney'\n",
    "adata.obs['InternDatasetNumber'] = '15-1-Kidney-Steward-2019'\n",
    "adata.obs['Dataset_status'] = 'Healthy_Dataset'\n",
    "\n",
    "adata.obs['celltype'] = adata.obs['celltype']\n",
    "adata.obs['sub_celltype'] ='NaN'\n",
    "adata.obs['Malignant'] = 'NonMalignant'\n",
    "\n",
    "adata.obs['Patient'] = 'Steward_Kidney-Donor1'\n",
    "adata.obs['Patient_Number'] = 'NaN'\n",
    "adata.obs['age'] = 'NaN'\n",
    "adata.obs['sex'] = 'NaN'\n",
    "adata.obs['ethnicity'] = 'NaN'\n",
    "adata.obs['health_status'] = 'NaN'\n",
    "\n",
    "adata.obs['original_celltype_1'] = adata.obs['cell_ontology_class']\n",
    "adata.obs['original_celltype_2'] = adata.obs['cell_ontology_class_broad']\n",
    "adata.obs['original_celltype_3'] = 'NaN'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 356,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 357,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs_names_make_unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 358,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.write(writepath + '15-1-Kidney-Steward-2019-processed.h5ad')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 15-2-Kidney-Wilson-2021"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 361,
   "metadata": {},
   "outputs": [],
   "source": [
    "target_collections = [\"9b02383a-9358-4f0f-9795-a891ec523bcc\"]\n",
    "cache_path = os.path.join(\".\", \"data\")\n",
    "dsg = sfaira.data.dataloaders.databases.DatasetSuperGroupDatabases(data_path=cache_path, cache_metadata=True)\n",
    "dsg.subset(key=\"collection_id\", values=target_collections)\n",
    "dsg.datasets\n",
    "dsg.download()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 362,
   "metadata": {},
   "outputs": [],
   "source": [
    "path = '/path/to/repo/9b02383a-9358-4f0f-9795-a891ec523bcc/'\n",
    "files = [f for f in listdir(path) if isfile(join(path, f))]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 363,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['13a027de-ea3e-432b-9a5e-6bc7048498fc.h5ad',\n",
       " '9df60c57-fdf3-4e93-828e-fe9303f20438.h5ad']"
      ]
     },
     "execution_count": 363,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 364,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "13a027de-ea3e-432b-9a5e-6bc7048498fc.h5ad\n",
      "9df60c57-fdf3-4e93-828e-fe9303f20438.h5ad\n"
     ]
    }
   ],
   "source": [
    "for i in range(len(files)):\n",
    "    print(files[i])\n",
    "    path_2 = path + files[i]\n",
    "    u = sc.read_h5ad(path_2)\n",
    "    u.obs['id'] = files[i]\n",
    "    u.var.index = u.var['feature_name']\n",
    "    #print(u)\n",
    "    #print(u.obs['disease'])\n",
    "    if u.n_obs == 19985:\n",
    "        adata = u"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 368,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['InternDatasetNumber'] ='15-2-Kidney-Wilson-2021'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 372,
   "metadata": {},
   "outputs": [],
   "source": [
    "#calculate QC covariates\n",
    "adata.obs['n_counts'] = adata.X.sum(1)\n",
    "adata.obs['log_counts'] = np.log(adata.obs['n_counts'])\n",
    "adata.obs['n_genes'] = (adata.X > 0).sum(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 376,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "filtered out 1 cells that have more than 2000 counts\n",
      "filtered out 2 cells that have more than 2500 genes expressed\n",
      "filtered out 14058 genes that are detected in less than 20 cells\n"
     ]
    }
   ],
   "source": [
    "# FILTER PARAMETERS\n",
    "#Filter out cells\n",
    "sc.pp.filter_cells(adata, max_counts = 2000)\n",
    "sc.pp.filter_cells(adata, max_genes = 2500)\n",
    "# Min 20 cells - filters out low count genes\n",
    "sc.pp.filter_genes(adata, min_cells=20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 378,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get mt genes\n",
    "mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]\n",
    "np.array(mt_genes)\n",
    "mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]\n",
    "\n",
    "# convert to spare\n",
    "adata.X=sp.sparse.csr_matrix.todense(adata.X)\n",
    "\n",
    "# manually define mt score\n",
    "y = np.bincount(mt_gene_mask)\n",
    "ii = np.nonzero(y)[0]\n",
    "np.vstack((ii,y[ii])).T\n",
    "adata.X[:, mt_gene_mask].sum(1)\n",
    "mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))\n",
    "mt_sum=np.array(pd.DataFrame(mt_sum)[0])\n",
    "mt_frac=mt_sum/adata.obs['n_counts'].values\n",
    "\n",
    "adata.obs['mt_frac'] = mt_frac\n",
    "\n",
    "#Filter out cells with over 20% mito fraction\n",
    "adata = adata[adata.obs['mt_frac'] < 0.20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 379,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pp=adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 380,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:00:01): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n",
      "computing PCA\n",
      "    with n_comps=15\n",
      "    finished (0:00:05)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:02)\n",
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 18 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:00:01)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 381,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 382,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 383,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/python/lib/python3.7/site-packages/ipykernel_launcher.py:3: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "#Delete adata_pp\n",
    "del adata_pp\n",
    "adata.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 384,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 385,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "#Keep the count data in a counts layer\n",
    "adata.layers[\"counts\"] = adata.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 386,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.raw = adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 387,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "adata.X /= adata.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 388,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n",
      "    finished (0:00:02)\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 389,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=50\n",
      "    finished (0:00:03)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 50\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:03)\n",
      "computing UMAP\n",
      "    finished: added\n",
      "    'X_umap', UMAP coordinates (adata.obsm) (0:00:16)\n"
     ]
    }
   ],
   "source": [
    "# Calculate the visualizations\n",
    "sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')\n",
    "sc.pp.neighbors(adata)\n",
    "sc.tl.umap(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 391,
   "metadata": {},
   "outputs": [],
   "source": [
    "# make consistent annotations across datasets\n",
    "adata.obs['celltype']=adata.obs['cell_type'].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 392,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['epithelial cell of proximal tubule', 'fibroblast',\n",
       "       'glomerular visceral epithelial cell',\n",
       "       'kidney capillary endothelial cell',\n",
       "       'kidney connecting tubule epithelial cell',\n",
       "       'kidney distal convoluted tubule epithelial cell',\n",
       "       'kidney loop of Henle thick ascending limb epithelial cell',\n",
       "       'leukocyte', 'mesangial cell', 'parietal epithelial cell',\n",
       "       'renal alpha-intercalated cell', 'renal beta-intercalated cell',\n",
       "       'renal principal cell'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 392,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.obs['celltype'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 393,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['celltype'],\n",
    "                           categories=['epithelial cell of proximal tubule', 'fibroblast',\n",
    "       'glomerular visceral epithelial cell',\n",
    "       'kidney capillary endothelial cell',\n",
    "       'kidney connecting tubule epithelial cell',\n",
    "       'kidney distal convoluted tubule epithelial cell',\n",
    "       'kidney loop of Henle thick ascending limb epithelial cell',\n",
    "       'leukocyte', 'mesangial cell', 'parietal epithelial cell',\n",
    "       'renal alpha-intercalated cell', 'renal beta-intercalated cell',\n",
    "       'renal principal cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 394,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,['kidney connecting tubule epithelial cell', 'kidney distal convoluted tubule epithelial cell',  'kidney loop of Henle thick ascending limb epithelial cell'])\n",
    "ref_cluster[ix]='epithelial cell of proximal tubule'\n",
    "\n",
    "ix=np.isin(ref_cluster,['renal beta-intercalated cell', 'renal principal cell'])\n",
    "ref_cluster[ix]='renal alpha-intercalated cell'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 395,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['celltype']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['epithelial cell of proximal tubule', 'fibroblast',\n",
    "       'glomerular visceral epithelial cell',\n",
    "       'kidney capillary endothelial cell',\n",
    "\n",
    "       \n",
    "      \n",
    "       'leukocyte', 'mesangial cell', 'parietal epithelial cell',\n",
    "       'renal alpha-intercalated cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 396,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.rename_categories('celltype', ['Tubule cells', 'Fibroblast cells',\n",
    "       'Glomerular visceral epithelial cells',\n",
    "       'Endothelial cells',\n",
    "\n",
    "       \n",
    "      \n",
    "       'Leukocytes', 'Mesangial cells', 'Parietal epithelial cells',\n",
    "       'Collecting duct system cells'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 400,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['reported_diseases'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['reported_diseases'],\n",
    "                           categories=['[Lyme disease,anxiety disorder,depressive disorder,kidney cancer,diverticulitis,gastroesophageal reflux disease,arthritic joint disease]',\n",
    "       '[benign prostatic hyperplasia,pericardial effusion,acute kidney tubular necrosis,essential hypertension,kidney cancer]',\n",
    "       '[kidney cancer,obstructive sleep apnea syndrome,acoustic neuroma]',\n",
    "       '[kidney cancer]'])\n",
    "adata.rename_categories('reported_diseases', ['Lyme disease,Anxiety disorder,Depressive disorder,Kidney cancer,Diverticulitis,Gastroesophageal reflux disease,Arthritic joint disease',\n",
    "       'Benign prostatic hyperplasia,Pericardial effusion,Acute kidney tubular necrosis,Essential hypertension,Kidney cancer',\n",
    "       'Kidney cancer,Obstructive sleep apnea syndrome,Acoustic neuroma',\n",
    "       'Kidney cancer'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 401,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['tissue'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['tissue'],\n",
    "                           categories=['cortex of kidney'])\n",
    "adata.rename_categories('tissue', ['Kidney_Cortex'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 402,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['sex'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['sex'],\n",
    "                           categories=['female', 'male'])\n",
    "adata.rename_categories('sex', ['Female', 'Male'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 403,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['development_stage'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['development_stage'],\n",
    "                           categories=['50-year-old human stage', '52-year-old human stage',\n",
    "       '54-year-old human stage', '61-year-old human stage',\n",
    "       '62-year-old human stage'])\n",
    "adata.rename_categories('development_stage', ['50', '52',\n",
    "       '54', '61',\n",
    "       '62'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 404,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['donor'] = adata.obs['donor_uuid']\n",
    "ref_cluster=pd.Categorical(adata.obs['donor'],\n",
    "                           categories=['8c570254-4bef-48d8-bd79-c812f60835a5',\n",
    "       '5028f75a-8c09-4155-a232-ad7dbfa6042e',\n",
    "       '8213a3f7-2437-4e8a-b836-caec33df901d',\n",
    "       'e0def004-9e30-4a3b-9a65-007110f3a1f2',\n",
    "       'f6c0f811-2fb8-4989-b796-37c14b055517'])\n",
    "adata.rename_categories('donor', ['Wilson_Kidney-Donor1',\n",
    "       'Wilson_Kidney-Donor2',\n",
    "       'Wilson_Kidney-Donor3',\n",
    "       'Wilson_Kidney-Donor4',\n",
    "       'Wilson_Kidney-Donor5'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 405,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['Organ'] = 'Kidney'\n",
    "adata.obs['Organ_Specific'] = adata.obs['tissue']\n",
    "adata.obs['Dataset'] = 'Wilson_Kidney'\n",
    "adata.obs['InternDatasetNumber'] = '15-2-Kidney-Wilson-2021'\n",
    "adata.obs['Dataset_status'] = 'Ill_Dataset'\n",
    "\n",
    "adata.obs['celltype'] = adata.obs['celltype']\n",
    "adata.obs['sub_celltype'] ='NaN'\n",
    "adata.obs['Malignant'] = 'NonMalignant'\n",
    "\n",
    "adata.obs['Patient'] = adata.obs['donor']\n",
    "adata.obs['Patient_Number'] = adata.obs['sample_uuid']\n",
    "adata.obs['age'] = adata.obs['development_stage']\n",
    "adata.obs['sex'] = adata.obs['sex']\n",
    "adata.obs['ethnicity'] = adata.obs['ethnicity']\n",
    "adata.obs['health_status'] = adata.obs['reported_diseases']\n",
    "\n",
    "adata.obs['original_celltype_1'] = adata.obs['cell_type']\n",
    "adata.obs['original_celltype_2'] = 'NaN'\n",
    "adata.obs['original_celltype_3'] = 'NaN'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 407,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 408,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs_names_make_unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 411,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.write(writepath + '15-2-Kidney-Wilson-2021-processed.h5ad')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "##  15-3-Kidney-Pisco-2022"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(adata_pisco.obs['tissue'],['kidney']) \n",
    "adata=adata_pisco[ix].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "#calculate QC covariates\n",
    "adata.obs['n_counts'] = adata.X.sum(1)\n",
    "adata.obs['log_counts'] = np.log(adata.obs['n_counts'])\n",
    "adata.obs['n_genes'] = (adata.X > 0).sum(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "filtered out 4 cells that have more than 13000 counts\n",
      "filtered out 3 cells that have more than 7500 genes expressed\n",
      "filtered out 40007 genes that are detected in less than 20 cells\n"
     ]
    }
   ],
   "source": [
    "# FILTER PARAMETERS\n",
    "#Filter out cells\n",
    "sc.pp.filter_cells(adata, max_counts = 13000)\n",
    "sc.pp.filter_cells(adata, max_genes = 7500)\n",
    "# Min 20 cells - filters out low count genes\n",
    "sc.pp.filter_genes(adata, min_cells=20) #500 works # # 450(30494) #400(29837) #300(28268) not"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get mt genes\n",
    "mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]\n",
    "np.array(mt_genes)\n",
    "mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]\n",
    "\n",
    "# convert to spare\n",
    "adata.X=sp.sparse.csr_matrix.todense(adata.X)\n",
    "\n",
    "# manually define mt score\n",
    "y = np.bincount(mt_gene_mask)\n",
    "ii = np.nonzero(y)[0]\n",
    "np.vstack((ii,y[ii])).T\n",
    "adata.X[:, mt_gene_mask].sum(1)\n",
    "mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))\n",
    "mt_sum=np.array(pd.DataFrame(mt_sum)[0])\n",
    "mt_frac=mt_sum/adata.obs['n_counts'].values\n",
    "\n",
    "adata.obs['mt_frac'] = mt_frac\n",
    "\n",
    "#Filter out cells with over 20% mito fraction\n",
    "adata = adata[adata.obs['mt_frac'] < 0.20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pp=adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:00:00): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n",
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=15\n",
      "    finished (0:00:00)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:01)\n",
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 12 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:00:00)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/python/lib/python3.7/site-packages/ipykernel_launcher.py:3: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "#Delete adata_pp\n",
    "del adata_pp\n",
    "adata.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "#Keep the count data in a counts layer\n",
    "adata.layers[\"counts\"] = adata.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.raw = adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "adata.X /= adata.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n",
      "    finished (0:00:01)\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=50\n",
      "    finished (0:00:01)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 50\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:02)\n",
      "computing UMAP\n",
      "    finished: added\n",
      "    'X_umap', UMAP coordinates (adata.obsm) (0:00:19)\n"
     ]
    }
   ],
   "source": [
    "# Calculate the visualizations\n",
    "sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')\n",
    "sc.pp.neighbors(adata)\n",
    "sc.tl.umap(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [],
   "source": [
    "# make consistent annotations across datasets\n",
    "adata.obs['celltype']=adata.obs['cell_type'].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['B cell', 'CD4-positive helper T cell',\n",
       "       'CD8-positive, alpha-beta T cell', 'endothelial cell',\n",
       "       'kidney epithelial cell', 'macrophage', 'mature NK T cell'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 87,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.obs['celltype'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['celltype'],\n",
    "                           categories=['B cell', 'CD4-positive helper T cell',\n",
    "       'CD8-positive, alpha-beta T cell', 'endothelial cell',\n",
    "       'kidney epithelial cell', 'macrophage', 'mature NK T cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,['CD8-positive, alpha-beta T cell'])\n",
    "ref_cluster[ix]='CD4-positive helper T cell'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['celltype']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['B cell', 'CD4-positive helper T cell',\n",
    "       'endothelial cell',\n",
    "       'kidney epithelial cell', 'macrophage', 'mature NK T cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.rename_categories('celltype', ['B cells', 'T cells',\n",
    "       'Endothelial cells',\n",
    "       'Tubule cells', 'Macrophages', 'NK cells'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['tissue'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['tissue'],\n",
    "                           categories=['kidney'])\n",
    "adata.rename_categories('tissue', ['Kidney'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['sex'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['sex'],\n",
    "                           categories=['female'])\n",
    "adata.rename_categories('sex', ['Female'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['ethnicity'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['ethnicity'],\n",
    "                           categories=['African American or Afro-Caribbean'])\n",
    "adata.rename_categories('ethnicity', ['African-American or Afro-Caribbean'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['development_stage'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['development_stage'],\n",
    "                           categories=['61-year-old human stage'])\n",
    "adata.rename_categories('development_stage',['61'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['donor'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['donor'],\n",
    "                           categories=['TSP2'])\n",
    "adata.rename_categories('donor', ['TSP2'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['Organ'] = 'Kidney'\n",
    "adata.obs['Organ_Specific'] = adata.obs['tissue']\n",
    "adata.obs['Dataset'] = 'Kidney_Heart'\n",
    "adata.obs['InternDatasetNumber'] = '15-3-Kidney-Pisco-2022'\n",
    "adata.obs['Dataset_status'] = 'Healthy_Dataset'\n",
    "\n",
    "adata.obs['celltype'] = adata.obs['celltype']\n",
    "adata.obs['sub_celltype'] = 'NaN'\n",
    "adata.obs['Malignant'] = 'NonMalignant'\n",
    "\n",
    "adata.obs['Patient'] = adata.obs['donor']\n",
    "adata.obs['Patient_Number'] = 'NaN'\n",
    "adata.obs['age'] = adata.obs['development_stage']\n",
    "adata.obs['sex'] = adata.obs['sex']\n",
    "adata.obs['ethnicity'] = adata.obs['ethnicity']\n",
    "adata.obs['health_status'] = 'NaN'\n",
    "\n",
    "adata.obs['original_celltype_1'] = adata.obs['cell_type']\n",
    "adata.obs['original_celltype_2'] = 'NaN'\n",
    "adata.obs['original_celltype_3'] = 'NaN'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs_names_make_unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.write(writepath + '15-3-Kidney-Pisco-2022-processed.h5ad')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## 15-4-Kidney-Han-2020"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 547,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['InternDatasetNumber'] ='15-4-Kidney-Han-2020'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 551,
   "metadata": {},
   "outputs": [],
   "source": [
    "#calculate QC covariates\n",
    "adata.obs['n_counts'] = adata.X.sum(1)\n",
    "adata.obs['log_counts'] = np.log(adata.obs['n_counts'])\n",
    "adata.obs['n_genes'] = (adata.X > 0).sum(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 554,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "filtered out 39 cells that have more than 4400 counts\n",
      "filtered out 10454 genes that are detected in less than 20 cells\n"
     ]
    }
   ],
   "source": [
    "# FILTER PARAMETERS#Filter out cells\n",
    "sc.pp.filter_cells(adata, max_counts = 4400)\n",
    "sc.pp.filter_cells(adata, max_genes = 2200)\n",
    "# Min 20 cells - filters out low count genes\n",
    "sc.pp.filter_genes(adata, min_cells=20) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 555,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get mt genes\n",
    "mt_genes = adata.var_names[[gene.startswith('MT-') for gene in adata.var_names]]\n",
    "np.array(mt_genes)\n",
    "mt_gene_mask = [gene.startswith('MT-') for gene in adata.var_names]\n",
    "\n",
    "# convert to spare\n",
    "adata.X=sp.sparse.csr_matrix.todense(adata.X)\n",
    "\n",
    "# manually define mt score\n",
    "y = np.bincount(mt_gene_mask)\n",
    "ii = np.nonzero(y)[0]\n",
    "np.vstack((ii,y[ii])).T\n",
    "adata.X[:, mt_gene_mask].sum(1)\n",
    "mt_sum=np.array(adata.X[:, mt_gene_mask].sum(1))\n",
    "mt_sum=np.array(pd.DataFrame(mt_sum)[0])\n",
    "mt_frac=mt_sum/adata.obs['n_counts'].values\n",
    "\n",
    "adata.obs['mt_frac'] = mt_frac\n",
    "\n",
    "#Filter out cells with over 20% mito fraction\n",
    "adata = adata[adata.obs['mt_frac'] < 0.20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 556,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata_pp=adata.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 557,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "normalizing by total count per cell\n",
      "    finished (0:00:01): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)\n",
      "computing PCA\n",
      "    with n_comps=15\n",
      "    finished (0:00:05)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 15\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:02)\n",
      "running Louvain clustering\n",
      "    using the \"louvain\" package of Traag (2017)\n",
      "    finished: found 16 clusters and added\n",
      "    'groups', the cluster labels (adata.obs, categorical) (0:00:03)\n"
     ]
    }
   ],
   "source": [
    "#Perform a clustering for scran normalization in clusters\n",
    "sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6)\n",
    "sc.pp.log1p(adata_pp)\n",
    "sc.pp.pca(adata_pp, n_comps=15)\n",
    "sc.pp.neighbors(adata_pp)\n",
    "sc.tl.louvain(adata_pp, key_added='groups', resolution=0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 558,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Preprocess variables for scran normalization\n",
    "input_groups = adata_pp.obs['groups']\n",
    "data_mat = adata.X.T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 559,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%R -i data_mat -i input_groups -o size_factors\n",
    "require(scran)\n",
    "size_factors = computeSumFactors(data_mat, clusters=input_groups, min.mean=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 560,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/python/lib/python3.7/site-packages/ipykernel_launcher.py:3: ImplicitModificationWarning: Trying to modify attribute `.obs` of view, initializing view as actual.\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    }
   ],
   "source": [
    "#Delete adata_pp\n",
    "del adata_pp\n",
    "adata.obs['size_factors'] = size_factors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 561,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.strings_to_categoricals()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 562,
   "metadata": {},
   "outputs": [],
   "source": [
    "#make  (adata.X) copy of counts of raw data for downstream analysis\n",
    "#Keep the count data in a counts layer\n",
    "adata.layers[\"counts\"] = adata.X.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 563,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.raw = adata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 564,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Normalize data\n",
    "adata.X /= adata.obs['size_factors'].values[:, None]\n",
    "sc.pp.log1p(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 565,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "If you pass `n_top_genes`, all cutoffs are ignored.\n",
      "extracting highly variable genes\n",
      "    finished (0:00:02)\n"
     ]
    }
   ],
   "source": [
    "# extract highly variable genes\n",
    "sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 566,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "computing PCA\n",
      "    on highly variable genes\n",
      "    with n_comps=50\n",
      "    finished (0:00:03)\n",
      "computing neighbors\n",
      "    using 'X_pca' with n_pcs = 50\n",
      "    finished: added to `.uns['neighbors']`\n",
      "    `.obsp['distances']`, distances for each pair of neighbors\n",
      "    `.obsp['connectivities']`, weighted adjacency matrix (0:00:04)\n",
      "computing UMAP\n",
      "    finished: added\n",
      "    'X_umap', UMAP coordinates (adata.obsm) (0:00:18)\n"
     ]
    }
   ],
   "source": [
    "# Calculate the visualizations\n",
    "sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')\n",
    "sc.pp.neighbors(adata)\n",
    "sc.tl.umap(adata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 568,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['B cell', 'B cell (Plasmocyte)', 'B cell(Plasmocyte)',\n",
       "       'Conventional dendritic cell', 'Dendritic cell', 'Distal tubule cell',\n",
       "       'Distal tubule cell_SLC12A3 high', 'Endothelial cell',\n",
       "       'Endothelial cell_EMCN high', 'Endothelial cell_IGFBP5 high',\n",
       "       'Epithelial_cell_NUPR1 high', 'Fenestrated endothelial cell_EMCN high',\n",
       "       'Fenestrated endothelial cell_SELE high', 'Fibroblast',\n",
       "       'Glomerular endothelial cell_AQP1 high', 'IC-tran-PC',\n",
       "       'Intercalated cell', 'Intercalated cell_SLC26A4 high',\n",
       "       'Intercalated cell_SPINK1 high', 'Kidney Epithelial cell',\n",
       "       'Loop of Henle (Thick ascending limb)',\n",
       "       'Loop of Henle(Thick ascending limb)', 'Loop of henle _ANXA1 high',\n",
       "       'Loop of henle _KNG1 high', 'Loop of henle _UMOD high',\n",
       "       'Loop of henle_SFN high', 'Loop of henle_SLPI high',\n",
       "       'Loop of henle_SOD3 high', 'Loop of henle_SPP1 high', 'Macrophage',\n",
       "       'Macrophage_APOC1 high', 'Macrophage_GPR183 high', 'Mast cell',\n",
       "       'Myocyte', 'Myofibroblast', 'Neutrophil', 'Principle cell',\n",
       "       'Proximal tubule cell', 'Proximal tubule cell_ALDOB high',\n",
       "       'Proximal tubule cell_MT1G high', 'Proximal tubule cell_SOX4 high',\n",
       "       'Smooth muscle cell', 'T cell', 'Unknown', 'Ureteric Epithelial cell'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 568,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# make consistent annotations across datasets\n",
    "adata.obs['celltype']=adata.obs['celltype_specific'].copy()\n",
    "adata.obs['celltype'].cat.categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 569,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.rename_categories('celltype', ['B cell', 'B cell (Plasmocyte)', 'B cell(Plasmocyte)_2',\n",
    "       'Conventional dendritic cell', 'Dendritic cell', 'Distal tubule cell',\n",
    "       'Distal tubule cell_SLC12A3 high', 'Endothelial cell',\n",
    "       'Endothelial cell_EMCN high', 'Endothelial cell_IGFBP5 high',\n",
    "       'Epithelial_cell_NUPR1 high', 'Fenestrated endothelial cell_EMCN high',\n",
    "       'Fenestrated endothelial cell_SELE high', 'Fibroblast',\n",
    "       'Glomerular endothelial cell_AQP1 high', 'IC-tran-PC',\n",
    "       'Intercalated cell', 'Intercalated cell_SLC26A4 high',\n",
    "       'Intercalated cell_SPINK1 high', 'Kidney Epithelial cell',\n",
    "       'Loop of Henle (Thick ascending limb)',\n",
    "       'Loop of Henle(Thick ascending limb)', 'Loop of henle _ANXA1 high',\n",
    "       'Loop of henle _KNG1 high', 'Loop of henle _UMOD high',\n",
    "       'Loop of henle_SFN high', 'Loop of henle_SLPI high',\n",
    "       'Loop of henle_SOD3 high', 'Loop of henle_SPP1 high', 'Macrophage',\n",
    "       'Macrophage_APOC1 high', 'Macrophage_GPR183 high', 'Mast cell',\n",
    "       'Myocyte', 'Myofibroblast', 'Neutrophil', 'Principle cell',\n",
    "       'Proximal tubule cell', 'Proximal tubule cell_ALDOB high',\n",
    "       'Proximal tubule cell_MT1G high', 'Proximal tubule cell_SOX4 high',\n",
    "       'Smooth muscle cell', 'T cell', 'Unknown', 'Ureteric Epithelial cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 570,
   "metadata": {},
   "outputs": [],
   "source": [
    "ref_cluster=pd.Categorical(adata.obs['celltype'],\n",
    "                           categories=['B cell', 'B cell (Plasmocyte)', 'B cell(Plasmocyte)_2',\n",
    "       'Conventional dendritic cell', 'Dendritic cell', 'Distal tubule cell',\n",
    "       'Distal tubule cell_SLC12A3 high', 'Endothelial cell',\n",
    "       'Endothelial cell_EMCN high', 'Endothelial cell_IGFBP5 high',\n",
    "       'Epithelial_cell_NUPR1 high', 'Fenestrated endothelial cell_EMCN high',\n",
    "       'Fenestrated endothelial cell_SELE high', 'Fibroblast',\n",
    "       'Glomerular endothelial cell_AQP1 high', 'IC-tran-PC',\n",
    "       'Intercalated cell', 'Intercalated cell_SLC26A4 high',\n",
    "       'Intercalated cell_SPINK1 high', 'Kidney Epithelial cell',\n",
    "       'Loop of Henle (Thick ascending limb)',\n",
    "       'Loop of Henle(Thick ascending limb)', 'Loop of henle _ANXA1 high',\n",
    "       'Loop of henle _KNG1 high', 'Loop of henle _UMOD high',\n",
    "       'Loop of henle_SFN high', 'Loop of henle_SLPI high',\n",
    "       'Loop of henle_SOD3 high', 'Loop of henle_SPP1 high', 'Macrophage',\n",
    "       'Macrophage_APOC1 high', 'Macrophage_GPR183 high', 'Mast cell',\n",
    "       'Myocyte', 'Myofibroblast', 'Neutrophil', 'Principle cell',\n",
    "       'Proximal tubule cell', 'Proximal tubule cell_ALDOB high',\n",
    "       'Proximal tubule cell_MT1G high', 'Proximal tubule cell_SOX4 high',\n",
    "       'Smooth muscle cell', 'T cell', 'Unknown', 'Ureteric Epithelial cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 571,
   "metadata": {},
   "outputs": [],
   "source": [
    "ix=np.isin(ref_cluster,[ 'B cell(Plasmocyte)_2'])\n",
    "ref_cluster[ix]= 'B cell (Plasmocyte)'\n",
    "\n",
    "ix=np.isin(ref_cluster,[  'Conventional dendritic cell'])\n",
    "ref_cluster[ix]=  'Dendritic cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,[ 'Endothelial cell_EMCN high', 'Endothelial cell_IGFBP5 high','Fenestrated endothelial cell_EMCN high',\n",
    "       'Fenestrated endothelial cell_SELE high'])\n",
    "ref_cluster[ix]= 'Endothelial cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,[  'Distal tubule cell_SLC12A3 high',   'Glomerular endothelial cell_AQP1 high',  'Loop of Henle (Thick ascending limb)',\n",
    "       'Loop of Henle(Thick ascending limb)', 'Loop of henle _ANXA1 high',\n",
    "       'Loop of henle _KNG1 high', 'Loop of henle _UMOD high',\n",
    "       'Loop of henle_SFN high', 'Loop of henle_SLPI high',\n",
    "       'Loop of henle_SOD3 high', 'Loop of henle_SPP1 high', \n",
    "                         'Proximal tubule cell', 'Proximal tubule cell_ALDOB high',\n",
    "       'Proximal tubule cell_MT1G high', 'Proximal tubule cell_SOX4 high',  'Kidney Epithelial cell',  'Epithelial_cell_NUPR1 high'])\n",
    "ref_cluster[ix]= 'Distal tubule cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,[ 'Intercalated cell_SLC26A4 high', 'Intercalated cell_SPINK1 high',  'IC-tran-PC',\n",
    "      'Principle cell'])\n",
    "ref_cluster[ix]=  'Intercalated cell'\n",
    "\n",
    "ix=np.isin(ref_cluster,[   'Macrophage_APOC1 high', 'Macrophage_GPR183 high'])\n",
    "ref_cluster[ix]=  'Macrophage'\n",
    "\n",
    "ix=np.isin(ref_cluster,[ 'Myocyte'])\n",
    "ref_cluster[ix]=  'Smooth muscle cell'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 572,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['celltype']=pd.Categorical(ref_cluster,\n",
    "                                           categories=['B cell', 'B cell (Plasmocyte)', \n",
    "       'Dendritic cell', 'Distal tubule cell',\n",
    "       'Endothelial cell',\n",
    "       \n",
    "       \n",
    "                                                       'Fibroblast',\n",
    "    \n",
    "                  'Intercalated cell',                                   \n",
    "      'Macrophage',\n",
    "      'Mast cell',\n",
    "        'Myofibroblast', 'Neutrophil', \n",
    "     \n",
    "       'Smooth muscle cell', 'T cell', 'Unknown', 'Ureteric Epithelial cell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 573,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.rename_categories('celltype', ['B cells', 'Plasma cells', \n",
    "       'Dendritic cells', 'Tubule cells',\n",
    "       'Endothelial cells',\n",
    "       \n",
    "       \n",
    "                                                       'Fibroblast cells',\n",
    "    \n",
    "                  'Collecting duct system cells' ,                                  \n",
    "      'Macrophages',\n",
    "      'Mast cells',\n",
    "        'Myofibroblast cells', 'Neutrophils', \n",
    "     \n",
    "       'Smooth muscle cells', 'T cells', 'Unknown', 'Ureteric epithelial cells'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 577,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['sub_tissue'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['sub_tissue'],\n",
    "                           categories=['AdultKidney'])\n",
    "adata.rename_categories('sub_tissue',['Kidney'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 578,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['sex'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['sex'],\n",
    "                           categories=['male'])\n",
    "adata.rename_categories('sex', ['Male'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 579,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['age'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['age'],\n",
    "                           categories=['41Y', '57Y', '66Y'])\n",
    "adata.rename_categories('age',['41', '57', '66'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 580,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['donor'].cat.categories\n",
    "ref_cluster=pd.Categorical(adata.obs['donor'],\n",
    "                           categories=['Donor34', 'Donor36', 'Donor37'])\n",
    "adata.rename_categories('donor', ['Han-Donor34', 'Han-Donor36', 'Han-Donor37'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 581,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['Organ'] = 'Kidney'\n",
    "adata.obs['Organ_Specific'] = adata.obs['sub_tissue']\n",
    "adata.obs['Dataset'] = 'Han_Kidney'\n",
    "adata.obs['InternDatasetNumber'] = '15-4-Kidney-Han-2020'\n",
    "adata.obs['Dataset_status'] = 'Healthy_Dataset'\n",
    "\n",
    "adata.obs['celltype'] = adata.obs['celltype']\n",
    "adata.obs['sub_celltype'] = 'NaN'\n",
    "adata.obs['Malignant'] = 'NonMalignant'\n",
    "\n",
    "adata.obs['Patient'] = adata.obs['donor']\n",
    "adata.obs['Patient_Number'] = 'NaN'\n",
    "adata.obs['age'] = adata.obs['age']\n",
    "adata.obs['sex'] = adata.obs['sex']\n",
    "adata.obs['ethnicity'] = 'NaN'\n",
    "adata.obs['health_status'] = 'NaN'\n",
    "\n",
    "adata.obs['original_celltype_1'] = adata.obs['celltype_specific']\n",
    "adata.obs['original_celltype_2'] = adata.obs['celltype_global']\n",
    "adata.obs['original_celltype_3'] = 'NaN'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 583,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.X = sp.sparse.csr_matrix(adata.X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 584,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.write(writepath + '15-4-Kidney-Han-2020-processed.h5ad')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": true,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}