{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## AML Data Processing Notebook\n", "\n", "This notebook processes the AML dataset obtained from [GEO (GSE116256)](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE116256) and saves it as a single count matrix.\n", "\n", "### Data Sources\n", "- The count matrix and annotations were downloaded from [GEO (GSE116256)](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE116256).\n", "- The compressed files are stored in: \n", " `/scMEDAL_for_scRNAseq/Experiments/data/AML_data/zip_files`\n", "- The processed count matrix is saved in: \n", " `/scMEDAL_for_scRNAseq/Experiments/data/AML_data/adata_merged`\n", "\n", "\n", "Environment: preprocess_and_plot_umaps_env\n", "\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "data_base_path: /endosome/archive/bioinformatics/DLLab/src/AixaAndrade/gitfront/scMEDAL_for_scRNAseq/Experiments/AML/../data/AML_data\n", "outputs_path: /endosome/archive/bioinformatics/DLLab/src/AixaAndrade/gitfront/scMEDAL_for_scRNAseq/Experiments/AML/../outputs/AML_outputs\n" ] } ], "source": [ "import sys\n", "# Add the parent directory to the Python path\n", "sys.path.append(\"../\")\n", "import os\n", "# Now you can import from the parent directory\n", "from paths_config import data_base_path\n", "\n", "from scMEDAL.utils.preprocessing_utils import AML_data_reader\n", "from scMEDAL.utils.utils import save_adata" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " id Patient_group counts\n", "0 AML1012 AML 1\n", "1 AML210A AML 1\n", "2 AML314 AML 2\n", "3 AML328 AML 4\n", "4 AML329 AML 3\n", "5 AML371 AML 2\n", "6 AML419A AML 1\n", "7 AML420B AML 3\n", "8 AML475 AML 2\n", "9 AML556 AML 3\n", "10 AML707B AML 5\n", "11 AML722B AML 2\n", "12 AML870 AML 2\n", "13 AML916 AML 1\n", "14 AML921A AML 1\n", "15 AML997 AML 2\n", "16 BM1 control 1\n", "17 BM2 control 1\n", "18 BM3 control 1\n", "19 BM4 control 1\n", "20 BM5 control 2\n", "21 MUTZ3 cellline 1\n", "22 OCI cellline 1\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
matrix_pathidfile_noteaccession_matrix_numanno_pathaccession_anno_numDayunique_idPatient_group
0/endosome/archive/bioinformatics/DLLab/src/Aix...AML328D0GSM3587931/endosome/archive/bioinformatics/DLLab/src/Aix...GSM3587932D0AML328_D0AML
1/endosome/archive/bioinformatics/DLLab/src/Aix...AML420BD14GSM3587955/endosome/archive/bioinformatics/DLLab/src/Aix...GSM3587956D14AML420B_D14AML
2/endosome/archive/bioinformatics/DLLab/src/Aix...AML314D0GSM3587927/endosome/archive/bioinformatics/DLLab/src/Aix...GSM3587928D0AML314_D0AML
3/endosome/archive/bioinformatics/DLLab/src/Aix...AML556D15GSM3587965/endosome/archive/bioinformatics/DLLab/src/Aix...GSM3587966D15AML556_D15AML
4/endosome/archive/bioinformatics/DLLab/src/Aix...AML314D31GSM3587929/endosome/archive/bioinformatics/DLLab/src/Aix...GSM3587930D31AML314_D31AML
5/endosome/archive/bioinformatics/DLLab/src/Aix...AML371D34GSM3587948/endosome/archive/bioinformatics/DLLab/src/Aix...GSM3587949D34AML371_D34AML
6/endosome/archive/bioinformatics/DLLab/src/Aix...AML210AD0GSM3587925/endosome/archive/bioinformatics/DLLab/src/Aix...GSM3587926D0AML210A_D0AML
7/endosome/archive/bioinformatics/DLLab/src/Aix...AML707BD41GSM3587975/endosome/archive/bioinformatics/DLLab/src/Aix...GSM3587976D41AML707B_D41AML
8/endosome/archive/bioinformatics/DLLab/src/Aix...AML475D0GSM3587959/endosome/archive/bioinformatics/DLLab/src/Aix...GSM3587960D0AML475_D0AML
9/endosome/archive/bioinformatics/DLLab/src/Aix...AML707BD113GSM3587971/endosome/archive/bioinformatics/DLLab/src/Aix...GSM3587972D113AML707B_D113AML
10/endosome/archive/bioinformatics/DLLab/src/Aix...AML328D113GSM3587933/endosome/archive/bioinformatics/DLLab/src/Aix...GSM3587934D113AML328_D113AML
11/endosome/archive/bioinformatics/DLLab/src/Aix...AML916D0GSM3587988/endosome/archive/bioinformatics/DLLab/src/Aix...GSM3587989D0AML916_D0AML
12/endosome/archive/bioinformatics/DLLab/src/Aix...AML329D37GSM3587944/endosome/archive/bioinformatics/DLLab/src/Aix...GSM3587945D37AML329_D37AML
13/endosome/archive/bioinformatics/DLLab/src/Aix...AML707BD18GSM3587973/endosome/archive/bioinformatics/DLLab/src/Aix...GSM3587974D18AML707B_D18AML
14/endosome/archive/bioinformatics/DLLab/src/Aix...AML556D0GSM3587963/endosome/archive/bioinformatics/DLLab/src/Aix...GSM3587964D0AML556_D0AML
15/endosome/archive/bioinformatics/DLLab/src/Aix...AML997D0GSM3587992/endosome/archive/bioinformatics/DLLab/src/Aix...GSM3587993D0AML997_D0AML
16/endosome/archive/bioinformatics/DLLab/src/Aix...AML722BD0GSM3587980/endosome/archive/bioinformatics/DLLab/src/Aix...GSM3587981D0AML722B_D0AML
17/endosome/archive/bioinformatics/DLLab/src/Aix...AML707BD97GSM3587977/endosome/archive/bioinformatics/DLLab/src/Aix...GSM3587978D97AML707B_D97AML
18/endosome/archive/bioinformatics/DLLab/src/Aix...AML1012D0GSM3587923/endosome/archive/bioinformatics/DLLab/src/Aix...GSM3587924D0AML1012_D0AML
19/endosome/archive/bioinformatics/DLLab/src/Aix...AML870D0GSM3587984/endosome/archive/bioinformatics/DLLab/src/Aix...GSM3587985D0AML870_D0AML
20/endosome/archive/bioinformatics/DLLab/src/Aix...AML329D20GSM3587942/endosome/archive/bioinformatics/DLLab/src/Aix...GSM3587943D20AML329_D20AML
21/endosome/archive/bioinformatics/DLLab/src/Aix...AML419AD0GSM3587950/endosome/archive/bioinformatics/DLLab/src/Aix...GSM3587951D0AML419A_D0AML
22/endosome/archive/bioinformatics/DLLab/src/Aix...AML707BD0GSM3587969/endosome/archive/bioinformatics/DLLab/src/Aix...GSM3587970D0AML707B_D0AML
23/endosome/archive/bioinformatics/DLLab/src/Aix...AML329D0GSM3587940/endosome/archive/bioinformatics/DLLab/src/Aix...GSM3587941D0AML329_D0AML
24/endosome/archive/bioinformatics/DLLab/src/Aix...AML556D31GSM3587967/endosome/archive/bioinformatics/DLLab/src/Aix...GSM3587968D31AML556_D31AML
25/endosome/archive/bioinformatics/DLLab/src/Aix...AML870D14GSM3587986/endosome/archive/bioinformatics/DLLab/src/Aix...GSM3587987D14AML870_D14AML
26/endosome/archive/bioinformatics/DLLab/src/Aix...BM4BM4GSM3588000/endosome/archive/bioinformatics/DLLab/src/Aix...GSM3588001NaNBM4control
27/endosome/archive/bioinformatics/DLLab/src/Aix...AML475D29GSM3587961/endosome/archive/bioinformatics/DLLab/src/Aix...GSM3587962D29AML475_D29AML
28/endosome/archive/bioinformatics/DLLab/src/Aix...OCINaNGSM3588005/endosome/archive/bioinformatics/DLLab/src/Aix...GSM3588005NaNOCIcellline
29/endosome/archive/bioinformatics/DLLab/src/Aix...AML997D35GSM3587994/endosome/archive/bioinformatics/DLLab/src/Aix...GSM3587995D35AML997_D35AML
30/endosome/archive/bioinformatics/DLLab/src/Aix...BM2BM2GSM3587997/endosome/archive/bioinformatics/DLLab/src/Aix...GSM3587997NaNBM2control
31/endosome/archive/bioinformatics/DLLab/src/Aix...AML328D29GSM3587937/endosome/archive/bioinformatics/DLLab/src/Aix...GSM3587938D29AML328_D29AML
32/endosome/archive/bioinformatics/DLLab/src/Aix...AML420BD35GSM3587957/endosome/archive/bioinformatics/DLLab/src/Aix...GSM3587958D35AML420B_D35AML
33/endosome/archive/bioinformatics/DLLab/src/Aix...AML921AD0GSM3587990/endosome/archive/bioinformatics/DLLab/src/Aix...GSM3587991D0AML921A_D0AML
34/endosome/archive/bioinformatics/DLLab/src/Aix...AML420BD0GSM3587953/endosome/archive/bioinformatics/DLLab/src/Aix...GSM3587954D0AML420B_D0AML
35/endosome/archive/bioinformatics/DLLab/src/Aix...BM3BM3GSM3587998/endosome/archive/bioinformatics/DLLab/src/Aix...GSM3587999NaNBM3control
36/endosome/archive/bioinformatics/DLLab/src/Aix...BM1BM1GSM3587996/endosome/archive/bioinformatics/DLLab/src/Aix...GSM3587996NaNBM1control
37/endosome/archive/bioinformatics/DLLab/src/Aix...BM5BM5-34p38nGSM3588003/endosome/archive/bioinformatics/DLLab/src/Aix...GSM3588003NaNBM5-34p38ncontrol
38/endosome/archive/bioinformatics/DLLab/src/Aix...AML371D0GSM3587946/endosome/archive/bioinformatics/DLLab/src/Aix...GSM3587947D0AML371_D0AML
39/endosome/archive/bioinformatics/DLLab/src/Aix...AML722BD49GSM3587982/endosome/archive/bioinformatics/DLLab/src/Aix...GSM3587983D49AML722B_D49AML
40/endosome/archive/bioinformatics/DLLab/src/Aix...MUTZ3NaNGSM3588004/endosome/archive/bioinformatics/DLLab/src/Aix...GSM3588004NaNMUTZ3cellline
41/endosome/archive/bioinformatics/DLLab/src/Aix...BM5BM5-34pGSM3588002/endosome/archive/bioinformatics/DLLab/src/Aix...GSM3588002NaNBM5-34pcontrol
42/endosome/archive/bioinformatics/DLLab/src/Aix...AML328D171GSM3587935/endosome/archive/bioinformatics/DLLab/src/Aix...GSM3587936D171AML328_D171AML
\n", "
" ], "text/plain": [ " matrix_path id file_note \\\n", "0 /endosome/archive/bioinformatics/DLLab/src/Aix... AML328 D0 \n", "1 /endosome/archive/bioinformatics/DLLab/src/Aix... AML420B D14 \n", "2 /endosome/archive/bioinformatics/DLLab/src/Aix... AML314 D0 \n", "3 /endosome/archive/bioinformatics/DLLab/src/Aix... AML556 D15 \n", "4 /endosome/archive/bioinformatics/DLLab/src/Aix... AML314 D31 \n", "5 /endosome/archive/bioinformatics/DLLab/src/Aix... AML371 D34 \n", "6 /endosome/archive/bioinformatics/DLLab/src/Aix... AML210A D0 \n", "7 /endosome/archive/bioinformatics/DLLab/src/Aix... AML707B D41 \n", "8 /endosome/archive/bioinformatics/DLLab/src/Aix... AML475 D0 \n", "9 /endosome/archive/bioinformatics/DLLab/src/Aix... AML707B D113 \n", "10 /endosome/archive/bioinformatics/DLLab/src/Aix... AML328 D113 \n", "11 /endosome/archive/bioinformatics/DLLab/src/Aix... AML916 D0 \n", "12 /endosome/archive/bioinformatics/DLLab/src/Aix... AML329 D37 \n", "13 /endosome/archive/bioinformatics/DLLab/src/Aix... AML707B D18 \n", "14 /endosome/archive/bioinformatics/DLLab/src/Aix... AML556 D0 \n", "15 /endosome/archive/bioinformatics/DLLab/src/Aix... AML997 D0 \n", "16 /endosome/archive/bioinformatics/DLLab/src/Aix... AML722B D0 \n", "17 /endosome/archive/bioinformatics/DLLab/src/Aix... AML707B D97 \n", "18 /endosome/archive/bioinformatics/DLLab/src/Aix... AML1012 D0 \n", "19 /endosome/archive/bioinformatics/DLLab/src/Aix... AML870 D0 \n", "20 /endosome/archive/bioinformatics/DLLab/src/Aix... AML329 D20 \n", "21 /endosome/archive/bioinformatics/DLLab/src/Aix... AML419A D0 \n", "22 /endosome/archive/bioinformatics/DLLab/src/Aix... AML707B D0 \n", "23 /endosome/archive/bioinformatics/DLLab/src/Aix... AML329 D0 \n", "24 /endosome/archive/bioinformatics/DLLab/src/Aix... AML556 D31 \n", "25 /endosome/archive/bioinformatics/DLLab/src/Aix... AML870 D14 \n", "26 /endosome/archive/bioinformatics/DLLab/src/Aix... BM4 BM4 \n", "27 /endosome/archive/bioinformatics/DLLab/src/Aix... AML475 D29 \n", "28 /endosome/archive/bioinformatics/DLLab/src/Aix... OCI NaN \n", "29 /endosome/archive/bioinformatics/DLLab/src/Aix... AML997 D35 \n", "30 /endosome/archive/bioinformatics/DLLab/src/Aix... BM2 BM2 \n", "31 /endosome/archive/bioinformatics/DLLab/src/Aix... AML328 D29 \n", "32 /endosome/archive/bioinformatics/DLLab/src/Aix... AML420B D35 \n", "33 /endosome/archive/bioinformatics/DLLab/src/Aix... AML921A D0 \n", "34 /endosome/archive/bioinformatics/DLLab/src/Aix... AML420B D0 \n", "35 /endosome/archive/bioinformatics/DLLab/src/Aix... BM3 BM3 \n", "36 /endosome/archive/bioinformatics/DLLab/src/Aix... BM1 BM1 \n", "37 /endosome/archive/bioinformatics/DLLab/src/Aix... BM5 BM5-34p38n \n", "38 /endosome/archive/bioinformatics/DLLab/src/Aix... AML371 D0 \n", "39 /endosome/archive/bioinformatics/DLLab/src/Aix... AML722B D49 \n", "40 /endosome/archive/bioinformatics/DLLab/src/Aix... MUTZ3 NaN \n", "41 /endosome/archive/bioinformatics/DLLab/src/Aix... BM5 BM5-34p \n", "42 /endosome/archive/bioinformatics/DLLab/src/Aix... AML328 D171 \n", "\n", " accession_matrix_num anno_path \\\n", "0 GSM3587931 /endosome/archive/bioinformatics/DLLab/src/Aix... \n", "1 GSM3587955 /endosome/archive/bioinformatics/DLLab/src/Aix... \n", "2 GSM3587927 /endosome/archive/bioinformatics/DLLab/src/Aix... \n", "3 GSM3587965 /endosome/archive/bioinformatics/DLLab/src/Aix... \n", "4 GSM3587929 /endosome/archive/bioinformatics/DLLab/src/Aix... \n", "5 GSM3587948 /endosome/archive/bioinformatics/DLLab/src/Aix... \n", "6 GSM3587925 /endosome/archive/bioinformatics/DLLab/src/Aix... \n", "7 GSM3587975 /endosome/archive/bioinformatics/DLLab/src/Aix... \n", "8 GSM3587959 /endosome/archive/bioinformatics/DLLab/src/Aix... \n", "9 GSM3587971 /endosome/archive/bioinformatics/DLLab/src/Aix... \n", "10 GSM3587933 /endosome/archive/bioinformatics/DLLab/src/Aix... \n", "11 GSM3587988 /endosome/archive/bioinformatics/DLLab/src/Aix... \n", "12 GSM3587944 /endosome/archive/bioinformatics/DLLab/src/Aix... \n", "13 GSM3587973 /endosome/archive/bioinformatics/DLLab/src/Aix... \n", "14 GSM3587963 /endosome/archive/bioinformatics/DLLab/src/Aix... \n", "15 GSM3587992 /endosome/archive/bioinformatics/DLLab/src/Aix... \n", "16 GSM3587980 /endosome/archive/bioinformatics/DLLab/src/Aix... \n", "17 GSM3587977 /endosome/archive/bioinformatics/DLLab/src/Aix... \n", "18 GSM3587923 /endosome/archive/bioinformatics/DLLab/src/Aix... \n", "19 GSM3587984 /endosome/archive/bioinformatics/DLLab/src/Aix... \n", "20 GSM3587942 /endosome/archive/bioinformatics/DLLab/src/Aix... \n", "21 GSM3587950 /endosome/archive/bioinformatics/DLLab/src/Aix... \n", "22 GSM3587969 /endosome/archive/bioinformatics/DLLab/src/Aix... \n", "23 GSM3587940 /endosome/archive/bioinformatics/DLLab/src/Aix... \n", "24 GSM3587967 /endosome/archive/bioinformatics/DLLab/src/Aix... \n", "25 GSM3587986 /endosome/archive/bioinformatics/DLLab/src/Aix... \n", "26 GSM3588000 /endosome/archive/bioinformatics/DLLab/src/Aix... \n", "27 GSM3587961 /endosome/archive/bioinformatics/DLLab/src/Aix... \n", "28 GSM3588005 /endosome/archive/bioinformatics/DLLab/src/Aix... \n", "29 GSM3587994 /endosome/archive/bioinformatics/DLLab/src/Aix... \n", "30 GSM3587997 /endosome/archive/bioinformatics/DLLab/src/Aix... \n", "31 GSM3587937 /endosome/archive/bioinformatics/DLLab/src/Aix... \n", "32 GSM3587957 /endosome/archive/bioinformatics/DLLab/src/Aix... \n", "33 GSM3587990 /endosome/archive/bioinformatics/DLLab/src/Aix... \n", "34 GSM3587953 /endosome/archive/bioinformatics/DLLab/src/Aix... \n", "35 GSM3587998 /endosome/archive/bioinformatics/DLLab/src/Aix... \n", "36 GSM3587996 /endosome/archive/bioinformatics/DLLab/src/Aix... \n", "37 GSM3588003 /endosome/archive/bioinformatics/DLLab/src/Aix... \n", "38 GSM3587946 /endosome/archive/bioinformatics/DLLab/src/Aix... \n", "39 GSM3587982 /endosome/archive/bioinformatics/DLLab/src/Aix... \n", "40 GSM3588004 /endosome/archive/bioinformatics/DLLab/src/Aix... \n", "41 GSM3588002 /endosome/archive/bioinformatics/DLLab/src/Aix... \n", "42 GSM3587935 /endosome/archive/bioinformatics/DLLab/src/Aix... \n", "\n", " accession_anno_num Day unique_id Patient_group \n", "0 GSM3587932 D0 AML328_D0 AML \n", "1 GSM3587956 D14 AML420B_D14 AML \n", "2 GSM3587928 D0 AML314_D0 AML \n", "3 GSM3587966 D15 AML556_D15 AML \n", "4 GSM3587930 D31 AML314_D31 AML \n", "5 GSM3587949 D34 AML371_D34 AML \n", "6 GSM3587926 D0 AML210A_D0 AML \n", "7 GSM3587976 D41 AML707B_D41 AML \n", "8 GSM3587960 D0 AML475_D0 AML \n", "9 GSM3587972 D113 AML707B_D113 AML \n", "10 GSM3587934 D113 AML328_D113 AML \n", "11 GSM3587989 D0 AML916_D0 AML \n", "12 GSM3587945 D37 AML329_D37 AML \n", "13 GSM3587974 D18 AML707B_D18 AML \n", "14 GSM3587964 D0 AML556_D0 AML \n", "15 GSM3587993 D0 AML997_D0 AML \n", "16 GSM3587981 D0 AML722B_D0 AML \n", "17 GSM3587978 D97 AML707B_D97 AML \n", "18 GSM3587924 D0 AML1012_D0 AML \n", "19 GSM3587985 D0 AML870_D0 AML \n", "20 GSM3587943 D20 AML329_D20 AML \n", "21 GSM3587951 D0 AML419A_D0 AML \n", "22 GSM3587970 D0 AML707B_D0 AML \n", "23 GSM3587941 D0 AML329_D0 AML \n", "24 GSM3587968 D31 AML556_D31 AML \n", "25 GSM3587987 D14 AML870_D14 AML \n", "26 GSM3588001 NaN BM4 control \n", "27 GSM3587962 D29 AML475_D29 AML \n", "28 GSM3588005 NaN OCI cellline \n", "29 GSM3587995 D35 AML997_D35 AML \n", "30 GSM3587997 NaN BM2 control \n", "31 GSM3587938 D29 AML328_D29 AML \n", "32 GSM3587958 D35 AML420B_D35 AML \n", "33 GSM3587991 D0 AML921A_D0 AML \n", "34 GSM3587954 D0 AML420B_D0 AML \n", "35 GSM3587999 NaN BM3 control \n", "36 GSM3587996 NaN BM1 control \n", "37 GSM3588003 NaN BM5-34p38n control \n", "38 GSM3587947 D0 AML371_D0 AML \n", "39 GSM3587983 D49 AML722B_D49 AML \n", "40 GSM3588004 NaN MUTZ3 cellline \n", "41 GSM3588002 NaN BM5-34p control \n", "42 GSM3587936 D171 AML328_D171 AML " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", "# I downloaded the dataset count matrix and annotations from: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE116256\n", "# I saved the compressed files under /scMEDAL_for_scRNAseq/Experiments/data/AML_data/zip_files\n", "# Path to the directory containing zip files\n", "parent_path = os.path.join(data_base_path, \"zip_files\")\n", "# 1.Read adata\n", "\n", "\n", "AML_reader = AML_data_reader(parent_path)\n", "# Get df_paths\n", "df_paths = AML_reader.get_df_paths()\n", "df_paths\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/archive/bioinformatics/DLLab/shared/CondaEnvironments/Aixa_scDML/lib/python3.8/site-packages/anndata/_core/anndata.py:120: ImplicitModificationWarning: Transforming to str index.\n", " warnings.warn(\"Transforming to str index.\", ImplicitModificationWarning)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Created 43 AnnData objects.\n" ] } ], "source": [ "# Create a dict of adata objects\n", "adata_dict = AML_reader.create_adata_dict(df_paths)\n", "print(f\"Created {len(adata_dict)} AnnData objects.\")\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "merged_adata = AML_reader.merge_adata_objects(adata_dict)\n", "\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "AnnData object with n_obs × n_vars = 41090 × 27899\n", " obs: 'Cell', 'NumberOfReads', 'AlignedToGenome', 'AlignedToTranscriptome', 'TranscriptomeUMIs', 'NumberOfGenes', 'CyclingScore', 'CyclingBinary', 'MutTranscripts', 'WtTranscripts', 'PredictionRF2', 'PredictionRefined', 'CellType', 'Score_HSC', 'Score_Prog', 'Score_GMP', 'Score_ProMono', 'Score_Mono', 'Score_cDC', 'Score_pDC', 'Score_earlyEry', 'Score_lateEry', 'Score_ProB', 'Score_B', 'Score_Plasma', 'Score_T', 'Score_CTL', 'Score_NK', 'NanoporeTranscripts', 'id', 'Day', 'unique_id', 'Patient_group'\n", " var: 'Gene'" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "merged_adata" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CellNumberOfReadsAlignedToGenomeAlignedToTranscriptomeTranscriptomeUMIsNumberOfGenesCyclingScoreCyclingBinaryMutTranscriptsWtTranscripts...Score_BScore_PlasmaScore_TScore_CTLScore_NKNanoporeTranscriptsidDayunique_idPatient_group
0AML328-D0_AAAAACAGAAGT249941539174771236581-0.351noNaNNaN...0.0420.0090.1320.1840.447NaNAML328D0AML328_D0AML
1AML328-D0_AAAACCGCTACT55122346331725233941238-0.409noNaNNaN...0.0710.0200.0700.0520.037NaNAML328D0AML328_D0AML
2AML328-D0_AAAACCGGCTTT43393268131614826491243-0.401noNaNNaN...0.0620.0520.0460.0320.034NaNAML328D0AML328_D0AML
3AML328-D0_AAAAGCTTATCA250851540494831582633-0.378noNaNNaN...0.0600.0090.4910.1670.080NaNAML328D0AML328_D0AML
4AML328-D0_AAAAGTCCCCGT54911332262054532801376-0.629noNaNNaN...0.0120.0030.0060.0100.009NaNAML328D0AML328_D0AML
..................................................................
41085AML328-D171_CCATCATCCACC2641419298136742516939-0.257noNaNNaN...0.0580.0160.4640.1350.109NaNAML328D171AML328_D171AML
41086AML328-D171_TTTTATCATTCT274602007394331651878-0.524noNaNNaN...0.0660.0230.2810.2260.213NaNAML328D171AML328_D171AML
41087AML328-D171_AAGATGTAGCGT12394878765431331504-0.348noNaNNaN...0.0880.0190.1360.0740.041NaNAML328D171AML328_D171AML
41088AML328-D171_CTGTAGCTCCTA1917213904103511792745-0.394noNaNNaN...0.1070.0150.1210.0450.052NaNAML328D171AML328_D171AML
41089AML328-D171_GATTTGGACGTT159121165261201153580-0.310noNaNNaN...0.0770.0200.3650.1380.067NaNAML328D171AML328_D171AML
\n", "

41090 rows × 33 columns

\n", "
" ], "text/plain": [ " Cell NumberOfReads AlignedToGenome \\\n", "0 AML328-D0_AAAAACAGAAGT 24994 15391 \n", "1 AML328-D0_AAAACCGCTACT 55122 34633 \n", "2 AML328-D0_AAAACCGGCTTT 43393 26813 \n", "3 AML328-D0_AAAAGCTTATCA 25085 15404 \n", "4 AML328-D0_AAAAGTCCCCGT 54911 33226 \n", "... ... ... ... \n", "41085 AML328-D171_CCATCATCCACC 26414 19298 \n", "41086 AML328-D171_TTTTATCATTCT 27460 20073 \n", "41087 AML328-D171_AAGATGTAGCGT 12394 8787 \n", "41088 AML328-D171_CTGTAGCTCCTA 19172 13904 \n", "41089 AML328-D171_GATTTGGACGTT 15912 11652 \n", "\n", " AlignedToTranscriptome TranscriptomeUMIs NumberOfGenes CyclingScore \\\n", "0 7477 1236 581 -0.351 \n", "1 17252 3394 1238 -0.409 \n", "2 16148 2649 1243 -0.401 \n", "3 9483 1582 633 -0.378 \n", "4 20545 3280 1376 -0.629 \n", "... ... ... ... ... \n", "41085 13674 2516 939 -0.257 \n", "41086 9433 1651 878 -0.524 \n", "41087 6543 1331 504 -0.348 \n", "41088 10351 1792 745 -0.394 \n", "41089 6120 1153 580 -0.310 \n", "\n", " CyclingBinary MutTranscripts WtTranscripts ... Score_B Score_Plasma \\\n", "0 no NaN NaN ... 0.042 0.009 \n", "1 no NaN NaN ... 0.071 0.020 \n", "2 no NaN NaN ... 0.062 0.052 \n", "3 no NaN NaN ... 0.060 0.009 \n", "4 no NaN NaN ... 0.012 0.003 \n", "... ... ... ... ... ... ... \n", "41085 no NaN NaN ... 0.058 0.016 \n", "41086 no NaN NaN ... 0.066 0.023 \n", "41087 no NaN NaN ... 0.088 0.019 \n", "41088 no NaN NaN ... 0.107 0.015 \n", "41089 no NaN NaN ... 0.077 0.020 \n", "\n", " Score_T Score_CTL Score_NK NanoporeTranscripts id Day \\\n", "0 0.132 0.184 0.447 NaN AML328 D0 \n", "1 0.070 0.052 0.037 NaN AML328 D0 \n", "2 0.046 0.032 0.034 NaN AML328 D0 \n", "3 0.491 0.167 0.080 NaN AML328 D0 \n", "4 0.006 0.010 0.009 NaN AML328 D0 \n", "... ... ... ... ... ... ... \n", "41085 0.464 0.135 0.109 NaN AML328 D171 \n", "41086 0.281 0.226 0.213 NaN AML328 D171 \n", "41087 0.136 0.074 0.041 NaN AML328 D171 \n", "41088 0.121 0.045 0.052 NaN AML328 D171 \n", "41089 0.365 0.138 0.067 NaN AML328 D171 \n", "\n", " unique_id Patient_group \n", "0 AML328_D0 AML \n", "1 AML328_D0 AML \n", "2 AML328_D0 AML \n", "3 AML328_D0 AML \n", "4 AML328_D0 AML \n", "... ... ... \n", "41085 AML328_D171 AML \n", "41086 AML328_D171 AML \n", "41087 AML328_D171 AML \n", "41088 AML328_D171 AML \n", "41089 AML328_D171 AML \n", "\n", "[41090 rows x 33 columns]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "merged_adata.obs" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Created folder: /archive/bioinformatics/DLLab/AixaAndrade/data/Genomic_data/VanGallen_2019/adata_merged\n" ] } ], "source": [ "# Change to True to save adata\n", "# I saved the cmerged count matrix under /scMEDAL_for_scRNAseq/Experiments/data/AML_data/adata_merged\n", "save_data = False\n", "if save_data:\n", " #save merged adata\n", " save_adata(merged_adata,output_path=data_base_path+\"/adata_merged\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.18" }, "vscode": { "interpreter": { "hash": "a3db3b14d5711d9666b8e92978eba797bf5a2f0cdbf745a2f197400c14fa420c" } } }, "nbformat": 4, "nbformat_minor": 4 }