{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## AML Data Processing Notebook\n", "\n", "This notebook processes the AML dataset obtained from [GEO (GSE116256)](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE116256) and saves it as a single count matrix.\n", "\n", "### Data Sources\n", "- The count matrix and annotations were downloaded from [GEO (GSE116256)](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE116256).\n", "- The compressed files are stored in: \n", " `/scMEDAL_for_scRNAseq/Experiments/data/AML_data/zip_files`\n", "- The processed count matrix is saved in: \n", " `/scMEDAL_for_scRNAseq/Experiments/data/AML_data/adata_merged`\n", "\n", "\n", "Environment: preprocess_and_plot_umaps_env\n", "\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "data_base_path: /endosome/archive/bioinformatics/DLLab/src/AixaAndrade/gitfront/scMEDAL_for_scRNAseq/Experiments/AML/../data/AML_data\n", "outputs_path: /endosome/archive/bioinformatics/DLLab/src/AixaAndrade/gitfront/scMEDAL_for_scRNAseq/Experiments/AML/../outputs/AML_outputs\n" ] } ], "source": [ "import sys\n", "# Add the parent directory to the Python path\n", "sys.path.append(\"../\")\n", "import os\n", "# Now you can import from the parent directory\n", "from paths_config import data_base_path\n", "\n", "from scMEDAL.utils.preprocessing_utils import AML_data_reader\n", "from scMEDAL.utils.utils import save_adata" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " id Patient_group counts\n", "0 AML1012 AML 1\n", "1 AML210A AML 1\n", "2 AML314 AML 2\n", "3 AML328 AML 4\n", "4 AML329 AML 3\n", "5 AML371 AML 2\n", "6 AML419A AML 1\n", "7 AML420B AML 3\n", "8 AML475 AML 2\n", "9 AML556 AML 3\n", "10 AML707B AML 5\n", "11 AML722B AML 2\n", "12 AML870 AML 2\n", "13 AML916 AML 1\n", "14 AML921A AML 1\n", "15 AML997 AML 2\n", "16 BM1 control 1\n", "17 BM2 control 1\n", "18 BM3 control 1\n", "19 BM4 control 1\n", "20 BM5 control 2\n", "21 MUTZ3 cellline 1\n", "22 OCI cellline 1\n" ] }, { "data": { "text/html": [ "
\n", " | matrix_path | \n", "id | \n", "file_note | \n", "accession_matrix_num | \n", "anno_path | \n", "accession_anno_num | \n", "Day | \n", "unique_id | \n", "Patient_group | \n", "
---|---|---|---|---|---|---|---|---|---|
0 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "AML328 | \n", "D0 | \n", "GSM3587931 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "GSM3587932 | \n", "D0 | \n", "AML328_D0 | \n", "AML | \n", "
1 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "AML420B | \n", "D14 | \n", "GSM3587955 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "GSM3587956 | \n", "D14 | \n", "AML420B_D14 | \n", "AML | \n", "
2 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "AML314 | \n", "D0 | \n", "GSM3587927 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "GSM3587928 | \n", "D0 | \n", "AML314_D0 | \n", "AML | \n", "
3 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "AML556 | \n", "D15 | \n", "GSM3587965 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "GSM3587966 | \n", "D15 | \n", "AML556_D15 | \n", "AML | \n", "
4 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "AML314 | \n", "D31 | \n", "GSM3587929 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "GSM3587930 | \n", "D31 | \n", "AML314_D31 | \n", "AML | \n", "
5 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "AML371 | \n", "D34 | \n", "GSM3587948 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "GSM3587949 | \n", "D34 | \n", "AML371_D34 | \n", "AML | \n", "
6 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "AML210A | \n", "D0 | \n", "GSM3587925 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "GSM3587926 | \n", "D0 | \n", "AML210A_D0 | \n", "AML | \n", "
7 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "AML707B | \n", "D41 | \n", "GSM3587975 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "GSM3587976 | \n", "D41 | \n", "AML707B_D41 | \n", "AML | \n", "
8 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "AML475 | \n", "D0 | \n", "GSM3587959 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "GSM3587960 | \n", "D0 | \n", "AML475_D0 | \n", "AML | \n", "
9 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "AML707B | \n", "D113 | \n", "GSM3587971 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "GSM3587972 | \n", "D113 | \n", "AML707B_D113 | \n", "AML | \n", "
10 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "AML328 | \n", "D113 | \n", "GSM3587933 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "GSM3587934 | \n", "D113 | \n", "AML328_D113 | \n", "AML | \n", "
11 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "AML916 | \n", "D0 | \n", "GSM3587988 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "GSM3587989 | \n", "D0 | \n", "AML916_D0 | \n", "AML | \n", "
12 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "AML329 | \n", "D37 | \n", "GSM3587944 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "GSM3587945 | \n", "D37 | \n", "AML329_D37 | \n", "AML | \n", "
13 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "AML707B | \n", "D18 | \n", "GSM3587973 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "GSM3587974 | \n", "D18 | \n", "AML707B_D18 | \n", "AML | \n", "
14 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "AML556 | \n", "D0 | \n", "GSM3587963 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "GSM3587964 | \n", "D0 | \n", "AML556_D0 | \n", "AML | \n", "
15 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "AML997 | \n", "D0 | \n", "GSM3587992 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "GSM3587993 | \n", "D0 | \n", "AML997_D0 | \n", "AML | \n", "
16 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "AML722B | \n", "D0 | \n", "GSM3587980 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "GSM3587981 | \n", "D0 | \n", "AML722B_D0 | \n", "AML | \n", "
17 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "AML707B | \n", "D97 | \n", "GSM3587977 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "GSM3587978 | \n", "D97 | \n", "AML707B_D97 | \n", "AML | \n", "
18 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "AML1012 | \n", "D0 | \n", "GSM3587923 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "GSM3587924 | \n", "D0 | \n", "AML1012_D0 | \n", "AML | \n", "
19 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "AML870 | \n", "D0 | \n", "GSM3587984 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "GSM3587985 | \n", "D0 | \n", "AML870_D0 | \n", "AML | \n", "
20 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "AML329 | \n", "D20 | \n", "GSM3587942 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "GSM3587943 | \n", "D20 | \n", "AML329_D20 | \n", "AML | \n", "
21 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "AML419A | \n", "D0 | \n", "GSM3587950 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "GSM3587951 | \n", "D0 | \n", "AML419A_D0 | \n", "AML | \n", "
22 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "AML707B | \n", "D0 | \n", "GSM3587969 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "GSM3587970 | \n", "D0 | \n", "AML707B_D0 | \n", "AML | \n", "
23 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "AML329 | \n", "D0 | \n", "GSM3587940 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "GSM3587941 | \n", "D0 | \n", "AML329_D0 | \n", "AML | \n", "
24 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "AML556 | \n", "D31 | \n", "GSM3587967 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "GSM3587968 | \n", "D31 | \n", "AML556_D31 | \n", "AML | \n", "
25 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "AML870 | \n", "D14 | \n", "GSM3587986 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "GSM3587987 | \n", "D14 | \n", "AML870_D14 | \n", "AML | \n", "
26 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "BM4 | \n", "BM4 | \n", "GSM3588000 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "GSM3588001 | \n", "NaN | \n", "BM4 | \n", "control | \n", "
27 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "AML475 | \n", "D29 | \n", "GSM3587961 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "GSM3587962 | \n", "D29 | \n", "AML475_D29 | \n", "AML | \n", "
28 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "OCI | \n", "NaN | \n", "GSM3588005 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "GSM3588005 | \n", "NaN | \n", "OCI | \n", "cellline | \n", "
29 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "AML997 | \n", "D35 | \n", "GSM3587994 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "GSM3587995 | \n", "D35 | \n", "AML997_D35 | \n", "AML | \n", "
30 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "BM2 | \n", "BM2 | \n", "GSM3587997 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "GSM3587997 | \n", "NaN | \n", "BM2 | \n", "control | \n", "
31 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "AML328 | \n", "D29 | \n", "GSM3587937 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "GSM3587938 | \n", "D29 | \n", "AML328_D29 | \n", "AML | \n", "
32 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "AML420B | \n", "D35 | \n", "GSM3587957 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "GSM3587958 | \n", "D35 | \n", "AML420B_D35 | \n", "AML | \n", "
33 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "AML921A | \n", "D0 | \n", "GSM3587990 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "GSM3587991 | \n", "D0 | \n", "AML921A_D0 | \n", "AML | \n", "
34 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "AML420B | \n", "D0 | \n", "GSM3587953 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "GSM3587954 | \n", "D0 | \n", "AML420B_D0 | \n", "AML | \n", "
35 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "BM3 | \n", "BM3 | \n", "GSM3587998 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "GSM3587999 | \n", "NaN | \n", "BM3 | \n", "control | \n", "
36 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "BM1 | \n", "BM1 | \n", "GSM3587996 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "GSM3587996 | \n", "NaN | \n", "BM1 | \n", "control | \n", "
37 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "BM5 | \n", "BM5-34p38n | \n", "GSM3588003 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "GSM3588003 | \n", "NaN | \n", "BM5-34p38n | \n", "control | \n", "
38 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "AML371 | \n", "D0 | \n", "GSM3587946 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "GSM3587947 | \n", "D0 | \n", "AML371_D0 | \n", "AML | \n", "
39 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "AML722B | \n", "D49 | \n", "GSM3587982 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "GSM3587983 | \n", "D49 | \n", "AML722B_D49 | \n", "AML | \n", "
40 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "MUTZ3 | \n", "NaN | \n", "GSM3588004 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "GSM3588004 | \n", "NaN | \n", "MUTZ3 | \n", "cellline | \n", "
41 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "BM5 | \n", "BM5-34p | \n", "GSM3588002 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "GSM3588002 | \n", "NaN | \n", "BM5-34p | \n", "control | \n", "
42 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "AML328 | \n", "D171 | \n", "GSM3587935 | \n", "/endosome/archive/bioinformatics/DLLab/src/Aix... | \n", "GSM3587936 | \n", "D171 | \n", "AML328_D171 | \n", "AML | \n", "
\n", " | Cell | \n", "NumberOfReads | \n", "AlignedToGenome | \n", "AlignedToTranscriptome | \n", "TranscriptomeUMIs | \n", "NumberOfGenes | \n", "CyclingScore | \n", "CyclingBinary | \n", "MutTranscripts | \n", "WtTranscripts | \n", "... | \n", "Score_B | \n", "Score_Plasma | \n", "Score_T | \n", "Score_CTL | \n", "Score_NK | \n", "NanoporeTranscripts | \n", "id | \n", "Day | \n", "unique_id | \n", "Patient_group | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "AML328-D0_AAAAACAGAAGT | \n", "24994 | \n", "15391 | \n", "7477 | \n", "1236 | \n", "581 | \n", "-0.351 | \n", "no | \n", "NaN | \n", "NaN | \n", "... | \n", "0.042 | \n", "0.009 | \n", "0.132 | \n", "0.184 | \n", "0.447 | \n", "NaN | \n", "AML328 | \n", "D0 | \n", "AML328_D0 | \n", "AML | \n", "
1 | \n", "AML328-D0_AAAACCGCTACT | \n", "55122 | \n", "34633 | \n", "17252 | \n", "3394 | \n", "1238 | \n", "-0.409 | \n", "no | \n", "NaN | \n", "NaN | \n", "... | \n", "0.071 | \n", "0.020 | \n", "0.070 | \n", "0.052 | \n", "0.037 | \n", "NaN | \n", "AML328 | \n", "D0 | \n", "AML328_D0 | \n", "AML | \n", "
2 | \n", "AML328-D0_AAAACCGGCTTT | \n", "43393 | \n", "26813 | \n", "16148 | \n", "2649 | \n", "1243 | \n", "-0.401 | \n", "no | \n", "NaN | \n", "NaN | \n", "... | \n", "0.062 | \n", "0.052 | \n", "0.046 | \n", "0.032 | \n", "0.034 | \n", "NaN | \n", "AML328 | \n", "D0 | \n", "AML328_D0 | \n", "AML | \n", "
3 | \n", "AML328-D0_AAAAGCTTATCA | \n", "25085 | \n", "15404 | \n", "9483 | \n", "1582 | \n", "633 | \n", "-0.378 | \n", "no | \n", "NaN | \n", "NaN | \n", "... | \n", "0.060 | \n", "0.009 | \n", "0.491 | \n", "0.167 | \n", "0.080 | \n", "NaN | \n", "AML328 | \n", "D0 | \n", "AML328_D0 | \n", "AML | \n", "
4 | \n", "AML328-D0_AAAAGTCCCCGT | \n", "54911 | \n", "33226 | \n", "20545 | \n", "3280 | \n", "1376 | \n", "-0.629 | \n", "no | \n", "NaN | \n", "NaN | \n", "... | \n", "0.012 | \n", "0.003 | \n", "0.006 | \n", "0.010 | \n", "0.009 | \n", "NaN | \n", "AML328 | \n", "D0 | \n", "AML328_D0 | \n", "AML | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
41085 | \n", "AML328-D171_CCATCATCCACC | \n", "26414 | \n", "19298 | \n", "13674 | \n", "2516 | \n", "939 | \n", "-0.257 | \n", "no | \n", "NaN | \n", "NaN | \n", "... | \n", "0.058 | \n", "0.016 | \n", "0.464 | \n", "0.135 | \n", "0.109 | \n", "NaN | \n", "AML328 | \n", "D171 | \n", "AML328_D171 | \n", "AML | \n", "
41086 | \n", "AML328-D171_TTTTATCATTCT | \n", "27460 | \n", "20073 | \n", "9433 | \n", "1651 | \n", "878 | \n", "-0.524 | \n", "no | \n", "NaN | \n", "NaN | \n", "... | \n", "0.066 | \n", "0.023 | \n", "0.281 | \n", "0.226 | \n", "0.213 | \n", "NaN | \n", "AML328 | \n", "D171 | \n", "AML328_D171 | \n", "AML | \n", "
41087 | \n", "AML328-D171_AAGATGTAGCGT | \n", "12394 | \n", "8787 | \n", "6543 | \n", "1331 | \n", "504 | \n", "-0.348 | \n", "no | \n", "NaN | \n", "NaN | \n", "... | \n", "0.088 | \n", "0.019 | \n", "0.136 | \n", "0.074 | \n", "0.041 | \n", "NaN | \n", "AML328 | \n", "D171 | \n", "AML328_D171 | \n", "AML | \n", "
41088 | \n", "AML328-D171_CTGTAGCTCCTA | \n", "19172 | \n", "13904 | \n", "10351 | \n", "1792 | \n", "745 | \n", "-0.394 | \n", "no | \n", "NaN | \n", "NaN | \n", "... | \n", "0.107 | \n", "0.015 | \n", "0.121 | \n", "0.045 | \n", "0.052 | \n", "NaN | \n", "AML328 | \n", "D171 | \n", "AML328_D171 | \n", "AML | \n", "
41089 | \n", "AML328-D171_GATTTGGACGTT | \n", "15912 | \n", "11652 | \n", "6120 | \n", "1153 | \n", "580 | \n", "-0.310 | \n", "no | \n", "NaN | \n", "NaN | \n", "... | \n", "0.077 | \n", "0.020 | \n", "0.365 | \n", "0.138 | \n", "0.067 | \n", "NaN | \n", "AML328 | \n", "D171 | \n", "AML328_D171 | \n", "AML | \n", "
41090 rows × 33 columns
\n", "