This notebook processes the AML dataset obtained from GEO (GSE116256) and saves it as a single count matrix.
/scMEDAL_for_scRNAseq/Experiments/data/AML_data/zip_files
/scMEDAL_for_scRNAseq/Experiments/data/AML_data/adata_merged
Environment: preprocess_and_plot_umaps_env
import sys
# Add the parent directory to the Python path
sys.path.append("../")
import os
# Now you can import from the parent directory
from paths_config import data_base_path
from scMEDAL.utils.preprocessing_utils import AML_data_reader
from scMEDAL.utils.utils import save_adata
data_base_path: /endosome/archive/bioinformatics/DLLab/src/AixaAndrade/gitfront/scMEDAL_for_scRNAseq/Experiments/AML/../data/AML_data
outputs_path: /endosome/archive/bioinformatics/DLLab/src/AixaAndrade/gitfront/scMEDAL_for_scRNAseq/Experiments/AML/../outputs/AML_outputs
# I downloaded the dataset count matrix and annotations from: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE116256
# I saved the compressed files under /scMEDAL_for_scRNAseq/Experiments/data/AML_data/zip_files
# Path to the directory containing zip files
parent_path = os.path.join(data_base_path, "zip_files")
# 1.Read adata
AML_reader = AML_data_reader(parent_path)
# Get df_paths
df_paths = AML_reader.get_df_paths()
df_paths
id Patient_group counts
0 AML1012 AML 1
1 AML210A AML 1
2 AML314 AML 2
3 AML328 AML 4
4 AML329 AML 3
5 AML371 AML 2
6 AML419A AML 1
7 AML420B AML 3
8 AML475 AML 2
9 AML556 AML 3
10 AML707B AML 5
11 AML722B AML 2
12 AML870 AML 2
13 AML916 AML 1
14 AML921A AML 1
15 AML997 AML 2
16 BM1 control 1
17 BM2 control 1
18 BM3 control 1
19 BM4 control 1
20 BM5 control 2
21 MUTZ3 cellline 1
22 OCI cellline 1
matrix_path | id | file_note | accession_matrix_num | anno_path | accession_anno_num | Day | unique_id | Patient_group | |
---|---|---|---|---|---|---|---|---|---|
0 | /endosome/archive/bioinformatics/DLLab/src/Aix... | AML328 | D0 | GSM3587931 | /endosome/archive/bioinformatics/DLLab/src/Aix... | GSM3587932 | D0 | AML328_D0 | AML |
1 | /endosome/archive/bioinformatics/DLLab/src/Aix... | AML420B | D14 | GSM3587955 | /endosome/archive/bioinformatics/DLLab/src/Aix... | GSM3587956 | D14 | AML420B_D14 | AML |
2 | /endosome/archive/bioinformatics/DLLab/src/Aix... | AML314 | D0 | GSM3587927 | /endosome/archive/bioinformatics/DLLab/src/Aix... | GSM3587928 | D0 | AML314_D0 | AML |
3 | /endosome/archive/bioinformatics/DLLab/src/Aix... | AML556 | D15 | GSM3587965 | /endosome/archive/bioinformatics/DLLab/src/Aix... | GSM3587966 | D15 | AML556_D15 | AML |
4 | /endosome/archive/bioinformatics/DLLab/src/Aix... | AML314 | D31 | GSM3587929 | /endosome/archive/bioinformatics/DLLab/src/Aix... | GSM3587930 | D31 | AML314_D31 | AML |
5 | /endosome/archive/bioinformatics/DLLab/src/Aix... | AML371 | D34 | GSM3587948 | /endosome/archive/bioinformatics/DLLab/src/Aix... | GSM3587949 | D34 | AML371_D34 | AML |
6 | /endosome/archive/bioinformatics/DLLab/src/Aix... | AML210A | D0 | GSM3587925 | /endosome/archive/bioinformatics/DLLab/src/Aix... | GSM3587926 | D0 | AML210A_D0 | AML |
7 | /endosome/archive/bioinformatics/DLLab/src/Aix... | AML707B | D41 | GSM3587975 | /endosome/archive/bioinformatics/DLLab/src/Aix... | GSM3587976 | D41 | AML707B_D41 | AML |
8 | /endosome/archive/bioinformatics/DLLab/src/Aix... | AML475 | D0 | GSM3587959 | /endosome/archive/bioinformatics/DLLab/src/Aix... | GSM3587960 | D0 | AML475_D0 | AML |
9 | /endosome/archive/bioinformatics/DLLab/src/Aix... | AML707B | D113 | GSM3587971 | /endosome/archive/bioinformatics/DLLab/src/Aix... | GSM3587972 | D113 | AML707B_D113 | AML |
10 | /endosome/archive/bioinformatics/DLLab/src/Aix... | AML328 | D113 | GSM3587933 | /endosome/archive/bioinformatics/DLLab/src/Aix... | GSM3587934 | D113 | AML328_D113 | AML |
11 | /endosome/archive/bioinformatics/DLLab/src/Aix... | AML916 | D0 | GSM3587988 | /endosome/archive/bioinformatics/DLLab/src/Aix... | GSM3587989 | D0 | AML916_D0 | AML |
12 | /endosome/archive/bioinformatics/DLLab/src/Aix... | AML329 | D37 | GSM3587944 | /endosome/archive/bioinformatics/DLLab/src/Aix... | GSM3587945 | D37 | AML329_D37 | AML |
13 | /endosome/archive/bioinformatics/DLLab/src/Aix... | AML707B | D18 | GSM3587973 | /endosome/archive/bioinformatics/DLLab/src/Aix... | GSM3587974 | D18 | AML707B_D18 | AML |
14 | /endosome/archive/bioinformatics/DLLab/src/Aix... | AML556 | D0 | GSM3587963 | /endosome/archive/bioinformatics/DLLab/src/Aix... | GSM3587964 | D0 | AML556_D0 | AML |
15 | /endosome/archive/bioinformatics/DLLab/src/Aix... | AML997 | D0 | GSM3587992 | /endosome/archive/bioinformatics/DLLab/src/Aix... | GSM3587993 | D0 | AML997_D0 | AML |
16 | /endosome/archive/bioinformatics/DLLab/src/Aix... | AML722B | D0 | GSM3587980 | /endosome/archive/bioinformatics/DLLab/src/Aix... | GSM3587981 | D0 | AML722B_D0 | AML |
17 | /endosome/archive/bioinformatics/DLLab/src/Aix... | AML707B | D97 | GSM3587977 | /endosome/archive/bioinformatics/DLLab/src/Aix... | GSM3587978 | D97 | AML707B_D97 | AML |
18 | /endosome/archive/bioinformatics/DLLab/src/Aix... | AML1012 | D0 | GSM3587923 | /endosome/archive/bioinformatics/DLLab/src/Aix... | GSM3587924 | D0 | AML1012_D0 | AML |
19 | /endosome/archive/bioinformatics/DLLab/src/Aix... | AML870 | D0 | GSM3587984 | /endosome/archive/bioinformatics/DLLab/src/Aix... | GSM3587985 | D0 | AML870_D0 | AML |
20 | /endosome/archive/bioinformatics/DLLab/src/Aix... | AML329 | D20 | GSM3587942 | /endosome/archive/bioinformatics/DLLab/src/Aix... | GSM3587943 | D20 | AML329_D20 | AML |
21 | /endosome/archive/bioinformatics/DLLab/src/Aix... | AML419A | D0 | GSM3587950 | /endosome/archive/bioinformatics/DLLab/src/Aix... | GSM3587951 | D0 | AML419A_D0 | AML |
22 | /endosome/archive/bioinformatics/DLLab/src/Aix... | AML707B | D0 | GSM3587969 | /endosome/archive/bioinformatics/DLLab/src/Aix... | GSM3587970 | D0 | AML707B_D0 | AML |
23 | /endosome/archive/bioinformatics/DLLab/src/Aix... | AML329 | D0 | GSM3587940 | /endosome/archive/bioinformatics/DLLab/src/Aix... | GSM3587941 | D0 | AML329_D0 | AML |
24 | /endosome/archive/bioinformatics/DLLab/src/Aix... | AML556 | D31 | GSM3587967 | /endosome/archive/bioinformatics/DLLab/src/Aix... | GSM3587968 | D31 | AML556_D31 | AML |
25 | /endosome/archive/bioinformatics/DLLab/src/Aix... | AML870 | D14 | GSM3587986 | /endosome/archive/bioinformatics/DLLab/src/Aix... | GSM3587987 | D14 | AML870_D14 | AML |
26 | /endosome/archive/bioinformatics/DLLab/src/Aix... | BM4 | BM4 | GSM3588000 | /endosome/archive/bioinformatics/DLLab/src/Aix... | GSM3588001 | NaN | BM4 | control |
27 | /endosome/archive/bioinformatics/DLLab/src/Aix... | AML475 | D29 | GSM3587961 | /endosome/archive/bioinformatics/DLLab/src/Aix... | GSM3587962 | D29 | AML475_D29 | AML |
28 | /endosome/archive/bioinformatics/DLLab/src/Aix... | OCI | NaN | GSM3588005 | /endosome/archive/bioinformatics/DLLab/src/Aix... | GSM3588005 | NaN | OCI | cellline |
29 | /endosome/archive/bioinformatics/DLLab/src/Aix... | AML997 | D35 | GSM3587994 | /endosome/archive/bioinformatics/DLLab/src/Aix... | GSM3587995 | D35 | AML997_D35 | AML |
30 | /endosome/archive/bioinformatics/DLLab/src/Aix... | BM2 | BM2 | GSM3587997 | /endosome/archive/bioinformatics/DLLab/src/Aix... | GSM3587997 | NaN | BM2 | control |
31 | /endosome/archive/bioinformatics/DLLab/src/Aix... | AML328 | D29 | GSM3587937 | /endosome/archive/bioinformatics/DLLab/src/Aix... | GSM3587938 | D29 | AML328_D29 | AML |
32 | /endosome/archive/bioinformatics/DLLab/src/Aix... | AML420B | D35 | GSM3587957 | /endosome/archive/bioinformatics/DLLab/src/Aix... | GSM3587958 | D35 | AML420B_D35 | AML |
33 | /endosome/archive/bioinformatics/DLLab/src/Aix... | AML921A | D0 | GSM3587990 | /endosome/archive/bioinformatics/DLLab/src/Aix... | GSM3587991 | D0 | AML921A_D0 | AML |
34 | /endosome/archive/bioinformatics/DLLab/src/Aix... | AML420B | D0 | GSM3587953 | /endosome/archive/bioinformatics/DLLab/src/Aix... | GSM3587954 | D0 | AML420B_D0 | AML |
35 | /endosome/archive/bioinformatics/DLLab/src/Aix... | BM3 | BM3 | GSM3587998 | /endosome/archive/bioinformatics/DLLab/src/Aix... | GSM3587999 | NaN | BM3 | control |
36 | /endosome/archive/bioinformatics/DLLab/src/Aix... | BM1 | BM1 | GSM3587996 | /endosome/archive/bioinformatics/DLLab/src/Aix... | GSM3587996 | NaN | BM1 | control |
37 | /endosome/archive/bioinformatics/DLLab/src/Aix... | BM5 | BM5-34p38n | GSM3588003 | /endosome/archive/bioinformatics/DLLab/src/Aix... | GSM3588003 | NaN | BM5-34p38n | control |
38 | /endosome/archive/bioinformatics/DLLab/src/Aix... | AML371 | D0 | GSM3587946 | /endosome/archive/bioinformatics/DLLab/src/Aix... | GSM3587947 | D0 | AML371_D0 | AML |
39 | /endosome/archive/bioinformatics/DLLab/src/Aix... | AML722B | D49 | GSM3587982 | /endosome/archive/bioinformatics/DLLab/src/Aix... | GSM3587983 | D49 | AML722B_D49 | AML |
40 | /endosome/archive/bioinformatics/DLLab/src/Aix... | MUTZ3 | NaN | GSM3588004 | /endosome/archive/bioinformatics/DLLab/src/Aix... | GSM3588004 | NaN | MUTZ3 | cellline |
41 | /endosome/archive/bioinformatics/DLLab/src/Aix... | BM5 | BM5-34p | GSM3588002 | /endosome/archive/bioinformatics/DLLab/src/Aix... | GSM3588002 | NaN | BM5-34p | control |
42 | /endosome/archive/bioinformatics/DLLab/src/Aix... | AML328 | D171 | GSM3587935 | /endosome/archive/bioinformatics/DLLab/src/Aix... | GSM3587936 | D171 | AML328_D171 | AML |
# Create a dict of adata objects
adata_dict = AML_reader.create_adata_dict(df_paths)
print(f"Created {len(adata_dict)} AnnData objects.")
/archive/bioinformatics/DLLab/shared/CondaEnvironments/Aixa_scDML/lib/python3.8/site-packages/anndata/_core/anndata.py:120: ImplicitModificationWarning: Transforming to str index.
warnings.warn("Transforming to str index.", ImplicitModificationWarning)
Created 43 AnnData objects.
merged_adata = AML_reader.merge_adata_objects(adata_dict)
merged_adata
AnnData object with n_obs × n_vars = 41090 × 27899
obs: 'Cell', 'NumberOfReads', 'AlignedToGenome', 'AlignedToTranscriptome', 'TranscriptomeUMIs', 'NumberOfGenes', 'CyclingScore', 'CyclingBinary', 'MutTranscripts', 'WtTranscripts', 'PredictionRF2', 'PredictionRefined', 'CellType', 'Score_HSC', 'Score_Prog', 'Score_GMP', 'Score_ProMono', 'Score_Mono', 'Score_cDC', 'Score_pDC', 'Score_earlyEry', 'Score_lateEry', 'Score_ProB', 'Score_B', 'Score_Plasma', 'Score_T', 'Score_CTL', 'Score_NK', 'NanoporeTranscripts', 'id', 'Day', 'unique_id', 'Patient_group'
var: 'Gene'
merged_adata.obs
Cell | NumberOfReads | AlignedToGenome | AlignedToTranscriptome | TranscriptomeUMIs | NumberOfGenes | CyclingScore | CyclingBinary | MutTranscripts | WtTranscripts | ... | Score_B | Score_Plasma | Score_T | Score_CTL | Score_NK | NanoporeTranscripts | id | Day | unique_id | Patient_group | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | AML328-D0_AAAAACAGAAGT | 24994 | 15391 | 7477 | 1236 | 581 | -0.351 | no | NaN | NaN | ... | 0.042 | 0.009 | 0.132 | 0.184 | 0.447 | NaN | AML328 | D0 | AML328_D0 | AML |
1 | AML328-D0_AAAACCGCTACT | 55122 | 34633 | 17252 | 3394 | 1238 | -0.409 | no | NaN | NaN | ... | 0.071 | 0.020 | 0.070 | 0.052 | 0.037 | NaN | AML328 | D0 | AML328_D0 | AML |
2 | AML328-D0_AAAACCGGCTTT | 43393 | 26813 | 16148 | 2649 | 1243 | -0.401 | no | NaN | NaN | ... | 0.062 | 0.052 | 0.046 | 0.032 | 0.034 | NaN | AML328 | D0 | AML328_D0 | AML |
3 | AML328-D0_AAAAGCTTATCA | 25085 | 15404 | 9483 | 1582 | 633 | -0.378 | no | NaN | NaN | ... | 0.060 | 0.009 | 0.491 | 0.167 | 0.080 | NaN | AML328 | D0 | AML328_D0 | AML |
4 | AML328-D0_AAAAGTCCCCGT | 54911 | 33226 | 20545 | 3280 | 1376 | -0.629 | no | NaN | NaN | ... | 0.012 | 0.003 | 0.006 | 0.010 | 0.009 | NaN | AML328 | D0 | AML328_D0 | AML |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
41085 | AML328-D171_CCATCATCCACC | 26414 | 19298 | 13674 | 2516 | 939 | -0.257 | no | NaN | NaN | ... | 0.058 | 0.016 | 0.464 | 0.135 | 0.109 | NaN | AML328 | D171 | AML328_D171 | AML |
41086 | AML328-D171_TTTTATCATTCT | 27460 | 20073 | 9433 | 1651 | 878 | -0.524 | no | NaN | NaN | ... | 0.066 | 0.023 | 0.281 | 0.226 | 0.213 | NaN | AML328 | D171 | AML328_D171 | AML |
41087 | AML328-D171_AAGATGTAGCGT | 12394 | 8787 | 6543 | 1331 | 504 | -0.348 | no | NaN | NaN | ... | 0.088 | 0.019 | 0.136 | 0.074 | 0.041 | NaN | AML328 | D171 | AML328_D171 | AML |
41088 | AML328-D171_CTGTAGCTCCTA | 19172 | 13904 | 10351 | 1792 | 745 | -0.394 | no | NaN | NaN | ... | 0.107 | 0.015 | 0.121 | 0.045 | 0.052 | NaN | AML328 | D171 | AML328_D171 | AML |
41089 | AML328-D171_GATTTGGACGTT | 15912 | 11652 | 6120 | 1153 | 580 | -0.310 | no | NaN | NaN | ... | 0.077 | 0.020 | 0.365 | 0.138 | 0.067 | NaN | AML328 | D171 | AML328_D171 | AML |
41090 rows × 33 columns
# Change to True to save adata
# I saved the cmerged count matrix under /scMEDAL_for_scRNAseq/Experiments/data/AML_data/adata_merged
save_data = False
if save_data:
#save merged adata
save_adata(merged_adata,output_path=data_base_path+"/adata_merged")
Created folder: /archive/bioinformatics/DLLab/AixaAndrade/data/Genomic_data/VanGallen_2019/adata_merged