scMEDAL_for_scRNAseq / Experiments / AML / preprocessing / AML_data_reader.ipynb
AML_data_reader.ipynb
Raw

AML Data Processing Notebook

This notebook processes the AML dataset obtained from GEO (GSE116256) and saves it as a single count matrix.

Data Sources

  • The count matrix and annotations were downloaded from GEO (GSE116256).
  • The compressed files are stored in:
    /scMEDAL_for_scRNAseq/Experiments/data/AML_data/zip_files
  • The processed count matrix is saved in:
    /scMEDAL_for_scRNAseq/Experiments/data/AML_data/adata_merged

Environment: preprocess_and_plot_umaps_env

import sys
# Add the parent directory to the Python path
sys.path.append("../")
import os
# Now you can import from the parent directory
from paths_config import data_base_path

from scMEDAL.utils.preprocessing_utils import AML_data_reader
from scMEDAL.utils.utils import save_adata
data_base_path: /endosome/archive/bioinformatics/DLLab/src/AixaAndrade/gitfront/scMEDAL_for_scRNAseq/Experiments/AML/../data/AML_data
outputs_path: /endosome/archive/bioinformatics/DLLab/src/AixaAndrade/gitfront/scMEDAL_for_scRNAseq/Experiments/AML/../outputs/AML_outputs

# I downloaded the dataset count matrix and annotations from: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE116256
# I saved the compressed files under /scMEDAL_for_scRNAseq/Experiments/data/AML_data/zip_files
# Path to the directory containing zip files
parent_path = os.path.join(data_base_path, "zip_files")
# 1.Read adata


AML_reader = AML_data_reader(parent_path)
# Get df_paths
df_paths = AML_reader.get_df_paths()
df_paths

         id Patient_group  counts
0   AML1012           AML       1
1   AML210A           AML       1
2    AML314           AML       2
3    AML328           AML       4
4    AML329           AML       3
5    AML371           AML       2
6   AML419A           AML       1
7   AML420B           AML       3
8    AML475           AML       2
9    AML556           AML       3
10  AML707B           AML       5
11  AML722B           AML       2
12   AML870           AML       2
13   AML916           AML       1
14  AML921A           AML       1
15   AML997           AML       2
16      BM1       control       1
17      BM2       control       1
18      BM3       control       1
19      BM4       control       1
20      BM5       control       2
21    MUTZ3      cellline       1
22      OCI      cellline       1

matrix_path id file_note accession_matrix_num anno_path accession_anno_num Day unique_id Patient_group
0 /endosome/archive/bioinformatics/DLLab/src/Aix... AML328 D0 GSM3587931 /endosome/archive/bioinformatics/DLLab/src/Aix... GSM3587932 D0 AML328_D0 AML
1 /endosome/archive/bioinformatics/DLLab/src/Aix... AML420B D14 GSM3587955 /endosome/archive/bioinformatics/DLLab/src/Aix... GSM3587956 D14 AML420B_D14 AML
2 /endosome/archive/bioinformatics/DLLab/src/Aix... AML314 D0 GSM3587927 /endosome/archive/bioinformatics/DLLab/src/Aix... GSM3587928 D0 AML314_D0 AML
3 /endosome/archive/bioinformatics/DLLab/src/Aix... AML556 D15 GSM3587965 /endosome/archive/bioinformatics/DLLab/src/Aix... GSM3587966 D15 AML556_D15 AML
4 /endosome/archive/bioinformatics/DLLab/src/Aix... AML314 D31 GSM3587929 /endosome/archive/bioinformatics/DLLab/src/Aix... GSM3587930 D31 AML314_D31 AML
5 /endosome/archive/bioinformatics/DLLab/src/Aix... AML371 D34 GSM3587948 /endosome/archive/bioinformatics/DLLab/src/Aix... GSM3587949 D34 AML371_D34 AML
6 /endosome/archive/bioinformatics/DLLab/src/Aix... AML210A D0 GSM3587925 /endosome/archive/bioinformatics/DLLab/src/Aix... GSM3587926 D0 AML210A_D0 AML
7 /endosome/archive/bioinformatics/DLLab/src/Aix... AML707B D41 GSM3587975 /endosome/archive/bioinformatics/DLLab/src/Aix... GSM3587976 D41 AML707B_D41 AML
8 /endosome/archive/bioinformatics/DLLab/src/Aix... AML475 D0 GSM3587959 /endosome/archive/bioinformatics/DLLab/src/Aix... GSM3587960 D0 AML475_D0 AML
9 /endosome/archive/bioinformatics/DLLab/src/Aix... AML707B D113 GSM3587971 /endosome/archive/bioinformatics/DLLab/src/Aix... GSM3587972 D113 AML707B_D113 AML
10 /endosome/archive/bioinformatics/DLLab/src/Aix... AML328 D113 GSM3587933 /endosome/archive/bioinformatics/DLLab/src/Aix... GSM3587934 D113 AML328_D113 AML
11 /endosome/archive/bioinformatics/DLLab/src/Aix... AML916 D0 GSM3587988 /endosome/archive/bioinformatics/DLLab/src/Aix... GSM3587989 D0 AML916_D0 AML
12 /endosome/archive/bioinformatics/DLLab/src/Aix... AML329 D37 GSM3587944 /endosome/archive/bioinformatics/DLLab/src/Aix... GSM3587945 D37 AML329_D37 AML
13 /endosome/archive/bioinformatics/DLLab/src/Aix... AML707B D18 GSM3587973 /endosome/archive/bioinformatics/DLLab/src/Aix... GSM3587974 D18 AML707B_D18 AML
14 /endosome/archive/bioinformatics/DLLab/src/Aix... AML556 D0 GSM3587963 /endosome/archive/bioinformatics/DLLab/src/Aix... GSM3587964 D0 AML556_D0 AML
15 /endosome/archive/bioinformatics/DLLab/src/Aix... AML997 D0 GSM3587992 /endosome/archive/bioinformatics/DLLab/src/Aix... GSM3587993 D0 AML997_D0 AML
16 /endosome/archive/bioinformatics/DLLab/src/Aix... AML722B D0 GSM3587980 /endosome/archive/bioinformatics/DLLab/src/Aix... GSM3587981 D0 AML722B_D0 AML
17 /endosome/archive/bioinformatics/DLLab/src/Aix... AML707B D97 GSM3587977 /endosome/archive/bioinformatics/DLLab/src/Aix... GSM3587978 D97 AML707B_D97 AML
18 /endosome/archive/bioinformatics/DLLab/src/Aix... AML1012 D0 GSM3587923 /endosome/archive/bioinformatics/DLLab/src/Aix... GSM3587924 D0 AML1012_D0 AML
19 /endosome/archive/bioinformatics/DLLab/src/Aix... AML870 D0 GSM3587984 /endosome/archive/bioinformatics/DLLab/src/Aix... GSM3587985 D0 AML870_D0 AML
20 /endosome/archive/bioinformatics/DLLab/src/Aix... AML329 D20 GSM3587942 /endosome/archive/bioinformatics/DLLab/src/Aix... GSM3587943 D20 AML329_D20 AML
21 /endosome/archive/bioinformatics/DLLab/src/Aix... AML419A D0 GSM3587950 /endosome/archive/bioinformatics/DLLab/src/Aix... GSM3587951 D0 AML419A_D0 AML
22 /endosome/archive/bioinformatics/DLLab/src/Aix... AML707B D0 GSM3587969 /endosome/archive/bioinformatics/DLLab/src/Aix... GSM3587970 D0 AML707B_D0 AML
23 /endosome/archive/bioinformatics/DLLab/src/Aix... AML329 D0 GSM3587940 /endosome/archive/bioinformatics/DLLab/src/Aix... GSM3587941 D0 AML329_D0 AML
24 /endosome/archive/bioinformatics/DLLab/src/Aix... AML556 D31 GSM3587967 /endosome/archive/bioinformatics/DLLab/src/Aix... GSM3587968 D31 AML556_D31 AML
25 /endosome/archive/bioinformatics/DLLab/src/Aix... AML870 D14 GSM3587986 /endosome/archive/bioinformatics/DLLab/src/Aix... GSM3587987 D14 AML870_D14 AML
26 /endosome/archive/bioinformatics/DLLab/src/Aix... BM4 BM4 GSM3588000 /endosome/archive/bioinformatics/DLLab/src/Aix... GSM3588001 NaN BM4 control
27 /endosome/archive/bioinformatics/DLLab/src/Aix... AML475 D29 GSM3587961 /endosome/archive/bioinformatics/DLLab/src/Aix... GSM3587962 D29 AML475_D29 AML
28 /endosome/archive/bioinformatics/DLLab/src/Aix... OCI NaN GSM3588005 /endosome/archive/bioinformatics/DLLab/src/Aix... GSM3588005 NaN OCI cellline
29 /endosome/archive/bioinformatics/DLLab/src/Aix... AML997 D35 GSM3587994 /endosome/archive/bioinformatics/DLLab/src/Aix... GSM3587995 D35 AML997_D35 AML
30 /endosome/archive/bioinformatics/DLLab/src/Aix... BM2 BM2 GSM3587997 /endosome/archive/bioinformatics/DLLab/src/Aix... GSM3587997 NaN BM2 control
31 /endosome/archive/bioinformatics/DLLab/src/Aix... AML328 D29 GSM3587937 /endosome/archive/bioinformatics/DLLab/src/Aix... GSM3587938 D29 AML328_D29 AML
32 /endosome/archive/bioinformatics/DLLab/src/Aix... AML420B D35 GSM3587957 /endosome/archive/bioinformatics/DLLab/src/Aix... GSM3587958 D35 AML420B_D35 AML
33 /endosome/archive/bioinformatics/DLLab/src/Aix... AML921A D0 GSM3587990 /endosome/archive/bioinformatics/DLLab/src/Aix... GSM3587991 D0 AML921A_D0 AML
34 /endosome/archive/bioinformatics/DLLab/src/Aix... AML420B D0 GSM3587953 /endosome/archive/bioinformatics/DLLab/src/Aix... GSM3587954 D0 AML420B_D0 AML
35 /endosome/archive/bioinformatics/DLLab/src/Aix... BM3 BM3 GSM3587998 /endosome/archive/bioinformatics/DLLab/src/Aix... GSM3587999 NaN BM3 control
36 /endosome/archive/bioinformatics/DLLab/src/Aix... BM1 BM1 GSM3587996 /endosome/archive/bioinformatics/DLLab/src/Aix... GSM3587996 NaN BM1 control
37 /endosome/archive/bioinformatics/DLLab/src/Aix... BM5 BM5-34p38n GSM3588003 /endosome/archive/bioinformatics/DLLab/src/Aix... GSM3588003 NaN BM5-34p38n control
38 /endosome/archive/bioinformatics/DLLab/src/Aix... AML371 D0 GSM3587946 /endosome/archive/bioinformatics/DLLab/src/Aix... GSM3587947 D0 AML371_D0 AML
39 /endosome/archive/bioinformatics/DLLab/src/Aix... AML722B D49 GSM3587982 /endosome/archive/bioinformatics/DLLab/src/Aix... GSM3587983 D49 AML722B_D49 AML
40 /endosome/archive/bioinformatics/DLLab/src/Aix... MUTZ3 NaN GSM3588004 /endosome/archive/bioinformatics/DLLab/src/Aix... GSM3588004 NaN MUTZ3 cellline
41 /endosome/archive/bioinformatics/DLLab/src/Aix... BM5 BM5-34p GSM3588002 /endosome/archive/bioinformatics/DLLab/src/Aix... GSM3588002 NaN BM5-34p control
42 /endosome/archive/bioinformatics/DLLab/src/Aix... AML328 D171 GSM3587935 /endosome/archive/bioinformatics/DLLab/src/Aix... GSM3587936 D171 AML328_D171 AML
# Create a dict of adata objects
adata_dict = AML_reader.create_adata_dict(df_paths)
print(f"Created {len(adata_dict)} AnnData objects.")

/archive/bioinformatics/DLLab/shared/CondaEnvironments/Aixa_scDML/lib/python3.8/site-packages/anndata/_core/anndata.py:120: ImplicitModificationWarning: Transforming to str index.
  warnings.warn("Transforming to str index.", ImplicitModificationWarning)


Created 43 AnnData objects.
merged_adata = AML_reader.merge_adata_objects(adata_dict)


merged_adata
AnnData object with n_obs × n_vars = 41090 × 27899
    obs: 'Cell', 'NumberOfReads', 'AlignedToGenome', 'AlignedToTranscriptome', 'TranscriptomeUMIs', 'NumberOfGenes', 'CyclingScore', 'CyclingBinary', 'MutTranscripts', 'WtTranscripts', 'PredictionRF2', 'PredictionRefined', 'CellType', 'Score_HSC', 'Score_Prog', 'Score_GMP', 'Score_ProMono', 'Score_Mono', 'Score_cDC', 'Score_pDC', 'Score_earlyEry', 'Score_lateEry', 'Score_ProB', 'Score_B', 'Score_Plasma', 'Score_T', 'Score_CTL', 'Score_NK', 'NanoporeTranscripts', 'id', 'Day', 'unique_id', 'Patient_group'
    var: 'Gene'
merged_adata.obs

Cell NumberOfReads AlignedToGenome AlignedToTranscriptome TranscriptomeUMIs NumberOfGenes CyclingScore CyclingBinary MutTranscripts WtTranscripts ... Score_B Score_Plasma Score_T Score_CTL Score_NK NanoporeTranscripts id Day unique_id Patient_group
0 AML328-D0_AAAAACAGAAGT 24994 15391 7477 1236 581 -0.351 no NaN NaN ... 0.042 0.009 0.132 0.184 0.447 NaN AML328 D0 AML328_D0 AML
1 AML328-D0_AAAACCGCTACT 55122 34633 17252 3394 1238 -0.409 no NaN NaN ... 0.071 0.020 0.070 0.052 0.037 NaN AML328 D0 AML328_D0 AML
2 AML328-D0_AAAACCGGCTTT 43393 26813 16148 2649 1243 -0.401 no NaN NaN ... 0.062 0.052 0.046 0.032 0.034 NaN AML328 D0 AML328_D0 AML
3 AML328-D0_AAAAGCTTATCA 25085 15404 9483 1582 633 -0.378 no NaN NaN ... 0.060 0.009 0.491 0.167 0.080 NaN AML328 D0 AML328_D0 AML
4 AML328-D0_AAAAGTCCCCGT 54911 33226 20545 3280 1376 -0.629 no NaN NaN ... 0.012 0.003 0.006 0.010 0.009 NaN AML328 D0 AML328_D0 AML
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
41085 AML328-D171_CCATCATCCACC 26414 19298 13674 2516 939 -0.257 no NaN NaN ... 0.058 0.016 0.464 0.135 0.109 NaN AML328 D171 AML328_D171 AML
41086 AML328-D171_TTTTATCATTCT 27460 20073 9433 1651 878 -0.524 no NaN NaN ... 0.066 0.023 0.281 0.226 0.213 NaN AML328 D171 AML328_D171 AML
41087 AML328-D171_AAGATGTAGCGT 12394 8787 6543 1331 504 -0.348 no NaN NaN ... 0.088 0.019 0.136 0.074 0.041 NaN AML328 D171 AML328_D171 AML
41088 AML328-D171_CTGTAGCTCCTA 19172 13904 10351 1792 745 -0.394 no NaN NaN ... 0.107 0.015 0.121 0.045 0.052 NaN AML328 D171 AML328_D171 AML
41089 AML328-D171_GATTTGGACGTT 15912 11652 6120 1153 580 -0.310 no NaN NaN ... 0.077 0.020 0.365 0.138 0.067 NaN AML328 D171 AML328_D171 AML

41090 rows × 33 columns

# Change to True to save adata
# I saved the cmerged count matrix under /scMEDAL_for_scRNAseq/Experiments/data/AML_data/adata_merged
save_data = False
if save_data:
    #save merged adata
    save_adata(merged_adata,output_path=data_base_path+"/adata_merged")
Created folder: /archive/bioinformatics/DLLab/AixaAndrade/data/Genomic_data/VanGallen_2019/adata_merged