This script processes the Healthy Heart dataset obtained from Figshare
(Yu et al., 2023).
Dataset Location
The dataset should be saved in scMEDAL_for_scRNAseq/Experiments/data/HealthyHeart_data/raw
.
Preprocessing and Output
The script preprocesses the data and stores the processed output in
scMEDAL_for_scRNAseq/Experiments/data/HealthyHeart_data/scenario_id
.
By default, scenario_id
is set to log_transformed_3000hvggenes
.
Environment
This script should be run in the preprocess_and_plot_umaps_env
environment.
import anndata
print(anndata.__version__)
import os
from scMEDAL.utils.preprocessing_utils import H5ADLoader, scRNAseq_pipeline_log, scRNAseq_pipeline_loghvg
from scMEDAL.utils.utils import save_adata
import sys
# To import data_base_path from paths_config
# Add the parent directory to the Python path
sys.path.append("../")
# Now you can import from the parent directory
from paths_config import data_base_path,scenario_id
# Path to the directory containing raw data
# Example: /MyscMEDALExpt/Experiments/data/HealthyHeart_data/raw
parent_path = os.path.join(data_base_path, "raw")
# file name
healthy_heart_file = "Healthy_human_heart_adata.h5ad"
file_path = os.path.join(parent_path,healthy_heart_file )
# Load adata with H5ADLoader
loader = H5ADLoader(file_path)
loader.load_h5ad()
adata = loader.create_anndata()
# save adata raw
# save_adata(adata,parent_path)
Reading X
CSR matrix created successfully with shape: [486134 33538], Transposed: False
Reading: X
Reading: layers
No data found under key 'layers'
Reading: obs
Reading: obsm
Reading: obsp
No data found under key 'obsp'
Reading: uns
Reading: var
Reading: varm
No data found under key 'varm'
Reading: varp
No data found under key 'varp'
adata
AnnData object with n_obs × n_vars = 486134 × 33538
obs: 'Age', 'AgeBin', 'DeathType', 'DonorID', 'Gender', 'Organ', 'Race', 'SampleType', 'Source', 'Tissue', 'TissueDetail', '_index', 'celltype', 'protocol', 'sampleID'
var: '_index', 'gene_ids'
uns: 'celltype_colors'
obsm: 'X_umap'
save_data = False
# expt = "expt_name"
expt = scenario_id # On this case the scenario_id has predefined preprocessing
print("expt:",scenario_id)
if save_data:
out_path = os.path.join(data_base_path,expt)
print(out_path)
# Check if the directory exists, if not, create it
if not os.path.exists(out_path):
os.makedirs(out_path)
# Adapt to the type of preprocessing you need
# Log transform log(x+1)
if expt == "log_transformed":
adata_log = scRNAseq_pipeline_log(adata, min_genes_per_cell=10, min_cells_per_gene=3, total_counts_per_cell=10000)
if save_data:
save_adata(adata_log,out_path)
# Log transform log(x+1) +HVG genes
elif expt == "log_transformed_3000hvggenes":
adata_log_hvg = scRNAseq_pipeline_loghvg(adata, min_genes_per_cell=10, min_cells_per_gene=3, total_counts_per_cell=10000,n_top_genes=3000)
if save_data:
save_adata(adata_log_hvg,out_path)
expt: log_transformed_3000hvggenes
adata_log_hvg
AnnData object with n_obs × n_vars = 486134 × 3000
obs: 'Age', 'AgeBin', 'DeathType', 'DonorID', 'Gender', 'Organ', 'Race', 'SampleType', 'Source', 'Tissue', 'TissueDetail', '_index', 'celltype', 'protocol', 'sampleID', 'n_genes'
var: '_index', 'gene_ids', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
uns: 'celltype_colors', 'log1p', 'hvg'
obsm: 'X_umap'
adata_log_hvg.var
_index | gene_ids | n_cells | highly_variable | means | dispersions | dispersions_norm | |
---|---|---|---|---|---|---|---|
5 | b'AL627309.2' | b'ENSG00000239906' | 11 | True | 0.000101 | 2.430229 | 1.432752 |
7 | b'AL732372.1' | b'ENSG00000236601' | 6 | True | 0.000108 | 2.301117 | 1.012375 |
19 | b'AL645608.1' | b'ENSG00000223764' | 901 | True | 0.011911 | 2.308639 | 1.036867 |
20 | b'SAMD11' | b'ENSG00000187634' | 4056 | True | 0.047151 | 2.298307 | 1.003227 |
26 | b'HES4' | b'ENSG00000188290' | 95117 | True | 0.942373 | 2.632907 | 0.875079 |
... | ... | ... | ... | ... | ... | ... | ... |
33494 | b'S100B' | b'ENSG00000160307' | 2761 | True | 0.052005 | 3.101684 | 3.618944 |
33497 | b'MT-ND2' | b'ENSG00000198763' | 176135 | True | 2.562359 | 3.970598 | 1.051245 |
33499 | b'MT-CO2' | b'ENSG00000198712' | 245109 | True | 3.264734 | 4.525665 | 1.025560 |
33505 | b'MT-ND4' | b'ENSG00000198886' | 208143 | True | 3.113622 | 4.532405 | 1.050785 |
33513 | b'AC136616.2' | b'ENSG00000277761' | 7 | True | 0.000113 | 2.311633 | 1.046614 |
3000 rows × 7 columns