"""AML Data Processing Script
This script processes the unified count matrix of the AML dataset.
**Prerequisite:** Run `AML_data_reader.ipynb` first and save the unified count matrix in:
`/scMEDAL_for_scRNAseq/Experiments/data/AML_data/adata_merged`
The script then preprocesses the data and saves the processed output in:
`/scMEDAL_for_scRNAseq/Experiments/data/AML_data/scenario_id`
By default, `scenario_id` is set to `"log_transformed_2916hvggenes"`.
Environment: preprocess_and_plot_umaps_env
"""
import os
import sys
import anndata
# ---- Add Parent Directory to Path ----
sys.path.append("../") # To import data_base_path from paths_config.py
from paths_config import data_base_path, scenario_id
from scMEDAL.utils.utils import read_adata, save_adata
from scMEDAL.utils.preprocessing_utils import scRNAseq_pipeline_loghvg
# ---- Define Data Paths ----
parent_path = os.path.join(data_base_path, "adata_merged")
# ---- Read AnnData ----
X, var, obs = read_adata(folder_path=parent_path)
adata = anndata.AnnData(X=X, obs=obs, var=var)
# ---- Convert 'nan' to String ----
adata.obs["CellType"] = adata.obs["CellType"].astype("str")
# ---- Filter Cells ----
# van Galen et al. 2019 randomly filtered 783 cells of 1,590 BM5 CD34+CD38- cells to reduce representation of this population
# Condition 1: Exclude 'nan' cells
keep_cells_condition1 = adata.obs["CellType"] != "nan"
# Dai et al. 2021 excludes AML314, AML371, AML722B, and AML997 samples due to unconfident annotations
# https://www.frontiersin.org/articles/10.3389/fcell.2021.762260/full
# Condition 2: Exclude specific IDs
exclude_ids = ["AML314", "AML371", "AML722B", "AML997"]
keep_cells_condition2 = ~adata.obs["id"].isin(exclude_ids)
# Combine both conditions using logical AND (&)
keep_cells_combined_condition = keep_cells_condition1 & keep_cells_condition2
# Apply the combined condition to filter `adata` directly
filtered_adata = adata[keep_cells_combined_condition, :].copy()
# ---- Rename Columns for Clarity ----
filtered_adata.obs["celltype"] = filtered_adata.obs["CellType"]
filtered_adata.obs["batch"] = filtered_adata.obs["id"]
print(f"Original number of cells: {adata.X.shape[0]}")
print(f"Number of cells after filtering: {filtered_adata.X.shape[0]}")
print("adata before preprocessing:", filtered_adata)
# ---- Save Processed Data ----
save_data = True
expt = scenario_id
if save_data:
out_path = os.path.join(data_base_path, expt)
print(out_path)
os.makedirs(out_path, exist_ok=True) # Create directory if it doesn't exist
# ---- Preprocessing: Log Transformation and HVG Selection ----
if expt == "log_transformed_2916hvggenes":
adata_log_hvg = scRNAseq_pipeline_loghvg(
filtered_adata,
min_genes_per_cell=10,
min_cells_per_gene=3,
total_counts_per_cell=10_000,
n_top_genes=2916,
)
if save_data:
save_adata(adata_log_hvg, out_path)
print("adata after preprocessing:", adata_log_hvg)