""" ASD Data Processing Script This script processes the ASD dataset obtained from https://cells.ucsc.edu/?ds=autism. The dataset should be saved in: scMEDAL_for_scRNAseq/Experiments/data/ASD_data/norm The script preprocesses the data and stores the processed output in: scMEDAL_for_scRNAseq/Experiments/data/ASD_data/scenario_id By default, `scenario_id` is set to "log_transformed_2916hvggenes". Environment: This script should be run in the `Aixa_scDML` environment. """ import os import pandas as pd import numpy as np import scanpy as sc import copy import sys # ---- Importing Required Modules and Configurations ---- # Add the parent directory to the Python path to import data_base_path from paths_config sys.path.append("../") from paths_config import data_base_path, scenario_id from scMEDAL.utils.utils import save_adata from scMEDAL.utils.preprocessing_utils import scRNAseq_pipeline_loghvg # ---- Define Paths ---- # Path to the directory containing normalized data parent_path = os.path.join(data_base_path, "norm") # File name for the dataset healthy_heart_file = "Healthy_human_heart_adata.h5ad" file_path = os.path.join(parent_path, healthy_heart_file) # ---- Step 1: Read the Dataset and Metadata ---- # Load gene expression data (transpose since dataset is in genes*cells format) adata_log2 = sc.read_text(parent_path + "/exprMatrix.tsv", first_column_names=True).T # Load metadata and gene IDs meta = pd.read_csv(parent_path + "/meta.tsv", sep="\t") gene_ids = pd.read_csv(parent_path + "/geneids.txt", sep="\t") # Assign metadata and gene IDs to the AnnData object adata_log2.obs = meta adata_log2.var['gene_ids'] = gene_ids.values # Ensure uniqueness of gene and cell names adata_log2.var_names_make_unique(join="-") adata_log2.obs_names_make_unique(join="-") # Print dimensions of the AnnData object to confirm cells*genes format print("AnnData object dimensions (cells*genes):", adata_log2.X.shape) # ---- Step 2: Preprocessing ---- # Reverse log2 transformation of the data adata = copy.deepcopy(adata_log2) # Rename cluster column to celltype for clarity adata.obs['celltype'] = adata.obs['cluster'].astype('category') # Format donor information adata.obs['donor'] = ['donor_' + str(d) for d in adata.obs['individual']] adata.obs['donor'] = adata.obs['donor'].astype('category') # Assign donor information as batch adata.obs['batch'] = adata.obs['donor'] # Print range of normalized data before transformation print("Normalized data range before reversing log2:", np.min(adata.X), np.max(adata.X)) # Reverse log2 transformation: Add 1 before applying exp2 to revert log2(x+1) adata.X = np.exp2(adata.X) - 1 # Print range of data after transformation print("Data range after reversing log2 transformation:", np.min(adata.X), np.max(adata.X)) print("adata:", adata) # ---- Step 3: Save Processed Data (Optional) ---- save_data = False expt = scenario_id # Scenario ID determines predefined preprocessing steps if save_data: out_path = os.path.join(data_base_path, expt) print(out_path) # Create directory if it does not exist if not os.path.exists(out_path): os.makedirs(out_path) # ---- Step 4: Additional Preprocessing for Specific Experiments ---- if expt == "log_transformed_2916hvggenes": adata_log_hvg = scRNAseq_pipeline_loghvg( adata, min_genes_per_cell=10, min_cells_per_gene=3, total_counts_per_cell=10000, n_top_genes=2916 ) if save_data: save_adata(adata_log_hvg, out_path) print("adata after preprocessing:", adata_log_hvg)