geneExtraction.py · precision-medicine

### import the modules and packages needed
import pandas as pd
import numpy as np
from glob import glob
import requests
import re
import json
from urllib.request import urlretrieve
import Bio
from Bio import SeqIO, SearchIO, Entrez
from Bio.Seq import Seq
from Bio.SeqUtils import GC
from Bio.Blast import NCBIWWW
from Bio.Data import CodonTable
from ratelimit import limits
import time
from typing import List, Any
import tqdm 





#### The code below is used to extract the frequency table of a given gene from the NCBI database. Using the snp_id of the gene extracted from Pharmgkb. 
def frequency_table(*,snp_id = None, gene_name = None, gene_consequence = None):
    """
    Retrieves the allele frequency table for a given SNP from the NCBI database.

    Args:
        snp_id (str): The SNP identifier (e.g., 'rs12345').
        gene_name (str): The name of the gene associated with the SNP.
        gene_consequence (str): The effect or consequence of the SNP on the gene.

    Returns:
        pd.DataFrame: DataFrame containing allele frequency data with additional columns for gene name,
                      gene consequence, and SNP ID. Returns an empty list if data is not found.
    """
    # Construct the URL to access the SNP frequency data from NCBI
    first_link = "https://www.ncbi.nlm.nih.gov/snp/"
    snp_id = str(snp_id)
    last_link = "/download/frequency"
    actual_link = first_link + snp_id + last_link

    try:
        # Request the frequency data from the constructed URL
        url_access = requests.get(actual_link)
        text = url_access.text
        # Split the raw text to isolate the frequency table section
        text_list = text.split("#################")[1]
        texts = text_list.split("\n")[1:-1]
        # Convert the raw text into a DataFrame
        data = pd.DataFrame([x.split("\t") for x in texts])
        data.columns = data.iloc[0]  # Set the first row as the header
        data = data.drop(data.index[0])  # Drop the header row from the data
        # Add gene metadata columns
        data["gene"] = gene_name
        data["gene_consequence"] = gene_consequence
        data["snp_id"] = snp_id
        return data
    except:
        # If data is not found or an error occurs, print a message and return an empty list
        print(f"There is no info for the snp_id entered: {snp_id}")
        data = []
        return data

    
    



#### compile all the data by passing the snps_ids information.
def compile_data(snps_data = None):
    """
    Compiles allele frequency data for a list of SNPs.

    Args:
        snps_data (pd.DataFrame): DataFrame containing SNP information with columns 
                                  "Variant", "Genes", and "Gene_consequence".

    Returns:
        pd.DataFrame: Concatenated DataFrame with allele frequency data for all valid SNPs.
    """
    list_of_data = []

    # Iterate through each row in the input DataFrame
    for index, rows in tqdm(snps_data.iterrows()):
        # Retrieve frequency table for the current SNP
        data = frequency_table(
            snp_id=rows["Variant"],
            gene_name=rows["Genes"],
            gene_consequence=rows["Gene_consequence"]
        )
        # Only append if the result is a DataFrame (i.e., data was found)
        if type(data) == pd.core.frame.DataFrame:
            data = data
        else:
            continue

        list_of_data.append(data)

    # Concatenate all individual DataFrames into a single DataFrame
    data_allele_frequency = pd.concat(list_of_data)
    return data_allele_frequency



if __name__ == '__main__':
    
    snps_data = "Pass"