### import the modules and packages needed
import pandas as pd
import numpy as np
from glob import glob
import requests
import re
import json
from urllib.request import urlretrieve
import Bio
from Bio import SeqIO, SearchIO, Entrez
from Bio.Seq import Seq
from Bio.SeqUtils import GC
from Bio.Blast import NCBIWWW
from Bio.Data import CodonTable
from ratelimit import limits
import time
from typing import List, Any
import tqdm
#### The code below is used to extract the frequency table of a given gene from the NCBI database. Using the snp_id of the gene extracted from Pharmgkb.
def frequency_table(*,snp_id = None, gene_name = None, gene_consequence = None):
"""
Retrieves the allele frequency table for a given SNP from the NCBI database.
Args:
snp_id (str): The SNP identifier (e.g., 'rs12345').
gene_name (str): The name of the gene associated with the SNP.
gene_consequence (str): The effect or consequence of the SNP on the gene.
Returns:
pd.DataFrame: DataFrame containing allele frequency data with additional columns for gene name,
gene consequence, and SNP ID. Returns an empty list if data is not found.
"""
# Construct the URL to access the SNP frequency data from NCBI
first_link = "https://www.ncbi.nlm.nih.gov/snp/"
snp_id = str(snp_id)
last_link = "/download/frequency"
actual_link = first_link + snp_id + last_link
try:
# Request the frequency data from the constructed URL
url_access = requests.get(actual_link)
text = url_access.text
# Split the raw text to isolate the frequency table section
text_list = text.split("#################")[1]
texts = text_list.split("\n")[1:-1]
# Convert the raw text into a DataFrame
data = pd.DataFrame([x.split("\t") for x in texts])
data.columns = data.iloc[0] # Set the first row as the header
data = data.drop(data.index[0]) # Drop the header row from the data
# Add gene metadata columns
data["gene"] = gene_name
data["gene_consequence"] = gene_consequence
data["snp_id"] = snp_id
return data
except:
# If data is not found or an error occurs, print a message and return an empty list
print(f"There is no info for the snp_id entered: {snp_id}")
data = []
return data
#### compile all the data by passing the snps_ids information.
def compile_data(snps_data = None):
"""
Compiles allele frequency data for a list of SNPs.
Args:
snps_data (pd.DataFrame): DataFrame containing SNP information with columns
"Variant", "Genes", and "Gene_consequence".
Returns:
pd.DataFrame: Concatenated DataFrame with allele frequency data for all valid SNPs.
"""
list_of_data = []
# Iterate through each row in the input DataFrame
for index, rows in tqdm(snps_data.iterrows()):
# Retrieve frequency table for the current SNP
data = frequency_table(
snp_id=rows["Variant"],
gene_name=rows["Genes"],
gene_consequence=rows["Gene_consequence"]
)
# Only append if the result is a DataFrame (i.e., data was found)
if type(data) == pd.core.frame.DataFrame:
data = data
else:
continue
list_of_data.append(data)
# Concatenate all individual DataFrames into a single DataFrame
data_allele_frequency = pd.concat(list_of_data)
return data_allele_frequency
if __name__ == '__main__':
snps_data = "Pass"