### import the modules and packages needed import pandas as pd import numpy as np from glob import glob import requests import re import json from urllib.request import urlretrieve import Bio from Bio import SeqIO, SearchIO, Entrez from Bio.Seq import Seq from Bio.SeqUtils import GC from Bio.Blast import NCBIWWW from Bio.Data import CodonTable from ratelimit import limits import time from typing import List, Any import tqdm #### The code below is used to extract the frequency table of a given gene from the NCBI database. Using the snp_id of the gene extracted from Pharmgkb. def frequency_table(*,snp_id = None, gene_name = None, gene_consequence = None): """ Retrieves the allele frequency table for a given SNP from the NCBI database. Args: snp_id (str): The SNP identifier (e.g., 'rs12345'). gene_name (str): The name of the gene associated with the SNP. gene_consequence (str): The effect or consequence of the SNP on the gene. Returns: pd.DataFrame: DataFrame containing allele frequency data with additional columns for gene name, gene consequence, and SNP ID. Returns an empty list if data is not found. """ # Construct the URL to access the SNP frequency data from NCBI first_link = "https://www.ncbi.nlm.nih.gov/snp/" snp_id = str(snp_id) last_link = "/download/frequency" actual_link = first_link + snp_id + last_link try: # Request the frequency data from the constructed URL url_access = requests.get(actual_link) text = url_access.text # Split the raw text to isolate the frequency table section text_list = text.split("#################")[1] texts = text_list.split("\n")[1:-1] # Convert the raw text into a DataFrame data = pd.DataFrame([x.split("\t") for x in texts]) data.columns = data.iloc[0] # Set the first row as the header data = data.drop(data.index[0]) # Drop the header row from the data # Add gene metadata columns data["gene"] = gene_name data["gene_consequence"] = gene_consequence data["snp_id"] = snp_id return data except: # If data is not found or an error occurs, print a message and return an empty list print(f"There is no info for the snp_id entered: {snp_id}") data = [] return data #### compile all the data by passing the snps_ids information. def compile_data(snps_data = None): """ Compiles allele frequency data for a list of SNPs. Args: snps_data (pd.DataFrame): DataFrame containing SNP information with columns "Variant", "Genes", and "Gene_consequence". Returns: pd.DataFrame: Concatenated DataFrame with allele frequency data for all valid SNPs. """ list_of_data = [] # Iterate through each row in the input DataFrame for index, rows in tqdm(snps_data.iterrows()): # Retrieve frequency table for the current SNP data = frequency_table( snp_id=rows["Variant"], gene_name=rows["Genes"], gene_consequence=rows["Gene_consequence"] ) # Only append if the result is a DataFrame (i.e., data was found) if type(data) == pd.core.frame.DataFrame: data = data else: continue list_of_data.append(data) # Concatenate all individual DataFrames into a single DataFrame data_allele_frequency = pd.concat(list_of_data) return data_allele_frequency if __name__ == '__main__': snps_data = "Pass"