cgmlst-hinf / pangenome_tools_result_processing.py
pangenome_tools_result_processing.py
Raw
#Consolidating core genome detected from PIRATE pangenome tool* with result from other pangenome tool(s)*
##by Krisna, M & Monteith, W 2023

## *Also applicable for result from panaroo, PEPPAN, and chewBBACA with modifications

import pandas as pd
import os

infile = sys.argv[1]
  #infile should be formatted to a format detailed in https://www.protocols.io/private/EF6DB7FE429311EEB8630A58A9FEAC02
outfile = sys.argv[2]

df_p2 = pd.read_csv(infile)
core_gene_list_p2 = df_p2['locus_id'].to_list()

os.chdir('/path/to/modified/gffs/result/')
files_p2 = os.listdir()

result_p2 = {}
for locus in core_gene_list_p2:
    for filename in files_p2:
        with open(filename, 'r') as f:
            content = f.readlines()
        for line in content:
            if locus in line:
                if locus not in result_p2:
                    result_p2[locus] = line 
            
if len(result_p2) != len(core_gene_list_p2):
    for locus in core_gene_list_p2:
        if locus not in result_p2:
            print(locus, 'not found in gffs')

start_loc_p2 = []
end_loc_p2 = []
note = []
for locus in core_gene_list_p2:
    result = result_p2[locus] 
    result = result.split('\t')
    start_loc_p2.append(result[3])
    end_loc_p2.append(result[4])
    note.append(result[8])

note2 = []
prev_locus = []
for data in note:
    split_data = data.split('prev_locus=',1)[1]
    note2.append(split_data)

for data in note2:
    split_data1 = data.split(';')
    prev_locus.append(split_data1[0])

pirate_output = pd.DataFrame (
    {
        'gene_family' : df_p2['gene_family'],
        'locus_id' : df_p2['locus_id'],
        'isolate_id' : df_p2['isolate_id'],
        'start_loc' : start_loc_p2,
        'end_loc' : end_loc_p2,
        'prokka_locus_name' : prev_locus
    }
)

pirate_output.to_csv(outfile)