#!/usr/bin/env python3 import sys import numpy as np import tempfile import os import pysam os.environ['TMPDIR'] = os.getcwd() def process_genome(file_path): hs_genome = {} with open(file_path, 'r') as file: tag = None for line in file: line = line.strip() if line.startswith('>'): tag = line[1:].split()[0] hs_genome[tag] = "" else: hs_genome[tag] += line return hs_genome def process_target(file_path, hs_genome, temp_file_path): with open(file_path, 'r') as file, open(temp_file_path, 'a') as temp_file: for line in file: line = line.strip() #if not line.startswith('chr'): # continue # Skip lines not starting with 'chr' ar = line.split('\t') # Genomic coordinates chrom = ar[1] start = int(ar[5]) + 1 end = int(ar[6]) + 1 seq_length = end - start # Form the tag for the output tag = f"{ar[0]}\t0\t{chrom}\t{start}\t60\t{seq_length}M\t*\t0\t0" # Extract 6mA and 5mC sites ref6mA = ar[8] ref5mC = ar[10] try: ref6mA_array = np.fromstring(ref6mA[1:-1], sep=', ') # Assuming comma-separated values without brackets #ref5mC_array = np.fromstring(ref5mC[1:-1], sep=', ') except ValueError: #print(f"Error parsing arrays for line: {line}") continue # Combine and filter valid modification sites refmodsite = ref6mA_array #refmodsite = np.concatenate([ref6mA_array, ref5mC_array]) refmodsite = refmodsite[refmodsite != -1] if refmodsite.size > 0: refmodml = np.full(refmodsite.shape, 255) # Create an array filled with 255 refmodsite = np.sort(refmodsite - int(ar[5])) # Sort the mod sites and adjust by start refmodsitedf = (np.diff(refmodsite)) - 1 refmodfinal = np.insert(refmodsitedf, 0, refmodsite[0]) # Keep the first element refmodfinal = [int(x) for x in refmodfinal] mmstr = ','.join(map(str, refmodfinal)) mlstr = ','.join(map(str, refmodml)) try: sequence = hs_genome[chrom][int(ar[5]):int(ar[6])] except KeyError: print(f"Chromosome {chrom} not found in genome data.") continue temp_file.write(f"{tag}\t{sequence}\t*\tMM:Z:N+n?,{mmstr}\tML:B:C,{mlstr}\n") else: try: sequence = hs_genome[chrom][int(ar[5]):int(ar[6])] except KeyError: print(f"Chromosome {chrom} not found in genome data.") continue temp_file.write(f"{tag}\t{sequence}\t*\n") def extract_header(bam_file_path, temp_file): with pysam.AlignmentFile(bam_file_path, 'rb', check_sq = False) as bam_file: for header_line in bam_file.text.splitlines(): temp_file.write(header_line + '\n') def merge_and_sort(temp_file_path, output_bam_path): # Convert the temporary file to a BAM file temp_bam_path = temp_file_path.replace(".sam", ".bam") pysam.view("-bS", "-o", temp_bam_path, temp_file_path, catch_stdout = False) # Sort the BAM file #sorted_bam_path = output_bam_path.replace(".bam", ".sorted.bam") pysam.sort("-o", output_bam_path, temp_bam_path) pysam.index(output_bam_path) # Clean up temporary BAM file os.remove(temp_bam_path) #return sorted_bam_path def main(): if len(sys.argv) < 5: print("Usage: script.py ") sys.exit(1) genome_file = sys.argv[1] target_file = sys.argv[2] alignbam_file = sys.argv[3] out_bam = sys.argv[4] # Process the genome file hs_genome = process_genome(genome_file) # Create a temporary file to store processed target data with tempfile.NamedTemporaryFile(delete=False, suffix = ".sam") as temp_file: temp_file_path = temp_file.name with open(temp_file_path, 'w') as temp_file: extract_header(alignbam_file, temp_file) # Process the target file process_target(target_file, hs_genome, temp_file_path) #header = extract_header(alignbam_file, temp_file_path) # Optionally, print the header if provided merge_and_sort(temp_file_path, out_bam) print(f"Sorted BAM file created: {out_bam}") os.remove(temp_file_path) #os.remove(out_bam) if __name__ == "__main__": main()