import numpy as np import sys import pandas as pd def cluster_array(input_array, distance_threshold): if len(input_array) == 0: return [] input_array = np.array(input_array) clusters = [] current_cluster = [input_array[0]] for i in range(1, len(input_array)): if input_array[i] - input_array[i - 1] < distance_threshold: current_cluster.append(input_array[i]) else: if len(current_cluster) >= 3: clusters.append((current_cluster[0], current_cluster[-1])) current_cluster = [input_array[i]] if len(current_cluster) >= 3: clusters.append((current_cluster[0], current_cluster[-1])) return clusters def calculate_adjacent_distances(clusters): distances = [] size = [int(clusters[i][1] - clusters[i][0]+1) for i in range(len(clusters))] for i in range(1, len(clusters)): previous_cluster_end = clusters[i - 1][1] current_cluster_start = clusters[i][0] distance = current_cluster_start - previous_cluster_end distances.append(distance) return distances,size # Example usage def main(ipdcsv, distance_threshold): ipdout = sys.argv[3] + '_clusterallfmt.xls' sm6mA_distri = np.zeros(600, dtype = int) smsize_distri = np.zeros(600, dtype = int) smadj_distri = np.zeros(600, dtype = int) bin_edges = np.linspace(0, 600, 601) with open(ipdcsv, 'r') as f, open(ipdout, 'w') as fout: for line in f: row = line.strip().split('\t') if len(row) < 9: # Ensure there are enough columns print(line.strip(), "Missing columns", sep="\t") continue sm6mA = row[7] ref6mA = row[8] try: sm6mA_array = np.fromstring(sm6mA[1:-1], sep=', ') ref6mA_array = np.fromstring(ref6mA[1:-1], sep=', ') except ValueError: print(line.strip(), "Invalid array format", sep="\t") continue if sm6mA_array.size == 0 or ref6mA_array.size == 0: print(line.strip(), "Empty array", sep="\t") continue sm6mA_arraydf = np.diff(sm6mA_array) sm6mA_arraydf = [int(x) for x in sm6mA_arraydf] output_arr = cluster_array(sm6mA_array, distance_threshold) adjdis_arr, sizes = calculate_adjacent_distances(output_arr) smadj_distri += np.histogram(adjdis_arr, bins = bin_edges)[0] smsize_distri += np.histogram(sizes, bins = bin_edges)[0] sm6mA_distri += np.histogram(sm6mA_arraydf, bins = bin_edges)[0] fout.write(f"{line.strip()}\t{output_arr}\t{adjdis_arr}\t{sizes}\t{sm6mA_arraydf}\n") smcluadjcfmt = pd.DataFrame({'Distance': bin_edges[:-1], 'Count': smadj_distri}) smcluadjcfmt.to_csv(sys.argv[3]+'_cluadjdistance.xls', header = False, sep = "\t", index = False) smadjcfmt = pd.DataFrame({'Distance': bin_edges[:-1], 'Count': sm6mA_distri}) smadjcfmt.to_csv(sys.argv[3]+'_6mAadjdistance.xls', header = False, sep = "\t", index = False) smsizecfmt = pd.DataFrame({'Distance': bin_edges[:-1], 'Count': smsize_distri}) smsizecfmt.to_csv(sys.argv[3]+'_smsizesdistance.xls', header = False, sep = "\t", index = False) if __name__ == "__main__": if len(sys.argv) < 3: print("Usage: python array_distance.py 6mAvs5mC.csv cluster_threshold") sys.exit(1) main(sys.argv[1], int(sys.argv[2]))