import os import numpy as np sentences = [] for i in range(1,503): versions_of_text = [] previous_text = "" speakers_missing_text = [] for dir in os.listdir("./"): try: text = open(dir + "/" + dir + "_" + "{:03n}".format(i) + ".txt").read() text = text.strip() if len(versions_of_text) == 0: versions_of_text.append(text) sentences.append(text + ", " + str(i)) else: is_match = False for t in versions_of_text: if text == t: is_match = True if not is_match: versions_of_text.append(text) sentences.append(text + ", " + str(i)) except: speakers_missing_text.append(dir) if len(versions_of_text) == 1 and len(speakers_missing_text) == 0: print("FILE: " + "{:03n}".format(i) + " is okay.") elif len(speakers_missing_text) > 0: if len(speakers_missing_text) < 10: speaker_list = speakers_missing_text else: speaker_list = speakers_missing_text[:10] print("\tFile: " + "{:03n}".format(i) + " is absent for " + str(speaker_list)) elif len(versions_of_text) > 1: print("\tFile: " + "{:03n}".format(i) + " has " + str(len(versions_of_text)) + " versions across all speakers:") for v in versions_of_text: print("\t\t" + v) print("I found a total of " + str(len(sentences)) + " unique sentences.") print("Here they are alphabetically:") sorted_sentences = np.sort(sentences) n = 0 for s in sorted_sentences: if n < 100: print(s) n += 1 print("\n\nWhoever did this is a war criminal.")